sheetmask 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sheetmask-0.1.0/.gitignore +9 -0
- sheetmask-0.1.0/PKG-INFO +114 -0
- sheetmask-0.1.0/README.md +99 -0
- sheetmask-0.1.0/docs/plans/2026-02-20-excel-anonymizer.md +997 -0
- sheetmask-0.1.0/pyproject.toml +37 -0
- sheetmask-0.1.0/src/sheetmask/__init__.py +3 -0
- sheetmask-0.1.0/src/sheetmask/analyzer.py +211 -0
- sheetmask-0.1.0/src/sheetmask/cli.py +229 -0
- sheetmask-0.1.0/src/sheetmask/entity_mapper.py +105 -0
- sheetmask-0.1.0/src/sheetmask/executor.py +250 -0
- sheetmask-0.1.0/src/sheetmask/filename_parser.py +334 -0
- sheetmask-0.1.0/src/sheetmask/multi_analyzer.py +349 -0
- sheetmask-0.1.0/src/sheetmask/rules.py +115 -0
- sheetmask-0.1.0/tests/__init__.py +0 -0
- sheetmask-0.1.0/tests/create_fixtures.py +236 -0
- sheetmask-0.1.0/tests/fixtures/2024-Q4 Team Roster.xlsx +0 -0
- sheetmask-0.1.0/tests/fixtures/Dec-24 Revenue Report.xlsx +0 -0
- sheetmask-0.1.0/tests/fixtures/revenue_report_config.py +39 -0
- sheetmask-0.1.0/tests/fixtures/team_roster_config.py +22 -0
- sheetmask-0.1.0/tests/test_cli.py +126 -0
- sheetmask-0.1.0/tests/test_entity_mapper.py +45 -0
- sheetmask-0.1.0/tests/test_filename_parser.py +25 -0
- sheetmask-0.1.0/tests/test_integration.py +412 -0
- sheetmask-0.1.0/tests/test_rules.py +73 -0
- sheetmask-0.1.0/uv.lock +642 -0
sheetmask-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sheetmask
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI for creating PII-safe Excel test fixtures
|
|
5
|
+
Project-URL: Repository, https://github.com/daniel-butler/sheetmask
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: faker>=20.0.0
|
|
9
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
10
|
+
Requires-Dist: pandas>=2.0.0
|
|
11
|
+
Requires-Dist: pyxlsb>=1.0.10
|
|
12
|
+
Requires-Dist: rich>=13.0.0
|
|
13
|
+
Requires-Dist: typer>=0.15.0
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# sheetmask
|
|
17
|
+
|
|
18
|
+
Turn a real Excel file into a safe test fixture — fake names, fake numbers, real structure.
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install git+https://github.com/daniel-butler/sheetmask.git
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
uv add git+https://github.com/daniel-butler/sheetmask.git
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quickstart
|
|
31
|
+
|
|
32
|
+
1. Run `analyze` on your file. It prints a prompt describing the columns and sample data — copy it.
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
sheetmask analyze "Q4 Expense Report.xlsx"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
2. Paste the prompt into Claude or ChatGPT. Save the config it returns:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
# q4_expense_config.py
|
|
42
|
+
from sheetmask import PercentageVarianceRule, PreserveRelationshipRule
|
|
43
|
+
|
|
44
|
+
config = {
|
|
45
|
+
"version": "1.0.0",
|
|
46
|
+
"sheets_to_keep": ["Expenses"],
|
|
47
|
+
"entity_columns": {
|
|
48
|
+
"Employee Name": "PERSON",
|
|
49
|
+
"Department": "ORGANIZATION",
|
|
50
|
+
"Manager": "PERSON",
|
|
51
|
+
},
|
|
52
|
+
"numeric_rules": {
|
|
53
|
+
"Reimbursement": PercentageVarianceRule(variance_pct=0.2),
|
|
54
|
+
"Net Amount": PreserveRelationshipRule(
|
|
55
|
+
formula="context['Reimbursement'] - context['Deduction']",
|
|
56
|
+
dependent_columns=["Reimbursement", "Deduction"],
|
|
57
|
+
),
|
|
58
|
+
},
|
|
59
|
+
"preserve_columns": ["Date", "Category"],
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
3. Run `process`. The output lands beside the original.
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
sheetmask process "Q4 Expense Report.xlsx" --config q4_expense_config.py
|
|
67
|
+
# Output: Q4 Expense Report_SYNTHETIC.xlsx
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Reference
|
|
71
|
+
|
|
72
|
+
### Entity types
|
|
73
|
+
|
|
74
|
+
Each unique value maps to the same fake value throughout the file, so relationships between rows stay intact.
|
|
75
|
+
|
|
76
|
+
| Type | Generates |
|
|
77
|
+
|------|-----------|
|
|
78
|
+
| `PERSON` | Full name |
|
|
79
|
+
| `PERSON_FIRST_NAME` | First name only |
|
|
80
|
+
| `PERSON_LAST_NAME` | Last name only |
|
|
81
|
+
| `ORGANIZATION` | Company name |
|
|
82
|
+
| `EMAIL_ADDRESS` | Email address |
|
|
83
|
+
| `PHONE_NUMBER` | Phone number |
|
|
84
|
+
| `PROJECT_NAME` | Project name |
|
|
85
|
+
| `LOCATION` | City, State |
|
|
86
|
+
|
|
87
|
+
### Numeric rules
|
|
88
|
+
|
|
89
|
+
**`PercentageVarianceRule`** replaces each value with a random number within a band of the original. Use it for independent figures.
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
"Headcount": PercentageVarianceRule(variance_pct=0.15)
|
|
93
|
+
# 100 becomes a random number between 85 and 115.
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**`PreserveRelationshipRule`** derives a value from other already-anonymized columns. Use it wherever one column is computed from others, so the arithmetic stays consistent.
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
"Gross Margin": PreserveRelationshipRule(
|
|
100
|
+
formula="context['Revenue'] - context['Cost']",
|
|
101
|
+
dependent_columns=["Revenue", "Cost"],
|
|
102
|
+
)
|
|
103
|
+
# Gross Margin will always equal anonymized Revenue minus anonymized Cost.
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### All commands
|
|
107
|
+
|
|
108
|
+
| Command | Description |
|
|
109
|
+
|---------|-------------|
|
|
110
|
+
| `sheetmask analyze <file>` | Analyze file and print LLM prompt |
|
|
111
|
+
| `sheetmask analyze <file> -o prompt.txt` | Save LLM prompt to a file |
|
|
112
|
+
| `sheetmask analyze-multi f1 f2 f3` | Analyze multiple files for shared schema patterns |
|
|
113
|
+
| `sheetmask process <file> --config config.py` | Anonymize file using config |
|
|
114
|
+
| `sheetmask process <file> out.xlsx --config config.py --seed 42` | Write to named output with fixed random seed |
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# sheetmask
|
|
2
|
+
|
|
3
|
+
Turn a real Excel file into a safe test fixture — fake names, fake numbers, real structure.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install git+https://github.com/daniel-butler/sheetmask.git
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv add git+https://github.com/daniel-butler/sheetmask.git
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quickstart
|
|
16
|
+
|
|
17
|
+
1. Run `analyze` on your file. It prints a prompt describing the columns and sample data — copy it.
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
sheetmask analyze "Q4 Expense Report.xlsx"
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
2. Paste the prompt into Claude or ChatGPT. Save the config it returns:
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
# q4_expense_config.py
|
|
27
|
+
from sheetmask import PercentageVarianceRule, PreserveRelationshipRule
|
|
28
|
+
|
|
29
|
+
config = {
|
|
30
|
+
"version": "1.0.0",
|
|
31
|
+
"sheets_to_keep": ["Expenses"],
|
|
32
|
+
"entity_columns": {
|
|
33
|
+
"Employee Name": "PERSON",
|
|
34
|
+
"Department": "ORGANIZATION",
|
|
35
|
+
"Manager": "PERSON",
|
|
36
|
+
},
|
|
37
|
+
"numeric_rules": {
|
|
38
|
+
"Reimbursement": PercentageVarianceRule(variance_pct=0.2),
|
|
39
|
+
"Net Amount": PreserveRelationshipRule(
|
|
40
|
+
formula="context['Reimbursement'] - context['Deduction']",
|
|
41
|
+
dependent_columns=["Reimbursement", "Deduction"],
|
|
42
|
+
),
|
|
43
|
+
},
|
|
44
|
+
"preserve_columns": ["Date", "Category"],
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
3. Run `process`. The output lands beside the original.
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
sheetmask process "Q4 Expense Report.xlsx" --config q4_expense_config.py
|
|
52
|
+
# Output: Q4 Expense Report_SYNTHETIC.xlsx
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Reference
|
|
56
|
+
|
|
57
|
+
### Entity types
|
|
58
|
+
|
|
59
|
+
Each unique value maps to the same fake value throughout the file, so relationships between rows stay intact.
|
|
60
|
+
|
|
61
|
+
| Type | Generates |
|
|
62
|
+
|------|-----------|
|
|
63
|
+
| `PERSON` | Full name |
|
|
64
|
+
| `PERSON_FIRST_NAME` | First name only |
|
|
65
|
+
| `PERSON_LAST_NAME` | Last name only |
|
|
66
|
+
| `ORGANIZATION` | Company name |
|
|
67
|
+
| `EMAIL_ADDRESS` | Email address |
|
|
68
|
+
| `PHONE_NUMBER` | Phone number |
|
|
69
|
+
| `PROJECT_NAME` | Project name |
|
|
70
|
+
| `LOCATION` | City, State |
|
|
71
|
+
|
|
72
|
+
### Numeric rules
|
|
73
|
+
|
|
74
|
+
**`PercentageVarianceRule`** replaces each value with a random number within a band of the original. Use it for independent figures.
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
"Headcount": PercentageVarianceRule(variance_pct=0.15)
|
|
78
|
+
# 100 becomes a random number between 85 and 115.
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**`PreserveRelationshipRule`** derives a value from other already-anonymized columns. Use it wherever one column is computed from others, so the arithmetic stays consistent.
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
"Gross Margin": PreserveRelationshipRule(
|
|
85
|
+
formula="context['Revenue'] - context['Cost']",
|
|
86
|
+
dependent_columns=["Revenue", "Cost"],
|
|
87
|
+
)
|
|
88
|
+
# Gross Margin will always equal anonymized Revenue minus anonymized Cost.
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### All commands
|
|
92
|
+
|
|
93
|
+
| Command | Description |
|
|
94
|
+
|---------|-------------|
|
|
95
|
+
| `sheetmask analyze <file>` | Analyze file and print LLM prompt |
|
|
96
|
+
| `sheetmask analyze <file> -o prompt.txt` | Save LLM prompt to a file |
|
|
97
|
+
| `sheetmask analyze-multi f1 f2 f3` | Analyze multiple files for shared schema patterns |
|
|
98
|
+
| `sheetmask process <file> --config config.py` | Anonymize file using config |
|
|
99
|
+
| `sheetmask process <file> out.xlsx --config config.py --seed 42` | Write to named output with fixed random seed |
|