cloakdata 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cloakdata-1.0.0/PKG-INFO +190 -0
- cloakdata-1.0.0/README.md +179 -0
- cloakdata-1.0.0/pyproject.toml +20 -0
- cloakdata-1.0.0/setup.cfg +4 -0
- cloakdata-1.0.0/src/cloakdata/__init__.py +4 -0
- cloakdata-1.0.0/src/cloakdata/core.py +699 -0
- cloakdata-1.0.0/src/cloakdata/py.typed +0 -0
- cloakdata-1.0.0/src/cloakdata/validate.py +39 -0
- cloakdata-1.0.0/src/cloakdata.egg-info/PKG-INFO +190 -0
- cloakdata-1.0.0/src/cloakdata.egg-info/SOURCES.txt +11 -0
- cloakdata-1.0.0/src/cloakdata.egg-info/dependency_links.txt +1 -0
- cloakdata-1.0.0/src/cloakdata.egg-info/requires.txt +2 -0
- cloakdata-1.0.0/src/cloakdata.egg-info/top_level.txt +1 -0
cloakdata-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cloakdata
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A lightweight library for anonymizing tabular datasets using Polars
|
|
5
|
+
Author: Jeferson Peter
|
|
6
|
+
Keywords: anonymization,data privacy,polars,etl,data masking
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: polars>=1.31.0
|
|
10
|
+
Requires-Dist: loguru>=0.7.3
|
|
11
|
+
|
|
12
|
+
# Data Anonymizer Script
|
|
13
|
+
|
|
14
|
+
This project is designed to anonymize sensitive data using configurable methods in Polars.
|
|
15
|
+
|
|
16
|
+
## ๐ฆ Features
|
|
17
|
+
|
|
18
|
+
- Full masking
|
|
19
|
+
- Email masking
|
|
20
|
+
- Phone number masking
|
|
21
|
+
- Replace with static values
|
|
22
|
+
- Replace by substring or dictionary
|
|
23
|
+
- Sequential numeric and alphabetical replacement
|
|
24
|
+
- Truncation
|
|
25
|
+
- Initials extraction
|
|
26
|
+
- Age and date generalization
|
|
27
|
+
- Random choice substitution
|
|
28
|
+
- Fake numeric generation
|
|
29
|
+
- Column shuffling
|
|
30
|
+
- Date offset
|
|
31
|
+
- Conditional anonymization
|
|
32
|
+
|
|
33
|
+
## โ๏ธ How it works
|
|
34
|
+
|
|
35
|
+
1. The script reads a CSV file into a Polars DataFrame.
|
|
36
|
+
2. It loads a JSON config describing which columns to anonymize and how.
|
|
37
|
+
3. Each rule is applied and the resulting DataFrame is written to output.
|
|
38
|
+
|
|
39
|
+
## ๐งช Example Config
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"columns": {
|
|
44
|
+
"name": "initials_only",
|
|
45
|
+
"email": "mask_email",
|
|
46
|
+
"phone": "mask_number",
|
|
47
|
+
"cpf": {
|
|
48
|
+
"method": "replace_with_fake",
|
|
49
|
+
"params": {
|
|
50
|
+
"digits": 11
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"username": {
|
|
54
|
+
"method": "replace_by_contains",
|
|
55
|
+
"params": {
|
|
56
|
+
"mapping": {
|
|
57
|
+
"admin": "user",
|
|
58
|
+
"root": "guest"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"status": {
|
|
63
|
+
"method": "replace_by_dict",
|
|
64
|
+
"params": {
|
|
65
|
+
"mapping": {
|
|
66
|
+
"active": "A",
|
|
67
|
+
"inactive": "I"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
"id_seq": {
|
|
72
|
+
"method": "sequential_numeric",
|
|
73
|
+
"params": {
|
|
74
|
+
"prefix": "ID"
|
|
75
|
+
}
|
|
76
|
+
},
|
|
77
|
+
"ref_code": {
|
|
78
|
+
"method": "sequential_alpha",
|
|
79
|
+
"params": {
|
|
80
|
+
"prefix": "REF"
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
"comments": {
|
|
84
|
+
"method": "truncate",
|
|
85
|
+
"params": {
|
|
86
|
+
"length": 5
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
"age": "generalize_age",
|
|
90
|
+
"birth_date": {
|
|
91
|
+
"method": "generalize_date",
|
|
92
|
+
"params": {
|
|
93
|
+
"mode": "month_year"
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
"state": {
|
|
97
|
+
"method": "random_choice",
|
|
98
|
+
"params": {
|
|
99
|
+
"choices": [
|
|
100
|
+
"SP",
|
|
101
|
+
"RJ",
|
|
102
|
+
"MG",
|
|
103
|
+
"BA"
|
|
104
|
+
]
|
|
105
|
+
}
|
|
106
|
+
},
|
|
107
|
+
"last_access": {
|
|
108
|
+
"method": "date_offset",
|
|
109
|
+
"params": {
|
|
110
|
+
"min_days": -2,
|
|
111
|
+
"max_days": 2
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"feedback": "shuffle"
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## ๐ง Conditional Rules
|
|
120
|
+
|
|
121
|
+
You can also apply rules based on other column values:
|
|
122
|
+
|
|
123
|
+
```json
|
|
124
|
+
"cpf": {
|
|
125
|
+
"method": "replace_with_fake",
|
|
126
|
+
"params": {
|
|
127
|
+
"digits": 11
|
|
128
|
+
},
|
|
129
|
+
"condition": {
|
|
130
|
+
"column": "status",
|
|
131
|
+
"operator": "equals",
|
|
132
|
+
"value": "active"
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## โ๏ธ Supported Condition Operators
|
|
138
|
+
|
|
139
|
+
| Operator | Description |
|
|
140
|
+
|----------------|----------------------------------------|
|
|
141
|
+
| equals | Equal to |
|
|
142
|
+
| not_equals | Not equal to |
|
|
143
|
+
| in | Value in list |
|
|
144
|
+
| not_in | Value not in list |
|
|
145
|
+
| gt | Greater than |
|
|
146
|
+
| gte | Greater than or equal to |
|
|
147
|
+
| lt | Less than |
|
|
148
|
+
| lte | Less than or equal to |
|
|
149
|
+
| contains | Substring exists in string |
|
|
150
|
+
| not_contains | Substring does not exist in string |
|
|
151
|
+
|
|
152
|
+
## ๐ Project Structure
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
.
|
|
156
|
+
โโโ main.py # Entry point to run anonymization
|
|
157
|
+
โโโ anonymizer.py # Core logic for applying anonymization rules
|
|
158
|
+
โโโ config.json # Example configuration file
|
|
159
|
+
โโโ sensitive_data.csv # Input file to be anonymized
|
|
160
|
+
โโโ README.md # Project documentation
|
|
161
|
+
โโโ requirements.txt # Project dependencies
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## ๐ ๏ธ Requirements
|
|
165
|
+
|
|
166
|
+
- Python 3.12+
|
|
167
|
+
- [Polars](https://pola.rs/) >= 1.31.0
|
|
168
|
+
- Create a virtual environment:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
python -m venv .venv
|
|
172
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
173
|
+
pip install -r requirements.txt
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## ๐ Run the script
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
python main.py
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Make sure to update paths for input CSV and config JSON as needed.
|
|
183
|
+
|
|
184
|
+
## ๐ฎ Possible Future Features
|
|
185
|
+
|
|
186
|
+
- Hashing support for specific fields
|
|
187
|
+
- Redaction rules using regex
|
|
188
|
+
- Support for nested or JSON-style fields
|
|
189
|
+
- CLI interface with rich options
|
|
190
|
+
- Parallel processing for large datasets
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Data Anonymizer Script
|
|
2
|
+
|
|
3
|
+
This project is designed to anonymize sensitive data using configurable methods in Polars.
|
|
4
|
+
|
|
5
|
+
## ๐ฆ Features
|
|
6
|
+
|
|
7
|
+
- Full masking
|
|
8
|
+
- Email masking
|
|
9
|
+
- Phone number masking
|
|
10
|
+
- Replace with static values
|
|
11
|
+
- Replace by substring or dictionary
|
|
12
|
+
- Sequential numeric and alphabetical replacement
|
|
13
|
+
- Truncation
|
|
14
|
+
- Initials extraction
|
|
15
|
+
- Age and date generalization
|
|
16
|
+
- Random choice substitution
|
|
17
|
+
- Fake numeric generation
|
|
18
|
+
- Column shuffling
|
|
19
|
+
- Date offset
|
|
20
|
+
- Conditional anonymization
|
|
21
|
+
|
|
22
|
+
## โ๏ธ How it works
|
|
23
|
+
|
|
24
|
+
1. The script reads a CSV file into a Polars DataFrame.
|
|
25
|
+
2. It loads a JSON config describing which columns to anonymize and how.
|
|
26
|
+
3. Each rule is applied and the resulting DataFrame is written to output.
|
|
27
|
+
|
|
28
|
+
## ๐งช Example Config
|
|
29
|
+
|
|
30
|
+
```json
|
|
31
|
+
{
|
|
32
|
+
"columns": {
|
|
33
|
+
"name": "initials_only",
|
|
34
|
+
"email": "mask_email",
|
|
35
|
+
"phone": "mask_number",
|
|
36
|
+
"cpf": {
|
|
37
|
+
"method": "replace_with_fake",
|
|
38
|
+
"params": {
|
|
39
|
+
"digits": 11
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"username": {
|
|
43
|
+
"method": "replace_by_contains",
|
|
44
|
+
"params": {
|
|
45
|
+
"mapping": {
|
|
46
|
+
"admin": "user",
|
|
47
|
+
"root": "guest"
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"status": {
|
|
52
|
+
"method": "replace_by_dict",
|
|
53
|
+
"params": {
|
|
54
|
+
"mapping": {
|
|
55
|
+
"active": "A",
|
|
56
|
+
"inactive": "I"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
"id_seq": {
|
|
61
|
+
"method": "sequential_numeric",
|
|
62
|
+
"params": {
|
|
63
|
+
"prefix": "ID"
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
"ref_code": {
|
|
67
|
+
"method": "sequential_alpha",
|
|
68
|
+
"params": {
|
|
69
|
+
"prefix": "REF"
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"comments": {
|
|
73
|
+
"method": "truncate",
|
|
74
|
+
"params": {
|
|
75
|
+
"length": 5
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
"age": "generalize_age",
|
|
79
|
+
"birth_date": {
|
|
80
|
+
"method": "generalize_date",
|
|
81
|
+
"params": {
|
|
82
|
+
"mode": "month_year"
|
|
83
|
+
}
|
|
84
|
+
},
|
|
85
|
+
"state": {
|
|
86
|
+
"method": "random_choice",
|
|
87
|
+
"params": {
|
|
88
|
+
"choices": [
|
|
89
|
+
"SP",
|
|
90
|
+
"RJ",
|
|
91
|
+
"MG",
|
|
92
|
+
"BA"
|
|
93
|
+
]
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
"last_access": {
|
|
97
|
+
"method": "date_offset",
|
|
98
|
+
"params": {
|
|
99
|
+
"min_days": -2,
|
|
100
|
+
"max_days": 2
|
|
101
|
+
}
|
|
102
|
+
},
|
|
103
|
+
"feedback": "shuffle"
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## ๐ง Conditional Rules
|
|
109
|
+
|
|
110
|
+
You can also apply rules based on other column values:
|
|
111
|
+
|
|
112
|
+
```json
|
|
113
|
+
"cpf": {
|
|
114
|
+
"method": "replace_with_fake",
|
|
115
|
+
"params": {
|
|
116
|
+
"digits": 11
|
|
117
|
+
},
|
|
118
|
+
"condition": {
|
|
119
|
+
"column": "status",
|
|
120
|
+
"operator": "equals",
|
|
121
|
+
"value": "active"
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## โ๏ธ Supported Condition Operators
|
|
127
|
+
|
|
128
|
+
| Operator | Description |
|
|
129
|
+
|----------------|----------------------------------------|
|
|
130
|
+
| equals | Equal to |
|
|
131
|
+
| not_equals | Not equal to |
|
|
132
|
+
| in | Value in list |
|
|
133
|
+
| not_in | Value not in list |
|
|
134
|
+
| gt | Greater than |
|
|
135
|
+
| gte | Greater than or equal to |
|
|
136
|
+
| lt | Less than |
|
|
137
|
+
| lte | Less than or equal to |
|
|
138
|
+
| contains | Substring exists in string |
|
|
139
|
+
| not_contains | Substring does not exist in string |
|
|
140
|
+
|
|
141
|
+
## ๐ Project Structure
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
.
|
|
145
|
+
โโโ main.py # Entry point to run anonymization
|
|
146
|
+
โโโ anonymizer.py # Core logic for applying anonymization rules
|
|
147
|
+
โโโ config.json # Example configuration file
|
|
148
|
+
โโโ sensitive_data.csv # Input file to be anonymized
|
|
149
|
+
โโโ README.md # Project documentation
|
|
150
|
+
โโโ requirements.txt # Project dependencies
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## ๐ ๏ธ Requirements
|
|
154
|
+
|
|
155
|
+
- Python 3.12+
|
|
156
|
+
- [Polars](https://pola.rs/) >= 1.31.0
|
|
157
|
+
- Create a virtual environment:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
python -m venv .venv
|
|
161
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
162
|
+
pip install -r requirements.txt
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
## ๐ Run the script
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
python main.py
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Make sure to update paths for input CSV and config JSON as needed.
|
|
172
|
+
|
|
173
|
+
## ๐ฎ Possible Future Features
|
|
174
|
+
|
|
175
|
+
- Hashing support for specific fields
|
|
176
|
+
- Redaction rules using regex
|
|
177
|
+
- Support for nested or JSON-style fields
|
|
178
|
+
- CLI interface with rich options
|
|
179
|
+
- Parallel processing for large datasets
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "cloakdata"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "A lightweight library for anonymizing tabular datasets using Polars"
|
|
5
|
+
keywords = ["anonymization", "data privacy", "polars", "etl", "data masking"]
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Jeferson Peter" }
|
|
8
|
+
]
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"polars>=1.31.0",
|
|
13
|
+
"loguru>=0.7.3"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[tool.setuptools]
|
|
17
|
+
package-dir = {"" = "src"}
|
|
18
|
+
|
|
19
|
+
[tool.setuptools.packages.find]
|
|
20
|
+
where = ["src"]
|
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
import inspect
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AnonymizationMethods:
|
|
12
|
+
"""
|
|
13
|
+
A collection of static methods for anonymizing or masking sensitive data in Polars DataFrames.
|
|
14
|
+
|
|
15
|
+
This class provides various anonymization strategies such as full masking, email obfuscation,
|
|
16
|
+
data generalization, conditional replacement, pseudonymization, and more.
|
|
17
|
+
|
|
18
|
+
Each method returns a `pl.Expr` that can be applied to a column in a Polars DataFrame.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def full_mask(_df: pl.DataFrame, col: str, _params: dict) -> pl.Expr:
|
|
23
|
+
"""
|
|
24
|
+
Fully masks all values in the specified column with a fixed placeholder.
|
|
25
|
+
|
|
26
|
+
Parameters:
|
|
27
|
+
_df (pl.DataFrame): The input DataFrame (not used in this method).
|
|
28
|
+
col (str): The name of the column to be masked.
|
|
29
|
+
_params (dict): Parameters dictionary (not used in this method).
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
pl.Expr: An expression that replaces all values in the column with "*****".
|
|
33
|
+
"""
|
|
34
|
+
return pl.lit("*****").alias(col)
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def mask_email(_df: pl.DataFrame, col: str, _params: dict) -> pl.Expr:
|
|
38
|
+
"""
|
|
39
|
+
Masks the local part of email addresses in the specified column, keeping the domain.
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
"john.doe@example.com" โ "xxxxx@example.com"
|
|
43
|
+
"invalid_email" โ "xxxxx@hidden.com"
|
|
44
|
+
|
|
45
|
+
Parameters:
|
|
46
|
+
_df (pl.DataFrame): The input DataFrame (not used in this method).
|
|
47
|
+
col (str): The name of the column containing email addresses.
|
|
48
|
+
_params (dict): Parameters dictionary (not used in this method).
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
pl.Expr: An expression that masks email addresses while preserving the domain.
|
|
52
|
+
"""
|
|
53
|
+
return (
|
|
54
|
+
pl.when(pl.col(col).str.contains("@"))
|
|
55
|
+
.then(pl.lit("xxxxx@") + pl.col(col).str.split("@").list.get(1))
|
|
56
|
+
.otherwise(pl.lit("xxxxx@hidden.com"))
|
|
57
|
+
.alias(col)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def mask_number(_df: pl.DataFrame, col: str, _params: dict) -> pl.Expr:
|
|
62
|
+
"""
|
|
63
|
+
Masks part of a numeric string in the specified column, keeping the first few characters.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
"123456789" โ "123*****"
|
|
67
|
+
|
|
68
|
+
Parameters:
|
|
69
|
+
_df (pl.DataFrame): The input DataFrame (not used in this method).
|
|
70
|
+
col (str): The name of the column to be masked.
|
|
71
|
+
_params (dict): Parameters dictionary (not used in this method).
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
pl.Expr: An expression that preserves the first 3 characters and masks the rest.
|
|
75
|
+
"""
|
|
76
|
+
return (
|
|
77
|
+
pl.col(col).cast(pl.Utf8).str.slice(0, 3) + pl.lit("*****")
|
|
78
|
+
).alias(col)
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def replace_with_value(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
82
|
+
"""
|
|
83
|
+
Replaces all values in the specified column with a static value.
|
|
84
|
+
|
|
85
|
+
Parameters:
|
|
86
|
+
_df (pl.DataFrame): The input DataFrame (not used in this method).
|
|
87
|
+
col (str): The name of the column to be replaced.
|
|
88
|
+
params (dict): Dictionary containing the key "value" with the replacement string.
|
|
89
|
+
If not provided, defaults to "Unknow".
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
pl.Expr: An expression that replaces all values with the specified static value.
|
|
93
|
+
"""
|
|
94
|
+
return pl.lit(params.get("value", "Unknow")).alias(col)
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def replace_by_contains(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
98
|
+
"""
|
|
99
|
+
Replaces values in the column based on whether they contain specific substrings.
|
|
100
|
+
|
|
101
|
+
Parameters:
|
|
102
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
103
|
+
col (str): The name of the column to be processed.
|
|
104
|
+
params (dict): Dictionary with one of the following:
|
|
105
|
+
- "mapping" (dict): Keys are substrings to look for, values are replacements.
|
|
106
|
+
- OR "substr" (str) and "replacement" (str): fallback single rule if no mapping is provided.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
pl.Expr: An expression that replaces values based on substring matching.
|
|
110
|
+
"""
|
|
111
|
+
mapping = params.get("mapping") or {
|
|
112
|
+
params.get("substr", ""): params.get("replacement", "Unknow")
|
|
113
|
+
}
|
|
114
|
+
expr = pl.col(col)
|
|
115
|
+
for substr, replacement in mapping.items():
|
|
116
|
+
expr = pl.when(expr.cast(pl.Utf8).str.contains(substr)).then(pl.lit(replacement)).otherwise(expr)
|
|
117
|
+
return expr.alias(col)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def replace_exact(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
121
|
+
"""
|
|
122
|
+
Replaces values in the column that exactly match a given set of keys.
|
|
123
|
+
|
|
124
|
+
Parameters:
|
|
125
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
126
|
+
col (str): The name of the column to be processed.
|
|
127
|
+
params (dict): Dictionary containing a "mapping" key with a dict of
|
|
128
|
+
{original_value: replacement_value}.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
pl.Expr: An expression that performs exact value replacements.
|
|
132
|
+
"""
|
|
133
|
+
expr = pl.col(col).cast(pl.Utf8)
|
|
134
|
+
for old, new in params.get("mapping", {}).items():
|
|
135
|
+
expr = pl.when(expr == old).then(pl.lit(new)).otherwise(expr)
|
|
136
|
+
return expr.alias(col)
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def sequential_numeric(df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
140
|
+
"""
|
|
141
|
+
Replaces unique values in the column with sequentially numbered strings.
|
|
142
|
+
|
|
143
|
+
Example:
|
|
144
|
+
"Alice", "Bob", "Alice" โ "val 1", "val 2", "val 1"
|
|
145
|
+
|
|
146
|
+
Parameters:
|
|
147
|
+
df (pl.DataFrame): The input DataFrame, used to extract unique values.
|
|
148
|
+
col (str): The name of the column to be pseudonymized.
|
|
149
|
+
params (dict): Optional parameters:
|
|
150
|
+
- "prefix" (str): A prefix to add to the generated values (default: "val").
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
pl.Expr: An expression replacing values with numeric pseudonyms.
|
|
154
|
+
"""
|
|
155
|
+
unique_vals = df.select(pl.col(col).unique()).to_series().to_list()
|
|
156
|
+
mapping = {val: f"{params.get("prefix", "val")} {i + 1}" for i, val in enumerate(unique_vals)}
|
|
157
|
+
return pl.col(col).replace(mapping).alias(col)
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def sequential_alpha(df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
161
|
+
"""
|
|
162
|
+
Replaces unique values in the column with alphabetically indexed pseudonyms.
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
"Alice", "Bob", "Alice" โ "val A", "val B", "val A"
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
df (pl.DataFrame): The input DataFrame, used to extract unique values.
|
|
169
|
+
col (str): The name of the column to be pseudonymized.
|
|
170
|
+
params (dict): Optional parameters:
|
|
171
|
+
- "prefix" (str): A prefix to add to the generated values (default: "val").
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
pl.Expr: An expression replacing values with alphabetic pseudonyms (A, B, ..., Z, AA, AB, ...).
|
|
175
|
+
"""
|
|
176
|
+
def num_to_alpha(n: int) -> str:
|
|
177
|
+
result = ""
|
|
178
|
+
while n >= 0:
|
|
179
|
+
result = chr(65 + (n % 26)) + result
|
|
180
|
+
n = n // 26 - 1
|
|
181
|
+
return result
|
|
182
|
+
|
|
183
|
+
unique_vals = df.select(pl.col(col).unique()).to_series().to_list()
|
|
184
|
+
mapping = {val: f"{params.get("prefix", "val")} {num_to_alpha(i)}" for i, val in enumerate(unique_vals)}
|
|
185
|
+
return pl.col(col).replace(mapping).alias(col)
|
|
186
|
+
|
|
187
|
+
@staticmethod
|
|
188
|
+
def truncate(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
189
|
+
"""
|
|
190
|
+
Truncates string values in the column to a fixed length.
|
|
191
|
+
|
|
192
|
+
Example:
|
|
193
|
+
"Alexander" with length=4 โ "Alex"
|
|
194
|
+
|
|
195
|
+
Parameters:
|
|
196
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
197
|
+
col (str): The name of the column to be truncated.
|
|
198
|
+
params (dict): Parameters containing:
|
|
199
|
+
- "length" (int): The maximum number of characters to retain (default: 4).
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
pl.Expr: An expression that truncates each string to the specified length.
|
|
203
|
+
"""
|
|
204
|
+
return pl.col(col).cast(pl.Utf8).str.slice(0, params.get("length", 4)).alias(col)
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def initials_only(_df: pl.DataFrame, col: str, _params: dict) -> pl.Expr:
|
|
208
|
+
"""
|
|
209
|
+
Converts full names into initials. For example, "John Doe" becomes "J.D."
|
|
210
|
+
|
|
211
|
+
Parameters:
|
|
212
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
213
|
+
col (str): The name of the column containing full names.
|
|
214
|
+
_params (dict): Parameters dictionary (not used in this method).
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
pl.Expr: An expression that converts names to initials format.
|
|
218
|
+
"""
|
|
219
|
+
return (
|
|
220
|
+
pl.col(col)
|
|
221
|
+
.cast(pl.Utf8)
|
|
222
|
+
.map_elements(
|
|
223
|
+
lambda x: "".join([n[0].upper() + "." for n in str(x).split() if n]),
|
|
224
|
+
return_dtype=pl.Utf8
|
|
225
|
+
)
|
|
226
|
+
.alias(col)
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def generalize_age(_df: pl.DataFrame, col: str, _params: dict) -> pl.Expr:
|
|
231
|
+
"""
|
|
232
|
+
Generalizes age values into 10-year intervals.
|
|
233
|
+
|
|
234
|
+
Example:
|
|
235
|
+
25 โ "20-29"
|
|
236
|
+
41 โ "40-49"
|
|
237
|
+
|
|
238
|
+
Parameters:
|
|
239
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
240
|
+
col (str): The name of the column containing age values.
|
|
241
|
+
_params (dict): Parameters dictionary (not used in this method).
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
pl.Expr: An expression that converts numeric ages into age groups.
|
|
245
|
+
"""
|
|
246
|
+
base = (pl.col(col).cast(pl.Int64) // 10) * 10
|
|
247
|
+
return (
|
|
248
|
+
(base.cast(pl.Utf8) + pl.lit("-") + (base + 9).cast(pl.Utf8))
|
|
249
|
+
.alias(col)
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
@staticmethod
|
|
253
|
+
def generalize_date(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
254
|
+
"""
|
|
255
|
+
Generalizes a date column by reducing its granularity (e.g., to month or year).
|
|
256
|
+
|
|
257
|
+
Parameters:
|
|
258
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
259
|
+
col (str): The name of the column containing date strings in "YYYY-MM-DD" format.
|
|
260
|
+
params (dict): Dictionary containing:
|
|
261
|
+
- "mode" (str): Either "month_year" to keep "YYYY-MM", or "year" to keep "YYYY".
|
|
262
|
+
Defaults to "month_year".
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
pl.Expr: An expression that truncates the date based on the selected mode.
|
|
266
|
+
"""
|
|
267
|
+
mode = params.get("mode", "month_year")
|
|
268
|
+
if mode == "month_year":
|
|
269
|
+
return pl.col(col).str.slice(0, 7).alias(col)
|
|
270
|
+
elif mode == "year":
|
|
271
|
+
return pl.col(col).str.slice(0, 4).alias(col)
|
|
272
|
+
else:
|
|
273
|
+
return pl.lit("invalid_mode").alias(col)
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def random_choice(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
277
|
+
"""
|
|
278
|
+
Replaces each value in the column with a random choice from a predefined list.
|
|
279
|
+
|
|
280
|
+
Example:
|
|
281
|
+
Original: "A", "B", "C"
|
|
282
|
+
After: "X", "Y", "X" (randomly assigned)
|
|
283
|
+
|
|
284
|
+
Parameters:
|
|
285
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
286
|
+
col (str): The name of the column to anonymize.
|
|
287
|
+
params (dict): Dictionary containing:
|
|
288
|
+
- "choices" (list): List of possible values to randomly assign. Defaults to ["X", "Y"].
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
pl.Expr: An expression that replaces values with random selections from the list.
|
|
292
|
+
"""
|
|
293
|
+
choices = params.get("choices", ["X", "Y"])
|
|
294
|
+
return (
|
|
295
|
+
pl.col(col)
|
|
296
|
+
.map_elements(lambda _: random.choice(choices), return_dtype=pl.Utf8)
|
|
297
|
+
.alias(col)
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
@staticmethod
|
|
301
|
+
def replace_with_fake(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
302
|
+
"""
|
|
303
|
+
Replaces each value in the column with a randomly generated fake number (e.g., CPF, ID).
|
|
304
|
+
|
|
305
|
+
Example:
|
|
306
|
+
Original: "123456789" โ "80239485711" (random 11-digit string)
|
|
307
|
+
|
|
308
|
+
Parameters:
|
|
309
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
310
|
+
col (str): The name of the column to anonymize.
|
|
311
|
+
params (dict): Dictionary containing:
|
|
312
|
+
- "digits" (int): Number of digits to generate (default: 11).
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
pl.Expr: An expression that replaces values with random digit strings.
|
|
316
|
+
"""
|
|
317
|
+
return (
|
|
318
|
+
pl.col(col)
|
|
319
|
+
.map_elements(lambda _: "".join(random.choices(string.digits, k=params.get("digits", 11))),
|
|
320
|
+
return_dtype=pl.Utf8)
|
|
321
|
+
.alias(col)
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
@staticmethod
|
|
325
|
+
def shuffle(_df: pl.DataFrame, col: str, _params: dict) -> pl.Expr:
|
|
326
|
+
"""
|
|
327
|
+
Randomly shuffles the values in the specified column.
|
|
328
|
+
|
|
329
|
+
Note:
|
|
330
|
+
This method preserves the original values but reorders them randomly.
|
|
331
|
+
|
|
332
|
+
Parameters:
|
|
333
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
334
|
+
col (str): The name of the column to shuffle.
|
|
335
|
+
_params (dict): Parameters dictionary (not used in this method).
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
pl.Expr: An expression that shuffles the column values.
|
|
339
|
+
"""
|
|
340
|
+
return pl.col(col).shuffle().alias(col)
|
|
341
|
+
|
|
342
|
+
@staticmethod
|
|
343
|
+
def date_offset(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
344
|
+
"""
|
|
345
|
+
Applies a random date offset (in days) to each value in the column.
|
|
346
|
+
|
|
347
|
+
Example:
|
|
348
|
+
"2025-07-20" โ "2025-07-18" (random within range)
|
|
349
|
+
|
|
350
|
+
Parameters:
|
|
351
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
352
|
+
col (str): The name of the column containing date strings (format "YYYY-MM-DD").
|
|
353
|
+
params (dict): Dictionary containing:
|
|
354
|
+
- "min_days" (int): Minimum number of days to shift (default: -3).
|
|
355
|
+
- "max_days" (int): Maximum number of days to shift (default: 3).
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
pl.Expr: An expression that offsets dates randomly within the given range.
|
|
359
|
+
"""
|
|
360
|
+
def shift(date_str: str) -> str:
|
|
361
|
+
try:
|
|
362
|
+
min_days = params.get("min_days", -3)
|
|
363
|
+
max_days = params.get("max_days", 3)
|
|
364
|
+
d = datetime.strptime(date_str, "%Y-%m-%d")
|
|
365
|
+
offset = timedelta(days=random.randint(min_days, max_days))
|
|
366
|
+
return (d + offset).strftime("%Y-%m-%d")
|
|
367
|
+
except:
|
|
368
|
+
return "invalid"
|
|
369
|
+
|
|
370
|
+
return (
|
|
371
|
+
pl.col(col)
|
|
372
|
+
.cast(pl.Utf8)
|
|
373
|
+
.map_elements(shift, return_dtype=pl.Utf8)
|
|
374
|
+
.alias(col)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def coalesce_cols(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
379
|
+
"""
|
|
380
|
+
Returns the first non-null value from a list of columns and assigns it to the target column.
|
|
381
|
+
|
|
382
|
+
Example:
|
|
383
|
+
If column "A" is null, but "B" has a value, it will use "B". Follows the order of the list.
|
|
384
|
+
|
|
385
|
+
Parameters:
|
|
386
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
387
|
+
col (str): The name of the resulting column.
|
|
388
|
+
params (dict): Dictionary containing:
|
|
389
|
+
- "columns" (list): List of column names to coalesce (in order of priority).
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
pl.Expr: An expression that returns the first non-null value among the given columns.
|
|
393
|
+
|
|
394
|
+
Raises:
|
|
395
|
+
ValueError: If the "columns" parameter is not provided.
|
|
396
|
+
"""
|
|
397
|
+
cols = params.get("columns", [])
|
|
398
|
+
if not cols:
|
|
399
|
+
raise ValueError("โ 'columns' param is required for 'coalesce_cols'")
|
|
400
|
+
return pl.coalesce([pl.col(c) for c in cols]).alias(col)
|
|
401
|
+
|
|
402
|
+
@staticmethod
|
|
403
|
+
def split_name_parts(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
404
|
+
"""
|
|
405
|
+
Splits a full name string and extracts either the first or last part.
|
|
406
|
+
|
|
407
|
+
Example:
|
|
408
|
+
"John Doe Smith" with part="first" โ "John"
|
|
409
|
+
"John Doe Smith" with part="last" โ "Smith"
|
|
410
|
+
|
|
411
|
+
Parameters:
|
|
412
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
413
|
+
col (str): The name of the column containing full names.
|
|
414
|
+
params (dict): Dictionary containing:
|
|
415
|
+
- "part" (str): Must be either "first" or "last". Defaults to "first".
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
pl.Expr: An expression that extracts the desired part of the name.
|
|
419
|
+
"""
|
|
420
|
+
part = params.get("part", "first")
|
|
421
|
+
|
|
422
|
+
if part == "first":
|
|
423
|
+
return pl.col(col).cast(pl.Utf8).str.split(" ").list.get(0).alias(col)
|
|
424
|
+
elif part == "last":
|
|
425
|
+
return pl.col(col).cast(pl.Utf8).str.split(" ").list.get(-1).alias(col)
|
|
426
|
+
else:
|
|
427
|
+
return pl.lit("").alias(col)
|
|
428
|
+
|
|
429
|
+
@staticmethod
|
|
430
|
+
def generalize_number_range(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
431
|
+
"""
|
|
432
|
+
Generalizes numeric values into intervals of fixed size (e.g., 0-9, 10-19, etc.).
|
|
433
|
+
|
|
434
|
+
Example:
|
|
435
|
+
Value: 23, interval: 10 โ "20-29"
|
|
436
|
+
|
|
437
|
+
Parameters:
|
|
438
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
439
|
+
col (str): The name of the column with numeric values.
|
|
440
|
+
params (dict): Dictionary containing:
|
|
441
|
+
- "interval" (int): Size of each numeric range (default: 10).
|
|
442
|
+
|
|
443
|
+
Returns:
|
|
444
|
+
pl.Expr: An expression that groups numbers into interval buckets.
|
|
445
|
+
"""
|
|
446
|
+
interval = params.get("interval", 10)
|
|
447
|
+
base = (pl.col(col).cast(pl.Int64) // interval) * interval
|
|
448
|
+
return (base.cast(pl.Utf8) + pl.lit("-") + (base + interval - 1).cast(pl.Utf8)).alias(col)
|
|
449
|
+
|
|
450
|
+
@staticmethod
|
|
451
|
+
def mask_partial(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
452
|
+
"""
|
|
453
|
+
Partially masks values by keeping the beginning and end visible, and masking the middle.
|
|
454
|
+
|
|
455
|
+
Example:
|
|
456
|
+
Value: "abcdef", visible_start: 2, visible_end: 2 โ "ab**ef"
|
|
457
|
+
|
|
458
|
+
Parameters:
|
|
459
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
460
|
+
col (str): The name of the column to mask.
|
|
461
|
+
params (dict): Dictionary containing:
|
|
462
|
+
- "visible_start" (int): Number of visible characters at the start (default: 2).
|
|
463
|
+
- "visible_end" (int): Number of visible characters at the end (default: 2).
|
|
464
|
+
- "mask_char" (str): Character used for masking (default: "*").
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
pl.Expr: An expression that partially masks each string value.
|
|
468
|
+
"""
|
|
469
|
+
visible_start = params.get("visible_start", 2)
|
|
470
|
+
visible_end = params.get("visible_end", 2)
|
|
471
|
+
mask_char = params.get("mask_char", "*")
|
|
472
|
+
|
|
473
|
+
return (
|
|
474
|
+
pl.col(col)
|
|
475
|
+
.cast(pl.Utf8)
|
|
476
|
+
.map_elements(
|
|
477
|
+
lambda x: (
|
|
478
|
+
x[:visible_start] + mask_char * (len(x) - visible_start - visible_end) + x[-visible_end:]
|
|
479
|
+
if len(x) > visible_start + visible_end else x
|
|
480
|
+
),
|
|
481
|
+
return_dtype=pl.Utf8
|
|
482
|
+
)
|
|
483
|
+
.alias(col)
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
@staticmethod
|
|
487
|
+
def round_number(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
488
|
+
"""
|
|
489
|
+
Rounds numeric values in the column to a specified number of decimal places.
|
|
490
|
+
|
|
491
|
+
Example:
|
|
492
|
+
3.14159 with digits=2 โ 3.14
|
|
493
|
+
|
|
494
|
+
Parameters:
|
|
495
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
496
|
+
col (str): The name of the numeric column to round.
|
|
497
|
+
params (dict): Dictionary containing:
|
|
498
|
+
- "digits" (int): Number of decimal places to keep (default: 0).
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
pl.Expr: An expression that rounds numbers to the specified precision.
|
|
502
|
+
"""
|
|
503
|
+
digits = params.get("digits", 0)
|
|
504
|
+
return pl.col(col).cast(pl.Float64).round(digits).alias(col)
|
|
505
|
+
|
|
506
|
+
@staticmethod
|
|
507
|
+
def round_date(_df: pl.DataFrame, col: str, params: dict) -> pl.Expr:
|
|
508
|
+
"""
|
|
509
|
+
Rounds date values down to the start of the month or year.
|
|
510
|
+
|
|
511
|
+
Example:
|
|
512
|
+
"2025-07-29" with mode="month" โ "2025-07-01"
|
|
513
|
+
"2025-07-29" with mode="year" โ "2025-01-01"
|
|
514
|
+
|
|
515
|
+
Parameters:
|
|
516
|
+
_df (pl.DataFrame): The input DataFrame (not used directly).
|
|
517
|
+
col (str): The name of the column containing date strings in "YYYY-MM-DD" format.
|
|
518
|
+
params (dict): Dictionary containing:
|
|
519
|
+
- "mode" (str): Rounding mode: "month" or "year" (default: "day", which means no rounding).
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
pl.Expr: An expression that returns rounded date strings.
|
|
523
|
+
"""
|
|
524
|
+
mode = params.get("mode", "day")
|
|
525
|
+
|
|
526
|
+
def rounder(s: str) -> str:
|
|
527
|
+
try:
|
|
528
|
+
d = datetime.strptime(s, "%Y-%m-%d")
|
|
529
|
+
if mode == "month":
|
|
530
|
+
return d.replace(day=1).strftime("%Y-%m-%d")
|
|
531
|
+
elif mode == "year":
|
|
532
|
+
return d.replace(month=1, day=1).strftime("%Y-%m-%d")
|
|
533
|
+
return s
|
|
534
|
+
except Exception:
|
|
535
|
+
return "invalid"
|
|
536
|
+
|
|
537
|
+
return (
|
|
538
|
+
pl.col(col)
|
|
539
|
+
.cast(pl.Utf8)
|
|
540
|
+
.map_elements(rounder, return_dtype=pl.Utf8)
|
|
541
|
+
.alias(col)
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
@staticmethod
|
|
545
|
+
def apply_conditioned_expr(col: str, expr: pl.Expr, condition: dict) -> pl.Expr:
|
|
546
|
+
"""
|
|
547
|
+
Applies an expression only to rows that satisfy a given condition.
|
|
548
|
+
|
|
549
|
+
If the condition is not met, the original value is kept.
|
|
550
|
+
|
|
551
|
+
Parameters:
|
|
552
|
+
col (str): The name of the target column being transformed.
|
|
553
|
+
expr (pl.Expr): The transformation expression to apply conditionally.
|
|
554
|
+
condition (dict): A dictionary defining the condition with:
|
|
555
|
+
- "column" (str): Column to evaluate.
|
|
556
|
+
- "operator" (str): One of ["equals", "not_equals", "in", "not_in", "gt", "gte",
|
|
557
|
+
"lt", "lte", "contains", "not_contains"].
|
|
558
|
+
- "value" (any): The value to compare against.
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
pl.Expr: The resulting expression with conditional logic applied.
|
|
562
|
+
|
|
563
|
+
Raises:
|
|
564
|
+
ValueError: If an unsupported operator is provided or required keys are missing.
|
|
565
|
+
"""
|
|
566
|
+
condition_col = condition.get("column")
|
|
567
|
+
operator = condition.get("operator", "equals")
|
|
568
|
+
value = condition.get("value")
|
|
569
|
+
|
|
570
|
+
if not condition_col or value is None:
|
|
571
|
+
return expr
|
|
572
|
+
|
|
573
|
+
col_expr = pl.col(condition_col)
|
|
574
|
+
|
|
575
|
+
if isinstance(value, (int, float)):
|
|
576
|
+
col_expr = col_expr.cast(pl.Float64 if isinstance(value, float) else pl.Int64)
|
|
577
|
+
elif isinstance(value, str):
|
|
578
|
+
col_expr = col_expr.cast(pl.Utf8)
|
|
579
|
+
elif isinstance(value, list) and all(isinstance(v, str) for v in value):
|
|
580
|
+
col_expr = col_expr.cast(pl.Utf8)
|
|
581
|
+
|
|
582
|
+
cond_expr = {
|
|
583
|
+
"equals": col_expr == value,
|
|
584
|
+
"not_equals": col_expr != value,
|
|
585
|
+
"in": col_expr.is_in(value),
|
|
586
|
+
"not_in": ~col_expr.is_in(value),
|
|
587
|
+
"gt": col_expr > value,
|
|
588
|
+
"gte": col_expr >= value,
|
|
589
|
+
"lt": col_expr < value,
|
|
590
|
+
"lte": col_expr <= value,
|
|
591
|
+
"contains": col_expr.cast(pl.Utf8).str.contains(value),
|
|
592
|
+
"not_contains": ~col_expr.cast(pl.Utf8).str.contains(value),
|
|
593
|
+
}.get(operator)
|
|
594
|
+
|
|
595
|
+
if cond_expr is None:
|
|
596
|
+
raise ValueError(f"Unsupported operator: {operator}")
|
|
597
|
+
|
|
598
|
+
return pl.when(cond_expr).then(expr).otherwise(pl.col(col)).alias(col)
|
|
599
|
+
|
|
600
|
+
@classmethod
|
|
601
|
+
def anonymize(cls, df: pl.DataFrame, config: dict) -> pl.DataFrame:
|
|
602
|
+
"""
|
|
603
|
+
Applies one or more anonymization methods to a Polars DataFrame based on a given configuration.
|
|
604
|
+
|
|
605
|
+
The configuration allows defining one or more anonymization strategies per column,
|
|
606
|
+
optionally using conditions to apply them selectively.
|
|
607
|
+
|
|
608
|
+
Parameters:
|
|
609
|
+
df (pl.DataFrame): The original input DataFrame.
|
|
610
|
+
config (dict): A dictionary with the following structure:
|
|
611
|
+
{
|
|
612
|
+
"columns": {
|
|
613
|
+
"column_name": "method_name" | {
|
|
614
|
+
"method": "method_name",
|
|
615
|
+
"params": { ... },
|
|
616
|
+
"condition": { ... }
|
|
617
|
+
} | [ ... multiple rules ... ]
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
Special Cases:
|
|
622
|
+
- If a column method is "drop", the column will be removed.
|
|
623
|
+
- If a method includes a "condition", it will only be applied where the condition is satisfied.
|
|
624
|
+
- Columns not found in the DataFrame will be skipped unless used in a conditional rule,
|
|
625
|
+
in which case a `null` column will be added before applying the condition.
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
pl.DataFrame: A new DataFrame with the applied anonymization rules.
|
|
629
|
+
"""
|
|
630
|
+
logger.info("๐ Starting anonymization process...")
|
|
631
|
+
exprs = []
|
|
632
|
+
dispatch_map = cls.build_dispatch_map()
|
|
633
|
+
|
|
634
|
+
dropped_cols = [col for col, rule in config["columns"].items()
|
|
635
|
+
if isinstance(rule, dict) and rule.get("method") == "drop"]
|
|
636
|
+
|
|
637
|
+
if dropped_cols:
|
|
638
|
+
logger.warning(f"โ ๏ธ Dropping columns: {dropped_cols}")
|
|
639
|
+
df = df.drop(dropped_cols)
|
|
640
|
+
|
|
641
|
+
for col, rule in config["columns"].items():
|
|
642
|
+
column_exists = col in df.columns
|
|
643
|
+
has_condition = (
|
|
644
|
+
isinstance(rule, dict) and "condition" in rule
|
|
645
|
+
) or (
|
|
646
|
+
isinstance(rule, list) and any(
|
|
647
|
+
isinstance(r, dict) and "condition" in r for r in rule
|
|
648
|
+
)
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
if not column_exists:
|
|
652
|
+
if has_condition:
|
|
653
|
+
logger.info(f"โ Column '{col}' not found โ adding as null to apply conditional rule.")
|
|
654
|
+
df = df.with_columns(pl.lit(None).alias(col))
|
|
655
|
+
else:
|
|
656
|
+
logger.warning(f"โญ๏ธ Skipping unknown column: {col}")
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
rule_list = [rule] if isinstance(rule, (str, dict)) else rule
|
|
660
|
+
current_expr = pl.col(col)
|
|
661
|
+
|
|
662
|
+
for r in rule_list:
|
|
663
|
+
method, params = (r, {}) if isinstance(r, str) else (r.get("method"), r.get("params", {}))
|
|
664
|
+
condition = r.get("condition") if isinstance(r, dict) else None
|
|
665
|
+
|
|
666
|
+
if method not in dispatch_map:
|
|
667
|
+
logger.error(f"โ Unknown method '{method}' for column '{col}'. Skipping.")
|
|
668
|
+
continue
|
|
669
|
+
|
|
670
|
+
logger.debug(f"๐ง Applying method '{method}' to column '{col}'"
|
|
671
|
+
f"{' with condition' if condition else ''}")
|
|
672
|
+
expr = dispatch_map[method](df, col, params)
|
|
673
|
+
expr = cls.apply_conditioned_expr(col, expr, condition) if condition else expr
|
|
674
|
+
current_expr = expr
|
|
675
|
+
|
|
676
|
+
exprs.append(current_expr.alias(col))
|
|
677
|
+
|
|
678
|
+
result_df = df.with_columns(exprs) if exprs else df
|
|
679
|
+
logger.success(f"โ
Anonymization complete. {len(exprs)} column(s) processed.")
|
|
680
|
+
return result_df
|
|
681
|
+
|
|
682
|
+
@classmethod
|
|
683
|
+
def build_dispatch_map(cls):
|
|
684
|
+
"""
|
|
685
|
+
Builds a mapping between method names (as strings) and their corresponding
|
|
686
|
+
anonymization functions defined in this class.
|
|
687
|
+
|
|
688
|
+
This is used internally by `anonymize()` to dynamically dispatch method calls
|
|
689
|
+
based on the configuration.
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
dict: A dictionary where keys are method names and values are callables with signature:
|
|
693
|
+
(df: pl.DataFrame, col: str, params: dict) โ pl.Expr
|
|
694
|
+
"""
|
|
695
|
+
return {
|
|
696
|
+
name: (lambda m: (lambda df, col, params: m(df, col, params)))(method)
|
|
697
|
+
for name, method in inspect.getmembers(cls, predicate=inspect.isfunction)
|
|
698
|
+
if not name.startswith("_") and name not in {"apply_conditioned_expr", "anonymize", "build_dispatch_map"}
|
|
699
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from loguru import logger
|
|
2
|
+
|
|
3
|
+
def validate_config(config: dict, dispatch_map: dict) -> None:
|
|
4
|
+
"""
|
|
5
|
+
Validates the anonymization configuration.
|
|
6
|
+
|
|
7
|
+
Parameters:
|
|
8
|
+
config (dict): The loaded configuration dictionary.
|
|
9
|
+
dispatch_map (dict): Dictionary of available anonymization methods.
|
|
10
|
+
|
|
11
|
+
Raises:
|
|
12
|
+
ValueError: If any method is invalid or the structure is inconsistent.
|
|
13
|
+
"""
|
|
14
|
+
logger.info("๐ Validating anonymization config...")
|
|
15
|
+
|
|
16
|
+
if "columns" not in config:
|
|
17
|
+
raise ValueError("โ Config is missing required 'columns' section.")
|
|
18
|
+
|
|
19
|
+
for col_name, rule in config["columns"].items():
|
|
20
|
+
rules = [rule] if isinstance(rule, (str, dict)) else rule
|
|
21
|
+
|
|
22
|
+
for r in rules:
|
|
23
|
+
if isinstance(r, str):
|
|
24
|
+
method = r
|
|
25
|
+
elif isinstance(r, dict):
|
|
26
|
+
method = r.get("method")
|
|
27
|
+
else:
|
|
28
|
+
raise ValueError(f"โ Invalid rule format for column '{col_name}': must be str, dict, or list")
|
|
29
|
+
|
|
30
|
+
if method == "drop":
|
|
31
|
+
logger.debug(f"๐๏ธ Column '{col_name}' marked to be dropped.")
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
if method not in dispatch_map:
|
|
35
|
+
raise ValueError(f"โ Method '{method}' for column '{col_name}' is not a valid anonymization method.")
|
|
36
|
+
|
|
37
|
+
logger.debug(f"โ
Column '{col_name}': method '{method}' is valid.")
|
|
38
|
+
|
|
39
|
+
logger.success("โ
Configuration validation passed.")
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cloakdata
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A lightweight library for anonymizing tabular datasets using Polars
|
|
5
|
+
Author: Jeferson Peter
|
|
6
|
+
Keywords: anonymization,data privacy,polars,etl,data masking
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: polars>=1.31.0
|
|
10
|
+
Requires-Dist: loguru>=0.7.3
|
|
11
|
+
|
|
12
|
+
# Data Anonymizer Script
|
|
13
|
+
|
|
14
|
+
This project is designed to anonymize sensitive data using configurable methods in Polars.
|
|
15
|
+
|
|
16
|
+
## ๐ฆ Features
|
|
17
|
+
|
|
18
|
+
- Full masking
|
|
19
|
+
- Email masking
|
|
20
|
+
- Phone number masking
|
|
21
|
+
- Replace with static values
|
|
22
|
+
- Replace by substring or dictionary
|
|
23
|
+
- Sequential numeric and alphabetical replacement
|
|
24
|
+
- Truncation
|
|
25
|
+
- Initials extraction
|
|
26
|
+
- Age and date generalization
|
|
27
|
+
- Random choice substitution
|
|
28
|
+
- Fake numeric generation
|
|
29
|
+
- Column shuffling
|
|
30
|
+
- Date offset
|
|
31
|
+
- Conditional anonymization
|
|
32
|
+
|
|
33
|
+
## โ๏ธ How it works
|
|
34
|
+
|
|
35
|
+
1. The script reads a CSV file into a Polars DataFrame.
|
|
36
|
+
2. It loads a JSON config describing which columns to anonymize and how.
|
|
37
|
+
3. Each rule is applied and the resulting DataFrame is written to output.
|
|
38
|
+
|
|
39
|
+
## ๐งช Example Config
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"columns": {
|
|
44
|
+
"name": "initials_only",
|
|
45
|
+
"email": "mask_email",
|
|
46
|
+
"phone": "mask_number",
|
|
47
|
+
"cpf": {
|
|
48
|
+
"method": "replace_with_fake",
|
|
49
|
+
"params": {
|
|
50
|
+
"digits": 11
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"username": {
|
|
54
|
+
"method": "replace_by_contains",
|
|
55
|
+
"params": {
|
|
56
|
+
"mapping": {
|
|
57
|
+
"admin": "user",
|
|
58
|
+
"root": "guest"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
"status": {
|
|
63
|
+
"method": "replace_by_dict",
|
|
64
|
+
"params": {
|
|
65
|
+
"mapping": {
|
|
66
|
+
"active": "A",
|
|
67
|
+
"inactive": "I"
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
"id_seq": {
|
|
72
|
+
"method": "sequential_numeric",
|
|
73
|
+
"params": {
|
|
74
|
+
"prefix": "ID"
|
|
75
|
+
}
|
|
76
|
+
},
|
|
77
|
+
"ref_code": {
|
|
78
|
+
"method": "sequential_alpha",
|
|
79
|
+
"params": {
|
|
80
|
+
"prefix": "REF"
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
"comments": {
|
|
84
|
+
"method": "truncate",
|
|
85
|
+
"params": {
|
|
86
|
+
"length": 5
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
"age": "generalize_age",
|
|
90
|
+
"birth_date": {
|
|
91
|
+
"method": "generalize_date",
|
|
92
|
+
"params": {
|
|
93
|
+
"mode": "month_year"
|
|
94
|
+
}
|
|
95
|
+
},
|
|
96
|
+
"state": {
|
|
97
|
+
"method": "random_choice",
|
|
98
|
+
"params": {
|
|
99
|
+
"choices": [
|
|
100
|
+
"SP",
|
|
101
|
+
"RJ",
|
|
102
|
+
"MG",
|
|
103
|
+
"BA"
|
|
104
|
+
]
|
|
105
|
+
}
|
|
106
|
+
},
|
|
107
|
+
"last_access": {
|
|
108
|
+
"method": "date_offset",
|
|
109
|
+
"params": {
|
|
110
|
+
"min_days": -2,
|
|
111
|
+
"max_days": 2
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"feedback": "shuffle"
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## ๐ง Conditional Rules
|
|
120
|
+
|
|
121
|
+
You can also apply rules based on other column values:
|
|
122
|
+
|
|
123
|
+
```json
|
|
124
|
+
"cpf": {
|
|
125
|
+
"method": "replace_with_fake",
|
|
126
|
+
"params": {
|
|
127
|
+
"digits": 11
|
|
128
|
+
},
|
|
129
|
+
"condition": {
|
|
130
|
+
"column": "status",
|
|
131
|
+
"operator": "equals",
|
|
132
|
+
"value": "active"
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## โ๏ธ Supported Condition Operators
|
|
138
|
+
|
|
139
|
+
| Operator | Description |
|
|
140
|
+
|----------------|----------------------------------------|
|
|
141
|
+
| equals | Equal to |
|
|
142
|
+
| not_equals | Not equal to |
|
|
143
|
+
| in | Value in list |
|
|
144
|
+
| not_in | Value not in list |
|
|
145
|
+
| gt | Greater than |
|
|
146
|
+
| gte | Greater than or equal to |
|
|
147
|
+
| lt | Less than |
|
|
148
|
+
| lte | Less than or equal to |
|
|
149
|
+
| contains | Substring exists in string |
|
|
150
|
+
| not_contains | Substring does not exist in string |
|
|
151
|
+
|
|
152
|
+
## ๐ Project Structure
|
|
153
|
+
|
|
154
|
+
```
|
|
155
|
+
.
|
|
156
|
+
โโโ main.py # Entry point to run anonymization
|
|
157
|
+
โโโ anonymizer.py # Core logic for applying anonymization rules
|
|
158
|
+
โโโ config.json # Example configuration file
|
|
159
|
+
โโโ sensitive_data.csv # Input file to be anonymized
|
|
160
|
+
โโโ README.md # Project documentation
|
|
161
|
+
โโโ requirements.txt # Project dependencies
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## ๐ ๏ธ Requirements
|
|
165
|
+
|
|
166
|
+
- Python 3.12+
|
|
167
|
+
- [Polars](https://pola.rs/) >= 1.31.0
|
|
168
|
+
- Create a virtual environment:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
python -m venv .venv
|
|
172
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
173
|
+
pip install -r requirements.txt
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## ๐ Run the script
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
python main.py
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Make sure to update paths for input CSV and config JSON as needed.
|
|
183
|
+
|
|
184
|
+
## ๐ฎ Possible Future Features
|
|
185
|
+
|
|
186
|
+
- Hashing support for specific fields
|
|
187
|
+
- Redaction rules using regex
|
|
188
|
+
- Support for nested or JSON-style fields
|
|
189
|
+
- CLI interface with rich options
|
|
190
|
+
- Parallel processing for large datasets
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/cloakdata/__init__.py
|
|
4
|
+
src/cloakdata/core.py
|
|
5
|
+
src/cloakdata/py.typed
|
|
6
|
+
src/cloakdata/validate.py
|
|
7
|
+
src/cloakdata.egg-info/PKG-INFO
|
|
8
|
+
src/cloakdata.egg-info/SOURCES.txt
|
|
9
|
+
src/cloakdata.egg-info/dependency_links.txt
|
|
10
|
+
src/cloakdata.egg-info/requires.txt
|
|
11
|
+
src/cloakdata.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cloakdata
|