datalock 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalock-1.0.0.dist-info/METADATA +1004 -0
- datalock-1.0.0.dist-info/RECORD +60 -0
- datalock-1.0.0.dist-info/WHEEL +5 -0
- datalock-1.0.0.dist-info/entry_points.txt +2 -0
- datalock-1.0.0.dist-info/licenses/LICENSE +632 -0
- datalock-1.0.0.dist-info/top_level.txt +1 -0
- logus/__init__.py +1967 -0
- logus/_defaults.py +6 -0
- logus/adapters/__init__.py +17 -0
- logus/adapters/db_adapter.py +831 -0
- logus/adapters/pandas_adapter.py +496 -0
- logus/adapters/polars_adapter.py +849 -0
- logus/adapters/sql_adapter.py +655 -0
- logus/analytics.py +1119 -0
- logus/asymmetric.py +335 -0
- logus/check.py +188 -0
- logus/cli.py +232 -0
- logus/contract.py +711 -0
- logus/core.py +516 -0
- logus/detectors/__init__.py +19 -0
- logus/detectors/fast_scan.py +431 -0
- logus/detectors/pii_detector.py +687 -0
- logus/detectors/sensitive_detector.py +369 -0
- logus/detectors/text_detector.py +183 -0
- logus/expr.py +429 -0
- logus/generators/__init__.py +21 -0
- logus/generators/identity_mocker.py +291 -0
- logus/generators/tabular_generative.py +620 -0
- logus/io_big.py +939 -0
- logus/lgs.py +280 -0
- logus/lineage.py +463 -0
- logus/link.py +121 -0
- logus/maskers/__init__.py +14 -0
- logus/maskers/date_masker.py +158 -0
- logus/maskers/hashing.py +229 -0
- logus/maskers/text_masker.py +140 -0
- logus/maskers/truncation.py +203 -0
- logus/metrics/__init__.py +63 -0
- logus/metrics/differential_privacy.py +666 -0
- logus/metrics/fidelity.py +600 -0
- logus/metrics/kanonymity.py +486 -0
- logus/metrics/risk_score.py +351 -0
- logus/metrics/tcloseness.py +387 -0
- logus/metrics/utility.py +440 -0
- logus/mockers/__init__.py +6 -0
- logus/mockers/category_mocker.py +181 -0
- logus/mockers/numeric_mocker.py +217 -0
- logus/privacy_score.py +255 -0
- logus/processor.py +386 -0
- logus/py.typed +0 -0
- logus/reports/__init__.py +5 -0
- logus/reports/audit_report.py +256 -0
- logus/reports/compliance_report.py +331 -0
- logus/secure_file.py +1850 -0
- logus/sql_transpiler.py +390 -0
- logus/utils/__init__.py +8 -0
- logus/utils/frames.py +51 -0
- logus/utils/salt.py +118 -0
- logus/utils/secret_str.py +84 -0
- logus/validate.py +594 -0
|
@@ -0,0 +1,1004 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datalock
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Privacy-by-Design para dados tabulares — LGPD compliance em Python.
|
|
5
|
+
Author-email: Leonardo Borges <leonardoborges6947@gmail.com>
|
|
6
|
+
License: GNU AGPLv3
|
|
7
|
+
Project-URL: Homepage, https://github.com/logus-lgpd/logus-lgpd
|
|
8
|
+
Project-URL: Documentation, https://github.com/logus-lgpd/logus-lgpd#readme
|
|
9
|
+
Project-URL: Changelog, https://github.com/logus-lgpd/logus-lgpd/blob/main/CHANGELOG.md
|
|
10
|
+
Project-URL: Source Code, https://github.com/logus-lgpd/logus-lgpd
|
|
11
|
+
Keywords: lgpd,privacy,pii,masking,anonymization,polars,pandas,data-privacy,gdpr,lgs,aes-256,encryption,hmac,pseudonymization,data-quality,k-anonymity,differential-privacy,compliance,brasil,cpf,cnpj
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
16
|
+
Classifier: Intended Audience :: Healthcare Industry
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Security :: Cryptography
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Classifier: Natural Language :: Portuguese (Brazilian)
|
|
26
|
+
Classifier: Topic :: Database
|
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
|
+
Classifier: Topic :: Security
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Environment :: Console
|
|
31
|
+
Classifier: Framework :: Jupyter
|
|
32
|
+
Requires-Python: >=3.10
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
License-File: LICENSE
|
|
35
|
+
Requires-Dist: polars>=1.0.0
|
|
36
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
37
|
+
Requires-Dist: cryptography>=41.0.0
|
|
38
|
+
Requires-Dist: numpy>=1.24.0
|
|
39
|
+
Requires-Dist: pandas>=2.0.0
|
|
40
|
+
Provides-Extra: excel
|
|
41
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "excel"
|
|
42
|
+
Provides-Extra: synthetic
|
|
43
|
+
Requires-Dist: ctgan>=0.9.0; extra == "synthetic"
|
|
44
|
+
Requires-Dist: faker>=18.0.0; extra == "synthetic"
|
|
45
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == "synthetic"
|
|
46
|
+
Provides-Extra: sql
|
|
47
|
+
Requires-Dist: duckdb>=0.10.0; extra == "sql"
|
|
48
|
+
Requires-Dist: sqlalchemy>=2.0.0; extra == "sql"
|
|
49
|
+
Provides-Extra: full
|
|
50
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "full"
|
|
51
|
+
Requires-Dist: duckdb>=0.10.0; extra == "full"
|
|
52
|
+
Requires-Dist: sqlalchemy>=2.0.0; extra == "full"
|
|
53
|
+
Requires-Dist: ctgan>=0.9.0; extra == "full"
|
|
54
|
+
Requires-Dist: faker>=18.0.0; extra == "full"
|
|
55
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == "full"
|
|
56
|
+
Provides-Extra: dev
|
|
57
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
58
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
59
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
60
|
+
Requires-Dist: mypy>=1.8.0; extra == "dev"
|
|
61
|
+
Dynamic: license-file
|
|
62
|
+
|
|
63
|
+
# logus-lgpd
|
|
64
|
+
|
|
65
|
+
**logus-lgpd** is a Python library for privacy-by-design with tabular data.
|
|
66
|
+
It provides LGPD-compliant PII detection and masking, AES-256-GCM encrypted
|
|
67
|
+
file storage (`.lgs` format), and a SQL-like DSL for data manipulation built
|
|
68
|
+
on top of Polars.
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
pip install logus-lgpd
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
import logus as lg
|
|
76
|
+
import os
|
|
77
|
+
|
|
78
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
79
|
+
KEY = os.environ["LOGUS_KEY"]
|
|
80
|
+
|
|
81
|
+
df = lg.read("clientes.csv") # any format → pl.DataFrame
|
|
82
|
+
df_safe = lg.mask(df, salt=SALT) # detect + mask PII (LGPD)
|
|
83
|
+
lg.store(df_safe, "clientes.lgs", key=KEY) # AES-256-GCM encrypted
|
|
84
|
+
df_back = lg.read("clientes.lgs", key=KEY) # decrypt and read back
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## What logus-lgpd does
|
|
90
|
+
|
|
91
|
+
| Capability | Function |
|
|
92
|
+
|---|---|
|
|
93
|
+
| Read any tabular format | `lg.read()` |
|
|
94
|
+
| Detect PII automatically | `lg.scan()` |
|
|
95
|
+
| Mask PII (HMAC-SHA256) | `lg.mask()` |
|
|
96
|
+
| Save with AES-256-GCM | `lg.store()` |
|
|
97
|
+
| Expressive data manipulation | `lg.where()`, `lg.groupby()`, `lg.add_column()` |
|
|
98
|
+
| Full pipeline in one call | `lg.process()` |
|
|
99
|
+
| Data quality validation | `lg.validate()` |
|
|
100
|
+
| Database with masking | `lg.db()` |
|
|
101
|
+
| Privacy metrics | `lg.check.kanon()`, `lg.check.risk()` |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Installation
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Core (Polars + pandas + pyarrow + cryptography)
|
|
109
|
+
pip install logus-lgpd
|
|
110
|
+
|
|
111
|
+
# With SQL via DuckDB (lg.sql, lg.db)
|
|
112
|
+
pip install "logus-lgpd[sql]"
|
|
113
|
+
|
|
114
|
+
# With Excel support (.xlsx, .ods)
|
|
115
|
+
pip install "logus-lgpd[excel]"
|
|
116
|
+
|
|
117
|
+
# With synthetic data generation (lg.clone, lg.sandbox)
|
|
118
|
+
pip install "logus-lgpd[synthetic]"
|
|
119
|
+
|
|
120
|
+
# Everything
|
|
121
|
+
pip install "logus-lgpd[full]"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Requires:** Python ≥ 3.10, Polars ≥ 1.0, pandas ≥ 2.0, pyarrow ≥ 14.0
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Quick Start
|
|
129
|
+
|
|
130
|
+
### 1. Read any file format
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
import logus as lg
|
|
134
|
+
|
|
135
|
+
# All return pl.DataFrame; auto-detects encoding
|
|
136
|
+
df = lg.read("clientes.csv")
|
|
137
|
+
df = lg.read("clientes.parquet")
|
|
138
|
+
df = lg.read("clientes.xlsx") # pip install "logus-lgpd[excel]"
|
|
139
|
+
df = lg.read("clientes.lgs", key=KEY) # decrypt .lgs
|
|
140
|
+
|
|
141
|
+
# CSV with non-default separator and encoding
|
|
142
|
+
df = lg.read("clientes.csv", sep=";", encoding="latin-1")
|
|
143
|
+
|
|
144
|
+
# Partial read for large files (no OOM)
|
|
145
|
+
df = lg.read("big.parquet", head=100_000)
|
|
146
|
+
df = lg.read("big.parquet", sample=500_000) # random row groups
|
|
147
|
+
info = lg.read("big.parquet", header_only=True) # schema + shape, zero data read
|
|
148
|
+
df = lg.read("big.parquet", n_chunks=5, chunks=[2, 4])
|
|
149
|
+
for chunk in lg.read("big.parquet", n_chunks=10, iter_chunks=True):
|
|
150
|
+
process(chunk) # never loads full file
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### 2. Detect and mask PII (LGPD)
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import logus as lg
|
|
157
|
+
import os
|
|
158
|
+
|
|
159
|
+
SALT = os.environ["LOGUS_SALT"] # HMAC key — store in env, never hardcode
|
|
160
|
+
|
|
161
|
+
# Detect PII columns automatically
|
|
162
|
+
reports = lg.scan(df)
|
|
163
|
+
for col, r in reports.items():
|
|
164
|
+
print(f"{col}: {r.pii_type.value} risk={r.risk_level.value} → {r.mask_strategy.value}")
|
|
165
|
+
# cpf: cpf risk=high → hash
|
|
166
|
+
# email: email risk=high → hash
|
|
167
|
+
# nome: nome risk=medium → redact
|
|
168
|
+
# cep: cep risk=low → truncate (01310-XXX)
|
|
169
|
+
# data_nasc:data_nascimento risk=medium → generalize_date (1985-03-XX)
|
|
170
|
+
|
|
171
|
+
# Mask PII — preserves input type (pd→pd, pl→pl)
|
|
172
|
+
df_safe = lg.mask(df, salt=SALT)
|
|
173
|
+
df_safe = lg.mask(df, salt=SALT, columns=["cpf", "email"]) # only these
|
|
174
|
+
df_safe = lg.mask(df, salt=SALT, exclude=["uf"]) # all except uf
|
|
175
|
+
df_safe = lg.mask(df, salt=SALT, risk="high") # only high-risk PII
|
|
176
|
+
|
|
177
|
+
# Deterministic: same value + same salt → same token (essential for JOINs)
|
|
178
|
+
# CPF "111.444.777-35" → always "3f2a8b1c9d4e7f0a" with the same SALT
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### 3. Save and read encrypted (.lgs)
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
import logus as lg
|
|
185
|
+
import os
|
|
186
|
+
|
|
187
|
+
KEY = os.environ["LOGUS_KEY"] # AES-256 key — different from SALT
|
|
188
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
189
|
+
|
|
190
|
+
# Save
|
|
191
|
+
lg.store(df, "clientes.lgs", key=KEY) # encrypt, data as-is
|
|
192
|
+
lg.store(df, "clientes.lgs", key=KEY, salt=SALT) # mask + encrypt
|
|
193
|
+
lg.store({"clients": df1, "orders": df2}, "base.lgs", key=KEY) # multi-frame
|
|
194
|
+
|
|
195
|
+
# Read metadata WITHOUT decrypting the payload
|
|
196
|
+
info = lg.inspect("clientes.lgs", key=KEY)
|
|
197
|
+
# {"version":"2.1","shape":[150000,12],"columns":["cpf","nome",...],
|
|
198
|
+
# "column_stats":{"cpf":{"n_nulls":0,"n_unique":150000},...},
|
|
199
|
+
# "content_type":"masked_dataframe","encryption":"AES-256-GCM"}
|
|
200
|
+
|
|
201
|
+
# Read
|
|
202
|
+
df = lg.read("clientes.lgs", key=KEY)
|
|
203
|
+
frames = lg.read("base.lgs", key=KEY) # dict[str, pd.DataFrame]
|
|
204
|
+
df_cli = lg.read("base.lgs", key=KEY, frame="clients")
|
|
205
|
+
|
|
206
|
+
# OO interface
|
|
207
|
+
with lg.open("clientes.lgs", key=KEY) as f:
|
|
208
|
+
df = f.read()
|
|
209
|
+
info = f.info()
|
|
210
|
+
frames = f.frames()
|
|
211
|
+
f.write(df_updated)
|
|
212
|
+
f.add_frame("novos", df_new)
|
|
213
|
+
|
|
214
|
+
# Rotate encryption key (without exposing data as file)
|
|
215
|
+
lg.rekey("clientes.lgs", old_key=OLD_KEY, new_key=NEW_KEY)
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### 4. Manipulate data (SQL-like DSL)
|
|
219
|
+
|
|
220
|
+
All functions work with `pd.DataFrame`, `pl.DataFrame`, and `pl.LazyFrame`.
|
|
221
|
+
They preserve the input type — Polars in, Polars out.
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
import logus as lg
|
|
225
|
+
|
|
226
|
+
# WHERE
|
|
227
|
+
df = lg.where(df, uf="SP")
|
|
228
|
+
df = lg.where(df, uf=["SP", "RJ", "MG"]) # isin
|
|
229
|
+
df = lg.where(df, renda_mensal=(">", 5_000)) # operator
|
|
230
|
+
df = lg.where(df, renda_mensal=(5_000, 15_000)) # between
|
|
231
|
+
df = lg.where(df, nome=("contains", "Silva")) # string contains
|
|
232
|
+
df = lg.where(df, lg.col("renda") > lg.col("media")) # expression
|
|
233
|
+
df = lg.where(df, uf="SP", tipo_pessoa="PF") # AND (multiple kwargs)
|
|
234
|
+
|
|
235
|
+
# SELECT / DROP / RENAME
|
|
236
|
+
df = lg.select(df, ["cpf", "renda", "uf"])
|
|
237
|
+
df = lg.drop(df, "coluna_inutil")
|
|
238
|
+
df = lg.rename(df, {"cpf": "documento"})
|
|
239
|
+
|
|
240
|
+
# ORDER BY
|
|
241
|
+
df = lg.sort(df, "renda_mensal", desc=True)
|
|
242
|
+
df = lg.sort(df, ["uf", "renda"], ascending=[True, False])
|
|
243
|
+
|
|
244
|
+
# GROUP BY with HAVING, ORDER BY, LIMIT
|
|
245
|
+
resultado = lg.groupby(df, "uf", {
|
|
246
|
+
"n": ("*", "count"),
|
|
247
|
+
"media": ("renda_mensal", "mean"),
|
|
248
|
+
"total": ("renda_mensal", "sum"),
|
|
249
|
+
"unicos": ("cpf", "n_unique"),
|
|
250
|
+
}, having={"n": (">", 100)}, sort="media", desc=True, limit=10)
|
|
251
|
+
|
|
252
|
+
# ADD COLUMN with expressions, CASE WHEN, window functions
|
|
253
|
+
df = lg.add_column(df,
|
|
254
|
+
imposto = lg.col("renda_mensal") * 0.275,
|
|
255
|
+
faixa = lg.when(lg.col("renda_mensal") > 10_000, "alta")
|
|
256
|
+
.when(lg.col("renda_mensal") > 5_000, "media")
|
|
257
|
+
.otherwise("baixa"),
|
|
258
|
+
rank_uf = lg.col("renda_mensal").rank("dense", descending=True).over("uf"),
|
|
259
|
+
media_uf = lg.col("renda_mensal").mean().over("uf"),
|
|
260
|
+
nome_lower = lg.col("nome").str.to_lowercase(),
|
|
261
|
+
ano_nasc = lg.col("data_nasc").str.to_date("%Y-%m-%d").dt.year(),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
# DISTINCT
|
|
265
|
+
df = lg.unique(df, "cpf") # one row per CPF
|
|
266
|
+
df = lg.unique(df, ["uf", "tipo"], keep="first")
|
|
267
|
+
|
|
268
|
+
# PIVOT / MELT
|
|
269
|
+
pv = lg.pivot(df, index="uf", columns="tipo", values="renda", aggfunc="mean")
|
|
270
|
+
ml = lg.melt(df, id_cols=["uf"], value_cols=["renda_jan", "renda_fev"])
|
|
271
|
+
|
|
272
|
+
# TOP N per group
|
|
273
|
+
top3 = lg.top_n(df, 3, "renda_mensal", group_by="uf")
|
|
274
|
+
|
|
275
|
+
# FILL NULL / CAST
|
|
276
|
+
df = lg.fill_null(df, {"renda": 0, "uf": "DESCONHECIDO"})
|
|
277
|
+
df = lg.cast(df, {"renda": "float32", "inadimplente": "bool"})
|
|
278
|
+
|
|
279
|
+
# CONCAT
|
|
280
|
+
df_all = lg.concat([df_jan, df_fev, df_mar])
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### 5. lg.col() — full Polars expression API
|
|
284
|
+
|
|
285
|
+
`lg.col` is literally `polars.col`. All 200+ Polars methods are available:
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
lg.col("renda").round(2)
|
|
289
|
+
lg.col("renda").log(base=10)
|
|
290
|
+
lg.col("nome").str.to_lowercase()
|
|
291
|
+
lg.col("email").str.split("@").list.last()
|
|
292
|
+
lg.col("cpf").str.replace_all(r"[\.\-]", "")
|
|
293
|
+
lg.col("data").dt.year()
|
|
294
|
+
lg.col("data").dt.truncate("1mo")
|
|
295
|
+
lg.col("renda").mean().over("uf") # window function
|
|
296
|
+
lg.col("renda").rank("dense", descending=True).over("uf")
|
|
297
|
+
lg.col("renda").rolling_mean(window_size=3)
|
|
298
|
+
lg.col("renda").cum_sum()
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
### 6. Pipeline fluente
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
import logus as lg
|
|
305
|
+
import os
|
|
306
|
+
|
|
307
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
308
|
+
KEY = os.environ["LOGUS_KEY"]
|
|
309
|
+
|
|
310
|
+
result = (
|
|
311
|
+
lg.pipe("clientes.parquet")
|
|
312
|
+
.where(uf="SP", tipo_pessoa="PF")
|
|
313
|
+
.add_column(
|
|
314
|
+
imposto = lg.col("renda_mensal") * 0.275,
|
|
315
|
+
faixa = lg.when(lg.col("renda_mensal") > 10_000, "alta")
|
|
316
|
+
.when(lg.col("renda_mensal") > 5_000, "media")
|
|
317
|
+
.otherwise("baixa"),
|
|
318
|
+
)
|
|
319
|
+
.mask(salt=SALT)
|
|
320
|
+
.groupby("faixa", {"n": ("*", "count"), "media": ("renda_mensal", "mean")})
|
|
321
|
+
.sort("media", desc=True)
|
|
322
|
+
.collect() # → pl.DataFrame
|
|
323
|
+
)
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
### 7. Full pipeline in one call
|
|
327
|
+
|
|
328
|
+
```python
|
|
329
|
+
import logus as lg
|
|
330
|
+
import os
|
|
331
|
+
|
|
332
|
+
result = lg.process(
|
|
333
|
+
"clientes.csv",
|
|
334
|
+
salt=os.environ["LOGUS_SALT"],
|
|
335
|
+
key=os.environ["LOGUS_KEY"],
|
|
336
|
+
output="clientes_safe.lgs",
|
|
337
|
+
overwrite=True,
|
|
338
|
+
where={"uf": ["SP", "RJ", "MG"]},
|
|
339
|
+
rules={
|
|
340
|
+
"cpf": {"not_null": True},
|
|
341
|
+
"renda_mensal": {"min": 0, "max": 500_000},
|
|
342
|
+
"email": {"contains": "@"},
|
|
343
|
+
},
|
|
344
|
+
verbose=True,
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
print(f"Rows: {result.n_rows:,}")
|
|
348
|
+
print(f"PII columns: {result.pii_columns}")
|
|
349
|
+
print(f"Privacy score: {result.privacy_score}/100")
|
|
350
|
+
print(f"Validation: {result.validation.passed}")
|
|
351
|
+
print(f"Saved to: {result.output_path}")
|
|
352
|
+
result.print_summary()
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### 8. SQL via DuckDB
|
|
356
|
+
|
|
357
|
+
```python
|
|
358
|
+
import logus as lg
|
|
359
|
+
|
|
360
|
+
# pip install "logus-lgpd[sql]"
|
|
361
|
+
|
|
362
|
+
# SQL on DataFrames
|
|
363
|
+
result = lg.sql(
|
|
364
|
+
"SELECT uf, AVG(renda_mensal) AS media, COUNT(*) AS n "
|
|
365
|
+
"FROM df GROUP BY uf HAVING n > 100 ORDER BY media DESC",
|
|
366
|
+
df=df,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# JOIN two DataFrames via SQL
|
|
370
|
+
result = lg.sql(
|
|
371
|
+
"SELECT c.uf, SUM(p.valor) AS total "
|
|
372
|
+
"FROM clientes c JOIN pedidos p ON c.cpf = p.cpf "
|
|
373
|
+
"GROUP BY c.uf",
|
|
374
|
+
clientes=df_clientes,
|
|
375
|
+
pedidos=df_pedidos,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# SQL on Parquet files (DuckDB reads natively)
|
|
379
|
+
result = lg.sql("SELECT * FROM read_parquet('big.parquet') WHERE uf='SP' LIMIT 1000")
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
### 9. Database connection
|
|
383
|
+
|
|
384
|
+
```python
|
|
385
|
+
import logus as lg
|
|
386
|
+
import os
|
|
387
|
+
|
|
388
|
+
# pip install "logus-lgpd[sql]"
|
|
389
|
+
|
|
390
|
+
banco = lg.db(
|
|
391
|
+
"postgresql://user:pass@host:5432/db",
|
|
392
|
+
salt=os.environ["LOGUS_SALT"],
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# Read with masking
|
|
396
|
+
df = lg.read(banco, "clientes")
|
|
397
|
+
df = lg.read(banco, "clientes", sample=10_000) # TABLESAMPLE BERNOULLI
|
|
398
|
+
df = lg.read(banco, "SELECT * FROM clientes WHERE uf='SP'")
|
|
399
|
+
|
|
400
|
+
# Write masked data back
|
|
401
|
+
banco.write(df_safe, "clientes_masked", if_exists="replace")
|
|
402
|
+
lg.write(df_safe, banco, "clientes_masked") # alternative syntax
|
|
403
|
+
|
|
404
|
+
# Explore
|
|
405
|
+
print(banco.tables())
|
|
406
|
+
print(banco.schema("clientes"))
|
|
407
|
+
sample = banco.sample_table("clientes", n=5)
|
|
408
|
+
|
|
409
|
+
# Context manager closes pool automatically
|
|
410
|
+
with lg.db("postgresql://...", salt=SALT) as banco:
|
|
411
|
+
df = banco.read("clientes")
|
|
412
|
+
|
|
413
|
+
# Dialects: postgresql, mysql, sqlite, sqlserver, bigquery, snowflake, duckdb
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
### 10. Data quality validation
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
import logus as lg
|
|
420
|
+
|
|
421
|
+
result = lg.validate(df, {
|
|
422
|
+
"cpf": {"not_null": True, "unique": True},
|
|
423
|
+
"email": {"not_null": True, "contains": "@"},
|
|
424
|
+
"renda_mensal": {"min": 0, "max": 500_000, "not_null": True},
|
|
425
|
+
"uf": {"in": ["SP","RJ","MG","RS","BA","PR","SC","GO","PE","CE"]},
|
|
426
|
+
"cep": {"matches": r"^\d{5}-\d{3}$"},
|
|
427
|
+
})
|
|
428
|
+
|
|
429
|
+
result.print_report() # formatted table
|
|
430
|
+
result.passed # True / False
|
|
431
|
+
result.score # 0.93 (proportion of rules passed)
|
|
432
|
+
result.raise_if_failed() # raises ValueError with details if any rule failed
|
|
433
|
+
|
|
434
|
+
# Fluent interface
|
|
435
|
+
lg.expect(df, "renda_mensal").not_null().between(0, 500_000).validate()
|
|
436
|
+
lg.expect(df, "email").contains("@").min_length(5).validate()
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
### 11. Streaming for large files
|
|
440
|
+
|
|
441
|
+
```python
|
|
442
|
+
import logus as lg
|
|
443
|
+
import os
|
|
444
|
+
|
|
445
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
446
|
+
|
|
447
|
+
# Yields pl.DataFrame chunks — never loads full file into memory
|
|
448
|
+
for chunk in lg.stream("grande.csv", salt=SALT, chunksize=50_000):
|
|
449
|
+
save_to_database(chunk)
|
|
450
|
+
|
|
451
|
+
# With progress callback
|
|
452
|
+
def progress(chunk_n, rows_done, total_estimate):
|
|
453
|
+
print(f"Chunk {chunk_n}: {rows_done:,} / ~{total_estimate:,} rows")
|
|
454
|
+
|
|
455
|
+
for chunk in lg.stream("grande.parquet", salt=SALT, on_progress=progress):
|
|
456
|
+
process(chunk)
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
### 12. Privacy metrics
|
|
460
|
+
|
|
461
|
+
```python
|
|
462
|
+
import logus as lg
|
|
463
|
+
|
|
464
|
+
# k-anonymity (ANPD recommends k ≥ 5)
|
|
465
|
+
report = lg.check.kanon(df, quasi_identifiers=["uf", "faixa_etaria", "escolaridade"])
|
|
466
|
+
print(f"k={report.k_anonymity.k_value} compliant={report.compliant_anpd}")
|
|
467
|
+
|
|
468
|
+
# Re-identification risk score (0=safe, 1=high risk)
|
|
469
|
+
report = lg.check.risk(df_safe, quasi_identifiers=["uf", "faixa_etaria"])
|
|
470
|
+
print(f"risk={report.risk_score:.2f} level={report.risk_level}")
|
|
471
|
+
|
|
472
|
+
# Utility preservation after masking
|
|
473
|
+
report = lg.check.utility(df_original, df_masked)
|
|
474
|
+
print(f"utility={report.overall_score:.0%}")
|
|
475
|
+
|
|
476
|
+
# Differential privacy
|
|
477
|
+
dp = lg.check.dp(epsilon=1.0)
|
|
478
|
+
noisy_mean = dp.laplace(df["renda"].mean(), sensitivity=df["renda"].max())
|
|
479
|
+
|
|
480
|
+
# Privacy score (composite 0–100)
|
|
481
|
+
profile = lg.profile(df)
|
|
482
|
+
print(f"Privacy score: {profile['privacy_score']['total']}/100 [{profile['privacy_score']['grade']}]")
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### 13. SQL transpiler
|
|
486
|
+
|
|
487
|
+
```python
|
|
488
|
+
import logus as lg
|
|
489
|
+
|
|
490
|
+
reports = lg.scan(df)
|
|
491
|
+
|
|
492
|
+
# Transform SELECT to mask PII inline (data never leaves the database)
|
|
493
|
+
safe_sql = lg.mask_sql(
|
|
494
|
+
"SELECT cpf, nome, email, renda_mensal, uf FROM clientes WHERE uf = 'SP'",
|
|
495
|
+
reports=reports,
|
|
496
|
+
dialect="postgresql",
|
|
497
|
+
salt=os.environ["LOGUS_SALT"],
|
|
498
|
+
)
|
|
499
|
+
# SELECT
|
|
500
|
+
# encode(hmac(cpf::text, 'salt', 'sha256'), 'hex') AS cpf,
|
|
501
|
+
# 'REDACTED' AS nome,
|
|
502
|
+
# encode(hmac(email::text, 'salt', 'sha256'), 'hex') AS email,
|
|
503
|
+
# renda_mensal,
|
|
504
|
+
# uf
|
|
505
|
+
# FROM clientes WHERE uf = 'SP'
|
|
506
|
+
|
|
507
|
+
# Generate CREATE VIEW
|
|
508
|
+
view_sql = lg.generate_view(df, "clientes", reports=reports, dialect="postgresql")
|
|
509
|
+
```
|
|
510
|
+
|
|
511
|
+
### 14. Data lineage
|
|
512
|
+
|
|
513
|
+
```python
|
|
514
|
+
import logus as lg
|
|
515
|
+
import os
|
|
516
|
+
|
|
517
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
518
|
+
|
|
519
|
+
# Automatic with lg.process()
|
|
520
|
+
result = lg.process(df, salt=SALT, track_lineage=True)
|
|
521
|
+
print(result.lineage.summary())
|
|
522
|
+
|
|
523
|
+
# Manual
|
|
524
|
+
tracker = lg.lineage.start("pipeline_crm")
|
|
525
|
+
tracker.origin(df, "clientes.parquet", format="parquet")
|
|
526
|
+
df_filtered = lg.where(df, uf="SP")
|
|
527
|
+
tracker.transform(df_filtered, operation="filter", detail="uf='SP'")
|
|
528
|
+
df_safe = lg.mask(df_filtered, salt=SALT)
|
|
529
|
+
tracker.mask(df_safe, columns=list(reports.keys()), salt=SALT)
|
|
530
|
+
tracker.export("output.lgs", format="lgs", encrypted=True)
|
|
531
|
+
|
|
532
|
+
tracker.to_json("lineage.json") # save lineage record
|
|
533
|
+
tracker.to_openlineage("openlineage.json") # OpenLineage-compatible format
|
|
534
|
+
|
|
535
|
+
# Context manager (prints summary automatically)
|
|
536
|
+
with lg.lineage.session("pipeline") as lin:
|
|
537
|
+
lin.origin(df, "source.csv")
|
|
538
|
+
lin.mask(df_safe, columns=["cpf", "email"], salt=SALT)
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
---
|
|
542
|
+
|
|
543
|
+
## Use Cases
|
|
544
|
+
|
|
545
|
+
### Masking CPFs in production logs
|
|
546
|
+
|
|
547
|
+
```python
|
|
548
|
+
import logus as lg
|
|
549
|
+
import os
|
|
550
|
+
|
|
551
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
552
|
+
KEY = os.environ["LOGUS_KEY"]
|
|
553
|
+
|
|
554
|
+
# One-time: mask production data
|
|
555
|
+
df_raw = lg.read("producao.parquet")
|
|
556
|
+
df_safe = lg.mask(df_raw, salt=SALT, verbose=True)
|
|
557
|
+
lg.store(df_safe, "producao_safe.lgs", key=KEY)
|
|
558
|
+
|
|
559
|
+
# Analytics team works with masked data
|
|
560
|
+
df = lg.read("producao_safe.lgs", key=KEY)
|
|
561
|
+
# CPFs are now: "3f2a8b1c9d4e7f0a" — tokens, not real values
|
|
562
|
+
```
|
|
563
|
+
|
|
564
|
+
### JOIN between masked tables (deterministic tokens)
|
|
565
|
+
|
|
566
|
+
```python
|
|
567
|
+
import logus as lg
|
|
568
|
+
import os
|
|
569
|
+
|
|
570
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
571
|
+
|
|
572
|
+
# Same CPF → same token in both tables → JOIN works
|
|
573
|
+
df_c_safe = lg.mask(df_clientes, salt=SALT)
|
|
574
|
+
df_p_safe = lg.mask(df_pedidos, salt=SALT)
|
|
575
|
+
|
|
576
|
+
result = lg.join(df_c_safe, df_p_safe, on="cpf")
|
|
577
|
+
# OR let logus apply the same SALT to both automatically:
|
|
578
|
+
result = lg.join(df_clientes, df_pedidos, on="cpf", salt=SALT)
|
|
579
|
+
```
|
|
580
|
+
|
|
581
|
+
### Processing a 10GB CSV without OOM
|
|
582
|
+
|
|
583
|
+
```python
|
|
584
|
+
import logus as lg
|
|
585
|
+
import os
|
|
586
|
+
|
|
587
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
588
|
+
|
|
589
|
+
# Option 1: stream in chunks
|
|
590
|
+
total_rows = 0
|
|
591
|
+
for chunk in lg.stream("big.csv", salt=SALT, chunksize=100_000):
|
|
592
|
+
save_to_db(chunk)
|
|
593
|
+
total_rows += len(chunk)
|
|
594
|
+
print(f"{total_rows:,} rows processed")
|
|
595
|
+
|
|
596
|
+
# Option 2: partial read for exploration
|
|
597
|
+
schema = lg.read("big.csv", header_only=True) # instant, zero data
|
|
598
|
+
sample = lg.read("big.csv", head=10_000) # first 10k rows
|
|
599
|
+
# For repeated access, convert to Parquet once:
|
|
600
|
+
df_full = lg.read("big.csv")
|
|
601
|
+
lg.write(df_full, "big.parquet") # subsequent reads 6× faster
|
|
602
|
+
```
|
|
603
|
+
|
|
604
|
+
### LGPD compliance check before sharing data
|
|
605
|
+
|
|
606
|
+
```python
|
|
607
|
+
import logus as lg
|
|
608
|
+
|
|
609
|
+
df = lg.read("clientes.parquet")
|
|
610
|
+
|
|
611
|
+
# Full diagnostic
|
|
612
|
+
profile = lg.profile(df)
|
|
613
|
+
score = profile["privacy_score"]
|
|
614
|
+
print(f"Privacy Score: {score['total']}/100 [{score['grade']}]")
|
|
615
|
+
print(f"PII columns: {profile['pii_columns']}")
|
|
616
|
+
print(f"Recommendation: {score['recommendation']}")
|
|
617
|
+
|
|
618
|
+
# If score is acceptable, share safely
|
|
619
|
+
if score["total"] >= 75:
|
|
620
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
621
|
+
KEY = os.environ["LOGUS_KEY"]
|
|
622
|
+
df_safe = lg.mask(df, salt=SALT)
|
|
623
|
+
lg.store(df_safe, "clientes_para_parceiro.lgs", key=KEY)
|
|
624
|
+
```
|
|
625
|
+
|
|
626
|
+
### Database masking without data leaving the server
|
|
627
|
+
|
|
628
|
+
```python
|
|
629
|
+
import logus as lg
|
|
630
|
+
import os
|
|
631
|
+
|
|
632
|
+
# pip install "logus-lgpd[sql]"
|
|
633
|
+
|
|
634
|
+
# Option 1: Create masked view in the database
|
|
635
|
+
banco = lg.db("postgresql://user:pass@host/db", salt=os.environ["LOGUS_SALT"])
|
|
636
|
+
result = banco.create_masked_view("clientes") # creates clientes_masked view
|
|
637
|
+
# Now devs use: SELECT * FROM clientes_masked
|
|
638
|
+
|
|
639
|
+
# Option 2: Generate SQL to run yourself
|
|
640
|
+
reports = lg.scan(lg.read(banco, "clientes", head=500)) # sample for detection
|
|
641
|
+
view_sql = lg.link.sql(None, reports, table="clientes", dialect="postgresql")
|
|
642
|
+
print(view_sql) # → CREATE OR REPLACE VIEW clientes_masked AS ...
|
|
643
|
+
```
|
|
644
|
+
|
|
645
|
+
---
|
|
646
|
+
|
|
647
|
+
## API Reference
|
|
648
|
+
|
|
649
|
+
### I/O Functions
|
|
650
|
+
|
|
651
|
+
| Function | Signature | Returns |
|
|
652
|
+
|---|---|---|
|
|
653
|
+
| `lg.read` | `(source, *, key, salt, head, sample, n_chunks, chunks, iter_chunks, header_only, columns, ...)` | `pl.DataFrame \| dict \| Generator` |
|
|
654
|
+
| `lg.write` | `(df, path_or_conn, table=None, **kw)` | `None` |
|
|
655
|
+
| `lg.store` | `(source, path, *, key, salt, anonymize, compress, overwrite, metadata)` | `dict` |
|
|
656
|
+
| `lg.stream` | `(source, *, salt, chunksize, on_progress)` | `Generator[pl.DataFrame]` |
|
|
657
|
+
| `lg.open` | `(path, *, key, salt, compress)` | `LGSFile` |
|
|
658
|
+
| `lg.inspect` | `(path, *, key)` | `dict` |
|
|
659
|
+
| `lg.rekey` | `(path, *, old_key, new_key, output_path)` | `dict` |
|
|
660
|
+
| `lg.db` | `(uri, *, salt, dialect, pool_size)` | `DatabaseConnection` |
|
|
661
|
+
|
|
662
|
+
### Privacy Functions
|
|
663
|
+
|
|
664
|
+
| Function | Signature | Returns |
|
|
665
|
+
|---|---|---|
|
|
666
|
+
| `lg.scan` | `(source, *, key, sample_size, threshold)` | `Dict[str, ColumnReport]` |
|
|
667
|
+
| `lg.mask` | `(df, *, salt, columns, exclude, risk, strict, verbose)` | same type as input |
|
|
668
|
+
| `lg.diff` | `(original, masked, *, sample_size)` | `dict` |
|
|
669
|
+
| `lg.profile` | `(source, *, key, sample_size)` | `dict` |
|
|
670
|
+
| `lg.join` | `(left, right, on, *, salt, how)` | `pd.DataFrame` |
|
|
671
|
+
| `lg.process` | `(source, *, salt, key, output, where, rules, verbose, ...)` | `ProcessResult` |
|
|
672
|
+
| `lg.validate` | `(df, rules, *, severity, warn_only)` | `ValidationReport` |
|
|
673
|
+
| `lg.expect` | `(df, column)` | `_ColumnExpectation` (fluent) |
|
|
674
|
+
|
|
675
|
+
### Manipulation Functions
|
|
676
|
+
|
|
677
|
+
All accept `pd.DataFrame`, `pl.DataFrame`, and `pl.LazyFrame`. All preserve input type.
|
|
678
|
+
|
|
679
|
+
| Function | SQL equivalent |
|
|
680
|
+
|---|---|
|
|
681
|
+
| `lg.where(df, **kwargs)` | `WHERE` |
|
|
682
|
+
| `lg.select(df, cols)` | `SELECT col1, col2` |
|
|
683
|
+
| `lg.drop(df, cols)` | `SELECT * EXCEPT(col)` |
|
|
684
|
+
| `lg.rename(df, mapping)` | `SELECT col AS new_name` |
|
|
685
|
+
| `lg.sort(df, by, desc=)` | `ORDER BY` |
|
|
686
|
+
| `lg.groupby(df, by, agg)` | `GROUP BY` |
|
|
687
|
+
| `lg.add_column(df, **exprs)` | `SELECT *, expr AS name` |
|
|
688
|
+
| `lg.when(cond, val).otherwise(d)` | `CASE WHEN` |
|
|
689
|
+
| `lg.unique(df, subset)` | `SELECT DISTINCT` |
|
|
690
|
+
| `lg.head(df, n)` | `LIMIT N` |
|
|
691
|
+
| `lg.top_n(df, n, by, group_by=)` | `RANK() OVER (PARTITION BY ...)` |
|
|
692
|
+
| `lg.concat(frames)` | `UNION ALL` |
|
|
693
|
+
| `lg.pivot(df, ...)` | `PIVOT` |
|
|
694
|
+
| `lg.melt(df, ...)` | `UNPIVOT` |
|
|
695
|
+
| `lg.fill_null(df, value)` | `COALESCE` |
|
|
696
|
+
| `lg.cast(df, schema)` | `CAST(col AS type)` |
|
|
697
|
+
|
|
698
|
+
### Aliases
|
|
699
|
+
|
|
700
|
+
| Alias | Points to |
|
|
701
|
+
|---|---|
|
|
702
|
+
| `lg.q` | `lg.where` |
|
|
703
|
+
| `lg.filter_` | `lg.where` |
|
|
704
|
+
| `lg.order_by` | `lg.sort` |
|
|
705
|
+
| `lg.group_by` | `lg.groupby` |
|
|
706
|
+
| `lg.distinct` | `lg.unique` |
|
|
707
|
+
| `lg.union_all` | `lg.concat` |
|
|
708
|
+
| `lg.limit` | `lg.head` |
|
|
709
|
+
| `lg.unpivot` | `lg.melt` |
|
|
710
|
+
| `lg.fillna` | `lg.fill_null` |
|
|
711
|
+
| `lg.coalesce` | `lg.fill_null` |
|
|
712
|
+
| `lg.assign` | `lg.add_column` |
|
|
713
|
+
| `lg.save` | `lg.store` |
|
|
714
|
+
| `lg.load` | `lg.read` |
|
|
715
|
+
|
|
716
|
+
---
|
|
717
|
+
|
|
718
|
+
## Configuration
|
|
719
|
+
|
|
720
|
+
```python
|
|
721
|
+
import logus as lg
|
|
722
|
+
import os
|
|
723
|
+
|
|
724
|
+
# Set global defaults at application startup
|
|
725
|
+
lg.configure(
|
|
726
|
+
default_salt = os.environ["LOGUS_SALT"], # used when salt= not passed to mask()
|
|
727
|
+
audit_path = "./audit/", # auto-create audit trail
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Generate secrets (do this once, save to .env)
|
|
731
|
+
SALT = lg.generate_salt() # 256-bit, base64-encoded
|
|
732
|
+
KEY = lg.generate_salt() # different value from SALT
|
|
733
|
+
print(f"LOGUS_SALT={SALT}")
|
|
734
|
+
print(f"LOGUS_KEY={KEY}")
|
|
735
|
+
```
|
|
736
|
+
|
|
737
|
+
---
|
|
738
|
+
|
|
739
|
+
## The .lgs Format
|
|
740
|
+
|
|
741
|
+
`.lgs` is a binary container for encrypted tabular data:
|
|
742
|
+
|
|
743
|
+
- **Encryption:** AES-256-GCM (NIST SP 800-38D) — confidentiality + integrity
|
|
744
|
+
- **Key derivation:** HKDF-SHA256 (RFC 5869) — unique DEK per file
|
|
745
|
+
- **Payload:** Parquet with zstd compression (or lz4 for speed)
|
|
746
|
+
- **Header (v2.1):** JSON with schema, column stats, and LGPD metadata — readable without decryption via `lg.inspect()`
|
|
747
|
+
- **Integrity:** HMAC-SHA256 over the entire file — detects tampering
|
|
748
|
+
|
|
749
|
+
```
|
|
750
|
+
[5 bytes] MAGIC = b"LOGUS"
|
|
751
|
+
[1 byte] VERSION = 0x02
|
|
752
|
+
[1 byte] CIPHER = 0x01 (AES-256-GCM)
|
|
753
|
+
[32 bytes] SALT_KDF — unique per file
|
|
754
|
+
[12 bytes] NONCE — for header encryption
|
|
755
|
+
[4 bytes] HEADER_LEN
|
|
756
|
+
[N+16 bytes] HEADER_CT+TAG — encrypted JSON metadata
|
|
757
|
+
[12 bytes] NONCE — for payload
|
|
758
|
+
[M+16 bytes] PAYLOAD_CT+TAG — encrypted Parquet
|
|
759
|
+
[32 bytes] FILE_HMAC
|
|
760
|
+
```
|
|
761
|
+
|
|
762
|
+
---
|
|
763
|
+
|
|
764
|
+
## Requirements
|
|
765
|
+
|
|
766
|
+
| Package | Min version | Purpose |
|
|
767
|
+
|---|---|---|
|
|
768
|
+
| `polars` | 1.0.0 | Data engine (required) |
|
|
769
|
+
| `pandas` | 2.0.0 | Fallback + Excel/SAS/SPSS formats (required) |
|
|
770
|
+
| `pyarrow` | 14.0.0 | Parquet I/O (required) |
|
|
771
|
+
| `cryptography` | 41.0.0 | AES-256-GCM, HKDF (required) |
|
|
772
|
+
| `numpy` | 1.24.0 | Numeric operations (required) |
|
|
773
|
+
| `duckdb` | 0.10.0 | `lg.sql()`, `lg.db()` (optional: `[sql]`) |
|
|
774
|
+
| `sqlalchemy` | 2.0.0 | `lg.db()` fallback (optional: `[sql]`) |
|
|
775
|
+
| `openpyxl` | 3.1.0 | Excel read/write (optional: `[excel]`) |
|
|
776
|
+
| `ctgan` | 0.9.0 | `lg.clone()` synthetic data (optional: `[synthetic]`) |
|
|
777
|
+
|
|
778
|
+
---
|
|
779
|
+
|
|
780
|
+
## License
|
|
781
|
+
|
|
782
|
+
GNU Affero General Public License v3 (AGPL-3.0)
|
|
783
|
+
|
|
784
|
+
---
|
|
785
|
+
|
|
786
|
+
## Changelog
|
|
787
|
+
|
|
788
|
+
See [CHANGELOG.md](CHANGELOG.md).
|
|
789
|
+
|
|
790
|
+
| Version | Highlights |
|
|
791
|
+
|---|---|
|
|
792
|
+
| 1.0.5 | `lg.read()` big-data params (head/sample/chunks/iter_chunks), `lg.db()`, mask(LazyFrame), stream via scan_csv batches, CSV sidecar index, 199 tests |
|
|
793
|
+
| 1.1.0 | `lg.validate()`, `lg.mask_sql()`, `lg.lineage`, privacy score in profile(), `lg.process()` |
|
|
794
|
+
| 1.0.4 | `polars>=1.0.0` required, FastPIIScanner (9× faster), `__init__.py` cleaned |
|
|
795
|
+
|
|
796
|
+
---
|
|
797
|
+
|
|
798
|
+
## Novidades v1.0.5
|
|
799
|
+
|
|
800
|
+
### 1. `lg.contract()` — Contrato de Dados
|
|
801
|
+
|
|
802
|
+
```python
|
|
803
|
+
import logus as lg
|
|
804
|
+
import os
|
|
805
|
+
|
|
806
|
+
SALT = os.environ["LOGUS_SALT"]
|
|
807
|
+
|
|
808
|
+
contrato = lg.contract({
|
|
809
|
+
"cpf": {"type": "str", "not_null": True, "unique": True,
|
|
810
|
+
"pii": "CPF", "mask": "hash"},
|
|
811
|
+
"renda": {"type": "float", "min": 0, "max": 500_000,
|
|
812
|
+
"pii": "numerico", "mask": "mock_numeric"},
|
|
813
|
+
"uf": {"type": "str", "in": ["SP","RJ","MG","RS","BA","PR","SC","GO","PE","CE"]},
|
|
814
|
+
"email": {"type": "str", "contains": "@",
|
|
815
|
+
"pii": "email", "mask": "hash"},
|
|
816
|
+
}, name="clientes_v2", version="2.0")
|
|
817
|
+
|
|
818
|
+
# Aplica: valida → mascara → retorna tudo em uma chamada
|
|
819
|
+
result = contrato.apply(df, salt=SALT)
|
|
820
|
+
result.raise_if_failed() # levanta ValueError se alguma regra falhou
|
|
821
|
+
df_safe = result.df # DataFrame mascarado
|
|
822
|
+
|
|
823
|
+
# Versionar e comparar
|
|
824
|
+
contrato.save("schema/clientes_v2.contract.json")
|
|
825
|
+
c_v1 = lg.DataContract.load("schema/clientes_v1.contract.json")
|
|
826
|
+
diff = contrato.diff(c_v1)
|
|
827
|
+
print(diff.has_breaking_changes)
|
|
828
|
+
print(diff.report())
|
|
829
|
+
|
|
830
|
+
# Exportar como JSON Schema (para DPOs e equipes não-Python)
|
|
831
|
+
schema = contrato.to_json_schema() # dicionário JSON Schema draft-07
|
|
832
|
+
```
|
|
833
|
+
|
|
834
|
+
### 2. Padrões PII customizados
|
|
835
|
+
|
|
836
|
+
```python
|
|
837
|
+
import logus as lg
|
|
838
|
+
from logus.detectors.fast_scan import FastPIIScanner
|
|
839
|
+
|
|
840
|
+
# Detecta identificadores proprietários da sua empresa
|
|
841
|
+
reports = lg.scan(df, custom_patterns={
|
|
842
|
+
"num_contrato": r"^CTR-[0-9]{8}$",
|
|
843
|
+
"matricula": r"^[0-9]{6}-[A-Z]$",
|
|
844
|
+
"protocolo": r"^PROT-[0-9]{4}-[0-9]{6}$",
|
|
845
|
+
})
|
|
846
|
+
```
|
|
847
|
+
|
|
848
|
+
### 3. Criptografia assimétrica no `.lgs`
|
|
849
|
+
|
|
850
|
+
```python
|
|
851
|
+
import logus as lg
|
|
852
|
+
|
|
853
|
+
# Gera par de chaves (uma vez por usuário/serviço)
|
|
854
|
+
priv, pub = lg.generate_keypair("ec") # P-256 — recomendado
|
|
855
|
+
priv, pub = lg.generate_keypair("rsa", 4096)
|
|
856
|
+
|
|
857
|
+
lg.save_keypair(priv, "chave.pem", pub, "chave_publica.pem")
|
|
858
|
+
pub = lg.load_public_key("chave_publica.pem")
|
|
859
|
+
priv = lg.load_private_key("chave.pem")
|
|
860
|
+
|
|
861
|
+
# Remetente: cifra com a chave pública do destinatário
|
|
862
|
+
lg.store(df, "dados.lgs", public_key=pub)
|
|
863
|
+
|
|
864
|
+
# Destinatário: decifra com sua chave privada
|
|
865
|
+
df = lg.read("dados.lgs", private_key=priv)
|
|
866
|
+
|
|
867
|
+
# Multi-recipient: uma vez para N destinatários
|
|
868
|
+
from logus.asymmetric import encrypt_dek_multi, decrypt_dek_from_list
|
|
869
|
+
encs = encrypt_dek_multi(dek, [pub_ana, pub_bruno, pub_carlos])
|
|
870
|
+
dek = decrypt_dek_from_list(encs, priv_ana)
|
|
871
|
+
```
|
|
872
|
+
|
|
873
|
+
### 4. Expiração de arquivo (LGPD Art. 16)
|
|
874
|
+
|
|
875
|
+
```python
|
|
876
|
+
import logus as lg
|
|
877
|
+
|
|
878
|
+
# Arquivo recusa leitura após a data
|
|
879
|
+
lg.store(df, "dados_campanha.lgs", key=KEY, expires_at="2025-12-31")
|
|
880
|
+
|
|
881
|
+
# Depois da data:
|
|
882
|
+
lg.read("dados_campanha.lgs", key=KEY) # → ExpiredFileError
|
|
883
|
+
```
|
|
884
|
+
|
|
885
|
+
### 5. PII em JSON aninhado (`pl.Struct` e `pl.List`)
|
|
886
|
+
|
|
887
|
+
```python
|
|
888
|
+
import logus as lg
|
|
889
|
+
import polars as pl
|
|
890
|
+
from logus.detectors.fast_scan import FastPIIScanner
|
|
891
|
+
|
|
892
|
+
# Polars 1.x: dados JSON em colunas estruturadas
|
|
893
|
+
df = pl.DataFrame({
|
|
894
|
+
"pessoa": [{"cpf": "111.444.777-35", "nome": "Ana Silva"}, ...],
|
|
895
|
+
"emails": [["ana@empresa.com", "ana@gmail.com"], ...],
|
|
896
|
+
})
|
|
897
|
+
|
|
898
|
+
# FastPIIScanner desaninha e detecta automaticamente
|
|
899
|
+
reports = FastPIIScanner().detect_dict(df)
|
|
900
|
+
# → {"pessoa.cpf": ColumnReport, "pessoa.nome": ColumnReport, "emails[]": ColumnReport}
|
|
901
|
+
```
|
|
902
|
+
|
|
903
|
+
### 6. Auto-detecção de `.env`
|
|
904
|
+
|
|
905
|
+
```python
|
|
906
|
+
import logus as lg
|
|
907
|
+
|
|
908
|
+
# Carrega LOGUS_SALT e LOGUS_KEY do arquivo .env automaticamente
|
|
909
|
+
lg.configure(load_dotenv=True) # procura .env no diretório atual
|
|
910
|
+
lg.configure(load_dotenv=True, dotenv_path="config/.env")
|
|
911
|
+
|
|
912
|
+
# Depois disso: lg.mask(df) usa LOGUS_SALT sem precisar de salt= explícito
|
|
913
|
+
```
|
|
914
|
+
|
|
915
|
+
### 7. `banco.create_table()` e `banco.upsert()`
|
|
916
|
+
|
|
917
|
+
```python
|
|
918
|
+
import logus as lg
|
|
919
|
+
import os
|
|
920
|
+
|
|
921
|
+
banco = lg.db("postgresql://user:pass@host/db", salt=os.environ["LOGUS_SALT"])
|
|
922
|
+
|
|
923
|
+
# Cria tabela a partir do schema do DataFrame
|
|
924
|
+
banco.create_table(df, "clientes")
|
|
925
|
+
banco.create_table(df, "clientes", if_exists="replace")
|
|
926
|
+
|
|
927
|
+
# Upsert — INSERT ... ON CONFLICT UPDATE
|
|
928
|
+
banco.upsert(df_new, "clientes", on="cpf")
|
|
929
|
+
banco.upsert(df_new, "clientes", on=["cpf", "data"])
|
|
930
|
+
|
|
931
|
+
# Também via lg.write()
|
|
932
|
+
lg.write(df_safe, banco, "clientes_masked")
|
|
933
|
+
```
|
|
934
|
+
|
|
935
|
+
### 8. Relatório de Conformidade LGPD (HTML/PDF)
|
|
936
|
+
|
|
937
|
+
```python
|
|
938
|
+
import logus as lg
|
|
939
|
+
|
|
940
|
+
reports = lg.scan(df)
|
|
941
|
+
report = lg.compliance_report(
|
|
942
|
+
df, reports,
|
|
943
|
+
dataset_name = "Clientes Q1 2025",
|
|
944
|
+
organization = "Empresa XYZ",
|
|
945
|
+
)
|
|
946
|
+
|
|
947
|
+
report.to_html("lgpd_jan2025.html") # relatório HTML completo
|
|
948
|
+
report.to_pdf("lgpd_jan2025.pdf") # PDF (pip install weasyprint)
|
|
949
|
+
report.to_json("lgpd_jan2025.json") # JSON serializado
|
|
950
|
+
print(report.to_text()) # texto simples, sempre disponível
|
|
951
|
+
```
|
|
952
|
+
|
|
953
|
+
### 9. `lg.validate_schema()` + `lg.save_rules()` / `lg.load_rules()`
|
|
954
|
+
|
|
955
|
+
```python
|
|
956
|
+
import logus as lg
|
|
957
|
+
|
|
958
|
+
# Valida o schema (estrutura) antes de processar os dados
|
|
959
|
+
result = lg.validate_schema(df,
|
|
960
|
+
required_columns = ["cpf", "renda_mensal", "uf"],
|
|
961
|
+
forbidden_columns= ["senha", "token"],
|
|
962
|
+
min_rows = 1,
|
|
963
|
+
max_rows = 10_000_000,
|
|
964
|
+
)
|
|
965
|
+
result.raise_if_failed()
|
|
966
|
+
|
|
967
|
+
# Versionar regras de validação como JSON
|
|
968
|
+
lg.save_rules({
|
|
969
|
+
"cpf": {"not_null": True, "unique": True},
|
|
970
|
+
"renda_mensal": {"min": 0, "max": 500_000},
|
|
971
|
+
"email": {"contains": "@"},
|
|
972
|
+
}, "regras/clientes_v2.json")
|
|
973
|
+
|
|
974
|
+
rules = lg.load_rules("regras/clientes_v2.json")
|
|
975
|
+
result = lg.validate(df, rules)
|
|
976
|
+
result.print_report()
|
|
977
|
+
```
|
|
978
|
+
|
|
979
|
+
### 10. `lg.shift()`, `lg.lag()`, `lg.lead()`, `lg.explode()`
|
|
980
|
+
|
|
981
|
+
```python
|
|
982
|
+
import logus as lg
|
|
983
|
+
import polars as pl
|
|
984
|
+
|
|
985
|
+
# Séries temporais: lag e lead
|
|
986
|
+
df_ts = pl.DataFrame({"renda": [5000., 5200., 4800., 5100., 5300.]})
|
|
987
|
+
|
|
988
|
+
lg.shift(df_ts, 1) # valor do período anterior
|
|
989
|
+
lg.shift(df_ts, -1) # valor do período seguinte
|
|
990
|
+
lg.lag(df_ts, 3, columns="renda") # alias — lag de 3 períodos
|
|
991
|
+
lg.lead(df_ts, 1) # alias — próximo valor
|
|
992
|
+
lg.shift(df_ts, 1, fill_value=0.0) # preenche nulos com 0
|
|
993
|
+
|
|
994
|
+
# Dados aninhados: expande listas em linhas
|
|
995
|
+
df_tags = pl.DataFrame({
|
|
996
|
+
"id": [1, 2, 3],
|
|
997
|
+
"tags": [["lgpd", "privacidade"], ["python", "polars"], ["dados"]],
|
|
998
|
+
})
|
|
999
|
+
lg.explode(df_tags, "tags")
|
|
1000
|
+
# Resultado: 5 linhas — uma por tag
|
|
1001
|
+
|
|
1002
|
+
# Múltiplas colunas simultâneas
|
|
1003
|
+
lg.explode(df, ["tags", "scores"]) # scores e tags devem ter o mesmo comprimento
|
|
1004
|
+
```
|