datalock 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. datalock-1.0.0.dist-info/METADATA +1004 -0
  2. datalock-1.0.0.dist-info/RECORD +60 -0
  3. datalock-1.0.0.dist-info/WHEEL +5 -0
  4. datalock-1.0.0.dist-info/entry_points.txt +2 -0
  5. datalock-1.0.0.dist-info/licenses/LICENSE +632 -0
  6. datalock-1.0.0.dist-info/top_level.txt +1 -0
  7. logus/__init__.py +1967 -0
  8. logus/_defaults.py +6 -0
  9. logus/adapters/__init__.py +17 -0
  10. logus/adapters/db_adapter.py +831 -0
  11. logus/adapters/pandas_adapter.py +496 -0
  12. logus/adapters/polars_adapter.py +849 -0
  13. logus/adapters/sql_adapter.py +655 -0
  14. logus/analytics.py +1119 -0
  15. logus/asymmetric.py +335 -0
  16. logus/check.py +188 -0
  17. logus/cli.py +232 -0
  18. logus/contract.py +711 -0
  19. logus/core.py +516 -0
  20. logus/detectors/__init__.py +19 -0
  21. logus/detectors/fast_scan.py +431 -0
  22. logus/detectors/pii_detector.py +687 -0
  23. logus/detectors/sensitive_detector.py +369 -0
  24. logus/detectors/text_detector.py +183 -0
  25. logus/expr.py +429 -0
  26. logus/generators/__init__.py +21 -0
  27. logus/generators/identity_mocker.py +291 -0
  28. logus/generators/tabular_generative.py +620 -0
  29. logus/io_big.py +939 -0
  30. logus/lgs.py +280 -0
  31. logus/lineage.py +463 -0
  32. logus/link.py +121 -0
  33. logus/maskers/__init__.py +14 -0
  34. logus/maskers/date_masker.py +158 -0
  35. logus/maskers/hashing.py +229 -0
  36. logus/maskers/text_masker.py +140 -0
  37. logus/maskers/truncation.py +203 -0
  38. logus/metrics/__init__.py +63 -0
  39. logus/metrics/differential_privacy.py +666 -0
  40. logus/metrics/fidelity.py +600 -0
  41. logus/metrics/kanonymity.py +486 -0
  42. logus/metrics/risk_score.py +351 -0
  43. logus/metrics/tcloseness.py +387 -0
  44. logus/metrics/utility.py +440 -0
  45. logus/mockers/__init__.py +6 -0
  46. logus/mockers/category_mocker.py +181 -0
  47. logus/mockers/numeric_mocker.py +217 -0
  48. logus/privacy_score.py +255 -0
  49. logus/processor.py +386 -0
  50. logus/py.typed +0 -0
  51. logus/reports/__init__.py +5 -0
  52. logus/reports/audit_report.py +256 -0
  53. logus/reports/compliance_report.py +331 -0
  54. logus/secure_file.py +1850 -0
  55. logus/sql_transpiler.py +390 -0
  56. logus/utils/__init__.py +8 -0
  57. logus/utils/frames.py +51 -0
  58. logus/utils/salt.py +118 -0
  59. logus/utils/secret_str.py +84 -0
  60. logus/validate.py +594 -0
@@ -0,0 +1,1004 @@
1
+ Metadata-Version: 2.4
2
+ Name: datalock
3
+ Version: 1.0.0
4
+ Summary: Privacy-by-Design para dados tabulares — LGPD compliance em Python.
5
+ Author-email: Leonardo Borges <leonardoborges6947@gmail.com>
6
+ License: GNU AGPLv3
7
+ Project-URL: Homepage, https://github.com/logus-lgpd/logus-lgpd
8
+ Project-URL: Documentation, https://github.com/logus-lgpd/logus-lgpd#readme
9
+ Project-URL: Changelog, https://github.com/logus-lgpd/logus-lgpd/blob/main/CHANGELOG.md
10
+ Project-URL: Source Code, https://github.com/logus-lgpd/logus-lgpd
11
+ Keywords: lgpd,privacy,pii,masking,anonymization,polars,pandas,data-privacy,gdpr,lgs,aes-256,encryption,hmac,pseudonymization,data-quality,k-anonymity,differential-privacy,compliance,brasil,cpf,cnpj
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: Financial and Insurance Industry
16
+ Classifier: Intended Audience :: Healthcare Industry
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: License :: OSI Approved :: GNU Affero General Public License v3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Security :: Cryptography
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Typing :: Typed
25
+ Classifier: Natural Language :: Portuguese (Brazilian)
26
+ Classifier: Topic :: Database
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Topic :: Security
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Environment :: Console
31
+ Classifier: Framework :: Jupyter
32
+ Requires-Python: >=3.10
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Requires-Dist: polars>=1.0.0
36
+ Requires-Dist: pyarrow>=14.0.0
37
+ Requires-Dist: cryptography>=41.0.0
38
+ Requires-Dist: numpy>=1.24.0
39
+ Requires-Dist: pandas>=2.0.0
40
+ Provides-Extra: excel
41
+ Requires-Dist: openpyxl>=3.1.0; extra == "excel"
42
+ Provides-Extra: synthetic
43
+ Requires-Dist: ctgan>=0.9.0; extra == "synthetic"
44
+ Requires-Dist: faker>=18.0.0; extra == "synthetic"
45
+ Requires-Dist: scikit-learn>=1.3.0; extra == "synthetic"
46
+ Provides-Extra: sql
47
+ Requires-Dist: duckdb>=0.10.0; extra == "sql"
48
+ Requires-Dist: sqlalchemy>=2.0.0; extra == "sql"
49
+ Provides-Extra: full
50
+ Requires-Dist: openpyxl>=3.1.0; extra == "full"
51
+ Requires-Dist: duckdb>=0.10.0; extra == "full"
52
+ Requires-Dist: sqlalchemy>=2.0.0; extra == "full"
53
+ Requires-Dist: ctgan>=0.9.0; extra == "full"
54
+ Requires-Dist: faker>=18.0.0; extra == "full"
55
+ Requires-Dist: scikit-learn>=1.3.0; extra == "full"
56
+ Provides-Extra: dev
57
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
58
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
59
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
60
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
61
+ Dynamic: license-file
62
+
63
+ # logus-lgpd
64
+
65
+ **logus-lgpd** is a Python library for privacy-by-design with tabular data.
66
+ It provides LGPD-compliant PII detection and masking, AES-256-GCM encrypted
67
+ file storage (`.lgs` format), and a SQL-like DSL for data manipulation built
68
+ on top of Polars.
69
+
70
+ ```
71
+ pip install logus-lgpd
72
+ ```
73
+
74
+ ```python
75
+ import logus as lg
76
+ import os
77
+
78
+ SALT = os.environ["LOGUS_SALT"]
79
+ KEY = os.environ["LOGUS_KEY"]
80
+
81
+ df = lg.read("clientes.csv") # any format → pl.DataFrame
82
+ df_safe = lg.mask(df, salt=SALT) # detect + mask PII (LGPD)
83
+ lg.store(df_safe, "clientes.lgs", key=KEY) # AES-256-GCM encrypted
84
+ df_back = lg.read("clientes.lgs", key=KEY) # decrypt and read back
85
+ ```
86
+
87
+ ---
88
+
89
+ ## What logus-lgpd does
90
+
91
+ | Capability | Function |
92
+ |---|---|
93
+ | Read any tabular format | `lg.read()` |
94
+ | Detect PII automatically | `lg.scan()` |
95
+ | Mask PII (HMAC-SHA256) | `lg.mask()` |
96
+ | Save with AES-256-GCM | `lg.store()` |
97
+ | Expressive data manipulation | `lg.where()`, `lg.groupby()`, `lg.add_column()` |
98
+ | Full pipeline in one call | `lg.process()` |
99
+ | Data quality validation | `lg.validate()` |
100
+ | Database with masking | `lg.db()` |
101
+ | Privacy metrics | `lg.check.kanon()`, `lg.check.risk()` |
102
+
103
+ ---
104
+
105
+ ## Installation
106
+
107
+ ```bash
108
+ # Core (Polars + pandas + pyarrow + cryptography)
109
+ pip install logus-lgpd
110
+
111
+ # With SQL via DuckDB (lg.sql, lg.db)
112
+ pip install "logus-lgpd[sql]"
113
+
114
+ # With Excel support (.xlsx, .ods)
115
+ pip install "logus-lgpd[excel]"
116
+
117
+ # With synthetic data generation (lg.clone, lg.sandbox)
118
+ pip install "logus-lgpd[synthetic]"
119
+
120
+ # Everything
121
+ pip install "logus-lgpd[full]"
122
+ ```
123
+
124
+ **Requires:** Python ≥ 3.10, Polars ≥ 1.0, pandas ≥ 2.0, pyarrow ≥ 14.0
125
+
126
+ ---
127
+
128
+ ## Quick Start
129
+
130
+ ### 1. Read any file format
131
+
132
+ ```python
133
+ import logus as lg
134
+
135
+ # All return pl.DataFrame; auto-detects encoding
136
+ df = lg.read("clientes.csv")
137
+ df = lg.read("clientes.parquet")
138
+ df = lg.read("clientes.xlsx") # pip install "logus-lgpd[excel]"
139
+ df = lg.read("clientes.lgs", key=KEY) # decrypt .lgs
140
+
141
+ # CSV with non-default separator and encoding
142
+ df = lg.read("clientes.csv", sep=";", encoding="latin-1")
143
+
144
+ # Partial read for large files (no OOM)
145
+ df = lg.read("big.parquet", head=100_000)
146
+ df = lg.read("big.parquet", sample=500_000) # random row groups
147
+ info = lg.read("big.parquet", header_only=True) # schema + shape, zero data read
148
+ df = lg.read("big.parquet", n_chunks=5, chunks=[2, 4])
149
+ for chunk in lg.read("big.parquet", n_chunks=10, iter_chunks=True):
150
+ process(chunk) # never loads full file
151
+ ```
152
+
153
+ ### 2. Detect and mask PII (LGPD)
154
+
155
+ ```python
156
+ import logus as lg
157
+ import os
158
+
159
+ SALT = os.environ["LOGUS_SALT"] # HMAC key — store in env, never hardcode
160
+
161
+ # Detect PII columns automatically
162
+ reports = lg.scan(df)
163
+ for col, r in reports.items():
164
+ print(f"{col}: {r.pii_type.value} risk={r.risk_level.value} → {r.mask_strategy.value}")
165
+ # cpf: cpf risk=high → hash
166
+ # email: email risk=high → hash
167
+ # nome: nome risk=medium → redact
168
+ # cep: cep risk=low → truncate (01310-XXX)
169
+ # data_nasc:data_nascimento risk=medium → generalize_date (1985-03-XX)
170
+
171
+ # Mask PII — preserves input type (pd→pd, pl→pl)
172
+ df_safe = lg.mask(df, salt=SALT)
173
+ df_safe = lg.mask(df, salt=SALT, columns=["cpf", "email"]) # only these
174
+ df_safe = lg.mask(df, salt=SALT, exclude=["uf"]) # all except uf
175
+ df_safe = lg.mask(df, salt=SALT, risk="high") # only high-risk PII
176
+
177
+ # Deterministic: same value + same salt → same token (essential for JOINs)
178
+ # CPF "111.444.777-35" → always "3f2a8b1c9d4e7f0a" with the same SALT
179
+ ```
180
+
181
+ ### 3. Save and read encrypted (.lgs)
182
+
183
+ ```python
184
+ import logus as lg
185
+ import os
186
+
187
+ KEY = os.environ["LOGUS_KEY"] # AES-256 key — different from SALT
188
+ SALT = os.environ["LOGUS_SALT"]
189
+
190
+ # Save
191
+ lg.store(df, "clientes.lgs", key=KEY) # encrypt, data as-is
192
+ lg.store(df, "clientes.lgs", key=KEY, salt=SALT) # mask + encrypt
193
+ lg.store({"clients": df1, "orders": df2}, "base.lgs", key=KEY) # multi-frame
194
+
195
+ # Read metadata WITHOUT decrypting the payload
196
+ info = lg.inspect("clientes.lgs", key=KEY)
197
+ # {"version":"2.1","shape":[150000,12],"columns":["cpf","nome",...],
198
+ # "column_stats":{"cpf":{"n_nulls":0,"n_unique":150000},...},
199
+ # "content_type":"masked_dataframe","encryption":"AES-256-GCM"}
200
+
201
+ # Read
202
+ df = lg.read("clientes.lgs", key=KEY)
203
+ frames = lg.read("base.lgs", key=KEY) # dict[str, pd.DataFrame]
204
+ df_cli = lg.read("base.lgs", key=KEY, frame="clients")
205
+
206
+ # OO interface
207
+ with lg.open("clientes.lgs", key=KEY) as f:
208
+ df = f.read()
209
+ info = f.info()
210
+ frames = f.frames()
211
+ f.write(df_updated)
212
+ f.add_frame("novos", df_new)
213
+
214
+ # Rotate encryption key (without exposing data as file)
215
+ lg.rekey("clientes.lgs", old_key=OLD_KEY, new_key=NEW_KEY)
216
+ ```
217
+
218
+ ### 4. Manipulate data (SQL-like DSL)
219
+
220
+ All functions work with `pd.DataFrame`, `pl.DataFrame`, and `pl.LazyFrame`.
221
+ They preserve the input type — Polars in, Polars out.
222
+
223
+ ```python
224
+ import logus as lg
225
+
226
+ # WHERE
227
+ df = lg.where(df, uf="SP")
228
+ df = lg.where(df, uf=["SP", "RJ", "MG"]) # isin
229
+ df = lg.where(df, renda_mensal=(">", 5_000)) # operator
230
+ df = lg.where(df, renda_mensal=(5_000, 15_000)) # between
231
+ df = lg.where(df, nome=("contains", "Silva")) # string contains
232
+ df = lg.where(df, lg.col("renda") > lg.col("media")) # expression
233
+ df = lg.where(df, uf="SP", tipo_pessoa="PF") # AND (multiple kwargs)
234
+
235
+ # SELECT / DROP / RENAME
236
+ df = lg.select(df, ["cpf", "renda", "uf"])
237
+ df = lg.drop(df, "coluna_inutil")
238
+ df = lg.rename(df, {"cpf": "documento"})
239
+
240
+ # ORDER BY
241
+ df = lg.sort(df, "renda_mensal", desc=True)
242
+ df = lg.sort(df, ["uf", "renda"], ascending=[True, False])
243
+
244
+ # GROUP BY with HAVING, ORDER BY, LIMIT
245
+ resultado = lg.groupby(df, "uf", {
246
+ "n": ("*", "count"),
247
+ "media": ("renda_mensal", "mean"),
248
+ "total": ("renda_mensal", "sum"),
249
+ "unicos": ("cpf", "n_unique"),
250
+ }, having={"n": (">", 100)}, sort="media", desc=True, limit=10)
251
+
252
+ # ADD COLUMN with expressions, CASE WHEN, window functions
253
+ df = lg.add_column(df,
254
+ imposto = lg.col("renda_mensal") * 0.275,
255
+ faixa = lg.when(lg.col("renda_mensal") > 10_000, "alta")
256
+ .when(lg.col("renda_mensal") > 5_000, "media")
257
+ .otherwise("baixa"),
258
+ rank_uf = lg.col("renda_mensal").rank("dense", descending=True).over("uf"),
259
+ media_uf = lg.col("renda_mensal").mean().over("uf"),
260
+ nome_lower = lg.col("nome").str.to_lowercase(),
261
+ ano_nasc = lg.col("data_nasc").str.to_date("%Y-%m-%d").dt.year(),
262
+ )
263
+
264
+ # DISTINCT
265
+ df = lg.unique(df, "cpf") # one row per CPF
266
+ df = lg.unique(df, ["uf", "tipo"], keep="first")
267
+
268
+ # PIVOT / MELT
269
+ pv = lg.pivot(df, index="uf", columns="tipo", values="renda", aggfunc="mean")
270
+ ml = lg.melt(df, id_cols=["uf"], value_cols=["renda_jan", "renda_fev"])
271
+
272
+ # TOP N per group
273
+ top3 = lg.top_n(df, 3, "renda_mensal", group_by="uf")
274
+
275
+ # FILL NULL / CAST
276
+ df = lg.fill_null(df, {"renda": 0, "uf": "DESCONHECIDO"})
277
+ df = lg.cast(df, {"renda": "float32", "inadimplente": "bool"})
278
+
279
+ # CONCAT
280
+ df_all = lg.concat([df_jan, df_fev, df_mar])
281
+ ```
282
+
283
+ ### 5. lg.col() — full Polars expression API
284
+
285
+ `lg.col` is literally `polars.col`. All 200+ Polars methods are available:
286
+
287
+ ```python
288
+ lg.col("renda").round(2)
289
+ lg.col("renda").log(base=10)
290
+ lg.col("nome").str.to_lowercase()
291
+ lg.col("email").str.split("@").list.last()
292
+ lg.col("cpf").str.replace_all(r"[\.\-]", "")
293
+ lg.col("data").dt.year()
294
+ lg.col("data").dt.truncate("1mo")
295
+ lg.col("renda").mean().over("uf") # window function
296
+ lg.col("renda").rank("dense", descending=True).over("uf")
297
+ lg.col("renda").rolling_mean(window_size=3)
298
+ lg.col("renda").cum_sum()
299
+ ```
300
+
301
+ ### 6. Pipeline fluente
302
+
303
+ ```python
304
+ import logus as lg
305
+ import os
306
+
307
+ SALT = os.environ["LOGUS_SALT"]
308
+ KEY = os.environ["LOGUS_KEY"]
309
+
310
+ result = (
311
+ lg.pipe("clientes.parquet")
312
+ .where(uf="SP", tipo_pessoa="PF")
313
+ .add_column(
314
+ imposto = lg.col("renda_mensal") * 0.275,
315
+ faixa = lg.when(lg.col("renda_mensal") > 10_000, "alta")
316
+ .when(lg.col("renda_mensal") > 5_000, "media")
317
+ .otherwise("baixa"),
318
+ )
319
+ .mask(salt=SALT)
320
+ .groupby("faixa", {"n": ("*", "count"), "media": ("renda_mensal", "mean")})
321
+ .sort("media", desc=True)
322
+ .collect() # → pl.DataFrame
323
+ )
324
+ ```
325
+
326
+ ### 7. Full pipeline in one call
327
+
328
+ ```python
329
+ import logus as lg
330
+ import os
331
+
332
+ result = lg.process(
333
+ "clientes.csv",
334
+ salt=os.environ["LOGUS_SALT"],
335
+ key=os.environ["LOGUS_KEY"],
336
+ output="clientes_safe.lgs",
337
+ overwrite=True,
338
+ where={"uf": ["SP", "RJ", "MG"]},
339
+ rules={
340
+ "cpf": {"not_null": True},
341
+ "renda_mensal": {"min": 0, "max": 500_000},
342
+ "email": {"contains": "@"},
343
+ },
344
+ verbose=True,
345
+ )
346
+
347
+ print(f"Rows: {result.n_rows:,}")
348
+ print(f"PII columns: {result.pii_columns}")
349
+ print(f"Privacy score: {result.privacy_score}/100")
350
+ print(f"Validation: {result.validation.passed}")
351
+ print(f"Saved to: {result.output_path}")
352
+ result.print_summary()
353
+ ```
354
+
355
+ ### 8. SQL via DuckDB
356
+
357
+ ```python
358
+ import logus as lg
359
+
360
+ # pip install "logus-lgpd[sql]"
361
+
362
+ # SQL on DataFrames
363
+ result = lg.sql(
364
+ "SELECT uf, AVG(renda_mensal) AS media, COUNT(*) AS n "
365
+ "FROM df GROUP BY uf HAVING n > 100 ORDER BY media DESC",
366
+ df=df,
367
+ )
368
+
369
+ # JOIN two DataFrames via SQL
370
+ result = lg.sql(
371
+ "SELECT c.uf, SUM(p.valor) AS total "
372
+ "FROM clientes c JOIN pedidos p ON c.cpf = p.cpf "
373
+ "GROUP BY c.uf",
374
+ clientes=df_clientes,
375
+ pedidos=df_pedidos,
376
+ )
377
+
378
+ # SQL on Parquet files (DuckDB reads natively)
379
+ result = lg.sql("SELECT * FROM read_parquet('big.parquet') WHERE uf='SP' LIMIT 1000")
380
+ ```
381
+
382
+ ### 9. Database connection
383
+
384
+ ```python
385
+ import logus as lg
386
+ import os
387
+
388
+ # pip install "logus-lgpd[sql]"
389
+
390
+ banco = lg.db(
391
+ "postgresql://user:pass@host:5432/db",
392
+ salt=os.environ["LOGUS_SALT"],
393
+ )
394
+
395
+ # Read with masking
396
+ df = lg.read(banco, "clientes")
397
+ df = lg.read(banco, "clientes", sample=10_000) # TABLESAMPLE BERNOULLI
398
+ df = lg.read(banco, "SELECT * FROM clientes WHERE uf='SP'")
399
+
400
+ # Write masked data back
401
+ banco.write(df_safe, "clientes_masked", if_exists="replace")
402
+ lg.write(df_safe, banco, "clientes_masked") # alternative syntax
403
+
404
+ # Explore
405
+ print(banco.tables())
406
+ print(banco.schema("clientes"))
407
+ sample = banco.sample_table("clientes", n=5)
408
+
409
+ # Context manager closes pool automatically
410
+ with lg.db("postgresql://...", salt=SALT) as banco:
411
+ df = banco.read("clientes")
412
+
413
+ # Dialects: postgresql, mysql, sqlite, sqlserver, bigquery, snowflake, duckdb
414
+ ```
415
+
416
+ ### 10. Data quality validation
417
+
418
+ ```python
419
+ import logus as lg
420
+
421
+ result = lg.validate(df, {
422
+ "cpf": {"not_null": True, "unique": True},
423
+ "email": {"not_null": True, "contains": "@"},
424
+ "renda_mensal": {"min": 0, "max": 500_000, "not_null": True},
425
+ "uf": {"in": ["SP","RJ","MG","RS","BA","PR","SC","GO","PE","CE"]},
426
+ "cep": {"matches": r"^\d{5}-\d{3}$"},
427
+ })
428
+
429
+ result.print_report() # formatted table
430
+ result.passed # True / False
431
+ result.score # 0.93 (proportion of rules passed)
432
+ result.raise_if_failed() # raises ValueError with details if any rule failed
433
+
434
+ # Fluent interface
435
+ lg.expect(df, "renda_mensal").not_null().between(0, 500_000).validate()
436
+ lg.expect(df, "email").contains("@").min_length(5).validate()
437
+ ```
438
+
439
+ ### 11. Streaming for large files
440
+
441
+ ```python
442
+ import logus as lg
443
+ import os
444
+
445
+ SALT = os.environ["LOGUS_SALT"]
446
+
447
+ # Yields pl.DataFrame chunks — never loads full file into memory
448
+ for chunk in lg.stream("grande.csv", salt=SALT, chunksize=50_000):
449
+ save_to_database(chunk)
450
+
451
+ # With progress callback
452
+ def progress(chunk_n, rows_done, total_estimate):
453
+ print(f"Chunk {chunk_n}: {rows_done:,} / ~{total_estimate:,} rows")
454
+
455
+ for chunk in lg.stream("grande.parquet", salt=SALT, on_progress=progress):
456
+ process(chunk)
457
+ ```
458
+
459
+ ### 12. Privacy metrics
460
+
461
+ ```python
462
+ import logus as lg
463
+
464
+ # k-anonymity (ANPD recommends k ≥ 5)
465
+ report = lg.check.kanon(df, quasi_identifiers=["uf", "faixa_etaria", "escolaridade"])
466
+ print(f"k={report.k_anonymity.k_value} compliant={report.compliant_anpd}")
467
+
468
+ # Re-identification risk score (0=safe, 1=high risk)
469
+ report = lg.check.risk(df_safe, quasi_identifiers=["uf", "faixa_etaria"])
470
+ print(f"risk={report.risk_score:.2f} level={report.risk_level}")
471
+
472
+ # Utility preservation after masking
473
+ report = lg.check.utility(df_original, df_masked)
474
+ print(f"utility={report.overall_score:.0%}")
475
+
476
+ # Differential privacy
477
+ dp = lg.check.dp(epsilon=1.0)
478
+ noisy_mean = dp.laplace(df["renda"].mean(), sensitivity=df["renda"].max())
479
+
480
+ # Privacy score (composite 0–100)
481
+ profile = lg.profile(df)
482
+ print(f"Privacy score: {profile['privacy_score']['total']}/100 [{profile['privacy_score']['grade']}]")
483
+ ```
484
+
485
+ ### 13. SQL transpiler
486
+
487
+ ```python
488
+ import logus as lg
489
+
490
+ reports = lg.scan(df)
491
+
492
+ # Transform SELECT to mask PII inline (data never leaves the database)
493
+ safe_sql = lg.mask_sql(
494
+ "SELECT cpf, nome, email, renda_mensal, uf FROM clientes WHERE uf = 'SP'",
495
+ reports=reports,
496
+ dialect="postgresql",
497
+ salt=os.environ["LOGUS_SALT"],
498
+ )
499
+ # SELECT
500
+ # encode(hmac(cpf::text, 'salt', 'sha256'), 'hex') AS cpf,
501
+ # 'REDACTED' AS nome,
502
+ # encode(hmac(email::text, 'salt', 'sha256'), 'hex') AS email,
503
+ # renda_mensal,
504
+ # uf
505
+ # FROM clientes WHERE uf = 'SP'
506
+
507
+ # Generate CREATE VIEW
508
+ view_sql = lg.generate_view(df, "clientes", reports=reports, dialect="postgresql")
509
+ ```
510
+
511
+ ### 14. Data lineage
512
+
513
+ ```python
514
+ import logus as lg
515
+ import os
516
+
517
+ SALT = os.environ["LOGUS_SALT"]
518
+
519
+ # Automatic with lg.process()
520
+ result = lg.process(df, salt=SALT, track_lineage=True)
521
+ print(result.lineage.summary())
522
+
523
+ # Manual
524
+ tracker = lg.lineage.start("pipeline_crm")
525
+ tracker.origin(df, "clientes.parquet", format="parquet")
526
+ df_filtered = lg.where(df, uf="SP")
527
+ tracker.transform(df_filtered, operation="filter", detail="uf='SP'")
528
+ df_safe = lg.mask(df_filtered, salt=SALT)
529
+ tracker.mask(df_safe, columns=list(reports.keys()), salt=SALT)
530
+ tracker.export("output.lgs", format="lgs", encrypted=True)
531
+
532
+ tracker.to_json("lineage.json") # save lineage record
533
+ tracker.to_openlineage("openlineage.json") # OpenLineage-compatible format
534
+
535
+ # Context manager (prints summary automatically)
536
+ with lg.lineage.session("pipeline") as lin:
537
+ lin.origin(df, "source.csv")
538
+ lin.mask(df_safe, columns=["cpf", "email"], salt=SALT)
539
+ ```
540
+
541
+ ---
542
+
543
+ ## Use Cases
544
+
545
+ ### Masking CPFs in production logs
546
+
547
+ ```python
548
+ import logus as lg
549
+ import os
550
+
551
+ SALT = os.environ["LOGUS_SALT"]
552
+ KEY = os.environ["LOGUS_KEY"]
553
+
554
+ # One-time: mask production data
555
+ df_raw = lg.read("producao.parquet")
556
+ df_safe = lg.mask(df_raw, salt=SALT, verbose=True)
557
+ lg.store(df_safe, "producao_safe.lgs", key=KEY)
558
+
559
+ # Analytics team works with masked data
560
+ df = lg.read("producao_safe.lgs", key=KEY)
561
+ # CPFs are now: "3f2a8b1c9d4e7f0a" — tokens, not real values
562
+ ```
563
+
564
+ ### JOIN between masked tables (deterministic tokens)
565
+
566
+ ```python
567
+ import logus as lg
568
+ import os
569
+
570
+ SALT = os.environ["LOGUS_SALT"]
571
+
572
+ # Same CPF → same token in both tables → JOIN works
573
+ df_c_safe = lg.mask(df_clientes, salt=SALT)
574
+ df_p_safe = lg.mask(df_pedidos, salt=SALT)
575
+
576
+ result = lg.join(df_c_safe, df_p_safe, on="cpf")
577
+ # OR let logus apply the same SALT to both automatically:
578
+ result = lg.join(df_clientes, df_pedidos, on="cpf", salt=SALT)
579
+ ```
580
+
581
+ ### Processing a 10GB CSV without OOM
582
+
583
+ ```python
584
+ import logus as lg
585
+ import os
586
+
587
+ SALT = os.environ["LOGUS_SALT"]
588
+
589
+ # Option 1: stream in chunks
590
+ total_rows = 0
591
+ for chunk in lg.stream("big.csv", salt=SALT, chunksize=100_000):
592
+ save_to_db(chunk)
593
+ total_rows += len(chunk)
594
+ print(f"{total_rows:,} rows processed")
595
+
596
+ # Option 2: partial read for exploration
597
+ schema = lg.read("big.csv", header_only=True) # instant, zero data
598
+ sample = lg.read("big.csv", head=10_000) # first 10k rows
599
+ # For repeated access, convert to Parquet once:
600
+ df_full = lg.read("big.csv")
601
+ lg.write(df_full, "big.parquet") # subsequent reads 6× faster
602
+ ```
603
+
604
+ ### LGPD compliance check before sharing data
605
+
606
+ ```python
607
+ import logus as lg
608
+
609
+ df = lg.read("clientes.parquet")
610
+
611
+ # Full diagnostic
612
+ profile = lg.profile(df)
613
+ score = profile["privacy_score"]
614
+ print(f"Privacy Score: {score['total']}/100 [{score['grade']}]")
615
+ print(f"PII columns: {profile['pii_columns']}")
616
+ print(f"Recommendation: {score['recommendation']}")
617
+
618
+ # If score is acceptable, share safely
619
+ if score["total"] >= 75:
620
+ SALT = os.environ["LOGUS_SALT"]
621
+ KEY = os.environ["LOGUS_KEY"]
622
+ df_safe = lg.mask(df, salt=SALT)
623
+ lg.store(df_safe, "clientes_para_parceiro.lgs", key=KEY)
624
+ ```
625
+
626
+ ### Database masking without data leaving the server
627
+
628
+ ```python
629
+ import logus as lg
630
+ import os
631
+
632
+ # pip install "logus-lgpd[sql]"
633
+
634
+ # Option 1: Create masked view in the database
635
+ banco = lg.db("postgresql://user:pass@host/db", salt=os.environ["LOGUS_SALT"])
636
+ result = banco.create_masked_view("clientes") # creates clientes_masked view
637
+ # Now devs use: SELECT * FROM clientes_masked
638
+
639
+ # Option 2: Generate SQL to run yourself
640
+ reports = lg.scan(lg.read(banco, "clientes", head=500)) # sample for detection
641
+ view_sql = lg.link.sql(None, reports, table="clientes", dialect="postgresql")
642
+ print(view_sql) # → CREATE OR REPLACE VIEW clientes_masked AS ...
643
+ ```
644
+
645
+ ---
646
+
647
+ ## API Reference
648
+
649
+ ### I/O Functions
650
+
651
+ | Function | Signature | Returns |
652
+ |---|---|---|
653
+ | `lg.read` | `(source, *, key, salt, head, sample, n_chunks, chunks, iter_chunks, header_only, columns, ...)` | `pl.DataFrame \| dict \| Generator` |
654
+ | `lg.write` | `(df, path_or_conn, table=None, **kw)` | `None` |
655
+ | `lg.store` | `(source, path, *, key, salt, anonymize, compress, overwrite, metadata)` | `dict` |
656
+ | `lg.stream` | `(source, *, salt, chunksize, on_progress)` | `Generator[pl.DataFrame]` |
657
+ | `lg.open` | `(path, *, key, salt, compress)` | `LGSFile` |
658
+ | `lg.inspect` | `(path, *, key)` | `dict` |
659
+ | `lg.rekey` | `(path, *, old_key, new_key, output_path)` | `dict` |
660
+ | `lg.db` | `(uri, *, salt, dialect, pool_size)` | `DatabaseConnection` |
661
+
662
+ ### Privacy Functions
663
+
664
+ | Function | Signature | Returns |
665
+ |---|---|---|
666
+ | `lg.scan` | `(source, *, key, sample_size, threshold)` | `Dict[str, ColumnReport]` |
667
+ | `lg.mask` | `(df, *, salt, columns, exclude, risk, strict, verbose)` | same type as input |
668
+ | `lg.diff` | `(original, masked, *, sample_size)` | `dict` |
669
+ | `lg.profile` | `(source, *, key, sample_size)` | `dict` |
670
+ | `lg.join` | `(left, right, on, *, salt, how)` | `pd.DataFrame` |
671
+ | `lg.process` | `(source, *, salt, key, output, where, rules, verbose, ...)` | `ProcessResult` |
672
+ | `lg.validate` | `(df, rules, *, severity, warn_only)` | `ValidationReport` |
673
+ | `lg.expect` | `(df, column)` | `_ColumnExpectation` (fluent) |
674
+
675
+ ### Manipulation Functions
676
+
677
+ All accept `pd.DataFrame`, `pl.DataFrame`, and `pl.LazyFrame`. All preserve input type.
678
+
679
+ | Function | SQL equivalent |
680
+ |---|---|
681
+ | `lg.where(df, **kwargs)` | `WHERE` |
682
+ | `lg.select(df, cols)` | `SELECT col1, col2` |
683
+ | `lg.drop(df, cols)` | `SELECT * EXCEPT(col)` |
684
+ | `lg.rename(df, mapping)` | `SELECT col AS new_name` |
685
+ | `lg.sort(df, by, desc=)` | `ORDER BY` |
686
+ | `lg.groupby(df, by, agg)` | `GROUP BY` |
687
+ | `lg.add_column(df, **exprs)` | `SELECT *, expr AS name` |
688
+ | `lg.when(cond, val).otherwise(d)` | `CASE WHEN` |
689
+ | `lg.unique(df, subset)` | `SELECT DISTINCT` |
690
+ | `lg.head(df, n)` | `LIMIT N` |
691
+ | `lg.top_n(df, n, by, group_by=)` | `RANK() OVER (PARTITION BY ...)` |
692
+ | `lg.concat(frames)` | `UNION ALL` |
693
+ | `lg.pivot(df, ...)` | `PIVOT` |
694
+ | `lg.melt(df, ...)` | `UNPIVOT` |
695
+ | `lg.fill_null(df, value)` | `COALESCE` |
696
+ | `lg.cast(df, schema)` | `CAST(col AS type)` |
697
+
698
+ ### Aliases
699
+
700
+ | Alias | Points to |
701
+ |---|---|
702
+ | `lg.q` | `lg.where` |
703
+ | `lg.filter_` | `lg.where` |
704
+ | `lg.order_by` | `lg.sort` |
705
+ | `lg.group_by` | `lg.groupby` |
706
+ | `lg.distinct` | `lg.unique` |
707
+ | `lg.union_all` | `lg.concat` |
708
+ | `lg.limit` | `lg.head` |
709
+ | `lg.unpivot` | `lg.melt` |
710
+ | `lg.fillna` | `lg.fill_null` |
711
+ | `lg.coalesce` | `lg.fill_null` |
712
+ | `lg.assign` | `lg.add_column` |
713
+ | `lg.save` | `lg.store` |
714
+ | `lg.load` | `lg.read` |
715
+
716
+ ---
717
+
718
+ ## Configuration
719
+
720
+ ```python
721
+ import logus as lg
722
+ import os
723
+
724
+ # Set global defaults at application startup
725
+ lg.configure(
726
+ default_salt = os.environ["LOGUS_SALT"], # used when salt= not passed to mask()
727
+ audit_path = "./audit/", # auto-create audit trail
728
+ )
729
+
730
+ # Generate secrets (do this once, save to .env)
731
+ SALT = lg.generate_salt() # 256-bit, base64-encoded
732
+ KEY = lg.generate_salt() # different value from SALT
733
+ print(f"LOGUS_SALT={SALT}")
734
+ print(f"LOGUS_KEY={KEY}")
735
+ ```
736
+
737
+ ---
738
+
739
+ ## The .lgs Format
740
+
741
+ `.lgs` is a binary container for encrypted tabular data:
742
+
743
+ - **Encryption:** AES-256-GCM (NIST SP 800-38D) — confidentiality + integrity
744
+ - **Key derivation:** HKDF-SHA256 (RFC 5869) — unique DEK per file
745
+ - **Payload:** Parquet with zstd compression (or lz4 for speed)
746
+ - **Header (v2.1):** JSON with schema, column stats, and LGPD metadata — readable without decryption via `lg.inspect()`
747
+ - **Integrity:** HMAC-SHA256 over the entire file — detects tampering
748
+
749
+ ```
750
+ [5 bytes] MAGIC = b"LOGUS"
751
+ [1 byte] VERSION = 0x02
752
+ [1 byte] CIPHER = 0x01 (AES-256-GCM)
753
+ [32 bytes] SALT_KDF — unique per file
754
+ [12 bytes] NONCE — for header encryption
755
+ [4 bytes] HEADER_LEN
756
+ [N+16 bytes] HEADER_CT+TAG — encrypted JSON metadata
757
+ [12 bytes] NONCE — for payload
758
+ [M+16 bytes] PAYLOAD_CT+TAG — encrypted Parquet
759
+ [32 bytes] FILE_HMAC
760
+ ```
761
+
762
+ ---
763
+
764
+ ## Requirements
765
+
766
+ | Package | Min version | Purpose |
767
+ |---|---|---|
768
+ | `polars` | 1.0.0 | Data engine (required) |
769
+ | `pandas` | 2.0.0 | Fallback + Excel/SAS/SPSS formats (required) |
770
+ | `pyarrow` | 14.0.0 | Parquet I/O (required) |
771
+ | `cryptography` | 41.0.0 | AES-256-GCM, HKDF (required) |
772
+ | `numpy` | 1.24.0 | Numeric operations (required) |
773
+ | `duckdb` | 0.10.0 | `lg.sql()`, `lg.db()` (optional: `[sql]`) |
774
+ | `sqlalchemy` | 2.0.0 | `lg.db()` fallback (optional: `[sql]`) |
775
+ | `openpyxl` | 3.1.0 | Excel read/write (optional: `[excel]`) |
776
+ | `ctgan` | 0.9.0 | `lg.clone()` synthetic data (optional: `[synthetic]`) |
777
+
778
+ ---
779
+
780
+ ## License
781
+
782
+ GNU Affero General Public License v3 (AGPL-3.0)
783
+
784
+ ---
785
+
786
+ ## Changelog
787
+
788
+ See [CHANGELOG.md](CHANGELOG.md).
789
+
790
+ | Version | Highlights |
791
+ |---|---|
792
+ | 1.0.5 | `lg.read()` big-data params (head/sample/chunks/iter_chunks), `lg.db()`, mask(LazyFrame), stream via scan_csv batches, CSV sidecar index, 199 tests |
793
+ | 1.1.0 | `lg.validate()`, `lg.mask_sql()`, `lg.lineage`, privacy score in profile(), `lg.process()` |
794
+ | 1.0.4 | `polars>=1.0.0` required, FastPIIScanner (9× faster), `__init__.py` cleaned |
795
+
796
+ ---
797
+
798
+ ## Novidades v1.0.5
799
+
800
+ ### 1. `lg.contract()` — Contrato de Dados
801
+
802
+ ```python
803
+ import logus as lg
804
+ import os
805
+
806
+ SALT = os.environ["LOGUS_SALT"]
807
+
808
+ contrato = lg.contract({
809
+ "cpf": {"type": "str", "not_null": True, "unique": True,
810
+ "pii": "CPF", "mask": "hash"},
811
+ "renda": {"type": "float", "min": 0, "max": 500_000,
812
+ "pii": "numerico", "mask": "mock_numeric"},
813
+ "uf": {"type": "str", "in": ["SP","RJ","MG","RS","BA","PR","SC","GO","PE","CE"]},
814
+ "email": {"type": "str", "contains": "@",
815
+ "pii": "email", "mask": "hash"},
816
+ }, name="clientes_v2", version="2.0")
817
+
818
+ # Aplica: valida → mascara → retorna tudo em uma chamada
819
+ result = contrato.apply(df, salt=SALT)
820
+ result.raise_if_failed() # levanta ValueError se alguma regra falhou
821
+ df_safe = result.df # DataFrame mascarado
822
+
823
+ # Versionar e comparar
824
+ contrato.save("schema/clientes_v2.contract.json")
825
+ c_v1 = lg.DataContract.load("schema/clientes_v1.contract.json")
826
+ diff = contrato.diff(c_v1)
827
+ print(diff.has_breaking_changes)
828
+ print(diff.report())
829
+
830
+ # Exportar como JSON Schema (para DPOs e equipes não-Python)
831
+ schema = contrato.to_json_schema() # dicionário JSON Schema draft-07
832
+ ```
833
+
834
+ ### 2. Padrões PII customizados
835
+
836
+ ```python
837
+ import logus as lg
838
+ from logus.detectors.fast_scan import FastPIIScanner
839
+
840
+ # Detecta identificadores proprietários da sua empresa
841
+ reports = lg.scan(df, custom_patterns={
842
+ "num_contrato": r"^CTR-[0-9]{8}$",
843
+ "matricula": r"^[0-9]{6}-[A-Z]$",
844
+ "protocolo": r"^PROT-[0-9]{4}-[0-9]{6}$",
845
+ })
846
+ ```
847
+
848
+ ### 3. Criptografia assimétrica no `.lgs`
849
+
850
+ ```python
851
+ import logus as lg
852
+
853
+ # Gera par de chaves (uma vez por usuário/serviço)
854
+ priv, pub = lg.generate_keypair("ec") # P-256 — recomendado
855
+ priv, pub = lg.generate_keypair("rsa", 4096)
856
+
857
+ lg.save_keypair(priv, "chave.pem", pub, "chave_publica.pem")
858
+ pub = lg.load_public_key("chave_publica.pem")
859
+ priv = lg.load_private_key("chave.pem")
860
+
861
+ # Remetente: cifra com a chave pública do destinatário
862
+ lg.store(df, "dados.lgs", public_key=pub)
863
+
864
+ # Destinatário: decifra com sua chave privada
865
+ df = lg.read("dados.lgs", private_key=priv)
866
+
867
+ # Multi-recipient: uma vez para N destinatários
868
+ from logus.asymmetric import encrypt_dek_multi, decrypt_dek_from_list
869
+ encs = encrypt_dek_multi(dek, [pub_ana, pub_bruno, pub_carlos])
870
+ dek = decrypt_dek_from_list(encs, priv_ana)
871
+ ```
872
+
873
+ ### 4. Expiração de arquivo (LGPD Art. 16)
874
+
875
+ ```python
876
+ import logus as lg
877
+
878
+ # Arquivo recusa leitura após a data
879
+ lg.store(df, "dados_campanha.lgs", key=KEY, expires_at="2025-12-31")
880
+
881
+ # Depois da data:
882
+ lg.read("dados_campanha.lgs", key=KEY) # → ExpiredFileError
883
+ ```
884
+
885
+ ### 5. PII em JSON aninhado (`pl.Struct` e `pl.List`)
886
+
887
+ ```python
888
+ import logus as lg
889
+ import polars as pl
890
+ from logus.detectors.fast_scan import FastPIIScanner
891
+
892
+ # Polars 1.x: dados JSON em colunas estruturadas
893
+ df = pl.DataFrame({
894
+ "pessoa": [{"cpf": "111.444.777-35", "nome": "Ana Silva"}, ...],
895
+ "emails": [["ana@empresa.com", "ana@gmail.com"], ...],
896
+ })
897
+
898
+ # FastPIIScanner desaninha e detecta automaticamente
899
+ reports = FastPIIScanner().detect_dict(df)
900
+ # → {"pessoa.cpf": ColumnReport, "pessoa.nome": ColumnReport, "emails[]": ColumnReport}
901
+ ```
902
+
903
+ ### 6. Auto-detecção de `.env`
904
+
905
+ ```python
906
+ import logus as lg
907
+
908
+ # Carrega LOGUS_SALT e LOGUS_KEY do arquivo .env automaticamente
909
+ lg.configure(load_dotenv=True) # procura .env no diretório atual
910
+ lg.configure(load_dotenv=True, dotenv_path="config/.env")
911
+
912
+ # Depois disso: lg.mask(df) usa LOGUS_SALT sem precisar de salt= explícito
913
+ ```
914
+
915
+ ### 7. `banco.create_table()` e `banco.upsert()`
916
+
917
+ ```python
918
+ import logus as lg
919
+ import os
920
+
921
+ banco = lg.db("postgresql://user:pass@host/db", salt=os.environ["LOGUS_SALT"])
922
+
923
+ # Cria tabela a partir do schema do DataFrame
924
+ banco.create_table(df, "clientes")
925
+ banco.create_table(df, "clientes", if_exists="replace")
926
+
927
+ # Upsert — INSERT ... ON CONFLICT UPDATE
928
+ banco.upsert(df_new, "clientes", on="cpf")
929
+ banco.upsert(df_new, "clientes", on=["cpf", "data"])
930
+
931
+ # Também via lg.write()
932
+ lg.write(df_safe, banco, "clientes_masked")
933
+ ```
934
+
935
+ ### 8. Relatório de Conformidade LGPD (HTML/PDF)
936
+
937
+ ```python
938
+ import logus as lg
939
+
940
+ reports = lg.scan(df)
941
+ report = lg.compliance_report(
942
+ df, reports,
943
+ dataset_name = "Clientes Q1 2025",
944
+ organization = "Empresa XYZ",
945
+ )
946
+
947
+ report.to_html("lgpd_jan2025.html") # relatório HTML completo
948
+ report.to_pdf("lgpd_jan2025.pdf") # PDF (pip install weasyprint)
949
+ report.to_json("lgpd_jan2025.json") # JSON serializado
950
+ print(report.to_text()) # texto simples, sempre disponível
951
+ ```
952
+
953
+ ### 9. `lg.validate_schema()` + `lg.save_rules()` / `lg.load_rules()`
954
+
955
+ ```python
956
+ import logus as lg
957
+
958
+ # Valida o schema (estrutura) antes de processar os dados
959
+ result = lg.validate_schema(df,
960
+ required_columns = ["cpf", "renda_mensal", "uf"],
961
+ forbidden_columns= ["senha", "token"],
962
+ min_rows = 1,
963
+ max_rows = 10_000_000,
964
+ )
965
+ result.raise_if_failed()
966
+
967
+ # Versionar regras de validação como JSON
968
+ lg.save_rules({
969
+ "cpf": {"not_null": True, "unique": True},
970
+ "renda_mensal": {"min": 0, "max": 500_000},
971
+ "email": {"contains": "@"},
972
+ }, "regras/clientes_v2.json")
973
+
974
+ rules = lg.load_rules("regras/clientes_v2.json")
975
+ result = lg.validate(df, rules)
976
+ result.print_report()
977
+ ```
978
+
979
+ ### 10. `lg.shift()`, `lg.lag()`, `lg.lead()`, `lg.explode()`
980
+
981
+ ```python
982
+ import logus as lg
983
+ import polars as pl
984
+
985
+ # Séries temporais: lag e lead
986
+ df_ts = pl.DataFrame({"renda": [5000., 5200., 4800., 5100., 5300.]})
987
+
988
+ lg.shift(df_ts, 1) # valor do período anterior
989
+ lg.shift(df_ts, -1) # valor do período seguinte
990
+ lg.lag(df_ts, 3, columns="renda") # alias — lag de 3 períodos
991
+ lg.lead(df_ts, 1) # alias — próximo valor
992
+ lg.shift(df_ts, 1, fill_value=0.0) # preenche nulos com 0
993
+
994
+ # Dados aninhados: expande listas em linhas
995
+ df_tags = pl.DataFrame({
996
+ "id": [1, 2, 3],
997
+ "tags": [["lgpd", "privacidade"], ["python", "polars"], ["dados"]],
998
+ })
999
+ lg.explode(df_tags, "tags")
1000
+ # Resultado: 5 linhas — uma por tag
1001
+
1002
+ # Múltiplas colunas simultâneas
1003
+ lg.explode(df, ["tags", "scores"]) # scores e tags devem ter o mesmo comprimento
1004
+ ```