csvw-safe 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. csvw_safe-0.0.1/LICENSE +19 -0
  2. csvw_safe-0.0.1/PKG-INFO +682 -0
  3. csvw_safe-0.0.1/README.md +651 -0
  4. csvw_safe-0.0.1/pyproject.toml +95 -0
  5. csvw_safe-0.0.1/setup.cfg +4 -0
  6. csvw_safe-0.0.1/src/csvw_safe/__init__.py +50 -0
  7. csvw_safe-0.0.1/src/csvw_safe/assert_same_structure.py +133 -0
  8. csvw_safe-0.0.1/src/csvw_safe/constants.py +79 -0
  9. csvw_safe-0.0.1/src/csvw_safe/csvw_to_opendp_context.py +173 -0
  10. csvw_safe-0.0.1/src/csvw_safe/csvw_to_opendp_margins.py +124 -0
  11. csvw_safe-0.0.1/src/csvw_safe/csvw_to_smartnoise_sql.py +261 -0
  12. csvw_safe-0.0.1/src/csvw_safe/datatypes.py +272 -0
  13. csvw_safe-0.0.1/src/csvw_safe/generate_series.py +434 -0
  14. csvw_safe-0.0.1/src/csvw_safe/make_dummy_from_metadata.py +260 -0
  15. csvw_safe-0.0.1/src/csvw_safe/make_metadata_from_data.py +837 -0
  16. csvw_safe-0.0.1/src/csvw_safe/metadata_structure.py +522 -0
  17. csvw_safe-0.0.1/src/csvw_safe/utils.py +179 -0
  18. csvw_safe-0.0.1/src/csvw_safe/validate_metadata.py +50 -0
  19. csvw_safe-0.0.1/src/csvw_safe/validate_metadata_shacl.py +100 -0
  20. csvw_safe-0.0.1/src/csvw_safe.egg-info/PKG-INFO +682 -0
  21. csvw_safe-0.0.1/src/csvw_safe.egg-info/SOURCES.txt +35 -0
  22. csvw_safe-0.0.1/src/csvw_safe.egg-info/dependency_links.txt +1 -0
  23. csvw_safe-0.0.1/src/csvw_safe.egg-info/requires.txt +18 -0
  24. csvw_safe-0.0.1/src/csvw_safe.egg-info/top_level.txt +1 -0
  25. csvw_safe-0.0.1/tests/test_csvw_to_opendp_context.py +229 -0
  26. csvw_safe-0.0.1/tests/test_csvw_to_opendp_margins.py +136 -0
  27. csvw_safe-0.0.1/tests/test_csvw_to_smartnoise_sql.py +174 -0
  28. csvw_safe-0.0.1/tests/test_datatypes.py +161 -0
  29. csvw_safe-0.0.1/tests/test_generate_series.py +401 -0
  30. csvw_safe-0.0.1/tests/test_make_dummy.py +207 -0
  31. csvw_safe-0.0.1/tests/test_make_metadata_contributions.py +211 -0
  32. csvw_safe-0.0.1/tests/test_make_metadata_main.py +231 -0
  33. csvw_safe-0.0.1/tests/test_make_metadata_synth.py +140 -0
  34. csvw_safe-0.0.1/tests/test_penguin_dataset.py +275 -0
  35. csvw_safe-0.0.1/tests/test_same_structure.py +75 -0
  36. csvw_safe-0.0.1/tests/test_validate_metadata.py +359 -0
  37. csvw_safe-0.0.1/tests/test_validate_metadata_shacl.py +44 -0
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2018 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,682 @@
1
+ Metadata-Version: 2.4
2
+ Name: csvw-safe
3
+ Version: 0.0.1
4
+ Summary: Safe processing utilities for CSVW data
5
+ Author-email: "DSCC, FSO" <dscc@example.com>
6
+ Project-URL: Homepage, https://github.com/dsccadminch/csvw-safe-library/csvw-safe
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: numpy>=1.21
14
+ Requires-Dist: pandas>=1.3
15
+ Requires-Dist: pydantic>=2.12.5
16
+ Requires-Dist: pyyaml>=6.0.3
17
+ Requires-Dist: polars==1.32.0
18
+ Requires-Dist: pyarrow>=23.0.1
19
+ Requires-Dist: opendp>=0.12
20
+ Requires-Dist: rdflib>=6.0
21
+ Requires-Dist: pyshacl>=0.17.2
22
+ Provides-Extra: dev
23
+ Requires-Dist: ruff>=0.6.0; extra == "dev"
24
+ Requires-Dist: mypy==1.15.0; extra == "dev"
25
+ Requires-Dist: coverage==7.6.12; extra == "dev"
26
+ Requires-Dist: pytest==8.3.3; extra == "dev"
27
+ Requires-Dist: pytest-cov==6.0.0; extra == "dev"
28
+ Requires-Dist: types-PyYAML==6.0.12.20250402; extra == "dev"
29
+ Requires-Dist: markdown==3.8.2; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # CSVW-SAFE Utility Library
33
+
34
+ This library provides Python utilities for generating, validating, and testing CSVW-SAFE metadata and associated dummy datasets for differential privacy (DP) development and safe data modeling workflows.
35
+
36
+ It includes five main scripts:
37
+
38
+ 1. `make_metadata_from_data.py`
39
+ 2. `make_dummy_from_metadata.py`
40
+ 3. `validate_metadata.py`
41
+ 4. `validate_metadata_shacl.py` (requires `pyshacl`)
42
+ 5. `assert_same_structure.py`
43
+
44
+ ![Overview](../images/csvwsafe_workflow_1.png)
45
+
46
+ In addition, two other scripts are available for conversion of csvw-safe metadata to smartnoise sql and opendp libraries:
47
+ 6. `csvw_to_smartnoise_sql.py` converts the metadata to the format expected in smartnoise-sql
48
+ 7. `assert_same_structure.py` prepares a context object for opendp with margin and information extracted from csvw-metadata format.
49
+
50
+ ![Overview](../images/csvwsafe_workflow_2.png)
51
+
52
+ **NOTES**:
53
+ - These scripts assist safe data modeling workflows; they DO NOT replace governance decisions on what is public information or not.
54
+ - IMPORTANT: Automatically generated metadata may contain sensitive information — MANUAL REVIEW IS ALWAYS REQUIRED before further steps.
55
+
56
+ For a description of [CSVW-SAFE metadata, see here](https://github.com/dscc-admin-ch/csvw-safe/blob/update_readme/README.md).
57
+
58
+ ---
59
+
60
+ ## Installation
61
+
62
+ Install Python 3.11+ and
63
+
64
+ ```bash
65
+ pip install csvw-safe
66
+ ```
67
+
68
+ or for development:
69
+ ```
70
+ git clone https://github.com/dscc-admin-ch/csvw-safe-library.git
71
+ cd csvw-safe-library
72
+ pip install -e .[dev]
73
+ ```
74
+
75
+ For testing:
76
+ ```
77
+ cd csvw-safe-library
78
+ pip install -e .[dev]
79
+ pytest --cov=csvw_safe --cov-report=term-missing tests/
80
+ ```
81
+
82
+ ## Learn via example
83
+
84
+ To get to know the library with examples, see the [notebook on the extended penguin dataset](notebook https://github.com/dscc-admin-ch/csvw-safe/blob/update_readme/csvw-safe-library/examples/Use-Library.ipynb) and the associated outputs in [metadata example folder](https://github.com/dscc-admin-ch/csvw-safe/tree/update_readme/csvw-safe-library/examples/metadata).
85
+
86
+
87
+ ## Scripts Overview
88
+
89
+ ### 1. **`make_metadata_from_data.py`**
90
+ #### Purpose
91
+
92
+ Automatically generate baseline CSVW-SAFE metadata from an existing dataset.
93
+
94
+ This script infers:
95
+ - Column datatypes
96
+ - Nullability and missingness rates
97
+ - Numeric bounds (min/max)
98
+ - Optional continuous partitions
99
+ - Contribution constraints (DP-oriented metadata)
100
+ - Optional column dependencies
101
+ - Optional column grouping metadata
102
+
103
+ **Important**: This tool is for automated metadata *drafting only*. All outputs must be manually reviewed (and properties can be removed) before publication.
104
+
105
+ The script first builds a pydantic `TableMetadata` model and then serialises it to a json-ld via a `to_dict()` method. See [TableMetadata.md](https://github.com/dscc-admin-ch/csvw-safe/blob/main/csvw-safe-library/TableMetadata.md) for more detailed explanation on the inner workings.
106
+
107
+ #### Differential Privacy (DP) Contribution Levels
108
+
109
+ The script provides flexibility in defining the level of detail for DP metadata.
110
+
111
+ **Warning**: Increasing the level of detail (i.e., more granular contribution definitions) can increase the risk of privacy leakage.
112
+ It is strongly recommended to:
113
+ - Choose the lowest level of detail sufficient for your use case
114
+ - Carefully review and validate the generated metadata
115
+
116
+ Four contribution levels are supported: `table`, `table_with_keys`, `column` and `partition`. By default the contribution level is `default_contributions_level=table`. If a different level is required for a column, it can by given via the argument `fine_contributions_level` (see CLI usage examples below).
117
+
118
+
119
+ ##### 1. `table` level
120
+
121
+ Defines DP constraints only at the table level.
122
+
123
+ Characteristics:
124
+ - Only table-level DP properties are specified
125
+ - Column metadata is minimal and includes:
126
+ - `name`
127
+ - `datatype`
128
+ - `required`
129
+ - `privacy_id`
130
+ - `nullable_proportion`
131
+ - `minimum` / `maximum` (if applicable)
132
+ - No:
133
+ - `public_keys_values` properties on column
134
+ - `ColumnGroup` class
135
+ - `Partition` class
136
+
137
+ Use case:
138
+ - When only global dataset-level privacy guarantees are required
139
+ - Safest option in terms of minimizing privacy leakage risk
140
+
141
+ ##### 2. `table_with_keys` level
142
+
143
+ As `table` level but with keys on categorical columns and ColumnGroup.
144
+
145
+ Use case:
146
+ - As `table` with keys being public information (like months in year, hours in day).
147
+
148
+ ##### 3. `column` level
149
+
150
+ Defines DP constraints at both the table_with_keys and column levels.
151
+
152
+ Requirements:
153
+ - `privacy_unit` must be specified to compute contribution bounds
154
+
155
+ Characteristics:
156
+ - Includes all `table`-level information
157
+ - Adds per-column DP properties (maximum contribution when grouping by the column):
158
+ - `max_length`
159
+ - `max_groups_per_unit`
160
+ - `max_contributions`
161
+ - For categorical columns:
162
+ - Extracts `public_keys_values` (set of possible values)
163
+ - Introduces column groups (`ColumnGroupMetadata`):
164
+ - Represent combinations of columns
165
+ - Include:
166
+ - `public_keys_values` (combinations of values)
167
+ - DP parameters for grouped contributions (maximum contribution when grouping by the group of columns)
168
+
169
+ Not included:
170
+ - No `Partition` objects
171
+
172
+ Use case:
173
+ - When per-column and multi-column contribution constraints are needed
174
+ - Balanced trade-off between utility and privacy
175
+
176
+
177
+ ##### 4. `partition` level
178
+
179
+ Defines DP constraints at the table_with_keys, column, and partition levels.
180
+
181
+ Characteristics:
182
+ - Includes all `column`-level information
183
+ - Introduces explicit `Partition` objects
184
+ - DP parameters are defined at:
185
+ - Table level (global bounds)
186
+ - Partition level (fine-grained bounds)
187
+
188
+ Partition behavior:
189
+ - Each `Partition` specifies:
190
+ - A predicate (categorical value or continuous range)
191
+ - DP parameters (maximum contribution in the partition):
192
+ - `max_length`
193
+ - `max_groups_per_unit`
194
+ - `max_contributions`
195
+ - These parameters represent the maximum contribution of a privacy unit within that specific partition
196
+
197
+ Continuous columns:
198
+ - If bounds (`minimum`, `maximum`) are provided:
199
+ - The column is divided into partitions (e.g., ranges)
200
+ - Each partition is assigned its own DP constraints
201
+
202
+ Use case:
203
+ - When fine-grained control over contributions is required
204
+ - Highest expressiveness, but also highest privacy risk
205
+
206
+
207
+ ##### Summary
208
+
209
+ | Level | Scope | Risk Level |
210
+ |------------|-----------------------|-------------|
211
+ | `table` | Table only | Lowest |
212
+ | `table_with_keys` | Table with keys in categorical columns | Medium |
213
+ | `column` | Table + Column | Medium |
214
+ | `partition` | Table + Column + Partition | Highest |
215
+
216
+ Start with the `table` level and only increase granularity if required.
217
+ Always validate that all information are already public information.
218
+
219
+ #### CLI Usage Examples
220
+
221
+ ```bash
222
+ # Basic usage
223
+ python make_metadata_from_data.py data.csv --privacy_unit user_id,
224
+ ```
225
+
226
+ It is possible to compute dependencies (bigger, depends on, etc) between columns with
227
+ ```bash
228
+ # Enable dependency detection (default: True)
229
+ python make_metadata_from_data.py data.csv \
230
+ --privacy_unit user_id \
231
+ --with_dependencies True
232
+ ```
233
+
234
+ It is also possible to describe partitions level of continuous data if public bounds are provided
235
+ ```bash
236
+ # Add continuous partitions
237
+ python make_metadata_from_data.py data.csv \
238
+ --privacy_unit user_id \
239
+ --continuous_partitions '{"age": [0, 18, 30, 50, 100]}'
240
+ ```
241
+
242
+ It is also possible to describe group of columns information (like after grouping by a list of columns) to have their metadata
243
+ ```bash
244
+ # Define column groups
245
+ python make_metadata_from_data.py data.csv \
246
+ --privacy_unit user_id \
247
+ --column_groups '[["age", "income"], ["city", "country"]]'
248
+ ```
249
+
250
+ ```bash
251
+ # Set default contribution level
252
+ python make_metadata_from_data.py data.csv \
253
+ --privacy_unit user_id \
254
+ --default_contributions_level table
255
+
256
+ # Column-specific contribution overrides
257
+ python make_metadata_from_data.py data.csv \
258
+ --privacy_unit user_id \
259
+ --fine_contributions_level '{"age": "column", "income": "partition"}'
260
+ ```
261
+
262
+ Save output to specific file
263
+ ```bash
264
+ python make_metadata_from_data.py data.csv \
265
+ --privacy_unit user_id \
266
+ --output my_metadata.json
267
+ ```
268
+
269
+ Notes
270
+ - Datetime columns are automatically inferred using pandas.to_datetime.
271
+ - Numeric bounds are computed only for non-string columns.
272
+ - Contribution levels control per-privacy-unit contribution constraints.
273
+ - Dependency detection may increase runtime on large datasets.
274
+ - Output is a JSON-serializable CSVW-SAFE metadata structure.
275
+
276
+ #### Future plans:
277
+ - Allow a DP vs non-DP mode (with/without) DP attributes
278
+ - Allow finer contribution level descrition (for now column level is very broad)
279
+
280
+ ### 2. **`make_dummy_from_metadata.py`**
281
+
282
+ #### Purpose
283
+
284
+ Generate a synthetic dummy dataset from CSVW-SAFE metadata.
285
+
286
+ The generator creates structured data that follows the declared metadata constraints, including:
287
+ - Column datatypes
288
+ - Numeric and categorical partitions
289
+ - Optional dependency structure between columns
290
+ - Nullable proportions
291
+ - Column-group constraints (when provided)
292
+
293
+ **Important**: This tool produces synthetic structural data only.
294
+ It does not preserve semantic meaning or real-world correlations beyond what is encoded in metadata.
295
+
296
+ #### Output Guarantees
297
+
298
+ The generated dataset:
299
+ - Respects declared column schema (datatypes)
300
+ - Respects partition definitions (categorical + continuous)
301
+ - Respects numeric bounds when defined
302
+ - Applies nullable proportions per column
303
+ - Optionally respects column-group partition constraints
304
+ - Produces reproducible results via random seed
305
+
306
+ #### Typical Use Cases
307
+
308
+ - Unit testing of CSVW-SAFE and DP pipelines
309
+ - Schema validation without real data access
310
+ - Debugging metadata-driven transformations
311
+ - Synthetic data generation for integration tests
312
+
313
+ #### CLI Usage Examples
314
+ Basic example with 100 rows:
315
+ ```bash
316
+ # Basic
317
+ python make_dummy_from_metadata.py metadata.json --output dummy.csv
318
+ ```
319
+
320
+ Set a seed (seed=42) and a number of rows (rows=1000) for a reproducible example:
321
+ ```bash
322
+ python make_dummy_from_metadata.py metadata.json \
323
+ --rows 1000 \
324
+ --seed 42 \
325
+ --output dummy.csv
326
+ ```
327
+
328
+ ### 3. **`validate_metadata.py`**
329
+
330
+ #### Purpose
331
+
332
+ Validate a CSVW-SAFE metadata file against the internal metadata schema.
333
+
334
+ This tool ensures that a metadata file is structurally correct and conforms to the expected CSVW-SAFE specification as defined by the internal `TableMetadata` model.
335
+
336
+ It is primarily used as a validation step before using metadata for:
337
+ - dummy dataset generation
338
+ - DP pipeline configuration
339
+ - downstream schema-driven processing
340
+
341
+
342
+ This validator performs schema-level validation only, including:
343
+ - Required fields presence
344
+ - Type correctness
345
+ - Structural consistency of metadata objects
346
+ - Compatibility with the `TableMetadata` model
347
+
348
+ Validation is implemented via a Pydantic model (`TableMetadata.from_dict`). See [TableMetadata.md](https://github.com/dscc-admin-ch/csvw-safe/blob/main/csvw-safe-library/TableMetadata.md) for more detailed explanation of the underlying pydantic model used to validate the metadata.
349
+
350
+ Output behaviour:
351
+ - If metadata is valid → script exits silently (no output)
352
+ - If metadata is invalid → raises a validation exception and exits with error
353
+
354
+ #### CLI Usage
355
+ ```bash
356
+ python validate_metadata.py metadata.json
357
+ ```
358
+
359
+
360
+ ### 4. **`validate_metadata_shacl.py`**
361
+
362
+ #### Purpose
363
+
364
+ Validate CSVW-SAFE metadata using a SHACL constraint schema.
365
+
366
+ This tool performs structural validation of metadata expressed in JSON-LD format against a SHACL shapes graph defined in Turtle format.
367
+
368
+ It is the most strict validation layer in the CSVW-SAFE toolchain, intended to ensure full compliance with RDF-based constraints.
369
+
370
+ #### Validation Scope
371
+
372
+ This validator checks:
373
+ - RDF structural consistency of metadata (JSON-LD parsing)
374
+ - Constraint satisfaction against SHACL shapes
375
+ - Class/property-level restrictions defined in the schema
376
+ - Cross-field structural rules defined in the SHACL graph
377
+
378
+ > Unlike `validate_metadata.py`, this tool performs formal SHACL validation, not just schema validation.
379
+
380
+
381
+ Python usage
382
+ ```bash
383
+ python validate_metadata_shacl.py metadata.jsonld shapes.ttl
384
+ ```
385
+
386
+ Validation output
387
+ On success: SHACL validation SUCCESSFUL
388
+ On failure: SHACL validation FAILED with a <detailed SHACL report>
389
+
390
+
391
+ Typical Use Cases
392
+ - Formal compliance validation of CSVW-SAFE metadata
393
+ - CI/CD enforcement of metadata correctness
394
+ - Pre-deployment validation in RDF-based pipelines
395
+ - Ensuring compatibility with external SHACL-aware systems
396
+
397
+ Notes
398
+ - Metadata must be valid JSON-LD RDF
399
+ - SHACL shapes must be valid Turtle RDF
400
+ - This is the strictest validation layer
401
+ - More expressive than Pydantic-based validation (validate_metadata.py)
402
+
403
+ ### 5. **`assert_same_structure.py`**
404
+
405
+ #### Purpose
406
+
407
+ Verify that a generated dummy CSV preserves the structural properties of an original dataset under the CSVW-SAFE assumptions.
408
+
409
+ This tool ensures that synthetic data produced by `make_dummy_from_metadata.py` remains schema-compatible with the original dataset used to derive metadata.
410
+
411
+ This validator checks structure only. It does not assess statistical similarity or data realism.
412
+
413
+
414
+ The tool checks:
415
+ - Column names and ordering
416
+ - Inferred CSVW-SAFE datatypes
417
+ - Nullability constraints (required vs optional columns)
418
+ - Optional categorical domain compatibility (subset check)
419
+
420
+ It does not check:
421
+ - Statistical similarity between datasets
422
+ - Distributional properties
423
+ - Correlation structure
424
+ - Semantic correctness of values
425
+
426
+
427
+ #### Core Validation Logic
428
+
429
+ Ensures that both datasets share identical schema:
430
+
431
+ - Same column names
432
+ - Same column ordering
433
+
434
+ Each column is type-checked using CSVW-SAFE inference:
435
+
436
+ - Datatypes are inferred via `infer_xmlschema_datatype`
437
+ - Integer subtype differences are tolerated (e.g., small vs large integer variants)
438
+
439
+ Validates whether required/optional status is preserved:
440
+
441
+ - A column is considered required if it has no missing values
442
+ - Both datasets must agree on required vs optional status per column
443
+
444
+ If enabled, ensures:
445
+
446
+ - All values in dummy dataset are a subset of original dataset values
447
+ - Uses `is_categorical()` to detect categorical columns
448
+
449
+ #### CLI Usage
450
+
451
+ ```bash
452
+ python assert_same_structure.py original.csv dummy.csv
453
+ ```
454
+
455
+ Skip categorical validation
456
+ ```
457
+ python validate_dummy_structure.py original.csv dummy.csv --no-categories
458
+ ```
459
+
460
+ Typical Use Cases
461
+ - Validate synthetic dataset generation correctness
462
+ - Regression testing for metadata-driven pipelines
463
+ - Ensuring structural integrity in DP synthetic data workflows
464
+ - Debugging mismatches between metadata and generated datasets
465
+ Notes
466
+ - This tool is intentionally strict on schema alignment but lenient on integer type variations
467
+ - Designed to validate synthetic structural fidelity, not realism
468
+ - Works best in combination with: make_metadata_from_data.py and make_dummy_from_metadata.py
469
+
470
+ ### 6. **`csvw_to_smartnoise_sql.py`**
471
+
472
+ #### Purpose
473
+
474
+ Convert CSVW-SAFE metadata into the format expected by SmartNoise SQL.
475
+
476
+ This script transforms a CSVW-SAFE JSON metadata file into a SmartNoise-compatible YAML configuration, enabling direct use in differential privacy queries with SmartNoise SQL.
477
+
478
+ The script maps CSVW-SAFE metadata into SmartNoise SQL structure:
479
+
480
+ - Table-level privacy constraints:
481
+ - `max_contributions` → `max_ids`
482
+ - Column definitions:
483
+ - Datatypes (converted to SmartNoise types)
484
+ - Nullability
485
+ - Value bounds (`minimum` / `maximum` → `lower` / `upper`)
486
+ - Privacy identifier (`privacy_id` → `private_id`)
487
+ - Optional DP configuration parameters:
488
+ - sampling, clamping, censoring, DPSU
489
+
490
+ #### Output Structure
491
+
492
+ The generated YAML follows SmartNoise SQL format:
493
+
494
+ ```yaml
495
+ "":
496
+ schema_name:
497
+ table_name:
498
+ max_ids: ...
499
+ rows: ...
500
+ sample_max_ids: ...
501
+ censor_dims: ...
502
+ clamp_counts: ...
503
+ clamp_columns: ...
504
+ use_dpsu: ...
505
+ column_name:
506
+ name: ...
507
+ type: ...
508
+ nullable: ...
509
+ lower: ...
510
+ upper: ...
511
+ private_id: ...
512
+ ```
513
+
514
+ #### CLI Usage
515
+
516
+ Basic conversion
517
+ ```bash
518
+ python csvw_to_smartnoise_sql.py \
519
+ --input metadata.json \
520
+ --output snsql_metadata.yaml
521
+ ```
522
+
523
+ With custom schema and table
524
+ ```bash
525
+ python csvw_to_smartnoise_sql.py \
526
+ --input metadata.json \
527
+ --output snsql_metadata.yaml \
528
+ --schema MySchema \
529
+ --table MyTable
530
+ ```
531
+
532
+ With DP configuration options
533
+ ```bash
534
+ python csvw_to_smartnoise_sql.py \
535
+ --input metadata.json \
536
+ --output snsql_metadata.yaml \
537
+ --sample_max_ids True \
538
+ --censor_dims True \
539
+ --clamp_columns True
540
+ ```
541
+
542
+ ### 7. **`csvw_to_opendp_context.py`**
543
+
544
+ #### Purpose
545
+
546
+ Create an OpenDP `Context` from CSVW-SAFE metadata and a dataset.
547
+
548
+ This script bridges CSVW-SAFE metadata with the OpenDP library by:
549
+ - Converting metadata into OpenDP margins
550
+ - Defining privacy units and privacy loss
551
+ - Building a ready-to-use OpenDP `Context` for DP queries
552
+
553
+ The resulting OpenDP `Context` includes:
554
+
555
+ - Privacy unit (based on `max_contributions`)
556
+ - Privacy loss:
557
+ - ε-DP (Laplace)
558
+ - ρ-DP / zCDP (Gaussian)
559
+ - Margins derived from CSVW metadata
560
+ - Dataset (as a Polars LazyFrame)
561
+
562
+
563
+ #### Supported Privacy Models
564
+
565
+ | Model | Parameter |
566
+ |------|----------|
567
+ | Laplace DP | `epsilon` |
568
+ | Gaussian / zCDP | `rho` |
569
+ | Approximate DP | `delta` |
570
+
571
+ > You must provide either `epsilon` OR `rho`, not both.
572
+
573
+
574
+ #### CLI Usage
575
+
576
+ Basic conversion
577
+ ```bash
578
+ import polars as pl
579
+ from csvw_safe.csvw_to_opendp_context import csvw_to_opendp_context
580
+
581
+ data = pl.scan_csv("data.csv")
582
+
583
+ context = csvw_to_opendp_context(
584
+ csvw_meta=metadata,
585
+ data=data,
586
+ epsilon=1.0,
587
+ )
588
+ ```
589
+
590
+
591
+ ## Typical Workflow
592
+
593
+ ### Via CLI
594
+ 1. Generate baseline metadata from the original dataset:
595
+ ```
596
+ python make_metadata_from_data.py data.csv --id user_id --mode fine
597
+ ```
598
+
599
+ 2. Review manually with a data expert and approve metadata for safety and governance compliance.
600
+ Optionnaly after removing private information, run (to validate metadata format)
601
+ ```
602
+ python scripts/validate_metadata_shacl.py metadata.json csvw-safe-constraints.ttl
603
+ ```
604
+ and with shacl constraints:
605
+ ```
606
+ python validate_metadata.py metadata.json --shacl csvw-safe-constraints.ttl
607
+ ```
608
+
609
+ 3. Generate a dummy dataset from the approved metadata:
610
+ ```
611
+ python make_dummy_from_metadata.py metadata.json --rows 1000 --output dummy.csv
612
+ ```
613
+
614
+ 4. Verify that the dummy matches the original structure:
615
+ ```
616
+ python assert_same_structure.py data.csv dummy.csv
617
+ ```
618
+
619
+
620
+ ### Python API Workflow
621
+ ```
622
+ import pandas as pd
623
+ from csvw_safe.make_metadata_from_data import make_metadata_from_data
624
+
625
+ df = pd.read_csv("data.csv")
626
+
627
+ # Generate metadata
628
+ metadata = make_metadata_from_data(df, csv_url="data.csv", individual_col="user_id")
629
+
630
+ ```
631
+ MANUAL REVIEW OF METADATA. VERIFY ONLY PUBLIC INFORMATION. REMOVE OTHERWISE.
632
+
633
+ ```
634
+ from csvw_safe.validate_metadata import validate_metadata
635
+ from csvw_safe.validate_metadata_shacl import validate_metadata_shacl
636
+ from csvw_safe.make_dummy_from_metadata import make_dummy_from_metadata
637
+ from csvw_safe.assert_same_structure import assert_same_structure
638
+
639
+ # Validate metadata
640
+ errors = validate_metadata(metadata)
641
+ errors = validate_metadata_shacl(metadata)
642
+
643
+ # Generate dummy dataset
644
+ dummy_df = make_dummy_from_metadata(metadata, nb_rows=500)
645
+
646
+ # Assert structure
647
+ assert_same_structure(df, dummy_df)
648
+ ```
649
+
650
+
651
+
652
+ # Directory Structure
653
+
654
+ ```
655
+ examples/
656
+ └─ Notebooks.ipynb # Example notebooks demonstrating CSVW-SAFE workflows
657
+
658
+ src/csvw_safe/
659
+ ├─ __init__.py # Package initializer for CSVW-SAFE library
660
+
661
+ ├─ make_metadata_from_data.py # Generate CSVW-SAFE metadata automatically from a dataset
662
+ ├─ make_dummy_from_metadata.py # Generate synthetic dummy datasets from CSVW-SAFE metadata
663
+ ├─ validate_metadata.py # Validate metadata using internal schema (TableMetadata model)
664
+ ├─ validate_metadata_shacl.py # Validate metadata using SHACL constraints via RDF graphs
665
+ ├─ assert_same_structure.py # Compare original and dummy CSVs for structural consistency
666
+
667
+ ├─ csvw_to_opendp_context.py # Convert CSVW-SAFE metadata into OpenDP analysis context
668
+ ├─ csvw_to_opendp_margins.py # Translate CSVW-SAFE metadata into OpenDP margin definitions
669
+ ├─ csvw_to_smartnoise_sql.py # Convert CSVW-SAFE metadata into SmartNoise SQL format
670
+
671
+ ├─ generate_series.py # Generate synthetic column values based on metadata rules
672
+ ├─ metadata_structure.py # Core data models defining CSVW-SAFE metadata schema
673
+ ├─ constants.py # Shared constants used across metadata pipeline
674
+ ├─ datatypes.py # Datatype inference and CSVW-SAFE type utilities
675
+ └─ utils.py # General helper utilities for metadata processing
676
+ tests/ # Unit and integration tests for CSVW-SAFE library
677
+
678
+ pyproject.toml # Project configuration and dependencies
679
+ README.md # Project overview and documentation entry point
680
+ run_linter.sh # Script to run linting and style checks
681
+ ```
682
+