csvw-safe 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- csvw_safe-0.0.1/LICENSE +19 -0
- csvw_safe-0.0.1/PKG-INFO +682 -0
- csvw_safe-0.0.1/README.md +651 -0
- csvw_safe-0.0.1/pyproject.toml +95 -0
- csvw_safe-0.0.1/setup.cfg +4 -0
- csvw_safe-0.0.1/src/csvw_safe/__init__.py +50 -0
- csvw_safe-0.0.1/src/csvw_safe/assert_same_structure.py +133 -0
- csvw_safe-0.0.1/src/csvw_safe/constants.py +79 -0
- csvw_safe-0.0.1/src/csvw_safe/csvw_to_opendp_context.py +173 -0
- csvw_safe-0.0.1/src/csvw_safe/csvw_to_opendp_margins.py +124 -0
- csvw_safe-0.0.1/src/csvw_safe/csvw_to_smartnoise_sql.py +261 -0
- csvw_safe-0.0.1/src/csvw_safe/datatypes.py +272 -0
- csvw_safe-0.0.1/src/csvw_safe/generate_series.py +434 -0
- csvw_safe-0.0.1/src/csvw_safe/make_dummy_from_metadata.py +260 -0
- csvw_safe-0.0.1/src/csvw_safe/make_metadata_from_data.py +837 -0
- csvw_safe-0.0.1/src/csvw_safe/metadata_structure.py +522 -0
- csvw_safe-0.0.1/src/csvw_safe/utils.py +179 -0
- csvw_safe-0.0.1/src/csvw_safe/validate_metadata.py +50 -0
- csvw_safe-0.0.1/src/csvw_safe/validate_metadata_shacl.py +100 -0
- csvw_safe-0.0.1/src/csvw_safe.egg-info/PKG-INFO +682 -0
- csvw_safe-0.0.1/src/csvw_safe.egg-info/SOURCES.txt +35 -0
- csvw_safe-0.0.1/src/csvw_safe.egg-info/dependency_links.txt +1 -0
- csvw_safe-0.0.1/src/csvw_safe.egg-info/requires.txt +18 -0
- csvw_safe-0.0.1/src/csvw_safe.egg-info/top_level.txt +1 -0
- csvw_safe-0.0.1/tests/test_csvw_to_opendp_context.py +229 -0
- csvw_safe-0.0.1/tests/test_csvw_to_opendp_margins.py +136 -0
- csvw_safe-0.0.1/tests/test_csvw_to_smartnoise_sql.py +174 -0
- csvw_safe-0.0.1/tests/test_datatypes.py +161 -0
- csvw_safe-0.0.1/tests/test_generate_series.py +401 -0
- csvw_safe-0.0.1/tests/test_make_dummy.py +207 -0
- csvw_safe-0.0.1/tests/test_make_metadata_contributions.py +211 -0
- csvw_safe-0.0.1/tests/test_make_metadata_main.py +231 -0
- csvw_safe-0.0.1/tests/test_make_metadata_synth.py +140 -0
- csvw_safe-0.0.1/tests/test_penguin_dataset.py +275 -0
- csvw_safe-0.0.1/tests/test_same_structure.py +75 -0
- csvw_safe-0.0.1/tests/test_validate_metadata.py +359 -0
- csvw_safe-0.0.1/tests/test_validate_metadata_shacl.py +44 -0
csvw_safe-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2018 The Python Packaging Authority
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
csvw_safe-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,682 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: csvw-safe
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Safe processing utilities for CSVW data
|
|
5
|
+
Author-email: "DSCC, FSO" <dscc@example.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/dsccadminch/csvw-safe-library/csvw-safe
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: numpy>=1.21
|
|
14
|
+
Requires-Dist: pandas>=1.3
|
|
15
|
+
Requires-Dist: pydantic>=2.12.5
|
|
16
|
+
Requires-Dist: pyyaml>=6.0.3
|
|
17
|
+
Requires-Dist: polars==1.32.0
|
|
18
|
+
Requires-Dist: pyarrow>=23.0.1
|
|
19
|
+
Requires-Dist: opendp>=0.12
|
|
20
|
+
Requires-Dist: rdflib>=6.0
|
|
21
|
+
Requires-Dist: pyshacl>=0.17.2
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: ruff>=0.6.0; extra == "dev"
|
|
24
|
+
Requires-Dist: mypy==1.15.0; extra == "dev"
|
|
25
|
+
Requires-Dist: coverage==7.6.12; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest==8.3.3; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov==6.0.0; extra == "dev"
|
|
28
|
+
Requires-Dist: types-PyYAML==6.0.12.20250402; extra == "dev"
|
|
29
|
+
Requires-Dist: markdown==3.8.2; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# CSVW-SAFE Utility Library
|
|
33
|
+
|
|
34
|
+
This library provides Python utilities for generating, validating, and testing CSVW-SAFE metadata and associated dummy datasets for differential privacy (DP) development and safe data modeling workflows.
|
|
35
|
+
|
|
36
|
+
It includes five main scripts:
|
|
37
|
+
|
|
38
|
+
1. `make_metadata_from_data.py`
|
|
39
|
+
2. `make_dummy_from_metadata.py`
|
|
40
|
+
3. `validate_metadata.py`
|
|
41
|
+
4. `validate_metadata_shacl.py` (requires `pyshacl`)
|
|
42
|
+
5. `assert_same_structure.py`
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
In addition, two other scripts are available for conversion of csvw-safe metadata to smartnoise sql and opendp libraries:
|
|
47
|
+
6. `csvw_to_smartnoise_sql.py` converts the metadata to the format expected in smartnoise-sql
|
|
48
|
+
7. `assert_same_structure.py` prepares a context object for opendp with margin and information extracted from csvw-metadata format.
|
|
49
|
+
|
|
50
|
+

|
|
51
|
+
|
|
52
|
+
**NOTES**:
|
|
53
|
+
- These scripts assist safe data modeling workflows; they DO NOT replace governance decisions on what is public information or not.
|
|
54
|
+
- IMPORTANT: Automatically generated metadata may contain sensitive information — MANUAL REVIEW IS ALWAYS REQUIRED before further steps.
|
|
55
|
+
|
|
56
|
+
For a description of [CSVW-SAFE metadata, see here](https://github.com/dscc-admin-ch/csvw-safe/blob/update_readme/README.md).
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
Install Python 3.11+ and
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install csvw-safe
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
or for development:
|
|
69
|
+
```
|
|
70
|
+
git clone https://github.com/dscc-admin-ch/csvw-safe-library.git
|
|
71
|
+
cd csvw-safe-library
|
|
72
|
+
pip install -e .[dev]
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
For testing:
|
|
76
|
+
```
|
|
77
|
+
cd csvw-safe-library
|
|
78
|
+
pip install -e .[dev]
|
|
79
|
+
pytest --cov=csvw_safe --cov-report=term-missing tests/
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Learn via example
|
|
83
|
+
|
|
84
|
+
To get to know the library with examples, see the [notebook on the extended penguin dataset](notebook https://github.com/dscc-admin-ch/csvw-safe/blob/update_readme/csvw-safe-library/examples/Use-Library.ipynb) and the associated outputs in [metadata example folder](https://github.com/dscc-admin-ch/csvw-safe/tree/update_readme/csvw-safe-library/examples/metadata).
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
## Scripts Overview
|
|
88
|
+
|
|
89
|
+
### 1. **`make_metadata_from_data.py`**
|
|
90
|
+
#### Purpose
|
|
91
|
+
|
|
92
|
+
Automatically generate baseline CSVW-SAFE metadata from an existing dataset.
|
|
93
|
+
|
|
94
|
+
This script infers:
|
|
95
|
+
- Column datatypes
|
|
96
|
+
- Nullability and missingness rates
|
|
97
|
+
- Numeric bounds (min/max)
|
|
98
|
+
- Optional continuous partitions
|
|
99
|
+
- Contribution constraints (DP-oriented metadata)
|
|
100
|
+
- Optional column dependencies
|
|
101
|
+
- Optional column grouping metadata
|
|
102
|
+
|
|
103
|
+
**Important**: This tool is for automated metadata *drafting only*. All outputs must be manually reviewed (and properties can be removed) before publication.
|
|
104
|
+
|
|
105
|
+
The script first builds a pydantic `TableMetadata` model and then serialises it to a json-ld via a `to_dict()` method. See [TableMetadata.md](https://github.com/dscc-admin-ch/csvw-safe/blob/main/csvw-safe-library/TableMetadata.md) for more detailed explanation on the inner workings.
|
|
106
|
+
|
|
107
|
+
#### Differential Privacy (DP) Contribution Levels
|
|
108
|
+
|
|
109
|
+
The script provides flexibility in defining the level of detail for DP metadata.
|
|
110
|
+
|
|
111
|
+
**Warning**: Increasing the level of detail (i.e., more granular contribution definitions) can increase the risk of privacy leakage.
|
|
112
|
+
It is strongly recommended to:
|
|
113
|
+
- Choose the lowest level of detail sufficient for your use case
|
|
114
|
+
- Carefully review and validate the generated metadata
|
|
115
|
+
|
|
116
|
+
Four contribution levels are supported: `table`, `table_with_keys`, `column` and `partition`. By default the contribution level is `default_contributions_level=table`. If a different level is required for a column, it can by given via the argument `fine_contributions_level` (see CLI usage examples below).
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
##### 1. `table` level
|
|
120
|
+
|
|
121
|
+
Defines DP constraints only at the table level.
|
|
122
|
+
|
|
123
|
+
Characteristics:
|
|
124
|
+
- Only table-level DP properties are specified
|
|
125
|
+
- Column metadata is minimal and includes:
|
|
126
|
+
- `name`
|
|
127
|
+
- `datatype`
|
|
128
|
+
- `required`
|
|
129
|
+
- `privacy_id`
|
|
130
|
+
- `nullable_proportion`
|
|
131
|
+
- `minimum` / `maximum` (if applicable)
|
|
132
|
+
- No:
|
|
133
|
+
- `public_keys_values` properties on column
|
|
134
|
+
- `ColumnGroup` class
|
|
135
|
+
- `Partition` class
|
|
136
|
+
|
|
137
|
+
Use case:
|
|
138
|
+
- When only global dataset-level privacy guarantees are required
|
|
139
|
+
- Safest option in terms of minimizing privacy leakage risk
|
|
140
|
+
|
|
141
|
+
##### 2. `table_with_keys` level
|
|
142
|
+
|
|
143
|
+
As `table` level but with keys on categorical columns and ColumnGroup.
|
|
144
|
+
|
|
145
|
+
Use case:
|
|
146
|
+
- As `table` with keys being public information (like months in year, hours in day).
|
|
147
|
+
|
|
148
|
+
##### 3. `column` level
|
|
149
|
+
|
|
150
|
+
Defines DP constraints at both the table_with_keys and column levels.
|
|
151
|
+
|
|
152
|
+
Requirements:
|
|
153
|
+
- `privacy_unit` must be specified to compute contribution bounds
|
|
154
|
+
|
|
155
|
+
Characteristics:
|
|
156
|
+
- Includes all `table`-level information
|
|
157
|
+
- Adds per-column DP properties (maximum contribution when grouping by the column):
|
|
158
|
+
- `max_length`
|
|
159
|
+
- `max_groups_per_unit`
|
|
160
|
+
- `max_contributions`
|
|
161
|
+
- For categorical columns:
|
|
162
|
+
- Extracts `public_keys_values` (set of possible values)
|
|
163
|
+
- Introduces column groups (`ColumnGroupMetadata`):
|
|
164
|
+
- Represent combinations of columns
|
|
165
|
+
- Include:
|
|
166
|
+
- `public_keys_values` (combinations of values)
|
|
167
|
+
- DP parameters for grouped contributions (maximum contribution when grouping by the group of columns)
|
|
168
|
+
|
|
169
|
+
Not included:
|
|
170
|
+
- No `Partition` objects
|
|
171
|
+
|
|
172
|
+
Use case:
|
|
173
|
+
- When per-column and multi-column contribution constraints are needed
|
|
174
|
+
- Balanced trade-off between utility and privacy
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
##### 4. `partition` level
|
|
178
|
+
|
|
179
|
+
Defines DP constraints at the table_with_keys, column, and partition levels.
|
|
180
|
+
|
|
181
|
+
Characteristics:
|
|
182
|
+
- Includes all `column`-level information
|
|
183
|
+
- Introduces explicit `Partition` objects
|
|
184
|
+
- DP parameters are defined at:
|
|
185
|
+
- Table level (global bounds)
|
|
186
|
+
- Partition level (fine-grained bounds)
|
|
187
|
+
|
|
188
|
+
Partition behavior:
|
|
189
|
+
- Each `Partition` specifies:
|
|
190
|
+
- A predicate (categorical value or continuous range)
|
|
191
|
+
- DP parameters (maximum contribution in the partition):
|
|
192
|
+
- `max_length`
|
|
193
|
+
- `max_groups_per_unit`
|
|
194
|
+
- `max_contributions`
|
|
195
|
+
- These parameters represent the maximum contribution of a privacy unit within that specific partition
|
|
196
|
+
|
|
197
|
+
Continuous columns:
|
|
198
|
+
- If bounds (`minimum`, `maximum`) are provided:
|
|
199
|
+
- The column is divided into partitions (e.g., ranges)
|
|
200
|
+
- Each partition is assigned its own DP constraints
|
|
201
|
+
|
|
202
|
+
Use case:
|
|
203
|
+
- When fine-grained control over contributions is required
|
|
204
|
+
- Highest expressiveness, but also highest privacy risk
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
##### Summary
|
|
208
|
+
|
|
209
|
+
| Level | Scope | Risk Level |
|
|
210
|
+
|------------|-----------------------|-------------|
|
|
211
|
+
| `table` | Table only | Lowest |
|
|
212
|
+
| `table_with_keys` | Table with keys in categorical columns | Medium |
|
|
213
|
+
| `column` | Table + Column | Medium |
|
|
214
|
+
| `partition` | Table + Column + Partition | Highest |
|
|
215
|
+
|
|
216
|
+
Start with the `table` level and only increase granularity if required.
|
|
217
|
+
Always validate that all information are already public information.
|
|
218
|
+
|
|
219
|
+
#### CLI Usage Examples
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
# Basic usage
|
|
223
|
+
python make_metadata_from_data.py data.csv --privacy_unit user_id,
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
It is possible to compute dependencies (bigger, depends on, etc) between columns with
|
|
227
|
+
```bash
|
|
228
|
+
# Enable dependency detection (default: True)
|
|
229
|
+
python make_metadata_from_data.py data.csv \
|
|
230
|
+
--privacy_unit user_id \
|
|
231
|
+
--with_dependencies True
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
It is also possible to describe partitions level of continuous data if public bounds are provided
|
|
235
|
+
```bash
|
|
236
|
+
# Add continuous partitions
|
|
237
|
+
python make_metadata_from_data.py data.csv \
|
|
238
|
+
--privacy_unit user_id \
|
|
239
|
+
--continuous_partitions '{"age": [0, 18, 30, 50, 100]}'
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
It is also possible to describe group of columns information (like after grouping by a list of columns) to have their metadata
|
|
243
|
+
```bash
|
|
244
|
+
# Define column groups
|
|
245
|
+
python make_metadata_from_data.py data.csv \
|
|
246
|
+
--privacy_unit user_id \
|
|
247
|
+
--column_groups '[["age", "income"], ["city", "country"]]'
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
# Set default contribution level
|
|
252
|
+
python make_metadata_from_data.py data.csv \
|
|
253
|
+
--privacy_unit user_id \
|
|
254
|
+
--default_contributions_level table
|
|
255
|
+
|
|
256
|
+
# Column-specific contribution overrides
|
|
257
|
+
python make_metadata_from_data.py data.csv \
|
|
258
|
+
--privacy_unit user_id \
|
|
259
|
+
--fine_contributions_level '{"age": "column", "income": "partition"}'
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Save output to specific file
|
|
263
|
+
```bash
|
|
264
|
+
python make_metadata_from_data.py data.csv \
|
|
265
|
+
--privacy_unit user_id \
|
|
266
|
+
--output my_metadata.json
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
Notes
|
|
270
|
+
- Datetime columns are automatically inferred using pandas.to_datetime.
|
|
271
|
+
- Numeric bounds are computed only for non-string columns.
|
|
272
|
+
- Contribution levels control per-privacy-unit contribution constraints.
|
|
273
|
+
- Dependency detection may increase runtime on large datasets.
|
|
274
|
+
- Output is a JSON-serializable CSVW-SAFE metadata structure.
|
|
275
|
+
|
|
276
|
+
#### Future plans:
|
|
277
|
+
- Allow a DP vs non-DP mode (with/without) DP attributes
|
|
278
|
+
- Allow finer contribution level descrition (for now column level is very broad)
|
|
279
|
+
|
|
280
|
+
### 2. **`make_dummy_from_metadata.py`**
|
|
281
|
+
|
|
282
|
+
#### Purpose
|
|
283
|
+
|
|
284
|
+
Generate a synthetic dummy dataset from CSVW-SAFE metadata.
|
|
285
|
+
|
|
286
|
+
The generator creates structured data that follows the declared metadata constraints, including:
|
|
287
|
+
- Column datatypes
|
|
288
|
+
- Numeric and categorical partitions
|
|
289
|
+
- Optional dependency structure between columns
|
|
290
|
+
- Nullable proportions
|
|
291
|
+
- Column-group constraints (when provided)
|
|
292
|
+
|
|
293
|
+
**Important**: This tool produces synthetic structural data only.
|
|
294
|
+
It does not preserve semantic meaning or real-world correlations beyond what is encoded in metadata.
|
|
295
|
+
|
|
296
|
+
#### Output Guarantees
|
|
297
|
+
|
|
298
|
+
The generated dataset:
|
|
299
|
+
- Respects declared column schema (datatypes)
|
|
300
|
+
- Respects partition definitions (categorical + continuous)
|
|
301
|
+
- Respects numeric bounds when defined
|
|
302
|
+
- Applies nullable proportions per column
|
|
303
|
+
- Optionally respects column-group partition constraints
|
|
304
|
+
- Produces reproducible results via random seed
|
|
305
|
+
|
|
306
|
+
#### Typical Use Cases
|
|
307
|
+
|
|
308
|
+
- Unit testing of CSVW-SAFE and DP pipelines
|
|
309
|
+
- Schema validation without real data access
|
|
310
|
+
- Debugging metadata-driven transformations
|
|
311
|
+
- Synthetic data generation for integration tests
|
|
312
|
+
|
|
313
|
+
#### CLI Usage Examples
|
|
314
|
+
Basic example with 100 rows:
|
|
315
|
+
```bash
|
|
316
|
+
# Basic
|
|
317
|
+
python make_dummy_from_metadata.py metadata.json --output dummy.csv
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
Set a seed (seed=42) and a number of rows (rows=1000) for a reproducible example:
|
|
321
|
+
```bash
|
|
322
|
+
python make_dummy_from_metadata.py metadata.json \
|
|
323
|
+
--rows 1000 \
|
|
324
|
+
--seed 42 \
|
|
325
|
+
--output dummy.csv
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### 3. **`validate_metadata.py`**
|
|
329
|
+
|
|
330
|
+
#### Purpose
|
|
331
|
+
|
|
332
|
+
Validate a CSVW-SAFE metadata file against the internal metadata schema.
|
|
333
|
+
|
|
334
|
+
This tool ensures that a metadata file is structurally correct and conforms to the expected CSVW-SAFE specification as defined by the internal `TableMetadata` model.
|
|
335
|
+
|
|
336
|
+
It is primarily used as a validation step before using metadata for:
|
|
337
|
+
- dummy dataset generation
|
|
338
|
+
- DP pipeline configuration
|
|
339
|
+
- downstream schema-driven processing
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
This validator performs schema-level validation only, including:
|
|
343
|
+
- Required fields presence
|
|
344
|
+
- Type correctness
|
|
345
|
+
- Structural consistency of metadata objects
|
|
346
|
+
- Compatibility with the `TableMetadata` model
|
|
347
|
+
|
|
348
|
+
Validation is implemented via a Pydantic model (`TableMetadata.from_dict`). See [TableMetadata.md](https://github.com/dscc-admin-ch/csvw-safe/blob/main/csvw-safe-library/TableMetadata.md) for more detailed explanation of the underlying pydantic model used to validate the metadata.
|
|
349
|
+
|
|
350
|
+
Output behaviour:
|
|
351
|
+
- If metadata is valid → script exits silently (no output)
|
|
352
|
+
- If metadata is invalid → raises a validation exception and exits with error
|
|
353
|
+
|
|
354
|
+
#### CLI Usage
|
|
355
|
+
```bash
|
|
356
|
+
python validate_metadata.py metadata.json
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
### 4. **`validate_metadata_shacl.py`**
|
|
361
|
+
|
|
362
|
+
#### Purpose
|
|
363
|
+
|
|
364
|
+
Validate CSVW-SAFE metadata using a SHACL constraint schema.
|
|
365
|
+
|
|
366
|
+
This tool performs structural validation of metadata expressed in JSON-LD format against a SHACL shapes graph defined in Turtle format.
|
|
367
|
+
|
|
368
|
+
It is the most strict validation layer in the CSVW-SAFE toolchain, intended to ensure full compliance with RDF-based constraints.
|
|
369
|
+
|
|
370
|
+
#### Validation Scope
|
|
371
|
+
|
|
372
|
+
This validator checks:
|
|
373
|
+
- RDF structural consistency of metadata (JSON-LD parsing)
|
|
374
|
+
- Constraint satisfaction against SHACL shapes
|
|
375
|
+
- Class/property-level restrictions defined in the schema
|
|
376
|
+
- Cross-field structural rules defined in the SHACL graph
|
|
377
|
+
|
|
378
|
+
> Unlike `validate_metadata.py`, this tool performs formal SHACL validation, not just schema validation.
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
Python usage
|
|
382
|
+
```bash
|
|
383
|
+
python validate_metadata_shacl.py metadata.jsonld shapes.ttl
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
Validation output
|
|
387
|
+
On success: SHACL validation SUCCESSFUL
|
|
388
|
+
On failure: SHACL validation FAILED with a <detailed SHACL report>
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
Typical Use Cases
|
|
392
|
+
- Formal compliance validation of CSVW-SAFE metadata
|
|
393
|
+
- CI/CD enforcement of metadata correctness
|
|
394
|
+
- Pre-deployment validation in RDF-based pipelines
|
|
395
|
+
- Ensuring compatibility with external SHACL-aware systems
|
|
396
|
+
|
|
397
|
+
Notes
|
|
398
|
+
- Metadata must be valid JSON-LD RDF
|
|
399
|
+
- SHACL shapes must be valid Turtle RDF
|
|
400
|
+
- This is the strictest validation layer
|
|
401
|
+
- More expressive than Pydantic-based validation (validate_metadata.py)
|
|
402
|
+
|
|
403
|
+
### 5. **`assert_same_structure.py`**
|
|
404
|
+
|
|
405
|
+
#### Purpose
|
|
406
|
+
|
|
407
|
+
Verify that a generated dummy CSV preserves the structural properties of an original dataset under the CSVW-SAFE assumptions.
|
|
408
|
+
|
|
409
|
+
This tool ensures that synthetic data produced by `make_dummy_from_metadata.py` remains schema-compatible with the original dataset used to derive metadata.
|
|
410
|
+
|
|
411
|
+
This validator checks structure only. It does not assess statistical similarity or data realism.
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
The tool checks:
|
|
415
|
+
- Column names and ordering
|
|
416
|
+
- Inferred CSVW-SAFE datatypes
|
|
417
|
+
- Nullability constraints (required vs optional columns)
|
|
418
|
+
- Optional categorical domain compatibility (subset check)
|
|
419
|
+
|
|
420
|
+
It does not check:
|
|
421
|
+
- Statistical similarity between datasets
|
|
422
|
+
- Distributional properties
|
|
423
|
+
- Correlation structure
|
|
424
|
+
- Semantic correctness of values
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
#### Core Validation Logic
|
|
428
|
+
|
|
429
|
+
Ensures that both datasets share identical schema:
|
|
430
|
+
|
|
431
|
+
- Same column names
|
|
432
|
+
- Same column ordering
|
|
433
|
+
|
|
434
|
+
Each column is type-checked using CSVW-SAFE inference:
|
|
435
|
+
|
|
436
|
+
- Datatypes are inferred via `infer_xmlschema_datatype`
|
|
437
|
+
- Integer subtype differences are tolerated (e.g., small vs large integer variants)
|
|
438
|
+
|
|
439
|
+
Validates whether required/optional status is preserved:
|
|
440
|
+
|
|
441
|
+
- A column is considered required if it has no missing values
|
|
442
|
+
- Both datasets must agree on required vs optional status per column
|
|
443
|
+
|
|
444
|
+
If enabled, ensures:
|
|
445
|
+
|
|
446
|
+
- All values in dummy dataset are a subset of original dataset values
|
|
447
|
+
- Uses `is_categorical()` to detect categorical columns
|
|
448
|
+
|
|
449
|
+
#### CLI Usage
|
|
450
|
+
|
|
451
|
+
```bash
|
|
452
|
+
python assert_same_structure.py original.csv dummy.csv
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
Skip categorical validation
|
|
456
|
+
```
|
|
457
|
+
python validate_dummy_structure.py original.csv dummy.csv --no-categories
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
Typical Use Cases
|
|
461
|
+
- Validate synthetic dataset generation correctness
|
|
462
|
+
- Regression testing for metadata-driven pipelines
|
|
463
|
+
- Ensuring structural integrity in DP synthetic data workflows
|
|
464
|
+
- Debugging mismatches between metadata and generated datasets
|
|
465
|
+
Notes
|
|
466
|
+
- This tool is intentionally strict on schema alignment but lenient on integer type variations
|
|
467
|
+
- Designed to validate synthetic structural fidelity, not realism
|
|
468
|
+
- Works best in combination with: make_metadata_from_data.py and make_dummy_from_metadata.py
|
|
469
|
+
|
|
470
|
+
### 6. **`csvw_to_smartnoise_sql.py`**
|
|
471
|
+
|
|
472
|
+
#### Purpose
|
|
473
|
+
|
|
474
|
+
Convert CSVW-SAFE metadata into the format expected by SmartNoise SQL.
|
|
475
|
+
|
|
476
|
+
This script transforms a CSVW-SAFE JSON metadata file into a SmartNoise-compatible YAML configuration, enabling direct use in differential privacy queries with SmartNoise SQL.
|
|
477
|
+
|
|
478
|
+
The script maps CSVW-SAFE metadata into SmartNoise SQL structure:
|
|
479
|
+
|
|
480
|
+
- Table-level privacy constraints:
|
|
481
|
+
- `max_contributions` → `max_ids`
|
|
482
|
+
- Column definitions:
|
|
483
|
+
- Datatypes (converted to SmartNoise types)
|
|
484
|
+
- Nullability
|
|
485
|
+
- Value bounds (`minimum` / `maximum` → `lower` / `upper`)
|
|
486
|
+
- Privacy identifier (`privacy_id` → `private_id`)
|
|
487
|
+
- Optional DP configuration parameters:
|
|
488
|
+
- sampling, clamping, censoring, DPSU
|
|
489
|
+
|
|
490
|
+
#### Output Structure
|
|
491
|
+
|
|
492
|
+
The generated YAML follows SmartNoise SQL format:
|
|
493
|
+
|
|
494
|
+
```yaml
|
|
495
|
+
"":
|
|
496
|
+
schema_name:
|
|
497
|
+
table_name:
|
|
498
|
+
max_ids: ...
|
|
499
|
+
rows: ...
|
|
500
|
+
sample_max_ids: ...
|
|
501
|
+
censor_dims: ...
|
|
502
|
+
clamp_counts: ...
|
|
503
|
+
clamp_columns: ...
|
|
504
|
+
use_dpsu: ...
|
|
505
|
+
column_name:
|
|
506
|
+
name: ...
|
|
507
|
+
type: ...
|
|
508
|
+
nullable: ...
|
|
509
|
+
lower: ...
|
|
510
|
+
upper: ...
|
|
511
|
+
private_id: ...
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
#### CLI Usage
|
|
515
|
+
|
|
516
|
+
Basic conversion
|
|
517
|
+
```bash
|
|
518
|
+
python csvw_to_smartnoise_sql.py \
|
|
519
|
+
--input metadata.json \
|
|
520
|
+
--output snsql_metadata.yaml
|
|
521
|
+
```
|
|
522
|
+
|
|
523
|
+
With custom schema and table
|
|
524
|
+
```bash
|
|
525
|
+
python csvw_to_smartnoise_sql.py \
|
|
526
|
+
--input metadata.json \
|
|
527
|
+
--output snsql_metadata.yaml \
|
|
528
|
+
--schema MySchema \
|
|
529
|
+
--table MyTable
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
With DP configuration options
|
|
533
|
+
```bash
|
|
534
|
+
python csvw_to_smartnoise_sql.py \
|
|
535
|
+
--input metadata.json \
|
|
536
|
+
--output snsql_metadata.yaml \
|
|
537
|
+
--sample_max_ids True \
|
|
538
|
+
--censor_dims True \
|
|
539
|
+
--clamp_columns True
|
|
540
|
+
```
|
|
541
|
+
|
|
542
|
+
### 7. **`csvw_to_opendp_context.py`**
|
|
543
|
+
|
|
544
|
+
#### Purpose
|
|
545
|
+
|
|
546
|
+
Create an OpenDP `Context` from CSVW-SAFE metadata and a dataset.
|
|
547
|
+
|
|
548
|
+
This script bridges CSVW-SAFE metadata with the OpenDP library by:
|
|
549
|
+
- Converting metadata into OpenDP margins
|
|
550
|
+
- Defining privacy units and privacy loss
|
|
551
|
+
- Building a ready-to-use OpenDP `Context` for DP queries
|
|
552
|
+
|
|
553
|
+
The resulting OpenDP `Context` includes:
|
|
554
|
+
|
|
555
|
+
- Privacy unit (based on `max_contributions`)
|
|
556
|
+
- Privacy loss:
|
|
557
|
+
- ε-DP (Laplace)
|
|
558
|
+
- ρ-DP / zCDP (Gaussian)
|
|
559
|
+
- Margins derived from CSVW metadata
|
|
560
|
+
- Dataset (as a Polars LazyFrame)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
#### Supported Privacy Models
|
|
564
|
+
|
|
565
|
+
| Model | Parameter |
|
|
566
|
+
|------|----------|
|
|
567
|
+
| Laplace DP | `epsilon` |
|
|
568
|
+
| Gaussian / zCDP | `rho` |
|
|
569
|
+
| Approximate DP | `delta` |
|
|
570
|
+
|
|
571
|
+
> You must provide either `epsilon` OR `rho`, not both.
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
#### CLI Usage
|
|
575
|
+
|
|
576
|
+
Basic conversion
|
|
577
|
+
```bash
|
|
578
|
+
import polars as pl
|
|
579
|
+
from csvw_safe.csvw_to_opendp_context import csvw_to_opendp_context
|
|
580
|
+
|
|
581
|
+
data = pl.scan_csv("data.csv")
|
|
582
|
+
|
|
583
|
+
context = csvw_to_opendp_context(
|
|
584
|
+
csvw_meta=metadata,
|
|
585
|
+
data=data,
|
|
586
|
+
epsilon=1.0,
|
|
587
|
+
)
|
|
588
|
+
```
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
## Typical Workflow
|
|
592
|
+
|
|
593
|
+
### Via CLI
|
|
594
|
+
1. Generate baseline metadata from the original dataset:
|
|
595
|
+
```
|
|
596
|
+
python make_metadata_from_data.py data.csv --id user_id --mode fine
|
|
597
|
+
```
|
|
598
|
+
|
|
599
|
+
2. Review manually with a data expert and approve metadata for safety and governance compliance.
|
|
600
|
+
Optionnaly after removing private information, run (to validate metadata format)
|
|
601
|
+
```
|
|
602
|
+
python scripts/validate_metadata_shacl.py metadata.json csvw-safe-constraints.ttl
|
|
603
|
+
```
|
|
604
|
+
and with shacl constraints:
|
|
605
|
+
```
|
|
606
|
+
python validate_metadata.py metadata.json --shacl csvw-safe-constraints.ttl
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
3. Generate a dummy dataset from the approved metadata:
|
|
610
|
+
```
|
|
611
|
+
python make_dummy_from_metadata.py metadata.json --rows 1000 --output dummy.csv
|
|
612
|
+
```
|
|
613
|
+
|
|
614
|
+
4. Verify that the dummy matches the original structure:
|
|
615
|
+
```
|
|
616
|
+
python assert_same_structure.py data.csv dummy.csv
|
|
617
|
+
```
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
### Python API Workflow
|
|
621
|
+
```
|
|
622
|
+
import pandas as pd
|
|
623
|
+
from csvw_safe.make_metadata_from_data import make_metadata_from_data
|
|
624
|
+
|
|
625
|
+
df = pd.read_csv("data.csv")
|
|
626
|
+
|
|
627
|
+
# Generate metadata
|
|
628
|
+
metadata = make_metadata_from_data(df, csv_url="data.csv", individual_col="user_id")
|
|
629
|
+
|
|
630
|
+
```
|
|
631
|
+
MANUAL REVIEW OF METADATA. VERIFY ONLY PUBLIC INFORMATION. REMOVE OTHERWISE.
|
|
632
|
+
|
|
633
|
+
```
|
|
634
|
+
from csvw_safe.validate_metadata import validate_metadata
|
|
635
|
+
from csvw_safe.validate_metadata_shacl import validate_metadata_shacl
|
|
636
|
+
from csvw_safe.make_dummy_from_metadata import make_dummy_from_metadata
|
|
637
|
+
from csvw_safe.assert_same_structure import assert_same_structure
|
|
638
|
+
|
|
639
|
+
# Validate metadata
|
|
640
|
+
errors = validate_metadata(metadata)
|
|
641
|
+
errors = validate_metadata_shacl(metadata)
|
|
642
|
+
|
|
643
|
+
# Generate dummy dataset
|
|
644
|
+
dummy_df = make_dummy_from_metadata(metadata, nb_rows=500)
|
|
645
|
+
|
|
646
|
+
# Assert structure
|
|
647
|
+
assert_same_structure(df, dummy_df)
|
|
648
|
+
```
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
|
|
652
|
+
# Directory Structure
|
|
653
|
+
|
|
654
|
+
```
|
|
655
|
+
examples/
|
|
656
|
+
└─ Notebooks.ipynb # Example notebooks demonstrating CSVW-SAFE workflows
|
|
657
|
+
|
|
658
|
+
src/csvw_safe/
|
|
659
|
+
├─ __init__.py # Package initializer for CSVW-SAFE library
|
|
660
|
+
|
|
661
|
+
├─ make_metadata_from_data.py # Generate CSVW-SAFE metadata automatically from a dataset
|
|
662
|
+
├─ make_dummy_from_metadata.py # Generate synthetic dummy datasets from CSVW-SAFE metadata
|
|
663
|
+
├─ validate_metadata.py # Validate metadata using internal schema (TableMetadata model)
|
|
664
|
+
├─ validate_metadata_shacl.py # Validate metadata using SHACL constraints via RDF graphs
|
|
665
|
+
├─ assert_same_structure.py # Compare original and dummy CSVs for structural consistency
|
|
666
|
+
|
|
667
|
+
├─ csvw_to_opendp_context.py # Convert CSVW-SAFE metadata into OpenDP analysis context
|
|
668
|
+
├─ csvw_to_opendp_margins.py # Translate CSVW-SAFE metadata into OpenDP margin definitions
|
|
669
|
+
├─ csvw_to_smartnoise_sql.py # Convert CSVW-SAFE metadata into SmartNoise SQL format
|
|
670
|
+
|
|
671
|
+
├─ generate_series.py # Generate synthetic column values based on metadata rules
|
|
672
|
+
├─ metadata_structure.py # Core data models defining CSVW-SAFE metadata schema
|
|
673
|
+
├─ constants.py # Shared constants used across metadata pipeline
|
|
674
|
+
├─ datatypes.py # Datatype inference and CSVW-SAFE type utilities
|
|
675
|
+
└─ utils.py # General helper utilities for metadata processing
|
|
676
|
+
tests/ # Unit and integration tests for CSVW-SAFE library
|
|
677
|
+
|
|
678
|
+
pyproject.toml # Project configuration and dependencies
|
|
679
|
+
README.md # Project overview and documentation entry point
|
|
680
|
+
run_linter.sh # Script to run linting and style checks
|
|
681
|
+
```
|
|
682
|
+
|