csvw-safe 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
csvw_safe/__init__.py ADDED
@@ -0,0 +1,50 @@
1
+ """
2
+ Top-level public interface for csvw_safe.
3
+
4
+ This module provides a simplified API by re-exporting the most commonly used
5
+ functions, classes, and constants for working with CSVW-style metadata.
6
+
7
+ It includes utilities to:
8
+
9
+ - Generate metadata from datasets
10
+ - Generate dummy datasets from metadata
11
+ - Validate metadata (standard and SHACL-based validation)
12
+ - Convert metadata to OpenDP and SmartNoise SQL contexts
13
+ - Assert structural equivalence between datasets
14
+ - Work with metadata models and datatypes
15
+ """
16
+
17
+ from .assert_same_structure import assert_same_structure
18
+ from .constants import COL_LIST, COL_NAME, MAXIMUM, MINIMUM, TABLE_SCHEMA
19
+ from .csvw_to_opendp_context import csvw_to_opendp_context
20
+ from .csvw_to_smartnoise_sql import csvw_to_smartnoise_sql
21
+ from .datatypes import XSD_GROUP_MAP, DataTypesGroups, to_pandas_dtype
22
+ from .make_dummy_from_metadata import make_dummy_from_metadata
23
+ from .make_metadata_from_data import make_metadata_from_data
24
+ from .metadata_structure import ColumnMetadata, TableMetadata
25
+ from .validate_metadata import validate_metadata
26
+ from .validate_metadata_shacl import validate_metadata_shacl
27
+
28
+ __all__ = [ # noqa: RUF022
29
+ # Core functionality
30
+ "assert_same_structure",
31
+ "csvw_to_opendp_context",
32
+ "csvw_to_smartnoise_sql",
33
+ "make_dummy_from_metadata",
34
+ "make_metadata_from_data",
35
+ "validate_metadata",
36
+ "validate_metadata_shacl",
37
+ # Metadata models
38
+ "TableMetadata",
39
+ "ColumnMetadata",
40
+ # Constants
41
+ "COL_LIST",
42
+ "COL_NAME",
43
+ "MAXIMUM",
44
+ "MINIMUM",
45
+ "TABLE_SCHEMA",
46
+ # Datatypes
47
+ "XSD_GROUP_MAP",
48
+ "DataTypesGroups",
49
+ "to_pandas_dtype",
50
+ ]
@@ -0,0 +1,133 @@
1
+ """
2
+ Utility script to verify that a generated dummy CSV preserves the structural.
3
+
4
+ properties of an original CSV dataset.
5
+
6
+ The script checks:
7
+ - column names and order
8
+ - inferred CSVW-SAFE datatypes
9
+ - nullability (required vs optional columns)
10
+ - optional categorical value compatibility
11
+
12
+ It does NOT check statistical similarity, only structural compatibility.
13
+ """
14
+
15
+ import argparse
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ import pandas as pd
20
+
21
+ from csvw_safe.datatypes import (
22
+ XSD_GROUP_MAP,
23
+ DataTypesGroups,
24
+ infer_xmlschema_datatype,
25
+ is_categorical,
26
+ )
27
+
28
+
29
+ def assert_same_structure(
30
+ df1: pd.DataFrame,
31
+ df2: pd.DataFrame,
32
+ check_categories: bool = True,
33
+ ) -> None:
34
+ """
35
+ Verify that two CSV files share the same structural schema.
36
+
37
+ The function checks column names/order, inferred datatypes,
38
+ nullability constraints, and optionally categorical value sets.
39
+
40
+ Parameters
41
+ ----------
42
+ df1 : pd.DataFrame
43
+ Original dataframe.
44
+ df2 : pd.DataFrame
45
+ Dummy dataframe.
46
+ check_categories : bool, default=True
47
+ Whether to verify that categorical values in the dummy data
48
+ are subsets of those in the original data.
49
+
50
+ Raises
51
+ ------
52
+ AssertionError
53
+ If any structural mismatch is detected.
54
+
55
+ """
56
+ # Columns: order and names
57
+ if list(df1.columns) != list(df2.columns):
58
+ raise AssertionError(
59
+ f"Column names/order differ:\nOriginal: {list(df1.columns)}\nDummy:{list(df2.columns)}"
60
+ )
61
+
62
+ # Data types
63
+ for col in df1.columns:
64
+ dtype1 = infer_xmlschema_datatype(df1[col])
65
+ dtype2 = infer_xmlschema_datatype(df2[col])
66
+
67
+ group1 = XSD_GROUP_MAP.get(dtype1)
68
+ group2 = XSD_GROUP_MAP.get(dtype2)
69
+
70
+ # If both are integer types, accept subtype differences
71
+ if group1 == DataTypesGroups.INTEGER and group2 == DataTypesGroups.INTEGER:
72
+ continue
73
+
74
+ if dtype1 != dtype2:
75
+ raise AssertionError(f"Column '{col}' dtype mismatch: original={dtype1}, dummy={dtype2}")
76
+
77
+ # Nullability
78
+ for col in df1.columns:
79
+ required1: bool = df1[col].notna().all()
80
+ required2: bool = df2[col].notna().all()
81
+
82
+ if required1 != required2:
83
+ raise AssertionError(
84
+ f"Column '{col}' nullability mismatch: original required={required1}, "
85
+ f"dummy required={required2}"
86
+ )
87
+
88
+ # Categorical subset check
89
+ if check_categories:
90
+ cat_cols = [col for col in df1.columns if is_categorical(df1[col])]
91
+ for col in cat_cols:
92
+ vals1 = set(df1[col].dropna().unique())
93
+ vals2 = set(df2[col].dropna().unique())
94
+
95
+ if not vals2.issubset(vals1):
96
+ raise AssertionError(
97
+ f"Column '{col}' dummy values {vals2} are not subset of original {vals1}"
98
+ )
99
+
100
+
101
+ def main() -> None:
102
+ """Command-line entry point for the CSV structure validator."""
103
+ parser = argparse.ArgumentParser(
104
+ description="Assert that two CSV files match CSVW-SAFE structural properties"
105
+ )
106
+ parser.add_argument("original_csv", type=str, help="Original CSV file")
107
+ parser.add_argument("dummy_csv", type=str, help="Dummy CSV file")
108
+ parser.add_argument(
109
+ "--no-categories",
110
+ action="store_true",
111
+ help="Skip categorical subset validation",
112
+ )
113
+
114
+ args = parser.parse_args()
115
+
116
+ df1 = pd.read_csv(Path(args.original_csv), parse_dates=True)
117
+ df2 = pd.read_csv(Path(args.dummy_csv), parse_dates=True)
118
+ try:
119
+ assert_same_structure(
120
+ df1,
121
+ df2,
122
+ check_categories=not args.no_categories,
123
+ )
124
+ except AssertionError as e:
125
+ print(f"Structure mismatch: {e}") # noqa: T201
126
+ sys.exit(1)
127
+ except Exception as e:
128
+ print(f"ERROR: {e}") # noqa: T201
129
+ sys.exit(2)
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()
csvw_safe/constants.py ADDED
@@ -0,0 +1,79 @@
1
+ """Defaults, constants and metadata objects for csvw-safe."""
2
+
3
+ import string
4
+ from enum import StrEnum
5
+ from pathlib import Path
6
+
7
+ # ============================================================
8
+ # CSVW
9
+ # ============================================================
10
+ CSVW_CONTEXT = "http://www.w3.org/ns/csvw"
11
+ COL_NAME = "name"
12
+ DATATYPE = "datatype"
13
+ REQUIRED = "required"
14
+ MINIMUM = "minimum"
15
+ MAXIMUM = "maximum"
16
+ TABLE_SCHEMA = "tableSchema"
17
+ COL_LIST = "columns"
18
+ COL_TYPE = "Column"
19
+ TABLE_TYPE = "Table"
20
+
21
+ # ============================================================
22
+ # CSVW_SAFE Namespaces
23
+ # ============================================================
24
+ CSVW_SAFE_CONTEXT = str((Path(__file__).resolve().parents[2] / "csvw-safe-context.jsonld").resolve()) # tmp
25
+
26
+ # Column groups / partitions
27
+ COLUMN_GROUP = "ColumnGroup"
28
+ PARTITION = "Partition"
29
+ COLUMNS_IN_GROUP = "columnsInGroup"
30
+ PUBLIC_PARTITIONS = "partitions"
31
+ KEY_VALUES = "keyValues"
32
+ EXHAUSTIVE_KEYS = "exhaustiveKeys"
33
+ INVARIANT_PUBLIC_KEYS = "invariantPublicKeys"
34
+ MAX_NUM_PARTITIONS = "maxNumPartitions"
35
+ PUBLIC_LENGTH = "publicLength"
36
+ PRIVACY_UNIT = "privacyUnit"
37
+ PRIVACY_ID = "privacyId"
38
+ ADD_INFO = "additionalInformation"
39
+
40
+ # Differential privacy bounds
41
+ MAX_LENGTH = "maxLength"
42
+ MAX_GROUPS = "maxGroupsPerUnit"
43
+ MAX_CONTRIB = "maxContributions"
44
+
45
+ # Partition predicates
46
+ PREDICATE = "predicate"
47
+ PARTITION_VALUE = "partitionValue"
48
+ LOWER_BOUND = "lowerBound"
49
+ UPPER_BOUND = "upperBound"
50
+
51
+ # Synthetic modeling
52
+ NULL_PROP = "nullableProportion"
53
+ ROW_DEP = "rowDependencies"
54
+ DEPENDS_ON = "dependsOn"
55
+ DEPENDENCY_TYPE = "dependencyType"
56
+ VALUE_MAP = "valueMap"
57
+
58
+
59
+ # ============================================================
60
+ # Make and generate metadata
61
+ # ============================================================
62
+ class DependencyType(StrEnum):
63
+ """Types of column dependency relationships."""
64
+
65
+ MAPPING = "mapping"
66
+ BIGGER = "bigger"
67
+ # SMALLER = "smaller" # redundant with bigger
68
+ FIXED = "fixedPerEntity"
69
+
70
+
71
+ # ============================================================
72
+ # Default Values
73
+ # ============================================================
74
+ DATE_LENGTH = 10 # YYYY-MM-DD only
75
+ DEFAULT_LOWER_INCLUSIVE = True
76
+ DEFAULT_UPPER_INCLUSIVE = True
77
+
78
+ DEFAULT_NUMBER_PARTITIONS = 10
79
+ RANDOM_STRINGS = list(string.ascii_lowercase + string.ascii_uppercase + string.digits)
@@ -0,0 +1,173 @@
1
+ """
2
+ Create an OpenDP Context from CSVW-SAFE metadata and a dataset.
3
+
4
+ This module:
5
+ - Converts CSVW-SAFE metadata into OpenDP margins
6
+ - Builds an OpenDP Context using a provided dataset
7
+ - Supports epsilon-based (Laplace) and rho-based (Gaussian) DP
8
+ - Exposes both a Python API and CLI
9
+
10
+ The resulting context can be used for differentially private queries.
11
+ """
12
+
13
+ from collections.abc import Sequence
14
+ from typing import Any, Union
15
+
16
+ import opendp.prelude as dp
17
+ import polars as pl
18
+ from opendp.extras.polars import Bound
19
+ from opendp.mod import Measure, Metric, enable_features
20
+
21
+ from csvw_safe.constants import MAX_CONTRIB # , PRIVACY_UNIT
22
+ from csvw_safe.csvw_to_opendp_margins import csvw_to_opendp_margins
23
+
24
+ enable_features("contrib")
25
+
26
+
27
+ def get_privacy_loss(
28
+ epsilon: float | None = None,
29
+ rho: float | None = None,
30
+ delta: float | None = None,
31
+ ) -> tuple[Measure, Any]:
32
+ """
33
+ Create an opendp privacy loss object.
34
+
35
+ Parameters
36
+ ----------
37
+ epsilon : float, optional
38
+ Privacy budget epsilon (for Laplace DP).
39
+ rho : float, optional
40
+ Privacy budget rho (for Gaussian / zCDP).
41
+ delta : float, optional
42
+ Privacy budget delta (if using approximate DP).
43
+
44
+ Returns
45
+ -------
46
+ privacy_loss
47
+ opendp privacy loss object
48
+
49
+ Raises
50
+ ------
51
+ ValueError
52
+ If neither epsilon nor rho is provided.
53
+
54
+ """
55
+ if epsilon is None and rho is None:
56
+ raise ValueError("Either epsilon or rho must be provided")
57
+
58
+ if epsilon is not None and rho is not None:
59
+ raise ValueError("Specify only one of epsilon or rho")
60
+
61
+ if epsilon is not None:
62
+ return dp.loss_of(epsilon=epsilon, delta=delta)
63
+
64
+ return dp.loss_of(rho=rho, delta=delta)
65
+
66
+
67
+ def get_privacy_unit(
68
+ csvw_meta: dict[str, Any], distance: str
69
+ ) -> tuple[Metric, Union[float, Sequence[Bound]]]:
70
+ """
71
+ Construct an OpenDP privacy unit from CSVW-SAFE metadata.
72
+
73
+ Parameters
74
+ ----------
75
+ csvw_meta : Dict[str, Any]
76
+ CSVW-SAFE metadata dictionary.
77
+ distance : str
78
+ Type of privacy distance metric to use (e.g. "contributions", "changes").
79
+
80
+ Returns
81
+ -------
82
+ privacy_unit
83
+ OpenDP privacy unit descriptor.
84
+
85
+ """
86
+ if MAX_CONTRIB not in csvw_meta:
87
+ raise ValueError("Missing max_contributions in metadata")
88
+
89
+ max_contrib = csvw_meta[MAX_CONTRIB]
90
+
91
+ kwargs: dict[str, Any] = {}
92
+
93
+ # Map distance type → correct argument
94
+ if distance == "contributions":
95
+ kwargs["contributions"] = max_contrib
96
+ elif distance == "changes":
97
+ kwargs["changes"] = max_contrib
98
+ # elif distance == "absolute":
99
+ # kwargs["absolute"] = max_contrib
100
+ # elif distance == "l1":
101
+ # kwargs["l1"] = float(max_contrib)
102
+ # elif distance == "l2":
103
+ # kwargs["l2"] = float(max_contrib)
104
+ else:
105
+ raise ValueError(f"Unsupported distance type: {distance}")
106
+
107
+ # identifier = csvw_meta.get(PRIVACY_UNIT)
108
+ # if identifier is not None:
109
+ # kwargs["identifier"] = pl.col(identifier) # TODO: investigate more
110
+
111
+ return dp.unit_of(**kwargs)
112
+
113
+
114
+ def csvw_to_opendp_context( # noqa: PLR0913
115
+ csvw_meta: dict[str, Any],
116
+ data: pl.LazyFrame,
117
+ epsilon: float | None = None,
118
+ rho: float | None = None,
119
+ delta: float | None = None,
120
+ split_evenly_over: int | None = None,
121
+ split_by_weights: list[float] | None = None,
122
+ distance: str = "contributions",
123
+ ) -> dp.Context:
124
+ """
125
+ Create an OpenDP Context from CSVW-SAFE metadata and a dataset.
126
+
127
+ Parameters
128
+ ----------
129
+ csvw_meta : Dict[str, Any]
130
+ CSVW-SAFE metadata dictionary.
131
+ Must include `csvw-safe.dp.maxContributions`.
132
+ data : pl.LazyFrame
133
+ Input dataset (recommended as LazyFrame).
134
+ epsilon : float, optional
135
+ Privacy budget epsilon (for Laplace DP).
136
+ rho : float, optional
137
+ Privacy budget rho (for Gaussian / zCDP).
138
+ delta : float, optional
139
+ Privacy budget delta (if using approximate DP).
140
+ split_evenly_over : int
141
+ Number of queries to split privacy budget across.
142
+ split_by_weights: list[float]
143
+ List of privacy budget weight by query.
144
+ distance: str, default='contributions'
145
+ Distance metric for privacy unit.
146
+
147
+ Returns
148
+ -------
149
+ Context
150
+ OpenDP Context object ready for queries.
151
+
152
+ Raises
153
+ ------
154
+ ValueError
155
+ If required metadata (max_contributions) is missing.
156
+ If neither epsilon nor rho is provided.
157
+
158
+ """
159
+ if split_evenly_over is not None and split_by_weights is not None:
160
+ raise ValueError("Specify only one of split_evenly_over or split_by_weights")
161
+
162
+ kwargs: dict[str, Any] = {
163
+ "data": data,
164
+ "privacy_unit": get_privacy_unit(csvw_meta, distance),
165
+ "privacy_loss": get_privacy_loss(epsilon, rho, delta),
166
+ "margins": csvw_to_opendp_margins(csvw_meta),
167
+ }
168
+ if split_by_weights is not None:
169
+ kwargs["split_by_weights"] = split_by_weights
170
+ else:
171
+ kwargs["split_evenly_over"] = split_evenly_over
172
+
173
+ return dp.Context.compositor(**kwargs)
@@ -0,0 +1,124 @@
1
+ """
2
+ Convert CSVW-SAFE JSON metadata into OpenDP margin descriptors.
3
+
4
+ This module provides:
5
+ - A function to translate CSVW-SAFE differential privacy metadata into
6
+ OpenDP `dp.polars.Margin` objects.
7
+ - A CLI for generating margin specifications from a JSON metadata file.
8
+
9
+ The resulting margins can be used in an OpenDP context, for example:
10
+
11
+ dp.Context.compositor(
12
+ data=...,
13
+ privacy_unit=dp.unit_of(contributions=...),
14
+ privacy_loss=dp.loss_of(epsilon=...),
15
+ margins=[...],
16
+ )
17
+ """
18
+
19
+ from typing import Any
20
+
21
+ from opendp.extras.polars import Margin
22
+
23
+ from csvw_safe.constants import (
24
+ ADD_INFO,
25
+ COL_LIST,
26
+ COL_NAME,
27
+ COLUMNS_IN_GROUP,
28
+ INVARIANT_PUBLIC_KEYS,
29
+ MAX_GROUPS,
30
+ MAX_LENGTH,
31
+ MAX_NUM_PARTITIONS,
32
+ PUBLIC_LENGTH,
33
+ TABLE_SCHEMA,
34
+ )
35
+
36
+
37
+ def get_margins(col_meta: dict[str, Any], by: list[str]) -> dict[str, Any]:
38
+ """
39
+ Build margin keyword arguments for a given column or column group.
40
+
41
+ Parameters
42
+ ----------
43
+ col_meta : Dict[str, Any]
44
+ Metadata describing a column or group of columns, including
45
+ differential privacy constraints (e.g., max_length, max_groups).
46
+ by : List[str]
47
+ Column name(s) to group by when defining the margin.
48
+
49
+ Returns
50
+ -------
51
+ Dict[str, Any]
52
+ Dictionary of keyword arguments suitable for constructing an
53
+ OpenDP Margin object.
54
+
55
+ """
56
+ margin_kwargs: dict[str, Any] = {"by": by}
57
+
58
+ # max_length per column
59
+ if MAX_LENGTH in col_meta:
60
+ margin_kwargs["max_length"] = col_meta[MAX_LENGTH]
61
+
62
+ # max_groups per column
63
+ if MAX_GROUPS in col_meta:
64
+ margin_kwargs["max_groups"] = col_meta[MAX_GROUPS]
65
+ elif MAX_NUM_PARTITIONS in col_meta:
66
+ margin_kwargs["max_groups"] = col_meta[MAX_NUM_PARTITIONS]
67
+
68
+ # Exhaustive partitions --> invariant keys
69
+ if col_meta.get(INVARIANT_PUBLIC_KEYS):
70
+ margin_kwargs["invariant"] = "keys"
71
+
72
+ if col_meta.get(PUBLIC_LENGTH):
73
+ margin_kwargs["invariant"] = "lengths"
74
+
75
+ return margin_kwargs
76
+
77
+
78
+ def csvw_to_opendp_margins(csvw_meta: dict[str, Any]) -> list["Margin"]:
79
+ """
80
+ Convert CSVW-SAFE metadata to a list of OpenDP Margin objects.
81
+
82
+ Parameters
83
+ ----------
84
+ csvw_meta : Dict[str, Any]
85
+ CSVW-SAFE metadata dictionary.
86
+
87
+ Returns
88
+ -------
89
+ List["Margin"]
90
+ List of OpenDP margin descriptors.
91
+
92
+ Raises
93
+ ------
94
+ ValueError
95
+ If required metadata (e.g., max_contributions) is missing.
96
+
97
+ """
98
+ margins: list[Margin] = []
99
+
100
+ # Table-level margins: non groupby queries (by=[], max_length=10, ...)
101
+ margin_kwargs: dict[str, Any] = {}
102
+
103
+ # Max length (for non count queries)
104
+ if csvw_meta.get(MAX_LENGTH, False):
105
+ margin_kwargs["max_length"] = csvw_meta[MAX_LENGTH]
106
+
107
+ # If length is public --> invariant lengths
108
+ if csvw_meta.get(PUBLIC_LENGTH, False):
109
+ margin_kwargs["invariant"] = "lengths"
110
+
111
+ if margin_kwargs:
112
+ margins.append(Margin(**margin_kwargs))
113
+
114
+ # Column-level margins: groupby queries (by=['col_name'], max_length=100, ...)
115
+ for col_meta in csvw_meta[TABLE_SCHEMA][COL_LIST]:
116
+ margin_kwargs = get_margins(col_meta, by=[col_meta[COL_NAME]])
117
+ margins.append(Margin(**margin_kwargs))
118
+
119
+ # Multi-columns-level margins: groupby queries (by=['col_1', 'col_2'], max_length=100, ...)
120
+ for cols_meta in csvw_meta.get(ADD_INFO, []):
121
+ margin_kwargs = get_margins(cols_meta, by=cols_meta[COLUMNS_IN_GROUP])
122
+ margins.append(Margin(**margin_kwargs))
123
+
124
+ return margins