oissyntheticdata 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oissyntheticdata/__init__.py +47 -0
- oissyntheticdata/__main__.py +34 -0
- oissyntheticdata/_io.py +85 -0
- oissyntheticdata/_relational.py +188 -0
- oissyntheticdata/_synth.py +140 -0
- oissyntheticdata/_tree.py +163 -0
- oissyntheticdata-0.2.0.dist-info/METADATA +297 -0
- oissyntheticdata-0.2.0.dist-info/RECORD +12 -0
- oissyntheticdata-0.2.0.dist-info/WHEEL +5 -0
- oissyntheticdata-0.2.0.dist-info/entry_points.txt +3 -0
- oissyntheticdata-0.2.0.dist-info/licenses/LICENSE +21 -0
- oissyntheticdata-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
oissyntheticdata — pure-Python sequential CART synthesis, in the synthpop tradition.
|
|
4
|
+
|
|
5
|
+
Zero third-party dependencies (standard library only). Designed for secure
|
|
6
|
+
research environments: develop and debug your analysis on the synthetic data
|
|
7
|
+
off-site, then run the final code on the real data on-premises.
|
|
8
|
+
|
|
9
|
+
Single table
|
|
10
|
+
------------
|
|
11
|
+
import oissyntheticdata
|
|
12
|
+
oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
|
|
13
|
+
drop=["national_id"], min_leaf=5)
|
|
14
|
+
|
|
15
|
+
Related tables (referential integrity preserved)
|
|
16
|
+
-------------------------------------------------
|
|
17
|
+
oissyntheticdata.synthesize_relational_files(
|
|
18
|
+
{"inmates": "inmates.csv", "judgements": "judgements.csv"},
|
|
19
|
+
schema={
|
|
20
|
+
"inmates": {"key": "prisoner_id"},
|
|
21
|
+
"judgements": {"key": "judgement_id",
|
|
22
|
+
"parent": "inmates", "foreign_key": "prisoner_id"},
|
|
23
|
+
},
|
|
24
|
+
out_dir="out", min_leaf=5)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from ._io import read_table, write_table
|
|
28
|
+
from ._synth import synthesize
|
|
29
|
+
from ._relational import synthesize_relational, synthesize_relational_files
|
|
30
|
+
|
|
31
|
+
__version__ = "0.2.0"
|
|
32
|
+
__all__ = [
|
|
33
|
+
"read_table", "write_table", "synthesize", "synthesize_file",
|
|
34
|
+
"synthesize_relational", "synthesize_relational_files",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def synthesize_file(in_path, out_path, n=None, visit=None, drop=None,
|
|
39
|
+
min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
|
|
40
|
+
"""Read a CSV/XLSX, synthesize one flat table, and write a CSV."""
|
|
41
|
+
header, cols = read_table(in_path)
|
|
42
|
+
out_header, out_cols = synthesize(
|
|
43
|
+
header, cols, n=n, visit=visit, drop=drop,
|
|
44
|
+
min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing, seed=seed)
|
|
45
|
+
write_table(out_path, out_header, out_cols)
|
|
46
|
+
nrows = len(out_cols[out_header[0]]) if out_header else 0
|
|
47
|
+
return nrows, len(out_header)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Command-line interface: python -m oissyntheticdata real.csv -o synthetic.csv"""
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
import argparse
|
|
6
|
+
from . import synthesize_file, __version__
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main(argv=None):
|
|
10
|
+
p = argparse.ArgumentParser(
|
|
11
|
+
prog="oissyntheticdata",
|
|
12
|
+
description="Pure-Python sequential CART synthesis (synthpop tradition, zero deps).")
|
|
13
|
+
p.add_argument("input", help="real CSV or XLSX file")
|
|
14
|
+
p.add_argument("-o", "--output", default="synthetic.csv", help="output CSV path")
|
|
15
|
+
p.add_argument("-n", "--rows", type=int, default=None, help="number of synthetic rows")
|
|
16
|
+
p.add_argument("--drop", default="", help="comma-separated columns to exclude (e.g. identifiers)")
|
|
17
|
+
p.add_argument("--visit", default="", help="comma-separated synthesis order (default: file order)")
|
|
18
|
+
p.add_argument("--min-leaf", type=int, default=5, help="minimum real records per leaf/cell (k)")
|
|
19
|
+
p.add_argument("--max-depth", type=int, default=12, help="maximum tree depth")
|
|
20
|
+
p.add_argument("--smoothing", type=float, default=0.0, help="continuous jitter (0 = off)")
|
|
21
|
+
p.add_argument("--seed", type=int, default=12345)
|
|
22
|
+
p.add_argument("--version", action="version", version="oissyntheticdata " + __version__)
|
|
23
|
+
a = p.parse_args(argv)
|
|
24
|
+
|
|
25
|
+
drop = [c.strip() for c in a.drop.split(",") if c.strip()]
|
|
26
|
+
visit = [c.strip() for c in a.visit.split(",") if c.strip()] or None
|
|
27
|
+
rows, cols = synthesize_file(a.input, a.output, n=a.rows, visit=visit, drop=drop,
|
|
28
|
+
min_leaf=a.min_leaf, max_depth=a.max_depth,
|
|
29
|
+
smoothing=a.smoothing, seed=a.seed)
|
|
30
|
+
sys.stderr.write("[oissyntheticdata] wrote %d rows x %d cols -> %s\n" % (rows, cols, a.output))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
if __name__ == "__main__":
|
|
34
|
+
main()
|
oissyntheticdata/_io.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""oissyntheticdata._io — read CSV/XLSX and write CSV using ONLY the standard library."""
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import csv
|
|
6
|
+
import zipfile
|
|
7
|
+
import xml.etree.ElementTree as ET
|
|
8
|
+
|
|
9
|
+
MISSING_TOKENS = {"", "na", "n/a", ".", "nan", "null", "none"}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _col_index(ref):
|
|
13
|
+
letters = "".join(ch for ch in ref if ch.isalpha())
|
|
14
|
+
n = 0
|
|
15
|
+
for ch in letters:
|
|
16
|
+
n = n * 26 + (ord(ch.upper()) - 64)
|
|
17
|
+
return n - 1
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _read_xlsx(path):
|
|
21
|
+
ns = {"a": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"}
|
|
22
|
+
T = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}t"
|
|
23
|
+
with zipfile.ZipFile(path) as z:
|
|
24
|
+
names = z.namelist()
|
|
25
|
+
shared = []
|
|
26
|
+
if "xl/sharedStrings.xml" in names:
|
|
27
|
+
root = ET.fromstring(z.read("xl/sharedStrings.xml"))
|
|
28
|
+
for si in root.findall("a:si", ns):
|
|
29
|
+
shared.append("".join(t.text or "" for t in si.iter(T)))
|
|
30
|
+
sheet = "xl/worksheets/sheet1.xml"
|
|
31
|
+
if sheet not in names:
|
|
32
|
+
sheet = sorted(n for n in names
|
|
33
|
+
if n.startswith("xl/worksheets/") and n.endswith(".xml"))[0]
|
|
34
|
+
root = ET.fromstring(z.read(sheet))
|
|
35
|
+
rows = []
|
|
36
|
+
for row in root.iter("{%s}row" % ns["a"]):
|
|
37
|
+
cells, maxi = {}, -1
|
|
38
|
+
for c in row.findall("a:c", ns):
|
|
39
|
+
ref = c.get("r", "")
|
|
40
|
+
idx = _col_index(ref) if ref else len(cells)
|
|
41
|
+
t = c.get("t")
|
|
42
|
+
v = c.find("a:v", ns)
|
|
43
|
+
if t == "s" and v is not None:
|
|
44
|
+
val = shared[int(v.text)]
|
|
45
|
+
elif t == "inlineStr":
|
|
46
|
+
is_ = c.find("a:is", ns)
|
|
47
|
+
val = "".join(x.text or "" for x in is_.iter(T)) if is_ is not None else ""
|
|
48
|
+
else:
|
|
49
|
+
val = v.text if v is not None else ""
|
|
50
|
+
cells[idx] = val if val is not None else ""
|
|
51
|
+
maxi = max(maxi, idx)
|
|
52
|
+
rows.append([cells.get(i, "") for i in range(maxi + 1)])
|
|
53
|
+
return rows
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _read_csv(path):
|
|
57
|
+
with open(path, "r", encoding="utf-8-sig", newline="") as f:
|
|
58
|
+
return [row for row in csv.reader(f)]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def read_table(path):
|
|
62
|
+
"""Return (header: list[str], columns: dict[str, list[str]])."""
|
|
63
|
+
raw = _read_xlsx(path) if path.lower().endswith((".xlsx", ".xls")) else _read_csv(path)
|
|
64
|
+
raw = [r for r in raw if any(str(c).strip() for c in r)]
|
|
65
|
+
if not raw:
|
|
66
|
+
return [], {}
|
|
67
|
+
header = [str(h).strip() for h in raw[0]]
|
|
68
|
+
cols = {h: [] for h in header}
|
|
69
|
+
for r in raw[1:]:
|
|
70
|
+
for i, h in enumerate(header):
|
|
71
|
+
cols[h].append(str(r[i]).strip() if i < len(r) else "")
|
|
72
|
+
return header, cols
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def write_table(path, header, columns):
|
|
76
|
+
n = len(columns[header[0]]) if header else 0
|
|
77
|
+
with open(path, "w", encoding="utf-8", newline="") as f:
|
|
78
|
+
w = csv.writer(f)
|
|
79
|
+
w.writerow(header)
|
|
80
|
+
for i in range(n):
|
|
81
|
+
w.writerow([columns[h][i] for h in header])
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def is_missing(v):
|
|
85
|
+
return str(v).strip().lower() in MISSING_TOKENS
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""oissyntheticdata._relational — multi-table (relational) synthesis.
|
|
3
|
+
|
|
4
|
+
Extends sequential CART synthesis to a parent -> child schema while keeping
|
|
5
|
+
**referential integrity** (every synthetic foreign key points at a synthetic
|
|
6
|
+
parent) and the **parent->child structure** (fan-out and attribute correlation).
|
|
7
|
+
|
|
8
|
+
For each table, in parent-before-child order:
|
|
9
|
+
1. Synthesize the table's attributes (sequential CART). For a child, the
|
|
10
|
+
parent's synthetic attributes are supplied as fixed predictors, so child
|
|
11
|
+
attributes are drawn conditioned on the parent they belong to.
|
|
12
|
+
2. Mint fresh surrogate primary keys (1..n) — real identifiers are never
|
|
13
|
+
reproduced.
|
|
14
|
+
3. For each child, a regression CART models the number of children per parent
|
|
15
|
+
from the parent's attributes (the fan-out), so realistic counts — and which
|
|
16
|
+
parents have many vs. few children — are preserved. Foreign keys are drawn
|
|
17
|
+
from the synthetic parent keys, guaranteeing valid joins.
|
|
18
|
+
|
|
19
|
+
Scope: a single-parent DAG (star / snowflake / chains). A table has at most one
|
|
20
|
+
parent; a parent may have many children; children may themselves be parents.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import random
|
|
24
|
+
from . import _io
|
|
25
|
+
from . import _tree
|
|
26
|
+
from ._synth import type_columns, stringify, synth_core, _to_float
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _topo_order(schema):
|
|
30
|
+
order, seen = [], set()
|
|
31
|
+
tables = list(schema.keys())
|
|
32
|
+
guard = 0
|
|
33
|
+
while len(order) < len(tables):
|
|
34
|
+
guard += 1
|
|
35
|
+
if guard > len(tables) + 2:
|
|
36
|
+
raise ValueError("Cyclic or unresolved parent reference in schema.")
|
|
37
|
+
for t in tables:
|
|
38
|
+
if t in seen:
|
|
39
|
+
continue
|
|
40
|
+
parent = schema[t].get("parent")
|
|
41
|
+
if parent is None or parent in seen:
|
|
42
|
+
order.append(t); seen.add(t)
|
|
43
|
+
return order
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _child_counts(parent_keys_real, child_fk_real):
|
|
47
|
+
"""counts[parent_key] = number of real child rows with that foreign key."""
|
|
48
|
+
counts = {}
|
|
49
|
+
for k in child_fk_real:
|
|
50
|
+
counts[k] = counts.get(k, 0) + 1
|
|
51
|
+
return [float(counts.get(pk, 0)) for pk in parent_keys_real]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def synthesize_relational(tables, schema, n=None, drop=None,
|
|
55
|
+
min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
|
|
56
|
+
"""Synthesize a set of related tables.
|
|
57
|
+
|
|
58
|
+
tables : dict table_name -> (header, columns) [from oissyntheticdata.read_table]
|
|
59
|
+
schema : dict table_name -> {"key": pk,
|
|
60
|
+
"parent": parent_table (optional),
|
|
61
|
+
"foreign_key": fk (required if parent set)}
|
|
62
|
+
n : dict table_name -> rows for ROOT tables (children sized by fan-out);
|
|
63
|
+
a single int applies to all roots; None = each root's real row count.
|
|
64
|
+
drop : dict table_name -> [columns to exclude] (besides keys), or a flat
|
|
65
|
+
list applied to every table.
|
|
66
|
+
|
|
67
|
+
Returns dict table_name -> (out_header, out_columns).
|
|
68
|
+
"""
|
|
69
|
+
rng = random.Random(seed)
|
|
70
|
+
drop = drop or {}
|
|
71
|
+
if isinstance(drop, (list, tuple, set)):
|
|
72
|
+
drop = {t: list(drop) for t in tables}
|
|
73
|
+
n_map = {} if n is None else (n if isinstance(n, dict) else {t: n for t in tables})
|
|
74
|
+
|
|
75
|
+
synth_attr = {} # table -> typed dict of synthesized attribute columns
|
|
76
|
+
synth_key = {} # table -> list of surrogate pk strings
|
|
77
|
+
is_num_of = {} # table -> is_num map for its attributes
|
|
78
|
+
results = {}
|
|
79
|
+
|
|
80
|
+
for t in _topo_order(schema):
|
|
81
|
+
header, cols = tables[t]
|
|
82
|
+
spec = schema[t]
|
|
83
|
+
pk = spec["key"]
|
|
84
|
+
parent = spec.get("parent")
|
|
85
|
+
fk = spec.get("foreign_key")
|
|
86
|
+
drop_t = set(drop.get(t, [])) | {pk}
|
|
87
|
+
if fk:
|
|
88
|
+
drop_t.add(fk)
|
|
89
|
+
attrs = [c for c in header if c not in drop_t]
|
|
90
|
+
is_num, real = type_columns(cols, attrs)
|
|
91
|
+
is_num_of[t] = is_num
|
|
92
|
+
n_real = len(cols[header[0]])
|
|
93
|
+
|
|
94
|
+
if parent is None:
|
|
95
|
+
# -------- root table --------
|
|
96
|
+
n_t = int(n_map.get(t, n_real))
|
|
97
|
+
out = synth_core(real, is_num, attrs, n_t, rng,
|
|
98
|
+
min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing)
|
|
99
|
+
synth_attr[t] = out
|
|
100
|
+
synth_key[t] = [str(i + 1) for i in range(n_t)]
|
|
101
|
+
else:
|
|
102
|
+
# -------- child table --------
|
|
103
|
+
p_attrs = list(synth_attr[parent].keys())
|
|
104
|
+
p_is_num = is_num_of[parent]
|
|
105
|
+
# real parent attributes, typed + a key->row lookup
|
|
106
|
+
p_header, p_cols = tables[parent]
|
|
107
|
+
_, p_real = type_columns(p_cols, p_attrs)
|
|
108
|
+
p_pk_real = p_cols[schema[parent]["key"]]
|
|
109
|
+
lookup = {k: i for i, k in enumerate(p_pk_real)}
|
|
110
|
+
# fan-out: counts of real children per real parent
|
|
111
|
+
counts = _child_counts(p_pk_real, cols[fk])
|
|
112
|
+
|
|
113
|
+
# count model: counts ~ parent attributes (regression CART)
|
|
114
|
+
n_parent = len(synth_key[parent])
|
|
115
|
+
drawn = []
|
|
116
|
+
if p_attrs:
|
|
117
|
+
_tree.set_predictors({a: p_real[a] for a in p_attrs})
|
|
118
|
+
croot = _tree.build_tree(list(range(len(counts))), counts, p_attrs,
|
|
119
|
+
p_is_num, "num", min_leaf=min_leaf, max_depth=max_depth)
|
|
120
|
+
for i in range(n_parent):
|
|
121
|
+
row = {a: synth_attr[parent][a][i] for a in p_attrs}
|
|
122
|
+
c = _tree.sample_leaf(croot, row, p_is_num, rng, 0.0)
|
|
123
|
+
drawn.append(max(0, int(round(c if c is not None else 0))))
|
|
124
|
+
else:
|
|
125
|
+
pos = [c for c in counts]
|
|
126
|
+
for _ in range(n_parent):
|
|
127
|
+
drawn.append(max(0, int(round(rng.choice(pos)))))
|
|
128
|
+
|
|
129
|
+
# expand: child foreign keys + the parent attrs carried to each child row
|
|
130
|
+
child_fk, parent_carry = [], {("p__" + a): [] for a in p_attrs}
|
|
131
|
+
for i in range(n_parent):
|
|
132
|
+
key_i = synth_key[parent][i]
|
|
133
|
+
for _ in range(drawn[i]):
|
|
134
|
+
child_fk.append(key_i)
|
|
135
|
+
for a in p_attrs:
|
|
136
|
+
parent_carry["p__" + a].append(synth_attr[parent][a][i])
|
|
137
|
+
total = len(child_fk)
|
|
138
|
+
|
|
139
|
+
# real fitting data, restricted to child rows whose parent exists
|
|
140
|
+
valid = [j for j in range(n_real) if cols[fk][j] in lookup]
|
|
141
|
+
child_real = {a: [real[a][j] for j in valid] for a in attrs}
|
|
142
|
+
fixed_real = {("p__" + a): [p_real[a][lookup[cols[fk][j]]] for j in valid] for a in p_attrs}
|
|
143
|
+
combined_is_num = dict(is_num)
|
|
144
|
+
for a in p_attrs:
|
|
145
|
+
combined_is_num["p__" + a] = p_is_num[a]
|
|
146
|
+
|
|
147
|
+
out = synth_core(child_real, combined_is_num, attrs, total, rng,
|
|
148
|
+
fixed_real=fixed_real, fixed_synth=parent_carry,
|
|
149
|
+
min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing)
|
|
150
|
+
synth_attr[t] = out
|
|
151
|
+
synth_key[t] = [str(i + 1) for i in range(total)]
|
|
152
|
+
results[t] = ("__child__", child_fk) # stash fk for assembly
|
|
153
|
+
|
|
154
|
+
# ---- assemble this table's output in original column order ----
|
|
155
|
+
out_header = [c for c in header if c not in (set(drop.get(t, [])))]
|
|
156
|
+
# keep pk and fk columns in output even though they aren't "attrs"
|
|
157
|
+
out_cols = {}
|
|
158
|
+
strattr = stringify(synth_attr[t], list(synth_attr[t].keys()))
|
|
159
|
+
nrows = len(synth_key[t])
|
|
160
|
+
for c in out_header:
|
|
161
|
+
if c == pk:
|
|
162
|
+
out_cols[c] = list(synth_key[t])
|
|
163
|
+
elif parent and c == fk:
|
|
164
|
+
out_cols[c] = list(results[t][1])
|
|
165
|
+
elif c in strattr:
|
|
166
|
+
out_cols[c] = strattr[c]
|
|
167
|
+
else:
|
|
168
|
+
out_cols[c] = [""] * nrows
|
|
169
|
+
results[t] = (out_header, out_cols)
|
|
170
|
+
|
|
171
|
+
return results
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def synthesize_relational_files(paths, schema, out_dir=".", **kw):
|
|
175
|
+
"""Read CSV/XLSX tables, synthesize relationally, write synthetic_<name>.csv.
|
|
176
|
+
|
|
177
|
+
paths : dict table_name -> input file path
|
|
178
|
+
Returns dict table_name -> (rows, cols).
|
|
179
|
+
"""
|
|
180
|
+
import os
|
|
181
|
+
tables = {t: _io.read_table(p) for t, p in paths.items()}
|
|
182
|
+
res = synthesize_relational(tables, schema, **kw)
|
|
183
|
+
summary = {}
|
|
184
|
+
for t, (hdr, cols) in res.items():
|
|
185
|
+
out_path = os.path.join(out_dir, "synthetic_%s.csv" % t)
|
|
186
|
+
_io.write_table(out_path, hdr, cols)
|
|
187
|
+
summary[t] = (len(cols[hdr[0]]) if hdr else 0, len(hdr))
|
|
188
|
+
return summary
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""oissyntheticdata._synth — sequential CART synthesis (the synthpop paradigm).
|
|
3
|
+
|
|
4
|
+
Columns are synthesized one at a time in `visit` order. The first column is
|
|
5
|
+
drawn from its own (disclosure-controlled) marginal. Each later column is
|
|
6
|
+
synthesized by growing a CART that predicts it from the columns ALREADY
|
|
7
|
+
synthesized, fitted on the real data, then drawing a donor from the matching
|
|
8
|
+
leaf for every synthetic row. Because predictors at draw time are the
|
|
9
|
+
synthetic values, the joint distribution is built up sequentially.
|
|
10
|
+
|
|
11
|
+
Confidentiality:
|
|
12
|
+
* `min_leaf` (k): no leaf / no marginal cell is built from fewer than k real
|
|
13
|
+
records, so a drawn value never isolates one person.
|
|
14
|
+
* `smoothing`: optional jitter on continuous donors so exact real values are
|
|
15
|
+
not echoed verbatim.
|
|
16
|
+
* direct identifiers should be dropped before synthesis (see `drop` arg).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import random
|
|
20
|
+
from . import _io
|
|
21
|
+
from . import _tree
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _is_numeric(values):
|
|
25
|
+
present = [v for v in values if not _io.is_missing(v)]
|
|
26
|
+
if not present:
|
|
27
|
+
return False
|
|
28
|
+
for v in present:
|
|
29
|
+
try:
|
|
30
|
+
float(str(v).replace(",", ""))
|
|
31
|
+
except ValueError:
|
|
32
|
+
return False
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _to_float(v):
|
|
37
|
+
try:
|
|
38
|
+
return float(str(v).replace(",", ""))
|
|
39
|
+
except (ValueError, TypeError):
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _marginal_draw(values, n, min_leaf, rng):
|
|
44
|
+
"""Sample n values from the empirical marginal, suppressing rare cells."""
|
|
45
|
+
counts = {}
|
|
46
|
+
for v in values:
|
|
47
|
+
counts[v] = counts.get(v, 0) + 1
|
|
48
|
+
pool = [v for v in values if counts[v] >= min_leaf]
|
|
49
|
+
if not pool: # everything rare -> fall back to all
|
|
50
|
+
pool = list(values)
|
|
51
|
+
return [rng.choice(pool) for _ in range(n)]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def type_columns(columns, names):
|
|
55
|
+
"""Return (is_num, typed) for the given column names.
|
|
56
|
+
Numeric columns become floats (missing -> None); others become strings."""
|
|
57
|
+
is_num, typed = {}, {}
|
|
58
|
+
for c in names:
|
|
59
|
+
numeric = _is_numeric(columns[c])
|
|
60
|
+
is_num[c] = numeric
|
|
61
|
+
if numeric:
|
|
62
|
+
typed[c] = [(_to_float(v) if not _io.is_missing(v) else None) for v in columns[c]]
|
|
63
|
+
else:
|
|
64
|
+
typed[c] = [("" if _io.is_missing(v) else str(v)) for v in columns[c]]
|
|
65
|
+
return is_num, typed
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def stringify(typed, names):
|
|
69
|
+
"""Turn typed synthetic columns back into CSV-ready strings."""
|
|
70
|
+
out = {}
|
|
71
|
+
for c in names:
|
|
72
|
+
vals = []
|
|
73
|
+
for v in typed[c]:
|
|
74
|
+
if v is None:
|
|
75
|
+
vals.append("")
|
|
76
|
+
elif isinstance(v, float):
|
|
77
|
+
vals.append(str(int(v)) if v.is_integer() else ("%.6g" % v))
|
|
78
|
+
else:
|
|
79
|
+
vals.append(str(v))
|
|
80
|
+
out[c] = vals
|
|
81
|
+
return out
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def synth_core(real, is_num, visit, n, rng, fixed_real=None, fixed_synth=None,
|
|
85
|
+
min_leaf=5, max_depth=12, smoothing=0.0):
|
|
86
|
+
"""Sequentially synthesize the `visit` columns and return typed output.
|
|
87
|
+
|
|
88
|
+
real : dict name->typed list (real data, for fitting)
|
|
89
|
+
is_num : dict name->bool covering every visited AND fixed column
|
|
90
|
+
fixed_real : dict name->typed list aligned to real rows — predictors that are
|
|
91
|
+
GIVEN, not synthesized (e.g. a child row's parent attributes).
|
|
92
|
+
fixed_synth : dict name->typed list aligned to the n synthetic rows — the
|
|
93
|
+
given predictor values for each synthetic row.
|
|
94
|
+
"""
|
|
95
|
+
fixed_real = fixed_real or {}
|
|
96
|
+
fixed_synth = fixed_synth or {}
|
|
97
|
+
fixed_names = list(fixed_real.keys())
|
|
98
|
+
if visit:
|
|
99
|
+
n_real = len(real[visit[0]])
|
|
100
|
+
elif fixed_real:
|
|
101
|
+
n_real = len(next(iter(fixed_real.values())))
|
|
102
|
+
else:
|
|
103
|
+
n_real = 0
|
|
104
|
+
|
|
105
|
+
out = {c: [] for c in visit}
|
|
106
|
+
done = []
|
|
107
|
+
for c in visit:
|
|
108
|
+
target_kind = "num" if is_num[c] else "cat"
|
|
109
|
+
preds = fixed_names + done
|
|
110
|
+
if not preds:
|
|
111
|
+
donors = [v for v in real[c] if v is not None] if is_num[c] else real[c]
|
|
112
|
+
out[c] = _marginal_draw(donors if donors else real[c], n, min_leaf, rng)
|
|
113
|
+
else:
|
|
114
|
+
_tree.set_predictors({p: (fixed_real[p] if p in fixed_real else real[p]) for p in preds})
|
|
115
|
+
idx = list(range(n_real))
|
|
116
|
+
root = _tree.build_tree(idx, real[c], preds, is_num, target_kind,
|
|
117
|
+
min_leaf=min_leaf, max_depth=max_depth)
|
|
118
|
+
col_out = []
|
|
119
|
+
for i in range(n):
|
|
120
|
+
row = {}
|
|
121
|
+
for p in preds:
|
|
122
|
+
row[p] = fixed_synth[p][i] if p in fixed_synth else out[p][i]
|
|
123
|
+
col_out.append(_tree.sample_leaf(root, row, is_num, rng, smoothing))
|
|
124
|
+
out[c] = col_out
|
|
125
|
+
done.append(c)
|
|
126
|
+
return out
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def synthesize(header, columns, n=None, visit=None, drop=None,
|
|
130
|
+
min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
|
|
131
|
+
"""Return (out_header, out_columns) of synthetic data for one flat table."""
|
|
132
|
+
rng = random.Random(seed)
|
|
133
|
+
drop = set(drop or [])
|
|
134
|
+
visit = [c for c in (visit or header) if c in columns and c not in drop]
|
|
135
|
+
n_real = len(columns[header[0]]) if header else 0
|
|
136
|
+
n = n_real if n is None else n
|
|
137
|
+
is_num, real = type_columns(columns, visit)
|
|
138
|
+
out = synth_core(real, is_num, visit, n, rng,
|
|
139
|
+
min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing)
|
|
140
|
+
return visit, stringify(out, visit)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""oissyntheticdata._tree — a small CART grown from scratch (no numpy/sklearn).
|
|
3
|
+
|
|
4
|
+
Each leaf keeps the list of REAL target values that reached it ("donors").
|
|
5
|
+
Synthesis samples from a leaf's donors rather than predicting a point value,
|
|
6
|
+
which reproduces the conditional distribution (Reiter, 2005). A minimum leaf
|
|
7
|
+
size (`min_leaf`) guarantees every donor pool blends >= k real records, so a
|
|
8
|
+
synthetic value is never traceable to one individual.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import random
|
|
13
|
+
|
|
14
|
+
MAX_THRESHOLDS = 40 # cap numeric split candidates for speed
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _gini(counts, n):
|
|
18
|
+
if n == 0:
|
|
19
|
+
return 0.0
|
|
20
|
+
s = 0.0
|
|
21
|
+
for c in counts.values():
|
|
22
|
+
p = c / n
|
|
23
|
+
s += p * p
|
|
24
|
+
return 1.0 - s
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _sse(values):
|
|
28
|
+
n = len(values)
|
|
29
|
+
if n == 0:
|
|
30
|
+
return 0.0
|
|
31
|
+
m = sum(values) / n
|
|
32
|
+
return sum((v - m) ** 2 for v in values)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _candidate_thresholds(vals):
|
|
36
|
+
uniq = sorted(set(vals))
|
|
37
|
+
if len(uniq) <= 1:
|
|
38
|
+
return []
|
|
39
|
+
if len(uniq) <= MAX_THRESHOLDS:
|
|
40
|
+
cuts = uniq
|
|
41
|
+
else:
|
|
42
|
+
step = len(uniq) / float(MAX_THRESHOLDS)
|
|
43
|
+
cuts = [uniq[int(i * step)] for i in range(1, MAX_THRESHOLDS)]
|
|
44
|
+
mids = []
|
|
45
|
+
for i in range(1, len(cuts)):
|
|
46
|
+
mids.append((cuts[i - 1] + cuts[i]) / 2.0)
|
|
47
|
+
return mids
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class _Node(object):
|
|
51
|
+
__slots__ = ("leaf", "donors", "feature", "kind", "threshold", "category",
|
|
52
|
+
"left", "right")
|
|
53
|
+
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self.leaf = False
|
|
56
|
+
self.donors = None
|
|
57
|
+
self.feature = None
|
|
58
|
+
self.kind = None # 'num' or 'cat'
|
|
59
|
+
self.threshold = None
|
|
60
|
+
self.category = None
|
|
61
|
+
self.left = None
|
|
62
|
+
self.right = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _impurity(idx, y, target_kind):
|
|
66
|
+
if target_kind == "cat":
|
|
67
|
+
counts = {}
|
|
68
|
+
for i in idx:
|
|
69
|
+
counts[y[i]] = counts.get(y[i], 0) + 1
|
|
70
|
+
return _gini(counts, len(idx)) * len(idx)
|
|
71
|
+
else:
|
|
72
|
+
return _sse([y[i] for i in idx])
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_tree(idx, y, predictors, num_cols, target_kind,
|
|
76
|
+
min_leaf=5, max_depth=12, depth=0):
|
|
77
|
+
"""idx: row indices; y: target list; predictors: list of feature names;
|
|
78
|
+
num_cols: dict name->bool (True if numeric predictor)."""
|
|
79
|
+
node = _Node()
|
|
80
|
+
distinct_y = set(y[i] for i in idx)
|
|
81
|
+
if (len(idx) < 2 * min_leaf or depth >= max_depth or len(distinct_y) <= 1
|
|
82
|
+
or not predictors):
|
|
83
|
+
node.leaf = True
|
|
84
|
+
node.donors = [y[i] for i in idx]
|
|
85
|
+
return node
|
|
86
|
+
|
|
87
|
+
base = _impurity(idx, y, target_kind)
|
|
88
|
+
best = None # (reduction, feature, kind, thr/cat, left_idx, right_idx)
|
|
89
|
+
|
|
90
|
+
for f in predictors:
|
|
91
|
+
is_num = num_cols[f]
|
|
92
|
+
if is_num:
|
|
93
|
+
colvals = [PRED[f][i] for i in idx]
|
|
94
|
+
present = [v for v in colvals if v is not None]
|
|
95
|
+
for thr in _candidate_thresholds(present):
|
|
96
|
+
left = [i for i in idx if PRED[f][i] is not None and PRED[f][i] <= thr]
|
|
97
|
+
right = [i for i in idx if not (PRED[f][i] is not None and PRED[f][i] <= thr)]
|
|
98
|
+
if len(left) < min_leaf or len(right) < min_leaf:
|
|
99
|
+
continue
|
|
100
|
+
red = base - _impurity(left, y, target_kind) - _impurity(right, y, target_kind)
|
|
101
|
+
if best is None or red > best[0]:
|
|
102
|
+
best = (red, f, "num", thr, left, right)
|
|
103
|
+
else:
|
|
104
|
+
cats = set(PRED[f][i] for i in idx)
|
|
105
|
+
if len(cats) <= 1:
|
|
106
|
+
continue
|
|
107
|
+
for c in cats:
|
|
108
|
+
left = [i for i in idx if PRED[f][i] == c]
|
|
109
|
+
right = [i for i in idx if PRED[f][i] != c]
|
|
110
|
+
if len(left) < min_leaf or len(right) < min_leaf:
|
|
111
|
+
continue
|
|
112
|
+
red = base - _impurity(left, y, target_kind) - _impurity(right, y, target_kind)
|
|
113
|
+
if best is None or red > best[0]:
|
|
114
|
+
best = (red, f, "cat", c, left, right)
|
|
115
|
+
|
|
116
|
+
if best is None or best[0] <= 1e-12:
|
|
117
|
+
node.leaf = True
|
|
118
|
+
node.donors = [y[i] for i in idx]
|
|
119
|
+
return node
|
|
120
|
+
|
|
121
|
+
_, f, kind, val, left_idx, right_idx = best
|
|
122
|
+
node.feature, node.kind = f, kind
|
|
123
|
+
if kind == "num":
|
|
124
|
+
node.threshold = val
|
|
125
|
+
else:
|
|
126
|
+
node.category = val
|
|
127
|
+
node.left = build_tree(left_idx, y, predictors, num_cols, target_kind,
|
|
128
|
+
min_leaf, max_depth, depth + 1)
|
|
129
|
+
node.right = build_tree(right_idx, y, predictors, num_cols, target_kind,
|
|
130
|
+
min_leaf, max_depth, depth + 1)
|
|
131
|
+
return node
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# PRED is module-level so the recursive builder can read predictor columns by
|
|
135
|
+
# row index without copying. set_predictors() installs it for one fit.
|
|
136
|
+
PRED = {}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def set_predictors(pred_cols):
|
|
140
|
+
global PRED
|
|
141
|
+
PRED = pred_cols
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def sample_leaf(node, row, num_cols, rng, smoothing=0.0):
|
|
145
|
+
"""Route a synthetic row (dict feature->value) to a leaf and draw a donor."""
|
|
146
|
+
while not node.leaf:
|
|
147
|
+
if node.kind == "num":
|
|
148
|
+
v = row.get(node.feature)
|
|
149
|
+
go_left = (v is not None and v <= node.threshold)
|
|
150
|
+
else:
|
|
151
|
+
go_left = (row.get(node.feature) == node.category)
|
|
152
|
+
node = node.left if go_left else node.right
|
|
153
|
+
donors = node.donors
|
|
154
|
+
val = rng.choice(donors)
|
|
155
|
+
if smoothing and val is not None and isinstance(val, float):
|
|
156
|
+
nums = [d for d in donors if isinstance(d, float)]
|
|
157
|
+
if len(nums) > 2:
|
|
158
|
+
m = sum(nums) / len(nums)
|
|
159
|
+
sd = math.sqrt(sum((x - m) ** 2 for x in nums) / len(nums))
|
|
160
|
+
if sd > 0:
|
|
161
|
+
lo, hi = min(nums), max(nums)
|
|
162
|
+
val = min(hi, max(lo, val + rng.gauss(0.0, smoothing * sd)))
|
|
163
|
+
return val
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: oissyntheticdata
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Zero-dependency sequential CART synthesis for secure research (synthpop tradition), with relational support. An OIS tool.
|
|
5
|
+
Author-email: Yohanan Ouaknine <yohanan.ouaknine@ois.co.il>
|
|
6
|
+
Maintainer-email: OIS <yohanan.ouaknine@ois.co.il>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Yohanan Ouaknine and OIS (https://ois.co.il)
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
|
|
29
|
+
Project-URL: Homepage, https://ois.co.il
|
|
30
|
+
Project-URL: Repository, https://github.com/yohananouaknine/oissyntheticdata
|
|
31
|
+
Project-URL: Issues, https://github.com/yohananouaknine/oissyntheticdata/issues
|
|
32
|
+
Keywords: synthetic data,synthpop,statistical disclosure control,CART,privacy,secure research,microdata,anonymization
|
|
33
|
+
Classifier: Development Status :: 4 - Beta
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
39
|
+
Classifier: Topic :: Security
|
|
40
|
+
Classifier: Operating System :: OS Independent
|
|
41
|
+
Requires-Python: >=3.7
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
License-File: LICENSE
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# oissyntheticdata
|
|
47
|
+
|
|
48
|
+
**Pure-Python sequential CART synthesis — in the `synthpop` tradition, with zero third-party dependencies.**
|
|
49
|
+
|
|
50
|
+
> An **OIS** tool · [ois.co.il](https://ois.co.il) · maintained by Dr Yohanan Ouaknine
|
|
51
|
+
> ([ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351))
|
|
52
|
+
|
|
53
|
+
`oissyntheticdata` generates a synthetic copy of a sensitive dataset that preserves the
|
|
54
|
+
*relationships between variables*, not just each column's marginal shape. It is
|
|
55
|
+
built for the secure-research workflow used by statistical agencies: **develop
|
|
56
|
+
and debug your analysis on the synthetic data off-site, then run the final code
|
|
57
|
+
on the real data on-premises and release only vetted aggregate results.**
|
|
58
|
+
|
|
59
|
+
It imports only the Python standard library (`csv`, `json`, `math`, `random`,
|
|
60
|
+
`statistics`, `zipfile`, `xml.etree`), so it can run inside a locked secure
|
|
61
|
+
environment with no `pip install` and is small enough to read and audit in full.
|
|
62
|
+
|
|
63
|
+
The approach was first deployed in a secure justice-research setting (a study of
|
|
64
|
+
terrorist recidivism after the 2011 Shalit prisoner exchange, run on-premises at
|
|
65
|
+
the Israel Prison Service under Research Committee authorization); this package
|
|
66
|
+
generalises and opens it. OIS offers deployment, validation, and training services
|
|
67
|
+
to government research units and academic researchers around the open core.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Why this exists
|
|
72
|
+
|
|
73
|
+
This follows a well-established paradigm in statistical disclosure control. The
|
|
74
|
+
synthetic data is *test data* that should resemble the real data closely but is
|
|
75
|
+
never used for final inference; the code developed on it is what gets run on the
|
|
76
|
+
confidential data (Nowok, Raab & Dibben 2016; US Census Bureau SIPP Synthetic
|
|
77
|
+
Beta). `oissyntheticdata` is a dependency-free re-implementation of the core engine those
|
|
78
|
+
tools use — **sequential CART synthesis** (Reiter 2005) — packaged for locked
|
|
79
|
+
environments.
|
|
80
|
+
|
|
81
|
+
It complements a metadata-only synthesizer (which preserves each column's shape
|
|
82
|
+
but not the joint structure): `oissyntheticdata` fits on the real microdata on-premises
|
|
83
|
+
and therefore reproduces conditional relationships, at the cost of touching raw
|
|
84
|
+
records (so it must run inside the secure environment).
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## How it works (the engine)
|
|
89
|
+
|
|
90
|
+
Synthesis proceeds **one column at a time** in a chosen visit order:
|
|
91
|
+
|
|
92
|
+
1. **First column** — drawn from its own empirical marginal, with cells smaller
|
|
93
|
+
than `min_leaf` suppressed.
|
|
94
|
+
2. **Each later column `Y`** — a CART (classification tree if `Y` is categorical,
|
|
95
|
+
regression tree if continuous) is grown on the **real data** to predict `Y`
|
|
96
|
+
from the columns already synthesized. Every leaf keeps the list of *real*
|
|
97
|
+
`Y` values that reached it (its "donors").
|
|
98
|
+
3. **Drawing** — for each synthetic row, route it down the tree using the values
|
|
99
|
+
already generated for that row, reach a leaf, and **sample a donor** from that
|
|
100
|
+
leaf (optionally jittered for continuous columns). Sampling from donors — not
|
|
101
|
+
predicting a point — is what reproduces the conditional distribution.
|
|
102
|
+
|
|
103
|
+
Because each column is predicted from the previously synthesized columns, the
|
|
104
|
+
joint distribution is assembled sequentially (the standard `synthpop` approach).
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
visit: c1 -> c2 -> c3 -> ...
|
|
108
|
+
c1 ~ marginal(c1)
|
|
109
|
+
c2 ~ leaf_donor( CART(c2 ~ c1) , synthetic c1 )
|
|
110
|
+
c3 ~ leaf_donor( CART(c3 ~ c1,c2) , synthetic c1,c2 )
|
|
111
|
+
...
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Confidentiality model
|
|
117
|
+
|
|
118
|
+
- **`min_leaf` (k, default 5):** no leaf and no marginal cell is built from fewer
|
|
119
|
+
than `k` real records, so every drawn value blends ≥ k individuals and is never
|
|
120
|
+
traceable to one person. This also caps tree depth and prevents the tree from
|
|
121
|
+
memorizing individuals.
|
|
122
|
+
- **`smoothing` (default 0):** optional Gaussian jitter on continuous donors,
|
|
123
|
+
bounded to the leaf's range, so exact real values are not echoed verbatim.
|
|
124
|
+
- **`drop`:** direct identifiers (national ID, names, record keys) should be
|
|
125
|
+
dropped before synthesis — `oissyntheticdata` does not attempt to anonymize them.
|
|
126
|
+
- **Only synthetic data leaves; the real data never does.** The intended use is
|
|
127
|
+
to take the synthetic file off-site for development and re-run final code on the
|
|
128
|
+
real data in place.
|
|
129
|
+
|
|
130
|
+
`oissyntheticdata` is a disclosure-control aid, not a formal privacy guarantee. For a
|
|
131
|
+
mathematical guarantee, combine it with differential privacy or apply output
|
|
132
|
+
checking (statistical disclosure control) to anything released.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Design decisions and trade-offs
|
|
137
|
+
|
|
138
|
+
The value of `oissyntheticdata` is in its design choices, which are deliberately narrow:
|
|
139
|
+
|
|
140
|
+
- **Where the synthesizer may run is a first-class concern.** `oissyntheticdata` fits on
|
|
141
|
+
real microdata to preserve joint structure, so it runs *on-premises*; only the
|
|
142
|
+
synthetic output leaves. A metadata-only synthesizer can run off-site but
|
|
143
|
+
preserves only per-column structure. Choosing fidelity-with-on-prem-execution
|
|
144
|
+
over portability-with-lower-fidelity is intentional, and the two roles are kept
|
|
145
|
+
as separate tools so the confidentiality reasoning stays explicit.
|
|
146
|
+
- **Donor-leaf sampling, not point prediction.** Drawing a real value from the
|
|
147
|
+
matching leaf reproduces the conditional distribution; predicting a mean would
|
|
148
|
+
not.
|
|
149
|
+
- **One confidentiality invariant.** `min_leaf` (`k`) applies the same `k`-record
|
|
150
|
+
floor to every marginal cell, tree leaf, fan-out estimate, and surrogate key,
|
|
151
|
+
instead of scattering ad hoc thresholds.
|
|
152
|
+
- **Relational by conditioning, not joining.** Children are synthesized
|
|
153
|
+
conditioned on the parent's synthetic attributes and linked by surrogate keys,
|
|
154
|
+
preserving referential integrity without materialising a real join.
|
|
155
|
+
- **Build on, don't reinvent.** The estimator is the established CART-synthesis
|
|
156
|
+
method; the new work is the dependency-free, auditable, relational realisation
|
|
157
|
+
for locked environments.
|
|
158
|
+
|
|
159
|
+
Scope boundaries are equally deliberate: single-parent schemas only, no enforced
|
|
160
|
+
high-order interactions or arithmetic identities, and no formal privacy guarantee
|
|
161
|
+
(see Limitations).
|
|
162
|
+
|
|
163
|
+
## Governance, support & contributing
|
|
164
|
+
|
|
165
|
+
`oissyntheticdata` is maintained in the open under the MIT license. Questions, bug reports,
|
|
166
|
+
and change proposals go through public GitHub Issues and Pull Requests; see
|
|
167
|
+
[`CONTRIBUTING.md`](CONTRIBUTING.md). Decisions are made by the maintainer(s)
|
|
168
|
+
listed in [`CITATION.cff`](CITATION.cff) via the public issue/PR process. There is
|
|
169
|
+
no private support channel — keeping development and discussion public is part of
|
|
170
|
+
the project's auditability goal. Releases are versioned and recorded in
|
|
171
|
+
[`CHANGELOG.md`](CHANGELOG.md).
|
|
172
|
+
|
|
173
|
+
## Generative AI disclosure
|
|
174
|
+
|
|
175
|
+
A generative AI assistant (Claude, Anthropic) was used to help draft and refactor
|
|
176
|
+
parts of the code and documentation. All output was reviewed, tested, and edited
|
|
177
|
+
by the author(s), who take full responsibility for the design, correctness, and
|
|
178
|
+
integrity of the software. The design decisions and abstractions above, and the
|
|
179
|
+
testing and documentation practices, are the author(s)' own. Contributors are
|
|
180
|
+
asked to disclose non-trivial AI assistance (see `CONTRIBUTING.md`).
|
|
181
|
+
|
|
182
|
+
## Install
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
pip install oissyntheticdata # once published
|
|
186
|
+
# or, in a locked environment, just copy the oissyntheticdata/ folder next to your code
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
No dependencies. Python 3.7+.
|
|
190
|
+
|
|
191
|
+
## Usage
|
|
192
|
+
|
|
193
|
+
Command line:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
python -m oissyntheticdata real.csv -o synthetic.csv --drop national_id --min-leaf 5
|
|
197
|
+
python -m oissyntheticdata data.xlsx -o synthetic.csv --visit "age,offense,violent" --smoothing 0.5
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Library:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
import oissyntheticdata
|
|
204
|
+
|
|
205
|
+
# one call
|
|
206
|
+
oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
|
|
207
|
+
drop=["national_id"], min_leaf=5)
|
|
208
|
+
|
|
209
|
+
# or step by step
|
|
210
|
+
header, cols = oissyntheticdata.read_table("real.xlsx")
|
|
211
|
+
out_header, out_cols = oissyntheticdata.synthesize(header, cols,
|
|
212
|
+
drop=["national_id"], min_leaf=5)
|
|
213
|
+
oissyntheticdata.write_table("synthetic.csv", out_header, out_cols)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Key parameters: `n` (rows, default = real), `visit` (column order),
|
|
217
|
+
`drop` (identifiers to exclude), `min_leaf` (k), `max_depth`, `smoothing`, `seed`.
|
|
218
|
+
|
|
219
|
+
### Related tables (multi-table synthesis)
|
|
220
|
+
|
|
221
|
+
For data split across linked tables (e.g. one row per inmate, many judgements per
|
|
222
|
+
inmate), `synthesize_relational` keeps **referential integrity** and the
|
|
223
|
+
**parent → child structure**:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
import oissyntheticdata
|
|
227
|
+
|
|
228
|
+
oissyntheticdata.synthesize_relational_files(
|
|
229
|
+
{"inmates": "inmates.csv", "judgements": "judgements.csv"},
|
|
230
|
+
schema={
|
|
231
|
+
"inmates": {"key": "prisoner_id"},
|
|
232
|
+
"judgements": {"key": "judgement_id",
|
|
233
|
+
"parent": "inmates", "foreign_key": "prisoner_id"},
|
|
234
|
+
},
|
|
235
|
+
out_dir="out", min_leaf=5,
|
|
236
|
+
)
|
|
237
|
+
# -> out/synthetic_inmates.csv, out/synthetic_judgements.csv
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
How it works: the parent is synthesized first and given fresh surrogate keys; a
|
|
241
|
+
regression CART models how many children each parent has (the fan-out) from the
|
|
242
|
+
parent's attributes; and each child's attributes are synthesized **conditioned on
|
|
243
|
+
its parent's synthetic attributes**. The result: every synthetic foreign key
|
|
244
|
+
points at a synthetic parent (0 orphan joins), the number of children per parent
|
|
245
|
+
is realistic, and parent → child relationships survive (e.g. high-risk parents
|
|
246
|
+
keep their child-row patterns). Supports a single-parent DAG — star, snowflake,
|
|
247
|
+
and parent → child → grandchild chains.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Limitations
|
|
252
|
+
|
|
253
|
+
- Fits on real microdata, so **run it on-premises**; the synthetic *output* is
|
|
254
|
+
what you take off-site.
|
|
255
|
+
- Relational synthesis covers a single-parent DAG (star / snowflake / chains).
|
|
256
|
+
Many-to-many relationships and compound keys are not modelled — pre-join or
|
|
257
|
+
pre-resolve them to a surrogate key first.
|
|
258
|
+
- CART captures pairwise/low-order structure well; very high-order interactions
|
|
259
|
+
and exact arithmetic identities (e.g. `rate = a/b`) are not enforced.
|
|
260
|
+
- Pure Python: comfortable to a few thousand rows × a few dozen columns; larger
|
|
261
|
+
data is slower than a compiled implementation.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Lineage & sources
|
|
266
|
+
|
|
267
|
+
- Rubin, D.B. (1993). *Statistical disclosure limitation.* J. Official Statistics 9(2).
|
|
268
|
+
- Little, R.J.A. (1993). *Statistical analysis of masked data.* J. Official Statistics 9(2).
|
|
269
|
+
- Reiter, J.P. (2005). *Using CART to generate partially synthetic public use microdata.*
|
|
270
|
+
J. Official Statistics 21(3).
|
|
271
|
+
- Reiter, Oganian & Karr (2009). *Verification servers.* Comput. Stat. Data Anal. 53(4):1475–1482.
|
|
272
|
+
https://doi.org/10.1016/j.csda.2008.10.006
|
|
273
|
+
- Nowok, Raab & Dibben (2016). *synthpop: Bespoke Creation of Synthetic Data in R.*
|
|
274
|
+
J. Statistical Software 74(11). https://doi.org/10.18637/jss.v074.i11
|
|
275
|
+
- Drechsler, J. (2011). *Synthetic Datasets for Statistical Disclosure Control.* Springer.
|
|
276
|
+
- US Census Bureau, *SIPP Synthetic Beta* + Cornell Synthetic Data Server (synthetic
|
|
277
|
+
development data + validation on confidential files).
|
|
278
|
+
|
|
279
|
+
## Maintainer
|
|
280
|
+
|
|
281
|
+
Dr **Yohanan Ouaknine** — OIS ([ois.co.il](https://ois.co.il)),
|
|
282
|
+
[yohanan.ouaknine@ois.co.il](mailto:yohanan.ouaknine@ois.co.il),
|
|
283
|
+
[ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351).
|
|
284
|
+
Department of Criminology, Ashkelon Academic College; formerly Head of the
|
|
285
|
+
Research Branch, Israel Prison Service.
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
MIT — see `LICENSE`.
|
|
290
|
+
|
|
291
|
+
## Citation
|
|
292
|
+
|
|
293
|
+
If you use `oissyntheticdata`, please cite this software (see `CITATION.cff`) and
|
|
294
|
+
the methodological lineage above (Reiter 2005; Nowok, Raab & Dibben 2016). The
|
|
295
|
+
method was first applied in Ouaknine, Elisha & Hasisi (2026), *The Effect of Mass
|
|
296
|
+
Prisoner Release on Terrorist Recidivism: A Propensity Score Analysis of the Shalit
|
|
297
|
+
Deal* (in publication).
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
oissyntheticdata/__init__.py,sha256=jBKFw02tnTZE8_CQfO8vxxsEcLujtwyrGbdmpqrF_EQ,1861
|
|
2
|
+
oissyntheticdata/__main__.py,sha256=7IP9flqUD4hlc97VAXW28XaI0KM8IfKI8xUhB5p1fOw,1767
|
|
3
|
+
oissyntheticdata/_io.py,sha256=Qd-gLzyn9-6KvyvHDPs_fjij9kR5I3tdU6y0kEmePJI,3029
|
|
4
|
+
oissyntheticdata/_relational.py,sha256=YZFTj2ixyinpcagF5yEygddKnqlqvSZLWR3Xs-UGxlo,8296
|
|
5
|
+
oissyntheticdata/_synth.py,sha256=WXpJkHv7aGohnLCWHV9Ct9MhvrZVjiUOOj_NpgSlK2U,5321
|
|
6
|
+
oissyntheticdata/_tree.py,sha256=vOePQ7xosqOpgG-_k6Op68O5nEHsfymyugMHlhv3cFg,5566
|
|
7
|
+
oissyntheticdata-0.2.0.dist-info/licenses/LICENSE,sha256=VRnrt9oHAA6oSVzz6w_OzRBg2WnvjsjgCENdpx9SiOk,1101
|
|
8
|
+
oissyntheticdata-0.2.0.dist-info/METADATA,sha256=lmhbOCO8004uHrGgf71rURCaJtvu7ZuqyzUEReSGWFk,14200
|
|
9
|
+
oissyntheticdata-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
oissyntheticdata-0.2.0.dist-info/entry_points.txt,sha256=hrXxwSv1b0C2gUhfVO-68zmh5Ot8oFmazt749kzG8Pw,107
|
|
11
|
+
oissyntheticdata-0.2.0.dist-info/top_level.txt,sha256=TjXJ9Cnt9DmspFooICOHMq1yNHg4ywh7v0UyX5N-JY4,17
|
|
12
|
+
oissyntheticdata-0.2.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yohanan Ouaknine and OIS (https://ois.co.il)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
oissyntheticdata
|