oissyntheticdata 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ oissyntheticdata — pure-Python sequential CART synthesis, in the synthpop tradition.
4
+
5
+ Zero third-party dependencies (standard library only). Designed for secure
6
+ research environments: develop and debug your analysis on the synthetic data
7
+ off-site, then run the final code on the real data on-premises.
8
+
9
+ Single table
10
+ ------------
11
+ import oissyntheticdata
12
+ oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
13
+ drop=["national_id"], min_leaf=5)
14
+
15
+ Related tables (referential integrity preserved)
16
+ -------------------------------------------------
17
+ oissyntheticdata.synthesize_relational_files(
18
+ {"inmates": "inmates.csv", "judgements": "judgements.csv"},
19
+ schema={
20
+ "inmates": {"key": "prisoner_id"},
21
+ "judgements": {"key": "judgement_id",
22
+ "parent": "inmates", "foreign_key": "prisoner_id"},
23
+ },
24
+ out_dir="out", min_leaf=5)
25
+ """
26
+
27
+ from ._io import read_table, write_table
28
+ from ._synth import synthesize
29
+ from ._relational import synthesize_relational, synthesize_relational_files
30
+
31
+ __version__ = "0.2.0"
32
+ __all__ = [
33
+ "read_table", "write_table", "synthesize", "synthesize_file",
34
+ "synthesize_relational", "synthesize_relational_files",
35
+ ]
36
+
37
+
38
+ def synthesize_file(in_path, out_path, n=None, visit=None, drop=None,
39
+ min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
40
+ """Read a CSV/XLSX, synthesize one flat table, and write a CSV."""
41
+ header, cols = read_table(in_path)
42
+ out_header, out_cols = synthesize(
43
+ header, cols, n=n, visit=visit, drop=drop,
44
+ min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing, seed=seed)
45
+ write_table(out_path, out_header, out_cols)
46
+ nrows = len(out_cols[out_header[0]]) if out_header else 0
47
+ return nrows, len(out_header)
@@ -0,0 +1,34 @@
1
+ # -*- coding: utf-8 -*-
2
+ """Command-line interface: python -m oissyntheticdata real.csv -o synthetic.csv"""
3
+
4
+ import sys
5
+ import argparse
6
+ from . import synthesize_file, __version__
7
+
8
+
9
+ def main(argv=None):
10
+ p = argparse.ArgumentParser(
11
+ prog="oissyntheticdata",
12
+ description="Pure-Python sequential CART synthesis (synthpop tradition, zero deps).")
13
+ p.add_argument("input", help="real CSV or XLSX file")
14
+ p.add_argument("-o", "--output", default="synthetic.csv", help="output CSV path")
15
+ p.add_argument("-n", "--rows", type=int, default=None, help="number of synthetic rows")
16
+ p.add_argument("--drop", default="", help="comma-separated columns to exclude (e.g. identifiers)")
17
+ p.add_argument("--visit", default="", help="comma-separated synthesis order (default: file order)")
18
+ p.add_argument("--min-leaf", type=int, default=5, help="minimum real records per leaf/cell (k)")
19
+ p.add_argument("--max-depth", type=int, default=12, help="maximum tree depth")
20
+ p.add_argument("--smoothing", type=float, default=0.0, help="continuous jitter (0 = off)")
21
+ p.add_argument("--seed", type=int, default=12345)
22
+ p.add_argument("--version", action="version", version="oissyntheticdata " + __version__)
23
+ a = p.parse_args(argv)
24
+
25
+ drop = [c.strip() for c in a.drop.split(",") if c.strip()]
26
+ visit = [c.strip() for c in a.visit.split(",") if c.strip()] or None
27
+ rows, cols = synthesize_file(a.input, a.output, n=a.rows, visit=visit, drop=drop,
28
+ min_leaf=a.min_leaf, max_depth=a.max_depth,
29
+ smoothing=a.smoothing, seed=a.seed)
30
+ sys.stderr.write("[oissyntheticdata] wrote %d rows x %d cols -> %s\n" % (rows, cols, a.output))
31
+
32
+
33
+ if __name__ == "__main__":
34
+ main()
@@ -0,0 +1,85 @@
1
+ # -*- coding: utf-8 -*-
2
+ """oissyntheticdata._io — read CSV/XLSX and write CSV using ONLY the standard library."""
3
+
4
+ import os
5
+ import csv
6
+ import zipfile
7
+ import xml.etree.ElementTree as ET
8
+
9
+ MISSING_TOKENS = {"", "na", "n/a", ".", "nan", "null", "none"}
10
+
11
+
12
+ def _col_index(ref):
13
+ letters = "".join(ch for ch in ref if ch.isalpha())
14
+ n = 0
15
+ for ch in letters:
16
+ n = n * 26 + (ord(ch.upper()) - 64)
17
+ return n - 1
18
+
19
+
20
+ def _read_xlsx(path):
21
+ ns = {"a": "http://schemas.openxmlformats.org/spreadsheetml/2006/main"}
22
+ T = "{http://schemas.openxmlformats.org/spreadsheetml/2006/main}t"
23
+ with zipfile.ZipFile(path) as z:
24
+ names = z.namelist()
25
+ shared = []
26
+ if "xl/sharedStrings.xml" in names:
27
+ root = ET.fromstring(z.read("xl/sharedStrings.xml"))
28
+ for si in root.findall("a:si", ns):
29
+ shared.append("".join(t.text or "" for t in si.iter(T)))
30
+ sheet = "xl/worksheets/sheet1.xml"
31
+ if sheet not in names:
32
+ sheet = sorted(n for n in names
33
+ if n.startswith("xl/worksheets/") and n.endswith(".xml"))[0]
34
+ root = ET.fromstring(z.read(sheet))
35
+ rows = []
36
+ for row in root.iter("{%s}row" % ns["a"]):
37
+ cells, maxi = {}, -1
38
+ for c in row.findall("a:c", ns):
39
+ ref = c.get("r", "")
40
+ idx = _col_index(ref) if ref else len(cells)
41
+ t = c.get("t")
42
+ v = c.find("a:v", ns)
43
+ if t == "s" and v is not None:
44
+ val = shared[int(v.text)]
45
+ elif t == "inlineStr":
46
+ is_ = c.find("a:is", ns)
47
+ val = "".join(x.text or "" for x in is_.iter(T)) if is_ is not None else ""
48
+ else:
49
+ val = v.text if v is not None else ""
50
+ cells[idx] = val if val is not None else ""
51
+ maxi = max(maxi, idx)
52
+ rows.append([cells.get(i, "") for i in range(maxi + 1)])
53
+ return rows
54
+
55
+
56
+ def _read_csv(path):
57
+ with open(path, "r", encoding="utf-8-sig", newline="") as f:
58
+ return [row for row in csv.reader(f)]
59
+
60
+
61
+ def read_table(path):
62
+ """Return (header: list[str], columns: dict[str, list[str]])."""
63
+ raw = _read_xlsx(path) if path.lower().endswith((".xlsx", ".xls")) else _read_csv(path)
64
+ raw = [r for r in raw if any(str(c).strip() for c in r)]
65
+ if not raw:
66
+ return [], {}
67
+ header = [str(h).strip() for h in raw[0]]
68
+ cols = {h: [] for h in header}
69
+ for r in raw[1:]:
70
+ for i, h in enumerate(header):
71
+ cols[h].append(str(r[i]).strip() if i < len(r) else "")
72
+ return header, cols
73
+
74
+
75
+ def write_table(path, header, columns):
76
+ n = len(columns[header[0]]) if header else 0
77
+ with open(path, "w", encoding="utf-8", newline="") as f:
78
+ w = csv.writer(f)
79
+ w.writerow(header)
80
+ for i in range(n):
81
+ w.writerow([columns[h][i] for h in header])
82
+
83
+
84
+ def is_missing(v):
85
+ return str(v).strip().lower() in MISSING_TOKENS
@@ -0,0 +1,188 @@
1
+ # -*- coding: utf-8 -*-
2
+ """oissyntheticdata._relational — multi-table (relational) synthesis.
3
+
4
+ Extends sequential CART synthesis to a parent -> child schema while keeping
5
+ **referential integrity** (every synthetic foreign key points at a synthetic
6
+ parent) and the **parent->child structure** (fan-out and attribute correlation).
7
+
8
+ For each table, in parent-before-child order:
9
+ 1. Synthesize the table's attributes (sequential CART). For a child, the
10
+ parent's synthetic attributes are supplied as fixed predictors, so child
11
+ attributes are drawn conditioned on the parent they belong to.
12
+ 2. Mint fresh surrogate primary keys (1..n) — real identifiers are never
13
+ reproduced.
14
+ 3. For each child, a regression CART models the number of children per parent
15
+ from the parent's attributes (the fan-out), so realistic counts — and which
16
+ parents have many vs. few children — are preserved. Foreign keys are drawn
17
+ from the synthetic parent keys, guaranteeing valid joins.
18
+
19
+ Scope: a single-parent DAG (star / snowflake / chains). A table has at most one
20
+ parent; a parent may have many children; children may themselves be parents.
21
+ """
22
+
23
+ import random
24
+ from . import _io
25
+ from . import _tree
26
+ from ._synth import type_columns, stringify, synth_core, _to_float
27
+
28
+
29
+ def _topo_order(schema):
30
+ order, seen = [], set()
31
+ tables = list(schema.keys())
32
+ guard = 0
33
+ while len(order) < len(tables):
34
+ guard += 1
35
+ if guard > len(tables) + 2:
36
+ raise ValueError("Cyclic or unresolved parent reference in schema.")
37
+ for t in tables:
38
+ if t in seen:
39
+ continue
40
+ parent = schema[t].get("parent")
41
+ if parent is None or parent in seen:
42
+ order.append(t); seen.add(t)
43
+ return order
44
+
45
+
46
+ def _child_counts(parent_keys_real, child_fk_real):
47
+ """counts[parent_key] = number of real child rows with that foreign key."""
48
+ counts = {}
49
+ for k in child_fk_real:
50
+ counts[k] = counts.get(k, 0) + 1
51
+ return [float(counts.get(pk, 0)) for pk in parent_keys_real]
52
+
53
+
54
+ def synthesize_relational(tables, schema, n=None, drop=None,
55
+ min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
56
+ """Synthesize a set of related tables.
57
+
58
+ tables : dict table_name -> (header, columns) [from oissyntheticdata.read_table]
59
+ schema : dict table_name -> {"key": pk,
60
+ "parent": parent_table (optional),
61
+ "foreign_key": fk (required if parent set)}
62
+ n : dict table_name -> rows for ROOT tables (children sized by fan-out);
63
+ a single int applies to all roots; None = each root's real row count.
64
+ drop : dict table_name -> [columns to exclude] (besides keys), or a flat
65
+ list applied to every table.
66
+
67
+ Returns dict table_name -> (out_header, out_columns).
68
+ """
69
+ rng = random.Random(seed)
70
+ drop = drop or {}
71
+ if isinstance(drop, (list, tuple, set)):
72
+ drop = {t: list(drop) for t in tables}
73
+ n_map = {} if n is None else (n if isinstance(n, dict) else {t: n for t in tables})
74
+
75
+ synth_attr = {} # table -> typed dict of synthesized attribute columns
76
+ synth_key = {} # table -> list of surrogate pk strings
77
+ is_num_of = {} # table -> is_num map for its attributes
78
+ results = {}
79
+
80
+ for t in _topo_order(schema):
81
+ header, cols = tables[t]
82
+ spec = schema[t]
83
+ pk = spec["key"]
84
+ parent = spec.get("parent")
85
+ fk = spec.get("foreign_key")
86
+ drop_t = set(drop.get(t, [])) | {pk}
87
+ if fk:
88
+ drop_t.add(fk)
89
+ attrs = [c for c in header if c not in drop_t]
90
+ is_num, real = type_columns(cols, attrs)
91
+ is_num_of[t] = is_num
92
+ n_real = len(cols[header[0]])
93
+
94
+ if parent is None:
95
+ # -------- root table --------
96
+ n_t = int(n_map.get(t, n_real))
97
+ out = synth_core(real, is_num, attrs, n_t, rng,
98
+ min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing)
99
+ synth_attr[t] = out
100
+ synth_key[t] = [str(i + 1) for i in range(n_t)]
101
+ else:
102
+ # -------- child table --------
103
+ p_attrs = list(synth_attr[parent].keys())
104
+ p_is_num = is_num_of[parent]
105
+ # real parent attributes, typed + a key->row lookup
106
+ p_header, p_cols = tables[parent]
107
+ _, p_real = type_columns(p_cols, p_attrs)
108
+ p_pk_real = p_cols[schema[parent]["key"]]
109
+ lookup = {k: i for i, k in enumerate(p_pk_real)}
110
+ # fan-out: counts of real children per real parent
111
+ counts = _child_counts(p_pk_real, cols[fk])
112
+
113
+ # count model: counts ~ parent attributes (regression CART)
114
+ n_parent = len(synth_key[parent])
115
+ drawn = []
116
+ if p_attrs:
117
+ _tree.set_predictors({a: p_real[a] for a in p_attrs})
118
+ croot = _tree.build_tree(list(range(len(counts))), counts, p_attrs,
119
+ p_is_num, "num", min_leaf=min_leaf, max_depth=max_depth)
120
+ for i in range(n_parent):
121
+ row = {a: synth_attr[parent][a][i] for a in p_attrs}
122
+ c = _tree.sample_leaf(croot, row, p_is_num, rng, 0.0)
123
+ drawn.append(max(0, int(round(c if c is not None else 0))))
124
+ else:
125
+ pos = [c for c in counts]
126
+ for _ in range(n_parent):
127
+ drawn.append(max(0, int(round(rng.choice(pos)))))
128
+
129
+ # expand: child foreign keys + the parent attrs carried to each child row
130
+ child_fk, parent_carry = [], {("p__" + a): [] for a in p_attrs}
131
+ for i in range(n_parent):
132
+ key_i = synth_key[parent][i]
133
+ for _ in range(drawn[i]):
134
+ child_fk.append(key_i)
135
+ for a in p_attrs:
136
+ parent_carry["p__" + a].append(synth_attr[parent][a][i])
137
+ total = len(child_fk)
138
+
139
+ # real fitting data, restricted to child rows whose parent exists
140
+ valid = [j for j in range(n_real) if cols[fk][j] in lookup]
141
+ child_real = {a: [real[a][j] for j in valid] for a in attrs}
142
+ fixed_real = {("p__" + a): [p_real[a][lookup[cols[fk][j]]] for j in valid] for a in p_attrs}
143
+ combined_is_num = dict(is_num)
144
+ for a in p_attrs:
145
+ combined_is_num["p__" + a] = p_is_num[a]
146
+
147
+ out = synth_core(child_real, combined_is_num, attrs, total, rng,
148
+ fixed_real=fixed_real, fixed_synth=parent_carry,
149
+ min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing)
150
+ synth_attr[t] = out
151
+ synth_key[t] = [str(i + 1) for i in range(total)]
152
+ results[t] = ("__child__", child_fk) # stash fk for assembly
153
+
154
+ # ---- assemble this table's output in original column order ----
155
+ out_header = [c for c in header if c not in (set(drop.get(t, [])))]
156
+ # keep pk and fk columns in output even though they aren't "attrs"
157
+ out_cols = {}
158
+ strattr = stringify(synth_attr[t], list(synth_attr[t].keys()))
159
+ nrows = len(synth_key[t])
160
+ for c in out_header:
161
+ if c == pk:
162
+ out_cols[c] = list(synth_key[t])
163
+ elif parent and c == fk:
164
+ out_cols[c] = list(results[t][1])
165
+ elif c in strattr:
166
+ out_cols[c] = strattr[c]
167
+ else:
168
+ out_cols[c] = [""] * nrows
169
+ results[t] = (out_header, out_cols)
170
+
171
+ return results
172
+
173
+
174
+ def synthesize_relational_files(paths, schema, out_dir=".", **kw):
175
+ """Read CSV/XLSX tables, synthesize relationally, write synthetic_<name>.csv.
176
+
177
+ paths : dict table_name -> input file path
178
+ Returns dict table_name -> (rows, cols).
179
+ """
180
+ import os
181
+ tables = {t: _io.read_table(p) for t, p in paths.items()}
182
+ res = synthesize_relational(tables, schema, **kw)
183
+ summary = {}
184
+ for t, (hdr, cols) in res.items():
185
+ out_path = os.path.join(out_dir, "synthetic_%s.csv" % t)
186
+ _io.write_table(out_path, hdr, cols)
187
+ summary[t] = (len(cols[hdr[0]]) if hdr else 0, len(hdr))
188
+ return summary
@@ -0,0 +1,140 @@
1
+ # -*- coding: utf-8 -*-
2
+ """oissyntheticdata._synth — sequential CART synthesis (the synthpop paradigm).
3
+
4
+ Columns are synthesized one at a time in `visit` order. The first column is
5
+ drawn from its own (disclosure-controlled) marginal. Each later column is
6
+ synthesized by growing a CART that predicts it from the columns ALREADY
7
+ synthesized, fitted on the real data, then drawing a donor from the matching
8
+ leaf for every synthetic row. Because predictors at draw time are the
9
+ synthetic values, the joint distribution is built up sequentially.
10
+
11
+ Confidentiality:
12
+ * `min_leaf` (k): no leaf / no marginal cell is built from fewer than k real
13
+ records, so a drawn value never isolates one person.
14
+ * `smoothing`: optional jitter on continuous donors so exact real values are
15
+ not echoed verbatim.
16
+ * direct identifiers should be dropped before synthesis (see `drop` arg).
17
+ """
18
+
19
+ import random
20
+ from . import _io
21
+ from . import _tree
22
+
23
+
24
+ def _is_numeric(values):
25
+ present = [v for v in values if not _io.is_missing(v)]
26
+ if not present:
27
+ return False
28
+ for v in present:
29
+ try:
30
+ float(str(v).replace(",", ""))
31
+ except ValueError:
32
+ return False
33
+ return True
34
+
35
+
36
+ def _to_float(v):
37
+ try:
38
+ return float(str(v).replace(",", ""))
39
+ except (ValueError, TypeError):
40
+ return None
41
+
42
+
43
+ def _marginal_draw(values, n, min_leaf, rng):
44
+ """Sample n values from the empirical marginal, suppressing rare cells."""
45
+ counts = {}
46
+ for v in values:
47
+ counts[v] = counts.get(v, 0) + 1
48
+ pool = [v for v in values if counts[v] >= min_leaf]
49
+ if not pool: # everything rare -> fall back to all
50
+ pool = list(values)
51
+ return [rng.choice(pool) for _ in range(n)]
52
+
53
+
54
+ def type_columns(columns, names):
55
+ """Return (is_num, typed) for the given column names.
56
+ Numeric columns become floats (missing -> None); others become strings."""
57
+ is_num, typed = {}, {}
58
+ for c in names:
59
+ numeric = _is_numeric(columns[c])
60
+ is_num[c] = numeric
61
+ if numeric:
62
+ typed[c] = [(_to_float(v) if not _io.is_missing(v) else None) for v in columns[c]]
63
+ else:
64
+ typed[c] = [("" if _io.is_missing(v) else str(v)) for v in columns[c]]
65
+ return is_num, typed
66
+
67
+
68
+ def stringify(typed, names):
69
+ """Turn typed synthetic columns back into CSV-ready strings."""
70
+ out = {}
71
+ for c in names:
72
+ vals = []
73
+ for v in typed[c]:
74
+ if v is None:
75
+ vals.append("")
76
+ elif isinstance(v, float):
77
+ vals.append(str(int(v)) if v.is_integer() else ("%.6g" % v))
78
+ else:
79
+ vals.append(str(v))
80
+ out[c] = vals
81
+ return out
82
+
83
+
84
+ def synth_core(real, is_num, visit, n, rng, fixed_real=None, fixed_synth=None,
85
+ min_leaf=5, max_depth=12, smoothing=0.0):
86
+ """Sequentially synthesize the `visit` columns and return typed output.
87
+
88
+ real : dict name->typed list (real data, for fitting)
89
+ is_num : dict name->bool covering every visited AND fixed column
90
+ fixed_real : dict name->typed list aligned to real rows — predictors that are
91
+ GIVEN, not synthesized (e.g. a child row's parent attributes).
92
+ fixed_synth : dict name->typed list aligned to the n synthetic rows — the
93
+ given predictor values for each synthetic row.
94
+ """
95
+ fixed_real = fixed_real or {}
96
+ fixed_synth = fixed_synth or {}
97
+ fixed_names = list(fixed_real.keys())
98
+ if visit:
99
+ n_real = len(real[visit[0]])
100
+ elif fixed_real:
101
+ n_real = len(next(iter(fixed_real.values())))
102
+ else:
103
+ n_real = 0
104
+
105
+ out = {c: [] for c in visit}
106
+ done = []
107
+ for c in visit:
108
+ target_kind = "num" if is_num[c] else "cat"
109
+ preds = fixed_names + done
110
+ if not preds:
111
+ donors = [v for v in real[c] if v is not None] if is_num[c] else real[c]
112
+ out[c] = _marginal_draw(donors if donors else real[c], n, min_leaf, rng)
113
+ else:
114
+ _tree.set_predictors({p: (fixed_real[p] if p in fixed_real else real[p]) for p in preds})
115
+ idx = list(range(n_real))
116
+ root = _tree.build_tree(idx, real[c], preds, is_num, target_kind,
117
+ min_leaf=min_leaf, max_depth=max_depth)
118
+ col_out = []
119
+ for i in range(n):
120
+ row = {}
121
+ for p in preds:
122
+ row[p] = fixed_synth[p][i] if p in fixed_synth else out[p][i]
123
+ col_out.append(_tree.sample_leaf(root, row, is_num, rng, smoothing))
124
+ out[c] = col_out
125
+ done.append(c)
126
+ return out
127
+
128
+
129
+ def synthesize(header, columns, n=None, visit=None, drop=None,
130
+ min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
131
+ """Return (out_header, out_columns) of synthetic data for one flat table."""
132
+ rng = random.Random(seed)
133
+ drop = set(drop or [])
134
+ visit = [c for c in (visit or header) if c in columns and c not in drop]
135
+ n_real = len(columns[header[0]]) if header else 0
136
+ n = n_real if n is None else n
137
+ is_num, real = type_columns(columns, visit)
138
+ out = synth_core(real, is_num, visit, n, rng,
139
+ min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing)
140
+ return visit, stringify(out, visit)
@@ -0,0 +1,163 @@
1
+ # -*- coding: utf-8 -*-
2
+ """oissyntheticdata._tree — a small CART grown from scratch (no numpy/sklearn).
3
+
4
+ Each leaf keeps the list of REAL target values that reached it ("donors").
5
+ Synthesis samples from a leaf's donors rather than predicting a point value,
6
+ which reproduces the conditional distribution (Reiter, 2005). A minimum leaf
7
+ size (`min_leaf`) guarantees every donor pool blends >= k real records, so a
8
+ synthetic value is never traceable to one individual.
9
+ """
10
+
11
+ import math
12
+ import random
13
+
14
+ MAX_THRESHOLDS = 40 # cap numeric split candidates for speed
15
+
16
+
17
+ def _gini(counts, n):
18
+ if n == 0:
19
+ return 0.0
20
+ s = 0.0
21
+ for c in counts.values():
22
+ p = c / n
23
+ s += p * p
24
+ return 1.0 - s
25
+
26
+
27
+ def _sse(values):
28
+ n = len(values)
29
+ if n == 0:
30
+ return 0.0
31
+ m = sum(values) / n
32
+ return sum((v - m) ** 2 for v in values)
33
+
34
+
35
+ def _candidate_thresholds(vals):
36
+ uniq = sorted(set(vals))
37
+ if len(uniq) <= 1:
38
+ return []
39
+ if len(uniq) <= MAX_THRESHOLDS:
40
+ cuts = uniq
41
+ else:
42
+ step = len(uniq) / float(MAX_THRESHOLDS)
43
+ cuts = [uniq[int(i * step)] for i in range(1, MAX_THRESHOLDS)]
44
+ mids = []
45
+ for i in range(1, len(cuts)):
46
+ mids.append((cuts[i - 1] + cuts[i]) / 2.0)
47
+ return mids
48
+
49
+
50
+ class _Node(object):
51
+ __slots__ = ("leaf", "donors", "feature", "kind", "threshold", "category",
52
+ "left", "right")
53
+
54
+ def __init__(self):
55
+ self.leaf = False
56
+ self.donors = None
57
+ self.feature = None
58
+ self.kind = None # 'num' or 'cat'
59
+ self.threshold = None
60
+ self.category = None
61
+ self.left = None
62
+ self.right = None
63
+
64
+
65
+ def _impurity(idx, y, target_kind):
66
+ if target_kind == "cat":
67
+ counts = {}
68
+ for i in idx:
69
+ counts[y[i]] = counts.get(y[i], 0) + 1
70
+ return _gini(counts, len(idx)) * len(idx)
71
+ else:
72
+ return _sse([y[i] for i in idx])
73
+
74
+
75
+ def build_tree(idx, y, predictors, num_cols, target_kind,
76
+ min_leaf=5, max_depth=12, depth=0):
77
+ """idx: row indices; y: target list; predictors: list of feature names;
78
+ num_cols: dict name->bool (True if numeric predictor)."""
79
+ node = _Node()
80
+ distinct_y = set(y[i] for i in idx)
81
+ if (len(idx) < 2 * min_leaf or depth >= max_depth or len(distinct_y) <= 1
82
+ or not predictors):
83
+ node.leaf = True
84
+ node.donors = [y[i] for i in idx]
85
+ return node
86
+
87
+ base = _impurity(idx, y, target_kind)
88
+ best = None # (reduction, feature, kind, thr/cat, left_idx, right_idx)
89
+
90
+ for f in predictors:
91
+ is_num = num_cols[f]
92
+ if is_num:
93
+ colvals = [PRED[f][i] for i in idx]
94
+ present = [v for v in colvals if v is not None]
95
+ for thr in _candidate_thresholds(present):
96
+ left = [i for i in idx if PRED[f][i] is not None and PRED[f][i] <= thr]
97
+ right = [i for i in idx if not (PRED[f][i] is not None and PRED[f][i] <= thr)]
98
+ if len(left) < min_leaf or len(right) < min_leaf:
99
+ continue
100
+ red = base - _impurity(left, y, target_kind) - _impurity(right, y, target_kind)
101
+ if best is None or red > best[0]:
102
+ best = (red, f, "num", thr, left, right)
103
+ else:
104
+ cats = set(PRED[f][i] for i in idx)
105
+ if len(cats) <= 1:
106
+ continue
107
+ for c in cats:
108
+ left = [i for i in idx if PRED[f][i] == c]
109
+ right = [i for i in idx if PRED[f][i] != c]
110
+ if len(left) < min_leaf or len(right) < min_leaf:
111
+ continue
112
+ red = base - _impurity(left, y, target_kind) - _impurity(right, y, target_kind)
113
+ if best is None or red > best[0]:
114
+ best = (red, f, "cat", c, left, right)
115
+
116
+ if best is None or best[0] <= 1e-12:
117
+ node.leaf = True
118
+ node.donors = [y[i] for i in idx]
119
+ return node
120
+
121
+ _, f, kind, val, left_idx, right_idx = best
122
+ node.feature, node.kind = f, kind
123
+ if kind == "num":
124
+ node.threshold = val
125
+ else:
126
+ node.category = val
127
+ node.left = build_tree(left_idx, y, predictors, num_cols, target_kind,
128
+ min_leaf, max_depth, depth + 1)
129
+ node.right = build_tree(right_idx, y, predictors, num_cols, target_kind,
130
+ min_leaf, max_depth, depth + 1)
131
+ return node
132
+
133
+
134
+ # PRED is module-level so the recursive builder can read predictor columns by
135
+ # row index without copying. set_predictors() installs it for one fit.
136
+ PRED = {}
137
+
138
+
139
+ def set_predictors(pred_cols):
140
+ global PRED
141
+ PRED = pred_cols
142
+
143
+
144
+ def sample_leaf(node, row, num_cols, rng, smoothing=0.0):
145
+ """Route a synthetic row (dict feature->value) to a leaf and draw a donor."""
146
+ while not node.leaf:
147
+ if node.kind == "num":
148
+ v = row.get(node.feature)
149
+ go_left = (v is not None and v <= node.threshold)
150
+ else:
151
+ go_left = (row.get(node.feature) == node.category)
152
+ node = node.left if go_left else node.right
153
+ donors = node.donors
154
+ val = rng.choice(donors)
155
+ if smoothing and val is not None and isinstance(val, float):
156
+ nums = [d for d in donors if isinstance(d, float)]
157
+ if len(nums) > 2:
158
+ m = sum(nums) / len(nums)
159
+ sd = math.sqrt(sum((x - m) ** 2 for x in nums) / len(nums))
160
+ if sd > 0:
161
+ lo, hi = min(nums), max(nums)
162
+ val = min(hi, max(lo, val + rng.gauss(0.0, smoothing * sd)))
163
+ return val
@@ -0,0 +1,297 @@
1
+ Metadata-Version: 2.4
2
+ Name: oissyntheticdata
3
+ Version: 0.2.0
4
+ Summary: Zero-dependency sequential CART synthesis for secure research (synthpop tradition), with relational support. An OIS tool.
5
+ Author-email: Yohanan Ouaknine <yohanan.ouaknine@ois.co.il>
6
+ Maintainer-email: OIS <yohanan.ouaknine@ois.co.il>
7
+ License: MIT License
8
+
9
+ Copyright (c) 2026 Yohanan Ouaknine and OIS (https://ois.co.il)
10
+
11
+ Permission is hereby granted, free of charge, to any person obtaining a copy
12
+ of this software and associated documentation files (the "Software"), to deal
13
+ in the Software without restriction, including without limitation the rights
14
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15
+ copies of the Software, and to permit persons to whom the Software is
16
+ furnished to do so, subject to the following conditions:
17
+
18
+ The above copyright notice and this permission notice shall be included in all
19
+ copies or substantial portions of the Software.
20
+
21
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27
+ SOFTWARE.
28
+
29
+ Project-URL: Homepage, https://ois.co.il
30
+ Project-URL: Repository, https://github.com/yohananouaknine/oissyntheticdata
31
+ Project-URL: Issues, https://github.com/yohananouaknine/oissyntheticdata/issues
32
+ Keywords: synthetic data,synthpop,statistical disclosure control,CART,privacy,secure research,microdata,anonymization
33
+ Classifier: Development Status :: 4 - Beta
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3 :: Only
38
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
39
+ Classifier: Topic :: Security
40
+ Classifier: Operating System :: OS Independent
41
+ Requires-Python: >=3.7
42
+ Description-Content-Type: text/markdown
43
+ License-File: LICENSE
44
+ Dynamic: license-file
45
+
46
+ # oissyntheticdata
47
+
48
+ **Pure-Python sequential CART synthesis — in the `synthpop` tradition, with zero third-party dependencies.**
49
+
50
+ > An **OIS** tool · [ois.co.il](https://ois.co.il) · maintained by Dr Yohanan Ouaknine
51
+ > ([ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351))
52
+
53
+ `oissyntheticdata` generates a synthetic copy of a sensitive dataset that preserves the
54
+ *relationships between variables*, not just each column's marginal shape. It is
55
+ built for the secure-research workflow used by statistical agencies: **develop
56
+ and debug your analysis on the synthetic data off-site, then run the final code
57
+ on the real data on-premises and release only vetted aggregate results.**
58
+
59
+ It imports only the Python standard library (`csv`, `json`, `math`, `random`,
60
+ `statistics`, `zipfile`, `xml.etree`), so it can run inside a locked secure
61
+ environment with no `pip install` and is small enough to read and audit in full.
62
+
63
+ The approach was first deployed in a secure justice-research setting (a study of
64
+ terrorist recidivism after the 2011 Shalit prisoner exchange, run on-premises at
65
+ the Israel Prison Service under Research Committee authorization); this package
66
+ generalises and opens it. OIS offers deployment, validation, and training services
67
+ to government research units and academic researchers around the open core.
68
+
69
+ ---
70
+
71
+ ## Why this exists
72
+
73
+ This follows a well-established paradigm in statistical disclosure control. The
74
+ synthetic data is *test data* that should resemble the real data closely but is
75
+ never used for final inference; the code developed on it is what gets run on the
76
+ confidential data (Nowok, Raab & Dibben 2016; US Census Bureau SIPP Synthetic
77
+ Beta). `oissyntheticdata` is a dependency-free re-implementation of the core engine those
78
+ tools use — **sequential CART synthesis** (Reiter 2005) — packaged for locked
79
+ environments.
80
+
81
+ It complements a metadata-only synthesizer (which preserves each column's shape
82
+ but not the joint structure): `oissyntheticdata` fits on the real microdata on-premises
83
+ and therefore reproduces conditional relationships, at the cost of touching raw
84
+ records (so it must run inside the secure environment).
85
+
86
+ ---
87
+
88
+ ## How it works (the engine)
89
+
90
+ Synthesis proceeds **one column at a time** in a chosen visit order:
91
+
92
+ 1. **First column** — drawn from its own empirical marginal, with cells smaller
93
+ than `min_leaf` suppressed.
94
+ 2. **Each later column `Y`** — a CART (classification tree if `Y` is categorical,
95
+ regression tree if continuous) is grown on the **real data** to predict `Y`
96
+ from the columns already synthesized. Every leaf keeps the list of *real*
97
+ `Y` values that reached it (its "donors").
98
+ 3. **Drawing** — for each synthetic row, route it down the tree using the values
99
+ already generated for that row, reach a leaf, and **sample a donor** from that
100
+ leaf (optionally jittered for continuous columns). Sampling from donors — not
101
+ predicting a point — is what reproduces the conditional distribution.
102
+
103
+ Because each column is predicted from the previously synthesized columns, the
104
+ joint distribution is assembled sequentially (the standard `synthpop` approach).
105
+
106
+ ```
107
+ visit: c1 -> c2 -> c3 -> ...
108
+ c1 ~ marginal(c1)
109
+ c2 ~ leaf_donor( CART(c2 ~ c1) , synthetic c1 )
110
+ c3 ~ leaf_donor( CART(c3 ~ c1,c2) , synthetic c1,c2 )
111
+ ...
112
+ ```
113
+
114
+ ---
115
+
116
+ ## Confidentiality model
117
+
118
+ - **`min_leaf` (k, default 5):** no leaf and no marginal cell is built from fewer
119
+ than `k` real records, so every drawn value blends ≥ k individuals and is never
120
+ traceable to one person. This also caps tree depth and prevents the tree from
121
+ memorizing individuals.
122
+ - **`smoothing` (default 0):** optional Gaussian jitter on continuous donors,
123
+ bounded to the leaf's range, so exact real values are not echoed verbatim.
124
+ - **`drop`:** direct identifiers (national ID, names, record keys) should be
125
+ dropped before synthesis — `oissyntheticdata` does not attempt to anonymize them.
126
+ - **Only synthetic data leaves; the real data never does.** The intended use is
127
+ to take the synthetic file off-site for development and re-run final code on the
128
+ real data in place.
129
+
130
+ `oissyntheticdata` is a disclosure-control aid, not a formal privacy guarantee. For a
131
+ mathematical guarantee, combine it with differential privacy or apply output
132
+ checking (statistical disclosure control) to anything released.
133
+
134
+ ---
135
+
136
+ ## Design decisions and trade-offs
137
+
138
+ The value of `oissyntheticdata` is in its design choices, which are deliberately narrow:
139
+
140
+ - **Where the synthesizer may run is a first-class concern.** `oissyntheticdata` fits on
141
+ real microdata to preserve joint structure, so it runs *on-premises*; only the
142
+ synthetic output leaves. A metadata-only synthesizer can run off-site but
143
+ preserves only per-column structure. Choosing fidelity-with-on-prem-execution
144
+ over portability-with-lower-fidelity is intentional, and the two roles are kept
145
+ as separate tools so the confidentiality reasoning stays explicit.
146
+ - **Donor-leaf sampling, not point prediction.** Drawing a real value from the
147
+ matching leaf reproduces the conditional distribution; predicting a mean would
148
+ not.
149
+ - **One confidentiality invariant.** `min_leaf` (`k`) applies the same `k`-record
150
+ floor to every marginal cell, tree leaf, fan-out estimate, and surrogate key,
151
+ instead of scattering ad hoc thresholds.
152
+ - **Relational by conditioning, not joining.** Children are synthesized
153
+ conditioned on the parent's synthetic attributes and linked by surrogate keys,
154
+ preserving referential integrity without materialising a real join.
155
+ - **Build on, don't reinvent.** The estimator is the established CART-synthesis
156
+ method; the new work is the dependency-free, auditable, relational realisation
157
+ for locked environments.
158
+
159
+ Scope boundaries are equally deliberate: single-parent schemas only, no enforced
160
+ high-order interactions or arithmetic identities, and no formal privacy guarantee
161
+ (see Limitations).
162
+
163
+ ## Governance, support & contributing
164
+
165
+ `oissyntheticdata` is maintained in the open under the MIT license. Questions, bug reports,
166
+ and change proposals go through public GitHub Issues and Pull Requests; see
167
+ [`CONTRIBUTING.md`](CONTRIBUTING.md). Decisions are made by the maintainer(s)
168
+ listed in [`CITATION.cff`](CITATION.cff) via the public issue/PR process. There is
169
+ no private support channel — keeping development and discussion public is part of
170
+ the project's auditability goal. Releases are versioned and recorded in
171
+ [`CHANGELOG.md`](CHANGELOG.md).
172
+
173
+ ## Generative AI disclosure
174
+
175
+ A generative AI assistant (Claude, Anthropic) was used to help draft and refactor
176
+ parts of the code and documentation. All output was reviewed, tested, and edited
177
+ by the author(s), who take full responsibility for the design, correctness, and
178
+ integrity of the software. The design decisions and abstractions above, and the
179
+ testing and documentation practices, are the author(s)' own. Contributors are
180
+ asked to disclose non-trivial AI assistance (see `CONTRIBUTING.md`).
181
+
182
+ ## Install
183
+
184
+ ```bash
185
+ pip install oissyntheticdata # once published
186
+ # or, in a locked environment, just copy the oissyntheticdata/ folder next to your code
187
+ ```
188
+
189
+ No dependencies. Python 3.7+.
190
+
191
+ ## Usage
192
+
193
+ Command line:
194
+
195
+ ```bash
196
+ python -m oissyntheticdata real.csv -o synthetic.csv --drop national_id --min-leaf 5
197
+ python -m oissyntheticdata data.xlsx -o synthetic.csv --visit "age,offense,violent" --smoothing 0.5
198
+ ```
199
+
200
+ Library:
201
+
202
+ ```python
203
+ import oissyntheticdata
204
+
205
+ # one call
206
+ oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
207
+ drop=["national_id"], min_leaf=5)
208
+
209
+ # or step by step
210
+ header, cols = oissyntheticdata.read_table("real.xlsx")
211
+ out_header, out_cols = oissyntheticdata.synthesize(header, cols,
212
+ drop=["national_id"], min_leaf=5)
213
+ oissyntheticdata.write_table("synthetic.csv", out_header, out_cols)
214
+ ```
215
+
216
+ Key parameters: `n` (rows, default = real), `visit` (column order),
217
+ `drop` (identifiers to exclude), `min_leaf` (k), `max_depth`, `smoothing`, `seed`.
218
+
219
+ ### Related tables (multi-table synthesis)
220
+
221
+ For data split across linked tables (e.g. one row per inmate, many judgements per
222
+ inmate), `synthesize_relational` keeps **referential integrity** and the
223
+ **parent → child structure**:
224
+
225
+ ```python
226
+ import oissyntheticdata
227
+
228
+ oissyntheticdata.synthesize_relational_files(
229
+ {"inmates": "inmates.csv", "judgements": "judgements.csv"},
230
+ schema={
231
+ "inmates": {"key": "prisoner_id"},
232
+ "judgements": {"key": "judgement_id",
233
+ "parent": "inmates", "foreign_key": "prisoner_id"},
234
+ },
235
+ out_dir="out", min_leaf=5,
236
+ )
237
+ # -> out/synthetic_inmates.csv, out/synthetic_judgements.csv
238
+ ```
239
+
240
+ How it works: the parent is synthesized first and given fresh surrogate keys; a
241
+ regression CART models how many children each parent has (the fan-out) from the
242
+ parent's attributes; and each child's attributes are synthesized **conditioned on
243
+ its parent's synthetic attributes**. The result: every synthetic foreign key
244
+ points at a synthetic parent (0 orphan joins), the number of children per parent
245
+ is realistic, and parent → child relationships survive (e.g. high-risk parents
246
+ keep their child-row patterns). Supports a single-parent DAG — star, snowflake,
247
+ and parent → child → grandchild chains.
248
+
249
+ ---
250
+
251
+ ## Limitations
252
+
253
+ - Fits on real microdata, so **run it on-premises**; the synthetic *output* is
254
+ what you take off-site.
255
+ - Relational synthesis covers a single-parent DAG (star / snowflake / chains).
256
+ Many-to-many relationships and compound keys are not modelled — pre-join or
257
+ pre-resolve them to a surrogate key first.
258
+ - CART captures pairwise/low-order structure well; very high-order interactions
259
+ and exact arithmetic identities (e.g. `rate = a/b`) are not enforced.
260
+ - Pure Python: comfortable to a few thousand rows × a few dozen columns; larger
261
+ data is slower than a compiled implementation.
262
+
263
+ ---
264
+
265
+ ## Lineage & sources
266
+
267
+ - Rubin, D.B. (1993). *Statistical disclosure limitation.* J. Official Statistics 9(2).
268
+ - Little, R.J.A. (1993). *Statistical analysis of masked data.* J. Official Statistics 9(2).
269
+ - Reiter, J.P. (2005). *Using CART to generate partially synthetic public use microdata.*
270
+ J. Official Statistics 21(3).
271
+ - Reiter, Oganian & Karr (2009). *Verification servers.* Comput. Stat. Data Anal. 53(4):1475–1482.
272
+ https://doi.org/10.1016/j.csda.2008.10.006
273
+ - Nowok, Raab & Dibben (2016). *synthpop: Bespoke Creation of Synthetic Data in R.*
274
+ J. Statistical Software 74(11). https://doi.org/10.18637/jss.v074.i11
275
+ - Drechsler, J. (2011). *Synthetic Datasets for Statistical Disclosure Control.* Springer.
276
+ - US Census Bureau, *SIPP Synthetic Beta* + Cornell Synthetic Data Server (synthetic
277
+ development data + validation on confidential files).
278
+
279
+ ## Maintainer
280
+
281
+ Dr **Yohanan Ouaknine** — OIS ([ois.co.il](https://ois.co.il)),
282
+ [yohanan.ouaknine@ois.co.il](mailto:yohanan.ouaknine@ois.co.il),
283
+ [ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351).
284
+ Department of Criminology, Ashkelon Academic College; formerly Head of the
285
+ Research Branch, Israel Prison Service.
286
+
287
+ ## License
288
+
289
+ MIT — see `LICENSE`.
290
+
291
+ ## Citation
292
+
293
+ If you use `oissyntheticdata`, please cite this software (see `CITATION.cff`) and
294
+ the methodological lineage above (Reiter 2005; Nowok, Raab & Dibben 2016). The
295
+ method was first applied in Ouaknine, Elisha & Hasisi (2026), *The Effect of Mass
296
+ Prisoner Release on Terrorist Recidivism: A Propensity Score Analysis of the Shalit
297
+ Deal* (in publication).
@@ -0,0 +1,12 @@
1
+ oissyntheticdata/__init__.py,sha256=jBKFw02tnTZE8_CQfO8vxxsEcLujtwyrGbdmpqrF_EQ,1861
2
+ oissyntheticdata/__main__.py,sha256=7IP9flqUD4hlc97VAXW28XaI0KM8IfKI8xUhB5p1fOw,1767
3
+ oissyntheticdata/_io.py,sha256=Qd-gLzyn9-6KvyvHDPs_fjij9kR5I3tdU6y0kEmePJI,3029
4
+ oissyntheticdata/_relational.py,sha256=YZFTj2ixyinpcagF5yEygddKnqlqvSZLWR3Xs-UGxlo,8296
5
+ oissyntheticdata/_synth.py,sha256=WXpJkHv7aGohnLCWHV9Ct9MhvrZVjiUOOj_NpgSlK2U,5321
6
+ oissyntheticdata/_tree.py,sha256=vOePQ7xosqOpgG-_k6Op68O5nEHsfymyugMHlhv3cFg,5566
7
+ oissyntheticdata-0.2.0.dist-info/licenses/LICENSE,sha256=VRnrt9oHAA6oSVzz6w_OzRBg2WnvjsjgCENdpx9SiOk,1101
8
+ oissyntheticdata-0.2.0.dist-info/METADATA,sha256=lmhbOCO8004uHrGgf71rURCaJtvu7ZuqyzUEReSGWFk,14200
9
+ oissyntheticdata-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ oissyntheticdata-0.2.0.dist-info/entry_points.txt,sha256=hrXxwSv1b0C2gUhfVO-68zmh5Ot8oFmazt749kzG8Pw,107
11
+ oissyntheticdata-0.2.0.dist-info/top_level.txt,sha256=TjXJ9Cnt9DmspFooICOHMq1yNHg4ywh7v0UyX5N-JY4,17
12
+ oissyntheticdata-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ oissd = oissyntheticdata.__main__:main
3
+ oissyntheticdata = oissyntheticdata.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Yohanan Ouaknine and OIS (https://ois.co.il)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ oissyntheticdata