oissyntheticdata 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oissyntheticdata-0.2.0/LICENSE +21 -0
- oissyntheticdata-0.2.0/PKG-INFO +297 -0
- oissyntheticdata-0.2.0/README.md +252 -0
- oissyntheticdata-0.2.0/oissyntheticdata/__init__.py +47 -0
- oissyntheticdata-0.2.0/oissyntheticdata/__main__.py +34 -0
- oissyntheticdata-0.2.0/oissyntheticdata/_io.py +85 -0
- oissyntheticdata-0.2.0/oissyntheticdata/_relational.py +188 -0
- oissyntheticdata-0.2.0/oissyntheticdata/_synth.py +140 -0
- oissyntheticdata-0.2.0/oissyntheticdata/_tree.py +163 -0
- oissyntheticdata-0.2.0/oissyntheticdata.egg-info/PKG-INFO +297 -0
- oissyntheticdata-0.2.0/oissyntheticdata.egg-info/SOURCES.txt +15 -0
- oissyntheticdata-0.2.0/oissyntheticdata.egg-info/dependency_links.txt +1 -0
- oissyntheticdata-0.2.0/oissyntheticdata.egg-info/entry_points.txt +3 -0
- oissyntheticdata-0.2.0/oissyntheticdata.egg-info/top_level.txt +1 -0
- oissyntheticdata-0.2.0/pyproject.toml +41 -0
- oissyntheticdata-0.2.0/setup.cfg +4 -0
- oissyntheticdata-0.2.0/tests/test_synth.py +110 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Yohanan Ouaknine and OIS (https://ois.co.il)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: oissyntheticdata
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Zero-dependency sequential CART synthesis for secure research (synthpop tradition), with relational support. An OIS tool.
|
|
5
|
+
Author-email: Yohanan Ouaknine <yohanan.ouaknine@ois.co.il>
|
|
6
|
+
Maintainer-email: OIS <yohanan.ouaknine@ois.co.il>
|
|
7
|
+
License: MIT License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2026 Yohanan Ouaknine and OIS (https://ois.co.il)
|
|
10
|
+
|
|
11
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
12
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
13
|
+
in the Software without restriction, including without limitation the rights
|
|
14
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
15
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16
|
+
furnished to do so, subject to the following conditions:
|
|
17
|
+
|
|
18
|
+
The above copyright notice and this permission notice shall be included in all
|
|
19
|
+
copies or substantial portions of the Software.
|
|
20
|
+
|
|
21
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
22
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
23
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
24
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
25
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
26
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
27
|
+
SOFTWARE.
|
|
28
|
+
|
|
29
|
+
Project-URL: Homepage, https://ois.co.il
|
|
30
|
+
Project-URL: Repository, https://github.com/yohananouaknine/oissyntheticdata
|
|
31
|
+
Project-URL: Issues, https://github.com/yohananouaknine/oissyntheticdata/issues
|
|
32
|
+
Keywords: synthetic data,synthpop,statistical disclosure control,CART,privacy,secure research,microdata,anonymization
|
|
33
|
+
Classifier: Development Status :: 4 - Beta
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
39
|
+
Classifier: Topic :: Security
|
|
40
|
+
Classifier: Operating System :: OS Independent
|
|
41
|
+
Requires-Python: >=3.7
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
License-File: LICENSE
|
|
44
|
+
Dynamic: license-file
|
|
45
|
+
|
|
46
|
+
# oissyntheticdata
|
|
47
|
+
|
|
48
|
+
**Pure-Python sequential CART synthesis — in the `synthpop` tradition, with zero third-party dependencies.**
|
|
49
|
+
|
|
50
|
+
> An **OIS** tool · [ois.co.il](https://ois.co.il) · maintained by Dr Yohanan Ouaknine
|
|
51
|
+
> ([ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351))
|
|
52
|
+
|
|
53
|
+
`oissyntheticdata` generates a synthetic copy of a sensitive dataset that preserves the
|
|
54
|
+
*relationships between variables*, not just each column's marginal shape. It is
|
|
55
|
+
built for the secure-research workflow used by statistical agencies: **develop
|
|
56
|
+
and debug your analysis on the synthetic data off-site, then run the final code
|
|
57
|
+
on the real data on-premises and release only vetted aggregate results.**
|
|
58
|
+
|
|
59
|
+
It imports only the Python standard library (`csv`, `json`, `math`, `random`,
|
|
60
|
+
`statistics`, `zipfile`, `xml.etree`), so it can run inside a locked secure
|
|
61
|
+
environment with no `pip install` and is small enough to read and audit in full.
|
|
62
|
+
|
|
63
|
+
The approach was first deployed in a secure justice-research setting (a study of
|
|
64
|
+
terrorist recidivism after the 2011 Shalit prisoner exchange, run on-premises at
|
|
65
|
+
the Israel Prison Service under Research Committee authorization); this package
|
|
66
|
+
generalises and opens it. OIS offers deployment, validation, and training services
|
|
67
|
+
to government research units and academic researchers around the open core.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Why this exists
|
|
72
|
+
|
|
73
|
+
This follows a well-established paradigm in statistical disclosure control. The
|
|
74
|
+
synthetic data is *test data* that should resemble the real data closely but is
|
|
75
|
+
never used for final inference; the code developed on it is what gets run on the
|
|
76
|
+
confidential data (Nowok, Raab & Dibben 2016; US Census Bureau SIPP Synthetic
|
|
77
|
+
Beta). `oissyntheticdata` is a dependency-free re-implementation of the core engine those
|
|
78
|
+
tools use — **sequential CART synthesis** (Reiter 2005) — packaged for locked
|
|
79
|
+
environments.
|
|
80
|
+
|
|
81
|
+
It complements a metadata-only synthesizer (which preserves each column's shape
|
|
82
|
+
but not the joint structure): `oissyntheticdata` fits on the real microdata on-premises
|
|
83
|
+
and therefore reproduces conditional relationships, at the cost of touching raw
|
|
84
|
+
records (so it must run inside the secure environment).
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## How it works (the engine)
|
|
89
|
+
|
|
90
|
+
Synthesis proceeds **one column at a time** in a chosen visit order:
|
|
91
|
+
|
|
92
|
+
1. **First column** — drawn from its own empirical marginal, with cells smaller
|
|
93
|
+
than `min_leaf` suppressed.
|
|
94
|
+
2. **Each later column `Y`** — a CART (classification tree if `Y` is categorical,
|
|
95
|
+
regression tree if continuous) is grown on the **real data** to predict `Y`
|
|
96
|
+
from the columns already synthesized. Every leaf keeps the list of *real*
|
|
97
|
+
`Y` values that reached it (its "donors").
|
|
98
|
+
3. **Drawing** — for each synthetic row, route it down the tree using the values
|
|
99
|
+
already generated for that row, reach a leaf, and **sample a donor** from that
|
|
100
|
+
leaf (optionally jittered for continuous columns). Sampling from donors — not
|
|
101
|
+
predicting a point — is what reproduces the conditional distribution.
|
|
102
|
+
|
|
103
|
+
Because each column is predicted from the previously synthesized columns, the
|
|
104
|
+
joint distribution is assembled sequentially (the standard `synthpop` approach).
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
visit: c1 -> c2 -> c3 -> ...
|
|
108
|
+
c1 ~ marginal(c1)
|
|
109
|
+
c2 ~ leaf_donor( CART(c2 ~ c1) , synthetic c1 )
|
|
110
|
+
c3 ~ leaf_donor( CART(c3 ~ c1,c2) , synthetic c1,c2 )
|
|
111
|
+
...
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Confidentiality model
|
|
117
|
+
|
|
118
|
+
- **`min_leaf` (k, default 5):** no leaf and no marginal cell is built from fewer
|
|
119
|
+
than `k` real records, so every drawn value blends ≥ k individuals and is never
|
|
120
|
+
traceable to one person. This also caps tree depth and prevents the tree from
|
|
121
|
+
memorizing individuals.
|
|
122
|
+
- **`smoothing` (default 0):** optional Gaussian jitter on continuous donors,
|
|
123
|
+
bounded to the leaf's range, so exact real values are not echoed verbatim.
|
|
124
|
+
- **`drop`:** direct identifiers (national ID, names, record keys) should be
|
|
125
|
+
dropped before synthesis — `oissyntheticdata` does not attempt to anonymize them.
|
|
126
|
+
- **Only synthetic data leaves; the real data never does.** The intended use is
|
|
127
|
+
to take the synthetic file off-site for development and re-run final code on the
|
|
128
|
+
real data in place.
|
|
129
|
+
|
|
130
|
+
`oissyntheticdata` is a disclosure-control aid, not a formal privacy guarantee. For a
|
|
131
|
+
mathematical guarantee, combine it with differential privacy or apply output
|
|
132
|
+
checking (statistical disclosure control) to anything released.
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Design decisions and trade-offs
|
|
137
|
+
|
|
138
|
+
The value of `oissyntheticdata` is in its design choices, which are deliberately narrow:
|
|
139
|
+
|
|
140
|
+
- **Where the synthesizer may run is a first-class concern.** `oissyntheticdata` fits on
|
|
141
|
+
real microdata to preserve joint structure, so it runs *on-premises*; only the
|
|
142
|
+
synthetic output leaves. A metadata-only synthesizer can run off-site but
|
|
143
|
+
preserves only per-column structure. Choosing fidelity-with-on-prem-execution
|
|
144
|
+
over portability-with-lower-fidelity is intentional, and the two roles are kept
|
|
145
|
+
as separate tools so the confidentiality reasoning stays explicit.
|
|
146
|
+
- **Donor-leaf sampling, not point prediction.** Drawing a real value from the
|
|
147
|
+
matching leaf reproduces the conditional distribution; predicting a mean would
|
|
148
|
+
not.
|
|
149
|
+
- **One confidentiality invariant.** `min_leaf` (`k`) applies the same `k`-record
|
|
150
|
+
floor to every marginal cell, tree leaf, fan-out estimate, and surrogate key,
|
|
151
|
+
instead of scattering ad hoc thresholds.
|
|
152
|
+
- **Relational by conditioning, not joining.** Children are synthesized
|
|
153
|
+
conditioned on the parent's synthetic attributes and linked by surrogate keys,
|
|
154
|
+
preserving referential integrity without materialising a real join.
|
|
155
|
+
- **Build on, don't reinvent.** The estimator is the established CART-synthesis
|
|
156
|
+
method; the new work is the dependency-free, auditable, relational realisation
|
|
157
|
+
for locked environments.
|
|
158
|
+
|
|
159
|
+
Scope boundaries are equally deliberate: single-parent schemas only, no enforced
|
|
160
|
+
high-order interactions or arithmetic identities, and no formal privacy guarantee
|
|
161
|
+
(see Limitations).
|
|
162
|
+
|
|
163
|
+
## Governance, support & contributing
|
|
164
|
+
|
|
165
|
+
`oissyntheticdata` is maintained in the open under the MIT license. Questions, bug reports,
|
|
166
|
+
and change proposals go through public GitHub Issues and Pull Requests; see
|
|
167
|
+
[`CONTRIBUTING.md`](CONTRIBUTING.md). Decisions are made by the maintainer(s)
|
|
168
|
+
listed in [`CITATION.cff`](CITATION.cff) via the public issue/PR process. There is
|
|
169
|
+
no private support channel — keeping development and discussion public is part of
|
|
170
|
+
the project's auditability goal. Releases are versioned and recorded in
|
|
171
|
+
[`CHANGELOG.md`](CHANGELOG.md).
|
|
172
|
+
|
|
173
|
+
## Generative AI disclosure
|
|
174
|
+
|
|
175
|
+
A generative AI assistant (Claude, Anthropic) was used to help draft and refactor
|
|
176
|
+
parts of the code and documentation. All output was reviewed, tested, and edited
|
|
177
|
+
by the author(s), who take full responsibility for the design, correctness, and
|
|
178
|
+
integrity of the software. The design decisions and abstractions above, and the
|
|
179
|
+
testing and documentation practices, are the author(s)' own. Contributors are
|
|
180
|
+
asked to disclose non-trivial AI assistance (see `CONTRIBUTING.md`).
|
|
181
|
+
|
|
182
|
+
## Install
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
pip install oissyntheticdata # once published
|
|
186
|
+
# or, in a locked environment, just copy the oissyntheticdata/ folder next to your code
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
No dependencies. Python 3.7+.
|
|
190
|
+
|
|
191
|
+
## Usage
|
|
192
|
+
|
|
193
|
+
Command line:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
python -m oissyntheticdata real.csv -o synthetic.csv --drop national_id --min-leaf 5
|
|
197
|
+
python -m oissyntheticdata data.xlsx -o synthetic.csv --visit "age,offense,violent" --smoothing 0.5
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Library:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
import oissyntheticdata
|
|
204
|
+
|
|
205
|
+
# one call
|
|
206
|
+
oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
|
|
207
|
+
drop=["national_id"], min_leaf=5)
|
|
208
|
+
|
|
209
|
+
# or step by step
|
|
210
|
+
header, cols = oissyntheticdata.read_table("real.xlsx")
|
|
211
|
+
out_header, out_cols = oissyntheticdata.synthesize(header, cols,
|
|
212
|
+
drop=["national_id"], min_leaf=5)
|
|
213
|
+
oissyntheticdata.write_table("synthetic.csv", out_header, out_cols)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Key parameters: `n` (rows, default = real), `visit` (column order),
|
|
217
|
+
`drop` (identifiers to exclude), `min_leaf` (k), `max_depth`, `smoothing`, `seed`.
|
|
218
|
+
|
|
219
|
+
### Related tables (multi-table synthesis)
|
|
220
|
+
|
|
221
|
+
For data split across linked tables (e.g. one row per inmate, many judgements per
|
|
222
|
+
inmate), `synthesize_relational` keeps **referential integrity** and the
|
|
223
|
+
**parent → child structure**:
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
import oissyntheticdata
|
|
227
|
+
|
|
228
|
+
oissyntheticdata.synthesize_relational_files(
|
|
229
|
+
{"inmates": "inmates.csv", "judgements": "judgements.csv"},
|
|
230
|
+
schema={
|
|
231
|
+
"inmates": {"key": "prisoner_id"},
|
|
232
|
+
"judgements": {"key": "judgement_id",
|
|
233
|
+
"parent": "inmates", "foreign_key": "prisoner_id"},
|
|
234
|
+
},
|
|
235
|
+
out_dir="out", min_leaf=5,
|
|
236
|
+
)
|
|
237
|
+
# -> out/synthetic_inmates.csv, out/synthetic_judgements.csv
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
How it works: the parent is synthesized first and given fresh surrogate keys; a
|
|
241
|
+
regression CART models how many children each parent has (the fan-out) from the
|
|
242
|
+
parent's attributes; and each child's attributes are synthesized **conditioned on
|
|
243
|
+
its parent's synthetic attributes**. The result: every synthetic foreign key
|
|
244
|
+
points at a synthetic parent (0 orphan joins), the number of children per parent
|
|
245
|
+
is realistic, and parent → child relationships survive (e.g. high-risk parents
|
|
246
|
+
keep their child-row patterns). Supports a single-parent DAG — star, snowflake,
|
|
247
|
+
and parent → child → grandchild chains.
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Limitations
|
|
252
|
+
|
|
253
|
+
- Fits on real microdata, so **run it on-premises**; the synthetic *output* is
|
|
254
|
+
what you take off-site.
|
|
255
|
+
- Relational synthesis covers a single-parent DAG (star / snowflake / chains).
|
|
256
|
+
Many-to-many relationships and compound keys are not modelled — pre-join or
|
|
257
|
+
pre-resolve them to a surrogate key first.
|
|
258
|
+
- CART captures pairwise/low-order structure well; very high-order interactions
|
|
259
|
+
and exact arithmetic identities (e.g. `rate = a/b`) are not enforced.
|
|
260
|
+
- Pure Python: comfortable to a few thousand rows × a few dozen columns; larger
|
|
261
|
+
data is slower than a compiled implementation.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Lineage & sources
|
|
266
|
+
|
|
267
|
+
- Rubin, D.B. (1993). *Statistical disclosure limitation.* J. Official Statistics 9(2).
|
|
268
|
+
- Little, R.J.A. (1993). *Statistical analysis of masked data.* J. Official Statistics 9(2).
|
|
269
|
+
- Reiter, J.P. (2005). *Using CART to generate partially synthetic public use microdata.*
|
|
270
|
+
J. Official Statistics 21(3).
|
|
271
|
+
- Reiter, Oganian & Karr (2009). *Verification servers.* Comput. Stat. Data Anal. 53(4):1475–1482.
|
|
272
|
+
https://doi.org/10.1016/j.csda.2008.10.006
|
|
273
|
+
- Nowok, Raab & Dibben (2016). *synthpop: Bespoke Creation of Synthetic Data in R.*
|
|
274
|
+
J. Statistical Software 74(11). https://doi.org/10.18637/jss.v074.i11
|
|
275
|
+
- Drechsler, J. (2011). *Synthetic Datasets for Statistical Disclosure Control.* Springer.
|
|
276
|
+
- US Census Bureau, *SIPP Synthetic Beta* + Cornell Synthetic Data Server (synthetic
|
|
277
|
+
development data + validation on confidential files).
|
|
278
|
+
|
|
279
|
+
## Maintainer
|
|
280
|
+
|
|
281
|
+
Dr **Yohanan Ouaknine** — OIS ([ois.co.il](https://ois.co.il)),
|
|
282
|
+
[yohanan.ouaknine@ois.co.il](mailto:yohanan.ouaknine@ois.co.il),
|
|
283
|
+
[ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351).
|
|
284
|
+
Department of Criminology, Ashkelon Academic College; formerly Head of the
|
|
285
|
+
Research Branch, Israel Prison Service.
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
MIT — see `LICENSE`.
|
|
290
|
+
|
|
291
|
+
## Citation
|
|
292
|
+
|
|
293
|
+
If you use `oissyntheticdata`, please cite this software (see `CITATION.cff`) and
|
|
294
|
+
the methodological lineage above (Reiter 2005; Nowok, Raab & Dibben 2016). The
|
|
295
|
+
method was first applied in Ouaknine, Elisha & Hasisi (2026), *The Effect of Mass
|
|
296
|
+
Prisoner Release on Terrorist Recidivism: A Propensity Score Analysis of the Shalit
|
|
297
|
+
Deal* (in publication).
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# oissyntheticdata
|
|
2
|
+
|
|
3
|
+
**Pure-Python sequential CART synthesis — in the `synthpop` tradition, with zero third-party dependencies.**
|
|
4
|
+
|
|
5
|
+
> An **OIS** tool · [ois.co.il](https://ois.co.il) · maintained by Dr Yohanan Ouaknine
|
|
6
|
+
> ([ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351))
|
|
7
|
+
|
|
8
|
+
`oissyntheticdata` generates a synthetic copy of a sensitive dataset that preserves the
|
|
9
|
+
*relationships between variables*, not just each column's marginal shape. It is
|
|
10
|
+
built for the secure-research workflow used by statistical agencies: **develop
|
|
11
|
+
and debug your analysis on the synthetic data off-site, then run the final code
|
|
12
|
+
on the real data on-premises and release only vetted aggregate results.**
|
|
13
|
+
|
|
14
|
+
It imports only the Python standard library (`csv`, `json`, `math`, `random`,
|
|
15
|
+
`statistics`, `zipfile`, `xml.etree`), so it can run inside a locked secure
|
|
16
|
+
environment with no `pip install` and is small enough to read and audit in full.
|
|
17
|
+
|
|
18
|
+
The approach was first deployed in a secure justice-research setting (a study of
|
|
19
|
+
terrorist recidivism after the 2011 Shalit prisoner exchange, run on-premises at
|
|
20
|
+
the Israel Prison Service under Research Committee authorization); this package
|
|
21
|
+
generalises and opens it. OIS offers deployment, validation, and training services
|
|
22
|
+
to government research units and academic researchers around the open core.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Why this exists
|
|
27
|
+
|
|
28
|
+
This follows a well-established paradigm in statistical disclosure control. The
|
|
29
|
+
synthetic data is *test data* that should resemble the real data closely but is
|
|
30
|
+
never used for final inference; the code developed on it is what gets run on the
|
|
31
|
+
confidential data (Nowok, Raab & Dibben 2016; US Census Bureau SIPP Synthetic
|
|
32
|
+
Beta). `oissyntheticdata` is a dependency-free re-implementation of the core engine those
|
|
33
|
+
tools use — **sequential CART synthesis** (Reiter 2005) — packaged for locked
|
|
34
|
+
environments.
|
|
35
|
+
|
|
36
|
+
It complements a metadata-only synthesizer (which preserves each column's shape
|
|
37
|
+
but not the joint structure): `oissyntheticdata` fits on the real microdata on-premises
|
|
38
|
+
and therefore reproduces conditional relationships, at the cost of touching raw
|
|
39
|
+
records (so it must run inside the secure environment).
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## How it works (the engine)
|
|
44
|
+
|
|
45
|
+
Synthesis proceeds **one column at a time** in a chosen visit order:
|
|
46
|
+
|
|
47
|
+
1. **First column** — drawn from its own empirical marginal, with cells smaller
|
|
48
|
+
than `min_leaf` suppressed.
|
|
49
|
+
2. **Each later column `Y`** — a CART (classification tree if `Y` is categorical,
|
|
50
|
+
regression tree if continuous) is grown on the **real data** to predict `Y`
|
|
51
|
+
from the columns already synthesized. Every leaf keeps the list of *real*
|
|
52
|
+
`Y` values that reached it (its "donors").
|
|
53
|
+
3. **Drawing** — for each synthetic row, route it down the tree using the values
|
|
54
|
+
already generated for that row, reach a leaf, and **sample a donor** from that
|
|
55
|
+
leaf (optionally jittered for continuous columns). Sampling from donors — not
|
|
56
|
+
predicting a point — is what reproduces the conditional distribution.
|
|
57
|
+
|
|
58
|
+
Because each column is predicted from the previously synthesized columns, the
|
|
59
|
+
joint distribution is assembled sequentially (the standard `synthpop` approach).
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
visit: c1 -> c2 -> c3 -> ...
|
|
63
|
+
c1 ~ marginal(c1)
|
|
64
|
+
c2 ~ leaf_donor( CART(c2 ~ c1) , synthetic c1 )
|
|
65
|
+
c3 ~ leaf_donor( CART(c3 ~ c1,c2) , synthetic c1,c2 )
|
|
66
|
+
...
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Confidentiality model
|
|
72
|
+
|
|
73
|
+
- **`min_leaf` (k, default 5):** no leaf and no marginal cell is built from fewer
|
|
74
|
+
than `k` real records, so every drawn value blends ≥ k individuals and is never
|
|
75
|
+
traceable to one person. This also caps tree depth and prevents the tree from
|
|
76
|
+
memorizing individuals.
|
|
77
|
+
- **`smoothing` (default 0):** optional Gaussian jitter on continuous donors,
|
|
78
|
+
bounded to the leaf's range, so exact real values are not echoed verbatim.
|
|
79
|
+
- **`drop`:** direct identifiers (national ID, names, record keys) should be
|
|
80
|
+
dropped before synthesis — `oissyntheticdata` does not attempt to anonymize them.
|
|
81
|
+
- **Only synthetic data leaves; the real data never does.** The intended use is
|
|
82
|
+
to take the synthetic file off-site for development and re-run final code on the
|
|
83
|
+
real data in place.
|
|
84
|
+
|
|
85
|
+
`oissyntheticdata` is a disclosure-control aid, not a formal privacy guarantee. For a
|
|
86
|
+
mathematical guarantee, combine it with differential privacy or apply output
|
|
87
|
+
checking (statistical disclosure control) to anything released.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Design decisions and trade-offs
|
|
92
|
+
|
|
93
|
+
The value of `oissyntheticdata` is in its design choices, which are deliberately narrow:
|
|
94
|
+
|
|
95
|
+
- **Where the synthesizer may run is a first-class concern.** `oissyntheticdata` fits on
|
|
96
|
+
real microdata to preserve joint structure, so it runs *on-premises*; only the
|
|
97
|
+
synthetic output leaves. A metadata-only synthesizer can run off-site but
|
|
98
|
+
preserves only per-column structure. Choosing fidelity-with-on-prem-execution
|
|
99
|
+
over portability-with-lower-fidelity is intentional, and the two roles are kept
|
|
100
|
+
as separate tools so the confidentiality reasoning stays explicit.
|
|
101
|
+
- **Donor-leaf sampling, not point prediction.** Drawing a real value from the
|
|
102
|
+
matching leaf reproduces the conditional distribution; predicting a mean would
|
|
103
|
+
not.
|
|
104
|
+
- **One confidentiality invariant.** `min_leaf` (`k`) applies the same `k`-record
|
|
105
|
+
floor to every marginal cell, tree leaf, fan-out estimate, and surrogate key,
|
|
106
|
+
instead of scattering ad hoc thresholds.
|
|
107
|
+
- **Relational by conditioning, not joining.** Children are synthesized
|
|
108
|
+
conditioned on the parent's synthetic attributes and linked by surrogate keys,
|
|
109
|
+
preserving referential integrity without materialising a real join.
|
|
110
|
+
- **Build on, don't reinvent.** The estimator is the established CART-synthesis
|
|
111
|
+
method; the new work is the dependency-free, auditable, relational realisation
|
|
112
|
+
for locked environments.
|
|
113
|
+
|
|
114
|
+
Scope boundaries are equally deliberate: single-parent schemas only, no enforced
|
|
115
|
+
high-order interactions or arithmetic identities, and no formal privacy guarantee
|
|
116
|
+
(see Limitations).
|
|
117
|
+
|
|
118
|
+
## Governance, support & contributing
|
|
119
|
+
|
|
120
|
+
`oissyntheticdata` is maintained in the open under the MIT license. Questions, bug reports,
|
|
121
|
+
and change proposals go through public GitHub Issues and Pull Requests; see
|
|
122
|
+
[`CONTRIBUTING.md`](CONTRIBUTING.md). Decisions are made by the maintainer(s)
|
|
123
|
+
listed in [`CITATION.cff`](CITATION.cff) via the public issue/PR process. There is
|
|
124
|
+
no private support channel — keeping development and discussion public is part of
|
|
125
|
+
the project's auditability goal. Releases are versioned and recorded in
|
|
126
|
+
[`CHANGELOG.md`](CHANGELOG.md).
|
|
127
|
+
|
|
128
|
+
## Generative AI disclosure
|
|
129
|
+
|
|
130
|
+
A generative AI assistant (Claude, Anthropic) was used to help draft and refactor
|
|
131
|
+
parts of the code and documentation. All output was reviewed, tested, and edited
|
|
132
|
+
by the author(s), who take full responsibility for the design, correctness, and
|
|
133
|
+
integrity of the software. The design decisions and abstractions above, and the
|
|
134
|
+
testing and documentation practices, are the author(s)' own. Contributors are
|
|
135
|
+
asked to disclose non-trivial AI assistance (see `CONTRIBUTING.md`).
|
|
136
|
+
|
|
137
|
+
## Install
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
pip install oissyntheticdata # once published
|
|
141
|
+
# or, in a locked environment, just copy the oissyntheticdata/ folder next to your code
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
No dependencies. Python 3.7+.
|
|
145
|
+
|
|
146
|
+
## Usage
|
|
147
|
+
|
|
148
|
+
Command line:
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
python -m oissyntheticdata real.csv -o synthetic.csv --drop national_id --min-leaf 5
|
|
152
|
+
python -m oissyntheticdata data.xlsx -o synthetic.csv --visit "age,offense,violent" --smoothing 0.5
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Library:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
import oissyntheticdata
|
|
159
|
+
|
|
160
|
+
# one call
|
|
161
|
+
oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
|
|
162
|
+
drop=["national_id"], min_leaf=5)
|
|
163
|
+
|
|
164
|
+
# or step by step
|
|
165
|
+
header, cols = oissyntheticdata.read_table("real.xlsx")
|
|
166
|
+
out_header, out_cols = oissyntheticdata.synthesize(header, cols,
|
|
167
|
+
drop=["national_id"], min_leaf=5)
|
|
168
|
+
oissyntheticdata.write_table("synthetic.csv", out_header, out_cols)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Key parameters: `n` (rows, default = real), `visit` (column order),
|
|
172
|
+
`drop` (identifiers to exclude), `min_leaf` (k), `max_depth`, `smoothing`, `seed`.
|
|
173
|
+
|
|
174
|
+
### Related tables (multi-table synthesis)
|
|
175
|
+
|
|
176
|
+
For data split across linked tables (e.g. one row per inmate, many judgements per
|
|
177
|
+
inmate), `synthesize_relational` keeps **referential integrity** and the
|
|
178
|
+
**parent → child structure**:
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
import oissyntheticdata
|
|
182
|
+
|
|
183
|
+
oissyntheticdata.synthesize_relational_files(
|
|
184
|
+
{"inmates": "inmates.csv", "judgements": "judgements.csv"},
|
|
185
|
+
schema={
|
|
186
|
+
"inmates": {"key": "prisoner_id"},
|
|
187
|
+
"judgements": {"key": "judgement_id",
|
|
188
|
+
"parent": "inmates", "foreign_key": "prisoner_id"},
|
|
189
|
+
},
|
|
190
|
+
out_dir="out", min_leaf=5,
|
|
191
|
+
)
|
|
192
|
+
# -> out/synthetic_inmates.csv, out/synthetic_judgements.csv
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
How it works: the parent is synthesized first and given fresh surrogate keys; a
|
|
196
|
+
regression CART models how many children each parent has (the fan-out) from the
|
|
197
|
+
parent's attributes; and each child's attributes are synthesized **conditioned on
|
|
198
|
+
its parent's synthetic attributes**. The result: every synthetic foreign key
|
|
199
|
+
points at a synthetic parent (0 orphan joins), the number of children per parent
|
|
200
|
+
is realistic, and parent → child relationships survive (e.g. high-risk parents
|
|
201
|
+
keep their child-row patterns). Supports a single-parent DAG — star, snowflake,
|
|
202
|
+
and parent → child → grandchild chains.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Limitations
|
|
207
|
+
|
|
208
|
+
- Fits on real microdata, so **run it on-premises**; the synthetic *output* is
|
|
209
|
+
what you take off-site.
|
|
210
|
+
- Relational synthesis covers a single-parent DAG (star / snowflake / chains).
|
|
211
|
+
Many-to-many relationships and compound keys are not modelled — pre-join or
|
|
212
|
+
pre-resolve them to a surrogate key first.
|
|
213
|
+
- CART captures pairwise/low-order structure well; very high-order interactions
|
|
214
|
+
and exact arithmetic identities (e.g. `rate = a/b`) are not enforced.
|
|
215
|
+
- Pure Python: comfortable to a few thousand rows × a few dozen columns; larger
|
|
216
|
+
data is slower than a compiled implementation.
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Lineage & sources
|
|
221
|
+
|
|
222
|
+
- Rubin, D.B. (1993). *Statistical disclosure limitation.* J. Official Statistics 9(2).
|
|
223
|
+
- Little, R.J.A. (1993). *Statistical analysis of masked data.* J. Official Statistics 9(2).
|
|
224
|
+
- Reiter, J.P. (2005). *Using CART to generate partially synthetic public use microdata.*
|
|
225
|
+
J. Official Statistics 21(3).
|
|
226
|
+
- Reiter, Oganian & Karr (2009). *Verification servers.* Comput. Stat. Data Anal. 53(4):1475–1482.
|
|
227
|
+
https://doi.org/10.1016/j.csda.2008.10.006
|
|
228
|
+
- Nowok, Raab & Dibben (2016). *synthpop: Bespoke Creation of Synthetic Data in R.*
|
|
229
|
+
J. Statistical Software 74(11). https://doi.org/10.18637/jss.v074.i11
|
|
230
|
+
- Drechsler, J. (2011). *Synthetic Datasets for Statistical Disclosure Control.* Springer.
|
|
231
|
+
- US Census Bureau, *SIPP Synthetic Beta* + Cornell Synthetic Data Server (synthetic
|
|
232
|
+
development data + validation on confidential files).
|
|
233
|
+
|
|
234
|
+
## Maintainer
|
|
235
|
+
|
|
236
|
+
Dr **Yohanan Ouaknine** — OIS ([ois.co.il](https://ois.co.il)),
|
|
237
|
+
[yohanan.ouaknine@ois.co.il](mailto:yohanan.ouaknine@ois.co.il),
|
|
238
|
+
[ORCID 0000-0002-4186-7351](https://orcid.org/0000-0002-4186-7351).
|
|
239
|
+
Department of Criminology, Ashkelon Academic College; formerly Head of the
|
|
240
|
+
Research Branch, Israel Prison Service.
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT — see `LICENSE`.
|
|
245
|
+
|
|
246
|
+
## Citation
|
|
247
|
+
|
|
248
|
+
If you use `oissyntheticdata`, please cite this software (see `CITATION.cff`) and
|
|
249
|
+
the methodological lineage above (Reiter 2005; Nowok, Raab & Dibben 2016). The
|
|
250
|
+
method was first applied in Ouaknine, Elisha & Hasisi (2026), *The Effect of Mass
|
|
251
|
+
Prisoner Release on Terrorist Recidivism: A Propensity Score Analysis of the Shalit
|
|
252
|
+
Deal* (in publication).
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
oissyntheticdata — pure-Python sequential CART synthesis, in the synthpop tradition.
|
|
4
|
+
|
|
5
|
+
Zero third-party dependencies (standard library only). Designed for secure
|
|
6
|
+
research environments: develop and debug your analysis on the synthetic data
|
|
7
|
+
off-site, then run the final code on the real data on-premises.
|
|
8
|
+
|
|
9
|
+
Single table
|
|
10
|
+
------------
|
|
11
|
+
import oissyntheticdata
|
|
12
|
+
oissyntheticdata.synthesize_file("real.csv", "synthetic.csv",
|
|
13
|
+
drop=["national_id"], min_leaf=5)
|
|
14
|
+
|
|
15
|
+
Related tables (referential integrity preserved)
|
|
16
|
+
-------------------------------------------------
|
|
17
|
+
oissyntheticdata.synthesize_relational_files(
|
|
18
|
+
{"inmates": "inmates.csv", "judgements": "judgements.csv"},
|
|
19
|
+
schema={
|
|
20
|
+
"inmates": {"key": "prisoner_id"},
|
|
21
|
+
"judgements": {"key": "judgement_id",
|
|
22
|
+
"parent": "inmates", "foreign_key": "prisoner_id"},
|
|
23
|
+
},
|
|
24
|
+
out_dir="out", min_leaf=5)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from ._io import read_table, write_table
|
|
28
|
+
from ._synth import synthesize
|
|
29
|
+
from ._relational import synthesize_relational, synthesize_relational_files
|
|
30
|
+
|
|
31
|
+
__version__ = "0.2.0"
|
|
32
|
+
__all__ = [
|
|
33
|
+
"read_table", "write_table", "synthesize", "synthesize_file",
|
|
34
|
+
"synthesize_relational", "synthesize_relational_files",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def synthesize_file(in_path, out_path, n=None, visit=None, drop=None,
|
|
39
|
+
min_leaf=5, max_depth=12, smoothing=0.0, seed=12345):
|
|
40
|
+
"""Read a CSV/XLSX, synthesize one flat table, and write a CSV."""
|
|
41
|
+
header, cols = read_table(in_path)
|
|
42
|
+
out_header, out_cols = synthesize(
|
|
43
|
+
header, cols, n=n, visit=visit, drop=drop,
|
|
44
|
+
min_leaf=min_leaf, max_depth=max_depth, smoothing=smoothing, seed=seed)
|
|
45
|
+
write_table(out_path, out_header, out_cols)
|
|
46
|
+
nrows = len(out_cols[out_header[0]]) if out_header else 0
|
|
47
|
+
return nrows, len(out_header)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""Command-line interface: python -m oissyntheticdata real.csv -o synthetic.csv"""
|
|
3
|
+
|
|
4
|
+
import sys
|
|
5
|
+
import argparse
|
|
6
|
+
from . import synthesize_file, __version__
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main(argv=None):
|
|
10
|
+
p = argparse.ArgumentParser(
|
|
11
|
+
prog="oissyntheticdata",
|
|
12
|
+
description="Pure-Python sequential CART synthesis (synthpop tradition, zero deps).")
|
|
13
|
+
p.add_argument("input", help="real CSV or XLSX file")
|
|
14
|
+
p.add_argument("-o", "--output", default="synthetic.csv", help="output CSV path")
|
|
15
|
+
p.add_argument("-n", "--rows", type=int, default=None, help="number of synthetic rows")
|
|
16
|
+
p.add_argument("--drop", default="", help="comma-separated columns to exclude (e.g. identifiers)")
|
|
17
|
+
p.add_argument("--visit", default="", help="comma-separated synthesis order (default: file order)")
|
|
18
|
+
p.add_argument("--min-leaf", type=int, default=5, help="minimum real records per leaf/cell (k)")
|
|
19
|
+
p.add_argument("--max-depth", type=int, default=12, help="maximum tree depth")
|
|
20
|
+
p.add_argument("--smoothing", type=float, default=0.0, help="continuous jitter (0 = off)")
|
|
21
|
+
p.add_argument("--seed", type=int, default=12345)
|
|
22
|
+
p.add_argument("--version", action="version", version="oissyntheticdata " + __version__)
|
|
23
|
+
a = p.parse_args(argv)
|
|
24
|
+
|
|
25
|
+
drop = [c.strip() for c in a.drop.split(",") if c.strip()]
|
|
26
|
+
visit = [c.strip() for c in a.visit.split(",") if c.strip()] or None
|
|
27
|
+
rows, cols = synthesize_file(a.input, a.output, n=a.rows, visit=visit, drop=drop,
|
|
28
|
+
min_leaf=a.min_leaf, max_depth=a.max_depth,
|
|
29
|
+
smoothing=a.smoothing, seed=a.seed)
|
|
30
|
+
sys.stderr.write("[oissyntheticdata] wrote %d rows x %d cols -> %s\n" % (rows, cols, a.output))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
if __name__ == "__main__":
|
|
34
|
+
main()
|