profilefoundry 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- profilefoundry-1.0.0/CITATION.cff +12 -0
- profilefoundry-1.0.0/LICENSE +34 -0
- profilefoundry-1.0.0/MANIFEST.in +2 -0
- profilefoundry-1.0.0/PKG-INFO +193 -0
- profilefoundry-1.0.0/README.md +149 -0
- profilefoundry-1.0.0/data/reference/MANIFEST.md +139 -0
- profilefoundry-1.0.0/data/reference/bootstrap/INDEX.json +17 -0
- profilefoundry-1.0.0/data/reference/bootstrap/SCHEMA.md +160 -0
- profilefoundry-1.0.0/data/reference/bootstrap/au/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/au/education.json +14 -0
- profilefoundry-1.0.0/data/reference/bootstrap/au/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/au/marital.json +32 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ca/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ca/education.json +14 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ca/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ca/marital.json +32 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ie/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ie/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/in/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/in/education.json +33 -0
- profilefoundry-1.0.0/data/reference/bootstrap/in/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/in/marital.json +37 -0
- profilefoundry-1.0.0/data/reference/bootstrap/nz/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/nz/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ph/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/ph/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/uk/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/uk/education.json +14 -0
- profilefoundry-1.0.0/data/reference/bootstrap/uk/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/uk/marital.json +32 -0
- profilefoundry-1.0.0/data/reference/bootstrap/uk/race_ethnicity.json +24 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/age_sex.json +15 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/education.json +14 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/household.json +22 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/income.json +38 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/marital.json +32 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/race_ethnicity.json +23 -0
- profilefoundry-1.0.0/data/reference/bootstrap/us/regions.json +65 -0
- profilefoundry-1.0.0/data/reference/geo/AU_cities.json +36 -0
- profilefoundry-1.0.0/data/reference/geo/CA_cities.json +43 -0
- profilefoundry-1.0.0/data/reference/geo/IE_cities.json +30 -0
- profilefoundry-1.0.0/data/reference/geo/IN_cities.json +71 -0
- profilefoundry-1.0.0/data/reference/geo/NZ_cities.json +30 -0
- profilefoundry-1.0.0/data/reference/geo/PH_cities.json +43 -0
- profilefoundry-1.0.0/data/reference/geo/UK_cities.json +51 -0
- profilefoundry-1.0.0/data/reference/geo/US_cities.json +71 -0
- profilefoundry-1.0.0/data/reference/names/IN_given.json +29 -0
- profilefoundry-1.0.0/data/reference/names/IN_surnames.json +44 -0
- profilefoundry-1.0.0/data/reference/names/PH_given.json +29 -0
- profilefoundry-1.0.0/data/reference/names/PH_surnames.json +25 -0
- profilefoundry-1.0.0/pyproject.toml +106 -0
- profilefoundry-1.0.0/setup.cfg +4 -0
- profilefoundry-1.0.0/src/profilefoundry/__init__.py +15 -0
- profilefoundry-1.0.0/src/profilefoundry/cli.py +361 -0
- profilefoundry-1.0.0/src/profilefoundry/data/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/data/loader.py +80 -0
- profilefoundry-1.0.0/src/profilefoundry/data/manifest_hash.py +54 -0
- profilefoundry-1.0.0/src/profilefoundry/data/paths.py +33 -0
- profilefoundry-1.0.0/src/profilefoundry/diversity/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/documents/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/address.py +659 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/backfill.py +810 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/contact_ids.py +1314 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/education_status.py +244 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/employer_names.py +180 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/factory.py +1178 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/finance_health.py +525 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/identity.py +316 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/locales/__init__.py +201 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/occupation.py +254 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/sampling.py +70 -0
- profilefoundry-1.0.0/src/profilefoundry/generate/seeding.py +38 -0
- profilefoundry-1.0.0/src/profilefoundry/io/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/io/hf_export.py +1454 -0
- profilefoundry-1.0.0/src/profilefoundry/linkage/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/linkage/employers.py +361 -0
- profilefoundry-1.0.0/src/profilefoundry/linkage/families.py +176 -0
- profilefoundry-1.0.0/src/profilefoundry/linkage/households.py +205 -0
- profilefoundry-1.0.0/src/profilefoundry/linkage/orchestrator.py +476 -0
- profilefoundry-1.0.0/src/profilefoundry/load/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/schema/__init__.py +78 -0
- profilefoundry-1.0.0/src/profilefoundry/schema/v0_1.py +457 -0
- profilefoundry-1.0.0/src/profilefoundry/validate/__init__.py +0 -0
- profilefoundry-1.0.0/src/profilefoundry/validate/consistency.py +105 -0
- profilefoundry-1.0.0/src/profilefoundry/validate/distributional.py +213 -0
- profilefoundry-1.0.0/src/profilefoundry/validate/leakage.py +376 -0
- profilefoundry-1.0.0/src/profilefoundry/validate/replay.py +130 -0
- profilefoundry-1.0.0/src/profilefoundry.egg-info/PKG-INFO +193 -0
- profilefoundry-1.0.0/src/profilefoundry.egg-info/SOURCES.txt +103 -0
- profilefoundry-1.0.0/src/profilefoundry.egg-info/dependency_links.txt +1 -0
- profilefoundry-1.0.0/src/profilefoundry.egg-info/entry_points.txt +2 -0
- profilefoundry-1.0.0/src/profilefoundry.egg-info/requires.txt +27 -0
- profilefoundry-1.0.0/src/profilefoundry.egg-info/top_level.txt +1 -0
- profilefoundry-1.0.0/tests/test_cli.py +277 -0
- profilefoundry-1.0.0/tests/test_export_normalized.py +622 -0
- profilefoundry-1.0.0/tests/test_factory.py +92 -0
- profilefoundry-1.0.0/tests/test_hf_release_current.py +94 -0
- profilefoundry-1.0.0/tests/test_linkage.py +146 -0
- profilefoundry-1.0.0/tests/test_loader.py +57 -0
- profilefoundry-1.0.0/tests/test_push_to_hf.py +96 -0
- profilefoundry-1.0.0/tests/test_reproducibility.py +61 -0
- profilefoundry-1.0.0/tests/test_update_leakage_report.py +142 -0
- profilefoundry-1.0.0/tests/test_validation.py +153 -0
- profilefoundry-1.0.0/tests/test_wikidata_ingest_robustness.py +87 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
cff-version: 1.2.0
|
|
2
|
+
message: "If you use ProfileFoundry, please cite the paper when available; until then, cite the repository."
|
|
3
|
+
type: software
|
|
4
|
+
title: "ProfileFoundry: An Audited Person-Object Substrate for Stateful NLP and Agent Evaluation"
|
|
5
|
+
authors:
|
|
6
|
+
- family-names: Selvam
|
|
7
|
+
given-names: Sriram
|
|
8
|
+
repository-code: "https://github.com/selvamsriram/ProfileFoundry"
|
|
9
|
+
url: "https://github.com/selvamsriram/ProfileFoundry"
|
|
10
|
+
version: "1.0.0"
|
|
11
|
+
date-released: "2026-06-14"
|
|
12
|
+
license: "LicenseRef-ProfileFoundry-Citation-1.0"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
ProfileFoundry Citation License 1.0
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Sriram Selvam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to use,
|
|
7
|
+
copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
|
8
|
+
the Software, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
1. Attribution and citation. Any public use, redistribution, derivative work,
|
|
11
|
+
benchmark, dataset, paper, report, model card, data card, product
|
|
12
|
+
documentation, or repository that uses or includes the Software or generated
|
|
13
|
+
ProfileFoundry artifacts must provide reasonable attribution to
|
|
14
|
+
ProfileFoundry. Cite the ProfileFoundry paper when it is available. Until
|
|
15
|
+
then, cite the ProfileFoundry repository:
|
|
16
|
+
|
|
17
|
+
ProfileFoundry: An Audited Person-Object Substrate for Stateful NLP
|
|
18
|
+
and Agent Evaluation. https://github.com/selvamsriram/ProfileFoundry
|
|
19
|
+
|
|
20
|
+
2. Notice preservation. The above copyright notice, this license, and any
|
|
21
|
+
citation instructions distributed with the Software must be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
3. Reference data. Embedded reference data and generated datasets may carry
|
|
25
|
+
additional upstream attribution requirements. Those notices must be
|
|
26
|
+
preserved when redistributing derived artifacts.
|
|
27
|
+
|
|
28
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
29
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
30
|
+
FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
31
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
|
|
32
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM,
|
|
33
|
+
OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
34
|
+
SOFTWARE.
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: profilefoundry
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An open-source generator for structured, internally-consistent, linked, temporally coherent synthetic Person Objects.
|
|
5
|
+
Author: Sriram Selvam
|
|
6
|
+
License-Expression: LicenseRef-ProfileFoundry-Citation-1.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/selvamsriram/ProfileFoundry
|
|
8
|
+
Project-URL: Issues, https://github.com/selvamsriram/ProfileFoundry/issues
|
|
9
|
+
Keywords: synthetic-data,pii,person-object,linkage,evaluation,privacy
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pydantic>=2.6
|
|
20
|
+
Requires-Dist: faker>=24.0
|
|
21
|
+
Requires-Dist: numpy>=1.26
|
|
22
|
+
Requires-Dist: pandas>=2.1
|
|
23
|
+
Requires-Dist: pyarrow>=15.0
|
|
24
|
+
Requires-Dist: click>=8.1
|
|
25
|
+
Requires-Dist: rapidfuzz>=3.6
|
|
26
|
+
Requires-Dist: pybloom-live>=4.0
|
|
27
|
+
Requires-Dist: requests>=2.31
|
|
28
|
+
Requires-Dist: tqdm>=4.66
|
|
29
|
+
Requires-Dist: python-dotenv>=1.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
33
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
34
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
35
|
+
Requires-Dist: ipython>=8.20; extra == "dev"
|
|
36
|
+
Provides-Extra: viz
|
|
37
|
+
Requires-Dist: matplotlib>=3.8; extra == "viz"
|
|
38
|
+
Requires-Dist: seaborn>=0.13; extra == "viz"
|
|
39
|
+
Requires-Dist: plotly>=5.20; extra == "viz"
|
|
40
|
+
Provides-Extra: hf
|
|
41
|
+
Requires-Dist: datasets>=2.18; extra == "hf"
|
|
42
|
+
Requires-Dist: huggingface-hub>=0.23; extra == "hf"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# ProfileFoundry
|
|
46
|
+
|
|
47
|
+
> A structured, internally-consistent, linked, temporally coherent synthetic
|
|
48
|
+
> Person Object - and an open-source generator that produces them at scale.
|
|
49
|
+
|
|
50
|
+
This repository is the implementation home for ProfileFoundry, which builds
|
|
51
|
+
on the problem setting exposed by [PANORAMA](https://arxiv.org/abs/2505.12238)
|
|
52
|
+
(Selvam & Ghosh, 2025). It provides a deterministic generator, schema,
|
|
53
|
+
validation suite, release reports, and Hugging Face export tooling for the
|
|
54
|
+
`ProfileFoundry-Core-100K` dataset.
|
|
55
|
+
|
|
56
|
+
The ACL/NeurIPS-style paper draft lives in [`Paper/`](Paper/).
|
|
57
|
+
The vetted 100K release package is staged for Hugging Face at
|
|
58
|
+
[`srirxml/ProfileFoundry-Core-100K`](https://huggingface.co/datasets/srirxml/ProfileFoundry-Core-100K);
|
|
59
|
+
run `python scripts/verify_hf_release_current.py` with an `HF_TOKEN` before
|
|
60
|
+
calling the remote artifact current. The default HF viewer table is
|
|
61
|
+
`person_objects.parquet`, a complete one-row-per-person view with nested
|
|
62
|
+
sections encoded as JSON strings; the remaining parquet files expose the
|
|
63
|
+
normalized relational schema.
|
|
64
|
+
|
|
65
|
+
## Status (2026-05-24) — v1.0 release sprint
|
|
66
|
+
|
|
67
|
+
| Phase | Status |
|
|
68
|
+
|---|---|
|
|
69
|
+
| **0 — Foundation** (schema and package scaffolding) | ✅ complete |
|
|
70
|
+
| **1 — Reference data** (manifest + bootstrap + loader) | ✅ ref data shipped for 5 validation locales |
|
|
71
|
+
| **2 — Generator factory** (port + joint constraints + age-gating) | ✅ deterministic across processes (set-iteration bug fixed) |
|
|
72
|
+
| **3 — Linkage** (households, employers, family graph) | ✅ household orchestrator + family edges + employer pool |
|
|
73
|
+
| **4 — Temporal** (event taxonomy, backfill, replay) | ✅ replay-valid non-credit timeline + typed payload export |
|
|
74
|
+
| **5a — Validation** (KS gaps, consistency) | ✅ disclosed KS gaps (max 0.124) · 100% consistency |
|
|
75
|
+
| **5b — Leakage audit** (Wikidata Bloom, reserved-domain email audit, self-collision) | ✅ Wikidata Bloom + self-collision + reserved-domain email syntax audit |
|
|
76
|
+
| **6 — Scale + publish** (100K + HF export) | ✅ 100K release · complete viewer table · normalized parquet star schema · MANIFEST.json |
|
|
77
|
+
| **7 — Reproducibility** (verify fixture, manifest hash) | ✅ normalized fixture stable across processes; verify script + fixture |
|
|
78
|
+
| **8 — Paper draft** | ✅ submission draft revised |
|
|
79
|
+
| **9 — HF push + tag v1.0.0** | ⏳ HF upload target documented; remote current check requires `HF_TOKEN`; tag pending |
|
|
80
|
+
|
|
81
|
+
## Quick start
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install -e ".[dev]"
|
|
85
|
+
profilefoundry verify # smoke-test all 8 locales
|
|
86
|
+
profilefoundry person --locale US # one US Person as JSON
|
|
87
|
+
profilefoundry household --locale US # one linked household (multiple Persons)
|
|
88
|
+
profilefoundry scale --n 1000 --locale UK --out /tmp/uk.jsonl
|
|
89
|
+
profilefoundry scale-households --n 500 --locale CA --out /tmp/ca_households.jsonl
|
|
90
|
+
profilefoundry validate --n 300 # KS gaps + leakage + consistency
|
|
91
|
+
profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24 --skip-hibp # normalized parquet star schema
|
|
92
|
+
profilefoundry scale-smoke --sizes 1000,10000,100000 # timings
|
|
93
|
+
|
|
94
|
+
# Full v1.0 release run (100K profiles in ~4-5 minutes on a laptop):
|
|
95
|
+
python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose
|
|
96
|
+
# Reproducibility check:
|
|
97
|
+
python scripts/verify_reproducibility.py
|
|
98
|
+
# Hugging Face upload preflight / push:
|
|
99
|
+
python scripts/push_to_hf.py --dry-run
|
|
100
|
+
python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K
|
|
101
|
+
# Confirm the public HF manifest matches the vetted local release:
|
|
102
|
+
python scripts/verify_hf_release_current.py
|
|
103
|
+
# Build the Wikidata bloom (multi-hour due to WDQS rate limit):
|
|
104
|
+
python scripts/ingest_wikidata_persons.py --scope sample --sleep 65 --verbose
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Or without installing:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
PYTHONPATH=src python3 -m profilefoundry.cli scale --n 1000 --locale US --out /tmp/us.jsonl
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## CLI reference
|
|
114
|
+
|
|
115
|
+
All commands are available through `profilefoundry` after installation, or via
|
|
116
|
+
`PYTHONPATH=src python3 -m profilefoundry.cli` from a checkout.
|
|
117
|
+
|
|
118
|
+
| Command | Purpose | Common example |
|
|
119
|
+
|---|---|---|
|
|
120
|
+
| `profilefoundry verify` | Generate one profile per supported locale as a smoke test. | `profilefoundry verify` |
|
|
121
|
+
| `profilefoundry person` | Print one deterministic Person Object as JSON. | `profilefoundry person --locale US --seed 4321 --profile-seq 1` |
|
|
122
|
+
| `profilefoundry household` | Print one linked household as JSON. | `profilefoundry household --locale UK --seed 4321 --seq 7` |
|
|
123
|
+
| `profilefoundry scale` | Generate flat JSONL profiles for one locale. | `profilefoundry scale --n 1000 --locale CA --out /tmp/ca.jsonl` |
|
|
124
|
+
| `profilefoundry scale-households` | Generate linked household members as JSONL. | `profilefoundry scale-households --n 500 --locale AU --out /tmp/au_households.jsonl` |
|
|
125
|
+
| `profilefoundry validate` | Run distributional, leakage, replay, and consistency checks. | `profilefoundry validate --n 300 --locales US,UK,IN --skip-hibp` |
|
|
126
|
+
| `profilefoundry export` | Write JSONL, parquet tables, manifest, and dataset card. | `profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24T00:00:00Z --skip-hibp` |
|
|
127
|
+
| `profilefoundry scale-smoke` | Time the generator at several sizes. | `profilefoundry scale-smoke --sizes 1000,10000,100000` |
|
|
128
|
+
|
|
129
|
+
Script-level release utilities:
|
|
130
|
+
|
|
131
|
+
| Script | Purpose |
|
|
132
|
+
|---|---|
|
|
133
|
+
| `python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose` | Rebuild the 100K release artifact under `data/raw/profilefoundry-core-v1/` and reports under `reports/v1_release/`. |
|
|
134
|
+
| `python scripts/verify_reproducibility.py` | Confirm the pinned reproducibility fixture still matches. |
|
|
135
|
+
| `python scripts/verify_package_reference_data.py` | Confirm reference data is available from an installed wheel. |
|
|
136
|
+
| `python scripts/push_to_hf.py --dry-run` | Preview the Hugging Face upload. |
|
|
137
|
+
| `python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K` | Upload the vetted release artifact. |
|
|
138
|
+
| `python scripts/verify_hf_release_current.py` | Compare the local release manifest with the Hugging Face dataset. Requires `HF_TOKEN` for private or gated repos. |
|
|
139
|
+
| `python scripts/export_schema.py --check` | Verify the checked-in JSON Schema matches the Pydantic model. |
|
|
140
|
+
| `python scripts/ingest_us_acs.py` / `python scripts/ingest_uk_ons.py` | Refresh reference-data inputs from upstream sources. |
|
|
141
|
+
|
|
142
|
+
## Project layout
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
.
|
|
146
|
+
├── README.md # this file
|
|
147
|
+
├── pyproject.toml # package metadata, deps, ruff/pytest config
|
|
148
|
+
├── notes/
|
|
149
|
+
│ ├── BUGS.tsv # resolved/known issue ledger
|
|
150
|
+
│ └── RELEASE_PLAN.md # v1.0 release plan
|
|
151
|
+
├── schemas/
|
|
152
|
+
│ ├── README.md
|
|
153
|
+
│ └── person_v0_1.schema.json # auto-exported JSON Schema
|
|
154
|
+
├── src/profilefoundry/
|
|
155
|
+
│ ├── schema/ # Pydantic Person Object v0.1 (source of truth)
|
|
156
|
+
│ ├── data/ # reference-data loader (bootstrap ↔ derived)
|
|
157
|
+
│ ├── generate/ # factory + sampling + locale providers
|
|
158
|
+
│ ├── linkage/ # (phase 3) households, employers, families
|
|
159
|
+
│ ├── validate/ # (phase 5) invariants, distributions, leakage
|
|
160
|
+
│ ├── io/ # (phase 6) Hugging Face export
|
|
161
|
+
│ └── cli.py # `profilefoundry` entry point
|
|
162
|
+
├── data/reference/
|
|
163
|
+
│ ├── MANIFEST.md # every external source enumerated
|
|
164
|
+
│ └── bootstrap/ # committed minimum reference marginals
|
|
165
|
+
├── scripts/
|
|
166
|
+
│ ├── export_schema.py # regenerate JSON Schema
|
|
167
|
+
│ └── ingest_us_acs.py # pull richer ACS tables live (needs API key)
|
|
168
|
+
├── tests/ # pytest invariants & unit tests
|
|
169
|
+
└── Paper/ # ACL-style paper draft + figures
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Testing
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
PYTHONPATH=src python3 -m pytest tests/
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
The full pytest suite covers schema invariants (Pydantic model), reference-data
|
|
179
|
+
loading, factory smoke checks, determinism, plausibility (age, employment,
|
|
180
|
+
addresses, education), **adults-only enforcement** (no profiles under 18),
|
|
181
|
+
linkage (households, employers, family graph), validation (KS gaps, replay,
|
|
182
|
+
consistency, leakage), release-report freshness, normalized export quality, and
|
|
183
|
+
reproducibility.
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
- Code and SDK: **ProfileFoundry Citation License 1.0** (see `LICENSE`).
|
|
188
|
+
Public uses and redistributions must cite the ProfileFoundry paper when
|
|
189
|
+
available, or this repository until then. Machine-readable citation metadata
|
|
190
|
+
is in `CITATION.cff`.
|
|
191
|
+
- Generated dataset: **CC-BY-4.0**.
|
|
192
|
+
- Embedded reference data retains its upstream license; see
|
|
193
|
+
`data/reference/MANIFEST.md`.
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
# ProfileFoundry
|
|
2
|
+
|
|
3
|
+
> A structured, internally-consistent, linked, temporally coherent synthetic
|
|
4
|
+
> Person Object - and an open-source generator that produces them at scale.
|
|
5
|
+
|
|
6
|
+
This repository is the implementation home for ProfileFoundry, which builds
|
|
7
|
+
on the problem setting exposed by [PANORAMA](https://arxiv.org/abs/2505.12238)
|
|
8
|
+
(Selvam & Ghosh, 2025). It provides a deterministic generator, schema,
|
|
9
|
+
validation suite, release reports, and Hugging Face export tooling for the
|
|
10
|
+
`ProfileFoundry-Core-100K` dataset.
|
|
11
|
+
|
|
12
|
+
The ACL/NeurIPS-style paper draft lives in [`Paper/`](Paper/).
|
|
13
|
+
The vetted 100K release package is staged for Hugging Face at
|
|
14
|
+
[`srirxml/ProfileFoundry-Core-100K`](https://huggingface.co/datasets/srirxml/ProfileFoundry-Core-100K);
|
|
15
|
+
run `python scripts/verify_hf_release_current.py` with an `HF_TOKEN` before
|
|
16
|
+
calling the remote artifact current. The default HF viewer table is
|
|
17
|
+
`person_objects.parquet`, a complete one-row-per-person view with nested
|
|
18
|
+
sections encoded as JSON strings; the remaining parquet files expose the
|
|
19
|
+
normalized relational schema.
|
|
20
|
+
|
|
21
|
+
## Status (2026-05-24) — v1.0 release sprint
|
|
22
|
+
|
|
23
|
+
| Phase | Status |
|
|
24
|
+
|---|---|
|
|
25
|
+
| **0 — Foundation** (schema and package scaffolding) | ✅ complete |
|
|
26
|
+
| **1 — Reference data** (manifest + bootstrap + loader) | ✅ ref data shipped for 5 validation locales |
|
|
27
|
+
| **2 — Generator factory** (port + joint constraints + age-gating) | ✅ deterministic across processes (set-iteration bug fixed) |
|
|
28
|
+
| **3 — Linkage** (households, employers, family graph) | ✅ household orchestrator + family edges + employer pool |
|
|
29
|
+
| **4 — Temporal** (event taxonomy, backfill, replay) | ✅ replay-valid non-credit timeline + typed payload export |
|
|
30
|
+
| **5a — Validation** (KS gaps, consistency) | ✅ disclosed KS gaps (max 0.124) · 100% consistency |
|
|
31
|
+
| **5b — Leakage audit** (Wikidata Bloom, reserved-domain email audit, self-collision) | ✅ Wikidata Bloom + self-collision + reserved-domain email syntax audit |
|
|
32
|
+
| **6 — Scale + publish** (100K + HF export) | ✅ 100K release · complete viewer table · normalized parquet star schema · MANIFEST.json |
|
|
33
|
+
| **7 — Reproducibility** (verify fixture, manifest hash) | ✅ normalized fixture stable across processes; verify script + fixture |
|
|
34
|
+
| **8 — Paper draft** | ✅ submission draft revised |
|
|
35
|
+
| **9 — HF push + tag v1.0.0** | ⏳ HF upload target documented; remote current check requires `HF_TOKEN`; tag pending |
|
|
36
|
+
|
|
37
|
+
## Quick start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -e ".[dev]"
|
|
41
|
+
profilefoundry verify # smoke-test all 8 locales
|
|
42
|
+
profilefoundry person --locale US # one US Person as JSON
|
|
43
|
+
profilefoundry household --locale US # one linked household (multiple Persons)
|
|
44
|
+
profilefoundry scale --n 1000 --locale UK --out /tmp/uk.jsonl
|
|
45
|
+
profilefoundry scale-households --n 500 --locale CA --out /tmp/ca_households.jsonl
|
|
46
|
+
profilefoundry validate --n 300 # KS gaps + leakage + consistency
|
|
47
|
+
profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24 --skip-hibp # normalized parquet star schema
|
|
48
|
+
profilefoundry scale-smoke --sizes 1000,10000,100000 # timings
|
|
49
|
+
|
|
50
|
+
# Full v1.0 release run (100K profiles in ~4-5 minutes on a laptop):
|
|
51
|
+
python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose
|
|
52
|
+
# Reproducibility check:
|
|
53
|
+
python scripts/verify_reproducibility.py
|
|
54
|
+
# Hugging Face upload preflight / push:
|
|
55
|
+
python scripts/push_to_hf.py --dry-run
|
|
56
|
+
python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K
|
|
57
|
+
# Confirm the public HF manifest matches the vetted local release:
|
|
58
|
+
python scripts/verify_hf_release_current.py
|
|
59
|
+
# Build the Wikidata bloom (multi-hour due to WDQS rate limit):
|
|
60
|
+
python scripts/ingest_wikidata_persons.py --scope sample --sleep 65 --verbose
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Or without installing:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
PYTHONPATH=src python3 -m profilefoundry.cli scale --n 1000 --locale US --out /tmp/us.jsonl
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## CLI reference
|
|
70
|
+
|
|
71
|
+
All commands are available through `profilefoundry` after installation, or via
|
|
72
|
+
`PYTHONPATH=src python3 -m profilefoundry.cli` from a checkout.
|
|
73
|
+
|
|
74
|
+
| Command | Purpose | Common example |
|
|
75
|
+
|---|---|---|
|
|
76
|
+
| `profilefoundry verify` | Generate one profile per supported locale as a smoke test. | `profilefoundry verify` |
|
|
77
|
+
| `profilefoundry person` | Print one deterministic Person Object as JSON. | `profilefoundry person --locale US --seed 4321 --profile-seq 1` |
|
|
78
|
+
| `profilefoundry household` | Print one linked household as JSON. | `profilefoundry household --locale UK --seed 4321 --seq 7` |
|
|
79
|
+
| `profilefoundry scale` | Generate flat JSONL profiles for one locale. | `profilefoundry scale --n 1000 --locale CA --out /tmp/ca.jsonl` |
|
|
80
|
+
| `profilefoundry scale-households` | Generate linked household members as JSONL. | `profilefoundry scale-households --n 500 --locale AU --out /tmp/au_households.jsonl` |
|
|
81
|
+
| `profilefoundry validate` | Run distributional, leakage, replay, and consistency checks. | `profilefoundry validate --n 300 --locales US,UK,IN --skip-hibp` |
|
|
82
|
+
| `profilefoundry export` | Write JSONL, parquet tables, manifest, and dataset card. | `profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24T00:00:00Z --skip-hibp` |
|
|
83
|
+
| `profilefoundry scale-smoke` | Time the generator at several sizes. | `profilefoundry scale-smoke --sizes 1000,10000,100000` |
|
|
84
|
+
|
|
85
|
+
Script-level release utilities:
|
|
86
|
+
|
|
87
|
+
| Script | Purpose |
|
|
88
|
+
|---|---|
|
|
89
|
+
| `python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose` | Rebuild the 100K release artifact under `data/raw/profilefoundry-core-v1/` and reports under `reports/v1_release/`. |
|
|
90
|
+
| `python scripts/verify_reproducibility.py` | Confirm the pinned reproducibility fixture still matches. |
|
|
91
|
+
| `python scripts/verify_package_reference_data.py` | Confirm reference data is available from an installed wheel. |
|
|
92
|
+
| `python scripts/push_to_hf.py --dry-run` | Preview the Hugging Face upload. |
|
|
93
|
+
| `python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K` | Upload the vetted release artifact. |
|
|
94
|
+
| `python scripts/verify_hf_release_current.py` | Compare the local release manifest with the Hugging Face dataset. Requires `HF_TOKEN` for private or gated repos. |
|
|
95
|
+
| `python scripts/export_schema.py --check` | Verify the checked-in JSON Schema matches the Pydantic model. |
|
|
96
|
+
| `python scripts/ingest_us_acs.py` / `python scripts/ingest_uk_ons.py` | Refresh reference-data inputs from upstream sources. |
|
|
97
|
+
|
|
98
|
+
## Project layout
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
.
|
|
102
|
+
├── README.md # this file
|
|
103
|
+
├── pyproject.toml # package metadata, deps, ruff/pytest config
|
|
104
|
+
├── notes/
|
|
105
|
+
│ ├── BUGS.tsv # resolved/known issue ledger
|
|
106
|
+
│ └── RELEASE_PLAN.md # v1.0 release plan
|
|
107
|
+
├── schemas/
|
|
108
|
+
│ ├── README.md
|
|
109
|
+
│ └── person_v0_1.schema.json # auto-exported JSON Schema
|
|
110
|
+
├── src/profilefoundry/
|
|
111
|
+
│ ├── schema/ # Pydantic Person Object v0.1 (source of truth)
|
|
112
|
+
│ ├── data/ # reference-data loader (bootstrap ↔ derived)
|
|
113
|
+
│ ├── generate/ # factory + sampling + locale providers
|
|
114
|
+
│ ├── linkage/ # (phase 3) households, employers, families
|
|
115
|
+
│ ├── validate/ # (phase 5) invariants, distributions, leakage
|
|
116
|
+
│ ├── io/ # (phase 6) Hugging Face export
|
|
117
|
+
│ └── cli.py # `profilefoundry` entry point
|
|
118
|
+
├── data/reference/
|
|
119
|
+
│ ├── MANIFEST.md # every external source enumerated
|
|
120
|
+
│ └── bootstrap/ # committed minimum reference marginals
|
|
121
|
+
├── scripts/
|
|
122
|
+
│ ├── export_schema.py # regenerate JSON Schema
|
|
123
|
+
│ └── ingest_us_acs.py # pull richer ACS tables live (needs API key)
|
|
124
|
+
├── tests/ # pytest invariants & unit tests
|
|
125
|
+
└── Paper/ # ACL-style paper draft + figures
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Testing
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
PYTHONPATH=src python3 -m pytest tests/
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
The full pytest suite covers schema invariants (Pydantic model), reference-data
|
|
135
|
+
loading, factory smoke checks, determinism, plausibility (age, employment,
|
|
136
|
+
addresses, education), **adults-only enforcement** (no profiles under 18),
|
|
137
|
+
linkage (households, employers, family graph), validation (KS gaps, replay,
|
|
138
|
+
consistency, leakage), release-report freshness, normalized export quality, and
|
|
139
|
+
reproducibility.
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
- Code and SDK: **ProfileFoundry Citation License 1.0** (see `LICENSE`).
|
|
144
|
+
Public uses and redistributions must cite the ProfileFoundry paper when
|
|
145
|
+
available, or this repository until then. Machine-readable citation metadata
|
|
146
|
+
is in `CITATION.cff`.
|
|
147
|
+
- Generated dataset: **CC-BY-4.0**.
|
|
148
|
+
- Embedded reference data retains its upstream license; see
|
|
149
|
+
`data/reference/MANIFEST.md`.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Reference-data manifest
|
|
2
|
+
|
|
3
|
+
This manifest enumerates every external statistical source the ProfileFoundry
|
|
4
|
+
generator and validator consume. Two tiers of data live in this tree:
|
|
5
|
+
|
|
6
|
+
1. **`bootstrap/`** — hand-curated reference marginals committed to git.
|
|
7
|
+
These are small JSON/YAML files extracted from public summary tables of
|
|
8
|
+
the listed sources. They exist so the generator can run end-to-end without
|
|
9
|
+
network access and so the realism guarantees are reproducible from the
|
|
10
|
+
repo alone. Every bootstrap file links back to its source URL and the
|
|
11
|
+
retrieval date.
|
|
12
|
+
|
|
13
|
+
2. **`derived/`** — Parquet tables produced by the `scripts/ingest_*.py`
|
|
14
|
+
scripts when they pull richer joint distributions live from authoritative
|
|
15
|
+
sources. These are **not** committed to git (see `.gitignore`); they are
|
|
16
|
+
regenerated locally and used by validators. The hashes are recorded in
|
|
17
|
+
`derived/HASHES.json` for reproducibility.
|
|
18
|
+
|
|
19
|
+
3. **`cache/`** — raw downloaded payloads. Also not committed.
|
|
20
|
+
|
|
21
|
+
When the live `derived/` tables are present, the generator and validator
|
|
22
|
+
prefer them. When they are absent (a fresh checkout), bootstrap is used.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Validation-tier locales (Q3)
|
|
27
|
+
|
|
28
|
+
The five locales below get full distributional validation. Reference data for
|
|
29
|
+
each must be present in `bootstrap/` and is auto-promoted to `derived/` when
|
|
30
|
+
the corresponding ingestion script runs successfully.
|
|
31
|
+
|
|
32
|
+
### US — United States
|
|
33
|
+
|
|
34
|
+
| Quantity | Source | URL | Retrieved |
|
|
35
|
+
|---|---|---|---|
|
|
36
|
+
| Age × sex × race × Hispanic origin | ACS 2023 1-year, Table B01001 + B03002 | `https://api.census.gov/data/2023/acs/acs1` | TBD |
|
|
37
|
+
| Education attainment by age × sex | ACS 2023, Table B15001 | (same API) | TBD |
|
|
38
|
+
| Occupation distribution | ACS 2023, Table C24010 | (same API) | TBD |
|
|
39
|
+
| Household income deciles | ACS 2023, Table B19001 | (same API) | TBD |
|
|
40
|
+
| Household type × size | ACS 2023, Table B11016 | (same API) | TBD |
|
|
41
|
+
| Marital status by age × sex | ACS 2023, Table B12002 | (same API) | TBD |
|
|
42
|
+
| Given names by year × sex | SSA National Names | `https://www.ssa.gov/oact/babynames/names.zip` | TBD |
|
|
43
|
+
| Surnames (frequency, race) | US Census 2010 Frequent Surnames | `https://www.census.gov/topics/population/genealogy/data/2010_surnames.html` | TBD |
|
|
44
|
+
| ZCTA-level population | Census 2020 ZCTA totals | `https://www.census.gov/data/datasets/2020/dec/2020-zcta.html` | TBD |
|
|
45
|
+
| Industry × employer size | County Business Patterns 2022 | `https://www.census.gov/programs-surveys/cbp/data/datasets.html` | TBD |
|
|
46
|
+
|
|
47
|
+
License: US federal-government work, public domain.
|
|
48
|
+
|
|
49
|
+
### UK — United Kingdom
|
|
50
|
+
|
|
51
|
+
| Quantity | Source | URL | Retrieved |
|
|
52
|
+
|---|---|---|---|
|
|
53
|
+
| Age × sex × ethnicity | ONS 2021 Census, TS021, TS003 | `https://www.ons.gov.uk/datasets/create` | TBD |
|
|
54
|
+
| Education attainment | ONS 2021 Census, TS067 | (ONS Create-a-Dataset) | TBD |
|
|
55
|
+
| Occupation (SOC2020) | ONS 2021 Census, TS063 | (ONS Create-a-Dataset) | TBD |
|
|
56
|
+
| Household income | ONS HBAI / Family Resources Survey, FY2022/23 | `https://www.gov.uk/government/collections/family-resources-survey` | TBD |
|
|
57
|
+
| Household composition | ONS 2021 Census, TS003 | (ONS Create-a-Dataset) | TBD |
|
|
58
|
+
| Marital/civil partnership status | ONS 2021 Census, TS002 | (ONS Create-a-Dataset) | TBD |
|
|
59
|
+
| Given names by year (E&W) | ONS Baby Names datasets, latest year | `https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/livebirths/datasets/babynamesenglandandwalesbabynamesstatisticsgirls` | TBD |
|
|
60
|
+
| Surnames | UK Office for National Statistics Surnames archive (or 2011 Census derived list, public republications) | `https://www.ons.gov.uk/` | TBD |
|
|
61
|
+
|
|
62
|
+
License: Open Government Licence v3.0 (attribution required).
|
|
63
|
+
|
|
64
|
+
### IN — India
|
|
65
|
+
|
|
66
|
+
| Quantity | Source | URL | Retrieved |
|
|
67
|
+
|---|---|---|---|
|
|
68
|
+
| Age × sex by state | Census of India 2011, C-13 | `https://censusindia.gov.in/census.website/data/census-tables` | TBD |
|
|
69
|
+
| Religion / caste category breakdown | Census 2011 + NSS rounds | (Census website) | TBD |
|
|
70
|
+
| Education attainment | NSS 75th round (Education) | `https://www.mospi.gov.in/` | TBD |
|
|
71
|
+
| Occupation (NCO-2015) | NSO PLFS 2022–23 | `https://www.mospi.gov.in/web/mospi/download-tables-data` | TBD |
|
|
72
|
+
| Household monthly income (MPCE) | NSS HCES 2022–23 | (MoSPI) | TBD |
|
|
73
|
+
| Household composition | Census 2011, H-series | (Census website) | TBD |
|
|
74
|
+
| Marital status by age × sex | Census 2011, C-2 | (Census website) | TBD |
|
|
75
|
+
| Given names | Public birth-record corpora (Bihar, Karnataka, etc. open data portals) + transliterated SSA-Indian-diaspora cross | Multiple; see ingest script | TBD |
|
|
76
|
+
| Surnames | Indian community-wise surname lists (publicly compiled) | Multiple | TBD |
|
|
77
|
+
|
|
78
|
+
License: most Government of India open data is published under the
|
|
79
|
+
National Data Sharing and Accessibility Policy (NDSAP), open with attribution.
|
|
80
|
+
|
|
81
|
+
### CA — Canada
|
|
82
|
+
|
|
83
|
+
| Quantity | Source | URL | Retrieved |
|
|
84
|
+
|---|---|---|---|
|
|
85
|
+
| Age × sex × visible-minority status | StatCan 2021 Census, Table 98-10-0265 | `https://www12.statcan.gc.ca/census-recensement/2021/dp-pd/dt-td/` | TBD |
|
|
86
|
+
| Education attainment | StatCan 2021 Census, Table 98-10-0418 | (same) | TBD |
|
|
87
|
+
| Occupation (NOC 2021) | StatCan 2021 Census, Table 98-10-0434 | (same) | TBD |
|
|
88
|
+
| Household income deciles | StatCan LFS / Income Survey 2022 | `https://www150.statcan.gc.ca/` | TBD |
|
|
89
|
+
| Household composition | StatCan 2021 Census, Table 98-10-0124 | (same) | TBD |
|
|
90
|
+
| Marital status | StatCan 2021 Census, Table 98-10-0125 | (same) | TBD |
|
|
91
|
+
| Given / family names | Public open data + diaspora reference | TBD | TBD |
|
|
92
|
+
|
|
93
|
+
License: Statistics Canada Open Licence (attribution required).
|
|
94
|
+
|
|
95
|
+
### AU — Australia
|
|
96
|
+
|
|
97
|
+
| Quantity | Source | URL | Retrieved |
|
|
98
|
+
|---|---|---|---|
|
|
99
|
+
| Age × sex × ancestry | ABS 2021 Census, TableBuilder | `https://www.abs.gov.au/census/find-census-data/quickstats/2021` | TBD |
|
|
100
|
+
| Education | ABS 2021 Census, NEDU + HSCP | (TableBuilder) | TBD |
|
|
101
|
+
| Occupation (ANZSCO 2022) | ABS 2021 Census, OCCP | (TableBuilder) | TBD |
|
|
102
|
+
| Household income | ABS Survey of Income and Housing 2019–20 | `https://www.abs.gov.au/statistics/economy/finance/household-income-and-wealth-australia/latest-release` | TBD |
|
|
103
|
+
| Household composition | ABS 2021 Census, HHCD | (TableBuilder) | TBD |
|
|
104
|
+
| Marital status | ABS 2021 Census, MSTP | (TableBuilder) | TBD |
|
|
105
|
+
| Names | TBD | TBD | TBD |
|
|
106
|
+
|
|
107
|
+
License: ABS Creative Commons CC-BY 4.0 (attribution required).
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Light-validation locales (Q3)
|
|
112
|
+
|
|
113
|
+
These three are generated against the same engine but only have light
|
|
114
|
+
validation. Bootstrap marginals only; no Q12 KS thresholds enforced.
|
|
115
|
+
|
|
116
|
+
### IE — Ireland
|
|
117
|
+
- CSO Census 2022.
|
|
118
|
+
- `https://www.cso.ie/en/census/`
|
|
119
|
+
|
|
120
|
+
### NZ — New Zealand
|
|
121
|
+
- Stats NZ Census 2023.
|
|
122
|
+
- `https://www.stats.govt.nz/`
|
|
123
|
+
|
|
124
|
+
### PH — Philippines
|
|
125
|
+
- PSA 2020 Census.
|
|
126
|
+
- `https://psa.gov.ph/statistics/census`
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Update protocol
|
|
131
|
+
|
|
132
|
+
1. Modify `bootstrap/<locale>/<table>.json` (or add a new table) only if the
|
|
133
|
+
underlying source has been **re-pulled by the supervisor** with the URL +
|
|
134
|
+
date recorded above. Random hand-tuning of bootstrap values is forbidden
|
|
135
|
+
by `AGENTS.md` §6.
|
|
136
|
+
2. Live ingestion: run `python scripts/ingest_<locale>.py` to populate
|
|
137
|
+
`derived/<locale>/`. The script writes a hash into `derived/HASHES.json`.
|
|
138
|
+
3. Validators prefer `derived/` over `bootstrap/`; the generator does the
|
|
139
|
+
same so realism upgrades silently when you have richer tables available.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generated_by": "hand-curated as part of Phase 1 bootstrap",
|
|
3
|
+
"generated_at": "2026-05-20",
|
|
4
|
+
"validation_locales": ["US", "UK", "IN", "CA", "AU"],
|
|
5
|
+
"light_validation_locales": ["IE", "NZ", "PH"],
|
|
6
|
+
"tables_per_locale": {
|
|
7
|
+
"US": ["age_sex", "race_ethnicity", "education", "marital", "household", "income", "regions"],
|
|
8
|
+
"UK": ["age_sex", "household", "race_ethnicity", "education", "marital"],
|
|
9
|
+
"IN": ["age_sex", "household", "education", "marital"],
|
|
10
|
+
"CA": ["age_sex", "household", "education", "marital"],
|
|
11
|
+
"AU": ["age_sex", "household", "education", "marital"],
|
|
12
|
+
"IE": ["age_sex", "household"],
|
|
13
|
+
"NZ": ["age_sex", "household"],
|
|
14
|
+
"PH": ["age_sex", "household"]
|
|
15
|
+
},
|
|
16
|
+
"fallback_strategy": "For tables absent in a locale, the generator falls back to US values, then to hard-coded defaults. The validator records every fallback usage in its consistency report so the dataset card can disclose them."
|
|
17
|
+
}
|