profilefoundry 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. profilefoundry-1.0.0/CITATION.cff +12 -0
  2. profilefoundry-1.0.0/LICENSE +34 -0
  3. profilefoundry-1.0.0/MANIFEST.in +2 -0
  4. profilefoundry-1.0.0/PKG-INFO +193 -0
  5. profilefoundry-1.0.0/README.md +149 -0
  6. profilefoundry-1.0.0/data/reference/MANIFEST.md +139 -0
  7. profilefoundry-1.0.0/data/reference/bootstrap/INDEX.json +17 -0
  8. profilefoundry-1.0.0/data/reference/bootstrap/SCHEMA.md +160 -0
  9. profilefoundry-1.0.0/data/reference/bootstrap/au/age_sex.json +15 -0
  10. profilefoundry-1.0.0/data/reference/bootstrap/au/education.json +14 -0
  11. profilefoundry-1.0.0/data/reference/bootstrap/au/household.json +22 -0
  12. profilefoundry-1.0.0/data/reference/bootstrap/au/marital.json +32 -0
  13. profilefoundry-1.0.0/data/reference/bootstrap/ca/age_sex.json +15 -0
  14. profilefoundry-1.0.0/data/reference/bootstrap/ca/education.json +14 -0
  15. profilefoundry-1.0.0/data/reference/bootstrap/ca/household.json +22 -0
  16. profilefoundry-1.0.0/data/reference/bootstrap/ca/marital.json +32 -0
  17. profilefoundry-1.0.0/data/reference/bootstrap/ie/age_sex.json +15 -0
  18. profilefoundry-1.0.0/data/reference/bootstrap/ie/household.json +22 -0
  19. profilefoundry-1.0.0/data/reference/bootstrap/in/age_sex.json +15 -0
  20. profilefoundry-1.0.0/data/reference/bootstrap/in/education.json +33 -0
  21. profilefoundry-1.0.0/data/reference/bootstrap/in/household.json +22 -0
  22. profilefoundry-1.0.0/data/reference/bootstrap/in/marital.json +37 -0
  23. profilefoundry-1.0.0/data/reference/bootstrap/nz/age_sex.json +15 -0
  24. profilefoundry-1.0.0/data/reference/bootstrap/nz/household.json +22 -0
  25. profilefoundry-1.0.0/data/reference/bootstrap/ph/age_sex.json +15 -0
  26. profilefoundry-1.0.0/data/reference/bootstrap/ph/household.json +22 -0
  27. profilefoundry-1.0.0/data/reference/bootstrap/uk/age_sex.json +15 -0
  28. profilefoundry-1.0.0/data/reference/bootstrap/uk/education.json +14 -0
  29. profilefoundry-1.0.0/data/reference/bootstrap/uk/household.json +22 -0
  30. profilefoundry-1.0.0/data/reference/bootstrap/uk/marital.json +32 -0
  31. profilefoundry-1.0.0/data/reference/bootstrap/uk/race_ethnicity.json +24 -0
  32. profilefoundry-1.0.0/data/reference/bootstrap/us/age_sex.json +15 -0
  33. profilefoundry-1.0.0/data/reference/bootstrap/us/education.json +14 -0
  34. profilefoundry-1.0.0/data/reference/bootstrap/us/household.json +22 -0
  35. profilefoundry-1.0.0/data/reference/bootstrap/us/income.json +38 -0
  36. profilefoundry-1.0.0/data/reference/bootstrap/us/marital.json +32 -0
  37. profilefoundry-1.0.0/data/reference/bootstrap/us/race_ethnicity.json +23 -0
  38. profilefoundry-1.0.0/data/reference/bootstrap/us/regions.json +65 -0
  39. profilefoundry-1.0.0/data/reference/geo/AU_cities.json +36 -0
  40. profilefoundry-1.0.0/data/reference/geo/CA_cities.json +43 -0
  41. profilefoundry-1.0.0/data/reference/geo/IE_cities.json +30 -0
  42. profilefoundry-1.0.0/data/reference/geo/IN_cities.json +71 -0
  43. profilefoundry-1.0.0/data/reference/geo/NZ_cities.json +30 -0
  44. profilefoundry-1.0.0/data/reference/geo/PH_cities.json +43 -0
  45. profilefoundry-1.0.0/data/reference/geo/UK_cities.json +51 -0
  46. profilefoundry-1.0.0/data/reference/geo/US_cities.json +71 -0
  47. profilefoundry-1.0.0/data/reference/names/IN_given.json +29 -0
  48. profilefoundry-1.0.0/data/reference/names/IN_surnames.json +44 -0
  49. profilefoundry-1.0.0/data/reference/names/PH_given.json +29 -0
  50. profilefoundry-1.0.0/data/reference/names/PH_surnames.json +25 -0
  51. profilefoundry-1.0.0/pyproject.toml +106 -0
  52. profilefoundry-1.0.0/setup.cfg +4 -0
  53. profilefoundry-1.0.0/src/profilefoundry/__init__.py +15 -0
  54. profilefoundry-1.0.0/src/profilefoundry/cli.py +361 -0
  55. profilefoundry-1.0.0/src/profilefoundry/data/__init__.py +0 -0
  56. profilefoundry-1.0.0/src/profilefoundry/data/loader.py +80 -0
  57. profilefoundry-1.0.0/src/profilefoundry/data/manifest_hash.py +54 -0
  58. profilefoundry-1.0.0/src/profilefoundry/data/paths.py +33 -0
  59. profilefoundry-1.0.0/src/profilefoundry/diversity/__init__.py +0 -0
  60. profilefoundry-1.0.0/src/profilefoundry/documents/__init__.py +0 -0
  61. profilefoundry-1.0.0/src/profilefoundry/generate/__init__.py +0 -0
  62. profilefoundry-1.0.0/src/profilefoundry/generate/address.py +659 -0
  63. profilefoundry-1.0.0/src/profilefoundry/generate/backfill.py +810 -0
  64. profilefoundry-1.0.0/src/profilefoundry/generate/contact_ids.py +1314 -0
  65. profilefoundry-1.0.0/src/profilefoundry/generate/education_status.py +244 -0
  66. profilefoundry-1.0.0/src/profilefoundry/generate/employer_names.py +180 -0
  67. profilefoundry-1.0.0/src/profilefoundry/generate/factory.py +1178 -0
  68. profilefoundry-1.0.0/src/profilefoundry/generate/finance_health.py +525 -0
  69. profilefoundry-1.0.0/src/profilefoundry/generate/identity.py +316 -0
  70. profilefoundry-1.0.0/src/profilefoundry/generate/locales/__init__.py +201 -0
  71. profilefoundry-1.0.0/src/profilefoundry/generate/occupation.py +254 -0
  72. profilefoundry-1.0.0/src/profilefoundry/generate/sampling.py +70 -0
  73. profilefoundry-1.0.0/src/profilefoundry/generate/seeding.py +38 -0
  74. profilefoundry-1.0.0/src/profilefoundry/io/__init__.py +0 -0
  75. profilefoundry-1.0.0/src/profilefoundry/io/hf_export.py +1454 -0
  76. profilefoundry-1.0.0/src/profilefoundry/linkage/__init__.py +0 -0
  77. profilefoundry-1.0.0/src/profilefoundry/linkage/employers.py +361 -0
  78. profilefoundry-1.0.0/src/profilefoundry/linkage/families.py +176 -0
  79. profilefoundry-1.0.0/src/profilefoundry/linkage/households.py +205 -0
  80. profilefoundry-1.0.0/src/profilefoundry/linkage/orchestrator.py +476 -0
  81. profilefoundry-1.0.0/src/profilefoundry/load/__init__.py +0 -0
  82. profilefoundry-1.0.0/src/profilefoundry/schema/__init__.py +78 -0
  83. profilefoundry-1.0.0/src/profilefoundry/schema/v0_1.py +457 -0
  84. profilefoundry-1.0.0/src/profilefoundry/validate/__init__.py +0 -0
  85. profilefoundry-1.0.0/src/profilefoundry/validate/consistency.py +105 -0
  86. profilefoundry-1.0.0/src/profilefoundry/validate/distributional.py +213 -0
  87. profilefoundry-1.0.0/src/profilefoundry/validate/leakage.py +376 -0
  88. profilefoundry-1.0.0/src/profilefoundry/validate/replay.py +130 -0
  89. profilefoundry-1.0.0/src/profilefoundry.egg-info/PKG-INFO +193 -0
  90. profilefoundry-1.0.0/src/profilefoundry.egg-info/SOURCES.txt +103 -0
  91. profilefoundry-1.0.0/src/profilefoundry.egg-info/dependency_links.txt +1 -0
  92. profilefoundry-1.0.0/src/profilefoundry.egg-info/entry_points.txt +2 -0
  93. profilefoundry-1.0.0/src/profilefoundry.egg-info/requires.txt +27 -0
  94. profilefoundry-1.0.0/src/profilefoundry.egg-info/top_level.txt +1 -0
  95. profilefoundry-1.0.0/tests/test_cli.py +277 -0
  96. profilefoundry-1.0.0/tests/test_export_normalized.py +622 -0
  97. profilefoundry-1.0.0/tests/test_factory.py +92 -0
  98. profilefoundry-1.0.0/tests/test_hf_release_current.py +94 -0
  99. profilefoundry-1.0.0/tests/test_linkage.py +146 -0
  100. profilefoundry-1.0.0/tests/test_loader.py +57 -0
  101. profilefoundry-1.0.0/tests/test_push_to_hf.py +96 -0
  102. profilefoundry-1.0.0/tests/test_reproducibility.py +61 -0
  103. profilefoundry-1.0.0/tests/test_update_leakage_report.py +142 -0
  104. profilefoundry-1.0.0/tests/test_validation.py +153 -0
  105. profilefoundry-1.0.0/tests/test_wikidata_ingest_robustness.py +87 -0
@@ -0,0 +1,12 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use ProfileFoundry, please cite the paper when available; until then, cite the repository."
3
+ type: software
4
+ title: "ProfileFoundry: An Audited Person-Object Substrate for Stateful NLP and Agent Evaluation"
5
+ authors:
6
+ - family-names: Selvam
7
+ given-names: Sriram
8
+ repository-code: "https://github.com/selvamsriram/ProfileFoundry"
9
+ url: "https://github.com/selvamsriram/ProfileFoundry"
10
+ version: "1.0.0"
11
+ date-released: "2026-06-14"
12
+ license: "LicenseRef-ProfileFoundry-Citation-1.0"
@@ -0,0 +1,34 @@
1
+ ProfileFoundry Citation License 1.0
2
+
3
+ Copyright (c) 2026 Sriram Selvam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to use,
7
+ copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ the Software, subject to the following conditions:
9
+
10
+ 1. Attribution and citation. Any public use, redistribution, derivative work,
11
+ benchmark, dataset, paper, report, model card, data card, product
12
+ documentation, or repository that uses or includes the Software or generated
13
+ ProfileFoundry artifacts must provide reasonable attribution to
14
+ ProfileFoundry. Cite the ProfileFoundry paper when it is available. Until
15
+ then, cite the ProfileFoundry repository:
16
+
17
+ ProfileFoundry: An Audited Person-Object Substrate for Stateful NLP
18
+ and Agent Evaluation. https://github.com/selvamsriram/ProfileFoundry
19
+
20
+ 2. Notice preservation. The above copyright notice, this license, and any
21
+ citation instructions distributed with the Software must be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ 3. Reference data. Embedded reference data and generated datasets may carry
25
+ additional upstream attribution requirements. Those notices must be
26
+ preserved when redistributing derived artifacts.
27
+
28
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30
+ FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER
32
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM,
33
+ OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ include CITATION.cff
2
+ include LICENSE
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.4
2
+ Name: profilefoundry
3
+ Version: 1.0.0
4
+ Summary: An open-source generator for structured, internally-consistent, linked, temporally coherent synthetic Person Objects.
5
+ Author: Sriram Selvam
6
+ License-Expression: LicenseRef-ProfileFoundry-Citation-1.0
7
+ Project-URL: Homepage, https://github.com/selvamsriram/ProfileFoundry
8
+ Project-URL: Issues, https://github.com/selvamsriram/ProfileFoundry/issues
9
+ Keywords: synthetic-data,pii,person-object,linkage,evaluation,privacy
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: pydantic>=2.6
20
+ Requires-Dist: faker>=24.0
21
+ Requires-Dist: numpy>=1.26
22
+ Requires-Dist: pandas>=2.1
23
+ Requires-Dist: pyarrow>=15.0
24
+ Requires-Dist: click>=8.1
25
+ Requires-Dist: rapidfuzz>=3.6
26
+ Requires-Dist: pybloom-live>=4.0
27
+ Requires-Dist: requests>=2.31
28
+ Requires-Dist: tqdm>=4.66
29
+ Requires-Dist: python-dotenv>=1.0
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=8.0; extra == "dev"
32
+ Requires-Dist: pytest-cov>=4.1; extra == "dev"
33
+ Requires-Dist: ruff>=0.4; extra == "dev"
34
+ Requires-Dist: mypy>=1.10; extra == "dev"
35
+ Requires-Dist: ipython>=8.20; extra == "dev"
36
+ Provides-Extra: viz
37
+ Requires-Dist: matplotlib>=3.8; extra == "viz"
38
+ Requires-Dist: seaborn>=0.13; extra == "viz"
39
+ Requires-Dist: plotly>=5.20; extra == "viz"
40
+ Provides-Extra: hf
41
+ Requires-Dist: datasets>=2.18; extra == "hf"
42
+ Requires-Dist: huggingface-hub>=0.23; extra == "hf"
43
+ Dynamic: license-file
44
+
45
+ # ProfileFoundry
46
+
47
+ > A structured, internally-consistent, linked, temporally coherent synthetic
48
+ > Person Object - and an open-source generator that produces them at scale.
49
+
50
+ This repository is the implementation home for ProfileFoundry, which builds
51
+ on the problem setting exposed by [PANORAMA](https://arxiv.org/abs/2505.12238)
52
+ (Selvam & Ghosh, 2025). It provides a deterministic generator, schema,
53
+ validation suite, release reports, and Hugging Face export tooling for the
54
+ `ProfileFoundry-Core-100K` dataset.
55
+
56
+ The ACL/NeurIPS-style paper draft lives in [`Paper/`](Paper/).
57
+ The vetted 100K release package is staged for Hugging Face at
58
+ [`srirxml/ProfileFoundry-Core-100K`](https://huggingface.co/datasets/srirxml/ProfileFoundry-Core-100K);
59
+ run `python scripts/verify_hf_release_current.py` with an `HF_TOKEN` before
60
+ calling the remote artifact current. The default HF viewer table is
61
+ `person_objects.parquet`, a complete one-row-per-person view with nested
62
+ sections encoded as JSON strings; the remaining parquet files expose the
63
+ normalized relational schema.
64
+
65
+ ## Status (2026-05-24) — v1.0 release sprint
66
+
67
+ | Phase | Status |
68
+ |---|---|
69
+ | **0 — Foundation** (schema and package scaffolding) | ✅ complete |
70
+ | **1 — Reference data** (manifest + bootstrap + loader) | ✅ ref data shipped for 5 validation locales |
71
+ | **2 — Generator factory** (port + joint constraints + age-gating) | ✅ deterministic across processes (set-iteration bug fixed) |
72
+ | **3 — Linkage** (households, employers, family graph) | ✅ household orchestrator + family edges + employer pool |
73
+ | **4 — Temporal** (event taxonomy, backfill, replay) | ✅ replay-valid non-credit timeline + typed payload export |
74
+ | **5a — Validation** (KS gaps, consistency) | ✅ disclosed KS gaps (max 0.124) · 100% consistency |
75
+ | **5b — Leakage audit** (Wikidata Bloom, reserved-domain email audit, self-collision) | ✅ Wikidata Bloom + self-collision + reserved-domain email syntax audit |
76
+ | **6 — Scale + publish** (100K + HF export) | ✅ 100K release · complete viewer table · normalized parquet star schema · MANIFEST.json |
77
+ | **7 — Reproducibility** (verify fixture, manifest hash) | ✅ normalized fixture stable across processes; verify script + fixture |
78
+ | **8 — Paper draft** | ✅ submission draft revised |
79
+ | **9 — HF push + tag v1.0.0** | ⏳ HF upload target documented; remote current check requires `HF_TOKEN`; tag pending |
80
+
81
+ ## Quick start
82
+
83
+ ```bash
84
+ pip install -e ".[dev]"
85
+ profilefoundry verify # smoke-test all 8 locales
86
+ profilefoundry person --locale US # one US Person as JSON
87
+ profilefoundry household --locale US # one linked household (multiple Persons)
88
+ profilefoundry scale --n 1000 --locale UK --out /tmp/uk.jsonl
89
+ profilefoundry scale-households --n 500 --locale CA --out /tmp/ca_households.jsonl
90
+ profilefoundry validate --n 300 # KS gaps + leakage + consistency
91
+ profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24 --skip-hibp # normalized parquet star schema
92
+ profilefoundry scale-smoke --sizes 1000,10000,100000 # timings
93
+
94
+ # Full v1.0 release run (100K profiles in ~4-5 minutes on a laptop):
95
+ python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose
96
+ # Reproducibility check:
97
+ python scripts/verify_reproducibility.py
98
+ # Hugging Face upload preflight / push:
99
+ python scripts/push_to_hf.py --dry-run
100
+ python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K
101
+ # Confirm the public HF manifest matches the vetted local release:
102
+ python scripts/verify_hf_release_current.py
103
+ # Build the Wikidata bloom (multi-hour due to WDQS rate limit):
104
+ python scripts/ingest_wikidata_persons.py --scope sample --sleep 65 --verbose
105
+ ```
106
+
107
+ Or without installing:
108
+
109
+ ```bash
110
+ PYTHONPATH=src python3 -m profilefoundry.cli scale --n 1000 --locale US --out /tmp/us.jsonl
111
+ ```
112
+
113
+ ## CLI reference
114
+
115
+ All commands are available through `profilefoundry` after installation, or via
116
+ `PYTHONPATH=src python3 -m profilefoundry.cli` from a checkout.
117
+
118
+ | Command | Purpose | Common example |
119
+ |---|---|---|
120
+ | `profilefoundry verify` | Generate one profile per supported locale as a smoke test. | `profilefoundry verify` |
121
+ | `profilefoundry person` | Print one deterministic Person Object as JSON. | `profilefoundry person --locale US --seed 4321 --profile-seq 1` |
122
+ | `profilefoundry household` | Print one linked household as JSON. | `profilefoundry household --locale UK --seed 4321 --seq 7` |
123
+ | `profilefoundry scale` | Generate flat JSONL profiles for one locale. | `profilefoundry scale --n 1000 --locale CA --out /tmp/ca.jsonl` |
124
+ | `profilefoundry scale-households` | Generate linked household members as JSONL. | `profilefoundry scale-households --n 500 --locale AU --out /tmp/au_households.jsonl` |
125
+ | `profilefoundry validate` | Run distributional, leakage, replay, and consistency checks. | `profilefoundry validate --n 300 --locales US,UK,IN --skip-hibp` |
126
+ | `profilefoundry export` | Write JSONL, parquet tables, manifest, and dataset card. | `profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24T00:00:00Z --skip-hibp` |
127
+ | `profilefoundry scale-smoke` | Time the generator at several sizes. | `profilefoundry scale-smoke --sizes 1000,10000,100000` |
128
+
129
+ Script-level release utilities:
130
+
131
+ | Script | Purpose |
132
+ |---|---|
133
+ | `python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose` | Rebuild the 100K release artifact under `data/raw/profilefoundry-core-v1/` and reports under `reports/v1_release/`. |
134
+ | `python scripts/verify_reproducibility.py` | Confirm the pinned reproducibility fixture still matches. |
135
+ | `python scripts/verify_package_reference_data.py` | Confirm reference data is available from an installed wheel. |
136
+ | `python scripts/push_to_hf.py --dry-run` | Preview the Hugging Face upload. |
137
+ | `python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K` | Upload the vetted release artifact. |
138
+ | `python scripts/verify_hf_release_current.py` | Compare the local release manifest with the Hugging Face dataset. Requires `HF_TOKEN` for private or gated repos. |
139
+ | `python scripts/export_schema.py --check` | Verify the checked-in JSON Schema matches the Pydantic model. |
140
+ | `python scripts/ingest_us_acs.py` / `python scripts/ingest_uk_ons.py` | Refresh reference-data inputs from upstream sources. |
141
+
142
+ ## Project layout
143
+
144
+ ```
145
+ .
146
+ ├── README.md # this file
147
+ ├── pyproject.toml # package metadata, deps, ruff/pytest config
148
+ ├── notes/
149
+ │ ├── BUGS.tsv # resolved/known issue ledger
150
+ │ └── RELEASE_PLAN.md # v1.0 release plan
151
+ ├── schemas/
152
+ │ ├── README.md
153
+ │ └── person_v0_1.schema.json # auto-exported JSON Schema
154
+ ├── src/profilefoundry/
155
+ │ ├── schema/ # Pydantic Person Object v0.1 (source of truth)
156
+ │ ├── data/ # reference-data loader (bootstrap ↔ derived)
157
+ │ ├── generate/ # factory + sampling + locale providers
158
+ │ ├── linkage/ # (phase 3) households, employers, families
159
+ │ ├── validate/ # (phase 5) invariants, distributions, leakage
160
+ │ ├── io/ # (phase 6) Hugging Face export
161
+ │ └── cli.py # `profilefoundry` entry point
162
+ ├── data/reference/
163
+ │ ├── MANIFEST.md # every external source enumerated
164
+ │ └── bootstrap/ # committed minimum reference marginals
165
+ ├── scripts/
166
+ │ ├── export_schema.py # regenerate JSON Schema
167
+ │ └── ingest_us_acs.py # pull richer ACS tables live (needs API key)
168
+ ├── tests/ # pytest invariants & unit tests
169
+ └── Paper/ # ACL-style paper draft + figures
170
+ ```
171
+
172
+ ## Testing
173
+
174
+ ```bash
175
+ PYTHONPATH=src python3 -m pytest tests/
176
+ ```
177
+
178
+ The full pytest suite covers schema invariants (Pydantic model), reference-data
179
+ loading, factory smoke checks, determinism, plausibility (age, employment,
180
+ addresses, education), **adults-only enforcement** (no profiles under 18),
181
+ linkage (households, employers, family graph), validation (KS gaps, replay,
182
+ consistency, leakage), release-report freshness, normalized export quality, and
183
+ reproducibility.
184
+
185
+ ## License
186
+
187
+ - Code and SDK: **ProfileFoundry Citation License 1.0** (see `LICENSE`).
188
+ Public uses and redistributions must cite the ProfileFoundry paper when
189
+ available, or this repository until then. Machine-readable citation metadata
190
+ is in `CITATION.cff`.
191
+ - Generated dataset: **CC-BY-4.0**.
192
+ - Embedded reference data retains its upstream license; see
193
+ `data/reference/MANIFEST.md`.
@@ -0,0 +1,149 @@
1
+ # ProfileFoundry
2
+
3
+ > A structured, internally-consistent, linked, temporally coherent synthetic
4
+ > Person Object - and an open-source generator that produces them at scale.
5
+
6
+ This repository is the implementation home for ProfileFoundry, which builds
7
+ on the problem setting exposed by [PANORAMA](https://arxiv.org/abs/2505.12238)
8
+ (Selvam & Ghosh, 2025). It provides a deterministic generator, schema,
9
+ validation suite, release reports, and Hugging Face export tooling for the
10
+ `ProfileFoundry-Core-100K` dataset.
11
+
12
+ The ACL/NeurIPS-style paper draft lives in [`Paper/`](Paper/).
13
+ The vetted 100K release package is staged for Hugging Face at
14
+ [`srirxml/ProfileFoundry-Core-100K`](https://huggingface.co/datasets/srirxml/ProfileFoundry-Core-100K);
15
+ run `python scripts/verify_hf_release_current.py` with an `HF_TOKEN` before
16
+ calling the remote artifact current. The default HF viewer table is
17
+ `person_objects.parquet`, a complete one-row-per-person view with nested
18
+ sections encoded as JSON strings; the remaining parquet files expose the
19
+ normalized relational schema.
20
+
21
+ ## Status (2026-05-24) — v1.0 release sprint
22
+
23
+ | Phase | Status |
24
+ |---|---|
25
+ | **0 — Foundation** (schema and package scaffolding) | ✅ complete |
26
+ | **1 — Reference data** (manifest + bootstrap + loader) | ✅ ref data shipped for 5 validation locales |
27
+ | **2 — Generator factory** (port + joint constraints + age-gating) | ✅ deterministic across processes (set-iteration bug fixed) |
28
+ | **3 — Linkage** (households, employers, family graph) | ✅ household orchestrator + family edges + employer pool |
29
+ | **4 — Temporal** (event taxonomy, backfill, replay) | ✅ replay-valid non-credit timeline + typed payload export |
30
+ | **5a — Validation** (KS gaps, consistency) | ✅ disclosed KS gaps (max 0.124) · 100% consistency |
31
+ | **5b — Leakage audit** (Wikidata Bloom, reserved-domain email audit, self-collision) | ✅ Wikidata Bloom + self-collision + reserved-domain email syntax audit |
32
+ | **6 — Scale + publish** (100K + HF export) | ✅ 100K release · complete viewer table · normalized parquet star schema · MANIFEST.json |
33
+ | **7 — Reproducibility** (verify fixture, manifest hash) | ✅ normalized fixture stable across processes; verify script + fixture |
34
+ | **8 — Paper draft** | ✅ submission draft revised |
35
+ | **9 — HF push + tag v1.0.0** | ⏳ HF upload target documented; remote current check requires `HF_TOKEN`; tag pending |
36
+
37
+ ## Quick start
38
+
39
+ ```bash
40
+ pip install -e ".[dev]"
41
+ profilefoundry verify # smoke-test all 8 locales
42
+ profilefoundry person --locale US # one US Person as JSON
43
+ profilefoundry household --locale US # one linked household (multiple Persons)
44
+ profilefoundry scale --n 1000 --locale UK --out /tmp/uk.jsonl
45
+ profilefoundry scale-households --n 500 --locale CA --out /tmp/ca_households.jsonl
46
+ profilefoundry validate --n 300 # KS gaps + leakage + consistency
47
+ profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24 --skip-hibp # normalized parquet star schema
48
+ profilefoundry scale-smoke --sizes 1000,10000,100000 # timings
49
+
50
+ # Full v1.0 release run (100K profiles in ~4-5 minutes on a laptop):
51
+ python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose
52
+ # Reproducibility check:
53
+ python scripts/verify_reproducibility.py
54
+ # Hugging Face upload preflight / push:
55
+ python scripts/push_to_hf.py --dry-run
56
+ python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K
57
+ # Confirm the public HF manifest matches the vetted local release:
58
+ python scripts/verify_hf_release_current.py
59
+ # Build the Wikidata bloom (multi-hour due to WDQS rate limit):
60
+ python scripts/ingest_wikidata_persons.py --scope sample --sleep 65 --verbose
61
+ ```
62
+
63
+ Or without installing:
64
+
65
+ ```bash
66
+ PYTHONPATH=src python3 -m profilefoundry.cli scale --n 1000 --locale US --out /tmp/us.jsonl
67
+ ```
68
+
69
+ ## CLI reference
70
+
71
+ All commands are available through `profilefoundry` after installation, or via
72
+ `PYTHONPATH=src python3 -m profilefoundry.cli` from a checkout.
73
+
74
+ | Command | Purpose | Common example |
75
+ |---|---|---|
76
+ | `profilefoundry verify` | Generate one profile per supported locale as a smoke test. | `profilefoundry verify` |
77
+ | `profilefoundry person` | Print one deterministic Person Object as JSON. | `profilefoundry person --locale US --seed 4321 --profile-seq 1` |
78
+ | `profilefoundry household` | Print one linked household as JSON. | `profilefoundry household --locale UK --seed 4321 --seq 7` |
79
+ | `profilefoundry scale` | Generate flat JSONL profiles for one locale. | `profilefoundry scale --n 1000 --locale CA --out /tmp/ca.jsonl` |
80
+ | `profilefoundry scale-households` | Generate linked household members as JSONL. | `profilefoundry scale-households --n 500 --locale AU --out /tmp/au_households.jsonl` |
81
+ | `profilefoundry validate` | Run distributional, leakage, replay, and consistency checks. | `profilefoundry validate --n 300 --locales US,UK,IN --skip-hibp` |
82
+ | `profilefoundry export` | Write JSONL, parquet tables, manifest, and dataset card. | `profilefoundry export --out /tmp/pf_core --n-per-locale 1000 --generation-date 2026-05-24 --exported-at 2026-05-24T00:00:00Z --skip-hibp` |
83
+ | `profilefoundry scale-smoke` | Time the generator at several sizes. | `profilefoundry scale-smoke --sizes 1000,10000,100000` |
84
+
85
+ Script-level release utilities:
86
+
87
+ | Script | Purpose |
88
+ |---|---|
89
+ | `python scripts/run_full_core.py --generation-date 2026-05-24 --skip-hibp --verbose` | Rebuild the 100K release artifact under `data/raw/profilefoundry-core-v1/` and reports under `reports/v1_release/`. |
90
+ | `python scripts/verify_reproducibility.py` | Confirm the pinned reproducibility fixture still matches. |
91
+ | `python scripts/verify_package_reference_data.py` | Confirm reference data is available from an installed wheel. |
92
+ | `python scripts/push_to_hf.py --dry-run` | Preview the Hugging Face upload. |
93
+ | `python scripts/push_to_hf.py --repo srirxml/ProfileFoundry-Core-100K` | Upload the vetted release artifact. |
94
+ | `python scripts/verify_hf_release_current.py` | Compare the local release manifest with the Hugging Face dataset. Requires `HF_TOKEN` for private or gated repos. |
95
+ | `python scripts/export_schema.py --check` | Verify the checked-in JSON Schema matches the Pydantic model. |
96
+ | `python scripts/ingest_us_acs.py` / `python scripts/ingest_uk_ons.py` | Refresh reference-data inputs from upstream sources. |
97
+
98
+ ## Project layout
99
+
100
+ ```
101
+ .
102
+ ├── README.md # this file
103
+ ├── pyproject.toml # package metadata, deps, ruff/pytest config
104
+ ├── notes/
105
+ │ ├── BUGS.tsv # resolved/known issue ledger
106
+ │ └── RELEASE_PLAN.md # v1.0 release plan
107
+ ├── schemas/
108
+ │ ├── README.md
109
+ │ └── person_v0_1.schema.json # auto-exported JSON Schema
110
+ ├── src/profilefoundry/
111
+ │ ├── schema/ # Pydantic Person Object v0.1 (source of truth)
112
+ │ ├── data/ # reference-data loader (bootstrap ↔ derived)
113
+ │ ├── generate/ # factory + sampling + locale providers
114
+ │ ├── linkage/ # (phase 3) households, employers, families
115
+ │ ├── validate/ # (phase 5) invariants, distributions, leakage
116
+ │ ├── io/ # (phase 6) Hugging Face export
117
+ │ └── cli.py # `profilefoundry` entry point
118
+ ├── data/reference/
119
+ │ ├── MANIFEST.md # every external source enumerated
120
+ │ └── bootstrap/ # committed minimum reference marginals
121
+ ├── scripts/
122
+ │ ├── export_schema.py # regenerate JSON Schema
123
+ │ └── ingest_us_acs.py # pull richer ACS tables live (needs API key)
124
+ ├── tests/ # pytest invariants & unit tests
125
+ └── Paper/ # ACL-style paper draft + figures
126
+ ```
127
+
128
+ ## Testing
129
+
130
+ ```bash
131
+ PYTHONPATH=src python3 -m pytest tests/
132
+ ```
133
+
134
+ The full pytest suite covers schema invariants (Pydantic model), reference-data
135
+ loading, factory smoke checks, determinism, plausibility (age, employment,
136
+ addresses, education), **adults-only enforcement** (no profiles under 18),
137
+ linkage (households, employers, family graph), validation (KS gaps, replay,
138
+ consistency, leakage), release-report freshness, normalized export quality, and
139
+ reproducibility.
140
+
141
+ ## License
142
+
143
+ - Code and SDK: **ProfileFoundry Citation License 1.0** (see `LICENSE`).
144
+ Public uses and redistributions must cite the ProfileFoundry paper when
145
+ available, or this repository until then. Machine-readable citation metadata
146
+ is in `CITATION.cff`.
147
+ - Generated dataset: **CC-BY-4.0**.
148
+ - Embedded reference data retains its upstream license; see
149
+ `data/reference/MANIFEST.md`.
@@ -0,0 +1,139 @@
1
+ # Reference-data manifest
2
+
3
+ This manifest enumerates every external statistical source the ProfileFoundry
4
+ generator and validator consume. Two tiers of data live in this tree:
5
+
6
+ 1. **`bootstrap/`** — hand-curated reference marginals committed to git.
7
+ These are small JSON/YAML files extracted from public summary tables of
8
+ the listed sources. They exist so the generator can run end-to-end without
9
+ network access and so the realism guarantees are reproducible from the
10
+ repo alone. Every bootstrap file links back to its source URL and the
11
+ retrieval date.
12
+
13
+ 2. **`derived/`** — Parquet tables produced by the `scripts/ingest_*.py`
14
+ scripts when they pull richer joint distributions live from authoritative
15
+ sources. These are **not** committed to git (see `.gitignore`); they are
16
+ regenerated locally and used by validators. The hashes are recorded in
17
+ `derived/HASHES.json` for reproducibility.
18
+
19
+ 3. **`cache/`** — raw downloaded payloads. Also not committed.
20
+
21
+ When the live `derived/` tables are present, the generator and validator
22
+ prefer them. When they are absent (a fresh checkout), bootstrap is used.
23
+
24
+ ---
25
+
26
+ ## Validation-tier locales (Q3)
27
+
28
+ The five locales below get full distributional validation. Reference data for
29
+ each must be present in `bootstrap/` and is auto-promoted to `derived/` when
30
+ the corresponding ingestion script runs successfully.
31
+
32
+ ### US — United States
33
+
34
+ | Quantity | Source | URL | Retrieved |
35
+ |---|---|---|---|
36
+ | Age × sex × race × Hispanic origin | ACS 2023 1-year, Table B01001 + B03002 | `https://api.census.gov/data/2023/acs/acs1` | TBD |
37
+ | Education attainment by age × sex | ACS 2023, Table B15001 | (same API) | TBD |
38
+ | Occupation distribution | ACS 2023, Table C24010 | (same API) | TBD |
39
+ | Household income deciles | ACS 2023, Table B19001 | (same API) | TBD |
40
+ | Household type × size | ACS 2023, Table B11016 | (same API) | TBD |
41
+ | Marital status by age × sex | ACS 2023, Table B12002 | (same API) | TBD |
42
+ | Given names by year × sex | SSA National Names | `https://www.ssa.gov/oact/babynames/names.zip` | TBD |
43
+ | Surnames (frequency, race) | US Census 2010 Frequent Surnames | `https://www.census.gov/topics/population/genealogy/data/2010_surnames.html` | TBD |
44
+ | ZCTA-level population | Census 2020 ZCTA totals | `https://www.census.gov/data/datasets/2020/dec/2020-zcta.html` | TBD |
45
+ | Industry × employer size | County Business Patterns 2022 | `https://www.census.gov/programs-surveys/cbp/data/datasets.html` | TBD |
46
+
47
+ License: US federal-government work, public domain.
48
+
49
+ ### UK — United Kingdom
50
+
51
+ | Quantity | Source | URL | Retrieved |
52
+ |---|---|---|---|
53
+ | Age × sex × ethnicity | ONS 2021 Census, TS021, TS003 | `https://www.ons.gov.uk/datasets/create` | TBD |
54
+ | Education attainment | ONS 2021 Census, TS067 | (ONS Create-a-Dataset) | TBD |
55
+ | Occupation (SOC2020) | ONS 2021 Census, TS063 | (ONS Create-a-Dataset) | TBD |
56
+ | Household income | ONS HBAI / Family Resources Survey, FY2022/23 | `https://www.gov.uk/government/collections/family-resources-survey` | TBD |
57
+ | Household composition | ONS 2021 Census, TS003 | (ONS Create-a-Dataset) | TBD |
58
+ | Marital/civil partnership status | ONS 2021 Census, TS002 | (ONS Create-a-Dataset) | TBD |
59
+ | Given names by year (E&W) | ONS Baby Names datasets, latest year | `https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/livebirths/datasets/babynamesenglandandwalesbabynamesstatisticsgirls` | TBD |
60
+ | Surnames | UK Office for National Statistics Surnames archive (or 2011 Census derived list, public republications) | `https://www.ons.gov.uk/` | TBD |
61
+
62
+ License: Open Government Licence v3.0 (attribution required).
63
+
64
+ ### IN — India
65
+
66
+ | Quantity | Source | URL | Retrieved |
67
+ |---|---|---|---|
68
+ | Age × sex by state | Census of India 2011, C-13 | `https://censusindia.gov.in/census.website/data/census-tables` | TBD |
69
+ | Religion / caste category breakdown | Census 2011 + NSS rounds | (Census website) | TBD |
70
+ | Education attainment | NSS 75th round (Education) | `https://www.mospi.gov.in/` | TBD |
71
+ | Occupation (NCO-2015) | NSO PLFS 2022–23 | `https://www.mospi.gov.in/web/mospi/download-tables-data` | TBD |
72
+ | Household monthly income (MPCE) | NSS HCES 2022–23 | (MoSPI) | TBD |
73
+ | Household composition | Census 2011, H-series | (Census website) | TBD |
74
+ | Marital status by age × sex | Census 2011, C-2 | (Census website) | TBD |
75
+ | Given names | Public birth-record corpora (Bihar, Karnataka, etc. open data portals) + transliterated SSA-Indian-diaspora cross | Multiple; see ingest script | TBD |
76
+ | Surnames | Indian community-wise surname lists (publicly compiled) | Multiple | TBD |
77
+
78
+ License: most Government of India open data is published under the
79
+ National Data Sharing and Accessibility Policy (NDSAP), open with attribution.
80
+
81
+ ### CA — Canada
82
+
83
+ | Quantity | Source | URL | Retrieved |
84
+ |---|---|---|---|
85
+ | Age × sex × visible-minority status | StatCan 2021 Census, Table 98-10-0265 | `https://www12.statcan.gc.ca/census-recensement/2021/dp-pd/dt-td/` | TBD |
86
+ | Education attainment | StatCan 2021 Census, Table 98-10-0418 | (same) | TBD |
87
+ | Occupation (NOC 2021) | StatCan 2021 Census, Table 98-10-0434 | (same) | TBD |
88
+ | Household income deciles | StatCan LFS / Income Survey 2022 | `https://www150.statcan.gc.ca/` | TBD |
89
+ | Household composition | StatCan 2021 Census, Table 98-10-0124 | (same) | TBD |
90
+ | Marital status | StatCan 2021 Census, Table 98-10-0125 | (same) | TBD |
91
+ | Given / family names | Public open data + diaspora reference | TBD | TBD |
92
+
93
+ License: Statistics Canada Open Licence (attribution required).
94
+
95
+ ### AU — Australia
96
+
97
+ | Quantity | Source | URL | Retrieved |
98
+ |---|---|---|---|
99
+ | Age × sex × ancestry | ABS 2021 Census, TableBuilder | `https://www.abs.gov.au/census/find-census-data/quickstats/2021` | TBD |
100
+ | Education | ABS 2021 Census, NEDU + HSCP | (TableBuilder) | TBD |
101
+ | Occupation (ANZSCO 2022) | ABS 2021 Census, OCCP | (TableBuilder) | TBD |
102
+ | Household income | ABS Survey of Income and Housing 2019–20 | `https://www.abs.gov.au/statistics/economy/finance/household-income-and-wealth-australia/latest-release` | TBD |
103
+ | Household composition | ABS 2021 Census, HHCD | (TableBuilder) | TBD |
104
+ | Marital status | ABS 2021 Census, MSTP | (TableBuilder) | TBD |
105
+ | Names | TBD | TBD | TBD |
106
+
107
+ License: ABS Creative Commons CC-BY 4.0 (attribution required).
108
+
109
+ ---
110
+
111
+ ## Light-validation locales (Q3)
112
+
113
+ These three are generated against the same engine but only have light
114
+ validation. Bootstrap marginals only; no Q12 KS thresholds enforced.
115
+
116
+ ### IE — Ireland
117
+ - CSO Census 2022.
118
+ - `https://www.cso.ie/en/census/`
119
+
120
+ ### NZ — New Zealand
121
+ - Stats NZ Census 2023.
122
+ - `https://www.stats.govt.nz/`
123
+
124
+ ### PH — Philippines
125
+ - PSA 2020 Census.
126
+ - `https://psa.gov.ph/statistics/census`
127
+
128
+ ---
129
+
130
+ ## Update protocol
131
+
132
+ 1. Modify `bootstrap/<locale>/<table>.json` (or add a new table) only if the
133
+ underlying source has been **re-pulled by the supervisor** with the URL +
134
+ date recorded above. Random hand-tuning of bootstrap values is forbidden
135
+ by `AGENTS.md` §6.
136
+ 2. Live ingestion: run `python scripts/ingest_<locale>.py` to populate
137
+ `derived/<locale>/`. The script writes a hash into `derived/HASHES.json`.
138
+ 3. Validators prefer `derived/` over `bootstrap/`; the generator does the
139
+ same so realism upgrades silently when you have richer tables available.
@@ -0,0 +1,17 @@
1
+ {
2
+ "generated_by": "hand-curated as part of Phase 1 bootstrap",
3
+ "generated_at": "2026-05-20",
4
+ "validation_locales": ["US", "UK", "IN", "CA", "AU"],
5
+ "light_validation_locales": ["IE", "NZ", "PH"],
6
+ "tables_per_locale": {
7
+ "US": ["age_sex", "race_ethnicity", "education", "marital", "household", "income", "regions"],
8
+ "UK": ["age_sex", "household", "race_ethnicity", "education", "marital"],
9
+ "IN": ["age_sex", "household", "education", "marital"],
10
+ "CA": ["age_sex", "household", "education", "marital"],
11
+ "AU": ["age_sex", "household", "education", "marital"],
12
+ "IE": ["age_sex", "household"],
13
+ "NZ": ["age_sex", "household"],
14
+ "PH": ["age_sex", "household"]
15
+ },
16
+ "fallback_strategy": "For tables absent in a locale, the generator falls back to US values, then to hard-coded defaults. The validator records every fallback usage in its consistency report so the dataset card can disclose them."
17
+ }