profilefoundry 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. profilefoundry/__init__.py +15 -0
  2. profilefoundry/cli.py +361 -0
  3. profilefoundry/data/__init__.py +0 -0
  4. profilefoundry/data/loader.py +80 -0
  5. profilefoundry/data/manifest_hash.py +54 -0
  6. profilefoundry/data/paths.py +33 -0
  7. profilefoundry/diversity/__init__.py +0 -0
  8. profilefoundry/documents/__init__.py +0 -0
  9. profilefoundry/generate/__init__.py +0 -0
  10. profilefoundry/generate/address.py +659 -0
  11. profilefoundry/generate/backfill.py +810 -0
  12. profilefoundry/generate/contact_ids.py +1314 -0
  13. profilefoundry/generate/education_status.py +244 -0
  14. profilefoundry/generate/employer_names.py +180 -0
  15. profilefoundry/generate/factory.py +1178 -0
  16. profilefoundry/generate/finance_health.py +525 -0
  17. profilefoundry/generate/identity.py +316 -0
  18. profilefoundry/generate/locales/__init__.py +201 -0
  19. profilefoundry/generate/occupation.py +254 -0
  20. profilefoundry/generate/sampling.py +70 -0
  21. profilefoundry/generate/seeding.py +38 -0
  22. profilefoundry/io/__init__.py +0 -0
  23. profilefoundry/io/hf_export.py +1454 -0
  24. profilefoundry/linkage/__init__.py +0 -0
  25. profilefoundry/linkage/employers.py +361 -0
  26. profilefoundry/linkage/families.py +176 -0
  27. profilefoundry/linkage/households.py +205 -0
  28. profilefoundry/linkage/orchestrator.py +476 -0
  29. profilefoundry/load/__init__.py +0 -0
  30. profilefoundry/schema/__init__.py +78 -0
  31. profilefoundry/schema/v0_1.py +457 -0
  32. profilefoundry/validate/__init__.py +0 -0
  33. profilefoundry/validate/consistency.py +105 -0
  34. profilefoundry/validate/distributional.py +213 -0
  35. profilefoundry/validate/leakage.py +376 -0
  36. profilefoundry/validate/replay.py +130 -0
  37. profilefoundry-1.0.0.data/data/data/reference/MANIFEST.md +139 -0
  38. profilefoundry-1.0.0.data/data/data/reference/bootstrap/INDEX.json +17 -0
  39. profilefoundry-1.0.0.data/data/data/reference/bootstrap/SCHEMA.md +160 -0
  40. profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/age_sex.json +15 -0
  41. profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/education.json +14 -0
  42. profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/household.json +22 -0
  43. profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/marital.json +32 -0
  44. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/age_sex.json +15 -0
  45. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/education.json +14 -0
  46. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/household.json +22 -0
  47. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/marital.json +32 -0
  48. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ie/age_sex.json +15 -0
  49. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ie/household.json +22 -0
  50. profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/age_sex.json +15 -0
  51. profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/education.json +33 -0
  52. profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/household.json +22 -0
  53. profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/marital.json +37 -0
  54. profilefoundry-1.0.0.data/data/data/reference/bootstrap/nz/age_sex.json +15 -0
  55. profilefoundry-1.0.0.data/data/data/reference/bootstrap/nz/household.json +22 -0
  56. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ph/age_sex.json +15 -0
  57. profilefoundry-1.0.0.data/data/data/reference/bootstrap/ph/household.json +22 -0
  58. profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/age_sex.json +15 -0
  59. profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/education.json +14 -0
  60. profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/household.json +22 -0
  61. profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/marital.json +32 -0
  62. profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/race_ethnicity.json +24 -0
  63. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/age_sex.json +15 -0
  64. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/education.json +14 -0
  65. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/household.json +22 -0
  66. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/income.json +38 -0
  67. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/marital.json +32 -0
  68. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/race_ethnicity.json +23 -0
  69. profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/regions.json +65 -0
  70. profilefoundry-1.0.0.data/data/data/reference/geo/AU_cities.json +36 -0
  71. profilefoundry-1.0.0.data/data/data/reference/geo/CA_cities.json +43 -0
  72. profilefoundry-1.0.0.data/data/data/reference/geo/IE_cities.json +30 -0
  73. profilefoundry-1.0.0.data/data/data/reference/geo/IN_cities.json +71 -0
  74. profilefoundry-1.0.0.data/data/data/reference/geo/NZ_cities.json +30 -0
  75. profilefoundry-1.0.0.data/data/data/reference/geo/PH_cities.json +43 -0
  76. profilefoundry-1.0.0.data/data/data/reference/geo/UK_cities.json +51 -0
  77. profilefoundry-1.0.0.data/data/data/reference/geo/US_cities.json +71 -0
  78. profilefoundry-1.0.0.data/data/data/reference/names/IN_given.json +29 -0
  79. profilefoundry-1.0.0.data/data/data/reference/names/IN_surnames.json +44 -0
  80. profilefoundry-1.0.0.data/data/data/reference/names/PH_given.json +29 -0
  81. profilefoundry-1.0.0.data/data/data/reference/names/PH_surnames.json +25 -0
  82. profilefoundry-1.0.0.data/data/share/profilefoundry/CITATION.cff +12 -0
  83. profilefoundry-1.0.0.dist-info/METADATA +193 -0
  84. profilefoundry-1.0.0.dist-info/RECORD +88 -0
  85. profilefoundry-1.0.0.dist-info/WHEEL +5 -0
  86. profilefoundry-1.0.0.dist-info/entry_points.txt +2 -0
  87. profilefoundry-1.0.0.dist-info/licenses/LICENSE +34 -0
  88. profilefoundry-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,15 @@
1
+ """ProfileFoundry — a generator + SDK for structured, internally-consistent,
2
+ linked, temporally coherent synthetic Person Objects.
3
+
4
+ The public surface area is intentionally tiny. See the project README for
5
+ worked examples.
6
+ """
7
+
8
+ __version__ = "1.0.0"
9
+
10
+ from profilefoundry.schema import ( # noqa: E402,F401
11
+ CURRENT_SCHEMA_VERSION,
12
+ Person,
13
+ )
14
+
15
+ __all__ = ["__version__", "CURRENT_SCHEMA_VERSION", "Person"]
profilefoundry/cli.py ADDED
@@ -0,0 +1,361 @@
1
+ """Tiny CLI for hand-driving the generator.
2
+
3
+ Examples::
4
+
5
+ profilefoundry person --locale US --seed 4321
6
+ profilefoundry scale --n 100 --locale US --seed 4321 --out /tmp/sample.jsonl
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import sys
13
+ from datetime import date
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ import click
18
+
19
+ # Auto-load .env at the repo root so secrets (HIBP_API_KEY, etc.) flow into
20
+ # any audit that needs them. Silent no-op if python-dotenv isn't installed.
21
+ try:
22
+ from dotenv import load_dotenv as _load_dotenv
23
+ _load_dotenv(Path(__file__).resolve().parents[2] / ".env")
24
+ except ImportError:
25
+ pass
26
+
27
+ from profilefoundry import __version__
28
+ from profilefoundry.generate.factory import make_person
29
+ from profilefoundry.schema import Locale
30
+
31
+ VALID_LOCALES = ("US", "UK", "CA", "AU", "NZ", "IE", "IN", "PH")
32
+
33
+
34
+ @click.group()
35
+ @click.version_option(__version__)
36
+ def main() -> None:
37
+ """ProfileFoundry — synthetic Person Object generator."""
38
+
39
+
40
+ @main.command("person")
41
+ @click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
42
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
43
+ @click.option("--profile-seq", type=int, default=1, show_default=True)
44
+ def person_cmd(locale: Locale, global_seed: int, profile_seq: int) -> None:
45
+ """Generate one Person and print it as JSON to stdout."""
46
+ p = make_person(locale=locale, profile_seq=profile_seq, global_seed=global_seed)
47
+ click.echo(p.model_dump_json(indent=2))
48
+
49
+
50
+ @main.command("scale")
51
+ @click.option("--n", "count", type=int, required=True, help="Number of profiles to generate.")
52
+ @click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
53
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
54
+ @click.option(
55
+ "--out",
56
+ type=click.Path(dir_okay=False, writable=True, path_type=Path),
57
+ default=None,
58
+ help="Write JSONL output to this file (one Person per line). Default: stdout.",
59
+ )
60
+ def scale_cmd(count: int, locale: Locale, global_seed: int, out: Optional[Path]) -> None:
61
+ """Generate ``--n`` profiles into a JSONL file or stdout."""
62
+ sink = out.open("w", encoding="utf-8") if out else sys.stdout
63
+ try:
64
+ for seq in range(1, count + 1):
65
+ p = make_person(locale=locale, profile_seq=seq, global_seed=global_seed)
66
+ sink.write(p.model_dump_json() + "\n")
67
+ finally:
68
+ if out:
69
+ sink.close()
70
+ if out:
71
+ click.echo(f"wrote {count} profiles to {out}", err=True)
72
+
73
+
74
+ @main.command("household")
75
+ @click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
76
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
77
+ @click.option("--seq", type=int, default=1, show_default=True, help="Household sequence number.")
78
+ def household_cmd(locale: Locale, global_seed: int, seq: int) -> None:
79
+ """Generate one household (multiple linked Persons) and print as JSON."""
80
+ from profilefoundry.linkage.orchestrator import build_household
81
+
82
+ bundle = build_household(locale=locale, household_seq=seq, global_seed=global_seed)
83
+ payload = {
84
+ "household_id": bundle.spec.household_id,
85
+ "composition_type": bundle.spec.composition_type,
86
+ "members": [json.loads(p.model_dump_json()) for p in bundle.persons],
87
+ }
88
+ click.echo(json.dumps(payload, indent=2))
89
+
90
+
91
+ @main.command("scale-households")
92
+ @click.option("--n", "count", type=int, required=True, help="Number of households to generate.")
93
+ @click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
94
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
95
+ @click.option("--out", type=click.Path(dir_okay=False, writable=True, path_type=Path), default=None,
96
+ help="Write JSONL output (one Person per line). Default: stdout.")
97
+ def scale_households_cmd(count: int, locale: Locale, global_seed: int, out: Optional[Path]) -> None:
98
+ """Generate ``--n`` households (linked profiles) as JSONL."""
99
+ from profilefoundry.linkage.orchestrator import iter_households
100
+
101
+ sink = out.open("w", encoding="utf-8") if out else sys.stdout
102
+ total_persons = 0
103
+ try:
104
+ for bundle in iter_households(locale, count, global_seed):
105
+ for p in bundle.persons:
106
+ sink.write(p.model_dump_json() + "\n")
107
+ total_persons += 1
108
+ finally:
109
+ if out:
110
+ sink.close()
111
+ if out:
112
+ click.echo(f"wrote {total_persons} persons in {count} households to {out}", err=True)
113
+
114
+
115
+ @main.command("validate")
116
+ @click.option("--n", "count", type=int, default=500, show_default=True)
117
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
118
+ @click.option("--locales", default="US,UK,IN,CA,AU", show_default=True,
119
+ help="Comma-separated locales to validate.")
120
+ @click.option(
121
+ "--hibp/--skip-hibp",
122
+ "use_hibp",
123
+ default=False,
124
+ show_default=True,
125
+ help="Run live HIBP breached-account checks. Default validate mode is syntax-only.",
126
+ )
127
+ @click.option(
128
+ "--hibp-max-emails",
129
+ type=click.IntRange(min=1),
130
+ default=100,
131
+ show_default=True,
132
+ help="Maximum unique emails to send to HIBP when --hibp is enabled.",
133
+ )
134
+ def validate_cmd(count: int, global_seed: int, locales: str, use_hibp: bool, hibp_max_emails: int) -> None:
135
+ """Run Phase 5 validation suite (KS gaps, leakage, consistency)."""
136
+ from profilefoundry.validate.consistency import aggregate_consistency
137
+ from profilefoundry.validate.distributional import validate_locale
138
+ from profilefoundry.validate.leakage import (
139
+ email_kanonymity_check,
140
+ self_collision_audit,
141
+ wikidata_leakage_scan,
142
+ )
143
+
144
+ today = date.today()
145
+ locale_list = [s.strip() for s in locales.split(",") if s.strip()]
146
+
147
+ click.echo(f"Validating n={count}/locale across {locale_list}\n")
148
+ for loc in locale_list:
149
+ click.echo(f"--- {loc} ---")
150
+ persons = [make_person(locale=loc, profile_seq=i, global_seed=global_seed) for i in range(1, count + 1)]
151
+ ks = validate_locale(persons, loc, today=today) # type: ignore[arg-type]
152
+ click.echo(f" age_ks male={ks.age_ks_male:.3f} female={ks.age_ks_female:.3f}"
153
+ if ks.age_ks_male is not None else " age_ks n/a")
154
+ click.echo(f" edu_ks {ks.education_ks:.3f}" if ks.education_ks is not None else " edu_ks n/a")
155
+ click.echo(f" marital_ks {ks.marital_ks:.3f}" if ks.marital_ks is not None else " marital_ks n/a")
156
+
157
+ cr = aggregate_consistency(persons, today=today)
158
+ click.echo(f" consistency {cr.n_pass}/{cr.n_total} ({100*cr.rate:.2f}%)")
159
+ if cr.failures:
160
+ top = sorted(cr.failures.items(), key=lambda kv: -kv[1])[:3]
161
+ click.echo(f" top failures: {top}")
162
+
163
+ scol = self_collision_audit(persons)
164
+ click.echo(f" self-collisions {scol.n_collisions}")
165
+ wd = wikidata_leakage_scan(persons)
166
+ click.echo(f" wikidata leaks {wd.n_leaks}")
167
+ em = email_kanonymity_check(
168
+ persons,
169
+ use_hibp=use_hibp,
170
+ max_emails=hibp_max_emails if use_hibp else None,
171
+ )
172
+ if use_hibp:
173
+ click.echo(
174
+ " email leaks "
175
+ f"{em.n_leaks} (mode={em.metadata.get('mode')}, "
176
+ f"checked={em.metadata.get('n_checked_emails', 0)})"
177
+ )
178
+ else:
179
+ click.echo(f" email-syntax bad {em.n_leaks}")
180
+ click.echo("")
181
+
182
+
183
+ @main.command("export")
184
+ @click.option("--out", type=click.Path(file_okay=False, writable=True, path_type=Path), required=True,
185
+ help="Output directory.")
186
+ @click.option("--n-per-locale", "n_per_locale", type=int, default=1000, show_default=True)
187
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
188
+ @click.option("--locales", default="US,UK,IN,CA,AU", show_default=True)
189
+ @click.option("--generation-date", type=click.DateTime(formats=["%Y-%m-%d"]), default=None,
190
+ help="Pin generated profile dates as YYYY-MM-DD.")
191
+ @click.option("--exported-at", default=None,
192
+ help="Pin release metadata timestamp as YYYY-MM-DD or ISO-8601 for byte-stable artifacts.")
193
+ @click.option("--skip-hibp", is_flag=True,
194
+ help="Skip live HIBP email checks even when HIBP_API_KEY is configured.")
195
+ @click.option("--household-mode/--profile-mode", default=True, show_default=True,
196
+ help="--household-mode generates linked households (Phase 3+).")
197
+ def export_cmd(
198
+ out: Path,
199
+ n_per_locale: int,
200
+ global_seed: int,
201
+ locales: str,
202
+ generation_date,
203
+ exported_at: str | None,
204
+ skip_hibp: bool,
205
+ household_mode: bool,
206
+ ) -> None:
207
+ """Export a multi-locale dataset (JSONL + Parquet + dataset card)."""
208
+ import time
209
+
210
+ from profilefoundry.io.hf_export import (
211
+ ExportStats,
212
+ normalize_exported_at,
213
+ write_addresses_parquet,
214
+ write_dataset_card,
215
+ write_education_parquet,
216
+ write_employers_parquet,
217
+ write_employment_parquet,
218
+ write_events_parquet,
219
+ write_flat_parquet,
220
+ write_health_allergies_parquet,
221
+ write_households_parquet,
222
+ write_jsonl,
223
+ write_manifest,
224
+ write_person_objects_parquet,
225
+ write_relationships_parquet,
226
+ write_social_handles_parquet,
227
+ )
228
+ from profilefoundry.linkage.orchestrator import iter_households_for_profile_target
229
+ from profilefoundry.validate.consistency import aggregate_consistency
230
+ from profilefoundry.validate.distributional import validate_locale
231
+ from profilefoundry.validate.leakage import (
232
+ email_kanonymity_check,
233
+ self_collision_audit,
234
+ wikidata_leakage_scan,
235
+ )
236
+
237
+ out.mkdir(parents=True, exist_ok=True)
238
+ loc_list = [s.strip() for s in locales.split(",") if s.strip()]
239
+ today = generation_date.date() if generation_date else date.today()
240
+ stats = ExportStats(locales={})
241
+ t0 = time.time()
242
+
243
+ all_persons = []
244
+ for loc in loc_list:
245
+ click.echo(f"Generating {loc} (~{n_per_locale} profiles)...", err=True)
246
+ loc_persons = []
247
+ if household_mode:
248
+ for b in iter_households_for_profile_target(loc, n_per_locale, global_seed, today=today): # type: ignore[arg-type]
249
+ loc_persons.extend(b.persons)
250
+ else:
251
+ for seq in range(1, n_per_locale + 1):
252
+ loc_persons.append(make_person(locale=loc, profile_seq=seq, global_seed=global_seed, today=today)) # type: ignore[arg-type]
253
+ all_persons.extend(loc_persons)
254
+ stats.locales[loc] = len(loc_persons)
255
+
256
+ stats.n_profiles = len(all_persons)
257
+ stats.n_events = sum(len(p.events) for p in all_persons)
258
+ stats.n_addresses = sum(len(p.addresses) for p in all_persons)
259
+ stats.n_employment_records = sum(len(p.employment) for p in all_persons)
260
+ stats.n_education_records = sum(len(p.education) for p in all_persons)
261
+ stats.n_social_handles = sum(len(p.contact.social_handles) for p in all_persons)
262
+ stats.n_health_allergies = sum(len(p.health.allergies) for p in all_persons)
263
+ stats.n_households = len({p.household_id for p in all_persons})
264
+ stats.n_employers = len({e.employer_id for p in all_persons for e in p.employment})
265
+ stats.duration_seconds = time.time() - t0
266
+ stats.generation_date = today.isoformat()
267
+ stats.exported_at = normalize_exported_at(exported_at)
268
+
269
+ click.echo(f"Writing {stats.n_profiles} profiles → {out}", err=True)
270
+ row_counts: dict[str, int] = {}
271
+ row_counts["profiles.jsonl"] = write_jsonl(all_persons, out / "profiles.jsonl")
272
+ row_counts["person_objects.parquet"] = write_person_objects_parquet(all_persons, out / "person_objects.parquet")
273
+ row_counts["profiles.parquet"] = write_flat_parquet(all_persons, out / "profiles.parquet")
274
+ row_counts["addresses.parquet"] = write_addresses_parquet(all_persons, out / "addresses.parquet")
275
+ row_counts["employment.parquet"] = write_employment_parquet(all_persons, out / "employment.parquet")
276
+ row_counts["education.parquet"] = write_education_parquet(all_persons, out / "education.parquet")
277
+ row_counts["social_handles.parquet"] = write_social_handles_parquet(all_persons, out / "social_handles.parquet")
278
+ row_counts["health_allergies.parquet"] = write_health_allergies_parquet(all_persons, out / "health_allergies.parquet")
279
+ row_counts["events.parquet"] = write_events_parquet(all_persons, out / "events.parquet")
280
+ row_counts["households.parquet"] = write_households_parquet(all_persons, out / "households.parquet")
281
+ row_counts["employers.parquet"] = write_employers_parquet(all_persons, out / "employers.parquet")
282
+ row_counts["relationships.parquet"] = write_relationships_parquet(all_persons, out / "relationships.parquet")
283
+ stats.n_addresses = row_counts["addresses.parquet"]
284
+ stats.n_social_handles = row_counts["social_handles.parquet"]
285
+ stats.n_health_allergies = row_counts["health_allergies.parquet"]
286
+ stats.n_relationships = row_counts["relationships.parquet"]
287
+
288
+ # Per-locale validation snapshot for the dataset card.
289
+ validation: dict = {}
290
+ for loc in loc_list:
291
+ loc_ps = [p for p in all_persons if p.locale == loc]
292
+ if not loc_ps:
293
+ continue
294
+ ks = validate_locale(loc_ps, loc, today=today) # type: ignore[arg-type]
295
+ cr = aggregate_consistency(loc_ps, today=today)
296
+ validation[loc] = {
297
+ "age_ks_male": ks.age_ks_male,
298
+ "age_ks_female": ks.age_ks_female,
299
+ "education_ks": ks.education_ks,
300
+ "marital_ks": ks.marital_ks,
301
+ "consistency_rate": cr.rate,
302
+ }
303
+
304
+ # Leakage summary for the dataset card.
305
+ leakage: dict = {}
306
+ sc = self_collision_audit(all_persons)
307
+ leakage["self_collision"] = {
308
+ "n_profiles": sc.n_profiles,
309
+ "n_collisions": sc.n_collisions,
310
+ "mode": "exact",
311
+ }
312
+ wk = wikidata_leakage_scan(all_persons)
313
+ leakage["wikidata"] = {
314
+ "n_profiles": wk.n_profiles,
315
+ "n_leaks": wk.n_leaks,
316
+ "mode": wk.metadata.get("mode", "demo"),
317
+ }
318
+ em = email_kanonymity_check(all_persons, use_hibp=False if skip_hibp else None)
319
+ leakage["email"] = {
320
+ "n_profiles": em.n_profiles,
321
+ "n_leaks": em.n_leaks,
322
+ "mode": em.metadata.get("mode", "syntax_only"),
323
+ }
324
+
325
+ write_dataset_card(out / "dataset_card.md", stats, validation, leakage)
326
+ write_manifest(out, stats, row_counts)
327
+ click.echo(f"Done ({stats.duration_seconds:.1f}s). Files written to {out}/", err=True)
328
+
329
+
330
+ @main.command("scale-smoke")
331
+ @click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
332
+ @click.option("--sizes", default="1000,10000", show_default=True,
333
+ help="Comma-separated profile counts to time.")
334
+ @click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
335
+ def scale_smoke_cmd(locale: Locale, sizes: str, global_seed: int) -> None:
336
+ """Time the generator at several scale points."""
337
+ import time
338
+ size_list = [int(s.strip()) for s in sizes.split(",")]
339
+ for n in size_list:
340
+ t0 = time.time()
341
+ for seq in range(1, n + 1):
342
+ make_person(locale=locale, profile_seq=seq, global_seed=global_seed)
343
+ dt = time.time() - t0
344
+ rate = n / dt if dt > 0 else 0
345
+ click.echo(f" n={n:>8d} {dt:6.2f}s ({rate:.0f} profiles/s)")
346
+
347
+
348
+ @main.command("verify")
349
+ def verify_cmd() -> None:
350
+ """Run a quick self-check: generate one profile per locale."""
351
+ for loc in VALID_LOCALES:
352
+ try:
353
+ p = make_person(locale=loc, profile_seq=1, global_seed=4321)
354
+ click.echo(f"OK {loc:>3} {p.profile_id} {p.identity.given_name} {p.identity.family_name}")
355
+ except Exception as exc: # noqa: BLE001
356
+ click.echo(f"FAIL {loc:>3} {exc}", err=True)
357
+ sys.exit(1)
358
+
359
+
360
+ if __name__ == "__main__":
361
+ main()
File without changes
@@ -0,0 +1,80 @@
1
+ """Reference-data loader.
2
+
3
+ Resolves a (locale, table-name) pair into a parsed Python object. Prefers
4
+ ``data/reference/derived/<locale>/<table>.parquet`` (live-ingested) over
5
+ ``data/reference/bootstrap/<locale>/<table>.json`` (committed minimum).
6
+
7
+ The loader is intentionally cache-aware: a single Python process never reads
8
+ the same file twice.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import functools
14
+ import json
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from profilefoundry.data.paths import reference_root
19
+
20
+ REF_ROOT = reference_root()
21
+ BOOTSTRAP_ROOT = REF_ROOT / "bootstrap"
22
+ DERIVED_ROOT = REF_ROOT / "derived"
23
+
24
+ LOCALE_CODES = ("US", "UK", "CA", "AU", "NZ", "IE", "IN", "PH")
25
+
26
+
27
+ class ReferenceDataMissing(FileNotFoundError):
28
+ """Raised when neither bootstrap nor derived data is available."""
29
+
30
+
31
+ def locale_dir(locale: str, *, tier: str) -> Path:
32
+ if locale not in LOCALE_CODES:
33
+ raise ValueError(f"Unknown locale: {locale!r}; expected one of {LOCALE_CODES}")
34
+ root = BOOTSTRAP_ROOT if tier == "bootstrap" else DERIVED_ROOT
35
+ return root / locale.lower()
36
+
37
+
38
+ @functools.lru_cache(maxsize=128)
39
+ def load_table(locale: str, table: str) -> dict[str, Any]:
40
+ """Load a reference table for a locale.
41
+
42
+ Resolution order:
43
+ 1. ``derived/<locale>/<table>.parquet`` (not implemented yet; will
44
+ wrap pandas.read_parquet when richer ingestion lands).
45
+ 2. ``bootstrap/<locale>/<table>.json``.
46
+
47
+ Returns the parsed ``data`` payload of the bootstrap envelope. Source
48
+ metadata can be retrieved via :func:`load_source_metadata`.
49
+ """
50
+
51
+ boot_path = locale_dir(locale, tier="bootstrap") / f"{table}.json"
52
+ if boot_path.exists():
53
+ envelope = json.loads(boot_path.read_text(encoding="utf-8"))
54
+ return envelope.get("data", {})
55
+
56
+ raise ReferenceDataMissing(
57
+ f"No reference data for locale={locale!r}, table={table!r}. "
58
+ f"Looked in {boot_path}. Run scripts/ingest_{locale.lower()}.py "
59
+ f"to populate live data, or add a bootstrap file."
60
+ )
61
+
62
+
63
+ @functools.lru_cache(maxsize=128)
64
+ def load_source_metadata(locale: str, table: str) -> dict[str, Any]:
65
+ boot_path = locale_dir(locale, tier="bootstrap") / f"{table}.json"
66
+ if not boot_path.exists():
67
+ raise ReferenceDataMissing(f"No bootstrap file at {boot_path}")
68
+ envelope = json.loads(boot_path.read_text(encoding="utf-8"))
69
+ return envelope.get("source", {})
70
+
71
+
72
+ def available_tables(locale: str) -> list[str]:
73
+ boot_dir = locale_dir(locale, tier="bootstrap")
74
+ if not boot_dir.is_dir():
75
+ return []
76
+ return sorted(p.stem for p in boot_dir.glob("*.json"))
77
+
78
+
79
+ def all_locales_with_data() -> dict[str, list[str]]:
80
+ return {loc: available_tables(loc) for loc in LOCALE_CODES}
@@ -0,0 +1,54 @@
1
+ """Reference data manifest hash (PF-053).
2
+
3
+ Computes a deterministic hash of all bootstrap reference files used by the
4
+ generator. Stamped into ``Person.generation.reference_manifest_hash`` so a
5
+ re-run with the same global_seed but a different reference-data version is
6
+ detectable.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import functools
12
+ import hashlib
13
+ from pathlib import Path
14
+
15
+ from profilefoundry.data.paths import reference_root
16
+
17
+ REF_ROOT = reference_root()
18
+
19
+
20
+ # Subtrees that influence generation output. Audit-only artifacts (leakage
21
+ # bloom, ingestion cache) live elsewhere under data/reference/ and are
22
+ # deliberately excluded so the hash stays stable across leakage ingests.
23
+ _HASHED_SUBTREES = ("bootstrap", "names", "geo")
24
+
25
+
26
+ @functools.lru_cache(maxsize=1)
27
+ def reference_manifest_hash() -> str:
28
+ """blake2b digest of every committed reference-data file the generator reads.
29
+
30
+ Returns a 16-char hex digest derived from the sorted list of
31
+ (relative_path, file_bytes) tuples under ``data/reference/{bootstrap,
32
+ names, geo}/``. Audit-only data (``leakage/``, ``cache/``, ``derived/``)
33
+ is excluded so re-running the leakage ingest does not change the hash.
34
+ Cached for the lifetime of the process.
35
+ """
36
+ h = hashlib.blake2b(digest_size=8)
37
+ if not REF_ROOT.exists():
38
+ return h.hexdigest()
39
+ paths: list[Path] = []
40
+ for sub in _HASHED_SUBTREES:
41
+ root = REF_ROOT / sub
42
+ if not root.exists():
43
+ continue
44
+ paths.extend(p for p in root.rglob("*") if p.is_file())
45
+ for p in sorted(paths):
46
+ rel = p.relative_to(REF_ROOT).as_posix()
47
+ h.update(rel.encode("utf-8"))
48
+ h.update(b"\x00")
49
+ try:
50
+ h.update(p.read_bytes())
51
+ except OSError:
52
+ h.update(b"<unreadable>")
53
+ h.update(b"\x01")
54
+ return h.hexdigest()
@@ -0,0 +1,33 @@
1
+ """Runtime paths for ProfileFoundry reference data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ REFERENCE_ROOT_ENV = "PROFILEFOUNDRY_REFERENCE_ROOT"
10
+
11
+
12
+ def reference_root() -> Path:
13
+ """Return the generation reference-data root for source or installed runs.
14
+
15
+ Source checkouts keep reference data at ``<repo>/data/reference`` while
16
+ wheels install the same committed files as ``<install-root>/data/reference``.
17
+ ``PROFILEFOUNDRY_REFERENCE_ROOT`` can point audits at an explicit copy.
18
+ """
19
+
20
+ env_root = os.environ.get(REFERENCE_ROOT_ENV)
21
+ if env_root:
22
+ return Path(env_root).expanduser().resolve()
23
+
24
+ module_path = Path(__file__).resolve()
25
+ candidates = (
26
+ Path(sys.prefix) / "data" / "reference", # installed wheel
27
+ module_path.parents[2] / "data" / "reference", # pip --target install
28
+ module_path.parents[3] / "data" / "reference", # source checkout
29
+ )
30
+ for candidate in candidates:
31
+ if candidate.exists():
32
+ return candidate
33
+ return candidates[-1]
File without changes
File without changes
File without changes