profilefoundry 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- profilefoundry/__init__.py +15 -0
- profilefoundry/cli.py +361 -0
- profilefoundry/data/__init__.py +0 -0
- profilefoundry/data/loader.py +80 -0
- profilefoundry/data/manifest_hash.py +54 -0
- profilefoundry/data/paths.py +33 -0
- profilefoundry/diversity/__init__.py +0 -0
- profilefoundry/documents/__init__.py +0 -0
- profilefoundry/generate/__init__.py +0 -0
- profilefoundry/generate/address.py +659 -0
- profilefoundry/generate/backfill.py +810 -0
- profilefoundry/generate/contact_ids.py +1314 -0
- profilefoundry/generate/education_status.py +244 -0
- profilefoundry/generate/employer_names.py +180 -0
- profilefoundry/generate/factory.py +1178 -0
- profilefoundry/generate/finance_health.py +525 -0
- profilefoundry/generate/identity.py +316 -0
- profilefoundry/generate/locales/__init__.py +201 -0
- profilefoundry/generate/occupation.py +254 -0
- profilefoundry/generate/sampling.py +70 -0
- profilefoundry/generate/seeding.py +38 -0
- profilefoundry/io/__init__.py +0 -0
- profilefoundry/io/hf_export.py +1454 -0
- profilefoundry/linkage/__init__.py +0 -0
- profilefoundry/linkage/employers.py +361 -0
- profilefoundry/linkage/families.py +176 -0
- profilefoundry/linkage/households.py +205 -0
- profilefoundry/linkage/orchestrator.py +476 -0
- profilefoundry/load/__init__.py +0 -0
- profilefoundry/schema/__init__.py +78 -0
- profilefoundry/schema/v0_1.py +457 -0
- profilefoundry/validate/__init__.py +0 -0
- profilefoundry/validate/consistency.py +105 -0
- profilefoundry/validate/distributional.py +213 -0
- profilefoundry/validate/leakage.py +376 -0
- profilefoundry/validate/replay.py +130 -0
- profilefoundry-1.0.0.data/data/data/reference/MANIFEST.md +139 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/INDEX.json +17 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/SCHEMA.md +160 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/education.json +14 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/au/marital.json +32 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/education.json +14 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ca/marital.json +32 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ie/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ie/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/education.json +33 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/in/marital.json +37 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/nz/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/nz/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ph/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/ph/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/education.json +14 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/marital.json +32 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/uk/race_ethnicity.json +24 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/age_sex.json +15 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/education.json +14 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/household.json +22 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/income.json +38 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/marital.json +32 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/race_ethnicity.json +23 -0
- profilefoundry-1.0.0.data/data/data/reference/bootstrap/us/regions.json +65 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/AU_cities.json +36 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/CA_cities.json +43 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/IE_cities.json +30 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/IN_cities.json +71 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/NZ_cities.json +30 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/PH_cities.json +43 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/UK_cities.json +51 -0
- profilefoundry-1.0.0.data/data/data/reference/geo/US_cities.json +71 -0
- profilefoundry-1.0.0.data/data/data/reference/names/IN_given.json +29 -0
- profilefoundry-1.0.0.data/data/data/reference/names/IN_surnames.json +44 -0
- profilefoundry-1.0.0.data/data/data/reference/names/PH_given.json +29 -0
- profilefoundry-1.0.0.data/data/data/reference/names/PH_surnames.json +25 -0
- profilefoundry-1.0.0.data/data/share/profilefoundry/CITATION.cff +12 -0
- profilefoundry-1.0.0.dist-info/METADATA +193 -0
- profilefoundry-1.0.0.dist-info/RECORD +88 -0
- profilefoundry-1.0.0.dist-info/WHEEL +5 -0
- profilefoundry-1.0.0.dist-info/entry_points.txt +2 -0
- profilefoundry-1.0.0.dist-info/licenses/LICENSE +34 -0
- profilefoundry-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""ProfileFoundry — a generator + SDK for structured, internally-consistent,
|
|
2
|
+
linked, temporally coherent synthetic Person Objects.
|
|
3
|
+
|
|
4
|
+
The public surface area is intentionally tiny. See the project README for
|
|
5
|
+
worked examples.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "1.0.0"
|
|
9
|
+
|
|
10
|
+
from profilefoundry.schema import ( # noqa: E402,F401
|
|
11
|
+
CURRENT_SCHEMA_VERSION,
|
|
12
|
+
Person,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = ["__version__", "CURRENT_SCHEMA_VERSION", "Person"]
|
profilefoundry/cli.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""Tiny CLI for hand-driving the generator.
|
|
2
|
+
|
|
3
|
+
Examples::
|
|
4
|
+
|
|
5
|
+
profilefoundry person --locale US --seed 4321
|
|
6
|
+
profilefoundry scale --n 100 --locale US --seed 4321 --out /tmp/sample.jsonl
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from datetime import date
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
import click
|
|
18
|
+
|
|
19
|
+
# Auto-load .env at the repo root so secrets (HIBP_API_KEY, etc.) flow into
|
|
20
|
+
# any audit that needs them. Silent no-op if python-dotenv isn't installed.
|
|
21
|
+
try:
|
|
22
|
+
from dotenv import load_dotenv as _load_dotenv
|
|
23
|
+
_load_dotenv(Path(__file__).resolve().parents[2] / ".env")
|
|
24
|
+
except ImportError:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
from profilefoundry import __version__
|
|
28
|
+
from profilefoundry.generate.factory import make_person
|
|
29
|
+
from profilefoundry.schema import Locale
|
|
30
|
+
|
|
31
|
+
VALID_LOCALES = ("US", "UK", "CA", "AU", "NZ", "IE", "IN", "PH")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@click.group()
|
|
35
|
+
@click.version_option(__version__)
|
|
36
|
+
def main() -> None:
|
|
37
|
+
"""ProfileFoundry — synthetic Person Object generator."""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@main.command("person")
|
|
41
|
+
@click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
|
|
42
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
43
|
+
@click.option("--profile-seq", type=int, default=1, show_default=True)
|
|
44
|
+
def person_cmd(locale: Locale, global_seed: int, profile_seq: int) -> None:
|
|
45
|
+
"""Generate one Person and print it as JSON to stdout."""
|
|
46
|
+
p = make_person(locale=locale, profile_seq=profile_seq, global_seed=global_seed)
|
|
47
|
+
click.echo(p.model_dump_json(indent=2))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@main.command("scale")
|
|
51
|
+
@click.option("--n", "count", type=int, required=True, help="Number of profiles to generate.")
|
|
52
|
+
@click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
|
|
53
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
54
|
+
@click.option(
|
|
55
|
+
"--out",
|
|
56
|
+
type=click.Path(dir_okay=False, writable=True, path_type=Path),
|
|
57
|
+
default=None,
|
|
58
|
+
help="Write JSONL output to this file (one Person per line). Default: stdout.",
|
|
59
|
+
)
|
|
60
|
+
def scale_cmd(count: int, locale: Locale, global_seed: int, out: Optional[Path]) -> None:
|
|
61
|
+
"""Generate ``--n`` profiles into a JSONL file or stdout."""
|
|
62
|
+
sink = out.open("w", encoding="utf-8") if out else sys.stdout
|
|
63
|
+
try:
|
|
64
|
+
for seq in range(1, count + 1):
|
|
65
|
+
p = make_person(locale=locale, profile_seq=seq, global_seed=global_seed)
|
|
66
|
+
sink.write(p.model_dump_json() + "\n")
|
|
67
|
+
finally:
|
|
68
|
+
if out:
|
|
69
|
+
sink.close()
|
|
70
|
+
if out:
|
|
71
|
+
click.echo(f"wrote {count} profiles to {out}", err=True)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@main.command("household")
|
|
75
|
+
@click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
|
|
76
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
77
|
+
@click.option("--seq", type=int, default=1, show_default=True, help="Household sequence number.")
|
|
78
|
+
def household_cmd(locale: Locale, global_seed: int, seq: int) -> None:
|
|
79
|
+
"""Generate one household (multiple linked Persons) and print as JSON."""
|
|
80
|
+
from profilefoundry.linkage.orchestrator import build_household
|
|
81
|
+
|
|
82
|
+
bundle = build_household(locale=locale, household_seq=seq, global_seed=global_seed)
|
|
83
|
+
payload = {
|
|
84
|
+
"household_id": bundle.spec.household_id,
|
|
85
|
+
"composition_type": bundle.spec.composition_type,
|
|
86
|
+
"members": [json.loads(p.model_dump_json()) for p in bundle.persons],
|
|
87
|
+
}
|
|
88
|
+
click.echo(json.dumps(payload, indent=2))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@main.command("scale-households")
|
|
92
|
+
@click.option("--n", "count", type=int, required=True, help="Number of households to generate.")
|
|
93
|
+
@click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
|
|
94
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
95
|
+
@click.option("--out", type=click.Path(dir_okay=False, writable=True, path_type=Path), default=None,
|
|
96
|
+
help="Write JSONL output (one Person per line). Default: stdout.")
|
|
97
|
+
def scale_households_cmd(count: int, locale: Locale, global_seed: int, out: Optional[Path]) -> None:
|
|
98
|
+
"""Generate ``--n`` households (linked profiles) as JSONL."""
|
|
99
|
+
from profilefoundry.linkage.orchestrator import iter_households
|
|
100
|
+
|
|
101
|
+
sink = out.open("w", encoding="utf-8") if out else sys.stdout
|
|
102
|
+
total_persons = 0
|
|
103
|
+
try:
|
|
104
|
+
for bundle in iter_households(locale, count, global_seed):
|
|
105
|
+
for p in bundle.persons:
|
|
106
|
+
sink.write(p.model_dump_json() + "\n")
|
|
107
|
+
total_persons += 1
|
|
108
|
+
finally:
|
|
109
|
+
if out:
|
|
110
|
+
sink.close()
|
|
111
|
+
if out:
|
|
112
|
+
click.echo(f"wrote {total_persons} persons in {count} households to {out}", err=True)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@main.command("validate")
|
|
116
|
+
@click.option("--n", "count", type=int, default=500, show_default=True)
|
|
117
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
118
|
+
@click.option("--locales", default="US,UK,IN,CA,AU", show_default=True,
|
|
119
|
+
help="Comma-separated locales to validate.")
|
|
120
|
+
@click.option(
|
|
121
|
+
"--hibp/--skip-hibp",
|
|
122
|
+
"use_hibp",
|
|
123
|
+
default=False,
|
|
124
|
+
show_default=True,
|
|
125
|
+
help="Run live HIBP breached-account checks. Default validate mode is syntax-only.",
|
|
126
|
+
)
|
|
127
|
+
@click.option(
|
|
128
|
+
"--hibp-max-emails",
|
|
129
|
+
type=click.IntRange(min=1),
|
|
130
|
+
default=100,
|
|
131
|
+
show_default=True,
|
|
132
|
+
help="Maximum unique emails to send to HIBP when --hibp is enabled.",
|
|
133
|
+
)
|
|
134
|
+
def validate_cmd(count: int, global_seed: int, locales: str, use_hibp: bool, hibp_max_emails: int) -> None:
|
|
135
|
+
"""Run Phase 5 validation suite (KS gaps, leakage, consistency)."""
|
|
136
|
+
from profilefoundry.validate.consistency import aggregate_consistency
|
|
137
|
+
from profilefoundry.validate.distributional import validate_locale
|
|
138
|
+
from profilefoundry.validate.leakage import (
|
|
139
|
+
email_kanonymity_check,
|
|
140
|
+
self_collision_audit,
|
|
141
|
+
wikidata_leakage_scan,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
today = date.today()
|
|
145
|
+
locale_list = [s.strip() for s in locales.split(",") if s.strip()]
|
|
146
|
+
|
|
147
|
+
click.echo(f"Validating n={count}/locale across {locale_list}\n")
|
|
148
|
+
for loc in locale_list:
|
|
149
|
+
click.echo(f"--- {loc} ---")
|
|
150
|
+
persons = [make_person(locale=loc, profile_seq=i, global_seed=global_seed) for i in range(1, count + 1)]
|
|
151
|
+
ks = validate_locale(persons, loc, today=today) # type: ignore[arg-type]
|
|
152
|
+
click.echo(f" age_ks male={ks.age_ks_male:.3f} female={ks.age_ks_female:.3f}"
|
|
153
|
+
if ks.age_ks_male is not None else " age_ks n/a")
|
|
154
|
+
click.echo(f" edu_ks {ks.education_ks:.3f}" if ks.education_ks is not None else " edu_ks n/a")
|
|
155
|
+
click.echo(f" marital_ks {ks.marital_ks:.3f}" if ks.marital_ks is not None else " marital_ks n/a")
|
|
156
|
+
|
|
157
|
+
cr = aggregate_consistency(persons, today=today)
|
|
158
|
+
click.echo(f" consistency {cr.n_pass}/{cr.n_total} ({100*cr.rate:.2f}%)")
|
|
159
|
+
if cr.failures:
|
|
160
|
+
top = sorted(cr.failures.items(), key=lambda kv: -kv[1])[:3]
|
|
161
|
+
click.echo(f" top failures: {top}")
|
|
162
|
+
|
|
163
|
+
scol = self_collision_audit(persons)
|
|
164
|
+
click.echo(f" self-collisions {scol.n_collisions}")
|
|
165
|
+
wd = wikidata_leakage_scan(persons)
|
|
166
|
+
click.echo(f" wikidata leaks {wd.n_leaks}")
|
|
167
|
+
em = email_kanonymity_check(
|
|
168
|
+
persons,
|
|
169
|
+
use_hibp=use_hibp,
|
|
170
|
+
max_emails=hibp_max_emails if use_hibp else None,
|
|
171
|
+
)
|
|
172
|
+
if use_hibp:
|
|
173
|
+
click.echo(
|
|
174
|
+
" email leaks "
|
|
175
|
+
f"{em.n_leaks} (mode={em.metadata.get('mode')}, "
|
|
176
|
+
f"checked={em.metadata.get('n_checked_emails', 0)})"
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
click.echo(f" email-syntax bad {em.n_leaks}")
|
|
180
|
+
click.echo("")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@main.command("export")
|
|
184
|
+
@click.option("--out", type=click.Path(file_okay=False, writable=True, path_type=Path), required=True,
|
|
185
|
+
help="Output directory.")
|
|
186
|
+
@click.option("--n-per-locale", "n_per_locale", type=int, default=1000, show_default=True)
|
|
187
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
188
|
+
@click.option("--locales", default="US,UK,IN,CA,AU", show_default=True)
|
|
189
|
+
@click.option("--generation-date", type=click.DateTime(formats=["%Y-%m-%d"]), default=None,
|
|
190
|
+
help="Pin generated profile dates as YYYY-MM-DD.")
|
|
191
|
+
@click.option("--exported-at", default=None,
|
|
192
|
+
help="Pin release metadata timestamp as YYYY-MM-DD or ISO-8601 for byte-stable artifacts.")
|
|
193
|
+
@click.option("--skip-hibp", is_flag=True,
|
|
194
|
+
help="Skip live HIBP email checks even when HIBP_API_KEY is configured.")
|
|
195
|
+
@click.option("--household-mode/--profile-mode", default=True, show_default=True,
|
|
196
|
+
help="--household-mode generates linked households (Phase 3+).")
|
|
197
|
+
def export_cmd(
|
|
198
|
+
out: Path,
|
|
199
|
+
n_per_locale: int,
|
|
200
|
+
global_seed: int,
|
|
201
|
+
locales: str,
|
|
202
|
+
generation_date,
|
|
203
|
+
exported_at: str | None,
|
|
204
|
+
skip_hibp: bool,
|
|
205
|
+
household_mode: bool,
|
|
206
|
+
) -> None:
|
|
207
|
+
"""Export a multi-locale dataset (JSONL + Parquet + dataset card)."""
|
|
208
|
+
import time
|
|
209
|
+
|
|
210
|
+
from profilefoundry.io.hf_export import (
|
|
211
|
+
ExportStats,
|
|
212
|
+
normalize_exported_at,
|
|
213
|
+
write_addresses_parquet,
|
|
214
|
+
write_dataset_card,
|
|
215
|
+
write_education_parquet,
|
|
216
|
+
write_employers_parquet,
|
|
217
|
+
write_employment_parquet,
|
|
218
|
+
write_events_parquet,
|
|
219
|
+
write_flat_parquet,
|
|
220
|
+
write_health_allergies_parquet,
|
|
221
|
+
write_households_parquet,
|
|
222
|
+
write_jsonl,
|
|
223
|
+
write_manifest,
|
|
224
|
+
write_person_objects_parquet,
|
|
225
|
+
write_relationships_parquet,
|
|
226
|
+
write_social_handles_parquet,
|
|
227
|
+
)
|
|
228
|
+
from profilefoundry.linkage.orchestrator import iter_households_for_profile_target
|
|
229
|
+
from profilefoundry.validate.consistency import aggregate_consistency
|
|
230
|
+
from profilefoundry.validate.distributional import validate_locale
|
|
231
|
+
from profilefoundry.validate.leakage import (
|
|
232
|
+
email_kanonymity_check,
|
|
233
|
+
self_collision_audit,
|
|
234
|
+
wikidata_leakage_scan,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
238
|
+
loc_list = [s.strip() for s in locales.split(",") if s.strip()]
|
|
239
|
+
today = generation_date.date() if generation_date else date.today()
|
|
240
|
+
stats = ExportStats(locales={})
|
|
241
|
+
t0 = time.time()
|
|
242
|
+
|
|
243
|
+
all_persons = []
|
|
244
|
+
for loc in loc_list:
|
|
245
|
+
click.echo(f"Generating {loc} (~{n_per_locale} profiles)...", err=True)
|
|
246
|
+
loc_persons = []
|
|
247
|
+
if household_mode:
|
|
248
|
+
for b in iter_households_for_profile_target(loc, n_per_locale, global_seed, today=today): # type: ignore[arg-type]
|
|
249
|
+
loc_persons.extend(b.persons)
|
|
250
|
+
else:
|
|
251
|
+
for seq in range(1, n_per_locale + 1):
|
|
252
|
+
loc_persons.append(make_person(locale=loc, profile_seq=seq, global_seed=global_seed, today=today)) # type: ignore[arg-type]
|
|
253
|
+
all_persons.extend(loc_persons)
|
|
254
|
+
stats.locales[loc] = len(loc_persons)
|
|
255
|
+
|
|
256
|
+
stats.n_profiles = len(all_persons)
|
|
257
|
+
stats.n_events = sum(len(p.events) for p in all_persons)
|
|
258
|
+
stats.n_addresses = sum(len(p.addresses) for p in all_persons)
|
|
259
|
+
stats.n_employment_records = sum(len(p.employment) for p in all_persons)
|
|
260
|
+
stats.n_education_records = sum(len(p.education) for p in all_persons)
|
|
261
|
+
stats.n_social_handles = sum(len(p.contact.social_handles) for p in all_persons)
|
|
262
|
+
stats.n_health_allergies = sum(len(p.health.allergies) for p in all_persons)
|
|
263
|
+
stats.n_households = len({p.household_id for p in all_persons})
|
|
264
|
+
stats.n_employers = len({e.employer_id for p in all_persons for e in p.employment})
|
|
265
|
+
stats.duration_seconds = time.time() - t0
|
|
266
|
+
stats.generation_date = today.isoformat()
|
|
267
|
+
stats.exported_at = normalize_exported_at(exported_at)
|
|
268
|
+
|
|
269
|
+
click.echo(f"Writing {stats.n_profiles} profiles → {out}", err=True)
|
|
270
|
+
row_counts: dict[str, int] = {}
|
|
271
|
+
row_counts["profiles.jsonl"] = write_jsonl(all_persons, out / "profiles.jsonl")
|
|
272
|
+
row_counts["person_objects.parquet"] = write_person_objects_parquet(all_persons, out / "person_objects.parquet")
|
|
273
|
+
row_counts["profiles.parquet"] = write_flat_parquet(all_persons, out / "profiles.parquet")
|
|
274
|
+
row_counts["addresses.parquet"] = write_addresses_parquet(all_persons, out / "addresses.parquet")
|
|
275
|
+
row_counts["employment.parquet"] = write_employment_parquet(all_persons, out / "employment.parquet")
|
|
276
|
+
row_counts["education.parquet"] = write_education_parquet(all_persons, out / "education.parquet")
|
|
277
|
+
row_counts["social_handles.parquet"] = write_social_handles_parquet(all_persons, out / "social_handles.parquet")
|
|
278
|
+
row_counts["health_allergies.parquet"] = write_health_allergies_parquet(all_persons, out / "health_allergies.parquet")
|
|
279
|
+
row_counts["events.parquet"] = write_events_parquet(all_persons, out / "events.parquet")
|
|
280
|
+
row_counts["households.parquet"] = write_households_parquet(all_persons, out / "households.parquet")
|
|
281
|
+
row_counts["employers.parquet"] = write_employers_parquet(all_persons, out / "employers.parquet")
|
|
282
|
+
row_counts["relationships.parquet"] = write_relationships_parquet(all_persons, out / "relationships.parquet")
|
|
283
|
+
stats.n_addresses = row_counts["addresses.parquet"]
|
|
284
|
+
stats.n_social_handles = row_counts["social_handles.parquet"]
|
|
285
|
+
stats.n_health_allergies = row_counts["health_allergies.parquet"]
|
|
286
|
+
stats.n_relationships = row_counts["relationships.parquet"]
|
|
287
|
+
|
|
288
|
+
# Per-locale validation snapshot for the dataset card.
|
|
289
|
+
validation: dict = {}
|
|
290
|
+
for loc in loc_list:
|
|
291
|
+
loc_ps = [p for p in all_persons if p.locale == loc]
|
|
292
|
+
if not loc_ps:
|
|
293
|
+
continue
|
|
294
|
+
ks = validate_locale(loc_ps, loc, today=today) # type: ignore[arg-type]
|
|
295
|
+
cr = aggregate_consistency(loc_ps, today=today)
|
|
296
|
+
validation[loc] = {
|
|
297
|
+
"age_ks_male": ks.age_ks_male,
|
|
298
|
+
"age_ks_female": ks.age_ks_female,
|
|
299
|
+
"education_ks": ks.education_ks,
|
|
300
|
+
"marital_ks": ks.marital_ks,
|
|
301
|
+
"consistency_rate": cr.rate,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
# Leakage summary for the dataset card.
|
|
305
|
+
leakage: dict = {}
|
|
306
|
+
sc = self_collision_audit(all_persons)
|
|
307
|
+
leakage["self_collision"] = {
|
|
308
|
+
"n_profiles": sc.n_profiles,
|
|
309
|
+
"n_collisions": sc.n_collisions,
|
|
310
|
+
"mode": "exact",
|
|
311
|
+
}
|
|
312
|
+
wk = wikidata_leakage_scan(all_persons)
|
|
313
|
+
leakage["wikidata"] = {
|
|
314
|
+
"n_profiles": wk.n_profiles,
|
|
315
|
+
"n_leaks": wk.n_leaks,
|
|
316
|
+
"mode": wk.metadata.get("mode", "demo"),
|
|
317
|
+
}
|
|
318
|
+
em = email_kanonymity_check(all_persons, use_hibp=False if skip_hibp else None)
|
|
319
|
+
leakage["email"] = {
|
|
320
|
+
"n_profiles": em.n_profiles,
|
|
321
|
+
"n_leaks": em.n_leaks,
|
|
322
|
+
"mode": em.metadata.get("mode", "syntax_only"),
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
write_dataset_card(out / "dataset_card.md", stats, validation, leakage)
|
|
326
|
+
write_manifest(out, stats, row_counts)
|
|
327
|
+
click.echo(f"Done ({stats.duration_seconds:.1f}s). Files written to {out}/", err=True)
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
@main.command("scale-smoke")
|
|
331
|
+
@click.option("--locale", type=click.Choice(VALID_LOCALES), default="US", show_default=True)
|
|
332
|
+
@click.option("--sizes", default="1000,10000", show_default=True,
|
|
333
|
+
help="Comma-separated profile counts to time.")
|
|
334
|
+
@click.option("--seed", "global_seed", type=int, default=4321, show_default=True)
|
|
335
|
+
def scale_smoke_cmd(locale: Locale, sizes: str, global_seed: int) -> None:
|
|
336
|
+
"""Time the generator at several scale points."""
|
|
337
|
+
import time
|
|
338
|
+
size_list = [int(s.strip()) for s in sizes.split(",")]
|
|
339
|
+
for n in size_list:
|
|
340
|
+
t0 = time.time()
|
|
341
|
+
for seq in range(1, n + 1):
|
|
342
|
+
make_person(locale=locale, profile_seq=seq, global_seed=global_seed)
|
|
343
|
+
dt = time.time() - t0
|
|
344
|
+
rate = n / dt if dt > 0 else 0
|
|
345
|
+
click.echo(f" n={n:>8d} {dt:6.2f}s ({rate:.0f} profiles/s)")
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
@main.command("verify")
|
|
349
|
+
def verify_cmd() -> None:
|
|
350
|
+
"""Run a quick self-check: generate one profile per locale."""
|
|
351
|
+
for loc in VALID_LOCALES:
|
|
352
|
+
try:
|
|
353
|
+
p = make_person(locale=loc, profile_seq=1, global_seed=4321)
|
|
354
|
+
click.echo(f"OK {loc:>3} {p.profile_id} {p.identity.given_name} {p.identity.family_name}")
|
|
355
|
+
except Exception as exc: # noqa: BLE001
|
|
356
|
+
click.echo(f"FAIL {loc:>3} {exc}", err=True)
|
|
357
|
+
sys.exit(1)
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
if __name__ == "__main__":
|
|
361
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Reference-data loader.
|
|
2
|
+
|
|
3
|
+
Resolves a (locale, table-name) pair into a parsed Python object. Prefers
|
|
4
|
+
``data/reference/derived/<locale>/<table>.parquet`` (live-ingested) over
|
|
5
|
+
``data/reference/bootstrap/<locale>/<table>.json`` (committed minimum).
|
|
6
|
+
|
|
7
|
+
The loader is intentionally cache-aware: a single Python process never reads
|
|
8
|
+
the same file twice.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import functools
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from profilefoundry.data.paths import reference_root
|
|
19
|
+
|
|
20
|
+
REF_ROOT = reference_root()
|
|
21
|
+
BOOTSTRAP_ROOT = REF_ROOT / "bootstrap"
|
|
22
|
+
DERIVED_ROOT = REF_ROOT / "derived"
|
|
23
|
+
|
|
24
|
+
LOCALE_CODES = ("US", "UK", "CA", "AU", "NZ", "IE", "IN", "PH")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ReferenceDataMissing(FileNotFoundError):
|
|
28
|
+
"""Raised when neither bootstrap nor derived data is available."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def locale_dir(locale: str, *, tier: str) -> Path:
|
|
32
|
+
if locale not in LOCALE_CODES:
|
|
33
|
+
raise ValueError(f"Unknown locale: {locale!r}; expected one of {LOCALE_CODES}")
|
|
34
|
+
root = BOOTSTRAP_ROOT if tier == "bootstrap" else DERIVED_ROOT
|
|
35
|
+
return root / locale.lower()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@functools.lru_cache(maxsize=128)
|
|
39
|
+
def load_table(locale: str, table: str) -> dict[str, Any]:
|
|
40
|
+
"""Load a reference table for a locale.
|
|
41
|
+
|
|
42
|
+
Resolution order:
|
|
43
|
+
1. ``derived/<locale>/<table>.parquet`` (not implemented yet; will
|
|
44
|
+
wrap pandas.read_parquet when richer ingestion lands).
|
|
45
|
+
2. ``bootstrap/<locale>/<table>.json``.
|
|
46
|
+
|
|
47
|
+
Returns the parsed ``data`` payload of the bootstrap envelope. Source
|
|
48
|
+
metadata can be retrieved via :func:`load_source_metadata`.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
boot_path = locale_dir(locale, tier="bootstrap") / f"{table}.json"
|
|
52
|
+
if boot_path.exists():
|
|
53
|
+
envelope = json.loads(boot_path.read_text(encoding="utf-8"))
|
|
54
|
+
return envelope.get("data", {})
|
|
55
|
+
|
|
56
|
+
raise ReferenceDataMissing(
|
|
57
|
+
f"No reference data for locale={locale!r}, table={table!r}. "
|
|
58
|
+
f"Looked in {boot_path}. Run scripts/ingest_{locale.lower()}.py "
|
|
59
|
+
f"to populate live data, or add a bootstrap file."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@functools.lru_cache(maxsize=128)
|
|
64
|
+
def load_source_metadata(locale: str, table: str) -> dict[str, Any]:
|
|
65
|
+
boot_path = locale_dir(locale, tier="bootstrap") / f"{table}.json"
|
|
66
|
+
if not boot_path.exists():
|
|
67
|
+
raise ReferenceDataMissing(f"No bootstrap file at {boot_path}")
|
|
68
|
+
envelope = json.loads(boot_path.read_text(encoding="utf-8"))
|
|
69
|
+
return envelope.get("source", {})
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def available_tables(locale: str) -> list[str]:
|
|
73
|
+
boot_dir = locale_dir(locale, tier="bootstrap")
|
|
74
|
+
if not boot_dir.is_dir():
|
|
75
|
+
return []
|
|
76
|
+
return sorted(p.stem for p in boot_dir.glob("*.json"))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def all_locales_with_data() -> dict[str, list[str]]:
|
|
80
|
+
return {loc: available_tables(loc) for loc in LOCALE_CODES}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Reference data manifest hash (PF-053).
|
|
2
|
+
|
|
3
|
+
Computes a deterministic hash of all bootstrap reference files used by the
|
|
4
|
+
generator. Stamped into ``Person.generation.reference_manifest_hash`` so a
|
|
5
|
+
re-run with the same global_seed but a different reference-data version is
|
|
6
|
+
detectable.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import functools
|
|
12
|
+
import hashlib
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from profilefoundry.data.paths import reference_root
|
|
16
|
+
|
|
17
|
+
REF_ROOT = reference_root()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Subtrees that influence generation output. Audit-only artifacts (leakage
|
|
21
|
+
# bloom, ingestion cache) live elsewhere under data/reference/ and are
|
|
22
|
+
# deliberately excluded so the hash stays stable across leakage ingests.
|
|
23
|
+
_HASHED_SUBTREES = ("bootstrap", "names", "geo")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@functools.lru_cache(maxsize=1)
|
|
27
|
+
def reference_manifest_hash() -> str:
|
|
28
|
+
"""blake2b digest of every committed reference-data file the generator reads.
|
|
29
|
+
|
|
30
|
+
Returns a 16-char hex digest derived from the sorted list of
|
|
31
|
+
(relative_path, file_bytes) tuples under ``data/reference/{bootstrap,
|
|
32
|
+
names, geo}/``. Audit-only data (``leakage/``, ``cache/``, ``derived/``)
|
|
33
|
+
is excluded so re-running the leakage ingest does not change the hash.
|
|
34
|
+
Cached for the lifetime of the process.
|
|
35
|
+
"""
|
|
36
|
+
h = hashlib.blake2b(digest_size=8)
|
|
37
|
+
if not REF_ROOT.exists():
|
|
38
|
+
return h.hexdigest()
|
|
39
|
+
paths: list[Path] = []
|
|
40
|
+
for sub in _HASHED_SUBTREES:
|
|
41
|
+
root = REF_ROOT / sub
|
|
42
|
+
if not root.exists():
|
|
43
|
+
continue
|
|
44
|
+
paths.extend(p for p in root.rglob("*") if p.is_file())
|
|
45
|
+
for p in sorted(paths):
|
|
46
|
+
rel = p.relative_to(REF_ROOT).as_posix()
|
|
47
|
+
h.update(rel.encode("utf-8"))
|
|
48
|
+
h.update(b"\x00")
|
|
49
|
+
try:
|
|
50
|
+
h.update(p.read_bytes())
|
|
51
|
+
except OSError:
|
|
52
|
+
h.update(b"<unreadable>")
|
|
53
|
+
h.update(b"\x01")
|
|
54
|
+
return h.hexdigest()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Runtime paths for ProfileFoundry reference data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
REFERENCE_ROOT_ENV = "PROFILEFOUNDRY_REFERENCE_ROOT"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def reference_root() -> Path:
|
|
13
|
+
"""Return the generation reference-data root for source or installed runs.
|
|
14
|
+
|
|
15
|
+
Source checkouts keep reference data at ``<repo>/data/reference`` while
|
|
16
|
+
wheels install the same committed files as ``<install-root>/data/reference``.
|
|
17
|
+
``PROFILEFOUNDRY_REFERENCE_ROOT`` can point audits at an explicit copy.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
env_root = os.environ.get(REFERENCE_ROOT_ENV)
|
|
21
|
+
if env_root:
|
|
22
|
+
return Path(env_root).expanduser().resolve()
|
|
23
|
+
|
|
24
|
+
module_path = Path(__file__).resolve()
|
|
25
|
+
candidates = (
|
|
26
|
+
Path(sys.prefix) / "data" / "reference", # installed wheel
|
|
27
|
+
module_path.parents[2] / "data" / "reference", # pip --target install
|
|
28
|
+
module_path.parents[3] / "data" / "reference", # source checkout
|
|
29
|
+
)
|
|
30
|
+
for candidate in candidates:
|
|
31
|
+
if candidate.exists():
|
|
32
|
+
return candidate
|
|
33
|
+
return candidates[-1]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|