@mailwoman/corpus 3.0.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapters/ban/adapter.d.ts.map +1 -1
- package/out/src/adapters/ban/adapter.js +6 -2
- package/out/src/adapters/ban/adapter.js.map +1 -1
- package/out/src/adapters/ban/street-decompose.d.ts +28 -0
- package/out/src/adapters/ban/street-decompose.d.ts.map +1 -0
- package/out/src/adapters/ban/street-decompose.js +78 -0
- package/out/src/adapters/ban/street-decompose.js.map +1 -0
- package/out/src/adapters/synth-po-box/adapter.d.ts +48 -0
- package/out/src/adapters/synth-po-box/adapter.d.ts.map +1 -0
- package/out/src/adapters/synth-po-box/adapter.js +101 -0
- package/out/src/adapters/synth-po-box/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -1
- package/out/src/adapters/tiger/adapter.js +9 -3
- package/out/src/adapters/tiger/adapter.js.map +1 -1
- package/out/src/adapters/tiger/street-decompose.d.ts +30 -0
- package/out/src/adapters/tiger/street-decompose.d.ts.map +1 -0
- package/out/src/adapters/tiger/street-decompose.js +99 -0
- package/out/src/adapters/tiger/street-decompose.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -1
- package/out/src/adapters/usgov-nad/adapter.js +31 -10
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -1
- package/out/src/adapters/wof-admin-jp/adapter.d.ts +58 -0
- package/out/src/adapters/wof-admin-jp/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-jp/adapter.js +129 -0
- package/out/src/adapters/wof-admin-jp/adapter.js.map +1 -0
- package/out/src/index.d.ts +6 -0
- package/out/src/index.d.ts.map +1 -1
- package/out/src/index.js +6 -0
- package/out/src/index.js.map +1 -1
- package/out/src/synthesize-german.d.ts +75 -0
- package/out/src/synthesize-german.d.ts.map +1 -0
- package/out/src/synthesize-german.js +116 -0
- package/out/src/synthesize-german.js.map +1 -0
- package/out/src/synthesize-house-venue.d.ts +57 -0
- package/out/src/synthesize-house-venue.d.ts.map +1 -0
- package/out/src/synthesize-house-venue.js +147 -0
- package/out/src/synthesize-house-venue.js.map +1 -0
- package/out/src/synthesize-intersection.d.ts +48 -0
- package/out/src/synthesize-intersection.d.ts.map +1 -0
- package/out/src/synthesize-intersection.js +138 -0
- package/out/src/synthesize-intersection.js.map +1 -0
- package/out/src/synthesize-no-street.d.ts +70 -0
- package/out/src/synthesize-no-street.d.ts.map +1 -0
- package/out/src/synthesize-no-street.js +279 -0
- package/out/src/synthesize-no-street.js.map +1 -0
- package/out/src/synthesize-po-box.d.ts +75 -0
- package/out/src/synthesize-po-box.d.ts.map +1 -0
- package/out/src/synthesize-po-box.js +186 -0
- package/out/src/synthesize-po-box.js.map +1 -0
- package/out/src/synthesize-street.d.ts +53 -0
- package/out/src/synthesize-street.d.ts.map +1 -0
- package/out/src/synthesize-street.js +212 -0
- package/out/src/synthesize-street.js.map +1 -0
- package/out/src/synthesize.js +1 -1
- package/out/src/synthesize.js.map +1 -1
- package/package.json +3 -2
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* No-street address synthesizer — the counter-distribution that v0.6.1's synth-street shard is
|
|
7
|
+
* missing. Generates BIO-labelable rows where there is NO street, NO house_number, NO
|
|
8
|
+
* street_prefix, NO street_suffix, NO intersection — only some subset of {venue, locality,
|
|
9
|
+
* region, postcode, country}.
|
|
10
|
+
*
|
|
11
|
+
* Rationale: the [2026-05-28 night-2
|
|
12
|
+
* postmortem](../../docs/articles/evals/2026-05-28-night-2-postmortem.md) and the [layer-1
|
|
13
|
+
* eval](../../docs/articles/evals/2026-05-28-layer-1-morphology-fst.md) showed that synth-street
|
|
14
|
+
* pushed the model into a high-confidence "decompose mode" that leaked into `dependent_locality`.
|
|
15
|
+
* Per DeepSeek's turn-2 recipe, the model needs explicit counter-examples: addresses where the
|
|
16
|
+
* model should NOT emit street labels. This synthesizer is that source.
|
|
17
|
+
*
|
|
18
|
+
* Six row templates, each producing a {raw, components} pair with no street-side tags:
|
|
19
|
+
*
|
|
20
|
+
* 1. **Plain venue + locality + region + postcode** `"Bob's Pizza, Boston, MA 02101"`
|
|
21
|
+
* 2. **Adversarial venue (containing street-typing words)** `"Wall Street Industries, NY 10005"`,
|
|
22
|
+
* `"5th Avenue Theater, Seattle, WA"`, `"Highway 61 Diner, Memphis TN"`. These are the rows
|
|
23
|
+
* that v0.6.1's decompose-mode would mis-tag as street_prefix/suffix; explicit negative
|
|
24
|
+
* training kills that signal.
|
|
25
|
+
* 3. **Locality + region + postcode (minimal)** — `"Boston, MA 02101"`
|
|
26
|
+
* 4. **Locality + region** — `"Boston, MA"`
|
|
27
|
+
* 5. **Postcode-only** — `"02101"`
|
|
28
|
+
* 6. **Country-only** — `"United States"`, `"France"` (rare in real data, but the model has seen these
|
|
29
|
+
* and should not hallucinate streets on them).
|
|
30
|
+
*
|
|
31
|
+
* Output is a `CanonicalRow` with no street-side components. Alignment will produce BIO labels
|
|
32
|
+
* where every token is one of {`B-venue`, `I-venue`, `B-locality`, `I-locality`, `B-region`,
|
|
33
|
+
* `B-postcode`, `B-country`, `I-country`, `O`} — explicitly never any street tag. That IS the
|
|
34
|
+
* counter-example signal the model is missing.
|
|
35
|
+
*
|
|
36
|
+
* This complements (does not replace) the existing US-base-tuple source used by
|
|
37
|
+
* `synthesize-po-box.ts`; the same `NoStreetBaseTuple` shape is consumed.
|
|
38
|
+
*/
|
|
39
|
+
// -------------------------------------------------------------------------------------------------
|
|
40
|
+
// Venue name pools
|
|
41
|
+
// -------------------------------------------------------------------------------------------------
|
|
42
|
+
/**
|
|
43
|
+
* Plain venue names — businesses without street-typing words in the name. Used as the easy-mode
|
|
44
|
+
* positive class for venue detection.
|
|
45
|
+
*/
|
|
46
|
+
const PLAIN_VENUES = [
|
|
47
|
+
"Bob's Pizza",
|
|
48
|
+
"Acme Corporation",
|
|
49
|
+
"Joe's Diner",
|
|
50
|
+
"Sunrise Bakery",
|
|
51
|
+
"Maple Leaf Cafe",
|
|
52
|
+
"Riverside Garden Center",
|
|
53
|
+
"Tech Solutions Inc",
|
|
54
|
+
"Pacific Industries",
|
|
55
|
+
"Atlantic Holdings",
|
|
56
|
+
"Stellar Consulting",
|
|
57
|
+
"Greenfield Partners",
|
|
58
|
+
"Mountain View Studio",
|
|
59
|
+
"The Daily Grind",
|
|
60
|
+
"Sunset Bistro",
|
|
61
|
+
"Harvest Moon Florist",
|
|
62
|
+
"Iron Forge Brewing",
|
|
63
|
+
"Crescent City Bookstore",
|
|
64
|
+
"Lighthouse Insurance Group",
|
|
65
|
+
"Pinecrest Veterinary",
|
|
66
|
+
"Westwood Realty",
|
|
67
|
+
];
|
|
68
|
+
/**
|
|
69
|
+
* Adversarial venue names — businesses whose names contain street-typing tokens (Avenue, Street,
|
|
70
|
+
* Highway, Lane, Drive, Court, Plaza, Park, ...) but are themselves venues, not streets. The model
|
|
71
|
+
* must learn that these are venues despite the street-typing tokens.
|
|
72
|
+
*
|
|
73
|
+
* **No leading digit+ordinal venues** (e.g. "5th Avenue Theatre", "7th Street Bistro"). The v0.6.2
|
|
74
|
+
* 2026-05-29 step-20K eval showed that synthesized rows starting with `<digits><ordinal>` confused
|
|
75
|
+
* the model about house_number recognition — tokens like "5th" (which should be `B-house_number` in
|
|
76
|
+
* real addresses) were being labeled `B-venue` because adversarial venues placed them in venue
|
|
77
|
+
* position. v0.6.3 omits these patterns; the `synth-house-venue` shard separately teaches that
|
|
78
|
+
* house_number and venue coexist.
|
|
79
|
+
*/
|
|
80
|
+
const ADVERSARIAL_VENUES = [
|
|
81
|
+
"Wall Street Industries",
|
|
82
|
+
"Highway 61 Diner",
|
|
83
|
+
"Lane Bryant",
|
|
84
|
+
"Park Avenue Dental",
|
|
85
|
+
"Broadway Theatre Company",
|
|
86
|
+
"Madison Square Garden",
|
|
87
|
+
"Main Street Bakery",
|
|
88
|
+
"Sunset Boulevard Studios",
|
|
89
|
+
"Ocean Drive Cafe",
|
|
90
|
+
"Mountain Road Outfitters",
|
|
91
|
+
"Hollywood Boulevard Salon",
|
|
92
|
+
"East Bay Auto",
|
|
93
|
+
"West End Pharmacy",
|
|
94
|
+
"North Shore Insurance",
|
|
95
|
+
"South Park Children's Center",
|
|
96
|
+
"River Road Animal Hospital",
|
|
97
|
+
"Hill Street Blues Bar",
|
|
98
|
+
"Court House Square Realty",
|
|
99
|
+
"Plaza Hotel",
|
|
100
|
+
"Lincoln Park Zoo",
|
|
101
|
+
"Central Park Conservancy",
|
|
102
|
+
"Lakeshore Boulevard Apartments",
|
|
103
|
+
"Memorial Drive Medical Center",
|
|
104
|
+
"Wabash Avenue Press",
|
|
105
|
+
"State Street Bank",
|
|
106
|
+
"Market Street Grill",
|
|
107
|
+
"Beach Boulevard Diner",
|
|
108
|
+
"Garden Lane Florist",
|
|
109
|
+
];
|
|
110
|
+
// Compile-time guard: every venue must NOT start with the digit+ordinal pattern that
|
|
111
|
+
// confuses house_number recognition. If a future contributor adds a "5th Avenue Theatre"-
|
|
112
|
+
// style entry, this assertion will fire at module load time.
|
|
113
|
+
for (const v of ADVERSARIAL_VENUES) {
|
|
114
|
+
if (/^\d+(st|nd|rd|th)\b/i.test(v)) {
|
|
115
|
+
throw new Error(`ADVERSARIAL_VENUES entry "${v}" starts with digit+ordinal; this pattern confuses ` +
|
|
116
|
+
`house_number recognition (see v0.6.3 eval doc). Use a non-numeric venue name.`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
const COUNTRY_NAMES = new Map([
|
|
120
|
+
["US", ["United States", "USA", "U.S.A.", "United States of America"]],
|
|
121
|
+
["FR", ["France"]],
|
|
122
|
+
["DE", ["Germany", "Deutschland"]],
|
|
123
|
+
["GB", ["United Kingdom", "UK", "Great Britain"]],
|
|
124
|
+
["CA", ["Canada"]],
|
|
125
|
+
["AU", ["Australia"]],
|
|
126
|
+
]);
|
|
127
|
+
// -------------------------------------------------------------------------------------------------
|
|
128
|
+
// Synthesis
|
|
129
|
+
// -------------------------------------------------------------------------------------------------
|
|
130
|
+
function pick(arr, random) {
|
|
131
|
+
return arr[Math.floor(random() * arr.length)];
|
|
132
|
+
}
|
|
133
|
+
function countryToLocale(country) {
|
|
134
|
+
switch (country) {
|
|
135
|
+
case "US":
|
|
136
|
+
return "en-US";
|
|
137
|
+
case "CA":
|
|
138
|
+
return "en-CA";
|
|
139
|
+
case "GB":
|
|
140
|
+
return "en-GB";
|
|
141
|
+
case "AU":
|
|
142
|
+
return "en-AU";
|
|
143
|
+
case "FR":
|
|
144
|
+
return "fr-FR";
|
|
145
|
+
case "DE":
|
|
146
|
+
return "de-DE";
|
|
147
|
+
default:
|
|
148
|
+
return "en-US";
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Generate one no-street counter-example row for a base (locality, region, postcode, country)
|
|
153
|
+
* tuple. Picks a template by weighted random; the venue templates are the load-bearing
|
|
154
|
+
* counter-distribution against synth-street's decompose-mode pressure.
|
|
155
|
+
*/
|
|
156
|
+
export function synthesizeNoStreetRow(base, opts = {}) {
|
|
157
|
+
const random = opts.random ?? Math.random;
|
|
158
|
+
const locale = countryToLocale(base.country);
|
|
159
|
+
const template = opts.forceTemplate ?? pickTemplate(random);
|
|
160
|
+
switch (template) {
|
|
161
|
+
case "venue-plain": {
|
|
162
|
+
const venue = pick(PLAIN_VENUES, random);
|
|
163
|
+
const raw = `${venue}, ${base.locality}, ${base.region} ${base.postcode}`;
|
|
164
|
+
return {
|
|
165
|
+
raw,
|
|
166
|
+
components: {
|
|
167
|
+
venue,
|
|
168
|
+
locality: base.locality,
|
|
169
|
+
region: base.region,
|
|
170
|
+
postcode: base.postcode,
|
|
171
|
+
},
|
|
172
|
+
locale,
|
|
173
|
+
template,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
case "venue-adversarial": {
|
|
177
|
+
// The venue-adversarial template name is descriptive — when selected, this branch
|
|
178
|
+
// always draws from the adversarial pool. The `adversarialVenueRatio` opt is what
|
|
179
|
+
// the OUTER template picker uses to bias toward this template versus the plain one;
|
|
180
|
+
// once we're inside this branch the choice is already made.
|
|
181
|
+
const venue = pick(ADVERSARIAL_VENUES, random);
|
|
182
|
+
const raw = `${venue}, ${base.locality}, ${base.region} ${base.postcode}`;
|
|
183
|
+
return {
|
|
184
|
+
raw,
|
|
185
|
+
components: {
|
|
186
|
+
venue,
|
|
187
|
+
locality: base.locality,
|
|
188
|
+
region: base.region,
|
|
189
|
+
postcode: base.postcode,
|
|
190
|
+
},
|
|
191
|
+
locale,
|
|
192
|
+
template,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
case "locality-region-postcode": {
|
|
196
|
+
const raw = `${base.locality}, ${base.region} ${base.postcode}`;
|
|
197
|
+
return {
|
|
198
|
+
raw,
|
|
199
|
+
components: {
|
|
200
|
+
locality: base.locality,
|
|
201
|
+
region: base.region,
|
|
202
|
+
postcode: base.postcode,
|
|
203
|
+
},
|
|
204
|
+
locale,
|
|
205
|
+
template,
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
case "locality-region": {
|
|
209
|
+
const raw = `${base.locality}, ${base.region}`;
|
|
210
|
+
return {
|
|
211
|
+
raw,
|
|
212
|
+
components: {
|
|
213
|
+
locality: base.locality,
|
|
214
|
+
region: base.region,
|
|
215
|
+
},
|
|
216
|
+
locale,
|
|
217
|
+
template,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
case "postcode-only": {
|
|
221
|
+
return {
|
|
222
|
+
raw: base.postcode,
|
|
223
|
+
components: { postcode: base.postcode },
|
|
224
|
+
locale,
|
|
225
|
+
template,
|
|
226
|
+
};
|
|
227
|
+
}
|
|
228
|
+
case "country-only": {
|
|
229
|
+
const names = COUNTRY_NAMES.get(base.country) ?? [base.country];
|
|
230
|
+
const country = pick(names, random);
|
|
231
|
+
return {
|
|
232
|
+
raw: country,
|
|
233
|
+
components: { country },
|
|
234
|
+
locale,
|
|
235
|
+
template,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Template weights chosen so that the venue-* templates dominate (they're the counter-example shape
|
|
242
|
+
* that matters), with the minimal templates as long-tail noise.
|
|
243
|
+
*/
|
|
244
|
+
function pickTemplate(random) {
|
|
245
|
+
const r = random();
|
|
246
|
+
if (r < 0.35)
|
|
247
|
+
return "venue-adversarial"; // 35% — the load-bearing slice
|
|
248
|
+
if (r < 0.6)
|
|
249
|
+
return "venue-plain"; // 25%
|
|
250
|
+
if (r < 0.8)
|
|
251
|
+
return "locality-region-postcode"; // 20%
|
|
252
|
+
if (r < 0.92)
|
|
253
|
+
return "locality-region"; // 12%
|
|
254
|
+
if (r < 0.98)
|
|
255
|
+
return "postcode-only"; // 6%
|
|
256
|
+
return "country-only"; // 2%
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Convenience: assert at type-level that a synthesized row carries no street-side components. Used
|
|
260
|
+
* by tests + downstream consumers who want to verify the contract at runtime.
|
|
261
|
+
*/
|
|
262
|
+
export const STREET_SIDE_TAGS = [
|
|
263
|
+
"street",
|
|
264
|
+
"street_prefix",
|
|
265
|
+
"street_prefix_particle",
|
|
266
|
+
"street_suffix",
|
|
267
|
+
"house_number",
|
|
268
|
+
"intersection_a",
|
|
269
|
+
"intersection_b",
|
|
270
|
+
"unit",
|
|
271
|
+
];
|
|
272
|
+
export function hasAnyStreetSideTag(components) {
|
|
273
|
+
for (const t of STREET_SIDE_TAGS) {
|
|
274
|
+
if (components[t])
|
|
275
|
+
return true;
|
|
276
|
+
}
|
|
277
|
+
return false;
|
|
278
|
+
}
|
|
279
|
+
//# sourceMappingURL=synthesize-no-street.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthesize-no-street.js","sourceRoot":"","sources":["../../src/synthesize-no-street.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAoCH,oGAAoG;AACpG,mBAAmB;AACnB,oGAAoG;AAEpG;;;GAGG;AACH,MAAM,YAAY,GAA0B;IAC3C,aAAa;IACb,kBAAkB;IAClB,aAAa;IACb,gBAAgB;IAChB,iBAAiB;IACjB,yBAAyB;IACzB,oBAAoB;IACpB,oBAAoB;IACpB,mBAAmB;IACnB,oBAAoB;IACpB,qBAAqB;IACrB,sBAAsB;IACtB,iBAAiB;IACjB,eAAe;IACf,sBAAsB;IACtB,oBAAoB;IACpB,yBAAyB;IACzB,4BAA4B;IAC5B,sBAAsB;IACtB,iBAAiB;CACjB,CAAA;AAED;;;;;;;;;;;GAWG;AACH,MAAM,kBAAkB,GAA0B;IACjD,wBAAwB;IACxB,kBAAkB;IAClB,aAAa;IACb,oBAAoB;IACpB,0BAA0B;IAC1B,uBAAuB;IACvB,oBAAoB;IACpB,0BAA0B;IAC1B,kBAAkB;IAClB,0BAA0B;IAC1B,2BAA2B;IAC3B,eAAe;IACf,mBAAmB;IACnB,uBAAuB;IACvB,8BAA8B;IAC9B,4BAA4B;IAC5B,uBAAuB;IACvB,2BAA2B;IAC3B,aAAa;IACb,kBAAkB;IAClB,0BAA0B;IAC1B,gCAAgC;IAChC,+BAA+B;IAC/B,qBAAqB;IACrB,mBAAmB;IACnB,qBAAqB;IACrB,uBAAuB;IACvB,qBAAqB;CACrB,CAAA;AAED,qFAAqF;AACrF,0FAA0F;AAC1F,6DAA6D;AAC7D,KAAK,MAAM,CAAC,IAAI,kBAAkB,EAAE,CAAC;IACpC,IAAI,sBAAsB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;QACpC,MAAM,IAAI,KAAK,CACd,6BAA6B,CAAC,qDAAqD;YAClF,+EAA+E,CAChF,CAAA;IACF,CAAC;AACF,CAAC;AAED,MAAM,aAAa,GAAG,IAAI,GAAG,CAAgC;IAC5D,CAAC,IAAI,EAAE,CAAC,eAAe,EAAE,KAAK,EAAE,QAAQ,EAAE,0BAA0B,CAAC,CAAC;IACtE,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,SAAS,EAAE,aAAa,CAAC,CAAC;IAClC,CAAC,IAAI,EAAE,CAAC,gBAAgB,EAAE,IAAI,EAAE,eAAe,CAAC,CAAC;IACjD,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,CAAC;IAClB,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,CAAC;CACrB,CAAC,CAAA;AAEF,oGAAoG;AACpG,YAAY;AACZ,oGAAoG;AAEpG,SAAS,IAAI,CAAI,GAAqB,EAAE,MAAoB;IAC3D,OAAO,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,MAAM,CAAC,CAAE,CAAA;AAC/C,CAAC;AAED,SAAS,eAAe,CAAC,OAAe;IACvC,QAAQ,OAAO,EAAE,CAAC;QACjB,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf;YACC,OAAO,OAAO,CAAA;IAChB,CAAC;AACF,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CACpC,IAAuB,EACvB,OAA8B,EAAE;IAEhC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAA;IACzC,MAAM,MAAM,GAAG,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAE5C,MAAM,QAAQ,GAAqB,IAAI,CAAC,aAAa,IAAI,YAAY,CAAC,MAAM,CAAC,CAAA;IAE7E,QAAQ,QAAQ,EAAE,CAAC;QAClB,KAAK,aAAa,CAAC,CAAC,CAAC;YACpB,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,EAAE,MAAM,CAAC,CAAA;YACxC,MAAM,GAAG,GAAG,GAAG,KAAK,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;YACzE,OAAO;gBACN,GAAG;gBACH,UAAU,EAAE;oBACX,KAAK;oBACL,QAAQ,EAAE,IAAI,CAAC,QAAQ;oBACvB,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;iBACvB;gBACD,MAAM;gBACN,QAAQ;aACR,CAAA;QACF,CAAC;QACD,KAAK,mBAAmB,CAAC,CAAC,CAAC;YAC1B,kFAAkF;YAClF,kFAAkF;YAClF,oFAAoF;YACpF,4DAA4D;YAC5D,MAAM,KAAK,GAAG,IAAI,CAAC,kBAAkB,EAAE,MAAM,CAAC,CAAA;YAC9C,MAAM,GAAG,GAAG,GAAG,KAAK,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;YACzE,OAAO;gBACN,GAAG;gBACH,UAAU,EAAE;oBACX,KAAK;oBACL,QAAQ,EAAE,IAAI,CAAC,QAAQ;oBACvB,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;iBACvB;gBACD,MAAM;gBACN,QAAQ;aACR,CAAA;QACF,CAAC;QACD,KAAK,0BAA0B,CAAC,CAAC,CAAC;YACjC,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;YAC/D,OAAO;gBACN,GAAG;gBACH,UAAU,EAAE;oBACX,QAAQ,EAAE,IAAI,CAAC,QAAQ;oBACvB,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;iBACvB;gBACD,MAAM;gBACN,QAAQ;aACR,CAAA;QACF,CAAC;QACD,KAAK,iBAAiB,CAAC,CAAC,CAAC;YACxB,MAAM,GAAG,GAAG,GAAG,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,EAAE,CAAA;YAC9C,OAAO;gBACN,GAAG;gBACH,UAAU,EAAE;oBACX,QAAQ,EAAE,IAAI,CAAC,QAAQ;oBACvB,MAAM,EAAE,IAAI,CAAC,MAAM;iBACnB;gBACD,MAAM;gBACN,QAAQ;aACR,CAAA;QACF,CAAC;QACD,KAAK,eAAe,CAAC,CAAC,CAAC;YACtB,OAAO;gBACN,GAAG,EAAE,IAAI,CAAC,QAAQ;gBAClB,UAAU,EAAE,EAAE,QAAQ,EAAE,IAAI,CAAC,QAAQ,EAAE;gBACvC,MAAM;gBACN,QAAQ;aACR,CAAA;QACF,CAAC;QACD,KAAK,cAAc,CAAC,CAAC,CAAC;YACrB,MAAM,KAAK,GAAG,aAAa,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YAC/D,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,EAAE,MAAM,CAAC,CAAA;YACnC,OAAO;gBACN,GAAG,EAAE,OAAO;gBACZ,UAAU,EAAE,EAAE,OAAO,EAAE;gBACvB,MAAM;gBACN,QAAQ;aACR,CAAA;QACF,CAAC;IACF,CAAC;AACF,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,MAAoB;IACzC,MAAM,CAAC,GAAG,MAAM,EAAE,CAAA;IAClB,IAAI,CAAC,GAAG,IAAI;QAAE,OAAO,mBAAmB,CAAA,CAAC,+BAA+B;IACxE,IAAI,CAAC,GAAG,GAAG;QAAE,OAAO,aAAa,CAAA,CAAC,MAAM;IACxC,IAAI,CAAC,GAAG,GAAG;QAAE,OAAO,0BAA0B,CAAA,CAAC,MAAM;IACrD,IAAI,CAAC,GAAG,IAAI;QAAE,OAAO,iBAAiB,CAAA,CAAC,MAAM;IAC7C,IAAI,CAAC,GAAG,IAAI;QAAE,OAAO,eAAe,CAAA,CAAC,KAAK;IAC1C,OAAO,cAAc,CAAA,CAAC,KAAK;AAC5B,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG;IAC/B,QAAQ;IACR,eAAe;IACf,wBAAwB;IACxB,eAAe;IACf,cAAc;IACd,gBAAgB;IAChB,gBAAgB;IAChB,MAAM;CACG,CAAA;AAEV,MAAM,UAAU,mBAAmB,CAAC,UAAsC;IACzE,KAAK,MAAM,CAAC,IAAI,gBAAgB,EAAE,CAAC;QAClC,IAAI,UAAU,CAAC,CAAC,CAAC;YAAE,OAAO,IAAI,CAAA;IAC/B,CAAC;IACD,OAAO,KAAK,CAAA;AACb,CAAC"}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* PO box / PMB / Apartado / Boîte Postale synthesizer.
|
|
7
|
+
*
|
|
8
|
+
* Generates BIO-labeled corpus rows where the delivery line is a PO box (mutually exclusive with
|
|
9
|
+
* street + house_number per USPS Pub 28 / DMM 508). Locale-aware: emits idiomatic forms for
|
|
10
|
+
* en-US, en-CA, en-GB, en-AU, fr-FR, fr-CA, es-ES, es-MX, es-AR.
|
|
11
|
+
*
|
|
12
|
+
* Per-DeepSeek design:
|
|
13
|
+
*
|
|
14
|
+
* - PMB ("Private Mailbox" — at CMRAs like UPS Store) shares the `po_box` tag with USPS PO Box.
|
|
15
|
+
* Disambiguation is a downstream heuristic (presence of a street line).
|
|
16
|
+
* - Whole-phrase span ("PO Box 123") not number-only ("123"). Matches existing golden eval.
|
|
17
|
+
* - 10% of outputs receive number-format noise (commas, dashes, embedded spaces) to harden against
|
|
18
|
+
* real-world OCR/transcription input.
|
|
19
|
+
* - PO boxes drop street/house_number/unit/street_prefix/street_suffix from input components.
|
|
20
|
+
*
|
|
21
|
+
* References:
|
|
22
|
+
*
|
|
23
|
+
* - USPS Pub 28 §28C2.040 — Private Mailbox formatting
|
|
24
|
+
* - USPS DMM 508 §4.1.4 / §4.5.4 — PO Box and street-addressed PO Box
|
|
25
|
+
*/
|
|
26
|
+
import type { CanonicalRow } from "./types.js";
|
|
27
|
+
export interface PoBoxBaseTuple {
|
|
28
|
+
locality: string;
|
|
29
|
+
region: string;
|
|
30
|
+
postcode: string;
|
|
31
|
+
country: string;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Inject number-format noise into a box number string. Returns the noisy variant or the original
|
|
35
|
+
* (10% probability of noise per the design).
|
|
36
|
+
*/
|
|
37
|
+
export declare function maybeNoisifyBoxNumber(num: string, random: () => number): string;
|
|
38
|
+
/**
|
|
39
|
+
* Compose a PO box phrase like "PO Box 123" or "PMB 200".
|
|
40
|
+
*
|
|
41
|
+
* Returns both the phrase and the canonical leader+number so the BIO aligner can mark the entire
|
|
42
|
+
* span as `po_box`.
|
|
43
|
+
*/
|
|
44
|
+
export declare function composePoBoxPhrase(leader: string, number: string): string;
|
|
45
|
+
export interface SynthesizedPoBoxRow {
|
|
46
|
+
raw: string;
|
|
47
|
+
components: CanonicalRow["components"];
|
|
48
|
+
locale: string;
|
|
49
|
+
template: "po-box" | "pmb-with-street";
|
|
50
|
+
}
|
|
51
|
+
export interface PoBoxSynthesisOpts {
|
|
52
|
+
/** Random function — pass deterministic seed for tests. Default Math.random. */
|
|
53
|
+
random?: () => number;
|
|
54
|
+
/** Number generator. Default uniform over 1..99999. */
|
|
55
|
+
pickNumber?: (random: () => number) => string;
|
|
56
|
+
/** PMB probability when locale supports it (and a street is provided in the base tuple). */
|
|
57
|
+
pmbRatio?: number;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Generate one PO box row for a base (locality, region, postcode, country) tuple. Picks a
|
|
61
|
+
* locale-appropriate leader and number. Optionally generates a PMB variant when the base tuple
|
|
62
|
+
* includes a street.
|
|
63
|
+
*/
|
|
64
|
+
export declare function synthesizePoBoxRow(base: PoBoxBaseTuple & {
|
|
65
|
+
street?: string;
|
|
66
|
+
houseNumber?: string;
|
|
67
|
+
}, opts?: PoBoxSynthesisOpts): SynthesizedPoBoxRow | null;
|
|
68
|
+
/**
|
|
69
|
+
* Map a country code (ISO-3166-1 alpha-2 or alpha-3, or country display name) to the locale code we
|
|
70
|
+
* have a PO box template for.
|
|
71
|
+
*/
|
|
72
|
+
export declare function countryToLocale(country: string): string;
|
|
73
|
+
/** All locales we synthesize for. Exposed for tests and for source-weight tuning. */
|
|
74
|
+
export declare function supportedLocales(): ReadonlyArray<string>;
|
|
75
|
+
//# sourceMappingURL=synthesize-po-box.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthesize-po-box.d.ts","sourceRoot":"","sources":["../../src/synthesize-po-box.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAE9C,MAAM,WAAW,cAAc;IAC9B,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,OAAO,EAAE,MAAM,CAAA;CACf;AAoDD;;;GAGG;AACH,wBAAgB,qBAAqB,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,MAAM,GAAG,MAAM,CAY/E;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,MAAM,CAEzE;AAED,MAAM,WAAW,mBAAmB;IACnC,GAAG,EAAE,MAAM,CAAA;IACX,UAAU,EAAE,YAAY,CAAC,YAAY,CAAC,CAAA;IACtC,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,QAAQ,GAAG,iBAAiB,CAAA;CACtC;AAED,MAAM,WAAW,kBAAkB;IAClC,gFAAgF;IAChF,MAAM,CAAC,EAAE,MAAM,MAAM,CAAA;IACrB,uDAAuD;IACvD,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,MAAM,KAAK,MAAM,CAAA;IAC7C,4FAA4F;IAC5F,QAAQ,CAAC,EAAE,MAAM,CAAA;CACjB;AAWD;;;;GAIG;AACH,wBAAgB,kBAAkB,CACjC,IAAI,EAAE,cAAc,GAAG;IAAE,MAAM,CAAC,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,EAChE,IAAI,GAAE,kBAAuB,GAC3B,mBAAmB,GAAG,IAAI,CAkD5B;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAWvD;AAED,qFAAqF;AACrF,wBAAgB,gBAAgB,IAAI,aAAa,CAAC,MAAM,CAAC,CAExD"}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* PO box / PMB / Apartado / Boîte Postale synthesizer.
|
|
7
|
+
*
|
|
8
|
+
* Generates BIO-labeled corpus rows where the delivery line is a PO box (mutually exclusive with
|
|
9
|
+
* street + house_number per USPS Pub 28 / DMM 508). Locale-aware: emits idiomatic forms for
|
|
10
|
+
* en-US, en-CA, en-GB, en-AU, fr-FR, fr-CA, es-ES, es-MX, es-AR.
|
|
11
|
+
*
|
|
12
|
+
* Per-DeepSeek design:
|
|
13
|
+
*
|
|
14
|
+
* - PMB ("Private Mailbox" — at CMRAs like UPS Store) shares the `po_box` tag with USPS PO Box.
|
|
15
|
+
* Disambiguation is a downstream heuristic (presence of a street line).
|
|
16
|
+
* - Whole-phrase span ("PO Box 123") not number-only ("123"). Matches existing golden eval.
|
|
17
|
+
* - 10% of outputs receive number-format noise (commas, dashes, embedded spaces) to harden against
|
|
18
|
+
* real-world OCR/transcription input.
|
|
19
|
+
* - PO boxes drop street/house_number/unit/street_prefix/street_suffix from input components.
|
|
20
|
+
*
|
|
21
|
+
* References:
|
|
22
|
+
*
|
|
23
|
+
* - USPS Pub 28 §28C2.040 — Private Mailbox formatting
|
|
24
|
+
* - USPS DMM 508 §4.1.4 / §4.5.4 — PO Box and street-addressed PO Box
|
|
25
|
+
*/
|
|
26
|
+
const LOCALE_TEMPLATES = [
|
|
27
|
+
{
|
|
28
|
+
locale: "en-US",
|
|
29
|
+
leaders: ["PO Box", "P.O. Box", "P.O.Box", "PO BOX", "POB", "Post Office Box", "Box"],
|
|
30
|
+
pmb: ["PMB", "#"],
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
locale: "en-CA",
|
|
34
|
+
leaders: ["PO Box", "P.O. Box", "POB", "Post Office Box"],
|
|
35
|
+
pmb: ["PMB", "#"],
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
locale: "en-GB",
|
|
39
|
+
leaders: ["PO Box", "P.O. Box", "Post Office Box"],
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
locale: "en-AU",
|
|
43
|
+
leaders: ["PO Box", "P.O. Box", "Post Office Box", "GPO Box", "Locked Bag"],
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
locale: "fr-FR",
|
|
47
|
+
leaders: ["BP", "B.P.", "Boîte Postale", "BP."],
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
locale: "fr-CA",
|
|
51
|
+
leaders: ["CP", "C.P.", "Case Postale", "BP", "B.P."],
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
locale: "es-ES",
|
|
55
|
+
leaders: ["Apdo.", "Apdo", "Apartado", "Apartado de Correos"],
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
locale: "es-MX",
|
|
59
|
+
leaders: ["Apdo.", "Apartado", "Apartado Postal", "AP"],
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
locale: "es-AR",
|
|
63
|
+
leaders: ["Casilla", "Casilla de Correo", "CC"],
|
|
64
|
+
},
|
|
65
|
+
];
|
|
66
|
+
const LEADERS_BY_LOCALE = new Map(LOCALE_TEMPLATES.map((t) => [t.locale, t]));
|
|
67
|
+
/**
|
|
68
|
+
* Inject number-format noise into a box number string. Returns the noisy variant or the original
|
|
69
|
+
* (10% probability of noise per the design).
|
|
70
|
+
*/
|
|
71
|
+
export function maybeNoisifyBoxNumber(num, random) {
|
|
72
|
+
if (random() > 0.1)
|
|
73
|
+
return num;
|
|
74
|
+
const variants = [
|
|
75
|
+
// Thousand-separator comma (real input: "Box 1,234")
|
|
76
|
+
(s) => (s.length >= 4 ? `${s.slice(0, -3)},${s.slice(-3)}` : s),
|
|
77
|
+
// Embedded dash (real input: "PMB-200")
|
|
78
|
+
(s) => (s.length >= 3 ? `${s.slice(0, -2)}-${s.slice(-2)}` : s),
|
|
79
|
+
// Embedded spaces (real input from OCR: "1 2 3 4")
|
|
80
|
+
(s) => s.split("").join(" "),
|
|
81
|
+
];
|
|
82
|
+
const f = variants[Math.floor(random() * variants.length)];
|
|
83
|
+
return f(num);
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Compose a PO box phrase like "PO Box 123" or "PMB 200".
|
|
87
|
+
*
|
|
88
|
+
* Returns both the phrase and the canonical leader+number so the BIO aligner can mark the entire
|
|
89
|
+
* span as `po_box`.
|
|
90
|
+
*/
|
|
91
|
+
export function composePoBoxPhrase(leader, number) {
|
|
92
|
+
return `${leader} ${number}`;
|
|
93
|
+
}
|
|
94
|
+
function defaultPickNumber(random) {
|
|
95
|
+
// 70% of real PO boxes are 1-5 digits; long ones exist (USPS allows up to ~6 digits).
|
|
96
|
+
const r = random();
|
|
97
|
+
if (r < 0.3)
|
|
98
|
+
return String(1 + Math.floor(random() * 99)); // 1-99
|
|
99
|
+
if (r < 0.7)
|
|
100
|
+
return String(100 + Math.floor(random() * 900)); // 100-999
|
|
101
|
+
if (r < 0.95)
|
|
102
|
+
return String(1000 + Math.floor(random() * 9000)); // 1000-9999
|
|
103
|
+
return String(10000 + Math.floor(random() * 90000)); // 10000-99999
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Generate one PO box row for a base (locality, region, postcode, country) tuple. Picks a
|
|
107
|
+
* locale-appropriate leader and number. Optionally generates a PMB variant when the base tuple
|
|
108
|
+
* includes a street.
|
|
109
|
+
*/
|
|
110
|
+
export function synthesizePoBoxRow(base, opts = {}) {
|
|
111
|
+
const random = opts.random ?? Math.random;
|
|
112
|
+
const pickNumber = opts.pickNumber ?? defaultPickNumber;
|
|
113
|
+
const pmbRatio = opts.pmbRatio ?? 0.0;
|
|
114
|
+
const locale = countryToLocale(base.country);
|
|
115
|
+
const tpl = LEADERS_BY_LOCALE.get(locale);
|
|
116
|
+
if (!tpl)
|
|
117
|
+
return null;
|
|
118
|
+
const number = maybeNoisifyBoxNumber(pickNumber(random), random);
|
|
119
|
+
const leader = tpl.leaders[Math.floor(random() * tpl.leaders.length)];
|
|
120
|
+
const poBoxPhrase = composePoBoxPhrase(leader, number);
|
|
121
|
+
// PMB variant: requires both a street and a PMB-supporting locale.
|
|
122
|
+
const wantPmb = base.street && tpl.pmb && random() < pmbRatio;
|
|
123
|
+
if (wantPmb) {
|
|
124
|
+
const pmbLeader = tpl.pmb[Math.floor(random() * tpl.pmb.length)];
|
|
125
|
+
const pmbPhrase = composePoBoxPhrase(pmbLeader, number);
|
|
126
|
+
const streetLine = base.houseNumber ? `${base.houseNumber} ${base.street}` : base.street;
|
|
127
|
+
const raw = `${streetLine}, ${pmbPhrase}, ${base.locality}, ${base.region} ${base.postcode}`;
|
|
128
|
+
return {
|
|
129
|
+
raw,
|
|
130
|
+
components: {
|
|
131
|
+
...(base.houseNumber ? { house_number: base.houseNumber } : {}),
|
|
132
|
+
street: base.street,
|
|
133
|
+
po_box: pmbPhrase,
|
|
134
|
+
locality: base.locality,
|
|
135
|
+
region: base.region,
|
|
136
|
+
postcode: base.postcode,
|
|
137
|
+
country: base.country,
|
|
138
|
+
},
|
|
139
|
+
locale,
|
|
140
|
+
template: "pmb-with-street",
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
// Standard PO box: replaces the street line entirely.
|
|
144
|
+
const raw = `${poBoxPhrase}, ${base.locality}, ${base.region} ${base.postcode}`;
|
|
145
|
+
return {
|
|
146
|
+
raw,
|
|
147
|
+
components: {
|
|
148
|
+
po_box: poBoxPhrase,
|
|
149
|
+
locality: base.locality,
|
|
150
|
+
region: base.region,
|
|
151
|
+
postcode: base.postcode,
|
|
152
|
+
country: base.country,
|
|
153
|
+
},
|
|
154
|
+
locale,
|
|
155
|
+
template: "po-box",
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Map a country code (ISO-3166-1 alpha-2 or alpha-3, or country display name) to the locale code we
|
|
160
|
+
* have a PO box template for.
|
|
161
|
+
*/
|
|
162
|
+
export function countryToLocale(country) {
|
|
163
|
+
const c = country.trim().toUpperCase();
|
|
164
|
+
if (c === "US" || c === "USA" || c === "UNITED STATES")
|
|
165
|
+
return "en-US";
|
|
166
|
+
if (c === "CA" || c === "CAN" || c === "CANADA")
|
|
167
|
+
return "en-CA";
|
|
168
|
+
if (c === "GB" || c === "UK" || c === "GBR" || c === "UNITED KINGDOM")
|
|
169
|
+
return "en-GB";
|
|
170
|
+
if (c === "AU" || c === "AUS" || c === "AUSTRALIA")
|
|
171
|
+
return "en-AU";
|
|
172
|
+
if (c === "FR" || c === "FRA" || c === "FRANCE")
|
|
173
|
+
return "fr-FR";
|
|
174
|
+
if (c === "ES" || c === "ESP" || c === "SPAIN")
|
|
175
|
+
return "es-ES";
|
|
176
|
+
if (c === "MX" || c === "MEX" || c === "MEXICO")
|
|
177
|
+
return "es-MX";
|
|
178
|
+
if (c === "AR" || c === "ARG" || c === "ARGENTINA")
|
|
179
|
+
return "es-AR";
|
|
180
|
+
return "en-US";
|
|
181
|
+
}
|
|
182
|
+
/** All locales we synthesize for. Exposed for tests and for source-weight tuning. */
|
|
183
|
+
export function supportedLocales() {
|
|
184
|
+
return LOCALE_TEMPLATES.map((t) => t.locale);
|
|
185
|
+
}
|
|
186
|
+
//# sourceMappingURL=synthesize-po-box.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthesize-po-box.js","sourceRoot":"","sources":["../../src/synthesize-po-box.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAkBH,MAAM,gBAAgB,GAAkC;IACvD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,QAAQ,EAAE,KAAK,EAAE,iBAAiB,EAAE,KAAK,CAAC;QACrF,GAAG,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC;KACjB;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,QAAQ,EAAE,UAAU,EAAE,KAAK,EAAE,iBAAiB,CAAC;QACzD,GAAG,EAAE,CAAC,KAAK,EAAE,GAAG,CAAC;KACjB;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,QAAQ,EAAE,UAAU,EAAE,iBAAiB,CAAC;KAClD;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,QAAQ,EAAE,UAAU,EAAE,iBAAiB,EAAE,SAAS,EAAE,YAAY,CAAC;KAC3E;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,eAAe,EAAE,KAAK,CAAC;KAC/C;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,cAAc,EAAE,IAAI,EAAE,MAAM,CAAC;KACrD;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,qBAAqB,CAAC;KAC7D;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,OAAO,EAAE,UAAU,EAAE,iBAAiB,EAAE,IAAI,CAAC;KACvD;IACD;QACC,MAAM,EAAE,OAAO;QACf,OAAO,EAAE,CAAC,SAAS,EAAE,mBAAmB,EAAE,IAAI,CAAC;KAC/C;CACD,CAAA;AAED,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAyB,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAC,CAAA;AAErG;;;GAGG;AACH,MAAM,UAAU,qBAAqB,CAAC,GAAW,EAAE,MAAoB;IACtE,IAAI,MAAM,EAAE,GAAG,GAAG;QAAE,OAAO,GAAG,CAAA;IAC9B,MAAM,QAAQ,GAAiC;QAC9C,qDAAqD;QACrD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,wCAAwC;QACxC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/D,mDAAmD;QACnD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;KAC5B,CAAA;IACD,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAE,CAAA;IAC3D,OAAO,CAAC,CAAC,GAAG,CAAC,CAAA;AACd,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAC,MAAc,EAAE,MAAc;IAChE,OAAO,GAAG,MAAM,IAAI,MAAM,EAAE,CAAA;AAC7B,CAAC;AAkBD,SAAS,iBAAiB,CAAC,MAAoB;IAC9C,sFAAsF;IACtF,MAAM,CAAC,GAAG,MAAM,EAAE,CAAA;IAClB,IAAI,CAAC,GAAG,GAAG;QAAE,OAAO,MAAM,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA,CAAC,OAAO;IACjE,IAAI,CAAC,GAAG,GAAG;QAAE,OAAO,MAAM,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAA,CAAC,UAAU;IACvE,IAAI,CAAC,GAAG,IAAI;QAAE,OAAO,MAAM,CAAC,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAA,CAAC,YAAY;IAC5E,OAAO,MAAM,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAA,CAAC,cAAc;AACnE,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,kBAAkB,CACjC,IAAgE,EAChE,OAA2B,EAAE;IAE7B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAA;IACzC,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI,iBAAiB,CAAA;IACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,GAAG,CAAA;IAErC,MAAM,MAAM,GAAG,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC5C,MAAM,GAAG,GAAG,iBAAiB,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IACzC,IAAI,CAAC,GAAG;QAAE,OAAO,IAAI,CAAA;IAErB,MAAM,MAAM,GAAG,qBAAqB,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAA;IAChE,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAE,CAAA;IACtE,MAAM,WAAW,GAAG,kBAAkB,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IAEtD,mEAAmE;IACnE,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,IAAI,GAAG,CAAC,GAAG,IAAI,MAAM,EAAE,GAAG,QAAQ,CAAA;IAC7D,IAAI,OAAO,EAAE,CAAC;QACb,MAAM,SAAS,GAAG,GAAG,CAAC,GAAI,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,GAAI,CAAC,MAAM,CAAC,CAAE,CAAA;QACnE,MAAM,SAAS,GAAG,kBAAkB,CAAC,SAAS,EAAE,MAAM,CAAC,CAAA;QACvD,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,MAAO,CAAA;QACzF,MAAM,GAAG,GAAG,GAAG,UAAU,KAAK,SAAS,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;QAC5F,OAAO;YACN,GAAG;YACH,UAAU,EAAE;gBACX,GAAG,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE,YAAY,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBAC/D,MAAM,EAAE,IAAI,CAAC,MAAO;gBACpB,MAAM,EAAE,SAAS;gBACjB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;gBACvB,OAAO,EAAE,IAAI,CAAC,OAAO;aACrB;YACD,MAAM;YACN,QAAQ,EAAE,iBAAiB;SAC3B,CAAA;IACF,CAAC;IAED,sDAAsD;IACtD,MAAM,GAAG,GAAG,GAAG,WAAW,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;IAC/E,OAAO;QACN,GAAG;QACH,UAAU,EAAE;YACX,MAAM,EAAE,WAAW;YACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,OAAO,EAAE,IAAI,CAAC,OAAO;SACrB;QACD,MAAM;QACN,QAAQ,EAAE,QAAQ;KAClB,CAAA;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,eAAe,CAAC,OAAe;IAC9C,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAA;IACtC,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,eAAe;QAAE,OAAO,OAAO,CAAA;IACtE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,QAAQ;QAAE,OAAO,OAAO,CAAA;IAC/D,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,gBAAgB;QAAE,OAAO,OAAO,CAAA;IACrF,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,WAAW;QAAE,OAAO,OAAO,CAAA;IAClE,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,QAAQ;QAAE,OAAO,OAAO,CAAA;IAC/D,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,OAAO;QAAE,OAAO,OAAO,CAAA;IAC9D,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,QAAQ;QAAE,OAAO,OAAO,CAAA;IAC/D,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,WAAW;QAAE,OAAO,OAAO,CAAA;IAClE,OAAO,OAAO,CAAA;AACf,CAAC;AAED,qFAAqF;AACrF,MAAM,UAAU,gBAAgB;IAC/B,OAAO,gBAAgB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAA;AAC7C,CAAC"}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Street-decomposition synthesizer for Stage 3 training. Generates address rows where
|
|
7
|
+
* `street_prefix`, `street`, and `street_suffix` are emitted as separate BIO spans (instead of
|
|
8
|
+
* monolithic `street`). Mirrors the PO box synthesizer pattern.
|
|
9
|
+
*
|
|
10
|
+
* Why this exists: TIGER/NAD/BAN adapter changes (committed earlier tonight) emit decomposed
|
|
11
|
+
* components from raw source data, but the v0.4.0 parquet shards on Modal were built BEFORE those
|
|
12
|
+
* changes. Rebuilding the full corpus requires downloading raw TIGER/NAD/BAN data and re-running
|
|
13
|
+
* adapters end-to-end — out of scope for a single night shift. This synthesizer takes (locality,
|
|
14
|
+
* region, postcode) tuples and produces freshly-decomposed Stage 3 training rows, same shape as
|
|
15
|
+
* the PO box pipeline.
|
|
16
|
+
*
|
|
17
|
+
* Note: uses the SAME decomposition logic as TIGER's `decomposeStreet()` so the synthetic
|
|
18
|
+
* distribution matches what the model would see if/when TIGER shards are rebuilt with the new
|
|
19
|
+
* adapter.
|
|
20
|
+
*/
|
|
21
|
+
import type { CanonicalRow } from "./types.js";
|
|
22
|
+
export interface StreetBaseTuple {
|
|
23
|
+
locality: string;
|
|
24
|
+
region: string;
|
|
25
|
+
postcode: string;
|
|
26
|
+
country: string;
|
|
27
|
+
}
|
|
28
|
+
export interface SynthesizedStreetRow {
|
|
29
|
+
raw: string;
|
|
30
|
+
components: CanonicalRow["components"];
|
|
31
|
+
locale: string;
|
|
32
|
+
}
|
|
33
|
+
export interface StreetSynthesisOpts {
|
|
34
|
+
random?: () => number;
|
|
35
|
+
/** Probability of emitting house_number alongside the street. Default 0.85. */
|
|
36
|
+
includeHouseNumberProb?: number;
|
|
37
|
+
/**
|
|
38
|
+
* Probability of emitting the street BARE — no `, City, ST ZIP` tail and no region/locality/
|
|
39
|
+
* postcode components (just `street_prefix`/`street`/`street_suffix` + optional `house_number`).
|
|
40
|
+
* Default 0 (preserves the original full-address behavior exactly, including the RNG sequence).
|
|
41
|
+
* Set >0 to teach the model that a bare `10th Ave` / `Main St` is a STREET, not a locality — the
|
|
42
|
+
* functional-test failure cluster (bare streets mislabeled `locality`), the bare-format analogue of
|
|
43
|
+
* the v0.7.x intersection-bare fix.
|
|
44
|
+
*/
|
|
45
|
+
bareProb?: number;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Synthesize a US street address with decomposed Stage 3 components. The street is built from
|
|
49
|
+
* PREFIX + NAME + SUFFIX, then passed through the same `decomposeStreet()` utility the TIGER
|
|
50
|
+
* adapter uses — guarantees the synthetic distribution matches the canonical decomposition logic.
|
|
51
|
+
*/
|
|
52
|
+
export declare function synthesizeStreetRow(base: StreetBaseTuple, opts?: StreetSynthesisOpts): SynthesizedStreetRow | null;
|
|
53
|
+
//# sourceMappingURL=synthesize-street.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthesize-street.d.ts","sourceRoot":"","sources":["../../src/synthesize-street.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAGH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAkI9C,MAAM,WAAW,eAAe;IAC/B,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,OAAO,EAAE,MAAM,CAAA;CACf;AAED,MAAM,WAAW,oBAAoB;IACpC,GAAG,EAAE,MAAM,CAAA;IACX,UAAU,EAAE,YAAY,CAAC,YAAY,CAAC,CAAA;IACtC,MAAM,EAAE,MAAM,CAAA;CACd;AAED,MAAM,WAAW,mBAAmB;IACnC,MAAM,CAAC,EAAE,MAAM,MAAM,CAAA;IACrB,+EAA+E;IAC/E,sBAAsB,CAAC,EAAE,MAAM,CAAA;IAC/B;;;;;;;OAOG;IACH,QAAQ,CAAC,EAAE,MAAM,CAAA;CACjB;AAgBD;;;;GAIG;AACH,wBAAgB,mBAAmB,CAClC,IAAI,EAAE,eAAe,EACrB,IAAI,GAAE,mBAAwB,GAC5B,oBAAoB,GAAG,IAAI,CAgD7B"}
|