@mailwoman/corpus 3.0.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/out/src/adapters/ban/adapter.d.ts.map +1 -1
  2. package/out/src/adapters/ban/adapter.js +6 -2
  3. package/out/src/adapters/ban/adapter.js.map +1 -1
  4. package/out/src/adapters/ban/street-decompose.d.ts +28 -0
  5. package/out/src/adapters/ban/street-decompose.d.ts.map +1 -0
  6. package/out/src/adapters/ban/street-decompose.js +78 -0
  7. package/out/src/adapters/ban/street-decompose.js.map +1 -0
  8. package/out/src/adapters/synth-po-box/adapter.d.ts +48 -0
  9. package/out/src/adapters/synth-po-box/adapter.d.ts.map +1 -0
  10. package/out/src/adapters/synth-po-box/adapter.js +101 -0
  11. package/out/src/adapters/synth-po-box/adapter.js.map +1 -0
  12. package/out/src/adapters/tiger/adapter.d.ts.map +1 -1
  13. package/out/src/adapters/tiger/adapter.js +9 -3
  14. package/out/src/adapters/tiger/adapter.js.map +1 -1
  15. package/out/src/adapters/tiger/street-decompose.d.ts +30 -0
  16. package/out/src/adapters/tiger/street-decompose.d.ts.map +1 -0
  17. package/out/src/adapters/tiger/street-decompose.js +99 -0
  18. package/out/src/adapters/tiger/street-decompose.js.map +1 -0
  19. package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -1
  20. package/out/src/adapters/usgov-nad/adapter.js +31 -10
  21. package/out/src/adapters/usgov-nad/adapter.js.map +1 -1
  22. package/out/src/adapters/wof-admin-jp/adapter.d.ts +58 -0
  23. package/out/src/adapters/wof-admin-jp/adapter.d.ts.map +1 -0
  24. package/out/src/adapters/wof-admin-jp/adapter.js +129 -0
  25. package/out/src/adapters/wof-admin-jp/adapter.js.map +1 -0
  26. package/out/src/index.d.ts +6 -0
  27. package/out/src/index.d.ts.map +1 -1
  28. package/out/src/index.js +6 -0
  29. package/out/src/index.js.map +1 -1
  30. package/out/src/synthesize-german.d.ts +75 -0
  31. package/out/src/synthesize-german.d.ts.map +1 -0
  32. package/out/src/synthesize-german.js +116 -0
  33. package/out/src/synthesize-german.js.map +1 -0
  34. package/out/src/synthesize-house-venue.d.ts +57 -0
  35. package/out/src/synthesize-house-venue.d.ts.map +1 -0
  36. package/out/src/synthesize-house-venue.js +147 -0
  37. package/out/src/synthesize-house-venue.js.map +1 -0
  38. package/out/src/synthesize-intersection.d.ts +48 -0
  39. package/out/src/synthesize-intersection.d.ts.map +1 -0
  40. package/out/src/synthesize-intersection.js +138 -0
  41. package/out/src/synthesize-intersection.js.map +1 -0
  42. package/out/src/synthesize-no-street.d.ts +70 -0
  43. package/out/src/synthesize-no-street.d.ts.map +1 -0
  44. package/out/src/synthesize-no-street.js +279 -0
  45. package/out/src/synthesize-no-street.js.map +1 -0
  46. package/out/src/synthesize-po-box.d.ts +75 -0
  47. package/out/src/synthesize-po-box.d.ts.map +1 -0
  48. package/out/src/synthesize-po-box.js +186 -0
  49. package/out/src/synthesize-po-box.js.map +1 -0
  50. package/out/src/synthesize-street.d.ts +53 -0
  51. package/out/src/synthesize-street.d.ts.map +1 -0
  52. package/out/src/synthesize-street.js +212 -0
  53. package/out/src/synthesize-street.js.map +1 -0
  54. package/out/src/synthesize.js +1 -1
  55. package/out/src/synthesize.js.map +1 -1
  56. package/package.json +3 -2
@@ -0,0 +1,147 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * House-number + venue + street co-occurrence synthesizer. The v0.6.3 corrective shard.
7
+ *
8
+ * The v0.6.2 step-20K diagnostic showed that adding synth-no-street counter-distribution regressed
9
+ * house_number recall by ~4-5pp. DeepSeek's turn-8 root-cause:
10
+ *
11
+ * 1. Direct: `5th Avenue Theatre`-style adversarial venues teach the model that tokens like "5th"
12
+ * belong to venues, not house_numbers. (Fixed in `synthesize-no-street.ts` by removing
13
+ * digit+ordinal venue patterns.)
14
+ * 2. Distributional dilution: synth-no-street adds 122K rows where house_number is absent. The model's
15
+ * training distribution shifts toward "house_number is rare," and it under-emits the tag at
16
+ * inference.
17
+ *
18
+ * This synthesizer fixes #2 directly. Each emitted row has ALL of: house_number, street, venue,
19
+ * locality, region, postcode — a counter-example to "house_number is rare." Used as a companion
20
+ * shard to synth-no-street; the v0.6.3 config weights synth-no-street at 0.5 and
21
+ * synth-house-venue at 1.0 to recover the lost house_number signal.
22
+ *
23
+ * Real-world shape: business cards, mailing labels, store directories — `"123 Main St, Sunrise
24
+ * Bakery, Springfield, IL 62701"` is a perfectly ordinary address form.
25
+ *
26
+ * Venue pool: PLAIN_VENUES from `synthesize-no-street.ts` (re-exported here). Adversarial venues
27
+ * are deliberately NOT used here — the point is to teach co-occurrence, not to re-introduce
28
+ * decompose-mode pressure.
29
+ */
30
+ // ---------------------------------------------------------------------------------------------
31
+ // Venue pool — PLAIN, no street-typing tokens. The point of this shard is to teach
32
+ // house_number + venue coexistence, NOT to re-introduce decompose-mode pressure.
33
+ // Adversarial venue names live in `synthesize-no-street.ts`.
34
+ // ---------------------------------------------------------------------------------------------
35
+ const PLAIN_VENUES = [
36
+ "Bob's Pizza",
37
+ "Acme Corporation",
38
+ "Joe's Diner",
39
+ "Sunrise Bakery",
40
+ "Maple Leaf Cafe",
41
+ "Riverside Garden Center",
42
+ "Tech Solutions Inc",
43
+ "Pacific Industries",
44
+ "Atlantic Holdings",
45
+ "Stellar Consulting",
46
+ "Greenfield Partners",
47
+ "Mountain View Studio",
48
+ "The Daily Grind",
49
+ "Sunset Bistro",
50
+ "Harvest Moon Florist",
51
+ "Iron Forge Brewing",
52
+ "Crescent City Bookstore",
53
+ "Lighthouse Insurance Group",
54
+ "Pinecrest Veterinary",
55
+ "Westwood Realty",
56
+ ];
57
+ // ---------------------------------------------------------------------------------------------
58
+ // Fallback street pool for tuples that didn't carry a `street` field. Plain street names
59
+ // without typing-token ambiguity.
60
+ // ---------------------------------------------------------------------------------------------
61
+ const FALLBACK_STREETS = [
62
+ "Main St",
63
+ "Oak Ave",
64
+ "Pine Rd",
65
+ "Elm Dr",
66
+ "Cedar Ln",
67
+ "Maple Blvd",
68
+ "Birch Ct",
69
+ "Walnut Pl",
70
+ "Cherry Way",
71
+ "Spruce St",
72
+ "Park Ave",
73
+ "Lake Dr",
74
+ "Hill Rd",
75
+ "River Ln",
76
+ "Forest Blvd",
77
+ ];
78
+ // ---------------------------------------------------------------------------------------------
79
+ // House-number generator
80
+ // ---------------------------------------------------------------------------------------------
81
+ function randomHouseNumber(random) {
82
+ // Generate a plain numeric house number 1-9999. No fractions/ranges — those land in
83
+ // `data/eval/falsehoods/numbers.jsonl` as known edge cases, not training material.
84
+ const digits = Math.floor(random() * 4) + 1;
85
+ const max = Math.pow(10, digits);
86
+ const n = Math.floor(random() * max) + 1;
87
+ return String(n);
88
+ }
89
+ function pick(arr, random) {
90
+ return arr[Math.floor(random() * arr.length)];
91
+ }
92
+ function countryToLocale(country) {
93
+ switch (country) {
94
+ case "US":
95
+ return "en-US";
96
+ case "CA":
97
+ return "en-CA";
98
+ case "GB":
99
+ return "en-GB";
100
+ case "AU":
101
+ return "en-AU";
102
+ case "FR":
103
+ return "fr-FR";
104
+ case "DE":
105
+ return "de-DE";
106
+ default:
107
+ return "en-US";
108
+ }
109
+ }
110
+ // ---------------------------------------------------------------------------------------------
111
+ // Synthesis
112
+ // ---------------------------------------------------------------------------------------------
113
+ export function synthesizeHouseVenueRow(base, opts = {}) {
114
+ const random = opts.random ?? Math.random;
115
+ const locale = countryToLocale(base.country);
116
+ const template = opts.forceTemplate ?? (random() < 0.5 ? "venue-after-street" : "venue-before-street");
117
+ const venue = pick(PLAIN_VENUES, random);
118
+ const street = base.street ?? pick(FALLBACK_STREETS, random);
119
+ const houseNumber = base.houseNumber ?? randomHouseNumber(random);
120
+ const components = {
121
+ house_number: houseNumber,
122
+ street,
123
+ venue,
124
+ locality: base.locality,
125
+ region: base.region,
126
+ postcode: base.postcode,
127
+ };
128
+ let raw;
129
+ switch (template) {
130
+ case "venue-after-street":
131
+ raw = `${houseNumber} ${street}, ${venue}, ${base.locality}, ${base.region} ${base.postcode}`;
132
+ break;
133
+ case "venue-before-street":
134
+ raw = `${venue}, ${houseNumber} ${street}, ${base.locality}, ${base.region} ${base.postcode}`;
135
+ break;
136
+ }
137
+ return { raw, components, locale, template };
138
+ }
139
+ /**
140
+ * Contract: every synthesized row carries BOTH house_number AND venue (the co-occurrence signal
141
+ * that synth-no-street's distributional shift cost the model). Used by tests + downstream
142
+ * consumers.
143
+ */
144
+ export function hasHouseNumberAndVenue(components) {
145
+ return components.house_number !== undefined && components.venue !== undefined;
146
+ }
147
+ //# sourceMappingURL=synthesize-house-venue.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"synthesize-house-venue.js","sourceRoot":"","sources":["../../src/synthesize-house-venue.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AA6BH,gGAAgG;AAChG,mFAAmF;AACnF,iFAAiF;AACjF,6DAA6D;AAC7D,gGAAgG;AAEhG,MAAM,YAAY,GAA0B;IAC3C,aAAa;IACb,kBAAkB;IAClB,aAAa;IACb,gBAAgB;IAChB,iBAAiB;IACjB,yBAAyB;IACzB,oBAAoB;IACpB,oBAAoB;IACpB,mBAAmB;IACnB,oBAAoB;IACpB,qBAAqB;IACrB,sBAAsB;IACtB,iBAAiB;IACjB,eAAe;IACf,sBAAsB;IACtB,oBAAoB;IACpB,yBAAyB;IACzB,4BAA4B;IAC5B,sBAAsB;IACtB,iBAAiB;CACjB,CAAA;AAED,gGAAgG;AAChG,yFAAyF;AACzF,kCAAkC;AAClC,gGAAgG;AAEhG,MAAM,gBAAgB,GAA0B;IAC/C,SAAS;IACT,SAAS;IACT,SAAS;IACT,QAAQ;IACR,UAAU;IACV,YAAY;IACZ,UAAU;IACV,WAAW;IACX,YAAY;IACZ,WAAW;IACX,UAAU;IACV,SAAS;IACT,SAAS;IACT,UAAU;IACV,aAAa;CACb,CAAA;AAED,gGAAgG;AAChG,yBAAyB;AACzB,gGAAgG;AAEhG,SAAS,iBAAiB,CAAC,MAAoB;IAC9C,oFAAoF;IACpF,mFAAmF;IACnF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;IAC3C,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,MAAM,CAAC,CAAA;IAChC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,GAAG,CAAC,CAAA;IACxC,OAAO,MAAM,CAAC,CAAC,CAAC,CAAA;AACjB,CAAC;AAED,SAAS,IAAI,CAAI,GAAqB,EAAE,MAAoB;IAC3D,OAAO,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,MAAM,CAAC,CAAE,CAAA;AAC/C,CAAC;AAED,SAAS,eAAe,CAAC,OAAe;IACvC,QAAQ,OAAO,EAAE,CAAC;QACjB,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf,KAAK,IAAI;YACR,OAAO,OAAO,CAAA;QACf;YACC,OAAO,OAAO,CAAA;IAChB,CAAC;AACF,CAAC;AAED,gGAAgG;AAChG,YAAY;AACZ,gGAAgG;AAEhG,MAAM,UAAU,uBAAuB,CACtC,IAAyB,EACzB,OAAgC,EAAE;IAElC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAA;IACzC,MAAM,MAAM,GAAG,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAC5C,MAAM,QAAQ,GAAG,IAAI,CAAC,aAAa,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,qBAAqB,CAAC,CAAA;IAEtG,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,EAAE,MAAM,CAAC,CAAA;IACxC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,gBAAgB,EAAE,MAAM,CAAC,CAAA;IAC5D,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,iBAAiB,CAAC,MAAM,CAAC,CAAA;IAEjE,MAAM,UAAU,GAA+B;QAC9C,YAAY,EAAE,WAAW;QACzB,MAAM;QACN,KAAK;QACL,QAAQ,EAAE,IAAI,CAAC,QAAQ;QACvB,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;KACvB,CAAA;IAED,IAAI,GAAW,CAAA;IACf,QAAQ,QAAQ,EAAE,CAAC;QAClB,KAAK,oBAAoB;YACxB,GAAG,GAAG,GAAG,WAAW,IAAI,MAAM,KAAK,KAAK,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;YAC7F,MAAK;QACN,KAAK,qBAAqB;YACzB,GAAG,GAAG,GAAG,KAAK,KAAK,WAAW,IAAI,MAAM,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAA;YAC7F,MAAK;IACP,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAA;AAC7C,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,UAAsC;IAC5E,OAAO,UAAU,CAAC,YAAY,KAAK,SAAS,IAAI,UAAU,CAAC,KAAK,KAAK,SAAS,CAAA;AAC/E,CAAC"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Intersection synthesizer — v0.7 coverage fix (night-3, DeepSeek-decided).
7
+ *
8
+ * The 2026-05-29 harness diagnostic found the neural model emits
9
+ * `intersection_a`/`intersection_b` with ~0.0001 probability on canonical
10
+ * intersections ("Broadway & W 42nd St") — it never learned the tags, because
11
+ * the corpus has NO intersection training signal (no generator, and real-data
12
+ * adapters don't emit intersection-formatted rows). Intersections are 65 of
13
+ * the 376 harness assertions (17%), all 0% neural. This generator produces the
14
+ * missing signal as a small targeted supplement shard (synthesis-as-supplement
15
+ * discipline: weight < 0.25, one-and-done).
16
+ *
17
+ * Output is a `CanonicalRow` ({raw, components}); the corpus aligner turns it
18
+ * into BIO labels (B-/I-intersection_a, O on the connector, B-/I-intersection_b).
19
+ * Surface forms of both streets MUST occur verbatim in `raw` so alignment lands.
20
+ *
21
+ * US-idiomatic only (the harness intersection cases are US: "X & Y, City, ST ZIP").
22
+ */
23
+ import type { CanonicalRow } from "./types.js";
24
+ export interface IntersectionBaseTuple {
25
+ locality: string;
26
+ region: string;
27
+ /** ZIP — optional; ~30% of synthetic intersections omit it (idiomatic). */
28
+ postcode?: string;
29
+ country: string;
30
+ }
31
+ export interface SynthesizedIntersectionRow {
32
+ raw: string;
33
+ components: CanonicalRow["components"];
34
+ locale: string;
35
+ }
36
+ export interface IntersectionSynthesisOpts {
37
+ random?: () => number;
38
+ }
39
+ /**
40
+ * Synthesize one US intersection row. Returns null on the rare degenerate case where the two
41
+ * streets collide (so alignment never has two identical surface forms to disambiguate).
42
+ */
43
+ export declare function synthesizeIntersectionRow(base: IntersectionBaseTuple, opts?: IntersectionSynthesisOpts): SynthesizedIntersectionRow | null;
44
+ /** A small built-in US city/region/zip pool for standalone shard generation + tests. */
45
+ export declare const DEFAULT_US_BASES: ReadonlyArray<IntersectionBaseTuple>;
46
+ /** Generate `count` intersection rows over the provided bases (round-robin). */
47
+ export declare function generateIntersectionRows(count: number, bases?: ReadonlyArray<IntersectionBaseTuple>, opts?: IntersectionSynthesisOpts): SynthesizedIntersectionRow[];
48
+ //# sourceMappingURL=synthesize-intersection.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"synthesize-intersection.d.ts","sourceRoot":"","sources":["../../src/synthesize-intersection.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAyC9C,MAAM,WAAW,qBAAqB;IACrC,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,MAAM,CAAA;IACd,2EAA2E;IAC3E,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,OAAO,EAAE,MAAM,CAAA;CACf;AAED,MAAM,WAAW,0BAA0B;IAC1C,GAAG,EAAE,MAAM,CAAA;IACX,UAAU,EAAE,YAAY,CAAC,YAAY,CAAC,CAAA;IACtC,MAAM,EAAE,MAAM,CAAA;CACd;AAED,MAAM,WAAW,yBAAyB;IACzC,MAAM,CAAC,EAAE,MAAM,MAAM,CAAA;CACrB;AAkBD;;;GAGG;AACH,wBAAgB,yBAAyB,CACxC,IAAI,EAAE,qBAAqB,EAC3B,IAAI,GAAE,yBAA8B,GAClC,0BAA0B,GAAG,IAAI,CAoCnC;AAED,wFAAwF;AACxF,eAAO,MAAM,gBAAgB,EAAE,aAAa,CAAC,qBAAqB,CAWjE,CAAA;AAED,gFAAgF;AAChF,wBAAgB,wBAAwB,CACvC,KAAK,EAAE,MAAM,EACb,KAAK,GAAE,aAAa,CAAC,qBAAqB,CAAoB,EAC9D,IAAI,GAAE,yBAA8B,GAClC,0BAA0B,EAAE,CAU9B"}
@@ -0,0 +1,138 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Intersection synthesizer — v0.7 coverage fix (night-3, DeepSeek-decided).
7
+ *
8
+ * The 2026-05-29 harness diagnostic found the neural model emits
9
+ * `intersection_a`/`intersection_b` with ~0.0001 probability on canonical
10
+ * intersections ("Broadway & W 42nd St") — it never learned the tags, because
11
+ * the corpus has NO intersection training signal (no generator, and real-data
12
+ * adapters don't emit intersection-formatted rows). Intersections are 65 of
13
+ * the 376 harness assertions (17%), all 0% neural. This generator produces the
14
+ * missing signal as a small targeted supplement shard (synthesis-as-supplement
15
+ * discipline: weight < 0.25, one-and-done).
16
+ *
17
+ * Output is a `CanonicalRow` ({raw, components}); the corpus aligner turns it
18
+ * into BIO labels (B-/I-intersection_a, O on the connector, B-/I-intersection_b).
19
+ * Surface forms of both streets MUST occur verbatim in `raw` so alignment lands.
20
+ *
21
+ * US-idiomatic only (the harness intersection cases are US: "X & Y, City, ST ZIP").
22
+ */
23
+ /** Street name cores (no suffix) — proper-noun streets that often appear bare. */
24
+ const STREET_CORES = [
25
+ "Main",
26
+ "Oak",
27
+ "Elm",
28
+ "Maple",
29
+ "Pine",
30
+ "Cedar",
31
+ "Park",
32
+ "Lake",
33
+ "Hill",
34
+ "Washington",
35
+ "Lincoln",
36
+ "Jefferson",
37
+ "Madison",
38
+ "Franklin",
39
+ "Market",
40
+ "Broad",
41
+ "Church",
42
+ "Mill",
43
+ "Highland",
44
+ "Sunset",
45
+ "Union",
46
+ "Spring",
47
+ ];
48
+ /** Bare proper-noun streets that idiomatically take NO suffix. */
49
+ const BARE_NAMES = ["Broadway", "Wall", "Bourbon", "Esplanade", "Riverside", "Lakeshore"];
50
+ const ORDINALS = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "42nd", "23rd", "34th"];
51
+ const SUFFIXES = ["St", "Ave", "Blvd", "Rd", "Dr", "Ln", "Way", "Pl", "Ct", "Pkwy", "Ter", "Cir"];
52
+ const DIRECTIONALS = ["N", "S", "E", "W", "NE", "NW", "SE", "SW"];
53
+ /** Connectors between the two streets. Whitespace-padded forms keep tokens clean for alignment.
54
+ * `@` added in v0.7.2 — the harness uses it ("Main St @ Second Ave") and v0.7.1 had never seen it. */
55
+ const CONNECTORS = [" & ", " and ", " at ", " / ", " @ "];
56
+ function pick(arr, random) {
57
+ return arr[Math.floor(random() * arr.length)];
58
+ }
59
+ /** Build a single street surface form, e.g. "W 42nd St", "Broadway", "Main St", "N Oak Ave". */
60
+ function buildStreetName(random) {
61
+ // ~20% bare proper-noun street (no suffix), else directional? + core/ordinal + suffix.
62
+ if (random() < 0.2)
63
+ return pick(BARE_NAMES, random);
64
+ const parts = [];
65
+ if (random() < 0.35)
66
+ parts.push(pick(DIRECTIONALS, random));
67
+ parts.push(random() < 0.45 ? pick(ORDINALS, random) : pick(STREET_CORES, random));
68
+ parts.push(pick(SUFFIXES, random));
69
+ return parts.join(" ");
70
+ }
71
+ /**
72
+ * Synthesize one US intersection row. Returns null on the rare degenerate case where the two
73
+ * streets collide (so alignment never has two identical surface forms to disambiguate).
74
+ */
75
+ export function synthesizeIntersectionRow(base, opts = {}) {
76
+ const random = opts.random ?? Math.random;
77
+ if (base.country !== "US")
78
+ return null;
79
+ const a = buildStreetName(random);
80
+ let b = buildStreetName(random);
81
+ // Ensure distinct surface forms (and not a substring of each other — alignment needs unambiguous spans).
82
+ let tries = 0;
83
+ while ((b === a || a.includes(b) || b.includes(a)) && tries++ < 8)
84
+ b = buildStreetName(random);
85
+ if (b === a || a.includes(b) || b.includes(a))
86
+ return null;
87
+ const connector = pick(CONNECTORS, random);
88
+ // "corner of" prefix variant (~20%) — still labels the two streets identically.
89
+ const cornerPrefix = random() < 0.2 ? "corner of " : "";
90
+ const components = { intersection_a: a, intersection_b: b };
91
+ // v0.7.2: ~60% BARE (no locality tail). v0.7.1 always appended ", City, ST", so the model learned
92
+ // to read post-intersection text as a locality and fumbled the harness's bare "X & Y" cases
93
+ // (mislabeling the second street as a locality). Match the eval distribution.
94
+ const bare = random() < 0.6;
95
+ let raw;
96
+ if (bare) {
97
+ raw = `${cornerPrefix}${a}${connector}${b}`;
98
+ }
99
+ else {
100
+ const includePostcode = base.postcode != null && random() < 0.7;
101
+ const tail = includePostcode
102
+ ? `, ${base.locality}, ${base.region} ${base.postcode}`
103
+ : `, ${base.locality}, ${base.region}`;
104
+ raw = `${cornerPrefix}${a}${connector}${b}${tail}`;
105
+ components.locality = base.locality;
106
+ components.region = base.region;
107
+ if (includePostcode)
108
+ components.postcode = base.postcode;
109
+ }
110
+ return { raw, components, locale: "en-US" };
111
+ }
112
+ /** A small built-in US city/region/zip pool for standalone shard generation + tests. */
113
+ export const DEFAULT_US_BASES = [
114
+ { locality: "New York", region: "NY", postcode: "10036", country: "US" },
115
+ { locality: "Chicago", region: "IL", postcode: "60613", country: "US" },
116
+ { locality: "Los Angeles", region: "CA", postcode: "90012", country: "US" },
117
+ { locality: "Seattle", region: "WA", postcode: "98109", country: "US" },
118
+ { locality: "Austin", region: "TX", postcode: "78701", country: "US" },
119
+ { locality: "Portland", region: "OR", postcode: "97205", country: "US" },
120
+ { locality: "Denver", region: "CO", postcode: "80202", country: "US" },
121
+ { locality: "Boston", region: "MA", postcode: "02116", country: "US" },
122
+ { locality: "Miami", region: "FL", postcode: "33130", country: "US" },
123
+ { locality: "Atlanta", region: "GA", postcode: "30303", country: "US" },
124
+ ];
125
+ /** Generate `count` intersection rows over the provided bases (round-robin). */
126
+ export function generateIntersectionRows(count, bases = DEFAULT_US_BASES, opts = {}) {
127
+ const random = opts.random ?? Math.random;
128
+ const out = [];
129
+ let guard = 0;
130
+ while (out.length < count && guard++ < count * 4) {
131
+ const base = bases[out.length % bases.length];
132
+ const row = synthesizeIntersectionRow(base, { random });
133
+ if (row)
134
+ out.push(row);
135
+ }
136
+ return out;
137
+ }
138
+ //# sourceMappingURL=synthesize-intersection.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"synthesize-intersection.js","sourceRoot":"","sources":["../../src/synthesize-intersection.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;AAIH,kFAAkF;AAClF,MAAM,YAAY,GAAG;IACpB,MAAM;IACN,KAAK;IACL,KAAK;IACL,OAAO;IACP,MAAM;IACN,OAAO;IACP,MAAM;IACN,MAAM;IACN,MAAM;IACN,YAAY;IACZ,SAAS;IACT,WAAW;IACX,SAAS;IACT,UAAU;IACV,QAAQ;IACR,OAAO;IACP,QAAQ;IACR,MAAM;IACN,UAAU;IACV,QAAQ;IACR,OAAO;IACP,QAAQ;CACC,CAAA;AAEV,kEAAkE;AAClE,MAAM,UAAU,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,WAAW,EAAE,WAAW,CAAU,CAAA;AAElG,MAAM,QAAQ,GAAG,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAU,CAAA;AAEzH,MAAM,QAAQ,GAAG,CAAC,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,CAAU,CAAA;AAE1G,MAAM,YAAY,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE1E;uGACuG;AACvG,MAAM,UAAU,GAAG,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,CAAU,CAAA;AAoBlE,SAAS,IAAI,CAAI,GAAqB,EAAE,MAAoB;IAC3D,OAAO,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,MAAM,CAAC,CAAE,CAAA;AAC/C,CAAC;AAED,gGAAgG;AAChG,SAAS,eAAe,CAAC,MAAoB;IAC5C,uFAAuF;IACvF,IAAI,MAAM,EAAE,GAAG,GAAG;QAAE,OAAO,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,CAAA;IAEnD,MAAM,KAAK,GAAa,EAAE,CAAA;IAC1B,IAAI,MAAM,EAAE,GAAG,IAAI;QAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;IAC3D,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAA;IACjF,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC,CAAA;IAClC,OAAO,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;AACvB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,yBAAyB,CACxC,IAA2B,EAC3B,OAAkC,EAAE;IAEpC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAA;IACzC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IAEtC,MAAM,CAAC,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;IACjC,IAAI,CAAC,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;IAC/B,yGAAyG;IACzG,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,OAAO,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,KAAK,EAAE,GAAG,CAAC;QAAE,CAAC,GAAG,eAAe,CAAC,MAAM,CAAC,CAAA;IAC9F,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC;QAAE,OAAO,IAAI,CAAA;IAE1D,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,CAAA;IAC1C,gFAAgF;IAChF,MAAM,YAAY,GAAG,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,EAAE,CAAA;IAEvD,MAAM,UAAU,GAA+B,EAAE,cAAc,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAA;IAEvF,kGAAkG;IAClG,4FAA4F;IAC5F,8EAA8E;IAC9E,MAAM,IAAI,GAAG,MAAM,EAAE,GAAG,GAAG,CAAA;IAC3B,IAAI,GAAW,CAAA;IACf,IAAI,IAAI,EAAE,CAAC;QACV,GAAG,GAAG,GAAG,YAAY,GAAG,CAAC,GAAG,SAAS,GAAG,CAAC,EAAE,CAAA;IAC5C,CAAC;SAAM,CAAC;QACP,MAAM,eAAe,GAAG,IAAI,CAAC,QAAQ,IAAI,IAAI,IAAI,MAAM,EAAE,GAAG,GAAG,CAAA;QAC/D,MAAM,IAAI,GAAG,eAAe;YAC3B,CAAC,CAAC,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE;YACvD,CAAC,CAAC,KAAK,IAAI,CAAC,QAAQ,KAAK,IAAI,CAAC,MAAM,EAAE,CAAA;QACvC,GAAG,GAAG,GAAG,YAAY,GAAG,CAAC,GAAG,SAAS,GAAG,CAAC,GAAG,IAAI,EAAE,CAAA;QAClD,UAAU,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAA;QACnC,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAA;QAC/B,IAAI,eAAe;YAAE,UAAU,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAA;IACzD,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,EAAE,CAAA;AAC5C,CAAC;AAED,wFAAwF;AACxF,MAAM,CAAC,MAAM,gBAAgB,GAAyC;IACrE,EAAE,QAAQ,EAAE,UAAU,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACxE,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACvE,EAAE,QAAQ,EAAE,aAAa,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IAC3E,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACvE,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACtE,EAAE,QAAQ,EAAE,UAAU,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACxE,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACtE,EAAE,QAAQ,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACtE,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;IACrE,EAAE,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE;CACvE,CAAA;AAED,gFAAgF;AAChF,MAAM,UAAU,wBAAwB,CACvC,KAAa,EACb,QAA8C,gBAAgB,EAC9D,OAAkC,EAAE;IAEpC,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,MAAM,CAAA;IACzC,MAAM,GAAG,GAAiC,EAAE,CAAA;IAC5C,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,OAAO,GAAG,CAAC,MAAM,GAAG,KAAK,IAAI,KAAK,EAAE,GAAG,KAAK,GAAG,CAAC,EAAE,CAAC;QAClD,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAE,CAAA;QAC9C,MAAM,GAAG,GAAG,yBAAyB,CAAC,IAAI,EAAE,EAAE,MAAM,EAAE,CAAC,CAAA;QACvD,IAAI,GAAG;YAAE,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACvB,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
@@ -0,0 +1,70 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * No-street address synthesizer — the counter-distribution that v0.6.1's synth-street shard is
7
+ * missing. Generates BIO-labelable rows where there is NO street, NO house_number, NO
8
+ * street_prefix, NO street_suffix, NO intersection — only some subset of {venue, locality,
9
+ * region, postcode, country}.
10
+ *
11
+ * Rationale: the [2026-05-28 night-2
12
+ * postmortem](../../docs/articles/evals/2026-05-28-night-2-postmortem.md) and the [layer-1
13
+ * eval](../../docs/articles/evals/2026-05-28-layer-1-morphology-fst.md) showed that synth-street
14
+ * pushed the model into a high-confidence "decompose mode" that leaked into `dependent_locality`.
15
+ * Per DeepSeek's turn-2 recipe, the model needs explicit counter-examples: addresses where the
16
+ * model should NOT emit street labels. This synthesizer is that source.
17
+ *
18
+ * Six row templates, each producing a {raw, components} pair with no street-side tags:
19
+ *
20
+ * 1. **Plain venue + locality + region + postcode** `"Bob's Pizza, Boston, MA 02101"`
21
+ * 2. **Adversarial venue (containing street-typing words)** `"Wall Street Industries, NY 10005"`,
22
+ * `"5th Avenue Theater, Seattle, WA"`, `"Highway 61 Diner, Memphis TN"`. These are the rows
23
+ * that v0.6.1's decompose-mode would mis-tag as street_prefix/suffix; explicit negative
24
+ * training kills that signal.
25
+ * 3. **Locality + region + postcode (minimal)** — `"Boston, MA 02101"`
26
+ * 4. **Locality + region** — `"Boston, MA"`
27
+ * 5. **Postcode-only** — `"02101"`
28
+ * 6. **Country-only** — `"United States"`, `"France"` (rare in real data, but the model has seen these
29
+ * and should not hallucinate streets on them).
30
+ *
31
+ * Output is a `CanonicalRow` with no street-side components. Alignment will produce BIO labels
32
+ * where every token is one of {`B-venue`, `I-venue`, `B-locality`, `I-locality`, `B-region`,
33
+ * `B-postcode`, `B-country`, `I-country`, `O`} — explicitly never any street tag. That IS the
34
+ * counter-example signal the model is missing.
35
+ *
36
+ * This complements (does not replace) the existing US-base-tuple source used by
37
+ * `synthesize-po-box.ts`; the same `NoStreetBaseTuple` shape is consumed.
38
+ */
39
+ import type { CanonicalRow } from "./types.js";
40
+ export interface NoStreetBaseTuple {
41
+ locality: string;
42
+ region: string;
43
+ postcode: string;
44
+ country: string;
45
+ }
46
+ export type NoStreetTemplate = "venue-plain" | "venue-adversarial" | "locality-region-postcode" | "locality-region" | "postcode-only" | "country-only";
47
+ export interface NoStreetSynthesisOpts {
48
+ random?: () => number;
49
+ /** Override the template selection entirely (used by tests for deterministic coverage). */
50
+ forceTemplate?: NoStreetTemplate;
51
+ }
52
+ export interface SynthesizedNoStreetRow {
53
+ raw: string;
54
+ components: CanonicalRow["components"];
55
+ locale: string;
56
+ template: NoStreetTemplate;
57
+ }
58
+ /**
59
+ * Generate one no-street counter-example row for a base (locality, region, postcode, country)
60
+ * tuple. Picks a template by weighted random; the venue templates are the load-bearing
61
+ * counter-distribution against synth-street's decompose-mode pressure.
62
+ */
63
+ export declare function synthesizeNoStreetRow(base: NoStreetBaseTuple, opts?: NoStreetSynthesisOpts): SynthesizedNoStreetRow | null;
64
+ /**
65
+ * Convenience: assert at type-level that a synthesized row carries no street-side components. Used
66
+ * by tests + downstream consumers who want to verify the contract at runtime.
67
+ */
68
+ export declare const STREET_SIDE_TAGS: readonly ["street", "street_prefix", "street_prefix_particle", "street_suffix", "house_number", "intersection_a", "intersection_b", "unit"];
69
+ export declare function hasAnyStreetSideTag(components: CanonicalRow["components"]): boolean;
70
+ //# sourceMappingURL=synthesize-no-street.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"synthesize-no-street.d.ts","sourceRoot":"","sources":["../../src/synthesize-no-street.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqCG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,YAAY,CAAA;AAM9C,MAAM,WAAW,iBAAiB;IACjC,QAAQ,EAAE,MAAM,CAAA;IAChB,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,MAAM,CAAA;IAChB,OAAO,EAAE,MAAM,CAAA;CACf;AAED,MAAM,MAAM,gBAAgB,GACzB,aAAa,GACb,mBAAmB,GACnB,0BAA0B,GAC1B,iBAAiB,GACjB,eAAe,GACf,cAAc,CAAA;AAEjB,MAAM,WAAW,qBAAqB;IACrC,MAAM,CAAC,EAAE,MAAM,MAAM,CAAA;IACrB,2FAA2F;IAC3F,aAAa,CAAC,EAAE,gBAAgB,CAAA;CAChC;AAED,MAAM,WAAW,sBAAsB;IACtC,GAAG,EAAE,MAAM,CAAA;IACX,UAAU,EAAE,YAAY,CAAC,YAAY,CAAC,CAAA;IACtC,MAAM,EAAE,MAAM,CAAA;IACd,QAAQ,EAAE,gBAAgB,CAAA;CAC1B;AA4HD;;;;GAIG;AACH,wBAAgB,qBAAqB,CACpC,IAAI,EAAE,iBAAiB,EACvB,IAAI,GAAE,qBAA0B,GAC9B,sBAAsB,GAAG,IAAI,CAqF/B;AAgBD;;;GAGG;AACH,eAAO,MAAM,gBAAgB,6IASnB,CAAA;AAEV,wBAAgB,mBAAmB,CAAC,UAAU,EAAE,YAAY,CAAC,YAAY,CAAC,GAAG,OAAO,CAKnF"}