@mailwoman/corpus 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/src/adapter.d.ts +96 -0
- package/out/src/adapter.d.ts.map +1 -0
- package/out/src/adapter.js +107 -0
- package/out/src/adapter.js.map +1 -0
- package/out/src/adapters/ban/adapter.d.ts +32 -0
- package/out/src/adapters/ban/adapter.d.ts.map +1 -0
- package/out/src/adapters/ban/adapter.js +133 -0
- package/out/src/adapters/ban/adapter.js.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts +61 -0
- package/out/src/adapters/fcc-bdc/adapter.d.ts.map +1 -0
- package/out/src/adapters/fcc-bdc/adapter.js +153 -0
- package/out/src/adapters/fcc-bdc/adapter.js.map +1 -0
- package/out/src/adapters/index.d.ts +42 -0
- package/out/src/adapters/index.d.ts.map +1 -0
- package/out/src/adapters/index.js +76 -0
- package/out/src/adapters/index.js.map +1 -0
- package/out/src/adapters/openaddresses/adapter.d.ts +60 -0
- package/out/src/adapters/openaddresses/adapter.d.ts.map +1 -0
- package/out/src/adapters/openaddresses/adapter.js +174 -0
- package/out/src/adapters/openaddresses/adapter.js.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts +23 -0
- package/out/src/adapters/state-ia-contractors/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ia-contractors/adapter.js +113 -0
- package/out/src/adapters/state-ia-contractors/adapter.js.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts +21 -0
- package/out/src/adapters/state-ny-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-ny-notaries/adapter.js +132 -0
- package/out/src/adapters/state-ny-notaries/adapter.js.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts +22 -0
- package/out/src/adapters/state-tx-notaries/adapter.d.ts.map +1 -0
- package/out/src/adapters/state-tx-notaries/adapter.js +125 -0
- package/out/src/adapters/state-tx-notaries/adapter.js.map +1 -0
- package/out/src/adapters/tiger/adapter.d.ts +45 -0
- package/out/src/adapters/tiger/adapter.d.ts.map +1 -0
- package/out/src/adapters/tiger/adapter.js +179 -0
- package/out/src/adapters/tiger/adapter.js.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts +36 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js +147 -0
- package/out/src/adapters/usgov-hrsa-fqhc/adapter.js.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts +25 -0
- package/out/src/adapters/usgov-imls-pls/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js +118 -0
- package/out/src/adapters/usgov-imls-pls/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts +37 -0
- package/out/src/adapters/usgov-nad/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nad/adapter.js +227 -0
- package/out/src/adapters/usgov-nad/adapter.js.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts +28 -0
- package/out/src/adapters/usgov-nppes/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-nppes/adapter.js +123 -0
- package/out/src/adapters/usgov-nppes/adapter.js.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts +35 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.d.ts.map +1 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js +162 -0
- package/out/src/adapters/usgov-samhsa-treatment-locator/adapter.js.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts +85 -0
- package/out/src/adapters/wof-admin-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-admin-json/adapter.js +241 -0
- package/out/src/adapters/wof-admin-json/adapter.js.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts +63 -0
- package/out/src/adapters/wof-postalcode-json/adapter.d.ts.map +1 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js +178 -0
- package/out/src/adapters/wof-postalcode-json/adapter.js.map +1 -0
- package/out/src/align.d.ts +58 -0
- package/out/src/align.d.ts.map +1 -0
- package/out/src/align.js +139 -0
- package/out/src/align.js.map +1 -0
- package/out/src/build.d.ts +104 -0
- package/out/src/build.d.ts.map +1 -0
- package/out/src/build.js +201 -0
- package/out/src/build.js.map +1 -0
- package/out/src/codex/us-fips-state.d.ts +44 -0
- package/out/src/codex/us-fips-state.d.ts.map +1 -0
- package/out/src/codex/us-fips-state.js +105 -0
- package/out/src/codex/us-fips-state.js.map +1 -0
- package/out/src/codex/us-street-suffix.d.ts +259 -0
- package/out/src/codex/us-street-suffix.d.ts.map +1 -0
- package/out/src/codex/us-street-suffix.js +285 -0
- package/out/src/codex/us-street-suffix.js.map +1 -0
- package/out/src/format.d.ts +79 -0
- package/out/src/format.d.ts.map +1 -0
- package/out/src/format.js +151 -0
- package/out/src/format.js.map +1 -0
- package/out/src/golden.d.ts +50 -0
- package/out/src/golden.d.ts.map +1 -0
- package/out/src/golden.js +104 -0
- package/out/src/golden.js.map +1 -0
- package/out/src/index.d.ts +18 -0
- package/out/src/index.d.ts.map +1 -0
- package/out/src/index.js +18 -0
- package/out/src/index.js.map +1 -0
- package/out/src/parquet-wrapper/index.d.ts +12 -0
- package/out/src/parquet-wrapper/index.d.ts.map +1 -0
- package/out/src/parquet-wrapper/index.js +12 -0
- package/out/src/parquet-wrapper/index.js.map +1 -0
- package/out/src/parquet-wrapper/reader.d.ts +31 -0
- package/out/src/parquet-wrapper/reader.d.ts.map +1 -0
- package/out/src/parquet-wrapper/reader.js +54 -0
- package/out/src/parquet-wrapper/reader.js.map +1 -0
- package/out/src/parquet-wrapper/schema.d.ts +45 -0
- package/out/src/parquet-wrapper/schema.d.ts.map +1 -0
- package/out/src/parquet-wrapper/schema.js +55 -0
- package/out/src/parquet-wrapper/schema.js.map +1 -0
- package/out/src/parquet-wrapper/writer.d.ts +41 -0
- package/out/src/parquet-wrapper/writer.d.ts.map +1 -0
- package/out/src/parquet-wrapper/writer.js +71 -0
- package/out/src/parquet-wrapper/writer.js.map +1 -0
- package/out/src/parquet.d.ts +122 -0
- package/out/src/parquet.d.ts.map +1 -0
- package/out/src/parquet.js +220 -0
- package/out/src/parquet.js.map +1 -0
- package/out/src/runner.d.ts +100 -0
- package/out/src/runner.d.ts.map +1 -0
- package/out/src/runner.js +183 -0
- package/out/src/runner.js.map +1 -0
- package/out/src/split.d.ts +108 -0
- package/out/src/split.d.ts.map +1 -0
- package/out/src/split.js +191 -0
- package/out/src/split.js.map +1 -0
- package/out/src/synthesize.d.ts +146 -0
- package/out/src/synthesize.d.ts.map +1 -0
- package/out/src/synthesize.js +472 -0
- package/out/src/synthesize.js.map +1 -0
- package/out/src/tokenize.d.ts +47 -0
- package/out/src/tokenize.d.ts.map +1 -0
- package/out/src/tokenize.js +49 -0
- package/out/src/tokenize.js.map +1 -0
- package/out/src/types.d.ts +168 -0
- package/out/src/types.d.ts.map +1 -0
- package/out/src/types.js +19 -0
- package/out/src/types.js.map +1 -0
- package/out/src/wof-json.d.ts +105 -0
- package/out/src/wof-json.d.ts.map +1 -0
- package/out/src/wof-json.js +174 -0
- package/out/src/wof-json.js.map +1 -0
- package/package.json +36 -0
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Synthesis / augmentation per Phase 1 task #6.
|
|
7
|
+
*
|
|
8
|
+
* An `Augmentation` is a pure function that takes a `CanonicalRow` and either returns a new
|
|
9
|
+
* `CanonicalRow` (with `raw` AND `components` transformed in lockstep so alignment still
|
|
10
|
+
* succeeds) or `null` when the augmentation doesn't apply to the row's shape.
|
|
11
|
+
*
|
|
12
|
+
* Synthesis runs **before** alignment: augmentations transform raw + components together, and the
|
|
13
|
+
* runner reruns alignment on each augmented row to produce its labels. This keeps the synthesis
|
|
14
|
+
* surface small (no token/label arithmetic) at the cost of a re-run.
|
|
15
|
+
*
|
|
16
|
+
* Every augmented row carries the `synth` marker:
|
|
17
|
+
*
|
|
18
|
+
* - `method`: the augmentation's stable id (e.g. `"case-upper"`, `"accent-strip"`).
|
|
19
|
+
* - `base_source_id`: the source_id of the un-augmented (or upstream-augmented) row, so ancestry is
|
|
20
|
+
* traceable.
|
|
21
|
+
*
|
|
22
|
+
* Phase 1 implements the locale-agnostic + most useful US/FR augmentations. Typo injection and
|
|
23
|
+
* other stochastic augmentations are intentionally deferred — they need a seed-aware API and are
|
|
24
|
+
* most useful at training time, not corpus build time.
|
|
25
|
+
*/
|
|
26
|
+
import { alignRow } from "./align.js";
|
|
27
|
+
import { US_STREET_SUFFIX_PREFERRED_ABBR, matchCase, matchTrailingSuffix } from "./codex/us-street-suffix.js";
|
|
28
|
+
import { whitespaceTokenizer } from "./tokenize.js";
|
|
29
|
+
/** Helper: build the augmented row with synth marker + chained source_id. */
|
|
30
|
+
function withAugmentation(source, method, newRaw, newComponents) {
|
|
31
|
+
const baseId = source.synth?.base_source_id ?? source.source_id;
|
|
32
|
+
return {
|
|
33
|
+
...source,
|
|
34
|
+
raw: newRaw,
|
|
35
|
+
components: newComponents,
|
|
36
|
+
source_id: `${source.source_id}+${method}`,
|
|
37
|
+
synth: { method, base_source_id: baseId },
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
// ===========================================================================
|
|
41
|
+
// Locale-agnostic augmentations
|
|
42
|
+
// ===========================================================================
|
|
43
|
+
/** Upper-case raw + every component value. Returns null if already all-upper. */
|
|
44
|
+
export const caseUpper = (row) => {
|
|
45
|
+
if (row.raw === row.raw.toUpperCase())
|
|
46
|
+
return null;
|
|
47
|
+
const upRaw = row.raw.toUpperCase();
|
|
48
|
+
const upComponents = {};
|
|
49
|
+
for (const [k, v] of Object.entries(row.components)) {
|
|
50
|
+
if (v)
|
|
51
|
+
upComponents[k] = v.toUpperCase();
|
|
52
|
+
}
|
|
53
|
+
return withAugmentation(row, "case-upper", upRaw, upComponents);
|
|
54
|
+
};
|
|
55
|
+
/** Lower-case raw + every component value. Returns null if already all-lower. */
|
|
56
|
+
export const caseLower = (row) => {
|
|
57
|
+
if (row.raw === row.raw.toLowerCase())
|
|
58
|
+
return null;
|
|
59
|
+
const downRaw = row.raw.toLowerCase();
|
|
60
|
+
const downComponents = {};
|
|
61
|
+
for (const [k, v] of Object.entries(row.components)) {
|
|
62
|
+
if (v)
|
|
63
|
+
downComponents[k] = v.toLowerCase();
|
|
64
|
+
}
|
|
65
|
+
return withAugmentation(row, "case-lower", downRaw, downComponents);
|
|
66
|
+
};
|
|
67
|
+
/** Drop commas from `raw`. Components unchanged (they didn't carry commas). */
|
|
68
|
+
export const dropCommas = (row) => {
|
|
69
|
+
if (!row.raw.includes(","))
|
|
70
|
+
return null;
|
|
71
|
+
const newRaw = row.raw.replace(/,/g, "").replace(/\s+/g, " ").trim();
|
|
72
|
+
return withAugmentation(row, "drop-commas", newRaw, { ...row.components });
|
|
73
|
+
};
|
|
74
|
+
/**
|
|
75
|
+
* Replace single spaces with double spaces in `raw` AND in every component value. The component
|
|
76
|
+
* update is load-bearing for alignment: `alignRow` substring-searches each component's surface form
|
|
77
|
+
* inside `raw`, so doubling the spaces in `raw` only would leave single-spaced components
|
|
78
|
+
* unfindable (this was the bug behind v0.1.1's first build attempt — 99.9% of quarantined rows
|
|
79
|
+
* traced back to this augmentation). Doubling both keeps the substring contract intact.
|
|
80
|
+
*/
|
|
81
|
+
export const doubleSpace = (row) => {
|
|
82
|
+
if (!/ /.test(row.raw))
|
|
83
|
+
return null;
|
|
84
|
+
const newRaw = row.raw.replace(/ /g, " ");
|
|
85
|
+
const newComponents = {};
|
|
86
|
+
for (const [k, v] of Object.entries(row.components)) {
|
|
87
|
+
if (v)
|
|
88
|
+
newComponents[k] = v.replace(/ /g, " ");
|
|
89
|
+
}
|
|
90
|
+
return withAugmentation(row, "double-space", newRaw, newComponents);
|
|
91
|
+
};
|
|
92
|
+
/**
|
|
93
|
+
* Strip Unicode combining marks (accents, diacritics) from raw + components. "Hôtel" → "Hotel";
|
|
94
|
+
* "Île-de-France" → "Ile-de-France". Returns null if the row has no accents.
|
|
95
|
+
*/
|
|
96
|
+
export const accentStrip = (row) => {
|
|
97
|
+
const stripped = stripAccents(row.raw);
|
|
98
|
+
if (stripped === row.raw)
|
|
99
|
+
return null;
|
|
100
|
+
const newComponents = {};
|
|
101
|
+
for (const [k, v] of Object.entries(row.components)) {
|
|
102
|
+
if (v)
|
|
103
|
+
newComponents[k] = stripAccents(v);
|
|
104
|
+
}
|
|
105
|
+
return withAugmentation(row, "accent-strip", stripped, newComponents);
|
|
106
|
+
};
|
|
107
|
+
function stripAccents(s) {
|
|
108
|
+
return s.normalize("NFD").replace(/\p{M}/gu, "");
|
|
109
|
+
}
|
|
110
|
+
// ===========================================================================
|
|
111
|
+
// US-specific augmentations
|
|
112
|
+
// ===========================================================================
|
|
113
|
+
/** US state full ↔ alpha-2 mapping. Two-way: `STATE_TO_ABBR["Oregon"] = "OR"`. */
|
|
114
|
+
const STATE_NAME_TO_ABBR = {
|
|
115
|
+
Alabama: "AL",
|
|
116
|
+
Alaska: "AK",
|
|
117
|
+
Arizona: "AZ",
|
|
118
|
+
Arkansas: "AR",
|
|
119
|
+
California: "CA",
|
|
120
|
+
Colorado: "CO",
|
|
121
|
+
Connecticut: "CT",
|
|
122
|
+
Delaware: "DE",
|
|
123
|
+
Florida: "FL",
|
|
124
|
+
Georgia: "GA",
|
|
125
|
+
Hawaii: "HI",
|
|
126
|
+
Idaho: "ID",
|
|
127
|
+
Illinois: "IL",
|
|
128
|
+
Indiana: "IN",
|
|
129
|
+
Iowa: "IA",
|
|
130
|
+
Kansas: "KS",
|
|
131
|
+
Kentucky: "KY",
|
|
132
|
+
Louisiana: "LA",
|
|
133
|
+
Maine: "ME",
|
|
134
|
+
Maryland: "MD",
|
|
135
|
+
Massachusetts: "MA",
|
|
136
|
+
Michigan: "MI",
|
|
137
|
+
Minnesota: "MN",
|
|
138
|
+
Mississippi: "MS",
|
|
139
|
+
Missouri: "MO",
|
|
140
|
+
Montana: "MT",
|
|
141
|
+
Nebraska: "NE",
|
|
142
|
+
Nevada: "NV",
|
|
143
|
+
"New Hampshire": "NH",
|
|
144
|
+
"New Jersey": "NJ",
|
|
145
|
+
"New Mexico": "NM",
|
|
146
|
+
"New York": "NY",
|
|
147
|
+
"North Carolina": "NC",
|
|
148
|
+
"North Dakota": "ND",
|
|
149
|
+
Ohio: "OH",
|
|
150
|
+
Oklahoma: "OK",
|
|
151
|
+
Oregon: "OR",
|
|
152
|
+
Pennsylvania: "PA",
|
|
153
|
+
"Rhode Island": "RI",
|
|
154
|
+
"South Carolina": "SC",
|
|
155
|
+
"South Dakota": "SD",
|
|
156
|
+
Tennessee: "TN",
|
|
157
|
+
Texas: "TX",
|
|
158
|
+
Utah: "UT",
|
|
159
|
+
Vermont: "VT",
|
|
160
|
+
Virginia: "VA",
|
|
161
|
+
Washington: "WA",
|
|
162
|
+
"West Virginia": "WV",
|
|
163
|
+
Wisconsin: "WI",
|
|
164
|
+
Wyoming: "WY",
|
|
165
|
+
"District of Columbia": "DC",
|
|
166
|
+
};
|
|
167
|
+
const STATE_ABBR_TO_NAME = Object.fromEntries(Object.entries(STATE_NAME_TO_ABBR).map(([k, v]) => [v, k]));
|
|
168
|
+
/** US: substitute the full state name for its alpha-2 abbreviation. */
|
|
169
|
+
export const stateExpand = (row) => {
|
|
170
|
+
if (row.country !== "US")
|
|
171
|
+
return null;
|
|
172
|
+
const region = row.components.region;
|
|
173
|
+
if (!region)
|
|
174
|
+
return null;
|
|
175
|
+
const full = STATE_ABBR_TO_NAME[region];
|
|
176
|
+
if (!full)
|
|
177
|
+
return null;
|
|
178
|
+
// Replace the bounded "OR" surface form with "Oregon" in raw. Use word boundaries so we
|
|
179
|
+
// don't match inside "Stop" or similar.
|
|
180
|
+
const re = new RegExp(`\\b${region}\\b`, "g");
|
|
181
|
+
if (!re.test(row.raw))
|
|
182
|
+
return null;
|
|
183
|
+
const newRaw = row.raw.replace(new RegExp(`\\b${region}\\b`, "g"), full);
|
|
184
|
+
const newComponents = { ...row.components, region: full };
|
|
185
|
+
return withAugmentation(row, "state-expand", newRaw, newComponents);
|
|
186
|
+
};
|
|
187
|
+
/** US: substitute the alpha-2 abbreviation for the full state name. */
|
|
188
|
+
export const stateAbbreviate = (row) => {
|
|
189
|
+
if (row.country !== "US")
|
|
190
|
+
return null;
|
|
191
|
+
const region = row.components.region;
|
|
192
|
+
if (!region)
|
|
193
|
+
return null;
|
|
194
|
+
const abbr = STATE_NAME_TO_ABBR[region];
|
|
195
|
+
if (!abbr)
|
|
196
|
+
return null;
|
|
197
|
+
const re = new RegExp(`\\b${region}\\b`, "g");
|
|
198
|
+
if (!re.test(row.raw))
|
|
199
|
+
return null;
|
|
200
|
+
const newRaw = row.raw.replace(new RegExp(`\\b${region}\\b`, "g"), abbr);
|
|
201
|
+
const newComponents = { ...row.components, region: abbr };
|
|
202
|
+
return withAugmentation(row, "state-abbreviate", newRaw, newComponents);
|
|
203
|
+
};
|
|
204
|
+
const DIRECTIONAL_FULL_TO_ABBR = {
|
|
205
|
+
North: "N",
|
|
206
|
+
South: "S",
|
|
207
|
+
East: "E",
|
|
208
|
+
West: "W",
|
|
209
|
+
Northeast: "NE",
|
|
210
|
+
Northwest: "NW",
|
|
211
|
+
Southeast: "SE",
|
|
212
|
+
Southwest: "SW",
|
|
213
|
+
};
|
|
214
|
+
const DIRECTIONAL_ABBR_TO_FULL = Object.fromEntries(Object.entries(DIRECTIONAL_FULL_TO_ABBR).map(([k, v]) => [v, k]));
|
|
215
|
+
/** US: expand directional abbreviations in `street`/`street_suffix` (NW → Northwest). */
|
|
216
|
+
export const directionalExpand = (row) => {
|
|
217
|
+
if (row.country !== "US")
|
|
218
|
+
return null;
|
|
219
|
+
const tagsToCheck = ["street", "street_suffix", "street_prefix"];
|
|
220
|
+
let changed = false;
|
|
221
|
+
let newRaw = row.raw;
|
|
222
|
+
const newComponents = { ...row.components };
|
|
223
|
+
for (const tag of tagsToCheck) {
|
|
224
|
+
const v = newComponents[tag];
|
|
225
|
+
if (!v)
|
|
226
|
+
continue;
|
|
227
|
+
const replaced = v.replace(/\b(N|S|E|W|NE|NW|SE|SW)\b/g, (m) => DIRECTIONAL_ABBR_TO_FULL[m] ?? m);
|
|
228
|
+
if (replaced !== v) {
|
|
229
|
+
newComponents[tag] = replaced;
|
|
230
|
+
newRaw = newRaw.replace(new RegExp(`\\b${escapeRegex(v)}\\b`, "g"), replaced);
|
|
231
|
+
changed = true;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
if (!changed)
|
|
235
|
+
return null;
|
|
236
|
+
return withAugmentation(row, "directional-expand", newRaw, newComponents);
|
|
237
|
+
};
|
|
238
|
+
/** US: abbreviate directional words (Northwest → NW). */
|
|
239
|
+
export const directionalAbbreviate = (row) => {
|
|
240
|
+
if (row.country !== "US")
|
|
241
|
+
return null;
|
|
242
|
+
const tagsToCheck = ["street", "street_suffix", "street_prefix"];
|
|
243
|
+
let changed = false;
|
|
244
|
+
let newRaw = row.raw;
|
|
245
|
+
const newComponents = { ...row.components };
|
|
246
|
+
for (const tag of tagsToCheck) {
|
|
247
|
+
const v = newComponents[tag];
|
|
248
|
+
if (!v)
|
|
249
|
+
continue;
|
|
250
|
+
const replaced = v.replace(/\b(North|South|East|West|Northeast|Northwest|Southeast|Southwest)\b/g, (m) => DIRECTIONAL_FULL_TO_ABBR[m] ?? m);
|
|
251
|
+
if (replaced !== v) {
|
|
252
|
+
newComponents[tag] = replaced;
|
|
253
|
+
newRaw = newRaw.replace(new RegExp(`\\b${escapeRegex(v)}\\b`, "g"), replaced);
|
|
254
|
+
changed = true;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
if (!changed)
|
|
258
|
+
return null;
|
|
259
|
+
return withAugmentation(row, "directional-abbreviate", newRaw, newComponents);
|
|
260
|
+
};
|
|
261
|
+
/**
|
|
262
|
+
* US: swap the trailing street-suffix word in `components.street` to its preferred USPS
|
|
263
|
+
* abbreviation, preserving case. `"5th Avenue"` → `"5th Ave"`; `"5TH AVENUE"` → `"5TH AVE"`; `"main
|
|
264
|
+
* street"` → `"main st"`. Returns null when no trailing suffix is recognized, when the trailing
|
|
265
|
+
* word is already the preferred abbreviation, or when the swap would leave `raw` un- touched
|
|
266
|
+
* (alignment requires both raw and components to move in lockstep).
|
|
267
|
+
*
|
|
268
|
+
* Targets the trailing word only to avoid mangling streets like "Avenue of the Americas" where the
|
|
269
|
+
* suffix-shaped word is part of the proper name rather than a USPS suffix.
|
|
270
|
+
*/
|
|
271
|
+
export const streetSuffixAbbreviate = (row) => {
|
|
272
|
+
if (row.country !== "US")
|
|
273
|
+
return null;
|
|
274
|
+
const street = row.components.street;
|
|
275
|
+
if (!street)
|
|
276
|
+
return null;
|
|
277
|
+
const match = matchTrailingSuffix(street);
|
|
278
|
+
if (!match)
|
|
279
|
+
return null;
|
|
280
|
+
const preferred = US_STREET_SUFFIX_PREFERRED_ABBR[match.canonical];
|
|
281
|
+
const target = matchCase(preferred, match.matched);
|
|
282
|
+
if (target === match.matched)
|
|
283
|
+
return null;
|
|
284
|
+
const newStreet = `${street.slice(0, street.lastIndexOf(match.matched))}${target}`;
|
|
285
|
+
if (newStreet === street)
|
|
286
|
+
return null;
|
|
287
|
+
const newComponents = { ...row.components, street: newStreet };
|
|
288
|
+
const newRaw = row.raw.replace(new RegExp(`\\b${escapeRegex(street)}\\b`, "g"), newStreet);
|
|
289
|
+
if (newRaw === row.raw)
|
|
290
|
+
return null;
|
|
291
|
+
return withAugmentation(row, "us-street-suffix-abbreviate", newRaw, newComponents);
|
|
292
|
+
};
|
|
293
|
+
/**
|
|
294
|
+
* US: swap the trailing street-suffix word in `components.street` to its full canonical form,
|
|
295
|
+
* preserving case. `"5th Ave"` → `"5th Avenue"`; `"5TH AVE"` → `"5TH AVENUE"`; `"main st"` → `"main
|
|
296
|
+
* street"`. Returns null when no trailing suffix is recognized, when the trailing word is already
|
|
297
|
+
* the canonical full form, or when the swap would leave `raw` untouched.
|
|
298
|
+
*
|
|
299
|
+
* Same trailing-word-only rule as `streetSuffixAbbreviate`.
|
|
300
|
+
*/
|
|
301
|
+
export const streetSuffixExpand = (row) => {
|
|
302
|
+
if (row.country !== "US")
|
|
303
|
+
return null;
|
|
304
|
+
const street = row.components.street;
|
|
305
|
+
if (!street)
|
|
306
|
+
return null;
|
|
307
|
+
const match = matchTrailingSuffix(street);
|
|
308
|
+
if (!match)
|
|
309
|
+
return null;
|
|
310
|
+
const target = matchCase(match.canonical, match.matched);
|
|
311
|
+
if (target === match.matched)
|
|
312
|
+
return null;
|
|
313
|
+
const newStreet = `${street.slice(0, street.lastIndexOf(match.matched))}${target}`;
|
|
314
|
+
if (newStreet === street)
|
|
315
|
+
return null;
|
|
316
|
+
const newComponents = { ...row.components, street: newStreet };
|
|
317
|
+
const newRaw = row.raw.replace(new RegExp(`\\b${escapeRegex(street)}\\b`, "g"), newStreet);
|
|
318
|
+
if (newRaw === row.raw)
|
|
319
|
+
return null;
|
|
320
|
+
return withAugmentation(row, "us-street-suffix-expand", newRaw, newComponents);
|
|
321
|
+
};
|
|
322
|
+
/** US: ZIP+4 form `12345-6789` → `123456789` (dash dropped). */
|
|
323
|
+
export const zipPlus4DashDrop = (row) => {
|
|
324
|
+
if (row.country !== "US")
|
|
325
|
+
return null;
|
|
326
|
+
const postcode = row.components.postcode;
|
|
327
|
+
if (!postcode || !/^\d{5}-\d{4}$/.test(postcode))
|
|
328
|
+
return null;
|
|
329
|
+
const noDash = postcode.replace("-", "");
|
|
330
|
+
const newRaw = row.raw.replace(postcode, noDash);
|
|
331
|
+
if (newRaw === row.raw)
|
|
332
|
+
return null;
|
|
333
|
+
return withAugmentation(row, "zip-plus4-dash-drop", newRaw, { ...row.components, postcode: noDash });
|
|
334
|
+
};
|
|
335
|
+
// ===========================================================================
|
|
336
|
+
// FR-specific augmentations
|
|
337
|
+
// ===========================================================================
|
|
338
|
+
/** FR: drop the article particle from a street ("Rue de la République" → "Rue République"). */
|
|
339
|
+
export const particleStrip = (row) => {
|
|
340
|
+
if (row.country !== "FR")
|
|
341
|
+
return null;
|
|
342
|
+
const particle = row.components.street_prefix_particle;
|
|
343
|
+
if (!particle)
|
|
344
|
+
return null;
|
|
345
|
+
const newComponents = { ...row.components };
|
|
346
|
+
delete newComponents.street_prefix_particle;
|
|
347
|
+
// Drop the particle from raw, then collapse any double spaces.
|
|
348
|
+
const re = new RegExp(`\\s+${escapeRegex(particle)}\\s+`, "g");
|
|
349
|
+
if (!re.test(row.raw))
|
|
350
|
+
return null;
|
|
351
|
+
const newRaw = row.raw.replace(re, " ").replace(/\s+/g, " ").trim();
|
|
352
|
+
return withAugmentation(row, "particle-strip", newRaw, newComponents);
|
|
353
|
+
};
|
|
354
|
+
// ===========================================================================
|
|
355
|
+
// Registry + default policies
|
|
356
|
+
// ===========================================================================
|
|
357
|
+
/** Stable id → augmentation table. */
|
|
358
|
+
export const AUGMENTATIONS = {
|
|
359
|
+
"case-upper": caseUpper,
|
|
360
|
+
"case-lower": caseLower,
|
|
361
|
+
"drop-commas": dropCommas,
|
|
362
|
+
"double-space": doubleSpace,
|
|
363
|
+
"accent-strip": accentStrip,
|
|
364
|
+
"state-expand": stateExpand,
|
|
365
|
+
"state-abbreviate": stateAbbreviate,
|
|
366
|
+
"directional-expand": directionalExpand,
|
|
367
|
+
"directional-abbreviate": directionalAbbreviate,
|
|
368
|
+
"us-street-suffix-abbreviate": streetSuffixAbbreviate,
|
|
369
|
+
"us-street-suffix-expand": streetSuffixExpand,
|
|
370
|
+
"zip-plus4-dash-drop": zipPlus4DashDrop,
|
|
371
|
+
"particle-strip": particleStrip,
|
|
372
|
+
};
|
|
373
|
+
/** Default augmentation set, by country. Phase 1: US + FR; others get the locale-agnostic set. */
|
|
374
|
+
export function defaultAugmentationsForCountry(country) {
|
|
375
|
+
const universal = [caseUpper, caseLower, dropCommas, doubleSpace];
|
|
376
|
+
switch (country) {
|
|
377
|
+
case "US":
|
|
378
|
+
return [
|
|
379
|
+
...universal,
|
|
380
|
+
stateExpand,
|
|
381
|
+
stateAbbreviate,
|
|
382
|
+
directionalExpand,
|
|
383
|
+
directionalAbbreviate,
|
|
384
|
+
streetSuffixAbbreviate,
|
|
385
|
+
streetSuffixExpand,
|
|
386
|
+
zipPlus4DashDrop,
|
|
387
|
+
];
|
|
388
|
+
case "FR":
|
|
389
|
+
return [...universal, accentStrip, particleStrip];
|
|
390
|
+
default:
|
|
391
|
+
return universal;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Run every augmentation against a row; collect the non-null outputs. The augmentations are pure,
|
|
396
|
+
* so callers can compose them off this generator (e.g. nesting accent-strip ∘ state-abbreviate).
|
|
397
|
+
*/
|
|
398
|
+
export function* synthesizeRow(row, augmentations = defaultAugmentationsForCountry(row.country)) {
|
|
399
|
+
for (const aug of augmentations) {
|
|
400
|
+
const out = aug(row);
|
|
401
|
+
if (out)
|
|
402
|
+
yield out;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
function escapeRegex(s) {
|
|
406
|
+
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
407
|
+
}
|
|
408
|
+
/**
|
|
409
|
+
* Compose a venue string + an address row into a single adversarial `LabeledRow`.
|
|
410
|
+
*
|
|
411
|
+
* The emitted row's `raw` is `${venue}${separator}${address.raw}`. Tokens are produced by
|
|
412
|
+
* tokenizing the two halves independently and concatenating; labels are venue tokens → `B-venue` /
|
|
413
|
+
* `I-venue` followed by the address's labels (obtained by aligning the input address in isolation).
|
|
414
|
+
* This deterministic boundary is the entire point of the primitive: the embedded place-shaped
|
|
415
|
+
* tokens in the venue stay labeled as `venue`, never as the address's locality / region / etc.,
|
|
416
|
+
* even when they share surface forms.
|
|
417
|
+
*
|
|
418
|
+
* The address's components are forwarded as-is (alignment ran on them and they survived); `venue`
|
|
419
|
+
* is added on top with the trimmed venue string as its surface form.
|
|
420
|
+
*
|
|
421
|
+
* Returns `{ kind: "quarantined" }` when:
|
|
422
|
+
*
|
|
423
|
+
* - The venue is empty or whitespace-only.
|
|
424
|
+
* - The address row fails alignment in isolation (the underlying failure reason is propagated).
|
|
425
|
+
*/
|
|
426
|
+
export function composeAdversarialRow(venue, address, options) {
|
|
427
|
+
const separator = options.separator ?? ", ";
|
|
428
|
+
const tokenizer = options.tokenizer ?? whitespaceTokenizer();
|
|
429
|
+
const venueTrimmed = venue.trim();
|
|
430
|
+
if (!venueTrimmed) {
|
|
431
|
+
return { kind: "quarantined", row: { row: address, reason: "venue-empty" } };
|
|
432
|
+
}
|
|
433
|
+
const addressAligned = alignRow(address, { tokenizer });
|
|
434
|
+
if (addressAligned.kind !== "labeled") {
|
|
435
|
+
// Surface the address's quarantine reason but tag it with the compose attempt for
|
|
436
|
+
// debugging. The original CanonicalRow stays on the QuarantinedRow so callers can
|
|
437
|
+
// inspect the address payload.
|
|
438
|
+
return {
|
|
439
|
+
kind: "quarantined",
|
|
440
|
+
row: { row: address, reason: `compose-address-${addressAligned.row.reason}` },
|
|
441
|
+
};
|
|
442
|
+
}
|
|
443
|
+
const venueTokens = tokenizer.tokenize(venueTrimmed);
|
|
444
|
+
if (venueTokens.length === 0) {
|
|
445
|
+
return { kind: "quarantined", row: { row: address, reason: "venue-no-tokens" } };
|
|
446
|
+
}
|
|
447
|
+
const venueLabels = venueTokens.map((_, i) => (i === 0 ? "B-venue" : "I-venue"));
|
|
448
|
+
const tokens = [...venueTokens.map((t) => t.text), ...addressAligned.row.tokens];
|
|
449
|
+
const labels = [...venueLabels, ...addressAligned.row.labels];
|
|
450
|
+
const composedRaw = `${venueTrimmed}${separator}${address.raw}`;
|
|
451
|
+
const composedComponents = {
|
|
452
|
+
venue: venueTrimmed,
|
|
453
|
+
...address.components,
|
|
454
|
+
};
|
|
455
|
+
const baseSourceId = address.synth?.base_source_id ?? address.source_id;
|
|
456
|
+
const method = `compose:${options.pattern}`;
|
|
457
|
+
const composed = {
|
|
458
|
+
raw: composedRaw,
|
|
459
|
+
components: composedComponents,
|
|
460
|
+
country: address.country,
|
|
461
|
+
locale: address.locale,
|
|
462
|
+
source: address.source,
|
|
463
|
+
source_id: `${address.source_id}+${method}`,
|
|
464
|
+
corpus_version: address.corpus_version,
|
|
465
|
+
license: address.license,
|
|
466
|
+
synth: { method, base_source_id: baseSourceId },
|
|
467
|
+
tokens,
|
|
468
|
+
labels,
|
|
469
|
+
};
|
|
470
|
+
return { kind: "labeled", row: composed };
|
|
471
|
+
}
|
|
472
|
+
//# sourceMappingURL=synthesize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"synthesize.js","sourceRoot":"","sources":["../../src/synthesize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAGH,OAAO,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAA;AACrC,OAAO,EAAE,+BAA+B,EAAE,SAAS,EAAE,mBAAmB,EAAE,MAAM,6BAA6B,CAAA;AAC7G,OAAO,EAAE,mBAAmB,EAAkB,MAAM,eAAe,CAAA;AAWnE,6EAA6E;AAC7E,SAAS,gBAAgB,CACxB,MAAoB,EACpB,MAAc,EACd,MAAc,EACd,aAA4B;IAE5B,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,EAAE,cAAc,IAAI,MAAM,CAAC,SAAS,CAAA;IAC/D,OAAO;QACN,GAAG,MAAM;QACT,GAAG,EAAE,MAAM;QACX,UAAU,EAAE,aAAa;QACzB,SAAS,EAAE,GAAG,MAAM,CAAC,SAAS,IAAI,MAAM,EAAE;QAC1C,KAAK,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,EAAE;KACzC,CAAA;AACF,CAAC;AAED,8EAA8E;AAC9E,gCAAgC;AAChC,8EAA8E;AAE9E,iFAAiF;AACjF,MAAM,CAAC,MAAM,SAAS,GAAiB,CAAC,GAAG,EAAE,EAAE;IAC9C,IAAI,GAAG,CAAC,GAAG,KAAK,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE;QAAE,OAAO,IAAI,CAAA;IAClD,MAAM,KAAK,GAAG,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAA;IACnC,MAAM,YAAY,GAAkB,EAAE,CAAA;IACtC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;QACrD,IAAI,CAAC;YAAE,YAAY,CAAC,CAAiB,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAA;IACzD,CAAC;IACD,OAAO,gBAAgB,CAAC,GAAG,EAAE,YAAY,EAAE,KAAK,EAAE,YAAY,CAAC,CAAA;AAChE,CAAC,CAAA;AAED,iFAAiF;AACjF,MAAM,CAAC,MAAM,SAAS,GAAiB,CAAC,GAAG,EAAE,EAAE;IAC9C,IAAI,GAAG,CAAC,GAAG,KAAK,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE;QAAE,OAAO,IAAI,CAAA;IAClD,MAAM,OAAO,GAAG,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAA;IACrC,MAAM,cAAc,GAAkB,EAAE,CAAA;IACxC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;QACrD,IAAI,CAAC;YAAE,cAAc,CAAC,CAAiB,CAAC,GAAG,CAAC,CAAC,WAAW,EAAE,CAAA;IAC3D,CAAC;IACD,OAAO,gBAAgB,CAAC,GAAG,EAAE,YAAY,EAAE,OAAO,EAAE,cAAc,CAAC,CAAA;AACpE,CAAC,CAAA;AAED,+EAA+E;AAC/E,MAAM,CAAC,MAAM,UAAU,GAAiB,CAAC,GAAG,EAAE,EAAE;IAC/C,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IACvC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACpE,OAAO,gBAAgB,CAAC,GAAG,EAAE,aAAa,EAAE,MAAM,EAAE,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,CAAC,CAAA;AAC3E,CAAC,CAAA;AAED;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,WAAW,GAAiB,CAAC,GAAG,EAAE,EAAE;IAChD,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IACnC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;IAC1C,MAAM,aAAa,GAAkB,EAAE,CAAA;IACvC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;QACrD,IAAI,CAAC;YAAE,aAAa,CAAC,CAAiB,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;IAChE,CAAC;IACD,OAAO,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AACpE,CAAC,CAAA;AAED;;;GAGG;AACH,MAAM,CAAC,MAAM,WAAW,GAAiB,CAAC,GAAG,EAAE,EAAE;IAChD,MAAM,QAAQ,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;IACtC,IAAI,QAAQ,KAAK,GAAG,CAAC,GAAG;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,aAAa,GAAkB,EAAE,CAAA;IACvC,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;QACrD,IAAI,CAAC;YAAE,aAAa,CAAC,CAAiB,CAAC,GAAG,YAAY,CAAC,CAAC,CAAC,CAAA;IAC1D,CAAC;IACD,OAAO,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE,QAAQ,EAAE,aAAa,CAAC,CAAA;AACtE,CAAC,CAAA;AAED,SAAS,YAAY,CAAC,CAAS;IAC9B,OAAO,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAA;AACjD,CAAC;AAED,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,kFAAkF;AAClF,MAAM,kBAAkB,GAA2B;IAClD,OAAO,EAAE,IAAI;IACb,MAAM,EAAE,IAAI;IACZ,OAAO,EAAE,IAAI;IACb,QAAQ,EAAE,IAAI;IACd,UAAU,EAAE,IAAI;IAChB,QAAQ,EAAE,IAAI;IACd,WAAW,EAAE,IAAI;IACjB,QAAQ,EAAE,IAAI;IACd,OAAO,EAAE,IAAI;IACb,OAAO,EAAE,IAAI;IACb,MAAM,EAAE,IAAI;IACZ,KAAK,EAAE,IAAI;IACX,QAAQ,EAAE,IAAI;IACd,OAAO,EAAE,IAAI;IACb,IAAI,EAAE,IAAI;IACV,MAAM,EAAE,IAAI;IACZ,QAAQ,EAAE,IAAI;IACd,SAAS,EAAE,IAAI;IACf,KAAK,EAAE,IAAI;IACX,QAAQ,EAAE,IAAI;IACd,aAAa,EAAE,IAAI;IACnB,QAAQ,EAAE,IAAI;IACd,SAAS,EAAE,IAAI;IACf,WAAW,EAAE,IAAI;IACjB,QAAQ,EAAE,IAAI;IACd,OAAO,EAAE,IAAI;IACb,QAAQ,EAAE,IAAI;IACd,MAAM,EAAE,IAAI;IACZ,eAAe,EAAE,IAAI;IACrB,YAAY,EAAE,IAAI;IAClB,YAAY,EAAE,IAAI;IAClB,UAAU,EAAE,IAAI;IAChB,gBAAgB,EAAE,IAAI;IACtB,cAAc,EAAE,IAAI;IACpB,IAAI,EAAE,IAAI;IACV,QAAQ,EAAE,IAAI;IACd,MAAM,EAAE,IAAI;IACZ,YAAY,EAAE,IAAI;IAClB,cAAc,EAAE,IAAI;IACpB,gBAAgB,EAAE,IAAI;IACtB,cAAc,EAAE,IAAI;IACpB,SAAS,EAAE,IAAI;IACf,KAAK,EAAE,IAAI;IACX,IAAI,EAAE,IAAI;IACV,OAAO,EAAE,IAAI;IACb,QAAQ,EAAE,IAAI;IACd,UAAU,EAAE,IAAI;IAChB,eAAe,EAAE,IAAI;IACrB,SAAS,EAAE,IAAI;IACf,OAAO,EAAE,IAAI;IACb,sBAAsB,EAAE,IAAI;CAC5B,CAAA;AAED,MAAM,kBAAkB,GAA2B,MAAM,CAAC,WAAW,CACpE,MAAM,CAAC,OAAO,CAAC,kBAAkB,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAC1D,CAAA;AAED,uEAAuE;AACvE,MAAM,CAAC,MAAM,WAAW,GAAiB,CAAC,GAAG,EAAE,EAAE;IAChD,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAA;IACpC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACxB,MAAM,IAAI,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAA;IACvC,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,wFAAwF;IACxF,wCAAwC;IACxC,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,MAAM,MAAM,KAAK,EAAE,GAAG,CAAC,CAAA;IAC7C,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IAClC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,MAAM,KAAK,EAAE,GAAG,CAAC,EAAE,IAAI,CAAC,CAAA;IACxE,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;IACxE,OAAO,gBAAgB,CAAC,GAAG,EAAE,cAAc,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AACpE,CAAC,CAAA;AAED,uEAAuE;AACvE,MAAM,CAAC,MAAM,eAAe,GAAiB,CAAC,GAAG,EAAE,EAAE;IACpD,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAA;IACpC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACxB,MAAM,IAAI,GAAG,kBAAkB,CAAC,MAAM,CAAC,CAAA;IACvC,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAA;IACtB,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,MAAM,MAAM,KAAK,EAAE,GAAG,CAAC,CAAA;IAC7C,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IAClC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,MAAM,KAAK,EAAE,GAAG,CAAC,EAAE,IAAI,CAAC,CAAA;IACxE,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;IACxE,OAAO,gBAAgB,CAAC,GAAG,EAAE,kBAAkB,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AACxE,CAAC,CAAA;AAED,MAAM,wBAAwB,GAA2B;IACxD,KAAK,EAAE,GAAG;IACV,KAAK,EAAE,GAAG;IACV,IAAI,EAAE,GAAG;IACT,IAAI,EAAE,GAAG;IACT,SAAS,EAAE,IAAI;IACf,SAAS,EAAE,IAAI;IACf,SAAS,EAAE,IAAI;IACf,SAAS,EAAE,IAAI;CACf,CAAA;AACD,MAAM,wBAAwB,GAA2B,MAAM,CAAC,WAAW,CAC1E,MAAM,CAAC,OAAO,CAAC,wBAAwB,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAChE,CAAA;AAED,yFAAyF;AACzF,MAAM,CAAC,MAAM,iBAAiB,GAAiB,CAAC,GAAG,EAAE,EAAE;IACtD,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,WAAW,GAAmB,CAAC,QAAQ,EAAE,eAAe,EAAE,eAAe,CAAC,CAAA;IAChF,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG,CAAA;IACpB,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,CAAA;IAC1D,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,aAAa,CAAC,GAAG,CAAC,CAAA;QAC5B,IAAI,CAAC,CAAC;YAAE,SAAQ;QAChB,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,4BAA4B,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,wBAAwB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;QACjG,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;YACpB,aAAa,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAA;YAC7B,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,WAAW,CAAC,CAAC,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,QAAQ,CAAC,CAAA;YAC7E,OAAO,GAAG,IAAI,CAAA;QACf,CAAC;IACF,CAAC;IACD,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,OAAO,gBAAgB,CAAC,GAAG,EAAE,oBAAoB,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AAC1E,CAAC,CAAA;AAED,yDAAyD;AACzD,MAAM,CAAC,MAAM,qBAAqB,GAAiB,CAAC,GAAG,EAAE,EAAE;IAC1D,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,WAAW,GAAmB,CAAC,QAAQ,EAAE,eAAe,EAAE,eAAe,CAAC,CAAA;IAChF,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,IAAI,MAAM,GAAG,GAAG,CAAC,GAAG,CAAA;IACpB,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,CAAA;IAC1D,KAAK,MAAM,GAAG,IAAI,WAAW,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,aAAa,CAAC,GAAG,CAAC,CAAA;QAC5B,IAAI,CAAC,CAAC;YAAE,SAAQ;QAChB,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CACzB,sEAAsE,EACtE,CAAC,CAAC,EAAE,EAAE,CAAC,wBAAwB,CAAC,CAAC,CAAC,IAAI,CAAC,CACvC,CAAA;QACD,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;YACpB,aAAa,CAAC,GAAG,CAAC,GAAG,QAAQ,CAAA;YAC7B,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,WAAW,CAAC,CAAC,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,QAAQ,CAAC,CAAA;YAC7E,OAAO,GAAG,IAAI,CAAA;QACf,CAAC;IACF,CAAC;IACD,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IACzB,OAAO,gBAAgB,CAAC,GAAG,EAAE,wBAAwB,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AAC9E,CAAC,CAAA;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,MAAM,sBAAsB,GAAiB,CAAC,GAAG,EAAE,EAAE;IAC3D,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAA;IACpC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACxB,MAAM,KAAK,GAAG,mBAAmB,CAAC,MAAM,CAAC,CAAA;IACzC,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAA;IAEvB,MAAM,SAAS,GAAG,+BAA+B,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;IAClE,MAAM,MAAM,GAAG,SAAS,CAAC,SAAS,EAAE,KAAK,CAAC,OAAO,CAAC,CAAA;IAClD,IAAI,MAAM,KAAK,KAAK,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IAEzC,MAAM,SAAS,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,MAAM,EAAE,CAAA;IAClF,IAAI,SAAS,KAAK,MAAM;QAAE,OAAO,IAAI,CAAA;IAErC,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,CAAA;IAC7E,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,WAAW,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,SAAS,CAAC,CAAA;IAC1F,IAAI,MAAM,KAAK,GAAG,CAAC,GAAG;QAAE,OAAO,IAAI,CAAA;IACnC,OAAO,gBAAgB,CAAC,GAAG,EAAE,6BAA6B,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AACnF,CAAC,CAAA;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,MAAM,kBAAkB,GAAiB,CAAC,GAAG,EAAE,EAAE;IACvD,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,MAAM,CAAA;IACpC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACxB,MAAM,KAAK,GAAG,mBAAmB,CAAC,MAAM,CAAC,CAAA;IACzC,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAA;IAEvB,MAAM,MAAM,GAAG,SAAS,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,OAAO,CAAC,CAAA;IACxD,IAAI,MAAM,KAAK,KAAK,CAAC,OAAO;QAAE,OAAO,IAAI,CAAA;IAEzC,MAAM,SAAS,GAAG,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,GAAG,MAAM,EAAE,CAAA;IAClF,IAAI,SAAS,KAAK,MAAM;QAAE,OAAO,IAAI,CAAA;IAErC,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE,CAAA;IAC7E,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,WAAW,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,SAAS,CAAC,CAAA;IAC1F,IAAI,MAAM,KAAK,GAAG,CAAC,GAAG;QAAE,OAAO,IAAI,CAAA;IACnC,OAAO,gBAAgB,CAAC,GAAG,EAAE,yBAAyB,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AAC/E,CAAC,CAAA;AAED,gEAAgE;AAChE,MAAM,CAAC,MAAM,gBAAgB,GAAiB,CAAC,GAAG,EAAE,EAAE;IACrD,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,UAAU,CAAC,QAAQ,CAAA;IACxC,IAAI,CAAC,QAAQ,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,CAAC;QAAE,OAAO,IAAI,CAAA;IAC7D,MAAM,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,CAAC,CAAA;IACxC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;IAChD,IAAI,MAAM,KAAK,GAAG,CAAC,GAAG;QAAE,OAAO,IAAI,CAAA;IACnC,OAAO,gBAAgB,CAAC,GAAG,EAAE,qBAAqB,EAAE,MAAM,EAAE,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAA;AACrG,CAAC,CAAA;AAED,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,+FAA+F;AAC/F,MAAM,CAAC,MAAM,aAAa,GAAiB,CAAC,GAAG,EAAE,EAAE;IAClD,IAAI,GAAG,CAAC,OAAO,KAAK,IAAI;QAAE,OAAO,IAAI,CAAA;IACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,UAAU,CAAC,sBAAsB,CAAA;IACtD,IAAI,CAAC,QAAQ;QAAE,OAAO,IAAI,CAAA;IAC1B,MAAM,aAAa,GAAkB,EAAE,GAAG,GAAG,CAAC,UAAU,EAAE,CAAA;IAC1D,OAAO,aAAa,CAAC,sBAAsB,CAAA;IAC3C,+DAA+D;IAC/D,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,OAAO,WAAW,CAAC,QAAQ,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IAC9D,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;QAAE,OAAO,IAAI,CAAA;IAClC,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;IACnE,OAAO,gBAAgB,CAAC,GAAG,EAAE,gBAAgB,EAAE,MAAM,EAAE,aAAa,CAAC,CAAA;AACtE,CAAC,CAAA;AAED,8EAA8E;AAC9E,8BAA8B;AAC9B,8EAA8E;AAE9E,sCAAsC;AACtC,MAAM,CAAC,MAAM,aAAa,GAAiC;IAC1D,YAAY,EAAE,SAAS;IACvB,YAAY,EAAE,SAAS;IACvB,aAAa,EAAE,UAAU;IACzB,cAAc,EAAE,WAAW;IAC3B,cAAc,EAAE,WAAW;IAC3B,cAAc,EAAE,WAAW;IAC3B,kBAAkB,EAAE,eAAe;IACnC,oBAAoB,EAAE,iBAAiB;IACvC,wBAAwB,EAAE,qBAAqB;IAC/C,6BAA6B,EAAE,sBAAsB;IACrD,yBAAyB,EAAE,kBAAkB;IAC7C,qBAAqB,EAAE,gBAAgB;IACvC,gBAAgB,EAAE,aAAa;CAC/B,CAAA;AAED,kGAAkG;AAClG,MAAM,UAAU,8BAA8B,CAAC,OAAe;IAC7D,MAAM,SAAS,GAAG,CAAC,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,WAAW,CAAC,CAAA;IACjE,QAAQ,OAAO,EAAE,CAAC;QACjB,KAAK,IAAI;YACR,OAAO;gBACN,GAAG,SAAS;gBACZ,WAAW;gBACX,eAAe;gBACf,iBAAiB;gBACjB,qBAAqB;gBACrB,sBAAsB;gBACtB,kBAAkB;gBAClB,gBAAgB;aAChB,CAAA;QACF,KAAK,IAAI;YACR,OAAO,CAAC,GAAG,SAAS,EAAE,WAAW,EAAE,aAAa,CAAC,CAAA;QAClD;YACC,OAAO,SAAS,CAAA;IAClB,CAAC;AACF,CAAC;AAED;;;GAGG;AACH,MAAM,SAAS,CAAC,CAAC,aAAa,CAC7B,GAAiB,EACjB,gBAAyC,8BAA8B,CAAC,GAAG,CAAC,OAAO,CAAC;IAEpF,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;QACjC,MAAM,GAAG,GAAG,GAAG,CAAC,GAAG,CAAC,CAAA;QACpB,IAAI,GAAG;YAAE,MAAM,GAAG,CAAA;IACnB,CAAC;AACF,CAAC;AAED,SAAS,WAAW,CAAC,CAAS;IAC7B,OAAO,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAA;AAChD,CAAC;AAiED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,qBAAqB,CACpC,KAAa,EACb,OAAqB,EACrB,OAAkC;IAElC,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,IAAI,CAAA;IAC3C,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,mBAAmB,EAAE,CAAA;IAE5D,MAAM,YAAY,GAAG,KAAK,CAAC,IAAI,EAAE,CAAA;IACjC,IAAI,CAAC,YAAY,EAAE,CAAC;QACnB,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE,EAAE,GAAG,EAAE,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,EAAE,CAAA;IAC7E,CAAC;IAED,MAAM,cAAc,GAAG,QAAQ,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,CAAC,CAAA;IACvD,IAAI,cAAc,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;QACvC,kFAAkF;QAClF,kFAAkF;QAClF,+BAA+B;QAC/B,OAAO;YACN,IAAI,EAAE,aAAa;YACnB,GAAG,EAAE,EAAE,GAAG,EAAE,OAAO,EAAE,MAAM,EAAE,mBAAmB,cAAc,CAAC,GAAG,CAAC,MAAM,EAAE,EAAE;SAC7E,CAAA;IACF,CAAC;IAED,MAAM,WAAW,GAAG,SAAS,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAA;IACpD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC9B,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE,EAAE,GAAG,EAAE,OAAO,EAAE,MAAM,EAAE,iBAAiB,EAAE,EAAE,CAAA;IACjF,CAAC;IAED,MAAM,WAAW,GAAe,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAA;IAE5F,MAAM,MAAM,GAAa,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,GAAG,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAC1F,MAAM,MAAM,GAAe,CAAC,GAAG,WAAW,EAAE,GAAG,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAEzE,MAAM,WAAW,GAAG,GAAG,YAAY,GAAG,SAAS,GAAG,OAAO,CAAC,GAAG,EAAE,CAAA;IAC/D,MAAM,kBAAkB,GAAG;QAC1B,KAAK,EAAE,YAAY;QACnB,GAAG,OAAO,CAAC,UAAU;KACrB,CAAA;IAED,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,EAAE,cAAc,IAAI,OAAO,CAAC,SAAS,CAAA;IACvE,MAAM,MAAM,GAAG,WAAW,OAAO,CAAC,OAAO,EAAE,CAAA;IAE3C,MAAM,QAAQ,GAAe;QAC5B,GAAG,EAAE,WAAW;QAChB,UAAU,EAAE,kBAAkB;QAC9B,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,MAAM,EAAE,OAAO,CAAC,MAAM;QACtB,SAAS,EAAE,GAAG,OAAO,CAAC,SAAS,IAAI,MAAM,EAAE;QAC3C,cAAc,EAAE,OAAO,CAAC,cAAc;QACtC,OAAO,EAAE,OAAO,CAAC,OAAO;QACxB,KAAK,EAAE,EAAE,MAAM,EAAE,cAAc,EAAE,YAAY,EAAE;QAC/C,MAAM;QACN,MAAM;KACN,CAAA;IAED,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,GAAG,EAAE,QAAQ,EAAE,CAAA;AAC1C,CAAC"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Tokenizer interface for alignment.
|
|
7
|
+
*
|
|
8
|
+
* Two implementations live in the corpus package:
|
|
9
|
+
*
|
|
10
|
+
* 1. `whitespaceTokenizer()` (this file): pure-JS, depends on nothing. Splits a string into maximal
|
|
11
|
+
* runs of letters/digits/marks, dropping whitespace and standalone punctuation. Used as the
|
|
12
|
+
* default for in-container alignment tests and as a fallback when no SentencePiece model is
|
|
13
|
+
* available.
|
|
14
|
+
* 2. `sentencePieceTokenizer(modelPath)` (Phase 1 task #11, deferred): wraps the SentencePiece model
|
|
15
|
+
* trained on the corpus. Same interface, different splits. Locked against the corpus version
|
|
16
|
+
* (`tokenizer-v0.1.0` ships with `corpus-v0.1.0`).
|
|
17
|
+
*
|
|
18
|
+
* The interface is intentionally minimal — only what alignment needs. Each token comes back with
|
|
19
|
+
* its (start, end) character offsets so BIO labels can be assigned by span overlap with component
|
|
20
|
+
* spans, independent of how the tokenizer chose its splits.
|
|
21
|
+
*/
|
|
22
|
+
/** A token with its character span in the source string. */
|
|
23
|
+
export interface TokenSpan {
|
|
24
|
+
/** The token text, possibly normalized (case unchanged here; tokenizers may differ). */
|
|
25
|
+
text: string;
|
|
26
|
+
/** Inclusive start offset (UTF-16 code-unit index) in the source string. */
|
|
27
|
+
start: number;
|
|
28
|
+
/** Exclusive end offset in the source string. `text === source.slice(start, end)`. */
|
|
29
|
+
end: number;
|
|
30
|
+
}
|
|
31
|
+
/** A tokenizer that maps a string to a sequence of `TokenSpan`s. */
|
|
32
|
+
export interface Tokenizer {
|
|
33
|
+
tokenize(text: string): readonly TokenSpan[];
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Whitespace + punctuation tokenizer (pure JS).
|
|
37
|
+
*
|
|
38
|
+
* Tokens are maximal runs of unicode word characters (`\p{L}` letters, `\p{N}` digits, `\p{M}`
|
|
39
|
+
* marks, plus `'`, `-`, `_`). Everything else — whitespace, punctuation, symbols — is treated as a
|
|
40
|
+
* separator and **not** emitted as a token. The resulting spans cover the original string only on
|
|
41
|
+
* token regions; in-between regions belong to no token.
|
|
42
|
+
*
|
|
43
|
+
* This is intentionally lossy at the edges (alignment can still label every meaningful span). A
|
|
44
|
+
* future SentencePiece tokenizer will preserve all bytes via byte-fallback.
|
|
45
|
+
*/
|
|
46
|
+
export declare function whitespaceTokenizer(): Tokenizer;
|
|
47
|
+
//# sourceMappingURL=tokenize.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenize.d.ts","sourceRoot":"","sources":["../../src/tokenize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,4DAA4D;AAC5D,MAAM,WAAW,SAAS;IACzB,wFAAwF;IACxF,IAAI,EAAE,MAAM,CAAA;IAEZ,4EAA4E;IAC5E,KAAK,EAAE,MAAM,CAAA;IAEb,sFAAsF;IACtF,GAAG,EAAE,MAAM,CAAA;CACX;AAED,oEAAoE;AACpE,MAAM,WAAW,SAAS;IACzB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,SAAS,EAAE,CAAA;CAC5C;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,mBAAmB,IAAI,SAAS,CAe/C"}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Tokenizer interface for alignment.
|
|
7
|
+
*
|
|
8
|
+
* Two implementations live in the corpus package:
|
|
9
|
+
*
|
|
10
|
+
* 1. `whitespaceTokenizer()` (this file): pure-JS, depends on nothing. Splits a string into maximal
|
|
11
|
+
* runs of letters/digits/marks, dropping whitespace and standalone punctuation. Used as the
|
|
12
|
+
* default for in-container alignment tests and as a fallback when no SentencePiece model is
|
|
13
|
+
* available.
|
|
14
|
+
* 2. `sentencePieceTokenizer(modelPath)` (Phase 1 task #11, deferred): wraps the SentencePiece model
|
|
15
|
+
* trained on the corpus. Same interface, different splits. Locked against the corpus version
|
|
16
|
+
* (`tokenizer-v0.1.0` ships with `corpus-v0.1.0`).
|
|
17
|
+
*
|
|
18
|
+
* The interface is intentionally minimal — only what alignment needs. Each token comes back with
|
|
19
|
+
* its (start, end) character offsets so BIO labels can be assigned by span overlap with component
|
|
20
|
+
* spans, independent of how the tokenizer chose its splits.
|
|
21
|
+
*/
|
|
22
|
+
/**
|
|
23
|
+
* Whitespace + punctuation tokenizer (pure JS).
|
|
24
|
+
*
|
|
25
|
+
* Tokens are maximal runs of unicode word characters (`\p{L}` letters, `\p{N}` digits, `\p{M}`
|
|
26
|
+
* marks, plus `'`, `-`, `_`). Everything else — whitespace, punctuation, symbols — is treated as a
|
|
27
|
+
* separator and **not** emitted as a token. The resulting spans cover the original string only on
|
|
28
|
+
* token regions; in-between regions belong to no token.
|
|
29
|
+
*
|
|
30
|
+
* This is intentionally lossy at the edges (alignment can still label every meaningful span). A
|
|
31
|
+
* future SentencePiece tokenizer will preserve all bytes via byte-fallback.
|
|
32
|
+
*/
|
|
33
|
+
export function whitespaceTokenizer() {
|
|
34
|
+
// Maximal runs of letters/digits/marks plus the joiners common to addresses
|
|
35
|
+
// (apostrophe, hyphen, underscore). Comma/space/period etc. are not in the set.
|
|
36
|
+
const tokenRe = /[\p{L}\p{N}\p{M}'_-]+/gu;
|
|
37
|
+
return {
|
|
38
|
+
tokenize(text) {
|
|
39
|
+
const out = [];
|
|
40
|
+
tokenRe.lastIndex = 0;
|
|
41
|
+
let m;
|
|
42
|
+
while ((m = tokenRe.exec(text))) {
|
|
43
|
+
out.push({ text: m[0], start: m.index, end: m.index + m[0].length });
|
|
44
|
+
}
|
|
45
|
+
return out;
|
|
46
|
+
},
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
//# sourceMappingURL=tokenize.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenize.js","sourceRoot":"","sources":["../../src/tokenize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAmBH;;;;;;;;;;GAUG;AACH,MAAM,UAAU,mBAAmB;IAClC,4EAA4E;IAC5E,gFAAgF;IAChF,MAAM,OAAO,GAAG,yBAAyB,CAAA;IACzC,OAAO;QACN,QAAQ,CAAC,IAAY;YACpB,MAAM,GAAG,GAAgB,EAAE,CAAA;YAC3B,OAAO,CAAC,SAAS,GAAG,CAAC,CAAA;YACrB,IAAI,CAAyB,CAAA;YAC7B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;gBACjC,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAA;YACrE,CAAC;YACD,OAAO,GAAG,CAAA;QACX,CAAC;KACD,CAAA;AACF,CAAC"}
|