@mailwoman/query-shape 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ import type { CharacterClass, SpanRange, TokenCharacterClass, TokenClass } from "./types.js";
7
+ /** Codepoint-level character class. */
8
+ export type CodepointClass = TokenCharacterClass | "whitespace" | "connector" | "other";
9
+ /** Classify a single Unicode codepoint. */
10
+ export declare function classifyCodepoint(cp: number): CodepointClass;
11
+ /**
12
+ * Classify a token by walking its codepoints and folding to the dominant class. Mixed alphanumeric
13
+ * (e.g. `"221B"`, `"10118-1234"`) returns `"mixed"`. Pure-punct tokens return `"punct"`.
14
+ */
15
+ export declare function classifyToken(text: string): TokenCharacterClass;
16
+ /** Fold per-token classes into the whole-input character class. */
17
+ export declare function foldInputClass(tokens: ReadonlyArray<TokenClass>): CharacterClass;
18
+ /**
19
+ * Walk a string and emit token spans (whitespace-and-punctuation-separated). Internal helper —
20
+ * callers receive `TokenClass[]` from `computeQueryShape`.
21
+ */
22
+ export declare function tokenizeForClass(text: string): SpanRange[];
23
+ //# sourceMappingURL=character-class.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"character-class.d.ts","sourceRoot":"","sources":["../character-class.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,mBAAmB,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAE5F,uCAAuC;AACvC,MAAM,MAAM,cAAc,GAAG,mBAAmB,GAAG,YAAY,GAAG,WAAW,GAAG,OAAO,CAAA;AAuFvF,2CAA2C;AAC3C,wBAAgB,iBAAiB,CAAC,EAAE,EAAE,MAAM,GAAG,cAAc,CAY5D;AAED;;;GAGG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,mBAAmB,CA8C/D;AAED,mEAAmE;AACnE,wBAAgB,cAAc,CAAC,MAAM,EAAE,aAAa,CAAC,UAAU,CAAC,GAAG,cAAc,CAyChF;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,EAAE,CAmD1D"}
@@ -0,0 +1,262 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ const CJK_RANGES = [
7
+ [0x3040, 0x30ff], // Hiragana + Katakana
8
+ [0x31f0, 0x31ff], // Katakana phonetic extensions
9
+ [0x3400, 0x4dbf], // CJK Unified Ideographs Extension A
10
+ [0x4e00, 0x9fff], // CJK Unified Ideographs
11
+ [0xa000, 0xa4cf], // Yi
12
+ [0xac00, 0xd7af], // Hangul Syllables
13
+ [0xf900, 0xfaff], // CJK Compatibility Ideographs
14
+ [0xff00, 0xffef], // Halfwidth + Fullwidth forms
15
+ [0x20000, 0x2a6df], // CJK Unified Ideographs Extension B
16
+ ];
17
+ const CYRILLIC_RANGES = [
18
+ [0x0400, 0x04ff],
19
+ [0x0500, 0x052f], // Cyrillic Supplement
20
+ [0x2de0, 0x2dff], // Cyrillic Extended-A
21
+ [0xa640, 0xa69f], // Cyrillic Extended-B
22
+ ];
23
+ const ARABIC_RANGES = [
24
+ [0x0600, 0x06ff],
25
+ [0x0750, 0x077f], // Arabic Supplement
26
+ [0x08a0, 0x08ff], // Arabic Extended-A
27
+ [0xfb50, 0xfdff], // Arabic Presentation Forms-A
28
+ [0xfe70, 0xfeff], // Arabic Presentation Forms-B
29
+ ];
30
+ function inRange(cp, ranges) {
31
+ for (const [lo, hi] of ranges) {
32
+ if (cp >= lo && cp <= hi)
33
+ return true;
34
+ }
35
+ return false;
36
+ }
37
+ const PUNCT_CODEPOINTS = new Set([
38
+ 0x21, // !
39
+ 0x22, // "
40
+ 0x23, // #
41
+ 0x25, // %
42
+ 0x26, // &
43
+ 0x28, // (
44
+ 0x29, // )
45
+ 0x2a, // *
46
+ 0x2b, // +
47
+ 0x2c, // ,
48
+ 0x2e, // .
49
+ 0x2f, // /
50
+ 0x3a, // :
51
+ 0x3b, // ;
52
+ 0x3c, // <
53
+ 0x3d, // =
54
+ 0x3e, // >
55
+ 0x3f, // ?
56
+ 0x40, // @
57
+ 0x5b, // [
58
+ 0x5c, // \
59
+ 0x5d, // ]
60
+ 0x5e, // ^
61
+ 0x60, // `
62
+ 0x7b, // {
63
+ 0x7c, // |
64
+ 0x7d, // }
65
+ 0x7e, // ~
66
+ 0x00a1, // ¡
67
+ 0x00bf, // ¿
68
+ 0x201c, // “
69
+ 0x201d, // ”
70
+ 0x2013, // –
71
+ 0x2014, // —
72
+ 0x3001, // 、 (CJK comma)
73
+ 0x3002, // 。 (CJK period)
74
+ ]);
75
+ /**
76
+ * "Connector" codepoints join adjacent tokens instead of separating them. Hyphen, apostrophe,
77
+ * underscore — surface in "10118-1234", "O'Brien", "Saint-Denis", and similar.
78
+ */
79
+ const CONNECTOR_CODEPOINTS = new Set([
80
+ 0x2d, // -
81
+ 0x27, // '
82
+ 0x5f, // _
83
+ 0x2018, // ‘
84
+ 0x2019, // ’
85
+ ]);
86
+ /** Classify a single Unicode codepoint. */
87
+ export function classifyCodepoint(cp) {
88
+ if (cp >= 0x30 && cp <= 0x39)
89
+ return "digit";
90
+ if ((cp >= 0x41 && cp <= 0x5a) || (cp >= 0x61 && cp <= 0x7a))
91
+ return "alpha";
92
+ // Latin-1 letters with diacritics + Latin Extended-A/B
93
+ if ((cp >= 0x00c0 && cp <= 0x024f) || (cp >= 0x1e00 && cp <= 0x1eff))
94
+ return "alpha";
95
+ if (cp === 0x20 || cp === 0x09 || cp === 0x0a || cp === 0x0d || cp === 0xa0)
96
+ return "whitespace";
97
+ if (CONNECTOR_CODEPOINTS.has(cp))
98
+ return "connector";
99
+ if (PUNCT_CODEPOINTS.has(cp))
100
+ return "punct";
101
+ if (inRange(cp, CJK_RANGES))
102
+ return "cjk";
103
+ if (inRange(cp, CYRILLIC_RANGES))
104
+ return "cyrillic";
105
+ if (inRange(cp, ARABIC_RANGES))
106
+ return "arabic";
107
+ return "other";
108
+ }
109
+ /**
110
+ * Classify a token by walking its codepoints and folding to the dominant class. Mixed alphanumeric
111
+ * (e.g. `"221B"`, `"10118-1234"`) returns `"mixed"`. Pure-punct tokens return `"punct"`.
112
+ */
113
+ export function classifyToken(text) {
114
+ let hasDigit = false;
115
+ let hasAlpha = false;
116
+ let hasCjk = false;
117
+ let hasCyrillic = false;
118
+ let hasArabic = false;
119
+ let hasPunct = false;
120
+ for (let i = 0; i < text.length;) {
121
+ const cp = text.codePointAt(i);
122
+ i += cp > 0xffff ? 2 : 1;
123
+ const cls = classifyCodepoint(cp);
124
+ switch (cls) {
125
+ case "digit":
126
+ hasDigit = true;
127
+ break;
128
+ case "alpha":
129
+ hasAlpha = true;
130
+ break;
131
+ case "cjk":
132
+ hasCjk = true;
133
+ break;
134
+ case "cyrillic":
135
+ hasCyrillic = true;
136
+ break;
137
+ case "arabic":
138
+ hasArabic = true;
139
+ break;
140
+ case "punct":
141
+ hasPunct = true;
142
+ break;
143
+ case "connector":
144
+ case "whitespace":
145
+ case "other":
146
+ break;
147
+ }
148
+ }
149
+ if (hasCjk)
150
+ return "cjk";
151
+ if (hasCyrillic)
152
+ return "cyrillic";
153
+ if (hasArabic)
154
+ return "arabic";
155
+ if (hasDigit && hasAlpha)
156
+ return "mixed";
157
+ if (hasDigit)
158
+ return "digit";
159
+ if (hasAlpha)
160
+ return "alpha";
161
+ if (hasPunct)
162
+ return "punct";
163
+ return "mixed";
164
+ }
165
+ /** Fold per-token classes into the whole-input character class. */
166
+ export function foldInputClass(tokens) {
167
+ if (tokens.length === 0)
168
+ return "alpha";
169
+ let hasDigit = false;
170
+ let hasAlpha = false;
171
+ let hasCjk = false;
172
+ let hasCyrillic = false;
173
+ let hasArabic = false;
174
+ let hasMixed = false;
175
+ for (const t of tokens) {
176
+ switch (t.class) {
177
+ case "cjk":
178
+ hasCjk = true;
179
+ break;
180
+ case "cyrillic":
181
+ hasCyrillic = true;
182
+ break;
183
+ case "arabic":
184
+ hasArabic = true;
185
+ break;
186
+ case "digit":
187
+ hasDigit = true;
188
+ break;
189
+ case "alpha":
190
+ hasAlpha = true;
191
+ break;
192
+ case "mixed":
193
+ hasMixed = true;
194
+ break;
195
+ }
196
+ }
197
+ if (hasCjk && !hasAlpha && !hasCyrillic && !hasArabic)
198
+ return "cjk";
199
+ if (hasCyrillic && !hasAlpha && !hasCjk && !hasArabic)
200
+ return "cyrillic";
201
+ if (hasArabic && !hasAlpha && !hasCjk && !hasCyrillic)
202
+ return "arabic";
203
+ if (hasCjk || hasCyrillic || hasArabic)
204
+ return "mixed";
205
+ if (hasMixed || (hasDigit && hasAlpha))
206
+ return "alphanumeric";
207
+ if (hasDigit && !hasAlpha)
208
+ return "numeric";
209
+ if (hasAlpha && !hasDigit)
210
+ return "alpha";
211
+ return "mixed";
212
+ }
213
+ /**
214
+ * Walk a string and emit token spans (whitespace-and-punctuation-separated). Internal helper —
215
+ * callers receive `TokenClass[]` from `computeQueryShape`.
216
+ */
217
+ export function tokenizeForClass(text) {
218
+ const tokens = [];
219
+ let i = 0;
220
+ const N = text.length;
221
+ while (i < N) {
222
+ const cp = text.codePointAt(i);
223
+ const cls = classifyCodepoint(cp);
224
+ if (cls === "whitespace" || cls === "punct") {
225
+ i += cp > 0xffff ? 2 : 1;
226
+ continue;
227
+ }
228
+ // A leading connector (rare — most inputs don't start with `-`/`'`) is consumed as whitespace.
229
+ if (cls === "connector") {
230
+ i += cp > 0xffff ? 2 : 1;
231
+ continue;
232
+ }
233
+ // Start a token at i; walk until we hit whitespace, punct, or a script boundary.
234
+ // Connectors (`-`, `'`, `_`) join across digit/alpha boundaries.
235
+ const start = i;
236
+ const startCls = cls;
237
+ let cur = i;
238
+ while (cur < N) {
239
+ const ncp = text.codePointAt(cur);
240
+ const nstep = ncp > 0xffff ? 2 : 1;
241
+ const ncls = classifyCodepoint(ncp);
242
+ if (ncls === "whitespace" || ncls === "punct")
243
+ break;
244
+ if (ncls === "connector") {
245
+ cur += nstep;
246
+ continue;
247
+ }
248
+ // Break tokens across script transitions (digit↔alpha is fine; alpha↔cjk is a boundary).
249
+ const isLatinPair = (a, b) => (a === "digit" || a === "alpha") && (b === "digit" || b === "alpha");
250
+ if (ncls !== startCls &&
251
+ !isLatinPair(startCls, ncls) &&
252
+ !(startCls === "other" && (ncls === "digit" || ncls === "alpha"))) {
253
+ break;
254
+ }
255
+ cur += nstep;
256
+ }
257
+ tokens.push({ start, end: cur, body: text.slice(start, cur) });
258
+ i = cur;
259
+ }
260
+ return tokens;
261
+ }
262
+ //# sourceMappingURL=character-class.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"character-class.js","sourceRoot":"","sources":["../character-class.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAOH,MAAM,UAAU,GAAoC;IACnD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,sBAAsB;IACxC,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,+BAA+B;IACjD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,qCAAqC;IACvD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,yBAAyB;IAC3C,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,KAAK;IACvB,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,mBAAmB;IACrC,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,+BAA+B;IACjD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,8BAA8B;IAChD,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,qCAAqC;CACzD,CAAA;AAED,MAAM,eAAe,GAAoC;IACxD,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,sBAAsB;IACxC,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,sBAAsB;IACxC,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,sBAAsB;CACxC,CAAA;AAED,MAAM,aAAa,GAAoC;IACtD,CAAC,MAAM,EAAE,MAAM,CAAC;IAChB,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,oBAAoB;IACtC,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,oBAAoB;IACtC,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,8BAA8B;IAChD,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,8BAA8B;CAChD,CAAA;AAED,SAAS,OAAO,CAAC,EAAU,EAAE,MAAuC;IACnE,KAAK,MAAM,CAAC,EAAE,EAAE,EAAE,CAAC,IAAI,MAAM,EAAE,CAAC;QAC/B,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE;YAAE,OAAO,IAAI,CAAA;IACtC,CAAC;IACD,OAAO,KAAK,CAAA;AACb,CAAC;AAED,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAAS;IACxC,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,gBAAgB;IACxB,MAAM,EAAE,iBAAiB;CACzB,CAAC,CAAA;AAEF;;;GAGG;AACH,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAS;IAC5C,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,IAAI,EAAE,IAAI;IACV,MAAM,EAAE,IAAI;IACZ,MAAM,EAAE,IAAI;CACZ,CAAC,CAAA;AAEF,2CAA2C;AAC3C,MAAM,UAAU,iBAAiB,CAAC,EAAU;IAC3C,IAAI,EAAE,IAAI,IAAI,IAAI,EAAE,IAAI,IAAI;QAAE,OAAO,OAAO,CAAA;IAC5C,IAAI,CAAC,EAAE,IAAI,IAAI,IAAI,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,EAAE,IAAI,IAAI,IAAI,EAAE,IAAI,IAAI,CAAC;QAAE,OAAO,OAAO,CAAA;IAC5E,uDAAuD;IACvD,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,MAAM,IAAI,EAAE,IAAI,MAAM,CAAC;QAAE,OAAO,OAAO,CAAA;IACpF,IAAI,EAAE,KAAK,IAAI,IAAI,EAAE,KAAK,IAAI,IAAI,EAAE,KAAK,IAAI,IAAI,EAAE,KAAK,IAAI,IAAI,EAAE,KAAK,IAAI;QAAE,OAAO,YAAY,CAAA;IAChG,IAAI,oBAAoB,CAAC,GAAG,CAAC,EAAE,CAAC;QAAE,OAAO,WAAW,CAAA;IACpD,IAAI,gBAAgB,CAAC,GAAG,CAAC,EAAE,CAAC;QAAE,OAAO,OAAO,CAAA;IAC5C,IAAI,OAAO,CAAC,EAAE,EAAE,UAAU,CAAC;QAAE,OAAO,KAAK,CAAA;IACzC,IAAI,OAAO,CAAC,EAAE,EAAE,eAAe,CAAC;QAAE,OAAO,UAAU,CAAA;IACnD,IAAI,OAAO,CAAC,EAAE,EAAE,aAAa,CAAC;QAAE,OAAO,QAAQ,CAAA;IAC/C,OAAO,OAAO,CAAA;AACf,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,aAAa,CAAC,IAAY;IACzC,IAAI,QAAQ,GAAG,KAAK,CAAA;IACpB,IAAI,QAAQ,GAAG,KAAK,CAAA;IACpB,IAAI,MAAM,GAAG,KAAK,CAAA;IAClB,IAAI,WAAW,GAAG,KAAK,CAAA;IACvB,IAAI,SAAS,GAAG,KAAK,CAAA;IACrB,IAAI,QAAQ,GAAG,KAAK,CAAA;IAEpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,GAAI,CAAC;QACnC,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAE,CAAA;QAC/B,CAAC,IAAI,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACxB,MAAM,GAAG,GAAG,iBAAiB,CAAC,EAAE,CAAC,CAAA;QACjC,QAAQ,GAAG,EAAE,CAAC;YACb,KAAK,OAAO;gBACX,QAAQ,GAAG,IAAI,CAAA;gBACf,MAAK;YACN,KAAK,OAAO;gBACX,QAAQ,GAAG,IAAI,CAAA;gBACf,MAAK;YACN,KAAK,KAAK;gBACT,MAAM,GAAG,IAAI,CAAA;gBACb,MAAK;YACN,KAAK,UAAU;gBACd,WAAW,GAAG,IAAI,CAAA;gBAClB,MAAK;YACN,KAAK,QAAQ;gBACZ,SAAS,GAAG,IAAI,CAAA;gBAChB,MAAK;YACN,KAAK,OAAO;gBACX,QAAQ,GAAG,IAAI,CAAA;gBACf,MAAK;YACN,KAAK,WAAW,CAAC;YACjB,KAAK,YAAY,CAAC;YAClB,KAAK,OAAO;gBACX,MAAK;QACP,CAAC;IACF,CAAC;IAED,IAAI,MAAM;QAAE,OAAO,KAAK,CAAA;IACxB,IAAI,WAAW;QAAE,OAAO,UAAU,CAAA;IAClC,IAAI,SAAS;QAAE,OAAO,QAAQ,CAAA;IAC9B,IAAI,QAAQ,IAAI,QAAQ;QAAE,OAAO,OAAO,CAAA;IACxC,IAAI,QAAQ;QAAE,OAAO,OAAO,CAAA;IAC5B,IAAI,QAAQ;QAAE,OAAO,OAAO,CAAA;IAC5B,IAAI,QAAQ;QAAE,OAAO,OAAO,CAAA;IAC5B,OAAO,OAAO,CAAA;AACf,CAAC;AAED,mEAAmE;AACnE,MAAM,UAAU,cAAc,CAAC,MAAiC;IAC/D,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,OAAO,CAAA;IAEvC,IAAI,QAAQ,GAAG,KAAK,CAAA;IACpB,IAAI,QAAQ,GAAG,KAAK,CAAA;IACpB,IAAI,MAAM,GAAG,KAAK,CAAA;IAClB,IAAI,WAAW,GAAG,KAAK,CAAA;IACvB,IAAI,SAAS,GAAG,KAAK,CAAA;IACrB,IAAI,QAAQ,GAAG,KAAK,CAAA;IAEpB,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;YACjB,KAAK,KAAK;gBACT,MAAM,GAAG,IAAI,CAAA;gBACb,MAAK;YACN,KAAK,UAAU;gBACd,WAAW,GAAG,IAAI,CAAA;gBAClB,MAAK;YACN,KAAK,QAAQ;gBACZ,SAAS,GAAG,IAAI,CAAA;gBAChB,MAAK;YACN,KAAK,OAAO;gBACX,QAAQ,GAAG,IAAI,CAAA;gBACf,MAAK;YACN,KAAK,OAAO;gBACX,QAAQ,GAAG,IAAI,CAAA;gBACf,MAAK;YACN,KAAK,OAAO;gBACX,QAAQ,GAAG,IAAI,CAAA;gBACf,MAAK;QACP,CAAC;IACF,CAAC;IAED,IAAI,MAAM,IAAI,CAAC,QAAQ,IAAI,CAAC,WAAW,IAAI,CAAC,SAAS;QAAE,OAAO,KAAK,CAAA;IACnE,IAAI,WAAW,IAAI,CAAC,QAAQ,IAAI,CAAC,MAAM,IAAI,CAAC,SAAS;QAAE,OAAO,UAAU,CAAA;IACxE,IAAI,SAAS,IAAI,CAAC,QAAQ,IAAI,CAAC,MAAM,IAAI,CAAC,WAAW;QAAE,OAAO,QAAQ,CAAA;IACtE,IAAI,MAAM,IAAI,WAAW,IAAI,SAAS;QAAE,OAAO,OAAO,CAAA;IACtD,IAAI,QAAQ,IAAI,CAAC,QAAQ,IAAI,QAAQ,CAAC;QAAE,OAAO,cAAc,CAAA;IAC7D,IAAI,QAAQ,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAA;IAC3C,IAAI,QAAQ,IAAI,CAAC,QAAQ;QAAE,OAAO,OAAO,CAAA;IACzC,OAAO,OAAO,CAAA;AACf,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC5C,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,CAAA;IAErB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;QACd,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAE,CAAA;QAC/B,MAAM,GAAG,GAAG,iBAAiB,CAAC,EAAE,CAAC,CAAA;QAEjC,IAAI,GAAG,KAAK,YAAY,IAAI,GAAG,KAAK,OAAO,EAAE,CAAC;YAC7C,CAAC,IAAI,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;YACxB,SAAQ;QACT,CAAC;QACD,+FAA+F;QAC/F,IAAI,GAAG,KAAK,WAAW,EAAE,CAAC;YACzB,CAAC,IAAI,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;YACxB,SAAQ;QACT,CAAC;QAED,iFAAiF;QACjF,iEAAiE;QACjE,MAAM,KAAK,GAAG,CAAC,CAAA;QACf,MAAM,QAAQ,GAAG,GAAG,CAAA;QACpB,IAAI,GAAG,GAAG,CAAC,CAAA;QACX,OAAO,GAAG,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAE,CAAA;YAClC,MAAM,KAAK,GAAG,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;YAClC,MAAM,IAAI,GAAG,iBAAiB,CAAC,GAAG,CAAC,CAAA;YACnC,IAAI,IAAI,KAAK,YAAY,IAAI,IAAI,KAAK,OAAO;gBAAE,MAAK;YACpD,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;gBAC1B,GAAG,IAAI,KAAK,CAAA;gBACZ,SAAQ;YACT,CAAC;YACD,yFAAyF;YACzF,MAAM,WAAW,GAAG,CAAC,CAAiB,EAAE,CAAiB,EAAE,EAAE,CAC5D,CAAC,CAAC,KAAK,OAAO,IAAI,CAAC,KAAK,OAAO,CAAC,IAAI,CAAC,CAAC,KAAK,OAAO,IAAI,CAAC,KAAK,OAAO,CAAC,CAAA;YACrE,IACC,IAAI,KAAK,QAAQ;gBACjB,CAAC,WAAW,CAAC,QAAQ,EAAE,IAAI,CAAC;gBAC5B,CAAC,CAAC,QAAQ,KAAK,OAAO,IAAI,CAAC,IAAI,KAAK,OAAO,IAAI,IAAI,KAAK,OAAO,CAAC,CAAC,EAChE,CAAC;gBACF,MAAK;YACN,CAAC;YACD,GAAG,IAAI,KAAK,CAAA;QACb,CAAC;QAED,MAAM,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,CAAA;QAC9D,CAAC,GAAG,GAAG,CAAA;IACR,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ import type { ComputeQueryShapeOpts, NormalizedInputLite, QueryShape } from "./types.js";
7
+ /**
8
+ * Compute a `QueryShape` from a string or normalized input. Microseconds-cheap, pure-function.
9
+ *
10
+ * @example Const shape = computeQueryShape("350 5th Ave, New York, NY 10118") //
11
+ * shape.knownFormats.find((f) => f.format === "us_zip") → defined // shape.segments.length === 4
12
+ */
13
+ export declare function computeQueryShape(input: string | NormalizedInputLite, opts?: ComputeQueryShapeOpts): QueryShape;
14
+ //# sourceMappingURL=compute.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compute.d.ts","sourceRoot":"","sources":["../compute.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH,OAAO,KAAK,EAAE,qBAAqB,EAAE,mBAAmB,EAAE,UAAU,EAAiC,MAAM,YAAY,CAAA;AA0BvH;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,MAAM,GAAG,mBAAmB,EAAE,IAAI,CAAC,EAAE,qBAAqB,GAAG,UAAU,CA0B/G"}
package/out/compute.js ADDED
@@ -0,0 +1,70 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ import { classifyToken, foldInputClass, tokenizeForClass } from "./character-class.js";
7
+ import { detectKnownFormats } from "./known-formats.js";
8
+ import { detectRegionAbbreviations } from "./region-abbreviations.js";
9
+ import { segment } from "./segmentation.js";
10
+ function detectWhitespacePattern(text) {
11
+ let hasTab = false;
12
+ let hasDouble = false;
13
+ let hasSingle = false;
14
+ let prevSpace = false;
15
+ for (let i = 0; i < text.length; i++) {
16
+ const ch = text[i];
17
+ if (ch === "\t") {
18
+ hasTab = true;
19
+ }
20
+ else if (ch === " ") {
21
+ if (prevSpace)
22
+ hasDouble = true;
23
+ else
24
+ hasSingle = true;
25
+ prevSpace = true;
26
+ continue;
27
+ }
28
+ prevSpace = false;
29
+ }
30
+ if (hasTab && (hasDouble || hasSingle))
31
+ return "mixed";
32
+ if (hasTab)
33
+ return "tab";
34
+ if (hasDouble)
35
+ return "double";
36
+ if (hasSingle)
37
+ return "single";
38
+ return "none";
39
+ }
40
+ /**
41
+ * Compute a `QueryShape` from a string or normalized input. Microseconds-cheap, pure-function.
42
+ *
43
+ * @example Const shape = computeQueryShape("350 5th Ave, New York, NY 10118") //
44
+ * shape.knownFormats.find((f) => f.format === "us_zip") → defined // shape.segments.length === 4
45
+ */
46
+ export function computeQueryShape(input, opts) {
47
+ const text = typeof input === "string" ? input : input.normalized;
48
+ const locale = opts?.locale ?? (typeof input === "string" ? undefined : input.appliedLocale);
49
+ const tokenSpans = tokenizeForClass(text);
50
+ const tokenClasses = tokenSpans.map((span) => ({
51
+ span,
52
+ class: classifyToken(span.body),
53
+ length: span.end - span.start,
54
+ }));
55
+ const segments = segment(text, locale);
56
+ const knownFormats = detectKnownFormats(text, tokenClasses);
57
+ const regionAbbreviations = detectRegionAbbreviations(tokenClasses, segments);
58
+ const characterClass = foldInputClass(tokenClasses);
59
+ const whitespacePattern = detectWhitespacePattern(text);
60
+ return Object.freeze({
61
+ characterClass,
62
+ tokenClasses: Object.freeze(tokenClasses),
63
+ segments: Object.freeze(segments),
64
+ knownFormats: Object.freeze(knownFormats),
65
+ regionAbbreviations: Object.freeze(regionAbbreviations),
66
+ totalLength: text.length,
67
+ whitespacePattern,
68
+ });
69
+ }
70
+ //# sourceMappingURL=compute.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compute.js","sourceRoot":"","sources":["../compute.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAA;AACtF,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAA;AACvD,OAAO,EAAE,yBAAyB,EAAE,MAAM,2BAA2B,CAAA;AACrE,OAAO,EAAE,OAAO,EAAE,MAAM,mBAAmB,CAAA;AAG3C,SAAS,uBAAuB,CAAC,IAAY;IAC5C,IAAI,MAAM,GAAG,KAAK,CAAA;IAClB,IAAI,SAAS,GAAG,KAAK,CAAA;IACrB,IAAI,SAAS,GAAG,KAAK,CAAA;IACrB,IAAI,SAAS,GAAG,KAAK,CAAA;IACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;QAClB,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YACjB,MAAM,GAAG,IAAI,CAAA;QACd,CAAC;aAAM,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;YACvB,IAAI,SAAS;gBAAE,SAAS,GAAG,IAAI,CAAA;;gBAC1B,SAAS,GAAG,IAAI,CAAA;YACrB,SAAS,GAAG,IAAI,CAAA;YAChB,SAAQ;QACT,CAAC;QACD,SAAS,GAAG,KAAK,CAAA;IAClB,CAAC;IACD,IAAI,MAAM,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC;QAAE,OAAO,OAAO,CAAA;IACtD,IAAI,MAAM;QAAE,OAAO,KAAK,CAAA;IACxB,IAAI,SAAS;QAAE,OAAO,QAAQ,CAAA;IAC9B,IAAI,SAAS;QAAE,OAAO,QAAQ,CAAA;IAC9B,OAAO,MAAM,CAAA;AACd,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAAC,KAAmC,EAAE,IAA4B;IAClG,MAAM,IAAI,GAAG,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,UAAU,CAAA;IACjE,MAAM,MAAM,GAAG,IAAI,EAAE,MAAM,IAAI,CAAC,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,aAAa,CAAC,CAAA;IAE5F,MAAM,UAAU,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;IACzC,MAAM,YAAY,GAAiB,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC5D,IAAI;QACJ,KAAK,EAAE,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC;QAC/B,MAAM,EAAE,IAAI,CAAC,GAAG,GAAG,IAAI,CAAC,KAAK;KAC7B,CAAC,CAAC,CAAA;IAEH,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IACtC,MAAM,YAAY,GAAG,kBAAkB,CAAC,IAAI,EAAE,YAAY,CAAC,CAAA;IAC3D,MAAM,mBAAmB,GAAG,yBAAyB,CAAC,YAAY,EAAE,QAAQ,CAAC,CAAA;IAC7E,MAAM,cAAc,GAAG,cAAc,CAAC,YAAY,CAAC,CAAA;IACnD,MAAM,iBAAiB,GAAG,uBAAuB,CAAC,IAAI,CAAC,CAAA;IAEvD,OAAO,MAAM,CAAC,MAAM,CAAC;QACpB,cAAc;QACd,YAAY,EAAE,MAAM,CAAC,MAAM,CAAC,YAAY,CAAiB;QACzD,QAAQ,EAAE,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAoB;QACpD,YAAY,EAAE,MAAM,CAAC,MAAM,CAAC,YAAY,CAAwB;QAChE,mBAAmB,EAAE,MAAM,CAAC,MAAM,CAAC,mBAAmB,CAA+B;QACrF,WAAW,EAAE,IAAI,CAAC,MAAM;QACxB,iBAAiB;KACjB,CAAsB,CAAA;AACxB,CAAC"}
package/out/index.d.ts ADDED
@@ -0,0 +1,21 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/query-shape` — pure-function structural priors for the runtime pipeline.
7
+ *
8
+ * Computes a `QueryShape` from an input string: character class, per-token class, punctuation-
9
+ * bounded segments, known-format regex hits, and whitespace pattern. Microseconds-cheap, no ML,
10
+ * no runtime dependencies.
11
+ *
12
+ * See `docs/articles/plan/reference/QUERY_SHAPE.md` for the design rationale and
13
+ * `docs/articles/plan/reference/STAGES.md` for how this fits into the runtime pipeline.
14
+ */
15
+ export { classifyCodepoint, classifyToken, foldInputClass } from "./character-class.js";
16
+ export { computeQueryShape } from "./compute.js";
17
+ export { detectKnownFormats } from "./known-formats.js";
18
+ export { detectRegionAbbreviations } from "./region-abbreviations.js";
19
+ export { segment } from "./segmentation.js";
20
+ export type { CharacterClass, ComputeQueryShapeOpts, KnownFormat, KnownFormatHit, NormalizedInputLite, QueryShape, RegionAbbreviationHit, Segment, SegmentSeparator, SpanRange, TokenCharacterClass, TokenClass, WhitespacePattern, } from "./types.js";
21
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAA;AACvF,OAAO,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAA;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAA;AACvD,OAAO,EAAE,yBAAyB,EAAE,MAAM,2BAA2B,CAAA;AACrE,OAAO,EAAE,OAAO,EAAE,MAAM,mBAAmB,CAAA;AAC3C,YAAY,EACX,cAAc,EACd,qBAAqB,EACrB,WAAW,EACX,cAAc,EACd,mBAAmB,EACnB,UAAU,EACV,qBAAqB,EACrB,OAAO,EACP,gBAAgB,EAChB,SAAS,EACT,mBAAmB,EACnB,UAAU,EACV,iBAAiB,GACjB,MAAM,YAAY,CAAA"}
package/out/index.js ADDED
@@ -0,0 +1,20 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/query-shape` — pure-function structural priors for the runtime pipeline.
7
+ *
8
+ * Computes a `QueryShape` from an input string: character class, per-token class, punctuation-
9
+ * bounded segments, known-format regex hits, and whitespace pattern. Microseconds-cheap, no ML,
10
+ * no runtime dependencies.
11
+ *
12
+ * See `docs/articles/plan/reference/QUERY_SHAPE.md` for the design rationale and
13
+ * `docs/articles/plan/reference/STAGES.md` for how this fits into the runtime pipeline.
14
+ */
15
+ export { classifyCodepoint, classifyToken, foldInputClass } from "./character-class.js";
16
+ export { computeQueryShape } from "./compute.js";
17
+ export { detectKnownFormats } from "./known-formats.js";
18
+ export { detectRegionAbbreviations } from "./region-abbreviations.js";
19
+ export { segment } from "./segmentation.js";
20
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,iBAAiB,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAA;AACvF,OAAO,EAAE,iBAAiB,EAAE,MAAM,cAAc,CAAA;AAChD,OAAO,EAAE,kBAAkB,EAAE,MAAM,oBAAoB,CAAA;AACvD,OAAO,EAAE,yBAAyB,EAAE,MAAM,2BAA2B,CAAA;AACrE,OAAO,EAAE,OAAO,EAAE,MAAM,mBAAmB,CAAA"}
@@ -0,0 +1,14 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ import type { KnownFormatHit, TokenClass } from "./types.js";
7
+ /**
8
+ * Detect known-format hits among the tokenized input.
9
+ *
10
+ * Strategy: for each token (or adjacent pair), try every pattern. Multiple format hits on the same
11
+ * span are allowed (US/FR/DE 5-digit ambiguity surfaces all three).
12
+ */
13
+ export declare function detectKnownFormats(text: string, tokens: ReadonlyArray<TokenClass>): KnownFormatHit[];
14
+ //# sourceMappingURL=known-formats.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"known-formats.d.ts","sourceRoot":"","sources":["../known-formats.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAe,cAAc,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAsCzE;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,aAAa,CAAC,UAAU,CAAC,GAAG,cAAc,EAAE,CAqCpG"}
@@ -0,0 +1,114 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ const PATTERNS = [
7
+ // Unambiguous single-token patterns first.
8
+ { format: "us_zip4", pattern: /^\d{5}-\d{4}$/, tokenSpan: 1, confidence: 0.95 },
9
+ { format: "ca_postcode", pattern: /^[A-Z]\d[A-Z]\d[A-Z]\d$/i, tokenSpan: 1, confidence: 0.95 },
10
+ { format: "jp_postcode", pattern: /^\d{3}-\d{4}$/, tokenSpan: 1, confidence: 0.95 },
11
+ // UK postcode is 2 tokens when split on space (e.g. "SW1A 1AA"), 1 token otherwise.
12
+ { format: "uk_postcode", pattern: /^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$/i, tokenSpan: 1, confidence: 0.9 },
13
+ { format: "uk_postcode", pattern: /^[A-Z]{1,2}\d[A-Z\d]? \d[A-Z]{2}$/i, tokenSpan: 2, confidence: 0.9 },
14
+ { format: "ca_postcode", pattern: /^[A-Z]\d[A-Z] \d[A-Z]\d$/i, tokenSpan: 2, confidence: 0.9 },
15
+ // Ambiguous 5-digit (US/FR/DE). Tag as us_zip with reduced confidence; caller disambiguates by
16
+ // locale prior. Multiple format hits on the same span are possible.
17
+ { format: "us_zip", pattern: /^\d{5}$/, tokenSpan: 1, confidence: 0.6 },
18
+ { format: "fr_postcode", pattern: /^\d{5}$/, tokenSpan: 1, confidence: 0.6 },
19
+ { format: "de_postcode", pattern: /^\d{5}$/, tokenSpan: 1, confidence: 0.6 },
20
+ // PO Box variants (US + FR). The pattern matches across 2-3 tokens — handled separately.
21
+ ];
22
+ const PO_BOX_LEADERS = new Set(["po", "p.o.", "p.o", "box", "bp", "b.p.", "b.p", "casilla", "apartado"]);
23
+ /**
24
+ * Detect known-format hits among the tokenized input.
25
+ *
26
+ * Strategy: for each token (or adjacent pair), try every pattern. Multiple format hits on the same
27
+ * span are allowed (US/FR/DE 5-digit ambiguity surfaces all three).
28
+ */
29
+ export function detectKnownFormats(text, tokens) {
30
+ const hits = [];
31
+ // Single-token patterns.
32
+ for (const tok of tokens) {
33
+ for (const p of PATTERNS) {
34
+ if (p.tokenSpan !== 1)
35
+ continue;
36
+ if (p.pattern.test(tok.span.body)) {
37
+ hits.push({ format: p.format, span: tok.span, confidence: p.confidence });
38
+ }
39
+ }
40
+ }
41
+ // Two-token patterns (joined by a single space).
42
+ for (let i = 0; i + 1 < tokens.length; i++) {
43
+ const a = tokens[i];
44
+ const b = tokens[i + 1];
45
+ if (!a || !b)
46
+ continue;
47
+ const joined = `${a.span.body} ${b.span.body}`;
48
+ for (const p of PATTERNS) {
49
+ if (p.tokenSpan !== 2)
50
+ continue;
51
+ if (p.pattern.test(joined)) {
52
+ hits.push({
53
+ format: p.format,
54
+ span: { start: a.span.start, end: b.span.end, body: text.slice(a.span.start, b.span.end) },
55
+ confidence: p.confidence,
56
+ });
57
+ }
58
+ }
59
+ }
60
+ // PO Box detection — handled separately because the leader can be 1-3 tokens and the number
61
+ // can be alphanumeric.
62
+ const poHit = detectPoBox(text, tokens);
63
+ if (poHit)
64
+ hits.push(poHit);
65
+ return hits;
66
+ }
67
+ function detectPoBox(text, tokens) {
68
+ if (tokens.length === 0)
69
+ return null;
70
+ // Find a leader token + optional "Box" + numeric/alphanumeric.
71
+ for (let i = 0; i < tokens.length; i++) {
72
+ const leadTok = tokens[i];
73
+ if (!leadTok)
74
+ continue;
75
+ const lead = leadTok.span.body.toLowerCase();
76
+ if (!PO_BOX_LEADERS.has(lead))
77
+ continue;
78
+ // Walk forward up to 3 tokens looking for the box number.
79
+ let last = i;
80
+ let foundNumber = false;
81
+ for (let j = i + 1; j <= Math.min(i + 3, tokens.length - 1); j++) {
82
+ const tj = tokens[j];
83
+ if (!tj)
84
+ break;
85
+ const tjBody = tj.span.body.toLowerCase();
86
+ if (PO_BOX_LEADERS.has(tjBody)) {
87
+ last = j;
88
+ continue;
89
+ }
90
+ // Numeric or alphanumeric token = the box number.
91
+ if (tj.class === "digit" || tj.class === "mixed") {
92
+ last = j;
93
+ foundNumber = true;
94
+ break;
95
+ }
96
+ break;
97
+ }
98
+ if (foundNumber) {
99
+ const startTok = tokens[i];
100
+ const endTok = tokens[last];
101
+ if (!startTok || !endTok)
102
+ return null;
103
+ const start = startTok.span.start;
104
+ const end = endTok.span.end;
105
+ return {
106
+ format: "po_box",
107
+ span: { start, end, body: text.slice(start, end) },
108
+ confidence: 0.85,
109
+ };
110
+ }
111
+ }
112
+ return null;
113
+ }
114
+ //# sourceMappingURL=known-formats.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"known-formats.js","sourceRoot":"","sources":["../known-formats.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAqBH,MAAM,QAAQ,GAAiC;IAC9C,2CAA2C;IAC3C,EAAE,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE;IAC/E,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,0BAA0B,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE;IAC9F,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,IAAI,EAAE;IACnF,oFAAoF;IACpF,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,mCAAmC,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE;IACtG,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,oCAAoC,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE;IACvG,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,2BAA2B,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE;IAC9F,+FAA+F;IAC/F,oEAAoE;IACpE,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE;IACvE,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE;IAC5E,EAAE,MAAM,EAAE,aAAa,EAAE,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE;IAC5E,yFAAyF;CACzF,CAAA;AAED,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,CAAC,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,UAAU,CAAC,CAAC,CAAA;AAExG;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,MAAiC;IACjF,MAAM,IAAI,GAAqB,EAAE,CAAA;IAEjC,yBAAyB;IACzB,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;QAC1B,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YAC1B,IAAI,CAAC,CAAC,SAAS,KAAK,CAAC;gBAAE,SAAQ;YAC/B,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;gBACnC,IAAI,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC,UAAU,EAAE,CAAC,CAAA;YAC1E,CAAC;QACF,CAAC;IACF,CAAC;IAED,iDAAiD;IACjD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5C,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;QACnB,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;QACvB,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;YAAE,SAAQ;QACtB,MAAM,MAAM,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,CAAA;QAC9C,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YAC1B,IAAI,CAAC,CAAC,SAAS,KAAK,CAAC;gBAAE,SAAQ;YAC/B,IAAI,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC5B,IAAI,CAAC,IAAI,CAAC;oBACT,MAAM,EAAE,CAAC,CAAC,MAAM;oBAChB,IAAI,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE;oBAC1F,UAAU,EAAE,CAAC,CAAC,UAAU;iBACxB,CAAC,CAAA;YACH,CAAC;QACF,CAAC;IACF,CAAC;IAED,4FAA4F;IAC5F,uBAAuB;IACvB,MAAM,KAAK,GAAG,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IACvC,IAAI,KAAK;QAAE,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IAE3B,OAAO,IAAI,CAAA;AACZ,CAAC;AAED,SAAS,WAAW,CAAC,IAAY,EAAE,MAAiC;IACnE,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAA;IACpC,+DAA+D;IAC/D,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;QACzB,IAAI,CAAC,OAAO;YAAE,SAAQ;QACtB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAA;QAC5C,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,SAAQ;QAEvC,0DAA0D;QAC1D,IAAI,IAAI,GAAG,CAAC,CAAA;QACZ,IAAI,WAAW,GAAG,KAAK,CAAA;QACvB,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAClE,MAAM,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;YACpB,IAAI,CAAC,EAAE;gBAAE,MAAK;YACd,MAAM,MAAM,GAAG,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAA;YACzC,IAAI,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;gBAChC,IAAI,GAAG,CAAC,CAAA;gBACR,SAAQ;YACT,CAAC;YACD,kDAAkD;YAClD,IAAI,EAAE,CAAC,KAAK,KAAK,OAAO,IAAI,EAAE,CAAC,KAAK,KAAK,OAAO,EAAE,CAAC;gBAClD,IAAI,GAAG,CAAC,CAAA;gBACR,WAAW,GAAG,IAAI,CAAA;gBAClB,MAAK;YACN,CAAC;YACD,MAAK;QACN,CAAC;QACD,IAAI,WAAW,EAAE,CAAC;YACjB,MAAM,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC,CAAA;YAC1B,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,CAAA;YAC3B,IAAI,CAAC,QAAQ,IAAI,CAAC,MAAM;gBAAE,OAAO,IAAI,CAAA;YACrC,MAAM,KAAK,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAA;YACjC,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAA;YAC3B,OAAO;gBACN,MAAM,EAAE,QAAQ;gBAChB,IAAI,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE;gBAClD,UAAU,EAAE,IAAI;aAChB,CAAA;QACF,CAAC;IACF,CAAC;IACD,OAAO,IAAI,CAAA;AACZ,CAAC"}
@@ -0,0 +1,17 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Detect unambiguous region abbreviations (e.g., "DC", "NY", "CA") for the locality soft prior.
7
+ * Only fires after a comma-space boundary in en-us — the canonical "City, ST ZIP" pattern.
8
+ */
9
+ import type { RegionAbbreviationHit, Segment, TokenClass } from "./types.js";
10
+ /**
11
+ * Find region abbreviation hits. A hit is a 2-letter all-uppercase token that appears after a
12
+ * comma-separated segment boundary — the canonical "City, ST" or "City, ST ZIP" tail pattern.
13
+ *
14
+ * Returns empty array for non-Western locales or inputs without comma segmentation.
15
+ */
16
+ export declare function detectRegionAbbreviations(tokens: ReadonlyArray<TokenClass>, segments: ReadonlyArray<Segment>): RegionAbbreviationHit[];
17
+ //# sourceMappingURL=region-abbreviations.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"region-abbreviations.d.ts","sourceRoot":"","sources":["../region-abbreviations.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,qBAAqB,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,YAAY,CAAA;AAI5E;;;;;GAKG;AACH,wBAAgB,yBAAyB,CACxC,MAAM,EAAE,aAAa,CAAC,UAAU,CAAC,EACjC,QAAQ,EAAE,aAAa,CAAC,OAAO,CAAC,GAC9B,qBAAqB,EAAE,CAkBzB"}
@@ -0,0 +1,35 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Detect unambiguous region abbreviations (e.g., "DC", "NY", "CA") for the locality soft prior.
7
+ * Only fires after a comma-space boundary in en-us — the canonical "City, ST ZIP" pattern.
8
+ */
9
+ const REGION_ABBREV_RE = /^[A-Z]{2}$/;
10
+ /**
11
+ * Find region abbreviation hits. A hit is a 2-letter all-uppercase token that appears after a
12
+ * comma-separated segment boundary — the canonical "City, ST" or "City, ST ZIP" tail pattern.
13
+ *
14
+ * Returns empty array for non-Western locales or inputs without comma segmentation.
15
+ */
16
+ export function detectRegionAbbreviations(tokens, segments) {
17
+ if (segments.length < 2)
18
+ return [];
19
+ const hits = [];
20
+ for (const seg of segments) {
21
+ if (seg.separator !== "comma")
22
+ continue;
23
+ for (const tok of tokens) {
24
+ if (tok.span.start < seg.span.start || tok.span.end > seg.span.end)
25
+ continue;
26
+ if (tok.class !== "alpha")
27
+ continue;
28
+ if (!REGION_ABBREV_RE.test(tok.span.body))
29
+ continue;
30
+ hits.push({ start: tok.span.start, span: tok.span.body });
31
+ }
32
+ }
33
+ return hits;
34
+ }
35
+ //# sourceMappingURL=region-abbreviations.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"region-abbreviations.js","sourceRoot":"","sources":["../region-abbreviations.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,MAAM,gBAAgB,GAAG,YAAY,CAAA;AAErC;;;;;GAKG;AACH,MAAM,UAAU,yBAAyB,CACxC,MAAiC,EACjC,QAAgC;IAEhC,IAAI,QAAQ,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,EAAE,CAAA;IAElC,MAAM,IAAI,GAA4B,EAAE,CAAA;IAExC,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC5B,IAAI,GAAG,CAAC,SAAS,KAAK,OAAO;YAAE,SAAQ;QAEvC,KAAK,MAAM,GAAG,IAAI,MAAM,EAAE,CAAC;YAC1B,IAAI,GAAG,CAAC,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,IAAI,GAAG,CAAC,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,GAAG;gBAAE,SAAQ;YAC5E,IAAI,GAAG,CAAC,KAAK,KAAK,OAAO;gBAAE,SAAQ;YACnC,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC;gBAAE,SAAQ;YAEnD,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAA;QAC1D,CAAC;IACF,CAAC;IAED,OAAO,IAAI,CAAA;AACZ,CAAC"}
@@ -0,0 +1,12 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ import type { Segment } from "./types.js";
7
+ /**
8
+ * Comma / newline / tab separate segments. Locale-aware grammar reserved for future (JP whitespace,
9
+ * KR honorifics). Default rules apply when no locale-specific override exists.
10
+ */
11
+ export declare function segment(text: string, _locale?: string): Segment[];
12
+ //# sourceMappingURL=segmentation.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"segmentation.d.ts","sourceRoot":"","sources":["../segmentation.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,KAAK,EAAE,OAAO,EAAoB,MAAM,YAAY,CAAA;AAE3D;;;GAGG;AACH,wBAAgB,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,GAAG,OAAO,EAAE,CA8CjE"}
@@ -0,0 +1,57 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ /**
7
+ * Comma / newline / tab separate segments. Locale-aware grammar reserved for future (JP whitespace,
8
+ * KR honorifics). Default rules apply when no locale-specific override exists.
9
+ */
10
+ export function segment(text, _locale) {
11
+ const segments = [];
12
+ if (text.length === 0)
13
+ return segments;
14
+ let start = 0;
15
+ let lastSeparator = null;
16
+ let index = 0;
17
+ const flush = (end, separator) => {
18
+ // Trim leading + trailing whitespace from each segment but record the original span.
19
+ const raw = text.slice(start, end);
20
+ const leftPad = raw.match(/^\s*/)[0].length;
21
+ const rightPad = raw.match(/\s*$/)[0].length;
22
+ const innerStart = start + leftPad;
23
+ const innerEnd = end - rightPad;
24
+ if (innerEnd > innerStart) {
25
+ segments.push({
26
+ span: { start: innerStart, end: innerEnd, body: text.slice(innerStart, innerEnd) },
27
+ body: text.slice(innerStart, innerEnd),
28
+ index,
29
+ separator: lastSeparator,
30
+ });
31
+ index += 1;
32
+ }
33
+ lastSeparator = separator;
34
+ };
35
+ for (let i = 0; i < text.length; i++) {
36
+ const ch = text[i];
37
+ if (ch === ",") {
38
+ flush(i, "comma");
39
+ start = i + 1;
40
+ }
41
+ else if (ch === "\n") {
42
+ flush(i, "newline");
43
+ start = i + 1;
44
+ }
45
+ else if (ch === "\t") {
46
+ flush(i, "tab");
47
+ start = i + 1;
48
+ }
49
+ else if (ch === ";") {
50
+ flush(i, "comma"); // semicolon treated as comma-equivalent
51
+ start = i + 1;
52
+ }
53
+ }
54
+ flush(text.length, null);
55
+ return segments;
56
+ }
57
+ //# sourceMappingURL=segmentation.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"segmentation.js","sourceRoot":"","sources":["../segmentation.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAIH;;;GAGG;AACH,MAAM,UAAU,OAAO,CAAC,IAAY,EAAE,OAAgB;IACrD,MAAM,QAAQ,GAAc,EAAE,CAAA;IAC9B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,QAAQ,CAAA;IAEtC,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,aAAa,GAAqB,IAAI,CAAA;IAC1C,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,MAAM,KAAK,GAAG,CAAC,GAAW,EAAE,SAA2B,EAAE,EAAE;QAC1D,qFAAqF;QACrF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAA;QAClC,MAAM,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QAC5C,MAAM,QAAQ,GAAG,GAAG,CAAC,KAAK,CAAC,MAAM,CAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QAC7C,MAAM,UAAU,GAAG,KAAK,GAAG,OAAO,CAAA;QAClC,MAAM,QAAQ,GAAG,GAAG,GAAG,QAAQ,CAAA;QAC/B,IAAI,QAAQ,GAAG,UAAU,EAAE,CAAC;YAC3B,QAAQ,CAAC,IAAI,CAAC;gBACb,IAAI,EAAE,EAAE,KAAK,EAAE,UAAU,EAAE,GAAG,EAAE,QAAQ,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE;gBAClF,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,UAAU,EAAE,QAAQ,CAAC;gBACtC,KAAK;gBACL,SAAS,EAAE,aAAa;aACxB,CAAC,CAAA;YACF,KAAK,IAAI,CAAC,CAAA;QACX,CAAC;QACD,aAAa,GAAG,SAAS,CAAA;IAC1B,CAAC,CAAA;IAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;QAClB,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;YAChB,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA;YACjB,KAAK,GAAG,CAAC,GAAG,CAAC,CAAA;QACd,CAAC;aAAM,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YACxB,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,CAAA;YACnB,KAAK,GAAG,CAAC,GAAG,CAAC,CAAA;QACd,CAAC;aAAM,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YACxB,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAA;YACf,KAAK,GAAG,CAAC,GAAG,CAAC,CAAA;QACd,CAAC;aAAM,IAAI,EAAE,KAAK,GAAG,EAAE,CAAC;YACvB,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAA,CAAC,wCAAwC;YAC1D,KAAK,GAAG,CAAC,GAAG,CAAC,CAAA;QACd,CAAC;IACF,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,CAAA;IAExB,OAAO,QAAQ,CAAA;AAChB,CAAC"}
package/out/types.d.ts ADDED
@@ -0,0 +1,87 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ /**
7
+ * Minimal character-range descriptor used internally. Compatible with `@mailwoman/core`'s `Span`
8
+ * class by shape — consumers holding a `Span` can pass it where `SpanRange` is expected.
9
+ */
10
+ export interface SpanRange {
11
+ start: number;
12
+ end: number;
13
+ body: string;
14
+ }
15
+ /** Per-token character classification. */
16
+ export type TokenCharacterClass = "digit" | "alpha" | "mixed" | "punct" | "cjk" | "cyrillic" | "arabic";
17
+ /** Whole-input character class — folded from `TokenCharacterClass`. */
18
+ export type CharacterClass = "numeric" | "alpha" | "alphanumeric" | "cjk" | "cyrillic" | "arabic" | "mixed";
19
+ /** Known-format identifier. The set is intentionally small + universal. */
20
+ export type KnownFormat = "us_zip" | "us_zip4" | "uk_postcode" | "fr_postcode" | "ca_postcode" | "de_postcode" | "jp_postcode" | "po_box";
21
+ /** Punctuation grammar separator between consecutive segments. */
22
+ export type SegmentSeparator = "comma" | "newline" | "tab" | "whitespace" | "japanese-style" | null;
23
+ /** Whitespace pattern of the whole input. */
24
+ export type WhitespacePattern = "single" | "double" | "tab" | "mixed" | "none";
25
+ export interface TokenClass {
26
+ span: SpanRange;
27
+ class: TokenCharacterClass;
28
+ length: number;
29
+ }
30
+ export interface Segment {
31
+ span: SpanRange;
32
+ body: string;
33
+ /** Position in the segment list, 0-indexed. */
34
+ index: number;
35
+ /** The separator that preceded this segment, or `null` for the first segment. */
36
+ separator: SegmentSeparator;
37
+ }
38
+ export interface KnownFormatHit {
39
+ format: KnownFormat;
40
+ span: SpanRange;
41
+ /** 0..1. Ambiguous patterns (`fr_postcode`/`de_postcode` overlap with `us_zip`) score lower. */
42
+ confidence: number;
43
+ }
44
+ /**
45
+ * A detected region abbreviation (e.g., "DC", "NY", "CA"). Used by the locality soft prior to bias
46
+ * preceding place-name tokens toward `B-locality`.
47
+ */
48
+ export interface RegionAbbreviationHit {
49
+ /** Character offset into the normalized input. */
50
+ start: number;
51
+ /** The abbreviation text (e.g., "DC", "NY"). */
52
+ span: string;
53
+ }
54
+ /**
55
+ * Structural snapshot of an input string, computed once at the boundary between Stage 1 and Stage 2
56
+ * of the runtime pipeline. Microseconds-cheap. Consumed by stages 2, 2.5, 3 (optional), and 6 as
57
+ * additional context.
58
+ *
59
+ * Bitter-lesson-safe: recognizes universal structural patterns (character class, punctuation,
60
+ * postcode shape) rather than place-specific knowledge.
61
+ */
62
+ export interface QueryShape {
63
+ characterClass: CharacterClass;
64
+ tokenClasses: TokenClass[];
65
+ segments: Segment[];
66
+ knownFormats: KnownFormatHit[];
67
+ /**
68
+ * Region abbreviation hits detected in the input. The locality soft prior uses these to bias
69
+ * preceding place-name tokens toward `B-locality` / `I-locality` during Viterbi decoding.
70
+ */
71
+ regionAbbreviations: RegionAbbreviationHit[];
72
+ totalLength: number;
73
+ whitespacePattern: WhitespacePattern;
74
+ }
75
+ /**
76
+ * Minimal shape that satisfies `computeQueryShape`'s input contract. The full `NormalizedInput`
77
+ * from `@mailwoman/normalize` is structurally compatible — no import required.
78
+ */
79
+ export interface NormalizedInputLite {
80
+ normalized: string;
81
+ appliedLocale?: string;
82
+ }
83
+ export interface ComputeQueryShapeOpts {
84
+ /** Locale hint for segmentation grammar (default: comma-based Western). */
85
+ locale?: string;
86
+ }
87
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;IACX,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,0CAA0C;AAC1C,MAAM,MAAM,mBAAmB,GAAG,OAAO,GAAG,OAAO,GAAG,OAAO,GAAG,OAAO,GAAG,KAAK,GAAG,UAAU,GAAG,QAAQ,CAAA;AAEvG,uEAAuE;AACvE,MAAM,MAAM,cAAc,GAAG,SAAS,GAAG,OAAO,GAAG,cAAc,GAAG,KAAK,GAAG,UAAU,GAAG,QAAQ,GAAG,OAAO,CAAA;AAE3G,2EAA2E;AAC3E,MAAM,MAAM,WAAW,GACpB,QAAQ,GACR,SAAS,GACT,aAAa,GACb,aAAa,GACb,aAAa,GACb,aAAa,GACb,aAAa,GACb,QAAQ,CAAA;AAEX,kEAAkE;AAClE,MAAM,MAAM,gBAAgB,GAAG,OAAO,GAAG,SAAS,GAAG,KAAK,GAAG,YAAY,GAAG,gBAAgB,GAAG,IAAI,CAAA;AAEnG,6CAA6C;AAC7C,MAAM,MAAM,iBAAiB,GAAG,QAAQ,GAAG,QAAQ,GAAG,KAAK,GAAG,OAAO,GAAG,MAAM,CAAA;AAE9E,MAAM,WAAW,UAAU;IAC1B,IAAI,EAAE,SAAS,CAAA;IACf,KAAK,EAAE,mBAAmB,CAAA;IAC1B,MAAM,EAAE,MAAM,CAAA;CACd;AAED,MAAM,WAAW,OAAO;IACvB,IAAI,EAAE,SAAS,CAAA;IACf,IAAI,EAAE,MAAM,CAAA;IACZ,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAA;IACb,iFAAiF;IACjF,SAAS,EAAE,gBAAgB,CAAA;CAC3B;AAED,MAAM,WAAW,cAAc;IAC9B,MAAM,EAAE,WAAW,CAAA;IACnB,IAAI,EAAE,SAAS,CAAA;IACf,gGAAgG;IAChG,UAAU,EAAE,MAAM,CAAA;CAClB;AAED;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACrC,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAA;IACb,gDAAgD;IAChD,IAAI,EAAE,MAAM,CAAA;CACZ;AAED;;;;;;;GAOG;AACH,MAAM,WAAW,UAAU;IAC1B,cAAc,EAAE,cAAc,CAAA;IAC9B,YAAY,EAAE,UAAU,EAAE,CAAA;IAC1B,QAAQ,EAAE,OAAO,EAAE,CAAA;IACnB,YAAY,EAAE,cAAc,EAAE,CAAA;IAC9B;;;OAGG;IACH,mBAAmB,EAAE,qBAAqB,EAAE,CAAA;IAC5C,WAAW,EAAE,MAAM,CAAA;IACnB,iBAAiB,EAAE,iBAAiB,CAAA;CACpC;AAED;;;GAGG;AACH,MAAM,WAAW,mBAAmB;IACnC,UAAU,EAAE,MAAM,CAAA;IAClB,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,qBAAqB;IACrC,2EAA2E;IAC3E,MAAM,CAAC,EAAE,MAAM,CAAA;CACf"}
package/out/types.js ADDED
@@ -0,0 +1,7 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ export {};
7
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../types.ts"],"names":[],"mappings":"AAAA;;;;GAIG"}
package/package.json ADDED
@@ -0,0 +1,25 @@
1
+ {
2
+ "name": "@mailwoman/query-shape",
3
+ "version": "4.0.0",
4
+ "description": "Cheap structural priors for the runtime pipeline: character class, segmentation, known-format detection. Pure functions, no ML.",
5
+ "license": "AGPL-3.0-only",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/sister-software/mailwoman.git",
9
+ "directory": "query-shape"
10
+ },
11
+ "type": "module",
12
+ "exports": {
13
+ "./package.json": "./package.json",
14
+ ".": "./out/index.js"
15
+ },
16
+ "files": [
17
+ "out/**/*.js",
18
+ "out/**/*.js.map",
19
+ "out/**/*.d.ts",
20
+ "out/**/*.d.ts.map"
21
+ ],
22
+ "publishConfig": {
23
+ "access": "public"
24
+ }
25
+ }