@mailwoman/normalize 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Abbreviation expansion — a small bounded dictionary per locale. Initial dict covers en-US street
7
+ * suffixes + directional prefixes. fr-FR + others added as needed.
8
+ *
9
+ * This is the INVERSE of the corpus synthesis pass (which produces `Ave` from `Avenue` for
10
+ * augmentation). Both sides should eventually share dictionaries; for v1 this dict is duplicated
11
+ * intentionally — refactoring sharing is a separate task.
12
+ */
13
+ import type { SpanRange } from "./types.js";
14
+ export interface AbbreviationResult {
15
+ text: string;
16
+ map: number[];
17
+ expansions: Array<{
18
+ from: string;
19
+ to: string;
20
+ at: SpanRange;
21
+ }>;
22
+ }
23
+ /**
24
+ * Expand known abbreviations. Walks the input token-by-token (whitespace-delimited) and rewrites
25
+ * matching tokens to their canonical long form. The output map points every char of the expanded
26
+ * form to its position in the original short form (first char of input token).
27
+ *
28
+ * Case rules: match case-insensitively. Output form preserves the dictionary's canonical casing
29
+ * (`St` → `Street`, `st` → `Street`, `ST` → `Street`).
30
+ */
31
+ export declare function expandAbbreviations(input: string, locale?: string): AbbreviationResult;
32
+ //# sourceMappingURL=abbreviations.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"abbreviations.d.ts","sourceRoot":"","sources":["../abbreviations.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAgD3C,MAAM,WAAW,kBAAkB;IAClC,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,UAAU,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,SAAS,CAAA;KAAE,CAAC,CAAA;CAC9D;AAED;;;;;;;GAOG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,kBAAkB,CAiDtF"}
@@ -0,0 +1,110 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Abbreviation expansion — a small bounded dictionary per locale. Initial dict covers en-US street
7
+ * suffixes + directional prefixes. fr-FR + others added as needed.
8
+ *
9
+ * This is the INVERSE of the corpus synthesis pass (which produces `Ave` from `Avenue` for
10
+ * augmentation). Both sides should eventually share dictionaries; for v1 this dict is duplicated
11
+ * intentionally — refactoring sharing is a separate task.
12
+ */
13
+ const EN_US_DICT = [
14
+ // Directional prefixes / suffixes
15
+ { from: "N", to: "North" },
16
+ { from: "S", to: "South" },
17
+ { from: "E", to: "East" },
18
+ { from: "W", to: "West" },
19
+ { from: "NE", to: "Northeast" },
20
+ { from: "NW", to: "Northwest" },
21
+ { from: "SE", to: "Southeast" },
22
+ { from: "SW", to: "Southwest" },
23
+ // Street suffixes
24
+ { from: "St", to: "Street" },
25
+ { from: "Ave", to: "Avenue" },
26
+ { from: "Blvd", to: "Boulevard" },
27
+ { from: "Rd", to: "Road" },
28
+ { from: "Dr", to: "Drive" },
29
+ { from: "Ct", to: "Court" },
30
+ { from: "Ln", to: "Lane" },
31
+ { from: "Pl", to: "Place" },
32
+ { from: "Pkwy", to: "Parkway" },
33
+ { from: "Hwy", to: "Highway" },
34
+ { from: "Sq", to: "Square" },
35
+ { from: "Ter", to: "Terrace" },
36
+ ];
37
+ const FR_FR_DICT = [
38
+ { from: "R", to: "Rue" },
39
+ { from: "Bd", to: "Boulevard" },
40
+ { from: "Av", to: "Avenue" },
41
+ { from: "Bvd", to: "Boulevard" },
42
+ { from: "Pl", to: "Place" },
43
+ { from: "Imp", to: "Impasse" },
44
+ { from: "Sq", to: "Square" },
45
+ ];
46
+ function getDictionary(locale) {
47
+ const lc = (locale ?? "en-US").toLowerCase();
48
+ if (lc.startsWith("fr"))
49
+ return FR_FR_DICT;
50
+ return EN_US_DICT;
51
+ }
52
+ /**
53
+ * Expand known abbreviations. Walks the input token-by-token (whitespace-delimited) and rewrites
54
+ * matching tokens to their canonical long form. The output map points every char of the expanded
55
+ * form to its position in the original short form (first char of input token).
56
+ *
57
+ * Case rules: match case-insensitively. Output form preserves the dictionary's canonical casing
58
+ * (`St` → `Street`, `st` → `Street`, `ST` → `Street`).
59
+ */
60
+ export function expandAbbreviations(input, locale) {
61
+ const dict = getDictionary(locale);
62
+ const lookup = new Map();
63
+ for (const entry of dict)
64
+ lookup.set(entry.from.toLowerCase(), entry.to);
65
+ const out = [];
66
+ const map = [];
67
+ const expansions = [];
68
+ let i = 0;
69
+ while (i < input.length) {
70
+ const ch = input[i];
71
+ // Walk to end of token (non-whitespace, non-punctuation). Unicode-letter-aware so
72
+ // "République" stays one token instead of fragmenting on 'é'.
73
+ const isTokenChar = (c) => /[\p{L}\p{N}'_-]/u.test(c);
74
+ if (!isTokenChar(ch)) {
75
+ out.push(ch);
76
+ map.push(i);
77
+ i += 1;
78
+ continue;
79
+ }
80
+ const start = i;
81
+ while (i < input.length && isTokenChar(input[i]))
82
+ i += 1;
83
+ const token = input.slice(start, i);
84
+ const tokenWithTrailingDot = i < input.length && input[i] === "." ? `${token}.` : token;
85
+ const lookupKey = token.replace(/\.$/, "").toLowerCase();
86
+ const expansion = lookup.get(lookupKey);
87
+ if (!expansion) {
88
+ for (let k = 0; k < token.length; k++) {
89
+ out.push(token[k]);
90
+ map.push(start + k);
91
+ }
92
+ continue;
93
+ }
94
+ // Emit expansion; map every char back to start of source token.
95
+ for (let k = 0; k < expansion.length; k++) {
96
+ out.push(expansion[k]);
97
+ map.push(start + Math.min(k, token.length - 1));
98
+ }
99
+ expansions.push({
100
+ from: tokenWithTrailingDot,
101
+ to: expansion,
102
+ at: { start, end: i, body: token },
103
+ });
104
+ // Skip the trailing period if we consumed an abbreviation with one (e.g. "St." → "Street").
105
+ if (i < input.length && input[i] === ".")
106
+ i += 1;
107
+ }
108
+ return { text: out.join(""), map, expansions };
109
+ }
110
+ //# sourceMappingURL=abbreviations.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"abbreviations.js","sourceRoot":"","sources":["../abbreviations.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AASH,MAAM,UAAU,GAAqC;IACpD,kCAAkC;IAClC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,OAAO,EAAE;IAC1B,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,OAAO,EAAE;IAC1B,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,EAAE;IACzB,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,EAAE;IACzB,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,kBAAkB;IAClB,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC5B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC7B,EAAE,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,WAAW,EAAE;IACjC,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE;IAC1B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE;IAC1B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,SAAS,EAAE;IAC/B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE;IAC9B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC5B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE;CAC9B,CAAA;AAED,MAAM,UAAU,GAAqC;IACpD,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE;IACxB,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC5B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,WAAW,EAAE;IAChC,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE;IAC9B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;CAC5B,CAAA;AAED,SAAS,aAAa,CAAC,MAA0B;IAChD,MAAM,EAAE,GAAG,CAAC,MAAM,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAA;IAC5C,IAAI,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAC1C,OAAO,UAAU,CAAA;AAClB,CAAC;AAQD;;;;;;;GAOG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa,EAAE,MAAe;IACjE,MAAM,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAA;IAClC,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAA;IACxC,KAAK,MAAM,KAAK,IAAI,IAAI;QAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,KAAK,CAAC,EAAE,CAAC,CAAA;IAExE,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,UAAU,GAAuD,EAAE,CAAA;IAEzE,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACzB,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;QACpB,kFAAkF;QAClF,8DAA8D;QAC9D,MAAM,WAAW,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC7D,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,EAAE,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,CAAC,IAAI,CAAC,CAAA;YACN,SAAQ;QACT,CAAC;QACD,MAAM,KAAK,GAAG,CAAC,CAAA;QACf,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC;YAAE,CAAC,IAAI,CAAC,CAAA;QACzD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAA;QACnC,MAAM,oBAAoB,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,KAAK,CAAA;QACvF,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;QACxD,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;QACvC,IAAI,CAAC,SAAS,EAAE,CAAC;YAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;gBACnB,GAAG,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAA;YACpB,CAAC;YACD,SAAQ;QACT,CAAC;QACD,gEAAgE;QAChE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,CAAA;YACvB,GAAG,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAA;QAChD,CAAC;QACD,UAAU,CAAC,IAAI,CAAC;YACf,IAAI,EAAE,oBAAoB;YAC1B,EAAE,EAAE,SAAS;YACb,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE;SAClC,CAAC,CAAA;QACF,4FAA4F;QAC5F,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAC,KAAK,GAAG;YAAE,CAAC,IAAI,CAAC,CAAA;IACjD,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,UAAU,EAAE,CAAA;AAC/C,CAAC"}
package/out/cjk.d.ts ADDED
@@ -0,0 +1,38 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * CJK input normalization (Direction E, #291) — a conservative, character-level pass that runs as
7
+ * part of `normalize()` so the parser sees a stable form of CJK addresses. It does only the
8
+ * transformations that are UNAMBIGUOUS in any context:
9
+ *
10
+ * - **Strip the postal mark 〒 (U+3012).** The JP cheap-probe found 〒 is byte-fallback OOV for the
11
+ * SentencePiece tokenizer — it fragments into raw UTF-8 byte pieces and poisons the parse of
12
+ * the digits right after it (the postcode gets mislabeled as a house number). It's a
13
+ * "postcode follows" marker with no addressing content of its own, so dropping it is safe and
14
+ * fixes the bug.
15
+ * - **Fold full-width ASCII (U+FF01–U+FF5E → U+0021–U+007E).** A full-width `1` is always the digit
16
+ * 1, a full-width `-` always a hyphen — keyboards and copy-paste produce these constantly.
17
+ * Folding them to ASCII makes `104−0061` and `104-0061` the same input.
18
+ * - **Fold the ideographic space (U+3000 → ' ').**
19
+ *
20
+ * It deliberately does NOT convert **kanji numerals** (一二三…): place names carry numeral kanji as
21
+ * ordinary characters (三田 _Mita_, 四谷 _Yotsuya_), so a blind 三→3 would corrupt them.
22
+ * Disambiguating "this 三 is a block number, that one is part of a name" is parsing, not
23
+ * normalization — deferred. Kana→kanji transliteration (ちょうめ→丁目) is dictionary work and likewise
24
+ * deferred.
25
+ *
26
+ * Self-gating: a string with none of these characters returns identity, so Latin input is
27
+ * untouched.
28
+ */
29
+ export interface CjkResult {
30
+ text: string;
31
+ map: number[];
32
+ /** Count of characters folded in place (full-width → ASCII, ideographic space → ' '). */
33
+ folded: number;
34
+ /** Count of characters dropped (the postal mark). */
35
+ stripped: number;
36
+ }
37
+ export declare function applyCjkNormalization(input: string): CjkResult;
38
+ //# sourceMappingURL=cjk.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cjk.d.ts","sourceRoot":"","sources":["../cjk.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAIH,MAAM,WAAW,SAAS;IACzB,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,yFAAyF;IACzF,MAAM,EAAE,MAAM,CAAA;IACd,qDAAqD;IACrD,QAAQ,EAAE,MAAM,CAAA;CAChB;AAQD,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAkC9D"}
package/out/cjk.js ADDED
@@ -0,0 +1,68 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * CJK input normalization (Direction E, #291) — a conservative, character-level pass that runs as
7
+ * part of `normalize()` so the parser sees a stable form of CJK addresses. It does only the
8
+ * transformations that are UNAMBIGUOUS in any context:
9
+ *
10
+ * - **Strip the postal mark 〒 (U+3012).** The JP cheap-probe found 〒 is byte-fallback OOV for the
11
+ * SentencePiece tokenizer — it fragments into raw UTF-8 byte pieces and poisons the parse of
12
+ * the digits right after it (the postcode gets mislabeled as a house number). It's a
13
+ * "postcode follows" marker with no addressing content of its own, so dropping it is safe and
14
+ * fixes the bug.
15
+ * - **Fold full-width ASCII (U+FF01–U+FF5E → U+0021–U+007E).** A full-width `1` is always the digit
16
+ * 1, a full-width `-` always a hyphen — keyboards and copy-paste produce these constantly.
17
+ * Folding them to ASCII makes `104−0061` and `104-0061` the same input.
18
+ * - **Fold the ideographic space (U+3000 → ' ').**
19
+ *
20
+ * It deliberately does NOT convert **kanji numerals** (一二三…): place names carry numeral kanji as
21
+ * ordinary characters (三田 _Mita_, 四谷 _Yotsuya_), so a blind 三→3 would corrupt them.
22
+ * Disambiguating "this 三 is a block number, that one is part of a name" is parsing, not
23
+ * normalization — deferred. Kana→kanji transliteration (ちょうめ→丁目) is dictionary work and likewise
24
+ * deferred.
25
+ *
26
+ * Self-gating: a string with none of these characters returns identity, so Latin input is
27
+ * untouched.
28
+ */
29
+ import { identityMap } from "./offset-map.js";
30
+ const FULLWIDTH_START = 0xff01; // !
31
+ const FULLWIDTH_END = 0xff5e; // ~
32
+ const FULLWIDTH_TO_ASCII = 0xfee0; // U+FFxx − 0xFEE0 = U+00xx
33
+ const IDEOGRAPHIC_SPACE = 0x3000;
34
+ const POSTAL_MARK = 0x3012; // 〒
35
+ export function applyCjkNormalization(input) {
36
+ let folded = 0;
37
+ let stripped = 0;
38
+ const out = [];
39
+ const map = [];
40
+ // All transformed code points are in the BMP (single UTF-16 unit), and every other character is
41
+ // passed through verbatim, so a per-unit walk is safe for surrogate-pair input too.
42
+ for (let i = 0; i < input.length; i++) {
43
+ const code = input.charCodeAt(i);
44
+ if (code === POSTAL_MARK) {
45
+ stripped += 1;
46
+ continue; // drop — no addressing content; whitespace collapse later tidies any gap
47
+ }
48
+ if (code >= FULLWIDTH_START && code <= FULLWIDTH_END) {
49
+ out.push(String.fromCharCode(code - FULLWIDTH_TO_ASCII));
50
+ map.push(i);
51
+ folded += 1;
52
+ continue;
53
+ }
54
+ if (code === IDEOGRAPHIC_SPACE) {
55
+ out.push(" ");
56
+ map.push(i);
57
+ folded += 1;
58
+ continue;
59
+ }
60
+ out.push(input[i]);
61
+ map.push(i);
62
+ }
63
+ if (folded === 0 && stripped === 0) {
64
+ return { text: input, map: identityMap(input.length), folded: 0, stripped: 0 };
65
+ }
66
+ return { text: out.join(""), map, folded, stripped };
67
+ }
68
+ //# sourceMappingURL=cjk.js.map
package/out/cjk.js.map ADDED
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cjk.js","sourceRoot":"","sources":["../cjk.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAW7C,MAAM,eAAe,GAAG,MAAM,CAAA,CAAC,IAAI;AACnC,MAAM,aAAa,GAAG,MAAM,CAAA,CAAC,IAAI;AACjC,MAAM,kBAAkB,GAAG,MAAM,CAAA,CAAC,2BAA2B;AAC7D,MAAM,iBAAiB,GAAG,MAAM,CAAA;AAChC,MAAM,WAAW,GAAG,MAAM,CAAA,CAAC,IAAI;AAE/B,MAAM,UAAU,qBAAqB,CAAC,KAAa;IAClD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,QAAQ,GAAG,CAAC,CAAA;IAChB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IAExB,gGAAgG;IAChG,oFAAoF;IACpF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAA;QAChC,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;YAC1B,QAAQ,IAAI,CAAC,CAAA;YACb,SAAQ,CAAC,yEAAyE;QACnF,CAAC;QACD,IAAI,IAAI,IAAI,eAAe,IAAI,IAAI,IAAI,aAAa,EAAE,CAAC;YACtD,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,GAAG,kBAAkB,CAAC,CAAC,CAAA;YACxD,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,MAAM,IAAI,CAAC,CAAA;YACX,SAAQ;QACT,CAAC;QACD,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;YAChC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YACb,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,MAAM,IAAI,CAAC,CAAA;YACX,SAAQ;QACT,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;QACnB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACZ,CAAC;IAED,IAAI,MAAM,KAAK,CAAC,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;QACpC,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAA;IAC/E,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAA;AACrD,CAAC"}
@@ -0,0 +1,11 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `normalize(raw, opts)` — the Stage 1 entry point. Composes NFC + punctuation + whitespace
7
+ * (always) with case-fold + abbreviation expansion (opt-in).
8
+ */
9
+ import type { NormalizedInput, NormalizeOpts } from "./types.js";
10
+ export declare function normalize(raw: string, opts?: NormalizeOpts): NormalizedInput;
11
+ //# sourceMappingURL=compute.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compute.d.ts","sourceRoot":"","sources":["../compute.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAOH,OAAO,KAAK,EAA0B,eAAe,EAAE,aAAa,EAAE,MAAM,YAAY,CAAA;AAGxF,wBAAgB,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,aAAa,GAAG,eAAe,CA2E5E"}
package/out/compute.js ADDED
@@ -0,0 +1,84 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `normalize(raw, opts)` — the Stage 1 entry point. Composes NFC + punctuation + whitespace
7
+ * (always) with case-fold + abbreviation expansion (opt-in).
8
+ */
9
+ import { expandAbbreviations } from "./abbreviations.js";
10
+ import { applyCjkNormalization } from "./cjk.js";
11
+ import { applyNfc } from "./nfc.js";
12
+ import { composeMaps, identityMap } from "./offset-map.js";
13
+ import { applyPunctuation } from "./punctuation.js";
14
+ import { collapseWhitespace } from "./whitespace.js";
15
+ export function normalize(raw, opts) {
16
+ const transforms = [];
17
+ let text = raw;
18
+ let map = identityMap(raw.length);
19
+ // 1. NFC
20
+ if (!opts?.skipNfc) {
21
+ const r = applyNfc(text);
22
+ text = r.text;
23
+ map = composeMaps(map, r.map);
24
+ transforms.push({ kind: "nfc", changed: r.changed });
25
+ }
26
+ // 1.5 CJK normalization — strip the postal mark 〒 (byte-fallback OOV that poisons the postcode
27
+ // parse) and fold full-width ASCII + the ideographic space. Runs after NFC so it sees composed
28
+ // forms, before punctuation/whitespace so any gap left by 〒 is then collapsed. No-op off-script.
29
+ {
30
+ const r = applyCjkNormalization(text);
31
+ if (r.folded > 0 || r.stripped > 0) {
32
+ text = r.text;
33
+ map = composeMaps(map, r.map);
34
+ transforms.push({ kind: "normalize_cjk", folded: r.folded, stripped: r.stripped });
35
+ }
36
+ }
37
+ // 2. Punctuation
38
+ {
39
+ const r = applyPunctuation(text);
40
+ if (r.replacements > 0) {
41
+ text = r.text;
42
+ map = composeMaps(map, r.map);
43
+ transforms.push({ kind: "normalize_punctuation", replacements: r.replacements });
44
+ }
45
+ }
46
+ // 3. Whitespace
47
+ {
48
+ const r = collapseWhitespace(text);
49
+ if (r.runs > 0 || r.text.length !== text.length) {
50
+ text = r.text;
51
+ map = composeMaps(map, r.map);
52
+ transforms.push({ kind: "collapse_whitespace", runs: r.runs });
53
+ }
54
+ }
55
+ // 4. Abbreviation expansion (opt-in) — runs BEFORE case-fold so case-folding the canonical
56
+ // expansion form (e.g. "Street") gives a consistent final case.
57
+ if (opts?.expandAbbreviations) {
58
+ const r = expandAbbreviations(text, opts.locale);
59
+ if (r.expansions.length > 0) {
60
+ text = r.text;
61
+ map = composeMaps(map, r.map);
62
+ for (const e of r.expansions) {
63
+ transforms.push({ kind: "expand_abbreviation", from: e.from, to: e.to, at: e.at });
64
+ }
65
+ }
66
+ }
67
+ // 5. Case fold (opt-in)
68
+ if (opts?.caseFold) {
69
+ const lc = text.toLocaleLowerCase(opts.locale);
70
+ if (lc !== text) {
71
+ text = lc;
72
+ // Case-fold is identity-length for ASCII + most Latin; map unchanged.
73
+ transforms.push({ kind: "case_fold", locale: opts.locale ?? "und" });
74
+ }
75
+ }
76
+ return Object.freeze({
77
+ raw,
78
+ normalized: text,
79
+ transforms: Object.freeze(transforms),
80
+ offsetMap: map,
81
+ appliedLocale: opts?.locale,
82
+ });
83
+ }
84
+ //# sourceMappingURL=compute.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compute.js","sourceRoot":"","sources":["../compute.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAA;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AAEpD,MAAM,UAAU,SAAS,CAAC,GAAW,EAAE,IAAoB;IAC1D,MAAM,UAAU,GAA6B,EAAE,CAAA;IAC/C,IAAI,IAAI,GAAG,GAAG,CAAA;IACd,IAAI,GAAG,GAAG,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAEjC,SAAS;IACT,IAAI,CAAC,IAAI,EAAE,OAAO,EAAE,CAAC;QACpB,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAA;QACxB,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;QACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;QAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,CAAA;IACrD,CAAC;IAED,+FAA+F;IAC/F,+FAA+F;IAC/F,iGAAiG;IACjG,CAAC;QACA,MAAM,CAAC,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;QACrC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,EAAE,CAAC;YACpC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,eAAe,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAA;QACnF,CAAC;IACF,CAAC;IAED,iBAAiB;IACjB,CAAC;QACA,MAAM,CAAC,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;QAChC,IAAI,CAAC,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,uBAAuB,EAAE,YAAY,EAAE,CAAC,CAAC,YAAY,EAAE,CAAC,CAAA;QACjF,CAAC;IACF,CAAC;IAED,gBAAgB;IAChB,CAAC;QACA,MAAM,CAAC,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QAClC,IAAI,CAAC,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;YACjD,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QAC/D,CAAC;IACF,CAAC;IAED,2FAA2F;IAC3F,gEAAgE;IAChE,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;QAChD,IAAI,CAAC,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7B,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;gBAC9B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;YACnF,CAAC;QACF,CAAC;IACF,CAAC;IAED,wBAAwB;IACxB,IAAI,IAAI,EAAE,QAAQ,EAAE,CAAC;QACpB,MAAM,EAAE,GAAG,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9C,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,GAAG,EAAE,CAAA;YACT,sEAAsE;YACtE,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,KAAK,EAAE,CAAC,CAAA;QACrE,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC;QACpB,GAAG;QACH,UAAU,EAAE,IAAI;QAChB,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,UAAU,CAA6B;QACjE,SAAS,EAAE,GAAG;QACd,aAAa,EAAE,IAAI,EAAE,MAAM;KAC3B,CAA2B,CAAA;AAC7B,CAAC"}
package/out/index.d.ts ADDED
@@ -0,0 +1,23 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/normalize` — Stage 1 of the runtime pipeline.
7
+ *
8
+ * Deterministic input preprocessing: NFC, punctuation, whitespace, optional case-fold +
9
+ * abbreviation expansion. Pure functions. Produces a `NormalizedInput` with a load-bearing
10
+ * `offsetMap` so downstream stages can map normalized-string spans back to raw-string character
11
+ * offsets.
12
+ *
13
+ * See `docs/articles/plan/reference/STAGES.md` § Stage 1 for the contract.
14
+ */
15
+ export { expandAbbreviations } from "./abbreviations.js";
16
+ export { applyCjkNormalization, type CjkResult } from "./cjk.js";
17
+ export { normalize } from "./compute.js";
18
+ export { applyNfc } from "./nfc.js";
19
+ export { composeMaps, identityMap } from "./offset-map.js";
20
+ export { applyPunctuation } from "./punctuation.js";
21
+ export type { NormalizationTransform, NormalizeOpts, NormalizedInput, SpanRange } from "./types.js";
22
+ export { collapseWhitespace } from "./whitespace.js";
23
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,EAAE,qBAAqB,EAAE,KAAK,SAAS,EAAE,MAAM,UAAU,CAAA;AAChE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAA;AACxC,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,YAAY,EAAE,sBAAsB,EAAE,aAAa,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AACnG,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA"}
package/out/index.js ADDED
@@ -0,0 +1,22 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/normalize` — Stage 1 of the runtime pipeline.
7
+ *
8
+ * Deterministic input preprocessing: NFC, punctuation, whitespace, optional case-fold +
9
+ * abbreviation expansion. Pure functions. Produces a `NormalizedInput` with a load-bearing
10
+ * `offsetMap` so downstream stages can map normalized-string spans back to raw-string character
11
+ * offsets.
12
+ *
13
+ * See `docs/articles/plan/reference/STAGES.md` § Stage 1 for the contract.
14
+ */
15
+ export { expandAbbreviations } from "./abbreviations.js";
16
+ export { applyCjkNormalization } from "./cjk.js";
17
+ export { normalize } from "./compute.js";
18
+ export { applyNfc } from "./nfc.js";
19
+ export { composeMaps, identityMap } from "./offset-map.js";
20
+ export { applyPunctuation } from "./punctuation.js";
21
+ export { collapseWhitespace } from "./whitespace.js";
22
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,EAAE,qBAAqB,EAAkB,MAAM,UAAU,CAAA;AAChE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAA;AACxC,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA"}
package/out/nfc.d.ts ADDED
@@ -0,0 +1,21 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Unicode NFC normalization. For inputs already in NFC (the common case) this is a no-op. When the
7
+ * input has combining characters (`e` + `́` → `é`), NFC composes them — the normalized string can
8
+ * be shorter than the raw.
9
+ *
10
+ * Approximation: we walk the input grapheme-by-grapheme (best effort via codepoint stepping) and
11
+ * map each output index to the start of its source sequence. Rare CJK edge cases involving
12
+ * variant selectors may produce off-by-one offsets — acceptable for v1.
13
+ */
14
+ export interface NfcResult {
15
+ text: string;
16
+ /** `text[i]` came from `input[map[i]]`. */
17
+ map: number[];
18
+ changed: boolean;
19
+ }
20
+ export declare function applyNfc(input: string): NfcResult;
21
+ //# sourceMappingURL=nfc.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"nfc.d.ts","sourceRoot":"","sources":["../nfc.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAIH,MAAM,WAAW,SAAS;IACzB,IAAI,EAAE,MAAM,CAAA;IACZ,2CAA2C;IAC3C,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,OAAO,EAAE,OAAO,CAAA;CAChB;AAED,wBAAgB,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAMjD"}
package/out/nfc.js ADDED
@@ -0,0 +1,53 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Unicode NFC normalization. For inputs already in NFC (the common case) this is a no-op. When the
7
+ * input has combining characters (`e` + `́` → `é`), NFC composes them — the normalized string can
8
+ * be shorter than the raw.
9
+ *
10
+ * Approximation: we walk the input grapheme-by-grapheme (best effort via codepoint stepping) and
11
+ * map each output index to the start of its source sequence. Rare CJK edge cases involving
12
+ * variant selectors may produce off-by-one offsets — acceptable for v1.
13
+ */
14
+ import { identityMap } from "./offset-map.js";
15
+ export function applyNfc(input) {
16
+ const normalized = input.normalize("NFC");
17
+ if (normalized === input) {
18
+ return { text: input, map: identityMap(input.length), changed: false };
19
+ }
20
+ return { text: normalized, map: estimateNfcMap(input, normalized), changed: true };
21
+ }
22
+ /**
23
+ * Estimate per-output-codepoint offsets. Walks both strings in parallel; emits the next source
24
+ * index for each output position. Imprecise for combining sequences but correct for length-equal
25
+ * NFC outputs (the common length-changing case is when a sequence shortens).
26
+ */
27
+ function estimateNfcMap(input, output) {
28
+ const map = [];
29
+ let inIdx = 0;
30
+ for (let outIdx = 0; outIdx < output.length; outIdx++) {
31
+ map.push(inIdx);
32
+ const outCp = output.codePointAt(outIdx);
33
+ const outStep = outCp > 0xffff ? 2 : 1;
34
+ // Walk the input forward by at least one codepoint; absorb any combining marks (0x0300–0x036f).
35
+ if (inIdx < input.length) {
36
+ const inCp = input.codePointAt(inIdx);
37
+ inIdx += inCp > 0xffff ? 2 : 1;
38
+ while (inIdx < input.length) {
39
+ const nextCp = input.codePointAt(inIdx);
40
+ if (nextCp >= 0x0300 && nextCp <= 0x036f) {
41
+ inIdx += nextCp > 0xffff ? 2 : 1;
42
+ }
43
+ else {
44
+ break;
45
+ }
46
+ }
47
+ }
48
+ if (outStep === 2)
49
+ outIdx += 1;
50
+ }
51
+ return map;
52
+ }
53
+ //# sourceMappingURL=nfc.js.map
package/out/nfc.js.map ADDED
@@ -0,0 +1 @@
1
+ {"version":3,"file":"nfc.js","sourceRoot":"","sources":["../nfc.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAS7C,MAAM,UAAU,QAAQ,CAAC,KAAa;IACrC,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;IACzC,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;QAC1B,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAA;IACvE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,cAAc,CAAC,KAAK,EAAE,UAAU,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAA;AACnF,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CAAC,KAAa,EAAE,MAAc;IACpD,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,IAAI,MAAM,GAAG,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE,CAAC;QACvD,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;QACf,MAAM,KAAK,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAE,CAAA;QACzC,MAAM,OAAO,GAAG,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACtC,gGAAgG;QAChG,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,KAAK,CAAC,WAAW,CAAC,KAAK,CAAE,CAAA;YACtC,KAAK,IAAI,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;YAC9B,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;gBAC7B,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC,KAAK,CAAE,CAAA;gBACxC,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,EAAE,CAAC;oBAC1C,KAAK,IAAI,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;gBACjC,CAAC;qBAAM,CAAC;oBACP,MAAK;gBACN,CAAC;YACF,CAAC;QACF,CAAC;QACD,IAAI,OAAO,KAAK,CAAC;YAAE,MAAM,IAAI,CAAC,CAAA;IAC/B,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
@@ -0,0 +1,20 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Utilities for composing per-transform offset maps into the final `raw → normalized` map.
7
+ */
8
+ /** Identity map for an input of length `n`: `[0, 1, 2, ..., n-1]`. */
9
+ export declare function identityMap(n: number): number[];
10
+ /**
11
+ * Compose `inputMap` (input → raw) with `transformMap` (output → input) to produce `outputMap`
12
+ * (output → raw).
13
+ *
14
+ * @example // raw = "350 5th" (chars 0..7, double space at 3-4) // input = "350 5th" (identity from
15
+ * raw, length 8) // output = "350 5th" (whitespace collapsed, length 7) // inputMap =
16
+ * [0,1,2,3,4,5,6,7] // transformMap = [0,1,2,3,5,6,7] (output[3]=' ' came from input[3];
17
+ * output[4]='5' from input[5]) // composed = [0,1,2,3,5,6,7]
18
+ */
19
+ export declare function composeMaps(inputMap: number[], transformMap: number[]): number[];
20
+ //# sourceMappingURL=offset-map.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"offset-map.d.ts","sourceRoot":"","sources":["../offset-map.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,sEAAsE;AACtE,wBAAgB,WAAW,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAI/C;AAED;;;;;;;;GAQG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,EAAE,YAAY,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAOhF"}
@@ -0,0 +1,32 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Utilities for composing per-transform offset maps into the final `raw → normalized` map.
7
+ */
8
+ /** Identity map for an input of length `n`: `[0, 1, 2, ..., n-1]`. */
9
+ export function identityMap(n) {
10
+ const m = new Array(n);
11
+ for (let i = 0; i < n; i++)
12
+ m[i] = i;
13
+ return m;
14
+ }
15
+ /**
16
+ * Compose `inputMap` (input → raw) with `transformMap` (output → input) to produce `outputMap`
17
+ * (output → raw).
18
+ *
19
+ * @example // raw = "350 5th" (chars 0..7, double space at 3-4) // input = "350 5th" (identity from
20
+ * raw, length 8) // output = "350 5th" (whitespace collapsed, length 7) // inputMap =
21
+ * [0,1,2,3,4,5,6,7] // transformMap = [0,1,2,3,5,6,7] (output[3]=' ' came from input[3];
22
+ * output[4]='5' from input[5]) // composed = [0,1,2,3,5,6,7]
23
+ */
24
+ export function composeMaps(inputMap, transformMap) {
25
+ const out = new Array(transformMap.length);
26
+ for (let i = 0; i < transformMap.length; i++) {
27
+ const j = transformMap[i];
28
+ out[i] = inputMap[j] ?? j;
29
+ }
30
+ return out;
31
+ }
32
+ //# sourceMappingURL=offset-map.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"offset-map.js","sourceRoot":"","sources":["../offset-map.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,sEAAsE;AACtE,MAAM,UAAU,WAAW,CAAC,CAAS;IACpC,MAAM,CAAC,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;IAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACpC,OAAO,CAAC,CAAA;AACT,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,WAAW,CAAC,QAAkB,EAAE,YAAsB;IACrE,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,YAAY,CAAC,MAAM,CAAC,CAAA;IAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9C,MAAM,CAAC,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;QAC1B,GAAG,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;IAC1B,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Punctuation normalization — fancy quotes / dashes to ASCII equivalents. Identity-length: every
7
+ * fancy character is a single codepoint that maps to a single ASCII char.
8
+ */
9
+ export interface PunctuationResult {
10
+ text: string;
11
+ map: number[];
12
+ replacements: number;
13
+ }
14
+ export declare function applyPunctuation(input: string): PunctuationResult;
15
+ //# sourceMappingURL=punctuation.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"punctuation.d.ts","sourceRoot":"","sources":["../punctuation.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAiBH,MAAM,WAAW,iBAAiB;IACjC,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,YAAY,EAAE,MAAM,CAAA;CACpB;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,MAAM,GAAG,iBAAiB,CA0BjE"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Punctuation normalization — fancy quotes / dashes to ASCII equivalents. Identity-length: every
7
+ * fancy character is a single codepoint that maps to a single ASCII char.
8
+ */
9
+ import { identityMap } from "./offset-map.js";
10
+ const REPLACEMENTS = new Map([
11
+ ["‘", "'"], // ‘
12
+ ["’", "'"], // ’
13
+ ["“", '"'], // “
14
+ ["”", '"'], // ”
15
+ ["–", "-"], // – en dash
16
+ ["—", "-"], // — em dash
17
+ ["−", "-"], // − U+2212 minus sign — Japanese IMEs emit this as the block separator (1−2−3)
18
+ ["―", "-"], // ― U+2015 horizontal bar — another common JP block separator
19
+ ["…", "..."], // … expands; tracked specially
20
+ [" ", " "], // non-breaking space
21
+ ]);
22
+ export function applyPunctuation(input) {
23
+ let changed = false;
24
+ let replacements = 0;
25
+ const out = [];
26
+ const map = [];
27
+ for (let i = 0; i < input.length; i++) {
28
+ const ch = input[i];
29
+ const sub = REPLACEMENTS.get(ch);
30
+ if (sub === undefined) {
31
+ out.push(ch);
32
+ map.push(i);
33
+ }
34
+ else {
35
+ changed = true;
36
+ replacements += 1;
37
+ for (let k = 0; k < sub.length; k++) {
38
+ out.push(sub[k]);
39
+ map.push(i);
40
+ }
41
+ }
42
+ }
43
+ if (!changed) {
44
+ return { text: input, map: identityMap(input.length), replacements: 0 };
45
+ }
46
+ return { text: out.join(""), map, replacements };
47
+ }
48
+ //# sourceMappingURL=punctuation.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"punctuation.js","sourceRoot":"","sources":["../punctuation.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAE7C,MAAM,YAAY,GAAG,IAAI,GAAG,CAAiB;IAC5C,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,YAAY;IACxB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,YAAY;IACxB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,+EAA+E;IAC3F,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,8DAA8D;IAC1E,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,+BAA+B;IAC7C,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,qBAAqB;CACjC,CAAC,CAAA;AAQF,MAAM,UAAU,gBAAgB,CAAC,KAAa;IAC7C,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,IAAI,YAAY,GAAG,CAAC,CAAA;IACpB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IAExB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;QACpB,MAAM,GAAG,GAAG,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;QAChC,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;YACvB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACZ,CAAC;aAAM,CAAC;YACP,OAAO,GAAG,IAAI,CAAA;YACd,YAAY,IAAI,CAAC,CAAA;YACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACrC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,CAAA;gBACjB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACZ,CAAC;QACF,CAAC;IACF,CAAC;IAED,IAAI,CAAC,OAAO,EAAE,CAAC;QACd,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAA;IACxE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,YAAY,EAAE,CAAA;AACjD,CAAC"}
package/out/types.d.ts ADDED
@@ -0,0 +1,63 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ export interface SpanRange {
7
+ start: number;
8
+ end: number;
9
+ body: string;
10
+ }
11
+ /** A single normalization step, as recorded on `NormalizedInput.transforms`. */
12
+ export type NormalizationTransform = {
13
+ kind: "nfc";
14
+ changed: boolean;
15
+ } | {
16
+ kind: "case_fold";
17
+ locale: string;
18
+ } | {
19
+ kind: "expand_abbreviation";
20
+ from: string;
21
+ to: string;
22
+ at: SpanRange;
23
+ } | {
24
+ kind: "collapse_whitespace";
25
+ runs: number;
26
+ } | {
27
+ kind: "normalize_punctuation";
28
+ replacements: number;
29
+ } | {
30
+ kind: "normalize_cjk";
31
+ folded: number;
32
+ stripped: number;
33
+ };
34
+ /**
35
+ * Result of running `normalize()` on a raw input string.
36
+ *
37
+ * `offsetMap[i]` is the index in `raw` from which `normalized[i]` came. For multi-character source
38
+ * sequences (NFC composition, whitespace collapse, abbreviation expansion), each output char points
39
+ * to the FIRST source char by convention.
40
+ */
41
+ export interface NormalizedInput {
42
+ /** The input as the caller sent it. */
43
+ raw: string;
44
+ /** Canonical form, all transforms applied. */
45
+ normalized: string;
46
+ /** Ordered record of what was done. */
47
+ transforms: NormalizationTransform[];
48
+ /** `normalized[i]` came from `raw[offsetMap[i]]`. Length === normalized.length. */
49
+ offsetMap: number[];
50
+ /** The locale used for case-folding + abbreviation rules. */
51
+ appliedLocale?: string;
52
+ }
53
+ export interface NormalizeOpts {
54
+ /** Locale hint for case-folding + abbreviation dictionaries. */
55
+ locale?: string;
56
+ /** Apply locale-aware lowercasing. Default: false (preserve case for downstream consumers). */
57
+ caseFold?: boolean;
58
+ /** Expand known abbreviations (`St` → `Street`, `NW` → `Northwest`, etc.). Default: false. */
59
+ expandAbbreviations?: boolean;
60
+ /** Skip Unicode NFC. Only use for debugging — production callers should leave on. */
61
+ skipNfc?: boolean;
62
+ }
63
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;IACX,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,gFAAgF;AAChF,MAAM,MAAM,sBAAsB,GAC/B;IAAE,IAAI,EAAE,KAAK,CAAC;IAAC,OAAO,EAAE,OAAO,CAAA;CAAE,GACjC;IAAE,IAAI,EAAE,WAAW,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,GACrC;IAAE,IAAI,EAAE,qBAAqB,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,SAAS,CAAA;CAAE,GACxE;IAAE,IAAI,EAAE,qBAAqB,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,GAC7C;IAAE,IAAI,EAAE,uBAAuB,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,GACvD;IAAE,IAAI,EAAE,eAAe,CAAC;IAAC,MAAM,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAA;AAE9D;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC/B,uCAAuC;IACvC,GAAG,EAAE,MAAM,CAAA;IAEX,8CAA8C;IAC9C,UAAU,EAAE,MAAM,CAAA;IAElB,uCAAuC;IACvC,UAAU,EAAE,sBAAsB,EAAE,CAAA;IAEpC,mFAAmF;IACnF,SAAS,EAAE,MAAM,EAAE,CAAA;IAEnB,6DAA6D;IAC7D,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,aAAa;IAC7B,gEAAgE;IAChE,MAAM,CAAC,EAAE,MAAM,CAAA;IAEf,+FAA+F;IAC/F,QAAQ,CAAC,EAAE,OAAO,CAAA;IAElB,8FAA8F;IAC9F,mBAAmB,CAAC,EAAE,OAAO,CAAA;IAE7B,qFAAqF;IACrF,OAAO,CAAC,EAAE,OAAO,CAAA;CACjB"}
package/out/types.js ADDED
@@ -0,0 +1,7 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ */
6
+ export {};
7
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../types.ts"],"names":[],"mappings":"AAAA;;;;GAIG"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Whitespace collapse — runs of whitespace become a single ASCII space. Newlines and tabs are
7
+ * preserved as-is (segmentation grammar in QueryShape uses them); inline runs of spaces
8
+ * collapse.
9
+ */
10
+ export interface WhitespaceResult {
11
+ text: string;
12
+ map: number[];
13
+ runs: number;
14
+ }
15
+ export declare function collapseWhitespace(input: string): WhitespaceResult;
16
+ //# sourceMappingURL=whitespace.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"whitespace.d.ts","sourceRoot":"","sources":["../whitespace.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAOH,MAAM,WAAW,gBAAgB;IAChC,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,MAAM,GAAG,gBAAgB,CAoDlE"}
@@ -0,0 +1,66 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Whitespace collapse — runs of whitespace become a single ASCII space. Newlines and tabs are
7
+ * preserved as-is (segmentation grammar in QueryShape uses them); inline runs of spaces
8
+ * collapse.
9
+ */
10
+ import { identityMap } from "./offset-map.js";
11
+ const INLINE_SPACE = /[ \t]/;
12
+ const ANY_SPACE = /[ \t\n\r]/;
13
+ export function collapseWhitespace(input) {
14
+ let changed = false;
15
+ let runs = 0;
16
+ const out = [];
17
+ const map = [];
18
+ let i = 0;
19
+ while (i < input.length) {
20
+ const ch = input[i];
21
+ if (ch === "\n" || ch === "\r") {
22
+ // Preserve newlines as segment separators.
23
+ out.push(ch);
24
+ map.push(i);
25
+ i += 1;
26
+ continue;
27
+ }
28
+ if (INLINE_SPACE.test(ch)) {
29
+ out.push(" ");
30
+ map.push(i);
31
+ const start = i;
32
+ i += 1;
33
+ while (i < input.length && INLINE_SPACE.test(input[i]))
34
+ i += 1;
35
+ if (i - start > 1) {
36
+ changed = true;
37
+ runs += 1;
38
+ }
39
+ continue;
40
+ }
41
+ // Collapse \r\n into one
42
+ if (ch === "\n" && out[out.length - 1] === "\r") {
43
+ // Already handled in CR branch above by emitting both; skip combiner check
44
+ }
45
+ out.push(ch);
46
+ map.push(i);
47
+ i += 1;
48
+ }
49
+ // Trim leading and trailing whitespace.
50
+ let lead = 0;
51
+ while (lead < out.length && ANY_SPACE.test(out[lead]))
52
+ lead += 1;
53
+ let trail = out.length;
54
+ while (trail > lead && ANY_SPACE.test(out[trail - 1]))
55
+ trail -= 1;
56
+ if (lead > 0 || trail < out.length) {
57
+ changed = true;
58
+ }
59
+ const trimmedOut = out.slice(lead, trail);
60
+ const trimmedMap = map.slice(lead, trail);
61
+ if (!changed && trimmedOut.length === input.length) {
62
+ return { text: input, map: identityMap(input.length), runs: 0 };
63
+ }
64
+ return { text: trimmedOut.join(""), map: trimmedMap, runs };
65
+ }
66
+ //# sourceMappingURL=whitespace.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"whitespace.js","sourceRoot":"","sources":["../whitespace.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAE7C,MAAM,YAAY,GAAG,OAAO,CAAA;AAC5B,MAAM,SAAS,GAAG,WAAW,CAAA;AAQ7B,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC/C,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,IAAI,CAAC,GAAG,CAAC,CAAA;IAET,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACzB,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;QACpB,IAAI,EAAE,KAAK,IAAI,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YAChC,2CAA2C;YAC3C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,CAAC,IAAI,CAAC,CAAA;YACN,SAAQ;QACT,CAAC;QACD,IAAI,YAAY,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;YAC3B,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YACb,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,MAAM,KAAK,GAAG,CAAC,CAAA;YACf,CAAC,IAAI,CAAC,CAAA;YACN,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC;gBAAE,CAAC,IAAI,CAAC,CAAA;YAC/D,IAAI,CAAC,GAAG,KAAK,GAAG,CAAC,EAAE,CAAC;gBACnB,OAAO,GAAG,IAAI,CAAA;gBACd,IAAI,IAAI,CAAC,CAAA;YACV,CAAC;YACD,SAAQ;QACT,CAAC;QACD,yBAAyB;QACzB,IAAI,EAAE,KAAK,IAAI,IAAI,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,2EAA2E;QAC5E,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;QACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACX,CAAC,IAAI,CAAC,CAAA;IACP,CAAC;IAED,wCAAwC;IACxC,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,OAAO,IAAI,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC;QAAE,IAAI,IAAI,CAAC,CAAA;IACjE,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,CAAA;IACtB,OAAO,KAAK,GAAG,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,CAAE,CAAC;QAAE,KAAK,IAAI,CAAC,CAAA;IAClE,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC;QACpC,OAAO,GAAG,IAAI,CAAA;IACf,CAAC;IACD,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,CAAA;IACzC,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,CAAA;IAEzC,IAAI,CAAC,OAAO,IAAI,UAAU,CAAC,MAAM,KAAK,KAAK,CAAC,MAAM,EAAE,CAAC;QACpD,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAA;IAChE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,UAAU,EAAE,IAAI,EAAE,CAAA;AAC5D,CAAC"}
package/package.json ADDED
@@ -0,0 +1,25 @@
1
+ {
2
+ "name": "@mailwoman/normalize",
3
+ "version": "4.0.0",
4
+ "description": "Stage 1 of the runtime pipeline — deterministic input preprocessing (Unicode NFC, punctuation, whitespace, abbreviation). Pure functions, no ML.",
5
+ "license": "AGPL-3.0-only",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/sister-software/mailwoman.git",
9
+ "directory": "normalize"
10
+ },
11
+ "type": "module",
12
+ "exports": {
13
+ "./package.json": "./package.json",
14
+ ".": "./out/index.js"
15
+ },
16
+ "files": [
17
+ "out/**/*.js",
18
+ "out/**/*.js.map",
19
+ "out/**/*.d.ts",
20
+ "out/**/*.d.ts.map"
21
+ ],
22
+ "publishConfig": {
23
+ "access": "public"
24
+ }
25
+ }