@mailwoman/normalize 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/abbreviations.d.ts +32 -0
- package/out/abbreviations.d.ts.map +1 -0
- package/out/abbreviations.js +110 -0
- package/out/abbreviations.js.map +1 -0
- package/out/cjk.d.ts +38 -0
- package/out/cjk.d.ts.map +1 -0
- package/out/cjk.js +68 -0
- package/out/cjk.js.map +1 -0
- package/out/compute.d.ts +11 -0
- package/out/compute.d.ts.map +1 -0
- package/out/compute.js +84 -0
- package/out/compute.js.map +1 -0
- package/out/index.d.ts +23 -0
- package/out/index.d.ts.map +1 -0
- package/out/index.js +22 -0
- package/out/index.js.map +1 -0
- package/out/nfc.d.ts +21 -0
- package/out/nfc.d.ts.map +1 -0
- package/out/nfc.js +53 -0
- package/out/nfc.js.map +1 -0
- package/out/offset-map.d.ts +20 -0
- package/out/offset-map.d.ts.map +1 -0
- package/out/offset-map.js +32 -0
- package/out/offset-map.js.map +1 -0
- package/out/punctuation.d.ts +15 -0
- package/out/punctuation.d.ts.map +1 -0
- package/out/punctuation.js +48 -0
- package/out/punctuation.js.map +1 -0
- package/out/types.d.ts +63 -0
- package/out/types.d.ts.map +1 -0
- package/out/types.js +7 -0
- package/out/types.js.map +1 -0
- package/out/whitespace.d.ts +16 -0
- package/out/whitespace.d.ts.map +1 -0
- package/out/whitespace.js +66 -0
- package/out/whitespace.js.map +1 -0
- package/package.json +25 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Abbreviation expansion — a small bounded dictionary per locale. Initial dict covers en-US street
|
|
7
|
+
* suffixes + directional prefixes. fr-FR + others added as needed.
|
|
8
|
+
*
|
|
9
|
+
* This is the INVERSE of the corpus synthesis pass (which produces `Ave` from `Avenue` for
|
|
10
|
+
* augmentation). Both sides should eventually share dictionaries; for v1 this dict is duplicated
|
|
11
|
+
* intentionally — refactoring sharing is a separate task.
|
|
12
|
+
*/
|
|
13
|
+
import type { SpanRange } from "./types.js";
|
|
14
|
+
export interface AbbreviationResult {
|
|
15
|
+
text: string;
|
|
16
|
+
map: number[];
|
|
17
|
+
expansions: Array<{
|
|
18
|
+
from: string;
|
|
19
|
+
to: string;
|
|
20
|
+
at: SpanRange;
|
|
21
|
+
}>;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Expand known abbreviations. Walks the input token-by-token (whitespace-delimited) and rewrites
|
|
25
|
+
* matching tokens to their canonical long form. The output map points every char of the expanded
|
|
26
|
+
* form to its position in the original short form (first char of input token).
|
|
27
|
+
*
|
|
28
|
+
* Case rules: match case-insensitively. Output form preserves the dictionary's canonical casing
|
|
29
|
+
* (`St` → `Street`, `st` → `Street`, `ST` → `Street`).
|
|
30
|
+
*/
|
|
31
|
+
export declare function expandAbbreviations(input: string, locale?: string): AbbreviationResult;
|
|
32
|
+
//# sourceMappingURL=abbreviations.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"abbreviations.d.ts","sourceRoot":"","sources":["../abbreviations.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAgD3C,MAAM,WAAW,kBAAkB;IAClC,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,UAAU,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,MAAM,CAAC;QAAC,EAAE,EAAE,SAAS,CAAA;KAAE,CAAC,CAAA;CAC9D;AAED;;;;;;;GAOG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,kBAAkB,CAiDtF"}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Abbreviation expansion — a small bounded dictionary per locale. Initial dict covers en-US street
|
|
7
|
+
* suffixes + directional prefixes. fr-FR + others added as needed.
|
|
8
|
+
*
|
|
9
|
+
* This is the INVERSE of the corpus synthesis pass (which produces `Ave` from `Avenue` for
|
|
10
|
+
* augmentation). Both sides should eventually share dictionaries; for v1 this dict is duplicated
|
|
11
|
+
* intentionally — refactoring sharing is a separate task.
|
|
12
|
+
*/
|
|
13
|
+
const EN_US_DICT = [
|
|
14
|
+
// Directional prefixes / suffixes
|
|
15
|
+
{ from: "N", to: "North" },
|
|
16
|
+
{ from: "S", to: "South" },
|
|
17
|
+
{ from: "E", to: "East" },
|
|
18
|
+
{ from: "W", to: "West" },
|
|
19
|
+
{ from: "NE", to: "Northeast" },
|
|
20
|
+
{ from: "NW", to: "Northwest" },
|
|
21
|
+
{ from: "SE", to: "Southeast" },
|
|
22
|
+
{ from: "SW", to: "Southwest" },
|
|
23
|
+
// Street suffixes
|
|
24
|
+
{ from: "St", to: "Street" },
|
|
25
|
+
{ from: "Ave", to: "Avenue" },
|
|
26
|
+
{ from: "Blvd", to: "Boulevard" },
|
|
27
|
+
{ from: "Rd", to: "Road" },
|
|
28
|
+
{ from: "Dr", to: "Drive" },
|
|
29
|
+
{ from: "Ct", to: "Court" },
|
|
30
|
+
{ from: "Ln", to: "Lane" },
|
|
31
|
+
{ from: "Pl", to: "Place" },
|
|
32
|
+
{ from: "Pkwy", to: "Parkway" },
|
|
33
|
+
{ from: "Hwy", to: "Highway" },
|
|
34
|
+
{ from: "Sq", to: "Square" },
|
|
35
|
+
{ from: "Ter", to: "Terrace" },
|
|
36
|
+
];
|
|
37
|
+
const FR_FR_DICT = [
|
|
38
|
+
{ from: "R", to: "Rue" },
|
|
39
|
+
{ from: "Bd", to: "Boulevard" },
|
|
40
|
+
{ from: "Av", to: "Avenue" },
|
|
41
|
+
{ from: "Bvd", to: "Boulevard" },
|
|
42
|
+
{ from: "Pl", to: "Place" },
|
|
43
|
+
{ from: "Imp", to: "Impasse" },
|
|
44
|
+
{ from: "Sq", to: "Square" },
|
|
45
|
+
];
|
|
46
|
+
function getDictionary(locale) {
|
|
47
|
+
const lc = (locale ?? "en-US").toLowerCase();
|
|
48
|
+
if (lc.startsWith("fr"))
|
|
49
|
+
return FR_FR_DICT;
|
|
50
|
+
return EN_US_DICT;
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Expand known abbreviations. Walks the input token-by-token (whitespace-delimited) and rewrites
|
|
54
|
+
* matching tokens to their canonical long form. The output map points every char of the expanded
|
|
55
|
+
* form to its position in the original short form (first char of input token).
|
|
56
|
+
*
|
|
57
|
+
* Case rules: match case-insensitively. Output form preserves the dictionary's canonical casing
|
|
58
|
+
* (`St` → `Street`, `st` → `Street`, `ST` → `Street`).
|
|
59
|
+
*/
|
|
60
|
+
export function expandAbbreviations(input, locale) {
|
|
61
|
+
const dict = getDictionary(locale);
|
|
62
|
+
const lookup = new Map();
|
|
63
|
+
for (const entry of dict)
|
|
64
|
+
lookup.set(entry.from.toLowerCase(), entry.to);
|
|
65
|
+
const out = [];
|
|
66
|
+
const map = [];
|
|
67
|
+
const expansions = [];
|
|
68
|
+
let i = 0;
|
|
69
|
+
while (i < input.length) {
|
|
70
|
+
const ch = input[i];
|
|
71
|
+
// Walk to end of token (non-whitespace, non-punctuation). Unicode-letter-aware so
|
|
72
|
+
// "République" stays one token instead of fragmenting on 'é'.
|
|
73
|
+
const isTokenChar = (c) => /[\p{L}\p{N}'_-]/u.test(c);
|
|
74
|
+
if (!isTokenChar(ch)) {
|
|
75
|
+
out.push(ch);
|
|
76
|
+
map.push(i);
|
|
77
|
+
i += 1;
|
|
78
|
+
continue;
|
|
79
|
+
}
|
|
80
|
+
const start = i;
|
|
81
|
+
while (i < input.length && isTokenChar(input[i]))
|
|
82
|
+
i += 1;
|
|
83
|
+
const token = input.slice(start, i);
|
|
84
|
+
const tokenWithTrailingDot = i < input.length && input[i] === "." ? `${token}.` : token;
|
|
85
|
+
const lookupKey = token.replace(/\.$/, "").toLowerCase();
|
|
86
|
+
const expansion = lookup.get(lookupKey);
|
|
87
|
+
if (!expansion) {
|
|
88
|
+
for (let k = 0; k < token.length; k++) {
|
|
89
|
+
out.push(token[k]);
|
|
90
|
+
map.push(start + k);
|
|
91
|
+
}
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
// Emit expansion; map every char back to start of source token.
|
|
95
|
+
for (let k = 0; k < expansion.length; k++) {
|
|
96
|
+
out.push(expansion[k]);
|
|
97
|
+
map.push(start + Math.min(k, token.length - 1));
|
|
98
|
+
}
|
|
99
|
+
expansions.push({
|
|
100
|
+
from: tokenWithTrailingDot,
|
|
101
|
+
to: expansion,
|
|
102
|
+
at: { start, end: i, body: token },
|
|
103
|
+
});
|
|
104
|
+
// Skip the trailing period if we consumed an abbreviation with one (e.g. "St." → "Street").
|
|
105
|
+
if (i < input.length && input[i] === ".")
|
|
106
|
+
i += 1;
|
|
107
|
+
}
|
|
108
|
+
return { text: out.join(""), map, expansions };
|
|
109
|
+
}
|
|
110
|
+
//# sourceMappingURL=abbreviations.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"abbreviations.js","sourceRoot":"","sources":["../abbreviations.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AASH,MAAM,UAAU,GAAqC;IACpD,kCAAkC;IAClC,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,OAAO,EAAE;IAC1B,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,OAAO,EAAE;IAC1B,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,EAAE;IACzB,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,MAAM,EAAE;IACzB,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,kBAAkB;IAClB,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC5B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC7B,EAAE,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,WAAW,EAAE;IACjC,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE;IAC1B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE;IAC1B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,MAAM,EAAE,EAAE,EAAE,SAAS,EAAE;IAC/B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE;IAC9B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC5B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE;CAC9B,CAAA;AAED,MAAM,UAAU,GAAqC;IACpD,EAAE,IAAI,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,EAAE;IACxB,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,WAAW,EAAE;IAC/B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;IAC5B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,WAAW,EAAE;IAChC,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,OAAO,EAAE;IAC3B,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,SAAS,EAAE;IAC9B,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,QAAQ,EAAE;CAC5B,CAAA;AAED,SAAS,aAAa,CAAC,MAA0B;IAChD,MAAM,EAAE,GAAG,CAAC,MAAM,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAA;IAC5C,IAAI,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,UAAU,CAAA;IAC1C,OAAO,UAAU,CAAA;AAClB,CAAC;AAQD;;;;;;;GAOG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa,EAAE,MAAe;IACjE,MAAM,IAAI,GAAG,aAAa,CAAC,MAAM,CAAC,CAAA;IAClC,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAA;IACxC,KAAK,MAAM,KAAK,IAAI,IAAI;QAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,KAAK,CAAC,EAAE,CAAC,CAAA;IAExE,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,UAAU,GAAuD,EAAE,CAAA;IAEzE,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACzB,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;QACpB,kFAAkF;QAClF,8DAA8D;QAC9D,MAAM,WAAW,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC7D,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,EAAE,CAAC;YACtB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,CAAC,IAAI,CAAC,CAAA;YACN,SAAQ;QACT,CAAC;QACD,MAAM,KAAK,GAAG,CAAC,CAAA;QACf,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,WAAW,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC;YAAE,CAAC,IAAI,CAAC,CAAA;QACzD,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC,CAAC,CAAA;QACnC,MAAM,oBAAoB,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,KAAK,CAAA;QACvF,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,WAAW,EAAE,CAAA;QACxD,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;QACvC,IAAI,CAAC,SAAS,EAAE,CAAC;YAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;gBACnB,GAAG,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAA;YACpB,CAAC;YACD,SAAQ;QACT,CAAC;QACD,gEAAgE;QAChE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAE,CAAC,CAAA;YACvB,GAAG,CAAC,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAA;QAChD,CAAC;QACD,UAAU,CAAC,IAAI,CAAC;YACf,IAAI,EAAE,oBAAoB;YAC1B,EAAE,EAAE,SAAS;YACb,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE;SAClC,CAAC,CAAA;QACF,4FAA4F;QAC5F,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,KAAK,CAAC,CAAC,CAAC,KAAK,GAAG;YAAE,CAAC,IAAI,CAAC,CAAA;IACjD,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,UAAU,EAAE,CAAA;AAC/C,CAAC"}
|
package/out/cjk.d.ts
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* CJK input normalization (Direction E, #291) — a conservative, character-level pass that runs as
|
|
7
|
+
* part of `normalize()` so the parser sees a stable form of CJK addresses. It does only the
|
|
8
|
+
* transformations that are UNAMBIGUOUS in any context:
|
|
9
|
+
*
|
|
10
|
+
* - **Strip the postal mark 〒 (U+3012).** The JP cheap-probe found 〒 is byte-fallback OOV for the
|
|
11
|
+
* SentencePiece tokenizer — it fragments into raw UTF-8 byte pieces and poisons the parse of
|
|
12
|
+
* the digits right after it (the postcode gets mislabeled as a house number). It's a
|
|
13
|
+
* "postcode follows" marker with no addressing content of its own, so dropping it is safe and
|
|
14
|
+
* fixes the bug.
|
|
15
|
+
* - **Fold full-width ASCII (U+FF01–U+FF5E → U+0021–U+007E).** A full-width `1` is always the digit
|
|
16
|
+
* 1, a full-width `-` always a hyphen — keyboards and copy-paste produce these constantly.
|
|
17
|
+
* Folding them to ASCII makes `104−0061` and `104-0061` the same input.
|
|
18
|
+
* - **Fold the ideographic space (U+3000 → ' ').**
|
|
19
|
+
*
|
|
20
|
+
* It deliberately does NOT convert **kanji numerals** (一二三…): place names carry numeral kanji as
|
|
21
|
+
* ordinary characters (三田 _Mita_, 四谷 _Yotsuya_), so a blind 三→3 would corrupt them.
|
|
22
|
+
* Disambiguating "this 三 is a block number, that one is part of a name" is parsing, not
|
|
23
|
+
* normalization — deferred. Kana→kanji transliteration (ちょうめ→丁目) is dictionary work and likewise
|
|
24
|
+
* deferred.
|
|
25
|
+
*
|
|
26
|
+
* Self-gating: a string with none of these characters returns identity, so Latin input is
|
|
27
|
+
* untouched.
|
|
28
|
+
*/
|
|
29
|
+
export interface CjkResult {
|
|
30
|
+
text: string;
|
|
31
|
+
map: number[];
|
|
32
|
+
/** Count of characters folded in place (full-width → ASCII, ideographic space → ' '). */
|
|
33
|
+
folded: number;
|
|
34
|
+
/** Count of characters dropped (the postal mark). */
|
|
35
|
+
stripped: number;
|
|
36
|
+
}
|
|
37
|
+
export declare function applyCjkNormalization(input: string): CjkResult;
|
|
38
|
+
//# sourceMappingURL=cjk.d.ts.map
|
package/out/cjk.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cjk.d.ts","sourceRoot":"","sources":["../cjk.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAIH,MAAM,WAAW,SAAS;IACzB,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,yFAAyF;IACzF,MAAM,EAAE,MAAM,CAAA;IACd,qDAAqD;IACrD,QAAQ,EAAE,MAAM,CAAA;CAChB;AAQD,wBAAgB,qBAAqB,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAkC9D"}
|
package/out/cjk.js
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* CJK input normalization (Direction E, #291) — a conservative, character-level pass that runs as
|
|
7
|
+
* part of `normalize()` so the parser sees a stable form of CJK addresses. It does only the
|
|
8
|
+
* transformations that are UNAMBIGUOUS in any context:
|
|
9
|
+
*
|
|
10
|
+
* - **Strip the postal mark 〒 (U+3012).** The JP cheap-probe found 〒 is byte-fallback OOV for the
|
|
11
|
+
* SentencePiece tokenizer — it fragments into raw UTF-8 byte pieces and poisons the parse of
|
|
12
|
+
* the digits right after it (the postcode gets mislabeled as a house number). It's a
|
|
13
|
+
* "postcode follows" marker with no addressing content of its own, so dropping it is safe and
|
|
14
|
+
* fixes the bug.
|
|
15
|
+
* - **Fold full-width ASCII (U+FF01–U+FF5E → U+0021–U+007E).** A full-width `1` is always the digit
|
|
16
|
+
* 1, a full-width `-` always a hyphen — keyboards and copy-paste produce these constantly.
|
|
17
|
+
* Folding them to ASCII makes `104−0061` and `104-0061` the same input.
|
|
18
|
+
* - **Fold the ideographic space (U+3000 → ' ').**
|
|
19
|
+
*
|
|
20
|
+
* It deliberately does NOT convert **kanji numerals** (一二三…): place names carry numeral kanji as
|
|
21
|
+
* ordinary characters (三田 _Mita_, 四谷 _Yotsuya_), so a blind 三→3 would corrupt them.
|
|
22
|
+
* Disambiguating "this 三 is a block number, that one is part of a name" is parsing, not
|
|
23
|
+
* normalization — deferred. Kana→kanji transliteration (ちょうめ→丁目) is dictionary work and likewise
|
|
24
|
+
* deferred.
|
|
25
|
+
*
|
|
26
|
+
* Self-gating: a string with none of these characters returns identity, so Latin input is
|
|
27
|
+
* untouched.
|
|
28
|
+
*/
|
|
29
|
+
import { identityMap } from "./offset-map.js";
|
|
30
|
+
const FULLWIDTH_START = 0xff01; // !
|
|
31
|
+
const FULLWIDTH_END = 0xff5e; // ~
|
|
32
|
+
const FULLWIDTH_TO_ASCII = 0xfee0; // U+FFxx − 0xFEE0 = U+00xx
|
|
33
|
+
const IDEOGRAPHIC_SPACE = 0x3000;
|
|
34
|
+
const POSTAL_MARK = 0x3012; // 〒
|
|
35
|
+
export function applyCjkNormalization(input) {
|
|
36
|
+
let folded = 0;
|
|
37
|
+
let stripped = 0;
|
|
38
|
+
const out = [];
|
|
39
|
+
const map = [];
|
|
40
|
+
// All transformed code points are in the BMP (single UTF-16 unit), and every other character is
|
|
41
|
+
// passed through verbatim, so a per-unit walk is safe for surrogate-pair input too.
|
|
42
|
+
for (let i = 0; i < input.length; i++) {
|
|
43
|
+
const code = input.charCodeAt(i);
|
|
44
|
+
if (code === POSTAL_MARK) {
|
|
45
|
+
stripped += 1;
|
|
46
|
+
continue; // drop — no addressing content; whitespace collapse later tidies any gap
|
|
47
|
+
}
|
|
48
|
+
if (code >= FULLWIDTH_START && code <= FULLWIDTH_END) {
|
|
49
|
+
out.push(String.fromCharCode(code - FULLWIDTH_TO_ASCII));
|
|
50
|
+
map.push(i);
|
|
51
|
+
folded += 1;
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
if (code === IDEOGRAPHIC_SPACE) {
|
|
55
|
+
out.push(" ");
|
|
56
|
+
map.push(i);
|
|
57
|
+
folded += 1;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
out.push(input[i]);
|
|
61
|
+
map.push(i);
|
|
62
|
+
}
|
|
63
|
+
if (folded === 0 && stripped === 0) {
|
|
64
|
+
return { text: input, map: identityMap(input.length), folded: 0, stripped: 0 };
|
|
65
|
+
}
|
|
66
|
+
return { text: out.join(""), map, folded, stripped };
|
|
67
|
+
}
|
|
68
|
+
//# sourceMappingURL=cjk.js.map
|
package/out/cjk.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cjk.js","sourceRoot":"","sources":["../cjk.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAW7C,MAAM,eAAe,GAAG,MAAM,CAAA,CAAC,IAAI;AACnC,MAAM,aAAa,GAAG,MAAM,CAAA,CAAC,IAAI;AACjC,MAAM,kBAAkB,GAAG,MAAM,CAAA,CAAC,2BAA2B;AAC7D,MAAM,iBAAiB,GAAG,MAAM,CAAA;AAChC,MAAM,WAAW,GAAG,MAAM,CAAA,CAAC,IAAI;AAE/B,MAAM,UAAU,qBAAqB,CAAC,KAAa;IAClD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,QAAQ,GAAG,CAAC,CAAA;IAChB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IAExB,gGAAgG;IAChG,oFAAoF;IACpF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAA;QAChC,IAAI,IAAI,KAAK,WAAW,EAAE,CAAC;YAC1B,QAAQ,IAAI,CAAC,CAAA;YACb,SAAQ,CAAC,yEAAyE;QACnF,CAAC;QACD,IAAI,IAAI,IAAI,eAAe,IAAI,IAAI,IAAI,aAAa,EAAE,CAAC;YACtD,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC,IAAI,GAAG,kBAAkB,CAAC,CAAC,CAAA;YACxD,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,MAAM,IAAI,CAAC,CAAA;YACX,SAAQ;QACT,CAAC;QACD,IAAI,IAAI,KAAK,iBAAiB,EAAE,CAAC;YAChC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YACb,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,MAAM,IAAI,CAAC,CAAA;YACX,SAAQ;QACT,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC,CAAA;QACnB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACZ,CAAC;IAED,IAAI,MAAM,KAAK,CAAC,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;QACpC,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,CAAA;IAC/E,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAA;AACrD,CAAC"}
|
package/out/compute.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `normalize(raw, opts)` — the Stage 1 entry point. Composes NFC + punctuation + whitespace
|
|
7
|
+
* (always) with case-fold + abbreviation expansion (opt-in).
|
|
8
|
+
*/
|
|
9
|
+
import type { NormalizedInput, NormalizeOpts } from "./types.js";
|
|
10
|
+
export declare function normalize(raw: string, opts?: NormalizeOpts): NormalizedInput;
|
|
11
|
+
//# sourceMappingURL=compute.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compute.d.ts","sourceRoot":"","sources":["../compute.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAOH,OAAO,KAAK,EAA0B,eAAe,EAAE,aAAa,EAAE,MAAM,YAAY,CAAA;AAGxF,wBAAgB,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,aAAa,GAAG,eAAe,CA2E5E"}
|
package/out/compute.js
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `normalize(raw, opts)` — the Stage 1 entry point. Composes NFC + punctuation + whitespace
|
|
7
|
+
* (always) with case-fold + abbreviation expansion (opt-in).
|
|
8
|
+
*/
|
|
9
|
+
import { expandAbbreviations } from "./abbreviations.js";
|
|
10
|
+
import { applyCjkNormalization } from "./cjk.js";
|
|
11
|
+
import { applyNfc } from "./nfc.js";
|
|
12
|
+
import { composeMaps, identityMap } from "./offset-map.js";
|
|
13
|
+
import { applyPunctuation } from "./punctuation.js";
|
|
14
|
+
import { collapseWhitespace } from "./whitespace.js";
|
|
15
|
+
export function normalize(raw, opts) {
|
|
16
|
+
const transforms = [];
|
|
17
|
+
let text = raw;
|
|
18
|
+
let map = identityMap(raw.length);
|
|
19
|
+
// 1. NFC
|
|
20
|
+
if (!opts?.skipNfc) {
|
|
21
|
+
const r = applyNfc(text);
|
|
22
|
+
text = r.text;
|
|
23
|
+
map = composeMaps(map, r.map);
|
|
24
|
+
transforms.push({ kind: "nfc", changed: r.changed });
|
|
25
|
+
}
|
|
26
|
+
// 1.5 CJK normalization — strip the postal mark 〒 (byte-fallback OOV that poisons the postcode
|
|
27
|
+
// parse) and fold full-width ASCII + the ideographic space. Runs after NFC so it sees composed
|
|
28
|
+
// forms, before punctuation/whitespace so any gap left by 〒 is then collapsed. No-op off-script.
|
|
29
|
+
{
|
|
30
|
+
const r = applyCjkNormalization(text);
|
|
31
|
+
if (r.folded > 0 || r.stripped > 0) {
|
|
32
|
+
text = r.text;
|
|
33
|
+
map = composeMaps(map, r.map);
|
|
34
|
+
transforms.push({ kind: "normalize_cjk", folded: r.folded, stripped: r.stripped });
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
// 2. Punctuation
|
|
38
|
+
{
|
|
39
|
+
const r = applyPunctuation(text);
|
|
40
|
+
if (r.replacements > 0) {
|
|
41
|
+
text = r.text;
|
|
42
|
+
map = composeMaps(map, r.map);
|
|
43
|
+
transforms.push({ kind: "normalize_punctuation", replacements: r.replacements });
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
// 3. Whitespace
|
|
47
|
+
{
|
|
48
|
+
const r = collapseWhitespace(text);
|
|
49
|
+
if (r.runs > 0 || r.text.length !== text.length) {
|
|
50
|
+
text = r.text;
|
|
51
|
+
map = composeMaps(map, r.map);
|
|
52
|
+
transforms.push({ kind: "collapse_whitespace", runs: r.runs });
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// 4. Abbreviation expansion (opt-in) — runs BEFORE case-fold so case-folding the canonical
|
|
56
|
+
// expansion form (e.g. "Street") gives a consistent final case.
|
|
57
|
+
if (opts?.expandAbbreviations) {
|
|
58
|
+
const r = expandAbbreviations(text, opts.locale);
|
|
59
|
+
if (r.expansions.length > 0) {
|
|
60
|
+
text = r.text;
|
|
61
|
+
map = composeMaps(map, r.map);
|
|
62
|
+
for (const e of r.expansions) {
|
|
63
|
+
transforms.push({ kind: "expand_abbreviation", from: e.from, to: e.to, at: e.at });
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
// 5. Case fold (opt-in)
|
|
68
|
+
if (opts?.caseFold) {
|
|
69
|
+
const lc = text.toLocaleLowerCase(opts.locale);
|
|
70
|
+
if (lc !== text) {
|
|
71
|
+
text = lc;
|
|
72
|
+
// Case-fold is identity-length for ASCII + most Latin; map unchanged.
|
|
73
|
+
transforms.push({ kind: "case_fold", locale: opts.locale ?? "und" });
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return Object.freeze({
|
|
77
|
+
raw,
|
|
78
|
+
normalized: text,
|
|
79
|
+
transforms: Object.freeze(transforms),
|
|
80
|
+
offsetMap: map,
|
|
81
|
+
appliedLocale: opts?.locale,
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
//# sourceMappingURL=compute.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"compute.js","sourceRoot":"","sources":["../compute.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,EAAE,qBAAqB,EAAE,MAAM,UAAU,CAAA;AAChD,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA;AAEpD,MAAM,UAAU,SAAS,CAAC,GAAW,EAAE,IAAoB;IAC1D,MAAM,UAAU,GAA6B,EAAE,CAAA;IAC/C,IAAI,IAAI,GAAG,GAAG,CAAA;IACd,IAAI,GAAG,GAAG,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;IAEjC,SAAS;IACT,IAAI,CAAC,IAAI,EAAE,OAAO,EAAE,CAAC;QACpB,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAA;QACxB,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;QACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;QAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,CAAA;IACrD,CAAC;IAED,+FAA+F;IAC/F,+FAA+F;IAC/F,iGAAiG;IACjG,CAAC;QACA,MAAM,CAAC,GAAG,qBAAqB,CAAC,IAAI,CAAC,CAAA;QACrC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,GAAG,CAAC,EAAE,CAAC;YACpC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,eAAe,EAAE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAA;QACnF,CAAC;IACF,CAAC;IAED,iBAAiB;IACjB,CAAC;QACA,MAAM,CAAC,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;QAChC,IAAI,CAAC,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YACxB,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,uBAAuB,EAAE,YAAY,EAAE,CAAC,CAAC,YAAY,EAAE,CAAC,CAAA;QACjF,CAAC;IACF,CAAC;IAED,gBAAgB;IAChB,CAAC;QACA,MAAM,CAAC,GAAG,kBAAkB,CAAC,IAAI,CAAC,CAAA;QAClC,IAAI,CAAC,CAAC,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;YACjD,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;QAC/D,CAAC;IACF,CAAC;IAED,2FAA2F;IAC3F,gEAAgE;IAChE,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,mBAAmB,CAAC,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;QAChD,IAAI,CAAC,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7B,IAAI,GAAG,CAAC,CAAC,IAAI,CAAA;YACb,GAAG,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAA;YAC7B,KAAK,MAAM,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;gBAC9B,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,qBAAqB,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;YACnF,CAAC;QACF,CAAC;IACF,CAAC;IAED,wBAAwB;IACxB,IAAI,IAAI,EAAE,QAAQ,EAAE,CAAC;QACpB,MAAM,EAAE,GAAG,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9C,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YACjB,IAAI,GAAG,EAAE,CAAA;YACT,sEAAsE;YACtE,UAAU,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,IAAI,KAAK,EAAE,CAAC,CAAA;QACrE,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC;QACpB,GAAG;QACH,UAAU,EAAE,IAAI;QAChB,UAAU,EAAE,MAAM,CAAC,MAAM,CAAC,UAAU,CAA6B;QACjE,SAAS,EAAE,GAAG;QACd,aAAa,EAAE,IAAI,EAAE,MAAM;KAC3B,CAA2B,CAAA;AAC7B,CAAC"}
|
package/out/index.d.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `@mailwoman/normalize` — Stage 1 of the runtime pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Deterministic input preprocessing: NFC, punctuation, whitespace, optional case-fold +
|
|
9
|
+
* abbreviation expansion. Pure functions. Produces a `NormalizedInput` with a load-bearing
|
|
10
|
+
* `offsetMap` so downstream stages can map normalized-string spans back to raw-string character
|
|
11
|
+
* offsets.
|
|
12
|
+
*
|
|
13
|
+
* See `docs/articles/plan/reference/STAGES.md` § Stage 1 for the contract.
|
|
14
|
+
*/
|
|
15
|
+
export { expandAbbreviations } from "./abbreviations.js";
|
|
16
|
+
export { applyCjkNormalization, type CjkResult } from "./cjk.js";
|
|
17
|
+
export { normalize } from "./compute.js";
|
|
18
|
+
export { applyNfc } from "./nfc.js";
|
|
19
|
+
export { composeMaps, identityMap } from "./offset-map.js";
|
|
20
|
+
export { applyPunctuation } from "./punctuation.js";
|
|
21
|
+
export type { NormalizationTransform, NormalizeOpts, NormalizedInput, SpanRange } from "./types.js";
|
|
22
|
+
export { collapseWhitespace } from "./whitespace.js";
|
|
23
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,EAAE,qBAAqB,EAAE,KAAK,SAAS,EAAE,MAAM,UAAU,CAAA;AAChE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAA;AACxC,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,YAAY,EAAE,sBAAsB,EAAE,aAAa,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AACnG,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA"}
|
package/out/index.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* `@mailwoman/normalize` — Stage 1 of the runtime pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Deterministic input preprocessing: NFC, punctuation, whitespace, optional case-fold +
|
|
9
|
+
* abbreviation expansion. Pure functions. Produces a `NormalizedInput` with a load-bearing
|
|
10
|
+
* `offsetMap` so downstream stages can map normalized-string spans back to raw-string character
|
|
11
|
+
* offsets.
|
|
12
|
+
*
|
|
13
|
+
* See `docs/articles/plan/reference/STAGES.md` § Stage 1 for the contract.
|
|
14
|
+
*/
|
|
15
|
+
export { expandAbbreviations } from "./abbreviations.js";
|
|
16
|
+
export { applyCjkNormalization } from "./cjk.js";
|
|
17
|
+
export { normalize } from "./compute.js";
|
|
18
|
+
export { applyNfc } from "./nfc.js";
|
|
19
|
+
export { composeMaps, identityMap } from "./offset-map.js";
|
|
20
|
+
export { applyPunctuation } from "./punctuation.js";
|
|
21
|
+
export { collapseWhitespace } from "./whitespace.js";
|
|
22
|
+
//# sourceMappingURL=index.js.map
|
package/out/index.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAA;AACxD,OAAO,EAAE,qBAAqB,EAAkB,MAAM,UAAU,CAAA;AAChE,OAAO,EAAE,SAAS,EAAE,MAAM,cAAc,CAAA;AACxC,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC1D,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAA"}
|
package/out/nfc.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Unicode NFC normalization. For inputs already in NFC (the common case) this is a no-op. When the
|
|
7
|
+
* input has combining characters (`e` + `́` → `é`), NFC composes them — the normalized string can
|
|
8
|
+
* be shorter than the raw.
|
|
9
|
+
*
|
|
10
|
+
* Approximation: we walk the input grapheme-by-grapheme (best effort via codepoint stepping) and
|
|
11
|
+
* map each output index to the start of its source sequence. Rare CJK edge cases involving
|
|
12
|
+
* variant selectors may produce off-by-one offsets — acceptable for v1.
|
|
13
|
+
*/
|
|
14
|
+
export interface NfcResult {
|
|
15
|
+
text: string;
|
|
16
|
+
/** `text[i]` came from `input[map[i]]`. */
|
|
17
|
+
map: number[];
|
|
18
|
+
changed: boolean;
|
|
19
|
+
}
|
|
20
|
+
export declare function applyNfc(input: string): NfcResult;
|
|
21
|
+
//# sourceMappingURL=nfc.d.ts.map
|
package/out/nfc.d.ts.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"nfc.d.ts","sourceRoot":"","sources":["../nfc.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAIH,MAAM,WAAW,SAAS;IACzB,IAAI,EAAE,MAAM,CAAA;IACZ,2CAA2C;IAC3C,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,OAAO,EAAE,OAAO,CAAA;CAChB;AAED,wBAAgB,QAAQ,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,CAMjD"}
|
package/out/nfc.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Unicode NFC normalization. For inputs already in NFC (the common case) this is a no-op. When the
|
|
7
|
+
* input has combining characters (`e` + `́` → `é`), NFC composes them — the normalized string can
|
|
8
|
+
* be shorter than the raw.
|
|
9
|
+
*
|
|
10
|
+
* Approximation: we walk the input grapheme-by-grapheme (best effort via codepoint stepping) and
|
|
11
|
+
* map each output index to the start of its source sequence. Rare CJK edge cases involving
|
|
12
|
+
* variant selectors may produce off-by-one offsets — acceptable for v1.
|
|
13
|
+
*/
|
|
14
|
+
import { identityMap } from "./offset-map.js";
|
|
15
|
+
export function applyNfc(input) {
|
|
16
|
+
const normalized = input.normalize("NFC");
|
|
17
|
+
if (normalized === input) {
|
|
18
|
+
return { text: input, map: identityMap(input.length), changed: false };
|
|
19
|
+
}
|
|
20
|
+
return { text: normalized, map: estimateNfcMap(input, normalized), changed: true };
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Estimate per-output-codepoint offsets. Walks both strings in parallel; emits the next source
|
|
24
|
+
* index for each output position. Imprecise for combining sequences but correct for length-equal
|
|
25
|
+
* NFC outputs (the common length-changing case is when a sequence shortens).
|
|
26
|
+
*/
|
|
27
|
+
function estimateNfcMap(input, output) {
|
|
28
|
+
const map = [];
|
|
29
|
+
let inIdx = 0;
|
|
30
|
+
for (let outIdx = 0; outIdx < output.length; outIdx++) {
|
|
31
|
+
map.push(inIdx);
|
|
32
|
+
const outCp = output.codePointAt(outIdx);
|
|
33
|
+
const outStep = outCp > 0xffff ? 2 : 1;
|
|
34
|
+
// Walk the input forward by at least one codepoint; absorb any combining marks (0x0300–0x036f).
|
|
35
|
+
if (inIdx < input.length) {
|
|
36
|
+
const inCp = input.codePointAt(inIdx);
|
|
37
|
+
inIdx += inCp > 0xffff ? 2 : 1;
|
|
38
|
+
while (inIdx < input.length) {
|
|
39
|
+
const nextCp = input.codePointAt(inIdx);
|
|
40
|
+
if (nextCp >= 0x0300 && nextCp <= 0x036f) {
|
|
41
|
+
inIdx += nextCp > 0xffff ? 2 : 1;
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
break;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
if (outStep === 2)
|
|
49
|
+
outIdx += 1;
|
|
50
|
+
}
|
|
51
|
+
return map;
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=nfc.js.map
|
package/out/nfc.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"nfc.js","sourceRoot":"","sources":["../nfc.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAS7C,MAAM,UAAU,QAAQ,CAAC,KAAa;IACrC,MAAM,UAAU,GAAG,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;IACzC,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;QAC1B,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,CAAA;IACvE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,UAAU,EAAE,GAAG,EAAE,cAAc,CAAC,KAAK,EAAE,UAAU,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,CAAA;AACnF,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CAAC,KAAa,EAAE,MAAc;IACpD,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,IAAI,MAAM,GAAG,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE,CAAC;QACvD,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;QACf,MAAM,KAAK,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAE,CAAA;QACzC,MAAM,OAAO,GAAG,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACtC,gGAAgG;QAChG,IAAI,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,KAAK,CAAC,WAAW,CAAC,KAAK,CAAE,CAAA;YACtC,KAAK,IAAI,IAAI,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;YAC9B,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;gBAC7B,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC,KAAK,CAAE,CAAA;gBACxC,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,EAAE,CAAC;oBAC1C,KAAK,IAAI,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;gBACjC,CAAC;qBAAM,CAAC;oBACP,MAAK;gBACN,CAAC;YACF,CAAC;QACF,CAAC;QACD,IAAI,OAAO,KAAK,CAAC;YAAE,MAAM,IAAI,CAAC,CAAA;IAC/B,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Utilities for composing per-transform offset maps into the final `raw → normalized` map.
|
|
7
|
+
*/
|
|
8
|
+
/** Identity map for an input of length `n`: `[0, 1, 2, ..., n-1]`. */
|
|
9
|
+
export declare function identityMap(n: number): number[];
|
|
10
|
+
/**
|
|
11
|
+
* Compose `inputMap` (input → raw) with `transformMap` (output → input) to produce `outputMap`
|
|
12
|
+
* (output → raw).
|
|
13
|
+
*
|
|
14
|
+
* @example // raw = "350 5th" (chars 0..7, double space at 3-4) // input = "350 5th" (identity from
|
|
15
|
+
* raw, length 8) // output = "350 5th" (whitespace collapsed, length 7) // inputMap =
|
|
16
|
+
* [0,1,2,3,4,5,6,7] // transformMap = [0,1,2,3,5,6,7] (output[3]=' ' came from input[3];
|
|
17
|
+
* output[4]='5' from input[5]) // composed = [0,1,2,3,5,6,7]
|
|
18
|
+
*/
|
|
19
|
+
export declare function composeMaps(inputMap: number[], transformMap: number[]): number[];
|
|
20
|
+
//# sourceMappingURL=offset-map.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"offset-map.d.ts","sourceRoot":"","sources":["../offset-map.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,sEAAsE;AACtE,wBAAgB,WAAW,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAI/C;AAED;;;;;;;;GAQG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,EAAE,YAAY,EAAE,MAAM,EAAE,GAAG,MAAM,EAAE,CAOhF"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Utilities for composing per-transform offset maps into the final `raw → normalized` map.
|
|
7
|
+
*/
|
|
8
|
+
/** Identity map for an input of length `n`: `[0, 1, 2, ..., n-1]`. */
|
|
9
|
+
export function identityMap(n) {
|
|
10
|
+
const m = new Array(n);
|
|
11
|
+
for (let i = 0; i < n; i++)
|
|
12
|
+
m[i] = i;
|
|
13
|
+
return m;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Compose `inputMap` (input → raw) with `transformMap` (output → input) to produce `outputMap`
|
|
17
|
+
* (output → raw).
|
|
18
|
+
*
|
|
19
|
+
* @example // raw = "350 5th" (chars 0..7, double space at 3-4) // input = "350 5th" (identity from
|
|
20
|
+
* raw, length 8) // output = "350 5th" (whitespace collapsed, length 7) // inputMap =
|
|
21
|
+
* [0,1,2,3,4,5,6,7] // transformMap = [0,1,2,3,5,6,7] (output[3]=' ' came from input[3];
|
|
22
|
+
* output[4]='5' from input[5]) // composed = [0,1,2,3,5,6,7]
|
|
23
|
+
*/
|
|
24
|
+
export function composeMaps(inputMap, transformMap) {
|
|
25
|
+
const out = new Array(transformMap.length);
|
|
26
|
+
for (let i = 0; i < transformMap.length; i++) {
|
|
27
|
+
const j = transformMap[i];
|
|
28
|
+
out[i] = inputMap[j] ?? j;
|
|
29
|
+
}
|
|
30
|
+
return out;
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=offset-map.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"offset-map.js","sourceRoot":"","sources":["../offset-map.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,sEAAsE;AACtE,MAAM,UAAU,WAAW,CAAC,CAAS;IACpC,MAAM,CAAC,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;IAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;IACpC,OAAO,CAAC,CAAA;AACT,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,WAAW,CAAC,QAAkB,EAAE,YAAsB;IACrE,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,YAAY,CAAC,MAAM,CAAC,CAAA;IAClD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC9C,MAAM,CAAC,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;QAC1B,GAAG,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;IAC1B,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Punctuation normalization — fancy quotes / dashes to ASCII equivalents. Identity-length: every
|
|
7
|
+
* fancy character is a single codepoint that maps to a single ASCII char.
|
|
8
|
+
*/
|
|
9
|
+
export interface PunctuationResult {
|
|
10
|
+
text: string;
|
|
11
|
+
map: number[];
|
|
12
|
+
replacements: number;
|
|
13
|
+
}
|
|
14
|
+
export declare function applyPunctuation(input: string): PunctuationResult;
|
|
15
|
+
//# sourceMappingURL=punctuation.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"punctuation.d.ts","sourceRoot":"","sources":["../punctuation.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAiBH,MAAM,WAAW,iBAAiB;IACjC,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,YAAY,EAAE,MAAM,CAAA;CACpB;AAED,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,MAAM,GAAG,iBAAiB,CA0BjE"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Punctuation normalization — fancy quotes / dashes to ASCII equivalents. Identity-length: every
|
|
7
|
+
* fancy character is a single codepoint that maps to a single ASCII char.
|
|
8
|
+
*/
|
|
9
|
+
import { identityMap } from "./offset-map.js";
|
|
10
|
+
const REPLACEMENTS = new Map([
|
|
11
|
+
["‘", "'"], // ‘
|
|
12
|
+
["’", "'"], // ’
|
|
13
|
+
["“", '"'], // “
|
|
14
|
+
["”", '"'], // ”
|
|
15
|
+
["–", "-"], // – en dash
|
|
16
|
+
["—", "-"], // — em dash
|
|
17
|
+
["−", "-"], // − U+2212 minus sign — Japanese IMEs emit this as the block separator (1−2−3)
|
|
18
|
+
["―", "-"], // ― U+2015 horizontal bar — another common JP block separator
|
|
19
|
+
["…", "..."], // … expands; tracked specially
|
|
20
|
+
[" ", " "], // non-breaking space
|
|
21
|
+
]);
|
|
22
|
+
export function applyPunctuation(input) {
|
|
23
|
+
let changed = false;
|
|
24
|
+
let replacements = 0;
|
|
25
|
+
const out = [];
|
|
26
|
+
const map = [];
|
|
27
|
+
for (let i = 0; i < input.length; i++) {
|
|
28
|
+
const ch = input[i];
|
|
29
|
+
const sub = REPLACEMENTS.get(ch);
|
|
30
|
+
if (sub === undefined) {
|
|
31
|
+
out.push(ch);
|
|
32
|
+
map.push(i);
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
changed = true;
|
|
36
|
+
replacements += 1;
|
|
37
|
+
for (let k = 0; k < sub.length; k++) {
|
|
38
|
+
out.push(sub[k]);
|
|
39
|
+
map.push(i);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
if (!changed) {
|
|
44
|
+
return { text: input, map: identityMap(input.length), replacements: 0 };
|
|
45
|
+
}
|
|
46
|
+
return { text: out.join(""), map, replacements };
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=punctuation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"punctuation.js","sourceRoot":"","sources":["../punctuation.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAE7C,MAAM,YAAY,GAAG,IAAI,GAAG,CAAiB;IAC5C,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,IAAI;IAChB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,YAAY;IACxB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,YAAY;IACxB,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,+EAA+E;IAC3F,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,8DAA8D;IAC1E,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,+BAA+B;IAC7C,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,qBAAqB;CACjC,CAAC,CAAA;AAQF,MAAM,UAAU,gBAAgB,CAAC,KAAa;IAC7C,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,IAAI,YAAY,GAAG,CAAC,CAAA;IACpB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IAExB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;QACpB,MAAM,GAAG,GAAG,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;QAChC,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;YACvB,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACZ,CAAC;aAAM,CAAC;YACP,OAAO,GAAG,IAAI,CAAA;YACd,YAAY,IAAI,CAAC,CAAA;YACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACrC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAE,CAAC,CAAA;gBACjB,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACZ,CAAC;QACF,CAAC;IACF,CAAC;IAED,IAAI,CAAC,OAAO,EAAE,CAAC;QACd,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAA;IACxE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,YAAY,EAAE,CAAA;AACjD,CAAC"}
|
package/out/types.d.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*/
|
|
6
|
+
export interface SpanRange {
|
|
7
|
+
start: number;
|
|
8
|
+
end: number;
|
|
9
|
+
body: string;
|
|
10
|
+
}
|
|
11
|
+
/** A single normalization step, as recorded on `NormalizedInput.transforms`. */
|
|
12
|
+
export type NormalizationTransform = {
|
|
13
|
+
kind: "nfc";
|
|
14
|
+
changed: boolean;
|
|
15
|
+
} | {
|
|
16
|
+
kind: "case_fold";
|
|
17
|
+
locale: string;
|
|
18
|
+
} | {
|
|
19
|
+
kind: "expand_abbreviation";
|
|
20
|
+
from: string;
|
|
21
|
+
to: string;
|
|
22
|
+
at: SpanRange;
|
|
23
|
+
} | {
|
|
24
|
+
kind: "collapse_whitespace";
|
|
25
|
+
runs: number;
|
|
26
|
+
} | {
|
|
27
|
+
kind: "normalize_punctuation";
|
|
28
|
+
replacements: number;
|
|
29
|
+
} | {
|
|
30
|
+
kind: "normalize_cjk";
|
|
31
|
+
folded: number;
|
|
32
|
+
stripped: number;
|
|
33
|
+
};
|
|
34
|
+
/**
|
|
35
|
+
* Result of running `normalize()` on a raw input string.
|
|
36
|
+
*
|
|
37
|
+
* `offsetMap[i]` is the index in `raw` from which `normalized[i]` came. For multi-character source
|
|
38
|
+
* sequences (NFC composition, whitespace collapse, abbreviation expansion), each output char points
|
|
39
|
+
* to the FIRST source char by convention.
|
|
40
|
+
*/
|
|
41
|
+
export interface NormalizedInput {
|
|
42
|
+
/** The input as the caller sent it. */
|
|
43
|
+
raw: string;
|
|
44
|
+
/** Canonical form, all transforms applied. */
|
|
45
|
+
normalized: string;
|
|
46
|
+
/** Ordered record of what was done. */
|
|
47
|
+
transforms: NormalizationTransform[];
|
|
48
|
+
/** `normalized[i]` came from `raw[offsetMap[i]]`. Length === normalized.length. */
|
|
49
|
+
offsetMap: number[];
|
|
50
|
+
/** The locale used for case-folding + abbreviation rules. */
|
|
51
|
+
appliedLocale?: string;
|
|
52
|
+
}
|
|
53
|
+
export interface NormalizeOpts {
|
|
54
|
+
/** Locale hint for case-folding + abbreviation dictionaries. */
|
|
55
|
+
locale?: string;
|
|
56
|
+
/** Apply locale-aware lowercasing. Default: false (preserve case for downstream consumers). */
|
|
57
|
+
caseFold?: boolean;
|
|
58
|
+
/** Expand known abbreviations (`St` → `Street`, `NW` → `Northwest`, etc.). Default: false. */
|
|
59
|
+
expandAbbreviations?: boolean;
|
|
60
|
+
/** Skip Unicode NFC. Only use for debugging — production callers should leave on. */
|
|
61
|
+
skipNfc?: boolean;
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;IACX,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,gFAAgF;AAChF,MAAM,MAAM,sBAAsB,GAC/B;IAAE,IAAI,EAAE,KAAK,CAAC;IAAC,OAAO,EAAE,OAAO,CAAA;CAAE,GACjC;IAAE,IAAI,EAAE,WAAW,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,GACrC;IAAE,IAAI,EAAE,qBAAqB,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,MAAM,CAAC;IAAC,EAAE,EAAE,SAAS,CAAA;CAAE,GACxE;IAAE,IAAI,EAAE,qBAAqB,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,GAC7C;IAAE,IAAI,EAAE,uBAAuB,CAAC;IAAC,YAAY,EAAE,MAAM,CAAA;CAAE,GACvD;IAAE,IAAI,EAAE,eAAe,CAAC;IAAC,MAAM,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAA;CAAE,CAAA;AAE9D;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC/B,uCAAuC;IACvC,GAAG,EAAE,MAAM,CAAA;IAEX,8CAA8C;IAC9C,UAAU,EAAE,MAAM,CAAA;IAElB,uCAAuC;IACvC,UAAU,EAAE,sBAAsB,EAAE,CAAA;IAEpC,mFAAmF;IACnF,SAAS,EAAE,MAAM,EAAE,CAAA;IAEnB,6DAA6D;IAC7D,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,aAAa;IAC7B,gEAAgE;IAChE,MAAM,CAAC,EAAE,MAAM,CAAA;IAEf,+FAA+F;IAC/F,QAAQ,CAAC,EAAE,OAAO,CAAA;IAElB,8FAA8F;IAC9F,mBAAmB,CAAC,EAAE,OAAO,CAAA;IAE7B,qFAAqF;IACrF,OAAO,CAAC,EAAE,OAAO,CAAA;CACjB"}
|
package/out/types.js
ADDED
package/out/types.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../types.ts"],"names":[],"mappings":"AAAA;;;;GAIG"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Whitespace collapse — runs of whitespace become a single ASCII space. Newlines and tabs are
|
|
7
|
+
* preserved as-is (segmentation grammar in QueryShape uses them); inline runs of spaces
|
|
8
|
+
* collapse.
|
|
9
|
+
*/
|
|
10
|
+
export interface WhitespaceResult {
|
|
11
|
+
text: string;
|
|
12
|
+
map: number[];
|
|
13
|
+
runs: number;
|
|
14
|
+
}
|
|
15
|
+
export declare function collapseWhitespace(input: string): WhitespaceResult;
|
|
16
|
+
//# sourceMappingURL=whitespace.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"whitespace.d.ts","sourceRoot":"","sources":["../whitespace.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAOH,MAAM,WAAW,gBAAgB;IAChC,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,EAAE,CAAA;IACb,IAAI,EAAE,MAAM,CAAA;CACZ;AAED,wBAAgB,kBAAkB,CAAC,KAAK,EAAE,MAAM,GAAG,gBAAgB,CAoDlE"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Whitespace collapse — runs of whitespace become a single ASCII space. Newlines and tabs are
|
|
7
|
+
* preserved as-is (segmentation grammar in QueryShape uses them); inline runs of spaces
|
|
8
|
+
* collapse.
|
|
9
|
+
*/
|
|
10
|
+
import { identityMap } from "./offset-map.js";
|
|
11
|
+
const INLINE_SPACE = /[ \t]/;
|
|
12
|
+
const ANY_SPACE = /[ \t\n\r]/;
|
|
13
|
+
export function collapseWhitespace(input) {
|
|
14
|
+
let changed = false;
|
|
15
|
+
let runs = 0;
|
|
16
|
+
const out = [];
|
|
17
|
+
const map = [];
|
|
18
|
+
let i = 0;
|
|
19
|
+
while (i < input.length) {
|
|
20
|
+
const ch = input[i];
|
|
21
|
+
if (ch === "\n" || ch === "\r") {
|
|
22
|
+
// Preserve newlines as segment separators.
|
|
23
|
+
out.push(ch);
|
|
24
|
+
map.push(i);
|
|
25
|
+
i += 1;
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
if (INLINE_SPACE.test(ch)) {
|
|
29
|
+
out.push(" ");
|
|
30
|
+
map.push(i);
|
|
31
|
+
const start = i;
|
|
32
|
+
i += 1;
|
|
33
|
+
while (i < input.length && INLINE_SPACE.test(input[i]))
|
|
34
|
+
i += 1;
|
|
35
|
+
if (i - start > 1) {
|
|
36
|
+
changed = true;
|
|
37
|
+
runs += 1;
|
|
38
|
+
}
|
|
39
|
+
continue;
|
|
40
|
+
}
|
|
41
|
+
// Collapse \r\n into one
|
|
42
|
+
if (ch === "\n" && out[out.length - 1] === "\r") {
|
|
43
|
+
// Already handled in CR branch above by emitting both; skip combiner check
|
|
44
|
+
}
|
|
45
|
+
out.push(ch);
|
|
46
|
+
map.push(i);
|
|
47
|
+
i += 1;
|
|
48
|
+
}
|
|
49
|
+
// Trim leading and trailing whitespace.
|
|
50
|
+
let lead = 0;
|
|
51
|
+
while (lead < out.length && ANY_SPACE.test(out[lead]))
|
|
52
|
+
lead += 1;
|
|
53
|
+
let trail = out.length;
|
|
54
|
+
while (trail > lead && ANY_SPACE.test(out[trail - 1]))
|
|
55
|
+
trail -= 1;
|
|
56
|
+
if (lead > 0 || trail < out.length) {
|
|
57
|
+
changed = true;
|
|
58
|
+
}
|
|
59
|
+
const trimmedOut = out.slice(lead, trail);
|
|
60
|
+
const trimmedMap = map.slice(lead, trail);
|
|
61
|
+
if (!changed && trimmedOut.length === input.length) {
|
|
62
|
+
return { text: input, map: identityMap(input.length), runs: 0 };
|
|
63
|
+
}
|
|
64
|
+
return { text: trimmedOut.join(""), map: trimmedMap, runs };
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=whitespace.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"whitespace.js","sourceRoot":"","sources":["../whitespace.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAE7C,MAAM,YAAY,GAAG,OAAO,CAAA;AAC5B,MAAM,SAAS,GAAG,WAAW,CAAA;AAQ7B,MAAM,UAAU,kBAAkB,CAAC,KAAa;IAC/C,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,MAAM,GAAG,GAAa,EAAE,CAAA;IACxB,IAAI,CAAC,GAAG,CAAC,CAAA;IAET,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;QACzB,MAAM,EAAE,GAAG,KAAK,CAAC,CAAC,CAAE,CAAA;QACpB,IAAI,EAAE,KAAK,IAAI,IAAI,EAAE,KAAK,IAAI,EAAE,CAAC;YAChC,2CAA2C;YAC3C,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;YACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,CAAC,IAAI,CAAC,CAAA;YACN,SAAQ;QACT,CAAC;QACD,IAAI,YAAY,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;YAC3B,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YACb,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;YACX,MAAM,KAAK,GAAG,CAAC,CAAA;YACf,CAAC,IAAI,CAAC,CAAA;YACN,OAAO,CAAC,GAAG,KAAK,CAAC,MAAM,IAAI,YAAY,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAE,CAAC;gBAAE,CAAC,IAAI,CAAC,CAAA;YAC/D,IAAI,CAAC,GAAG,KAAK,GAAG,CAAC,EAAE,CAAC;gBACnB,OAAO,GAAG,IAAI,CAAA;gBACd,IAAI,IAAI,CAAC,CAAA;YACV,CAAC;YACD,SAAQ;QACT,CAAC;QACD,yBAAyB;QACzB,IAAI,EAAE,KAAK,IAAI,IAAI,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,2EAA2E;QAC5E,CAAC;QACD,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;QACZ,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACX,CAAC,IAAI,CAAC,CAAA;IACP,CAAC;IAED,wCAAwC;IACxC,IAAI,IAAI,GAAG,CAAC,CAAA;IACZ,OAAO,IAAI,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAE,CAAC;QAAE,IAAI,IAAI,CAAC,CAAA;IACjE,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,CAAA;IACtB,OAAO,KAAK,GAAG,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,CAAE,CAAC;QAAE,KAAK,IAAI,CAAC,CAAA;IAClE,IAAI,IAAI,GAAG,CAAC,IAAI,KAAK,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC;QACpC,OAAO,GAAG,IAAI,CAAA;IACf,CAAC;IACD,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,CAAA;IACzC,MAAM,UAAU,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,CAAA;IAEzC,IAAI,CAAC,OAAO,IAAI,UAAU,CAAC,MAAM,KAAK,KAAK,CAAC,MAAM,EAAE,CAAC;QACpD,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,EAAE,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAA;IAChE,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,GAAG,EAAE,UAAU,EAAE,IAAI,EAAE,CAAA;AAC5D,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mailwoman/normalize",
|
|
3
|
+
"version": "4.0.0",
|
|
4
|
+
"description": "Stage 1 of the runtime pipeline — deterministic input preprocessing (Unicode NFC, punctuation, whitespace, abbreviation). Pure functions, no ML.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman.git",
|
|
9
|
+
"directory": "normalize"
|
|
10
|
+
},
|
|
11
|
+
"type": "module",
|
|
12
|
+
"exports": {
|
|
13
|
+
"./package.json": "./package.json",
|
|
14
|
+
".": "./out/index.js"
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"out/**/*.js",
|
|
18
|
+
"out/**/*.js.map",
|
|
19
|
+
"out/**/*.d.ts",
|
|
20
|
+
"out/**/*.d.ts.map"
|
|
21
|
+
],
|
|
22
|
+
"publishConfig": {
|
|
23
|
+
"access": "public"
|
|
24
|
+
}
|
|
25
|
+
}
|