@lde/text-normalization 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,39 @@
1
+ # @lde/text-normalization
2
+
3
+ Zero-dependency text folding for search index and query normalization.
4
+
5
+ `fold()` produces a diacritic- and case-insensitive form of a string, applied
6
+ **identically at index time and query time** so that a search index never
7
+ diverges from the queries run against it (divergence = silent search misses).
8
+
9
+ ```ts
10
+ import { fold } from '@lde/text-normalization';
11
+
12
+ fold('Møhlmann'); // 'mohlmann'
13
+ fold('Coöperatieve'); // 'cooperatieve'
14
+ fold('Straße'); // 'strasse'
15
+ ```
16
+
17
+ It combines Unicode NFKD decomposition + combining-mark stripping (which folds
18
+ é, ö, å, ç, …) with an explicit transliteration map for letters that do **not**
19
+ decompose under NFKD (ø, æ, œ, ß, ð, þ, ł, đ, …).
20
+
21
+ ## When it’s needed
22
+
23
+ A search engine on its default locale often folds case and diacritics for you –
24
+ Typesense v30 (verified) even folds the non-decomposing `ø`/`æ`/`ß` – so on the
25
+ default locale `fold()` is redundant for _search_. It becomes necessary when:
26
+
27
+ - **Sorting** – engines sort strings by raw code-point order with no collation,
28
+ so a `fold()`-ed companion field is the only way to sort case- and
29
+ diacritic-insensitively.
30
+ - **Stemming** – enabling a language’s stemmer requires a non-default
31
+ `locale`, which switches the tokenizer (Typesense → ICU) to one that
32
+ _preserves_ diacritics; the default folding is lost, and `fold()` restores
33
+ diacritic-insensitive matching.
34
+
35
+ `fold()` is idempotent (`fold(fold(x)) === fold(x)`). Punctuation and word
36
+ boundaries are preserved; tokenization is left to the search engine.
37
+
38
+ Because folded values are stored in the search index, the same `fold()` must be
39
+ used at index time and query time, and any change to it requires a full rebuild.
package/dist/fold.d.ts ADDED
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Fold a string into a diacritic- and case-insensitive normalized form, applied
3
+ * IDENTICALLY at index time and query time (divergence = silent search misses).
4
+ *
5
+ * Steps: lowercase → transliterate non-decomposing letters → NFKD → strip
6
+ * combining marks. Idempotent: `fold(fold(x)) === fold(x)`. Punctuation and
7
+ * word boundaries are preserved; tokenization is left to the search engine.
8
+ */
9
+ export declare function fold(input: string): string;
10
+ //# sourceMappingURL=fold.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fold.d.ts","sourceRoot":"","sources":["../src/fold.ts"],"names":[],"mappings":"AAiCA;;;;;;;GAOG;AACH,wBAAgB,IAAI,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAM1C"}
package/dist/fold.js ADDED
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Transliteration map for letters that do NOT decompose under Unicode NFKD
3
+ * normalization, so a plain combining-mark strip cannot fold them. Keys are
4
+ * lowercase; uppercase variants are handled by lowercasing before lookup.
5
+ *
6
+ * The flagship case is ø (#1661: “Møhlmann” must be found by “Mohlmann”).
7
+ * Decomposing letters (é, ö, å, ç, …) are intentionally NOT listed here —
8
+ * NFKD + the combining-mark strip already fold them.
9
+ */
10
+ const TRANSLITERATION_MAP = {
11
+ ø: 'o',
12
+ æ: 'ae',
13
+ œ: 'oe',
14
+ ß: 'ss',
15
+ ð: 'd',
16
+ þ: 'th',
17
+ ł: 'l',
18
+ đ: 'd',
19
+ ħ: 'h',
20
+ ŋ: 'ng',
21
+ ı: 'i',
22
+ ĸ: 'k',
23
+ };
24
+ const TRANSLITERATION_PATTERN = new RegExp(`[${Object.keys(TRANSLITERATION_MAP).join('')}]`, 'g');
25
+ // Strip all Unicode nonspacing combining marks left behind by NFKD
26
+ // decomposition (e.g. the diaeresis in ö, the acute in é).
27
+ const COMBINING_MARKS_PATTERN = /\p{Mn}/gu;
28
+ /**
29
+ * Fold a string into a diacritic- and case-insensitive normalized form, applied
30
+ * IDENTICALLY at index time and query time (divergence = silent search misses).
31
+ *
32
+ * Steps: lowercase → transliterate non-decomposing letters → NFKD → strip
33
+ * combining marks. Idempotent: `fold(fold(x)) === fold(x)`. Punctuation and
34
+ * word boundaries are preserved; tokenization is left to the search engine.
35
+ */
36
+ export function fold(input) {
37
+ return input
38
+ .toLowerCase()
39
+ .replace(TRANSLITERATION_PATTERN, (letter) => TRANSLITERATION_MAP[letter])
40
+ .normalize('NFKD')
41
+ .replace(COMBINING_MARKS_PATTERN, '');
42
+ }
@@ -0,0 +1,2 @@
1
+ export { fold } from './fold.js';
2
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ export { fold } from './fold.js';
package/package.json ADDED
@@ -0,0 +1,30 @@
1
+ {
2
+ "name": "@lde/text-normalization",
3
+ "version": "0.0.0",
4
+ "description": "Zero-dependency text folding (diacritic stripping and transliteration) for search index and query normalization",
5
+ "repository": {
6
+ "url": "git+https://github.com/ldelements/lde.git",
7
+ "directory": "packages/text-normalization"
8
+ },
9
+ "license": "MIT",
10
+ "type": "module",
11
+ "exports": {
12
+ "./package.json": "./package.json",
13
+ ".": {
14
+ "types": "./dist/index.d.ts",
15
+ "import": "./dist/index.js",
16
+ "development": "./src/index.ts",
17
+ "default": "./dist/index.js"
18
+ }
19
+ },
20
+ "main": "./dist/index.js",
21
+ "module": "./dist/index.js",
22
+ "types": "./dist/index.d.ts",
23
+ "files": [
24
+ "dist",
25
+ "!**/*.tsbuildinfo"
26
+ ],
27
+ "dependencies": {
28
+ "tslib": "^2.3.0"
29
+ }
30
+ }