@lde/text-normalization 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -0
- package/dist/fold.d.ts +10 -0
- package/dist/fold.d.ts.map +1 -0
- package/dist/fold.js +42 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1 -0
- package/package.json +30 -0
package/README.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# @lde/text-normalization
|
|
2
|
+
|
|
3
|
+
Zero-dependency text folding for search index and query normalization.
|
|
4
|
+
|
|
5
|
+
`fold()` produces a diacritic- and case-insensitive form of a string, applied
|
|
6
|
+
**identically at index time and query time** so that a search index never
|
|
7
|
+
diverges from the queries run against it (divergence = silent search misses).
|
|
8
|
+
|
|
9
|
+
```ts
|
|
10
|
+
import { fold } from '@lde/text-normalization';
|
|
11
|
+
|
|
12
|
+
fold('Møhlmann'); // 'mohlmann'
|
|
13
|
+
fold('Coöperatieve'); // 'cooperatieve'
|
|
14
|
+
fold('Straße'); // 'strasse'
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
It combines Unicode NFKD decomposition + combining-mark stripping (which folds
|
|
18
|
+
é, ö, å, ç, …) with an explicit transliteration map for letters that do **not**
|
|
19
|
+
decompose under NFKD (ø, æ, œ, ß, ð, þ, ł, đ, …).
|
|
20
|
+
|
|
21
|
+
## When it’s needed
|
|
22
|
+
|
|
23
|
+
A search engine on its default locale often folds case and diacritics for you –
|
|
24
|
+
Typesense v30 (verified) even folds the non-decomposing `ø`/`æ`/`ß` – so on the
|
|
25
|
+
default locale `fold()` is redundant for _search_. It becomes necessary when:
|
|
26
|
+
|
|
27
|
+
- **Sorting** – engines sort strings by raw code-point order with no collation,
|
|
28
|
+
so a `fold()`-ed companion field is the only way to sort case- and
|
|
29
|
+
diacritic-insensitively.
|
|
30
|
+
- **Stemming** – enabling a language’s stemmer requires a non-default
|
|
31
|
+
`locale`, which switches the tokenizer (Typesense → ICU) to one that
|
|
32
|
+
_preserves_ diacritics; the default folding is lost, and `fold()` restores
|
|
33
|
+
diacritic-insensitive matching.
|
|
34
|
+
|
|
35
|
+
`fold()` is idempotent (`fold(fold(x)) === fold(x)`). Punctuation and word
|
|
36
|
+
boundaries are preserved; tokenization is left to the search engine.
|
|
37
|
+
|
|
38
|
+
Because folded values are stored in the search index, the same `fold()` must be
|
|
39
|
+
used at index time and query time, and any change to it requires a full rebuild.
|
package/dist/fold.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fold a string into a diacritic- and case-insensitive normalized form, applied
|
|
3
|
+
* IDENTICALLY at index time and query time (divergence = silent search misses).
|
|
4
|
+
*
|
|
5
|
+
* Steps: lowercase → transliterate non-decomposing letters → NFKD → strip
|
|
6
|
+
* combining marks. Idempotent: `fold(fold(x)) === fold(x)`. Punctuation and
|
|
7
|
+
* word boundaries are preserved; tokenization is left to the search engine.
|
|
8
|
+
*/
|
|
9
|
+
export declare function fold(input: string): string;
|
|
10
|
+
//# sourceMappingURL=fold.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fold.d.ts","sourceRoot":"","sources":["../src/fold.ts"],"names":[],"mappings":"AAiCA;;;;;;;GAOG;AACH,wBAAgB,IAAI,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAM1C"}
|
package/dist/fold.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transliteration map for letters that do NOT decompose under Unicode NFKD
|
|
3
|
+
* normalization, so a plain combining-mark strip cannot fold them. Keys are
|
|
4
|
+
* lowercase; uppercase variants are handled by lowercasing before lookup.
|
|
5
|
+
*
|
|
6
|
+
* The flagship case is ø (#1661: “Møhlmann” must be found by “Mohlmann”).
|
|
7
|
+
* Decomposing letters (é, ö, å, ç, …) are intentionally NOT listed here —
|
|
8
|
+
* NFKD + the combining-mark strip already fold them.
|
|
9
|
+
*/
|
|
10
|
+
const TRANSLITERATION_MAP = {
|
|
11
|
+
ø: 'o',
|
|
12
|
+
æ: 'ae',
|
|
13
|
+
œ: 'oe',
|
|
14
|
+
ß: 'ss',
|
|
15
|
+
ð: 'd',
|
|
16
|
+
þ: 'th',
|
|
17
|
+
ł: 'l',
|
|
18
|
+
đ: 'd',
|
|
19
|
+
ħ: 'h',
|
|
20
|
+
ŋ: 'ng',
|
|
21
|
+
ı: 'i',
|
|
22
|
+
ĸ: 'k',
|
|
23
|
+
};
|
|
24
|
+
const TRANSLITERATION_PATTERN = new RegExp(`[${Object.keys(TRANSLITERATION_MAP).join('')}]`, 'g');
|
|
25
|
+
// Strip all Unicode nonspacing combining marks left behind by NFKD
|
|
26
|
+
// decomposition (e.g. the diaeresis in ö, the acute in é).
|
|
27
|
+
const COMBINING_MARKS_PATTERN = /\p{Mn}/gu;
|
|
28
|
+
/**
|
|
29
|
+
* Fold a string into a diacritic- and case-insensitive normalized form, applied
|
|
30
|
+
* IDENTICALLY at index time and query time (divergence = silent search misses).
|
|
31
|
+
*
|
|
32
|
+
* Steps: lowercase → transliterate non-decomposing letters → NFKD → strip
|
|
33
|
+
* combining marks. Idempotent: `fold(fold(x)) === fold(x)`. Punctuation and
|
|
34
|
+
* word boundaries are preserved; tokenization is left to the search engine.
|
|
35
|
+
*/
|
|
36
|
+
export function fold(input) {
|
|
37
|
+
return input
|
|
38
|
+
.toLowerCase()
|
|
39
|
+
.replace(TRANSLITERATION_PATTERN, (letter) => TRANSLITERATION_MAP[letter])
|
|
40
|
+
.normalize('NFKD')
|
|
41
|
+
.replace(COMBINING_MARKS_PATTERN, '');
|
|
42
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { fold } from './fold.js';
|
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@lde/text-normalization",
|
|
3
|
+
"version": "0.0.0",
|
|
4
|
+
"description": "Zero-dependency text folding (diacritic stripping and transliteration) for search index and query normalization",
|
|
5
|
+
"repository": {
|
|
6
|
+
"url": "git+https://github.com/ldelements/lde.git",
|
|
7
|
+
"directory": "packages/text-normalization"
|
|
8
|
+
},
|
|
9
|
+
"license": "MIT",
|
|
10
|
+
"type": "module",
|
|
11
|
+
"exports": {
|
|
12
|
+
"./package.json": "./package.json",
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"import": "./dist/index.js",
|
|
16
|
+
"development": "./src/index.ts",
|
|
17
|
+
"default": "./dist/index.js"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"main": "./dist/index.js",
|
|
21
|
+
"module": "./dist/index.js",
|
|
22
|
+
"types": "./dist/index.d.ts",
|
|
23
|
+
"files": [
|
|
24
|
+
"dist",
|
|
25
|
+
"!**/*.tsbuildinfo"
|
|
26
|
+
],
|
|
27
|
+
"dependencies": {
|
|
28
|
+
"tslib": "^2.3.0"
|
|
29
|
+
}
|
|
30
|
+
}
|