arabic-text-normalizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,49 @@
1
+ # arabic-text-normalizer
2
+
3
+ Normalize Quranic/Arabic text by removing diacritics, markers, and decorative characters.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install arabic-text-normalizer
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```ts
14
+ import { normalize } from "arabic-text-normalizer";
15
+
16
+ // Full normalization (default)
17
+ normalize("بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ");
18
+ // => "بسم الله الرحمن الرحيم"
19
+
20
+ normalize("۞ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ ﴿٢﴾");
21
+ // => "الحمد لله رب العالمين"
22
+
23
+ // Selective normalization
24
+ normalize("بِسْمِ", { diacritics: false });
25
+ // => "بِسْمِ"
26
+ ```
27
+
28
+ ## Options
29
+
30
+ All options default to `true` for full normalization:
31
+
32
+ | Option | Description |
33
+ |--------|-------------|
34
+ | `diacritics` | Remove harakat/tashkeel (fatha, damma, kasra, shadda, sukun, tanween) |
35
+ | `markers` | Remove Quranic markers (sajdah, rub el hizb, end of ayah) |
36
+ | `verseNumbers` | Remove verse numbers and ornate brackets |
37
+ | `tatweel` | Remove tatweel/kashida elongation character |
38
+ | `smallLetters` | Remove small/superscript letters (small alif, waw, ya) |
39
+ | `collapseWhitespace` | Collapse multiple spaces to single, trim |
40
+
41
+ ## What's Preserved
42
+
43
+ - Hamza forms (أ إ آ ؤ ئ ء) remain distinct
44
+ - Alif maqsura (ى) vs ya (ي) remain distinct
45
+ - Teh marbuta (ة) vs heh (ه) remain distinct
46
+
47
+ ## License
48
+
49
+ MIT
package/dist/index.cjs ADDED
@@ -0,0 +1,69 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var index_exports = {};
22
+ __export(index_exports, {
23
+ default: () => index_default,
24
+ normalize: () => normalize
25
+ });
26
+ module.exports = __toCommonJS(index_exports);
27
+ var DIACRITICS = /[\u064B-\u065F]/g;
28
+ var ALIF_MADDA = /\u0622/g;
29
+ var QURANIC_ANNOTATIONS = /[\u0670\u06D6-\u06ED]/g;
30
+ var TATWEEL = /\u0640/g;
31
+ var ORNATE_PARENS = /[\uFD3E\uFD3F]/g;
32
+ var ARABIC_DIGITS = /[\u0660-\u0669\u06F0-\u06F9]/g;
33
+ var MULTI_WHITESPACE = /\s+/g;
34
+ var DEFAULT_OPTIONS = {
35
+ diacritics: true,
36
+ markers: true,
37
+ verseNumbers: true,
38
+ tatweel: true,
39
+ smallLetters: true,
40
+ collapseWhitespace: true
41
+ };
42
+ function normalize(text, options = {}) {
43
+ const opts = { ...DEFAULT_OPTIONS, ...options };
44
+ let result = text;
45
+ if (opts.diacritics) {
46
+ result = result.replace(DIACRITICS, "");
47
+ result = result.replace(ALIF_MADDA, "\u0627");
48
+ }
49
+ if (opts.markers || opts.smallLetters) {
50
+ result = result.replace(QURANIC_ANNOTATIONS, "");
51
+ }
52
+ if (opts.verseNumbers) {
53
+ result = result.replace(ORNATE_PARENS, "");
54
+ result = result.replace(ARABIC_DIGITS, "");
55
+ }
56
+ if (opts.tatweel) {
57
+ result = result.replace(TATWEEL, "");
58
+ }
59
+ if (opts.collapseWhitespace) {
60
+ result = result.replace(MULTI_WHITESPACE, " ").trim();
61
+ }
62
+ return result;
63
+ }
64
+ var index_default = normalize;
65
+ // Annotate the CommonJS export names for ESM import in node:
66
+ 0 && (module.exports = {
67
+ normalize
68
+ });
69
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export interface NormalizeOptions {\n /** Remove harakat/tashkeel diacritics (default: true) */\n diacritics?: boolean;\n /** Remove Quranic markers like sajdah, rub el hizb (default: true) */\n markers?: boolean;\n /** Remove verse numbers and their brackets (default: true) */\n verseNumbers?: boolean;\n /** Remove tatweel/kashida elongation (default: true) */\n tatweel?: boolean;\n /** Remove small/superscript letters (default: true) */\n smallLetters?: boolean;\n /** Collapse multiple whitespace to single space (default: true) */\n collapseWhitespace?: boolean;\n}\n\n// Arabic tashkeel/harakat: U+064B-U+065F\nconst DIACRITICS = /[\\u064B-\\u065F]/g;\n\n// Alif with madda above (آ U+0622) -> plain alif (ا U+0627)\nconst ALIF_MADDA = /\\u0622/g;\n\n// Superscript alif and other Quranic annotation marks: U+0670, U+06D6-U+06ED\nconst QURANIC_ANNOTATIONS = /[\\u0670\\u06D6-\\u06ED]/g;\n\n// Small letters (superscript): small high letters used in Quranic text\n// U+06E5 (small waw), U+06E6 (small ya), etc. are in QURANIC_ANNOTATIONS range\n\n// End of ayah U+06DD, start of rub el hizb U+06DE, place of sajdah U+06E9\n// These are included in QURANIC_ANNOTATIONS range (U+06D6-U+06ED)\n\n// Tatweel/kashida: U+0640\nconst TATWEEL = /\\u0640/g;\n\n// Ornate parentheses: U+FD3E, U+FD3F\nconst ORNATE_PARENS = /[\\uFD3E\\uFD3F]/g;\n\n// Arabic-Indic digits: U+0660-U+0669\n// Extended Arabic-Indic digits: U+06F0-U+06F9\nconst ARABIC_DIGITS = /[\\u0660-\\u0669\\u06F0-\\u06F9]/g;\n\n// Multiple whitespace\nconst MULTI_WHITESPACE = /\\s+/g;\n\nconst DEFAULT_OPTIONS: Required<NormalizeOptions> = {\n diacritics: true,\n markers: true,\n verseNumbers: true,\n tatweel: true,\n smallLetters: true,\n collapseWhitespace: true,\n};\n\n/**\n * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.\n *\n * @param text - The Arabic text to normalize\n * @param options - Normalization options (all default to true for full normalization)\n * @returns Normalized plain Arabic text\n *\n * @example\n * ```ts\n * import { normalize } from 'arabic-text-normalizer';\n *\n * // Full normalization (default)\n * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');\n * // => 'بسم الله الرحمن الرحيم'\n *\n * // Keep diacritics\n * normalize('بِسْمِ', { diacritics: false });\n * // => 'بِسْمِ'\n * ```\n */\nexport function normalize(text: string, options: NormalizeOptions = {}): string {\n const opts = { ...DEFAULT_OPTIONS, ...options };\n let result = text;\n\n if (opts.diacritics) {\n result = result.replace(DIACRITICS, \"\");\n result = result.replace(ALIF_MADDA, \"\\u0627\"); // آ -> ا\n }\n\n if (opts.markers || opts.smallLetters) {\n // QURANIC_ANNOTATIONS includes both markers and small letters\n result = result.replace(QURANIC_ANNOTATIONS, \"\");\n }\n\n if (opts.verseNumbers) {\n result = result.replace(ORNATE_PARENS, \"\");\n result = result.replace(ARABIC_DIGITS, \"\");\n }\n\n if (opts.tatweel) {\n result = result.replace(TATWEEL, \"\");\n }\n\n if (opts.collapseWhitespace) {\n result = result.replace(MULTI_WHITESPACE, \" \").trim();\n }\n\n return result;\n}\n\nexport default normalize;\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAgBA,IAAM,aAAa;AAGnB,IAAM,aAAa;AAGnB,IAAM,sBAAsB;AAS5B,IAAM,UAAU;AAGhB,IAAM,gBAAgB;AAItB,IAAM,gBAAgB;AAGtB,IAAM,mBAAmB;AAEzB,IAAM,kBAA8C;AAAA,EAClD,YAAY;AAAA,EACZ,SAAS;AAAA,EACT,cAAc;AAAA,EACd,SAAS;AAAA,EACT,cAAc;AAAA,EACd,oBAAoB;AACtB;AAsBO,SAAS,UAAU,MAAc,UAA4B,CAAC,GAAW;AAC9E,QAAM,OAAO,EAAE,GAAG,iBAAiB,GAAG,QAAQ;AAC9C,MAAI,SAAS;AAEb,MAAI,KAAK,YAAY;AACnB,aAAS,OAAO,QAAQ,YAAY,EAAE;AACtC,aAAS,OAAO,QAAQ,YAAY,QAAQ;AAAA,EAC9C;AAEA,MAAI,KAAK,WAAW,KAAK,cAAc;AAErC,aAAS,OAAO,QAAQ,qBAAqB,EAAE;AAAA,EACjD;AAEA,MAAI,KAAK,cAAc;AACrB,aAAS,OAAO,QAAQ,eAAe,EAAE;AACzC,aAAS,OAAO,QAAQ,eAAe,EAAE;AAAA,EAC3C;AAEA,MAAI,KAAK,SAAS;AAChB,aAAS,OAAO,QAAQ,SAAS,EAAE;AAAA,EACrC;AAEA,MAAI,KAAK,oBAAoB;AAC3B,aAAS,OAAO,QAAQ,kBAAkB,GAAG,EAAE,KAAK;AAAA,EACtD;AAEA,SAAO;AACT;AAEA,IAAO,gBAAQ;","names":[]}
@@ -0,0 +1,37 @@
1
+ interface NormalizeOptions {
2
+ /** Remove harakat/tashkeel diacritics (default: true) */
3
+ diacritics?: boolean;
4
+ /** Remove Quranic markers like sajdah, rub el hizb (default: true) */
5
+ markers?: boolean;
6
+ /** Remove verse numbers and their brackets (default: true) */
7
+ verseNumbers?: boolean;
8
+ /** Remove tatweel/kashida elongation (default: true) */
9
+ tatweel?: boolean;
10
+ /** Remove small/superscript letters (default: true) */
11
+ smallLetters?: boolean;
12
+ /** Collapse multiple whitespace to single space (default: true) */
13
+ collapseWhitespace?: boolean;
14
+ }
15
+ /**
16
+ * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.
17
+ *
18
+ * @param text - The Arabic text to normalize
19
+ * @param options - Normalization options (all default to true for full normalization)
20
+ * @returns Normalized plain Arabic text
21
+ *
22
+ * @example
23
+ * ```ts
24
+ * import { normalize } from 'arabic-text-normalizer';
25
+ *
26
+ * // Full normalization (default)
27
+ * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');
28
+ * // => 'بسم الله الرحمن الرحيم'
29
+ *
30
+ * // Keep diacritics
31
+ * normalize('بِسْمِ', { diacritics: false });
32
+ * // => 'بِسْمِ'
33
+ * ```
34
+ */
35
+ declare function normalize(text: string, options?: NormalizeOptions): string;
36
+
37
+ export { type NormalizeOptions, normalize as default, normalize };
@@ -0,0 +1,37 @@
1
+ interface NormalizeOptions {
2
+ /** Remove harakat/tashkeel diacritics (default: true) */
3
+ diacritics?: boolean;
4
+ /** Remove Quranic markers like sajdah, rub el hizb (default: true) */
5
+ markers?: boolean;
6
+ /** Remove verse numbers and their brackets (default: true) */
7
+ verseNumbers?: boolean;
8
+ /** Remove tatweel/kashida elongation (default: true) */
9
+ tatweel?: boolean;
10
+ /** Remove small/superscript letters (default: true) */
11
+ smallLetters?: boolean;
12
+ /** Collapse multiple whitespace to single space (default: true) */
13
+ collapseWhitespace?: boolean;
14
+ }
15
+ /**
16
+ * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.
17
+ *
18
+ * @param text - The Arabic text to normalize
19
+ * @param options - Normalization options (all default to true for full normalization)
20
+ * @returns Normalized plain Arabic text
21
+ *
22
+ * @example
23
+ * ```ts
24
+ * import { normalize } from 'arabic-text-normalizer';
25
+ *
26
+ * // Full normalization (default)
27
+ * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');
28
+ * // => 'بسم الله الرحمن الرحيم'
29
+ *
30
+ * // Keep diacritics
31
+ * normalize('بِسْمِ', { diacritics: false });
32
+ * // => 'بِسْمِ'
33
+ * ```
34
+ */
35
+ declare function normalize(text: string, options?: NormalizeOptions): string;
36
+
37
+ export { type NormalizeOptions, normalize as default, normalize };
package/dist/index.js ADDED
@@ -0,0 +1,44 @@
1
+ // src/index.ts
2
+ var DIACRITICS = /[\u064B-\u065F]/g;
3
+ var ALIF_MADDA = /\u0622/g;
4
+ var QURANIC_ANNOTATIONS = /[\u0670\u06D6-\u06ED]/g;
5
+ var TATWEEL = /\u0640/g;
6
+ var ORNATE_PARENS = /[\uFD3E\uFD3F]/g;
7
+ var ARABIC_DIGITS = /[\u0660-\u0669\u06F0-\u06F9]/g;
8
+ var MULTI_WHITESPACE = /\s+/g;
9
+ var DEFAULT_OPTIONS = {
10
+ diacritics: true,
11
+ markers: true,
12
+ verseNumbers: true,
13
+ tatweel: true,
14
+ smallLetters: true,
15
+ collapseWhitespace: true
16
+ };
17
+ function normalize(text, options = {}) {
18
+ const opts = { ...DEFAULT_OPTIONS, ...options };
19
+ let result = text;
20
+ if (opts.diacritics) {
21
+ result = result.replace(DIACRITICS, "");
22
+ result = result.replace(ALIF_MADDA, "\u0627");
23
+ }
24
+ if (opts.markers || opts.smallLetters) {
25
+ result = result.replace(QURANIC_ANNOTATIONS, "");
26
+ }
27
+ if (opts.verseNumbers) {
28
+ result = result.replace(ORNATE_PARENS, "");
29
+ result = result.replace(ARABIC_DIGITS, "");
30
+ }
31
+ if (opts.tatweel) {
32
+ result = result.replace(TATWEEL, "");
33
+ }
34
+ if (opts.collapseWhitespace) {
35
+ result = result.replace(MULTI_WHITESPACE, " ").trim();
36
+ }
37
+ return result;
38
+ }
39
+ var index_default = normalize;
40
+ export {
41
+ index_default as default,
42
+ normalize
43
+ };
44
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export interface NormalizeOptions {\n /** Remove harakat/tashkeel diacritics (default: true) */\n diacritics?: boolean;\n /** Remove Quranic markers like sajdah, rub el hizb (default: true) */\n markers?: boolean;\n /** Remove verse numbers and their brackets (default: true) */\n verseNumbers?: boolean;\n /** Remove tatweel/kashida elongation (default: true) */\n tatweel?: boolean;\n /** Remove small/superscript letters (default: true) */\n smallLetters?: boolean;\n /** Collapse multiple whitespace to single space (default: true) */\n collapseWhitespace?: boolean;\n}\n\n// Arabic tashkeel/harakat: U+064B-U+065F\nconst DIACRITICS = /[\\u064B-\\u065F]/g;\n\n// Alif with madda above (آ U+0622) -> plain alif (ا U+0627)\nconst ALIF_MADDA = /\\u0622/g;\n\n// Superscript alif and other Quranic annotation marks: U+0670, U+06D6-U+06ED\nconst QURANIC_ANNOTATIONS = /[\\u0670\\u06D6-\\u06ED]/g;\n\n// Small letters (superscript): small high letters used in Quranic text\n// U+06E5 (small waw), U+06E6 (small ya), etc. are in QURANIC_ANNOTATIONS range\n\n// End of ayah U+06DD, start of rub el hizb U+06DE, place of sajdah U+06E9\n// These are included in QURANIC_ANNOTATIONS range (U+06D6-U+06ED)\n\n// Tatweel/kashida: U+0640\nconst TATWEEL = /\\u0640/g;\n\n// Ornate parentheses: U+FD3E, U+FD3F\nconst ORNATE_PARENS = /[\\uFD3E\\uFD3F]/g;\n\n// Arabic-Indic digits: U+0660-U+0669\n// Extended Arabic-Indic digits: U+06F0-U+06F9\nconst ARABIC_DIGITS = /[\\u0660-\\u0669\\u06F0-\\u06F9]/g;\n\n// Multiple whitespace\nconst MULTI_WHITESPACE = /\\s+/g;\n\nconst DEFAULT_OPTIONS: Required<NormalizeOptions> = {\n diacritics: true,\n markers: true,\n verseNumbers: true,\n tatweel: true,\n smallLetters: true,\n collapseWhitespace: true,\n};\n\n/**\n * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.\n *\n * @param text - The Arabic text to normalize\n * @param options - Normalization options (all default to true for full normalization)\n * @returns Normalized plain Arabic text\n *\n * @example\n * ```ts\n * import { normalize } from 'arabic-text-normalizer';\n *\n * // Full normalization (default)\n * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');\n * // => 'بسم الله الرحمن الرحيم'\n *\n * // Keep diacritics\n * normalize('بِسْمِ', { diacritics: false });\n * // => 'بِسْمِ'\n * ```\n */\nexport function normalize(text: string, options: NormalizeOptions = {}): string {\n const opts = { ...DEFAULT_OPTIONS, ...options };\n let result = text;\n\n if (opts.diacritics) {\n result = result.replace(DIACRITICS, \"\");\n result = result.replace(ALIF_MADDA, \"\\u0627\"); // آ -> ا\n }\n\n if (opts.markers || opts.smallLetters) {\n // QURANIC_ANNOTATIONS includes both markers and small letters\n result = result.replace(QURANIC_ANNOTATIONS, \"\");\n }\n\n if (opts.verseNumbers) {\n result = result.replace(ORNATE_PARENS, \"\");\n result = result.replace(ARABIC_DIGITS, \"\");\n }\n\n if (opts.tatweel) {\n result = result.replace(TATWEEL, \"\");\n }\n\n if (opts.collapseWhitespace) {\n result = result.replace(MULTI_WHITESPACE, \" \").trim();\n }\n\n return result;\n}\n\nexport default normalize;\n"],"mappings":";AAgBA,IAAM,aAAa;AAGnB,IAAM,aAAa;AAGnB,IAAM,sBAAsB;AAS5B,IAAM,UAAU;AAGhB,IAAM,gBAAgB;AAItB,IAAM,gBAAgB;AAGtB,IAAM,mBAAmB;AAEzB,IAAM,kBAA8C;AAAA,EAClD,YAAY;AAAA,EACZ,SAAS;AAAA,EACT,cAAc;AAAA,EACd,SAAS;AAAA,EACT,cAAc;AAAA,EACd,oBAAoB;AACtB;AAsBO,SAAS,UAAU,MAAc,UAA4B,CAAC,GAAW;AAC9E,QAAM,OAAO,EAAE,GAAG,iBAAiB,GAAG,QAAQ;AAC9C,MAAI,SAAS;AAEb,MAAI,KAAK,YAAY;AACnB,aAAS,OAAO,QAAQ,YAAY,EAAE;AACtC,aAAS,OAAO,QAAQ,YAAY,QAAQ;AAAA,EAC9C;AAEA,MAAI,KAAK,WAAW,KAAK,cAAc;AAErC,aAAS,OAAO,QAAQ,qBAAqB,EAAE;AAAA,EACjD;AAEA,MAAI,KAAK,cAAc;AACrB,aAAS,OAAO,QAAQ,eAAe,EAAE;AACzC,aAAS,OAAO,QAAQ,eAAe,EAAE;AAAA,EAC3C;AAEA,MAAI,KAAK,SAAS;AAChB,aAAS,OAAO,QAAQ,SAAS,EAAE;AAAA,EACrC;AAEA,MAAI,KAAK,oBAAoB;AAC3B,aAAS,OAAO,QAAQ,kBAAkB,GAAG,EAAE,KAAK;AAAA,EACtD;AAEA,SAAO;AACT;AAEA,IAAO,gBAAQ;","names":[]}
package/package.json ADDED
@@ -0,0 +1,55 @@
1
+ {
2
+ "name": "arabic-text-normalizer",
3
+ "version": "1.0.0",
4
+ "description": "Normalize Quranic/Arabic text by removing diacritics, markers, and decorative characters",
5
+ "type": "module",
6
+ "main": "./dist/index.cjs",
7
+ "module": "./dist/index.js",
8
+ "types": "./dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "import": {
12
+ "types": "./dist/index.d.ts",
13
+ "default": "./dist/index.js"
14
+ },
15
+ "require": {
16
+ "types": "./dist/index.d.cts",
17
+ "default": "./dist/index.cjs"
18
+ }
19
+ }
20
+ },
21
+ "files": [
22
+ "dist"
23
+ ],
24
+ "scripts": {
25
+ "build": "tsup",
26
+ "test": "vitest run",
27
+ "test:watch": "vitest",
28
+ "prepublishOnly": "npm run build"
29
+ },
30
+ "keywords": [
31
+ "arabic",
32
+ "quran",
33
+ "normalize",
34
+ "diacritics",
35
+ "tashkeel",
36
+ "harakat",
37
+ "text-processing"
38
+ ],
39
+ "author": "",
40
+ "license": "MIT",
41
+ "repository": {
42
+ "type": "git",
43
+ "url": "git+https://github.com/yazinsai/arabic-text-normalizer.git"
44
+ },
45
+ "bugs": {
46
+ "url": "https://github.com/yazinsai/arabic-text-normalizer/issues"
47
+ },
48
+ "homepage": "https://github.com/yazinsai/arabic-text-normalizer#readme",
49
+ "devDependencies": {
50
+ "@types/node": "^25.2.0",
51
+ "tsup": "^8.5.1",
52
+ "typescript": "^5.9.3",
53
+ "vitest": "^4.0.18"
54
+ }
55
+ }