npm - arabic-text-normalizer - Versions diffs - 1.0.0 - Mend

arabic-text-normalizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,49 @@
+# arabic-text-normalizer
+Normalize Quranic/Arabic text by removing diacritics, markers, and decorative characters.
+## Installation
+```bash
+npm install arabic-text-normalizer
+```
+## Usage
+```ts
+import { normalize } from "arabic-text-normalizer";
+// Full normalization (default)
+normalize("بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ");
+// => "بسم الله الرحمن الرحيم"
+normalize("۞ الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ ﴿٢﴾");
+// => "الحمد لله رب العالمين"
+// Selective normalization
+normalize("بِسْمِ", { diacritics: false });
+// => "بِسْمِ"
+```
+## Options
+All options default to `true` for full normalization:
+| Option | Description |
+|--------|-------------|
+| `diacritics` | Remove harakat/tashkeel (fatha, damma, kasra, shadda, sukun, tanween) |
+| `markers` | Remove Quranic markers (sajdah, rub el hizb, end of ayah) |
+| `verseNumbers` | Remove verse numbers and ornate brackets |
+| `tatweel` | Remove tatweel/kashida elongation character |
+| `smallLetters` | Remove small/superscript letters (small alif, waw, ya) |
+| `collapseWhitespace` | Collapse multiple spaces to single, trim |
+## What's Preserved
+- Hamza forms (أ إ آ ؤ ئ ء) remain distinct
+- Alif maqsura (ى) vs ya (ي) remain distinct
+- Teh marbuta (ة) vs heh (ه) remain distinct
+## License
+MIT

package/dist/index.cjs ADDED Viewed

@@ -0,0 +1,69 @@
+"use strict";
+var __defProp = Object.defineProperty;
+var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
+var __getOwnPropNames = Object.getOwnPropertyNames;
+var __hasOwnProp = Object.prototype.hasOwnProperty;
+var __export = (target, all) => {
+  for (var name in all)
+    __defProp(target, name, { get: all[name], enumerable: true });
+};
+var __copyProps = (to, from, except, desc) => {
+  if (from && typeof from === "object" || typeof from === "function") {
+    for (let key of __getOwnPropNames(from))
+      if (!__hasOwnProp.call(to, key) && key !== except)
+        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
+  }
+  return to;
+};
+var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
+// src/index.ts
+var index_exports = {};
+__export(index_exports, {
+  default: () => index_default,
+  normalize: () => normalize
+});
+module.exports = __toCommonJS(index_exports);
+var DIACRITICS = /[\u064B-\u065F]/g;
+var ALIF_MADDA = /\u0622/g;
+var QURANIC_ANNOTATIONS = /[\u0670\u06D6-\u06ED]/g;
+var TATWEEL = /\u0640/g;
+var ORNATE_PARENS = /[\uFD3E\uFD3F]/g;
+var ARABIC_DIGITS = /[\u0660-\u0669\u06F0-\u06F9]/g;
+var MULTI_WHITESPACE = /\s+/g;
+var DEFAULT_OPTIONS = {
+  diacritics: true,
+  markers: true,
+  verseNumbers: true,
+  tatweel: true,
+  smallLetters: true,
+  collapseWhitespace: true
+};
+function normalize(text, options = {}) {
+  const opts = { ...DEFAULT_OPTIONS, ...options };
+  let result = text;
+  if (opts.diacritics) {
+    result = result.replace(DIACRITICS, "");
+    result = result.replace(ALIF_MADDA, "\u0627");
+  }
+  if (opts.markers || opts.smallLetters) {
+    result = result.replace(QURANIC_ANNOTATIONS, "");
+  }
+  if (opts.verseNumbers) {
+    result = result.replace(ORNATE_PARENS, "");
+    result = result.replace(ARABIC_DIGITS, "");
+  }
+  if (opts.tatweel) {
+    result = result.replace(TATWEEL, "");
+  }
+  if (opts.collapseWhitespace) {
+    result = result.replace(MULTI_WHITESPACE, " ").trim();
+  }
+  return result;
+}
+var index_default = normalize;
+// Annotate the CommonJS export names for ESM import in node:
+0 && (module.exports = {
+  normalize
+});
+//# sourceMappingURL=index.cjs.map

package/dist/index.cjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export interface NormalizeOptions {\n /** Remove harakat/tashkeel diacritics (default: true) */\n diacritics?: boolean;\n /** Remove Quranic markers like sajdah, rub el hizb (default: true) */\n markers?: boolean;\n /** Remove verse numbers and their brackets (default: true) */\n verseNumbers?: boolean;\n /** Remove tatweel/kashida elongation (default: true) */\n tatweel?: boolean;\n /** Remove small/superscript letters (default: true) */\n smallLetters?: boolean;\n /** Collapse multiple whitespace to single space (default: true) */\n collapseWhitespace?: boolean;\n}\n\n// Arabic tashkeel/harakat: U+064B-U+065F\nconst DIACRITICS = /[\\u064B-\\u065F]/g;\n\n// Alif with madda above (آ U+0622) -> plain alif (ا U+0627)\nconst ALIF_MADDA = /\\u0622/g;\n\n// Superscript alif and other Quranic annotation marks: U+0670, U+06D6-U+06ED\nconst QURANIC_ANNOTATIONS = /[\\u0670\\u06D6-\\u06ED]/g;\n\n// Small letters (superscript): small high letters used in Quranic text\n// U+06E5 (small waw), U+06E6 (small ya), etc. are in QURANIC_ANNOTATIONS range\n\n// End of ayah U+06DD, start of rub el hizb U+06DE, place of sajdah U+06E9\n// These are included in QURANIC_ANNOTATIONS range (U+06D6-U+06ED)\n\n// Tatweel/kashida: U+0640\nconst TATWEEL = /\\u0640/g;\n\n// Ornate parentheses: U+FD3E, U+FD3F\nconst ORNATE_PARENS = /[\\uFD3E\\uFD3F]/g;\n\n// Arabic-Indic digits: U+0660-U+0669\n// Extended Arabic-Indic digits: U+06F0-U+06F9\nconst ARABIC_DIGITS = /[\\u0660-\\u0669\\u06F0-\\u06F9]/g;\n\n// Multiple whitespace\nconst MULTI_WHITESPACE = /\\s+/g;\n\nconst DEFAULT_OPTIONS: Required<NormalizeOptions> = {\n diacritics: true,\n markers: true,\n verseNumbers: true,\n tatweel: true,\n smallLetters: true,\n collapseWhitespace: true,\n};\n\n/**\n * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.\n *\n * @param text - The Arabic text to normalize\n * @param options - Normalization options (all default to true for full normalization)\n * @returns Normalized plain Arabic text\n *\n * @example\n * ```ts\n * import { normalize } from 'arabic-text-normalizer';\n *\n * // Full normalization (default)\n * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');\n * // => 'بسم الله الرحمن الرحيم'\n *\n * // Keep diacritics\n * normalize('بِسْمِ', { diacritics: false });\n * // => 'بِسْمِ'\n * ```\n */\nexport function normalize(text: string, options: NormalizeOptions = {}): string {\n const opts = { ...DEFAULT_OPTIONS, ...options };\n let result = text;\n\n if (opts.diacritics) {\n result = result.replace(DIACRITICS, \"\");\n result = result.replace(ALIF_MADDA, \"\\u0627\"); // آ -> ا\n }\n\n if (opts.markers || opts.smallLetters) {\n // QURANIC_ANNOTATIONS includes both markers and small letters\n result = result.replace(QURANIC_ANNOTATIONS, \"\");\n }\n\n if (opts.verseNumbers) {\n result = result.replace(ORNATE_PARENS, \"\");\n result = result.replace(ARABIC_DIGITS, \"\");\n }\n\n if (opts.tatweel) {\n result = result.replace(TATWEEL, \"\");\n }\n\n if (opts.collapseWhitespace) {\n result = result.replace(MULTI_WHITESPACE, \" \").trim();\n }\n\n return result;\n}\n\nexport default normalize;\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAgBA,IAAM,aAAa;AAGnB,IAAM,aAAa;AAGnB,IAAM,sBAAsB;AAS5B,IAAM,UAAU;AAGhB,IAAM,gBAAgB;AAItB,IAAM,gBAAgB;AAGtB,IAAM,mBAAmB;AAEzB,IAAM,kBAA8C;AAAA,EAClD,YAAY;AAAA,EACZ,SAAS;AAAA,EACT,cAAc;AAAA,EACd,SAAS;AAAA,EACT,cAAc;AAAA,EACd,oBAAoB;AACtB;AAsBO,SAAS,UAAU,MAAc,UAA4B,CAAC,GAAW;AAC9E,QAAM,OAAO,EAAE,GAAG,iBAAiB,GAAG,QAAQ;AAC9C,MAAI,SAAS;AAEb,MAAI,KAAK,YAAY;AACnB,aAAS,OAAO,QAAQ,YAAY,EAAE;AACtC,aAAS,OAAO,QAAQ,YAAY,QAAQ;AAAA,EAC9C;AAEA,MAAI,KAAK,WAAW,KAAK,cAAc;AAErC,aAAS,OAAO,QAAQ,qBAAqB,EAAE;AAAA,EACjD;AAEA,MAAI,KAAK,cAAc;AACrB,aAAS,OAAO,QAAQ,eAAe,EAAE;AACzC,aAAS,OAAO,QAAQ,eAAe,EAAE;AAAA,EAC3C;AAEA,MAAI,KAAK,SAAS;AAChB,aAAS,OAAO,QAAQ,SAAS,EAAE;AAAA,EACrC;AAEA,MAAI,KAAK,oBAAoB;AAC3B,aAAS,OAAO,QAAQ,kBAAkB,GAAG,EAAE,KAAK;AAAA,EACtD;AAEA,SAAO;AACT;AAEA,IAAO,gBAAQ;","names":[]}

package/dist/index.d.cts ADDED Viewed

@@ -0,0 +1,37 @@
+interface NormalizeOptions {
+    /** Remove harakat/tashkeel diacritics (default: true) */
+    diacritics?: boolean;
+    /** Remove Quranic markers like sajdah, rub el hizb (default: true) */
+    markers?: boolean;
+    /** Remove verse numbers and their brackets (default: true) */
+    verseNumbers?: boolean;
+    /** Remove tatweel/kashida elongation (default: true) */
+    tatweel?: boolean;
+    /** Remove small/superscript letters (default: true) */
+    smallLetters?: boolean;
+    /** Collapse multiple whitespace to single space (default: true) */
+    collapseWhitespace?: boolean;
+}
+/**
+ * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.
+ *
+ * @param text - The Arabic text to normalize
+ * @param options - Normalization options (all default to true for full normalization)
+ * @returns Normalized plain Arabic text
+ *
+ * @example
+ * ```ts
+ * import { normalize } from 'arabic-text-normalizer';
+ *
+ * // Full normalization (default)
+ * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');
+ * // => 'بسم الله الرحمن الرحيم'
+ *
+ * // Keep diacritics
+ * normalize('بِسْمِ', { diacritics: false });
+ * // => 'بِسْمِ'
+ * ```
+ */
+declare function normalize(text: string, options?: NormalizeOptions): string;
+export { type NormalizeOptions, normalize as default, normalize };

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,37 @@
+interface NormalizeOptions {
+    /** Remove harakat/tashkeel diacritics (default: true) */
+    diacritics?: boolean;
+    /** Remove Quranic markers like sajdah, rub el hizb (default: true) */
+    markers?: boolean;
+    /** Remove verse numbers and their brackets (default: true) */
+    verseNumbers?: boolean;
+    /** Remove tatweel/kashida elongation (default: true) */
+    tatweel?: boolean;
+    /** Remove small/superscript letters (default: true) */
+    smallLetters?: boolean;
+    /** Collapse multiple whitespace to single space (default: true) */
+    collapseWhitespace?: boolean;
+}
+/**
+ * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.
+ *
+ * @param text - The Arabic text to normalize
+ * @param options - Normalization options (all default to true for full normalization)
+ * @returns Normalized plain Arabic text
+ *
+ * @example
+ * ```ts
+ * import { normalize } from 'arabic-text-normalizer';
+ *
+ * // Full normalization (default)
+ * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');
+ * // => 'بسم الله الرحمن الرحيم'
+ *
+ * // Keep diacritics
+ * normalize('بِسْمِ', { diacritics: false });
+ * // => 'بِسْمِ'
+ * ```
+ */
+declare function normalize(text: string, options?: NormalizeOptions): string;
+export { type NormalizeOptions, normalize as default, normalize };

package/dist/index.js ADDED Viewed

@@ -0,0 +1,44 @@
+// src/index.ts
+var DIACRITICS = /[\u064B-\u065F]/g;
+var ALIF_MADDA = /\u0622/g;
+var QURANIC_ANNOTATIONS = /[\u0670\u06D6-\u06ED]/g;
+var TATWEEL = /\u0640/g;
+var ORNATE_PARENS = /[\uFD3E\uFD3F]/g;
+var ARABIC_DIGITS = /[\u0660-\u0669\u06F0-\u06F9]/g;
+var MULTI_WHITESPACE = /\s+/g;
+var DEFAULT_OPTIONS = {
+  diacritics: true,
+  markers: true,
+  verseNumbers: true,
+  tatweel: true,
+  smallLetters: true,
+  collapseWhitespace: true
+};
+function normalize(text, options = {}) {
+  const opts = { ...DEFAULT_OPTIONS, ...options };
+  let result = text;
+  if (opts.diacritics) {
+    result = result.replace(DIACRITICS, "");
+    result = result.replace(ALIF_MADDA, "\u0627");
+  }
+  if (opts.markers || opts.smallLetters) {
+    result = result.replace(QURANIC_ANNOTATIONS, "");
+  }
+  if (opts.verseNumbers) {
+    result = result.replace(ORNATE_PARENS, "");
+    result = result.replace(ARABIC_DIGITS, "");
+  }
+  if (opts.tatweel) {
+    result = result.replace(TATWEEL, "");
+  }
+  if (opts.collapseWhitespace) {
+    result = result.replace(MULTI_WHITESPACE, " ").trim();
+  }
+  return result;
+}
+var index_default = normalize;
+export {
+  index_default as default,
+  normalize
+};
+//# sourceMappingURL=index.js.map

package/dist/index.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/index.ts"],"sourcesContent":["export interface NormalizeOptions {\n /** Remove harakat/tashkeel diacritics (default: true) */\n diacritics?: boolean;\n /** Remove Quranic markers like sajdah, rub el hizb (default: true) */\n markers?: boolean;\n /** Remove verse numbers and their brackets (default: true) */\n verseNumbers?: boolean;\n /** Remove tatweel/kashida elongation (default: true) */\n tatweel?: boolean;\n /** Remove small/superscript letters (default: true) */\n smallLetters?: boolean;\n /** Collapse multiple whitespace to single space (default: true) */\n collapseWhitespace?: boolean;\n}\n\n// Arabic tashkeel/harakat: U+064B-U+065F\nconst DIACRITICS = /[\\u064B-\\u065F]/g;\n\n// Alif with madda above (آ U+0622) -> plain alif (ا U+0627)\nconst ALIF_MADDA = /\\u0622/g;\n\n// Superscript alif and other Quranic annotation marks: U+0670, U+06D6-U+06ED\nconst QURANIC_ANNOTATIONS = /[\\u0670\\u06D6-\\u06ED]/g;\n\n// Small letters (superscript): small high letters used in Quranic text\n// U+06E5 (small waw), U+06E6 (small ya), etc. are in QURANIC_ANNOTATIONS range\n\n// End of ayah U+06DD, start of rub el hizb U+06DE, place of sajdah U+06E9\n// These are included in QURANIC_ANNOTATIONS range (U+06D6-U+06ED)\n\n// Tatweel/kashida: U+0640\nconst TATWEEL = /\\u0640/g;\n\n// Ornate parentheses: U+FD3E, U+FD3F\nconst ORNATE_PARENS = /[\\uFD3E\\uFD3F]/g;\n\n// Arabic-Indic digits: U+0660-U+0669\n// Extended Arabic-Indic digits: U+06F0-U+06F9\nconst ARABIC_DIGITS = /[\\u0660-\\u0669\\u06F0-\\u06F9]/g;\n\n// Multiple whitespace\nconst MULTI_WHITESPACE = /\\s+/g;\n\nconst DEFAULT_OPTIONS: Required<NormalizeOptions> = {\n diacritics: true,\n markers: true,\n verseNumbers: true,\n tatweel: true,\n smallLetters: true,\n collapseWhitespace: true,\n};\n\n/**\n * Normalize Arabic/Quranic text by removing diacritics, markers, and decorative characters.\n *\n * @param text - The Arabic text to normalize\n * @param options - Normalization options (all default to true for full normalization)\n * @returns Normalized plain Arabic text\n *\n * @example\n * ```ts\n * import { normalize } from 'arabic-text-normalizer';\n *\n * // Full normalization (default)\n * normalize('بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ');\n * // => 'بسم الله الرحمن الرحيم'\n *\n * // Keep diacritics\n * normalize('بِسْمِ', { diacritics: false });\n * // => 'بِسْمِ'\n * ```\n */\nexport function normalize(text: string, options: NormalizeOptions = {}): string {\n const opts = { ...DEFAULT_OPTIONS, ...options };\n let result = text;\n\n if (opts.diacritics) {\n result = result.replace(DIACRITICS, \"\");\n result = result.replace(ALIF_MADDA, \"\\u0627\"); // آ -> ا\n }\n\n if (opts.markers || opts.smallLetters) {\n // QURANIC_ANNOTATIONS includes both markers and small letters\n result = result.replace(QURANIC_ANNOTATIONS, \"\");\n }\n\n if (opts.verseNumbers) {\n result = result.replace(ORNATE_PARENS, \"\");\n result = result.replace(ARABIC_DIGITS, \"\");\n }\n\n if (opts.tatweel) {\n result = result.replace(TATWEEL, \"\");\n }\n\n if (opts.collapseWhitespace) {\n result = result.replace(MULTI_WHITESPACE, \" \").trim();\n }\n\n return result;\n}\n\nexport default normalize;\n"],"mappings":";AAgBA,IAAM,aAAa;AAGnB,IAAM,aAAa;AAGnB,IAAM,sBAAsB;AAS5B,IAAM,UAAU;AAGhB,IAAM,gBAAgB;AAItB,IAAM,gBAAgB;AAGtB,IAAM,mBAAmB;AAEzB,IAAM,kBAA8C;AAAA,EAClD,YAAY;AAAA,EACZ,SAAS;AAAA,EACT,cAAc;AAAA,EACd,SAAS;AAAA,EACT,cAAc;AAAA,EACd,oBAAoB;AACtB;AAsBO,SAAS,UAAU,MAAc,UAA4B,CAAC,GAAW;AAC9E,QAAM,OAAO,EAAE,GAAG,iBAAiB,GAAG,QAAQ;AAC9C,MAAI,SAAS;AAEb,MAAI,KAAK,YAAY;AACnB,aAAS,OAAO,QAAQ,YAAY,EAAE;AACtC,aAAS,OAAO,QAAQ,YAAY,QAAQ;AAAA,EAC9C;AAEA,MAAI,KAAK,WAAW,KAAK,cAAc;AAErC,aAAS,OAAO,QAAQ,qBAAqB,EAAE;AAAA,EACjD;AAEA,MAAI,KAAK,cAAc;AACrB,aAAS,OAAO,QAAQ,eAAe,EAAE;AACzC,aAAS,OAAO,QAAQ,eAAe,EAAE;AAAA,EAC3C;AAEA,MAAI,KAAK,SAAS;AAChB,aAAS,OAAO,QAAQ,SAAS,EAAE;AAAA,EACrC;AAEA,MAAI,KAAK,oBAAoB;AAC3B,aAAS,OAAO,QAAQ,kBAAkB,GAAG,EAAE,KAAK;AAAA,EACtD;AAEA,SAAO;AACT;AAEA,IAAO,gBAAQ;","names":[]}

package/package.json ADDED Viewed

@@ -0,0 +1,55 @@
+{
+  "name": "arabic-text-normalizer",
+  "version": "1.0.0",
+  "description": "Normalize Quranic/Arabic text by removing diacritics, markers, and decorative characters",
+  "type": "module",
+  "main": "./dist/index.cjs",
+  "module": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "exports": {
+    ".": {
+      "import": {
+        "types": "./dist/index.d.ts",
+        "default": "./dist/index.js"
+      },
+      "require": {
+        "types": "./dist/index.d.cts",
+        "default": "./dist/index.cjs"
+      }
+    }
+  },
+  "files": [
+    "dist"
+  ],
+  "scripts": {
+    "build": "tsup",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "prepublishOnly": "npm run build"
+  },
+  "keywords": [
+    "arabic",
+    "quran",
+    "normalize",
+    "diacritics",
+    "tashkeel",
+    "harakat",
+    "text-processing"
+  ],
+  "author": "",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/yazinsai/arabic-text-normalizer.git"
+  },
+  "bugs": {
+    "url": "https://github.com/yazinsai/arabic-text-normalizer/issues"
+  },
+  "homepage": "https://github.com/yazinsai/arabic-text-normalizer#readme",
+  "devDependencies": {
+    "@types/node": "^25.2.0",
+    "tsup": "^8.5.1",
+    "typescript": "^5.9.3",
+    "vitest": "^4.0.18"
+  }
+}