npm - shamela - Versions diffs - 1.3.2 → 1.3.4 - Mend

shamela 1.3.2 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +204 -55
package/dist/content-B60R0uYQ.js +8 -0
package/dist/content-B60R0uYQ.js.map +1 -0
package/dist/content-CwjMtCQl.d.ts +54 -0
package/dist/content.d.ts +2 -0
package/dist/content.js +1 -0
package/dist/index.d.ts +4 -241
package/dist/index.js +7 -11
package/dist/index.js.map +1 -1
package/dist/types-C693UiUs.d.ts +226 -0
package/dist/types.d.ts +2 -0
package/dist/types.js +0 -0
package/package.json +24 -14

package/README.md CHANGED Viewed

@@ -15,21 +15,48 @@
 A universal TypeScript library for accessing and downloading Maktabah Shamela v4 APIs. The package runs in both Node.js and modern browsers, providing ergonomic helpers to interact with the Shamela API, download master and book databases, and retrieve book data programmatically.
+## Features
+- 🚀 **Full data lifecycle** – fetch metadata, download master and book databases, and query the results entirely in-memory.
+- 🔐 **Runtime configuration** – configure API credentials, WASM paths, and custom fetch/logging implementations at runtime.
+- 🧠 **Content tooling** – parse, sanitise, and post-process Arabic book content with utilities tailored for Shamela formatting.
+- 🌐 **Environment aware** – automatically selects optimal sql.js WASM bundles for Node.js, browsers, and bundled runtimes.
+- 🧪 **Well-tested** – comprehensive unit and end-to-end coverage to ensure reliable integrations.
 ## Table of Contents
+- [Features](#features)
 - [Installation](#installation)
 - [Quick Start](#quick-start)
   - [Standard Node.js](#standard-nodejs)
   - [Next.js / Bundled Environments](#nextjs--bundled-environments)
-  - [Browser](#browser)
+  - [Browser (Full API)](#browser-full-api)
+  - [Browser (Content Utilities Only)](#browser-content-utilities-only)
 - [API Reference](#api-reference)
-  - [getMasterMetadata](#getmastermetadata)
-  - [downloadMasterDatabase](#downloadmasterdatabase)
-  - [getBookMetadata](#getbookmetadata)
-  - [downloadBook](#downloadbook)
-  - [getBook](#getbook)
-  - [getMaster](#getmaster)
-  - [getCoverUrl](#getcoverurl)
+  - [Configuration](#configuration)
+    - [configure](#configure)
+    - [resetConfig](#resetconfig)
+    - [getConfig](#getconfig)
+    - [getConfigValue](#getconfigvalue)
+    - [requireConfigValue](#requireconfigvalue)
+  - [Metadata & Downloads](#metadata--downloads)
+    - [getMasterMetadata](#getmastermetadata)
+    - [downloadMasterDatabase](#downloadmasterdatabase)
+    - [getBookMetadata](#getbookmetadata)
+    - [downloadBook](#downloadbook)
+    - [getCoverUrl](#getcoverurl)
+  - [Data Access](#data-access)
+    - [getBook](#getbook)
+    - [getMaster](#getmaster)
+  - [Content Utilities](#content-utilities)
+    - [parseContentRobust](#parsecontentrobust)
+    - [sanitizePageContent](#sanitizepagecontent)
+    - [splitPageBodyFromFooter](#splitpagebodyfromfooter)
+    - [removeArabicNumericPageMarkers](#removearabicnumericpagemarkers)
+    - [removeTagsExceptSpan](#removetagsexceptspan)
+  - [Supporting Utilities](#supporting-utilities)
+    - [buildUrl](#buildurl)
+    - [httpsGet](#httpsget)
 - [Examples](#examples)
 - [Data Structures](#data-structures)
 - [Next.js Demo](#nextjs-demo)
@@ -129,7 +156,7 @@ export async function downloadBookAction(bookId: number) {
 **Important:** Only import `shamela` in server-side code (Server Actions, API Routes, or Server Components). Never import in client components or `layout.tsx`.
-### Browser
+### Browser (Full API)
 In browsers, the library automatically uses a CDN-hosted WASM file:
@@ -146,21 +173,104 @@ configure({
 const book = await getBook(26592);
 ```
+### Browser (Content Utilities Only)
+If you only need the content processing utilities (sanitization, parsing, etc.) without the database functionality, use the lightweight `shamela/content` export:
+```typescript
+import {
+  sanitizePageContent,
+  splitPageBodyFromFooter,
+  removeTagsExceptSpan,
+  parseContentRobust,
+} from 'shamela/content';
+// Process content without loading sql.js (~1.5KB gzipped vs ~900KB)
+const clean = removeTagsExceptSpan(sanitizePageContent(rawContent));
+const [body, footnotes] = splitPageBodyFromFooter(clean);
+```
+This is ideal for:
+- Client-side React/Next.js components
+- Bundled environments where you want to avoid sql.js WASM
+- Processing pre-downloaded book data
+**Available exports from `shamela/content`:**
+- `parseContentRobust` - Parse HTML into structured lines
+- `sanitizePageContent` - Normalize Arabic text
+- `splitPageBodyFromFooter` - Separate body from footnotes
+- `removeArabicNumericPageMarkers` - Remove page markers
+- `removeTagsExceptSpan` - Strip HTML except spans
 ## API Reference
-### getMasterMetadata
+### Configuration
+#### configure
-Fetches metadata for the master database.
+Initialises runtime configuration including API credentials, custom fetch implementations, sql.js WASM location, and logger overrides.
 ```typescript
-getMasterMetadata(version?: number): Promise<GetMasterMetadataResponsePayload>
+configure(options: ConfigureOptions): void
 ```
-- `version` (optional): The version number to check for updates (defaults to 0)
+**Example:**
-**Returns:** Promise resolving to master database metadata including download URL and version
+```typescript
+import { configure } from 'shamela';
-**Example:**
+configure({
+  apiKey: process.env.SHAMELA_API_KEY!,
+  booksEndpoint: process.env.SHAMELA_BOOKS_ENDPOINT!,
+  masterPatchEndpoint: process.env.SHAMELA_MASTER_ENDPOINT!,
+});
+```
+#### resetConfig
+Clears runtime overrides and restores the default silent logger.
+```typescript
+resetConfig(): void
+```
+Use this in tests or long-running processes when you need a clean configuration slate.
+#### getConfig
+Returns the merged configuration snapshot combining runtime overrides with environment variables.
+```typescript
+getConfig(): ShamelaConfig
+```
+#### getConfigValue
+Reads a single configuration value without throwing when it is missing.
+```typescript
+getConfigValue<Key extends ShamelaConfigKey>(key: Key): ShamelaConfig[Key] | undefined
+```
+#### requireConfigValue
+Retrieves a configuration entry and throws an error if the value is not present.
+```typescript
+requireConfigValue(key: Exclude<ShamelaConfigKey, 'fetchImplementation'>): string
+```
+### Metadata & Downloads
+#### getMasterMetadata
+Fetches metadata for the master database, including download URLs for the latest patches.
+```typescript
+getMasterMetadata(version?: number): Promise<GetMasterMetadataResponsePayload>
+```
+- `version` (optional): The version number to check for updates (defaults to 0)
 ```typescript
 const metadata = await getMasterMetadata();
@@ -171,21 +281,17 @@ console.log(metadata.version); // Version number
 const updates = await getMasterMetadata(5);
 ```
-### downloadMasterDatabase
+#### downloadMasterDatabase
-Downloads the master database containing all books, authors, and categories.
+Downloads the master database containing all books, authors, and categories and writes it to disk or a custom writer.
 ```typescript
 downloadMasterDatabase(options: DownloadMasterOptions): Promise<string>
 ```
-- `options.masterMetadata` (optional): Pre-fetched metadata
+- `options.masterMetadata` (optional): Pre-fetched metadata to avoid an extra HTTP call
 - `options.outputFile.path`: Output file path (`.db`, `.sqlite`, or `.json`)
-**Returns:** Promise resolving to the output file path
-**Example:**
 ```typescript
 // Download as SQLite database
 await downloadMasterDatabase({
@@ -198,9 +304,9 @@ await downloadMasterDatabase({
 });
 ```
-### getBookMetadata
+#### getBookMetadata
-Fetches metadata for a specific book.
+Fetches metadata for a specific book, including patch release information.
 ```typescript
 getBookMetadata(id: number, options?: GetBookMetadataOptions): Promise<GetBookMetadataResponsePayload>
@@ -210,32 +316,24 @@ getBookMetadata(id: number, options?: GetBookMetadataOptions): Promise<GetBookMe
 - `options.majorVersion` (optional): Major version to check
 - `options.minorVersion` (optional): Minor version to check
-**Returns:** Promise resolving to book metadata
-**Example:**
 ```typescript
 const metadata = await getBookMetadata(26592);
 console.log(metadata.majorReleaseUrl);
 console.log(metadata.minorReleaseUrl);
 ```
-### downloadBook
+#### downloadBook
-Downloads and processes a book from Shamela.
+Downloads and processes a book from Shamela, writing it to JSON or SQLite on disk.
 ```typescript
 downloadBook(id: number, options: DownloadBookOptions): Promise<string>
 ```
 - `id`: Book identifier
-- `options.bookMetadata` (optional): Pre-fetched metadata
+- `options.bookMetadata` (optional): Pre-fetched metadata to avoid re-fetching
 - `options.outputFile.path`: Output file path (`.db`, `.sqlite`, or `.json`)
-**Returns:** Promise resolving to the output file path
-**Example:**
 ```typescript
 // Download as JSON
 await downloadBook(26592, {
@@ -248,19 +346,28 @@ await downloadBook(26592, {
 });
 ```
-### getBook
+#### getCoverUrl
-Retrieves complete book data as a JavaScript object.
+Generates the URL for a book's cover image using the configured Shamela host.
 ```typescript
-getBook(id: number): Promise<BookData>
+getCoverUrl(bookId: number): string
 ```
-- `id`: Book identifier
+```typescript
+const coverUrl = getCoverUrl(26592);
+// Returns: "https://shamela.ws/covers/26592.jpg"
+```
-**Returns:** Promise resolving to book data with pages and titles
+### Data Access
-**Example:**
+#### getBook
+Retrieves complete book data as a JavaScript object, returning pages and title entries.
+```typescript
+getBook(id: number): Promise<BookData>
+```
 ```typescript
 const book = await getBook(26592);
@@ -269,18 +376,14 @@ console.log(book.titles?.length);
 console.log(book.pages[0].content);
 ```
-### getMaster
+#### getMaster
-Retrieves the entire master dataset as a JavaScript object.
+Retrieves the entire master dataset as a JavaScript object, including version information.
 ```typescript
 getMaster(): Promise<MasterData>
 ```
-**Returns:** Promise resolving to master data with authors, books, categories, and version
-**Example:**
 ```typescript
 const master = await getMaster();
 console.log(master.version);
@@ -289,23 +392,69 @@ console.log(master.authors.length);
 console.log(master.categories.length);
 ```
-### getCoverUrl
+### Content Utilities
+#### parseContentRobust
-Generates the URL for a book's cover image.
+Parses Shamela HTML snippets into structured lines while preserving title hierarchy and Arabic punctuation.
 ```typescript
-getCoverUrl(bookId: number): string
+parseContentRobust(content: string): Line[]
 ```
-- `bookId`: Book identifier
+```typescript
+const lines = parseContentRobust(rawHtml);
+lines.forEach((line) => console.log(line.id, line.text));
+```
-**Returns:** Cover image URL
+#### sanitizePageContent
-**Example:**
+Normalises page content by applying regex-based replacement rules tuned for Shamela sources.
 ```typescript
-const coverUrl = getCoverUrl(26592);
-// Returns: "https://shamela.ws/covers/26592.jpg"
+sanitizePageContent(text: string, rules?: Record<string, string>): string
+```
+#### splitPageBodyFromFooter
+Separates page body content from trailing footnotes using the default Shamela marker.
+```typescript
+splitPageBodyFromFooter(content: string, marker?: string): readonly [string, string]
+```
+#### removeArabicNumericPageMarkers
+Removes Arabic numeral markers enclosed in ⦗ ⦘, commonly used to denote page numbers.
+```typescript
+removeArabicNumericPageMarkers(text: string): string
+```
+#### removeTagsExceptSpan
+Strips anchor and hadeeth tags while preserving nested `<span>` elements.
+```typescript
+removeTagsExceptSpan(content: string): string
+```
+### Supporting Utilities
+#### buildUrl
+Constructs authenticated API URLs with query parameters and optional API key injection.
+```typescript
+buildUrl(endpoint: string, queryParams: Record<string, any>, useAuth?: boolean): URL
+```
+#### httpsGet
+Makes HTTPS GET requests using the configured fetch implementation, automatically parsing JSON responses and returning binary data otherwise.
+```typescript
+httpsGet<T extends Uint8Array | Record<string, any>>(url: string | URL, options?: { fetchImpl?: typeof fetch }): Promise<T>
 ```
 ## Examples

package/dist/content-B60R0uYQ.js ADDED Viewed

@@ -0,0 +1,8 @@
+const e=0,t={"<img[^>]*>>":``,舄:``,"﵀":`رَحِمَهُ ٱللَّٰهُ`,"﵁":`رضي الله عنه`,"﵂":`رَضِيَ ٱللَّٰهُ عَنْهَا`,"﵃":`رَضِيَ اللَّهُ عَنْهُمْ`,"﵄":`رَضِيَ ٱللَّٰهُ عَنْهُمَا`,"﵅":`رَضِيَ اللَّهُ عَنْهُنَّ`,"﵇":`عَلَيْهِ ٱلسَّلَٰمُ`,"﵈":`عَلَيْهِمُ السَّلامُ`,"﵊":`عليه الصلاة والسلام`,"﵌":`صلى الله عليه وآله وسلم`,"﵍":`عَلَيْهِ ٱلسَّلَٰمُ`,"﵎":`تبارك وتعالى`,"﵏":`رَحِمَهُمُ ٱللَّٰهُ`,"﷽":``,"﷿":`عَزَّ وَجَلَّ`},n=/^[)\]\u00BB"”'’.,?!:\u061B\u060C\u061F\u06D4\u2026]+$/,r=e=>{let t=[];for(let r of e){let e=t[t.length-1];e&&n.test(r.text)?e.text+=r.text:t.push(r)}return t},i=e=>e.replace(/\r\n/g,`
+`).replace(/\r/g,`
+`).split(`
+`).map(e=>e.trim()).filter(Boolean),a=e=>i(e).map(e=>({text:e})),o=(e,t)=>{let n=RegExp(`${t}\\s*=\\s*("([^"]*)"|'([^']*)'|([^s>]+))`,`i`),r=e.match(n);if(r)return r[2]??r[3]??r[4]},s=e=>{let t=[],n=/<[^>]+>/g,r=0,i;for(i=n.exec(e);i;){i.index>r&&t.push({type:`text`,value:e.slice(r,i.index)});let a=i[0],s=/^<\//.test(a),c=a.match(/^<\/?\s*([a-zA-Z0-9:-]+)/),l=c?c[1].toLowerCase():``;if(s)t.push({name:l,type:`end`});else{let e={};e.id=o(a,`id`),e[`data-type`]=o(a,`data-type`),t.push({attributes:e,name:l,type:`start`})}r=n.lastIndex,i=n.exec(e)}return r<e.length&&t.push({type:`text`,value:e.slice(r)}),t},c=(e,t)=>{let n=e.trim();return n?t?{id:t,text:n}:{text:n}:null},l=e=>{for(let t=e.length-1;t>=0;t--){let n=e[t];if(n.isTitle&&n.id)return n.id}},u=(e,t)=>{if(!e)return;let n=e.split(`
+`);for(let e=0;e<n.length;e++){if(e>0){let e=c(t.currentText,t.currentId);e&&t.result.push(e),t.currentText=``,t.currentId=l(t.spanStack)||void 0}n[e]&&(t.currentText+=n[e])}},d=(e,t)=>{let n=e.attributes[`data-type`]===`title`,r;n&&(r=(e.attributes.id??``).replace(/^toc-/,``)),t.spanStack.push({id:r,isTitle:n}),n&&r&&!t.currentId&&(t.currentId=r)},f=e=>{if(e=e.replace(/\r\n/g,`
+`).replace(/\r/g,`
+`),!/<span[^>]*>/i.test(e))return r(a(e));let t=s(`<root>${e}</root>`),n={currentId:void 0,currentText:``,result:[],spanStack:[]};for(let e of t)e.type===`text`?u(e.value,n):e.type===`start`&&e.name===`span`?d(e,n):e.type===`end`&&e.name===`span`&&n.spanStack.pop();let i=c(n.currentText,n.currentId);return i&&n.result.push(i),r(n.result).filter(e=>e.text.length>0)},p=Object.entries(t).map(([e,t])=>({regex:new RegExp(e,`g`),replacement:t})),m=e=>{if(e===t)return p;let n=[];for(let t in e)n.push({regex:new RegExp(t,`g`),replacement:e[t]});return n},h=(e,n=t)=>{let r=m(n),i=e;for(let e=0;e<r.length;e++){let{regex:t,replacement:n}=r[e];i=i.replace(t,n)}return i},g=(e,t=`_________`)=>{let n=``,r=e.indexOf(t);return r>=0&&(n=e.slice(r+t.length),e=e.slice(0,r)),[e,n]},_=e=>e.replace(/(?: |\r){0,2}⦗[\u0660-\u0669]+⦘(?: |\r)?/g,` `),v=e=>(e=e.replace(/<a[^>]*>(.*?)<\/a>/gs,`$1`),e=e.replace(/<hadeeth[^>]*>|<\/hadeeth>|<hadeeth-\d+>/gs,``),e),y=e=>e.replace(/<hadeeth-\d+>/gi,`<span class="hadeeth">`).replace(/<\s*\/?\s*hadeeth\s*>/gi,`</span>`);export{h as a,v as i,f as n,g as o,_ as r,e as s,y as t};
+//# sourceMappingURL=content-B60R0uYQ.js.map

package/dist/content-B60R0uYQ.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"content-B60R0uYQ.js","names":["DEFAULT_SANITIZATION_RULES: Record<string, string>","out: Line[]","tokens: Token[]","match: RegExpExecArray | null","attributes: Record<string, string | undefined>","id: string | undefined"],"sources":["../src/utils/constants.ts","../src/content.ts"],"sourcesContent":["/**\n * The default version number for master metadata.\n * @constant {number}\n */\nexport const DEFAULT_MASTER_METADATA_VERSION = 0;\n\n/**\n * Placeholder value used to represent unknown or missing data.\n * @constant {string}\n */\nexport const UNKNOWN_VALUE_PLACEHOLDER = '99999';\n\n/**\n * Default rules to sanitize page content.\n */\nexport const DEFAULT_SANITIZATION_RULES: Record<string, string> = {\n '<img[^>]*>>': '',\n 舄: '',\n '﵀': 'رَحِمَهُ ٱللَّٰهُ',\n '﵁': 'رضي الله عنه',\n '﵂': 'رَضِيَ ٱللَّٰهُ عَنْهَا',\n '﵃': 'رَضِيَ اللَّهُ عَنْهُمْ',\n '﵄': 'رَضِيَ ٱللَّٰهُ عَنْهُمَا',\n '﵅': 'رَضِيَ اللَّهُ عَنْهُنَّ',\n '﵇': 'عَلَيْهِ ٱلسَّلَٰمُ',\n '﵈': 'عَلَيْهِمُ السَّلامُ',\n '﵊': 'عليه الصلاة والسلام',\n '﵌': 'صلى الله عليه وآله وسلم',\n '﵍': 'عَلَيْهِ ٱلسَّلَٰمُ',\n '﵎': 'تبارك وتعالى',\n '﵏': 'رَحِمَهُمُ ٱللَّٰهُ',\n '﷽': '',\n '﷿': 'عَزَّ وَجَلَّ',\n};\n","import { DEFAULT_SANITIZATION_RULES } from './utils/constants';\n\nexport type Line = {\n id?: string;\n text: string;\n};\n\nconst PUNCT_ONLY = /^[)\\]\\u00BB\"”'’.,?!:\\u061B\\u060C\\u061F\\u06D4\\u2026]+$/;\n\n/**\n * Merges punctuation-only lines into the preceding title when appropriate.\n *\n * @param lines - The processed line candidates to normalise\n * @returns A new array where dangling punctuation fragments are appended to titles\n */\nconst mergeDanglingPunctuation = (lines: Line[]): Line[] => {\n const out: Line[] = [];\n for (const item of lines) {\n const last = out[out.length - 1];\n if (last && PUNCT_ONLY.test(item.text)) {\n last.text += item.text;\n } else {\n out.push(item);\n }\n }\n return out;\n};\n\n/**\n * Normalises raw text into discrete line entries.\n *\n * @param text - Raw book content potentially containing inconsistent breaks\n * @returns An array of trimmed line strings with empty entries removed\n */\nconst splitIntoLines = (text: string) => {\n const normalized = text.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n\n return normalized\n .split('\\n')\n .map((line) => line.trim())\n .filter(Boolean);\n};\n\n/**\n * Converts plain text content into {@link Line} objects without title metadata.\n *\n * @param content - The text content to split into line structures\n * @returns A {@link Line} array wrapping each detected sentence fragment\n */\nconst processTextContent = (content: string): Line[] => {\n return splitIntoLines(content).map((line) => ({ text: line }));\n};\n\n/**\n * Extracts an attribute value from the provided HTML tag string.\n *\n * @param tag - Raw HTML tag source\n * @param name - Attribute name to locate\n * @returns The attribute value when found; otherwise undefined\n */\nconst extractAttribute = (tag: string, name: string): string | undefined => {\n const pattern = new RegExp(`${name}\\\\s*=\\\\s*(\"([^\"]*)\"|'([^']*)'|([^s>]+))`, 'i');\n const match = tag.match(pattern);\n if (!match) {\n return undefined;\n }\n return match[2] ?? match[3] ?? match[4];\n};\n\ntype Token =\n | { type: 'text'; value: string }\n | { type: 'start'; name: string; attributes: Record<string, string | undefined> }\n | { type: 'end'; name: string };\n\n/**\n * Breaks the provided HTML fragment into structural tokens.\n *\n * @param html - HTML fragment containing book content markup\n * @returns A token stream describing text and span boundaries\n */\nconst tokenize = (html: string): Token[] => {\n const tokens: Token[] = [];\n const tagRegex = /<[^>]+>/g;\n let lastIndex = 0;\n let match: RegExpExecArray | null;\n match = tagRegex.exec(html);\n\n while (match) {\n if (match.index > lastIndex) {\n tokens.push({ type: 'text', value: html.slice(lastIndex, match.index) });\n }\n\n const raw = match[0];\n const isEnd = /^<\\//.test(raw);\n const nameMatch = raw.match(/^<\\/?\\s*([a-zA-Z0-9:-]+)/);\n const name = nameMatch ? nameMatch[1].toLowerCase() : '';\n\n if (isEnd) {\n tokens.push({ name, type: 'end' });\n } else {\n const attributes: Record<string, string | undefined> = {};\n attributes.id = extractAttribute(raw, 'id');\n attributes['data-type'] = extractAttribute(raw, 'data-type');\n tokens.push({ attributes, name, type: 'start' });\n }\n\n lastIndex = tagRegex.lastIndex;\n match = tagRegex.exec(html);\n }\n\n if (lastIndex < html.length) {\n tokens.push({ type: 'text', value: html.slice(lastIndex) });\n }\n\n return tokens;\n};\n\n/**\n * Pushes the accumulated text as a new line to the result array.\n */\nconst createLine = (text: string, id?: string): Line | null => {\n const trimmed = text.trim();\n if (!trimmed) {\n return null;\n }\n return id ? { id, text: trimmed } : { text: trimmed };\n};\n\n/**\n * Finds the active title ID from the span stack.\n */\nconst getActiveTitleId = (spanStack: Array<{ isTitle: boolean; id?: string }>): string | undefined => {\n for (let i = spanStack.length - 1; i >= 0; i--) {\n const entry = spanStack[i];\n if (entry.isTitle && entry.id) {\n return entry.id;\n }\n }\n};\n\n/**\n * Processes text content by handling line breaks and maintaining title context.\n */\nconst processTextWithLineBreaks = (\n raw: string,\n state: {\n currentText: string;\n currentId?: string;\n result: Line[];\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n if (!raw) {\n return;\n }\n\n const parts = raw.split('\\n');\n\n for (let i = 0; i < parts.length; i++) {\n // Push previous line when crossing a line break\n if (i > 0) {\n const line = createLine(state.currentText, state.currentId);\n if (line) {\n state.result.push(line);\n }\n state.currentText = '';\n\n // Preserve title ID if still inside a title span\n const activeTitleId = getActiveTitleId(state.spanStack);\n state.currentId = activeTitleId || undefined;\n }\n\n // Append the text part\n if (parts[i]) {\n state.currentText += parts[i];\n }\n }\n};\n\n/**\n * Handles the start of a span tag, updating the stack and current ID.\n */\nconst handleSpanStart = (\n token: { attributes: Record<string, string | undefined> },\n state: {\n currentId?: string;\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n const dataType = token.attributes['data-type'];\n const isTitle = dataType === 'title';\n\n let id: string | undefined;\n if (isTitle) {\n const rawId = token.attributes.id ?? '';\n id = rawId.replace(/^toc-/, '');\n }\n\n state.spanStack.push({ id, isTitle });\n\n // First title span on the current physical line wins\n if (isTitle && id && !state.currentId) {\n state.currentId = id;\n }\n};\n\n/**\n * Parses Shamela HTML content into structured lines while preserving headings.\n *\n * @param content - The raw HTML markup representing a page\n * @returns An array of {@link Line} objects containing text and optional IDs\n */\nexport const parseContentRobust = (content: string): Line[] => {\n // Normalize line endings first\n content = content.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n\n // Fast path when there are no span tags at all\n if (!/<span[^>]*>/i.test(content)) {\n return mergeDanglingPunctuation(processTextContent(content));\n }\n\n const tokens = tokenize(`<root>${content}</root>`);\n const state = {\n currentId: undefined as string | undefined,\n currentText: '',\n result: [] as Line[],\n spanStack: [] as Array<{ isTitle: boolean; id?: string }>,\n };\n\n // Process all tokens\n for (const token of tokens) {\n if (token.type === 'text') {\n processTextWithLineBreaks(token.value, state);\n } else if (token.type === 'start' && token.name === 'span') {\n handleSpanStart(token, state);\n } else if (token.type === 'end' && token.name === 'span') {\n // Closing a span does NOT end the line; trailing text stays on the same line\n state.spanStack.pop();\n }\n }\n\n // Flush any trailing text\n const finalLine = createLine(state.currentText, state.currentId);\n if (finalLine) {\n state.result.push(finalLine);\n }\n\n // Merge punctuation-only lines and drop empties\n return mergeDanglingPunctuation(state.result).filter((line) => line.text.length > 0);\n};\n\nconst DEFAULT_COMPILED_RULES = Object.entries(DEFAULT_SANITIZATION_RULES).map(([pattern, replacement]) => ({\n regex: new RegExp(pattern, 'g'),\n replacement,\n}));\n\n/**\n * Compiles sanitisation rules into RegExp objects for reuse.\n *\n * @param rules - Key/value replacements used during sanitisation\n * @returns A list of compiled regular expression rules\n */\nconst getCompiledRules = (rules: Record<string, string>) => {\n if (rules === DEFAULT_SANITIZATION_RULES) {\n return DEFAULT_COMPILED_RULES;\n }\n\n const compiled = [];\n for (const pattern in rules) {\n compiled.push({\n regex: new RegExp(pattern, 'g'),\n replacement: rules[pattern],\n });\n }\n return compiled;\n};\n\n/**\n * Sanitises page content by applying regex replacement rules.\n *\n * @param text - The text to clean\n * @param rules - Optional custom replacements, defaults to {@link DEFAULT_SANITIZATION_RULES}\n * @returns The sanitised content\n */\nexport const sanitizePageContent = (\n text: string,\n rules: Record<string, string> = DEFAULT_SANITIZATION_RULES,\n): string => {\n const compiledRules = getCompiledRules(rules);\n\n let content = text;\n for (let i = 0; i < compiledRules.length; i++) {\n const { regex, replacement } = compiledRules[i];\n content = content.replace(regex, replacement);\n }\n return content;\n};\n\n/**\n * Splits a page body from its trailing footnotes using a marker string.\n *\n * @param content - Combined body and footnote text\n * @param footnoteMarker - Marker indicating the start of footnotes\n * @returns A tuple containing the page body followed by the footnote section\n */\nexport const splitPageBodyFromFooter = (content: string, footnoteMarker = '_________') => {\n let footnote = '';\n const indexOfFootnote = content.indexOf(footnoteMarker);\n\n if (indexOfFootnote >= 0) {\n footnote = content.slice(indexOfFootnote + footnoteMarker.length);\n content = content.slice(0, indexOfFootnote);\n }\n\n return [content, footnote] as const;\n};\n\n/**\n * Removes Arabic numeral page markers enclosed in turtle ⦗ ⦘ brackets.\n * Replaces the marker along with up to two preceding whitespace characters\n * (space or carriage return) and up to one following whitespace character\n * with a single space.\n *\n * @param text - Text potentially containing page markers\n * @returns The text with numeric markers replaced by a single space\n */\nexport const removeArabicNumericPageMarkers = (text: string) => {\n return text.replace(/(?: |\\r){0,2}⦗[\\u0660-\\u0669]+⦘(?: |\\r)?/g, ' ');\n};\n\n/**\n * Removes anchor and hadeeth tags from the content while preserving spans.\n *\n * @param content - HTML string containing various tags\n * @returns The content with only span tags retained\n */\nexport const removeTagsExceptSpan = (content: string) => {\n // Remove <a> tags and their content, keeping only the text inside\n content = content.replace(/<a[^>]*>(.*?)<\\/a>/gs, '$1');\n\n // Remove <hadeeth> tags (both self-closing, with content, and numbered)\n content = content.replace(/<hadeeth[^>]*>|<\\/hadeeth>|<hadeeth-\\d+>/gs, '');\n\n return content;\n};\n\n/**\n * Normalizes Shamela HTML for CSS styling:\n * - Converts <hadeeth-N> to <span class=\"hadeeth\">\n * - Converts </hadeeth> or standalone <hadeeth> to </span>\n */\nexport const normalizeHtml = (html: string): string => {\n return html.replace(/<hadeeth-\\d+>/gi, '<span class=\"hadeeth\">').replace(/<\\s*\\/?\\s*hadeeth\\s*>/gi, '</span>');\n};\n"],"mappings":"AAIA,MAAa,EAAkC,EAWlCA,EAAqD,CAC9D,cAAe,GACf,EAAG,GACH,IAAK,oBACL,IAAK,eACL,IAAK,0BACL,IAAK,0BACL,IAAK,4BACL,IAAK,2BACL,IAAK,sBACL,IAAK,uBACL,IAAK,sBACL,IAAK,0BACL,IAAK,sBACL,IAAK,eACL,IAAK,sBACL,IAAK,GACL,IAAK,gBACR,CC1BK,EAAa,wDAQb,EAA4B,GAA0B,CACxD,IAAMC,EAAc,EAAE,CACtB,IAAK,IAAM,KAAQ,EAAO,CACtB,IAAM,EAAO,EAAI,EAAI,OAAS,GAC1B,GAAQ,EAAW,KAAK,EAAK,KAAK,CAClC,EAAK,MAAQ,EAAK,KAElB,EAAI,KAAK,EAAK,CAGtB,OAAO,GASL,EAAkB,GACD,EAAK,QAAQ,QAAS;EAAK,CAAC,QAAQ,MAAO;EAAK,CAG9D,MAAM;EAAK,CACX,IAAK,GAAS,EAAK,MAAM,CAAC,CAC1B,OAAO,QAAQ,CASlB,EAAsB,GACjB,EAAe,EAAQ,CAAC,IAAK,IAAU,CAAE,KAAM,EAAM,EAAE,CAU5D,GAAoB,EAAa,IAAqC,CACxE,IAAM,EAAc,OAAO,GAAG,EAAK,yCAA0C,IAAI,CAC3E,EAAQ,EAAI,MAAM,EAAQ,CAC3B,KAGL,OAAO,EAAM,IAAM,EAAM,IAAM,EAAM,IAcnC,EAAY,GAA0B,CACxC,IAAMC,EAAkB,EAAE,CACpB,EAAW,WACb,EAAY,EACZC,EAGJ,IAFA,EAAQ,EAAS,KAAK,EAAK,CAEpB,GAAO,CACN,EAAM,MAAQ,GACd,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAW,EAAM,MAAM,CAAE,CAAC,CAG5E,IAAM,EAAM,EAAM,GACZ,EAAQ,OAAO,KAAK,EAAI,CACxB,EAAY,EAAI,MAAM,2BAA2B,CACjD,EAAO,EAAY,EAAU,GAAG,aAAa,CAAG,GAEtD,GAAI,EACA,EAAO,KAAK,CAAE,OAAM,KAAM,MAAO,CAAC,KAC/B,CACH,IAAMC,EAAiD,EAAE,CACzD,EAAW,GAAK,EAAiB,EAAK,KAAK,CAC3C,EAAW,aAAe,EAAiB,EAAK,YAAY,CAC5D,EAAO,KAAK,CAAE,aAAY,OAAM,KAAM,QAAS,CAAC,CAGpD,EAAY,EAAS,UACrB,EAAQ,EAAS,KAAK,EAAK,CAO/B,OAJI,EAAY,EAAK,QACjB,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAU,CAAE,CAAC,CAGxD,GAML,GAAc,EAAc,IAA6B,CAC3D,IAAM,EAAU,EAAK,MAAM,CAI3B,OAHK,EAGE,EAAK,CAAE,KAAI,KAAM,EAAS,CAAG,CAAE,KAAM,EAAS,CAF1C,MAQT,EAAoB,GAA4E,CAClG,IAAK,IAAI,EAAI,EAAU,OAAS,EAAG,GAAK,EAAG,IAAK,CAC5C,IAAM,EAAQ,EAAU,GACxB,GAAI,EAAM,SAAW,EAAM,GACvB,OAAO,EAAM,KAQnB,GACF,EACA,IAMC,CACD,GAAI,CAAC,EACD,OAGJ,IAAM,EAAQ,EAAI,MAAM;EAAK,CAE7B,IAAK,IAAI,EAAI,EAAG,EAAI,EAAM,OAAQ,IAAK,CAEnC,GAAI,EAAI,EAAG,CACP,IAAM,EAAO,EAAW,EAAM,YAAa,EAAM,UAAU,CACvD,GACA,EAAM,OAAO,KAAK,EAAK,CAE3B,EAAM,YAAc,GAIpB,EAAM,UADgB,EAAiB,EAAM,UAAU,EACpB,IAAA,GAInC,EAAM,KACN,EAAM,aAAe,EAAM,MAQjC,GACF,EACA,IAIC,CAED,IAAM,EADW,EAAM,WAAW,eACL,QAEzBC,EACA,IAEA,GADc,EAAM,WAAW,IAAM,IAC1B,QAAQ,QAAS,GAAG,EAGnC,EAAM,UAAU,KAAK,CAAE,KAAI,UAAS,CAAC,CAGjC,GAAW,GAAM,CAAC,EAAM,YACxB,EAAM,UAAY,IAUb,EAAsB,GAA4B,CAK3D,GAHA,EAAU,EAAQ,QAAQ,QAAS;EAAK,CAAC,QAAQ,MAAO;EAAK,CAGzD,CAAC,eAAe,KAAK,EAAQ,CAC7B,OAAO,EAAyB,EAAmB,EAAQ,CAAC,CAGhE,IAAM,EAAS,EAAS,SAAS,EAAQ,SAAS,CAC5C,EAAQ,CACV,UAAW,IAAA,GACX,YAAa,GACb,OAAQ,EAAE,CACV,UAAW,EAAE,CAChB,CAGD,IAAK,IAAM,KAAS,EACZ,EAAM,OAAS,OACf,EAA0B,EAAM,MAAO,EAAM,CACtC,EAAM,OAAS,SAAW,EAAM,OAAS,OAChD,EAAgB,EAAO,EAAM,CACtB,EAAM,OAAS,OAAS,EAAM,OAAS,QAE9C,EAAM,UAAU,KAAK,CAK7B,IAAM,EAAY,EAAW,EAAM,YAAa,EAAM,UAAU,CAMhE,OALI,GACA,EAAM,OAAO,KAAK,EAAU,CAIzB,EAAyB,EAAM,OAAO,CAAC,OAAQ,GAAS,EAAK,KAAK,OAAS,EAAE,EAGlF,EAAyB,OAAO,QAAQ,EAA2B,CAAC,KAAK,CAAC,EAAS,MAAkB,CACvG,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,cACH,EAAE,CAQG,EAAoB,GAAkC,CACxD,GAAI,IAAU,EACV,OAAO,EAGX,IAAM,EAAW,EAAE,CACnB,IAAK,IAAM,KAAW,EAClB,EAAS,KAAK,CACV,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,YAAa,EAAM,GACtB,CAAC,CAEN,OAAO,GAUE,GACT,EACA,EAAgC,IACvB,CACT,IAAM,EAAgB,EAAiB,EAAM,CAEzC,EAAU,EACd,IAAK,IAAI,EAAI,EAAG,EAAI,EAAc,OAAQ,IAAK,CAC3C,GAAM,CAAE,QAAO,eAAgB,EAAc,GAC7C,EAAU,EAAQ,QAAQ,EAAO,EAAY,CAEjD,OAAO,GAUE,GAA2B,EAAiB,EAAiB,cAAgB,CACtF,IAAI,EAAW,GACT,EAAkB,EAAQ,QAAQ,EAAe,CAOvD,OALI,GAAmB,IACnB,EAAW,EAAQ,MAAM,EAAkB,EAAe,OAAO,CACjE,EAAU,EAAQ,MAAM,EAAG,EAAgB,EAGxC,CAAC,EAAS,EAAS,EAYjB,EAAkC,GACpC,EAAK,QAAQ,4CAA6C,IAAI,CAS5D,EAAwB,IAEjC,EAAU,EAAQ,QAAQ,uBAAwB,KAAK,CAGvD,EAAU,EAAQ,QAAQ,6CAA8C,GAAG,CAEpE,GAQE,EAAiB,GACnB,EAAK,QAAQ,kBAAmB,yBAAyB,CAAC,QAAQ,0BAA2B,UAAU"}

package/dist/content-CwjMtCQl.d.ts ADDED Viewed

@@ -0,0 +1,54 @@
+//#region src/content.d.ts
+type Line = {
+  id?: string;
+  text: string;
+};
+/**
+ * Parses Shamela HTML content into structured lines while preserving headings.
+ *
+ * @param content - The raw HTML markup representing a page
+ * @returns An array of {@link Line} objects containing text and optional IDs
+ */
+declare const parseContentRobust: (content: string) => Line[];
+/**
+ * Sanitises page content by applying regex replacement rules.
+ *
+ * @param text - The text to clean
+ * @param rules - Optional custom replacements, defaults to {@link DEFAULT_SANITIZATION_RULES}
+ * @returns The sanitised content
+ */
+declare const sanitizePageContent: (text: string, rules?: Record<string, string>) => string;
+/**
+ * Splits a page body from its trailing footnotes using a marker string.
+ *
+ * @param content - Combined body and footnote text
+ * @param footnoteMarker - Marker indicating the start of footnotes
+ * @returns A tuple containing the page body followed by the footnote section
+ */
+declare const splitPageBodyFromFooter: (content: string, footnoteMarker?: string) => readonly [string, string];
+/**
+ * Removes Arabic numeral page markers enclosed in turtle ⦗ ⦘ brackets.
+ * Replaces the marker along with up to two preceding whitespace characters
+ * (space or carriage return) and up to one following whitespace character
+ * with a single space.
+ *
+ * @param text - Text potentially containing page markers
+ * @returns The text with numeric markers replaced by a single space
+ */
+declare const removeArabicNumericPageMarkers: (text: string) => string;
+/**
+ * Removes anchor and hadeeth tags from the content while preserving spans.
+ *
+ * @param content - HTML string containing various tags
+ * @returns The content with only span tags retained
+ */
+declare const removeTagsExceptSpan: (content: string) => string;
+/**
+ * Normalizes Shamela HTML for CSS styling:
+ * - Converts <hadeeth-N> to <span class="hadeeth">
+ * - Converts </hadeeth> or standalone <hadeeth> to </span>
+ */
+declare const normalizeHtml: (html: string) => string;
+//#endregion
+export { removeTagsExceptSpan as a, removeArabicNumericPageMarkers as i, normalizeHtml as n, sanitizePageContent as o, parseContentRobust as r, splitPageBodyFromFooter as s, Line as t };
+//# sourceMappingURL=content-CwjMtCQl.d.ts.map

package/dist/content.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { a as removeTagsExceptSpan, i as removeArabicNumericPageMarkers, n as normalizeHtml, o as sanitizePageContent, r as parseContentRobust, s as splitPageBodyFromFooter, t as Line } from "./content-CwjMtCQl.js";
2	+ export { Line, normalizeHtml, parseContentRobust, removeArabicNumericPageMarkers, removeTagsExceptSpan, sanitizePageContent, splitPageBodyFromFooter };

package/dist/content.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ import{a as e,i as t,n,o as r,r as i,t as a}from"./content-B60R0uYQ.js";export{a as normalizeHtml,n as parseContentRobust,i as removeArabicNumericPageMarkers,t as removeTagsExceptSpan,e as sanitizePageContent,r as splitPageBodyFromFooter};