shamela 1.3.2 โ†’ 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -15,21 +15,48 @@
15
15
 
16
16
  A universal TypeScript library for accessing and downloading Maktabah Shamela v4 APIs. The package runs in both Node.js and modern browsers, providing ergonomic helpers to interact with the Shamela API, download master and book databases, and retrieve book data programmatically.
17
17
 
18
+ ## Features
19
+
20
+ - ๐Ÿš€ **Full data lifecycle** โ€“ fetch metadata, download master and book databases, and query the results entirely in-memory.
21
+ - ๐Ÿ” **Runtime configuration** โ€“ configure API credentials, WASM paths, and custom fetch/logging implementations at runtime.
22
+ - ๐Ÿง  **Content tooling** โ€“ parse, sanitise, and post-process Arabic book content with utilities tailored for Shamela formatting.
23
+ - ๐ŸŒ **Environment aware** โ€“ automatically selects optimal sql.js WASM bundles for Node.js, browsers, and bundled runtimes.
24
+ - ๐Ÿงช **Well-tested** โ€“ comprehensive unit and end-to-end coverage to ensure reliable integrations.
25
+
18
26
  ## Table of Contents
19
27
 
28
+ - [Features](#features)
20
29
  - [Installation](#installation)
21
30
  - [Quick Start](#quick-start)
22
31
  - [Standard Node.js](#standard-nodejs)
23
32
  - [Next.js / Bundled Environments](#nextjs--bundled-environments)
24
- - [Browser](#browser)
33
+ - [Browser (Full API)](#browser-full-api)
34
+ - [Browser (Content Utilities Only)](#browser-content-utilities-only)
25
35
  - [API Reference](#api-reference)
26
- - [getMasterMetadata](#getmastermetadata)
27
- - [downloadMasterDatabase](#downloadmasterdatabase)
28
- - [getBookMetadata](#getbookmetadata)
29
- - [downloadBook](#downloadbook)
30
- - [getBook](#getbook)
31
- - [getMaster](#getmaster)
32
- - [getCoverUrl](#getcoverurl)
36
+ - [Configuration](#configuration)
37
+ - [configure](#configure)
38
+ - [resetConfig](#resetconfig)
39
+ - [getConfig](#getconfig)
40
+ - [getConfigValue](#getconfigvalue)
41
+ - [requireConfigValue](#requireconfigvalue)
42
+ - [Metadata & Downloads](#metadata--downloads)
43
+ - [getMasterMetadata](#getmastermetadata)
44
+ - [downloadMasterDatabase](#downloadmasterdatabase)
45
+ - [getBookMetadata](#getbookmetadata)
46
+ - [downloadBook](#downloadbook)
47
+ - [getCoverUrl](#getcoverurl)
48
+ - [Data Access](#data-access)
49
+ - [getBook](#getbook)
50
+ - [getMaster](#getmaster)
51
+ - [Content Utilities](#content-utilities)
52
+ - [parseContentRobust](#parsecontentrobust)
53
+ - [sanitizePageContent](#sanitizepagecontent)
54
+ - [splitPageBodyFromFooter](#splitpagebodyfromfooter)
55
+ - [removeArabicNumericPageMarkers](#removearabicnumericpagemarkers)
56
+ - [removeTagsExceptSpan](#removetagsexceptspan)
57
+ - [Supporting Utilities](#supporting-utilities)
58
+ - [buildUrl](#buildurl)
59
+ - [httpsGet](#httpsget)
33
60
  - [Examples](#examples)
34
61
  - [Data Structures](#data-structures)
35
62
  - [Next.js Demo](#nextjs-demo)
@@ -129,7 +156,7 @@ export async function downloadBookAction(bookId: number) {
129
156
 
130
157
  **Important:** Only import `shamela` in server-side code (Server Actions, API Routes, or Server Components). Never import in client components or `layout.tsx`.
131
158
 
132
- ### Browser
159
+ ### Browser (Full API)
133
160
 
134
161
  In browsers, the library automatically uses a CDN-hosted WASM file:
135
162
 
@@ -146,21 +173,104 @@ configure({
146
173
  const book = await getBook(26592);
147
174
  ```
148
175
 
176
+ ### Browser (Content Utilities Only)
177
+
178
+ If you only need the content processing utilities (sanitization, parsing, etc.) without the database functionality, use the lightweight `shamela/content` export:
179
+
180
+ ```typescript
181
+ import {
182
+ sanitizePageContent,
183
+ splitPageBodyFromFooter,
184
+ removeTagsExceptSpan,
185
+ parseContentRobust,
186
+ } from 'shamela/content';
187
+
188
+ // Process content without loading sql.js (~1.5KB gzipped vs ~900KB)
189
+ const clean = removeTagsExceptSpan(sanitizePageContent(rawContent));
190
+ const [body, footnotes] = splitPageBodyFromFooter(clean);
191
+ ```
192
+
193
+ This is ideal for:
194
+ - Client-side React/Next.js components
195
+ - Bundled environments where you want to avoid sql.js WASM
196
+ - Processing pre-downloaded book data
197
+
198
+ **Available exports from `shamela/content`:**
199
+ - `parseContentRobust` - Parse HTML into structured lines
200
+ - `sanitizePageContent` - Normalize Arabic text
201
+ - `splitPageBodyFromFooter` - Separate body from footnotes
202
+ - `removeArabicNumericPageMarkers` - Remove page markers
203
+ - `removeTagsExceptSpan` - Strip HTML except spans
204
+
149
205
  ## API Reference
150
206
 
151
- ### getMasterMetadata
207
+ ### Configuration
208
+
209
+ #### configure
152
210
 
153
- Fetches metadata for the master database.
211
+ Initialises runtime configuration including API credentials, custom fetch implementations, sql.js WASM location, and logger overrides.
154
212
 
155
213
  ```typescript
156
- getMasterMetadata(version?: number): Promise<GetMasterMetadataResponsePayload>
214
+ configure(options: ConfigureOptions): void
157
215
  ```
158
216
 
159
- - `version` (optional): The version number to check for updates (defaults to 0)
217
+ **Example:**
160
218
 
161
- **Returns:** Promise resolving to master database metadata including download URL and version
219
+ ```typescript
220
+ import { configure } from 'shamela';
162
221
 
163
- **Example:**
222
+ configure({
223
+ apiKey: process.env.SHAMELA_API_KEY!,
224
+ booksEndpoint: process.env.SHAMELA_BOOKS_ENDPOINT!,
225
+ masterPatchEndpoint: process.env.SHAMELA_MASTER_ENDPOINT!,
226
+ });
227
+ ```
228
+
229
+ #### resetConfig
230
+
231
+ Clears runtime overrides and restores the default silent logger.
232
+
233
+ ```typescript
234
+ resetConfig(): void
235
+ ```
236
+
237
+ Use this in tests or long-running processes when you need a clean configuration slate.
238
+
239
+ #### getConfig
240
+
241
+ Returns the merged configuration snapshot combining runtime overrides with environment variables.
242
+
243
+ ```typescript
244
+ getConfig(): ShamelaConfig
245
+ ```
246
+
247
+ #### getConfigValue
248
+
249
+ Reads a single configuration value without throwing when it is missing.
250
+
251
+ ```typescript
252
+ getConfigValue<Key extends ShamelaConfigKey>(key: Key): ShamelaConfig[Key] | undefined
253
+ ```
254
+
255
+ #### requireConfigValue
256
+
257
+ Retrieves a configuration entry and throws an error if the value is not present.
258
+
259
+ ```typescript
260
+ requireConfigValue(key: Exclude<ShamelaConfigKey, 'fetchImplementation'>): string
261
+ ```
262
+
263
+ ### Metadata & Downloads
264
+
265
+ #### getMasterMetadata
266
+
267
+ Fetches metadata for the master database, including download URLs for the latest patches.
268
+
269
+ ```typescript
270
+ getMasterMetadata(version?: number): Promise<GetMasterMetadataResponsePayload>
271
+ ```
272
+
273
+ - `version` (optional): The version number to check for updates (defaults to 0)
164
274
 
165
275
  ```typescript
166
276
  const metadata = await getMasterMetadata();
@@ -171,21 +281,17 @@ console.log(metadata.version); // Version number
171
281
  const updates = await getMasterMetadata(5);
172
282
  ```
173
283
 
174
- ### downloadMasterDatabase
284
+ #### downloadMasterDatabase
175
285
 
176
- Downloads the master database containing all books, authors, and categories.
286
+ Downloads the master database containing all books, authors, and categories and writes it to disk or a custom writer.
177
287
 
178
288
  ```typescript
179
289
  downloadMasterDatabase(options: DownloadMasterOptions): Promise<string>
180
290
  ```
181
291
 
182
- - `options.masterMetadata` (optional): Pre-fetched metadata
292
+ - `options.masterMetadata` (optional): Pre-fetched metadata to avoid an extra HTTP call
183
293
  - `options.outputFile.path`: Output file path (`.db`, `.sqlite`, or `.json`)
184
294
 
185
- **Returns:** Promise resolving to the output file path
186
-
187
- **Example:**
188
-
189
295
  ```typescript
190
296
  // Download as SQLite database
191
297
  await downloadMasterDatabase({
@@ -198,9 +304,9 @@ await downloadMasterDatabase({
198
304
  });
199
305
  ```
200
306
 
201
- ### getBookMetadata
307
+ #### getBookMetadata
202
308
 
203
- Fetches metadata for a specific book.
309
+ Fetches metadata for a specific book, including patch release information.
204
310
 
205
311
  ```typescript
206
312
  getBookMetadata(id: number, options?: GetBookMetadataOptions): Promise<GetBookMetadataResponsePayload>
@@ -210,32 +316,24 @@ getBookMetadata(id: number, options?: GetBookMetadataOptions): Promise<GetBookMe
210
316
  - `options.majorVersion` (optional): Major version to check
211
317
  - `options.minorVersion` (optional): Minor version to check
212
318
 
213
- **Returns:** Promise resolving to book metadata
214
-
215
- **Example:**
216
-
217
319
  ```typescript
218
320
  const metadata = await getBookMetadata(26592);
219
321
  console.log(metadata.majorReleaseUrl);
220
322
  console.log(metadata.minorReleaseUrl);
221
323
  ```
222
324
 
223
- ### downloadBook
325
+ #### downloadBook
224
326
 
225
- Downloads and processes a book from Shamela.
327
+ Downloads and processes a book from Shamela, writing it to JSON or SQLite on disk.
226
328
 
227
329
  ```typescript
228
330
  downloadBook(id: number, options: DownloadBookOptions): Promise<string>
229
331
  ```
230
332
 
231
333
  - `id`: Book identifier
232
- - `options.bookMetadata` (optional): Pre-fetched metadata
334
+ - `options.bookMetadata` (optional): Pre-fetched metadata to avoid re-fetching
233
335
  - `options.outputFile.path`: Output file path (`.db`, `.sqlite`, or `.json`)
234
336
 
235
- **Returns:** Promise resolving to the output file path
236
-
237
- **Example:**
238
-
239
337
  ```typescript
240
338
  // Download as JSON
241
339
  await downloadBook(26592, {
@@ -248,19 +346,28 @@ await downloadBook(26592, {
248
346
  });
249
347
  ```
250
348
 
251
- ### getBook
349
+ #### getCoverUrl
252
350
 
253
- Retrieves complete book data as a JavaScript object.
351
+ Generates the URL for a book's cover image using the configured Shamela host.
254
352
 
255
353
  ```typescript
256
- getBook(id: number): Promise<BookData>
354
+ getCoverUrl(bookId: number): string
257
355
  ```
258
356
 
259
- - `id`: Book identifier
357
+ ```typescript
358
+ const coverUrl = getCoverUrl(26592);
359
+ // Returns: "https://shamela.ws/covers/26592.jpg"
360
+ ```
260
361
 
261
- **Returns:** Promise resolving to book data with pages and titles
362
+ ### Data Access
262
363
 
263
- **Example:**
364
+ #### getBook
365
+
366
+ Retrieves complete book data as a JavaScript object, returning pages and title entries.
367
+
368
+ ```typescript
369
+ getBook(id: number): Promise<BookData>
370
+ ```
264
371
 
265
372
  ```typescript
266
373
  const book = await getBook(26592);
@@ -269,18 +376,14 @@ console.log(book.titles?.length);
269
376
  console.log(book.pages[0].content);
270
377
  ```
271
378
 
272
- ### getMaster
379
+ #### getMaster
273
380
 
274
- Retrieves the entire master dataset as a JavaScript object.
381
+ Retrieves the entire master dataset as a JavaScript object, including version information.
275
382
 
276
383
  ```typescript
277
384
  getMaster(): Promise<MasterData>
278
385
  ```
279
386
 
280
- **Returns:** Promise resolving to master data with authors, books, categories, and version
281
-
282
- **Example:**
283
-
284
387
  ```typescript
285
388
  const master = await getMaster();
286
389
  console.log(master.version);
@@ -289,23 +392,69 @@ console.log(master.authors.length);
289
392
  console.log(master.categories.length);
290
393
  ```
291
394
 
292
- ### getCoverUrl
395
+ ### Content Utilities
396
+
397
+ #### parseContentRobust
293
398
 
294
- Generates the URL for a book's cover image.
399
+ Parses Shamela HTML snippets into structured lines while preserving title hierarchy and Arabic punctuation.
295
400
 
296
401
  ```typescript
297
- getCoverUrl(bookId: number): string
402
+ parseContentRobust(content: string): Line[]
298
403
  ```
299
404
 
300
- - `bookId`: Book identifier
405
+ ```typescript
406
+ const lines = parseContentRobust(rawHtml);
407
+ lines.forEach((line) => console.log(line.id, line.text));
408
+ ```
301
409
 
302
- **Returns:** Cover image URL
410
+ #### sanitizePageContent
303
411
 
304
- **Example:**
412
+ Normalises page content by applying regex-based replacement rules tuned for Shamela sources.
305
413
 
306
414
  ```typescript
307
- const coverUrl = getCoverUrl(26592);
308
- // Returns: "https://shamela.ws/covers/26592.jpg"
415
+ sanitizePageContent(text: string, rules?: Record<string, string>): string
416
+ ```
417
+
418
+ #### splitPageBodyFromFooter
419
+
420
+ Separates page body content from trailing footnotes using the default Shamela marker.
421
+
422
+ ```typescript
423
+ splitPageBodyFromFooter(content: string, marker?: string): readonly [string, string]
424
+ ```
425
+
426
+ #### removeArabicNumericPageMarkers
427
+
428
+ Removes Arabic numeral markers enclosed in โฆ— โฆ˜, commonly used to denote page numbers.
429
+
430
+ ```typescript
431
+ removeArabicNumericPageMarkers(text: string): string
432
+ ```
433
+
434
+ #### removeTagsExceptSpan
435
+
436
+ Strips anchor and hadeeth tags while preserving nested `<span>` elements.
437
+
438
+ ```typescript
439
+ removeTagsExceptSpan(content: string): string
440
+ ```
441
+
442
+ ### Supporting Utilities
443
+
444
+ #### buildUrl
445
+
446
+ Constructs authenticated API URLs with query parameters and optional API key injection.
447
+
448
+ ```typescript
449
+ buildUrl(endpoint: string, queryParams: Record<string, any>, useAuth?: boolean): URL
450
+ ```
451
+
452
+ #### httpsGet
453
+
454
+ Makes HTTPS GET requests using the configured fetch implementation, automatically parsing JSON responses and returning binary data otherwise.
455
+
456
+ ```typescript
457
+ httpsGet<T extends Uint8Array | Record<string, any>>(url: string | URL, options?: { fetchImpl?: typeof fetch }): Promise<T>
309
458
  ```
310
459
 
311
460
  ## Examples
@@ -0,0 +1,8 @@
1
+ const e=0,t={"<img[^>]*>>":``,่ˆ„:``,"๏ต€":`ุฑูŽุญูู…ูŽู‡ู ูฑู„ู„ูŽู‘ูฐู‡ู`,"๏ต":`ุฑุถูŠ ุงู„ู„ู‡ ุนู†ู‡`,"๏ต‚":`ุฑูŽุถููŠูŽ ูฑู„ู„ูŽู‘ูฐู‡ู ุนูŽู†ู’ู‡ูŽุง`,"๏ตƒ":`ุฑูŽุถููŠูŽ ุงู„ู„ูŽู‘ู‡ู ุนูŽู†ู’ู‡ูู…ู’`,"๏ต„":`ุฑูŽุถููŠูŽ ูฑู„ู„ูŽู‘ูฐู‡ู ุนูŽู†ู’ู‡ูู…ูŽุง`,"๏ต…":`ุฑูŽุถููŠูŽ ุงู„ู„ูŽู‘ู‡ู ุนูŽู†ู’ู‡ูู†ูŽู‘`,"๏ต‡":`ุนูŽู„ูŽูŠู’ู‡ู ูฑู„ุณูŽู‘ู„ูŽูฐู…ู`,"๏ตˆ":`ุนูŽู„ูŽูŠู’ู‡ูู…ู ุงู„ุณูŽู‘ู„ุงู…ู`,"๏ตŠ":`ุนู„ูŠู‡ ุงู„ุตู„ุงุฉ ูˆุงู„ุณู„ุงู…`,"๏ตŒ":`ุตู„ู‰ ุงู„ู„ู‡ ุนู„ูŠู‡ ูˆุขู„ู‡ ูˆุณู„ู…`,"๏ต":`ุนูŽู„ูŽูŠู’ู‡ู ูฑู„ุณูŽู‘ู„ูŽูฐู…ู`,"๏ตŽ":`ุชุจุงุฑูƒ ูˆุชุนุงู„ู‰`,"๏ต":`ุฑูŽุญูู…ูŽู‡ูู…ู ูฑู„ู„ูŽู‘ูฐู‡ู`,"๏ทฝ":``,"๏ทฟ":`ุนูŽุฒู‘ูŽ ูˆูŽุฌูŽู„ู‘ูŽ`},n=/^[)\]\u00BB"โ€'โ€™.,?!:\u061B\u060C\u061F\u06D4\u2026]+$/,r=e=>{let t=[];for(let r of e){let e=t[t.length-1];e&&n.test(r.text)?e.text+=r.text:t.push(r)}return t},i=e=>e.replace(/\r\n/g,`
2
+ `).replace(/\r/g,`
3
+ `).split(`
4
+ `).map(e=>e.trim()).filter(Boolean),a=e=>i(e).map(e=>({text:e})),o=(e,t)=>{let n=RegExp(`${t}\\s*=\\s*("([^"]*)"|'([^']*)'|([^s>]+))`,`i`),r=e.match(n);if(r)return r[2]??r[3]??r[4]},s=e=>{let t=[],n=/<[^>]+>/g,r=0,i;for(i=n.exec(e);i;){i.index>r&&t.push({type:`text`,value:e.slice(r,i.index)});let a=i[0],s=/^<\//.test(a),c=a.match(/^<\/?\s*([a-zA-Z0-9:-]+)/),l=c?c[1].toLowerCase():``;if(s)t.push({name:l,type:`end`});else{let e={};e.id=o(a,`id`),e[`data-type`]=o(a,`data-type`),t.push({attributes:e,name:l,type:`start`})}r=n.lastIndex,i=n.exec(e)}return r<e.length&&t.push({type:`text`,value:e.slice(r)}),t},c=(e,t)=>{let n=e.trim();return n?t?{id:t,text:n}:{text:n}:null},l=e=>{for(let t=e.length-1;t>=0;t--){let n=e[t];if(n.isTitle&&n.id)return n.id}},u=(e,t)=>{if(!e)return;let n=e.split(`
5
+ `);for(let e=0;e<n.length;e++){if(e>0){let e=c(t.currentText,t.currentId);e&&t.result.push(e),t.currentText=``,t.currentId=l(t.spanStack)||void 0}n[e]&&(t.currentText+=n[e])}},d=(e,t)=>{let n=e.attributes[`data-type`]===`title`,r;n&&(r=(e.attributes.id??``).replace(/^toc-/,``)),t.spanStack.push({id:r,isTitle:n}),n&&r&&!t.currentId&&(t.currentId=r)},f=e=>{if(e=e.replace(/\r\n/g,`
6
+ `).replace(/\r/g,`
7
+ `),!/<span[^>]*>/i.test(e))return r(a(e));let t=s(`<root>${e}</root>`),n={currentId:void 0,currentText:``,result:[],spanStack:[]};for(let e of t)e.type===`text`?u(e.value,n):e.type===`start`&&e.name===`span`?d(e,n):e.type===`end`&&e.name===`span`&&n.spanStack.pop();let i=c(n.currentText,n.currentId);return i&&n.result.push(i),r(n.result).filter(e=>e.text.length>0)},p=Object.entries(t).map(([e,t])=>({regex:new RegExp(e,`g`),replacement:t})),m=e=>{if(e===t)return p;let n=[];for(let t in e)n.push({regex:new RegExp(t,`g`),replacement:e[t]});return n},h=(e,n=t)=>{let r=m(n),i=e;for(let e=0;e<r.length;e++){let{regex:t,replacement:n}=r[e];i=i.replace(t,n)}return i},g=(e,t=`_________`)=>{let n=``,r=e.indexOf(t);return r>=0&&(n=e.slice(r+t.length),e=e.slice(0,r)),[e,n]},_=e=>e.replace(/(?: |\r){0,2}โฆ—[\u0660-\u0669]+โฆ˜(?: |\r)?/g,` `),v=e=>(e=e.replace(/<a[^>]*>(.*?)<\/a>/gs,`$1`),e=e.replace(/<hadeeth[^>]*>|<\/hadeeth>|<hadeeth-\d+>/gs,``),e),y=e=>e.replace(/<hadeeth-\d+>/gi,`<span class="hadeeth">`).replace(/<\s*\/?\s*hadeeth\s*>/gi,`</span>`);export{h as a,v as i,f as n,g as o,_ as r,e as s,y as t};
8
+ //# sourceMappingURL=content-B60R0uYQ.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-B60R0uYQ.js","names":["DEFAULT_SANITIZATION_RULES: Record<string, string>","out: Line[]","tokens: Token[]","match: RegExpExecArray | null","attributes: Record<string, string | undefined>","id: string | undefined"],"sources":["../src/utils/constants.ts","../src/content.ts"],"sourcesContent":["/**\n * The default version number for master metadata.\n * @constant {number}\n */\nexport const DEFAULT_MASTER_METADATA_VERSION = 0;\n\n/**\n * Placeholder value used to represent unknown or missing data.\n * @constant {string}\n */\nexport const UNKNOWN_VALUE_PLACEHOLDER = '99999';\n\n/**\n * Default rules to sanitize page content.\n */\nexport const DEFAULT_SANITIZATION_RULES: Record<string, string> = {\n '<img[^>]*>>': '',\n ่ˆ„: '',\n '๏ต€': 'ุฑูŽุญูู…ูŽู‡ู ูฑู„ู„ูŽู‘ูฐู‡ู',\n '๏ต': 'ุฑุถูŠ ุงู„ู„ู‡ ุนู†ู‡',\n '๏ต‚': 'ุฑูŽุถููŠูŽ ูฑู„ู„ูŽู‘ูฐู‡ู ุนูŽู†ู’ู‡ูŽุง',\n '๏ตƒ': 'ุฑูŽุถููŠูŽ ุงู„ู„ูŽู‘ู‡ู ุนูŽู†ู’ู‡ูู…ู’',\n '๏ต„': 'ุฑูŽุถููŠูŽ ูฑู„ู„ูŽู‘ูฐู‡ู ุนูŽู†ู’ู‡ูู…ูŽุง',\n '๏ต…': 'ุฑูŽุถููŠูŽ ุงู„ู„ูŽู‘ู‡ู ุนูŽู†ู’ู‡ูู†ูŽู‘',\n '๏ต‡': 'ุนูŽู„ูŽูŠู’ู‡ู ูฑู„ุณูŽู‘ู„ูŽูฐู…ู',\n '๏ตˆ': 'ุนูŽู„ูŽูŠู’ู‡ูู…ู ุงู„ุณูŽู‘ู„ุงู…ู',\n '๏ตŠ': 'ุนู„ูŠู‡ ุงู„ุตู„ุงุฉ ูˆุงู„ุณู„ุงู…',\n '๏ตŒ': 'ุตู„ู‰ ุงู„ู„ู‡ ุนู„ูŠู‡ ูˆุขู„ู‡ ูˆุณู„ู…',\n '๏ต': 'ุนูŽู„ูŽูŠู’ู‡ู ูฑู„ุณูŽู‘ู„ูŽูฐู…ู',\n '๏ตŽ': 'ุชุจุงุฑูƒ ูˆุชุนุงู„ู‰',\n '๏ต': 'ุฑูŽุญูู…ูŽู‡ูู…ู ูฑู„ู„ูŽู‘ูฐู‡ู',\n '๏ทฝ': '',\n '๏ทฟ': 'ุนูŽุฒู‘ูŽ ูˆูŽุฌูŽู„ู‘ูŽ',\n};\n","import { DEFAULT_SANITIZATION_RULES } from './utils/constants';\n\nexport type Line = {\n id?: string;\n text: string;\n};\n\nconst PUNCT_ONLY = /^[)\\]\\u00BB\"โ€'โ€™.,?!:\\u061B\\u060C\\u061F\\u06D4\\u2026]+$/;\n\n/**\n * Merges punctuation-only lines into the preceding title when appropriate.\n *\n * @param lines - The processed line candidates to normalise\n * @returns A new array where dangling punctuation fragments are appended to titles\n */\nconst mergeDanglingPunctuation = (lines: Line[]): Line[] => {\n const out: Line[] = [];\n for (const item of lines) {\n const last = out[out.length - 1];\n if (last && PUNCT_ONLY.test(item.text)) {\n last.text += item.text;\n } else {\n out.push(item);\n }\n }\n return out;\n};\n\n/**\n * Normalises raw text into discrete line entries.\n *\n * @param text - Raw book content potentially containing inconsistent breaks\n * @returns An array of trimmed line strings with empty entries removed\n */\nconst splitIntoLines = (text: string) => {\n const normalized = text.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n\n return normalized\n .split('\\n')\n .map((line) => line.trim())\n .filter(Boolean);\n};\n\n/**\n * Converts plain text content into {@link Line} objects without title metadata.\n *\n * @param content - The text content to split into line structures\n * @returns A {@link Line} array wrapping each detected sentence fragment\n */\nconst processTextContent = (content: string): Line[] => {\n return splitIntoLines(content).map((line) => ({ text: line }));\n};\n\n/**\n * Extracts an attribute value from the provided HTML tag string.\n *\n * @param tag - Raw HTML tag source\n * @param name - Attribute name to locate\n * @returns The attribute value when found; otherwise undefined\n */\nconst extractAttribute = (tag: string, name: string): string | undefined => {\n const pattern = new RegExp(`${name}\\\\s*=\\\\s*(\"([^\"]*)\"|'([^']*)'|([^s>]+))`, 'i');\n const match = tag.match(pattern);\n if (!match) {\n return undefined;\n }\n return match[2] ?? match[3] ?? match[4];\n};\n\ntype Token =\n | { type: 'text'; value: string }\n | { type: 'start'; name: string; attributes: Record<string, string | undefined> }\n | { type: 'end'; name: string };\n\n/**\n * Breaks the provided HTML fragment into structural tokens.\n *\n * @param html - HTML fragment containing book content markup\n * @returns A token stream describing text and span boundaries\n */\nconst tokenize = (html: string): Token[] => {\n const tokens: Token[] = [];\n const tagRegex = /<[^>]+>/g;\n let lastIndex = 0;\n let match: RegExpExecArray | null;\n match = tagRegex.exec(html);\n\n while (match) {\n if (match.index > lastIndex) {\n tokens.push({ type: 'text', value: html.slice(lastIndex, match.index) });\n }\n\n const raw = match[0];\n const isEnd = /^<\\//.test(raw);\n const nameMatch = raw.match(/^<\\/?\\s*([a-zA-Z0-9:-]+)/);\n const name = nameMatch ? nameMatch[1].toLowerCase() : '';\n\n if (isEnd) {\n tokens.push({ name, type: 'end' });\n } else {\n const attributes: Record<string, string | undefined> = {};\n attributes.id = extractAttribute(raw, 'id');\n attributes['data-type'] = extractAttribute(raw, 'data-type');\n tokens.push({ attributes, name, type: 'start' });\n }\n\n lastIndex = tagRegex.lastIndex;\n match = tagRegex.exec(html);\n }\n\n if (lastIndex < html.length) {\n tokens.push({ type: 'text', value: html.slice(lastIndex) });\n }\n\n return tokens;\n};\n\n/**\n * Pushes the accumulated text as a new line to the result array.\n */\nconst createLine = (text: string, id?: string): Line | null => {\n const trimmed = text.trim();\n if (!trimmed) {\n return null;\n }\n return id ? { id, text: trimmed } : { text: trimmed };\n};\n\n/**\n * Finds the active title ID from the span stack.\n */\nconst getActiveTitleId = (spanStack: Array<{ isTitle: boolean; id?: string }>): string | undefined => {\n for (let i = spanStack.length - 1; i >= 0; i--) {\n const entry = spanStack[i];\n if (entry.isTitle && entry.id) {\n return entry.id;\n }\n }\n};\n\n/**\n * Processes text content by handling line breaks and maintaining title context.\n */\nconst processTextWithLineBreaks = (\n raw: string,\n state: {\n currentText: string;\n currentId?: string;\n result: Line[];\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n if (!raw) {\n return;\n }\n\n const parts = raw.split('\\n');\n\n for (let i = 0; i < parts.length; i++) {\n // Push previous line when crossing a line break\n if (i > 0) {\n const line = createLine(state.currentText, state.currentId);\n if (line) {\n state.result.push(line);\n }\n state.currentText = '';\n\n // Preserve title ID if still inside a title span\n const activeTitleId = getActiveTitleId(state.spanStack);\n state.currentId = activeTitleId || undefined;\n }\n\n // Append the text part\n if (parts[i]) {\n state.currentText += parts[i];\n }\n }\n};\n\n/**\n * Handles the start of a span tag, updating the stack and current ID.\n */\nconst handleSpanStart = (\n token: { attributes: Record<string, string | undefined> },\n state: {\n currentId?: string;\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n const dataType = token.attributes['data-type'];\n const isTitle = dataType === 'title';\n\n let id: string | undefined;\n if (isTitle) {\n const rawId = token.attributes.id ?? '';\n id = rawId.replace(/^toc-/, '');\n }\n\n state.spanStack.push({ id, isTitle });\n\n // First title span on the current physical line wins\n if (isTitle && id && !state.currentId) {\n state.currentId = id;\n }\n};\n\n/**\n * Parses Shamela HTML content into structured lines while preserving headings.\n *\n * @param content - The raw HTML markup representing a page\n * @returns An array of {@link Line} objects containing text and optional IDs\n */\nexport const parseContentRobust = (content: string): Line[] => {\n // Normalize line endings first\n content = content.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n\n // Fast path when there are no span tags at all\n if (!/<span[^>]*>/i.test(content)) {\n return mergeDanglingPunctuation(processTextContent(content));\n }\n\n const tokens = tokenize(`<root>${content}</root>`);\n const state = {\n currentId: undefined as string | undefined,\n currentText: '',\n result: [] as Line[],\n spanStack: [] as Array<{ isTitle: boolean; id?: string }>,\n };\n\n // Process all tokens\n for (const token of tokens) {\n if (token.type === 'text') {\n processTextWithLineBreaks(token.value, state);\n } else if (token.type === 'start' && token.name === 'span') {\n handleSpanStart(token, state);\n } else if (token.type === 'end' && token.name === 'span') {\n // Closing a span does NOT end the line; trailing text stays on the same line\n state.spanStack.pop();\n }\n }\n\n // Flush any trailing text\n const finalLine = createLine(state.currentText, state.currentId);\n if (finalLine) {\n state.result.push(finalLine);\n }\n\n // Merge punctuation-only lines and drop empties\n return mergeDanglingPunctuation(state.result).filter((line) => line.text.length > 0);\n};\n\nconst DEFAULT_COMPILED_RULES = Object.entries(DEFAULT_SANITIZATION_RULES).map(([pattern, replacement]) => ({\n regex: new RegExp(pattern, 'g'),\n replacement,\n}));\n\n/**\n * Compiles sanitisation rules into RegExp objects for reuse.\n *\n * @param rules - Key/value replacements used during sanitisation\n * @returns A list of compiled regular expression rules\n */\nconst getCompiledRules = (rules: Record<string, string>) => {\n if (rules === DEFAULT_SANITIZATION_RULES) {\n return DEFAULT_COMPILED_RULES;\n }\n\n const compiled = [];\n for (const pattern in rules) {\n compiled.push({\n regex: new RegExp(pattern, 'g'),\n replacement: rules[pattern],\n });\n }\n return compiled;\n};\n\n/**\n * Sanitises page content by applying regex replacement rules.\n *\n * @param text - The text to clean\n * @param rules - Optional custom replacements, defaults to {@link DEFAULT_SANITIZATION_RULES}\n * @returns The sanitised content\n */\nexport const sanitizePageContent = (\n text: string,\n rules: Record<string, string> = DEFAULT_SANITIZATION_RULES,\n): string => {\n const compiledRules = getCompiledRules(rules);\n\n let content = text;\n for (let i = 0; i < compiledRules.length; i++) {\n const { regex, replacement } = compiledRules[i];\n content = content.replace(regex, replacement);\n }\n return content;\n};\n\n/**\n * Splits a page body from its trailing footnotes using a marker string.\n *\n * @param content - Combined body and footnote text\n * @param footnoteMarker - Marker indicating the start of footnotes\n * @returns A tuple containing the page body followed by the footnote section\n */\nexport const splitPageBodyFromFooter = (content: string, footnoteMarker = '_________') => {\n let footnote = '';\n const indexOfFootnote = content.indexOf(footnoteMarker);\n\n if (indexOfFootnote >= 0) {\n footnote = content.slice(indexOfFootnote + footnoteMarker.length);\n content = content.slice(0, indexOfFootnote);\n }\n\n return [content, footnote] as const;\n};\n\n/**\n * Removes Arabic numeral page markers enclosed in turtle โฆ— โฆ˜ brackets.\n * Replaces the marker along with up to two preceding whitespace characters\n * (space or carriage return) and up to one following whitespace character\n * with a single space.\n *\n * @param text - Text potentially containing page markers\n * @returns The text with numeric markers replaced by a single space\n */\nexport const removeArabicNumericPageMarkers = (text: string) => {\n return text.replace(/(?: |\\r){0,2}โฆ—[\\u0660-\\u0669]+โฆ˜(?: |\\r)?/g, ' ');\n};\n\n/**\n * Removes anchor and hadeeth tags from the content while preserving spans.\n *\n * @param content - HTML string containing various tags\n * @returns The content with only span tags retained\n */\nexport const removeTagsExceptSpan = (content: string) => {\n // Remove <a> tags and their content, keeping only the text inside\n content = content.replace(/<a[^>]*>(.*?)<\\/a>/gs, '$1');\n\n // Remove <hadeeth> tags (both self-closing, with content, and numbered)\n content = content.replace(/<hadeeth[^>]*>|<\\/hadeeth>|<hadeeth-\\d+>/gs, '');\n\n return content;\n};\n\n/**\n * Normalizes Shamela HTML for CSS styling:\n * - Converts <hadeeth-N> to <span class=\"hadeeth\">\n * - Converts </hadeeth> or standalone <hadeeth> to </span>\n */\nexport const normalizeHtml = (html: string): string => {\n return html.replace(/<hadeeth-\\d+>/gi, '<span class=\"hadeeth\">').replace(/<\\s*\\/?\\s*hadeeth\\s*>/gi, '</span>');\n};\n"],"mappings":"AAIA,MAAa,EAAkC,EAWlCA,EAAqD,CAC9D,cAAe,GACf,EAAG,GACH,IAAK,oBACL,IAAK,eACL,IAAK,0BACL,IAAK,0BACL,IAAK,4BACL,IAAK,2BACL,IAAK,sBACL,IAAK,uBACL,IAAK,sBACL,IAAK,0BACL,IAAK,sBACL,IAAK,eACL,IAAK,sBACL,IAAK,GACL,IAAK,gBACR,CC1BK,EAAa,wDAQb,EAA4B,GAA0B,CACxD,IAAMC,EAAc,EAAE,CACtB,IAAK,IAAM,KAAQ,EAAO,CACtB,IAAM,EAAO,EAAI,EAAI,OAAS,GAC1B,GAAQ,EAAW,KAAK,EAAK,KAAK,CAClC,EAAK,MAAQ,EAAK,KAElB,EAAI,KAAK,EAAK,CAGtB,OAAO,GASL,EAAkB,GACD,EAAK,QAAQ,QAAS;EAAK,CAAC,QAAQ,MAAO;EAAK,CAG9D,MAAM;EAAK,CACX,IAAK,GAAS,EAAK,MAAM,CAAC,CAC1B,OAAO,QAAQ,CASlB,EAAsB,GACjB,EAAe,EAAQ,CAAC,IAAK,IAAU,CAAE,KAAM,EAAM,EAAE,CAU5D,GAAoB,EAAa,IAAqC,CACxE,IAAM,EAAc,OAAO,GAAG,EAAK,yCAA0C,IAAI,CAC3E,EAAQ,EAAI,MAAM,EAAQ,CAC3B,KAGL,OAAO,EAAM,IAAM,EAAM,IAAM,EAAM,IAcnC,EAAY,GAA0B,CACxC,IAAMC,EAAkB,EAAE,CACpB,EAAW,WACb,EAAY,EACZC,EAGJ,IAFA,EAAQ,EAAS,KAAK,EAAK,CAEpB,GAAO,CACN,EAAM,MAAQ,GACd,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAW,EAAM,MAAM,CAAE,CAAC,CAG5E,IAAM,EAAM,EAAM,GACZ,EAAQ,OAAO,KAAK,EAAI,CACxB,EAAY,EAAI,MAAM,2BAA2B,CACjD,EAAO,EAAY,EAAU,GAAG,aAAa,CAAG,GAEtD,GAAI,EACA,EAAO,KAAK,CAAE,OAAM,KAAM,MAAO,CAAC,KAC/B,CACH,IAAMC,EAAiD,EAAE,CACzD,EAAW,GAAK,EAAiB,EAAK,KAAK,CAC3C,EAAW,aAAe,EAAiB,EAAK,YAAY,CAC5D,EAAO,KAAK,CAAE,aAAY,OAAM,KAAM,QAAS,CAAC,CAGpD,EAAY,EAAS,UACrB,EAAQ,EAAS,KAAK,EAAK,CAO/B,OAJI,EAAY,EAAK,QACjB,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAU,CAAE,CAAC,CAGxD,GAML,GAAc,EAAc,IAA6B,CAC3D,IAAM,EAAU,EAAK,MAAM,CAI3B,OAHK,EAGE,EAAK,CAAE,KAAI,KAAM,EAAS,CAAG,CAAE,KAAM,EAAS,CAF1C,MAQT,EAAoB,GAA4E,CAClG,IAAK,IAAI,EAAI,EAAU,OAAS,EAAG,GAAK,EAAG,IAAK,CAC5C,IAAM,EAAQ,EAAU,GACxB,GAAI,EAAM,SAAW,EAAM,GACvB,OAAO,EAAM,KAQnB,GACF,EACA,IAMC,CACD,GAAI,CAAC,EACD,OAGJ,IAAM,EAAQ,EAAI,MAAM;EAAK,CAE7B,IAAK,IAAI,EAAI,EAAG,EAAI,EAAM,OAAQ,IAAK,CAEnC,GAAI,EAAI,EAAG,CACP,IAAM,EAAO,EAAW,EAAM,YAAa,EAAM,UAAU,CACvD,GACA,EAAM,OAAO,KAAK,EAAK,CAE3B,EAAM,YAAc,GAIpB,EAAM,UADgB,EAAiB,EAAM,UAAU,EACpB,IAAA,GAInC,EAAM,KACN,EAAM,aAAe,EAAM,MAQjC,GACF,EACA,IAIC,CAED,IAAM,EADW,EAAM,WAAW,eACL,QAEzBC,EACA,IAEA,GADc,EAAM,WAAW,IAAM,IAC1B,QAAQ,QAAS,GAAG,EAGnC,EAAM,UAAU,KAAK,CAAE,KAAI,UAAS,CAAC,CAGjC,GAAW,GAAM,CAAC,EAAM,YACxB,EAAM,UAAY,IAUb,EAAsB,GAA4B,CAK3D,GAHA,EAAU,EAAQ,QAAQ,QAAS;EAAK,CAAC,QAAQ,MAAO;EAAK,CAGzD,CAAC,eAAe,KAAK,EAAQ,CAC7B,OAAO,EAAyB,EAAmB,EAAQ,CAAC,CAGhE,IAAM,EAAS,EAAS,SAAS,EAAQ,SAAS,CAC5C,EAAQ,CACV,UAAW,IAAA,GACX,YAAa,GACb,OAAQ,EAAE,CACV,UAAW,EAAE,CAChB,CAGD,IAAK,IAAM,KAAS,EACZ,EAAM,OAAS,OACf,EAA0B,EAAM,MAAO,EAAM,CACtC,EAAM,OAAS,SAAW,EAAM,OAAS,OAChD,EAAgB,EAAO,EAAM,CACtB,EAAM,OAAS,OAAS,EAAM,OAAS,QAE9C,EAAM,UAAU,KAAK,CAK7B,IAAM,EAAY,EAAW,EAAM,YAAa,EAAM,UAAU,CAMhE,OALI,GACA,EAAM,OAAO,KAAK,EAAU,CAIzB,EAAyB,EAAM,OAAO,CAAC,OAAQ,GAAS,EAAK,KAAK,OAAS,EAAE,EAGlF,EAAyB,OAAO,QAAQ,EAA2B,CAAC,KAAK,CAAC,EAAS,MAAkB,CACvG,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,cACH,EAAE,CAQG,EAAoB,GAAkC,CACxD,GAAI,IAAU,EACV,OAAO,EAGX,IAAM,EAAW,EAAE,CACnB,IAAK,IAAM,KAAW,EAClB,EAAS,KAAK,CACV,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,YAAa,EAAM,GACtB,CAAC,CAEN,OAAO,GAUE,GACT,EACA,EAAgC,IACvB,CACT,IAAM,EAAgB,EAAiB,EAAM,CAEzC,EAAU,EACd,IAAK,IAAI,EAAI,EAAG,EAAI,EAAc,OAAQ,IAAK,CAC3C,GAAM,CAAE,QAAO,eAAgB,EAAc,GAC7C,EAAU,EAAQ,QAAQ,EAAO,EAAY,CAEjD,OAAO,GAUE,GAA2B,EAAiB,EAAiB,cAAgB,CACtF,IAAI,EAAW,GACT,EAAkB,EAAQ,QAAQ,EAAe,CAOvD,OALI,GAAmB,IACnB,EAAW,EAAQ,MAAM,EAAkB,EAAe,OAAO,CACjE,EAAU,EAAQ,MAAM,EAAG,EAAgB,EAGxC,CAAC,EAAS,EAAS,EAYjB,EAAkC,GACpC,EAAK,QAAQ,4CAA6C,IAAI,CAS5D,EAAwB,IAEjC,EAAU,EAAQ,QAAQ,uBAAwB,KAAK,CAGvD,EAAU,EAAQ,QAAQ,6CAA8C,GAAG,CAEpE,GAQE,EAAiB,GACnB,EAAK,QAAQ,kBAAmB,yBAAyB,CAAC,QAAQ,0BAA2B,UAAU"}
@@ -0,0 +1,54 @@
1
+ //#region src/content.d.ts
2
+ type Line = {
3
+ id?: string;
4
+ text: string;
5
+ };
6
+ /**
7
+ * Parses Shamela HTML content into structured lines while preserving headings.
8
+ *
9
+ * @param content - The raw HTML markup representing a page
10
+ * @returns An array of {@link Line} objects containing text and optional IDs
11
+ */
12
+ declare const parseContentRobust: (content: string) => Line[];
13
+ /**
14
+ * Sanitises page content by applying regex replacement rules.
15
+ *
16
+ * @param text - The text to clean
17
+ * @param rules - Optional custom replacements, defaults to {@link DEFAULT_SANITIZATION_RULES}
18
+ * @returns The sanitised content
19
+ */
20
+ declare const sanitizePageContent: (text: string, rules?: Record<string, string>) => string;
21
+ /**
22
+ * Splits a page body from its trailing footnotes using a marker string.
23
+ *
24
+ * @param content - Combined body and footnote text
25
+ * @param footnoteMarker - Marker indicating the start of footnotes
26
+ * @returns A tuple containing the page body followed by the footnote section
27
+ */
28
+ declare const splitPageBodyFromFooter: (content: string, footnoteMarker?: string) => readonly [string, string];
29
+ /**
30
+ * Removes Arabic numeral page markers enclosed in turtle โฆ— โฆ˜ brackets.
31
+ * Replaces the marker along with up to two preceding whitespace characters
32
+ * (space or carriage return) and up to one following whitespace character
33
+ * with a single space.
34
+ *
35
+ * @param text - Text potentially containing page markers
36
+ * @returns The text with numeric markers replaced by a single space
37
+ */
38
+ declare const removeArabicNumericPageMarkers: (text: string) => string;
39
+ /**
40
+ * Removes anchor and hadeeth tags from the content while preserving spans.
41
+ *
42
+ * @param content - HTML string containing various tags
43
+ * @returns The content with only span tags retained
44
+ */
45
+ declare const removeTagsExceptSpan: (content: string) => string;
46
+ /**
47
+ * Normalizes Shamela HTML for CSS styling:
48
+ * - Converts <hadeeth-N> to <span class="hadeeth">
49
+ * - Converts </hadeeth> or standalone <hadeeth> to </span>
50
+ */
51
+ declare const normalizeHtml: (html: string) => string;
52
+ //#endregion
53
+ export { removeTagsExceptSpan as a, removeArabicNumericPageMarkers as i, normalizeHtml as n, sanitizePageContent as o, parseContentRobust as r, splitPageBodyFromFooter as s, Line as t };
54
+ //# sourceMappingURL=content-CwjMtCQl.d.ts.map
@@ -0,0 +1,2 @@
1
+ import { a as removeTagsExceptSpan, i as removeArabicNumericPageMarkers, n as normalizeHtml, o as sanitizePageContent, r as parseContentRobust, s as splitPageBodyFromFooter, t as Line } from "./content-CwjMtCQl.js";
2
+ export { Line, normalizeHtml, parseContentRobust, removeArabicNumericPageMarkers, removeTagsExceptSpan, sanitizePageContent, splitPageBodyFromFooter };
@@ -0,0 +1 @@
1
+ import{a as e,i as t,n,o as r,r as i,t as a}from"./content-B60R0uYQ.js";export{a as normalizeHtml,n as parseContentRobust,i as removeArabicNumericPageMarkers,t as removeTagsExceptSpan,e as sanitizePageContent,r as splitPageBodyFromFooter};