shamela 1.4.1 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -15,6 +15,17 @@
15
15
 
16
16
  A universal TypeScript library for accessing and downloading Maktabah Shamela v4 APIs. The package runs in both Node.js and modern browsers, providing ergonomic helpers to interact with the Shamela API, download master and book databases, and retrieve book data programmatically.
17
17
 
18
+ > [!WARNING]
19
+ > ## API Key Notice
20
+ >
21
+ > **This library requires an API key to access Shamela's APIs.** I cannot provide API keys and am unable to assist with API key requests.
22
+ >
23
+ > **Please do not:**
24
+ > - Open issues asking for API keys
25
+ > - Contact me directly for API access
26
+ >
27
+ > **For API key inquiries, please email:** [mail@shamela.ws](mailto:mail@shamela.ws)
28
+
18
29
  ## Features
19
30
 
20
31
  - 🚀 **Full data lifecycle** – fetch metadata, download master and book databases, and query the results entirely in-memory.
package/dist/content.js CHANGED
@@ -3,7 +3,7 @@ import{DEFAULT_MAPPING_RULES as e,FOOTNOTE_MARKER as t}from"./utils/constants.js
3
3
  `).split(`
4
4
  `).map(e=>e.trim()).filter(Boolean),a=e=>i(e).map(e=>({text:e})),o=(e,t)=>{let n=RegExp(`${t}\\s*=\\s*("([^"]*)"|'([^']*)'|([^s>]+))`,`i`),r=e.match(n);if(r)return r[2]??r[3]??r[4]},s=e=>{let t=[],n=/<[^>]+>/g,r=0,i;for(i=n.exec(e);i;){i.index>r&&t.push({type:`text`,value:e.slice(r,i.index)});let a=i[0],s=/^<\//.test(a),c=a.match(/^<\/?\s*([a-zA-Z0-9:-]+)/),l=c?c[1].toLowerCase():``;if(s)t.push({name:l,type:`end`});else{let e={};e.id=o(a,`id`),e[`data-type`]=o(a,`data-type`),t.push({attributes:e,name:l,type:`start`})}r=n.lastIndex,i=n.exec(e)}return r<e.length&&t.push({type:`text`,value:e.slice(r)}),t},c=(e,t)=>{let n=e.trim();return n?t?{id:t,text:n}:{text:n}:null},l=e=>{for(let t=e.length-1;t>=0;t--){let n=e[t];if(n.isTitle&&n.id)return n.id}},u=(e,t)=>{if(!e)return;let n=e.split(`
5
5
  `);for(let e=0;e<n.length;e++){if(e>0){let e=c(t.currentText,t.currentId);e&&t.result.push(e),t.currentText=``,t.currentId=l(t.spanStack)||void 0}n[e]&&(t.currentText+=n[e])}},d=(e,t)=>{let n=e.attributes[`data-type`]===`title`,r;n&&(r=(e.attributes.id??``).replace(/^toc-/,``)),t.spanStack.push({id:r,isTitle:n}),n&&r&&!t.currentId&&(t.currentId=r)},f=e=>e.includes(`\r`)?e.replace(/\r\n?/g,`
6
- `):e,p=e=>{if(e=f(e),!/<span[^>]*>/i.test(e))return r(a(e));let t=s(`<root>${e}</root>`),n={currentId:``,currentText:``,result:[],spanStack:[]};for(let e of t)e.type===`text`?u(e.value,n):e.type===`start`&&e.name===`span`?d(e,n):e.type===`end`&&e.name===`span`&&n.spanStack.pop();let i=c(n.currentText,n.currentId);return i&&n.result.push(i),r(n.result).filter(e=>e.text.length>0)},m=Object.entries(e).map(([e,t])=>({regex:new RegExp(e,`g`),replacement:t})),h=t=>{if(t===e)return m;let n=[];for(let e in t)n.push({regex:new RegExp(e,`g`),replacement:t[e]});return n},g=(t,n=e)=>{let r=h(n),i=t;for(let e=0;e<r.length;e++){let{regex:t,replacement:n}=r[e];i=i.replace(t,n)}return i},_=(e,n=t)=>{let r=``,i=e.indexOf(n);return i>=0&&(r=e.slice(i+n.length),e=e.slice(0,i)),[e,r]},v=e=>e.replace(/(?: |\r){0,2}⦗[\u0660-\u0669]+⦘(?: |\r)?/g,` `),y=e=>(e=e.replace(/<a[^>]*>(.*?)<\/a>/gs,`$1`),e=e.replace(/<hadeeth[^>]*>|<\/hadeeth>|<hadeeth-\d+>/gs,``),e),b=e=>e.replace(/<hadeeth-\d+>/gi,`<span class="hadeeth">`).replace(/<\s*\/?\s*hadeeth\s*>/gi,`</span>`),x=e=>e.replace(/<[^>]*>/g,``),S=e=>e.replace(/(^|\r)([^\r]*?)<span[^>]*data-type=["']title["'][^>]*>/gi,`$1<span data-type="title">$2`),C=e=>x(e.replace(/<span[^>]*data-type=["']title["'][^>]*>(.*?)<\/span>/gi,`## $1`).replace(/<a[^>]*href=["']inr:\/\/[^"']*["'][^>]*>(.*?)<\/a>/gi,`$1`)),w=(e,t)=>{let{separator:n=` — `,strategy:r}=t;if(!e)return e;let i=/<span\b[^>]*\bdata-type=(["'])title\1[^>]*>[\s\S]*?<\/span>/gi;return e.replace(/(?:<span\b[^>]*\bdata-type=(["'])title\1[^>]*>[\s\S]*?<\/span>\s*){2,}/gi,e=>{let t=e.match(i)??[];if(t.length<2)return e;if(r===`splitLines`)return t.join(`
7
- `);if(r===`merge`){let e=t.map(e=>e.replace(/^<span\b[^>]*>/i,``).replace(/<\/span>$/i,``).trim()).filter(Boolean);return`${t[0].match(/^<span\b[^>]*>/i)?.[0]??`<span data-type="title">`}${e.join(n)}</span>`}return[t[0],...t.slice(1).map(e=>e.replace(/\bdata-type=(["'])title\1/i,`data-type="subtitle"`))].join(`
8
- `)})},T=(e,t)=>(e=w(e,{strategy:`splitLines`,...t}),e=S(e),e=C(e),e);export{T as convertContentToMarkdown,C as htmlToMarkdown,g as mapPageCharacterContent,S as moveContentAfterLineBreakIntoSpan,b as normalizeHtml,f as normalizeLineEndings,w as normalizeTitleSpans,p as parseContentRobust,v as removeArabicNumericPageMarkers,y as removeTagsExceptSpan,_ as splitPageBodyFromFooter,x as stripHtmlTags};
6
+ `):e,p=e=>{if(e=f(e),!/<span[^>]*>/i.test(e))return r(a(e));let t=s(`<root>${e}</root>`),n={currentId:``,currentText:``,result:[],spanStack:[]};for(let e of t)e.type===`text`?u(e.value,n):e.type===`start`&&e.name===`span`?d(e,n):e.type===`end`&&e.name===`span`&&n.spanStack.pop();let i=c(n.currentText,n.currentId);return i&&n.result.push(i),r(n.result).filter(e=>e.text.length>0)},m=Object.entries(e).map(([e,t])=>({regex:new RegExp(e,`g`),replacement:t})),h=t=>{if(t===e)return m;let n=[];for(let e in t)n.push({regex:new RegExp(e,`g`),replacement:t[e]});return n},g=(t,n=e)=>{let r=h(n),i=t;for(let e=0;e<r.length;e++){let{regex:t,replacement:n}=r[e];i=i.replace(t,n)}return i},_=(e,n=t)=>{let r=``,i=e.indexOf(n);return i>=0&&(r=e.slice(i+n.length),e=e.slice(0,i)),[e,r]},v=e=>e.replace(/(?: |\r){0,2}⦗[\u0660-\u0669]+⦘(?: |\r)?/g,` `),y=e=>(e=e.replace(/<a[^>]*>(.*?)<\/a>/gs,`$1`),e=e.replace(/<hadeeth[^>]*>|<\/hadeeth>|<hadeeth-\d+>/gs,``),e),b=e=>e.replace(/<hadeeth-\d+>/gi,`<span class="hadeeth">`).replace(/<\s*\/?\s*hadeeth\s*>/gi,`</span>`),x=e=>e.replace(/<[^>]*>/g,``),S=e=>e.replace(/(^|[\r\n])([^\r\n]*?)<span[^>]*data-type=["']title["'][^>]*>/gi,`$1<span data-type="title">$2`),C=e=>x(e.replace(/<span[^>]*data-type=["']title["'][^>]*>(.*?)<\/span>/gi,`## $1`).replace(/<a[^>]*href=["']inr:\/\/[^"']*["'][^>]*>(.*?)<\/a>/gi,`$1`)),w=(e,t)=>{let{separator:n=` — `,strategy:r}=t;if(!e)return e;let i=/<span\b[^>]*\bdata-type=(['"])title\1[^>]*>[\s\S]*?<\/span>/gi;return e.replace(/((?:<span\b[^>]*\bdata-type=(['"])title\2[^>]*>[\s\S]*?<\/span>\s*){2,})/gi,e=>{let t=e.match(i)??[];if(t.length<2)return e;let a=t[t.length-1],o=e.lastIndexOf(a)+a.length,s=e.slice(o);if(r===`splitLines`)return t.join(`
7
+ `)+s;if(r===`merge`){let e=t.map(e=>e.replace(/^<span\b[^>]*>/i,``).replace(/<\/span>$/i,``).trim()).filter(Boolean);return`${t[0].match(/^<span\b[^>]*>/i)?.[0]??`<span data-type="title">`}${e.join(n)}</span>`}return[t[0],...t.slice(1).map(e=>e.replace(/\bdata-type=(["'])title\1/i,`data-type="subtitle"`))].join(`
8
+ `)})},T=(e,t)=>(e=w(e,{strategy:`splitLines`,...t}),e=S(e),e=C(e),e=f(e),e);export{T as convertContentToMarkdown,C as htmlToMarkdown,g as mapPageCharacterContent,S as moveContentAfterLineBreakIntoSpan,b as normalizeHtml,f as normalizeLineEndings,w as normalizeTitleSpans,p as parseContentRobust,v as removeArabicNumericPageMarkers,y as removeTagsExceptSpan,_ as splitPageBodyFromFooter,x as stripHtmlTags};
9
9
  //# sourceMappingURL=content.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"content.js","names":["out: Line[]","tokens: Token[]","match: RegExpExecArray | null","attributes: Record<string, string | undefined>","id: string | undefined"],"sources":["../src/content.ts"],"sourcesContent":["import type { NormalizeTitleSpanOptions } from './types';\nimport { DEFAULT_MAPPING_RULES, FOOTNOTE_MARKER } from './utils/constants';\n\nexport type Line = {\n id?: string;\n text: string;\n};\n\nconst PUNCT_ONLY = /^[)\\]\\u00BB\"”'’.,?!:\\u061B\\u060C\\u061F\\u06D4\\u2026]+$/;\n\n/**\n * Merges punctuation-only lines into the preceding title when appropriate.\n *\n * @param lines - The processed line candidates to normalise\n * @returns A new array where dangling punctuation fragments are appended to titles\n */\nconst mergeDanglingPunctuation = (lines: Line[]): Line[] => {\n const out: Line[] = [];\n for (const item of lines) {\n const last = out[out.length - 1];\n if (last && PUNCT_ONLY.test(item.text)) {\n last.text += item.text;\n } else {\n out.push(item);\n }\n }\n return out;\n};\n\n/**\n * Normalises raw text into discrete line entries.\n *\n * @param text - Raw book content potentially containing inconsistent breaks\n * @returns An array of trimmed line strings with empty entries removed\n */\nconst splitIntoLines = (text: string) => {\n const normalized = text.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n\n return normalized\n .split('\\n')\n .map((line) => line.trim())\n .filter(Boolean);\n};\n\n/**\n * Converts plain text content into {@link Line} objects without title metadata.\n *\n * @param content - The text content to split into line structures\n * @returns A {@link Line} array wrapping each detected sentence fragment\n */\nconst processTextContent = (content: string): Line[] => {\n return splitIntoLines(content).map((line) => ({ text: line }));\n};\n\n/**\n * Extracts an attribute value from the provided HTML tag string.\n *\n * @param tag - Raw HTML tag source\n * @param name - Attribute name to locate\n * @returns The attribute value when found; otherwise undefined\n */\nconst extractAttribute = (tag: string, name: string): string | undefined => {\n const pattern = new RegExp(`${name}\\\\s*=\\\\s*(\"([^\"]*)\"|'([^']*)'|([^s>]+))`, 'i');\n const match = tag.match(pattern);\n if (!match) {\n return undefined;\n }\n return match[2] ?? match[3] ?? match[4];\n};\n\ntype Token =\n | { type: 'text'; value: string }\n | { type: 'start'; name: string; attributes: Record<string, string | undefined> }\n | { type: 'end'; name: string };\n\n/**\n * Breaks the provided HTML fragment into structural tokens.\n *\n * @param html - HTML fragment containing book content markup\n * @returns A token stream describing text and span boundaries\n */\nconst tokenize = (html: string): Token[] => {\n const tokens: Token[] = [];\n const tagRegex = /<[^>]+>/g;\n let lastIndex = 0;\n let match: RegExpExecArray | null;\n match = tagRegex.exec(html);\n\n while (match) {\n if (match.index > lastIndex) {\n tokens.push({ type: 'text', value: html.slice(lastIndex, match.index) });\n }\n\n const raw = match[0];\n const isEnd = /^<\\//.test(raw);\n const nameMatch = raw.match(/^<\\/?\\s*([a-zA-Z0-9:-]+)/);\n const name = nameMatch ? nameMatch[1].toLowerCase() : '';\n\n if (isEnd) {\n tokens.push({ name, type: 'end' });\n } else {\n const attributes: Record<string, string | undefined> = {};\n attributes.id = extractAttribute(raw, 'id');\n attributes['data-type'] = extractAttribute(raw, 'data-type');\n tokens.push({ attributes, name, type: 'start' });\n }\n\n lastIndex = tagRegex.lastIndex;\n match = tagRegex.exec(html);\n }\n\n if (lastIndex < html.length) {\n tokens.push({ type: 'text', value: html.slice(lastIndex) });\n }\n\n return tokens;\n};\n\n/**\n * Pushes the accumulated text as a new line to the result array.\n */\nconst createLine = (text: string, id?: string): Line | null => {\n const trimmed = text.trim();\n if (!trimmed) {\n return null;\n }\n return id ? { id, text: trimmed } : { text: trimmed };\n};\n\n/**\n * Finds the active title ID from the span stack.\n */\nconst getActiveTitleId = (spanStack: Array<{ isTitle: boolean; id?: string }>): string | undefined => {\n for (let i = spanStack.length - 1; i >= 0; i--) {\n const entry = spanStack[i];\n if (entry.isTitle && entry.id) {\n return entry.id;\n }\n }\n};\n\n/**\n * Processes text content by handling line breaks and maintaining title context.\n */\nconst processTextWithLineBreaks = (\n raw: string,\n state: {\n currentText: string;\n currentId?: string;\n result: Line[];\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n if (!raw) {\n return;\n }\n\n const parts = raw.split('\\n');\n\n for (let i = 0; i < parts.length; i++) {\n // Push previous line when crossing a line break\n if (i > 0) {\n const line = createLine(state.currentText, state.currentId);\n if (line) {\n state.result.push(line);\n }\n state.currentText = '';\n\n // Preserve title ID if still inside a title span\n const activeTitleId = getActiveTitleId(state.spanStack);\n state.currentId = activeTitleId || undefined;\n }\n\n // Append the text part\n if (parts[i]) {\n state.currentText += parts[i];\n }\n }\n};\n\n/**\n * Handles the start of a span tag, updating the stack and current ID.\n */\nconst handleSpanStart = (\n token: { attributes: Record<string, string | undefined> },\n state: {\n currentId?: string;\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n const dataType = token.attributes['data-type'];\n const isTitle = dataType === 'title';\n\n let id: string | undefined;\n if (isTitle) {\n const rawId = token.attributes.id ?? '';\n id = rawId.replace(/^toc-/, '');\n }\n\n state.spanStack.push({ id, isTitle });\n\n // First title span on the current physical line wins\n if (isTitle && id && !state.currentId) {\n state.currentId = id;\n }\n};\n\n/**\n * Normalizes line endings to Unix-style (`\\n`).\n *\n * Converts Windows (`\\r\\n`) and old Mac (`\\r`) line endings to Unix style\n * for consistent pattern matching across platforms.\n *\n * @param content - Raw content with potentially mixed line endings\n * @returns Content with all line endings normalized to `\\n`\n */\nexport const normalizeLineEndings = (content: string) => {\n return content.includes('\\r') ? content.replace(/\\r\\n?/g, '\\n') : content;\n};\n\n/**\n * Parses Shamela HTML content into structured lines while preserving headings.\n *\n * @param content - The raw HTML markup representing a page\n * @returns An array of {@link Line} objects containing text and optional IDs\n */\nexport const parseContentRobust = (content: string): Line[] => {\n // Normalize line endings first\n content = normalizeLineEndings(content);\n\n // Fast path when there are no span tags at all\n if (!/<span[^>]*>/i.test(content)) {\n return mergeDanglingPunctuation(processTextContent(content));\n }\n\n const tokens = tokenize(`<root>${content}</root>`);\n const state = {\n currentId: '',\n currentText: '',\n result: [] as Line[],\n spanStack: [] as Array<{ isTitle: boolean; id?: string }>,\n };\n\n // Process all tokens\n for (const token of tokens) {\n if (token.type === 'text') {\n processTextWithLineBreaks(token.value, state);\n } else if (token.type === 'start' && token.name === 'span') {\n handleSpanStart(token, state);\n } else if (token.type === 'end' && token.name === 'span') {\n // Closing a span does NOT end the line; trailing text stays on the same line\n state.spanStack.pop();\n }\n }\n\n // Flush any trailing text\n const finalLine = createLine(state.currentText, state.currentId);\n if (finalLine) {\n state.result.push(finalLine);\n }\n\n // Merge punctuation-only lines and drop empties\n return mergeDanglingPunctuation(state.result).filter((line) => line.text.length > 0);\n};\n\nconst DEFAULT_COMPILED_RULES = Object.entries(DEFAULT_MAPPING_RULES).map(([pattern, replacement]) => ({\n regex: new RegExp(pattern, 'g'),\n replacement,\n}));\n\n/**\n * Compiles sanitisation rules into RegExp objects for reuse.\n *\n * @param rules - Key/value replacements used during sanitisation\n * @returns A list of compiled regular expression rules\n */\nconst getCompiledRules = (rules: Record<string, string>) => {\n if (rules === DEFAULT_MAPPING_RULES) {\n return DEFAULT_COMPILED_RULES;\n }\n\n const compiled = [];\n for (const pattern in rules) {\n compiled.push({\n regex: new RegExp(pattern, 'g'),\n replacement: rules[pattern],\n });\n }\n return compiled;\n};\n\n/**\n * Sanitises page content by applying regex replacement rules.\n *\n * @param text - The text to clean\n * @param rules - Optional custom replacements, defaults to {@link DEFAULT_MAPPING_RULES}\n * @returns The sanitised content\n */\nexport const mapPageCharacterContent = (\n text: string,\n rules: Record<string, string> = DEFAULT_MAPPING_RULES,\n): string => {\n const compiledRules = getCompiledRules(rules);\n\n let content = text;\n for (let i = 0; i < compiledRules.length; i++) {\n const { regex, replacement } = compiledRules[i];\n content = content.replace(regex, replacement);\n }\n return content;\n};\n\n/**\n * Splits a page body from its trailing footnotes using a marker string.\n *\n * @param content - Combined body and footnote text\n * @param footnoteMarker - Marker indicating the start of footnotes\n * @returns A tuple containing the page body followed by the footnote section\n */\nexport const splitPageBodyFromFooter = (content: string, footnoteMarker = FOOTNOTE_MARKER) => {\n let footnote = '';\n const indexOfFootnote = content.indexOf(footnoteMarker);\n\n if (indexOfFootnote >= 0) {\n footnote = content.slice(indexOfFootnote + footnoteMarker.length);\n content = content.slice(0, indexOfFootnote);\n }\n\n return [content, footnote] as const;\n};\n\n/**\n * Removes Arabic numeral page markers enclosed in turtle ⦗ ⦘ brackets.\n * Replaces the marker along with up to two preceding whitespace characters\n * (space or carriage return) and up to one following whitespace character\n * with a single space.\n *\n * @param text - Text potentially containing page markers\n * @returns The text with numeric markers replaced by a single space\n */\nexport const removeArabicNumericPageMarkers = (text: string) => {\n return text.replace(/(?: |\\r){0,2}⦗[\\u0660-\\u0669]+⦘(?: |\\r)?/g, ' ');\n};\n\n/**\n * Removes anchor and hadeeth tags from the content while preserving spans.\n *\n * @param content - HTML string containing various tags\n * @returns The content with only span tags retained\n */\nexport const removeTagsExceptSpan = (content: string) => {\n // Remove <a> tags and their content, keeping only the text inside\n content = content.replace(/<a[^>]*>(.*?)<\\/a>/gs, '$1');\n\n // Remove <hadeeth> tags (both self-closing, with content, and numbered)\n content = content.replace(/<hadeeth[^>]*>|<\\/hadeeth>|<hadeeth-\\d+>/gs, '');\n\n return content;\n};\n\n/**\n * Normalizes Shamela HTML for CSS styling:\n * - Converts <hadeeth-N> to <span class=\"hadeeth\">\n * - Converts </hadeeth> or standalone <hadeeth> to </span>\n */\nexport const normalizeHtml = (html: string): string => {\n return html.replace(/<hadeeth-\\d+>/gi, '<span class=\"hadeeth\">').replace(/<\\s*\\/?\\s*hadeeth\\s*>/gi, '</span>');\n};\n\n/**\n * Strip all HTML tags from content, keeping only text.\n *\n * @param html - HTML content\n * @returns Plain text content\n */\nexport const stripHtmlTags = (html: string) => {\n return html.replace(/<[^>]*>/g, '');\n};\n\n/**\n * Moves content that appears after a line break but before a title span into the span.\n *\n * This handles cases where text at the start of a line (such as chapter numbers like \"١ -\")\n * should logically be part of the following title but was placed outside the span in the HTML.\n *\n * @example\n * ```typescript\n * // Input: \"\\rباب الأول<span data-type=\"title\">العنوان</span>\"\n * // Output: \"\\r<span data-type=\"title\">باب الأول العنوان</span>\"\n * ```\n *\n * @param html - HTML content with potential pre-title text\n * @returns HTML with pre-title text moved inside title spans\n */\nexport const moveContentAfterLineBreakIntoSpan = (html: string) => {\n return (\n html\n // Move content after line break (or at start) but before title span INTO the span\n .replace(/(^|\\r)([^\\r]*?)<span[^>]*data-type=[\"']title[\"'][^>]*>/gi, '$1<span data-type=\"title\">$2')\n );\n};\n\n/**\n * Convert Shamela HTML to Markdown format for easier pattern matching.\n *\n * Transformations:\n * - `<span data-type=\"title\">text</span>` → `## text`\n * - `<a href=\"inr://...\">text</a>` → `text` (strip narrator links)\n * - All other HTML tags → stripped\n *\n * Note: Content typically already has proper line breaks before title spans,\n * so we don't add extra newlines around the ## header.\n * Line ending normalization is handled by segmentPages.\n *\n * @param html - HTML content from Shamela\n * @returns Markdown-formatted content\n */\nexport const htmlToMarkdown = (html: string) => {\n const converted = html\n // Convert title spans to markdown headers (no extra newlines - content already has them)\n .replace(/<span[^>]*data-type=[\"']title[\"'][^>]*>(.*?)<\\/span>/gi, '## $1')\n // Strip narrator links but keep text\n .replace(/<a[^>]*href=[\"']inr:\\/\\/[^\"']*[\"'][^>]*>(.*?)<\\/a>/gi, '$1');\n\n return stripHtmlTags(converted);\n};\n\n/**\n * Normalizes consecutive Shamela-style title spans.\n *\n * Shamela exports sometimes contain adjacent title spans like:\n * `<span data-type=\"title\">باب الميم</span><span data-type=\"title\">من اسمه محمد</span>`\n *\n * If you naively convert each title span into a markdown heading, you can end up with:\n * `## باب الميم ## من اسمه محمد` (two headings on one line).\n *\n * This helper rewrites the HTML so downstream HTML→Markdown conversion can stay simple and consistent.\n */\nexport const normalizeTitleSpans = (html: string, options: NormalizeTitleSpanOptions): string => {\n const { separator = ' — ', strategy } = options;\n if (!html) {\n return html;\n }\n\n const titleSpanRegex = /<span\\b[^>]*\\bdata-type=([\"'])title\\1[^>]*>[\\s\\S]*?<\\/span>/gi;\n // Two or more title spans with optional whitespace between them\n const titleRunRegex = /(?:<span\\b[^>]*\\bdata-type=([\"'])title\\1[^>]*>[\\s\\S]*?<\\/span>\\s*){2,}/gi;\n\n return html.replace(titleRunRegex, (run) => {\n const spans = run.match(titleSpanRegex) ?? [];\n if (spans.length < 2) {\n return run;\n }\n\n if (strategy === 'splitLines') {\n return spans.join('\\n');\n }\n\n if (strategy === 'merge') {\n const texts = spans\n .map((s) =>\n s\n .replace(/^<span\\b[^>]*>/i, '')\n .replace(/<\\/span>$/i, '')\n .trim(),\n )\n .filter(Boolean);\n\n // Preserve the first span's opening tag (attributes) but replace its inner text.\n const firstOpenTagMatch = spans[0]!.match(/^<span\\b[^>]*>/i);\n const openTag = firstOpenTagMatch?.[0] ?? '<span data-type=\"title\">';\n return `${openTag}${texts.join(separator)}</span>`;\n }\n\n // hierarchy\n const first = spans[0];\n const rest = spans.slice(1).map((s) => s.replace(/\\bdata-type=([\"'])title\\1/i, 'data-type=\"subtitle\"'));\n return [first, ...rest].join('\\n');\n });\n};\n\n/**\n * Converts Shamela HTML content to Markdown format using a standardized pipeline.\n *\n * This is a convenience function that applies the recommended sequence of transformations:\n * 1. Normalizes consecutive title spans (default: splitLines strategy)\n * 2. Moves pre-title text into spans\n * 3. Converts to Markdown format\n *\n * @example\n * ```typescript\n * const html = '<span data-type=\"title\">Chapter</span><span data-type=\"title\">One</span>';\n * const markdown = convertContentToMarkdown(html);\n * // => \"## Chapter\\n## One\"\n * ```\n *\n * @param content - Raw HTML content from Shamela\n * @param options - Optional configuration for title span normalization\n * @returns Markdown-formatted content\n */\nexport const convertContentToMarkdown = (content: string, options?: NormalizeTitleSpanOptions) => {\n content = normalizeTitleSpans(content, { strategy: 'splitLines', ...options });\n content = moveContentAfterLineBreakIntoSpan(content);\n content = htmlToMarkdown(content);\n\n return content;\n};\n"],"mappings":"kFAQA,MAAM,EAAa,wDAQb,EAA4B,GAA0B,CACxD,IAAMA,EAAc,EAAE,CACtB,IAAK,IAAM,KAAQ,EAAO,CACtB,IAAM,EAAO,EAAI,EAAI,OAAS,GAC1B,GAAQ,EAAW,KAAK,EAAK,KAAK,CAClC,EAAK,MAAQ,EAAK,KAElB,EAAI,KAAK,EAAK,CAGtB,OAAO,GASL,EAAkB,GACD,EAAK,QAAQ,QAAS;EAAK,CAAC,QAAQ,MAAO;EAAK,CAG9D,MAAM;EAAK,CACX,IAAK,GAAS,EAAK,MAAM,CAAC,CAC1B,OAAO,QAAQ,CASlB,EAAsB,GACjB,EAAe,EAAQ,CAAC,IAAK,IAAU,CAAE,KAAM,EAAM,EAAE,CAU5D,GAAoB,EAAa,IAAqC,CACxE,IAAM,EAAc,OAAO,GAAG,EAAK,yCAA0C,IAAI,CAC3E,EAAQ,EAAI,MAAM,EAAQ,CAC3B,KAGL,OAAO,EAAM,IAAM,EAAM,IAAM,EAAM,IAcnC,EAAY,GAA0B,CACxC,IAAMC,EAAkB,EAAE,CACpB,EAAW,WACb,EAAY,EACZC,EAGJ,IAFA,EAAQ,EAAS,KAAK,EAAK,CAEpB,GAAO,CACN,EAAM,MAAQ,GACd,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAW,EAAM,MAAM,CAAE,CAAC,CAG5E,IAAM,EAAM,EAAM,GACZ,EAAQ,OAAO,KAAK,EAAI,CACxB,EAAY,EAAI,MAAM,2BAA2B,CACjD,EAAO,EAAY,EAAU,GAAG,aAAa,CAAG,GAEtD,GAAI,EACA,EAAO,KAAK,CAAE,OAAM,KAAM,MAAO,CAAC,KAC/B,CACH,IAAMC,EAAiD,EAAE,CACzD,EAAW,GAAK,EAAiB,EAAK,KAAK,CAC3C,EAAW,aAAe,EAAiB,EAAK,YAAY,CAC5D,EAAO,KAAK,CAAE,aAAY,OAAM,KAAM,QAAS,CAAC,CAGpD,EAAY,EAAS,UACrB,EAAQ,EAAS,KAAK,EAAK,CAO/B,OAJI,EAAY,EAAK,QACjB,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAU,CAAE,CAAC,CAGxD,GAML,GAAc,EAAc,IAA6B,CAC3D,IAAM,EAAU,EAAK,MAAM,CAI3B,OAHK,EAGE,EAAK,CAAE,KAAI,KAAM,EAAS,CAAG,CAAE,KAAM,EAAS,CAF1C,MAQT,EAAoB,GAA4E,CAClG,IAAK,IAAI,EAAI,EAAU,OAAS,EAAG,GAAK,EAAG,IAAK,CAC5C,IAAM,EAAQ,EAAU,GACxB,GAAI,EAAM,SAAW,EAAM,GACvB,OAAO,EAAM,KAQnB,GACF,EACA,IAMC,CACD,GAAI,CAAC,EACD,OAGJ,IAAM,EAAQ,EAAI,MAAM;EAAK,CAE7B,IAAK,IAAI,EAAI,EAAG,EAAI,EAAM,OAAQ,IAAK,CAEnC,GAAI,EAAI,EAAG,CACP,IAAM,EAAO,EAAW,EAAM,YAAa,EAAM,UAAU,CACvD,GACA,EAAM,OAAO,KAAK,EAAK,CAE3B,EAAM,YAAc,GAIpB,EAAM,UADgB,EAAiB,EAAM,UAAU,EACpB,IAAA,GAInC,EAAM,KACN,EAAM,aAAe,EAAM,MAQjC,GACF,EACA,IAIC,CAED,IAAM,EADW,EAAM,WAAW,eACL,QAEzBC,EACA,IAEA,GADc,EAAM,WAAW,IAAM,IAC1B,QAAQ,QAAS,GAAG,EAGnC,EAAM,UAAU,KAAK,CAAE,KAAI,UAAS,CAAC,CAGjC,GAAW,GAAM,CAAC,EAAM,YACxB,EAAM,UAAY,IAab,EAAwB,GAC1B,EAAQ,SAAS,KAAK,CAAG,EAAQ,QAAQ,SAAU;EAAK,CAAG,EASzD,EAAsB,GAA4B,CAK3D,GAHA,EAAU,EAAqB,EAAQ,CAGnC,CAAC,eAAe,KAAK,EAAQ,CAC7B,OAAO,EAAyB,EAAmB,EAAQ,CAAC,CAGhE,IAAM,EAAS,EAAS,SAAS,EAAQ,SAAS,CAC5C,EAAQ,CACV,UAAW,GACX,YAAa,GACb,OAAQ,EAAE,CACV,UAAW,EAAE,CAChB,CAGD,IAAK,IAAM,KAAS,EACZ,EAAM,OAAS,OACf,EAA0B,EAAM,MAAO,EAAM,CACtC,EAAM,OAAS,SAAW,EAAM,OAAS,OAChD,EAAgB,EAAO,EAAM,CACtB,EAAM,OAAS,OAAS,EAAM,OAAS,QAE9C,EAAM,UAAU,KAAK,CAK7B,IAAM,EAAY,EAAW,EAAM,YAAa,EAAM,UAAU,CAMhE,OALI,GACA,EAAM,OAAO,KAAK,EAAU,CAIzB,EAAyB,EAAM,OAAO,CAAC,OAAQ,GAAS,EAAK,KAAK,OAAS,EAAE,EAGlF,EAAyB,OAAO,QAAQ,EAAsB,CAAC,KAAK,CAAC,EAAS,MAAkB,CAClG,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,cACH,EAAE,CAQG,EAAoB,GAAkC,CACxD,GAAI,IAAU,EACV,OAAO,EAGX,IAAM,EAAW,EAAE,CACnB,IAAK,IAAM,KAAW,EAClB,EAAS,KAAK,CACV,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,YAAa,EAAM,GACtB,CAAC,CAEN,OAAO,GAUE,GACT,EACA,EAAgC,IACvB,CACT,IAAM,EAAgB,EAAiB,EAAM,CAEzC,EAAU,EACd,IAAK,IAAI,EAAI,EAAG,EAAI,EAAc,OAAQ,IAAK,CAC3C,GAAM,CAAE,QAAO,eAAgB,EAAc,GAC7C,EAAU,EAAQ,QAAQ,EAAO,EAAY,CAEjD,OAAO,GAUE,GAA2B,EAAiB,EAAiB,IAAoB,CAC1F,IAAI,EAAW,GACT,EAAkB,EAAQ,QAAQ,EAAe,CAOvD,OALI,GAAmB,IACnB,EAAW,EAAQ,MAAM,EAAkB,EAAe,OAAO,CACjE,EAAU,EAAQ,MAAM,EAAG,EAAgB,EAGxC,CAAC,EAAS,EAAS,EAYjB,EAAkC,GACpC,EAAK,QAAQ,4CAA6C,IAAI,CAS5D,EAAwB,IAEjC,EAAU,EAAQ,QAAQ,uBAAwB,KAAK,CAGvD,EAAU,EAAQ,QAAQ,6CAA8C,GAAG,CAEpE,GAQE,EAAiB,GACnB,EAAK,QAAQ,kBAAmB,yBAAyB,CAAC,QAAQ,0BAA2B,UAAU,CASrG,EAAiB,GACnB,EAAK,QAAQ,WAAY,GAAG,CAkB1B,EAAqC,GAE1C,EAEK,QAAQ,2DAA4D,+BAA+B,CAmBnG,EAAkB,GAOpB,EANW,EAEb,QAAQ,yDAA0D,QAAQ,CAE1E,QAAQ,uDAAwD,KAAK,CAE3C,CActB,GAAuB,EAAc,IAA+C,CAC7F,GAAM,CAAE,YAAY,MAAO,YAAa,EACxC,GAAI,CAAC,EACD,OAAO,EAGX,IAAM,EAAiB,gEAIvB,OAAO,EAAK,QAFU,2EAEc,GAAQ,CACxC,IAAM,EAAQ,EAAI,MAAM,EAAe,EAAI,EAAE,CAC7C,GAAI,EAAM,OAAS,EACf,OAAO,EAGX,GAAI,IAAa,aACb,OAAO,EAAM,KAAK;EAAK,CAG3B,GAAI,IAAa,QAAS,CACtB,IAAM,EAAQ,EACT,IAAK,GACF,EACK,QAAQ,kBAAmB,GAAG,CAC9B,QAAQ,aAAc,GAAG,CACzB,MAAM,CACd,CACA,OAAO,QAAQ,CAKpB,MAAO,GAFmB,EAAM,GAAI,MAAM,kBAAkB,GACxB,IAAM,6BACtB,EAAM,KAAK,EAAU,CAAC,SAM9C,MAAO,CAFO,EAAM,GAEL,GADF,EAAM,MAAM,EAAE,CAAC,IAAK,GAAM,EAAE,QAAQ,6BAA8B,uBAAuB,CAAC,CAChF,CAAC,KAAK;EAAK,EACpC,EAsBO,GAA4B,EAAiB,KACtD,EAAU,EAAoB,EAAS,CAAE,SAAU,aAAc,GAAG,EAAS,CAAC,CAC9E,EAAU,EAAkC,EAAQ,CACpD,EAAU,EAAe,EAAQ,CAE1B"}
1
+ {"version":3,"file":"content.js","names":["out: Line[]","tokens: Token[]","match: RegExpExecArray | null","attributes: Record<string, string | undefined>","id: string | undefined"],"sources":["../src/content.ts"],"sourcesContent":["import type { NormalizeTitleSpanOptions } from './types';\nimport { DEFAULT_MAPPING_RULES, FOOTNOTE_MARKER } from './utils/constants';\n\nexport type Line = {\n id?: string;\n text: string;\n};\n\nconst PUNCT_ONLY = /^[)\\]\\u00BB\"”'’.,?!:\\u061B\\u060C\\u061F\\u06D4\\u2026]+$/;\n\n/**\n * Merges punctuation-only lines into the preceding title when appropriate.\n *\n * @param lines - The processed line candidates to normalise\n * @returns A new array where dangling punctuation fragments are appended to titles\n */\nconst mergeDanglingPunctuation = (lines: Line[]): Line[] => {\n const out: Line[] = [];\n for (const item of lines) {\n const last = out[out.length - 1];\n if (last && PUNCT_ONLY.test(item.text)) {\n last.text += item.text;\n } else {\n out.push(item);\n }\n }\n return out;\n};\n\n/**\n * Normalises raw text into discrete line entries.\n *\n * @param text - Raw book content potentially containing inconsistent breaks\n * @returns An array of trimmed line strings with empty entries removed\n */\nconst splitIntoLines = (text: string) => {\n const normalized = text.replace(/\\r\\n/g, '\\n').replace(/\\r/g, '\\n');\n\n return normalized\n .split('\\n')\n .map((line) => line.trim())\n .filter(Boolean);\n};\n\n/**\n * Converts plain text content into {@link Line} objects without title metadata.\n *\n * @param content - The text content to split into line structures\n * @returns A {@link Line} array wrapping each detected sentence fragment\n */\nconst processTextContent = (content: string): Line[] => {\n return splitIntoLines(content).map((line) => ({ text: line }));\n};\n\n/**\n * Extracts an attribute value from the provided HTML tag string.\n *\n * @param tag - Raw HTML tag source\n * @param name - Attribute name to locate\n * @returns The attribute value when found; otherwise undefined\n */\nconst extractAttribute = (tag: string, name: string): string | undefined => {\n const pattern = new RegExp(`${name}\\\\s*=\\\\s*(\"([^\"]*)\"|'([^']*)'|([^s>]+))`, 'i');\n const match = tag.match(pattern);\n if (!match) {\n return undefined;\n }\n return match[2] ?? match[3] ?? match[4];\n};\n\ntype Token =\n | { type: 'text'; value: string }\n | { type: 'start'; name: string; attributes: Record<string, string | undefined> }\n | { type: 'end'; name: string };\n\n/**\n * Breaks the provided HTML fragment into structural tokens.\n *\n * @param html - HTML fragment containing book content markup\n * @returns A token stream describing text and span boundaries\n */\nconst tokenize = (html: string): Token[] => {\n const tokens: Token[] = [];\n const tagRegex = /<[^>]+>/g;\n let lastIndex = 0;\n let match: RegExpExecArray | null;\n match = tagRegex.exec(html);\n\n while (match) {\n if (match.index > lastIndex) {\n tokens.push({ type: 'text', value: html.slice(lastIndex, match.index) });\n }\n\n const raw = match[0];\n const isEnd = /^<\\//.test(raw);\n const nameMatch = raw.match(/^<\\/?\\s*([a-zA-Z0-9:-]+)/);\n const name = nameMatch ? nameMatch[1].toLowerCase() : '';\n\n if (isEnd) {\n tokens.push({ name, type: 'end' });\n } else {\n const attributes: Record<string, string | undefined> = {};\n attributes.id = extractAttribute(raw, 'id');\n attributes['data-type'] = extractAttribute(raw, 'data-type');\n tokens.push({ attributes, name, type: 'start' });\n }\n\n lastIndex = tagRegex.lastIndex;\n match = tagRegex.exec(html);\n }\n\n if (lastIndex < html.length) {\n tokens.push({ type: 'text', value: html.slice(lastIndex) });\n }\n\n return tokens;\n};\n\n/**\n * Pushes the accumulated text as a new line to the result array.\n */\nconst createLine = (text: string, id?: string): Line | null => {\n const trimmed = text.trim();\n if (!trimmed) {\n return null;\n }\n return id ? { id, text: trimmed } : { text: trimmed };\n};\n\n/**\n * Finds the active title ID from the span stack.\n */\nconst getActiveTitleId = (spanStack: Array<{ isTitle: boolean; id?: string }>): string | undefined => {\n for (let i = spanStack.length - 1; i >= 0; i--) {\n const entry = spanStack[i];\n if (entry.isTitle && entry.id) {\n return entry.id;\n }\n }\n};\n\n/**\n * Processes text content by handling line breaks and maintaining title context.\n */\nconst processTextWithLineBreaks = (\n raw: string,\n state: {\n currentText: string;\n currentId?: string;\n result: Line[];\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n if (!raw) {\n return;\n }\n\n const parts = raw.split('\\n');\n\n for (let i = 0; i < parts.length; i++) {\n // Push previous line when crossing a line break\n if (i > 0) {\n const line = createLine(state.currentText, state.currentId);\n if (line) {\n state.result.push(line);\n }\n state.currentText = '';\n\n // Preserve title ID if still inside a title span\n const activeTitleId = getActiveTitleId(state.spanStack);\n state.currentId = activeTitleId || undefined;\n }\n\n // Append the text part\n if (parts[i]) {\n state.currentText += parts[i];\n }\n }\n};\n\n/**\n * Handles the start of a span tag, updating the stack and current ID.\n */\nconst handleSpanStart = (\n token: { attributes: Record<string, string | undefined> },\n state: {\n currentId?: string;\n spanStack: Array<{ isTitle: boolean; id?: string }>;\n },\n) => {\n const dataType = token.attributes['data-type'];\n const isTitle = dataType === 'title';\n\n let id: string | undefined;\n if (isTitle) {\n const rawId = token.attributes.id ?? '';\n id = rawId.replace(/^toc-/, '');\n }\n\n state.spanStack.push({ id, isTitle });\n\n // First title span on the current physical line wins\n if (isTitle && id && !state.currentId) {\n state.currentId = id;\n }\n};\n\n/**\n * Normalizes line endings to Unix-style (`\\n`).\n *\n * Converts Windows (`\\r\\n`) and old Mac (`\\r`) line endings to Unix style\n * for consistent pattern matching across platforms.\n *\n * @param content - Raw content with potentially mixed line endings\n * @returns Content with all line endings normalized to `\\n`\n */\nexport const normalizeLineEndings = (content: string) => {\n return content.includes('\\r') ? content.replace(/\\r\\n?/g, '\\n') : content;\n};\n\n/**\n * Parses Shamela HTML content into structured lines while preserving headings.\n *\n * @param content - The raw HTML markup representing a page\n * @returns An array of {@link Line} objects containing text and optional IDs\n */\nexport const parseContentRobust = (content: string): Line[] => {\n // Normalize line endings first\n content = normalizeLineEndings(content);\n\n // Fast path when there are no span tags at all\n if (!/<span[^>]*>/i.test(content)) {\n return mergeDanglingPunctuation(processTextContent(content));\n }\n\n const tokens = tokenize(`<root>${content}</root>`);\n const state = {\n currentId: '',\n currentText: '',\n result: [] as Line[],\n spanStack: [] as Array<{ isTitle: boolean; id?: string }>,\n };\n\n // Process all tokens\n for (const token of tokens) {\n if (token.type === 'text') {\n processTextWithLineBreaks(token.value, state);\n } else if (token.type === 'start' && token.name === 'span') {\n handleSpanStart(token, state);\n } else if (token.type === 'end' && token.name === 'span') {\n // Closing a span does NOT end the line; trailing text stays on the same line\n state.spanStack.pop();\n }\n }\n\n // Flush any trailing text\n const finalLine = createLine(state.currentText, state.currentId);\n if (finalLine) {\n state.result.push(finalLine);\n }\n\n // Merge punctuation-only lines and drop empties\n return mergeDanglingPunctuation(state.result).filter((line) => line.text.length > 0);\n};\n\nconst DEFAULT_COMPILED_RULES = Object.entries(DEFAULT_MAPPING_RULES).map(([pattern, replacement]) => ({\n regex: new RegExp(pattern, 'g'),\n replacement,\n}));\n\n/**\n * Compiles sanitisation rules into RegExp objects for reuse.\n *\n * @param rules - Key/value replacements used during sanitisation\n * @returns A list of compiled regular expression rules\n */\nconst getCompiledRules = (rules: Record<string, string>) => {\n if (rules === DEFAULT_MAPPING_RULES) {\n return DEFAULT_COMPILED_RULES;\n }\n\n const compiled = [];\n for (const pattern in rules) {\n compiled.push({\n regex: new RegExp(pattern, 'g'),\n replacement: rules[pattern],\n });\n }\n return compiled;\n};\n\n/**\n * Sanitises page content by applying regex replacement rules.\n *\n * @param text - The text to clean\n * @param rules - Optional custom replacements, defaults to {@link DEFAULT_MAPPING_RULES}\n * @returns The sanitised content\n */\nexport const mapPageCharacterContent = (\n text: string,\n rules: Record<string, string> = DEFAULT_MAPPING_RULES,\n): string => {\n const compiledRules = getCompiledRules(rules);\n\n let content = text;\n for (let i = 0; i < compiledRules.length; i++) {\n const { regex, replacement } = compiledRules[i];\n content = content.replace(regex, replacement);\n }\n return content;\n};\n\n/**\n * Splits a page body from its trailing footnotes using a marker string.\n *\n * @param content - Combined body and footnote text\n * @param footnoteMarker - Marker indicating the start of footnotes\n * @returns A tuple containing the page body followed by the footnote section\n */\nexport const splitPageBodyFromFooter = (content: string, footnoteMarker = FOOTNOTE_MARKER) => {\n let footnote = '';\n const indexOfFootnote = content.indexOf(footnoteMarker);\n\n if (indexOfFootnote >= 0) {\n footnote = content.slice(indexOfFootnote + footnoteMarker.length);\n content = content.slice(0, indexOfFootnote);\n }\n\n return [content, footnote] as const;\n};\n\n/**\n * Removes Arabic numeral page markers enclosed in turtle ⦗ ⦘ brackets.\n * Replaces the marker along with up to two preceding whitespace characters\n * (space or carriage return) and up to one following whitespace character\n * with a single space.\n *\n * @param text - Text potentially containing page markers\n * @returns The text with numeric markers replaced by a single space\n */\nexport const removeArabicNumericPageMarkers = (text: string) => {\n return text.replace(/(?: |\\r){0,2}⦗[\\u0660-\\u0669]+⦘(?: |\\r)?/g, ' ');\n};\n\n/**\n * Removes anchor and hadeeth tags from the content while preserving spans.\n *\n * @param content - HTML string containing various tags\n * @returns The content with only span tags retained\n */\nexport const removeTagsExceptSpan = (content: string) => {\n // Remove <a> tags and their content, keeping only the text inside\n content = content.replace(/<a[^>]*>(.*?)<\\/a>/gs, '$1');\n\n // Remove <hadeeth> tags (both self-closing, with content, and numbered)\n content = content.replace(/<hadeeth[^>]*>|<\\/hadeeth>|<hadeeth-\\d+>/gs, '');\n\n return content;\n};\n\n/**\n * Normalizes Shamela HTML for CSS styling:\n * - Converts <hadeeth-N> to <span class=\"hadeeth\">\n * - Converts </hadeeth> or standalone <hadeeth> to </span>\n */\nexport const normalizeHtml = (html: string): string => {\n return html.replace(/<hadeeth-\\d+>/gi, '<span class=\"hadeeth\">').replace(/<\\s*\\/?\\s*hadeeth\\s*>/gi, '</span>');\n};\n\n/**\n * Strip all HTML tags from content, keeping only text.\n *\n * @param html - HTML content\n * @returns Plain text content\n */\nexport const stripHtmlTags = (html: string) => {\n return html.replace(/<[^>]*>/g, '');\n};\n\n/**\n * Moves content that appears after a line break but before a title span into the span.\n *\n * This handles cases where text at the start of a line (such as chapter numbers like \"١ -\")\n * should logically be part of the following title but was placed outside the span in the HTML.\n *\n * @example\n * ```typescript\n * // Input: \"\\rباب الأول<span data-type=\"title\">العنوان</span>\"\n * // Output: \"\\r<span data-type=\"title\">باب الأول العنوان</span>\"\n * ```\n *\n * @param html - HTML content with potential pre-title text\n * @returns HTML with pre-title text moved inside title spans\n */\nexport const moveContentAfterLineBreakIntoSpan = (html: string) => {\n return (\n html\n // Move content after line break (or at start) but before title span INTO the span\n // Exclude both \\r and \\n to avoid capturing content across multiple lines\n .replace(/(^|[\\r\\n])([^\\r\\n]*?)<span[^>]*data-type=[\"']title[\"'][^>]*>/gi, '$1<span data-type=\"title\">$2')\n );\n};\n\n/**\n * Convert Shamela HTML to Markdown format for easier pattern matching.\n *\n * Transformations:\n * - `<span data-type=\"title\">text</span>` → `## text`\n * - `<a href=\"inr://...\">text</a>` → `text` (strip narrator links)\n * - All other HTML tags → stripped\n *\n * Note: Content typically already has proper line breaks before title spans,\n * so we don't add extra newlines around the ## header.\n * Line ending normalization is handled by segmentPages.\n *\n * @param html - HTML content from Shamela\n * @returns Markdown-formatted content\n */\nexport const htmlToMarkdown = (html: string) => {\n const converted = html\n // Convert title spans to markdown headers (no extra newlines - content already has them)\n .replace(/<span[^>]*data-type=[\"']title[\"'][^>]*>(.*?)<\\/span>/gi, '## $1')\n // Strip narrator links but keep text\n .replace(/<a[^>]*href=[\"']inr:\\/\\/[^\"']*[\"'][^>]*>(.*?)<\\/a>/gi, '$1');\n\n return stripHtmlTags(converted);\n};\n\n/**\n * Normalizes consecutive Shamela-style title spans.\n *\n * Shamela exports sometimes contain adjacent title spans like:\n * `<span data-type=\"title\">باب الميم</span><span data-type=\"title\">من اسمه محمد</span>`\n *\n * If you naively convert each title span into a markdown heading, you can end up with:\n * `## باب الميم ## من اسمه محمد` (two headings on one line).\n *\n * This helper rewrites the HTML so downstream HTML→Markdown conversion can stay simple and consistent.\n */\nexport const normalizeTitleSpans = (html: string, options: NormalizeTitleSpanOptions): string => {\n const { separator = ' — ', strategy } = options;\n if (!html) {\n return html;\n }\n\n const titleSpanRegex = /<span\\b[^>]*\\bdata-type=(['\"])title\\1[^>]*>[\\s\\S]*?<\\/span>/gi;\n // Two or more title spans with optional whitespace between them, capturing trailing whitespace\n const titleRunRegex = /((?:<span\\b[^>]*\\bdata-type=(['\"])title\\2[^>]*>[\\s\\S]*?<\\/span>\\s*){2,})/gi;\n\n return html.replace(titleRunRegex, (run) => {\n const spans = run.match(titleSpanRegex) ?? [];\n if (spans.length < 2) {\n return run;\n }\n\n // Capture trailing whitespace after the last span to preserve line breaks\n const lastSpan = spans[spans.length - 1];\n const lastSpanEndIndex = run.lastIndexOf(lastSpan) + lastSpan.length;\n const trailingWhitespace = run.slice(lastSpanEndIndex);\n\n if (strategy === 'splitLines') {\n return spans.join('\\n') + trailingWhitespace;\n }\n\n if (strategy === 'merge') {\n const texts = spans\n .map((s) =>\n s\n .replace(/^<span\\b[^>]*>/i, '')\n .replace(/<\\/span>$/i, '')\n .trim(),\n )\n .filter(Boolean);\n\n // Preserve the first span's opening tag (attributes) but replace its inner text.\n const firstOpenTagMatch = spans[0]!.match(/^<span\\b[^>]*>/i);\n const openTag = firstOpenTagMatch?.[0] ?? '<span data-type=\"title\">';\n return `${openTag}${texts.join(separator)}</span>`;\n }\n\n // hierarchy\n const first = spans[0];\n const rest = spans.slice(1).map((s) => s.replace(/\\bdata-type=([\"'])title\\1/i, 'data-type=\"subtitle\"'));\n return [first, ...rest].join('\\n');\n });\n};\n\n/**\n * Converts Shamela HTML content to Markdown format using a standardized pipeline.\n *\n * This is a convenience function that applies the recommended sequence of transformations:\n * 1. Normalizes consecutive title spans (default: splitLines strategy)\n * 2. Moves pre-title text into spans\n * 3. Converts to Markdown format\n *\n * @example\n * ```typescript\n * const html = '<span data-type=\"title\">Chapter</span><span data-type=\"title\">One</span>';\n * const markdown = convertContentToMarkdown(html);\n * // => \"## Chapter\\n## One\"\n * ```\n *\n * @param content - Raw HTML content from Shamela\n * @param options - Optional configuration for title span normalization\n * @returns Markdown-formatted content\n */\nexport const convertContentToMarkdown = (content: string, options?: NormalizeTitleSpanOptions) => {\n content = normalizeTitleSpans(content, { strategy: 'splitLines', ...options });\n content = moveContentAfterLineBreakIntoSpan(content);\n content = htmlToMarkdown(content);\n content = normalizeLineEndings(content);\n\n return content;\n};\n"],"mappings":"kFAQA,MAAM,EAAa,wDAQb,EAA4B,GAA0B,CACxD,IAAMA,EAAc,EAAE,CACtB,IAAK,IAAM,KAAQ,EAAO,CACtB,IAAM,EAAO,EAAI,EAAI,OAAS,GAC1B,GAAQ,EAAW,KAAK,EAAK,KAAK,CAClC,EAAK,MAAQ,EAAK,KAElB,EAAI,KAAK,EAAK,CAGtB,OAAO,GASL,EAAkB,GACD,EAAK,QAAQ,QAAS;EAAK,CAAC,QAAQ,MAAO;EAAK,CAG9D,MAAM;EAAK,CACX,IAAK,GAAS,EAAK,MAAM,CAAC,CAC1B,OAAO,QAAQ,CASlB,EAAsB,GACjB,EAAe,EAAQ,CAAC,IAAK,IAAU,CAAE,KAAM,EAAM,EAAE,CAU5D,GAAoB,EAAa,IAAqC,CACxE,IAAM,EAAc,OAAO,GAAG,EAAK,yCAA0C,IAAI,CAC3E,EAAQ,EAAI,MAAM,EAAQ,CAC3B,KAGL,OAAO,EAAM,IAAM,EAAM,IAAM,EAAM,IAcnC,EAAY,GAA0B,CACxC,IAAMC,EAAkB,EAAE,CACpB,EAAW,WACb,EAAY,EACZC,EAGJ,IAFA,EAAQ,EAAS,KAAK,EAAK,CAEpB,GAAO,CACN,EAAM,MAAQ,GACd,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAW,EAAM,MAAM,CAAE,CAAC,CAG5E,IAAM,EAAM,EAAM,GACZ,EAAQ,OAAO,KAAK,EAAI,CACxB,EAAY,EAAI,MAAM,2BAA2B,CACjD,EAAO,EAAY,EAAU,GAAG,aAAa,CAAG,GAEtD,GAAI,EACA,EAAO,KAAK,CAAE,OAAM,KAAM,MAAO,CAAC,KAC/B,CACH,IAAMC,EAAiD,EAAE,CACzD,EAAW,GAAK,EAAiB,EAAK,KAAK,CAC3C,EAAW,aAAe,EAAiB,EAAK,YAAY,CAC5D,EAAO,KAAK,CAAE,aAAY,OAAM,KAAM,QAAS,CAAC,CAGpD,EAAY,EAAS,UACrB,EAAQ,EAAS,KAAK,EAAK,CAO/B,OAJI,EAAY,EAAK,QACjB,EAAO,KAAK,CAAE,KAAM,OAAQ,MAAO,EAAK,MAAM,EAAU,CAAE,CAAC,CAGxD,GAML,GAAc,EAAc,IAA6B,CAC3D,IAAM,EAAU,EAAK,MAAM,CAI3B,OAHK,EAGE,EAAK,CAAE,KAAI,KAAM,EAAS,CAAG,CAAE,KAAM,EAAS,CAF1C,MAQT,EAAoB,GAA4E,CAClG,IAAK,IAAI,EAAI,EAAU,OAAS,EAAG,GAAK,EAAG,IAAK,CAC5C,IAAM,EAAQ,EAAU,GACxB,GAAI,EAAM,SAAW,EAAM,GACvB,OAAO,EAAM,KAQnB,GACF,EACA,IAMC,CACD,GAAI,CAAC,EACD,OAGJ,IAAM,EAAQ,EAAI,MAAM;EAAK,CAE7B,IAAK,IAAI,EAAI,EAAG,EAAI,EAAM,OAAQ,IAAK,CAEnC,GAAI,EAAI,EAAG,CACP,IAAM,EAAO,EAAW,EAAM,YAAa,EAAM,UAAU,CACvD,GACA,EAAM,OAAO,KAAK,EAAK,CAE3B,EAAM,YAAc,GAIpB,EAAM,UADgB,EAAiB,EAAM,UAAU,EACpB,IAAA,GAInC,EAAM,KACN,EAAM,aAAe,EAAM,MAQjC,GACF,EACA,IAIC,CAED,IAAM,EADW,EAAM,WAAW,eACL,QAEzBC,EACA,IAEA,GADc,EAAM,WAAW,IAAM,IAC1B,QAAQ,QAAS,GAAG,EAGnC,EAAM,UAAU,KAAK,CAAE,KAAI,UAAS,CAAC,CAGjC,GAAW,GAAM,CAAC,EAAM,YACxB,EAAM,UAAY,IAab,EAAwB,GAC1B,EAAQ,SAAS,KAAK,CAAG,EAAQ,QAAQ,SAAU;EAAK,CAAG,EASzD,EAAsB,GAA4B,CAK3D,GAHA,EAAU,EAAqB,EAAQ,CAGnC,CAAC,eAAe,KAAK,EAAQ,CAC7B,OAAO,EAAyB,EAAmB,EAAQ,CAAC,CAGhE,IAAM,EAAS,EAAS,SAAS,EAAQ,SAAS,CAC5C,EAAQ,CACV,UAAW,GACX,YAAa,GACb,OAAQ,EAAE,CACV,UAAW,EAAE,CAChB,CAGD,IAAK,IAAM,KAAS,EACZ,EAAM,OAAS,OACf,EAA0B,EAAM,MAAO,EAAM,CACtC,EAAM,OAAS,SAAW,EAAM,OAAS,OAChD,EAAgB,EAAO,EAAM,CACtB,EAAM,OAAS,OAAS,EAAM,OAAS,QAE9C,EAAM,UAAU,KAAK,CAK7B,IAAM,EAAY,EAAW,EAAM,YAAa,EAAM,UAAU,CAMhE,OALI,GACA,EAAM,OAAO,KAAK,EAAU,CAIzB,EAAyB,EAAM,OAAO,CAAC,OAAQ,GAAS,EAAK,KAAK,OAAS,EAAE,EAGlF,EAAyB,OAAO,QAAQ,EAAsB,CAAC,KAAK,CAAC,EAAS,MAAkB,CAClG,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,cACH,EAAE,CAQG,EAAoB,GAAkC,CACxD,GAAI,IAAU,EACV,OAAO,EAGX,IAAM,EAAW,EAAE,CACnB,IAAK,IAAM,KAAW,EAClB,EAAS,KAAK,CACV,MAAO,IAAI,OAAO,EAAS,IAAI,CAC/B,YAAa,EAAM,GACtB,CAAC,CAEN,OAAO,GAUE,GACT,EACA,EAAgC,IACvB,CACT,IAAM,EAAgB,EAAiB,EAAM,CAEzC,EAAU,EACd,IAAK,IAAI,EAAI,EAAG,EAAI,EAAc,OAAQ,IAAK,CAC3C,GAAM,CAAE,QAAO,eAAgB,EAAc,GAC7C,EAAU,EAAQ,QAAQ,EAAO,EAAY,CAEjD,OAAO,GAUE,GAA2B,EAAiB,EAAiB,IAAoB,CAC1F,IAAI,EAAW,GACT,EAAkB,EAAQ,QAAQ,EAAe,CAOvD,OALI,GAAmB,IACnB,EAAW,EAAQ,MAAM,EAAkB,EAAe,OAAO,CACjE,EAAU,EAAQ,MAAM,EAAG,EAAgB,EAGxC,CAAC,EAAS,EAAS,EAYjB,EAAkC,GACpC,EAAK,QAAQ,4CAA6C,IAAI,CAS5D,EAAwB,IAEjC,EAAU,EAAQ,QAAQ,uBAAwB,KAAK,CAGvD,EAAU,EAAQ,QAAQ,6CAA8C,GAAG,CAEpE,GAQE,EAAiB,GACnB,EAAK,QAAQ,kBAAmB,yBAAyB,CAAC,QAAQ,0BAA2B,UAAU,CASrG,EAAiB,GACnB,EAAK,QAAQ,WAAY,GAAG,CAkB1B,EAAqC,GAE1C,EAGK,QAAQ,iEAAkE,+BAA+B,CAmBzG,EAAkB,GAOpB,EANW,EAEb,QAAQ,yDAA0D,QAAQ,CAE1E,QAAQ,uDAAwD,KAAK,CAE3C,CActB,GAAuB,EAAc,IAA+C,CAC7F,GAAM,CAAE,YAAY,MAAO,YAAa,EACxC,GAAI,CAAC,EACD,OAAO,EAGX,IAAM,EAAiB,gEAIvB,OAAO,EAAK,QAFU,6EAEc,GAAQ,CACxC,IAAM,EAAQ,EAAI,MAAM,EAAe,EAAI,EAAE,CAC7C,GAAI,EAAM,OAAS,EACf,OAAO,EAIX,IAAM,EAAW,EAAM,EAAM,OAAS,GAChC,EAAmB,EAAI,YAAY,EAAS,CAAG,EAAS,OACxD,EAAqB,EAAI,MAAM,EAAiB,CAEtD,GAAI,IAAa,aACb,OAAO,EAAM,KAAK;EAAK,CAAG,EAG9B,GAAI,IAAa,QAAS,CACtB,IAAM,EAAQ,EACT,IAAK,GACF,EACK,QAAQ,kBAAmB,GAAG,CAC9B,QAAQ,aAAc,GAAG,CACzB,MAAM,CACd,CACA,OAAO,QAAQ,CAKpB,MAAO,GAFmB,EAAM,GAAI,MAAM,kBAAkB,GACxB,IAAM,6BACtB,EAAM,KAAK,EAAU,CAAC,SAM9C,MAAO,CAFO,EAAM,GAEL,GADF,EAAM,MAAM,EAAE,CAAC,IAAK,GAAM,EAAE,QAAQ,6BAA8B,uBAAuB,CAAC,CAChF,CAAC,KAAK;EAAK,EACpC,EAsBO,GAA4B,EAAiB,KACtD,EAAU,EAAoB,EAAS,CAAE,SAAU,aAAc,GAAG,EAAS,CAAC,CAC9E,EAAU,EAAkC,EAAQ,CACpD,EAAU,EAAe,EAAQ,CACjC,EAAU,EAAqB,EAAQ,CAEhC"}
package/package.json CHANGED
@@ -16,11 +16,11 @@
16
16
  "@types/react": "^19.2.7",
17
17
  "@types/react-dom": "^19.2.3",
18
18
  "@types/sql.js": "^1.4.9",
19
- "next": "^16.1.0",
19
+ "next": "^16.1.1",
20
20
  "react": "^19.2.3",
21
21
  "react-dom": "^19.2.3",
22
22
  "semantic-release": "^25.0.2",
23
- "tsdown": "^0.18.1",
23
+ "tsdown": "^0.18.3",
24
24
  "typescript": "^5.9.3"
25
25
  },
26
26
  "engines": {
@@ -79,5 +79,5 @@
79
79
  "source": "src/index.ts",
80
80
  "type": "module",
81
81
  "types": "dist/index.d.ts",
82
- "version": "1.4.1"
82
+ "version": "1.4.2"
83
83
  }