@nanocollective/get-md 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +36 -0
- package/README.md +205 -0
- package/bin/get-md.js +4 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +91 -0
- package/dist/cli.js.map +1 -0
- package/dist/cli.spec.d.ts +2 -0
- package/dist/cli.spec.d.ts.map +1 -0
- package/dist/cli.spec.js +278 -0
- package/dist/cli.spec.js.map +1 -0
- package/dist/config.d.ts +5 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +6 -0
- package/dist/config.js.map +1 -0
- package/dist/extractors/metadata-extractor.d.ts +6 -0
- package/dist/extractors/metadata-extractor.d.ts.map +1 -0
- package/dist/extractors/metadata-extractor.js +131 -0
- package/dist/extractors/metadata-extractor.js.map +1 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +70 -0
- package/dist/index.js.map +1 -0
- package/dist/optimizers/html-cleaner.d.ts +12 -0
- package/dist/optimizers/html-cleaner.d.ts.map +1 -0
- package/dist/optimizers/html-cleaner.js +228 -0
- package/dist/optimizers/html-cleaner.js.map +1 -0
- package/dist/optimizers/llm-formatter.d.ts +8 -0
- package/dist/optimizers/llm-formatter.d.ts.map +1 -0
- package/dist/optimizers/llm-formatter.js +94 -0
- package/dist/optimizers/llm-formatter.js.map +1 -0
- package/dist/optimizers/structure-enhancer.d.ts +8 -0
- package/dist/optimizers/structure-enhancer.d.ts.map +1 -0
- package/dist/optimizers/structure-enhancer.js +92 -0
- package/dist/optimizers/structure-enhancer.js.map +1 -0
- package/dist/parsers/markdown-parser.d.ts +16 -0
- package/dist/parsers/markdown-parser.d.ts.map +1 -0
- package/dist/parsers/markdown-parser.js +369 -0
- package/dist/parsers/markdown-parser.js.map +1 -0
- package/dist/types.d.ts +115 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/url-fetcher.d.ts +10 -0
- package/dist/utils/url-fetcher.d.ts.map +1 -0
- package/dist/utils/url-fetcher.js +54 -0
- package/dist/utils/url-fetcher.js.map +1 -0
- package/dist/utils/validators.d.ts +5 -0
- package/dist/utils/validators.d.ts.map +1 -0
- package/dist/utils/validators.js +23 -0
- package/dist/utils/validators.js.map +1 -0
- package/package.json +104 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
// src/optimizers/structure-enhancer.ts
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
/**
|
|
4
|
+
* Enhance HTML structure for better markdown conversion
|
|
5
|
+
* - Improve heading hierarchy
|
|
6
|
+
* - Clean up nested elements
|
|
7
|
+
* - Normalize structure
|
|
8
|
+
*/
|
|
9
|
+
export function enhanceStructure(html) {
|
|
10
|
+
const $ = cheerio.load(html);
|
|
11
|
+
// 1. Normalize heading hierarchy
|
|
12
|
+
normalizeHeadings($);
|
|
13
|
+
// 2. Unwrap unnecessary nested elements
|
|
14
|
+
unwrapRedundantElements($);
|
|
15
|
+
// 3. Convert divs with heading-like content to actual headings
|
|
16
|
+
convertPseudoHeadings($);
|
|
17
|
+
return $.html();
|
|
18
|
+
}
|
|
19
|
+
function normalizeHeadings($) {
|
|
20
|
+
// Ensure headings have proper hierarchy
|
|
21
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
22
|
+
const headings = [];
|
|
23
|
+
$("h1, h2, h3, h4, h5, h6").each((_, el) => {
|
|
24
|
+
const $el = $(el);
|
|
25
|
+
const tagName = el.tagName?.toLowerCase() || "";
|
|
26
|
+
const level = parseInt(tagName.substring(1));
|
|
27
|
+
headings.push({ level, $el });
|
|
28
|
+
});
|
|
29
|
+
// Adjust heading levels if they skip
|
|
30
|
+
let lastLevel = 0;
|
|
31
|
+
headings.forEach(({ level, $el }) => {
|
|
32
|
+
if (level > lastLevel + 1) {
|
|
33
|
+
const newLevel = lastLevel + 1;
|
|
34
|
+
const newTag = `h${newLevel}`;
|
|
35
|
+
const html = $el.html();
|
|
36
|
+
if (html) {
|
|
37
|
+
$el.replaceWith($(`<${newTag}>${html}</${newTag}>`));
|
|
38
|
+
}
|
|
39
|
+
lastLevel = newLevel;
|
|
40
|
+
}
|
|
41
|
+
else {
|
|
42
|
+
lastLevel = level;
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
function unwrapRedundantElements($) {
|
|
47
|
+
// Remove redundant nested divs and spans
|
|
48
|
+
$("div > div:only-child, span > span:only-child").each((_, el) => {
|
|
49
|
+
const $el = $(el);
|
|
50
|
+
const html = $el.html();
|
|
51
|
+
if (html)
|
|
52
|
+
$el.replaceWith(html);
|
|
53
|
+
});
|
|
54
|
+
// Unwrap paragraphs that only contain another block element
|
|
55
|
+
$("p").each((_, el) => {
|
|
56
|
+
const $el = $(el);
|
|
57
|
+
const children = $el.children();
|
|
58
|
+
if (children.length === 1) {
|
|
59
|
+
const tagName = children.first().prop("tagName")?.toLowerCase();
|
|
60
|
+
if (tagName &&
|
|
61
|
+
["div", "blockquote", "pre", "ul", "ol", "table"].includes(tagName)) {
|
|
62
|
+
const html = $el.html();
|
|
63
|
+
if (html)
|
|
64
|
+
$el.replaceWith(html);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
function convertPseudoHeadings($) {
|
|
70
|
+
// Convert divs/spans that look like headings into actual headings
|
|
71
|
+
$("div, span").each((_, el) => {
|
|
72
|
+
const $el = $(el);
|
|
73
|
+
const text = $el.text().trim();
|
|
74
|
+
// Skip if it has children that aren't just text
|
|
75
|
+
if ($el.children().length > 0)
|
|
76
|
+
return;
|
|
77
|
+
// Check if it looks like a heading (short, possibly styled)
|
|
78
|
+
const className = $el.attr("class") || "";
|
|
79
|
+
const style = $el.attr("style") || "";
|
|
80
|
+
if (text.length > 0 &&
|
|
81
|
+
text.length < 100 &&
|
|
82
|
+
(className.toLowerCase().includes("title") ||
|
|
83
|
+
className.toLowerCase().includes("heading") ||
|
|
84
|
+
style.includes("font-weight: bold") ||
|
|
85
|
+
style.includes("font-weight:bold"))) {
|
|
86
|
+
// Convert to h3 by default
|
|
87
|
+
const newHeading = cheerio.load(`<h3>${text}</h3>`)("h3");
|
|
88
|
+
$el.replaceWith(newHeading);
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
//# sourceMappingURL=structure-enhancer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structure-enhancer.js","sourceRoot":"","sources":["../../src/optimizers/structure-enhancer.ts"],"names":[],"mappings":"AAAA,uCAAuC;AAEvC,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAEnC;;;;;GAKG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,iCAAiC;IACjC,iBAAiB,CAAC,CAAC,CAAC,CAAC;IAErB,wCAAwC;IACxC,uBAAuB,CAAC,CAAC,CAAC,CAAC;IAE3B,+DAA+D;IAC/D,qBAAqB,CAAC,CAAC,CAAC,CAAC;IAEzB,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;AAClB,CAAC;AAED,SAAS,iBAAiB,CAAC,CAAqB;IAC9C,wCAAwC;IACxC,8DAA8D;IAC9D,MAAM,QAAQ,GAAmD,EAAE,CAAC;IAEpE,CAAC,CAAC,wBAAwB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACzC,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAG,EAAE,CAAC,OAAO,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC;QAChD,MAAM,KAAK,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7C,QAAQ,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC;IAChC,CAAC,CAAC,CAAC;IAEH,qCAAqC;IACrC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,QAAQ,CAAC,OAAO,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE;QAClC,IAAI,KAAK,GAAG,SAAS,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,SAAS,GAAG,CAAC,CAAC;YAC/B,MAAM,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;YAC9B,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;YACxB,IAAI,IAAI,EAAE,CAAC;gBACT,GAAG,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,MAAM,IAAI,IAAI,KAAK,MAAM,GAAG,CAAC,CAAC,CAAC;YACvD,CAAC;YACD,SAAS,GAAG,QAAQ,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,SAAS,GAAG,KAAK,CAAC;QACpB,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,uBAAuB,CAAC,CAAqB;IACpD,yCAAyC;IACzC,CAAC,CAAC,8CAA8C,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAC/D,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QACxB,IAAI,IAAI;YAAE,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;IAClC,CAAC,CAAC,CAAC;IAEH,4DAA4D;IAC5D,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,QAAQ,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC;QAEhC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC1B,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,WAAW,EAAE,CAAC;YAChE,IACE,OAAO;gBACP,CAAC,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,EACnE,CAAC;gBACD,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;gBACxB,IAAI,IAAI;oBAAE,GAAG,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YAClC,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,qBAAqB,CAAC,CAAqB;IAClD,kEAAkE;IAClE,CAAC,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAC5B,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAE/B,gDAAgD;QAChD,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,MAAM,GAAG,CAAC;YAAE,OAAO;QAEtC,4DAA4D;QAC5D,MAAM,SAAS,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;QAEtC,IACE,IAAI,CAAC,MAAM,GAAG,CAAC;YACf,IAAI,CAAC,MAAM,GAAG,GAAG;YACjB,CAAC,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,OAAO,CAAC;gBACxC,SAAS,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAC3C,KAAK,CAAC,QAAQ,CAAC,mBAAmB,CAAC;gBACnC,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAC,EACrC,CAAC;YACD,2BAA2B;YAC3B,MAAM,UAAU,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC;YAC1D,GAAG,CAAC,WAAW,CAAC,UAAU,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { MarkdownOptions, MarkdownResult } from "../types.js";
|
|
2
|
+
export declare class MarkdownParser {
|
|
3
|
+
private turndown;
|
|
4
|
+
constructor();
|
|
5
|
+
convert(html: string, options?: MarkdownOptions): MarkdownResult;
|
|
6
|
+
private extractMainContent;
|
|
7
|
+
private filterContent;
|
|
8
|
+
private setupLLMRules;
|
|
9
|
+
private convertTableToMarkdown;
|
|
10
|
+
private applyCustomRules;
|
|
11
|
+
private addFrontmatter;
|
|
12
|
+
private calculateMarkdownStats;
|
|
13
|
+
private postProcess;
|
|
14
|
+
private normalizeOptions;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=markdown-parser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown-parser.d.ts","sourceRoot":"","sources":["../../src/parsers/markdown-parser.ts"],"names":[],"mappings":"AAWA,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EAIf,MAAM,aAAa,CAAC;AAErB,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAkB;;IAuBlC,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,GAAE,eAAoB,GAAG,cAAc;IA0FpE,OAAO,CAAC,kBAAkB;IA4B1B,OAAO,CAAC,aAAa;IA2BrB,OAAO,CAAC,aAAa;IAsErB,OAAO,CAAC,sBAAsB;IAiE9B,OAAO,CAAC,gBAAgB;IASxB,OAAO,CAAC,cAAc;IAoBtB,OAAO,CAAC,sBAAsB;IAsC9B,OAAO,CAAC,WAAW;IAwDnB,OAAO,CAAC,gBAAgB;CAgBzB"}
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
// src/parsers/markdown-parser.ts
|
|
2
|
+
import TurndownService from "turndown";
|
|
3
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
4
|
+
import { Readability } from "@mozilla/readability";
|
|
5
|
+
import { JSDOM } from "jsdom";
|
|
6
|
+
import * as cheerio from "cheerio";
|
|
7
|
+
import { cleanHTML } from "../optimizers/html-cleaner.js";
|
|
8
|
+
import { formatForLLM } from "../optimizers/llm-formatter.js";
|
|
9
|
+
import { enhanceStructure } from "../optimizers/structure-enhancer.js";
|
|
10
|
+
import { extractMetadata } from "../extractors/metadata-extractor.js";
|
|
11
|
+
export class MarkdownParser {
|
|
12
|
+
turndown;
|
|
13
|
+
constructor() {
|
|
14
|
+
// Initialize Turndown with LLM-friendly settings
|
|
15
|
+
this.turndown = new TurndownService({
|
|
16
|
+
headingStyle: "atx", // Use # style headings
|
|
17
|
+
hr: "---", // Horizontal rule style
|
|
18
|
+
bulletListMarker: "-", // Use - for lists
|
|
19
|
+
codeBlockStyle: "fenced", // Use ``` for code blocks
|
|
20
|
+
fence: "```", // Fence marker
|
|
21
|
+
emDelimiter: "*", // Emphasis delimiter
|
|
22
|
+
strongDelimiter: "**", // Strong delimiter
|
|
23
|
+
linkStyle: "inlined", // Inline links
|
|
24
|
+
linkReferenceStyle: "full", // Full reference links
|
|
25
|
+
});
|
|
26
|
+
// Add GitHub Flavored Markdown support (tables, strikethrough, etc.)
|
|
27
|
+
this.turndown.use(gfm);
|
|
28
|
+
// Set up custom rules optimized for LLMs
|
|
29
|
+
this.setupLLMRules();
|
|
30
|
+
}
|
|
31
|
+
convert(html, options = {}) {
|
|
32
|
+
const startTime = Date.now();
|
|
33
|
+
const opts = this.normalizeOptions(options);
|
|
34
|
+
// Step 1: Extract main content using Readability
|
|
35
|
+
let contentHtml = html;
|
|
36
|
+
let metadata = {};
|
|
37
|
+
let readabilitySuccess = false;
|
|
38
|
+
if (opts.extractContent) {
|
|
39
|
+
try {
|
|
40
|
+
const extracted = this.extractMainContent(html, opts.baseUrl);
|
|
41
|
+
if (extracted) {
|
|
42
|
+
contentHtml = extracted.content;
|
|
43
|
+
metadata = extracted.metadata;
|
|
44
|
+
readabilitySuccess = true;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
catch {
|
|
48
|
+
// Fallback to raw HTML if Readability fails
|
|
49
|
+
console.warn("Readability extraction failed, using raw HTML");
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
// Step 2: Additional metadata extraction
|
|
53
|
+
const additionalMeta = extractMetadata(contentHtml, opts.baseUrl);
|
|
54
|
+
metadata = { ...additionalMeta, ...metadata };
|
|
55
|
+
// Step 3: Clean HTML (remove scripts, styles, ads, etc.)
|
|
56
|
+
contentHtml = cleanHTML(contentHtml, {
|
|
57
|
+
aggressive: opts.aggressiveCleanup,
|
|
58
|
+
baseUrl: opts.baseUrl,
|
|
59
|
+
});
|
|
60
|
+
// Step 4: Enhance structure (improve heading hierarchy, etc.)
|
|
61
|
+
contentHtml = enhanceStructure(contentHtml);
|
|
62
|
+
// Step 5: Filter content based on options
|
|
63
|
+
contentHtml = this.filterContent(contentHtml, opts);
|
|
64
|
+
// Step 6: Apply custom rules if provided
|
|
65
|
+
if (opts.customRules && opts.customRules.length > 0) {
|
|
66
|
+
this.applyCustomRules(opts.customRules);
|
|
67
|
+
}
|
|
68
|
+
// Step 7: Convert to markdown
|
|
69
|
+
let markdown = this.turndown.turndown(contentHtml);
|
|
70
|
+
// Step 8: Apply LLM-specific formatting
|
|
71
|
+
markdown = formatForLLM(markdown);
|
|
72
|
+
// Step 9: Calculate word count and reading time from markdown (before adding frontmatter)
|
|
73
|
+
const { wordCount, readingTime } = this.calculateMarkdownStats(markdown);
|
|
74
|
+
metadata.wordCount = wordCount;
|
|
75
|
+
metadata.readingTime = readingTime;
|
|
76
|
+
// Step 10: Add frontmatter if requested
|
|
77
|
+
if (opts.includeMeta && Object.keys(metadata).length > 0) {
|
|
78
|
+
markdown = this.addFrontmatter(markdown, metadata);
|
|
79
|
+
}
|
|
80
|
+
// Step 11: Final cleanup
|
|
81
|
+
markdown = this.postProcess(markdown);
|
|
82
|
+
// Step 12: Validate length
|
|
83
|
+
if (opts.maxLength && markdown.length > opts.maxLength) {
|
|
84
|
+
markdown =
|
|
85
|
+
markdown.substring(0, opts.maxLength) + "\n\n[Content truncated]";
|
|
86
|
+
}
|
|
87
|
+
const processingTime = Date.now() - startTime;
|
|
88
|
+
// Calculate statistics
|
|
89
|
+
const $ = cheerio.load(contentHtml);
|
|
90
|
+
const imageCount = opts.includeImages ? $("img").length : 0;
|
|
91
|
+
const linkCount = opts.includeLinks ? $("a").length : 0;
|
|
92
|
+
return {
|
|
93
|
+
markdown,
|
|
94
|
+
metadata,
|
|
95
|
+
stats: {
|
|
96
|
+
inputLength: html.length,
|
|
97
|
+
outputLength: markdown.length,
|
|
98
|
+
processingTime,
|
|
99
|
+
readabilitySuccess,
|
|
100
|
+
imageCount,
|
|
101
|
+
linkCount,
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
extractMainContent(html, baseUrl) {
|
|
106
|
+
const doc = new JSDOM(html, { url: baseUrl });
|
|
107
|
+
const reader = new Readability(doc.window.document, {
|
|
108
|
+
// Increase content threshold to avoid extracting navigation/sidebars
|
|
109
|
+
charThreshold: 500,
|
|
110
|
+
});
|
|
111
|
+
const article = reader.parse();
|
|
112
|
+
if (!article) {
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
return {
|
|
116
|
+
content: article.content || "",
|
|
117
|
+
metadata: {
|
|
118
|
+
title: article.title || undefined,
|
|
119
|
+
author: article.byline || undefined,
|
|
120
|
+
excerpt: article.excerpt || undefined,
|
|
121
|
+
siteName: article.siteName || undefined,
|
|
122
|
+
// wordCount and readingTime will be calculated from final markdown
|
|
123
|
+
},
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
filterContent(html, options) {
|
|
127
|
+
const $ = cheerio.load(html);
|
|
128
|
+
// Remove images if disabled
|
|
129
|
+
if (!options.includeImages) {
|
|
130
|
+
$("img, picture, figure").remove();
|
|
131
|
+
}
|
|
132
|
+
// Remove links if disabled (keep text content)
|
|
133
|
+
if (!options.includeLinks) {
|
|
134
|
+
$("a").each((_, el) => {
|
|
135
|
+
const $el = $(el);
|
|
136
|
+
$el.replaceWith($el.text());
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
// Remove tables if disabled
|
|
140
|
+
if (!options.includeTables) {
|
|
141
|
+
$("table").remove();
|
|
142
|
+
}
|
|
143
|
+
return $.html();
|
|
144
|
+
}
|
|
145
|
+
setupLLMRules() {
|
|
146
|
+
// Custom rule for better table formatting
|
|
147
|
+
this.turndown.addRule("tables", {
|
|
148
|
+
filter: "table",
|
|
149
|
+
replacement: (_content, node) => {
|
|
150
|
+
return this.convertTableToMarkdown(node);
|
|
151
|
+
},
|
|
152
|
+
});
|
|
153
|
+
// Custom rule for code blocks with language detection
|
|
154
|
+
this.turndown.addRule("codeBlocks", {
|
|
155
|
+
filter: (node) => {
|
|
156
|
+
return node.nodeName === "PRE" && node.querySelector?.("code") !== null;
|
|
157
|
+
},
|
|
158
|
+
replacement: (_content, node) => {
|
|
159
|
+
const code = node.querySelector?.("code");
|
|
160
|
+
if (!code)
|
|
161
|
+
return "";
|
|
162
|
+
// Detect language from class name
|
|
163
|
+
const className = code.className || "";
|
|
164
|
+
const langMatch = className.match(/language-(\w+)|lang-(\w+)/);
|
|
165
|
+
const language = langMatch?.[1] || langMatch?.[2] || "";
|
|
166
|
+
const codeContent = code.textContent || "";
|
|
167
|
+
return `\n\`\`\`${language}\n${codeContent}\n\`\`\`\n`;
|
|
168
|
+
},
|
|
169
|
+
});
|
|
170
|
+
// Custom rule for better image handling with alt text
|
|
171
|
+
this.turndown.addRule("images", {
|
|
172
|
+
filter: "img",
|
|
173
|
+
replacement: (_content, node) => {
|
|
174
|
+
const alt = node.alt || "Image";
|
|
175
|
+
const src = node.src || node.getAttribute?.("data-src") || "";
|
|
176
|
+
const title = node.title || "";
|
|
177
|
+
if (!src)
|
|
178
|
+
return "";
|
|
179
|
+
if (title) {
|
|
180
|
+
return ``;
|
|
181
|
+
}
|
|
182
|
+
return ``;
|
|
183
|
+
},
|
|
184
|
+
});
|
|
185
|
+
// Custom rule for blockquotes with better formatting
|
|
186
|
+
this.turndown.addRule("blockquotes", {
|
|
187
|
+
filter: "blockquote",
|
|
188
|
+
replacement: (_content, node) => {
|
|
189
|
+
const text = node.textContent || "";
|
|
190
|
+
const lines = text.trim().split("\n");
|
|
191
|
+
return ("\n" + lines.map((line) => `> ${line}`).join("\n") + "\n");
|
|
192
|
+
},
|
|
193
|
+
});
|
|
194
|
+
// Remove empty paragraphs and whitespace-only elements
|
|
195
|
+
this.turndown.addRule("removeEmpty", {
|
|
196
|
+
filter: (node) => {
|
|
197
|
+
return (["P", "DIV", "SPAN"].includes(node.nodeName.toUpperCase()) &&
|
|
198
|
+
(!node.textContent || node.textContent.trim() === ""));
|
|
199
|
+
},
|
|
200
|
+
replacement: () => "",
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
convertTableToMarkdown(table) {
|
|
204
|
+
const $ = cheerio.load(table.outerHTML || "");
|
|
205
|
+
const headers = [];
|
|
206
|
+
const rows = [];
|
|
207
|
+
const alignments = [];
|
|
208
|
+
// Extract headers and alignments
|
|
209
|
+
$("thead tr, tr:first-child")
|
|
210
|
+
.first()
|
|
211
|
+
.find("th, td")
|
|
212
|
+
.each((_, el) => {
|
|
213
|
+
const $el = $(el);
|
|
214
|
+
headers.push($el.text().trim().replace(/\n/g, " "));
|
|
215
|
+
// Detect alignment from style or align attribute
|
|
216
|
+
const align = $el.attr("align") ||
|
|
217
|
+
($el.css("text-align") === "center"
|
|
218
|
+
? "center"
|
|
219
|
+
: $el.css("text-align") === "right"
|
|
220
|
+
? "right"
|
|
221
|
+
: "left");
|
|
222
|
+
alignments.push(align);
|
|
223
|
+
});
|
|
224
|
+
if (headers.length === 0)
|
|
225
|
+
return "";
|
|
226
|
+
// Extract rows
|
|
227
|
+
const rowSelector = $("thead").length > 0 ? "tbody tr" : "tr:not(:first-child)";
|
|
228
|
+
$(rowSelector).each((_, tr) => {
|
|
229
|
+
const row = [];
|
|
230
|
+
$(tr)
|
|
231
|
+
.find("td")
|
|
232
|
+
.each((_, td) => {
|
|
233
|
+
row.push($(td).text().trim().replace(/\n/g, " "));
|
|
234
|
+
});
|
|
235
|
+
if (row.length > 0) {
|
|
236
|
+
// Ensure row has same number of columns as headers
|
|
237
|
+
while (row.length < headers.length) {
|
|
238
|
+
row.push("");
|
|
239
|
+
}
|
|
240
|
+
rows.push(row.slice(0, headers.length));
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
// Build markdown table
|
|
244
|
+
let markdown = "\n| " + headers.join(" | ") + " |\n";
|
|
245
|
+
// Add alignment row
|
|
246
|
+
const alignRow = alignments.map((align) => {
|
|
247
|
+
if (align === "center")
|
|
248
|
+
return ":---:";
|
|
249
|
+
if (align === "right")
|
|
250
|
+
return "---:";
|
|
251
|
+
return "---";
|
|
252
|
+
});
|
|
253
|
+
markdown += "| " + alignRow.join(" | ") + " |\n";
|
|
254
|
+
// Add data rows
|
|
255
|
+
rows.forEach((row) => {
|
|
256
|
+
markdown += "| " + row.join(" | ") + " |\n";
|
|
257
|
+
});
|
|
258
|
+
return markdown + "\n";
|
|
259
|
+
}
|
|
260
|
+
applyCustomRules(rules) {
|
|
261
|
+
rules.forEach((rule) => {
|
|
262
|
+
this.turndown.addRule(rule.name, {
|
|
263
|
+
filter: rule.filter,
|
|
264
|
+
replacement: rule.replacement,
|
|
265
|
+
});
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
addFrontmatter(markdown, metadata) {
|
|
269
|
+
// Build YAML frontmatter
|
|
270
|
+
const yaml = ["---"];
|
|
271
|
+
Object.entries(metadata).forEach(([key, value]) => {
|
|
272
|
+
if (value !== undefined && value !== null) {
|
|
273
|
+
// Escape string values with quotes if they contain special chars
|
|
274
|
+
const yamlValue = typeof value === "string" && /[:\n\r]/.test(value)
|
|
275
|
+
? `"${value.replace(/"/g, '\\"')}"`
|
|
276
|
+
: value;
|
|
277
|
+
yaml.push(`${key}: ${String(yamlValue)}`);
|
|
278
|
+
}
|
|
279
|
+
});
|
|
280
|
+
yaml.push("---");
|
|
281
|
+
return yaml.join("\n") + "\n\n" + markdown;
|
|
282
|
+
}
|
|
283
|
+
calculateMarkdownStats(markdown) {
|
|
284
|
+
// Remove frontmatter if present
|
|
285
|
+
let contentOnly = markdown;
|
|
286
|
+
if (markdown.startsWith("---")) {
|
|
287
|
+
const frontmatterEnd = markdown.indexOf("---", 3);
|
|
288
|
+
if (frontmatterEnd !== -1) {
|
|
289
|
+
contentOnly = markdown.substring(frontmatterEnd + 3).trim();
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
// Count words in the actual content (excluding code blocks and URLs)
|
|
293
|
+
// Remove code blocks first
|
|
294
|
+
contentOnly = contentOnly.replace(/```[\s\S]*?```/g, "");
|
|
295
|
+
// Remove inline code
|
|
296
|
+
contentOnly = contentOnly.replace(/`[^`]+`/g, "");
|
|
297
|
+
// Remove URLs
|
|
298
|
+
contentOnly = contentOnly.replace(/https?:\/\/[^\s)]+/g, "");
|
|
299
|
+
// Remove markdown link syntax but keep the text
|
|
300
|
+
contentOnly = contentOnly.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1");
|
|
301
|
+
// Remove image syntax
|
|
302
|
+
contentOnly = contentOnly.replace(/!\[([^\]]*)\]\([^)]+\)/g, "");
|
|
303
|
+
// Count words
|
|
304
|
+
const words = contentOnly
|
|
305
|
+
.trim()
|
|
306
|
+
.split(/\s+/)
|
|
307
|
+
.filter((w) => w.length > 0);
|
|
308
|
+
const wordCount = words.length;
|
|
309
|
+
// Calculate reading time (250 words per minute is more realistic)
|
|
310
|
+
const readingTime = Math.ceil(wordCount / 250);
|
|
311
|
+
return { wordCount, readingTime };
|
|
312
|
+
}
|
|
313
|
+
postProcess(markdown) {
|
|
314
|
+
// Remove excessive blank lines (more than 2 consecutive)
|
|
315
|
+
markdown = markdown.replace(/\n{3,}/g, "\n\n");
|
|
316
|
+
// Clean up list formatting (remove blank lines within lists)
|
|
317
|
+
markdown = markdown.replace(/^(-|\d+\.)\s+(.+?)(\n\n)(-|\d+\.)/gm, "$1 $2\n$4");
|
|
318
|
+
// Ensure code blocks have spacing
|
|
319
|
+
markdown = markdown.replace(/([^\n])\n```/g, "$1\n\n```");
|
|
320
|
+
markdown = markdown.replace(/```\n([^`\n])/g, "```\n\n$1");
|
|
321
|
+
// Fix heading spacing (ensure blank line before headings, but not right after frontmatter)
|
|
322
|
+
markdown = markdown.replace(/([^\n-])\n(#{1,6}\s)/g, "$1\n\n$2");
|
|
323
|
+
// Ensure consistent spacing around horizontal rules (but not frontmatter delimiters)
|
|
324
|
+
// Only add spacing to --- that are not at the start and not part of frontmatter
|
|
325
|
+
const lines = markdown.split("\n");
|
|
326
|
+
let frontmatterCount = 0;
|
|
327
|
+
for (let i = 0; i < lines.length; i++) {
|
|
328
|
+
if (lines[i].trim() === "---") {
|
|
329
|
+
frontmatterCount++;
|
|
330
|
+
if (frontmatterCount <= 2) {
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
// After frontmatter is closed, add spacing around horizontal rules
|
|
335
|
+
if (frontmatterCount >= 2 &&
|
|
336
|
+
lines[i].trim() === "---" &&
|
|
337
|
+
i > 0 &&
|
|
338
|
+
i < lines.length - 1) {
|
|
339
|
+
if (lines[i - 1].trim() !== "" && !lines[i - 1].startsWith("#")) {
|
|
340
|
+
lines.splice(i, 0, "");
|
|
341
|
+
i++;
|
|
342
|
+
}
|
|
343
|
+
if (lines[i + 1].trim() !== "" && !lines[i + 1].startsWith("#")) {
|
|
344
|
+
lines.splice(i + 1, 0, "");
|
|
345
|
+
i++;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
markdown = lines.join("\n");
|
|
350
|
+
// Remove trailing whitespace from lines
|
|
351
|
+
markdown = markdown.replace(/[^\S\n]+$/gm, "");
|
|
352
|
+
return markdown.trim() + "\n";
|
|
353
|
+
}
|
|
354
|
+
normalizeOptions(options) {
|
|
355
|
+
return {
|
|
356
|
+
extractContent: options.extractContent ?? true,
|
|
357
|
+
includeMeta: options.includeMeta ?? true,
|
|
358
|
+
customRules: options.customRules ?? [],
|
|
359
|
+
preserveElements: options.preserveElements ?? [],
|
|
360
|
+
maxLength: options.maxLength ?? 1000000,
|
|
361
|
+
baseUrl: options.baseUrl,
|
|
362
|
+
includeImages: options.includeImages ?? true,
|
|
363
|
+
includeLinks: options.includeLinks ?? true,
|
|
364
|
+
includeTables: options.includeTables ?? true,
|
|
365
|
+
aggressiveCleanup: options.aggressiveCleanup ?? true, // Back to true with smarter selectors
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
//# sourceMappingURL=markdown-parser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown-parser.js","sourceRoot":"","sources":["../../src/parsers/markdown-parser.ts"],"names":[],"mappings":"AAAA,iCAAiC;AAEjC,OAAO,eAAe,MAAM,UAAU,CAAC;AACvC,OAAO,EAAE,GAAG,EAAE,MAAM,qBAAqB,CAAC;AAC1C,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AACnD,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AACnC,OAAO,EAAE,SAAS,EAAE,MAAM,+BAA+B,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,MAAM,gCAAgC,CAAC;AAC9D,OAAO,EAAE,gBAAgB,EAAE,MAAM,qCAAqC,CAAC;AACvE,OAAO,EAAE,eAAe,EAAE,MAAM,qCAAqC,CAAC;AAStE,MAAM,OAAO,cAAc;IACjB,QAAQ,CAAkB;IAElC;QACE,iDAAiD;QACjD,IAAI,CAAC,QAAQ,GAAG,IAAI,eAAe,CAAC;YAClC,YAAY,EAAE,KAAK,EAAE,uBAAuB;YAC5C,EAAE,EAAE,KAAK,EAAE,wBAAwB;YACnC,gBAAgB,EAAE,GAAG,EAAE,kBAAkB;YACzC,cAAc,EAAE,QAAQ,EAAE,0BAA0B;YACpD,KAAK,EAAE,KAAK,EAAE,eAAe;YAC7B,WAAW,EAAE,GAAG,EAAE,qBAAqB;YACvC,eAAe,EAAE,IAAI,EAAE,mBAAmB;YAC1C,SAAS,EAAE,SAAS,EAAE,eAAe;YACrC,kBAAkB,EAAE,MAAM,EAAE,uBAAuB;SACpD,CAAC,CAAC;QAEH,qEAAqE;QACrE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAEvB,yCAAyC;QACzC,IAAI,CAAC,aAAa,EAAE,CAAC;IACvB,CAAC;IAED,OAAO,CAAC,IAAY,EAAE,UAA2B,EAAE;QACjD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;QAE5C,iDAAiD;QACjD,IAAI,WAAW,GAAG,IAAI,CAAC;QACvB,IAAI,QAAQ,GAAoB,EAAE,CAAC;QACnC,IAAI,kBAAkB,GAAG,KAAK,CAAC;QAE/B,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACxB,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;gBAC9D,IAAI,SAAS,EAAE,CAAC;oBACd,WAAW,GAAG,SAAS,CAAC,OAAO,CAAC;oBAChC,QAAQ,GAAG,SAAS,CAAC,QAAQ,CAAC;oBAC9B,kBAAkB,GAAG,IAAI,CAAC;gBAC5B,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,4CAA4C;gBAC5C,OAAO,CAAC,IAAI,CAAC,+CAA+C,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;QAED,yCAAyC;QACzC,MAAM,cAAc,GAAG,eAAe,CAAC,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAClE,QAAQ,GAAG,EAAE,GAAG,cAAc,EAAE,GAAG,QAAQ,EAAE,CAAC;QAE9C,yDAAyD;QACzD,WAAW,GAAG,SAAS,CAAC,WAAW,EAAE;YACnC,UAAU,EAAE,IAAI,CAAC,iBAAiB;YAClC,OAAO,EAAE,IAAI,CAAC,OAAO;SACtB,CAAC,CAAC;QAEH,8DAA8D;QAC9D,WAAW,GAAG,gBAAgB,CAAC,WAAW,CAAC,CAAC;QAE5C,0CAA0C;QAC1C,WAAW,GAAG,IAAI,CAAC,aAAa,CAAC,WAAW,EAAE,IAAI,CAAC,CAAC;QAEpD,yCAAyC;QACzC,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpD,IAAI,CAAC,gBAAgB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC1C,CAAC;QAED,8BAA8B;QAC9B,IAAI,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;QAEnD,wCAAwC;QACxC,QAAQ,GAAG,YAAY,CAAC,QAAQ,CAAC,CAAC;QAElC,0FAA0F;QAC1F,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,IAAI,CAAC,sBAAsB,CAAC,QAAQ,CAAC,CAAC;QACzE,QAAQ,CAAC,SAAS,GAAG,SAAS,CAAC;QAC/B,QAAQ,CAAC,WAAW,GAAG,WAAW,CAAC;QAEnC,wCAAwC;QACxC,IAAI,IAAI,CAAC,WAAW,IAAI,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACzD,QAAQ,GAAG,IAAI,CAAC,cAAc,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;QACrD,CAAC;QAED,yBAAyB;QACzB,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;QAEtC,2BAA2B;QAC3B,IAAI,IAAI,CAAC,SAAS,IAAI,QAAQ,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;YACvD,QAAQ;gBACN,QAAQ,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,GAAG,yBAAyB,CAAC;QACtE,CAAC;QAED,MAAM,cAAc,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;QAE9C,uBAAuB;QACvB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QACpC,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5D,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QAExD,OAAO;YACL,QAAQ;YACR,QAAQ;YACR,KAAK,EAAE;gBACL,WAAW,EAAE,IAAI,CAAC,MAAM;gBACxB,YAAY,EAAE,QAAQ,CAAC,MAAM;gBAC7B,cAAc;gBACd,kBAAkB;gBAClB,UAAU;gBACV,SAAS;aACV;SACF,CAAC;IACJ,CAAC;IAEO,kBAAkB,CACxB,IAAY,EACZ,OAAgB;QAEhB,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,OAAO,EAAE,CAAC,CAAC;QAC9C,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,EAAE;YAClD,qEAAqE;YACrE,aAAa,EAAE,GAAG;SACnB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;QAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,IAAI,CAAC;QACd,CAAC;QAED,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;YAC9B,QAAQ,EAAE;gBACR,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,SAAS;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;gBACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,SAAS;gBACvC,mEAAmE;aACpE;SACF,CAAC;IACJ,CAAC;IAEO,aAAa,CACnB,IAAY,EACZ,OAAkC;QAElC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE7B,4BAA4B;QAC5B,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;YAC3B,CAAC,CAAC,sBAAsB,CAAC,CAAC,MAAM,EAAE,CAAC;QACrC,CAAC;QAED,+CAA+C;QAC/C,IAAI,CAAC,OAAO,CAAC,YAAY,EAAE,CAAC;YAC1B,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;gBACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;gBAClB,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;YAC9B,CAAC,CAAC,CAAC;QACL,CAAC;QAED,4BAA4B;QAC5B,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;YAC3B,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC;QACtB,CAAC;QAED,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;IAClB,CAAC;IAEO,aAAa;QACnB,0CAA0C;QAC1C,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE;YAC9B,MAAM,EAAE,OAAO;YACf,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;gBAC9B,OAAO,IAAI,CAAC,sBAAsB,CAAC,IAAoB,CAAC,CAAC;YAC3D,CAAC;SACF,CAAC,CAAC;QAEH,sDAAsD;QACtD,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,YAAY,EAAE;YAClC,MAAM,EAAE,CAAC,IAAkB,EAAE,EAAE;gBAC7B,OAAO,IAAI,CAAC,QAAQ,KAAK,KAAK,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC,MAAM,CAAC,KAAK,IAAI,CAAC;YAC1E,CAAC;YACD,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAkB,EAAE,EAAE;gBAC5C,MAAM,IAAI,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC,MAAM,CAAC,CAAC;gBAC1C,IAAI,CAAC,IAAI;oBAAE,OAAO,EAAE,CAAC;gBAErB,kCAAkC;gBAClC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,EAAE,CAAC;gBACvC,MAAM,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,2BAA2B,CAAC,CAAC;gBAC/D,MAAM,QAAQ,GAAG,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,SAAS,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;gBAExD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;gBAE3C,OAAO,WAAW,QAAQ,KAAK,WAAW,YAAY,CAAC;YACzD,CAAC;SACF,CAAC,CAAC;QAEH,sDAAsD;QACtD,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,QAAQ,EAAE;YAC9B,MAAM,EAAE,KAAK;YACb,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAkB,EAAE,EAAE;gBAC5C,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,IAAI,OAAO,CAAC;gBAChC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;gBAC9D,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;gBAE/B,IAAI,CAAC,GAAG;oBAAE,OAAO,EAAE,CAAC;gBAEpB,IAAI,KAAK,EAAE,CAAC;oBACV,OAAO,KAAK,GAAG,KAAK,GAAG,KAAK,KAAK,IAAI,CAAC;gBACxC,CAAC;gBACD,OAAO,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;YAC7B,CAAC;SACF,CAAC,CAAC;QAEH,qDAAqD;QACrD,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE;YACnC,MAAM,EAAE,YAAY;YACpB,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAkB,EAAE,EAAE;gBAC5C,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;gBACpC,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;gBACtC,OAAO,CACL,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAY,EAAE,EAAE,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,IAAI,CAClE,CAAC;YACJ,CAAC;SACF,CAAC,CAAC;QAEH,uDAAuD;QACvD,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE;YACnC,MAAM,EAAE,CAAC,IAAkB,EAAE,EAAE;gBAC7B,OAAO,CACL,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC;oBAC1D,CAAC,CAAC,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CACtD,CAAC;YACJ,CAAC;YACD,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;SACtB,CAAC,CAAC;IACL,CAAC;IAEO,sBAAsB,CAAC,KAAmB;QAChD,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,IAAI,EAAE,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAe,EAAE,CAAC;QAC5B,MAAM,UAAU,GAAa,EAAE,CAAC;QAEhC,iCAAiC;QACjC,CAAC,CAAC,0BAA0B,CAAC;aAC1B,KAAK,EAAE;aACP,IAAI,CAAC,QAAQ,CAAC;aACd,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YACd,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;YAClB,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YAEpD,iDAAiD;YACjD,MAAM,KAAK,GACT,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC;gBACjB,CAAC,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,QAAQ;oBACjC,CAAC,CAAC,QAAQ;oBACV,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,OAAO;wBACjC,CAAC,CAAC,OAAO;wBACT,CAAC,CAAC,MAAM,CAAC,CAAC;YAChB,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QAEL,IAAI,OAAO,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEpC,eAAe;QACf,MAAM,WAAW,GACf,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,sBAAsB,CAAC;QAC9D,CAAC,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;YAC5B,MAAM,GAAG,GAAa,EAAE,CAAC;YACzB,CAAC,CAAC,EAAE,CAAC;iBACF,IAAI,CAAC,IAAI,CAAC;iBACV,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;gBACd,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC,CAAC;YACpD,CAAC,CAAC,CAAC;YACL,IAAI,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnB,mDAAmD;gBACnD,OAAO,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,EAAE,CAAC;oBACnC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBACf,CAAC;gBACD,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC;YAC1C,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,uBAAuB;QACvB,IAAI,QAAQ,GAAG,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC;QAErD,oBAAoB;QACpB,MAAM,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE;YACxC,IAAI,KAAK,KAAK,QAAQ;gBAAE,OAAO,OAAO,CAAC;YACvC,IAAI,KAAK,KAAK,OAAO;gBAAE,OAAO,MAAM,CAAC;YACrC,OAAO,KAAK,CAAC;QACf,CAAC,CAAC,CAAC;QACH,QAAQ,IAAI,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC;QAEjD,gBAAgB;QAChB,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;YACnB,QAAQ,IAAI,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC;QAC9C,CAAC,CAAC,CAAC;QAEH,OAAO,QAAQ,GAAG,IAAI,CAAC;IACzB,CAAC;IAEO,gBAAgB,CAAC,KAAqB;QAC5C,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACrB,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE;gBAC/B,MAAM,EAAE,IAAI,CAAC,MAA2C;gBACxD,WAAW,EAAE,IAAI,CAAC,WAAkD;aACrE,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC;IAEO,cAAc,CAAC,QAAgB,EAAE,QAAyB;QAChE,yBAAyB;QACzB,MAAM,IAAI,GAAa,CAAC,KAAK,CAAC,CAAC;QAE/B,MAAM,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAoB,EAAE,EAAE;YACnE,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;gBAC1C,iEAAiE;gBACjE,MAAM,SAAS,GACb,OAAO,KAAK,KAAK,QAAQ,IAAI,SAAS,CAAC,IAAI,CAAC,KAAK,CAAC;oBAChD,CAAC,CAAC,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,GAAG;oBACnC,CAAC,CAAE,KAAyB,CAAC;gBACjC,IAAI,CAAC,IAAI,CAAC,GAAG,GAAG,KAAK,MAAM,CAAC,SAAS,CAAC,EAAE,CAAC,CAAC;YAC5C,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAEjB,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,MAAM,GAAG,QAAQ,CAAC;IAC7C,CAAC;IAEO,sBAAsB,CAAC,QAAgB;QAI7C,gCAAgC;QAChC,IAAI,WAAW,GAAG,QAAQ,CAAC;QAC3B,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,MAAM,cAAc,GAAG,QAAQ,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YAClD,IAAI,cAAc,KAAK,CAAC,CAAC,EAAE,CAAC;gBAC1B,WAAW,GAAG,QAAQ,CAAC,SAAS,CAAC,cAAc,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YAC9D,CAAC;QACH,CAAC;QAED,qEAAqE;QACrE,2BAA2B;QAC3B,WAAW,GAAG,WAAW,CAAC,OAAO,CAAC,iBAAiB,EAAE,EAAE,CAAC,CAAC;QACzD,qBAAqB;QACrB,WAAW,GAAG,WAAW,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC;QAClD,cAAc;QACd,WAAW,GAAG,WAAW,CAAC,OAAO,CAAC,qBAAqB,EAAE,EAAE,CAAC,CAAC;QAC7D,gDAAgD;QAChD,WAAW,GAAG,WAAW,CAAC,OAAO,CAAC,wBAAwB,EAAE,IAAI,CAAC,CAAC;QAClE,sBAAsB;QACtB,WAAW,GAAG,WAAW,CAAC,OAAO,CAAC,yBAAyB,EAAE,EAAE,CAAC,CAAC;QAEjE,cAAc;QACd,MAAM,KAAK,GAAG,WAAW;aACtB,IAAI,EAAE;aACN,KAAK,CAAC,KAAK,CAAC;aACZ,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC/B,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC;QAE/B,kEAAkE;QAClE,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC;QAE/C,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,CAAC;IACpC,CAAC;IAEO,WAAW,CAAC,QAAgB;QAClC,yDAAyD;QACzD,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;QAE/C,6DAA6D;QAC7D,QAAQ,GAAG,QAAQ,CAAC,OAAO,CACzB,qCAAqC,EACrC,WAAW,CACZ,CAAC;QAEF,kCAAkC;QAClC,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,eAAe,EAAE,WAAW,CAAC,CAAC;QAC1D,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE,WAAW,CAAC,CAAC;QAE3D,2FAA2F;QAC3F,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,uBAAuB,EAAE,UAAU,CAAC,CAAC;QAEjE,qFAAqF;QACrF,gFAAgF;QAChF,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QACnC,IAAI,gBAAgB,GAAG,CAAC,CAAC;QAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,KAAK,EAAE,CAAC;gBAC9B,gBAAgB,EAAE,CAAC;gBACnB,IAAI,gBAAgB,IAAI,CAAC,EAAE,CAAC;oBAC1B,SAAS;gBACX,CAAC;YACH,CAAC;YAED,mEAAmE;YACnE,IACE,gBAAgB,IAAI,CAAC;gBACrB,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,KAAK;gBACzB,CAAC,GAAG,CAAC;gBACL,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,EACpB,CAAC;gBACD,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;oBAChE,KAAK,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;oBACvB,CAAC,EAAE,CAAC;gBACN,CAAC;gBACD,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;oBAChE,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;oBAC3B,CAAC,EAAE,CAAC;gBACN,CAAC;YACH,CAAC;QACH,CAAC;QAED,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE5B,wCAAwC;QACxC,QAAQ,GAAG,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC;QAE/C,OAAO,QAAQ,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC;IAChC,CAAC;IAEO,gBAAgB,CACtB,OAAwB;QAExB,OAAO;YACL,cAAc,EAAE,OAAO,CAAC,cAAc,IAAI,IAAI;YAC9C,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,IAAI;YACxC,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;YACtC,gBAAgB,EAAE,OAAO,CAAC,gBAAgB,IAAI,EAAE;YAChD,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,OAAO;YACvC,OAAO,EAAE,OAAO,CAAC,OAAO;YACxB,aAAa,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;YAC5C,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,IAAI;YAC1C,aAAa,EAAE,OAAO,CAAC,aAAa,IAAI,IAAI;YAC5C,iBAAiB,EAAE,OAAO,CAAC,iBAAiB,IAAI,IAAI,EAAE,sCAAsC;SAChE,CAAC;IACjC,CAAC;CACF"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/** Options for HTML to Markdown conversion */
|
|
2
|
+
export interface MarkdownOptions {
|
|
3
|
+
/** Extract only main content using Readability (default: true) */
|
|
4
|
+
extractContent?: boolean;
|
|
5
|
+
/** Include metadata as YAML frontmatter (default: true) */
|
|
6
|
+
includeMeta?: boolean;
|
|
7
|
+
/** Custom Turndown rules */
|
|
8
|
+
customRules?: TurndownRule[];
|
|
9
|
+
/** Preserve specific HTML elements as HTML */
|
|
10
|
+
preserveElements?: string[];
|
|
11
|
+
/** Maximum content length in characters (default: 1000000) */
|
|
12
|
+
maxLength?: number;
|
|
13
|
+
/** Base URL for resolving relative links */
|
|
14
|
+
baseUrl?: string;
|
|
15
|
+
/** Include images (default: true) */
|
|
16
|
+
includeImages?: boolean;
|
|
17
|
+
/** Include links (default: true) */
|
|
18
|
+
includeLinks?: boolean;
|
|
19
|
+
/** Include tables (default: true) */
|
|
20
|
+
includeTables?: boolean;
|
|
21
|
+
/** Aggressive noise removal (default: true) */
|
|
22
|
+
aggressiveCleanup?: boolean;
|
|
23
|
+
/** Force treat input as URL (bypasses auto-detection) */
|
|
24
|
+
isUrl?: boolean;
|
|
25
|
+
/** Request timeout in milliseconds (default: 15000) */
|
|
26
|
+
timeout?: number;
|
|
27
|
+
/** Follow redirects (default: true) */
|
|
28
|
+
followRedirects?: boolean;
|
|
29
|
+
/** Maximum redirects to follow (default: 5) */
|
|
30
|
+
maxRedirects?: number;
|
|
31
|
+
/** Custom headers for URL fetching */
|
|
32
|
+
headers?: Record<string, string>;
|
|
33
|
+
/** User agent string for URL fetching */
|
|
34
|
+
userAgent?: string;
|
|
35
|
+
}
|
|
36
|
+
/** Result from markdown conversion */
|
|
37
|
+
export interface MarkdownResult {
|
|
38
|
+
/** Generated markdown content */
|
|
39
|
+
markdown: string;
|
|
40
|
+
/** Extracted metadata */
|
|
41
|
+
metadata: ContentMetadata;
|
|
42
|
+
/** Conversion statistics */
|
|
43
|
+
stats: ConversionStats;
|
|
44
|
+
}
|
|
45
|
+
/** Extracted content metadata */
|
|
46
|
+
export interface ContentMetadata {
|
|
47
|
+
/** Page title */
|
|
48
|
+
title?: string;
|
|
49
|
+
/** Author/byline */
|
|
50
|
+
author?: string;
|
|
51
|
+
/** Excerpt/description */
|
|
52
|
+
excerpt?: string;
|
|
53
|
+
/** Site name */
|
|
54
|
+
siteName?: string;
|
|
55
|
+
/** Publication date */
|
|
56
|
+
publishedTime?: string;
|
|
57
|
+
/** Language code */
|
|
58
|
+
language?: string;
|
|
59
|
+
/** Canonical URL */
|
|
60
|
+
canonicalUrl?: string;
|
|
61
|
+
/** Reading time estimate (minutes) */
|
|
62
|
+
readingTime?: number;
|
|
63
|
+
/** Word count */
|
|
64
|
+
wordCount?: number;
|
|
65
|
+
}
|
|
66
|
+
/** Conversion statistics */
|
|
67
|
+
export interface ConversionStats {
|
|
68
|
+
/** Input HTML length */
|
|
69
|
+
inputLength: number;
|
|
70
|
+
/** Output markdown length */
|
|
71
|
+
outputLength: number;
|
|
72
|
+
/** Processing time in milliseconds */
|
|
73
|
+
processingTime: number;
|
|
74
|
+
/** Whether Readability extraction succeeded */
|
|
75
|
+
readabilitySuccess: boolean;
|
|
76
|
+
/** Number of images found */
|
|
77
|
+
imageCount: number;
|
|
78
|
+
/** Number of links found */
|
|
79
|
+
linkCount: number;
|
|
80
|
+
}
|
|
81
|
+
/** Options for URL fetching */
|
|
82
|
+
export interface FetchOptions {
|
|
83
|
+
/** Request timeout in milliseconds (default: 15000) */
|
|
84
|
+
timeout?: number;
|
|
85
|
+
/** Follow redirects (default: true) */
|
|
86
|
+
followRedirects?: boolean;
|
|
87
|
+
/** Maximum redirects to follow (default: 5) */
|
|
88
|
+
maxRedirects?: number;
|
|
89
|
+
/** Custom headers */
|
|
90
|
+
headers?: Record<string, string>;
|
|
91
|
+
/** User agent string */
|
|
92
|
+
userAgent?: string;
|
|
93
|
+
}
|
|
94
|
+
/** Custom Turndown rule */
|
|
95
|
+
export interface TurndownRule {
|
|
96
|
+
/** Rule name */
|
|
97
|
+
name: string;
|
|
98
|
+
/** Filter for elements to apply rule to */
|
|
99
|
+
filter: string | string[] | ((node: TurndownNode) => boolean);
|
|
100
|
+
/** Replacement function */
|
|
101
|
+
replacement: (content: string, node: TurndownNode) => string;
|
|
102
|
+
}
|
|
103
|
+
/** Turndown DOM node type */
|
|
104
|
+
export interface TurndownNode {
|
|
105
|
+
nodeName: string;
|
|
106
|
+
textContent?: string | null;
|
|
107
|
+
outerHTML?: string;
|
|
108
|
+
className?: string;
|
|
109
|
+
alt?: string;
|
|
110
|
+
src?: string;
|
|
111
|
+
title?: string;
|
|
112
|
+
querySelector?: (selector: string) => TurndownNode | null;
|
|
113
|
+
getAttribute?: (name: string) => string | null;
|
|
114
|
+
}
|
|
115
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAEA,8CAA8C;AAC9C,MAAM,WAAW,eAAe;IAC9B,kEAAkE;IAClE,cAAc,CAAC,EAAE,OAAO,CAAC;IAEzB,2DAA2D;IAC3D,WAAW,CAAC,EAAE,OAAO,CAAC;IAEtB,4BAA4B;IAC5B,WAAW,CAAC,EAAE,YAAY,EAAE,CAAC;IAE7B,8CAA8C;IAC9C,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE5B,8DAA8D;IAC9D,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,4CAA4C;IAC5C,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,qCAAqC;IACrC,aAAa,CAAC,EAAE,OAAO,CAAC;IAExB,oCAAoC;IACpC,YAAY,CAAC,EAAE,OAAO,CAAC;IAEvB,qCAAqC;IACrC,aAAa,CAAC,EAAE,OAAO,CAAC;IAExB,+CAA+C;IAC/C,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAE5B,yDAAyD;IACzD,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB,uDAAuD;IACvD,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uCAAuC;IACvC,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B,+CAA+C;IAC/C,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,sCAAsC;IACtC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAEjC,yCAAyC;IACzC,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,sCAAsC;AACtC,MAAM,WAAW,cAAc;IAC7B,iCAAiC;IACjC,QAAQ,EAAE,MAAM,CAAC;IAEjB,yBAAyB;IACzB,QAAQ,EAAE,eAAe,CAAC;IAE1B,4BAA4B;IAC5B,KAAK,EAAE,eAAe,CAAC;CACxB;AAED,iCAAiC;AACjC,MAAM,WAAW,eAAe;IAC9B,iBAAiB;IACjB,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf,oBAAoB;IACpB,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,0BAA0B;IAC1B,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,gBAAgB;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,uBAAuB;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,oBAAoB;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,oBAAoB;IACpB,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,sCAAsC;IACtC,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB,iBAAiB;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,4BAA4B;AAC5B,MAAM,WAAW,eAAe;IAC9B,wBAAwB;IACxB,WAAW,EAAE,MAAM,CAAC;IAEpB,6BAA6B;IAC7B,YAAY,EAAE,MAAM,CAAC;IAErB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IAEvB,+CAA+C;IAC/C,kBAAkB,EAAE,OAAO,CAAC;IAE5B,6BAA6B;IAC7B,UAAU,EAAE,MAAM,CAAC;IAEnB,4BAA4B;IAC5B,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,+BAA+B;AAC/B,MAAM,WAAW,YAAY;IAC3B,uDAAuD;IACvD,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,uCAAuC;IACvC,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B,+CAA+C;IAC/C,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,qBAAqB;IACrB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAEjC,wBAAwB;IACxB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,2BAA2B;AAC3B,MAAM,WAAW,YAAY;IAC3B,gBAAgB;IAChB,IAAI,EAAE,MAAM,CAAC;IAEb,2CAA2C;IAC3C,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,YAAY,KAAK,OAAO,CAAC,CAAC;IAE9D,2BAA2B;IAC3B,WAAW,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,KAAK,MAAM,CAAC;CAC9D;AAED,6BAA6B;AAC7B,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,aAAa,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,YAAY,GAAG,IAAI,CAAC;IAC1D,YAAY,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,GAAG,IAAI,CAAC;CAChD"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,eAAe"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { FetchOptions } from "../types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Fetch HTML from a URL with timeout and redirect handling
|
|
4
|
+
*/
|
|
5
|
+
export declare function fetchUrl(url: string, options?: FetchOptions): Promise<string>;
|
|
6
|
+
/**
|
|
7
|
+
* Validate if a string is a valid URL
|
|
8
|
+
*/
|
|
9
|
+
export declare function isValidUrl(urlString: string): boolean;
|
|
10
|
+
//# sourceMappingURL=url-fetcher.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"url-fetcher.d.ts","sourceRoot":"","sources":["../../src/utils/url-fetcher.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,aAAa,CAAC;AAGhD;;GAEG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,OAAO,GAAE,YAAiB,GACzB,OAAO,CAAC,MAAM,CAAC,CAwCjB;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAOrD"}
|