@nanocollective/get-md 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +36 -0
- package/README.md +205 -0
- package/bin/get-md.js +4 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +91 -0
- package/dist/cli.js.map +1 -0
- package/dist/cli.spec.d.ts +2 -0
- package/dist/cli.spec.d.ts.map +1 -0
- package/dist/cli.spec.js +278 -0
- package/dist/cli.spec.js.map +1 -0
- package/dist/config.d.ts +5 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +6 -0
- package/dist/config.js.map +1 -0
- package/dist/extractors/metadata-extractor.d.ts +6 -0
- package/dist/extractors/metadata-extractor.d.ts.map +1 -0
- package/dist/extractors/metadata-extractor.js +131 -0
- package/dist/extractors/metadata-extractor.js.map +1 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +70 -0
- package/dist/index.js.map +1 -0
- package/dist/optimizers/html-cleaner.d.ts +12 -0
- package/dist/optimizers/html-cleaner.d.ts.map +1 -0
- package/dist/optimizers/html-cleaner.js +228 -0
- package/dist/optimizers/html-cleaner.js.map +1 -0
- package/dist/optimizers/llm-formatter.d.ts +8 -0
- package/dist/optimizers/llm-formatter.d.ts.map +1 -0
- package/dist/optimizers/llm-formatter.js +94 -0
- package/dist/optimizers/llm-formatter.js.map +1 -0
- package/dist/optimizers/structure-enhancer.d.ts +8 -0
- package/dist/optimizers/structure-enhancer.d.ts.map +1 -0
- package/dist/optimizers/structure-enhancer.js +92 -0
- package/dist/optimizers/structure-enhancer.js.map +1 -0
- package/dist/parsers/markdown-parser.d.ts +16 -0
- package/dist/parsers/markdown-parser.d.ts.map +1 -0
- package/dist/parsers/markdown-parser.js +369 -0
- package/dist/parsers/markdown-parser.js.map +1 -0
- package/dist/types.d.ts +115 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/url-fetcher.d.ts +10 -0
- package/dist/utils/url-fetcher.d.ts.map +1 -0
- package/dist/utils/url-fetcher.js +54 -0
- package/dist/utils/url-fetcher.js.map +1 -0
- package/dist/utils/validators.d.ts +5 -0
- package/dist/utils/validators.d.ts.map +1 -0
- package/dist/utils/validators.js +23 -0
- package/dist/utils/validators.js.map +1 -0
- package/package.json +104 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AAAA,gBAAgB;AAEhB,2CAA2C;AAC3C,MAAM,CAAC,MAAM,kBAAkB,GAC7B,kFAAkF,CAAC;AAErF,oDAAoD;AACpD,MAAM,CAAC,MAAM,qBAAqB,GAAG,KAAK,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metadata-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/metadata-extractor.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAEnD;;GAEG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,eAAe,CAajB"}
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
// src/extractors/metadata-extractor.ts
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
/**
|
|
4
|
+
* Extract metadata from HTML
|
|
5
|
+
*/
|
|
6
|
+
export function extractMetadata(html, baseUrl) {
|
|
7
|
+
const $ = cheerio.load(html);
|
|
8
|
+
return {
|
|
9
|
+
title: extractTitle($),
|
|
10
|
+
author: extractAuthor($),
|
|
11
|
+
excerpt: extractExcerpt($),
|
|
12
|
+
siteName: extractSiteName($),
|
|
13
|
+
publishedTime: extractPublishedTime($),
|
|
14
|
+
language: extractLanguage($),
|
|
15
|
+
canonicalUrl: extractCanonicalUrl($, baseUrl),
|
|
16
|
+
// wordCount and readingTime are now calculated from final markdown
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
function extractTitle($) {
|
|
20
|
+
// Try Open Graph
|
|
21
|
+
const ogTitle = $('meta[property="og:title"]').attr("content");
|
|
22
|
+
if (ogTitle)
|
|
23
|
+
return ogTitle;
|
|
24
|
+
// Try Twitter
|
|
25
|
+
const twitterTitle = $('meta[name="twitter:title"]').attr("content");
|
|
26
|
+
if (twitterTitle)
|
|
27
|
+
return twitterTitle;
|
|
28
|
+
// Try regular title tag
|
|
29
|
+
const titleTag = $("title").text();
|
|
30
|
+
if (titleTag)
|
|
31
|
+
return titleTag.trim();
|
|
32
|
+
// Try first h1
|
|
33
|
+
const h1 = $("h1").first().text();
|
|
34
|
+
if (h1)
|
|
35
|
+
return h1.trim();
|
|
36
|
+
return undefined;
|
|
37
|
+
}
|
|
38
|
+
function extractAuthor($) {
|
|
39
|
+
// Try meta author tag
|
|
40
|
+
const metaAuthor = $('meta[name="author"]').attr("content");
|
|
41
|
+
if (metaAuthor)
|
|
42
|
+
return metaAuthor;
|
|
43
|
+
// Try article:author
|
|
44
|
+
const articleAuthor = $('meta[property="article:author"]').attr("content");
|
|
45
|
+
if (articleAuthor)
|
|
46
|
+
return articleAuthor;
|
|
47
|
+
// Try rel="author"
|
|
48
|
+
const relAuthor = $('[rel="author"]').text().trim();
|
|
49
|
+
if (relAuthor)
|
|
50
|
+
return relAuthor;
|
|
51
|
+
// Try common class names
|
|
52
|
+
const byline = $(".author, .byline, .author-name").first().text().trim();
|
|
53
|
+
if (byline)
|
|
54
|
+
return byline;
|
|
55
|
+
return undefined;
|
|
56
|
+
}
|
|
57
|
+
function extractExcerpt($) {
|
|
58
|
+
// Try Open Graph
|
|
59
|
+
const ogDesc = $('meta[property="og:description"]').attr("content");
|
|
60
|
+
if (ogDesc)
|
|
61
|
+
return ogDesc;
|
|
62
|
+
// Try meta description
|
|
63
|
+
const metaDesc = $('meta[name="description"]').attr("content");
|
|
64
|
+
if (metaDesc)
|
|
65
|
+
return metaDesc;
|
|
66
|
+
// Try Twitter
|
|
67
|
+
const twitterDesc = $('meta[name="twitter:description"]').attr("content");
|
|
68
|
+
if (twitterDesc)
|
|
69
|
+
return twitterDesc;
|
|
70
|
+
return undefined;
|
|
71
|
+
}
|
|
72
|
+
function extractSiteName($) {
|
|
73
|
+
// Try Open Graph
|
|
74
|
+
const ogSite = $('meta[property="og:site_name"]').attr("content");
|
|
75
|
+
if (ogSite)
|
|
76
|
+
return ogSite;
|
|
77
|
+
// Try application name
|
|
78
|
+
const appName = $('meta[name="application-name"]').attr("content");
|
|
79
|
+
if (appName)
|
|
80
|
+
return appName;
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
83
|
+
function extractPublishedTime($) {
|
|
84
|
+
// Try article:published_time
|
|
85
|
+
const articleTime = $('meta[property="article:published_time"]').attr("content");
|
|
86
|
+
if (articleTime)
|
|
87
|
+
return articleTime;
|
|
88
|
+
// Try time element with datetime
|
|
89
|
+
const timeEl = $("time[datetime]").first().attr("datetime");
|
|
90
|
+
if (timeEl)
|
|
91
|
+
return timeEl;
|
|
92
|
+
// Try datePublished
|
|
93
|
+
const datePublished = $('[itemprop="datePublished"]').first().attr("content");
|
|
94
|
+
if (datePublished)
|
|
95
|
+
return datePublished;
|
|
96
|
+
return undefined;
|
|
97
|
+
}
|
|
98
|
+
function extractLanguage($) {
|
|
99
|
+
// Try html lang attribute
|
|
100
|
+
const htmlLang = $("html").attr("lang");
|
|
101
|
+
if (htmlLang)
|
|
102
|
+
return htmlLang;
|
|
103
|
+
// Try meta content-language
|
|
104
|
+
const metaLang = $('meta[http-equiv="content-language"]').attr("content");
|
|
105
|
+
if (metaLang)
|
|
106
|
+
return metaLang;
|
|
107
|
+
return undefined;
|
|
108
|
+
}
|
|
109
|
+
function extractCanonicalUrl($, baseUrl) {
|
|
110
|
+
// Try link rel="canonical"
|
|
111
|
+
const canonical = $('link[rel="canonical"]').attr("href");
|
|
112
|
+
if (canonical) {
|
|
113
|
+
if (canonical.startsWith("http"))
|
|
114
|
+
return canonical;
|
|
115
|
+
if (baseUrl) {
|
|
116
|
+
try {
|
|
117
|
+
return new URL(canonical, baseUrl).href;
|
|
118
|
+
}
|
|
119
|
+
catch {
|
|
120
|
+
// Ignore invalid URLs
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
return canonical;
|
|
124
|
+
}
|
|
125
|
+
// Try Open Graph URL
|
|
126
|
+
const ogUrl = $('meta[property="og:url"]').attr("content");
|
|
127
|
+
if (ogUrl)
|
|
128
|
+
return ogUrl;
|
|
129
|
+
return baseUrl;
|
|
130
|
+
}
|
|
131
|
+
//# sourceMappingURL=metadata-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"metadata-extractor.js","sourceRoot":"","sources":["../../src/extractors/metadata-extractor.ts"],"names":[],"mappings":"AAAA,uCAAuC;AAEvC,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,IAAY,EACZ,OAAgB;IAEhB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,OAAO;QACL,KAAK,EAAE,YAAY,CAAC,CAAC,CAAC;QACtB,MAAM,EAAE,aAAa,CAAC,CAAC,CAAC;QACxB,OAAO,EAAE,cAAc,CAAC,CAAC,CAAC;QAC1B,QAAQ,EAAE,eAAe,CAAC,CAAC,CAAC;QAC5B,aAAa,EAAE,oBAAoB,CAAC,CAAC,CAAC;QACtC,QAAQ,EAAE,eAAe,CAAC,CAAC,CAAC;QAC5B,YAAY,EAAE,mBAAmB,CAAC,CAAC,EAAE,OAAO,CAAC;QAC7C,mEAAmE;KACpE,CAAC;AACJ,CAAC;AAED,SAAS,YAAY,CAAC,CAAqB;IACzC,iBAAiB;IACjB,MAAM,OAAO,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC/D,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC;IAE5B,cAAc;IACd,MAAM,YAAY,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACrE,IAAI,YAAY;QAAE,OAAO,YAAY,CAAC;IAEtC,wBAAwB;IACxB,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;IACnC,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;IAErC,eAAe;IACf,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC;IAClC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC;IAEzB,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,aAAa,CAAC,CAAqB;IAC1C,sBAAsB;IACtB,MAAM,UAAU,GAAG,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC5D,IAAI,UAAU;QAAE,OAAO,UAAU,CAAC;IAElC,qBAAqB;IACrB,MAAM,aAAa,GAAG,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC3E,IAAI,aAAa;QAAE,OAAO,aAAa,CAAC;IAExC,mBAAmB;IACnB,MAAM,SAAS,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IACpD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC;IAEhC,yBAAyB;IACzB,MAAM,MAAM,GAAG,CAAC,CAAC,gCAAgC,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IACzE,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,cAAc,CAAC,CAAqB;IAC3C,iBAAiB;IACjB,MAAM,MAAM,GAAG,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACpE,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,uBAAuB;IACvB,MAAM,QAAQ,GAAG,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC/D,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,cAAc;IACd,MAAM,WAAW,GAAG,CAAC,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1E,IAAI,WAAW;QAAE,OAAO,WAAW,CAAC;IAEpC,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,CAAqB;IAC5C,iBAAiB;IACjB,MAAM,MAAM,GAAG,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAClE,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,uBAAuB;IACvB,MAAM,OAAO,GAAG,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACnE,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC;IAE5B,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,oBAAoB,CAAC,CAAqB;IACjD,6BAA6B;IAC7B,MAAM,WAAW,GAAG,CAAC,CAAC,yCAAyC,CAAC,CAAC,IAAI,CACnE,SAAS,CACV,CAAC;IACF,IAAI,WAAW;QAAE,OAAO,WAAW,CAAC;IAEpC,iCAAiC;IACjC,MAAM,MAAM,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC5D,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,oBAAoB;IACpB,MAAM,aAAa,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9E,IAAI,aAAa;QAAE,OAAO,aAAa,CAAC;IAExC,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,CAAqB;IAC5C,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACxC,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,4BAA4B;IAC5B,MAAM,QAAQ,GAAG,CAAC,CAAC,qCAAqC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1E,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,mBAAmB,CAC1B,CAAqB,EACrB,OAAgB;IAEhB,2BAA2B;IAC3B,MAAM,SAAS,GAAG,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC1D,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,OAAO,SAAS,CAAC;QACnD,IAAI,OAAO,EAAE,CAAC;YACZ,IAAI,CAAC;gBACH,OAAO,IAAI,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YAC1C,CAAC;YAAC,MAAM,CAAC;gBACP,sBAAsB;YACxB,CAAC;QACH,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,qBAAqB;IACrB,MAAM,KAAK,GAAG,CAAC,CAAC,yBAAyB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC3D,IAAI,KAAK;QAAE,OAAO,KAAK,CAAC;IAExB,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import type { MarkdownOptions, MarkdownResult, ContentMetadata, TurndownRule, ConversionStats } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Convert HTML to clean, LLM-optimized Markdown
|
|
4
|
+
*
|
|
5
|
+
* @param html - Raw HTML string or URL to fetch
|
|
6
|
+
* @param options - Conversion options (including fetch options for URLs)
|
|
7
|
+
* @returns Promise resolving to markdown result
|
|
8
|
+
*
|
|
9
|
+
* @example
|
|
10
|
+
* ```typescript
|
|
11
|
+
* import { convertToMarkdown } from '@nanocollective/get-md';
|
|
12
|
+
*
|
|
13
|
+
* // From HTML string
|
|
14
|
+
* const result = await convertToMarkdown('<h1>Hello</h1><p>World</p>');
|
|
15
|
+
* console.log(result.markdown);
|
|
16
|
+
* // # Hello
|
|
17
|
+
* //
|
|
18
|
+
* // World
|
|
19
|
+
*
|
|
20
|
+
* // From URL
|
|
21
|
+
* const result = await convertToMarkdown('https://example.com');
|
|
22
|
+
* console.log(result.metadata.title);
|
|
23
|
+
*
|
|
24
|
+
* // From URL with custom fetch options
|
|
25
|
+
* const result = await convertToMarkdown('https://example.com', {
|
|
26
|
+
* timeout: 10000,
|
|
27
|
+
* headers: { 'Authorization': 'Bearer token' },
|
|
28
|
+
* llmOptimized: true
|
|
29
|
+
* });
|
|
30
|
+
*
|
|
31
|
+
* // Force URL mode if auto-detection fails
|
|
32
|
+
* const result = await convertToMarkdown('example.com', { isUrl: true });
|
|
33
|
+
* ```
|
|
34
|
+
*/
|
|
35
|
+
export declare function convertToMarkdown(html: string, options?: MarkdownOptions): Promise<MarkdownResult>;
|
|
36
|
+
/**
|
|
37
|
+
* Validate if HTML contains extractable content
|
|
38
|
+
*
|
|
39
|
+
* @param html - Raw HTML string
|
|
40
|
+
* @returns Whether content can be extracted
|
|
41
|
+
*/
|
|
42
|
+
export declare function hasContent(html: string): boolean;
|
|
43
|
+
export type { MarkdownOptions, MarkdownResult, ContentMetadata, ConversionStats, TurndownRule, };
|
|
44
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,eAAe,EACf,YAAY,EACZ,eAAe,EAChB,MAAM,YAAY,CAAC;AAGpB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,eAAe,GACxB,OAAO,CAAC,cAAc,CAAC,CAyBzB;AAED;;;;;GAKG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAEhD;AAGD,YAAY,EACV,eAAe,EACf,cAAc,EACd,eAAe,EACf,eAAe,EACf,YAAY,GACb,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
// src/index.ts
|
|
2
|
+
import { MarkdownParser } from "./parsers/markdown-parser.js";
|
|
3
|
+
import { fetchUrl, isValidUrl } from "./utils/url-fetcher.js";
|
|
4
|
+
import { hasContent as hasContentUtil } from "./utils/validators.js";
|
|
5
|
+
/**
|
|
6
|
+
* Convert HTML to clean, LLM-optimized Markdown
|
|
7
|
+
*
|
|
8
|
+
* @param html - Raw HTML string or URL to fetch
|
|
9
|
+
* @param options - Conversion options (including fetch options for URLs)
|
|
10
|
+
* @returns Promise resolving to markdown result
|
|
11
|
+
*
|
|
12
|
+
* @example
|
|
13
|
+
* ```typescript
|
|
14
|
+
* import { convertToMarkdown } from '@nanocollective/get-md';
|
|
15
|
+
*
|
|
16
|
+
* // From HTML string
|
|
17
|
+
* const result = await convertToMarkdown('<h1>Hello</h1><p>World</p>');
|
|
18
|
+
* console.log(result.markdown);
|
|
19
|
+
* // # Hello
|
|
20
|
+
* //
|
|
21
|
+
* // World
|
|
22
|
+
*
|
|
23
|
+
* // From URL
|
|
24
|
+
* const result = await convertToMarkdown('https://example.com');
|
|
25
|
+
* console.log(result.metadata.title);
|
|
26
|
+
*
|
|
27
|
+
* // From URL with custom fetch options
|
|
28
|
+
* const result = await convertToMarkdown('https://example.com', {
|
|
29
|
+
* timeout: 10000,
|
|
30
|
+
* headers: { 'Authorization': 'Bearer token' },
|
|
31
|
+
* llmOptimized: true
|
|
32
|
+
* });
|
|
33
|
+
*
|
|
34
|
+
* // Force URL mode if auto-detection fails
|
|
35
|
+
* const result = await convertToMarkdown('example.com', { isUrl: true });
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
export async function convertToMarkdown(html, options) {
|
|
39
|
+
// Check if input is a URL (or forced to be treated as one)
|
|
40
|
+
if (options?.isUrl || isValidUrl(html)) {
|
|
41
|
+
// Extract fetch options
|
|
42
|
+
const fetchOptions = {
|
|
43
|
+
timeout: options?.timeout,
|
|
44
|
+
followRedirects: options?.followRedirects,
|
|
45
|
+
maxRedirects: options?.maxRedirects,
|
|
46
|
+
headers: options?.headers,
|
|
47
|
+
userAgent: options?.userAgent,
|
|
48
|
+
};
|
|
49
|
+
// Fetch HTML from URL
|
|
50
|
+
const fetchedHtml = await fetchUrl(html, fetchOptions);
|
|
51
|
+
// Parse with base URL set to the fetched URL
|
|
52
|
+
const parser = new MarkdownParser();
|
|
53
|
+
return parser.convert(fetchedHtml, {
|
|
54
|
+
...options,
|
|
55
|
+
baseUrl: options?.baseUrl || html,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
const parser = new MarkdownParser();
|
|
59
|
+
return parser.convert(html, options);
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Validate if HTML contains extractable content
|
|
63
|
+
*
|
|
64
|
+
* @param html - Raw HTML string
|
|
65
|
+
* @returns Whether content can be extracted
|
|
66
|
+
*/
|
|
67
|
+
export function hasContent(html) {
|
|
68
|
+
return hasContentUtil(html);
|
|
69
|
+
}
|
|
70
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,eAAe;AAEf,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,UAAU,IAAI,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAUrE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,OAAyB;IAEzB,2DAA2D;IAC3D,IAAI,OAAO,EAAE,KAAK,IAAI,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,wBAAwB;QACxB,MAAM,YAAY,GAAiB;YACjC,OAAO,EAAE,OAAO,EAAE,OAAO;YACzB,eAAe,EAAE,OAAO,EAAE,eAAe;YACzC,YAAY,EAAE,OAAO,EAAE,YAAY;YACnC,OAAO,EAAE,OAAO,EAAE,OAAO;YACzB,SAAS,EAAE,OAAO,EAAE,SAAS;SAC9B,CAAC;QAEF,sBAAsB;QACtB,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;QAEvD,6CAA6C;QAC7C,MAAM,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;QACpC,OAAO,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE;YACjC,GAAG,OAAO;YACV,OAAO,EAAE,OAAO,EAAE,OAAO,IAAI,IAAI;SAClC,CAAC,CAAC;IACL,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;IACpC,OAAO,MAAM,CAAC,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AACvC,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
interface CleanOptions {
|
|
2
|
+
/** Remove ads, navigation, social media, etc. */
|
|
3
|
+
aggressive?: boolean;
|
|
4
|
+
/** Base URL for resolving relative URLs */
|
|
5
|
+
baseUrl?: string;
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Aggressively clean HTML to remove noise for LLMs
|
|
9
|
+
*/
|
|
10
|
+
export declare function cleanHTML(html: string, options?: CleanOptions): string;
|
|
11
|
+
export {};
|
|
12
|
+
//# sourceMappingURL=html-cleaner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-cleaner.d.ts","sourceRoot":"","sources":["../../src/optimizers/html-cleaner.ts"],"names":[],"mappings":"AAIA,UAAU,YAAY;IACpB,iDAAiD;IACjD,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,2CAA2C;IAC3C,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,MAAM,CA+B1E"}
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
// src/optimizers/html-cleaner.ts
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
/**
|
|
4
|
+
* Aggressively clean HTML to remove noise for LLMs
|
|
5
|
+
*/
|
|
6
|
+
export function cleanHTML(html, options = {}) {
|
|
7
|
+
const $ = cheerio.load(html);
|
|
8
|
+
// 1. Remove script and style tags
|
|
9
|
+
$("script, style, noscript").remove();
|
|
10
|
+
// 2. Remove common noise elements BEFORE cleaning attributes
|
|
11
|
+
// This way we can still use class/ID selectors
|
|
12
|
+
if (options.aggressive !== false) {
|
|
13
|
+
removeNoiseElements($);
|
|
14
|
+
}
|
|
15
|
+
// 3. Remove comments
|
|
16
|
+
$("*")
|
|
17
|
+
.contents()
|
|
18
|
+
.filter((_, el) => el.type === "comment")
|
|
19
|
+
.remove();
|
|
20
|
+
// 4. Resolve relative URLs (do this before cleaning attributes so we still have src/href)
|
|
21
|
+
if (options.baseUrl) {
|
|
22
|
+
resolveRelativeUrls($, options.baseUrl);
|
|
23
|
+
}
|
|
24
|
+
// 5. Clean attributes (keep only essential ones)
|
|
25
|
+
// Do this AFTER removing noise so our selectors still work
|
|
26
|
+
cleanAttributes($);
|
|
27
|
+
// 6. Remove empty elements (do this last after everything else is cleaned)
|
|
28
|
+
removeEmptyElements($);
|
|
29
|
+
return $.html();
|
|
30
|
+
}
|
|
31
|
+
function removeNoiseElements($) {
|
|
32
|
+
// Remove by role attribute
|
|
33
|
+
$([
|
|
34
|
+
'[role="navigation"]',
|
|
35
|
+
'[role="banner"]',
|
|
36
|
+
'[role="complementary"]',
|
|
37
|
+
'[role="contentinfo"]',
|
|
38
|
+
'[role="search"]',
|
|
39
|
+
].join(",")).remove();
|
|
40
|
+
// Remove by common class/id patterns
|
|
41
|
+
// More specific selectors to avoid false positives
|
|
42
|
+
const noiseSelectors = [
|
|
43
|
+
// Navigation elements (but not components that just have 'nav' in the name)
|
|
44
|
+
'nav[role="navigation"]',
|
|
45
|
+
"nav.navbar",
|
|
46
|
+
"nav.nav-menu",
|
|
47
|
+
"div.navbar",
|
|
48
|
+
'div[role="navigation"]',
|
|
49
|
+
"#navigation",
|
|
50
|
+
"#nav",
|
|
51
|
+
"#menu",
|
|
52
|
+
// Headers/Footers - only actual header/footer elements or very specific classes
|
|
53
|
+
'header[role="banner"]',
|
|
54
|
+
'footer[role="contentinfo"]',
|
|
55
|
+
"#header",
|
|
56
|
+
"#footer",
|
|
57
|
+
"div.site-header",
|
|
58
|
+
"div.site-footer",
|
|
59
|
+
"div.page-header",
|
|
60
|
+
"div.page-footer",
|
|
61
|
+
// Sidebars
|
|
62
|
+
"aside",
|
|
63
|
+
"div.sidebar",
|
|
64
|
+
'div[role="complementary"]',
|
|
65
|
+
"#sidebar",
|
|
66
|
+
// Ads
|
|
67
|
+
".ad",
|
|
68
|
+
".ads",
|
|
69
|
+
".advertisement",
|
|
70
|
+
".advert",
|
|
71
|
+
'[id*="ad-"]',
|
|
72
|
+
'[class*="advertisement"]',
|
|
73
|
+
'[class*="-ad-"]',
|
|
74
|
+
'[class*="google-ad"]',
|
|
75
|
+
// Social media
|
|
76
|
+
".social",
|
|
77
|
+
".social-share",
|
|
78
|
+
".share-buttons",
|
|
79
|
+
".social-media",
|
|
80
|
+
// Comments
|
|
81
|
+
".comments",
|
|
82
|
+
"#comments",
|
|
83
|
+
".comment-section",
|
|
84
|
+
// Related/recommendations
|
|
85
|
+
".related",
|
|
86
|
+
".recommendations",
|
|
87
|
+
".suggested",
|
|
88
|
+
// Popups/modals
|
|
89
|
+
".modal",
|
|
90
|
+
".popup",
|
|
91
|
+
".overlay",
|
|
92
|
+
'[role="dialog"]',
|
|
93
|
+
// Cookie notices
|
|
94
|
+
".cookie-notice",
|
|
95
|
+
".cookie-banner",
|
|
96
|
+
"#cookie-consent",
|
|
97
|
+
// Newsletter signups
|
|
98
|
+
".newsletter",
|
|
99
|
+
".subscribe",
|
|
100
|
+
".signup-form",
|
|
101
|
+
];
|
|
102
|
+
$(noiseSelectors.join(",")).remove();
|
|
103
|
+
// Remove elements with common noise text
|
|
104
|
+
// But ONLY if they are small elements (to avoid removing large content blocks
|
|
105
|
+
// that happen to mention these terms)
|
|
106
|
+
$("*")
|
|
107
|
+
.filter((_, el) => {
|
|
108
|
+
const $el = $(el);
|
|
109
|
+
const text = $el.text().toLowerCase();
|
|
110
|
+
const textLength = text.trim().length;
|
|
111
|
+
// Only remove if text is short (< 200 chars) and matches noise patterns
|
|
112
|
+
// This avoids removing entire articles that mention these terms
|
|
113
|
+
if (textLength > 200)
|
|
114
|
+
return false;
|
|
115
|
+
return (text.includes("cookie policy") ||
|
|
116
|
+
text.includes("accept cookies") ||
|
|
117
|
+
text.includes("sign up for") ||
|
|
118
|
+
text.includes("newsletter") ||
|
|
119
|
+
text.includes("follow us"));
|
|
120
|
+
})
|
|
121
|
+
.remove();
|
|
122
|
+
}
|
|
123
|
+
function cleanAttributes($) {
|
|
124
|
+
// Attributes to preserve
|
|
125
|
+
const keepAttributes = new Set([
|
|
126
|
+
"href",
|
|
127
|
+
"src",
|
|
128
|
+
"alt",
|
|
129
|
+
"title",
|
|
130
|
+
"colspan",
|
|
131
|
+
"rowspan", // For tables
|
|
132
|
+
"align", // For table alignment
|
|
133
|
+
]);
|
|
134
|
+
$("*").each((_, el) => {
|
|
135
|
+
const $el = $(el);
|
|
136
|
+
const attrs = $el.attr();
|
|
137
|
+
if (attrs) {
|
|
138
|
+
Object.keys(attrs).forEach((attr) => {
|
|
139
|
+
if (!keepAttributes.has(attr)) {
|
|
140
|
+
$el.removeAttr(attr);
|
|
141
|
+
}
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
function removeEmptyElements($) {
|
|
147
|
+
// Remove elements that have no text and no important children
|
|
148
|
+
const importantTags = new Set(["img", "br", "hr", "input", "iframe"]);
|
|
149
|
+
const contentTags = new Set([
|
|
150
|
+
"p",
|
|
151
|
+
"h1",
|
|
152
|
+
"h2",
|
|
153
|
+
"h3",
|
|
154
|
+
"h4",
|
|
155
|
+
"h5",
|
|
156
|
+
"h6",
|
|
157
|
+
"ul",
|
|
158
|
+
"ol",
|
|
159
|
+
"li",
|
|
160
|
+
"table",
|
|
161
|
+
"blockquote",
|
|
162
|
+
"pre",
|
|
163
|
+
"code",
|
|
164
|
+
]);
|
|
165
|
+
$("*").each((_, el) => {
|
|
166
|
+
const $el = $(el);
|
|
167
|
+
const tagName = el.tagName?.toLowerCase();
|
|
168
|
+
// Skip important tags
|
|
169
|
+
if (tagName && importantTags.has(tagName))
|
|
170
|
+
return;
|
|
171
|
+
// Get the text content
|
|
172
|
+
const text = $el.text().trim();
|
|
173
|
+
const hasImportantChildren = $el.find("img, iframe").length > 0;
|
|
174
|
+
const hasContentTags = $el.find("p, h1, h2, h3, h4, h5, h6, ul, ol, table, blockquote, pre")
|
|
175
|
+
.length > 0;
|
|
176
|
+
// For content-bearing tags (p, h1-h6, etc), remove if they're empty or just junk
|
|
177
|
+
if (tagName && contentTags.has(tagName)) {
|
|
178
|
+
// Remove if completely empty
|
|
179
|
+
if (text.length === 0 && !hasImportantChildren) {
|
|
180
|
+
$el.remove();
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
// Remove if only contains punctuation/whitespace like "|", "-", etc.
|
|
184
|
+
const meaningfulText = text.replace(/[\s|_.:;-]+/g, "");
|
|
185
|
+
if (meaningfulText.length === 0 && !hasImportantChildren) {
|
|
186
|
+
$el.remove();
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
// For container elements (div, section, etc), remove if no text and no important children
|
|
191
|
+
if (!text && !hasImportantChildren && !hasContentTags) {
|
|
192
|
+
$el.remove();
|
|
193
|
+
}
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
function resolveRelativeUrls($, baseUrl) {
|
|
197
|
+
const base = new URL(baseUrl);
|
|
198
|
+
// Resolve image sources
|
|
199
|
+
$("img").each((_, el) => {
|
|
200
|
+
const $el = $(el);
|
|
201
|
+
const src = $el.attr("src");
|
|
202
|
+
if (src && !src.startsWith("http") && !src.startsWith("data:")) {
|
|
203
|
+
try {
|
|
204
|
+
$el.attr("src", new URL(src, base).href);
|
|
205
|
+
}
|
|
206
|
+
catch {
|
|
207
|
+
// Ignore invalid URLs
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
});
|
|
211
|
+
// Resolve link hrefs
|
|
212
|
+
$("a").each((_, el) => {
|
|
213
|
+
const $el = $(el);
|
|
214
|
+
const href = $el.attr("href");
|
|
215
|
+
if (href &&
|
|
216
|
+
!href.startsWith("http") &&
|
|
217
|
+
!href.startsWith("#") &&
|
|
218
|
+
!href.startsWith("mailto:")) {
|
|
219
|
+
try {
|
|
220
|
+
$el.attr("href", new URL(href, base).href);
|
|
221
|
+
}
|
|
222
|
+
catch {
|
|
223
|
+
// Ignore invalid URLs
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
//# sourceMappingURL=html-cleaner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-cleaner.js","sourceRoot":"","sources":["../../src/optimizers/html-cleaner.ts"],"names":[],"mappings":"AAAA,iCAAiC;AAEjC,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AASnC;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,UAAwB,EAAE;IAChE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,kCAAkC;IAClC,CAAC,CAAC,yBAAyB,CAAC,CAAC,MAAM,EAAE,CAAC;IAEtC,6DAA6D;IAC7D,+CAA+C;IAC/C,IAAI,OAAO,CAAC,UAAU,KAAK,KAAK,EAAE,CAAC;QACjC,mBAAmB,CAAC,CAAC,CAAC,CAAC;IACzB,CAAC;IAED,qBAAqB;IACrB,CAAC,CAAC,GAAG,CAAC;SACH,QAAQ,EAAE;SACV,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAE,EAAwB,CAAC,IAAI,KAAK,SAAS,CAAC;SAC/D,MAAM,EAAE,CAAC;IAEZ,0FAA0F;IAC1F,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,mBAAmB,CAAC,CAAC,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IAC1C,CAAC;IAED,iDAAiD;IACjD,2DAA2D;IAC3D,eAAe,CAAC,CAAC,CAAC,CAAC;IAEnB,2EAA2E;IAC3E,mBAAmB,CAAC,CAAC,CAAC,CAAC;IAEvB,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;AAClB,CAAC;AAED,SAAS,mBAAmB,CAAC,CAAqB;IAChD,2BAA2B;IAC3B,CAAC,CACC;QACE,qBAAqB;QACrB,iBAAiB;QACjB,wBAAwB;QACxB,sBAAsB;QACtB,iBAAiB;KAClB,CAAC,IAAI,CAAC,GAAG,CAAC,CACZ,CAAC,MAAM,EAAE,CAAC;IAEX,qCAAqC;IACrC,mDAAmD;IACnD,MAAM,cAAc,GAAG;QACrB,4EAA4E;QAC5E,wBAAwB;QACxB,YAAY;QACZ,cAAc;QACd,YAAY;QACZ,wBAAwB;QACxB,aAAa;QACb,MAAM;QACN,OAAO;QAEP,gFAAgF;QAChF,uBAAuB;QACvB,4BAA4B;QAC5B,SAAS;QACT,SAAS;QACT,iBAAiB;QACjB,iBAAiB;QACjB,iBAAiB;QACjB,iBAAiB;QAEjB,WAAW;QACX,OAAO;QACP,aAAa;QACb,2BAA2B;QAC3B,UAAU;QAEV,MAAM;QACN,KAAK;QACL,MAAM;QACN,gBAAgB;QAChB,SAAS;QACT,aAAa;QACb,0BAA0B;QAC1B,iBAAiB;QACjB,sBAAsB;QAEtB,eAAe;QACf,SAAS;QACT,eAAe;QACf,gBAAgB;QAChB,eAAe;QAEf,WAAW;QACX,WAAW;QACX,WAAW;QACX,kBAAkB;QAElB,0BAA0B;QAC1B,UAAU;QACV,kBAAkB;QAClB,YAAY;QAEZ,gBAAgB;QAChB,QAAQ;QACR,QAAQ;QACR,UAAU;QACV,iBAAiB;QAEjB,iBAAiB;QACjB,gBAAgB;QAChB,gBAAgB;QAChB,iBAAiB;QAEjB,qBAAqB;QACrB,aAAa;QACb,YAAY;QACZ,cAAc;KACf,CAAC;IAEF,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IAErC,yCAAyC;IACzC,8EAA8E;IAC9E,sCAAsC;IACtC,CAAC,CAAC,GAAG,CAAC;SACH,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAChB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACtC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC;QAEtC,wEAAwE;QACxE,gEAAgE;QAChE,IAAI,UAAU,GAAG,GAAG;YAAE,OAAO,KAAK,CAAC;QAEnC,OAAO,CACL,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC;YAC9B,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC;YAC/B,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC;YAC5B,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC;YAC3B,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,CAC3B,CAAC;IACJ,CAAC,CAAC;SACD,MAAM,EAAE,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,CAAqB;IAC5C,yBAAyB;IACzB,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;QAC7B,MAAM;QACN,KAAK;QACL,KAAK;QACL,OAAO;QACP,SAAS;QACT,SAAS,EAAE,aAAa;QACxB,OAAO,EAAE,sBAAsB;KAChC,CAAC,CAAC;IAEH,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QAEzB,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;gBAClC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC9B,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,mBAAmB,CAAC,CAAqB;IAChD,8DAA8D;IAC9D,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,CAAC,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC;IACtE,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC;QAC1B,GAAG;QACH,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,OAAO;QACP,YAAY;QACZ,KAAK;QACL,MAAM;KACP,CAAC,CAAC;IAEH,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAI,EAA2B,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC;QAEpE,sBAAsB;QACtB,IAAI,OAAO,IAAI,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC;YAAE,OAAO;QAElD,uBAAuB;QACvB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAC/B,MAAM,oBAAoB,GAAG,GAAG,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAChE,MAAM,cAAc,GAClB,GAAG,CAAC,IAAI,CAAC,2DAA2D,CAAC;aAClE,MAAM,GAAG,CAAC,CAAC;QAEhB,iFAAiF;QACjF,IAAI,OAAO,IAAI,WAAW,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACxC,6BAA6B;YAC7B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBAC/C,GAAG,CAAC,MAAM,EAAE,CAAC;gBACb,OAAO;YACT,CAAC;YAED,qEAAqE;YACrE,MAAM,cAAc,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;YACxD,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBACzD,GAAG,CAAC,MAAM,EAAE,CAAC;gBACb,OAAO;YACT,CAAC;QACH,CAAC;QAED,0FAA0F;QAC1F,IAAI,CAAC,IAAI,IAAI,CAAC,oBAAoB,IAAI,CAAC,cAAc,EAAE,CAAC;YACtD,GAAG,CAAC,MAAM,EAAE,CAAC;QACf,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,mBAAmB,CAAC,CAAqB,EAAE,OAAe;IACjE,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;IAE9B,wBAAwB;IACxB,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACtB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAC/D,IAAI,CAAC;gBACH,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;YAC3C,CAAC;YAAC,MAAM,CAAC;gBACP,sBAAsB;YACxB,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,qBAAqB;IACrB,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAC9B,IACE,IAAI;YACJ,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;YACxB,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YACrB,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,EAC3B,CAAC;YACD,IAAI,CAAC;gBACH,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;YAC7C,CAAC;YAAC,MAAM,CAAC;gBACP,sBAAsB;YACxB,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format markdown specifically for LLM consumption
|
|
3
|
+
* - Consistent spacing and structure
|
|
4
|
+
* - Clear section boundaries
|
|
5
|
+
* - Reduced noise and clutter
|
|
6
|
+
*/
|
|
7
|
+
export declare function formatForLLM(markdown: string): string;
|
|
8
|
+
//# sourceMappingURL=llm-formatter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-formatter.d.ts","sourceRoot":"","sources":["../../src/optimizers/llm-formatter.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAmBrD"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
// src/optimizers/llm-formatter.ts
|
|
2
|
+
/**
|
|
3
|
+
* Format markdown specifically for LLM consumption
|
|
4
|
+
* - Consistent spacing and structure
|
|
5
|
+
* - Clear section boundaries
|
|
6
|
+
* - Reduced noise and clutter
|
|
7
|
+
*/
|
|
8
|
+
export function formatForLLM(markdown) {
|
|
9
|
+
let formatted = markdown;
|
|
10
|
+
// 1. Normalize heading levels (ensure no skipped levels)
|
|
11
|
+
formatted = normalizeHeadingLevels(formatted);
|
|
12
|
+
// 2. Improve list consistency
|
|
13
|
+
formatted = normalizeListFormatting(formatted);
|
|
14
|
+
// 3. Clean up inline formatting (remove excessive emphasis)
|
|
15
|
+
formatted = cleanInlineFormatting(formatted);
|
|
16
|
+
// 4. Ensure code blocks are clearly marked
|
|
17
|
+
formatted = enhanceCodeBlocks(formatted);
|
|
18
|
+
// 5. Improve link formatting for LLMs
|
|
19
|
+
formatted = optimizeLinkFormatting(formatted);
|
|
20
|
+
return formatted;
|
|
21
|
+
}
|
|
22
|
+
function normalizeHeadingLevels(markdown) {
|
|
23
|
+
const lines = markdown.split("\n");
|
|
24
|
+
const result = [];
|
|
25
|
+
let currentLevel = 0;
|
|
26
|
+
for (const line of lines) {
|
|
27
|
+
const match = line.match(/^(#{1,6})\s+(.+)$/);
|
|
28
|
+
if (match) {
|
|
29
|
+
const level = match[1].length;
|
|
30
|
+
const title = match[2];
|
|
31
|
+
// Don't skip heading levels
|
|
32
|
+
const normalizedLevel = Math.min(level, currentLevel + 2);
|
|
33
|
+
currentLevel = normalizedLevel;
|
|
34
|
+
result.push("#".repeat(normalizedLevel) + " " + title);
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
result.push(line);
|
|
38
|
+
if (line.trim() === "") {
|
|
39
|
+
currentLevel = 0; // Reset on blank section
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return result.join("\n");
|
|
44
|
+
}
|
|
45
|
+
function normalizeListFormatting(markdown) {
|
|
46
|
+
let result = markdown;
|
|
47
|
+
// Ensure consistent list markers (use - for unordered)
|
|
48
|
+
result = result.replace(/^\s*[*+]\s+/gm, "- ");
|
|
49
|
+
// Ensure proper indentation for nested lists (2 spaces)
|
|
50
|
+
const lines = result.split("\n");
|
|
51
|
+
result = lines
|
|
52
|
+
.map((line) => {
|
|
53
|
+
if (/^(\s*)[-*+]\s+/.test(line)) {
|
|
54
|
+
const indent = line.match(/^(\s*)/)?.[1].length || 0;
|
|
55
|
+
const depth = Math.floor(indent / 2);
|
|
56
|
+
return " ".repeat(depth) + "- " + line.trim().replace(/^[-*+]\s+/, "");
|
|
57
|
+
}
|
|
58
|
+
return line;
|
|
59
|
+
})
|
|
60
|
+
.join("\n");
|
|
61
|
+
return result;
|
|
62
|
+
}
|
|
63
|
+
function cleanInlineFormatting(markdown) {
|
|
64
|
+
let result = markdown;
|
|
65
|
+
// Remove multiple consecutive emphasis markers (***text*** → **text**)
|
|
66
|
+
result = result.replace(/\*{3,}(.+?)\*{3,}/g, "**$1**");
|
|
67
|
+
// Clean up spaces inside emphasis markers only (e.g., "** text **" → "**text**")
|
|
68
|
+
// This pattern matches the full emphasis span with internal spaces and removes them
|
|
69
|
+
result = result.replace(/(\*{1,2})\s+([^*]+?)\s+(\*{1,2})/g, "$1$2$3");
|
|
70
|
+
return result;
|
|
71
|
+
}
|
|
72
|
+
function enhanceCodeBlocks(markdown) {
|
|
73
|
+
// Ensure code blocks are on their own lines with spacing
|
|
74
|
+
let result = markdown.replace(/([^\n])```/g, "$1\n\n```");
|
|
75
|
+
result = result.replace(/```([^\n])/g, "```\n$1");
|
|
76
|
+
return result;
|
|
77
|
+
}
|
|
78
|
+
function optimizeLinkFormatting(markdown) {
|
|
79
|
+
// Convert reference-style links to inline for simpler parsing
|
|
80
|
+
const links = new Map();
|
|
81
|
+
// Extract reference definitions
|
|
82
|
+
let result = markdown.replace(/^\[([^\]]+)\]:\s*(.+)$/gm, (_match, ref, url) => {
|
|
83
|
+
links.set(ref.toLowerCase(), url.trim());
|
|
84
|
+
return "";
|
|
85
|
+
});
|
|
86
|
+
// Replace reference-style links with inline
|
|
87
|
+
result = result.replace(/\[([^\]]+)\]\[([^\]]*)\]/g, (match, text, ref) => {
|
|
88
|
+
const refKey = (ref || text).toLowerCase();
|
|
89
|
+
const url = links.get(refKey);
|
|
90
|
+
return url ? `[${text}](${url})` : match;
|
|
91
|
+
});
|
|
92
|
+
return result;
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=llm-formatter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-formatter.js","sourceRoot":"","sources":["../../src/optimizers/llm-formatter.ts"],"names":[],"mappings":"AAAA,kCAAkC;AAElC;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,QAAgB;IAC3C,IAAI,SAAS,GAAG,QAAQ,CAAC;IAEzB,yDAAyD;IACzD,SAAS,GAAG,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAE9C,8BAA8B;IAC9B,SAAS,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAC;IAE/C,4DAA4D;IAC5D,SAAS,GAAG,qBAAqB,CAAC,SAAS,CAAC,CAAC;IAE7C,2CAA2C;IAC3C,SAAS,GAAG,iBAAiB,CAAC,SAAS,CAAC,CAAC;IAEzC,sCAAsC;IACtC,SAAS,GAAG,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAE9C,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,sBAAsB,CAAC,QAAgB;IAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,YAAY,GAAG,CAAC,CAAC;IAErB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAE9C,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YAC9B,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YAEvB,4BAA4B;YAC5B,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;YAC1D,YAAY,GAAG,eAAe,CAAC;YAE/B,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,eAAe,CAAC,GAAG,GAAG,GAAG,KAAK,CAAC,CAAC;QACzD,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;gBACvB,YAAY,GAAG,CAAC,CAAC,CAAC,yBAAyB;YAC7C,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC3B,CAAC;AAED,SAAS,uBAAuB,CAAC,QAAgB;IAC/C,IAAI,MAAM,GAAG,QAAQ,CAAC;IAEtB,uDAAuD;IACvD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,eAAe,EAAE,IAAI,CAAC,CAAC;IAE/C,wDAAwD;IACxD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,MAAM,GAAG,KAAK;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACZ,IAAI,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;YACrD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACrC,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;QAC1E,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC;SACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,IAAI,MAAM,GAAG,QAAQ,CAAC;IAEtB,uEAAuE;IACvE,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,oBAAoB,EAAE,QAAQ,CAAC,CAAC;IAExD,iFAAiF;IACjF,oFAAoF;IACpF,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,mCAAmC,EAAE,QAAQ,CAAC,CAAC;IAEvE,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAgB;IACzC,yDAAyD;IACzD,IAAI,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC,CAAC;IAC1D,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC;IAElD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,sBAAsB,CAAC,QAAgB;IAC9C,8DAA8D;IAC9D,MAAM,KAAK,GAAwB,IAAI,GAAG,EAAE,CAAC;IAE7C,gCAAgC;IAChC,IAAI,MAAM,GAAG,QAAQ,CAAC,OAAO,CAC3B,0BAA0B,EAC1B,CAAC,MAAM,EAAE,GAAW,EAAE,GAAW,EAAE,EAAE;QACnC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,OAAO,EAAE,CAAC;IACZ,CAAC,CACF,CAAC;IAEF,4CAA4C;IAC5C,MAAM,GAAG,MAAM,CAAC,OAAO,CACrB,2BAA2B,EAC3B,CAAC,KAAK,EAAE,IAAY,EAAE,GAAW,EAAE,EAAE;QACnC,MAAM,MAAM,GAAG,CAAC,GAAG,IAAI,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAC9B,OAAO,GAAG,CAAC,CAAC,CAAC,IAAI,IAAI,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;IAC3C,CAAC,CACF,CAAC;IAEF,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enhance HTML structure for better markdown conversion
|
|
3
|
+
* - Improve heading hierarchy
|
|
4
|
+
* - Clean up nested elements
|
|
5
|
+
* - Normalize structure
|
|
6
|
+
*/
|
|
7
|
+
export declare function enhanceStructure(html: string): string;
|
|
8
|
+
//# sourceMappingURL=structure-enhancer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structure-enhancer.d.ts","sourceRoot":"","sources":["../../src/optimizers/structure-enhancer.ts"],"names":[],"mappings":"AAIA;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAarD"}
|