@nanocollective/get-md 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/LICENSE +36 -0
  2. package/README.md +205 -0
  3. package/bin/get-md.js +4 -0
  4. package/dist/cli.d.ts +3 -0
  5. package/dist/cli.d.ts.map +1 -0
  6. package/dist/cli.js +91 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/cli.spec.d.ts +2 -0
  9. package/dist/cli.spec.d.ts.map +1 -0
  10. package/dist/cli.spec.js +278 -0
  11. package/dist/cli.spec.js.map +1 -0
  12. package/dist/config.d.ts +5 -0
  13. package/dist/config.d.ts.map +1 -0
  14. package/dist/config.js +6 -0
  15. package/dist/config.js.map +1 -0
  16. package/dist/extractors/metadata-extractor.d.ts +6 -0
  17. package/dist/extractors/metadata-extractor.d.ts.map +1 -0
  18. package/dist/extractors/metadata-extractor.js +131 -0
  19. package/dist/extractors/metadata-extractor.js.map +1 -0
  20. package/dist/index.d.ts +44 -0
  21. package/dist/index.d.ts.map +1 -0
  22. package/dist/index.js +70 -0
  23. package/dist/index.js.map +1 -0
  24. package/dist/optimizers/html-cleaner.d.ts +12 -0
  25. package/dist/optimizers/html-cleaner.d.ts.map +1 -0
  26. package/dist/optimizers/html-cleaner.js +228 -0
  27. package/dist/optimizers/html-cleaner.js.map +1 -0
  28. package/dist/optimizers/llm-formatter.d.ts +8 -0
  29. package/dist/optimizers/llm-formatter.d.ts.map +1 -0
  30. package/dist/optimizers/llm-formatter.js +94 -0
  31. package/dist/optimizers/llm-formatter.js.map +1 -0
  32. package/dist/optimizers/structure-enhancer.d.ts +8 -0
  33. package/dist/optimizers/structure-enhancer.d.ts.map +1 -0
  34. package/dist/optimizers/structure-enhancer.js +92 -0
  35. package/dist/optimizers/structure-enhancer.js.map +1 -0
  36. package/dist/parsers/markdown-parser.d.ts +16 -0
  37. package/dist/parsers/markdown-parser.d.ts.map +1 -0
  38. package/dist/parsers/markdown-parser.js +369 -0
  39. package/dist/parsers/markdown-parser.js.map +1 -0
  40. package/dist/types.d.ts +115 -0
  41. package/dist/types.d.ts.map +1 -0
  42. package/dist/types.js +3 -0
  43. package/dist/types.js.map +1 -0
  44. package/dist/utils/url-fetcher.d.ts +10 -0
  45. package/dist/utils/url-fetcher.d.ts.map +1 -0
  46. package/dist/utils/url-fetcher.js +54 -0
  47. package/dist/utils/url-fetcher.js.map +1 -0
  48. package/dist/utils/validators.d.ts +5 -0
  49. package/dist/utils/validators.d.ts.map +1 -0
  50. package/dist/utils/validators.js +23 -0
  51. package/dist/utils/validators.js.map +1 -0
  52. package/package.json +104 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"config.js","sourceRoot":"","sources":["../src/config.ts"],"names":[],"mappings":"AAAA,gBAAgB;AAEhB,2CAA2C;AAC3C,MAAM,CAAC,MAAM,kBAAkB,GAC7B,kFAAkF,CAAC;AAErF,oDAAoD;AACpD,MAAM,CAAC,MAAM,qBAAqB,GAAG,KAAK,CAAC"}
@@ -0,0 +1,6 @@
1
+ import type { ContentMetadata } from "../types.js";
2
+ /**
3
+ * Extract metadata from HTML
4
+ */
5
+ export declare function extractMetadata(html: string, baseUrl?: string): ContentMetadata;
6
+ //# sourceMappingURL=metadata-extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metadata-extractor.d.ts","sourceRoot":"","sources":["../../src/extractors/metadata-extractor.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAEnD;;GAEG;AACH,wBAAgB,eAAe,CAC7B,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,MAAM,GACf,eAAe,CAajB"}
@@ -0,0 +1,131 @@
1
+ // src/extractors/metadata-extractor.ts
2
+ import * as cheerio from "cheerio";
3
+ /**
4
+ * Extract metadata from HTML
5
+ */
6
+ export function extractMetadata(html, baseUrl) {
7
+ const $ = cheerio.load(html);
8
+ return {
9
+ title: extractTitle($),
10
+ author: extractAuthor($),
11
+ excerpt: extractExcerpt($),
12
+ siteName: extractSiteName($),
13
+ publishedTime: extractPublishedTime($),
14
+ language: extractLanguage($),
15
+ canonicalUrl: extractCanonicalUrl($, baseUrl),
16
+ // wordCount and readingTime are now calculated from final markdown
17
+ };
18
+ }
19
+ function extractTitle($) {
20
+ // Try Open Graph
21
+ const ogTitle = $('meta[property="og:title"]').attr("content");
22
+ if (ogTitle)
23
+ return ogTitle;
24
+ // Try Twitter
25
+ const twitterTitle = $('meta[name="twitter:title"]').attr("content");
26
+ if (twitterTitle)
27
+ return twitterTitle;
28
+ // Try regular title tag
29
+ const titleTag = $("title").text();
30
+ if (titleTag)
31
+ return titleTag.trim();
32
+ // Try first h1
33
+ const h1 = $("h1").first().text();
34
+ if (h1)
35
+ return h1.trim();
36
+ return undefined;
37
+ }
38
+ function extractAuthor($) {
39
+ // Try meta author tag
40
+ const metaAuthor = $('meta[name="author"]').attr("content");
41
+ if (metaAuthor)
42
+ return metaAuthor;
43
+ // Try article:author
44
+ const articleAuthor = $('meta[property="article:author"]').attr("content");
45
+ if (articleAuthor)
46
+ return articleAuthor;
47
+ // Try rel="author"
48
+ const relAuthor = $('[rel="author"]').text().trim();
49
+ if (relAuthor)
50
+ return relAuthor;
51
+ // Try common class names
52
+ const byline = $(".author, .byline, .author-name").first().text().trim();
53
+ if (byline)
54
+ return byline;
55
+ return undefined;
56
+ }
57
+ function extractExcerpt($) {
58
+ // Try Open Graph
59
+ const ogDesc = $('meta[property="og:description"]').attr("content");
60
+ if (ogDesc)
61
+ return ogDesc;
62
+ // Try meta description
63
+ const metaDesc = $('meta[name="description"]').attr("content");
64
+ if (metaDesc)
65
+ return metaDesc;
66
+ // Try Twitter
67
+ const twitterDesc = $('meta[name="twitter:description"]').attr("content");
68
+ if (twitterDesc)
69
+ return twitterDesc;
70
+ return undefined;
71
+ }
72
+ function extractSiteName($) {
73
+ // Try Open Graph
74
+ const ogSite = $('meta[property="og:site_name"]').attr("content");
75
+ if (ogSite)
76
+ return ogSite;
77
+ // Try application name
78
+ const appName = $('meta[name="application-name"]').attr("content");
79
+ if (appName)
80
+ return appName;
81
+ return undefined;
82
+ }
83
+ function extractPublishedTime($) {
84
+ // Try article:published_time
85
+ const articleTime = $('meta[property="article:published_time"]').attr("content");
86
+ if (articleTime)
87
+ return articleTime;
88
+ // Try time element with datetime
89
+ const timeEl = $("time[datetime]").first().attr("datetime");
90
+ if (timeEl)
91
+ return timeEl;
92
+ // Try datePublished
93
+ const datePublished = $('[itemprop="datePublished"]').first().attr("content");
94
+ if (datePublished)
95
+ return datePublished;
96
+ return undefined;
97
+ }
98
+ function extractLanguage($) {
99
+ // Try html lang attribute
100
+ const htmlLang = $("html").attr("lang");
101
+ if (htmlLang)
102
+ return htmlLang;
103
+ // Try meta content-language
104
+ const metaLang = $('meta[http-equiv="content-language"]').attr("content");
105
+ if (metaLang)
106
+ return metaLang;
107
+ return undefined;
108
+ }
109
+ function extractCanonicalUrl($, baseUrl) {
110
+ // Try link rel="canonical"
111
+ const canonical = $('link[rel="canonical"]').attr("href");
112
+ if (canonical) {
113
+ if (canonical.startsWith("http"))
114
+ return canonical;
115
+ if (baseUrl) {
116
+ try {
117
+ return new URL(canonical, baseUrl).href;
118
+ }
119
+ catch {
120
+ // Ignore invalid URLs
121
+ }
122
+ }
123
+ return canonical;
124
+ }
125
+ // Try Open Graph URL
126
+ const ogUrl = $('meta[property="og:url"]').attr("content");
127
+ if (ogUrl)
128
+ return ogUrl;
129
+ return baseUrl;
130
+ }
131
+ //# sourceMappingURL=metadata-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metadata-extractor.js","sourceRoot":"","sources":["../../src/extractors/metadata-extractor.ts"],"names":[],"mappings":"AAAA,uCAAuC;AAEvC,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAGnC;;GAEG;AACH,MAAM,UAAU,eAAe,CAC7B,IAAY,EACZ,OAAgB;IAEhB,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,OAAO;QACL,KAAK,EAAE,YAAY,CAAC,CAAC,CAAC;QACtB,MAAM,EAAE,aAAa,CAAC,CAAC,CAAC;QACxB,OAAO,EAAE,cAAc,CAAC,CAAC,CAAC;QAC1B,QAAQ,EAAE,eAAe,CAAC,CAAC,CAAC;QAC5B,aAAa,EAAE,oBAAoB,CAAC,CAAC,CAAC;QACtC,QAAQ,EAAE,eAAe,CAAC,CAAC,CAAC;QAC5B,YAAY,EAAE,mBAAmB,CAAC,CAAC,EAAE,OAAO,CAAC;QAC7C,mEAAmE;KACpE,CAAC;AACJ,CAAC;AAED,SAAS,YAAY,CAAC,CAAqB;IACzC,iBAAiB;IACjB,MAAM,OAAO,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC/D,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC;IAE5B,cAAc;IACd,MAAM,YAAY,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACrE,IAAI,YAAY;QAAE,OAAO,YAAY,CAAC;IAEtC,wBAAwB;IACxB,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;IACnC,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC,IAAI,EAAE,CAAC;IAErC,eAAe;IACf,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC;IAClC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC,IAAI,EAAE,CAAC;IAEzB,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,aAAa,CAAC,CAAqB;IAC1C,sBAAsB;IACtB,MAAM,UAAU,GAAG,CAAC,CAAC,qBAAqB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC5D,IAAI,UAAU;QAAE,OAAO,UAAU,CAAC;IAElC,qBAAqB;IACrB,MAAM,aAAa,GAAG,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC3E,IAAI,aAAa;QAAE,OAAO,aAAa,CAAC;IAExC,mBAAmB;IACnB,MAAM,SAAS,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IACpD,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC;IAEhC,yBAAyB;IACzB,MAAM,MAAM,GAAG,CAAC,CAAC,gCAAgC,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IACzE,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,cAAc,CAAC,CAAqB;IAC3C,iBAAiB;IACjB,MAAM,MAAM,GAAG,CAAC,CAAC,iCAAiC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACpE,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,uBAAuB;IACvB,MAAM,QAAQ,GAAG,CAAC,CAAC,0BAA0B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC/D,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,cAAc;IACd,MAAM,WAAW,GAAG,CAAC,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1E,IAAI,WAAW;QAAE,OAAO,WAAW,CAAC;IAEpC,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,CAAqB;IAC5C,iBAAiB;IACjB,MAAM,MAAM,GAAG,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAClE,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,uBAAuB;IACvB,MAAM,OAAO,GAAG,CAAC,CAAC,+BAA+B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IACnE,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC;IAE5B,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,oBAAoB,CAAC,CAAqB;IACjD,6BAA6B;IAC7B,MAAM,WAAW,GAAG,CAAC,CAAC,yCAAyC,CAAC,CAAC,IAAI,CACnE,SAAS,CACV,CAAC;IACF,IAAI,WAAW;QAAE,OAAO,WAAW,CAAC;IAEpC,iCAAiC;IACjC,MAAM,MAAM,GAAG,CAAC,CAAC,gBAAgB,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAC5D,IAAI,MAAM;QAAE,OAAO,MAAM,CAAC;IAE1B,oBAAoB;IACpB,MAAM,aAAa,GAAG,CAAC,CAAC,4BAA4B,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC9E,IAAI,aAAa;QAAE,OAAO,aAAa,CAAC;IAExC,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,eAAe,CAAC,CAAqB;IAC5C,0BAA0B;IAC1B,MAAM,QAAQ,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACxC,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,4BAA4B;IAC5B,MAAM,QAAQ,GAAG,CAAC,CAAC,qCAAqC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC1E,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,mBAAmB,CAC1B,CAAqB,EACrB,OAAgB;IAEhB,2BAA2B;IAC3B,MAAM,SAAS,GAAG,CAAC,CAAC,uBAAuB,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC1D,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC;YAAE,OAAO,SAAS,CAAC;QACnD,IAAI,OAAO,EAAE,CAAC;YACZ,IAAI,CAAC;gBACH,OAAO,IAAI,GAAG,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;YAC1C,CAAC;YAAC,MAAM,CAAC;gBACP,sBAAsB;YACxB,CAAC;QACH,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,qBAAqB;IACrB,MAAM,KAAK,GAAG,CAAC,CAAC,yBAAyB,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC3D,IAAI,KAAK;QAAE,OAAO,KAAK,CAAC;IAExB,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,44 @@
1
+ import type { MarkdownOptions, MarkdownResult, ContentMetadata, TurndownRule, ConversionStats } from "./types.js";
2
+ /**
3
+ * Convert HTML to clean, LLM-optimized Markdown
4
+ *
5
+ * @param html - Raw HTML string or URL to fetch
6
+ * @param options - Conversion options (including fetch options for URLs)
7
+ * @returns Promise resolving to markdown result
8
+ *
9
+ * @example
10
+ * ```typescript
11
+ * import { convertToMarkdown } from '@nanocollective/get-md';
12
+ *
13
+ * // From HTML string
14
+ * const result = await convertToMarkdown('<h1>Hello</h1><p>World</p>');
15
+ * console.log(result.markdown);
16
+ * // # Hello
17
+ * //
18
+ * // World
19
+ *
20
+ * // From URL
21
+ * const result = await convertToMarkdown('https://example.com');
22
+ * console.log(result.metadata.title);
23
+ *
24
+ * // From URL with custom fetch options
25
+ * const result = await convertToMarkdown('https://example.com', {
26
+ * timeout: 10000,
27
+ * headers: { 'Authorization': 'Bearer token' },
28
+ * llmOptimized: true
29
+ * });
30
+ *
31
+ * // Force URL mode if auto-detection fails
32
+ * const result = await convertToMarkdown('example.com', { isUrl: true });
33
+ * ```
34
+ */
35
+ export declare function convertToMarkdown(html: string, options?: MarkdownOptions): Promise<MarkdownResult>;
36
+ /**
37
+ * Validate if HTML contains extractable content
38
+ *
39
+ * @param html - Raw HTML string
40
+ * @returns Whether content can be extracted
41
+ */
42
+ export declare function hasContent(html: string): boolean;
43
+ export type { MarkdownOptions, MarkdownResult, ContentMetadata, ConversionStats, TurndownRule, };
44
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EACV,eAAe,EACf,cAAc,EACd,eAAe,EACf,YAAY,EACZ,eAAe,EAChB,MAAM,YAAY,CAAC;AAGpB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAsB,iBAAiB,CACrC,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE,eAAe,GACxB,OAAO,CAAC,cAAc,CAAC,CAyBzB;AAED;;;;;GAKG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAEhD;AAGD,YAAY,EACV,eAAe,EACf,cAAc,EACd,eAAe,EACf,eAAe,EACf,YAAY,GACb,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,70 @@
1
+ // src/index.ts
2
+ import { MarkdownParser } from "./parsers/markdown-parser.js";
3
+ import { fetchUrl, isValidUrl } from "./utils/url-fetcher.js";
4
+ import { hasContent as hasContentUtil } from "./utils/validators.js";
5
+ /**
6
+ * Convert HTML to clean, LLM-optimized Markdown
7
+ *
8
+ * @param html - Raw HTML string or URL to fetch
9
+ * @param options - Conversion options (including fetch options for URLs)
10
+ * @returns Promise resolving to markdown result
11
+ *
12
+ * @example
13
+ * ```typescript
14
+ * import { convertToMarkdown } from '@nanocollective/get-md';
15
+ *
16
+ * // From HTML string
17
+ * const result = await convertToMarkdown('<h1>Hello</h1><p>World</p>');
18
+ * console.log(result.markdown);
19
+ * // # Hello
20
+ * //
21
+ * // World
22
+ *
23
+ * // From URL
24
+ * const result = await convertToMarkdown('https://example.com');
25
+ * console.log(result.metadata.title);
26
+ *
27
+ * // From URL with custom fetch options
28
+ * const result = await convertToMarkdown('https://example.com', {
29
+ * timeout: 10000,
30
+ * headers: { 'Authorization': 'Bearer token' },
31
+ * llmOptimized: true
32
+ * });
33
+ *
34
+ * // Force URL mode if auto-detection fails
35
+ * const result = await convertToMarkdown('example.com', { isUrl: true });
36
+ * ```
37
+ */
38
+ export async function convertToMarkdown(html, options) {
39
+ // Check if input is a URL (or forced to be treated as one)
40
+ if (options?.isUrl || isValidUrl(html)) {
41
+ // Extract fetch options
42
+ const fetchOptions = {
43
+ timeout: options?.timeout,
44
+ followRedirects: options?.followRedirects,
45
+ maxRedirects: options?.maxRedirects,
46
+ headers: options?.headers,
47
+ userAgent: options?.userAgent,
48
+ };
49
+ // Fetch HTML from URL
50
+ const fetchedHtml = await fetchUrl(html, fetchOptions);
51
+ // Parse with base URL set to the fetched URL
52
+ const parser = new MarkdownParser();
53
+ return parser.convert(fetchedHtml, {
54
+ ...options,
55
+ baseUrl: options?.baseUrl || html,
56
+ });
57
+ }
58
+ const parser = new MarkdownParser();
59
+ return parser.convert(html, options);
60
+ }
61
+ /**
62
+ * Validate if HTML contains extractable content
63
+ *
64
+ * @param html - Raw HTML string
65
+ * @returns Whether content can be extracted
66
+ */
67
+ export function hasContent(html) {
68
+ return hasContentUtil(html);
69
+ }
70
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,eAAe;AAEf,OAAO,EAAE,cAAc,EAAE,MAAM,8BAA8B,CAAC;AAC9D,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,UAAU,IAAI,cAAc,EAAE,MAAM,uBAAuB,CAAC;AAUrE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,IAAY,EACZ,OAAyB;IAEzB,2DAA2D;IAC3D,IAAI,OAAO,EAAE,KAAK,IAAI,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,wBAAwB;QACxB,MAAM,YAAY,GAAiB;YACjC,OAAO,EAAE,OAAO,EAAE,OAAO;YACzB,eAAe,EAAE,OAAO,EAAE,eAAe;YACzC,YAAY,EAAE,OAAO,EAAE,YAAY;YACnC,OAAO,EAAE,OAAO,EAAE,OAAO;YACzB,SAAS,EAAE,OAAO,EAAE,SAAS;SAC9B,CAAC;QAEF,sBAAsB;QACtB,MAAM,WAAW,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;QAEvD,6CAA6C;QAC7C,MAAM,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;QACpC,OAAO,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE;YACjC,GAAG,OAAO;YACV,OAAO,EAAE,OAAO,EAAE,OAAO,IAAI,IAAI;SAClC,CAAC,CAAC;IACL,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,cAAc,EAAE,CAAC;IACpC,OAAO,MAAM,CAAC,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;AACvC,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CAAC,IAAY;IACrC,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC;AAC9B,CAAC"}
@@ -0,0 +1,12 @@
1
+ interface CleanOptions {
2
+ /** Remove ads, navigation, social media, etc. */
3
+ aggressive?: boolean;
4
+ /** Base URL for resolving relative URLs */
5
+ baseUrl?: string;
6
+ }
7
+ /**
8
+ * Aggressively clean HTML to remove noise for LLMs
9
+ */
10
+ export declare function cleanHTML(html: string, options?: CleanOptions): string;
11
+ export {};
12
+ //# sourceMappingURL=html-cleaner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-cleaner.d.ts","sourceRoot":"","sources":["../../src/optimizers/html-cleaner.ts"],"names":[],"mappings":"AAIA,UAAU,YAAY;IACpB,iDAAiD;IACjD,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,2CAA2C;IAC3C,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,GAAE,YAAiB,GAAG,MAAM,CA+B1E"}
@@ -0,0 +1,228 @@
1
+ // src/optimizers/html-cleaner.ts
2
+ import * as cheerio from "cheerio";
3
+ /**
4
+ * Aggressively clean HTML to remove noise for LLMs
5
+ */
6
+ export function cleanHTML(html, options = {}) {
7
+ const $ = cheerio.load(html);
8
+ // 1. Remove script and style tags
9
+ $("script, style, noscript").remove();
10
+ // 2. Remove common noise elements BEFORE cleaning attributes
11
+ // This way we can still use class/ID selectors
12
+ if (options.aggressive !== false) {
13
+ removeNoiseElements($);
14
+ }
15
+ // 3. Remove comments
16
+ $("*")
17
+ .contents()
18
+ .filter((_, el) => el.type === "comment")
19
+ .remove();
20
+ // 4. Resolve relative URLs (do this before cleaning attributes so we still have src/href)
21
+ if (options.baseUrl) {
22
+ resolveRelativeUrls($, options.baseUrl);
23
+ }
24
+ // 5. Clean attributes (keep only essential ones)
25
+ // Do this AFTER removing noise so our selectors still work
26
+ cleanAttributes($);
27
+ // 6. Remove empty elements (do this last after everything else is cleaned)
28
+ removeEmptyElements($);
29
+ return $.html();
30
+ }
31
+ function removeNoiseElements($) {
32
+ // Remove by role attribute
33
+ $([
34
+ '[role="navigation"]',
35
+ '[role="banner"]',
36
+ '[role="complementary"]',
37
+ '[role="contentinfo"]',
38
+ '[role="search"]',
39
+ ].join(",")).remove();
40
+ // Remove by common class/id patterns
41
+ // More specific selectors to avoid false positives
42
+ const noiseSelectors = [
43
+ // Navigation elements (but not components that just have 'nav' in the name)
44
+ 'nav[role="navigation"]',
45
+ "nav.navbar",
46
+ "nav.nav-menu",
47
+ "div.navbar",
48
+ 'div[role="navigation"]',
49
+ "#navigation",
50
+ "#nav",
51
+ "#menu",
52
+ // Headers/Footers - only actual header/footer elements or very specific classes
53
+ 'header[role="banner"]',
54
+ 'footer[role="contentinfo"]',
55
+ "#header",
56
+ "#footer",
57
+ "div.site-header",
58
+ "div.site-footer",
59
+ "div.page-header",
60
+ "div.page-footer",
61
+ // Sidebars
62
+ "aside",
63
+ "div.sidebar",
64
+ 'div[role="complementary"]',
65
+ "#sidebar",
66
+ // Ads
67
+ ".ad",
68
+ ".ads",
69
+ ".advertisement",
70
+ ".advert",
71
+ '[id*="ad-"]',
72
+ '[class*="advertisement"]',
73
+ '[class*="-ad-"]',
74
+ '[class*="google-ad"]',
75
+ // Social media
76
+ ".social",
77
+ ".social-share",
78
+ ".share-buttons",
79
+ ".social-media",
80
+ // Comments
81
+ ".comments",
82
+ "#comments",
83
+ ".comment-section",
84
+ // Related/recommendations
85
+ ".related",
86
+ ".recommendations",
87
+ ".suggested",
88
+ // Popups/modals
89
+ ".modal",
90
+ ".popup",
91
+ ".overlay",
92
+ '[role="dialog"]',
93
+ // Cookie notices
94
+ ".cookie-notice",
95
+ ".cookie-banner",
96
+ "#cookie-consent",
97
+ // Newsletter signups
98
+ ".newsletter",
99
+ ".subscribe",
100
+ ".signup-form",
101
+ ];
102
+ $(noiseSelectors.join(",")).remove();
103
+ // Remove elements with common noise text
104
+ // But ONLY if they are small elements (to avoid removing large content blocks
105
+ // that happen to mention these terms)
106
+ $("*")
107
+ .filter((_, el) => {
108
+ const $el = $(el);
109
+ const text = $el.text().toLowerCase();
110
+ const textLength = text.trim().length;
111
+ // Only remove if text is short (< 200 chars) and matches noise patterns
112
+ // This avoids removing entire articles that mention these terms
113
+ if (textLength > 200)
114
+ return false;
115
+ return (text.includes("cookie policy") ||
116
+ text.includes("accept cookies") ||
117
+ text.includes("sign up for") ||
118
+ text.includes("newsletter") ||
119
+ text.includes("follow us"));
120
+ })
121
+ .remove();
122
+ }
123
+ function cleanAttributes($) {
124
+ // Attributes to preserve
125
+ const keepAttributes = new Set([
126
+ "href",
127
+ "src",
128
+ "alt",
129
+ "title",
130
+ "colspan",
131
+ "rowspan", // For tables
132
+ "align", // For table alignment
133
+ ]);
134
+ $("*").each((_, el) => {
135
+ const $el = $(el);
136
+ const attrs = $el.attr();
137
+ if (attrs) {
138
+ Object.keys(attrs).forEach((attr) => {
139
+ if (!keepAttributes.has(attr)) {
140
+ $el.removeAttr(attr);
141
+ }
142
+ });
143
+ }
144
+ });
145
+ }
146
+ function removeEmptyElements($) {
147
+ // Remove elements that have no text and no important children
148
+ const importantTags = new Set(["img", "br", "hr", "input", "iframe"]);
149
+ const contentTags = new Set([
150
+ "p",
151
+ "h1",
152
+ "h2",
153
+ "h3",
154
+ "h4",
155
+ "h5",
156
+ "h6",
157
+ "ul",
158
+ "ol",
159
+ "li",
160
+ "table",
161
+ "blockquote",
162
+ "pre",
163
+ "code",
164
+ ]);
165
+ $("*").each((_, el) => {
166
+ const $el = $(el);
167
+ const tagName = el.tagName?.toLowerCase();
168
+ // Skip important tags
169
+ if (tagName && importantTags.has(tagName))
170
+ return;
171
+ // Get the text content
172
+ const text = $el.text().trim();
173
+ const hasImportantChildren = $el.find("img, iframe").length > 0;
174
+ const hasContentTags = $el.find("p, h1, h2, h3, h4, h5, h6, ul, ol, table, blockquote, pre")
175
+ .length > 0;
176
+ // For content-bearing tags (p, h1-h6, etc), remove if they're empty or just junk
177
+ if (tagName && contentTags.has(tagName)) {
178
+ // Remove if completely empty
179
+ if (text.length === 0 && !hasImportantChildren) {
180
+ $el.remove();
181
+ return;
182
+ }
183
+ // Remove if only contains punctuation/whitespace like "|", "-", etc.
184
+ const meaningfulText = text.replace(/[\s|_.:;-]+/g, "");
185
+ if (meaningfulText.length === 0 && !hasImportantChildren) {
186
+ $el.remove();
187
+ return;
188
+ }
189
+ }
190
+ // For container elements (div, section, etc), remove if no text and no important children
191
+ if (!text && !hasImportantChildren && !hasContentTags) {
192
+ $el.remove();
193
+ }
194
+ });
195
+ }
196
+ function resolveRelativeUrls($, baseUrl) {
197
+ const base = new URL(baseUrl);
198
+ // Resolve image sources
199
+ $("img").each((_, el) => {
200
+ const $el = $(el);
201
+ const src = $el.attr("src");
202
+ if (src && !src.startsWith("http") && !src.startsWith("data:")) {
203
+ try {
204
+ $el.attr("src", new URL(src, base).href);
205
+ }
206
+ catch {
207
+ // Ignore invalid URLs
208
+ }
209
+ }
210
+ });
211
+ // Resolve link hrefs
212
+ $("a").each((_, el) => {
213
+ const $el = $(el);
214
+ const href = $el.attr("href");
215
+ if (href &&
216
+ !href.startsWith("http") &&
217
+ !href.startsWith("#") &&
218
+ !href.startsWith("mailto:")) {
219
+ try {
220
+ $el.attr("href", new URL(href, base).href);
221
+ }
222
+ catch {
223
+ // Ignore invalid URLs
224
+ }
225
+ }
226
+ });
227
+ }
228
+ //# sourceMappingURL=html-cleaner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"html-cleaner.js","sourceRoot":"","sources":["../../src/optimizers/html-cleaner.ts"],"names":[],"mappings":"AAAA,iCAAiC;AAEjC,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AASnC;;GAEG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,UAAwB,EAAE;IAChE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,kCAAkC;IAClC,CAAC,CAAC,yBAAyB,CAAC,CAAC,MAAM,EAAE,CAAC;IAEtC,6DAA6D;IAC7D,+CAA+C;IAC/C,IAAI,OAAO,CAAC,UAAU,KAAK,KAAK,EAAE,CAAC;QACjC,mBAAmB,CAAC,CAAC,CAAC,CAAC;IACzB,CAAC;IAED,qBAAqB;IACrB,CAAC,CAAC,GAAG,CAAC;SACH,QAAQ,EAAE;SACV,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAE,EAAwB,CAAC,IAAI,KAAK,SAAS,CAAC;SAC/D,MAAM,EAAE,CAAC;IAEZ,0FAA0F;IAC1F,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;QACpB,mBAAmB,CAAC,CAAC,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;IAC1C,CAAC;IAED,iDAAiD;IACjD,2DAA2D;IAC3D,eAAe,CAAC,CAAC,CAAC,CAAC;IAEnB,2EAA2E;IAC3E,mBAAmB,CAAC,CAAC,CAAC,CAAC;IAEvB,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;AAClB,CAAC;AAED,SAAS,mBAAmB,CAAC,CAAqB;IAChD,2BAA2B;IAC3B,CAAC,CACC;QACE,qBAAqB;QACrB,iBAAiB;QACjB,wBAAwB;QACxB,sBAAsB;QACtB,iBAAiB;KAClB,CAAC,IAAI,CAAC,GAAG,CAAC,CACZ,CAAC,MAAM,EAAE,CAAC;IAEX,qCAAqC;IACrC,mDAAmD;IACnD,MAAM,cAAc,GAAG;QACrB,4EAA4E;QAC5E,wBAAwB;QACxB,YAAY;QACZ,cAAc;QACd,YAAY;QACZ,wBAAwB;QACxB,aAAa;QACb,MAAM;QACN,OAAO;QAEP,gFAAgF;QAChF,uBAAuB;QACvB,4BAA4B;QAC5B,SAAS;QACT,SAAS;QACT,iBAAiB;QACjB,iBAAiB;QACjB,iBAAiB;QACjB,iBAAiB;QAEjB,WAAW;QACX,OAAO;QACP,aAAa;QACb,2BAA2B;QAC3B,UAAU;QAEV,MAAM;QACN,KAAK;QACL,MAAM;QACN,gBAAgB;QAChB,SAAS;QACT,aAAa;QACb,0BAA0B;QAC1B,iBAAiB;QACjB,sBAAsB;QAEtB,eAAe;QACf,SAAS;QACT,eAAe;QACf,gBAAgB;QAChB,eAAe;QAEf,WAAW;QACX,WAAW;QACX,WAAW;QACX,kBAAkB;QAElB,0BAA0B;QAC1B,UAAU;QACV,kBAAkB;QAClB,YAAY;QAEZ,gBAAgB;QAChB,QAAQ;QACR,QAAQ;QACR,UAAU;QACV,iBAAiB;QAEjB,iBAAiB;QACjB,gBAAgB;QAChB,gBAAgB;QAChB,iBAAiB;QAEjB,qBAAqB;QACrB,aAAa;QACb,YAAY;QACZ,cAAc;KACf,CAAC;IAEF,CAAC,CAAC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;IAErC,yCAAyC;IACzC,8EAA8E;IAC9E,sCAAsC;IACtC,CAAC,CAAC,GAAG,CAAC;SACH,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QAChB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACtC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC;QAEtC,wEAAwE;QACxE,gEAAgE;QAChE,IAAI,UAAU,GAAG,GAAG;YAAE,OAAO,KAAK,CAAC;QAEnC,OAAO,CACL,IAAI,CAAC,QAAQ,CAAC,eAAe,CAAC;YAC9B,IAAI,CAAC,QAAQ,CAAC,gBAAgB,CAAC;YAC/B,IAAI,CAAC,QAAQ,CAAC,aAAa,CAAC;YAC5B,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC;YAC3B,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,CAC3B,CAAC;IACJ,CAAC,CAAC;SACD,MAAM,EAAE,CAAC;AACd,CAAC;AAED,SAAS,eAAe,CAAC,CAAqB;IAC5C,yBAAyB;IACzB,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;QAC7B,MAAM;QACN,KAAK;QACL,KAAK;QACL,OAAO;QACP,SAAS;QACT,SAAS,EAAE,aAAa;QACxB,OAAO,EAAE,sBAAsB;KAChC,CAAC,CAAC;IAEH,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;QAEzB,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;gBAClC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC9B,GAAG,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;gBACvB,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,mBAAmB,CAAC,CAAqB;IAChD,8DAA8D;IAC9D,MAAM,aAAa,GAAG,IAAI,GAAG,CAAC,CAAC,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAC;IACtE,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC;QAC1B,GAAG;QACH,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,OAAO;QACP,YAAY;QACZ,KAAK;QACL,MAAM;KACP,CAAC,CAAC;IAEH,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,OAAO,GAAI,EAA2B,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC;QAEpE,sBAAsB;QACtB,IAAI,OAAO,IAAI,aAAa,CAAC,GAAG,CAAC,OAAO,CAAC;YAAE,OAAO;QAElD,uBAAuB;QACvB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;QAC/B,MAAM,oBAAoB,GAAG,GAAG,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAChE,MAAM,cAAc,GAClB,GAAG,CAAC,IAAI,CAAC,2DAA2D,CAAC;aAClE,MAAM,GAAG,CAAC,CAAC;QAEhB,iFAAiF;QACjF,IAAI,OAAO,IAAI,WAAW,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACxC,6BAA6B;YAC7B,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBAC/C,GAAG,CAAC,MAAM,EAAE,CAAC;gBACb,OAAO;YACT,CAAC;YAED,qEAAqE;YACrE,MAAM,cAAc,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;YACxD,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,oBAAoB,EAAE,CAAC;gBACzD,GAAG,CAAC,MAAM,EAAE,CAAC;gBACb,OAAO;YACT,CAAC;QACH,CAAC;QAED,0FAA0F;QAC1F,IAAI,CAAC,IAAI,IAAI,CAAC,oBAAoB,IAAI,CAAC,cAAc,EAAE,CAAC;YACtD,GAAG,CAAC,MAAM,EAAE,CAAC;QACf,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC;AAED,SAAS,mBAAmB,CAAC,CAAqB,EAAE,OAAe;IACjE,MAAM,IAAI,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;IAE9B,wBAAwB;IACxB,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACtB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,GAAG,GAAG,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;YAC/D,IAAI,CAAC;gBACH,GAAG,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;YAC3C,CAAC;YAAC,MAAM,CAAC;gBACP,sBAAsB;YACxB,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,qBAAqB;IACrB,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;QACpB,MAAM,GAAG,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;QAClB,MAAM,IAAI,GAAG,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAC9B,IACE,IAAI;YACJ,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;YACxB,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC;YACrB,CAAC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,EAC3B,CAAC;YACD,IAAI,CAAC;gBACH,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;YAC7C,CAAC;YAAC,MAAM,CAAC;gBACP,sBAAsB;YACxB,CAAC;QACH,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Format markdown specifically for LLM consumption
3
+ * - Consistent spacing and structure
4
+ * - Clear section boundaries
5
+ * - Reduced noise and clutter
6
+ */
7
+ export declare function formatForLLM(markdown: string): string;
8
+ //# sourceMappingURL=llm-formatter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm-formatter.d.ts","sourceRoot":"","sources":["../../src/optimizers/llm-formatter.ts"],"names":[],"mappings":"AAEA;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,CAmBrD"}
@@ -0,0 +1,94 @@
1
+ // src/optimizers/llm-formatter.ts
2
+ /**
3
+ * Format markdown specifically for LLM consumption
4
+ * - Consistent spacing and structure
5
+ * - Clear section boundaries
6
+ * - Reduced noise and clutter
7
+ */
8
+ export function formatForLLM(markdown) {
9
+ let formatted = markdown;
10
+ // 1. Normalize heading levels (ensure no skipped levels)
11
+ formatted = normalizeHeadingLevels(formatted);
12
+ // 2. Improve list consistency
13
+ formatted = normalizeListFormatting(formatted);
14
+ // 3. Clean up inline formatting (remove excessive emphasis)
15
+ formatted = cleanInlineFormatting(formatted);
16
+ // 4. Ensure code blocks are clearly marked
17
+ formatted = enhanceCodeBlocks(formatted);
18
+ // 5. Improve link formatting for LLMs
19
+ formatted = optimizeLinkFormatting(formatted);
20
+ return formatted;
21
+ }
22
+ function normalizeHeadingLevels(markdown) {
23
+ const lines = markdown.split("\n");
24
+ const result = [];
25
+ let currentLevel = 0;
26
+ for (const line of lines) {
27
+ const match = line.match(/^(#{1,6})\s+(.+)$/);
28
+ if (match) {
29
+ const level = match[1].length;
30
+ const title = match[2];
31
+ // Don't skip heading levels
32
+ const normalizedLevel = Math.min(level, currentLevel + 2);
33
+ currentLevel = normalizedLevel;
34
+ result.push("#".repeat(normalizedLevel) + " " + title);
35
+ }
36
+ else {
37
+ result.push(line);
38
+ if (line.trim() === "") {
39
+ currentLevel = 0; // Reset on blank section
40
+ }
41
+ }
42
+ }
43
+ return result.join("\n");
44
+ }
45
+ function normalizeListFormatting(markdown) {
46
+ let result = markdown;
47
+ // Ensure consistent list markers (use - for unordered)
48
+ result = result.replace(/^\s*[*+]\s+/gm, "- ");
49
+ // Ensure proper indentation for nested lists (2 spaces)
50
+ const lines = result.split("\n");
51
+ result = lines
52
+ .map((line) => {
53
+ if (/^(\s*)[-*+]\s+/.test(line)) {
54
+ const indent = line.match(/^(\s*)/)?.[1].length || 0;
55
+ const depth = Math.floor(indent / 2);
56
+ return " ".repeat(depth) + "- " + line.trim().replace(/^[-*+]\s+/, "");
57
+ }
58
+ return line;
59
+ })
60
+ .join("\n");
61
+ return result;
62
+ }
63
+ function cleanInlineFormatting(markdown) {
64
+ let result = markdown;
65
+ // Remove multiple consecutive emphasis markers (***text*** → **text**)
66
+ result = result.replace(/\*{3,}(.+?)\*{3,}/g, "**$1**");
67
+ // Clean up spaces inside emphasis markers only (e.g., "** text **" → "**text**")
68
+ // This pattern matches the full emphasis span with internal spaces and removes them
69
+ result = result.replace(/(\*{1,2})\s+([^*]+?)\s+(\*{1,2})/g, "$1$2$3");
70
+ return result;
71
+ }
72
+ function enhanceCodeBlocks(markdown) {
73
+ // Ensure code blocks are on their own lines with spacing
74
+ let result = markdown.replace(/([^\n])```/g, "$1\n\n```");
75
+ result = result.replace(/```([^\n])/g, "```\n$1");
76
+ return result;
77
+ }
78
+ function optimizeLinkFormatting(markdown) {
79
+ // Convert reference-style links to inline for simpler parsing
80
+ const links = new Map();
81
+ // Extract reference definitions
82
+ let result = markdown.replace(/^\[([^\]]+)\]:\s*(.+)$/gm, (_match, ref, url) => {
83
+ links.set(ref.toLowerCase(), url.trim());
84
+ return "";
85
+ });
86
+ // Replace reference-style links with inline
87
+ result = result.replace(/\[([^\]]+)\]\[([^\]]*)\]/g, (match, text, ref) => {
88
+ const refKey = (ref || text).toLowerCase();
89
+ const url = links.get(refKey);
90
+ return url ? `[${text}](${url})` : match;
91
+ });
92
+ return result;
93
+ }
94
+ //# sourceMappingURL=llm-formatter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm-formatter.js","sourceRoot":"","sources":["../../src/optimizers/llm-formatter.ts"],"names":[],"mappings":"AAAA,kCAAkC;AAElC;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,QAAgB;IAC3C,IAAI,SAAS,GAAG,QAAQ,CAAC;IAEzB,yDAAyD;IACzD,SAAS,GAAG,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAE9C,8BAA8B;IAC9B,SAAS,GAAG,uBAAuB,CAAC,SAAS,CAAC,CAAC;IAE/C,4DAA4D;IAC5D,SAAS,GAAG,qBAAqB,CAAC,SAAS,CAAC,CAAC;IAE7C,2CAA2C;IAC3C,SAAS,GAAG,iBAAiB,CAAC,SAAS,CAAC,CAAC;IAEzC,sCAAsC;IACtC,SAAS,GAAG,sBAAsB,CAAC,SAAS,CAAC,CAAC;IAE9C,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,SAAS,sBAAsB,CAAC,QAAgB;IAC9C,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IACnC,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,YAAY,GAAG,CAAC,CAAC;IAErB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAE9C,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YAC9B,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;YAEvB,4BAA4B;YAC5B,MAAM,eAAe,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,YAAY,GAAG,CAAC,CAAC,CAAC;YAC1D,YAAY,GAAG,eAAe,CAAC;YAE/B,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,eAAe,CAAC,GAAG,GAAG,GAAG,KAAK,CAAC,CAAC;QACzD,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClB,IAAI,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,EAAE,CAAC;gBACvB,YAAY,GAAG,CAAC,CAAC,CAAC,yBAAyB;YAC7C,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC3B,CAAC;AAED,SAAS,uBAAuB,CAAC,QAAgB;IAC/C,IAAI,MAAM,GAAG,QAAQ,CAAC;IAEtB,uDAAuD;IACvD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,eAAe,EAAE,IAAI,CAAC,CAAC;IAE/C,wDAAwD;IACxD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,MAAM,GAAG,KAAK;SACX,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACZ,IAAI,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;YAChC,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC;YACrD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACrC,OAAO,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;QAC1E,CAAC;QACD,OAAO,IAAI,CAAC;IACd,CAAC,CAAC;SACD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,qBAAqB,CAAC,QAAgB;IAC7C,IAAI,MAAM,GAAG,QAAQ,CAAC;IAEtB,uEAAuE;IACvE,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,oBAAoB,EAAE,QAAQ,CAAC,CAAC;IAExD,iFAAiF;IACjF,oFAAoF;IACpF,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,mCAAmC,EAAE,QAAQ,CAAC,CAAC;IAEvE,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,iBAAiB,CAAC,QAAgB;IACzC,yDAAyD;IACzD,IAAI,MAAM,GAAG,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE,WAAW,CAAC,CAAC;IAC1D,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,aAAa,EAAE,SAAS,CAAC,CAAC;IAElD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,sBAAsB,CAAC,QAAgB;IAC9C,8DAA8D;IAC9D,MAAM,KAAK,GAAwB,IAAI,GAAG,EAAE,CAAC;IAE7C,gCAAgC;IAChC,IAAI,MAAM,GAAG,QAAQ,CAAC,OAAO,CAC3B,0BAA0B,EAC1B,CAAC,MAAM,EAAE,GAAW,EAAE,GAAW,EAAE,EAAE;QACnC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,EAAE,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,OAAO,EAAE,CAAC;IACZ,CAAC,CACF,CAAC;IAEF,4CAA4C;IAC5C,MAAM,GAAG,MAAM,CAAC,OAAO,CACrB,2BAA2B,EAC3B,CAAC,KAAK,EAAE,IAAY,EAAE,GAAW,EAAE,EAAE;QACnC,MAAM,MAAM,GAAG,CAAC,GAAG,IAAI,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC;QAC3C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;QAC9B,OAAO,GAAG,CAAC,CAAC,CAAC,IAAI,IAAI,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC;IAC3C,CAAC,CACF,CAAC;IAEF,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Enhance HTML structure for better markdown conversion
3
+ * - Improve heading hierarchy
4
+ * - Clean up nested elements
5
+ * - Normalize structure
6
+ */
7
+ export declare function enhanceStructure(html: string): string;
8
+ //# sourceMappingURL=structure-enhancer.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"structure-enhancer.d.ts","sourceRoot":"","sources":["../../src/optimizers/structure-enhancer.ts"],"names":[],"mappings":"AAIA;;;;;GAKG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAarD"}