@peam-ai/parser 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,10 +1,9 @@
1
- import robotsParser from 'robots-parser';
2
-
3
1
  interface StructuredPage {
4
2
  title: string;
5
3
  description: string;
6
4
  content: string;
7
5
  textContent: string;
6
+ markdownContent?: string;
8
7
  contentLength?: number;
9
8
  author?: string;
10
9
  direction?: string;
@@ -45,55 +44,6 @@ interface ParseOptions {
45
44
  extractImages?: boolean;
46
45
  }
47
46
 
48
- /**
49
- * Checks if a path matches any of the given wildcard patterns
50
- * Uses the matcher library which supports:
51
- * - * (matches any characters except /)
52
- * - ** (matches any characters including /)
53
- * - ? (matches single character)
54
- * - ! (negation)
55
- * - [] (character ranges)
56
- */
57
- declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
58
-
59
- type RobotsParser = ReturnType<typeof robotsParser>;
60
- interface RobotsTxtResult {
61
- parser: RobotsParser;
62
- path: string;
63
- }
64
- declare function createRobotsParser(content: string): RobotsParser;
65
- /**
66
- * Loads and parses robots.txt from custom path or standard locations
67
- * Returns the parser and the path where robots.txt was found
68
- */
69
- declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
70
- declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
71
-
72
- type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
73
- interface PathFilterResult {
74
- included: boolean;
75
- reason: PathFilterReason;
76
- }
77
- /**
78
- * Determines if a pathname should be included in the index
79
- * Returns both the decision and the reason for exclusion
80
- */
81
- declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
82
-
83
- /**
84
- * Convert file path to URL pathname
85
- *
86
- * Examples:
87
- * index.html -> /
88
- * about.html -> /about
89
- * about/index.html -> /about/
90
- * blog/post-1.html -> /blog/post-1
91
- * blog/post-1/index.html -> /blog/post-1/
92
- * server/pages/contact.html -> /contact
93
- * server/app/about.html -> /about
94
- */
95
- declare function filePathToPathname(filePath: string): string;
96
-
97
47
  /**
98
48
  * Parse HTML content and convert it to a StructuredPage
99
49
  * @param html - HTML string to parse
@@ -102,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
102
52
  */
103
53
  declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
104
54
 
105
- export { type ParseOptions, type PathFilterReason, type PathFilterResult, type RobotsParser, type RobotsTxtResult, type StructuredPage, createRobotsParser, filePathToPathname, isPathAllowedByRobots, loadRobotsTxt, matchesExcludePattern, parseHTML, shouldIncludePath };
55
+ export { type ParseOptions, type StructuredPage, parseHTML };
package/dist/index.d.ts CHANGED
@@ -1,10 +1,9 @@
1
- import robotsParser from 'robots-parser';
2
-
3
1
  interface StructuredPage {
4
2
  title: string;
5
3
  description: string;
6
4
  content: string;
7
5
  textContent: string;
6
+ markdownContent?: string;
8
7
  contentLength?: number;
9
8
  author?: string;
10
9
  direction?: string;
@@ -45,55 +44,6 @@ interface ParseOptions {
45
44
  extractImages?: boolean;
46
45
  }
47
46
 
48
- /**
49
- * Checks if a path matches any of the given wildcard patterns
50
- * Uses the matcher library which supports:
51
- * - * (matches any characters except /)
52
- * - ** (matches any characters including /)
53
- * - ? (matches single character)
54
- * - ! (negation)
55
- * - [] (character ranges)
56
- */
57
- declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
58
-
59
- type RobotsParser = ReturnType<typeof robotsParser>;
60
- interface RobotsTxtResult {
61
- parser: RobotsParser;
62
- path: string;
63
- }
64
- declare function createRobotsParser(content: string): RobotsParser;
65
- /**
66
- * Loads and parses robots.txt from custom path or standard locations
67
- * Returns the parser and the path where robots.txt was found
68
- */
69
- declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
70
- declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
71
-
72
- type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
73
- interface PathFilterResult {
74
- included: boolean;
75
- reason: PathFilterReason;
76
- }
77
- /**
78
- * Determines if a pathname should be included in the index
79
- * Returns both the decision and the reason for exclusion
80
- */
81
- declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
82
-
83
- /**
84
- * Convert file path to URL pathname
85
- *
86
- * Examples:
87
- * index.html -> /
88
- * about.html -> /about
89
- * about/index.html -> /about/
90
- * blog/post-1.html -> /blog/post-1
91
- * blog/post-1/index.html -> /blog/post-1/
92
- * server/pages/contact.html -> /contact
93
- * server/app/about.html -> /about
94
- */
95
- declare function filePathToPathname(filePath: string): string;
96
-
97
47
  /**
98
48
  * Parse HTML content and convert it to a StructuredPage
99
49
  * @param html - HTML string to parse
@@ -102,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
102
52
  */
103
53
  declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
104
54
 
105
- export { type ParseOptions, type PathFilterReason, type PathFilterResult, type RobotsParser, type RobotsTxtResult, type StructuredPage, createRobotsParser, filePathToPathname, isPathAllowedByRobots, loadRobotsTxt, matchesExcludePattern, parseHTML, shouldIncludePath };
55
+ export { type ParseOptions, type StructuredPage, parseHTML };
package/dist/index.js CHANGED
@@ -44,133 +44,14 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
44
44
  // src/index.ts
45
45
  var index_exports = {};
46
46
  __export(index_exports, {
47
- createRobotsParser: () => createRobotsParser,
48
- filePathToPathname: () => filePathToPathname,
49
- isPathAllowedByRobots: () => isPathAllowedByRobots,
50
- loadRobotsTxt: () => loadRobotsTxt,
51
- matchesExcludePattern: () => matchesExcludePattern,
52
- parseHTML: () => parseHTML,
53
- shouldIncludePath: () => shouldIncludePath
47
+ parseHTML: () => parseHTML
54
48
  });
55
49
  module.exports = __toCommonJS(index_exports);
56
50
 
57
- // src/utils/excludePatterns.ts
58
- var import_matcher = require("matcher");
59
- function matchesExcludePattern(path, patterns) {
60
- if (!patterns || patterns.length === 0) {
61
- return false;
62
- }
63
- const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
64
- const normalizedPath = normalize(path);
65
- const normalizedPatterns = patterns.map(normalize);
66
- return (0, import_matcher.isMatch)(normalizedPath, normalizedPatterns);
67
- }
68
-
69
- // src/utils/robotsParser.ts
70
- var import_fs = require("fs");
71
- var import_path = require("path");
72
- var import_robots_parser = __toESM(require("robots-parser"));
73
- function createRobotsParser(content) {
74
- return (0, import_robots_parser.default)("https://robots.invalid/robots.txt", content);
75
- }
76
- function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
77
- let robotsContent = null;
78
- let foundPath = null;
79
- if (robotsTxtPath) {
80
- const customPath = (0, import_path.join)(projectDir, robotsTxtPath);
81
- if ((0, import_fs.existsSync)(customPath)) {
82
- robotsContent = (0, import_fs.readFileSync)(customPath, "utf-8");
83
- foundPath = customPath;
84
- }
85
- }
86
- if (!robotsContent) {
87
- for (const searchPath of searchPaths) {
88
- const fullPath = (0, import_path.join)(projectDir, searchPath);
89
- if ((0, import_fs.existsSync)(fullPath)) {
90
- robotsContent = (0, import_fs.readFileSync)(fullPath, "utf-8");
91
- foundPath = fullPath;
92
- break;
93
- }
94
- }
95
- }
96
- if (!robotsContent) {
97
- return null;
98
- }
99
- return {
100
- parser: createRobotsParser(robotsContent),
101
- path: foundPath || ""
102
- };
103
- }
104
- function isPathAllowedByRobots(path, robots) {
105
- if (!robots) {
106
- return true;
107
- }
108
- const normalizedPath = path.startsWith("/") ? path : `/${path}`;
109
- const testUrl = `https://robots.invalid${normalizedPath}`;
110
- const isAllowed = robots.isAllowed(testUrl, "*");
111
- return isAllowed !== false;
112
- }
113
-
114
- // src/utils/pathFilter.ts
115
- function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
116
- if (pathname.includes("[") && pathname.includes("]")) {
117
- return { included: false, reason: "dynamic-route" };
118
- }
119
- if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
120
- return { included: false, reason: "internal-route" };
121
- }
122
- if (pathname.endsWith(".rsc")) {
123
- return { included: false, reason: "rsc-file" };
124
- }
125
- if (pathname.includes(".segments/")) {
126
- return { included: false, reason: "segment-file" };
127
- }
128
- if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
129
- return { included: false, reason: "static-asset" };
130
- }
131
- if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
132
- return { included: false, reason: "robots-txt" };
133
- }
134
- if (matchesExcludePattern(pathname, excludePatterns)) {
135
- return { included: false, reason: "exclude-pattern" };
136
- }
137
- return { included: true, reason: "included" };
138
- }
139
-
140
- // src/utils/pathUtils.ts
141
- var import_path2 = require("path");
142
- var artifactPrefixes = [
143
- "server/pages/",
144
- "server/app/",
145
- "static/chunks/app/",
146
- "static/chunks/pages/",
147
- "static/",
148
- "server/"
149
- ];
150
- function filePathToPathname(filePath) {
151
- let pathname = filePath.split(import_path2.sep).join("/");
152
- for (const prefix of artifactPrefixes) {
153
- if (pathname.startsWith(prefix)) {
154
- pathname = pathname.substring(prefix.length);
155
- break;
156
- }
157
- }
158
- pathname = pathname.replace(/\.html?$/, "");
159
- if (pathname === "index" || pathname === "") {
160
- return "/";
161
- }
162
- if (pathname.endsWith("/index")) {
163
- pathname = pathname.slice(0, -5);
164
- }
165
- if (!pathname.startsWith("/")) {
166
- pathname = "/" + pathname;
167
- }
168
- return pathname;
169
- }
170
-
171
51
  // src/parseHtml.ts
172
52
  var import_logger = require("@peam-ai/logger");
173
53
  var import_jsdom = require("jsdom");
54
+ var import_turndown = __toESM(require("turndown"));
174
55
 
175
56
  // src/parsers/cssSelectorParser.ts
176
57
  var CssSelectorParser = class {
@@ -299,10 +180,9 @@ var ReadabilityParser = class {
299
180
  var log = import_logger.loggers.parser;
300
181
  function parseHTML(html, options = {}) {
301
182
  if (!html || html.trim().length === 0) {
302
- log.warn("Empty or invalid HTML input");
183
+ log.error("Empty or invalid HTML input");
303
184
  return void 0;
304
185
  }
305
- log.debug("Starting parse with options", options);
306
186
  const dom = new import_jsdom.JSDOM(html);
307
187
  const document = dom.window.document;
308
188
  const cssSelectorParser = new CssSelectorParser();
@@ -310,24 +190,30 @@ function parseHTML(html, options = {}) {
310
190
  const readabilityParser = new ReadabilityParser();
311
191
  const readabilityStructuredPage = readabilityParser.parse(document, options);
312
192
  if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
313
- log.warn("Failed to extract content");
193
+ log.error("Failed to extract content");
314
194
  return void 0;
315
195
  }
316
- return __spreadValues(__spreadValues(__spreadValues({}, {
196
+ const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
317
197
  title: "",
318
198
  description: "",
319
199
  content: "",
320
200
  textContent: ""
321
201
  }), cssSelectorStructuredPage), readabilityStructuredPage);
202
+ if (mergedResult.content) {
203
+ try {
204
+ const turndownService = new import_turndown.default({
205
+ headingStyle: "atx",
206
+ codeBlockStyle: "fenced"
207
+ });
208
+ mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
209
+ } catch (error) {
210
+ log.error("Failed to convert content to markdown", error);
211
+ }
212
+ }
213
+ return mergedResult;
322
214
  }
323
215
  // Annotate the CommonJS export names for ESM import in node:
324
216
  0 && (module.exports = {
325
- createRobotsParser,
326
- filePathToPathname,
327
- isPathAllowedByRobots,
328
- loadRobotsTxt,
329
- matchesExcludePattern,
330
- parseHTML,
331
- shouldIncludePath
217
+ parseHTML
332
218
  });
333
219
  //# sourceMappingURL=index.js.map
package/dist/index.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/index.ts","../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n\nexport { matchesExcludePattern } from './utils/excludePatterns';\nexport { shouldIncludePath, type PathFilterReason, type PathFilterResult } from './utils/pathFilter';\nexport { filePathToPathname } from './utils/pathUtils';\nexport {\n createRobotsParser,\n isPathAllowedByRobots,\n loadRobotsTxt,\n type RobotsParser,\n type RobotsTxtResult,\n} from './utils/robotsParser';\n\nexport { parseHTML } from './parseHtml';\n","import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,qBAAwB;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,aAAO,wBAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,gBAAyC;AACzC,kBAAqB;AACrB,2BAAyB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,aAAO,qBAAAA,SAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,iBAAa,kBAAK,YAAY,aAAa;AACjD,YAAI,sBAAW,UAAU,GAAG;AAC1B,0BAAgB,wBAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,eAAW,kBAAK,YAAY,UAAU;AAC5C,cAAI,sBAAW,QAAQ,GAAG;AACxB,4BAAgB,wBAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,IAAAC,eAAoB;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,gBAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,oBAAwB;AACxB,mBAAsB;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAC;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["robotsParser","import_path","_a"]}
1
+ {"version":3,"sources":["../src/index.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { parseHTML } from './parseHtml';\nexport { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,oBAAwB;AACxB,mBAAsB;AACtB,sBAA4B;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a","TurndownService"]}
package/dist/index.mjs CHANGED
@@ -15,123 +15,10 @@ var __spreadValues = (a, b) => {
15
15
  return a;
16
16
  };
17
17
 
18
- // src/utils/excludePatterns.ts
19
- import { isMatch } from "matcher";
20
- function matchesExcludePattern(path, patterns) {
21
- if (!patterns || patterns.length === 0) {
22
- return false;
23
- }
24
- const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
25
- const normalizedPath = normalize(path);
26
- const normalizedPatterns = patterns.map(normalize);
27
- return isMatch(normalizedPath, normalizedPatterns);
28
- }
29
-
30
- // src/utils/robotsParser.ts
31
- import { existsSync, readFileSync } from "fs";
32
- import { join } from "path";
33
- import robotsParser from "robots-parser";
34
- function createRobotsParser(content) {
35
- return robotsParser("https://robots.invalid/robots.txt", content);
36
- }
37
- function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
38
- let robotsContent = null;
39
- let foundPath = null;
40
- if (robotsTxtPath) {
41
- const customPath = join(projectDir, robotsTxtPath);
42
- if (existsSync(customPath)) {
43
- robotsContent = readFileSync(customPath, "utf-8");
44
- foundPath = customPath;
45
- }
46
- }
47
- if (!robotsContent) {
48
- for (const searchPath of searchPaths) {
49
- const fullPath = join(projectDir, searchPath);
50
- if (existsSync(fullPath)) {
51
- robotsContent = readFileSync(fullPath, "utf-8");
52
- foundPath = fullPath;
53
- break;
54
- }
55
- }
56
- }
57
- if (!robotsContent) {
58
- return null;
59
- }
60
- return {
61
- parser: createRobotsParser(robotsContent),
62
- path: foundPath || ""
63
- };
64
- }
65
- function isPathAllowedByRobots(path, robots) {
66
- if (!robots) {
67
- return true;
68
- }
69
- const normalizedPath = path.startsWith("/") ? path : `/${path}`;
70
- const testUrl = `https://robots.invalid${normalizedPath}`;
71
- const isAllowed = robots.isAllowed(testUrl, "*");
72
- return isAllowed !== false;
73
- }
74
-
75
- // src/utils/pathFilter.ts
76
- function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
77
- if (pathname.includes("[") && pathname.includes("]")) {
78
- return { included: false, reason: "dynamic-route" };
79
- }
80
- if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
81
- return { included: false, reason: "internal-route" };
82
- }
83
- if (pathname.endsWith(".rsc")) {
84
- return { included: false, reason: "rsc-file" };
85
- }
86
- if (pathname.includes(".segments/")) {
87
- return { included: false, reason: "segment-file" };
88
- }
89
- if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
90
- return { included: false, reason: "static-asset" };
91
- }
92
- if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
93
- return { included: false, reason: "robots-txt" };
94
- }
95
- if (matchesExcludePattern(pathname, excludePatterns)) {
96
- return { included: false, reason: "exclude-pattern" };
97
- }
98
- return { included: true, reason: "included" };
99
- }
100
-
101
- // src/utils/pathUtils.ts
102
- import { sep } from "path";
103
- var artifactPrefixes = [
104
- "server/pages/",
105
- "server/app/",
106
- "static/chunks/app/",
107
- "static/chunks/pages/",
108
- "static/",
109
- "server/"
110
- ];
111
- function filePathToPathname(filePath) {
112
- let pathname = filePath.split(sep).join("/");
113
- for (const prefix of artifactPrefixes) {
114
- if (pathname.startsWith(prefix)) {
115
- pathname = pathname.substring(prefix.length);
116
- break;
117
- }
118
- }
119
- pathname = pathname.replace(/\.html?$/, "");
120
- if (pathname === "index" || pathname === "") {
121
- return "/";
122
- }
123
- if (pathname.endsWith("/index")) {
124
- pathname = pathname.slice(0, -5);
125
- }
126
- if (!pathname.startsWith("/")) {
127
- pathname = "/" + pathname;
128
- }
129
- return pathname;
130
- }
131
-
132
18
  // src/parseHtml.ts
133
19
  import { loggers } from "@peam-ai/logger";
134
20
  import { JSDOM } from "jsdom";
21
+ import TurndownService from "turndown";
135
22
 
136
23
  // src/parsers/cssSelectorParser.ts
137
24
  var CssSelectorParser = class {
@@ -260,10 +147,9 @@ var ReadabilityParser = class {
260
147
  var log = loggers.parser;
261
148
  function parseHTML(html, options = {}) {
262
149
  if (!html || html.trim().length === 0) {
263
- log.warn("Empty or invalid HTML input");
150
+ log.error("Empty or invalid HTML input");
264
151
  return void 0;
265
152
  }
266
- log.debug("Starting parse with options", options);
267
153
  const dom = new JSDOM(html);
268
154
  const document = dom.window.document;
269
155
  const cssSelectorParser = new CssSelectorParser();
@@ -271,23 +157,29 @@ function parseHTML(html, options = {}) {
271
157
  const readabilityParser = new ReadabilityParser();
272
158
  const readabilityStructuredPage = readabilityParser.parse(document, options);
273
159
  if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
274
- log.warn("Failed to extract content");
160
+ log.error("Failed to extract content");
275
161
  return void 0;
276
162
  }
277
- return __spreadValues(__spreadValues(__spreadValues({}, {
163
+ const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
278
164
  title: "",
279
165
  description: "",
280
166
  content: "",
281
167
  textContent: ""
282
168
  }), cssSelectorStructuredPage), readabilityStructuredPage);
169
+ if (mergedResult.content) {
170
+ try {
171
+ const turndownService = new TurndownService({
172
+ headingStyle: "atx",
173
+ codeBlockStyle: "fenced"
174
+ });
175
+ mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
176
+ } catch (error) {
177
+ log.error("Failed to convert content to markdown", error);
178
+ }
179
+ }
180
+ return mergedResult;
283
181
  }
284
182
  export {
285
- createRobotsParser,
286
- filePathToPathname,
287
- isPathAllowedByRobots,
288
- loadRobotsTxt,
289
- matchesExcludePattern,
290
- parseHTML,
291
- shouldIncludePath
183
+ parseHTML
292
184
  };
293
185
  //# sourceMappingURL=index.mjs.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,SAAO,QAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,SAAS,YAAY,oBAAoB;AACzC,SAAS,YAAY;AACrB,OAAO,kBAAkB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,SAAO,aAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,aAAa,KAAK,YAAY,aAAa;AACjD,QAAI,WAAW,UAAU,GAAG;AAC1B,sBAAgB,aAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,WAAW,KAAK,YAAY,UAAU;AAC5C,UAAI,WAAW,QAAQ,GAAG;AACxB,wBAAgB,aAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,SAAS,WAAW;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,GAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,SAAS,eAAe;AACxB,SAAS,aAAa;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["_a"]}
1
+ {"version":3,"sources":["../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AACxB,SAAS,aAAa;AACtB,OAAO,qBAAqB;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a"]}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@peam-ai/parser",
3
3
  "description": "Content parser for extracting page metadata",
4
- "version": "0.1.3",
4
+ "version": "0.1.5",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",
7
7
  "types": "./dist/index.d.ts",
@@ -29,21 +29,23 @@
29
29
  },
30
30
  "devDependencies": {
31
31
  "@types/jsdom": "^27.0.0",
32
+ "@types/turndown": "^5.0.5",
32
33
  "tsup": "^8.2.4",
33
34
  "typescript": "^5.5.4"
34
35
  },
35
36
  "dependencies": {
36
37
  "@mozilla/readability": "^0.5.0",
37
38
  "jsdom": "^27.3.0",
38
- "matcher": "^6.0.0",
39
- "robots-parser": "^3.0.1",
40
- "@peam-ai/logger": "0.1.3"
39
+ "turndown": "^7.2.2",
40
+ "@peam-ai/logger": "0.1.5"
41
41
  },
42
42
  "scripts": {
43
43
  "build": "tsup",
44
+ "build:watch": "tsup --watch",
44
45
  "clean": "rm -rf dist",
45
- "test:eslint": "eslint \"src/**/*.ts*\"",
46
- "test:prettier": "prettier --check \"src/**/*.ts*\"",
47
- "test:jest": "jest"
46
+ "format": "prettier --write \"src/**/*.ts*\"",
47
+ "test:unit": "vitest run",
48
+ "test:lint": "eslint \"src/**/*.ts*\"",
49
+ "test:format": "prettier --check \"src/**/*.ts*\""
48
50
  }
49
51
  }