npm - @peam-ai/parser - Versions diffs - 0.1.3 → 0.1.5 - Mend

@peam-ai/parser 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.mts CHANGED Viewed

@@ -1,10 +1,9 @@
-import robotsParser from 'robots-parser';
 interface StructuredPage {
     title: string;
     description: string;
     content: string;
     textContent: string;
+    markdownContent?: string;
     contentLength?: number;
     author?: string;
     direction?: string;
@@ -45,55 +44,6 @@ interface ParseOptions {
     extractImages?: boolean;
 }
-/**
- * Checks if a path matches any of the given wildcard patterns
- * Uses the matcher library which supports:
- * - * (matches any characters except /)
- * - ** (matches any characters including /)
- * - ? (matches single character)
- * - ! (negation)
- * - [] (character ranges)
- */
-declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
-type RobotsParser = ReturnType<typeof robotsParser>;
-interface RobotsTxtResult {
-    parser: RobotsParser;
-    path: string;
-}
-declare function createRobotsParser(content: string): RobotsParser;
-/**
- * Loads and parses robots.txt from custom path or standard locations
- * Returns the parser and the path where robots.txt was found
- */
-declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
-declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
-type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
-interface PathFilterResult {
-    included: boolean;
-    reason: PathFilterReason;
-}
-/**
- * Determines if a pathname should be included in the index
- * Returns both the decision and the reason for exclusion
- */
-declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
-/**
- * Convert file path to URL pathname
- *
- * Examples:
- *   index.html -> /
- *   about.html -> /about
- *   about/index.html -> /about/
- *   blog/post-1.html -> /blog/post-1
- *   blog/post-1/index.html -> /blog/post-1/
- *   server/pages/contact.html -> /contact
- *   server/app/about.html -> /about
- */
-declare function filePathToPathname(filePath: string): string;
 /**
  * Parse HTML content and convert it to a StructuredPage
  * @param html - HTML string to parse
@@ -102,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
  */
 declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
-export { type ParseOptions, type PathFilterReason, type PathFilterResult, type RobotsParser, type RobotsTxtResult, type StructuredPage, createRobotsParser, filePathToPathname, isPathAllowedByRobots, loadRobotsTxt, matchesExcludePattern, parseHTML, shouldIncludePath };
+export { type ParseOptions, type StructuredPage, parseHTML };

package/dist/index.d.ts CHANGED Viewed

@@ -1,10 +1,9 @@
-import robotsParser from 'robots-parser';
 interface StructuredPage {
     title: string;
     description: string;
     content: string;
     textContent: string;
+    markdownContent?: string;
     contentLength?: number;
     author?: string;
     direction?: string;
@@ -45,55 +44,6 @@ interface ParseOptions {
     extractImages?: boolean;
 }
-/**
- * Checks if a path matches any of the given wildcard patterns
- * Uses the matcher library which supports:
- * - * (matches any characters except /)
- * - ** (matches any characters including /)
- * - ? (matches single character)
- * - ! (negation)
- * - [] (character ranges)
- */
-declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
-type RobotsParser = ReturnType<typeof robotsParser>;
-interface RobotsTxtResult {
-    parser: RobotsParser;
-    path: string;
-}
-declare function createRobotsParser(content: string): RobotsParser;
-/**
- * Loads and parses robots.txt from custom path or standard locations
- * Returns the parser and the path where robots.txt was found
- */
-declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
-declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
-type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
-interface PathFilterResult {
-    included: boolean;
-    reason: PathFilterReason;
-}
-/**
- * Determines if a pathname should be included in the index
- * Returns both the decision and the reason for exclusion
- */
-declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
-/**
- * Convert file path to URL pathname
- *
- * Examples:
- *   index.html -> /
- *   about.html -> /about
- *   about/index.html -> /about/
- *   blog/post-1.html -> /blog/post-1
- *   blog/post-1/index.html -> /blog/post-1/
- *   server/pages/contact.html -> /contact
- *   server/app/about.html -> /about
- */
-declare function filePathToPathname(filePath: string): string;
 /**
  * Parse HTML content and convert it to a StructuredPage
  * @param html - HTML string to parse
@@ -102,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
  */
 declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
-export { type ParseOptions, type PathFilterReason, type PathFilterResult, type RobotsParser, type RobotsTxtResult, type StructuredPage, createRobotsParser, filePathToPathname, isPathAllowedByRobots, loadRobotsTxt, matchesExcludePattern, parseHTML, shouldIncludePath };
+export { type ParseOptions, type StructuredPage, parseHTML };

package/dist/index.js CHANGED Viewed

@@ -44,133 +44,14 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
 // src/index.ts
 var index_exports = {};
 __export(index_exports, {
-  createRobotsParser: () => createRobotsParser,
-  filePathToPathname: () => filePathToPathname,
-  isPathAllowedByRobots: () => isPathAllowedByRobots,
-  loadRobotsTxt: () => loadRobotsTxt,
-  matchesExcludePattern: () => matchesExcludePattern,
-  parseHTML: () => parseHTML,
-  shouldIncludePath: () => shouldIncludePath
+  parseHTML: () => parseHTML
 });
 module.exports = __toCommonJS(index_exports);
-// src/utils/excludePatterns.ts
-var import_matcher = require("matcher");
-function matchesExcludePattern(path, patterns) {
-  if (!patterns || patterns.length === 0) {
-    return false;
-  }
-  const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
-  const normalizedPath = normalize(path);
-  const normalizedPatterns = patterns.map(normalize);
-  return (0, import_matcher.isMatch)(normalizedPath, normalizedPatterns);
-}
-// src/utils/robotsParser.ts
-var import_fs = require("fs");
-var import_path = require("path");
-var import_robots_parser = __toESM(require("robots-parser"));
-function createRobotsParser(content) {
-  return (0, import_robots_parser.default)("https://robots.invalid/robots.txt", content);
-}
-function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
-  let robotsContent = null;
-  let foundPath = null;
-  if (robotsTxtPath) {
-    const customPath = (0, import_path.join)(projectDir, robotsTxtPath);
-    if ((0, import_fs.existsSync)(customPath)) {
-      robotsContent = (0, import_fs.readFileSync)(customPath, "utf-8");
-      foundPath = customPath;
-    }
-  }
-  if (!robotsContent) {
-    for (const searchPath of searchPaths) {
-      const fullPath = (0, import_path.join)(projectDir, searchPath);
-      if ((0, import_fs.existsSync)(fullPath)) {
-        robotsContent = (0, import_fs.readFileSync)(fullPath, "utf-8");
-        foundPath = fullPath;
-        break;
-      }
-    }
-  }
-  if (!robotsContent) {
-    return null;
-  }
-  return {
-    parser: createRobotsParser(robotsContent),
-    path: foundPath || ""
-  };
-}
-function isPathAllowedByRobots(path, robots) {
-  if (!robots) {
-    return true;
-  }
-  const normalizedPath = path.startsWith("/") ? path : `/${path}`;
-  const testUrl = `https://robots.invalid${normalizedPath}`;
-  const isAllowed = robots.isAllowed(testUrl, "*");
-  return isAllowed !== false;
-}
-// src/utils/pathFilter.ts
-function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
-  if (pathname.includes("[") && pathname.includes("]")) {
-    return { included: false, reason: "dynamic-route" };
-  }
-  if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
-    return { included: false, reason: "internal-route" };
-  }
-  if (pathname.endsWith(".rsc")) {
-    return { included: false, reason: "rsc-file" };
-  }
-  if (pathname.includes(".segments/")) {
-    return { included: false, reason: "segment-file" };
-  }
-  if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
-    return { included: false, reason: "static-asset" };
-  }
-  if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
-    return { included: false, reason: "robots-txt" };
-  }
-  if (matchesExcludePattern(pathname, excludePatterns)) {
-    return { included: false, reason: "exclude-pattern" };
-  }
-  return { included: true, reason: "included" };
-}
-// src/utils/pathUtils.ts
-var import_path2 = require("path");
-var artifactPrefixes = [
-  "server/pages/",
-  "server/app/",
-  "static/chunks/app/",
-  "static/chunks/pages/",
-  "static/",
-  "server/"
-];
-function filePathToPathname(filePath) {
-  let pathname = filePath.split(import_path2.sep).join("/");
-  for (const prefix of artifactPrefixes) {
-    if (pathname.startsWith(prefix)) {
-      pathname = pathname.substring(prefix.length);
-      break;
-    }
-  }
-  pathname = pathname.replace(/\.html?$/, "");
-  if (pathname === "index" || pathname === "") {
-    return "/";
-  }
-  if (pathname.endsWith("/index")) {
-    pathname = pathname.slice(0, -5);
-  }
-  if (!pathname.startsWith("/")) {
-    pathname = "/" + pathname;
-  }
-  return pathname;
-}
 // src/parseHtml.ts
 var import_logger = require("@peam-ai/logger");
 var import_jsdom = require("jsdom");
+var import_turndown = __toESM(require("turndown"));
 // src/parsers/cssSelectorParser.ts
 var CssSelectorParser = class {
@@ -299,10 +180,9 @@ var ReadabilityParser = class {
 var log = import_logger.loggers.parser;
 function parseHTML(html, options = {}) {
   if (!html || html.trim().length === 0) {
-    log.warn("Empty or invalid HTML input");
+    log.error("Empty or invalid HTML input");
     return void 0;
   }
-  log.debug("Starting parse with options", options);
   const dom = new import_jsdom.JSDOM(html);
   const document = dom.window.document;
   const cssSelectorParser = new CssSelectorParser();
@@ -310,24 +190,30 @@ function parseHTML(html, options = {}) {
   const readabilityParser = new ReadabilityParser();
   const readabilityStructuredPage = readabilityParser.parse(document, options);
   if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
-    log.warn("Failed to extract content");
+    log.error("Failed to extract content");
     return void 0;
   }
-  return __spreadValues(__spreadValues(__spreadValues({}, {
+  const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
     title: "",
     description: "",
     content: "",
     textContent: ""
   }), cssSelectorStructuredPage), readabilityStructuredPage);
+  if (mergedResult.content) {
+    try {
+      const turndownService = new import_turndown.default({
+        headingStyle: "atx",
+        codeBlockStyle: "fenced"
+      });
+      mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
+    } catch (error) {
+      log.error("Failed to convert content to markdown", error);
+    }
+  }
+  return mergedResult;
 }
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
-  createRobotsParser,
-  filePathToPathname,
-  isPathAllowedByRobots,
-  loadRobotsTxt,
-  matchesExcludePattern,
-  parseHTML,
-  shouldIncludePath
+  parseHTML
 });
 //# sourceMappingURL=index.js.map

package/dist/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/index.ts","../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n\nexport { matchesExcludePattern } from './utils/excludePatterns';\nexport { shouldIncludePath, type PathFilterReason, type PathFilterResult } from './utils/pathFilter';\nexport { filePathToPathname } from './utils/pathUtils';\nexport {\n createRobotsParser,\n isPathAllowedByRobots,\n loadRobotsTxt,\n type RobotsParser,\n type RobotsTxtResult,\n} from './utils/robotsParser';\n\nexport { parseHTML } from './parseHtml';\n","import { isMatch } from 'matcher';\n\n/*\n Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n /\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns \|\| patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/\n Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n /\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult \| null {\n let robotsContent: string \| null = null;\n let foundPath: string \| null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath \|\| '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser \| null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n \| 'included'\n \| 'dynamic-route'\n \| 'internal-route'\n \| 'rsc-file'\n \| 'segment-file'\n \| 'static-asset'\n \| 'robots-txt'\n \| 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/*\n Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n /\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser \| null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') \|\| pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico\|png\|jpg\|jpeg\|svg\|gif\|webp\|txt\|xml\|json\|css\|js\|woff\|woff2\|ttf\|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/\n Convert file path to URL pathname\n \n Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n /\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' \|\| pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/\n Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage \| undefined {\n if (!html \|\| html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title \|\|\n title.trim().length === 0 \|\|\n !textContent \|\|\n textContent.trim().length === 0 \|\|\n !content \|\|\n content.trim().length === 0 \|\|\n !description \|\|\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') \|\| href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') \|\| img.getAttribute('data-src') \|\| img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src \|\| '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt \|\| '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const { keepClasses, disableJSONLD } = options \|\| {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,qBAAwB;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,aAAO,wBAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,gBAAyC;AACzC,kBAAqB;AACrB,2BAAyB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,aAAO,qBAAAA,SAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,iBAAa,kBAAK,YAAY,aAAa;AACjD,YAAI,sBAAW,UAAU,GAAG;AAC1B,0BAAgB,wBAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,eAAW,kBAAK,YAAY,UAAU;AAC5C,cAAI,sBAAW,QAAQ,GAAG;AACxB,4BAAgB,wBAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,IAAAC,eAAoB;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,gBAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,oBAAwB;AACxB,mBAAsB;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAC;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["robotsParser","import_path","_a"]}
1	+ {"version":3,"sources":["../src/index.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { parseHTML } from './parseHtml';\nexport { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/*\n Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage \| undefined {\n if (!html \|\| html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title \|\|\n title.trim().length === 0 \|\|\n !textContent \|\|\n textContent.trim().length === 0 \|\|\n !content \|\|\n content.trim().length === 0 \|\|\n !description \|\|\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') \|\| href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') \|\| img.getAttribute('data-src') \|\| img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src \|\| '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt \|\| '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const { keepClasses, disableJSONLD } = options \|\| {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,oBAAwB;AACxB,mBAAsB;AACtB,sBAA4B;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a","TurndownService"]}

package/dist/index.mjs CHANGED Viewed

@@ -15,123 +15,10 @@ var __spreadValues = (a, b) => {
   return a;
 };
-// src/utils/excludePatterns.ts
-import { isMatch } from "matcher";
-function matchesExcludePattern(path, patterns) {
-  if (!patterns || patterns.length === 0) {
-    return false;
-  }
-  const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
-  const normalizedPath = normalize(path);
-  const normalizedPatterns = patterns.map(normalize);
-  return isMatch(normalizedPath, normalizedPatterns);
-}
-// src/utils/robotsParser.ts
-import { existsSync, readFileSync } from "fs";
-import { join } from "path";
-import robotsParser from "robots-parser";
-function createRobotsParser(content) {
-  return robotsParser("https://robots.invalid/robots.txt", content);
-}
-function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
-  let robotsContent = null;
-  let foundPath = null;
-  if (robotsTxtPath) {
-    const customPath = join(projectDir, robotsTxtPath);
-    if (existsSync(customPath)) {
-      robotsContent = readFileSync(customPath, "utf-8");
-      foundPath = customPath;
-    }
-  }
-  if (!robotsContent) {
-    for (const searchPath of searchPaths) {
-      const fullPath = join(projectDir, searchPath);
-      if (existsSync(fullPath)) {
-        robotsContent = readFileSync(fullPath, "utf-8");
-        foundPath = fullPath;
-        break;
-      }
-    }
-  }
-  if (!robotsContent) {
-    return null;
-  }
-  return {
-    parser: createRobotsParser(robotsContent),
-    path: foundPath || ""
-  };
-}
-function isPathAllowedByRobots(path, robots) {
-  if (!robots) {
-    return true;
-  }
-  const normalizedPath = path.startsWith("/") ? path : `/${path}`;
-  const testUrl = `https://robots.invalid${normalizedPath}`;
-  const isAllowed = robots.isAllowed(testUrl, "*");
-  return isAllowed !== false;
-}
-// src/utils/pathFilter.ts
-function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
-  if (pathname.includes("[") && pathname.includes("]")) {
-    return { included: false, reason: "dynamic-route" };
-  }
-  if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
-    return { included: false, reason: "internal-route" };
-  }
-  if (pathname.endsWith(".rsc")) {
-    return { included: false, reason: "rsc-file" };
-  }
-  if (pathname.includes(".segments/")) {
-    return { included: false, reason: "segment-file" };
-  }
-  if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
-    return { included: false, reason: "static-asset" };
-  }
-  if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
-    return { included: false, reason: "robots-txt" };
-  }
-  if (matchesExcludePattern(pathname, excludePatterns)) {
-    return { included: false, reason: "exclude-pattern" };
-  }
-  return { included: true, reason: "included" };
-}
-// src/utils/pathUtils.ts
-import { sep } from "path";
-var artifactPrefixes = [
-  "server/pages/",
-  "server/app/",
-  "static/chunks/app/",
-  "static/chunks/pages/",
-  "static/",
-  "server/"
-];
-function filePathToPathname(filePath) {
-  let pathname = filePath.split(sep).join("/");
-  for (const prefix of artifactPrefixes) {
-    if (pathname.startsWith(prefix)) {
-      pathname = pathname.substring(prefix.length);
-      break;
-    }
-  }
-  pathname = pathname.replace(/\.html?$/, "");
-  if (pathname === "index" || pathname === "") {
-    return "/";
-  }
-  if (pathname.endsWith("/index")) {
-    pathname = pathname.slice(0, -5);
-  }
-  if (!pathname.startsWith("/")) {
-    pathname = "/" + pathname;
-  }
-  return pathname;
-}
 // src/parseHtml.ts
 import { loggers } from "@peam-ai/logger";
 import { JSDOM } from "jsdom";
+import TurndownService from "turndown";
 // src/parsers/cssSelectorParser.ts
 var CssSelectorParser = class {
@@ -260,10 +147,9 @@ var ReadabilityParser = class {
 var log = loggers.parser;
 function parseHTML(html, options = {}) {
   if (!html || html.trim().length === 0) {
-    log.warn("Empty or invalid HTML input");
+    log.error("Empty or invalid HTML input");
     return void 0;
   }
-  log.debug("Starting parse with options", options);
   const dom = new JSDOM(html);
   const document = dom.window.document;
   const cssSelectorParser = new CssSelectorParser();
@@ -271,23 +157,29 @@ function parseHTML(html, options = {}) {
   const readabilityParser = new ReadabilityParser();
   const readabilityStructuredPage = readabilityParser.parse(document, options);
   if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
-    log.warn("Failed to extract content");
+    log.error("Failed to extract content");
     return void 0;
   }
-  return __spreadValues(__spreadValues(__spreadValues({}, {
+  const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
     title: "",
     description: "",
     content: "",
     textContent: ""
   }), cssSelectorStructuredPage), readabilityStructuredPage);
+  if (mergedResult.content) {
+    try {
+      const turndownService = new TurndownService({
+        headingStyle: "atx",
+        codeBlockStyle: "fenced"
+      });
+      mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
+    } catch (error) {
+      log.error("Failed to convert content to markdown", error);
+    }
+  }
+  return mergedResult;
 }
 export {
-  createRobotsParser,
-  filePathToPathname,
-  isPathAllowedByRobots,
-  loadRobotsTxt,
-  matchesExcludePattern,
-  parseHTML,
-  shouldIncludePath
+  parseHTML
 };
 //# sourceMappingURL=index.mjs.map

package/dist/index.mjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { isMatch } from 'matcher';\n\n/*\n Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n /\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns \|\| patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/\n Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n /\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult \| null {\n let robotsContent: string \| null = null;\n let foundPath: string \| null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath \|\| '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser \| null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n \| 'included'\n \| 'dynamic-route'\n \| 'internal-route'\n \| 'rsc-file'\n \| 'segment-file'\n \| 'static-asset'\n \| 'robots-txt'\n \| 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/*\n Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n /\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser \| null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') \|\| pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico\|png\|jpg\|jpeg\|svg\|gif\|webp\|txt\|xml\|json\|css\|js\|woff\|woff2\|ttf\|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/\n Convert file path to URL pathname\n \n Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n /\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' \|\| pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/\n Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage \| undefined {\n if (!html \|\| html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title \|\|\n title.trim().length === 0 \|\|\n !textContent \|\|\n textContent.trim().length === 0 \|\|\n !content \|\|\n content.trim().length === 0 \|\|\n !description \|\|\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') \|\| href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') \|\| img.getAttribute('data-src') \|\| img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src \|\| '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt \|\| '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const { keepClasses, disableJSONLD } = options \|\| {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,SAAO,QAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,SAAS,YAAY,oBAAoB;AACzC,SAAS,YAAY;AACrB,OAAO,kBAAkB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,SAAO,aAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,aAAa,KAAK,YAAY,aAAa;AACjD,QAAI,WAAW,UAAU,GAAG;AAC1B,sBAAgB,aAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,WAAW,KAAK,YAAY,UAAU;AAC5C,UAAI,WAAW,QAAQ,GAAG;AACxB,wBAAgB,aAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,SAAS,WAAW;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,GAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,SAAS,eAAe;AACxB,SAAS,aAAa;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["_a"]}
1	+ {"version":3,"sources":["../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/*\n Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage \| undefined {\n if (!html \|\| html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title \|\|\n title.trim().length === 0 \|\|\n !textContent \|\|\n textContent.trim().length === 0 \|\|\n !content \|\|\n content.trim().length === 0 \|\|\n !description \|\|\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') \|\| href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') \|\| img.getAttribute('data-src') \|\| img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src \|\| '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt \|\| '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage \| undefined {\n const { keepClasses, disableJSONLD } = options \|\| {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AACxB,SAAS,aAAa;AACtB,OAAO,qBAAqB;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a"]}

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@peam-ai/parser",
   "description": "Content parser for extracting page metadata",
-  "version": "0.1.3",
+  "version": "0.1.5",
   "main": "./dist/index.js",
   "module": "./dist/index.mjs",
   "types": "./dist/index.d.ts",
@@ -29,21 +29,23 @@
   },
   "devDependencies": {
     "@types/jsdom": "^27.0.0",
+    "@types/turndown": "^5.0.5",
     "tsup": "^8.2.4",
     "typescript": "^5.5.4"
   },
   "dependencies": {
     "@mozilla/readability": "^0.5.0",
     "jsdom": "^27.3.0",
-    "matcher": "^6.0.0",
-    "robots-parser": "^3.0.1",
-    "@peam-ai/logger": "0.1.3"
+    "turndown": "^7.2.2",
+    "@peam-ai/logger": "0.1.5"
   },
   "scripts": {
     "build": "tsup",
+    "build:watch": "tsup --watch",
     "clean": "rm -rf dist",
-    "test:eslint": "eslint \"src/**/*.ts*\"",
-    "test:prettier": "prettier --check \"src/**/*.ts*\"",
-    "test:jest": "jest"
+    "format": "prettier --write \"src/**/*.ts*\"",
+    "test:unit": "vitest run",
+    "test:lint": "eslint \"src/**/*.ts*\"",
+    "test:format": "prettier --check \"src/**/*.ts*\""
   }
 }