@peam-ai/parser 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +16 -3
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +16 -3
- package/dist/index.mjs.map +1 -1
- package/package.json +7 -5
package/dist/index.d.mts
CHANGED
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -171,6 +171,7 @@ function filePathToPathname(filePath) {
|
|
|
171
171
|
// src/parseHtml.ts
|
|
172
172
|
var import_logger = require("@peam-ai/logger");
|
|
173
173
|
var import_jsdom = require("jsdom");
|
|
174
|
+
var import_turndown = __toESM(require("turndown"));
|
|
174
175
|
|
|
175
176
|
// src/parsers/cssSelectorParser.ts
|
|
176
177
|
var CssSelectorParser = class {
|
|
@@ -299,7 +300,7 @@ var ReadabilityParser = class {
|
|
|
299
300
|
var log = import_logger.loggers.parser;
|
|
300
301
|
function parseHTML(html, options = {}) {
|
|
301
302
|
if (!html || html.trim().length === 0) {
|
|
302
|
-
log.
|
|
303
|
+
log.error("Empty or invalid HTML input");
|
|
303
304
|
return void 0;
|
|
304
305
|
}
|
|
305
306
|
log.debug("Starting parse with options", options);
|
|
@@ -310,15 +311,27 @@ function parseHTML(html, options = {}) {
|
|
|
310
311
|
const readabilityParser = new ReadabilityParser();
|
|
311
312
|
const readabilityStructuredPage = readabilityParser.parse(document, options);
|
|
312
313
|
if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
|
|
313
|
-
log.
|
|
314
|
+
log.error("Failed to extract content");
|
|
314
315
|
return void 0;
|
|
315
316
|
}
|
|
316
|
-
|
|
317
|
+
const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
|
|
317
318
|
title: "",
|
|
318
319
|
description: "",
|
|
319
320
|
content: "",
|
|
320
321
|
textContent: ""
|
|
321
322
|
}), cssSelectorStructuredPage), readabilityStructuredPage);
|
|
323
|
+
if (mergedResult.content) {
|
|
324
|
+
try {
|
|
325
|
+
const turndownService = new import_turndown.default({
|
|
326
|
+
headingStyle: "atx",
|
|
327
|
+
codeBlockStyle: "fenced"
|
|
328
|
+
});
|
|
329
|
+
mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
|
|
330
|
+
} catch (error) {
|
|
331
|
+
log.error("Failed to convert content to markdown", error);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
return mergedResult;
|
|
322
335
|
}
|
|
323
336
|
// Annotate the CommonJS export names for ESM import in node:
|
|
324
337
|
0 && (module.exports = {
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n\nexport { matchesExcludePattern } from './utils/excludePatterns';\nexport { shouldIncludePath, type PathFilterReason, type PathFilterResult } from './utils/pathFilter';\nexport { filePathToPathname } from './utils/pathUtils';\nexport {\n createRobotsParser,\n isPathAllowedByRobots,\n loadRobotsTxt,\n type RobotsParser,\n type RobotsTxtResult,\n} from './utils/robotsParser';\n\nexport { parseHTML } from './parseHtml';\n","import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,qBAAwB;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,aAAO,wBAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,gBAAyC;AACzC,kBAAqB;AACrB,2BAAyB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,aAAO,qBAAAA,SAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,iBAAa,kBAAK,YAAY,aAAa;AACjD,YAAI,sBAAW,UAAU,GAAG;AAC1B,0BAAgB,wBAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,eAAW,kBAAK,YAAY,UAAU;AAC5C,cAAI,sBAAW,QAAQ,GAAG;AACxB,4BAAgB,wBAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,IAAAC,eAAoB;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,gBAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,oBAAwB;AACxB,mBAAsB;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAC;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["robotsParser","import_path","_a"]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n\nexport { matchesExcludePattern } from './utils/excludePatterns';\nexport { shouldIncludePath, type PathFilterReason, type PathFilterResult } from './utils/pathFilter';\nexport { filePathToPathname } from './utils/pathUtils';\nexport {\n createRobotsParser,\n isPathAllowedByRobots,\n loadRobotsTxt,\n type RobotsParser,\n type RobotsTxtResult,\n} from './utils/robotsParser';\n\nexport { parseHTML } from './parseHtml';\n","import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,qBAAwB;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,aAAO,wBAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,gBAAyC;AACzC,kBAAqB;AACrB,2BAAyB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,aAAO,qBAAAA,SAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,iBAAa,kBAAK,YAAY,aAAa;AACjD,YAAI,sBAAW,UAAU,GAAG;AAC1B,0BAAgB,wBAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,eAAW,kBAAK,YAAY,UAAU;AAC5C,cAAI,sBAAW,QAAQ,GAAG;AACxB,4BAAgB,wBAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,IAAAC,eAAoB;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,gBAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,oBAAwB;AACxB,mBAAsB;AACtB,sBAA4B;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAC;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["robotsParser","import_path","_a","TurndownService"]}
|
package/dist/index.mjs
CHANGED
|
@@ -132,6 +132,7 @@ function filePathToPathname(filePath) {
|
|
|
132
132
|
// src/parseHtml.ts
|
|
133
133
|
import { loggers } from "@peam-ai/logger";
|
|
134
134
|
import { JSDOM } from "jsdom";
|
|
135
|
+
import TurndownService from "turndown";
|
|
135
136
|
|
|
136
137
|
// src/parsers/cssSelectorParser.ts
|
|
137
138
|
var CssSelectorParser = class {
|
|
@@ -260,7 +261,7 @@ var ReadabilityParser = class {
|
|
|
260
261
|
var log = loggers.parser;
|
|
261
262
|
function parseHTML(html, options = {}) {
|
|
262
263
|
if (!html || html.trim().length === 0) {
|
|
263
|
-
log.
|
|
264
|
+
log.error("Empty or invalid HTML input");
|
|
264
265
|
return void 0;
|
|
265
266
|
}
|
|
266
267
|
log.debug("Starting parse with options", options);
|
|
@@ -271,15 +272,27 @@ function parseHTML(html, options = {}) {
|
|
|
271
272
|
const readabilityParser = new ReadabilityParser();
|
|
272
273
|
const readabilityStructuredPage = readabilityParser.parse(document, options);
|
|
273
274
|
if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
|
|
274
|
-
log.
|
|
275
|
+
log.error("Failed to extract content");
|
|
275
276
|
return void 0;
|
|
276
277
|
}
|
|
277
|
-
|
|
278
|
+
const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
|
|
278
279
|
title: "",
|
|
279
280
|
description: "",
|
|
280
281
|
content: "",
|
|
281
282
|
textContent: ""
|
|
282
283
|
}), cssSelectorStructuredPage), readabilityStructuredPage);
|
|
284
|
+
if (mergedResult.content) {
|
|
285
|
+
try {
|
|
286
|
+
const turndownService = new TurndownService({
|
|
287
|
+
headingStyle: "atx",
|
|
288
|
+
codeBlockStyle: "fenced"
|
|
289
|
+
});
|
|
290
|
+
mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
|
|
291
|
+
} catch (error) {
|
|
292
|
+
log.error("Failed to convert content to markdown", error);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
return mergedResult;
|
|
283
296
|
}
|
|
284
297
|
export {
|
|
285
298
|
createRobotsParser,
|
package/dist/index.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,SAAO,QAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,SAAS,YAAY,oBAAoB;AACzC,SAAS,YAAY;AACrB,OAAO,kBAAkB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,SAAO,aAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,aAAa,KAAK,YAAY,aAAa;AACjD,QAAI,WAAW,UAAU,GAAG;AAC1B,sBAAgB,aAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,WAAW,KAAK,YAAY,UAAU;AAC5C,UAAI,WAAW,QAAQ,GAAG;AACxB,wBAAgB,aAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,SAAS,WAAW;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,GAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,SAAS,eAAe;AACxB,SAAS,aAAa;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["_a"]}
|
|
1
|
+
{"version":3,"sources":["../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,SAAO,QAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,SAAS,YAAY,oBAAoB;AACzC,SAAS,YAAY;AACrB,OAAO,kBAAkB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,SAAO,aAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,aAAa,KAAK,YAAY,aAAa;AACjD,QAAI,WAAW,UAAU,GAAG;AAC1B,sBAAgB,aAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,WAAW,KAAK,YAAY,UAAU;AAC5C,UAAI,WAAW,QAAQ,GAAG;AACxB,wBAAgB,aAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,SAAS,WAAW;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,GAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,SAAS,eAAe;AACxB,SAAS,aAAa;AACtB,OAAO,qBAAqB;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a"]}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@peam-ai/parser",
|
|
3
3
|
"description": "Content parser for extracting page metadata",
|
|
4
|
-
"version": "0.1.
|
|
4
|
+
"version": "0.1.4",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|
|
7
7
|
"types": "./dist/index.d.ts",
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
31
31
|
"@types/jsdom": "^27.0.0",
|
|
32
|
+
"@types/turndown": "^5.0.5",
|
|
32
33
|
"tsup": "^8.2.4",
|
|
33
34
|
"typescript": "^5.5.4"
|
|
34
35
|
},
|
|
@@ -37,13 +38,14 @@
|
|
|
37
38
|
"jsdom": "^27.3.0",
|
|
38
39
|
"matcher": "^6.0.0",
|
|
39
40
|
"robots-parser": "^3.0.1",
|
|
40
|
-
"
|
|
41
|
+
"turndown": "^7.2.2",
|
|
42
|
+
"@peam-ai/logger": "0.1.4"
|
|
41
43
|
},
|
|
42
44
|
"scripts": {
|
|
43
45
|
"build": "tsup",
|
|
44
46
|
"clean": "rm -rf dist",
|
|
45
|
-
"test:
|
|
46
|
-
"test:
|
|
47
|
-
"test:
|
|
47
|
+
"test:unit": "vitest run",
|
|
48
|
+
"test:lint": "eslint \"src/**/*.ts*\"",
|
|
49
|
+
"test:prettier": "prettier --check \"src/**/*.ts*\""
|
|
48
50
|
}
|
|
49
51
|
}
|