@peam-ai/parser 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +1 -52
- package/dist/index.d.ts +1 -52
- package/dist/index.js +2 -129
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1 -122
- package/dist/index.mjs.map +1 -1
- package/package.json +5 -5
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import robotsParser from 'robots-parser';
|
|
2
|
-
|
|
3
1
|
interface StructuredPage {
|
|
4
2
|
title: string;
|
|
5
3
|
description: string;
|
|
@@ -46,55 +44,6 @@ interface ParseOptions {
|
|
|
46
44
|
extractImages?: boolean;
|
|
47
45
|
}
|
|
48
46
|
|
|
49
|
-
/**
|
|
50
|
-
* Checks if a path matches any of the given wildcard patterns
|
|
51
|
-
* Uses the matcher library which supports:
|
|
52
|
-
* - * (matches any characters except /)
|
|
53
|
-
* - ** (matches any characters including /)
|
|
54
|
-
* - ? (matches single character)
|
|
55
|
-
* - ! (negation)
|
|
56
|
-
* - [] (character ranges)
|
|
57
|
-
*/
|
|
58
|
-
declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
|
|
59
|
-
|
|
60
|
-
type RobotsParser = ReturnType<typeof robotsParser>;
|
|
61
|
-
interface RobotsTxtResult {
|
|
62
|
-
parser: RobotsParser;
|
|
63
|
-
path: string;
|
|
64
|
-
}
|
|
65
|
-
declare function createRobotsParser(content: string): RobotsParser;
|
|
66
|
-
/**
|
|
67
|
-
* Loads and parses robots.txt from custom path or standard locations
|
|
68
|
-
* Returns the parser and the path where robots.txt was found
|
|
69
|
-
*/
|
|
70
|
-
declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
|
|
71
|
-
declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
|
|
72
|
-
|
|
73
|
-
type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
|
|
74
|
-
interface PathFilterResult {
|
|
75
|
-
included: boolean;
|
|
76
|
-
reason: PathFilterReason;
|
|
77
|
-
}
|
|
78
|
-
/**
|
|
79
|
-
* Determines if a pathname should be included in the index
|
|
80
|
-
* Returns both the decision and the reason for exclusion
|
|
81
|
-
*/
|
|
82
|
-
declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
|
|
83
|
-
|
|
84
|
-
/**
|
|
85
|
-
* Convert file path to URL pathname
|
|
86
|
-
*
|
|
87
|
-
* Examples:
|
|
88
|
-
* index.html -> /
|
|
89
|
-
* about.html -> /about
|
|
90
|
-
* about/index.html -> /about/
|
|
91
|
-
* blog/post-1.html -> /blog/post-1
|
|
92
|
-
* blog/post-1/index.html -> /blog/post-1/
|
|
93
|
-
* server/pages/contact.html -> /contact
|
|
94
|
-
* server/app/about.html -> /about
|
|
95
|
-
*/
|
|
96
|
-
declare function filePathToPathname(filePath: string): string;
|
|
97
|
-
|
|
98
47
|
/**
|
|
99
48
|
* Parse HTML content and convert it to a StructuredPage
|
|
100
49
|
* @param html - HTML string to parse
|
|
@@ -103,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
|
|
|
103
52
|
*/
|
|
104
53
|
declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
|
|
105
54
|
|
|
106
|
-
export { type ParseOptions, type
|
|
55
|
+
export { type ParseOptions, type StructuredPage, parseHTML };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import robotsParser from 'robots-parser';
|
|
2
|
-
|
|
3
1
|
interface StructuredPage {
|
|
4
2
|
title: string;
|
|
5
3
|
description: string;
|
|
@@ -46,55 +44,6 @@ interface ParseOptions {
|
|
|
46
44
|
extractImages?: boolean;
|
|
47
45
|
}
|
|
48
46
|
|
|
49
|
-
/**
|
|
50
|
-
* Checks if a path matches any of the given wildcard patterns
|
|
51
|
-
* Uses the matcher library which supports:
|
|
52
|
-
* - * (matches any characters except /)
|
|
53
|
-
* - ** (matches any characters including /)
|
|
54
|
-
* - ? (matches single character)
|
|
55
|
-
* - ! (negation)
|
|
56
|
-
* - [] (character ranges)
|
|
57
|
-
*/
|
|
58
|
-
declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
|
|
59
|
-
|
|
60
|
-
type RobotsParser = ReturnType<typeof robotsParser>;
|
|
61
|
-
interface RobotsTxtResult {
|
|
62
|
-
parser: RobotsParser;
|
|
63
|
-
path: string;
|
|
64
|
-
}
|
|
65
|
-
declare function createRobotsParser(content: string): RobotsParser;
|
|
66
|
-
/**
|
|
67
|
-
* Loads and parses robots.txt from custom path or standard locations
|
|
68
|
-
* Returns the parser and the path where robots.txt was found
|
|
69
|
-
*/
|
|
70
|
-
declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
|
|
71
|
-
declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
|
|
72
|
-
|
|
73
|
-
type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
|
|
74
|
-
interface PathFilterResult {
|
|
75
|
-
included: boolean;
|
|
76
|
-
reason: PathFilterReason;
|
|
77
|
-
}
|
|
78
|
-
/**
|
|
79
|
-
* Determines if a pathname should be included in the index
|
|
80
|
-
* Returns both the decision and the reason for exclusion
|
|
81
|
-
*/
|
|
82
|
-
declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
|
|
83
|
-
|
|
84
|
-
/**
|
|
85
|
-
* Convert file path to URL pathname
|
|
86
|
-
*
|
|
87
|
-
* Examples:
|
|
88
|
-
* index.html -> /
|
|
89
|
-
* about.html -> /about
|
|
90
|
-
* about/index.html -> /about/
|
|
91
|
-
* blog/post-1.html -> /blog/post-1
|
|
92
|
-
* blog/post-1/index.html -> /blog/post-1/
|
|
93
|
-
* server/pages/contact.html -> /contact
|
|
94
|
-
* server/app/about.html -> /about
|
|
95
|
-
*/
|
|
96
|
-
declare function filePathToPathname(filePath: string): string;
|
|
97
|
-
|
|
98
47
|
/**
|
|
99
48
|
* Parse HTML content and convert it to a StructuredPage
|
|
100
49
|
* @param html - HTML string to parse
|
|
@@ -103,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
|
|
|
103
52
|
*/
|
|
104
53
|
declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
|
|
105
54
|
|
|
106
|
-
export { type ParseOptions, type
|
|
55
|
+
export { type ParseOptions, type StructuredPage, parseHTML };
|
package/dist/index.js
CHANGED
|
@@ -44,130 +44,10 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
44
44
|
// src/index.ts
|
|
45
45
|
var index_exports = {};
|
|
46
46
|
__export(index_exports, {
|
|
47
|
-
|
|
48
|
-
filePathToPathname: () => filePathToPathname,
|
|
49
|
-
isPathAllowedByRobots: () => isPathAllowedByRobots,
|
|
50
|
-
loadRobotsTxt: () => loadRobotsTxt,
|
|
51
|
-
matchesExcludePattern: () => matchesExcludePattern,
|
|
52
|
-
parseHTML: () => parseHTML,
|
|
53
|
-
shouldIncludePath: () => shouldIncludePath
|
|
47
|
+
parseHTML: () => parseHTML
|
|
54
48
|
});
|
|
55
49
|
module.exports = __toCommonJS(index_exports);
|
|
56
50
|
|
|
57
|
-
// src/utils/excludePatterns.ts
|
|
58
|
-
var import_matcher = require("matcher");
|
|
59
|
-
function matchesExcludePattern(path, patterns) {
|
|
60
|
-
if (!patterns || patterns.length === 0) {
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
|
|
64
|
-
const normalizedPath = normalize(path);
|
|
65
|
-
const normalizedPatterns = patterns.map(normalize);
|
|
66
|
-
return (0, import_matcher.isMatch)(normalizedPath, normalizedPatterns);
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// src/utils/robotsParser.ts
|
|
70
|
-
var import_fs = require("fs");
|
|
71
|
-
var import_path = require("path");
|
|
72
|
-
var import_robots_parser = __toESM(require("robots-parser"));
|
|
73
|
-
function createRobotsParser(content) {
|
|
74
|
-
return (0, import_robots_parser.default)("https://robots.invalid/robots.txt", content);
|
|
75
|
-
}
|
|
76
|
-
function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
|
|
77
|
-
let robotsContent = null;
|
|
78
|
-
let foundPath = null;
|
|
79
|
-
if (robotsTxtPath) {
|
|
80
|
-
const customPath = (0, import_path.join)(projectDir, robotsTxtPath);
|
|
81
|
-
if ((0, import_fs.existsSync)(customPath)) {
|
|
82
|
-
robotsContent = (0, import_fs.readFileSync)(customPath, "utf-8");
|
|
83
|
-
foundPath = customPath;
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
if (!robotsContent) {
|
|
87
|
-
for (const searchPath of searchPaths) {
|
|
88
|
-
const fullPath = (0, import_path.join)(projectDir, searchPath);
|
|
89
|
-
if ((0, import_fs.existsSync)(fullPath)) {
|
|
90
|
-
robotsContent = (0, import_fs.readFileSync)(fullPath, "utf-8");
|
|
91
|
-
foundPath = fullPath;
|
|
92
|
-
break;
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
if (!robotsContent) {
|
|
97
|
-
return null;
|
|
98
|
-
}
|
|
99
|
-
return {
|
|
100
|
-
parser: createRobotsParser(robotsContent),
|
|
101
|
-
path: foundPath || ""
|
|
102
|
-
};
|
|
103
|
-
}
|
|
104
|
-
function isPathAllowedByRobots(path, robots) {
|
|
105
|
-
if (!robots) {
|
|
106
|
-
return true;
|
|
107
|
-
}
|
|
108
|
-
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
|
|
109
|
-
const testUrl = `https://robots.invalid${normalizedPath}`;
|
|
110
|
-
const isAllowed = robots.isAllowed(testUrl, "*");
|
|
111
|
-
return isAllowed !== false;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
// src/utils/pathFilter.ts
|
|
115
|
-
function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
|
|
116
|
-
if (pathname.includes("[") && pathname.includes("]")) {
|
|
117
|
-
return { included: false, reason: "dynamic-route" };
|
|
118
|
-
}
|
|
119
|
-
if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
|
|
120
|
-
return { included: false, reason: "internal-route" };
|
|
121
|
-
}
|
|
122
|
-
if (pathname.endsWith(".rsc")) {
|
|
123
|
-
return { included: false, reason: "rsc-file" };
|
|
124
|
-
}
|
|
125
|
-
if (pathname.includes(".segments/")) {
|
|
126
|
-
return { included: false, reason: "segment-file" };
|
|
127
|
-
}
|
|
128
|
-
if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
|
|
129
|
-
return { included: false, reason: "static-asset" };
|
|
130
|
-
}
|
|
131
|
-
if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
|
|
132
|
-
return { included: false, reason: "robots-txt" };
|
|
133
|
-
}
|
|
134
|
-
if (matchesExcludePattern(pathname, excludePatterns)) {
|
|
135
|
-
return { included: false, reason: "exclude-pattern" };
|
|
136
|
-
}
|
|
137
|
-
return { included: true, reason: "included" };
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// src/utils/pathUtils.ts
|
|
141
|
-
var import_path2 = require("path");
|
|
142
|
-
var artifactPrefixes = [
|
|
143
|
-
"server/pages/",
|
|
144
|
-
"server/app/",
|
|
145
|
-
"static/chunks/app/",
|
|
146
|
-
"static/chunks/pages/",
|
|
147
|
-
"static/",
|
|
148
|
-
"server/"
|
|
149
|
-
];
|
|
150
|
-
function filePathToPathname(filePath) {
|
|
151
|
-
let pathname = filePath.split(import_path2.sep).join("/");
|
|
152
|
-
for (const prefix of artifactPrefixes) {
|
|
153
|
-
if (pathname.startsWith(prefix)) {
|
|
154
|
-
pathname = pathname.substring(prefix.length);
|
|
155
|
-
break;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
pathname = pathname.replace(/\.html?$/, "");
|
|
159
|
-
if (pathname === "index" || pathname === "") {
|
|
160
|
-
return "/";
|
|
161
|
-
}
|
|
162
|
-
if (pathname.endsWith("/index")) {
|
|
163
|
-
pathname = pathname.slice(0, -5);
|
|
164
|
-
}
|
|
165
|
-
if (!pathname.startsWith("/")) {
|
|
166
|
-
pathname = "/" + pathname;
|
|
167
|
-
}
|
|
168
|
-
return pathname;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
51
|
// src/parseHtml.ts
|
|
172
52
|
var import_logger = require("@peam-ai/logger");
|
|
173
53
|
var import_jsdom = require("jsdom");
|
|
@@ -303,7 +183,6 @@ function parseHTML(html, options = {}) {
|
|
|
303
183
|
log.error("Empty or invalid HTML input");
|
|
304
184
|
return void 0;
|
|
305
185
|
}
|
|
306
|
-
log.debug("Starting parse with options", options);
|
|
307
186
|
const dom = new import_jsdom.JSDOM(html);
|
|
308
187
|
const document = dom.window.document;
|
|
309
188
|
const cssSelectorParser = new CssSelectorParser();
|
|
@@ -335,12 +214,6 @@ function parseHTML(html, options = {}) {
|
|
|
335
214
|
}
|
|
336
215
|
// Annotate the CommonJS export names for ESM import in node:
|
|
337
216
|
0 && (module.exports = {
|
|
338
|
-
|
|
339
|
-
filePathToPathname,
|
|
340
|
-
isPathAllowedByRobots,
|
|
341
|
-
loadRobotsTxt,
|
|
342
|
-
matchesExcludePattern,
|
|
343
|
-
parseHTML,
|
|
344
|
-
shouldIncludePath
|
|
217
|
+
parseHTML
|
|
345
218
|
});
|
|
346
219
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n\nexport { matchesExcludePattern } from './utils/excludePatterns';\nexport { shouldIncludePath, type PathFilterReason, type PathFilterResult } from './utils/pathFilter';\nexport { filePathToPathname } from './utils/pathUtils';\nexport {\n createRobotsParser,\n isPathAllowedByRobots,\n loadRobotsTxt,\n type RobotsParser,\n type RobotsTxtResult,\n} from './utils/robotsParser';\n\nexport { parseHTML } from './parseHtml';\n","import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,qBAAwB;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,aAAO,wBAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,gBAAyC;AACzC,kBAAqB;AACrB,2BAAyB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,aAAO,qBAAAA,SAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,iBAAa,kBAAK,YAAY,aAAa;AACjD,YAAI,sBAAW,UAAU,GAAG;AAC1B,0BAAgB,wBAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,eAAW,kBAAK,YAAY,UAAU;AAC5C,cAAI,sBAAW,QAAQ,GAAG;AACxB,4BAAgB,wBAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,IAAAC,eAAoB;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,gBAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,oBAAwB;AACxB,mBAAsB;AACtB,sBAA4B;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAC;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["robotsParser","import_path","_a","TurndownService"]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { parseHTML } from './parseHtml';\nexport { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,oBAAwB;AACxB,mBAAsB;AACtB,sBAA4B;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a","TurndownService"]}
|
package/dist/index.mjs
CHANGED
|
@@ -15,120 +15,6 @@ var __spreadValues = (a, b) => {
|
|
|
15
15
|
return a;
|
|
16
16
|
};
|
|
17
17
|
|
|
18
|
-
// src/utils/excludePatterns.ts
|
|
19
|
-
import { isMatch } from "matcher";
|
|
20
|
-
function matchesExcludePattern(path, patterns) {
|
|
21
|
-
if (!patterns || patterns.length === 0) {
|
|
22
|
-
return false;
|
|
23
|
-
}
|
|
24
|
-
const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
|
|
25
|
-
const normalizedPath = normalize(path);
|
|
26
|
-
const normalizedPatterns = patterns.map(normalize);
|
|
27
|
-
return isMatch(normalizedPath, normalizedPatterns);
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
// src/utils/robotsParser.ts
|
|
31
|
-
import { existsSync, readFileSync } from "fs";
|
|
32
|
-
import { join } from "path";
|
|
33
|
-
import robotsParser from "robots-parser";
|
|
34
|
-
function createRobotsParser(content) {
|
|
35
|
-
return robotsParser("https://robots.invalid/robots.txt", content);
|
|
36
|
-
}
|
|
37
|
-
function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
|
|
38
|
-
let robotsContent = null;
|
|
39
|
-
let foundPath = null;
|
|
40
|
-
if (robotsTxtPath) {
|
|
41
|
-
const customPath = join(projectDir, robotsTxtPath);
|
|
42
|
-
if (existsSync(customPath)) {
|
|
43
|
-
robotsContent = readFileSync(customPath, "utf-8");
|
|
44
|
-
foundPath = customPath;
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
if (!robotsContent) {
|
|
48
|
-
for (const searchPath of searchPaths) {
|
|
49
|
-
const fullPath = join(projectDir, searchPath);
|
|
50
|
-
if (existsSync(fullPath)) {
|
|
51
|
-
robotsContent = readFileSync(fullPath, "utf-8");
|
|
52
|
-
foundPath = fullPath;
|
|
53
|
-
break;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
if (!robotsContent) {
|
|
58
|
-
return null;
|
|
59
|
-
}
|
|
60
|
-
return {
|
|
61
|
-
parser: createRobotsParser(robotsContent),
|
|
62
|
-
path: foundPath || ""
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
function isPathAllowedByRobots(path, robots) {
|
|
66
|
-
if (!robots) {
|
|
67
|
-
return true;
|
|
68
|
-
}
|
|
69
|
-
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
|
|
70
|
-
const testUrl = `https://robots.invalid${normalizedPath}`;
|
|
71
|
-
const isAllowed = robots.isAllowed(testUrl, "*");
|
|
72
|
-
return isAllowed !== false;
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// src/utils/pathFilter.ts
|
|
76
|
-
function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
|
|
77
|
-
if (pathname.includes("[") && pathname.includes("]")) {
|
|
78
|
-
return { included: false, reason: "dynamic-route" };
|
|
79
|
-
}
|
|
80
|
-
if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
|
|
81
|
-
return { included: false, reason: "internal-route" };
|
|
82
|
-
}
|
|
83
|
-
if (pathname.endsWith(".rsc")) {
|
|
84
|
-
return { included: false, reason: "rsc-file" };
|
|
85
|
-
}
|
|
86
|
-
if (pathname.includes(".segments/")) {
|
|
87
|
-
return { included: false, reason: "segment-file" };
|
|
88
|
-
}
|
|
89
|
-
if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
|
|
90
|
-
return { included: false, reason: "static-asset" };
|
|
91
|
-
}
|
|
92
|
-
if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
|
|
93
|
-
return { included: false, reason: "robots-txt" };
|
|
94
|
-
}
|
|
95
|
-
if (matchesExcludePattern(pathname, excludePatterns)) {
|
|
96
|
-
return { included: false, reason: "exclude-pattern" };
|
|
97
|
-
}
|
|
98
|
-
return { included: true, reason: "included" };
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// src/utils/pathUtils.ts
|
|
102
|
-
import { sep } from "path";
|
|
103
|
-
var artifactPrefixes = [
|
|
104
|
-
"server/pages/",
|
|
105
|
-
"server/app/",
|
|
106
|
-
"static/chunks/app/",
|
|
107
|
-
"static/chunks/pages/",
|
|
108
|
-
"static/",
|
|
109
|
-
"server/"
|
|
110
|
-
];
|
|
111
|
-
function filePathToPathname(filePath) {
|
|
112
|
-
let pathname = filePath.split(sep).join("/");
|
|
113
|
-
for (const prefix of artifactPrefixes) {
|
|
114
|
-
if (pathname.startsWith(prefix)) {
|
|
115
|
-
pathname = pathname.substring(prefix.length);
|
|
116
|
-
break;
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
pathname = pathname.replace(/\.html?$/, "");
|
|
120
|
-
if (pathname === "index" || pathname === "") {
|
|
121
|
-
return "/";
|
|
122
|
-
}
|
|
123
|
-
if (pathname.endsWith("/index")) {
|
|
124
|
-
pathname = pathname.slice(0, -5);
|
|
125
|
-
}
|
|
126
|
-
if (!pathname.startsWith("/")) {
|
|
127
|
-
pathname = "/" + pathname;
|
|
128
|
-
}
|
|
129
|
-
return pathname;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
18
|
// src/parseHtml.ts
|
|
133
19
|
import { loggers } from "@peam-ai/logger";
|
|
134
20
|
import { JSDOM } from "jsdom";
|
|
@@ -264,7 +150,6 @@ function parseHTML(html, options = {}) {
|
|
|
264
150
|
log.error("Empty or invalid HTML input");
|
|
265
151
|
return void 0;
|
|
266
152
|
}
|
|
267
|
-
log.debug("Starting parse with options", options);
|
|
268
153
|
const dom = new JSDOM(html);
|
|
269
154
|
const document = dom.window.document;
|
|
270
155
|
const cssSelectorParser = new CssSelectorParser();
|
|
@@ -295,12 +180,6 @@ function parseHTML(html, options = {}) {
|
|
|
295
180
|
return mergedResult;
|
|
296
181
|
}
|
|
297
182
|
export {
|
|
298
|
-
|
|
299
|
-
filePathToPathname,
|
|
300
|
-
isPathAllowedByRobots,
|
|
301
|
-
loadRobotsTxt,
|
|
302
|
-
matchesExcludePattern,
|
|
303
|
-
parseHTML,
|
|
304
|
-
shouldIncludePath
|
|
183
|
+
parseHTML
|
|
305
184
|
};
|
|
306
185
|
//# sourceMappingURL=index.mjs.map
|
package/dist/index.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,SAAO,QAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,SAAS,YAAY,oBAAoB;AACzC,SAAS,YAAY;AACrB,OAAO,kBAAkB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,SAAO,aAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,aAAa,KAAK,YAAY,aAAa;AACjD,QAAI,WAAW,UAAU,GAAG;AAC1B,sBAAgB,aAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,WAAW,KAAK,YAAY,UAAU;AAC5C,UAAI,WAAW,QAAQ,GAAG;AACxB,wBAAgB,aAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,SAAS,WAAW;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,GAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,SAAS,eAAe;AACxB,SAAS,aAAa;AACtB,OAAO,qBAAqB;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a"]}
|
|
1
|
+
{"version":3,"sources":["../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AACxB,SAAS,aAAa;AACtB,OAAO,qBAAqB;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a"]}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@peam-ai/parser",
|
|
3
3
|
"description": "Content parser for extracting page metadata",
|
|
4
|
-
"version": "0.1.
|
|
4
|
+
"version": "0.1.5",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|
|
7
7
|
"types": "./dist/index.d.ts",
|
|
@@ -36,16 +36,16 @@
|
|
|
36
36
|
"dependencies": {
|
|
37
37
|
"@mozilla/readability": "^0.5.0",
|
|
38
38
|
"jsdom": "^27.3.0",
|
|
39
|
-
"matcher": "^6.0.0",
|
|
40
|
-
"robots-parser": "^3.0.1",
|
|
41
39
|
"turndown": "^7.2.2",
|
|
42
|
-
"@peam-ai/logger": "0.1.
|
|
40
|
+
"@peam-ai/logger": "0.1.5"
|
|
43
41
|
},
|
|
44
42
|
"scripts": {
|
|
45
43
|
"build": "tsup",
|
|
44
|
+
"build:watch": "tsup --watch",
|
|
46
45
|
"clean": "rm -rf dist",
|
|
46
|
+
"format": "prettier --write \"src/**/*.ts*\"",
|
|
47
47
|
"test:unit": "vitest run",
|
|
48
48
|
"test:lint": "eslint \"src/**/*.ts*\"",
|
|
49
|
-
"test:
|
|
49
|
+
"test:format": "prettier --check \"src/**/*.ts*\""
|
|
50
50
|
}
|
|
51
51
|
}
|