@peam-ai/parser 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +2 -52
- package/dist/index.d.ts +2 -52
- package/dist/index.js +18 -132
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +17 -125
- package/dist/index.mjs.map +1 -1
- package/package.json +9 -7
package/dist/index.d.mts
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
import robotsParser from 'robots-parser';
|
|
2
|
-
|
|
3
1
|
interface StructuredPage {
|
|
4
2
|
title: string;
|
|
5
3
|
description: string;
|
|
6
4
|
content: string;
|
|
7
5
|
textContent: string;
|
|
6
|
+
markdownContent?: string;
|
|
8
7
|
contentLength?: number;
|
|
9
8
|
author?: string;
|
|
10
9
|
direction?: string;
|
|
@@ -45,55 +44,6 @@ interface ParseOptions {
|
|
|
45
44
|
extractImages?: boolean;
|
|
46
45
|
}
|
|
47
46
|
|
|
48
|
-
/**
|
|
49
|
-
* Checks if a path matches any of the given wildcard patterns
|
|
50
|
-
* Uses the matcher library which supports:
|
|
51
|
-
* - * (matches any characters except /)
|
|
52
|
-
* - ** (matches any characters including /)
|
|
53
|
-
* - ? (matches single character)
|
|
54
|
-
* - ! (negation)
|
|
55
|
-
* - [] (character ranges)
|
|
56
|
-
*/
|
|
57
|
-
declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
|
|
58
|
-
|
|
59
|
-
type RobotsParser = ReturnType<typeof robotsParser>;
|
|
60
|
-
interface RobotsTxtResult {
|
|
61
|
-
parser: RobotsParser;
|
|
62
|
-
path: string;
|
|
63
|
-
}
|
|
64
|
-
declare function createRobotsParser(content: string): RobotsParser;
|
|
65
|
-
/**
|
|
66
|
-
* Loads and parses robots.txt from custom path or standard locations
|
|
67
|
-
* Returns the parser and the path where robots.txt was found
|
|
68
|
-
*/
|
|
69
|
-
declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
|
|
70
|
-
declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
|
|
71
|
-
|
|
72
|
-
type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
|
|
73
|
-
interface PathFilterResult {
|
|
74
|
-
included: boolean;
|
|
75
|
-
reason: PathFilterReason;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Determines if a pathname should be included in the index
|
|
79
|
-
* Returns both the decision and the reason for exclusion
|
|
80
|
-
*/
|
|
81
|
-
declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* Convert file path to URL pathname
|
|
85
|
-
*
|
|
86
|
-
* Examples:
|
|
87
|
-
* index.html -> /
|
|
88
|
-
* about.html -> /about
|
|
89
|
-
* about/index.html -> /about/
|
|
90
|
-
* blog/post-1.html -> /blog/post-1
|
|
91
|
-
* blog/post-1/index.html -> /blog/post-1/
|
|
92
|
-
* server/pages/contact.html -> /contact
|
|
93
|
-
* server/app/about.html -> /about
|
|
94
|
-
*/
|
|
95
|
-
declare function filePathToPathname(filePath: string): string;
|
|
96
|
-
|
|
97
47
|
/**
|
|
98
48
|
* Parse HTML content and convert it to a StructuredPage
|
|
99
49
|
* @param html - HTML string to parse
|
|
@@ -102,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
|
|
|
102
52
|
*/
|
|
103
53
|
declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
|
|
104
54
|
|
|
105
|
-
export { type ParseOptions, type
|
|
55
|
+
export { type ParseOptions, type StructuredPage, parseHTML };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
import robotsParser from 'robots-parser';
|
|
2
|
-
|
|
3
1
|
interface StructuredPage {
|
|
4
2
|
title: string;
|
|
5
3
|
description: string;
|
|
6
4
|
content: string;
|
|
7
5
|
textContent: string;
|
|
6
|
+
markdownContent?: string;
|
|
8
7
|
contentLength?: number;
|
|
9
8
|
author?: string;
|
|
10
9
|
direction?: string;
|
|
@@ -45,55 +44,6 @@ interface ParseOptions {
|
|
|
45
44
|
extractImages?: boolean;
|
|
46
45
|
}
|
|
47
46
|
|
|
48
|
-
/**
|
|
49
|
-
* Checks if a path matches any of the given wildcard patterns
|
|
50
|
-
* Uses the matcher library which supports:
|
|
51
|
-
* - * (matches any characters except /)
|
|
52
|
-
* - ** (matches any characters including /)
|
|
53
|
-
* - ? (matches single character)
|
|
54
|
-
* - ! (negation)
|
|
55
|
-
* - [] (character ranges)
|
|
56
|
-
*/
|
|
57
|
-
declare function matchesExcludePattern(path: string, patterns: string[]): boolean;
|
|
58
|
-
|
|
59
|
-
type RobotsParser = ReturnType<typeof robotsParser>;
|
|
60
|
-
interface RobotsTxtResult {
|
|
61
|
-
parser: RobotsParser;
|
|
62
|
-
path: string;
|
|
63
|
-
}
|
|
64
|
-
declare function createRobotsParser(content: string): RobotsParser;
|
|
65
|
-
/**
|
|
66
|
-
* Loads and parses robots.txt from custom path or standard locations
|
|
67
|
-
* Returns the parser and the path where robots.txt was found
|
|
68
|
-
*/
|
|
69
|
-
declare function loadRobotsTxt(projectDir: string, searchPaths: string[], robotsTxtPath?: string): RobotsTxtResult | null;
|
|
70
|
-
declare function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean;
|
|
71
|
-
|
|
72
|
-
type PathFilterReason = 'included' | 'dynamic-route' | 'internal-route' | 'rsc-file' | 'segment-file' | 'static-asset' | 'robots-txt' | 'exclude-pattern';
|
|
73
|
-
interface PathFilterResult {
|
|
74
|
-
included: boolean;
|
|
75
|
-
reason: PathFilterReason;
|
|
76
|
-
}
|
|
77
|
-
/**
|
|
78
|
-
* Determines if a pathname should be included in the index
|
|
79
|
-
* Returns both the decision and the reason for exclusion
|
|
80
|
-
*/
|
|
81
|
-
declare function shouldIncludePath(pathname: string, robots: RobotsParser | null, excludePatterns: string[], respectRobotsTxt: boolean): PathFilterResult;
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* Convert file path to URL pathname
|
|
85
|
-
*
|
|
86
|
-
* Examples:
|
|
87
|
-
* index.html -> /
|
|
88
|
-
* about.html -> /about
|
|
89
|
-
* about/index.html -> /about/
|
|
90
|
-
* blog/post-1.html -> /blog/post-1
|
|
91
|
-
* blog/post-1/index.html -> /blog/post-1/
|
|
92
|
-
* server/pages/contact.html -> /contact
|
|
93
|
-
* server/app/about.html -> /about
|
|
94
|
-
*/
|
|
95
|
-
declare function filePathToPathname(filePath: string): string;
|
|
96
|
-
|
|
97
47
|
/**
|
|
98
48
|
* Parse HTML content and convert it to a StructuredPage
|
|
99
49
|
* @param html - HTML string to parse
|
|
@@ -102,4 +52,4 @@ declare function filePathToPathname(filePath: string): string;
|
|
|
102
52
|
*/
|
|
103
53
|
declare function parseHTML(html: string, options?: ParseOptions): StructuredPage | undefined;
|
|
104
54
|
|
|
105
|
-
export { type ParseOptions, type
|
|
55
|
+
export { type ParseOptions, type StructuredPage, parseHTML };
|
package/dist/index.js
CHANGED
|
@@ -44,133 +44,14 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
44
44
|
// src/index.ts
|
|
45
45
|
var index_exports = {};
|
|
46
46
|
__export(index_exports, {
|
|
47
|
-
|
|
48
|
-
filePathToPathname: () => filePathToPathname,
|
|
49
|
-
isPathAllowedByRobots: () => isPathAllowedByRobots,
|
|
50
|
-
loadRobotsTxt: () => loadRobotsTxt,
|
|
51
|
-
matchesExcludePattern: () => matchesExcludePattern,
|
|
52
|
-
parseHTML: () => parseHTML,
|
|
53
|
-
shouldIncludePath: () => shouldIncludePath
|
|
47
|
+
parseHTML: () => parseHTML
|
|
54
48
|
});
|
|
55
49
|
module.exports = __toCommonJS(index_exports);
|
|
56
50
|
|
|
57
|
-
// src/utils/excludePatterns.ts
|
|
58
|
-
var import_matcher = require("matcher");
|
|
59
|
-
function matchesExcludePattern(path, patterns) {
|
|
60
|
-
if (!patterns || patterns.length === 0) {
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
|
|
64
|
-
const normalizedPath = normalize(path);
|
|
65
|
-
const normalizedPatterns = patterns.map(normalize);
|
|
66
|
-
return (0, import_matcher.isMatch)(normalizedPath, normalizedPatterns);
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
// src/utils/robotsParser.ts
|
|
70
|
-
var import_fs = require("fs");
|
|
71
|
-
var import_path = require("path");
|
|
72
|
-
var import_robots_parser = __toESM(require("robots-parser"));
|
|
73
|
-
function createRobotsParser(content) {
|
|
74
|
-
return (0, import_robots_parser.default)("https://robots.invalid/robots.txt", content);
|
|
75
|
-
}
|
|
76
|
-
function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
|
|
77
|
-
let robotsContent = null;
|
|
78
|
-
let foundPath = null;
|
|
79
|
-
if (robotsTxtPath) {
|
|
80
|
-
const customPath = (0, import_path.join)(projectDir, robotsTxtPath);
|
|
81
|
-
if ((0, import_fs.existsSync)(customPath)) {
|
|
82
|
-
robotsContent = (0, import_fs.readFileSync)(customPath, "utf-8");
|
|
83
|
-
foundPath = customPath;
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
if (!robotsContent) {
|
|
87
|
-
for (const searchPath of searchPaths) {
|
|
88
|
-
const fullPath = (0, import_path.join)(projectDir, searchPath);
|
|
89
|
-
if ((0, import_fs.existsSync)(fullPath)) {
|
|
90
|
-
robotsContent = (0, import_fs.readFileSync)(fullPath, "utf-8");
|
|
91
|
-
foundPath = fullPath;
|
|
92
|
-
break;
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
if (!robotsContent) {
|
|
97
|
-
return null;
|
|
98
|
-
}
|
|
99
|
-
return {
|
|
100
|
-
parser: createRobotsParser(robotsContent),
|
|
101
|
-
path: foundPath || ""
|
|
102
|
-
};
|
|
103
|
-
}
|
|
104
|
-
function isPathAllowedByRobots(path, robots) {
|
|
105
|
-
if (!robots) {
|
|
106
|
-
return true;
|
|
107
|
-
}
|
|
108
|
-
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
|
|
109
|
-
const testUrl = `https://robots.invalid${normalizedPath}`;
|
|
110
|
-
const isAllowed = robots.isAllowed(testUrl, "*");
|
|
111
|
-
return isAllowed !== false;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
// src/utils/pathFilter.ts
|
|
115
|
-
function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
|
|
116
|
-
if (pathname.includes("[") && pathname.includes("]")) {
|
|
117
|
-
return { included: false, reason: "dynamic-route" };
|
|
118
|
-
}
|
|
119
|
-
if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
|
|
120
|
-
return { included: false, reason: "internal-route" };
|
|
121
|
-
}
|
|
122
|
-
if (pathname.endsWith(".rsc")) {
|
|
123
|
-
return { included: false, reason: "rsc-file" };
|
|
124
|
-
}
|
|
125
|
-
if (pathname.includes(".segments/")) {
|
|
126
|
-
return { included: false, reason: "segment-file" };
|
|
127
|
-
}
|
|
128
|
-
if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
|
|
129
|
-
return { included: false, reason: "static-asset" };
|
|
130
|
-
}
|
|
131
|
-
if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
|
|
132
|
-
return { included: false, reason: "robots-txt" };
|
|
133
|
-
}
|
|
134
|
-
if (matchesExcludePattern(pathname, excludePatterns)) {
|
|
135
|
-
return { included: false, reason: "exclude-pattern" };
|
|
136
|
-
}
|
|
137
|
-
return { included: true, reason: "included" };
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
// src/utils/pathUtils.ts
|
|
141
|
-
var import_path2 = require("path");
|
|
142
|
-
var artifactPrefixes = [
|
|
143
|
-
"server/pages/",
|
|
144
|
-
"server/app/",
|
|
145
|
-
"static/chunks/app/",
|
|
146
|
-
"static/chunks/pages/",
|
|
147
|
-
"static/",
|
|
148
|
-
"server/"
|
|
149
|
-
];
|
|
150
|
-
function filePathToPathname(filePath) {
|
|
151
|
-
let pathname = filePath.split(import_path2.sep).join("/");
|
|
152
|
-
for (const prefix of artifactPrefixes) {
|
|
153
|
-
if (pathname.startsWith(prefix)) {
|
|
154
|
-
pathname = pathname.substring(prefix.length);
|
|
155
|
-
break;
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
pathname = pathname.replace(/\.html?$/, "");
|
|
159
|
-
if (pathname === "index" || pathname === "") {
|
|
160
|
-
return "/";
|
|
161
|
-
}
|
|
162
|
-
if (pathname.endsWith("/index")) {
|
|
163
|
-
pathname = pathname.slice(0, -5);
|
|
164
|
-
}
|
|
165
|
-
if (!pathname.startsWith("/")) {
|
|
166
|
-
pathname = "/" + pathname;
|
|
167
|
-
}
|
|
168
|
-
return pathname;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
51
|
// src/parseHtml.ts
|
|
172
52
|
var import_logger = require("@peam-ai/logger");
|
|
173
53
|
var import_jsdom = require("jsdom");
|
|
54
|
+
var import_turndown = __toESM(require("turndown"));
|
|
174
55
|
|
|
175
56
|
// src/parsers/cssSelectorParser.ts
|
|
176
57
|
var CssSelectorParser = class {
|
|
@@ -299,10 +180,9 @@ var ReadabilityParser = class {
|
|
|
299
180
|
var log = import_logger.loggers.parser;
|
|
300
181
|
function parseHTML(html, options = {}) {
|
|
301
182
|
if (!html || html.trim().length === 0) {
|
|
302
|
-
log.
|
|
183
|
+
log.error("Empty or invalid HTML input");
|
|
303
184
|
return void 0;
|
|
304
185
|
}
|
|
305
|
-
log.debug("Starting parse with options", options);
|
|
306
186
|
const dom = new import_jsdom.JSDOM(html);
|
|
307
187
|
const document = dom.window.document;
|
|
308
188
|
const cssSelectorParser = new CssSelectorParser();
|
|
@@ -310,24 +190,30 @@ function parseHTML(html, options = {}) {
|
|
|
310
190
|
const readabilityParser = new ReadabilityParser();
|
|
311
191
|
const readabilityStructuredPage = readabilityParser.parse(document, options);
|
|
312
192
|
if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
|
|
313
|
-
log.
|
|
193
|
+
log.error("Failed to extract content");
|
|
314
194
|
return void 0;
|
|
315
195
|
}
|
|
316
|
-
|
|
196
|
+
const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
|
|
317
197
|
title: "",
|
|
318
198
|
description: "",
|
|
319
199
|
content: "",
|
|
320
200
|
textContent: ""
|
|
321
201
|
}), cssSelectorStructuredPage), readabilityStructuredPage);
|
|
202
|
+
if (mergedResult.content) {
|
|
203
|
+
try {
|
|
204
|
+
const turndownService = new import_turndown.default({
|
|
205
|
+
headingStyle: "atx",
|
|
206
|
+
codeBlockStyle: "fenced"
|
|
207
|
+
});
|
|
208
|
+
mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
|
|
209
|
+
} catch (error) {
|
|
210
|
+
log.error("Failed to convert content to markdown", error);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
return mergedResult;
|
|
322
214
|
}
|
|
323
215
|
// Annotate the CommonJS export names for ESM import in node:
|
|
324
216
|
0 && (module.exports = {
|
|
325
|
-
|
|
326
|
-
filePathToPathname,
|
|
327
|
-
isPathAllowedByRobots,
|
|
328
|
-
loadRobotsTxt,
|
|
329
|
-
matchesExcludePattern,
|
|
330
|
-
parseHTML,
|
|
331
|
-
shouldIncludePath
|
|
217
|
+
parseHTML
|
|
332
218
|
});
|
|
333
219
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n\nexport { matchesExcludePattern } from './utils/excludePatterns';\nexport { shouldIncludePath, type PathFilterReason, type PathFilterResult } from './utils/pathFilter';\nexport { filePathToPathname } from './utils/pathUtils';\nexport {\n createRobotsParser,\n isPathAllowedByRobots,\n loadRobotsTxt,\n type RobotsParser,\n type RobotsTxtResult,\n} from './utils/robotsParser';\n\nexport { parseHTML } from './parseHtml';\n","import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,qBAAwB;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,aAAO,wBAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,gBAAyC;AACzC,kBAAqB;AACrB,2BAAyB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,aAAO,qBAAAA,SAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,iBAAa,kBAAK,YAAY,aAAa;AACjD,YAAI,sBAAW,UAAU,GAAG;AAC1B,0BAAgB,wBAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,eAAW,kBAAK,YAAY,UAAU;AAC5C,cAAI,sBAAW,QAAQ,GAAG;AACxB,4BAAgB,wBAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,IAAAC,eAAoB;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,gBAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,oBAAwB;AACxB,mBAAsB;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAC;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["robotsParser","import_path","_a"]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["export { parseHTML } from './parseHtml';\nexport { ParseOptions } from './parsers/parser';\nexport { StructuredPage } from './structuredPage';\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,oBAAwB;AACxB,mBAAsB;AACtB,sBAA4B;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,yBAA4B;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,+BAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,sBAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,mBAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a","TurndownService"]}
|
package/dist/index.mjs
CHANGED
|
@@ -15,123 +15,10 @@ var __spreadValues = (a, b) => {
|
|
|
15
15
|
return a;
|
|
16
16
|
};
|
|
17
17
|
|
|
18
|
-
// src/utils/excludePatterns.ts
|
|
19
|
-
import { isMatch } from "matcher";
|
|
20
|
-
function matchesExcludePattern(path, patterns) {
|
|
21
|
-
if (!patterns || patterns.length === 0) {
|
|
22
|
-
return false;
|
|
23
|
-
}
|
|
24
|
-
const normalize = (p) => p.startsWith("/") ? p : `/${p}`;
|
|
25
|
-
const normalizedPath = normalize(path);
|
|
26
|
-
const normalizedPatterns = patterns.map(normalize);
|
|
27
|
-
return isMatch(normalizedPath, normalizedPatterns);
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
// src/utils/robotsParser.ts
|
|
31
|
-
import { existsSync, readFileSync } from "fs";
|
|
32
|
-
import { join } from "path";
|
|
33
|
-
import robotsParser from "robots-parser";
|
|
34
|
-
function createRobotsParser(content) {
|
|
35
|
-
return robotsParser("https://robots.invalid/robots.txt", content);
|
|
36
|
-
}
|
|
37
|
-
function loadRobotsTxt(projectDir, searchPaths, robotsTxtPath) {
|
|
38
|
-
let robotsContent = null;
|
|
39
|
-
let foundPath = null;
|
|
40
|
-
if (robotsTxtPath) {
|
|
41
|
-
const customPath = join(projectDir, robotsTxtPath);
|
|
42
|
-
if (existsSync(customPath)) {
|
|
43
|
-
robotsContent = readFileSync(customPath, "utf-8");
|
|
44
|
-
foundPath = customPath;
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
if (!robotsContent) {
|
|
48
|
-
for (const searchPath of searchPaths) {
|
|
49
|
-
const fullPath = join(projectDir, searchPath);
|
|
50
|
-
if (existsSync(fullPath)) {
|
|
51
|
-
robotsContent = readFileSync(fullPath, "utf-8");
|
|
52
|
-
foundPath = fullPath;
|
|
53
|
-
break;
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
if (!robotsContent) {
|
|
58
|
-
return null;
|
|
59
|
-
}
|
|
60
|
-
return {
|
|
61
|
-
parser: createRobotsParser(robotsContent),
|
|
62
|
-
path: foundPath || ""
|
|
63
|
-
};
|
|
64
|
-
}
|
|
65
|
-
function isPathAllowedByRobots(path, robots) {
|
|
66
|
-
if (!robots) {
|
|
67
|
-
return true;
|
|
68
|
-
}
|
|
69
|
-
const normalizedPath = path.startsWith("/") ? path : `/${path}`;
|
|
70
|
-
const testUrl = `https://robots.invalid${normalizedPath}`;
|
|
71
|
-
const isAllowed = robots.isAllowed(testUrl, "*");
|
|
72
|
-
return isAllowed !== false;
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// src/utils/pathFilter.ts
|
|
76
|
-
function shouldIncludePath(pathname, robots, excludePatterns, respectRobotsTxt) {
|
|
77
|
-
if (pathname.includes("[") && pathname.includes("]")) {
|
|
78
|
-
return { included: false, reason: "dynamic-route" };
|
|
79
|
-
}
|
|
80
|
-
if (pathname.startsWith("/_not-found") || pathname.startsWith("/_global-error")) {
|
|
81
|
-
return { included: false, reason: "internal-route" };
|
|
82
|
-
}
|
|
83
|
-
if (pathname.endsWith(".rsc")) {
|
|
84
|
-
return { included: false, reason: "rsc-file" };
|
|
85
|
-
}
|
|
86
|
-
if (pathname.includes(".segments/")) {
|
|
87
|
-
return { included: false, reason: "segment-file" };
|
|
88
|
-
}
|
|
89
|
-
if (pathname.match(/\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {
|
|
90
|
-
return { included: false, reason: "static-asset" };
|
|
91
|
-
}
|
|
92
|
-
if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {
|
|
93
|
-
return { included: false, reason: "robots-txt" };
|
|
94
|
-
}
|
|
95
|
-
if (matchesExcludePattern(pathname, excludePatterns)) {
|
|
96
|
-
return { included: false, reason: "exclude-pattern" };
|
|
97
|
-
}
|
|
98
|
-
return { included: true, reason: "included" };
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// src/utils/pathUtils.ts
|
|
102
|
-
import { sep } from "path";
|
|
103
|
-
var artifactPrefixes = [
|
|
104
|
-
"server/pages/",
|
|
105
|
-
"server/app/",
|
|
106
|
-
"static/chunks/app/",
|
|
107
|
-
"static/chunks/pages/",
|
|
108
|
-
"static/",
|
|
109
|
-
"server/"
|
|
110
|
-
];
|
|
111
|
-
function filePathToPathname(filePath) {
|
|
112
|
-
let pathname = filePath.split(sep).join("/");
|
|
113
|
-
for (const prefix of artifactPrefixes) {
|
|
114
|
-
if (pathname.startsWith(prefix)) {
|
|
115
|
-
pathname = pathname.substring(prefix.length);
|
|
116
|
-
break;
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
pathname = pathname.replace(/\.html?$/, "");
|
|
120
|
-
if (pathname === "index" || pathname === "") {
|
|
121
|
-
return "/";
|
|
122
|
-
}
|
|
123
|
-
if (pathname.endsWith("/index")) {
|
|
124
|
-
pathname = pathname.slice(0, -5);
|
|
125
|
-
}
|
|
126
|
-
if (!pathname.startsWith("/")) {
|
|
127
|
-
pathname = "/" + pathname;
|
|
128
|
-
}
|
|
129
|
-
return pathname;
|
|
130
|
-
}
|
|
131
|
-
|
|
132
18
|
// src/parseHtml.ts
|
|
133
19
|
import { loggers } from "@peam-ai/logger";
|
|
134
20
|
import { JSDOM } from "jsdom";
|
|
21
|
+
import TurndownService from "turndown";
|
|
135
22
|
|
|
136
23
|
// src/parsers/cssSelectorParser.ts
|
|
137
24
|
var CssSelectorParser = class {
|
|
@@ -260,10 +147,9 @@ var ReadabilityParser = class {
|
|
|
260
147
|
var log = loggers.parser;
|
|
261
148
|
function parseHTML(html, options = {}) {
|
|
262
149
|
if (!html || html.trim().length === 0) {
|
|
263
|
-
log.
|
|
150
|
+
log.error("Empty or invalid HTML input");
|
|
264
151
|
return void 0;
|
|
265
152
|
}
|
|
266
|
-
log.debug("Starting parse with options", options);
|
|
267
153
|
const dom = new JSDOM(html);
|
|
268
154
|
const document = dom.window.document;
|
|
269
155
|
const cssSelectorParser = new CssSelectorParser();
|
|
@@ -271,23 +157,29 @@ function parseHTML(html, options = {}) {
|
|
|
271
157
|
const readabilityParser = new ReadabilityParser();
|
|
272
158
|
const readabilityStructuredPage = readabilityParser.parse(document, options);
|
|
273
159
|
if (!cssSelectorStructuredPage && !readabilityStructuredPage) {
|
|
274
|
-
log.
|
|
160
|
+
log.error("Failed to extract content");
|
|
275
161
|
return void 0;
|
|
276
162
|
}
|
|
277
|
-
|
|
163
|
+
const mergedResult = __spreadValues(__spreadValues(__spreadValues({}, {
|
|
278
164
|
title: "",
|
|
279
165
|
description: "",
|
|
280
166
|
content: "",
|
|
281
167
|
textContent: ""
|
|
282
168
|
}), cssSelectorStructuredPage), readabilityStructuredPage);
|
|
169
|
+
if (mergedResult.content) {
|
|
170
|
+
try {
|
|
171
|
+
const turndownService = new TurndownService({
|
|
172
|
+
headingStyle: "atx",
|
|
173
|
+
codeBlockStyle: "fenced"
|
|
174
|
+
});
|
|
175
|
+
mergedResult.markdownContent = turndownService.turndown(mergedResult.content);
|
|
176
|
+
} catch (error) {
|
|
177
|
+
log.error("Failed to convert content to markdown", error);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return mergedResult;
|
|
283
181
|
}
|
|
284
182
|
export {
|
|
285
|
-
|
|
286
|
-
filePathToPathname,
|
|
287
|
-
isPathAllowedByRobots,
|
|
288
|
-
loadRobotsTxt,
|
|
289
|
-
matchesExcludePattern,
|
|
290
|
-
parseHTML,
|
|
291
|
-
shouldIncludePath
|
|
183
|
+
parseHTML
|
|
292
184
|
};
|
|
293
185
|
//# sourceMappingURL=index.mjs.map
|
package/dist/index.mjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/utils/excludePatterns.ts","../src/utils/robotsParser.ts","../src/utils/pathFilter.ts","../src/utils/pathUtils.ts","../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { isMatch } from 'matcher';\n\n/**\n * Checks if a path matches any of the given wildcard patterns\n * Uses the matcher library which supports:\n * - * (matches any characters except /)\n * - ** (matches any characters including /)\n * - ? (matches single character)\n * - ! (negation)\n * - [] (character ranges)\n */\nexport function matchesExcludePattern(path: string, patterns: string[]): boolean {\n if (!patterns || patterns.length === 0) {\n return false;\n }\n\n const normalize = (p: string) => (p.startsWith('/') ? p : `/${p}`);\n const normalizedPath = normalize(path);\n const normalizedPatterns = patterns.map(normalize);\n\n return isMatch(normalizedPath, normalizedPatterns);\n}\n","import { existsSync, readFileSync } from 'fs';\nimport { join } from 'path';\nimport robotsParser from 'robots-parser';\n\nexport type RobotsParser = ReturnType<typeof robotsParser>;\n\nexport interface RobotsTxtResult {\n parser: RobotsParser;\n path: string;\n}\n\nexport function createRobotsParser(content: string): RobotsParser {\n return robotsParser('https://robots.invalid/robots.txt', content);\n}\n\n/**\n * Loads and parses robots.txt from custom path or standard locations\n * Returns the parser and the path where robots.txt was found\n */\nexport function loadRobotsTxt(\n projectDir: string,\n searchPaths: string[],\n robotsTxtPath?: string\n): RobotsTxtResult | null {\n let robotsContent: string | null = null;\n let foundPath: string | null = null;\n\n if (robotsTxtPath) {\n const customPath = join(projectDir, robotsTxtPath);\n if (existsSync(customPath)) {\n robotsContent = readFileSync(customPath, 'utf-8');\n foundPath = customPath;\n }\n }\n\n if (!robotsContent) {\n for (const searchPath of searchPaths) {\n const fullPath = join(projectDir, searchPath);\n if (existsSync(fullPath)) {\n robotsContent = readFileSync(fullPath, 'utf-8');\n foundPath = fullPath;\n break;\n }\n }\n }\n\n if (!robotsContent) {\n return null;\n }\n\n return {\n parser: createRobotsParser(robotsContent),\n path: foundPath || '',\n };\n}\n\nexport function isPathAllowedByRobots(path: string, robots: RobotsParser | null): boolean {\n if (!robots) {\n return true;\n }\n\n const normalizedPath = path.startsWith('/') ? path : `/${path}`;\n const testUrl = `https://robots.invalid${normalizedPath}`;\n const isAllowed = robots.isAllowed(testUrl, '*');\n\n return isAllowed !== false;\n}\n","import { matchesExcludePattern } from './excludePatterns';\nimport type { RobotsParser } from './robotsParser';\nimport { isPathAllowedByRobots } from './robotsParser';\n\nexport type PathFilterReason =\n | 'included'\n | 'dynamic-route'\n | 'internal-route'\n | 'rsc-file'\n | 'segment-file'\n | 'static-asset'\n | 'robots-txt'\n | 'exclude-pattern';\n\nexport interface PathFilterResult {\n included: boolean;\n reason: PathFilterReason;\n}\n\n/**\n * Determines if a pathname should be included in the index\n * Returns both the decision and the reason for exclusion\n */\nexport function shouldIncludePath(\n pathname: string,\n robots: RobotsParser | null,\n excludePatterns: string[],\n respectRobotsTxt: boolean\n): PathFilterResult {\n // Exclude routes with dynamic parameters (e.g., /session/[session_id])\n if (pathname.includes('[') && pathname.includes(']')) {\n return { included: false, reason: 'dynamic-route' };\n }\n\n // Exclude Next.js internal routes\n if (pathname.startsWith('/_not-found') || pathname.startsWith('/_global-error')) {\n return { included: false, reason: 'internal-route' };\n }\n\n // Exclude RSC files\n if (pathname.endsWith('.rsc')) {\n return { included: false, reason: 'rsc-file' };\n }\n\n // Exclude segment files\n if (pathname.includes('.segments/')) {\n return { included: false, reason: 'segment-file' };\n }\n\n // Exclude static assets\n if (pathname.match(/\\.(ico|png|jpg|jpeg|svg|gif|webp|txt|xml|json|css|js|woff|woff2|ttf|eot)$/i)) {\n return { included: false, reason: 'static-asset' };\n }\n\n // Check robots.txt rules\n if (respectRobotsTxt && !isPathAllowedByRobots(pathname, robots)) {\n return { included: false, reason: 'robots-txt' };\n }\n\n // Check user-defined exclude patterns\n if (matchesExcludePattern(pathname, excludePatterns)) {\n return { included: false, reason: 'exclude-pattern' };\n }\n\n return { included: true, reason: 'included' };\n}\n","import { sep } from 'path';\n\nconst artifactPrefixes = [\n 'server/pages/',\n 'server/app/',\n 'static/chunks/app/',\n 'static/chunks/pages/',\n 'static/',\n 'server/',\n];\n\n/**\n * Convert file path to URL pathname\n *\n * Examples:\n * index.html -> /\n * about.html -> /about\n * about/index.html -> /about/\n * blog/post-1.html -> /blog/post-1\n * blog/post-1/index.html -> /blog/post-1/\n * server/pages/contact.html -> /contact\n * server/app/about.html -> /about\n */\nexport function filePathToPathname(filePath: string): string {\n let pathname = filePath.split(sep).join('/');\n\n for (const prefix of artifactPrefixes) {\n if (pathname.startsWith(prefix)) {\n pathname = pathname.substring(prefix.length);\n break;\n }\n }\n\n pathname = pathname.replace(/\\.html?$/, '');\n\n if (pathname === 'index' || pathname === '') {\n return '/';\n }\n\n if (pathname.endsWith('/index')) {\n pathname = pathname.slice(0, -5);\n }\n\n if (!pathname.startsWith('/')) {\n pathname = '/' + pathname;\n }\n\n return pathname;\n}\n","import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.warn('Empty or invalid HTML input');\n return undefined;\n }\n\n log.debug('Starting parse with options', options);\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.warn('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n return {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AAWjB,SAAS,sBAAsB,MAAc,UAA6B;AAC/E,MAAI,CAAC,YAAY,SAAS,WAAW,GAAG;AACtC,WAAO;AAAA,EACT;AAEA,QAAM,YAAY,CAAC,MAAe,EAAE,WAAW,GAAG,IAAI,IAAI,IAAI,CAAC;AAC/D,QAAM,iBAAiB,UAAU,IAAI;AACrC,QAAM,qBAAqB,SAAS,IAAI,SAAS;AAEjD,SAAO,QAAQ,gBAAgB,kBAAkB;AACnD;;;ACrBA,SAAS,YAAY,oBAAoB;AACzC,SAAS,YAAY;AACrB,OAAO,kBAAkB;AASlB,SAAS,mBAAmB,SAA+B;AAChE,SAAO,aAAa,qCAAqC,OAAO;AAClE;AAMO,SAAS,cACd,YACA,aACA,eACwB;AACxB,MAAI,gBAA+B;AACnC,MAAI,YAA2B;AAE/B,MAAI,eAAe;AACjB,UAAM,aAAa,KAAK,YAAY,aAAa;AACjD,QAAI,WAAW,UAAU,GAAG;AAC1B,sBAAgB,aAAa,YAAY,OAAO;AAChD,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,eAAW,cAAc,aAAa;AACpC,YAAM,WAAW,KAAK,YAAY,UAAU;AAC5C,UAAI,WAAW,QAAQ,GAAG;AACxB,wBAAgB,aAAa,UAAU,OAAO;AAC9C,oBAAY;AACZ;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,eAAe;AAClB,WAAO;AAAA,EACT;AAEA,SAAO;AAAA,IACL,QAAQ,mBAAmB,aAAa;AAAA,IACxC,MAAM,aAAa;AAAA,EACrB;AACF;AAEO,SAAS,sBAAsB,MAAc,QAAsC;AACxF,MAAI,CAAC,QAAQ;AACX,WAAO;AAAA,EACT;AAEA,QAAM,iBAAiB,KAAK,WAAW,GAAG,IAAI,OAAO,IAAI,IAAI;AAC7D,QAAM,UAAU,yBAAyB,cAAc;AACvD,QAAM,YAAY,OAAO,UAAU,SAAS,GAAG;AAE/C,SAAO,cAAc;AACvB;;;AC3CO,SAAS,kBACd,UACA,QACA,iBACA,kBACkB;AAElB,MAAI,SAAS,SAAS,GAAG,KAAK,SAAS,SAAS,GAAG,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,gBAAgB;AAAA,EACpD;AAGA,MAAI,SAAS,WAAW,aAAa,KAAK,SAAS,WAAW,gBAAgB,GAAG;AAC/E,WAAO,EAAE,UAAU,OAAO,QAAQ,iBAAiB;AAAA,EACrD;AAGA,MAAI,SAAS,SAAS,MAAM,GAAG;AAC7B,WAAO,EAAE,UAAU,OAAO,QAAQ,WAAW;AAAA,EAC/C;AAGA,MAAI,SAAS,SAAS,YAAY,GAAG;AACnC,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,SAAS,MAAM,4EAA4E,GAAG;AAChG,WAAO,EAAE,UAAU,OAAO,QAAQ,eAAe;AAAA,EACnD;AAGA,MAAI,oBAAoB,CAAC,sBAAsB,UAAU,MAAM,GAAG;AAChE,WAAO,EAAE,UAAU,OAAO,QAAQ,aAAa;AAAA,EACjD;AAGA,MAAI,sBAAsB,UAAU,eAAe,GAAG;AACpD,WAAO,EAAE,UAAU,OAAO,QAAQ,kBAAkB;AAAA,EACtD;AAEA,SAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAC9C;;;ACjEA,SAAS,WAAW;AAEpB,IAAM,mBAAmB;AAAA,EACvB;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAcO,SAAS,mBAAmB,UAA0B;AAC3D,MAAI,WAAW,SAAS,MAAM,GAAG,EAAE,KAAK,GAAG;AAE3C,aAAW,UAAU,kBAAkB;AACrC,QAAI,SAAS,WAAW,MAAM,GAAG;AAC/B,iBAAW,SAAS,UAAU,OAAO,MAAM;AAC3C;AAAA,IACF;AAAA,EACF;AAEA,aAAW,SAAS,QAAQ,YAAY,EAAE;AAE1C,MAAI,aAAa,WAAW,aAAa,IAAI;AAC3C,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,SAAS,QAAQ,GAAG;AAC/B,eAAW,SAAS,MAAM,GAAG,EAAE;AAAA,EACjC;AAEA,MAAI,CAAC,SAAS,WAAW,GAAG,GAAG;AAC7B,eAAW,MAAM;AAAA,EACnB;AAEA,SAAO;AACT;;;AChDA,SAAS,eAAe;AACxB,SAAS,aAAa;;;ACEf,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF3BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,KAAK,6BAA6B;AACtC,WAAO;AAAA,EACT;AAEA,MAAI,MAAM,+BAA+B,OAAO;AAChD,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,KAAK,2BAA2B;AACpC,WAAO;AAAA,EACT;AAGA,SAAO,iDACF;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAEP;","names":["_a"]}
|
|
1
|
+
{"version":3,"sources":["../src/parseHtml.ts","../src/parsers/cssSelectorParser.ts","../src/parsers/readabilityParser.ts"],"sourcesContent":["import { loggers } from '@peam-ai/logger';\nimport { JSDOM } from 'jsdom';\nimport TurndownService from 'turndown';\nimport { CssSelectorParser } from './parsers/cssSelectorParser';\nimport { ParseOptions } from './parsers/parser';\nimport { ReadabilityParser } from './parsers/readabilityParser';\nimport { StructuredPage } from './structuredPage';\n\nconst log = loggers.parser;\n\n/**\n * Parse HTML content and convert it to a StructuredPage\n * @param html - HTML string to parse\n * @param options - Parsing options\n * @returns StructuredPage or undefined if parsing fails\n */\nexport function parseHTML(html: string, options: ParseOptions = {}): StructuredPage | undefined {\n if (!html || html.trim().length === 0) {\n log.error('Empty or invalid HTML input');\n return undefined;\n }\n\n const dom = new JSDOM(html);\n const document = dom.window.document;\n\n const cssSelectorParser = new CssSelectorParser();\n const cssSelectorStructuredPage = cssSelectorParser.parse(document, options);\n const readabilityParser = new ReadabilityParser();\n const readabilityStructuredPage = readabilityParser.parse(document, options);\n\n if (!cssSelectorStructuredPage && !readabilityStructuredPage) {\n log.error('Failed to extract content');\n return undefined;\n }\n\n // Merge results, prioritizing Readability data\n const mergedResult: StructuredPage = {\n ...{\n title: '',\n description: '',\n content: '',\n textContent: '',\n },\n ...cssSelectorStructuredPage,\n ...readabilityStructuredPage,\n };\n\n // Convert HTML content to markdown\n if (mergedResult.content) {\n try {\n const turndownService = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n });\n mergedResult.markdownContent = turndownService.turndown(mergedResult.content);\n } catch (error) {\n log.error('Failed to convert content to markdown', error);\n }\n }\n\n return mergedResult;\n}\n","import { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class CssSelectorParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const title = document.querySelector('title')?.textContent;\n const description = document.querySelector('meta[name=\"description\"]')?.getAttribute('content');\n const content = document.body.innerHTML;\n\n const bodyClone = document.body.cloneNode(true) as HTMLElement;\n bodyClone\n .querySelectorAll('script, style, noscript, iframe, [hidden], nav, header, footer, aside')\n .forEach((el) => el.remove());\n\n const textContent = bodyClone.textContent?.replace(/\\s+/g, ' ').trim();\n\n if (\n !title ||\n title.trim().length === 0 ||\n !textContent ||\n textContent.trim().length === 0 ||\n !content ||\n content.trim().length === 0 ||\n !description ||\n description.trim().length === 0\n ) {\n return undefined;\n }\n\n const page: StructuredPage = {\n title,\n description,\n content,\n textContent,\n contentLength: textContent.length,\n };\n\n const htmlLang = document.documentElement.getAttribute('lang');\n if (htmlLang) {\n page.language = htmlLang;\n }\n\n const dir = document.documentElement.getAttribute('dir');\n if (dir) {\n page.direction = dir;\n }\n\n const ogTitle = document.querySelector('meta[property=\"og:title\"]')?.getAttribute('content');\n if (ogTitle && ogTitle !== page.title) {\n page.ogTitle = ogTitle;\n }\n\n const ogDescription = document.querySelector('meta[property=\"og:description\"]')?.getAttribute('content');\n if (ogDescription) {\n page.ogDescription = ogDescription;\n }\n\n const ogImage = document.querySelector('meta[property=\"og:image\"]')?.getAttribute('content');\n if (ogImage) {\n page.ogImage = ogImage;\n }\n\n const ogSiteName = document.querySelector('meta[property=\"og:site_name\"]')?.getAttribute('content');\n if (ogSiteName) {\n page.siteName = ogSiteName;\n }\n\n const author = document.querySelector('meta[name=\"author\"]')?.getAttribute('content');\n if (author) {\n page.author = author;\n }\n\n const keywords = document.querySelector('meta[name=\"keywords\"]')?.getAttribute('content');\n if (keywords) {\n page.keywords = keywords\n .split(',')\n .map((k) => k.trim())\n .filter((k) => k);\n }\n\n const publishedTime = document.querySelector('meta[property=\"article:published_time\"]')?.getAttribute('content');\n if (publishedTime) {\n page.publishedTime = publishedTime;\n }\n\n const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))\n .map((h) => h.textContent?.trim())\n .filter((h): h is string => !!h && h.length > 0);\n\n if (headings.length > 0) {\n page.headings = headings;\n }\n\n // Extract links if enabled\n if (options?.extractLinks !== false) {\n // Extract internal links\n const links = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('/') && !href.startsWith('/_next') && !href.startsWith('/_');\n });\n\n if (links.length > 0) {\n page.internalLinks = Array.from(new Set(links));\n }\n\n // Extract external links\n const externalLinks = Array.from(document.querySelectorAll('a[href]'))\n .map((a) => a.getAttribute('href'))\n .filter((href): href is string => {\n if (!href) return false;\n return href.startsWith('http://') || href.startsWith('https://');\n });\n\n if (externalLinks.length > 0) {\n page.externalLinks = Array.from(new Set(externalLinks));\n }\n }\n\n // Extract images if enabled\n if (options?.extractImages !== false) {\n const images = Array.from(document.querySelectorAll('img[src], img[data-nimg], img[data-src]'))\n .map((img) => {\n const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-nimg');\n const alt = img.getAttribute('alt');\n return { src, alt };\n })\n .filter((img) => img.src && !img.src.startsWith('data:'));\n\n if (images.length > 0) {\n page.images = images.map((img) => img.src || '').filter((src) => src);\n page.imageAlts = images.map((img) => img.alt || '');\n }\n }\n\n return page;\n }\n}\n","import { Readability } from '@mozilla/readability';\nimport { StructuredPage } from '../structuredPage';\nimport { ParseOptions, Parser } from './parser';\n\nexport class ReadabilityParser implements Parser {\n parse(document: Document, options?: ParseOptions): StructuredPage | undefined {\n const { keepClasses, disableJSONLD } = options || {};\n\n const reader = new Readability(document, {\n keepClasses,\n disableJSONLD,\n });\n\n const parsedPage = reader.parse();\n\n if (parsedPage) {\n const page: StructuredPage = {\n title: parsedPage.title,\n description: parsedPage.excerpt,\n content: parsedPage.content,\n textContent: parsedPage.textContent,\n contentLength: parsedPage.length,\n author: parsedPage.byline,\n direction: parsedPage.dir,\n language: parsedPage.lang,\n siteName: parsedPage.siteName,\n publishedTime: parsedPage.publishedTime,\n };\n\n return page;\n }\n\n return undefined;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;AAAA,SAAS,eAAe;AACxB,SAAS,aAAa;AACtB,OAAO,qBAAqB;;;ACCrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAJhF;AAKI,UAAM,SAAQ,cAAS,cAAc,OAAO,MAA9B,mBAAiC;AAC/C,UAAM,eAAc,cAAS,cAAc,0BAA0B,MAAjD,mBAAoD,aAAa;AACrF,UAAM,UAAU,SAAS,KAAK;AAE9B,UAAM,YAAY,SAAS,KAAK,UAAU,IAAI;AAC9C,cACG,iBAAiB,uEAAuE,EACxF,QAAQ,CAAC,OAAO,GAAG,OAAO,CAAC;AAE9B,UAAM,eAAc,eAAU,gBAAV,mBAAuB,QAAQ,QAAQ,KAAK;AAEhE,QACE,CAAC,SACD,MAAM,KAAK,EAAE,WAAW,KACxB,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,KAC9B,CAAC,WACD,QAAQ,KAAK,EAAE,WAAW,KAC1B,CAAC,eACD,YAAY,KAAK,EAAE,WAAW,GAC9B;AACA,aAAO;AAAA,IACT;AAEA,UAAM,OAAuB;AAAA,MAC3B;AAAA,MACA;AAAA,MACA;AAAA,MACA;AAAA,MACA,eAAe,YAAY;AAAA,IAC7B;AAEA,UAAM,WAAW,SAAS,gBAAgB,aAAa,MAAM;AAC7D,QAAI,UAAU;AACZ,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,MAAM,SAAS,gBAAgB,aAAa,KAAK;AACvD,QAAI,KAAK;AACP,WAAK,YAAY;AAAA,IACnB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,WAAW,YAAY,KAAK,OAAO;AACrC,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,iBAAgB,cAAS,cAAc,iCAAiC,MAAxD,mBAA2D,aAAa;AAC9F,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAU,cAAS,cAAc,2BAA2B,MAAlD,mBAAqD,aAAa;AAClF,QAAI,SAAS;AACX,WAAK,UAAU;AAAA,IACjB;AAEA,UAAM,cAAa,cAAS,cAAc,+BAA+B,MAAtD,mBAAyD,aAAa;AACzF,QAAI,YAAY;AACd,WAAK,WAAW;AAAA,IAClB;AAEA,UAAM,UAAS,cAAS,cAAc,qBAAqB,MAA5C,mBAA+C,aAAa;AAC3E,QAAI,QAAQ;AACV,WAAK,SAAS;AAAA,IAChB;AAEA,UAAM,YAAW,cAAS,cAAc,uBAAuB,MAA9C,mBAAiD,aAAa;AAC/E,QAAI,UAAU;AACZ,WAAK,WAAW,SACb,MAAM,GAAG,EACT,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,CAAC;AAAA,IACpB;AAEA,UAAM,iBAAgB,cAAS,cAAc,yCAAyC,MAAhE,mBAAmE,aAAa;AACtG,QAAI,eAAe;AACjB,WAAK,gBAAgB;AAAA,IACvB;AAEA,UAAM,WAAW,MAAM,KAAK,SAAS,iBAAiB,wBAAwB,CAAC,EAC5E,IAAI,CAAC,MAAG;AAtFf,UAAAA;AAsFkB,cAAAA,MAAA,EAAE,gBAAF,gBAAAA,IAAe;AAAA,KAAM,EAChC,OAAO,CAAC,MAAmB,CAAC,CAAC,KAAK,EAAE,SAAS,CAAC;AAEjD,QAAI,SAAS,SAAS,GAAG;AACvB,WAAK,WAAW;AAAA,IAClB;AAGA,SAAI,mCAAS,kBAAiB,OAAO;AAEnC,YAAM,QAAQ,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAC1D,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,GAAG,KAAK,CAAC,KAAK,WAAW,QAAQ,KAAK,CAAC,KAAK,WAAW,IAAI;AAAA,MACpF,CAAC;AAEH,UAAI,MAAM,SAAS,GAAG;AACpB,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,KAAK,CAAC;AAAA,MAChD;AAGA,YAAM,gBAAgB,MAAM,KAAK,SAAS,iBAAiB,SAAS,CAAC,EAClE,IAAI,CAAC,MAAM,EAAE,aAAa,MAAM,CAAC,EACjC,OAAO,CAAC,SAAyB;AAChC,YAAI,CAAC,KAAM,QAAO;AAClB,eAAO,KAAK,WAAW,SAAS,KAAK,KAAK,WAAW,UAAU;AAAA,MACjE,CAAC;AAEH,UAAI,cAAc,SAAS,GAAG;AAC5B,aAAK,gBAAgB,MAAM,KAAK,IAAI,IAAI,aAAa,CAAC;AAAA,MACxD;AAAA,IACF;AAGA,SAAI,mCAAS,mBAAkB,OAAO;AACpC,YAAM,SAAS,MAAM,KAAK,SAAS,iBAAiB,yCAAyC,CAAC,EAC3F,IAAI,CAAC,QAAQ;AACZ,cAAM,MAAM,IAAI,aAAa,KAAK,KAAK,IAAI,aAAa,UAAU,KAAK,IAAI,aAAa,WAAW;AACnG,cAAM,MAAM,IAAI,aAAa,KAAK;AAClC,eAAO,EAAE,KAAK,IAAI;AAAA,MACpB,CAAC,EACA,OAAO,CAAC,QAAQ,IAAI,OAAO,CAAC,IAAI,IAAI,WAAW,OAAO,CAAC;AAE1D,UAAI,OAAO,SAAS,GAAG;AACrB,aAAK,SAAS,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE,EAAE,OAAO,CAAC,QAAQ,GAAG;AACpE,aAAK,YAAY,OAAO,IAAI,CAAC,QAAQ,IAAI,OAAO,EAAE;AAAA,MACpD;AAAA,IACF;AAEA,WAAO;AAAA,EACT;AACF;;;AC1IA,SAAS,mBAAmB;AAIrB,IAAM,oBAAN,MAA0C;AAAA,EAC/C,MAAM,UAAoB,SAAoD;AAC5E,UAAM,EAAE,aAAa,cAAc,IAAI,WAAW,CAAC;AAEnD,UAAM,SAAS,IAAI,YAAY,UAAU;AAAA,MACvC;AAAA,MACA;AAAA,IACF,CAAC;AAED,UAAM,aAAa,OAAO,MAAM;AAEhC,QAAI,YAAY;AACd,YAAM,OAAuB;AAAA,QAC3B,OAAO,WAAW;AAAA,QAClB,aAAa,WAAW;AAAA,QACxB,SAAS,WAAW;AAAA,QACpB,aAAa,WAAW;AAAA,QACxB,eAAe,WAAW;AAAA,QAC1B,QAAQ,WAAW;AAAA,QACnB,WAAW,WAAW;AAAA,QACtB,UAAU,WAAW;AAAA,QACrB,UAAU,WAAW;AAAA,QACrB,eAAe,WAAW;AAAA,MAC5B;AAEA,aAAO;AAAA,IACT;AAEA,WAAO;AAAA,EACT;AACF;;;AF1BA,IAAM,MAAM,QAAQ;AAQb,SAAS,UAAU,MAAc,UAAwB,CAAC,GAA+B;AAC9F,MAAI,CAAC,QAAQ,KAAK,KAAK,EAAE,WAAW,GAAG;AACrC,QAAI,MAAM,6BAA6B;AACvC,WAAO;AAAA,EACT;AAEA,QAAM,MAAM,IAAI,MAAM,IAAI;AAC1B,QAAM,WAAW,IAAI,OAAO;AAE5B,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAC3E,QAAM,oBAAoB,IAAI,kBAAkB;AAChD,QAAM,4BAA4B,kBAAkB,MAAM,UAAU,OAAO;AAE3E,MAAI,CAAC,6BAA6B,CAAC,2BAA2B;AAC5D,QAAI,MAAM,2BAA2B;AACrC,WAAO;AAAA,EACT;AAGA,QAAM,eAA+B,iDAChC;AAAA,IACD,OAAO;AAAA,IACP,aAAa;AAAA,IACb,SAAS;AAAA,IACT,aAAa;AAAA,EACf,IACG,4BACA;AAIL,MAAI,aAAa,SAAS;AACxB,QAAI;AACF,YAAM,kBAAkB,IAAI,gBAAgB;AAAA,QAC1C,cAAc;AAAA,QACd,gBAAgB;AAAA,MAClB,CAAC;AACD,mBAAa,kBAAkB,gBAAgB,SAAS,aAAa,OAAO;AAAA,IAC9E,SAAS,OAAO;AACd,UAAI,MAAM,yCAAyC,KAAK;AAAA,IAC1D;AAAA,EACF;AAEA,SAAO;AACT;","names":["_a"]}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@peam-ai/parser",
|
|
3
3
|
"description": "Content parser for extracting page metadata",
|
|
4
|
-
"version": "0.1.
|
|
4
|
+
"version": "0.1.5",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|
|
7
7
|
"types": "./dist/index.d.ts",
|
|
@@ -29,21 +29,23 @@
|
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
31
31
|
"@types/jsdom": "^27.0.0",
|
|
32
|
+
"@types/turndown": "^5.0.5",
|
|
32
33
|
"tsup": "^8.2.4",
|
|
33
34
|
"typescript": "^5.5.4"
|
|
34
35
|
},
|
|
35
36
|
"dependencies": {
|
|
36
37
|
"@mozilla/readability": "^0.5.0",
|
|
37
38
|
"jsdom": "^27.3.0",
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"@peam-ai/logger": "0.1.3"
|
|
39
|
+
"turndown": "^7.2.2",
|
|
40
|
+
"@peam-ai/logger": "0.1.5"
|
|
41
41
|
},
|
|
42
42
|
"scripts": {
|
|
43
43
|
"build": "tsup",
|
|
44
|
+
"build:watch": "tsup --watch",
|
|
44
45
|
"clean": "rm -rf dist",
|
|
45
|
-
"
|
|
46
|
-
"test:
|
|
47
|
-
"test:
|
|
46
|
+
"format": "prettier --write \"src/**/*.ts*\"",
|
|
47
|
+
"test:unit": "vitest run",
|
|
48
|
+
"test:lint": "eslint \"src/**/*.ts*\"",
|
|
49
|
+
"test:format": "prettier --check \"src/**/*.ts*\""
|
|
48
50
|
}
|
|
49
51
|
}
|