bluera-knowledge 0.9.32 → 0.9.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/hooks/post-edit-check.sh +5 -3
- package/.claude/skills/atomic-commits/SKILL.md +3 -1
- package/.husky/pre-commit +3 -2
- package/.prettierrc +9 -0
- package/.versionrc.json +1 -1
- package/CHANGELOG.md +70 -0
- package/CLAUDE.md +6 -0
- package/README.md +25 -13
- package/bun.lock +277 -33
- package/dist/{chunk-L2YVNC63.js → chunk-6FHWC36B.js} +9 -1
- package/dist/chunk-6FHWC36B.js.map +1 -0
- package/dist/{chunk-RST4XGRL.js → chunk-DC7CGSGT.js} +288 -241
- package/dist/chunk-DC7CGSGT.js.map +1 -0
- package/dist/{chunk-6PBP5DVD.js → chunk-WFNPNAAP.js} +3212 -3054
- package/dist/chunk-WFNPNAAP.js.map +1 -0
- package/dist/{chunk-WT2DAEO7.js → chunk-Z2KKVH45.js} +548 -482
- package/dist/chunk-Z2KKVH45.js.map +1 -0
- package/dist/index.js +871 -758
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +3 -3
- package/dist/watch.service-BJV3TI3F.js +7 -0
- package/dist/workers/background-worker-cli.js +97 -71
- package/dist/workers/background-worker-cli.js.map +1 -1
- package/eslint.config.js +43 -1
- package/package.json +18 -11
- package/plugin.json +8 -0
- package/python/requirements.txt +1 -1
- package/src/analysis/ast-parser.test.ts +12 -11
- package/src/analysis/ast-parser.ts +28 -22
- package/src/analysis/code-graph.test.ts +52 -62
- package/src/analysis/code-graph.ts +9 -13
- package/src/analysis/dependency-usage-analyzer.test.ts +91 -271
- package/src/analysis/dependency-usage-analyzer.ts +52 -24
- package/src/analysis/go-ast-parser.test.ts +22 -22
- package/src/analysis/go-ast-parser.ts +18 -25
- package/src/analysis/parser-factory.test.ts +9 -9
- package/src/analysis/parser-factory.ts +3 -3
- package/src/analysis/python-ast-parser.test.ts +27 -27
- package/src/analysis/python-ast-parser.ts +2 -2
- package/src/analysis/repo-url-resolver.test.ts +82 -82
- package/src/analysis/rust-ast-parser.test.ts +19 -19
- package/src/analysis/rust-ast-parser.ts +17 -27
- package/src/analysis/tree-sitter-parser.test.ts +3 -3
- package/src/analysis/tree-sitter-parser.ts +10 -16
- package/src/cli/commands/crawl.test.ts +40 -24
- package/src/cli/commands/crawl.ts +186 -166
- package/src/cli/commands/index-cmd.test.ts +90 -90
- package/src/cli/commands/index-cmd.ts +52 -36
- package/src/cli/commands/mcp.test.ts +6 -6
- package/src/cli/commands/mcp.ts +2 -2
- package/src/cli/commands/plugin-api.test.ts +16 -18
- package/src/cli/commands/plugin-api.ts +9 -6
- package/src/cli/commands/search.test.ts +16 -7
- package/src/cli/commands/search.ts +124 -87
- package/src/cli/commands/serve.test.ts +67 -25
- package/src/cli/commands/serve.ts +18 -3
- package/src/cli/commands/setup.test.ts +176 -101
- package/src/cli/commands/setup.ts +140 -117
- package/src/cli/commands/store.test.ts +82 -53
- package/src/cli/commands/store.ts +56 -37
- package/src/cli/program.ts +2 -2
- package/src/crawl/article-converter.test.ts +4 -1
- package/src/crawl/article-converter.ts +46 -31
- package/src/crawl/bridge.test.ts +240 -132
- package/src/crawl/bridge.ts +87 -30
- package/src/crawl/claude-client.test.ts +124 -56
- package/src/crawl/claude-client.ts +7 -15
- package/src/crawl/intelligent-crawler.test.ts +65 -22
- package/src/crawl/intelligent-crawler.ts +86 -53
- package/src/crawl/markdown-utils.ts +1 -4
- package/src/db/embeddings.ts +4 -6
- package/src/db/lance.test.ts +4 -4
- package/src/db/lance.ts +16 -12
- package/src/index.ts +26 -17
- package/src/logging/index.ts +1 -5
- package/src/logging/logger.ts +3 -5
- package/src/logging/payload.test.ts +1 -1
- package/src/logging/payload.ts +3 -5
- package/src/mcp/commands/index.ts +2 -2
- package/src/mcp/commands/job.commands.ts +12 -18
- package/src/mcp/commands/meta.commands.ts +13 -13
- package/src/mcp/commands/registry.ts +5 -8
- package/src/mcp/commands/store.commands.ts +19 -19
- package/src/mcp/handlers/execute.handler.test.ts +10 -10
- package/src/mcp/handlers/execute.handler.ts +4 -5
- package/src/mcp/handlers/index.ts +10 -14
- package/src/mcp/handlers/job.handler.test.ts +10 -10
- package/src/mcp/handlers/job.handler.ts +22 -25
- package/src/mcp/handlers/search.handler.test.ts +36 -65
- package/src/mcp/handlers/search.handler.ts +135 -104
- package/src/mcp/handlers/store.handler.test.ts +41 -52
- package/src/mcp/handlers/store.handler.ts +108 -88
- package/src/mcp/schemas/index.test.ts +73 -68
- package/src/mcp/schemas/index.ts +18 -12
- package/src/mcp/server.test.ts +1 -1
- package/src/mcp/server.ts +59 -46
- package/src/plugin/commands.test.ts +230 -95
- package/src/plugin/commands.ts +24 -25
- package/src/plugin/dependency-analyzer.test.ts +52 -52
- package/src/plugin/dependency-analyzer.ts +85 -22
- package/src/plugin/git-clone.test.ts +24 -13
- package/src/plugin/git-clone.ts +3 -7
- package/src/server/app.test.ts +109 -109
- package/src/server/app.ts +32 -23
- package/src/server/index.test.ts +64 -66
- package/src/services/chunking.service.test.ts +32 -32
- package/src/services/chunking.service.ts +16 -9
- package/src/services/code-graph.service.test.ts +30 -36
- package/src/services/code-graph.service.ts +24 -10
- package/src/services/code-unit.service.test.ts +55 -11
- package/src/services/code-unit.service.ts +85 -11
- package/src/services/config.service.test.ts +37 -18
- package/src/services/config.service.ts +30 -7
- package/src/services/index.service.test.ts +49 -18
- package/src/services/index.service.ts +98 -48
- package/src/services/index.ts +6 -9
- package/src/services/job.service.test.ts +22 -22
- package/src/services/job.service.ts +18 -18
- package/src/services/project-root.service.test.ts +1 -3
- package/src/services/search.service.test.ts +248 -120
- package/src/services/search.service.ts +286 -156
- package/src/services/services.test.ts +1 -1
- package/src/services/snippet.service.test.ts +14 -6
- package/src/services/snippet.service.ts +7 -5
- package/src/services/store.service.test.ts +68 -29
- package/src/services/store.service.ts +41 -12
- package/src/services/watch.service.test.ts +34 -14
- package/src/services/watch.service.ts +11 -1
- package/src/types/brands.test.ts +3 -1
- package/src/types/index.ts +2 -13
- package/src/types/search.ts +10 -8
- package/src/utils/type-guards.test.ts +20 -15
- package/src/utils/type-guards.ts +1 -1
- package/src/workers/background-worker-cli.ts +28 -30
- package/src/workers/background-worker.test.ts +54 -40
- package/src/workers/background-worker.ts +76 -60
- package/src/workers/pid-file.test.ts +167 -0
- package/src/workers/pid-file.ts +82 -0
- package/src/workers/spawn-worker.test.ts +22 -10
- package/src/workers/spawn-worker.ts +6 -6
- package/tests/analysis/ast-parser.test.ts +3 -3
- package/tests/analysis/code-graph.test.ts +5 -5
- package/tests/fixtures/code-snippets/api/error-handling.ts +4 -15
- package/tests/fixtures/code-snippets/api/rest-controller.ts +3 -9
- package/tests/fixtures/code-snippets/auth/jwt-auth.ts +5 -21
- package/tests/fixtures/code-snippets/auth/oauth-flow.ts +4 -4
- package/tests/fixtures/code-snippets/database/repository-pattern.ts +11 -3
- package/tests/fixtures/corpus/oss-repos/hono/src/adapter/aws-lambda/handler.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/adapter/cloudflare-pages/handler.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/adapter/cloudflare-workers/serve-static.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/client/client.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/client/types.ts +22 -20
- package/tests/fixtures/corpus/oss-repos/hono/src/context.ts +13 -10
- package/tests/fixtures/corpus/oss-repos/hono/src/helper/accepts/accepts.ts +10 -7
- package/tests/fixtures/corpus/oss-repos/hono/src/helper/adapter/index.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/helper/css/index.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/helper/factory/index.ts +16 -16
- package/tests/fixtures/corpus/oss-repos/hono/src/helper/ssg/ssg.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/hono-base.ts +3 -3
- package/tests/fixtures/corpus/oss-repos/hono/src/hono.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/jsx/dom/css.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/jsx/dom/intrinsic-element/components.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/jsx/dom/render.ts +7 -7
- package/tests/fixtures/corpus/oss-repos/hono/src/jsx/hooks/index.ts +3 -3
- package/tests/fixtures/corpus/oss-repos/hono/src/jsx/intrinsic-element/components.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/jsx/utils.ts +6 -6
- package/tests/fixtures/corpus/oss-repos/hono/src/middleware/jsx-renderer/index.ts +3 -3
- package/tests/fixtures/corpus/oss-repos/hono/src/middleware/serve-static/index.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/preset/quick.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/preset/tiny.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/router/pattern-router/router.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/router/reg-exp-router/node.ts +4 -4
- package/tests/fixtures/corpus/oss-repos/hono/src/router/reg-exp-router/router.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/router/trie-router/node.ts +1 -1
- package/tests/fixtures/corpus/oss-repos/hono/src/types.ts +166 -169
- package/tests/fixtures/corpus/oss-repos/hono/src/utils/body.ts +8 -8
- package/tests/fixtures/corpus/oss-repos/hono/src/utils/color.ts +3 -3
- package/tests/fixtures/corpus/oss-repos/hono/src/utils/cookie.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/utils/encode.ts +2 -2
- package/tests/fixtures/corpus/oss-repos/hono/src/utils/types.ts +30 -33
- package/tests/fixtures/corpus/oss-repos/hono/src/validator/validator.ts +2 -2
- package/tests/fixtures/test-server.ts +3 -2
- package/tests/helpers/performance-metrics.ts +8 -25
- package/tests/helpers/search-relevance.ts +14 -69
- package/tests/integration/cli-consistency.test.ts +6 -5
- package/tests/integration/python-bridge.test.ts +13 -3
- package/tests/mcp/server.test.ts +1 -1
- package/tests/services/code-unit.service.test.ts +48 -0
- package/tests/services/job.service.test.ts +124 -0
- package/tests/services/search.progressive-context.test.ts +2 -2
- package/.claude-plugin/plugin.json +0 -13
- package/dist/chunk-6PBP5DVD.js.map +0 -1
- package/dist/chunk-L2YVNC63.js.map +0 -1
- package/dist/chunk-RST4XGRL.js.map +0 -1
- package/dist/chunk-WT2DAEO7.js.map +0 -1
- package/dist/watch.service-YAIKKDCF.js +0 -7
- package/skills/atomic-commits/SKILL.md +0 -77
- /package/dist/{watch.service-YAIKKDCF.js.map → watch.service-BJV3TI3F.js.map} +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import { watch } from "chokidar";
|
|
3
3
|
var WatchService = class {
|
|
4
4
|
watchers = /* @__PURE__ */ new Map();
|
|
5
|
+
pendingTimeouts = /* @__PURE__ */ new Map();
|
|
5
6
|
indexService;
|
|
6
7
|
lanceStore;
|
|
7
8
|
constructor(indexService, lanceStore) {
|
|
@@ -21,6 +22,7 @@ var WatchService = class {
|
|
|
21
22
|
const reindexHandler = () => {
|
|
22
23
|
if (timeout) clearTimeout(timeout);
|
|
23
24
|
timeout = setTimeout(() => {
|
|
25
|
+
this.pendingTimeouts.delete(store.id);
|
|
24
26
|
void (async () => {
|
|
25
27
|
try {
|
|
26
28
|
await this.lanceStore.initialize(store.id);
|
|
@@ -31,6 +33,7 @@ var WatchService = class {
|
|
|
31
33
|
}
|
|
32
34
|
})();
|
|
33
35
|
}, debounceMs);
|
|
36
|
+
this.pendingTimeouts.set(store.id, timeout);
|
|
34
37
|
};
|
|
35
38
|
watcher.on("all", reindexHandler);
|
|
36
39
|
watcher.on("error", (error) => {
|
|
@@ -40,6 +43,11 @@ var WatchService = class {
|
|
|
40
43
|
return Promise.resolve();
|
|
41
44
|
}
|
|
42
45
|
async unwatch(storeId) {
|
|
46
|
+
const pendingTimeout = this.pendingTimeouts.get(storeId);
|
|
47
|
+
if (pendingTimeout) {
|
|
48
|
+
clearTimeout(pendingTimeout);
|
|
49
|
+
this.pendingTimeouts.delete(storeId);
|
|
50
|
+
}
|
|
43
51
|
const watcher = this.watchers.get(storeId);
|
|
44
52
|
if (watcher) {
|
|
45
53
|
await watcher.close();
|
|
@@ -56,4 +64,4 @@ var WatchService = class {
|
|
|
56
64
|
export {
|
|
57
65
|
WatchService
|
|
58
66
|
};
|
|
59
|
-
//# sourceMappingURL=chunk-
|
|
67
|
+
//# sourceMappingURL=chunk-6FHWC36B.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/services/watch.service.ts"],"sourcesContent":["import { watch, type FSWatcher } from 'chokidar';\nimport type { IndexService } from './index.service.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { FileStore, RepoStore } from '../types/store.js';\n\nexport class WatchService {\n private readonly watchers: Map<string, FSWatcher> = new Map();\n private readonly pendingTimeouts: Map<string, NodeJS.Timeout> = new Map();\n private readonly indexService: IndexService;\n private readonly lanceStore: LanceStore;\n\n constructor(indexService: IndexService, lanceStore: LanceStore) {\n this.indexService = indexService;\n this.lanceStore = lanceStore;\n }\n\n async watch(\n store: FileStore | RepoStore,\n debounceMs = 1000,\n onReindex?: () => void\n ): Promise<void> {\n if (this.watchers.has(store.id)) {\n return Promise.resolve(); // Already watching\n }\n\n let timeout: NodeJS.Timeout | null = null;\n\n const watcher = watch(store.path, {\n ignored: /(^|[/\\\\])\\.(git|node_modules|dist|build)/,\n persistent: true,\n ignoreInitial: true,\n });\n\n const reindexHandler = (): void => {\n if (timeout) clearTimeout(timeout);\n timeout = setTimeout(() => {\n this.pendingTimeouts.delete(store.id);\n void (async (): Promise<void> => {\n try {\n await this.lanceStore.initialize(store.id);\n await this.indexService.indexStore(store);\n onReindex?.();\n } catch (error) {\n console.error('Error during reindexing:', error);\n }\n })();\n }, debounceMs);\n this.pendingTimeouts.set(store.id, timeout);\n };\n\n watcher.on('all', reindexHandler);\n\n watcher.on('error', (error) => {\n console.error('Watcher error:', error);\n });\n\n this.watchers.set(store.id, watcher);\n return Promise.resolve();\n }\n\n async unwatch(storeId: string): Promise<void> {\n // Clear any pending timeout to prevent timer leak\n const pendingTimeout = this.pendingTimeouts.get(storeId);\n if (pendingTimeout) {\n clearTimeout(pendingTimeout);\n this.pendingTimeouts.delete(storeId);\n }\n\n const watcher = this.watchers.get(storeId);\n if (watcher) {\n await watcher.close();\n this.watchers.delete(storeId);\n }\n }\n\n async unwatchAll(): Promise<void> {\n for (const [id] of this.watchers) {\n await this.unwatch(id);\n }\n }\n}\n"],"mappings":";AAAA,SAAS,aAA6B;AAK/B,IAAM,eAAN,MAAmB;AAAA,EACP,WAAmC,oBAAI,IAAI;AAAA,EAC3C,kBAA+C,oBAAI,IAAI;AAAA,EACvD;AAAA,EACA;AAAA,EAEjB,YAAY,cAA4B,YAAwB;AAC9D,SAAK,eAAe;AACpB,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,MAAM,MACJ,OACA,aAAa,KACb,WACe;AACf,QAAI,KAAK,SAAS,IAAI,MAAM,EAAE,GAAG;AAC/B,aAAO,QAAQ,QAAQ;AAAA,IACzB;AAEA,QAAI,UAAiC;AAErC,UAAM,UAAU,MAAM,MAAM,MAAM;AAAA,MAChC,SAAS;AAAA,MACT,YAAY;AAAA,MACZ,eAAe;AAAA,IACjB,CAAC;AAED,UAAM,iBAAiB,MAAY;AACjC,UAAI,QAAS,cAAa,OAAO;AACjC,gBAAU,WAAW,MAAM;AACzB,aAAK,gBAAgB,OAAO,MAAM,EAAE;AACpC,cAAM,YAA2B;AAC/B,cAAI;AACF,kBAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,kBAAM,KAAK,aAAa,WAAW,KAAK;AACxC,wBAAY;AAAA,UACd,SAAS,OAAO;AACd,oBAAQ,MAAM,4BAA4B,KAAK;AAAA,UACjD;AAAA,QACF,GAAG;AAAA,MACL,GAAG,UAAU;AACb,WAAK,gBAAgB,IAAI,MAAM,IAAI,OAAO;AAAA,IAC5C;AAEA,YAAQ,GAAG,OAAO,cAAc;AAEhC,YAAQ,GAAG,SAAS,CAAC,UAAU;AAC7B,cAAQ,MAAM,kBAAkB,KAAK;AAAA,IACvC,CAAC;AAED,SAAK,SAAS,IAAI,MAAM,IAAI,OAAO;AACnC,WAAO,QAAQ,QAAQ;AAAA,EACzB;AAAA,EAEA,MAAM,QAAQ,SAAgC;AAE5C,UAAM,iBAAiB,KAAK,gBAAgB,IAAI,OAAO;AACvD,QAAI,gBAAgB;AAClB,mBAAa,cAAc;AAC3B,WAAK,gBAAgB,OAAO,OAAO;AAAA,IACrC;AAEA,UAAM,UAAU,KAAK,SAAS,IAAI,OAAO;AACzC,QAAI,SAAS;AACX,YAAM,QAAQ,MAAM;AACpB,WAAK,SAAS,OAAO,OAAO;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,MAAM,aAA4B;AAChC,eAAW,CAAC,EAAE,KAAK,KAAK,UAAU;AAChC,YAAM,KAAK,QAAQ,EAAE;AAAA,IACvB;AAAA,EACF;AACF;","names":[]}
|
|
@@ -3,12 +3,223 @@ import {
|
|
|
3
3
|
createLogger,
|
|
4
4
|
summarizePayload,
|
|
5
5
|
truncateForLog
|
|
6
|
-
} from "./chunk-
|
|
6
|
+
} from "./chunk-WFNPNAAP.js";
|
|
7
7
|
|
|
8
8
|
// src/crawl/intelligent-crawler.ts
|
|
9
9
|
import { EventEmitter } from "events";
|
|
10
10
|
import axios from "axios";
|
|
11
11
|
|
|
12
|
+
// src/crawl/article-converter.ts
|
|
13
|
+
import { extractFromHtml } from "@extractus/article-extractor";
|
|
14
|
+
import TurndownService from "turndown";
|
|
15
|
+
import { gfm } from "turndown-plugin-gfm";
|
|
16
|
+
|
|
17
|
+
// src/crawl/markdown-utils.ts
|
|
18
|
+
import * as cheerio from "cheerio";
|
|
19
|
+
function detectLanguageFromClass(className) {
|
|
20
|
+
if (className === void 0 || className === "") return "";
|
|
21
|
+
const patterns = [
|
|
22
|
+
/language-(\w+)/i,
|
|
23
|
+
/lang-(\w+)/i,
|
|
24
|
+
/highlight-(\w+)/i,
|
|
25
|
+
/hljs\s+(\w+)/i,
|
|
26
|
+
/^(\w+)$/i
|
|
27
|
+
];
|
|
28
|
+
for (const pattern of patterns) {
|
|
29
|
+
const match = className.match(pattern);
|
|
30
|
+
if (match?.[1] !== void 0) {
|
|
31
|
+
const lang = match[1].toLowerCase();
|
|
32
|
+
if (!["hljs", "highlight", "code", "pre", "block", "inline"].includes(lang)) {
|
|
33
|
+
return lang;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return "";
|
|
38
|
+
}
|
|
39
|
+
function escapeHtml(text) {
|
|
40
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
41
|
+
}
|
|
42
|
+
function preprocessHtmlForCodeBlocks(html) {
|
|
43
|
+
if (!html || typeof html !== "string") return html;
|
|
44
|
+
const $ = cheerio.load(html);
|
|
45
|
+
$("table").each((_i, table) => {
|
|
46
|
+
const $table = $(table);
|
|
47
|
+
const $codeCell = $table.find("td pre code, td div pre code");
|
|
48
|
+
if ($codeCell.length > 0) {
|
|
49
|
+
const $pre = $codeCell.closest("pre");
|
|
50
|
+
const $code = $codeCell.first();
|
|
51
|
+
let language = detectLanguageFromClass($code.attr("class"));
|
|
52
|
+
if (!language) {
|
|
53
|
+
language = detectLanguageFromClass($pre.attr("class"));
|
|
54
|
+
}
|
|
55
|
+
const codeText = $code.text();
|
|
56
|
+
const cleanPre = `<pre><code class="language-${language}">${escapeHtml(codeText)}</code></pre>`;
|
|
57
|
+
$table.replaceWith(cleanPre);
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
$("pre a, code a").each((_i, anchor) => {
|
|
61
|
+
const $anchor = $(anchor);
|
|
62
|
+
if (!$anchor.text().trim()) {
|
|
63
|
+
$anchor.remove();
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
$("pre span, code span").each((_i, span) => {
|
|
67
|
+
const $span = $(span);
|
|
68
|
+
$span.replaceWith($span.text());
|
|
69
|
+
});
|
|
70
|
+
$("pre").each((_i, pre) => {
|
|
71
|
+
const $pre = $(pre);
|
|
72
|
+
if ($pre.find("code").length === 0) {
|
|
73
|
+
const text = $pre.text();
|
|
74
|
+
const lang = detectLanguageFromClass($pre.attr("class"));
|
|
75
|
+
$pre.html(`<code class="language-${lang}">${escapeHtml(text)}</code>`);
|
|
76
|
+
}
|
|
77
|
+
});
|
|
78
|
+
return $.html();
|
|
79
|
+
}
|
|
80
|
+
function cleanupMarkdown(markdown) {
|
|
81
|
+
if (!markdown) return "";
|
|
82
|
+
const trimmed = markdown.trim();
|
|
83
|
+
if (trimmed === "") return "";
|
|
84
|
+
let result = trimmed;
|
|
85
|
+
result = result.replace(/^(#{1,6})\s*\n\n+(\S[^\n]*)/gm, "$1 $2");
|
|
86
|
+
result = result.replace(/(#{1,6})\s{2,}/g, "$1 ");
|
|
87
|
+
result = result.replace(/\*\s+\[\s*([^\n]+?)\s*\]\(([^)]+)\)/g, "* [$1]($2)");
|
|
88
|
+
result = result.replace(/([^\n])\n\n+(#\s)/g, "$1\n$2");
|
|
89
|
+
result = result.replace(/(Some text\.)\n(##\s)/g, "$1\n\n$2");
|
|
90
|
+
result = result.replace(/(#{1,6}\s[^\n]+)\n([^#\n])/g, "$1\n\n$2");
|
|
91
|
+
result = result.replace(/(#{1,6}\s[^\n]+)\n(#{1,6}\s)/g, "$1\n\n$2");
|
|
92
|
+
result = result.replace(/(\* Item 1)\n\n+(\* Item 2)\n\n+(\* Item 3)/g, "$1\n$2\n$3");
|
|
93
|
+
result = result.replace(/(^\*\s[^\n]+)\n{2,}(^\*\s)/gm, "$1\n$2");
|
|
94
|
+
result = result.replace(/\n{3,}/g, "\n\n");
|
|
95
|
+
result = result.replace(/(```[^\n]*)\n\n+/g, "$1\n");
|
|
96
|
+
result = result.replace(/\n\n+```/g, "\n```");
|
|
97
|
+
result = result.replace(/\*\s*\n\s*\*/g, "*");
|
|
98
|
+
result = result.replace(/<\/?table[^>]*>/gi, "");
|
|
99
|
+
result = result.replace(/<\/?tbody[^>]*>/gi, "");
|
|
100
|
+
result = result.replace(/<\/?thead[^>]*>/gi, "");
|
|
101
|
+
result = result.replace(/<\/?tr[^>]*>/gi, "");
|
|
102
|
+
result = result.replace(/<\/?td[^>]*>/gi, "");
|
|
103
|
+
result = result.replace(/<\/?th[^>]*>/gi, "");
|
|
104
|
+
result = result.replace(/<a[^>]*><\/a>/gi, "");
|
|
105
|
+
result = result.replace(/<\/?span[^>]*>/gi, "");
|
|
106
|
+
result = result.replace(/<\/?div[^>]*>/gi, "");
|
|
107
|
+
result = result.replace(/<\/?pre[^>]*>/gi, "");
|
|
108
|
+
result = result.replace(/<\/?code[^>]*>/gi, "");
|
|
109
|
+
result = result.replace(/\[\]\([^)]*\)/g, "");
|
|
110
|
+
result = result.replace(/\[\]\([^)]*#__codelineno-[^)]+\)/g, "");
|
|
111
|
+
result = result.replace(/\[?\]?\([^)]*#__codelineno-[^)]*\)/g, "");
|
|
112
|
+
result = result.replace(/&lt;/g, "<");
|
|
113
|
+
result = result.replace(/&gt;/g, ">");
|
|
114
|
+
result = result.replace(/&amp;/g, "&");
|
|
115
|
+
result = result.replace(/\n{3,}/g, "\n\n");
|
|
116
|
+
result = result.replace(/[ \t]+\n/g, "\n");
|
|
117
|
+
return result;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// src/crawl/article-converter.ts
|
|
121
|
+
var logger = createLogger("article-converter");
|
|
122
|
+
async function convertHtmlToMarkdown(html, url) {
|
|
123
|
+
logger.debug({ url, htmlLength: html.length }, "Starting HTML conversion");
|
|
124
|
+
try {
|
|
125
|
+
let articleHtml;
|
|
126
|
+
let title;
|
|
127
|
+
try {
|
|
128
|
+
const article = await extractFromHtml(html, url);
|
|
129
|
+
if (article?.content !== void 0 && article.content !== "") {
|
|
130
|
+
articleHtml = article.content;
|
|
131
|
+
title = article.title !== void 0 && article.title !== "" ? article.title : void 0;
|
|
132
|
+
logger.debug(
|
|
133
|
+
{
|
|
134
|
+
url,
|
|
135
|
+
title,
|
|
136
|
+
extractedLength: articleHtml.length,
|
|
137
|
+
usedFullHtml: false
|
|
138
|
+
},
|
|
139
|
+
"Article content extracted"
|
|
140
|
+
);
|
|
141
|
+
} else {
|
|
142
|
+
articleHtml = html;
|
|
143
|
+
logger.debug(
|
|
144
|
+
{ url, usedFullHtml: true },
|
|
145
|
+
"Article extraction returned empty, using full HTML"
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
} catch (extractError) {
|
|
149
|
+
articleHtml = html;
|
|
150
|
+
logger.debug(
|
|
151
|
+
{
|
|
152
|
+
url,
|
|
153
|
+
usedFullHtml: true,
|
|
154
|
+
error: extractError instanceof Error ? extractError.message : String(extractError)
|
|
155
|
+
},
|
|
156
|
+
"Article extraction failed, using full HTML"
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
const preprocessed = preprocessHtmlForCodeBlocks(articleHtml);
|
|
160
|
+
const turndownService = new TurndownService({
|
|
161
|
+
headingStyle: "atx",
|
|
162
|
+
// Use # style headings
|
|
163
|
+
codeBlockStyle: "fenced",
|
|
164
|
+
// Use ``` style code blocks
|
|
165
|
+
fence: "```",
|
|
166
|
+
emDelimiter: "*",
|
|
167
|
+
strongDelimiter: "**",
|
|
168
|
+
linkStyle: "inlined"
|
|
169
|
+
});
|
|
170
|
+
turndownService.use(gfm);
|
|
171
|
+
turndownService.addRule("headingsWithAnchors", {
|
|
172
|
+
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
173
|
+
replacement(content, node) {
|
|
174
|
+
const level = Number(node.nodeName.charAt(1));
|
|
175
|
+
const hashes = "#".repeat(level);
|
|
176
|
+
const cleanContent = content.replace(/\[\]\([^)]*\)/g, "").replace(/\s+/g, " ").trim();
|
|
177
|
+
return cleanContent !== "" ? `
|
|
178
|
+
|
|
179
|
+
${hashes} ${cleanContent}
|
|
180
|
+
|
|
181
|
+
` : "";
|
|
182
|
+
}
|
|
183
|
+
});
|
|
184
|
+
const rawMarkdown = turndownService.turndown(preprocessed);
|
|
185
|
+
const markdown = cleanupMarkdown(rawMarkdown);
|
|
186
|
+
logger.debug(
|
|
187
|
+
{
|
|
188
|
+
url,
|
|
189
|
+
title,
|
|
190
|
+
rawMarkdownLength: rawMarkdown.length,
|
|
191
|
+
finalMarkdownLength: markdown.length
|
|
192
|
+
},
|
|
193
|
+
"HTML to markdown conversion complete"
|
|
194
|
+
);
|
|
195
|
+
logger.trace(
|
|
196
|
+
{
|
|
197
|
+
url,
|
|
198
|
+
markdownPreview: truncateForLog(markdown, 1e3)
|
|
199
|
+
},
|
|
200
|
+
"Markdown content preview"
|
|
201
|
+
);
|
|
202
|
+
return {
|
|
203
|
+
markdown,
|
|
204
|
+
...title !== void 0 && { title },
|
|
205
|
+
success: true
|
|
206
|
+
};
|
|
207
|
+
} catch (error) {
|
|
208
|
+
logger.error(
|
|
209
|
+
{
|
|
210
|
+
url,
|
|
211
|
+
error: error instanceof Error ? error.message : String(error)
|
|
212
|
+
},
|
|
213
|
+
"HTML to markdown conversion failed"
|
|
214
|
+
);
|
|
215
|
+
return {
|
|
216
|
+
markdown: "",
|
|
217
|
+
success: false,
|
|
218
|
+
error: error instanceof Error ? error.message : String(error)
|
|
219
|
+
};
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
12
223
|
// src/crawl/claude-client.ts
|
|
13
224
|
import { spawn, execSync } from "child_process";
|
|
14
225
|
var CRAWL_STRATEGY_SCHEMA = {
|
|
@@ -150,9 +361,7 @@ ${this.truncateMarkdown(markdown, 1e5)}`;
|
|
|
150
361
|
resolve(stdout.trim());
|
|
151
362
|
} else {
|
|
152
363
|
reject(
|
|
153
|
-
new Error(
|
|
154
|
-
`Claude CLI exited with code ${String(code)}${stderr ? `: ${stderr}` : ""}`
|
|
155
|
-
)
|
|
364
|
+
new Error(`Claude CLI exited with code ${String(code)}${stderr ? `: ${stderr}` : ""}`)
|
|
156
365
|
);
|
|
157
366
|
}
|
|
158
367
|
});
|
|
@@ -171,212 +380,20 @@ ${this.truncateMarkdown(markdown, 1e5)}`;
|
|
|
171
380
|
*/
|
|
172
381
|
truncateHtml(html, maxLength) {
|
|
173
382
|
if (html.length <= maxLength) return html;
|
|
174
|
-
return html.substring(0, maxLength)
|
|
383
|
+
return `${html.substring(0, maxLength)}
|
|
384
|
+
|
|
385
|
+
[... HTML truncated ...]`;
|
|
175
386
|
}
|
|
176
387
|
/**
|
|
177
388
|
* Truncate markdown to a maximum length
|
|
178
389
|
*/
|
|
179
390
|
truncateMarkdown(markdown, maxLength) {
|
|
180
391
|
if (markdown.length <= maxLength) return markdown;
|
|
181
|
-
return markdown.substring(0, maxLength)
|
|
182
|
-
}
|
|
183
|
-
};
|
|
392
|
+
return `${markdown.substring(0, maxLength)}
|
|
184
393
|
|
|
185
|
-
|
|
186
|
-
import { extractFromHtml } from "@extractus/article-extractor";
|
|
187
|
-
import TurndownService from "turndown";
|
|
188
|
-
import { gfm } from "turndown-plugin-gfm";
|
|
189
|
-
|
|
190
|
-
// src/crawl/markdown-utils.ts
|
|
191
|
-
import * as cheerio from "cheerio";
|
|
192
|
-
function detectLanguageFromClass(className) {
|
|
193
|
-
if (className === void 0 || className === "") return "";
|
|
194
|
-
const patterns = [
|
|
195
|
-
/language-(\w+)/i,
|
|
196
|
-
/lang-(\w+)/i,
|
|
197
|
-
/highlight-(\w+)/i,
|
|
198
|
-
/hljs\s+(\w+)/i,
|
|
199
|
-
/^(\w+)$/i
|
|
200
|
-
];
|
|
201
|
-
for (const pattern of patterns) {
|
|
202
|
-
const match = className.match(pattern);
|
|
203
|
-
if (match?.[1] !== void 0) {
|
|
204
|
-
const lang = match[1].toLowerCase();
|
|
205
|
-
if (!["hljs", "highlight", "code", "pre", "block", "inline"].includes(lang)) {
|
|
206
|
-
return lang;
|
|
207
|
-
}
|
|
208
|
-
}
|
|
394
|
+
[... content truncated ...]`;
|
|
209
395
|
}
|
|
210
|
-
|
|
211
|
-
}
|
|
212
|
-
function escapeHtml(text) {
|
|
213
|
-
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
214
|
-
}
|
|
215
|
-
function preprocessHtmlForCodeBlocks(html) {
|
|
216
|
-
if (!html || typeof html !== "string") return html;
|
|
217
|
-
const $ = cheerio.load(html);
|
|
218
|
-
$("table").each((_i, table) => {
|
|
219
|
-
const $table = $(table);
|
|
220
|
-
const $codeCell = $table.find("td pre code, td div pre code");
|
|
221
|
-
if ($codeCell.length > 0) {
|
|
222
|
-
const $pre = $codeCell.closest("pre");
|
|
223
|
-
const $code = $codeCell.first();
|
|
224
|
-
let language = detectLanguageFromClass($code.attr("class"));
|
|
225
|
-
if (!language) {
|
|
226
|
-
language = detectLanguageFromClass($pre.attr("class"));
|
|
227
|
-
}
|
|
228
|
-
const codeText = $code.text();
|
|
229
|
-
const cleanPre = `<pre><code class="language-${language}">${escapeHtml(codeText)}</code></pre>`;
|
|
230
|
-
$table.replaceWith(cleanPre);
|
|
231
|
-
}
|
|
232
|
-
});
|
|
233
|
-
$("pre a, code a").each((_i, anchor) => {
|
|
234
|
-
const $anchor = $(anchor);
|
|
235
|
-
if (!$anchor.text().trim()) {
|
|
236
|
-
$anchor.remove();
|
|
237
|
-
}
|
|
238
|
-
});
|
|
239
|
-
$("pre span, code span").each((_i, span) => {
|
|
240
|
-
const $span = $(span);
|
|
241
|
-
$span.replaceWith($span.text());
|
|
242
|
-
});
|
|
243
|
-
$("pre").each((_i, pre) => {
|
|
244
|
-
const $pre = $(pre);
|
|
245
|
-
if ($pre.find("code").length === 0) {
|
|
246
|
-
const text = $pre.text();
|
|
247
|
-
const lang = detectLanguageFromClass($pre.attr("class"));
|
|
248
|
-
$pre.html(`<code class="language-${lang}">${escapeHtml(text)}</code>`);
|
|
249
|
-
}
|
|
250
|
-
});
|
|
251
|
-
return $.html();
|
|
252
|
-
}
|
|
253
|
-
function cleanupMarkdown(markdown) {
|
|
254
|
-
if (!markdown) return "";
|
|
255
|
-
const trimmed = markdown.trim();
|
|
256
|
-
if (trimmed === "") return "";
|
|
257
|
-
let result = trimmed;
|
|
258
|
-
result = result.replace(/^(#{1,6})\s*\n\n+(\S[^\n]*)/gm, "$1 $2");
|
|
259
|
-
result = result.replace(/(#{1,6})\s{2,}/g, "$1 ");
|
|
260
|
-
result = result.replace(/\*\s+\[\s*([^\n]+?)\s*\]\(([^)]+)\)/g, "* [$1]($2)");
|
|
261
|
-
result = result.replace(/([^\n])\n\n+(#\s)/g, "$1\n$2");
|
|
262
|
-
result = result.replace(/(Some text\.)\n(##\s)/g, "$1\n\n$2");
|
|
263
|
-
result = result.replace(/(#{1,6}\s[^\n]+)\n([^#\n])/g, "$1\n\n$2");
|
|
264
|
-
result = result.replace(/(#{1,6}\s[^\n]+)\n(#{1,6}\s)/g, "$1\n\n$2");
|
|
265
|
-
result = result.replace(
|
|
266
|
-
/(\* Item 1)\n\n+(\* Item 2)\n\n+(\* Item 3)/g,
|
|
267
|
-
"$1\n$2\n$3"
|
|
268
|
-
);
|
|
269
|
-
result = result.replace(/(^\*\s[^\n]+)\n{2,}(^\*\s)/gm, "$1\n$2");
|
|
270
|
-
result = result.replace(/\n{3,}/g, "\n\n");
|
|
271
|
-
result = result.replace(/(```[^\n]*)\n\n+/g, "$1\n");
|
|
272
|
-
result = result.replace(/\n\n+```/g, "\n```");
|
|
273
|
-
result = result.replace(/\*\s*\n\s*\*/g, "*");
|
|
274
|
-
result = result.replace(/<\/?table[^>]*>/gi, "");
|
|
275
|
-
result = result.replace(/<\/?tbody[^>]*>/gi, "");
|
|
276
|
-
result = result.replace(/<\/?thead[^>]*>/gi, "");
|
|
277
|
-
result = result.replace(/<\/?tr[^>]*>/gi, "");
|
|
278
|
-
result = result.replace(/<\/?td[^>]*>/gi, "");
|
|
279
|
-
result = result.replace(/<\/?th[^>]*>/gi, "");
|
|
280
|
-
result = result.replace(/<a[^>]*><\/a>/gi, "");
|
|
281
|
-
result = result.replace(/<\/?span[^>]*>/gi, "");
|
|
282
|
-
result = result.replace(/<\/?div[^>]*>/gi, "");
|
|
283
|
-
result = result.replace(/<\/?pre[^>]*>/gi, "");
|
|
284
|
-
result = result.replace(/<\/?code[^>]*>/gi, "");
|
|
285
|
-
result = result.replace(/\[\]\([^)]*\)/g, "");
|
|
286
|
-
result = result.replace(/\[\]\([^)]*#__codelineno-[^)]+\)/g, "");
|
|
287
|
-
result = result.replace(/\[?\]?\([^)]*#__codelineno-[^)]*\)/g, "");
|
|
288
|
-
result = result.replace(/&lt;/g, "<");
|
|
289
|
-
result = result.replace(/&gt;/g, ">");
|
|
290
|
-
result = result.replace(/&amp;/g, "&");
|
|
291
|
-
result = result.replace(/\n{3,}/g, "\n\n");
|
|
292
|
-
result = result.replace(/[ \t]+\n/g, "\n");
|
|
293
|
-
return result;
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
// src/crawl/article-converter.ts
|
|
297
|
-
var logger = createLogger("article-converter");
|
|
298
|
-
async function convertHtmlToMarkdown(html, url) {
|
|
299
|
-
logger.debug({ url, htmlLength: html.length }, "Starting HTML conversion");
|
|
300
|
-
try {
|
|
301
|
-
let articleHtml;
|
|
302
|
-
let title;
|
|
303
|
-
try {
|
|
304
|
-
const article = await extractFromHtml(html, url);
|
|
305
|
-
if (article !== null && article.content !== void 0 && article.content !== "") {
|
|
306
|
-
articleHtml = article.content;
|
|
307
|
-
title = article.title !== void 0 && article.title !== "" ? article.title : void 0;
|
|
308
|
-
logger.debug({
|
|
309
|
-
url,
|
|
310
|
-
title,
|
|
311
|
-
extractedLength: articleHtml.length,
|
|
312
|
-
usedFullHtml: false
|
|
313
|
-
}, "Article content extracted");
|
|
314
|
-
} else {
|
|
315
|
-
articleHtml = html;
|
|
316
|
-
logger.debug({ url, usedFullHtml: true }, "Article extraction returned empty, using full HTML");
|
|
317
|
-
}
|
|
318
|
-
} catch (extractError) {
|
|
319
|
-
articleHtml = html;
|
|
320
|
-
logger.debug({
|
|
321
|
-
url,
|
|
322
|
-
usedFullHtml: true,
|
|
323
|
-
error: extractError instanceof Error ? extractError.message : String(extractError)
|
|
324
|
-
}, "Article extraction failed, using full HTML");
|
|
325
|
-
}
|
|
326
|
-
const preprocessed = preprocessHtmlForCodeBlocks(articleHtml);
|
|
327
|
-
const turndownService = new TurndownService({
|
|
328
|
-
headingStyle: "atx",
|
|
329
|
-
// Use # style headings
|
|
330
|
-
codeBlockStyle: "fenced",
|
|
331
|
-
// Use ``` style code blocks
|
|
332
|
-
fence: "```",
|
|
333
|
-
emDelimiter: "*",
|
|
334
|
-
strongDelimiter: "**",
|
|
335
|
-
linkStyle: "inlined"
|
|
336
|
-
});
|
|
337
|
-
turndownService.use(gfm);
|
|
338
|
-
turndownService.addRule("headingsWithAnchors", {
|
|
339
|
-
filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
340
|
-
replacement(content, node) {
|
|
341
|
-
const level = Number(node.nodeName.charAt(1));
|
|
342
|
-
const hashes = "#".repeat(level);
|
|
343
|
-
const cleanContent = content.replace(/\[\]\([^)]*\)/g, "").replace(/\s+/g, " ").trim();
|
|
344
|
-
return cleanContent !== "" ? `
|
|
345
|
-
|
|
346
|
-
${hashes} ${cleanContent}
|
|
347
|
-
|
|
348
|
-
` : "";
|
|
349
|
-
}
|
|
350
|
-
});
|
|
351
|
-
const rawMarkdown = turndownService.turndown(preprocessed);
|
|
352
|
-
const markdown = cleanupMarkdown(rawMarkdown);
|
|
353
|
-
logger.debug({
|
|
354
|
-
url,
|
|
355
|
-
title,
|
|
356
|
-
rawMarkdownLength: rawMarkdown.length,
|
|
357
|
-
finalMarkdownLength: markdown.length
|
|
358
|
-
}, "HTML to markdown conversion complete");
|
|
359
|
-
logger.trace({
|
|
360
|
-
url,
|
|
361
|
-
markdownPreview: truncateForLog(markdown, 1e3)
|
|
362
|
-
}, "Markdown content preview");
|
|
363
|
-
return {
|
|
364
|
-
markdown,
|
|
365
|
-
...title !== void 0 && { title },
|
|
366
|
-
success: true
|
|
367
|
-
};
|
|
368
|
-
} catch (error) {
|
|
369
|
-
logger.error({
|
|
370
|
-
url,
|
|
371
|
-
error: error instanceof Error ? error.message : String(error)
|
|
372
|
-
}, "HTML to markdown conversion failed");
|
|
373
|
-
return {
|
|
374
|
-
markdown: "",
|
|
375
|
-
success: false,
|
|
376
|
-
error: error instanceof Error ? error.message : String(error)
|
|
377
|
-
};
|
|
378
|
-
}
|
|
379
|
-
}
|
|
396
|
+
};
|
|
380
397
|
|
|
381
398
|
// src/crawl/intelligent-crawler.ts
|
|
382
399
|
var logger2 = createLogger("crawler");
|
|
@@ -396,20 +413,18 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
396
413
|
* Crawl a website with intelligent or simple mode
|
|
397
414
|
*/
|
|
398
415
|
async *crawl(seedUrl, options = {}) {
|
|
399
|
-
const {
|
|
400
|
-
crawlInstruction,
|
|
401
|
-
extractInstruction,
|
|
402
|
-
maxPages = 50,
|
|
403
|
-
simple = false
|
|
404
|
-
} = options;
|
|
416
|
+
const { crawlInstruction, extractInstruction, maxPages = 50, simple = false } = options;
|
|
405
417
|
this.visited.clear();
|
|
406
418
|
this.stopped = false;
|
|
407
|
-
logger2.info(
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
419
|
+
logger2.info(
|
|
420
|
+
{
|
|
421
|
+
seedUrl,
|
|
422
|
+
maxPages,
|
|
423
|
+
mode: simple ? "simple" : crawlInstruction !== void 0 && crawlInstruction !== "" ? "intelligent" : "simple",
|
|
424
|
+
hasExtractInstruction: extractInstruction !== void 0
|
|
425
|
+
},
|
|
426
|
+
"Starting crawl"
|
|
427
|
+
);
|
|
413
428
|
const startProgress = {
|
|
414
429
|
type: "start",
|
|
415
430
|
pagesVisited: 0,
|
|
@@ -418,14 +433,23 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
418
433
|
this.emit("progress", startProgress);
|
|
419
434
|
const useIntelligentMode = !simple && crawlInstruction !== void 0 && crawlInstruction !== "";
|
|
420
435
|
if (useIntelligentMode) {
|
|
421
|
-
yield* this.crawlIntelligent(
|
|
436
|
+
yield* this.crawlIntelligent(
|
|
437
|
+
seedUrl,
|
|
438
|
+
crawlInstruction,
|
|
439
|
+
extractInstruction,
|
|
440
|
+
maxPages,
|
|
441
|
+
options.useHeadless ?? false
|
|
442
|
+
);
|
|
422
443
|
} else {
|
|
423
444
|
yield* this.crawlSimple(seedUrl, extractInstruction, maxPages, options.useHeadless ?? false);
|
|
424
445
|
}
|
|
425
|
-
logger2.info(
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
446
|
+
logger2.info(
|
|
447
|
+
{
|
|
448
|
+
seedUrl,
|
|
449
|
+
pagesVisited: this.visited.size
|
|
450
|
+
},
|
|
451
|
+
"Crawl complete"
|
|
452
|
+
);
|
|
429
453
|
const completeProgress = {
|
|
430
454
|
type: "complete",
|
|
431
455
|
pagesVisited: this.visited.size,
|
|
@@ -485,7 +509,12 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
485
509
|
if (this.stopped || pagesVisited >= maxPages) break;
|
|
486
510
|
if (this.visited.has(url)) continue;
|
|
487
511
|
try {
|
|
488
|
-
const result = await this.crawlSinglePage(
|
|
512
|
+
const result = await this.crawlSinglePage(
|
|
513
|
+
url,
|
|
514
|
+
extractInstruction,
|
|
515
|
+
pagesVisited,
|
|
516
|
+
useHeadless
|
|
517
|
+
);
|
|
489
518
|
pagesVisited++;
|
|
490
519
|
yield result;
|
|
491
520
|
} catch (error) {
|
|
@@ -528,7 +557,10 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
528
557
|
if (links.length === 0) {
|
|
529
558
|
logger2.debug({ url: current.url }, "No links found - page may be a leaf node");
|
|
530
559
|
} else {
|
|
531
|
-
logger2.debug(
|
|
560
|
+
logger2.debug(
|
|
561
|
+
{ url: current.url, linkCount: links.length },
|
|
562
|
+
"Links extracted from page"
|
|
563
|
+
);
|
|
532
564
|
}
|
|
533
565
|
for (const link of links) {
|
|
534
566
|
if (!this.visited.has(link) && this.isSameDomain(seedUrl, link)) {
|
|
@@ -577,11 +609,14 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
577
609
|
logger2.error({ url, error: conversion.error }, "HTML to markdown conversion failed");
|
|
578
610
|
throw new Error(`Failed to convert HTML: ${conversion.error ?? "Unknown error"}`);
|
|
579
611
|
}
|
|
580
|
-
logger2.debug(
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
612
|
+
logger2.debug(
|
|
613
|
+
{
|
|
614
|
+
url,
|
|
615
|
+
title: conversion.title,
|
|
616
|
+
markdownLength: conversion.markdown.length
|
|
617
|
+
},
|
|
618
|
+
"Article converted to markdown"
|
|
619
|
+
);
|
|
585
620
|
let extracted;
|
|
586
621
|
if (extractInstruction !== void 0 && extractInstruction !== "") {
|
|
587
622
|
if (!ClaudeClient.isAvailable()) {
|
|
@@ -637,15 +672,21 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
637
672
|
try {
|
|
638
673
|
const result = await this.pythonBridge.fetchHeadless(url);
|
|
639
674
|
const durationMs = Date.now() - startTime;
|
|
640
|
-
logger2.info(
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
675
|
+
logger2.info(
|
|
676
|
+
{
|
|
677
|
+
url,
|
|
678
|
+
useHeadless: true,
|
|
679
|
+
durationMs,
|
|
680
|
+
...summarizePayload(result.html, "raw-html", url)
|
|
681
|
+
},
|
|
682
|
+
"Raw HTML fetched"
|
|
683
|
+
);
|
|
646
684
|
return result.html;
|
|
647
685
|
} catch (error) {
|
|
648
|
-
logger2.warn(
|
|
686
|
+
logger2.warn(
|
|
687
|
+
{ url, error: error instanceof Error ? error.message : String(error) },
|
|
688
|
+
"Headless fetch failed, falling back to axios"
|
|
689
|
+
);
|
|
649
690
|
}
|
|
650
691
|
}
|
|
651
692
|
try {
|
|
@@ -656,15 +697,21 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
656
697
|
}
|
|
657
698
|
});
|
|
658
699
|
const durationMs = Date.now() - startTime;
|
|
659
|
-
logger2.info(
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
700
|
+
logger2.info(
|
|
701
|
+
{
|
|
702
|
+
url,
|
|
703
|
+
useHeadless: false,
|
|
704
|
+
durationMs,
|
|
705
|
+
...summarizePayload(response.data, "raw-html", url)
|
|
706
|
+
},
|
|
707
|
+
"Raw HTML fetched"
|
|
708
|
+
);
|
|
665
709
|
return response.data;
|
|
666
710
|
} catch (error) {
|
|
667
|
-
logger2.error(
|
|
711
|
+
logger2.error(
|
|
712
|
+
{ url, error: error instanceof Error ? error.message : String(error) },
|
|
713
|
+
"Failed to fetch HTML"
|
|
714
|
+
);
|
|
668
715
|
throw new Error(
|
|
669
716
|
`Failed to fetch ${url}: ${error instanceof Error ? error.message : String(error)}`
|
|
670
717
|
);
|
|
@@ -718,4 +765,4 @@ var IntelligentCrawler = class extends EventEmitter {
|
|
|
718
765
|
export {
|
|
719
766
|
IntelligentCrawler
|
|
720
767
|
};
|
|
721
|
-
//# sourceMappingURL=chunk-
|
|
768
|
+
//# sourceMappingURL=chunk-DC7CGSGT.js.map
|