bluera-knowledge 0.9.32 → 0.9.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. package/.claude/hooks/post-edit-check.sh +5 -3
  2. package/.claude/skills/atomic-commits/SKILL.md +3 -1
  3. package/.husky/pre-commit +3 -2
  4. package/.prettierrc +9 -0
  5. package/.versionrc.json +1 -1
  6. package/CHANGELOG.md +70 -0
  7. package/CLAUDE.md +6 -0
  8. package/README.md +25 -13
  9. package/bun.lock +277 -33
  10. package/dist/{chunk-L2YVNC63.js → chunk-6FHWC36B.js} +9 -1
  11. package/dist/chunk-6FHWC36B.js.map +1 -0
  12. package/dist/{chunk-RST4XGRL.js → chunk-DC7CGSGT.js} +288 -241
  13. package/dist/chunk-DC7CGSGT.js.map +1 -0
  14. package/dist/{chunk-6PBP5DVD.js → chunk-WFNPNAAP.js} +3212 -3054
  15. package/dist/chunk-WFNPNAAP.js.map +1 -0
  16. package/dist/{chunk-WT2DAEO7.js → chunk-Z2KKVH45.js} +548 -482
  17. package/dist/chunk-Z2KKVH45.js.map +1 -0
  18. package/dist/index.js +871 -758
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp/server.js +3 -3
  21. package/dist/watch.service-BJV3TI3F.js +7 -0
  22. package/dist/workers/background-worker-cli.js +97 -71
  23. package/dist/workers/background-worker-cli.js.map +1 -1
  24. package/eslint.config.js +43 -1
  25. package/package.json +18 -11
  26. package/plugin.json +8 -0
  27. package/python/requirements.txt +1 -1
  28. package/src/analysis/ast-parser.test.ts +12 -11
  29. package/src/analysis/ast-parser.ts +28 -22
  30. package/src/analysis/code-graph.test.ts +52 -62
  31. package/src/analysis/code-graph.ts +9 -13
  32. package/src/analysis/dependency-usage-analyzer.test.ts +91 -271
  33. package/src/analysis/dependency-usage-analyzer.ts +52 -24
  34. package/src/analysis/go-ast-parser.test.ts +22 -22
  35. package/src/analysis/go-ast-parser.ts +18 -25
  36. package/src/analysis/parser-factory.test.ts +9 -9
  37. package/src/analysis/parser-factory.ts +3 -3
  38. package/src/analysis/python-ast-parser.test.ts +27 -27
  39. package/src/analysis/python-ast-parser.ts +2 -2
  40. package/src/analysis/repo-url-resolver.test.ts +82 -82
  41. package/src/analysis/rust-ast-parser.test.ts +19 -19
  42. package/src/analysis/rust-ast-parser.ts +17 -27
  43. package/src/analysis/tree-sitter-parser.test.ts +3 -3
  44. package/src/analysis/tree-sitter-parser.ts +10 -16
  45. package/src/cli/commands/crawl.test.ts +40 -24
  46. package/src/cli/commands/crawl.ts +186 -166
  47. package/src/cli/commands/index-cmd.test.ts +90 -90
  48. package/src/cli/commands/index-cmd.ts +52 -36
  49. package/src/cli/commands/mcp.test.ts +6 -6
  50. package/src/cli/commands/mcp.ts +2 -2
  51. package/src/cli/commands/plugin-api.test.ts +16 -18
  52. package/src/cli/commands/plugin-api.ts +9 -6
  53. package/src/cli/commands/search.test.ts +16 -7
  54. package/src/cli/commands/search.ts +124 -87
  55. package/src/cli/commands/serve.test.ts +67 -25
  56. package/src/cli/commands/serve.ts +18 -3
  57. package/src/cli/commands/setup.test.ts +176 -101
  58. package/src/cli/commands/setup.ts +140 -117
  59. package/src/cli/commands/store.test.ts +82 -53
  60. package/src/cli/commands/store.ts +56 -37
  61. package/src/cli/program.ts +2 -2
  62. package/src/crawl/article-converter.test.ts +4 -1
  63. package/src/crawl/article-converter.ts +46 -31
  64. package/src/crawl/bridge.test.ts +240 -132
  65. package/src/crawl/bridge.ts +87 -30
  66. package/src/crawl/claude-client.test.ts +124 -56
  67. package/src/crawl/claude-client.ts +7 -15
  68. package/src/crawl/intelligent-crawler.test.ts +65 -22
  69. package/src/crawl/intelligent-crawler.ts +86 -53
  70. package/src/crawl/markdown-utils.ts +1 -4
  71. package/src/db/embeddings.ts +4 -6
  72. package/src/db/lance.test.ts +4 -4
  73. package/src/db/lance.ts +16 -12
  74. package/src/index.ts +26 -17
  75. package/src/logging/index.ts +1 -5
  76. package/src/logging/logger.ts +3 -5
  77. package/src/logging/payload.test.ts +1 -1
  78. package/src/logging/payload.ts +3 -5
  79. package/src/mcp/commands/index.ts +2 -2
  80. package/src/mcp/commands/job.commands.ts +12 -18
  81. package/src/mcp/commands/meta.commands.ts +13 -13
  82. package/src/mcp/commands/registry.ts +5 -8
  83. package/src/mcp/commands/store.commands.ts +19 -19
  84. package/src/mcp/handlers/execute.handler.test.ts +10 -10
  85. package/src/mcp/handlers/execute.handler.ts +4 -5
  86. package/src/mcp/handlers/index.ts +10 -14
  87. package/src/mcp/handlers/job.handler.test.ts +10 -10
  88. package/src/mcp/handlers/job.handler.ts +22 -25
  89. package/src/mcp/handlers/search.handler.test.ts +36 -65
  90. package/src/mcp/handlers/search.handler.ts +135 -104
  91. package/src/mcp/handlers/store.handler.test.ts +41 -52
  92. package/src/mcp/handlers/store.handler.ts +108 -88
  93. package/src/mcp/schemas/index.test.ts +73 -68
  94. package/src/mcp/schemas/index.ts +18 -12
  95. package/src/mcp/server.test.ts +1 -1
  96. package/src/mcp/server.ts +59 -46
  97. package/src/plugin/commands.test.ts +230 -95
  98. package/src/plugin/commands.ts +24 -25
  99. package/src/plugin/dependency-analyzer.test.ts +52 -52
  100. package/src/plugin/dependency-analyzer.ts +85 -22
  101. package/src/plugin/git-clone.test.ts +24 -13
  102. package/src/plugin/git-clone.ts +3 -7
  103. package/src/server/app.test.ts +109 -109
  104. package/src/server/app.ts +32 -23
  105. package/src/server/index.test.ts +64 -66
  106. package/src/services/chunking.service.test.ts +32 -32
  107. package/src/services/chunking.service.ts +16 -9
  108. package/src/services/code-graph.service.test.ts +30 -36
  109. package/src/services/code-graph.service.ts +24 -10
  110. package/src/services/code-unit.service.test.ts +55 -11
  111. package/src/services/code-unit.service.ts +85 -11
  112. package/src/services/config.service.test.ts +37 -18
  113. package/src/services/config.service.ts +30 -7
  114. package/src/services/index.service.test.ts +49 -18
  115. package/src/services/index.service.ts +98 -48
  116. package/src/services/index.ts +6 -9
  117. package/src/services/job.service.test.ts +22 -22
  118. package/src/services/job.service.ts +18 -18
  119. package/src/services/project-root.service.test.ts +1 -3
  120. package/src/services/search.service.test.ts +248 -120
  121. package/src/services/search.service.ts +286 -156
  122. package/src/services/services.test.ts +1 -1
  123. package/src/services/snippet.service.test.ts +14 -6
  124. package/src/services/snippet.service.ts +7 -5
  125. package/src/services/store.service.test.ts +68 -29
  126. package/src/services/store.service.ts +41 -12
  127. package/src/services/watch.service.test.ts +34 -14
  128. package/src/services/watch.service.ts +11 -1
  129. package/src/types/brands.test.ts +3 -1
  130. package/src/types/index.ts +2 -13
  131. package/src/types/search.ts +10 -8
  132. package/src/utils/type-guards.test.ts +20 -15
  133. package/src/utils/type-guards.ts +1 -1
  134. package/src/workers/background-worker-cli.ts +28 -30
  135. package/src/workers/background-worker.test.ts +54 -40
  136. package/src/workers/background-worker.ts +76 -60
  137. package/src/workers/pid-file.test.ts +167 -0
  138. package/src/workers/pid-file.ts +82 -0
  139. package/src/workers/spawn-worker.test.ts +22 -10
  140. package/src/workers/spawn-worker.ts +6 -6
  141. package/tests/analysis/ast-parser.test.ts +3 -3
  142. package/tests/analysis/code-graph.test.ts +5 -5
  143. package/tests/fixtures/code-snippets/api/error-handling.ts +4 -15
  144. package/tests/fixtures/code-snippets/api/rest-controller.ts +3 -9
  145. package/tests/fixtures/code-snippets/auth/jwt-auth.ts +5 -21
  146. package/tests/fixtures/code-snippets/auth/oauth-flow.ts +4 -4
  147. package/tests/fixtures/code-snippets/database/repository-pattern.ts +11 -3
  148. package/tests/fixtures/corpus/oss-repos/hono/src/adapter/aws-lambda/handler.ts +2 -2
  149. package/tests/fixtures/corpus/oss-repos/hono/src/adapter/cloudflare-pages/handler.ts +1 -1
  150. package/tests/fixtures/corpus/oss-repos/hono/src/adapter/cloudflare-workers/serve-static.ts +2 -2
  151. package/tests/fixtures/corpus/oss-repos/hono/src/client/client.ts +2 -2
  152. package/tests/fixtures/corpus/oss-repos/hono/src/client/types.ts +22 -20
  153. package/tests/fixtures/corpus/oss-repos/hono/src/context.ts +13 -10
  154. package/tests/fixtures/corpus/oss-repos/hono/src/helper/accepts/accepts.ts +10 -7
  155. package/tests/fixtures/corpus/oss-repos/hono/src/helper/adapter/index.ts +2 -2
  156. package/tests/fixtures/corpus/oss-repos/hono/src/helper/css/index.ts +1 -1
  157. package/tests/fixtures/corpus/oss-repos/hono/src/helper/factory/index.ts +16 -16
  158. package/tests/fixtures/corpus/oss-repos/hono/src/helper/ssg/ssg.ts +2 -2
  159. package/tests/fixtures/corpus/oss-repos/hono/src/hono-base.ts +3 -3
  160. package/tests/fixtures/corpus/oss-repos/hono/src/hono.ts +1 -1
  161. package/tests/fixtures/corpus/oss-repos/hono/src/jsx/dom/css.ts +2 -2
  162. package/tests/fixtures/corpus/oss-repos/hono/src/jsx/dom/intrinsic-element/components.ts +1 -1
  163. package/tests/fixtures/corpus/oss-repos/hono/src/jsx/dom/render.ts +7 -7
  164. package/tests/fixtures/corpus/oss-repos/hono/src/jsx/hooks/index.ts +3 -3
  165. package/tests/fixtures/corpus/oss-repos/hono/src/jsx/intrinsic-element/components.ts +1 -1
  166. package/tests/fixtures/corpus/oss-repos/hono/src/jsx/utils.ts +6 -6
  167. package/tests/fixtures/corpus/oss-repos/hono/src/middleware/jsx-renderer/index.ts +3 -3
  168. package/tests/fixtures/corpus/oss-repos/hono/src/middleware/serve-static/index.ts +1 -1
  169. package/tests/fixtures/corpus/oss-repos/hono/src/preset/quick.ts +1 -1
  170. package/tests/fixtures/corpus/oss-repos/hono/src/preset/tiny.ts +1 -1
  171. package/tests/fixtures/corpus/oss-repos/hono/src/router/pattern-router/router.ts +2 -2
  172. package/tests/fixtures/corpus/oss-repos/hono/src/router/reg-exp-router/node.ts +4 -4
  173. package/tests/fixtures/corpus/oss-repos/hono/src/router/reg-exp-router/router.ts +1 -1
  174. package/tests/fixtures/corpus/oss-repos/hono/src/router/trie-router/node.ts +1 -1
  175. package/tests/fixtures/corpus/oss-repos/hono/src/types.ts +166 -169
  176. package/tests/fixtures/corpus/oss-repos/hono/src/utils/body.ts +8 -8
  177. package/tests/fixtures/corpus/oss-repos/hono/src/utils/color.ts +3 -3
  178. package/tests/fixtures/corpus/oss-repos/hono/src/utils/cookie.ts +2 -2
  179. package/tests/fixtures/corpus/oss-repos/hono/src/utils/encode.ts +2 -2
  180. package/tests/fixtures/corpus/oss-repos/hono/src/utils/types.ts +30 -33
  181. package/tests/fixtures/corpus/oss-repos/hono/src/validator/validator.ts +2 -2
  182. package/tests/fixtures/test-server.ts +3 -2
  183. package/tests/helpers/performance-metrics.ts +8 -25
  184. package/tests/helpers/search-relevance.ts +14 -69
  185. package/tests/integration/cli-consistency.test.ts +6 -5
  186. package/tests/integration/python-bridge.test.ts +13 -3
  187. package/tests/mcp/server.test.ts +1 -1
  188. package/tests/services/code-unit.service.test.ts +48 -0
  189. package/tests/services/job.service.test.ts +124 -0
  190. package/tests/services/search.progressive-context.test.ts +2 -2
  191. package/.claude-plugin/plugin.json +0 -13
  192. package/dist/chunk-6PBP5DVD.js.map +0 -1
  193. package/dist/chunk-L2YVNC63.js.map +0 -1
  194. package/dist/chunk-RST4XGRL.js.map +0 -1
  195. package/dist/chunk-WT2DAEO7.js.map +0 -1
  196. package/dist/watch.service-YAIKKDCF.js +0 -7
  197. package/skills/atomic-commits/SKILL.md +0 -77
  198. /package/dist/{watch.service-YAIKKDCF.js.map → watch.service-BJV3TI3F.js.map} +0 -0
@@ -2,6 +2,7 @@
2
2
  import { watch } from "chokidar";
3
3
  var WatchService = class {
4
4
  watchers = /* @__PURE__ */ new Map();
5
+ pendingTimeouts = /* @__PURE__ */ new Map();
5
6
  indexService;
6
7
  lanceStore;
7
8
  constructor(indexService, lanceStore) {
@@ -21,6 +22,7 @@ var WatchService = class {
21
22
  const reindexHandler = () => {
22
23
  if (timeout) clearTimeout(timeout);
23
24
  timeout = setTimeout(() => {
25
+ this.pendingTimeouts.delete(store.id);
24
26
  void (async () => {
25
27
  try {
26
28
  await this.lanceStore.initialize(store.id);
@@ -31,6 +33,7 @@ var WatchService = class {
31
33
  }
32
34
  })();
33
35
  }, debounceMs);
36
+ this.pendingTimeouts.set(store.id, timeout);
34
37
  };
35
38
  watcher.on("all", reindexHandler);
36
39
  watcher.on("error", (error) => {
@@ -40,6 +43,11 @@ var WatchService = class {
40
43
  return Promise.resolve();
41
44
  }
42
45
  async unwatch(storeId) {
46
+ const pendingTimeout = this.pendingTimeouts.get(storeId);
47
+ if (pendingTimeout) {
48
+ clearTimeout(pendingTimeout);
49
+ this.pendingTimeouts.delete(storeId);
50
+ }
43
51
  const watcher = this.watchers.get(storeId);
44
52
  if (watcher) {
45
53
  await watcher.close();
@@ -56,4 +64,4 @@ var WatchService = class {
56
64
  export {
57
65
  WatchService
58
66
  };
59
- //# sourceMappingURL=chunk-L2YVNC63.js.map
67
+ //# sourceMappingURL=chunk-6FHWC36B.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/services/watch.service.ts"],"sourcesContent":["import { watch, type FSWatcher } from 'chokidar';\nimport type { IndexService } from './index.service.js';\nimport type { LanceStore } from '../db/lance.js';\nimport type { FileStore, RepoStore } from '../types/store.js';\n\nexport class WatchService {\n private readonly watchers: Map<string, FSWatcher> = new Map();\n private readonly pendingTimeouts: Map<string, NodeJS.Timeout> = new Map();\n private readonly indexService: IndexService;\n private readonly lanceStore: LanceStore;\n\n constructor(indexService: IndexService, lanceStore: LanceStore) {\n this.indexService = indexService;\n this.lanceStore = lanceStore;\n }\n\n async watch(\n store: FileStore | RepoStore,\n debounceMs = 1000,\n onReindex?: () => void\n ): Promise<void> {\n if (this.watchers.has(store.id)) {\n return Promise.resolve(); // Already watching\n }\n\n let timeout: NodeJS.Timeout | null = null;\n\n const watcher = watch(store.path, {\n ignored: /(^|[/\\\\])\\.(git|node_modules|dist|build)/,\n persistent: true,\n ignoreInitial: true,\n });\n\n const reindexHandler = (): void => {\n if (timeout) clearTimeout(timeout);\n timeout = setTimeout(() => {\n this.pendingTimeouts.delete(store.id);\n void (async (): Promise<void> => {\n try {\n await this.lanceStore.initialize(store.id);\n await this.indexService.indexStore(store);\n onReindex?.();\n } catch (error) {\n console.error('Error during reindexing:', error);\n }\n })();\n }, debounceMs);\n this.pendingTimeouts.set(store.id, timeout);\n };\n\n watcher.on('all', reindexHandler);\n\n watcher.on('error', (error) => {\n console.error('Watcher error:', error);\n });\n\n this.watchers.set(store.id, watcher);\n return Promise.resolve();\n }\n\n async unwatch(storeId: string): Promise<void> {\n // Clear any pending timeout to prevent timer leak\n const pendingTimeout = this.pendingTimeouts.get(storeId);\n if (pendingTimeout) {\n clearTimeout(pendingTimeout);\n this.pendingTimeouts.delete(storeId);\n }\n\n const watcher = this.watchers.get(storeId);\n if (watcher) {\n await watcher.close();\n this.watchers.delete(storeId);\n }\n }\n\n async unwatchAll(): Promise<void> {\n for (const [id] of this.watchers) {\n await this.unwatch(id);\n }\n }\n}\n"],"mappings":";AAAA,SAAS,aAA6B;AAK/B,IAAM,eAAN,MAAmB;AAAA,EACP,WAAmC,oBAAI,IAAI;AAAA,EAC3C,kBAA+C,oBAAI,IAAI;AAAA,EACvD;AAAA,EACA;AAAA,EAEjB,YAAY,cAA4B,YAAwB;AAC9D,SAAK,eAAe;AACpB,SAAK,aAAa;AAAA,EACpB;AAAA,EAEA,MAAM,MACJ,OACA,aAAa,KACb,WACe;AACf,QAAI,KAAK,SAAS,IAAI,MAAM,EAAE,GAAG;AAC/B,aAAO,QAAQ,QAAQ;AAAA,IACzB;AAEA,QAAI,UAAiC;AAErC,UAAM,UAAU,MAAM,MAAM,MAAM;AAAA,MAChC,SAAS;AAAA,MACT,YAAY;AAAA,MACZ,eAAe;AAAA,IACjB,CAAC;AAED,UAAM,iBAAiB,MAAY;AACjC,UAAI,QAAS,cAAa,OAAO;AACjC,gBAAU,WAAW,MAAM;AACzB,aAAK,gBAAgB,OAAO,MAAM,EAAE;AACpC,cAAM,YAA2B;AAC/B,cAAI;AACF,kBAAM,KAAK,WAAW,WAAW,MAAM,EAAE;AACzC,kBAAM,KAAK,aAAa,WAAW,KAAK;AACxC,wBAAY;AAAA,UACd,SAAS,OAAO;AACd,oBAAQ,MAAM,4BAA4B,KAAK;AAAA,UACjD;AAAA,QACF,GAAG;AAAA,MACL,GAAG,UAAU;AACb,WAAK,gBAAgB,IAAI,MAAM,IAAI,OAAO;AAAA,IAC5C;AAEA,YAAQ,GAAG,OAAO,cAAc;AAEhC,YAAQ,GAAG,SAAS,CAAC,UAAU;AAC7B,cAAQ,MAAM,kBAAkB,KAAK;AAAA,IACvC,CAAC;AAED,SAAK,SAAS,IAAI,MAAM,IAAI,OAAO;AACnC,WAAO,QAAQ,QAAQ;AAAA,EACzB;AAAA,EAEA,MAAM,QAAQ,SAAgC;AAE5C,UAAM,iBAAiB,KAAK,gBAAgB,IAAI,OAAO;AACvD,QAAI,gBAAgB;AAClB,mBAAa,cAAc;AAC3B,WAAK,gBAAgB,OAAO,OAAO;AAAA,IACrC;AAEA,UAAM,UAAU,KAAK,SAAS,IAAI,OAAO;AACzC,QAAI,SAAS;AACX,YAAM,QAAQ,MAAM;AACpB,WAAK,SAAS,OAAO,OAAO;AAAA,IAC9B;AAAA,EACF;AAAA,EAEA,MAAM,aAA4B;AAChC,eAAW,CAAC,EAAE,KAAK,KAAK,UAAU;AAChC,YAAM,KAAK,QAAQ,EAAE;AAAA,IACvB;AAAA,EACF;AACF;","names":[]}
@@ -3,12 +3,223 @@ import {
3
3
  createLogger,
4
4
  summarizePayload,
5
5
  truncateForLog
6
- } from "./chunk-6PBP5DVD.js";
6
+ } from "./chunk-WFNPNAAP.js";
7
7
 
8
8
  // src/crawl/intelligent-crawler.ts
9
9
  import { EventEmitter } from "events";
10
10
  import axios from "axios";
11
11
 
12
+ // src/crawl/article-converter.ts
13
+ import { extractFromHtml } from "@extractus/article-extractor";
14
+ import TurndownService from "turndown";
15
+ import { gfm } from "turndown-plugin-gfm";
16
+
17
+ // src/crawl/markdown-utils.ts
18
+ import * as cheerio from "cheerio";
19
+ function detectLanguageFromClass(className) {
20
+ if (className === void 0 || className === "") return "";
21
+ const patterns = [
22
+ /language-(\w+)/i,
23
+ /lang-(\w+)/i,
24
+ /highlight-(\w+)/i,
25
+ /hljs\s+(\w+)/i,
26
+ /^(\w+)$/i
27
+ ];
28
+ for (const pattern of patterns) {
29
+ const match = className.match(pattern);
30
+ if (match?.[1] !== void 0) {
31
+ const lang = match[1].toLowerCase();
32
+ if (!["hljs", "highlight", "code", "pre", "block", "inline"].includes(lang)) {
33
+ return lang;
34
+ }
35
+ }
36
+ }
37
+ return "";
38
+ }
39
+ function escapeHtml(text) {
40
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#039;");
41
+ }
42
+ function preprocessHtmlForCodeBlocks(html) {
43
+ if (!html || typeof html !== "string") return html;
44
+ const $ = cheerio.load(html);
45
+ $("table").each((_i, table) => {
46
+ const $table = $(table);
47
+ const $codeCell = $table.find("td pre code, td div pre code");
48
+ if ($codeCell.length > 0) {
49
+ const $pre = $codeCell.closest("pre");
50
+ const $code = $codeCell.first();
51
+ let language = detectLanguageFromClass($code.attr("class"));
52
+ if (!language) {
53
+ language = detectLanguageFromClass($pre.attr("class"));
54
+ }
55
+ const codeText = $code.text();
56
+ const cleanPre = `<pre><code class="language-${language}">${escapeHtml(codeText)}</code></pre>`;
57
+ $table.replaceWith(cleanPre);
58
+ }
59
+ });
60
+ $("pre a, code a").each((_i, anchor) => {
61
+ const $anchor = $(anchor);
62
+ if (!$anchor.text().trim()) {
63
+ $anchor.remove();
64
+ }
65
+ });
66
+ $("pre span, code span").each((_i, span) => {
67
+ const $span = $(span);
68
+ $span.replaceWith($span.text());
69
+ });
70
+ $("pre").each((_i, pre) => {
71
+ const $pre = $(pre);
72
+ if ($pre.find("code").length === 0) {
73
+ const text = $pre.text();
74
+ const lang = detectLanguageFromClass($pre.attr("class"));
75
+ $pre.html(`<code class="language-${lang}">${escapeHtml(text)}</code>`);
76
+ }
77
+ });
78
+ return $.html();
79
+ }
80
+ function cleanupMarkdown(markdown) {
81
+ if (!markdown) return "";
82
+ const trimmed = markdown.trim();
83
+ if (trimmed === "") return "";
84
+ let result = trimmed;
85
+ result = result.replace(/^(#{1,6})\s*\n\n+(\S[^\n]*)/gm, "$1 $2");
86
+ result = result.replace(/(#{1,6})\s{2,}/g, "$1 ");
87
+ result = result.replace(/\*\s+\[\s*([^\n]+?)\s*\]\(([^)]+)\)/g, "* [$1]($2)");
88
+ result = result.replace(/([^\n])\n\n+(#\s)/g, "$1\n$2");
89
+ result = result.replace(/(Some text\.)\n(##\s)/g, "$1\n\n$2");
90
+ result = result.replace(/(#{1,6}\s[^\n]+)\n([^#\n])/g, "$1\n\n$2");
91
+ result = result.replace(/(#{1,6}\s[^\n]+)\n(#{1,6}\s)/g, "$1\n\n$2");
92
+ result = result.replace(/(\* Item 1)\n\n+(\* Item 2)\n\n+(\* Item 3)/g, "$1\n$2\n$3");
93
+ result = result.replace(/(^\*\s[^\n]+)\n{2,}(^\*\s)/gm, "$1\n$2");
94
+ result = result.replace(/\n{3,}/g, "\n\n");
95
+ result = result.replace(/(```[^\n]*)\n\n+/g, "$1\n");
96
+ result = result.replace(/\n\n+```/g, "\n```");
97
+ result = result.replace(/\*\s*\n\s*\*/g, "*");
98
+ result = result.replace(/<\/?table[^>]*>/gi, "");
99
+ result = result.replace(/<\/?tbody[^>]*>/gi, "");
100
+ result = result.replace(/<\/?thead[^>]*>/gi, "");
101
+ result = result.replace(/<\/?tr[^>]*>/gi, "");
102
+ result = result.replace(/<\/?td[^>]*>/gi, "");
103
+ result = result.replace(/<\/?th[^>]*>/gi, "");
104
+ result = result.replace(/<a[^>]*><\/a>/gi, "");
105
+ result = result.replace(/<\/?span[^>]*>/gi, "");
106
+ result = result.replace(/<\/?div[^>]*>/gi, "");
107
+ result = result.replace(/<\/?pre[^>]*>/gi, "");
108
+ result = result.replace(/<\/?code[^>]*>/gi, "");
109
+ result = result.replace(/\[\]\([^)]*\)/g, "");
110
+ result = result.replace(/\[\]\([^)]*#__codelineno-[^)]+\)/g, "");
111
+ result = result.replace(/\[?\]?\([^)]*#__codelineno-[^)]*\)/g, "");
112
+ result = result.replace(/&amp;lt;/g, "&lt;");
113
+ result = result.replace(/&amp;gt;/g, "&gt;");
114
+ result = result.replace(/&amp;amp;/g, "&amp;");
115
+ result = result.replace(/\n{3,}/g, "\n\n");
116
+ result = result.replace(/[ \t]+\n/g, "\n");
117
+ return result;
118
+ }
119
+
120
+ // src/crawl/article-converter.ts
121
+ var logger = createLogger("article-converter");
122
+ async function convertHtmlToMarkdown(html, url) {
123
+ logger.debug({ url, htmlLength: html.length }, "Starting HTML conversion");
124
+ try {
125
+ let articleHtml;
126
+ let title;
127
+ try {
128
+ const article = await extractFromHtml(html, url);
129
+ if (article?.content !== void 0 && article.content !== "") {
130
+ articleHtml = article.content;
131
+ title = article.title !== void 0 && article.title !== "" ? article.title : void 0;
132
+ logger.debug(
133
+ {
134
+ url,
135
+ title,
136
+ extractedLength: articleHtml.length,
137
+ usedFullHtml: false
138
+ },
139
+ "Article content extracted"
140
+ );
141
+ } else {
142
+ articleHtml = html;
143
+ logger.debug(
144
+ { url, usedFullHtml: true },
145
+ "Article extraction returned empty, using full HTML"
146
+ );
147
+ }
148
+ } catch (extractError) {
149
+ articleHtml = html;
150
+ logger.debug(
151
+ {
152
+ url,
153
+ usedFullHtml: true,
154
+ error: extractError instanceof Error ? extractError.message : String(extractError)
155
+ },
156
+ "Article extraction failed, using full HTML"
157
+ );
158
+ }
159
+ const preprocessed = preprocessHtmlForCodeBlocks(articleHtml);
160
+ const turndownService = new TurndownService({
161
+ headingStyle: "atx",
162
+ // Use # style headings
163
+ codeBlockStyle: "fenced",
164
+ // Use ``` style code blocks
165
+ fence: "```",
166
+ emDelimiter: "*",
167
+ strongDelimiter: "**",
168
+ linkStyle: "inlined"
169
+ });
170
+ turndownService.use(gfm);
171
+ turndownService.addRule("headingsWithAnchors", {
172
+ filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
173
+ replacement(content, node) {
174
+ const level = Number(node.nodeName.charAt(1));
175
+ const hashes = "#".repeat(level);
176
+ const cleanContent = content.replace(/\[\]\([^)]*\)/g, "").replace(/\s+/g, " ").trim();
177
+ return cleanContent !== "" ? `
178
+
179
+ ${hashes} ${cleanContent}
180
+
181
+ ` : "";
182
+ }
183
+ });
184
+ const rawMarkdown = turndownService.turndown(preprocessed);
185
+ const markdown = cleanupMarkdown(rawMarkdown);
186
+ logger.debug(
187
+ {
188
+ url,
189
+ title,
190
+ rawMarkdownLength: rawMarkdown.length,
191
+ finalMarkdownLength: markdown.length
192
+ },
193
+ "HTML to markdown conversion complete"
194
+ );
195
+ logger.trace(
196
+ {
197
+ url,
198
+ markdownPreview: truncateForLog(markdown, 1e3)
199
+ },
200
+ "Markdown content preview"
201
+ );
202
+ return {
203
+ markdown,
204
+ ...title !== void 0 && { title },
205
+ success: true
206
+ };
207
+ } catch (error) {
208
+ logger.error(
209
+ {
210
+ url,
211
+ error: error instanceof Error ? error.message : String(error)
212
+ },
213
+ "HTML to markdown conversion failed"
214
+ );
215
+ return {
216
+ markdown: "",
217
+ success: false,
218
+ error: error instanceof Error ? error.message : String(error)
219
+ };
220
+ }
221
+ }
222
+
12
223
  // src/crawl/claude-client.ts
13
224
  import { spawn, execSync } from "child_process";
14
225
  var CRAWL_STRATEGY_SCHEMA = {
@@ -150,9 +361,7 @@ ${this.truncateMarkdown(markdown, 1e5)}`;
150
361
  resolve(stdout.trim());
151
362
  } else {
152
363
  reject(
153
- new Error(
154
- `Claude CLI exited with code ${String(code)}${stderr ? `: ${stderr}` : ""}`
155
- )
364
+ new Error(`Claude CLI exited with code ${String(code)}${stderr ? `: ${stderr}` : ""}`)
156
365
  );
157
366
  }
158
367
  });
@@ -171,212 +380,20 @@ ${this.truncateMarkdown(markdown, 1e5)}`;
171
380
  */
172
381
  truncateHtml(html, maxLength) {
173
382
  if (html.length <= maxLength) return html;
174
- return html.substring(0, maxLength) + "\n\n[... HTML truncated ...]";
383
+ return `${html.substring(0, maxLength)}
384
+
385
+ [... HTML truncated ...]`;
175
386
  }
176
387
  /**
177
388
  * Truncate markdown to a maximum length
178
389
  */
179
390
  truncateMarkdown(markdown, maxLength) {
180
391
  if (markdown.length <= maxLength) return markdown;
181
- return markdown.substring(0, maxLength) + "\n\n[... content truncated ...]";
182
- }
183
- };
392
+ return `${markdown.substring(0, maxLength)}
184
393
 
185
- // src/crawl/article-converter.ts
186
- import { extractFromHtml } from "@extractus/article-extractor";
187
- import TurndownService from "turndown";
188
- import { gfm } from "turndown-plugin-gfm";
189
-
190
- // src/crawl/markdown-utils.ts
191
- import * as cheerio from "cheerio";
192
- function detectLanguageFromClass(className) {
193
- if (className === void 0 || className === "") return "";
194
- const patterns = [
195
- /language-(\w+)/i,
196
- /lang-(\w+)/i,
197
- /highlight-(\w+)/i,
198
- /hljs\s+(\w+)/i,
199
- /^(\w+)$/i
200
- ];
201
- for (const pattern of patterns) {
202
- const match = className.match(pattern);
203
- if (match?.[1] !== void 0) {
204
- const lang = match[1].toLowerCase();
205
- if (!["hljs", "highlight", "code", "pre", "block", "inline"].includes(lang)) {
206
- return lang;
207
- }
208
- }
394
+ [... content truncated ...]`;
209
395
  }
210
- return "";
211
- }
212
- function escapeHtml(text) {
213
- return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#039;");
214
- }
215
- function preprocessHtmlForCodeBlocks(html) {
216
- if (!html || typeof html !== "string") return html;
217
- const $ = cheerio.load(html);
218
- $("table").each((_i, table) => {
219
- const $table = $(table);
220
- const $codeCell = $table.find("td pre code, td div pre code");
221
- if ($codeCell.length > 0) {
222
- const $pre = $codeCell.closest("pre");
223
- const $code = $codeCell.first();
224
- let language = detectLanguageFromClass($code.attr("class"));
225
- if (!language) {
226
- language = detectLanguageFromClass($pre.attr("class"));
227
- }
228
- const codeText = $code.text();
229
- const cleanPre = `<pre><code class="language-${language}">${escapeHtml(codeText)}</code></pre>`;
230
- $table.replaceWith(cleanPre);
231
- }
232
- });
233
- $("pre a, code a").each((_i, anchor) => {
234
- const $anchor = $(anchor);
235
- if (!$anchor.text().trim()) {
236
- $anchor.remove();
237
- }
238
- });
239
- $("pre span, code span").each((_i, span) => {
240
- const $span = $(span);
241
- $span.replaceWith($span.text());
242
- });
243
- $("pre").each((_i, pre) => {
244
- const $pre = $(pre);
245
- if ($pre.find("code").length === 0) {
246
- const text = $pre.text();
247
- const lang = detectLanguageFromClass($pre.attr("class"));
248
- $pre.html(`<code class="language-${lang}">${escapeHtml(text)}</code>`);
249
- }
250
- });
251
- return $.html();
252
- }
253
- function cleanupMarkdown(markdown) {
254
- if (!markdown) return "";
255
- const trimmed = markdown.trim();
256
- if (trimmed === "") return "";
257
- let result = trimmed;
258
- result = result.replace(/^(#{1,6})\s*\n\n+(\S[^\n]*)/gm, "$1 $2");
259
- result = result.replace(/(#{1,6})\s{2,}/g, "$1 ");
260
- result = result.replace(/\*\s+\[\s*([^\n]+?)\s*\]\(([^)]+)\)/g, "* [$1]($2)");
261
- result = result.replace(/([^\n])\n\n+(#\s)/g, "$1\n$2");
262
- result = result.replace(/(Some text\.)\n(##\s)/g, "$1\n\n$2");
263
- result = result.replace(/(#{1,6}\s[^\n]+)\n([^#\n])/g, "$1\n\n$2");
264
- result = result.replace(/(#{1,6}\s[^\n]+)\n(#{1,6}\s)/g, "$1\n\n$2");
265
- result = result.replace(
266
- /(\* Item 1)\n\n+(\* Item 2)\n\n+(\* Item 3)/g,
267
- "$1\n$2\n$3"
268
- );
269
- result = result.replace(/(^\*\s[^\n]+)\n{2,}(^\*\s)/gm, "$1\n$2");
270
- result = result.replace(/\n{3,}/g, "\n\n");
271
- result = result.replace(/(```[^\n]*)\n\n+/g, "$1\n");
272
- result = result.replace(/\n\n+```/g, "\n```");
273
- result = result.replace(/\*\s*\n\s*\*/g, "*");
274
- result = result.replace(/<\/?table[^>]*>/gi, "");
275
- result = result.replace(/<\/?tbody[^>]*>/gi, "");
276
- result = result.replace(/<\/?thead[^>]*>/gi, "");
277
- result = result.replace(/<\/?tr[^>]*>/gi, "");
278
- result = result.replace(/<\/?td[^>]*>/gi, "");
279
- result = result.replace(/<\/?th[^>]*>/gi, "");
280
- result = result.replace(/<a[^>]*><\/a>/gi, "");
281
- result = result.replace(/<\/?span[^>]*>/gi, "");
282
- result = result.replace(/<\/?div[^>]*>/gi, "");
283
- result = result.replace(/<\/?pre[^>]*>/gi, "");
284
- result = result.replace(/<\/?code[^>]*>/gi, "");
285
- result = result.replace(/\[\]\([^)]*\)/g, "");
286
- result = result.replace(/\[\]\([^)]*#__codelineno-[^)]+\)/g, "");
287
- result = result.replace(/\[?\]?\([^)]*#__codelineno-[^)]*\)/g, "");
288
- result = result.replace(/&amp;lt;/g, "&lt;");
289
- result = result.replace(/&amp;gt;/g, "&gt;");
290
- result = result.replace(/&amp;amp;/g, "&amp;");
291
- result = result.replace(/\n{3,}/g, "\n\n");
292
- result = result.replace(/[ \t]+\n/g, "\n");
293
- return result;
294
- }
295
-
296
- // src/crawl/article-converter.ts
297
- var logger = createLogger("article-converter");
298
- async function convertHtmlToMarkdown(html, url) {
299
- logger.debug({ url, htmlLength: html.length }, "Starting HTML conversion");
300
- try {
301
- let articleHtml;
302
- let title;
303
- try {
304
- const article = await extractFromHtml(html, url);
305
- if (article !== null && article.content !== void 0 && article.content !== "") {
306
- articleHtml = article.content;
307
- title = article.title !== void 0 && article.title !== "" ? article.title : void 0;
308
- logger.debug({
309
- url,
310
- title,
311
- extractedLength: articleHtml.length,
312
- usedFullHtml: false
313
- }, "Article content extracted");
314
- } else {
315
- articleHtml = html;
316
- logger.debug({ url, usedFullHtml: true }, "Article extraction returned empty, using full HTML");
317
- }
318
- } catch (extractError) {
319
- articleHtml = html;
320
- logger.debug({
321
- url,
322
- usedFullHtml: true,
323
- error: extractError instanceof Error ? extractError.message : String(extractError)
324
- }, "Article extraction failed, using full HTML");
325
- }
326
- const preprocessed = preprocessHtmlForCodeBlocks(articleHtml);
327
- const turndownService = new TurndownService({
328
- headingStyle: "atx",
329
- // Use # style headings
330
- codeBlockStyle: "fenced",
331
- // Use ``` style code blocks
332
- fence: "```",
333
- emDelimiter: "*",
334
- strongDelimiter: "**",
335
- linkStyle: "inlined"
336
- });
337
- turndownService.use(gfm);
338
- turndownService.addRule("headingsWithAnchors", {
339
- filter: ["h1", "h2", "h3", "h4", "h5", "h6"],
340
- replacement(content, node) {
341
- const level = Number(node.nodeName.charAt(1));
342
- const hashes = "#".repeat(level);
343
- const cleanContent = content.replace(/\[\]\([^)]*\)/g, "").replace(/\s+/g, " ").trim();
344
- return cleanContent !== "" ? `
345
-
346
- ${hashes} ${cleanContent}
347
-
348
- ` : "";
349
- }
350
- });
351
- const rawMarkdown = turndownService.turndown(preprocessed);
352
- const markdown = cleanupMarkdown(rawMarkdown);
353
- logger.debug({
354
- url,
355
- title,
356
- rawMarkdownLength: rawMarkdown.length,
357
- finalMarkdownLength: markdown.length
358
- }, "HTML to markdown conversion complete");
359
- logger.trace({
360
- url,
361
- markdownPreview: truncateForLog(markdown, 1e3)
362
- }, "Markdown content preview");
363
- return {
364
- markdown,
365
- ...title !== void 0 && { title },
366
- success: true
367
- };
368
- } catch (error) {
369
- logger.error({
370
- url,
371
- error: error instanceof Error ? error.message : String(error)
372
- }, "HTML to markdown conversion failed");
373
- return {
374
- markdown: "",
375
- success: false,
376
- error: error instanceof Error ? error.message : String(error)
377
- };
378
- }
379
- }
396
+ };
380
397
 
381
398
  // src/crawl/intelligent-crawler.ts
382
399
  var logger2 = createLogger("crawler");
@@ -396,20 +413,18 @@ var IntelligentCrawler = class extends EventEmitter {
396
413
  * Crawl a website with intelligent or simple mode
397
414
  */
398
415
  async *crawl(seedUrl, options = {}) {
399
- const {
400
- crawlInstruction,
401
- extractInstruction,
402
- maxPages = 50,
403
- simple = false
404
- } = options;
416
+ const { crawlInstruction, extractInstruction, maxPages = 50, simple = false } = options;
405
417
  this.visited.clear();
406
418
  this.stopped = false;
407
- logger2.info({
408
- seedUrl,
409
- maxPages,
410
- mode: simple ? "simple" : crawlInstruction !== void 0 && crawlInstruction !== "" ? "intelligent" : "simple",
411
- hasExtractInstruction: extractInstruction !== void 0
412
- }, "Starting crawl");
419
+ logger2.info(
420
+ {
421
+ seedUrl,
422
+ maxPages,
423
+ mode: simple ? "simple" : crawlInstruction !== void 0 && crawlInstruction !== "" ? "intelligent" : "simple",
424
+ hasExtractInstruction: extractInstruction !== void 0
425
+ },
426
+ "Starting crawl"
427
+ );
413
428
  const startProgress = {
414
429
  type: "start",
415
430
  pagesVisited: 0,
@@ -418,14 +433,23 @@ var IntelligentCrawler = class extends EventEmitter {
418
433
  this.emit("progress", startProgress);
419
434
  const useIntelligentMode = !simple && crawlInstruction !== void 0 && crawlInstruction !== "";
420
435
  if (useIntelligentMode) {
421
- yield* this.crawlIntelligent(seedUrl, crawlInstruction, extractInstruction, maxPages, options.useHeadless ?? false);
436
+ yield* this.crawlIntelligent(
437
+ seedUrl,
438
+ crawlInstruction,
439
+ extractInstruction,
440
+ maxPages,
441
+ options.useHeadless ?? false
442
+ );
422
443
  } else {
423
444
  yield* this.crawlSimple(seedUrl, extractInstruction, maxPages, options.useHeadless ?? false);
424
445
  }
425
- logger2.info({
426
- seedUrl,
427
- pagesVisited: this.visited.size
428
- }, "Crawl complete");
446
+ logger2.info(
447
+ {
448
+ seedUrl,
449
+ pagesVisited: this.visited.size
450
+ },
451
+ "Crawl complete"
452
+ );
429
453
  const completeProgress = {
430
454
  type: "complete",
431
455
  pagesVisited: this.visited.size,
@@ -485,7 +509,12 @@ var IntelligentCrawler = class extends EventEmitter {
485
509
  if (this.stopped || pagesVisited >= maxPages) break;
486
510
  if (this.visited.has(url)) continue;
487
511
  try {
488
- const result = await this.crawlSinglePage(url, extractInstruction, pagesVisited, useHeadless);
512
+ const result = await this.crawlSinglePage(
513
+ url,
514
+ extractInstruction,
515
+ pagesVisited,
516
+ useHeadless
517
+ );
489
518
  pagesVisited++;
490
519
  yield result;
491
520
  } catch (error) {
@@ -528,7 +557,10 @@ var IntelligentCrawler = class extends EventEmitter {
528
557
  if (links.length === 0) {
529
558
  logger2.debug({ url: current.url }, "No links found - page may be a leaf node");
530
559
  } else {
531
- logger2.debug({ url: current.url, linkCount: links.length }, "Links extracted from page");
560
+ logger2.debug(
561
+ { url: current.url, linkCount: links.length },
562
+ "Links extracted from page"
563
+ );
532
564
  }
533
565
  for (const link of links) {
534
566
  if (!this.visited.has(link) && this.isSameDomain(seedUrl, link)) {
@@ -577,11 +609,14 @@ var IntelligentCrawler = class extends EventEmitter {
577
609
  logger2.error({ url, error: conversion.error }, "HTML to markdown conversion failed");
578
610
  throw new Error(`Failed to convert HTML: ${conversion.error ?? "Unknown error"}`);
579
611
  }
580
- logger2.debug({
581
- url,
582
- title: conversion.title,
583
- markdownLength: conversion.markdown.length
584
- }, "Article converted to markdown");
612
+ logger2.debug(
613
+ {
614
+ url,
615
+ title: conversion.title,
616
+ markdownLength: conversion.markdown.length
617
+ },
618
+ "Article converted to markdown"
619
+ );
585
620
  let extracted;
586
621
  if (extractInstruction !== void 0 && extractInstruction !== "") {
587
622
  if (!ClaudeClient.isAvailable()) {
@@ -637,15 +672,21 @@ var IntelligentCrawler = class extends EventEmitter {
637
672
  try {
638
673
  const result = await this.pythonBridge.fetchHeadless(url);
639
674
  const durationMs = Date.now() - startTime;
640
- logger2.info({
641
- url,
642
- useHeadless: true,
643
- durationMs,
644
- ...summarizePayload(result.html, "raw-html", url)
645
- }, "Raw HTML fetched");
675
+ logger2.info(
676
+ {
677
+ url,
678
+ useHeadless: true,
679
+ durationMs,
680
+ ...summarizePayload(result.html, "raw-html", url)
681
+ },
682
+ "Raw HTML fetched"
683
+ );
646
684
  return result.html;
647
685
  } catch (error) {
648
- logger2.warn({ url, error: error instanceof Error ? error.message : String(error) }, "Headless fetch failed, falling back to axios");
686
+ logger2.warn(
687
+ { url, error: error instanceof Error ? error.message : String(error) },
688
+ "Headless fetch failed, falling back to axios"
689
+ );
649
690
  }
650
691
  }
651
692
  try {
@@ -656,15 +697,21 @@ var IntelligentCrawler = class extends EventEmitter {
656
697
  }
657
698
  });
658
699
  const durationMs = Date.now() - startTime;
659
- logger2.info({
660
- url,
661
- useHeadless: false,
662
- durationMs,
663
- ...summarizePayload(response.data, "raw-html", url)
664
- }, "Raw HTML fetched");
700
+ logger2.info(
701
+ {
702
+ url,
703
+ useHeadless: false,
704
+ durationMs,
705
+ ...summarizePayload(response.data, "raw-html", url)
706
+ },
707
+ "Raw HTML fetched"
708
+ );
665
709
  return response.data;
666
710
  } catch (error) {
667
- logger2.error({ url, error: error instanceof Error ? error.message : String(error) }, "Failed to fetch HTML");
711
+ logger2.error(
712
+ { url, error: error instanceof Error ? error.message : String(error) },
713
+ "Failed to fetch HTML"
714
+ );
668
715
  throw new Error(
669
716
  `Failed to fetch ${url}: ${error instanceof Error ? error.message : String(error)}`
670
717
  );
@@ -718,4 +765,4 @@ var IntelligentCrawler = class extends EventEmitter {
718
765
  export {
719
766
  IntelligentCrawler
720
767
  };
721
- //# sourceMappingURL=chunk-RST4XGRL.js.map
768
+ //# sourceMappingURL=chunk-DC7CGSGT.js.map