webcontext-ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +583 -0
  3. package/dist/browser/manager.d.ts +47 -0
  4. package/dist/browser/manager.d.ts.map +1 -0
  5. package/dist/browser/manager.js +215 -0
  6. package/dist/browser/manager.js.map +1 -0
  7. package/dist/cache/cache.d.ts +22 -0
  8. package/dist/cache/cache.d.ts.map +1 -0
  9. package/dist/cache/cache.js +150 -0
  10. package/dist/cache/cache.js.map +1 -0
  11. package/dist/chunking/chunker.d.ts +26 -0
  12. package/dist/chunking/chunker.d.ts.map +1 -0
  13. package/dist/chunking/chunker.js +208 -0
  14. package/dist/chunking/chunker.js.map +1 -0
  15. package/dist/cli/index.d.ts +3 -0
  16. package/dist/cli/index.d.ts.map +1 -0
  17. package/dist/cli/index.js +406 -0
  18. package/dist/cli/index.js.map +1 -0
  19. package/dist/core/pipeline.d.ts +35 -0
  20. package/dist/core/pipeline.d.ts.map +1 -0
  21. package/dist/core/pipeline.js +476 -0
  22. package/dist/core/pipeline.js.map +1 -0
  23. package/dist/core/stream.d.ts +48 -0
  24. package/dist/core/stream.d.ts.map +1 -0
  25. package/dist/core/stream.js +72 -0
  26. package/dist/core/stream.js.map +1 -0
  27. package/dist/core/types.d.ts +259 -0
  28. package/dist/core/types.d.ts.map +1 -0
  29. package/dist/core/types.js +4 -0
  30. package/dist/core/types.js.map +1 -0
  31. package/dist/export/index.d.ts +3 -0
  32. package/dist/export/index.d.ts.map +1 -0
  33. package/dist/export/index.js +8 -0
  34. package/dist/export/index.js.map +1 -0
  35. package/dist/export/templates.d.ts +25 -0
  36. package/dist/export/templates.d.ts.map +1 -0
  37. package/dist/export/templates.js +76 -0
  38. package/dist/export/templates.js.map +1 -0
  39. package/dist/export/vectordb.d.ts +21 -0
  40. package/dist/export/vectordb.d.ts.map +1 -0
  41. package/dist/export/vectordb.js +101 -0
  42. package/dist/export/vectordb.js.map +1 -0
  43. package/dist/extractors/content.d.ts +23 -0
  44. package/dist/extractors/content.d.ts.map +1 -0
  45. package/dist/extractors/content.js +328 -0
  46. package/dist/extractors/content.js.map +1 -0
  47. package/dist/extractors/github.d.ts +19 -0
  48. package/dist/extractors/github.d.ts.map +1 -0
  49. package/dist/extractors/github.js +150 -0
  50. package/dist/extractors/github.js.map +1 -0
  51. package/dist/extractors/images.d.ts +20 -0
  52. package/dist/extractors/images.d.ts.map +1 -0
  53. package/dist/extractors/images.js +73 -0
  54. package/dist/extractors/images.js.map +1 -0
  55. package/dist/extractors/pdf.d.ts +11 -0
  56. package/dist/extractors/pdf.d.ts.map +1 -0
  57. package/dist/extractors/pdf.js +107 -0
  58. package/dist/extractors/pdf.js.map +1 -0
  59. package/dist/extractors/screenshot.d.ts +21 -0
  60. package/dist/extractors/screenshot.d.ts.map +1 -0
  61. package/dist/extractors/screenshot.js +85 -0
  62. package/dist/extractors/screenshot.js.map +1 -0
  63. package/dist/index.d.ts +70 -0
  64. package/dist/index.d.ts.map +1 -0
  65. package/dist/index.js +206 -0
  66. package/dist/index.js.map +1 -0
  67. package/dist/mcp-server.d.ts +3 -0
  68. package/dist/mcp-server.d.ts.map +1 -0
  69. package/dist/mcp-server.js +108 -0
  70. package/dist/mcp-server.js.map +1 -0
  71. package/dist/sdk/client.d.ts +48 -0
  72. package/dist/sdk/client.d.ts.map +1 -0
  73. package/dist/sdk/client.js +120 -0
  74. package/dist/sdk/client.js.map +1 -0
  75. package/dist/sdk/mcp.d.ts +12 -0
  76. package/dist/sdk/mcp.d.ts.map +1 -0
  77. package/dist/sdk/mcp.js +146 -0
  78. package/dist/sdk/mcp.js.map +1 -0
  79. package/dist/sdk/server.d.ts +5 -0
  80. package/dist/sdk/server.d.ts.map +1 -0
  81. package/dist/sdk/server.js +158 -0
  82. package/dist/sdk/server.js.map +1 -0
  83. package/dist/search/vector.d.ts +26 -0
  84. package/dist/search/vector.d.ts.map +1 -0
  85. package/dist/search/vector.js +142 -0
  86. package/dist/search/vector.js.map +1 -0
  87. package/dist/transformers/markdown.d.ts +21 -0
  88. package/dist/transformers/markdown.d.ts.map +1 -0
  89. package/dist/transformers/markdown.js +242 -0
  90. package/dist/transformers/markdown.js.map +1 -0
  91. package/dist/utils/dedup.d.ts +20 -0
  92. package/dist/utils/dedup.d.ts.map +1 -0
  93. package/dist/utils/dedup.js +61 -0
  94. package/dist/utils/dedup.js.map +1 -0
  95. package/dist/utils/index.d.ts +6 -0
  96. package/dist/utils/index.d.ts.map +1 -0
  97. package/dist/utils/index.js +15 -0
  98. package/dist/utils/index.js.map +1 -0
  99. package/dist/utils/metrics.d.ts +16 -0
  100. package/dist/utils/metrics.d.ts.map +1 -0
  101. package/dist/utils/metrics.js +28 -0
  102. package/dist/utils/metrics.js.map +1 -0
  103. package/dist/utils/scheduler.d.ts +19 -0
  104. package/dist/utils/scheduler.d.ts.map +1 -0
  105. package/dist/utils/scheduler.js +63 -0
  106. package/dist/utils/scheduler.js.map +1 -0
  107. package/dist/utils/sitemap.d.ts +17 -0
  108. package/dist/utils/sitemap.d.ts.map +1 -0
  109. package/dist/utils/sitemap.js +118 -0
  110. package/dist/utils/sitemap.js.map +1 -0
  111. package/dist/utils/validation.d.ts +142 -0
  112. package/dist/utils/validation.d.ts.map +1 -0
  113. package/dist/utils/validation.js +35 -0
  114. package/dist/utils/validation.js.map +1 -0
  115. package/dist/utils/webhook.d.ts +21 -0
  116. package/dist/utils/webhook.d.ts.map +1 -0
  117. package/dist/utils/webhook.js +108 -0
  118. package/dist/utils/webhook.js.map +1 -0
  119. package/package.json +109 -0
@@ -0,0 +1,215 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || function (mod) {
19
+ if (mod && mod.__esModule) return mod;
20
+ var result = {};
21
+ if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
+ __setModuleDefault(result, mod);
23
+ return result;
24
+ };
25
+ var __importDefault = (this && this.__importDefault) || function (mod) {
26
+ return (mod && mod.__esModule) ? mod : { "default": mod };
27
+ };
28
+ Object.defineProperty(exports, "__esModule", { value: true });
29
+ exports.BrowserManager = void 0;
30
+ const robots_parser_1 = __importDefault(require("robots-parser"));
31
+ const DEFAULT_RETRY = { maxRetries: 3, backoffMs: 1000, backoffMultiplier: 2, retryOn: [429, 500, 502, 503, 504] };
32
+ const DEFAULT_RATE_LIMIT = { requestsPerSecond: 2, burstSize: 5 };
33
+ /**
34
+ * Browser manager using Playwright for JS-heavy page rendering.
35
+ * Handles rate limiting, retry with backoff, and robots.txt compliance.
36
+ */
37
+ class BrowserManager {
38
+ browser = null;
39
+ context = null;
40
+ config;
41
+ rateLimitConfig;
42
+ robotsCache = new Map();
43
+ tokens;
44
+ lastRefill;
45
+ constructor(config = {}, rateLimitConfig) {
46
+ this.config = {
47
+ headless: true,
48
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
49
+ viewport: { width: 1280, height: 720 },
50
+ ...config,
51
+ };
52
+ this.rateLimitConfig = rateLimitConfig || DEFAULT_RATE_LIMIT;
53
+ this.tokens = this.rateLimitConfig.burstSize;
54
+ this.lastRefill = Date.now();
55
+ }
56
+ refillTokens() {
57
+ const now = Date.now();
58
+ const elapsed = (now - this.lastRefill) / 1000;
59
+ this.tokens = Math.min(this.rateLimitConfig.burstSize, this.tokens + elapsed * this.rateLimitConfig.requestsPerSecond);
60
+ this.lastRefill = now;
61
+ }
62
+ async waitForToken() {
63
+ this.refillTokens();
64
+ if (this.tokens >= 1) {
65
+ this.tokens -= 1;
66
+ return;
67
+ }
68
+ const waitMs = ((1 - this.tokens) / this.rateLimitConfig.requestsPerSecond) * 1000;
69
+ await new Promise(resolve => setTimeout(resolve, waitMs));
70
+ this.refillTokens();
71
+ this.tokens -= 1;
72
+ }
73
+ async launch() {
74
+ if (this.browser)
75
+ return;
76
+ let chromium;
77
+ try {
78
+ const pw = await Promise.resolve().then(() => __importStar(require('playwright')));
79
+ chromium = pw.chromium;
80
+ }
81
+ catch {
82
+ throw new Error('Playwright is required for JavaScript rendering but is not installed.\n' +
83
+ 'Install it with: npm install playwright && npx playwright install chromium\n' +
84
+ 'Or use { javascript: false } to extract without a browser.');
85
+ }
86
+ this.browser = await chromium.launch({
87
+ headless: this.config.headless,
88
+ args: this.config.proxy ? [`--proxy-server=${this.config.proxy}`] : [],
89
+ });
90
+ this.context = await this.browser.newContext({
91
+ userAgent: this.config.userAgent,
92
+ viewport: this.config.viewport,
93
+ });
94
+ }
95
+ async checkRobots(url) {
96
+ const origin = new URL(url).origin;
97
+ if (!this.robotsCache.has(origin)) {
98
+ try {
99
+ const robotsUrl = `${origin}/robots.txt`;
100
+ const response = await fetch(robotsUrl, {
101
+ headers: { 'User-Agent': this.config.userAgent || 'Mozilla/5.0' },
102
+ signal: AbortSignal.timeout(5000),
103
+ });
104
+ const body = response.ok ? await response.text() : '';
105
+ this.robotsCache.set(origin, (0, robots_parser_1.default)(robotsUrl, body));
106
+ }
107
+ catch {
108
+ this.robotsCache.set(origin, (0, robots_parser_1.default)(`${origin}/robots.txt`, ''));
109
+ }
110
+ }
111
+ return this.robotsCache.get(origin).isAllowed(url, this.config.userAgent || '*') ?? true;
112
+ }
113
+ async fetchWithRetry(fn, retryConfig = DEFAULT_RETRY) {
114
+ let lastError = null;
115
+ for (let attempt = 0; attempt <= retryConfig.maxRetries; attempt++) {
116
+ try {
117
+ return await fn();
118
+ }
119
+ catch (err) {
120
+ lastError = err;
121
+ if (attempt === retryConfig.maxRetries)
122
+ break;
123
+ const delay = retryConfig.backoffMs * Math.pow(retryConfig.backoffMultiplier, attempt);
124
+ await new Promise(resolve => setTimeout(resolve, delay));
125
+ }
126
+ }
127
+ throw lastError;
128
+ }
129
+ async fetchPage(url, options = {}) {
130
+ if (options.respectRobots !== false) {
131
+ const allowed = await this.checkRobots(url);
132
+ if (!allowed)
133
+ throw new Error(`Blocked by robots.txt: ${url}`);
134
+ }
135
+ await this.waitForToken();
136
+ if (!this.context)
137
+ await this.launch();
138
+ return this.fetchWithRetry(async () => {
139
+ if (options.cookies?.length) {
140
+ await this.context.addCookies(options.cookies.map((c) => ({ ...c, path: c.path || '/' })));
141
+ }
142
+ const page = await this.context.newPage();
143
+ if (options.headers) {
144
+ await page.setExtraHTTPHeaders(options.headers);
145
+ }
146
+ // Block unnecessary resources for speed
147
+ await page.route('**/*', (route) => {
148
+ const type = route.request().resourceType();
149
+ if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
150
+ route.abort();
151
+ }
152
+ else {
153
+ route.continue();
154
+ }
155
+ });
156
+ try {
157
+ const response = await page.goto(url, {
158
+ waitUntil: 'domcontentloaded',
159
+ timeout: options.timeout || 30000,
160
+ });
161
+ if (options.waitForSelector) {
162
+ await page.waitForSelector(options.waitForSelector, { timeout: 10000 }).catch(() => { });
163
+ }
164
+ // Wait for dynamic content
165
+ await page.waitForTimeout(1000);
166
+ const content = await page.content();
167
+ const status = response?.status() || 200;
168
+ // Throw on retryable status codes to trigger retry
169
+ if ((options.retryConfig?.retryOn || DEFAULT_RETRY.retryOn).includes(status)) {
170
+ throw new Error(`HTTP ${status}`);
171
+ }
172
+ return { content, status };
173
+ }
174
+ finally {
175
+ await page.close();
176
+ }
177
+ }, options.retryConfig);
178
+ }
179
+ async fetchStatic(url, options = {}) {
180
+ if (options.respectRobots !== false) {
181
+ const allowed = await this.checkRobots(url);
182
+ if (!allowed)
183
+ throw new Error(`Blocked by robots.txt: ${url}`);
184
+ }
185
+ await this.waitForToken();
186
+ return this.fetchWithRetry(async () => {
187
+ const response = await fetch(url, {
188
+ headers: {
189
+ 'User-Agent': this.config.userAgent || 'WebContext/2.0',
190
+ ...options.headers,
191
+ },
192
+ signal: AbortSignal.timeout(30000),
193
+ });
194
+ const buffer = Buffer.from(await response.arrayBuffer());
195
+ const status = response.status;
196
+ if ((options.retryConfig?.retryOn || DEFAULT_RETRY.retryOn).includes(status)) {
197
+ throw new Error(`HTTP ${status}`);
198
+ }
199
+ return { body: buffer, status };
200
+ }, options.retryConfig);
201
+ }
202
+ async close() {
203
+ if (this.context) {
204
+ await this.context.close();
205
+ this.context = null;
206
+ }
207
+ if (this.browser) {
208
+ await this.browser.close();
209
+ this.browser = null;
210
+ }
211
+ this.robotsCache.clear();
212
+ }
213
+ }
214
+ exports.BrowserManager = BrowserManager;
215
+ //# sourceMappingURL=manager.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"manager.js","sourceRoot":"","sources":["../../src/browser/manager.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,kEAAyC;AAGzC,MAAM,aAAa,GAAgB,EAAE,UAAU,EAAE,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,iBAAiB,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;AAChI,MAAM,kBAAkB,GAAoB,EAAE,iBAAiB,EAAE,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,CAAC;AAEnF;;;GAGG;AACH,MAAa,cAAc;IACjB,OAAO,GAAQ,IAAI,CAAC;IACpB,OAAO,GAAQ,IAAI,CAAC;IACpB,MAAM,CAAgB;IACtB,eAAe,CAAkB;IACjC,WAAW,GAAqB,IAAI,GAAG,EAAE,CAAC;IAC1C,MAAM,CAAS;IACf,UAAU,CAAS;IAE3B,YAAY,SAAwB,EAAE,EAAE,eAAiC;QACvE,IAAI,CAAC,MAAM,GAAG;YACZ,QAAQ,EAAE,IAAI;YACd,SAAS,EAAE,iHAAiH;YAC5H,QAAQ,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;YACtC,GAAG,MAAM;SACV,CAAC;QACF,IAAI,CAAC,eAAe,GAAG,eAAe,IAAI,kBAAkB,CAAC;QAC7D,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,CAAC;QAC7C,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC/B,CAAC;IAEO,YAAY;QAClB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,MAAM,OAAO,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,IAAI,CAAC;QAC/C,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE,IAAI,CAAC,MAAM,GAAG,OAAO,GAAG,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,CAAC;QACvH,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC;IACxB,CAAC;IAEO,KAAK,CAAC,YAAY;QACxB,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YACrB,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC;YACjB,OAAO;QACT,CAAC;QACD,MAAM,MAAM,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,iBAAiB,CAAC,GAAG,IAAI,CAAC;QACnF,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,CAAC;QAC1D,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,IAAI,CAAC,MAAM,IAAI,CAAC,CAAC;IACnB,CAAC;IAED,KAAK,CAAC,MAAM;QACV,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO;QACzB,IAAI,QAAa,CAAC;QAClB,IAAI,CAAC;YACH,MAAM,EAAE,GAAG,wDAAa,YAAY,GAAC,CAAC;YACtC,QAAQ,GAAG,EAAE,CAAC,QAAQ,CAAC;QACzB,CAAC;QAAC,MAAM,CAAC;YACP,MAAM,IAAI,KAAK,CACb,yEAAyE;gBACzE,8EAA8E;gBAC9E,4DAA4D,CAC7D,CAAC;QACJ,CAAC;QACD,IAAI,CAAC,OAAO,GAAG,MAAM,QAAQ,CAAC,MAAM,CAAC;YACnC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;YAC9B,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,kBAAkB,IAAI,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE;SACvE,CAAC,CAAC;QACH,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC;YAC3C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,QAAQ,EAAE,IAAI,CAAC,MAAM,CAAC,QAAQ;SAC/B,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,GAAW;QAC3B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC;QACnC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YAClC,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,GAAG,MAAM,aAAa,CAAC;gBACzC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,SAAS,EAAE;oBACtC,OAAO,EAAE,EAAE,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,aAAa,EAAE;oBACjE,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,IAAI,CAAC;iBAClC,CAAC,CAAC;gBACH,MAAM,IAAI,GAAG,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;gBACtD,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,IAAA,uBAAY,EAAC,SAAS,EAAE,IAAI,CAAC,CAAC,CAAC;YAC9D,CAAC;YAAC,MAAM,CAAC;gBACP,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,EAAE,IAAA,uBAAY,EAAC,GAAG,MAAM,aAAa,EAAE,EAAE,CAAC,CAAC,CAAC;YACzE,CAAC;QACH,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,MAAM,CAAE,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC;IAC5F,CAAC;IAED,KAAK,CAAC,cAAc,CAAI,EAAoB,EAAE,cAA2B,aAAa;QACpF,IAAI,SAAS,GAAiB,IAAI,CAAC;QACnC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,WAAW,CAAC,UAAU,EAAE,OAAO,EAAE,EAAE,CAAC;YACnE,IAAI,CAAC;gBACH,OAAO,MAAM,EAAE,EAAE,CAAC;YACpB,CAAC;YAAC,OAAO,GAAQ,EAAE,CAAC;gBAClB,SAAS,GAAG,GAAG,CAAC;gBAChB,IAAI,OAAO,KAAK,WAAW,CAAC,UAAU;oBAAE,MAAM;gBAC9C,MAAM,KAAK,GAAG,WAAW,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,WAAW,CAAC,iBAAiB,EAAE,OAAO,CAAC,CAAC;gBACvF,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;QACD,MAAM,SAAS,CAAC;IAClB,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,GAAW,EAAE,UAOzB,EAAE;QACJ,IAAI,OAAO,CAAC,aAAa,KAAK,KAAK,EAAE,CAAC;YACpC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;YAC5C,IAAI,CAAC,OAAO;gBAAE,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;QACjE,CAAC;QAED,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;QAC1B,IAAI,CAAC,IAAI,CAAC,OAAO;YAAE,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;QAEvC,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,IAAI,EAAE;YACpC,IAAI,OAAO,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC;gBAC5B,MAAM,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;YAClG,CAAC;YAED,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC;YAC1C,IAAI,OAAO,CAAC,OAAO,EAAE,CAAC;gBACpB,MAAM,IAAI,CAAC,mBAAmB,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAClD,CAAC;YAED,wCAAwC;YACxC,MAAM,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,KAAU,EAAE,EAAE;gBACtC,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,EAAE,CAAC,YAAY,EAAE,CAAC;gBAC5C,IAAI,CAAC,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,YAAY,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC5D,KAAK,CAAC,KAAK,EAAE,CAAC;gBAChB,CAAC;qBAAM,CAAC;oBACN,KAAK,CAAC,QAAQ,EAAE,CAAC;gBACnB,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE;oBACpC,SAAS,EAAE,kBAAkB;oBAC7B,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,KAAK;iBAClC,CAAC,CAAC;gBAEH,IAAI,OAAO,CAAC,eAAe,EAAE,CAAC;oBAC5B,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;gBAC1F,CAAC;gBAED,2BAA2B;gBAC3B,MAAM,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;gBAEhC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;gBACrC,MAAM,MAAM,GAAG,QAAQ,EAAE,MAAM,EAAE,IAAI,GAAG,CAAC;gBAEzC,mDAAmD;gBACnD,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,OAAO,IAAI,aAAa,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;oBAC7E,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,EAAE,CAAC,CAAC;gBACpC,CAAC;gBAED,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;YAC7B,CAAC;oBAAS,CAAC;gBACT,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;YACrB,CAAC;QACH,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC1B,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,GAAW,EAAE,UAI3B,EAAE;QACJ,IAAI,OAAO,CAAC,aAAa,KAAK,KAAK,EAAE,CAAC;YACpC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;YAC5C,IAAI,CAAC,OAAO;gBAAE,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;QACjE,CAAC;QAED,MAAM,IAAI,CAAC,YAAY,EAAE,CAAC;QAE1B,OAAO,IAAI,CAAC,cAAc,CAAC,KAAK,IAAI,EAAE;YACpC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,gBAAgB;oBACvD,GAAG,OAAO,CAAC,OAAO;iBACnB;gBACD,MAAM,EAAE,WAAW,CAAC,OAAO,CAAC,KAAK,CAAC;aACnC,CAAC,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;YACzD,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC;YAE/B,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,OAAO,IAAI,aAAa,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC7E,MAAM,IAAI,KAAK,CAAC,QAAQ,MAAM,EAAE,CAAC,CAAC;YACpC,CAAC;YAED,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC;QAClC,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC1B,CAAC;IAED,KAAK,CAAC,KAAK;QACT,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;IAC3B,CAAC;CACF;AA1MD,wCA0MC"}
@@ -0,0 +1,22 @@
1
+ import { CacheConfig, ExtractedContent, ContentDiff } from '../core/types';
2
+ /**
3
+ * Dual-layer cache (LRU memory + file-based) with content hashing for diff detection.
4
+ */
5
+ export declare class CrawlCache {
6
+ private memoryCache;
7
+ private hashCache;
8
+ private config;
9
+ private cacheDir;
10
+ constructor(config?: Partial<CacheConfig>);
11
+ private hash;
12
+ private urlToKey;
13
+ private readFileEntry;
14
+ get(url: string): ExtractedContent | undefined;
15
+ set(url: string, content: ExtractedContent): void;
16
+ has(url: string): boolean;
17
+ invalidate(url: string): void;
18
+ clear(): void;
19
+ getContentHash(url: string): string | undefined;
20
+ hasChanged(url: string, newContent: string): ContentDiff;
21
+ }
22
+ //# sourceMappingURL=cache.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.d.ts","sourceRoot":"","sources":["../../src/cache/cache.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAU3E;;GAEG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,WAAW,CAAqC;IACxD,OAAO,CAAC,SAAS,CAA2B;IAC5C,OAAO,CAAC,MAAM,CAAc;IAC5B,OAAO,CAAC,QAAQ,CAAS;gBAEb,MAAM,GAAE,OAAO,CAAC,WAAW,CAAM;IAsB7C,OAAO,CAAC,IAAI;IAIZ,OAAO,CAAC,QAAQ;IAIhB,OAAO,CAAC,aAAa;IAgBrB,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,SAAS;IAgB9C,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,GAAG,IAAI;IAajD,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAMzB,UAAU,CAAC,GAAG,EAAE,MAAM,GAAG,IAAI;IAO7B,KAAK,IAAI,IAAI;IAUb,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAO/C,UAAU,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,WAAW;CAsBzD"}
@@ -0,0 +1,150 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.CrawlCache = void 0;
4
+ const crypto_1 = require("crypto");
5
+ const fs_1 = require("fs");
6
+ const path_1 = require("path");
7
+ const lru_cache_1 = require("lru-cache");
8
+ const zod_1 = require("zod");
9
+ const urlSchema = zod_1.z.string().url();
10
+ /**
11
+ * Dual-layer cache (LRU memory + file-based) with content hashing for diff detection.
12
+ */
13
+ class CrawlCache {
14
+ memoryCache;
15
+ hashCache;
16
+ config;
17
+ cacheDir;
18
+ constructor(config = {}) {
19
+ this.config = {
20
+ enabled: config.enabled ?? true,
21
+ ttl: config.ttl ?? 3600,
22
+ maxSize: config.maxSize ?? 500,
23
+ directory: config.directory,
24
+ contentHashing: config.contentHashing ?? true,
25
+ };
26
+ this.cacheDir = this.config.directory || (0, path_1.join)(process.cwd(), '.webcontext-cache');
27
+ if (this.config.enabled && !(0, fs_1.existsSync)(this.cacheDir)) {
28
+ (0, fs_1.mkdirSync)(this.cacheDir, { recursive: true });
29
+ }
30
+ this.memoryCache = new lru_cache_1.LRUCache({
31
+ max: this.config.maxSize,
32
+ ttl: this.config.ttl * 1000,
33
+ });
34
+ this.hashCache = new lru_cache_1.LRUCache({
35
+ max: this.config.maxSize,
36
+ ttl: this.config.ttl * 1000,
37
+ });
38
+ }
39
+ hash(content) {
40
+ return (0, crypto_1.createHash)('sha256').update(content).digest('hex');
41
+ }
42
+ urlToKey(url) {
43
+ return (0, crypto_1.createHash)('md5').update(url).digest('hex');
44
+ }
45
+ readFileEntry(url) {
46
+ const filepath = (0, path_1.join)(this.cacheDir, this.urlToKey(url) + '.json');
47
+ if (!(0, fs_1.existsSync)(filepath))
48
+ return null;
49
+ try {
50
+ const entry = JSON.parse((0, fs_1.readFileSync)(filepath, 'utf-8'));
51
+ const age = (Date.now() - entry.timestamp) / 1000;
52
+ if (age > this.config.ttl) {
53
+ (0, fs_1.rmSync)(filepath);
54
+ return null;
55
+ }
56
+ return entry;
57
+ }
58
+ catch {
59
+ return null;
60
+ }
61
+ }
62
+ get(url) {
63
+ if (!this.config.enabled)
64
+ return undefined;
65
+ try {
66
+ urlSchema.parse(url);
67
+ }
68
+ catch {
69
+ return undefined;
70
+ }
71
+ const memResult = this.memoryCache.get(url);
72
+ if (memResult)
73
+ return memResult;
74
+ const fileEntry = this.readFileEntry(url);
75
+ if (fileEntry) {
76
+ this.memoryCache.set(url, fileEntry.data);
77
+ this.hashCache.set(url, fileEntry.hash);
78
+ return fileEntry.data;
79
+ }
80
+ return undefined;
81
+ }
82
+ set(url, content) {
83
+ if (!this.config.enabled)
84
+ return;
85
+ try {
86
+ urlSchema.parse(url);
87
+ }
88
+ catch {
89
+ return;
90
+ }
91
+ const contentHash = this.hash(JSON.stringify(content));
92
+ this.memoryCache.set(url, content);
93
+ this.hashCache.set(url, contentHash);
94
+ const entry = { hash: contentHash, timestamp: Date.now(), data: content };
95
+ const filepath = (0, path_1.join)(this.cacheDir, this.urlToKey(url) + '.json');
96
+ (0, fs_1.writeFileSync)(filepath, JSON.stringify(entry));
97
+ }
98
+ has(url) {
99
+ if (!this.config.enabled)
100
+ return false;
101
+ if (this.memoryCache.has(url))
102
+ return true;
103
+ return this.readFileEntry(url) !== null;
104
+ }
105
+ invalidate(url) {
106
+ this.memoryCache.delete(url);
107
+ this.hashCache.delete(url);
108
+ const filepath = (0, path_1.join)(this.cacheDir, this.urlToKey(url) + '.json');
109
+ if ((0, fs_1.existsSync)(filepath))
110
+ (0, fs_1.rmSync)(filepath);
111
+ }
112
+ clear() {
113
+ this.memoryCache.clear();
114
+ this.hashCache.clear();
115
+ if ((0, fs_1.existsSync)(this.cacheDir)) {
116
+ for (const file of (0, fs_1.readdirSync)(this.cacheDir)) {
117
+ if (file.endsWith('.json'))
118
+ (0, fs_1.rmSync)((0, path_1.join)(this.cacheDir, file));
119
+ }
120
+ }
121
+ }
122
+ getContentHash(url) {
123
+ const memHash = this.hashCache.get(url);
124
+ if (memHash)
125
+ return memHash;
126
+ const fileEntry = this.readFileEntry(url);
127
+ return fileEntry?.hash;
128
+ }
129
+ hasChanged(url, newContent) {
130
+ const currentHash = this.hash(newContent);
131
+ const previousHash = this.getContentHash(url) || '';
132
+ const changed = previousHash !== '' && previousHash !== currentHash;
133
+ let addedSections = [];
134
+ let removedSections = [];
135
+ if (changed) {
136
+ const cached = this.get(url);
137
+ if (cached) {
138
+ const oldHeadings = cached.headings.map(h => h.text);
139
+ const newHeadings = newContent.match(/^#{1,6}\s+(.+)$/gm)?.map(h => h.replace(/^#+\s+/, '')) || [];
140
+ const oldSet = new Set(oldHeadings);
141
+ const newSet = new Set(newHeadings);
142
+ addedSections = newHeadings.filter(h => !oldSet.has(h));
143
+ removedSections = oldHeadings.filter(h => !newSet.has(h));
144
+ }
145
+ }
146
+ return { url, previousHash, currentHash, changed, addedSections, removedSections };
147
+ }
148
+ }
149
+ exports.CrawlCache = CrawlCache;
150
+ //# sourceMappingURL=cache.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.js","sourceRoot":"","sources":["../../src/cache/cache.ts"],"names":[],"mappings":";;;AAAA,mCAAoC;AACpC,2BAA6F;AAC7F,+BAA4B;AAC5B,yCAAqC;AACrC,6BAAwB;AAGxB,MAAM,SAAS,GAAG,OAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC;AAQnC;;GAEG;AACH,MAAa,UAAU;IACb,WAAW,CAAqC;IAChD,SAAS,CAA2B;IACpC,MAAM,CAAc;IACpB,QAAQ,CAAS;IAEzB,YAAY,SAA+B,EAAE;QAC3C,IAAI,CAAC,MAAM,GAAG;YACZ,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,IAAI;YAC/B,GAAG,EAAE,MAAM,CAAC,GAAG,IAAI,IAAI;YACvB,OAAO,EAAE,MAAM,CAAC,OAAO,IAAI,GAAG;YAC9B,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,cAAc,EAAE,MAAM,CAAC,cAAc,IAAI,IAAI;SAC9C,CAAC;QACF,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,IAAI,IAAA,WAAI,EAAC,OAAO,CAAC,GAAG,EAAE,EAAE,mBAAmB,CAAC,CAAC;QAClF,IAAI,IAAI,CAAC,MAAM,CAAC,OAAO,IAAI,CAAC,IAAA,eAAU,EAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YACtD,IAAA,cAAS,EAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAChD,CAAC;QACD,IAAI,CAAC,WAAW,GAAG,IAAI,oBAAQ,CAA2B;YACxD,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;YACxB,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,IAAI;SAC5B,CAAC,CAAC;QACH,IAAI,CAAC,SAAS,GAAG,IAAI,oBAAQ,CAAiB;YAC5C,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,OAAO;YACxB,GAAG,EAAE,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,IAAI;SAC5B,CAAC,CAAC;IACL,CAAC;IAEO,IAAI,CAAC,OAAe;QAC1B,OAAO,IAAA,mBAAU,EAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IAC5D,CAAC;IAEO,QAAQ,CAAC,GAAW;QAC1B,OAAO,IAAA,mBAAU,EAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;IACrD,CAAC;IAEO,aAAa,CAAC,GAAW;QAC/B,MAAM,QAAQ,GAAG,IAAA,WAAI,EAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;QACnE,IAAI,CAAC,IAAA,eAAU,EAAC,QAAQ,CAAC;YAAE,OAAO,IAAI,CAAC;QACvC,IAAI,CAAC;YACH,MAAM,KAAK,GAAmB,IAAI,CAAC,KAAK,CAAC,IAAA,iBAAY,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,CAAC;YAC1E,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC;YAClD,IAAI,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,CAAC;gBAC1B,IAAA,WAAM,EAAC,QAAQ,CAAC,CAAC;gBACjB,OAAO,IAAI,CAAC;YACd,CAAC;YACD,OAAO,KAAK,CAAC;QACf,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED,GAAG,CAAC,GAAW;QACb,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO;YAAE,OAAO,SAAS,CAAC;QAC3C,IAAI,CAAC;YAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC;YAAC,OAAO,SAAS,CAAC;QAAC,CAAC;QAEzD,MAAM,SAAS,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC5C,IAAI,SAAS;YAAE,OAAO,SAAS,CAAC;QAEhC,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;QAC1C,IAAI,SAAS,EAAE,CAAC;YACd,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC;YACxC,OAAO,SAAS,CAAC,IAAI,CAAC;QACxB,CAAC;QACD,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,GAAG,CAAC,GAAW,EAAE,OAAyB;QACxC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO;YAAE,OAAO;QACjC,IAAI,CAAC;YAAC,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC;YAAC,OAAO;QAAC,CAAC;QAE/C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QACnC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,WAAW,CAAC,CAAC;QAErC,MAAM,KAAK,GAAmB,EAAE,IAAI,EAAE,WAAW,EAAE,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;QAC1F,MAAM,QAAQ,GAAG,IAAA,WAAI,EAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;QACnE,IAAA,kBAAa,EAAC,QAAQ,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;IACjD,CAAC;IAED,GAAG,CAAC,GAAW;QACb,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;QACvC,IAAI,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAC;QAC3C,OAAO,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC;IAC1C,CAAC;IAED,UAAU,CAAC,GAAW;QACpB,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC7B,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC3B,MAAM,QAAQ,GAAG,IAAA,WAAI,EAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;QACnE,IAAI,IAAA,eAAU,EAAC,QAAQ,CAAC;YAAE,IAAA,WAAM,EAAC,QAAQ,CAAC,CAAC;IAC7C,CAAC;IAED,KAAK;QACH,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;QACvB,IAAI,IAAA,eAAU,EAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC9B,KAAK,MAAM,IAAI,IAAI,IAAA,gBAAW,EAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC9C,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC;oBAAE,IAAA,WAAM,EAAC,IAAA,WAAI,EAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;YAChE,CAAC;QACH,CAAC;IACH,CAAC;IAED,cAAc,CAAC,GAAW;QACxB,MAAM,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACxC,IAAI,OAAO;YAAE,OAAO,OAAO,CAAC;QAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC;QAC1C,OAAO,SAAS,EAAE,IAAI,CAAC;IACzB,CAAC;IAED,UAAU,CAAC,GAAW,EAAE,UAAkB;QACxC,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC;QACpD,MAAM,OAAO,GAAG,YAAY,KAAK,EAAE,IAAI,YAAY,KAAK,WAAW,CAAC;QAEpE,IAAI,aAAa,GAAa,EAAE,CAAC;QACjC,IAAI,eAAe,GAAa,EAAE,CAAC;QAEnC,IAAI,OAAO,EAAE,CAAC;YACZ,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAC7B,IAAI,MAAM,EAAE,CAAC;gBACX,MAAM,WAAW,GAAG,MAAM,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;gBACrD,MAAM,WAAW,GAAG,UAAU,CAAC,KAAK,CAAC,mBAAmB,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;gBACnG,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;gBACpC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;gBACpC,aAAa,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;gBACxD,eAAe,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC5D,CAAC;QACH,CAAC;QAED,OAAO,EAAE,GAAG,EAAE,YAAY,EAAE,WAAW,EAAE,OAAO,EAAE,aAAa,EAAE,eAAe,EAAE,CAAC;IACrF,CAAC;CACF;AArID,gCAqIC"}
@@ -0,0 +1,26 @@
1
+ import { ContentChunk, ChunkOptions, Heading } from '../core/types';
2
+ /**
3
+ * Token-aware content chunking for RAG/LLM pipelines.
4
+ * Uses actual tiktoken for accurate token counting.
5
+ */
6
+ export declare class ContentChunker {
7
+ private encoder;
8
+ private options;
9
+ constructor(options?: ChunkOptions);
10
+ /** Count tokens using tiktoken */
11
+ countTokens(text: string): number;
12
+ /** Chunk content using configured strategy */
13
+ chunk(markdown: string, sourceUrl: string, title: string, headings: Heading[], options?: ChunkOptions): ContentChunk[];
14
+ /** Free tiktoken encoder memory */
15
+ dispose(): void;
16
+ private splitSemantic;
17
+ private splitByHeading;
18
+ private splitFixed;
19
+ private splitByParagraph;
20
+ private splitBySections;
21
+ private mergeCodeBlocks;
22
+ private applyOverlap;
23
+ private getHeadingPath;
24
+ private detectChunkLanguage;
25
+ }
26
+ //# sourceMappingURL=chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../../src/chunking/chunker.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,YAAY,EAAE,YAAY,EAAgC,OAAO,EAAE,MAAM,eAAe,CAAC;AAElG;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAW;IAC1B,OAAO,CAAC,OAAO,CAAyB;gBAE5B,OAAO,GAAE,YAAiB;IAWtC,kCAAkC;IAClC,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAIjC,8CAA8C;IAC9C,KAAK,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,YAAY,EAAE;IAwDtH,mCAAmC;IACnC,OAAO,IAAI,IAAI;IAIf,OAAO,CAAC,aAAa;IAkBrB,OAAO,CAAC,cAAc;IAItB,OAAO,CAAC,UAAU;IAkBlB,OAAO,CAAC,gBAAgB;IAkBxB,OAAO,CAAC,eAAe;IAmBvB,OAAO,CAAC,eAAe;IAiBvB,OAAO,CAAC,YAAY;IAepB,OAAO,CAAC,cAAc;IAatB,OAAO,CAAC,mBAAmB;CAI5B"}
@@ -0,0 +1,208 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.ContentChunker = void 0;
4
+ const tiktoken_1 = require("tiktoken");
5
+ const crypto_1 = require("crypto");
6
+ /**
7
+ * Token-aware content chunking for RAG/LLM pipelines.
8
+ * Uses actual tiktoken for accurate token counting.
9
+ */
10
+ class ContentChunker {
11
+ encoder;
12
+ options;
13
+ constructor(options = {}) {
14
+ this.encoder = (0, tiktoken_1.get_encoding)('cl100k_base');
15
+ this.options = {
16
+ maxTokens: options.maxTokens ?? 1500,
17
+ overlap: options.overlap ?? 100,
18
+ strategy: options.strategy ?? 'semantic',
19
+ preserveCodeBlocks: options.preserveCodeBlocks ?? true,
20
+ preserveHeadings: options.preserveHeadings ?? true,
21
+ };
22
+ }
23
+ /** Count tokens using tiktoken */
24
+ countTokens(text) {
25
+ return this.encoder.encode(text).length;
26
+ }
27
+ /** Chunk content using configured strategy */
28
+ chunk(markdown, sourceUrl, title, headings, options) {
29
+ const opts = { ...this.options, ...options };
30
+ let rawChunks;
31
+ switch (opts.strategy) {
32
+ case 'heading':
33
+ rawChunks = this.splitByHeading(markdown);
34
+ break;
35
+ case 'fixed':
36
+ rawChunks = this.splitFixed(markdown, opts.maxTokens);
37
+ break;
38
+ case 'paragraph':
39
+ rawChunks = this.splitByParagraph(markdown, opts.maxTokens);
40
+ break;
41
+ case 'semantic':
42
+ default:
43
+ rawChunks = this.splitSemantic(markdown, opts.maxTokens);
44
+ break;
45
+ }
46
+ // Preserve code blocks by merging split blocks
47
+ if (opts.preserveCodeBlocks) {
48
+ rawChunks = this.mergeCodeBlocks(rawChunks, opts.maxTokens);
49
+ }
50
+ // Enforce max token size
51
+ rawChunks = rawChunks.flatMap(c => this.countTokens(c) > opts.maxTokens ? this.splitFixed(c, opts.maxTokens) : [c]);
52
+ // Apply overlap
53
+ if (opts.overlap > 0 && rawChunks.length > 1) {
54
+ rawChunks = this.applyOverlap(rawChunks, opts.overlap);
55
+ }
56
+ const totalChunks = rawChunks.length;
57
+ return rawChunks.map((text, i) => {
58
+ const id = (0, crypto_1.createHash)('sha256').update(`${sourceUrl}:${i}:${text.slice(0, 64)}`).digest('hex').slice(0, 16);
59
+ const headingPath = this.getHeadingPath(markdown, text, headings);
60
+ return {
61
+ id,
62
+ content: text,
63
+ tokens: this.countTokens(text),
64
+ metadata: {
65
+ sourceUrl,
66
+ title,
67
+ headingPath,
68
+ chunkIndex: i,
69
+ totalChunks,
70
+ hasCode: /```[\s\S]*?```/.test(text),
71
+ language: this.detectChunkLanguage(text),
72
+ },
73
+ };
74
+ });
75
+ }
76
+ /** Free tiktoken encoder memory */
77
+ dispose() {
78
+ this.encoder.free();
79
+ }
80
+ splitSemantic(content, maxTokens) {
81
+ const sections = this.splitBySections(content);
82
+ const chunks = [];
83
+ let buffer = '';
84
+ for (const section of sections) {
85
+ const combined = buffer ? buffer + '\n\n' + section : section;
86
+ if (this.countTokens(combined) > maxTokens && buffer) {
87
+ chunks.push(buffer.trim());
88
+ buffer = section;
89
+ }
90
+ else {
91
+ buffer = combined;
92
+ }
93
+ }
94
+ if (buffer.trim())
95
+ chunks.push(buffer.trim());
96
+ return chunks;
97
+ }
98
+ splitByHeading(content) {
99
+ return content.split(/(?=^#{1,3}\s)/m).map(s => s.trim()).filter(Boolean);
100
+ }
101
+ splitFixed(content, maxTokens) {
102
+ const lines = content.split('\n');
103
+ const chunks = [];
104
+ let buffer = '';
105
+ for (const line of lines) {
106
+ const candidate = buffer ? buffer + '\n' + line : line;
107
+ if (this.countTokens(candidate) > maxTokens && buffer) {
108
+ chunks.push(buffer.trim());
109
+ buffer = line;
110
+ }
111
+ else {
112
+ buffer = candidate;
113
+ }
114
+ }
115
+ if (buffer.trim())
116
+ chunks.push(buffer.trim());
117
+ return chunks;
118
+ }
119
+ splitByParagraph(content, maxTokens) {
120
+ const paragraphs = content.split(/\n\n+/).filter(p => p.trim());
121
+ const chunks = [];
122
+ let buffer = '';
123
+ for (const para of paragraphs) {
124
+ const combined = buffer ? buffer + '\n\n' + para : para;
125
+ if (this.countTokens(combined) > maxTokens && buffer) {
126
+ chunks.push(buffer.trim());
127
+ buffer = para;
128
+ }
129
+ else {
130
+ buffer = combined;
131
+ }
132
+ }
133
+ if (buffer.trim())
134
+ chunks.push(buffer.trim());
135
+ return chunks;
136
+ }
137
+ splitBySections(markdown) {
138
+ const parts = [];
139
+ const lines = markdown.split('\n');
140
+ let current = '';
141
+ let inCodeBlock = false;
142
+ for (const line of lines) {
143
+ if (line.startsWith('```'))
144
+ inCodeBlock = !inCodeBlock;
145
+ if (!inCodeBlock && line.match(/^#{1,4}\s/) && current.trim()) {
146
+ parts.push(current.trim());
147
+ current = line + '\n';
148
+ }
149
+ else {
150
+ current += line + '\n';
151
+ }
152
+ }
153
+ if (current.trim())
154
+ parts.push(current.trim());
155
+ return parts;
156
+ }
157
+ mergeCodeBlocks(chunks, maxTokens) {
158
+ const result = [];
159
+ for (let i = 0; i < chunks.length; i++) {
160
+ const openCount = (chunks[i].match(/```/g) || []).length;
161
+ if (openCount % 2 !== 0 && i + 1 < chunks.length) {
162
+ const merged = chunks[i] + '\n' + chunks[i + 1];
163
+ if (this.countTokens(merged) <= maxTokens * 1.5) {
164
+ result.push(merged);
165
+ i++;
166
+ continue;
167
+ }
168
+ }
169
+ result.push(chunks[i]);
170
+ }
171
+ return result;
172
+ }
173
+ applyOverlap(chunks, overlapTokens) {
174
+ const result = [chunks[0]];
175
+ for (let i = 1; i < chunks.length; i++) {
176
+ const prevWords = chunks[i - 1].split(/\s+/);
177
+ let overlap = '';
178
+ for (let j = prevWords.length - 1; j >= 0; j--) {
179
+ const candidate = prevWords.slice(j).join(' ');
180
+ if (this.countTokens(candidate) > overlapTokens)
181
+ break;
182
+ overlap = candidate;
183
+ }
184
+ result.push(overlap ? overlap + '\n' + chunks[i] : chunks[i]);
185
+ }
186
+ return result;
187
+ }
188
+ getHeadingPath(fullContent, chunkText, headings) {
189
+ const chunkStart = fullContent.indexOf(chunkText.slice(0, 50));
190
+ if (chunkStart === -1)
191
+ return [];
192
+ const before = fullContent.slice(0, chunkStart);
193
+ const path = [];
194
+ for (const h of headings) {
195
+ if (before.includes(h.text) || chunkText.includes(h.text)) {
196
+ if (h.level <= 3)
197
+ path[h.level - 1] = h.text;
198
+ }
199
+ }
200
+ return path.filter(Boolean);
201
+ }
202
+ detectChunkLanguage(content) {
203
+ const match = content.match(/```(\w+)/);
204
+ return match?.[1];
205
+ }
206
+ }
207
+ exports.ContentChunker = ContentChunker;
208
+ //# sourceMappingURL=chunker.js.map