@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,357 @@
1
+ import { extractProps, extractCodeBlocks, determineContentType, parseMetadata } from './metadata-parser.js';
2
+ describe('Metadata Parser', () => {
3
+ describe('extractProps', () => {
4
+ it('should extract props from a standard markdown table', () => {
5
+ const content = `# Button Component
6
+
7
+ ## Props
8
+
9
+ | Name | Type | Default | Description |
10
+ |------|------|---------|-------------|
11
+ | variant | string | 'primary' | The button style |
12
+ | disabled | boolean | false | Whether button is disabled |
13
+ | onClick | function | - | Click handler |
14
+ `;
15
+ const props = extractProps(content);
16
+ expect(props.length).toBe(3);
17
+ expect(props[0].name).toBe('variant');
18
+ expect(props[0].type).toBe('string');
19
+ expect(props[0].defaultValue).toBe("'primary'");
20
+ expect(props[1].name).toBe('disabled');
21
+ expect(props[1].type).toBe('boolean');
22
+ expect(props[2].name).toBe('onClick');
23
+ });
24
+ it('should detect required props from asterisk', () => {
25
+ const content = `## Props
26
+
27
+ | Name | Type | Description |
28
+ |------|------|-------------|
29
+ | children* | ReactNode | Required content |
30
+ | optional | string | Optional value |
31
+ `;
32
+ const props = extractProps(content);
33
+ expect(props.length).toBe(2);
34
+ // The asterisk is part of name detection for required props
35
+ expect(props[0].name).toBeTruthy();
36
+ expect(props[1].required).toBe(false);
37
+ });
38
+ it('should handle table without Props heading', () => {
39
+ const content = `# Component API
40
+
41
+ | Name | Type | Default |
42
+ |------|------|---------|
43
+ | size | string | 'medium' |
44
+ | color | string | 'blue' |
45
+ `;
46
+ const props = extractProps(content);
47
+ expect(props.length).toBe(2);
48
+ });
49
+ it('should handle different column names', () => {
50
+ const content = `## Props
51
+
52
+ | Prop | Types | Desc | DefaultValue |
53
+ |------|-------|------|--------------|
54
+ | label | string | The label text | '' |
55
+ `;
56
+ const props = extractProps(content);
57
+ expect(props.length).toBe(1);
58
+ expect(props[0].name).toBe('label');
59
+ });
60
+ it('should deduplicate props', () => {
61
+ const content = `## Props
62
+
63
+ | Name | Type |
64
+ |------|------|
65
+ | value | string |
66
+
67
+ ## More Props
68
+
69
+ | Name | Type |
70
+ |------|------|
71
+ | value | number |
72
+ `;
73
+ const props = extractProps(content);
74
+ // Should only have one entry for 'value'
75
+ const valueProps = props.filter((p) => p.name === 'value');
76
+ expect(valueProps.length).toBe(1);
77
+ });
78
+ it('should handle escaped pipes in tables', () => {
79
+ const content = `## Props
80
+
81
+ | Name | Type | Description |
82
+ |------|------|-------------|
83
+ | value | string \\| number | Can be string or number |
84
+ `;
85
+ const props = extractProps(content);
86
+ expect(props.length).toBe(1);
87
+ expect(props[0].type).toContain('|');
88
+ });
89
+ it('should return empty array for content without props', () => {
90
+ const content = `# Overview
91
+
92
+ This is just an overview without any props table.
93
+ `;
94
+ const props = extractProps(content);
95
+ expect(props).toEqual([]);
96
+ });
97
+ it('should extract props from inline patterns as fallback', () => {
98
+ const content = `# API
99
+
100
+ \`value\` - The current value (type: string)
101
+ \`onChange\` - Change handler (type: function)
102
+ `;
103
+ const props = extractProps(content);
104
+ expect(props.length).toBe(2);
105
+ expect(props[0].name).toBe('value');
106
+ expect(props[1].name).toBe('onChange');
107
+ });
108
+ });
109
+ describe('extractCodeBlocks', () => {
110
+ it('should extract code blocks with language', () => {
111
+ const content = `# Examples
112
+
113
+ Here's a JavaScript example:
114
+
115
+ \`\`\`javascript
116
+ const x = 1;
117
+ console.log(x);
118
+ \`\`\`
119
+
120
+ And Python:
121
+
122
+ \`\`\`python
123
+ x = 1
124
+ print(x)
125
+ \`\`\`
126
+ `;
127
+ const blocks = extractCodeBlocks(content);
128
+ expect(blocks.length).toBe(2);
129
+ expect(blocks[0].language).toBe('javascript');
130
+ expect(blocks[0].code).toContain('const x = 1');
131
+ expect(blocks[1].language).toBe('python');
132
+ expect(blocks[1].code).toContain('x = 1');
133
+ });
134
+ it('should extract code blocks without language', () => {
135
+ const content = `# Code
136
+
137
+ \`\`\`
138
+ some code here
139
+ \`\`\`
140
+ `;
141
+ const blocks = extractCodeBlocks(content);
142
+ expect(blocks.length).toBe(1);
143
+ expect(blocks[0].language).toBe('plaintext');
144
+ });
145
+ it('should include context from preceding text', () => {
146
+ const content = `# Installation
147
+
148
+ Run this command:
149
+
150
+ \`\`\`bash
151
+ npm install package
152
+ \`\`\`
153
+ `;
154
+ const blocks = extractCodeBlocks(content);
155
+ expect(blocks.length).toBe(1);
156
+ // Context is the nearest meaningful text (could be heading or paragraph)
157
+ expect(blocks[0].context).toBeTruthy();
158
+ });
159
+ it('should handle multiple code blocks', () => {
160
+ const content = `
161
+ \`\`\`js
162
+ code1
163
+ \`\`\`
164
+
165
+ \`\`\`js
166
+ code2
167
+ \`\`\`
168
+
169
+ \`\`\`js
170
+ code3
171
+ \`\`\`
172
+ `;
173
+ const blocks = extractCodeBlocks(content);
174
+ expect(blocks.length).toBe(3);
175
+ });
176
+ it('should return empty array for content without code blocks', () => {
177
+ const content = `# Just Text
178
+
179
+ No code blocks here.
180
+ `;
181
+ const blocks = extractCodeBlocks(content);
182
+ expect(blocks).toEqual([]);
183
+ });
184
+ it('should handle code blocks with complex content', () => {
185
+ const content = `## Component Usage
186
+
187
+ \`\`\`jsx
188
+ import { Button } from 'ui';
189
+
190
+ export function App() {
191
+ return (
192
+ <Button
193
+ variant="primary"
194
+ onClick={() => console.log('clicked')}
195
+ >
196
+ Click me
197
+ </Button>
198
+ );
199
+ }
200
+ \`\`\`
201
+ `;
202
+ const blocks = extractCodeBlocks(content);
203
+ expect(blocks.length).toBe(1);
204
+ expect(blocks[0].code).toContain('import { Button }');
205
+ expect(blocks[0].code).toContain('export function App');
206
+ });
207
+ });
208
+ describe('determineContentType', () => {
209
+ it('should identify API content', () => {
210
+ const apiContent = `# API Reference
211
+
212
+ ## Parameters
213
+
214
+ | Name | Type |
215
+ |------|------|
216
+ | value | string |
217
+
218
+ ## Returns
219
+
220
+ The processed value.
221
+ `;
222
+ expect(determineContentType(apiContent)).toBe('api');
223
+ });
224
+ it('should identify API content from props table', () => {
225
+ const propsContent = `# Button Props
226
+
227
+ | Name | Type | Description |
228
+ |------|------|-------------|
229
+ | variant | string | Button style |
230
+ `;
231
+ expect(determineContentType(propsContent)).toBe('api');
232
+ });
233
+ it('should identify example content', () => {
234
+ const exampleContent = `# Examples
235
+
236
+ ## Basic Example
237
+
238
+ \`\`\`js
239
+ const x = 1;
240
+ \`\`\`
241
+
242
+ ## Advanced Example
243
+
244
+ \`\`\`js
245
+ const y = 2;
246
+ \`\`\`
247
+
248
+ ## Another Example
249
+
250
+ \`\`\`js
251
+ const z = 3;
252
+ \`\`\`
253
+ `;
254
+ expect(determineContentType(exampleContent)).toBe('example');
255
+ });
256
+ it('should identify usage content', () => {
257
+ const usageContent = `# Getting Started
258
+
259
+ ## Installation
260
+
261
+ Run npm install to get started.
262
+
263
+ ## How to Use
264
+
265
+ Follow these steps...
266
+ `;
267
+ expect(determineContentType(usageContent)).toBe('usage');
268
+ });
269
+ it('should default to overview for general content', () => {
270
+ const overviewContent = `# About Our Product
271
+
272
+ This is a great product that does many things.
273
+
274
+ ## Features
275
+
276
+ - Feature 1
277
+ - Feature 2
278
+ `;
279
+ expect(determineContentType(overviewContent)).toBe('overview');
280
+ });
281
+ });
282
+ describe('parseMetadata', () => {
283
+ it('should parse all metadata from content', () => {
284
+ const content = `# Component
285
+
286
+ ## Props
287
+
288
+ | Name | Type |
289
+ |------|------|
290
+ | value | string |
291
+
292
+ ## Example
293
+
294
+ \`\`\`jsx
295
+ <Component value="test" />
296
+ \`\`\`
297
+ `;
298
+ const metadata = parseMetadata(content);
299
+ expect(metadata.props.length).toBe(1);
300
+ expect(metadata.codeBlocks.length).toBe(1);
301
+ expect(metadata.contentType).toBe('api');
302
+ });
303
+ it('should return empty arrays for minimal content', () => {
304
+ const content = 'Just some plain text.';
305
+ const metadata = parseMetadata(content);
306
+ expect(metadata.props).toEqual([]);
307
+ expect(metadata.codeBlocks).toEqual([]);
308
+ expect(metadata.contentType).toBe('overview');
309
+ });
310
+ it('should handle complex documentation page', () => {
311
+ const content = `# Button Component
312
+
313
+ A versatile button component for your application.
314
+
315
+ ## Props
316
+
317
+ | Name | Type | Default | Description |
318
+ |------|------|---------|-------------|
319
+ | variant | 'primary' \\| 'secondary' | 'primary' | Button style variant |
320
+ | size | 'sm' \\| 'md' \\| 'lg' | 'md' | Button size |
321
+ | disabled | boolean | false | Disabled state |
322
+ | onClick | () => void | - | Click handler |
323
+
324
+ ## Basic Usage
325
+
326
+ Import and use the button:
327
+
328
+ \`\`\`tsx
329
+ import { Button } from '@ui/components';
330
+
331
+ function App() {
332
+ return <Button variant="primary">Click me</Button>;
333
+ }
334
+ \`\`\`
335
+
336
+ ## Variants
337
+
338
+ ### Primary Button
339
+
340
+ \`\`\`tsx
341
+ <Button variant="primary">Primary</Button>
342
+ \`\`\`
343
+
344
+ ### Secondary Button
345
+
346
+ \`\`\`tsx
347
+ <Button variant="secondary">Secondary</Button>
348
+ \`\`\`
349
+ `;
350
+ const metadata = parseMetadata(content);
351
+ expect(metadata.props.length).toBe(4);
352
+ expect(metadata.codeBlocks.length).toBe(3);
353
+ expect(metadata.contentType).toBe('api');
354
+ });
355
+ });
356
+ });
357
+ //# sourceMappingURL=metadata-parser.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"metadata-parser.test.js","sourceRoot":"","sources":["../../src/processor/metadata-parser.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,iBAAiB,EAAE,oBAAoB,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAE5G,QAAQ,CAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,QAAQ,CAAC,cAAc,EAAE,GAAG,EAAE;QAC5B,EAAE,CAAC,qDAAqD,EAAE,GAAG,EAAE;YAC7D,MAAM,OAAO,GAAG;;;;;;;;;CASrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAChD,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;YACvC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QACxC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;YACpD,MAAM,OAAO,GAAG;;;;;;CAMrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7B,4DAA4D;YAC5D,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,UAAU,EAAE,CAAC;YACnC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACxC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2CAA2C,EAAE,GAAG,EAAE;YACnD,MAAM,OAAO,GAAG;;;;;;CAMrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;YAC9C,MAAM,OAAO,GAAG;;;;;CAKrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0BAA0B,EAAE,GAAG,EAAE;YAClC,MAAM,OAAO,GAAG;;;;;;;;;;;CAWrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,yCAAyC;YACzC,MAAM,UAAU,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,OAAO,CAAC,CAAC;YAC3D,MAAM,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACpC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,OAAO,GAAG;;;;;CAKrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qDAAqD,EAAE,GAAG,EAAE;YAC7D,MAAM,OAAO,GAAG;;;CAGrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YACpC,MAAM,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uDAAuD,EAAE,GAAG,EAAE;YAC/D,MAAM,OAAO,GAAG;;;;CAIrB,CAAC;YAEI,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;YAEpC,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACpC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACzC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;QACjC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,OAAO,GAAG;;;;;;;;;;;;;;;CAerB,CAAC;YAEI,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAE1C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;YAChD,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,GAAG,EAAE;YACrD,MAAM,OAAO,GAAG;;;;;CAKrB,CAAC;YAEI,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAE1C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;YACpD,MAAM,OAAO,GAAG;;;;;;;CAOrB,CAAC;YAEI,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAE1C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC9B,yEAAyE;YACzE,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,UAAU,EAAE,CAAC;QACzC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;YAC5C,MAAM,OAAO,GAAG;;;;;;;;;;;;CAYrB,CAAC;YAEI,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAC1C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,2DAA2D,EAAE,GAAG,EAAE;YACnE,MAAM,OAAO,GAAG;;;CAGrB,CAAC;YAEI,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAC1C,MAAM,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC7B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;YACxD,MAAM,OAAO,GAAG;;;;;;;;;;;;;;;;CAgBrB,CAAC;YAEI,MAAM,MAAM,GAAG,iBAAiB,CAAC,OAAO,CAAC,CAAC;YAE1C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;YACtD,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,SAAS,CAAC,qBAAqB,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,sBAAsB,EAAE,GAAG,EAAE;QACpC,EAAE,CAAC,6BAA6B,EAAE,GAAG,EAAE;YACrC,MAAM,UAAU,GAAG;;;;;;;;;;;CAWxB,CAAC;YAEI,MAAM,CAAC,oBAAoB,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACvD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;YACtD,MAAM,YAAY,GAAG;;;;;CAK1B,CAAC;YAEI,MAAM,CAAC,oBAAoB,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACzD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iCAAiC,EAAE,GAAG,EAAE;YACzC,MAAM,cAAc,GAAG;;;;;;;;;;;;;;;;;;;CAmB5B,CAAC;YAEI,MAAM,CAAC,oBAAoB,CAAC,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;YACvC,MAAM,YAAY,GAAG;;;;;;;;;CAS1B,CAAC;YAEI,MAAM,CAAC,oBAAoB,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC3D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;YACxD,MAAM,eAAe,GAAG;;;;;;;;CAQ7B,CAAC;YAEI,MAAM,CAAC,oBAAoB,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QACjE,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,eAAe,EAAE,GAAG,EAAE;QAC7B,EAAE,CAAC,wCAAwC,EAAE,GAAG,EAAE;YAChD,MAAM,OAAO,GAAG;;;;;;;;;;;;;CAarB,CAAC;YAEI,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;YAExC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACtC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC3C,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;YACxD,MAAM,OAAO,GAAG,uBAAuB,CAAC;YAExC,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;YAExC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YACnC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;YACxC,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,OAAO,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAsCrB,CAAC;YAEI,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;YAExC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACtC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC3C,MAAM,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAC3C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,8 @@
1
+ import { CrawlResult, DocumentProcessor, ProcessedDocument } from '../types.js';
2
+ import { EmbeddingsProvider } from '../embeddings/types.js';
3
+ export declare class WebDocumentProcessor implements DocumentProcessor {
4
+ private readonly embeddings;
5
+ private readonly maxChunkSize;
6
+ constructor(embeddings: EmbeddingsProvider, maxChunkSize?: number);
7
+ process(crawlResult: CrawlResult): Promise<ProcessedDocument>;
8
+ }
@@ -0,0 +1,190 @@
1
+ import { processHtmlContent } from './content.js';
2
+ import { processMarkdownContent, processExtractedContent } from './markdown.js';
3
+ import { isMarkdownPath } from '../config.js';
4
+ import { logger } from '../util/logger.js';
5
+ import { parseMetadata } from './metadata-parser.js';
6
+ // Extractors that return already-formatted or plain text content (not raw HTML)
7
+ const FORMATTED_CONTENT_EXTRACTORS = [
8
+ 'StorybookExtractor',
9
+ 'GithubPagesExtractor',
10
+ 'DefaultExtractor', // Crawlee's default extractor returns plain text, not HTML
11
+ // Add more extractors here as they're implemented
12
+ ];
13
+ /**
14
+ * Create a DocumentChunk with parsed metadata from the content
15
+ */
16
+ function createChunkWithMetadata(content, baseMetadata, startLine, endLine, vector) {
17
+ const parsed = parseMetadata(content);
18
+ return {
19
+ content,
20
+ startLine,
21
+ endLine,
22
+ vector,
23
+ url: baseMetadata.url,
24
+ title: baseMetadata.title,
25
+ path: baseMetadata.path,
26
+ metadata: {
27
+ type: parsed.contentType,
28
+ props: parsed.props.length > 0 ? parsed.props : undefined,
29
+ codeBlocks: parsed.codeBlocks.length > 0 ? parsed.codeBlocks : undefined,
30
+ },
31
+ };
32
+ }
33
+ async function* semanticChunker(content, maxChunkSize, embeddings, metadata) {
34
+ if (content.trim().length === 0) {
35
+ return;
36
+ }
37
+ // Split content into semantic sections (paragraphs, lists, code blocks)
38
+ const sections = content.split(/(?:\r?\n){2,}/);
39
+ let currentChunk = '';
40
+ let startLine = 0;
41
+ let currentLine = 0;
42
+ let tokenCount = 0;
43
+ for (const section of sections) {
44
+ const sectionLines = section.split('\n');
45
+ const sectionText = section.trim();
46
+ if (sectionText.length === 0) {
47
+ currentLine += sectionLines.length;
48
+ continue;
49
+ }
50
+ // Estimate token count (rough approximation: 4 chars per token)
51
+ const sectionTokens = Math.ceil(sectionText.length / 4);
52
+ // If section alone is too large, split it further
53
+ if (sectionTokens > maxChunkSize) {
54
+ // First yield current chunk if not empty
55
+ if (currentChunk.trim().length > 0) {
56
+ const vector = await embeddings.embed(currentChunk);
57
+ yield createChunkWithMetadata(currentChunk.trim(), metadata, startLine, currentLine - 1, vector);
58
+ currentChunk = '';
59
+ }
60
+ // Split large section by sentences
61
+ const sentences = sectionText.match(/[^.!?]+[.!?]+/g) || [sectionText];
62
+ let sentenceChunk = '';
63
+ let sentenceTokens = 0;
64
+ for (const sentence of sentences) {
65
+ const nextTokens = Math.ceil(sentence.length / 4);
66
+ if (sentenceTokens + nextTokens > maxChunkSize - 5) {
67
+ if (sentenceChunk.trim().length > 0) {
68
+ const vector = await embeddings.embed(sentenceChunk);
69
+ yield createChunkWithMetadata(sentenceChunk.trim(), metadata, currentLine, currentLine + sentenceChunk.split('\n').length - 1, vector);
70
+ }
71
+ sentenceChunk = sentence;
72
+ sentenceTokens = nextTokens;
73
+ }
74
+ else {
75
+ sentenceChunk += ' ' + sentence;
76
+ sentenceTokens += nextTokens;
77
+ }
78
+ }
79
+ // Yield remaining sentence chunk
80
+ if (sentenceChunk.trim().length > 0) {
81
+ const vector = await embeddings.embed(sentenceChunk);
82
+ yield createChunkWithMetadata(sentenceChunk.trim(), metadata, currentLine, currentLine + sentenceChunk.split('\n').length - 1, vector);
83
+ }
84
+ }
85
+ // If adding section would exceed limit, yield current chunk and start new one
86
+ else if (tokenCount + sectionTokens > maxChunkSize - 5) {
87
+ if (currentChunk.trim().length > 0) {
88
+ const vector = await embeddings.embed(currentChunk);
89
+ yield createChunkWithMetadata(currentChunk.trim(), metadata, startLine, currentLine - 1, vector);
90
+ }
91
+ currentChunk = sectionText;
92
+ tokenCount = sectionTokens;
93
+ startLine = currentLine;
94
+ }
95
+ // Otherwise add section to current chunk
96
+ else {
97
+ if (currentChunk.length > 0) {
98
+ currentChunk += '\n\n';
99
+ }
100
+ currentChunk += sectionText;
101
+ tokenCount += sectionTokens;
102
+ }
103
+ currentLine += sectionLines.length;
104
+ }
105
+ // Yield final chunk if not empty
106
+ if (currentChunk.trim().length > 0) {
107
+ const vector = await embeddings.embed(currentChunk);
108
+ yield createChunkWithMetadata(currentChunk.trim(), metadata, startLine, currentLine - 1, vector);
109
+ }
110
+ }
111
+ export class WebDocumentProcessor {
112
+ embeddings;
113
+ maxChunkSize;
114
+ constructor(embeddings, maxChunkSize = 1000) {
115
+ this.embeddings = embeddings;
116
+ this.maxChunkSize = maxChunkSize;
117
+ }
118
+ async process(crawlResult) {
119
+ logger.debug(`[WebDocumentProcessor] Processing ${crawlResult.url}`);
120
+ logger.debug(`[WebDocumentProcessor] Content length: ${crawlResult.content.length} bytes`);
121
+ logger.debug(`[WebDocumentProcessor] Extractor used: ${crawlResult.extractorUsed || 'unknown'}`);
122
+ try {
123
+ // Determine content type and process accordingly
124
+ let processedContent;
125
+ // Check if content was extracted by a formatter that outputs markdown
126
+ const isFormattedContent = crawlResult.extractorUsed && FORMATTED_CONTENT_EXTRACTORS.includes(crawlResult.extractorUsed);
127
+ if (isFormattedContent) {
128
+ // Content is already formatted markdown from a custom extractor
129
+ logger.debug(`[WebDocumentProcessor] Using extracted content processor for ${crawlResult.extractorUsed}`);
130
+ processedContent = await processExtractedContent(crawlResult);
131
+ }
132
+ else if (isMarkdownPath(crawlResult.path)) {
133
+ // Raw markdown file
134
+ logger.debug(`[WebDocumentProcessor] Using markdown processor for ${crawlResult.path}`);
135
+ processedContent = await processMarkdownContent(crawlResult);
136
+ }
137
+ else {
138
+ // Raw HTML - needs parsing
139
+ logger.debug(`[WebDocumentProcessor] Using HTML processor for ${crawlResult.path}`);
140
+ processedContent = await processHtmlContent(crawlResult);
141
+ }
142
+ if (!processedContent) {
143
+ logger.error(`[WebDocumentProcessor] Failed to parse document content for ${crawlResult.url}`);
144
+ throw new Error('Failed to parse document content');
145
+ }
146
+ logger.debug(`[WebDocumentProcessor] Successfully processed content for ${crawlResult.url}`);
147
+ logger.debug(`[WebDocumentProcessor] Found ${processedContent.article.components.length} components`);
148
+ logger.debug(`[WebDocumentProcessor] Creating chunks for ${processedContent.article.title}`);
149
+ const chunks = [];
150
+ let totalChunks = 0;
151
+ const metadata = {
152
+ url: processedContent.article.url,
153
+ title: processedContent.article.title,
154
+ path: processedContent.article.path,
155
+ };
156
+ // Process each component separately
157
+ for (const component of processedContent.article.components) {
158
+ logger.debug(`[WebDocumentProcessor] Processing component: ${component.title}`);
159
+ logger.debug(`[WebDocumentProcessor] Component body length: ${component.body.length} bytes`);
160
+ const componentContent = `${component.title}\n\n${component.body}`;
161
+ for await (const chunk of semanticChunker(componentContent, this.maxChunkSize, this.embeddings, metadata)) {
162
+ chunks.push(chunk);
163
+ totalChunks++;
164
+ }
165
+ }
166
+ logger.debug(`[WebDocumentProcessor] Created ${totalChunks} chunks`);
167
+ if (chunks.length === 0) {
168
+ logger.warn(`[WebDocumentProcessor] No valid chunks were created for ${crawlResult.url}`);
169
+ logger.warn(`[WebDocumentProcessor] Original content length: ${crawlResult.content.length}`);
170
+ logger.warn(`[WebDocumentProcessor] Processed content length: ${processedContent.content.length}`);
171
+ throw new Error('No valid chunks were created');
172
+ }
173
+ logger.debug(`[WebDocumentProcessor] Successfully processed ${crawlResult.url}`);
174
+ return {
175
+ metadata: {
176
+ url: crawlResult.url,
177
+ title: processedContent.article.title,
178
+ lastIndexed: new Date(),
179
+ },
180
+ chunks,
181
+ };
182
+ }
183
+ catch (error) {
184
+ logger.error(`[WebDocumentProcessor] Error processing ${crawlResult.url}:`, error);
185
+ logger.error(`[WebDocumentProcessor] Error details:`, error instanceof Error ? error.stack : error);
186
+ throw error;
187
+ }
188
+ }
189
+ }
190
+ //# sourceMappingURL=processor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"processor.js","sourceRoot":"","sources":["../../src/processor/processor.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAClD,OAAO,EAAE,sBAAsB,EAAE,uBAAuB,EAAE,MAAM,eAAe,CAAC;AAChF,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAErD,gFAAgF;AAChF,MAAM,4BAA4B,GAAG;IACnC,oBAAoB;IACpB,sBAAsB;IACtB,kBAAkB,EAAE,2DAA2D;IAC/E,kDAAkD;CACnD,CAAC;AAEF;;GAEG;AACH,SAAS,uBAAuB,CAC9B,OAAe,EACf,YAA0D,EAC1D,SAAiB,EACjB,OAAe,EACf,MAAgB;IAEhB,MAAM,MAAM,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;IAEtC,OAAO;QACL,OAAO;QACP,SAAS;QACT,OAAO;QACP,MAAM;QACN,GAAG,EAAE,YAAY,CAAC,GAAG;QACrB,KAAK,EAAE,YAAY,CAAC,KAAK;QACzB,IAAI,EAAE,YAAY,CAAC,IAAI;QACvB,QAAQ,EAAE;YACR,IAAI,EAAE,MAAM,CAAC,WAAW;YACxB,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS;YACzD,UAAU,EAAE,MAAM,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS;SACzE;KACF,CAAC;AACJ,CAAC;AAED,KAAK,SAAS,CAAC,CAAC,eAAe,CAC7B,OAAe,EACf,YAAoB,EACpB,UAA8B,EAC9B,QAIC;IAED,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAChC,OAAO;IACT,CAAC;IAED,wEAAwE;IACxE,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC;IAChD,IAAI,YAAY,GAAG,EAAE,CAAC;IACtB,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,YAAY,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QACzC,MAAM,WAAW,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QAEnC,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC7B,WAAW,IAAI,YAAY,CAAC,MAAM,CAAC;YACnC,SAAS;QACX,CAAC;QAED,gEAAgE;QAChE,MAAM,aAAa,GAAG,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAExD,kDAAkD;QAClD,IAAI,aAAa,GAAG,YAAY,EAAE,CAAC;YACjC,yCAAyC;YACzC,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;gBACpD,MAAM,uBAAuB,CAAC,YAAY,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,GAAG,CAAC,EAAE,MAAM,CAAC,CAAC;gBACjG,YAAY,GAAG,EAAE,CAAC;YACpB,CAAC;YAED,mCAAmC;YACnC,MAAM,SAAS,GAAG,WAAW,CAAC,KAAK,CAAC,gBAAgB,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YACvE,IAAI,aAAa,GAAG,EAAE,CAAC;YACvB,IAAI,cAAc,GAAG,CAAC,CAAC;YAEvB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;gBACjC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAElD,IAAI,cAAc,GAAG,UAAU,GAAG,YAAY,GAAG,CAAC,EAAE,CAAC;oBACnD,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBACpC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;wBACrD,MAAM,uBAAuB,CAC3B,aAAa,CAAC,IAAI,EAAE,EACpB,QAAQ,EACR,WAAW,EACX,WAAW,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,EAClD,MAAM,CACP,CAAC;oBACJ,CAAC;oBACD,aAAa,GAAG,QAAQ,CAAC;oBACzB,cAAc,GAAG,UAAU,CAAC;gBAC9B,CAAC;qBAAM,CAAC;oBACN,aAAa,IAAI,GAAG,GAAG,QAAQ,CAAC;oBAChC,cAAc,IAAI,UAAU,CAAC;gBAC/B,CAAC;YACH,CAAC;YAED,iCAAiC;YACjC,IAAI,aAAa,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACpC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;gBACrD,MAAM,uBAAuB,CAC3B,aAAa,CAAC,IAAI,EAAE,EACpB,QAAQ,EACR,WAAW,EACX,WAAW,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,EAClD,MAAM,CACP,CAAC;YACJ,CAAC;QACH,CAAC;QACD,8EAA8E;aACzE,IAAI,UAAU,GAAG,aAAa,GAAG,YAAY,GAAG,CAAC,EAAE,CAAC;YACvD,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACnC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;gBACpD,MAAM,uBAAuB,CAAC,YAAY,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,GAAG,CAAC,EAAE,MAAM,CAAC,CAAC;YACnG,CAAC;YACD,YAAY,GAAG,WAAW,CAAC;YAC3B,UAAU,GAAG,aAAa,CAAC;YAC3B,SAAS,GAAG,WAAW,CAAC;QAC1B,CAAC;QACD,yCAAyC;aACpC,CAAC;YACJ,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC5B,YAAY,IAAI,MAAM,CAAC;YACzB,CAAC;YACD,YAAY,IAAI,WAAW,CAAC;YAC5B,UAAU,IAAI,aAAa,CAAC;QAC9B,CAAC;QAED,WAAW,IAAI,YAAY,CAAC,MAAM,CAAC;IACrC,CAAC;IAED,iCAAiC;IACjC,IAAI,YAAY,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC;QACpD,MAAM,uBAAuB,CAAC,YAAY,CAAC,IAAI,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,GAAG,CAAC,EAAE,MAAM,CAAC,CAAC;IACnG,CAAC;AACH,CAAC;AAED,MAAM,OAAO,oBAAoB;IAEZ;IACA;IAFnB,YACmB,UAA8B,EAC9B,eAAuB,IAAI;QAD3B,eAAU,GAAV,UAAU,CAAoB;QAC9B,iBAAY,GAAZ,YAAY,CAAe;IAC3C,CAAC;IAEJ,KAAK,CAAC,OAAO,CAAC,WAAwB;QACpC,MAAM,CAAC,KAAK,CAAC,qCAAqC,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;QACrE,MAAM,CAAC,KAAK,CAAC,0CAA0C,WAAW,CAAC,OAAO,CAAC,MAAM,QAAQ,CAAC,CAAC;QAC3F,MAAM,CAAC,KAAK,CAAC,0CAA0C,WAAW,CAAC,aAAa,IAAI,SAAS,EAAE,CAAC,CAAC;QAEjG,IAAI,CAAC;YACH,iDAAiD;YACjD,IAAI,gBAAgB,CAAC;YAErB,sEAAsE;YACtE,MAAM,kBAAkB,GAAG,WAAW,CAAC,aAAa,IAAI,4BAA4B,CAAC,QAAQ,CAAC,WAAW,CAAC,aAAa,CAAC,CAAC;YAEzH,IAAI,kBAAkB,EAAE,CAAC;gBACvB,gEAAgE;gBAChE,MAAM,CAAC,KAAK,CAAC,gEAAgE,WAAW,CAAC,aAAa,EAAE,CAAC,CAAC;gBAC1G,gBAAgB,GAAG,MAAM,uBAAuB,CAAC,WAAW,CAAC,CAAC;YAChE,CAAC;iBAAM,IAAI,cAAc,CAAC,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC5C,oBAAoB;gBACpB,MAAM,CAAC,KAAK,CAAC,uDAAuD,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;gBACxF,gBAAgB,GAAG,MAAM,sBAAsB,CAAC,WAAW,CAAC,CAAC;YAC/D,CAAC;iBAAM,CAAC;gBACN,2BAA2B;gBAC3B,MAAM,CAAC,KAAK,CAAC,mDAAmD,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC;gBACpF,gBAAgB,GAAG,MAAM,kBAAkB,CAAC,WAAW,CAAC,CAAC;YAC3D,CAAC;YAED,IAAI,CAAC,gBAAgB,EAAE,CAAC;gBACtB,MAAM,CAAC,KAAK,CAAC,+DAA+D,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC/F,MAAM,IAAI,KAAK,CAAC,kCAAkC,CAAC,CAAC;YACtD,CAAC;YAED,MAAM,CAAC,KAAK,CAAC,6DAA6D,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;YAC7F,MAAM,CAAC,KAAK,CAAC,gCAAgC,gBAAgB,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,aAAa,CAAC,CAAC;YACtG,MAAM,CAAC,KAAK,CAAC,8CAA8C,gBAAgB,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,CAAC;YAE7F,MAAM,MAAM,GAAoB,EAAE,CAAC;YACnC,IAAI,WAAW,GAAG,CAAC,CAAC;YAEpB,MAAM,QAAQ,GAAG;gBACf,GAAG,EAAE,gBAAgB,CAAC,OAAO,CAAC,GAAG;gBACjC,KAAK,EAAE,gBAAgB,CAAC,OAAO,CAAC,KAAK;gBACrC,IAAI,EAAE,gBAAgB,CAAC,OAAO,CAAC,IAAI;aACpC,CAAC;YAEF,oCAAoC;YACpC,KAAK,MAAM,SAAS,IAAI,gBAAgB,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC;gBAC5D,MAAM,CAAC,KAAK,CAAC,gDAAgD,SAAS,CAAC,KAAK,EAAE,CAAC,CAAC;gBAChF,MAAM,CAAC,KAAK,CAAC,iDAAiD,SAAS,CAAC,IAAI,CAAC,MAAM,QAAQ,CAAC,CAAC;gBAE7F,MAAM,gBAAgB,GAAG,GAAG,SAAS,CAAC,KAAK,OAAO,SAAS,CAAC,IAAI,EAAE,CAAC;gBACnE,IAAI,KAAK,EAAE,MAAM,KAAK,IAAI,eAAe,CAAC,gBAAgB,EAAE,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,EAAE,CAAC;oBAC1G,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;oBACnB,WAAW,EAAE,CAAC;gBAChB,CAAC;YACH,CAAC;YAED,MAAM,CAAC,KAAK,CAAC,kCAAkC,WAAW,SAAS,CAAC,CAAC;YAErE,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACxB,MAAM,CAAC,IAAI,CAAC,2DAA2D,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1F,MAAM,CAAC,IAAI,CAAC,mDAAmD,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC7F,MAAM,CAAC,IAAI,CAAC,oDAAoD,gBAAgB,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC;gBACnG,MAAM,IAAI,KAAK,CAAC,8BAA8B,CAAC,CAAC;YAClD,CAAC;YAED,MAAM,CAAC,KAAK,CAAC,iDAAiD,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC;YACjF,OAAO;gBACL,QAAQ,EAAE;oBACR,GAAG,EAAE,WAAW,CAAC,GAAG;oBACpB,KAAK,EAAE,gBAAgB,CAAC,OAAO,CAAC,KAAK;oBACrC,WAAW,EAAE,IAAI,IAAI,EAAE;iBACxB;gBACD,MAAM;aACP,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,CAAC,KAAK,CAAC,2CAA2C,WAAW,CAAC,GAAG,GAAG,EAAE,KAAK,CAAC,CAAC;YACnF,MAAM,CAAC,KAAK,CAAC,uCAAuC,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;YACpG,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;CACF"}
@@ -0,0 +1 @@
1
+ export {};