@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,369 @@
1
+ import { processHtmlContent } from './content.js';
2
+ describe('HTML Content Processor', () => {
3
+ describe('processHtmlContent', () => {
4
+ it('should process simple HTML content', async () => {
5
+ const page = {
6
+ url: 'https://example.com/docs/page',
7
+ path: '/docs/page',
8
+ title: 'Documentation Page',
9
+ content: `
10
+ <!DOCTYPE html>
11
+ <html>
12
+ <head><title>Page Title</title></head>
13
+ <body>
14
+ <main>
15
+ <h1>Welcome to the Documentation</h1>
16
+ <p>This is the introduction paragraph.</p>
17
+ <h2>Getting Started</h2>
18
+ <p>Here's how to get started.</p>
19
+ </main>
20
+ </body>
21
+ </html>
22
+ `,
23
+ };
24
+ const result = await processHtmlContent(page);
25
+ expect(result).toBeDefined();
26
+ expect(result?.article.url).toBe(page.url);
27
+ expect(result?.article.components.length).toBeGreaterThan(0);
28
+ });
29
+ it('should extract content from article tags', async () => {
30
+ const page = {
31
+ url: 'https://example.com/blog/post',
32
+ path: '/blog/post',
33
+ title: 'Blog Post',
34
+ content: `
35
+ <html>
36
+ <body>
37
+ <nav>Navigation items</nav>
38
+ <article>
39
+ <h1>Article Title</h1>
40
+ <p>Article content goes here.</p>
41
+ <h2>Section One</h2>
42
+ <p>More content.</p>
43
+ </article>
44
+ <footer>Footer content</footer>
45
+ </body>
46
+ </html>
47
+ `,
48
+ };
49
+ const result = await processHtmlContent(page);
50
+ expect(result).toBeDefined();
51
+ expect(result?.content).toContain('Article content');
52
+ // Should not include nav/footer content
53
+ expect(result?.content).not.toContain('Navigation items');
54
+ });
55
+ it('should handle documentation-specific selectors', async () => {
56
+ const page = {
57
+ url: 'https://example.com/docs',
58
+ path: '/docs',
59
+ title: 'Docs',
60
+ content: `
61
+ <html>
62
+ <body>
63
+ <div class="sidebar">Sidebar</div>
64
+ <div class="markdown-body">
65
+ <h1>Documentation</h1>
66
+ <p>Main documentation content.</p>
67
+ </div>
68
+ </body>
69
+ </html>
70
+ `,
71
+ };
72
+ const result = await processHtmlContent(page);
73
+ expect(result).toBeDefined();
74
+ expect(result?.content).toContain('Main documentation content');
75
+ });
76
+ it('should preserve code blocks', async () => {
77
+ const page = {
78
+ url: 'https://example.com/code',
79
+ path: '/code',
80
+ title: 'Code',
81
+ content: `
82
+ <html>
83
+ <body>
84
+ <main>
85
+ <h1>Code Examples</h1>
86
+ <pre><code>function example() {
87
+ return "Hello";
88
+ }</code></pre>
89
+ </main>
90
+ </body>
91
+ </html>
92
+ `,
93
+ };
94
+ const result = await processHtmlContent(page);
95
+ expect(result).toBeDefined();
96
+ expect(result?.content).toContain('function example');
97
+ });
98
+ it('should handle lists properly', async () => {
99
+ const page = {
100
+ url: 'https://example.com/list',
101
+ path: '/list',
102
+ title: 'List',
103
+ content: `
104
+ <html>
105
+ <body>
106
+ <main>
107
+ <h1>Features</h1>
108
+ <ul>
109
+ <li>Feature one</li>
110
+ <li>Feature two</li>
111
+ <li>Feature three</li>
112
+ </ul>
113
+ </main>
114
+ </body>
115
+ </html>
116
+ `,
117
+ };
118
+ const result = await processHtmlContent(page);
119
+ expect(result).toBeDefined();
120
+ expect(result?.content).toContain('Feature one');
121
+ expect(result?.content).toContain('Feature two');
122
+ });
123
+ it('should handle tables', async () => {
124
+ const page = {
125
+ url: 'https://example.com/table',
126
+ path: '/table',
127
+ title: 'Table',
128
+ content: `
129
+ <html>
130
+ <body>
131
+ <main>
132
+ <h1>API Reference</h1>
133
+ <table>
134
+ <tr><th>Method</th><th>Description</th></tr>
135
+ <tr><td>GET</td><td>Retrieve data</td></tr>
136
+ <tr><td>POST</td><td>Create data</td></tr>
137
+ </table>
138
+ </main>
139
+ </body>
140
+ </html>
141
+ `,
142
+ };
143
+ const result = await processHtmlContent(page);
144
+ expect(result).toBeDefined();
145
+ expect(result?.content).toContain('GET');
146
+ expect(result?.content).toContain('POST');
147
+ });
148
+ it('should skip script and style tags', async () => {
149
+ const page = {
150
+ url: 'https://example.com/scripts',
151
+ path: '/scripts',
152
+ title: 'Scripts',
153
+ content: `
154
+ <html>
155
+ <head>
156
+ <style>.class { color: red; }</style>
157
+ </head>
158
+ <body>
159
+ <script>alert('hello');</script>
160
+ <main>
161
+ <h1>Content</h1>
162
+ <p>Actual content here.</p>
163
+ </main>
164
+ <script>console.log('end');</script>
165
+ </body>
166
+ </html>
167
+ `,
168
+ };
169
+ const result = await processHtmlContent(page);
170
+ expect(result).toBeDefined();
171
+ expect(result?.content).not.toContain('alert');
172
+ expect(result?.content).not.toContain('color: red');
173
+ expect(result?.content).toContain('Actual content');
174
+ });
175
+ it('should use Readability as fallback', async () => {
176
+ const page = {
177
+ url: 'https://example.com/blog',
178
+ path: '/blog',
179
+ title: 'Blog',
180
+ content: `
181
+ <html>
182
+ <head><title>Blog Post</title></head>
183
+ <body>
184
+ <div>
185
+ <p>This is a paragraph of content.</p>
186
+ <p>Another paragraph with more information.</p>
187
+ <p>Yet another paragraph to provide enough content for Readability.</p>
188
+ <p>Additional paragraph for proper content extraction.</p>
189
+ <p>Final paragraph of meaningful content.</p>
190
+ </div>
191
+ </body>
192
+ </html>
193
+ `,
194
+ };
195
+ const result = await processHtmlContent(page);
196
+ // Should still extract something even without clear main content
197
+ expect(result).toBeDefined();
198
+ });
199
+ it('should return undefined for pages without extractable content', async () => {
200
+ const page = {
201
+ url: 'https://example.com/empty',
202
+ path: '/empty',
203
+ title: 'Empty',
204
+ content: `
205
+ <html>
206
+ <body>
207
+ <nav>Just navigation</nav>
208
+ </body>
209
+ </html>
210
+ `,
211
+ };
212
+ const result = await processHtmlContent(page);
213
+ // May return undefined or minimal content depending on parser
214
+ if (result) {
215
+ expect(result.content.length).toBeLessThan(100);
216
+ }
217
+ });
218
+ it('should handle Storybook-specific classes', async () => {
219
+ const page = {
220
+ url: 'https://storybook.example.com/docs',
221
+ path: '/docs',
222
+ title: 'Storybook',
223
+ content: `
224
+ <html>
225
+ <body>
226
+ <div class="sbdocs-wrapper">
227
+ <div class="sbdocs-content">
228
+ <h1 class="sbdocs-h1">Component Documentation</h1>
229
+ <div class="docblock-description">
230
+ <p>Description of the component.</p>
231
+ </div>
232
+ </div>
233
+ </div>
234
+ </body>
235
+ </html>
236
+ `,
237
+ };
238
+ const result = await processHtmlContent(page);
239
+ expect(result).toBeDefined();
240
+ expect(result?.content).toContain('Component Documentation');
241
+ });
242
+ it('should handle React app root containers', async () => {
243
+ const page = {
244
+ url: 'https://example.com/react-app',
245
+ path: '/react-app',
246
+ title: 'React App',
247
+ content: `
248
+ <html>
249
+ <body>
250
+ <div id="root">
251
+ <div>
252
+ <h1>React App Content</h1>
253
+ <p>This is rendered by React.</p>
254
+ </div>
255
+ </div>
256
+ </body>
257
+ </html>
258
+ `,
259
+ };
260
+ const result = await processHtmlContent(page);
261
+ expect(result).toBeDefined();
262
+ expect(result?.content).toContain('React App Content');
263
+ });
264
+ it('should handle Next.js containers', async () => {
265
+ const page = {
266
+ url: 'https://example.com/nextjs',
267
+ path: '/nextjs',
268
+ title: 'Next.js',
269
+ content: `
270
+ <html>
271
+ <body>
272
+ <div id="__next">
273
+ <main>
274
+ <h1>Next.js Page</h1>
275
+ <p>Content from Next.js.</p>
276
+ </main>
277
+ </div>
278
+ </body>
279
+ </html>
280
+ `,
281
+ };
282
+ const result = await processHtmlContent(page);
283
+ expect(result).toBeDefined();
284
+ expect(result?.content).toContain('Next.js Page');
285
+ });
286
+ it('should handle multiple heading levels for section extraction', async () => {
287
+ const page = {
288
+ url: 'https://example.com/headings',
289
+ path: '/headings',
290
+ title: 'Headings',
291
+ content: `
292
+ <html>
293
+ <body>
294
+ <main>
295
+ <h1>Main Title</h1>
296
+ <p>Intro text.</p>
297
+ <h2>Section 1</h2>
298
+ <p>Section 1 content.</p>
299
+ <h3>Subsection 1.1</h3>
300
+ <p>Subsection content.</p>
301
+ <h2>Section 2</h2>
302
+ <p>Section 2 content.</p>
303
+ <h4>Deep Section</h4>
304
+ <p>Deep content.</p>
305
+ </main>
306
+ </body>
307
+ </html>
308
+ `,
309
+ };
310
+ const result = await processHtmlContent(page);
311
+ expect(result).toBeDefined();
312
+ expect(result?.article.components.length).toBeGreaterThan(3);
313
+ });
314
+ it('should handle malformed HTML gracefully', async () => {
315
+ const page = {
316
+ url: 'https://example.com/malformed',
317
+ path: '/malformed',
318
+ title: 'Malformed',
319
+ content: `
320
+ <html>
321
+ <body>
322
+ <div>
323
+ <p>Unclosed paragraph
324
+ <p>Another unclosed
325
+ <span>Nested <b>incorrectly</span></b>
326
+ <main>
327
+ <h1>Still Works</h1>
328
+ <p>Content here.</p>
329
+ </main>
330
+ </div>
331
+ </body>
332
+ </html>
333
+ `,
334
+ };
335
+ // Should not throw
336
+ const result = await processHtmlContent(page);
337
+ expect(result).toBeDefined();
338
+ });
339
+ it('should clean text and remove extra whitespace', async () => {
340
+ const page = {
341
+ url: 'https://example.com/whitespace',
342
+ path: '/whitespace',
343
+ title: 'Whitespace',
344
+ content: `
345
+ <html>
346
+ <body>
347
+ <main>
348
+ <h1>Title</h1>
349
+ <p>Text with extra spaces.</p>
350
+ <p>
351
+
352
+
353
+ Multiple newlines here.
354
+
355
+
356
+ </p>
357
+ </main>
358
+ </body>
359
+ </html>
360
+ `,
361
+ };
362
+ const result = await processHtmlContent(page);
363
+ expect(result).toBeDefined();
364
+ // Should normalize whitespace
365
+ expect(result?.content).not.toMatch(/\s{3,}/);
366
+ });
367
+ });
368
+ });
369
+ //# sourceMappingURL=content.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content.test.js","sourceRoot":"","sources":["../../src/processor/content.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAGlD,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;IACtC,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,+BAA+B;gBACpC,IAAI,EAAE,YAAY;gBAClB,KAAK,EAAE,oBAAoB;gBAC3B,OAAO,EAAE;;;;;;;;;;;;;SAaR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAC3C,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,+BAA+B;gBACpC,IAAI,EAAE,YAAY;gBAClB,KAAK,EAAE,WAAW;gBAClB,OAAO,EAAE;;;;;;;;;;;;;SAaR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,iBAAiB,CAAC,CAAC;YACrD,wCAAwC;YACxC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;QAC5D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,0BAA0B;gBAC/B,IAAI,EAAE,OAAO;gBACb,KAAK,EAAE,MAAM;gBACb,OAAO,EAAE;;;;;;;;;;SAUR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,4BAA4B,CAAC,CAAC;QAClE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6BAA6B,EAAE,KAAK,IAAI,EAAE;YAC3C,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,0BAA0B;gBAC/B,IAAI,EAAE,OAAO;gBACb,KAAK,EAAE,MAAM;gBACb,OAAO,EAAE;;;;;;;;;;;SAWR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,kBAAkB,CAAC,CAAC;QACxD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8BAA8B,EAAE,KAAK,IAAI,EAAE;YAC5C,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,0BAA0B;gBAC/B,IAAI,EAAE,OAAO;gBACb,KAAK,EAAE,MAAM;gBACb,OAAO,EAAE;;;;;;;;;;;;;SAaR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;YACjD,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,sBAAsB,EAAE,KAAK,IAAI,EAAE;YACpC,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,2BAA2B;gBAChC,IAAI,EAAE,QAAQ;gBACd,KAAK,EAAE,OAAO;gBACd,OAAO,EAAE;;;;;;;;;;;;;SAaR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;YACzC,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,mCAAmC,EAAE,KAAK,IAAI,EAAE;YACjD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,6BAA6B;gBAClC,IAAI,EAAE,UAAU;gBAChB,KAAK,EAAE,SAAS;gBAChB,OAAO,EAAE;;;;;;;;;;;;;;SAcR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;YAC/C,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACpD,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,gBAAgB,CAAC,CAAC;QACtD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,0BAA0B;gBAC/B,IAAI,EAAE,OAAO;gBACb,KAAK,EAAE,MAAM;gBACb,OAAO,EAAE;;;;;;;;;;;;;SAaR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,iEAAiE;YACjE,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+DAA+D,EAAE,KAAK,IAAI,EAAE;YAC7E,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,2BAA2B;gBAChC,IAAI,EAAE,QAAQ;gBACd,KAAK,EAAE,OAAO;gBACd,OAAO,EAAE;;;;;;SAMR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAC9C,8DAA8D;YAC9D,IAAI,MAAM,EAAE,CAAC;gBACX,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YAClD,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,oCAAoC;gBACzC,IAAI,EAAE,OAAO;gBACb,KAAK,EAAE,WAAW;gBAClB,OAAO,EAAE;;;;;;;;;;;;;SAaR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,yBAAyB,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,+BAA+B;gBACpC,IAAI,EAAE,YAAY;gBAClB,KAAK,EAAE,WAAW;gBAClB,OAAO,EAAE;;;;;;;;;;;SAWR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,mBAAmB,CAAC,CAAC;QACzD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kCAAkC,EAAE,KAAK,IAAI,EAAE;YAChD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,4BAA4B;gBACjC,IAAI,EAAE,SAAS;gBACf,KAAK,EAAE,SAAS;gBAChB,OAAO,EAAE;;;;;;;;;;;SAWR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;QACpD,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;YAC5E,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,8BAA8B;gBACnC,IAAI,EAAE,WAAW;gBACjB,KAAK,EAAE,UAAU;gBACjB,OAAO,EAAE;;;;;;;;;;;;;;;;;SAiBR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAC/D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yCAAyC,EAAE,KAAK,IAAI,EAAE;YACvD,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,+BAA+B;gBACpC,IAAI,EAAE,YAAY;gBAClB,KAAK,EAAE,WAAW;gBAClB,OAAO,EAAE;;;;;;;;;;;;;;SAcR;aACF,CAAC;YAEF,mBAAmB;YACnB,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;YAC7D,MAAM,IAAI,GAAgB;gBACxB,GAAG,EAAE,gCAAgC;gBACrC,IAAI,EAAE,aAAa;gBACnB,KAAK,EAAE,YAAY;gBACnB,OAAO,EAAE;;;;;;;;;;;;;;;;SAgBR;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,kBAAkB,CAAC,IAAI,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;YAC7B,8BAA8B;YAC9B,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,11 @@
1
+ import { CrawlResult } from '../types.js';
2
+ import { ProcessedContent } from './content.js';
3
+ export declare function processMarkdownContent(page: CrawlResult): Promise<ProcessedContent | undefined>;
4
+ /**
5
+ * Process content that was already extracted and formatted by a custom extractor
6
+ * (e.g., StorybookExtractor, GithubPagesExtractor).
7
+ *
8
+ * These extractors output markdown-formatted content, so we don't need to
9
+ * parse HTML - we just need to structure the content into sections.
10
+ */
11
+ export declare function processExtractedContent(page: CrawlResult): Promise<ProcessedContent | undefined>;
@@ -0,0 +1,256 @@
1
+ import { logger } from '../util/logger.js';
2
+ function cleanText(text) {
3
+ return text
4
+ .replace(/\s+/g, ' ')
5
+ .replace(/\n\s*\n/g, '\n\n')
6
+ .trim();
7
+ }
8
+ function extractFrontMatter(content) {
9
+ const frontMatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n/;
10
+ const match = content.match(frontMatterRegex);
11
+ if (!match) {
12
+ return { frontMatter: {}, content, endLine: 0 };
13
+ }
14
+ try {
15
+ const frontMatterStr = match[1];
16
+ const frontMatter = {};
17
+ // Parse YAML-like front matter
18
+ frontMatterStr.split('\n').forEach((line) => {
19
+ const [key, ...valueParts] = line.split(':');
20
+ if (key && valueParts.length > 0) {
21
+ const value = valueParts.join(':').trim();
22
+ // Remove quotes if present
23
+ frontMatter[key.trim()] = value.replace(/^["']|["']$/g, '');
24
+ }
25
+ });
26
+ return {
27
+ frontMatter,
28
+ content: content.slice(match[0].length),
29
+ endLine: match[0].split('\n').length - 1,
30
+ };
31
+ }
32
+ catch (e) {
33
+ logger.debug('[MarkdownProcessor] Error parsing front matter:', e);
34
+ return { frontMatter: {}, content, endLine: 0 };
35
+ }
36
+ }
37
+ /**
38
+ * Detect if a line looks like a section header.
39
+ * Handles:
40
+ * - Markdown headers: # Title, ## Title, etc.
41
+ * - Docusaurus-style headers: Title (with zero-width space or other unicode)
42
+ * - Plain text headers: Short lines that end with special characters
43
+ */
44
+ function isLikelyHeader(line, prevLine, nextLine) {
45
+ // Standard markdown header
46
+ const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
47
+ if (headerMatch) {
48
+ return { isHeader: true, level: headerMatch[1].length, title: headerMatch[2].trim() };
49
+ }
50
+ // Clean the line of zero-width spaces and other unicode markers
51
+ const cleanLine = line.replace(/[\u200B-\u200D\uFEFF\u2060]/g, '').trim();
52
+ // Skip empty lines or very long lines (unlikely to be headers)
53
+ if (!cleanLine || cleanLine.length > 80) {
54
+ return { isHeader: false, level: 0, title: '' };
55
+ }
56
+ // Docusaurus-style header: ends with unicode marker (\\u200B) and is relatively short
57
+ // These are typically section titles like "Hooks", "Example", "Important"
58
+ if (line.includes('\u200B') || line.includes('\u200D') || line.includes('\u2060')) {
59
+ // Check if this looks like a title (short, possibly with capitalization)
60
+ if (cleanLine.length < 50 && cleanLine.length > 0) {
61
+ return { isHeader: true, level: 2, title: cleanLine };
62
+ }
63
+ }
64
+ // Plain text header detection:
65
+ // - Short line (< 50 chars)
66
+ // - Previous line is empty or doesn't exist
67
+ // - Next line is not empty (has content following)
68
+ // - Line contains mostly letters/spaces (not code)
69
+ if (cleanLine.length < 50 &&
70
+ cleanLine.length > 2 &&
71
+ (!prevLine || prevLine.trim() === '') &&
72
+ nextLine &&
73
+ nextLine.trim() !== '' &&
74
+ /^[A-Z][A-Za-z0-9\s\-_()]+$/.test(cleanLine)) {
75
+ return { isHeader: true, level: 2, title: cleanLine };
76
+ }
77
+ return { isHeader: false, level: 0, title: '' };
78
+ }
79
+ function parseMarkdownSections(content, startLine = 0) {
80
+ const lines = content.split('\n');
81
+ const sections = [];
82
+ let currentSection = null;
83
+ for (let i = 0; i < lines.length; i++) {
84
+ const line = lines[i];
85
+ const prevLine = i > 0 ? lines[i - 1] : '';
86
+ const nextLine = i < lines.length - 1 ? lines[i + 1] : '';
87
+ const headerInfo = isLikelyHeader(line, prevLine, nextLine);
88
+ if (headerInfo.isHeader) {
89
+ // Save previous section if exists
90
+ if (currentSection) {
91
+ currentSection.endLine = startLine + i - 1;
92
+ sections.push(currentSection);
93
+ }
94
+ // Start new section
95
+ currentSection = {
96
+ level: headerInfo.level,
97
+ title: headerInfo.title,
98
+ content: '',
99
+ startLine: startLine + i,
100
+ endLine: startLine + i,
101
+ };
102
+ }
103
+ else if (currentSection) {
104
+ // Add line to current section
105
+ if (currentSection.content.length > 0) {
106
+ currentSection.content += '\n';
107
+ }
108
+ currentSection.content += line;
109
+ }
110
+ else {
111
+ // Content before first header goes into an "Introduction" section
112
+ if (!sections.length) {
113
+ currentSection = {
114
+ level: 1,
115
+ title: 'Content',
116
+ content: line,
117
+ startLine,
118
+ endLine: startLine,
119
+ };
120
+ }
121
+ }
122
+ }
123
+ // Add last section
124
+ if (currentSection) {
125
+ currentSection.endLine = startLine + lines.length - 1;
126
+ sections.push(currentSection);
127
+ }
128
+ return sections;
129
+ }
130
+ function processCodeBlocks(content) {
131
+ // Preserve code blocks by replacing them with placeholders
132
+ const codeBlocks = [];
133
+ let processedContent = content.replace(/```[\s\S]*?```/g, (match) => {
134
+ codeBlocks.push(match);
135
+ return `CODE_BLOCK_${codeBlocks.length - 1}`;
136
+ });
137
+ // Clean the text
138
+ processedContent = cleanText(processedContent);
139
+ // Restore code blocks
140
+ processedContent = processedContent.replace(/CODE_BLOCK_(\d+)/g, (_, index) => codeBlocks[parseInt(index)]);
141
+ return processedContent;
142
+ }
143
+ export async function processMarkdownContent(page) {
144
+ try {
145
+ logger.debug(`[MarkdownProcessor] Processing content for ${page.url}`);
146
+ // Extract front matter
147
+ const { frontMatter, content: mainContent, endLine } = extractFrontMatter(page.content);
148
+ // Parse markdown sections
149
+ const sections = parseMarkdownSections(mainContent, endLine);
150
+ // Process sections into components
151
+ const components = sections.map((section) => ({
152
+ title: section.title,
153
+ body: processCodeBlocks(section.content),
154
+ }));
155
+ // Filter out empty components
156
+ const validComponents = components.filter((comp) => comp.body.length > 0);
157
+ if (validComponents.length === 0) {
158
+ logger.debug(`[MarkdownProcessor] No valid content sections found in ${page.url}`);
159
+ return undefined;
160
+ }
161
+ const article = {
162
+ url: page.url,
163
+ path: page.path,
164
+ title: frontMatter.title || page.title || validComponents[0].title,
165
+ components: validComponents,
166
+ };
167
+ return {
168
+ article,
169
+ content: validComponents
170
+ .map((comp) => `${comp.title}\n\n${comp.body}`)
171
+ .join('\n\n')
172
+ .trim(),
173
+ };
174
+ }
175
+ catch (error) {
176
+ logger.debug('[MarkdownProcessor] Error processing markdown content:', error);
177
+ logger.debug('[MarkdownProcessor] Error details:', error instanceof Error ? error.stack : error);
178
+ return undefined;
179
+ }
180
+ }
181
+ /**
182
+ * Process content that was already extracted and formatted by a custom extractor
183
+ * (e.g., StorybookExtractor, GithubPagesExtractor).
184
+ *
185
+ * These extractors output markdown-formatted content, so we don't need to
186
+ * parse HTML - we just need to structure the content into sections.
187
+ */
188
+ export async function processExtractedContent(page) {
189
+ try {
190
+ logger.debug(`[ExtractedContentProcessor] Processing pre-extracted content for ${page.url}`);
191
+ logger.debug(`[ExtractedContentProcessor] Content length: ${page.content.length} bytes`);
192
+ const content = page.content;
193
+ if (!content || content.trim().length === 0) {
194
+ logger.debug(`[ExtractedContentProcessor] No content found in ${page.url}`);
195
+ return undefined;
196
+ }
197
+ // Parse markdown sections - the content is already in markdown format
198
+ const sections = parseMarkdownSections(content, 0);
199
+ logger.debug(`[ExtractedContentProcessor] Found ${sections.length} sections`);
200
+ // Convert sections to components, preserving the markdown content as-is
201
+ const components = sections.map((section) => ({
202
+ title: section.title,
203
+ // Don't over-process - just trim and normalize whitespace
204
+ body: section.content.trim(),
205
+ }));
206
+ // Filter out empty components but keep sections with minimal content
207
+ // (some sections like "## Props" header might have content in the next section)
208
+ const validComponents = components.filter((comp) => comp.body.length > 0 || comp.title.length > 0);
209
+ if (validComponents.length === 0) {
210
+ // If no sections found, treat entire content as one component
211
+ logger.debug(`[ExtractedContentProcessor] No sections found, using entire content`);
212
+ const article = {
213
+ url: page.url,
214
+ path: page.path,
215
+ title: page.title || 'Content',
216
+ components: [
217
+ {
218
+ title: page.title || 'Content',
219
+ body: content.trim(),
220
+ },
221
+ ],
222
+ };
223
+ return {
224
+ article,
225
+ content: content.trim(),
226
+ };
227
+ }
228
+ // Extract title from first H1 if present, otherwise use page title
229
+ let title = page.title;
230
+ const firstH1Section = sections.find((s) => s.level === 1);
231
+ if (firstH1Section) {
232
+ title = firstH1Section.title;
233
+ }
234
+ const article = {
235
+ url: page.url,
236
+ path: page.path,
237
+ title: title || validComponents[0].title,
238
+ components: validComponents,
239
+ };
240
+ logger.debug(`[ExtractedContentProcessor] Created article with ${validComponents.length} components`);
241
+ logger.debug(`[ExtractedContentProcessor] Total content length: ${content.length} bytes`);
242
+ return {
243
+ article,
244
+ content: validComponents
245
+ .map((comp) => (comp.title ? `${comp.title}\n\n${comp.body}` : comp.body))
246
+ .join('\n\n')
247
+ .trim(),
248
+ };
249
+ }
250
+ catch (error) {
251
+ logger.debug('[ExtractedContentProcessor] Error processing extracted content:', error);
252
+ logger.debug('[ExtractedContentProcessor] Error details:', error instanceof Error ? error.stack : error);
253
+ return undefined;
254
+ }
255
+ }
256
+ //# sourceMappingURL=markdown.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"markdown.js","sourceRoot":"","sources":["../../src/processor/markdown.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAE3C,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI;SACR,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,OAAO,CAAC,UAAU,EAAE,MAAM,CAAC;SAC3B,IAAI,EAAE,CAAC;AACZ,CAAC;AAUD,SAAS,kBAAkB,CAAC,OAAe;IAKzC,MAAM,gBAAgB,GAAG,+BAA+B,CAAC;IACzD,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;IAE9C,IAAI,CAAC,KAAK,EAAE,CAAC;QACX,OAAO,EAAE,WAAW,EAAE,EAAE,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IAClD,CAAC;IAED,IAAI,CAAC;QACH,MAAM,cAAc,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QAChC,MAAM,WAAW,GAA4B,EAAE,CAAC;QAEhD,+BAA+B;QAC/B,cAAc,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YAC1C,MAAM,CAAC,GAAG,EAAE,GAAG,UAAU,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;YAC7C,IAAI,GAAG,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACjC,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC1C,2BAA2B;gBAC3B,WAAW,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC;YAC9D,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,WAAW;YACX,OAAO,EAAE,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YACvC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC;SACzC,CAAC;IACJ,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,CAAC,KAAK,CAAC,iDAAiD,EAAE,CAAC,CAAC,CAAC;QACnE,OAAO,EAAE,WAAW,EAAE,EAAE,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;IAClD,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,QAAgB,EAAE,QAAgB;IACtE,2BAA2B;IAC3B,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;IACpD,IAAI,WAAW,EAAE,CAAC;QAChB,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,WAAW,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC;IACxF,CAAC;IAED,gEAAgE;IAChE,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,8BAA8B,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAE1E,+DAA+D;IAC/D,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,EAAE,EAAE,CAAC;QACxC,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;IAClD,CAAC;IAED,sFAAsF;IACtF,0EAA0E;IAC1E,IAAI,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;QAClF,yEAAyE;QACzE,IAAI,SAAS,CAAC,MAAM,GAAG,EAAE,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClD,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;QACxD,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,4BAA4B;IAC5B,4CAA4C;IAC5C,mDAAmD;IACnD,mDAAmD;IACnD,IACE,SAAS,CAAC,MAAM,GAAG,EAAE;QACrB,SAAS,CAAC,MAAM,GAAG,CAAC;QACpB,CAAC,CAAC,QAAQ,IAAI,QAAQ,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC;QACrC,QAAQ;QACR,QAAQ,CAAC,IAAI,EAAE,KAAK,EAAE;QACtB,4BAA4B,CAAC,IAAI,CAAC,SAAS,CAAC,EAC5C,CAAC;QACD,OAAO,EAAE,QAAQ,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC;IACxD,CAAC;IAED,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;AAClD,CAAC;AAED,SAAS,qBAAqB,CAAC,OAAe,EAAE,YAAoB,CAAC;IACnE,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAsB,EAAE,CAAC;IACvC,IAAI,cAAc,GAA2B,IAAI,CAAC;IAElD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;QACtB,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3C,MAAM,QAAQ,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE1D,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,CAAC;QAE5D,IAAI,UAAU,CAAC,QAAQ,EAAE,CAAC;YACxB,kCAAkC;YAClC,IAAI,cAAc,EAAE,CAAC;gBACnB,cAAc,CAAC,OAAO,GAAG,SAAS,GAAG,CAAC,GAAG,CAAC,CAAC;gBAC3C,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAChC,CAAC;YAED,oBAAoB;YACpB,cAAc,GAAG;gBACf,KAAK,EAAE,UAAU,CAAC,KAAK;gBACvB,KAAK,EAAE,UAAU,CAAC,KAAK;gBACvB,OAAO,EAAE,EAAE;gBACX,SAAS,EAAE,SAAS,GAAG,CAAC;gBACxB,OAAO,EAAE,SAAS,GAAG,CAAC;aACvB,CAAC;QACJ,CAAC;aAAM,IAAI,cAAc,EAAE,CAAC;YAC1B,8BAA8B;YAC9B,IAAI,cAAc,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACtC,cAAc,CAAC,OAAO,IAAI,IAAI,CAAC;YACjC,CAAC;YACD,cAAc,CAAC,OAAO,IAAI,IAAI,CAAC;QACjC,CAAC;aAAM,CAAC;YACN,kEAAkE;YAClE,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;gBACrB,cAAc,GAAG;oBACf,KAAK,EAAE,CAAC;oBACR,KAAK,EAAE,SAAS;oBAChB,OAAO,EAAE,IAAI;oBACb,SAAS;oBACT,OAAO,EAAE,SAAS;iBACnB,CAAC;YACJ,CAAC;QACH,CAAC;IACH,CAAC;IAED,mBAAmB;IACnB,IAAI,cAAc,EAAE,CAAC;QACnB,cAAc,CAAC,OAAO,GAAG,SAAS,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC;QACtD,QAAQ,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;IAChC,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,iBAAiB,CAAC,OAAe;IACxC,2DAA2D;IAC3D,MAAM,UAAU,GAAa,EAAE,CAAC;IAChC,IAAI,gBAAgB,GAAG,OAAO,CAAC,OAAO,CAAC,iBAAiB,EAAE,CAAC,KAAK,EAAE,EAAE;QAClE,UAAU,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACvB,OAAO,cAAc,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;IAC/C,CAAC,CAAC,CAAC;IAEH,iBAAiB;IACjB,gBAAgB,GAAG,SAAS,CAAC,gBAAgB,CAAC,CAAC;IAE/C,sBAAsB;IACtB,gBAAgB,GAAG,gBAAgB,CAAC,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAE5G,OAAO,gBAAgB,CAAC;AAC1B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAAC,IAAiB;IAC5D,IAAI,CAAC;QACH,MAAM,CAAC,KAAK,CAAC,8CAA8C,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QAEvE,uBAAuB;QACvB,MAAM,EAAE,WAAW,EAAE,OAAO,EAAE,WAAW,EAAE,OAAO,EAAE,GAAG,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAExF,0BAA0B;QAC1B,MAAM,QAAQ,GAAG,qBAAqB,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;QAE7D,mCAAmC;QACnC,MAAM,UAAU,GAAuB,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YAChE,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,IAAI,EAAE,iBAAiB,CAAC,OAAO,CAAC,OAAO,CAAC;SACzC,CAAC,CAAC,CAAC;QAEJ,8BAA8B;QAC9B,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAE1E,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,MAAM,CAAC,KAAK,CAAC,0DAA0D,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;YACnF,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,MAAM,OAAO,GAAY;YACvB,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK,EAAG,WAAW,CAAC,KAAgB,IAAI,IAAI,CAAC,KAAK,IAAI,eAAe,CAAC,CAAC,CAAC,CAAC,KAAK;YAC9E,UAAU,EAAE,eAAe;SAC5B,CAAC;QAEF,OAAO;YACL,OAAO;YACP,OAAO,EAAE,eAAe;iBACrB,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,GAAG,IAAI,CAAC,KAAK,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;iBAC9C,IAAI,CAAC,MAAM,CAAC;iBACZ,IAAI,EAAE;SACV,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,CAAC,wDAAwD,EAAE,KAAK,CAAC,CAAC;QAC9E,MAAM,CAAC,KAAK,CAAC,oCAAoC,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QACjG,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAAC,IAAiB;IAC7D,IAAI,CAAC;QACH,MAAM,CAAC,KAAK,CAAC,oEAAoE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QAC7F,MAAM,CAAC,KAAK,CAAC,+CAA+C,IAAI,CAAC,OAAO,CAAC,MAAM,QAAQ,CAAC,CAAC;QAEzF,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;QAE7B,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC5C,MAAM,CAAC,KAAK,CAAC,mDAAmD,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;YAC5E,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,sEAAsE;QACtE,MAAM,QAAQ,GAAG,qBAAqB,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;QAEnD,MAAM,CAAC,KAAK,CAAC,qCAAqC,QAAQ,CAAC,MAAM,WAAW,CAAC,CAAC;QAE9E,wEAAwE;QACxE,MAAM,UAAU,GAAuB,QAAQ,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;YAChE,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,0DAA0D;YAC1D,IAAI,EAAE,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE;SAC7B,CAAC,CAAC,CAAC;QAEJ,qEAAqE;QACrE,gFAAgF;QAChF,MAAM,eAAe,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAEnG,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,8DAA8D;YAC9D,MAAM,CAAC,KAAK,CAAC,qEAAqE,CAAC,CAAC;YACpF,MAAM,OAAO,GAAY;gBACvB,GAAG,EAAE,IAAI,CAAC,GAAG;gBACb,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;gBAC9B,UAAU,EAAE;oBACV;wBACE,KAAK,EAAE,IAAI,CAAC,KAAK,IAAI,SAAS;wBAC9B,IAAI,EAAE,OAAO,CAAC,IAAI,EAAE;qBACrB;iBACF;aACF,CAAC;YAEF,OAAO;gBACL,OAAO;gBACP,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;aACxB,CAAC;QACJ,CAAC;QAED,mEAAmE;QACnE,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC;QACvB,MAAM,cAAc,GAAG,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,KAAK,CAAC,CAAC,CAAC;QAC3D,IAAI,cAAc,EAAE,CAAC;YACnB,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC;QAC/B,CAAC;QAED,MAAM,OAAO,GAAY;YACvB,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,IAAI,EAAE,IAAI,CAAC,IAAI;YACf,KAAK,EAAE,KAAK,IAAI,eAAe,CAAC,CAAC,CAAC,CAAC,KAAK;YACxC,UAAU,EAAE,eAAe;SAC5B,CAAC;QAEF,MAAM,CAAC,KAAK,CAAC,oDAAoD,eAAe,CAAC,MAAM,aAAa,CAAC,CAAC;QACtG,MAAM,CAAC,KAAK,CAAC,qDAAqD,OAAO,CAAC,MAAM,QAAQ,CAAC,CAAC;QAE1F,OAAO;YACL,OAAO;YACP,OAAO,EAAE,eAAe;iBACrB,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;iBACzE,IAAI,CAAC,MAAM,CAAC;iBACZ,IAAI,EAAE;SACV,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,CAAC,KAAK,CAAC,iEAAiE,EAAE,KAAK,CAAC,CAAC;QACvF,MAAM,CAAC,KAAK,CAAC,4CAA4C,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QACzG,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC"}
@@ -0,0 +1 @@
1
+ export {};