@copilotkit/pathfinder 1.1.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -1
- package/README.md +65 -248
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +70 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +8 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +58 -5
- package/dist/config.js.map +1 -1
- package/dist/db/client.d.ts.map +1 -1
- package/dist/db/client.js +3 -1
- package/dist/db/client.js.map +1 -1
- package/dist/db/queries.d.ts +21 -4
- package/dist/db/queries.d.ts.map +1 -1
- package/dist/db/queries.js +101 -45
- package/dist/db/queries.js.map +1 -1
- package/dist/db/schema.d.ts +5 -0
- package/dist/db/schema.d.ts.map +1 -1
- package/dist/db/schema.js +11 -0
- package/dist/db/schema.js.map +1 -1
- package/dist/faq-txt.d.ts +12 -0
- package/dist/faq-txt.d.ts.map +1 -0
- package/dist/faq-txt.js +37 -0
- package/dist/faq-txt.js.map +1 -0
- package/dist/index.js +2 -362
- package/dist/index.js.map +1 -1
- package/dist/indexing/chunking/html.d.ts +7 -0
- package/dist/indexing/chunking/html.d.ts.map +1 -0
- package/dist/indexing/chunking/html.js +356 -0
- package/dist/indexing/chunking/html.js.map +1 -0
- package/dist/indexing/chunking/index.js +5 -0
- package/dist/indexing/chunking/index.js.map +1 -1
- package/dist/indexing/chunking/qa.d.ts +8 -0
- package/dist/indexing/chunking/qa.d.ts.map +1 -0
- package/dist/indexing/chunking/qa.js +22 -0
- package/dist/indexing/chunking/qa.js.map +1 -0
- package/dist/indexing/distiller.d.ts +29 -0
- package/dist/indexing/distiller.d.ts.map +1 -0
- package/dist/indexing/distiller.js +104 -0
- package/dist/indexing/distiller.js.map +1 -0
- package/dist/indexing/orchestrator.d.ts +9 -3
- package/dist/indexing/orchestrator.d.ts.map +1 -1
- package/dist/indexing/orchestrator.js +113 -83
- package/dist/indexing/orchestrator.js.map +1 -1
- package/dist/indexing/pipeline.d.ts +18 -0
- package/dist/indexing/pipeline.d.ts.map +1 -0
- package/dist/indexing/pipeline.js +68 -0
- package/dist/indexing/pipeline.js.map +1 -0
- package/dist/indexing/providers/discord-api.d.ts +79 -0
- package/dist/indexing/providers/discord-api.d.ts.map +1 -0
- package/dist/indexing/providers/discord-api.js +167 -0
- package/dist/indexing/providers/discord-api.js.map +1 -0
- package/dist/indexing/providers/discord.d.ts +25 -0
- package/dist/indexing/providers/discord.d.ts.map +1 -0
- package/dist/indexing/providers/discord.js +282 -0
- package/dist/indexing/providers/discord.js.map +1 -0
- package/dist/indexing/providers/file.d.ts +18 -0
- package/dist/indexing/providers/file.d.ts.map +1 -0
- package/dist/indexing/providers/file.js +262 -0
- package/dist/indexing/providers/file.js.map +1 -0
- package/dist/indexing/providers/index.d.ts +5 -0
- package/dist/indexing/providers/index.d.ts.map +1 -0
- package/dist/indexing/providers/index.js +22 -0
- package/dist/indexing/providers/index.js.map +1 -0
- package/dist/indexing/providers/slack-api.d.ts +62 -0
- package/dist/indexing/providers/slack-api.d.ts.map +1 -0
- package/dist/indexing/providers/slack-api.js +167 -0
- package/dist/indexing/providers/slack-api.js.map +1 -0
- package/dist/indexing/providers/slack.d.ts +21 -0
- package/dist/indexing/providers/slack.d.ts.map +1 -0
- package/dist/indexing/providers/slack.js +192 -0
- package/dist/indexing/providers/slack.js.map +1 -0
- package/dist/indexing/providers/types.d.ts +56 -0
- package/dist/indexing/providers/types.d.ts.map +1 -0
- package/dist/indexing/providers/types.js +3 -0
- package/dist/indexing/providers/types.js.map +1 -0
- package/dist/indexing/url-derivation.d.ts +2 -2
- package/dist/indexing/url-derivation.d.ts.map +1 -1
- package/dist/indexing/url-derivation.js.map +1 -1
- package/dist/indexing/utils.d.ts +19 -0
- package/dist/indexing/utils.d.ts.map +1 -0
- package/dist/indexing/utils.js +63 -0
- package/dist/indexing/utils.js.map +1 -0
- package/dist/ip-limiter.d.ts +11 -0
- package/dist/ip-limiter.d.ts.map +1 -0
- package/dist/ip-limiter.js +40 -0
- package/dist/ip-limiter.js.map +1 -0
- package/dist/llms-txt.d.ts +11 -0
- package/dist/llms-txt.d.ts.map +1 -0
- package/dist/llms-txt.js +43 -0
- package/dist/llms-txt.js.map +1 -0
- package/dist/mcp/server.d.ts +3 -1
- package/dist/mcp/server.d.ts.map +1 -1
- package/dist/mcp/server.js +9 -1
- package/dist/mcp/server.js.map +1 -1
- package/dist/mcp/tools/bash-fs.d.ts.map +1 -1
- package/dist/mcp/tools/bash-fs.js +4 -1
- package/dist/mcp/tools/bash-fs.js.map +1 -1
- package/dist/mcp/tools/bash.d.ts +8 -0
- package/dist/mcp/tools/bash.d.ts.map +1 -1
- package/dist/mcp/tools/bash.js +59 -0
- package/dist/mcp/tools/bash.js.map +1 -1
- package/dist/mcp/tools/knowledge.d.ts +13 -0
- package/dist/mcp/tools/knowledge.d.ts.map +1 -0
- package/dist/mcp/tools/knowledge.js +92 -0
- package/dist/mcp/tools/knowledge.js.map +1 -0
- package/dist/mcp/tools/search.d.ts.map +1 -1
- package/dist/mcp/tools/search.js +11 -3
- package/dist/mcp/tools/search.js.map +1 -1
- package/dist/server.d.ts +6 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +595 -0
- package/dist/server.js.map +1 -0
- package/dist/skill-md.d.ts +3 -0
- package/dist/skill-md.d.ts.map +1 -0
- package/dist/skill-md.js +75 -0
- package/dist/skill-md.js.map +1 -0
- package/dist/types.d.ts +844 -38
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +88 -6
- package/dist/types.js.map +1 -1
- package/dist/validate.d.ts +29 -0
- package/dist/validate.d.ts.map +1 -0
- package/dist/validate.js +192 -0
- package/dist/validate.js.map +1 -0
- package/dist/webhooks/discord.d.ts +13 -0
- package/dist/webhooks/discord.d.ts.map +1 -0
- package/dist/webhooks/discord.js +57 -0
- package/dist/webhooks/discord.js.map +1 -0
- package/dist/webhooks/slack.d.ts +13 -0
- package/dist/webhooks/slack.d.ts.map +1 -0
- package/dist/webhooks/slack.js +106 -0
- package/dist/webhooks/slack.js.map +1 -0
- package/dist/workspace.d.ts +13 -0
- package/dist/workspace.d.ts.map +1 -0
- package/dist/workspace.js +118 -0
- package/dist/workspace.js.map +1 -0
- package/package.json +27 -2
- package/pathfinder-docs.yaml +54 -0
- package/pathfinder.example.yaml +48 -0
- package/.superpowers/brainstorm/47098-1775507869/content/homepage-mockup.html +0 -324
- package/.superpowers/brainstorm/47098-1775507869/state/server-stopped +0 -1
- package/.superpowers/brainstorm/47098-1775507869/state/server.log +0 -13
- package/.superpowers/brainstorm/47098-1775507869/state/server.pid +0 -1
- package/.superpowers/brainstorm/82141-1775511032/content/migration-v2.html +0 -340
- package/.superpowers/brainstorm/82141-1775511032/content/migration.html +0 -340
- package/.superpowers/brainstorm/82141-1775511032/state/server-stopped +0 -1
- package/.superpowers/brainstorm/82141-1775511032/state/server.log +0 -4
- package/.superpowers/brainstorm/82141-1775511032/state/server.pid +0 -1
- package/dist/indexing/source-indexer.d.ts +0 -68
- package/dist/indexing/source-indexer.d.ts.map +0 -1
- package/dist/indexing/source-indexer.js +0 -379
- package/dist/indexing/source-indexer.js.map +0 -1
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
// HTML chunker — extracts semantic text content from HTML documents
|
|
2
|
+
// using cheerio for DOM traversal, splits on heading boundaries.
|
|
3
|
+
import * as cheerio from 'cheerio';
|
|
4
|
+
const DEFAULT_TARGET_TOKENS = 600;
|
|
5
|
+
const DEFAULT_OVERLAP_TOKENS = 50;
|
|
6
|
+
/** Elements to remove entirely before content extraction. */
|
|
7
|
+
const STRIP_SELECTORS = 'script, style, nav, footer, header, svg, noscript';
|
|
8
|
+
/** Selectors to try for the main content container, in priority order. */
|
|
9
|
+
const CONTENT_SELECTORS = ['main', 'article', '[role="main"]', '.content', '#content'];
|
|
10
|
+
/**
|
|
11
|
+
* Extract text from a cheerio element, converting block elements to newlines
|
|
12
|
+
* and preserving code blocks.
|
|
13
|
+
*/
|
|
14
|
+
function extractText($, el) {
|
|
15
|
+
const lines = [];
|
|
16
|
+
el.contents().each((_, node) => {
|
|
17
|
+
if (node.type === 'text') {
|
|
18
|
+
const text = node.data?.trim();
|
|
19
|
+
if (text)
|
|
20
|
+
lines.push(text);
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
if (node.type !== 'tag')
|
|
24
|
+
return;
|
|
25
|
+
const tagNode = node;
|
|
26
|
+
const tag = tagNode.tagName?.toLowerCase();
|
|
27
|
+
const child = $(tagNode);
|
|
28
|
+
if (tag === 'pre') {
|
|
29
|
+
// Preserve code blocks with whitespace intact
|
|
30
|
+
lines.push('\n```\n' + child.text() + '\n```\n');
|
|
31
|
+
}
|
|
32
|
+
else if (tag === 'ul') {
|
|
33
|
+
child.children('li').each((_, li) => {
|
|
34
|
+
lines.push('- ' + extractText($, $(li)));
|
|
35
|
+
});
|
|
36
|
+
}
|
|
37
|
+
else if (tag === 'ol') {
|
|
38
|
+
child.children('li').each((i, li) => {
|
|
39
|
+
lines.push(`${i + 1}. ` + extractText($, $(li)));
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
else if (tag === 'table') {
|
|
43
|
+
child.find('tr').each((_, tr) => {
|
|
44
|
+
const cells = [];
|
|
45
|
+
$(tr).find('th, td').each((_, cell) => {
|
|
46
|
+
cells.push($(cell).text().trim());
|
|
47
|
+
});
|
|
48
|
+
if (cells.length > 0)
|
|
49
|
+
lines.push(cells.join(' | '));
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
else if (tag === 'img') {
|
|
53
|
+
const alt = child.attr('alt');
|
|
54
|
+
if (alt)
|
|
55
|
+
lines.push(`[image: ${alt}]`);
|
|
56
|
+
}
|
|
57
|
+
else if (['p', 'div', 'blockquote', 'dd', 'section', 'figcaption'].includes(tag)) {
|
|
58
|
+
const text = extractText($, child);
|
|
59
|
+
if (text)
|
|
60
|
+
lines.push(text);
|
|
61
|
+
}
|
|
62
|
+
else if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tag)) {
|
|
63
|
+
// Headings are split by the section splitter at the top level; here we just extract text content
|
|
64
|
+
lines.push(child.text().trim());
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
// Recurse into other elements (spans, links, etc.)
|
|
68
|
+
const text = extractText($, child);
|
|
69
|
+
if (text)
|
|
70
|
+
lines.push(text);
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
return lines.join('\n').replace(/\n{3,}/g, '\n\n').trim();
|
|
74
|
+
}
|
|
75
|
+
function splitOnHeadings($, container) {
|
|
76
|
+
const sections = [];
|
|
77
|
+
let currentHeading = null;
|
|
78
|
+
let currentLevel = 0;
|
|
79
|
+
let currentContent = [];
|
|
80
|
+
function flush() {
|
|
81
|
+
const text = currentContent.join('\n').replace(/\n{3,}/g, '\n\n').trim();
|
|
82
|
+
if (text) {
|
|
83
|
+
sections.push({ heading: currentHeading, level: currentLevel, content: text });
|
|
84
|
+
}
|
|
85
|
+
currentContent = [];
|
|
86
|
+
}
|
|
87
|
+
// Walk all descendant nodes in document order, splitting on h1/h2/h3
|
|
88
|
+
function walk(el) {
|
|
89
|
+
el.contents().each((_, node) => {
|
|
90
|
+
if (node.type === 'text') {
|
|
91
|
+
const text = node.data?.trim();
|
|
92
|
+
if (text)
|
|
93
|
+
currentContent.push(text);
|
|
94
|
+
return;
|
|
95
|
+
}
|
|
96
|
+
if (node.type !== 'tag')
|
|
97
|
+
return;
|
|
98
|
+
const tagNode = node;
|
|
99
|
+
const tag = tagNode.tagName?.toLowerCase();
|
|
100
|
+
if (tag && /^h[123]$/.test(tag)) {
|
|
101
|
+
flush();
|
|
102
|
+
currentHeading = $(tagNode).text().trim();
|
|
103
|
+
currentLevel = parseInt(tag[1]);
|
|
104
|
+
return; // Don't recurse into the heading
|
|
105
|
+
}
|
|
106
|
+
// h4-h6 are not section boundaries but should stand out in text
|
|
107
|
+
if (tag && /^h[456]$/.test(tag)) {
|
|
108
|
+
const text = $(tagNode).text().trim();
|
|
109
|
+
if (text)
|
|
110
|
+
currentContent.push('\n' + text);
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
// Block elements with special formatting — keep in sync with extractText
|
|
114
|
+
if (tag === 'pre') {
|
|
115
|
+
currentContent.push('\n```\n' + $(tagNode).text() + '\n```\n');
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
if (tag === 'ul') {
|
|
119
|
+
$(tagNode).children('li').each((_, li) => {
|
|
120
|
+
currentContent.push('- ' + extractText($, $(li)));
|
|
121
|
+
});
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
if (tag === 'ol') {
|
|
125
|
+
$(tagNode).children('li').each((i, li) => {
|
|
126
|
+
currentContent.push(`${i + 1}. ` + extractText($, $(li)));
|
|
127
|
+
});
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
if (tag === 'table') {
|
|
131
|
+
$(tagNode).find('tr').each((_, tr) => {
|
|
132
|
+
const cells = [];
|
|
133
|
+
$(tr).find('th, td').each((_, cell) => {
|
|
134
|
+
cells.push($(cell).text().trim());
|
|
135
|
+
});
|
|
136
|
+
if (cells.length > 0)
|
|
137
|
+
currentContent.push(cells.join(' | '));
|
|
138
|
+
});
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
if (tag === 'img') {
|
|
142
|
+
const alt = $(tagNode).attr('alt');
|
|
143
|
+
if (alt)
|
|
144
|
+
currentContent.push(`[image: ${alt}]`);
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
// Other block content elements — delegate to extractText
|
|
148
|
+
if (['p', 'blockquote', 'dd', 'figcaption', 'dl', 'figure'].includes(tag)) {
|
|
149
|
+
const text = extractText($, $(tagNode));
|
|
150
|
+
if (text)
|
|
151
|
+
currentContent.push(text);
|
|
152
|
+
return;
|
|
153
|
+
}
|
|
154
|
+
// Recurse into container elements (section, div, article, etc.)
|
|
155
|
+
walk($(tagNode));
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
walk(container);
|
|
159
|
+
flush();
|
|
160
|
+
return sections;
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Build heading path from sections up to a given index.
|
|
164
|
+
* Tracks h1 > h2 > h3 hierarchy (same concept as markdown chunker).
|
|
165
|
+
*/
|
|
166
|
+
function buildHeadingPath(sections, upToIndex) {
|
|
167
|
+
const stack = [];
|
|
168
|
+
for (let i = 0; i <= upToIndex; i++) {
|
|
169
|
+
const section = sections[i];
|
|
170
|
+
if (!section.heading)
|
|
171
|
+
continue;
|
|
172
|
+
// Pop headings at same or deeper level
|
|
173
|
+
while (stack.length > 0 && stack[stack.length - 1].level >= section.level) {
|
|
174
|
+
stack.pop();
|
|
175
|
+
}
|
|
176
|
+
stack.push({ level: section.level, text: section.heading });
|
|
177
|
+
}
|
|
178
|
+
return stack.map(h => h.text);
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Recursively split oversized text on paragraph then line boundaries.
|
|
182
|
+
* Ensures no chunk exceeds targetChars (best-effort — a single very long
|
|
183
|
+
* line will be returned as-is).
|
|
184
|
+
*/
|
|
185
|
+
function splitLargeText(text, targetChars) {
|
|
186
|
+
if (text.length <= targetChars)
|
|
187
|
+
return [text];
|
|
188
|
+
// Try paragraph boundaries first
|
|
189
|
+
const paragraphs = text.split(/\n\n+/);
|
|
190
|
+
if (paragraphs.length > 1) {
|
|
191
|
+
return mergeSmallParts(paragraphs, targetChars).flatMap(p => splitLargeText(p, targetChars));
|
|
192
|
+
}
|
|
193
|
+
// Fall back to line boundaries
|
|
194
|
+
const lines = text.split('\n');
|
|
195
|
+
if (lines.length > 1) {
|
|
196
|
+
return mergeSmallParts(lines, targetChars, '\n');
|
|
197
|
+
}
|
|
198
|
+
// Single long line — return as-is
|
|
199
|
+
return [text];
|
|
200
|
+
}
|
|
201
|
+
/** Merge adjacent small parts until they approach target size. */
|
|
202
|
+
function mergeSmallParts(parts, targetSize, separator = '\n\n') {
|
|
203
|
+
const merged = [];
|
|
204
|
+
let current = '';
|
|
205
|
+
for (const part of parts) {
|
|
206
|
+
const sep = current ? separator : '';
|
|
207
|
+
if (current && (current.length + sep.length + part.length) > targetSize) {
|
|
208
|
+
merged.push(current);
|
|
209
|
+
current = part;
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
current = current ? current + sep + part : part;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
if (current)
|
|
216
|
+
merged.push(current);
|
|
217
|
+
return merged;
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Merge small consecutive sections that share the same heading context,
|
|
221
|
+
* then apply overlap between chunks.
|
|
222
|
+
*
|
|
223
|
+
* Sections with distinct headings are never merged — each heading-bearing
|
|
224
|
+
* section becomes its own chunk so that headingPath stays accurate.
|
|
225
|
+
* Only headingless content following the same heading gets merged.
|
|
226
|
+
*/
|
|
227
|
+
function mergeAndOverlap(sections, targetChars, overlapChars) {
|
|
228
|
+
const merged = [];
|
|
229
|
+
let current = '';
|
|
230
|
+
let currentIdx = 0;
|
|
231
|
+
for (let i = 0; i < sections.length; i++) {
|
|
232
|
+
const section = sections[i];
|
|
233
|
+
const text = section.content;
|
|
234
|
+
const separator = current ? '\n\n' : '';
|
|
235
|
+
// Start a new chunk when the section has its own heading
|
|
236
|
+
// (preserves headingPath per-section) or when size exceeds target
|
|
237
|
+
const sizeExceeded = current && (current.length + separator.length + text.length) > targetChars;
|
|
238
|
+
const hasNewHeading = section.heading !== null && current.length > 0;
|
|
239
|
+
if (sizeExceeded || hasNewHeading) {
|
|
240
|
+
if (current.trim()) {
|
|
241
|
+
merged.push({ content: current, sectionIndex: currentIdx });
|
|
242
|
+
}
|
|
243
|
+
current = text;
|
|
244
|
+
currentIdx = i;
|
|
245
|
+
}
|
|
246
|
+
else {
|
|
247
|
+
if (!current)
|
|
248
|
+
currentIdx = i;
|
|
249
|
+
current = current ? current + separator + text : text;
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
if (current.trim()) {
|
|
253
|
+
merged.push({ content: current, sectionIndex: currentIdx });
|
|
254
|
+
}
|
|
255
|
+
// Apply overlap
|
|
256
|
+
let result;
|
|
257
|
+
if (merged.length <= 1 || overlapChars <= 0) {
|
|
258
|
+
result = merged;
|
|
259
|
+
}
|
|
260
|
+
else {
|
|
261
|
+
result = [merged[0]];
|
|
262
|
+
for (let i = 1; i < merged.length; i++) {
|
|
263
|
+
const prev = merged[i - 1].content;
|
|
264
|
+
const overlapText = prev.slice(-overlapChars);
|
|
265
|
+
const breakPoint = overlapText.lastIndexOf('\n');
|
|
266
|
+
const cleanOverlap = breakPoint > 0 ? overlapText.slice(breakPoint) : overlapText;
|
|
267
|
+
result.push({
|
|
268
|
+
content: cleanOverlap + '\n\n' + merged[i].content,
|
|
269
|
+
sectionIndex: merged[i].sectionIndex,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
// Post-pass: split any chunks that still exceed target
|
|
274
|
+
const final = [];
|
|
275
|
+
for (const chunk of result) {
|
|
276
|
+
if (chunk.content.length > targetChars) {
|
|
277
|
+
const parts = splitLargeText(chunk.content, targetChars);
|
|
278
|
+
for (const part of parts) {
|
|
279
|
+
final.push({ content: part, sectionIndex: chunk.sectionIndex });
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
else {
|
|
283
|
+
final.push(chunk);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
return final;
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Extract title from HTML: <title> tag, then first <h1>, then filename.
|
|
290
|
+
*/
|
|
291
|
+
function extractTitle($, filePath) {
|
|
292
|
+
const titleTag = $('title').first().text().trim();
|
|
293
|
+
if (titleTag) {
|
|
294
|
+
// Strip " — SiteName", " - SiteName", " | SiteName" suffixes common in doc sites
|
|
295
|
+
const match = titleTag.match(/^(.+)(?:\s+[—\-|]\s+.+)$/);
|
|
296
|
+
return match ? match[1] : titleTag;
|
|
297
|
+
}
|
|
298
|
+
const h1 = $('h1').first().text().trim();
|
|
299
|
+
if (h1)
|
|
300
|
+
return h1;
|
|
301
|
+
return filePath.split('/').pop() ?? filePath;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Chunk HTML content into embedding-friendly chunks.
|
|
305
|
+
* Follows the ChunkerFn signature: (content, filePath, config) => ChunkOutput[]
|
|
306
|
+
*/
|
|
307
|
+
export function chunkHtml(content, filePath, config) {
|
|
308
|
+
if (!content || !content.trim()) {
|
|
309
|
+
return [];
|
|
310
|
+
}
|
|
311
|
+
const $ = cheerio.load(content);
|
|
312
|
+
// Strip non-content elements
|
|
313
|
+
$(STRIP_SELECTORS).remove();
|
|
314
|
+
// Find the main content container
|
|
315
|
+
let container = null;
|
|
316
|
+
for (const selector of CONTENT_SELECTORS) {
|
|
317
|
+
const found = $(selector).first();
|
|
318
|
+
if (found.length > 0) {
|
|
319
|
+
container = found;
|
|
320
|
+
break;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (!container) {
|
|
324
|
+
container = $('body');
|
|
325
|
+
}
|
|
326
|
+
if (!container || container.length === 0) {
|
|
327
|
+
return [];
|
|
328
|
+
}
|
|
329
|
+
const title = extractTitle($, filePath);
|
|
330
|
+
const targetChars = (config.chunk?.target_tokens ?? DEFAULT_TARGET_TOKENS) * 4;
|
|
331
|
+
const overlapChars = (config.chunk?.overlap_tokens ?? DEFAULT_OVERLAP_TOKENS) * 4;
|
|
332
|
+
// Split content on heading boundaries
|
|
333
|
+
const sections = splitOnHeadings($, container);
|
|
334
|
+
if (sections.length === 0) {
|
|
335
|
+
// No headings — treat entire content as one or more chunks
|
|
336
|
+
const text = extractText($, container);
|
|
337
|
+
if (!text.trim())
|
|
338
|
+
return [];
|
|
339
|
+
const parts = splitLargeText(text.trim(), targetChars);
|
|
340
|
+
return parts.map((content, i) => ({
|
|
341
|
+
content,
|
|
342
|
+
title,
|
|
343
|
+
headingPath: [],
|
|
344
|
+
chunkIndex: i,
|
|
345
|
+
}));
|
|
346
|
+
}
|
|
347
|
+
// Merge small sections and apply overlap
|
|
348
|
+
const merged = mergeAndOverlap(sections, targetChars, overlapChars);
|
|
349
|
+
return merged.map((chunk, i) => ({
|
|
350
|
+
content: chunk.content,
|
|
351
|
+
title,
|
|
352
|
+
headingPath: buildHeadingPath(sections, chunk.sectionIndex),
|
|
353
|
+
chunkIndex: i,
|
|
354
|
+
}));
|
|
355
|
+
}
|
|
356
|
+
//# sourceMappingURL=html.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html.js","sourceRoot":"","sources":["../../../src/indexing/chunking/html.ts"],"names":[],"mappings":"AAAA,oEAAoE;AACpE,iEAAiE;AAEjE,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAKnC,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAClC,MAAM,sBAAsB,GAAG,EAAE,CAAC;AAElC,6DAA6D;AAC7D,MAAM,eAAe,GAAG,mDAAmD,CAAC;AAE5E,0EAA0E;AAC1E,MAAM,iBAAiB,GAAG,CAAC,MAAM,EAAE,SAAS,EAAE,eAAe,EAAE,UAAU,EAAE,UAAU,CAAC,CAAC;AAEvF;;;GAGG;AACH,SAAS,WAAW,CAAC,CAAa,EAAE,EAAoB;IACpD,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,EAAE,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;QAC3B,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;YACvB,MAAM,IAAI,GAAI,IAAa,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;YACzC,IAAI,IAAI;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC3B,OAAO;QACX,CAAC;QAED,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK;YAAE,OAAO;QAChC,MAAM,OAAO,GAAG,IAAe,CAAC;QAChC,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC;QAC3C,MAAM,KAAK,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC;QAEzB,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;YAChB,8CAA8C;YAC9C,KAAK,CAAC,IAAI,CAAC,SAAS,GAAG,KAAK,CAAC,IAAI,EAAE,GAAG,SAAS,CAAC,CAAC;QACrD,CAAC;aAAM,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACtB,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;gBAChC,KAAK,CAAC,IAAI,CAAC,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YAC7C,CAAC,CAAC,CAAC;QACP,CAAC;aAAM,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;YACtB,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;gBAChC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YACrD,CAAC,CAAC,CAAC;QACP,CAAC;aAAM,IAAI,GAAG,KAAK,OAAO,EAAE,CAAC;YACzB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;gBAC5B,MAAM,KAAK,GAAa,EAAE,CAAC;gBAC3B,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;oBAClC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;gBACtC,CAAC,CAAC,CAAC;gBACH,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;YACxD,CAAC,CAAC,CAAC;QACP,CAAC;aAAM,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;YACvB,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC9B,IAAI,GAAG;gBAAE,KAAK,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAC;QAC3C,CAAC;aAAM,IAAI,CAAC,GAAG,EAAE,KAAK,EAAE,YAAY,EAAE,IAAI,EAAE,SAAS,EAAE,YAAY,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YACjF,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;YACnC,IAAI,IAAI;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;aAAM,IAAI,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5D,iGAAiG;YACjG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;QACpC,CAAC;aAAM,CAAC;YACJ,mDAAmD;YACnD,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;YACnC,IAAI,IAAI;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC/B,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;AAC9D,CAAC;AAYD,SAAS,eAAe,CAAC,CAAa,EAAE,SAA2B;IAC/D,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,IAAI,cAAc,GAAkB,IAAI,CAAC;IACzC,IAAI,YAAY,GAAG,CAAC,CAAC;IACrB,IAAI,cAAc,GAAa,EAAE,CAAC;IAElC,SAAS,KAAK;QACV,MAAM,IAAI,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;QACzE,IAAI,IAAI,EAAE,CAAC;YACP,QAAQ,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,cAAc,EAAE,KAAK,EAAE,YAAY,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACnF,CAAC;QACD,cAAc,GAAG,EAAE,CAAC;IACxB,CAAC;IAED,qEAAqE;IACrE,SAAS,IAAI,CAAC,EAAoB;QAC9B,EAAE,CAAC,QAAQ,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;YAC3B,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM,EAAE,CAAC;gBACvB,MAAM,IAAI,GAAI,IAAa,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;gBACzC,IAAI,IAAI;oBAAE,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACpC,OAAO;YACX,CAAC;YACD,IAAI,IAAI,CAAC,IAAI,KAAK,KAAK;gBAAE,OAAO;YAChC,MAAM,OAAO,GAAG,IAAe,CAAC;YAChC,MAAM,GAAG,GAAG,OAAO,CAAC,OAAO,EAAE,WAAW,EAAE,CAAC;YAE3C,IAAI,GAAG,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC9B,KAAK,EAAE,CAAC;gBACR,cAAc,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBAC1C,YAAY,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;gBAChC,OAAO,CAAC,iCAAiC;YAC7C,CAAC;YAED,gEAAgE;YAChE,IAAI,GAAG,IAAI,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC9B,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;gBACtC,IAAI,IAAI;oBAAE,cAAc,CAAC,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;gBAC3C,OAAO;YACX,CAAC;YAED,yEAAyE;YACzE,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;gBAChB,cAAc,CAAC,IAAI,CAAC,SAAS,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,GAAG,SAAS,CAAC,CAAC;gBAC/D,OAAO;YACX,CAAC;YACD,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;gBACf,CAAC,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;oBACrC,cAAc,CAAC,IAAI,CAAC,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBACtD,CAAC,CAAC,CAAC;gBACH,OAAO;YACX,CAAC;YACD,IAAI,GAAG,KAAK,IAAI,EAAE,CAAC;gBACf,CAAC,CAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;oBACrC,cAAc,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC9D,CAAC,CAAC,CAAC;gBACH,OAAO;YACX,CAAC;YACD,IAAI,GAAG,KAAK,OAAO,EAAE,CAAC;gBAClB,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE;oBACjC,MAAM,KAAK,GAAa,EAAE,CAAC;oBAC3B,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;wBAClC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC;oBACtC,CAAC,CAAC,CAAC;oBACH,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;wBAAE,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;gBACjE,CAAC,CAAC,CAAC;gBACH,OAAO;YACX,CAAC;YACD,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;gBAChB,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;gBACnC,IAAI,GAAG;oBAAE,cAAc,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAC;gBAChD,OAAO;YACX,CAAC;YAED,yDAAyD;YACzD,IAAI,CAAC,GAAG,EAAE,YAAY,EAAE,IAAI,EAAE,YAAY,EAAE,IAAI,EAAE,QAAQ,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;gBACxE,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;gBACxC,IAAI,IAAI;oBAAE,cAAc,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;gBACpC,OAAO;YACX,CAAC;YAED,gEAAgE;YAChE,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;QACrB,CAAC,CAAC,CAAC;IACP,CAAC;IAED,IAAI,CAAC,SAAS,CAAC,CAAC;IAChB,KAAK,EAAE,CAAC;IACR,OAAO,QAAQ,CAAC;AACpB,CAAC;AAED;;;GAGG;AACH,SAAS,gBAAgB,CAAC,QAAuB,EAAE,SAAiB;IAChE,MAAM,KAAK,GAAsC,EAAE,CAAC;IAEpD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;QAClC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,CAAC,OAAO,CAAC,OAAO;YAAE,SAAS;QAE/B,uCAAuC;QACvC,OAAO,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,KAAK,IAAI,OAAO,CAAC,KAAK,EAAE,CAAC;YACxE,KAAK,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;QACD,KAAK,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,IAAI,EAAE,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;IAChE,CAAC;IAED,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;;;GAIG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,WAAmB;IACrD,IAAI,IAAI,CAAC,MAAM,IAAI,WAAW;QAAE,OAAO,CAAC,IAAI,CAAC,CAAC;IAE9C,iCAAiC;IACjC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IACvC,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACxB,OAAO,eAAe,CAAC,UAAU,EAAE,WAAW,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,cAAc,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC,CAAC;IACjG,CAAC;IAED,+BAA+B;IAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACnB,OAAO,eAAe,CAAC,KAAK,EAAE,WAAW,EAAE,IAAI,CAAC,CAAC;IACrD,CAAC;IAED,kCAAkC;IAClC,OAAO,CAAC,IAAI,CAAC,CAAC;AAClB,CAAC;AAED,kEAAkE;AAClE,SAAS,eAAe,CAAC,KAAe,EAAE,UAAkB,EAAE,YAAoB,MAAM;IACpF,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;QACrC,IAAI,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,UAAU,EAAE,CAAC;YACtE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACrB,OAAO,GAAG,IAAI,CAAC;QACnB,CAAC;aAAM,CAAC;YACJ,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,GAAG,GAAG,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;QACpD,CAAC;IACL,CAAC;IACD,IAAI,OAAO;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAClC,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,eAAe,CAAC,QAAuB,EAAE,WAAmB,EAAE,YAAoB;IACvF,MAAM,MAAM,GAAgD,EAAE,CAAC;IAC/D,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC;QAC5B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC;QAC7B,MAAM,SAAS,GAAG,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;QAExC,yDAAyD;QACzD,kEAAkE;QAClE,MAAM,YAAY,GAAG,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,WAAW,CAAC;QAChG,MAAM,aAAa,GAAG,OAAO,CAAC,OAAO,KAAK,IAAI,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAErE,IAAI,YAAY,IAAI,aAAa,EAAE,CAAC;YAChC,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;gBACjB,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;YAChE,CAAC;YACD,OAAO,GAAG,IAAI,CAAC;YACf,UAAU,GAAG,CAAC,CAAC;QACnB,CAAC;aAAM,CAAC;YACJ,IAAI,CAAC,OAAO;gBAAE,UAAU,GAAG,CAAC,CAAC;YAC7B,OAAO,GAAG,OAAO,CAAC,CAAC,CAAC,OAAO,GAAG,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;QAC1D,CAAC;IACL,CAAC;IACD,IAAI,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QACjB,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,YAAY,EAAE,UAAU,EAAE,CAAC,CAAC;IAChE,CAAC;IAED,gBAAgB;IAChB,IAAI,MAAmD,CAAC;IACxD,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,IAAI,YAAY,IAAI,CAAC,EAAE,CAAC;QAC1C,MAAM,GAAG,MAAM,CAAC;IACpB,CAAC;SAAM,CAAC;QACJ,MAAM,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC;YACnC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,CAAC;YAC9C,MAAM,UAAU,GAAG,WAAW,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;YACjD,MAAM,YAAY,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;YAClF,MAAM,CAAC,IAAI,CAAC;gBACR,OAAO,EAAE,YAAY,GAAG,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO;gBAClD,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC,CAAC,YAAY;aACvC,CAAC,CAAC;QACP,CAAC;IACL,CAAC;IAED,uDAAuD;IACvD,MAAM,KAAK,GAAgD,EAAE,CAAC;IAC9D,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QACzB,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,GAAG,WAAW,EAAE,CAAC;YACrC,MAAM,KAAK,GAAG,cAAc,CAAC,KAAK,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;YACzD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACvB,KAAK,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,KAAK,CAAC,YAAY,EAAE,CAAC,CAAC;YACpE,CAAC;QACL,CAAC;aAAM,CAAC;YACJ,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACtB,CAAC;IACL,CAAC;IACD,OAAO,KAAK,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,CAAa,EAAE,QAAgB;IACjD,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAClD,IAAI,QAAQ,EAAE,CAAC;QACX,iFAAiF;QACjF,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;QACzD,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;IACvC,CAAC;IAED,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IACzC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAElB,OAAO,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,IAAI,QAAQ,CAAC;AACjD,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,SAAS,CAAC,OAAe,EAAE,QAAgB,EAAE,MAAoB;IAC7E,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAC;IACd,CAAC;IAED,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAEhC,6BAA6B;IAC7B,CAAC,CAAC,eAAe,CAAC,CAAC,MAAM,EAAE,CAAC;IAE5B,kCAAkC;IAClC,IAAI,SAAS,GAA4B,IAAI,CAAC;IAC9C,KAAK,MAAM,QAAQ,IAAI,iBAAiB,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;QAClC,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnB,SAAS,GAAG,KAAK,CAAC;YAClB,MAAM;QACV,CAAC;IACL,CAAC;IACD,IAAI,CAAC,SAAS,EAAE,CAAC;QACb,SAAS,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC;IAC1B,CAAC;IAED,IAAI,CAAC,SAAS,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACvC,OAAO,EAAE,CAAC;IACd,CAAC;IAED,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACxC,MAAM,WAAW,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE,aAAa,IAAI,qBAAqB,CAAC,GAAG,CAAC,CAAC;IAC/E,MAAM,YAAY,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE,cAAc,IAAI,sBAAsB,CAAC,GAAG,CAAC,CAAC;IAElF,sCAAsC;IACtC,MAAM,QAAQ,GAAG,eAAe,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;IAE/C,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,2DAA2D;QAC3D,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC;QACvC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,OAAO,EAAE,CAAC;QAC5B,MAAM,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,IAAI,EAAE,EAAE,WAAW,CAAC,CAAC;QACvD,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;YAC9B,OAAO;YACP,KAAK;YACL,WAAW,EAAE,EAAE;YACf,UAAU,EAAE,CAAC;SAChB,CAAC,CAAC,CAAC;IACR,CAAC;IAED,yCAAyC;IACzC,MAAM,MAAM,GAAG,eAAe,CAAC,QAAQ,EAAE,WAAW,EAAE,YAAY,CAAC,CAAC;IAEpE,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC7B,OAAO,EAAE,KAAK,CAAC,OAAO;QACtB,KAAK;QACL,WAAW,EAAE,gBAAgB,CAAC,QAAQ,EAAE,KAAK,CAAC,YAAY,CAAC;QAC3D,UAAU,EAAE,CAAC;KAChB,CAAC,CAAC,CAAC;AACR,CAAC"}
|
|
@@ -13,7 +13,12 @@ export function getChunker(type) {
|
|
|
13
13
|
import { chunkMarkdown } from './markdown.js';
|
|
14
14
|
import { chunkCode } from './code.js';
|
|
15
15
|
import { chunkRawText } from './raw-text.js';
|
|
16
|
+
import { chunkHtml } from './html.js';
|
|
16
17
|
registerChunker('markdown', chunkMarkdown);
|
|
17
18
|
registerChunker('code', chunkCode);
|
|
18
19
|
registerChunker('raw-text', chunkRawText);
|
|
20
|
+
registerChunker('html', chunkHtml);
|
|
21
|
+
import { chunkQa } from './qa.js';
|
|
22
|
+
registerChunker('slack', chunkQa);
|
|
23
|
+
registerChunker('discord', chunkQa);
|
|
19
24
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/indexing/chunking/index.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAM9D,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAqB,CAAC;AAE9C,MAAM,UAAU,eAAe,CAAC,IAAY,EAAE,EAAa;IACvD,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,IAAY;IACnC,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC9B,IAAI,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,iBAAiB,CAAC,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC3G,OAAO,EAAE,CAAC;AACd,CAAC;AAED,+BAA+B;AAC/B,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,OAAO,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/indexing/chunking/index.ts"],"names":[],"mappings":"AAAA,8DAA8D;AAM9D,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAqB,CAAC;AAE9C,MAAM,UAAU,eAAe,CAAC,IAAY,EAAE,EAAa;IACvD,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;AAC3B,CAAC;AAED,MAAM,UAAU,UAAU,CAAC,IAAY;IACnC,MAAM,EAAE,GAAG,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IAC9B,IAAI,CAAC,EAAE;QAAE,MAAM,IAAI,KAAK,CAAC,0BAA0B,IAAI,iBAAiB,CAAC,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC3G,OAAO,EAAE,CAAC;AACd,CAAC;AAED,+BAA+B;AAC/B,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,OAAO,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAEtC,eAAe,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;AAC3C,eAAe,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;AACnC,eAAe,CAAC,UAAU,EAAE,YAAY,CAAC,CAAC;AAC1C,eAAe,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;AAEnC,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAElC,eAAe,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;AAClC,eAAe,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { ChunkOutput, SourceConfig } from '../../types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Chunk Q&A content. Each content item from a FAQ-category provider
|
|
4
|
+
* is a single Q&A pair, already sized appropriately.
|
|
5
|
+
* The chunker formats it and returns a single ChunkOutput.
|
|
6
|
+
*/
|
|
7
|
+
export declare function chunkQa(content: string, filePath: string, config: SourceConfig): ChunkOutput[];
|
|
8
|
+
//# sourceMappingURL=qa.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"qa.d.ts","sourceRoot":"","sources":["../../../src/indexing/chunking/qa.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,WAAW,EAAE,YAAY,EAAE,MAAM,gBAAgB,CAAC;AAEhE;;;;GAIG;AACH,wBAAgB,OAAO,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,EAAE,YAAY,GAAG,WAAW,EAAE,CAe9F"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
// Q&A chunker — formats distilled Q&A pairs for embedding.
|
|
2
|
+
// Source-agnostic: used by any source that produces Q&A-formatted content.
|
|
3
|
+
/**
|
|
4
|
+
* Chunk Q&A content. Each content item from a FAQ-category provider
|
|
5
|
+
* is a single Q&A pair, already sized appropriately.
|
|
6
|
+
* The chunker formats it and returns a single ChunkOutput.
|
|
7
|
+
*/
|
|
8
|
+
export function chunkQa(content, filePath, config) {
|
|
9
|
+
if (!content || !content.trim()) {
|
|
10
|
+
return [];
|
|
11
|
+
}
|
|
12
|
+
// The content is already formatted as "Q: ...\n\nA: ..." by the provider.
|
|
13
|
+
// Extract the question for use as title.
|
|
14
|
+
const questionMatch = content.match(/^Q:\s*(.+?)(?:\n|$)/);
|
|
15
|
+
const title = questionMatch ? questionMatch[1].trim() : undefined;
|
|
16
|
+
return [{
|
|
17
|
+
content: content.trim(),
|
|
18
|
+
title,
|
|
19
|
+
chunkIndex: 0,
|
|
20
|
+
}];
|
|
21
|
+
}
|
|
22
|
+
//# sourceMappingURL=qa.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"qa.js","sourceRoot":"","sources":["../../../src/indexing/chunking/qa.ts"],"names":[],"mappings":"AAAA,2DAA2D;AAC3D,2EAA2E;AAI3E;;;;GAIG;AACH,MAAM,UAAU,OAAO,CAAC,OAAe,EAAE,QAAgB,EAAE,MAAoB;IAC3E,IAAI,CAAC,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;QAC9B,OAAO,EAAE,CAAC;IACd,CAAC;IAED,0EAA0E;IAC1E,yCAAyC;IACzC,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAC,qBAAqB,CAAC,CAAC;IAC3D,MAAM,KAAK,GAAG,aAAa,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;IAElE,OAAO,CAAC;YACJ,OAAO,EAAE,OAAO,CAAC,IAAI,EAAE;YACvB,KAAK;YACL,UAAU,EAAE,CAAC;SAChB,CAAC,CAAC;AACP,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import OpenAI from 'openai';
|
|
2
|
+
export interface ThreadMessage {
|
|
3
|
+
author: string;
|
|
4
|
+
content: string;
|
|
5
|
+
timestamp: string;
|
|
6
|
+
reactions?: Array<{
|
|
7
|
+
name: string;
|
|
8
|
+
count: number;
|
|
9
|
+
}>;
|
|
10
|
+
}
|
|
11
|
+
export interface DistilledPair {
|
|
12
|
+
question: string;
|
|
13
|
+
answer: string;
|
|
14
|
+
confidence: number;
|
|
15
|
+
}
|
|
16
|
+
export interface DistillerResult {
|
|
17
|
+
pairs: DistilledPair[];
|
|
18
|
+
}
|
|
19
|
+
export interface DistillerOptions {
|
|
20
|
+
model?: string;
|
|
21
|
+
maxMessages?: number;
|
|
22
|
+
apiKey?: string;
|
|
23
|
+
client?: OpenAI;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Distill a conversation thread into Q&A pairs using an LLM.
|
|
27
|
+
*/
|
|
28
|
+
export declare function distillThread(messages: ThreadMessage[], options?: DistillerOptions): Promise<DistillerResult>;
|
|
29
|
+
//# sourceMappingURL=distiller.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"distiller.d.ts","sourceRoot":"","sources":["../../src/indexing/distiller.ts"],"names":[],"mappings":"AAGA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAI5B,MAAM,WAAW,aAAa;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CACtD;AAED,MAAM,WAAW,aAAa;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,eAAe;IAC5B,KAAK,EAAE,aAAa,EAAE,CAAC;CAC1B;AAED,MAAM,WAAW,gBAAgB;IAC7B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACnB;AAuCD;;GAEG;AACH,wBAAsB,aAAa,CAC/B,QAAQ,EAAE,aAAa,EAAE,EACzB,OAAO,CAAC,EAAE,gBAAgB,GAC3B,OAAO,CAAC,eAAe,CAAC,CAyE1B"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
// LLM thread distiller — extracts Q&A pairs from conversation threads.
|
|
2
|
+
// Source-agnostic: takes structured messages, returns structured Q&A pairs.
|
|
3
|
+
import OpenAI from 'openai';
|
|
4
|
+
// ── Constants ────────────────────────────────────────────────────────────────
|
|
5
|
+
const DEFAULT_MODEL = 'gpt-4o-mini';
|
|
6
|
+
const DEFAULT_MAX_MESSAGES = 100;
|
|
7
|
+
const SYSTEM_PROMPT = `You are a Q&A extraction engine. Given a conversation thread, identify distinct question-answer pairs.
|
|
8
|
+
|
|
9
|
+
For each pair:
|
|
10
|
+
1. Extract the core question (rephrase if needed for clarity)
|
|
11
|
+
2. Extract the best answer (synthesize from multiple replies if needed)
|
|
12
|
+
3. Score confidence from 0.0 to 1.0 based on:
|
|
13
|
+
- Answer completeness (does it fully address the question?)
|
|
14
|
+
- Questioner satisfaction signals ("thanks", "that worked", etc.)
|
|
15
|
+
- Community validation (reactions like thumbsup, check marks)
|
|
16
|
+
- Answer specificity (concrete steps vs vague suggestions)
|
|
17
|
+
|
|
18
|
+
Return JSON with this exact structure:
|
|
19
|
+
{
|
|
20
|
+
"pairs": [
|
|
21
|
+
{
|
|
22
|
+
"question": "How do I configure X?",
|
|
23
|
+
"answer": "You can configure X by...",
|
|
24
|
+
"confidence": 0.85
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
Rules:
|
|
30
|
+
- A thread may contain multiple Q&A pairs (follow-up questions)
|
|
31
|
+
- Skip greetings, pleasantries, and off-topic tangents
|
|
32
|
+
- If no clear Q&A exists, return {"pairs": []}
|
|
33
|
+
- Keep answers concise but complete (aim for 1-3 paragraphs)
|
|
34
|
+
- Preserve code blocks, URLs, and technical details from answers
|
|
35
|
+
- Confidence below 0.3 means the answer is likely incomplete or wrong`;
|
|
36
|
+
// ── Distiller ────────────────────────────────────────────────────────────────
|
|
37
|
+
/**
|
|
38
|
+
* Distill a conversation thread into Q&A pairs using an LLM.
|
|
39
|
+
*/
|
|
40
|
+
export async function distillThread(messages, options) {
|
|
41
|
+
const model = options?.model ?? DEFAULT_MODEL;
|
|
42
|
+
const maxMessages = options?.maxMessages ?? DEFAULT_MAX_MESSAGES;
|
|
43
|
+
if (messages.length === 0) {
|
|
44
|
+
return { pairs: [] };
|
|
45
|
+
}
|
|
46
|
+
// Truncate to max messages
|
|
47
|
+
const truncated = messages.slice(0, maxMessages);
|
|
48
|
+
// Format as conversation transcript
|
|
49
|
+
const transcript = truncated.map(msg => {
|
|
50
|
+
const reactions = msg.reactions && msg.reactions.length > 0
|
|
51
|
+
? ` [reactions: ${msg.reactions.map(r => `:${r.name}: x${r.count}`).join(', ')}]`
|
|
52
|
+
: '';
|
|
53
|
+
return `[${msg.timestamp}] ${msg.author}: ${msg.content}${reactions}`;
|
|
54
|
+
}).join('\n\n');
|
|
55
|
+
const client = options?.client ?? new OpenAI({ apiKey: options?.apiKey });
|
|
56
|
+
try {
|
|
57
|
+
const response = await client.chat.completions.create({
|
|
58
|
+
model,
|
|
59
|
+
messages: [
|
|
60
|
+
{ role: 'system', content: SYSTEM_PROMPT },
|
|
61
|
+
{ role: 'user', content: transcript },
|
|
62
|
+
],
|
|
63
|
+
response_format: { type: 'json_object' },
|
|
64
|
+
temperature: 0.1,
|
|
65
|
+
});
|
|
66
|
+
const content = response.choices[0]?.message?.content;
|
|
67
|
+
if (!content) {
|
|
68
|
+
console.warn('[distiller] Empty response from LLM');
|
|
69
|
+
return { pairs: [] };
|
|
70
|
+
}
|
|
71
|
+
const parsed = JSON.parse(content);
|
|
72
|
+
// Validate structure
|
|
73
|
+
if (!Array.isArray(parsed.pairs)) {
|
|
74
|
+
console.warn('[distiller] Invalid response structure — missing pairs array');
|
|
75
|
+
return { pairs: [] };
|
|
76
|
+
}
|
|
77
|
+
// Validate and filter each pair
|
|
78
|
+
const validPairs = [];
|
|
79
|
+
for (const pair of parsed.pairs) {
|
|
80
|
+
if (typeof pair.question === 'string' && pair.question.trim() &&
|
|
81
|
+
typeof pair.answer === 'string' && pair.answer.trim() &&
|
|
82
|
+
typeof pair.confidence === 'number' &&
|
|
83
|
+
pair.confidence >= 0 && pair.confidence <= 1) {
|
|
84
|
+
validPairs.push({
|
|
85
|
+
question: pair.question.trim(),
|
|
86
|
+
answer: pair.answer.trim(),
|
|
87
|
+
confidence: pair.confidence,
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
else {
|
|
91
|
+
console.warn('[distiller] Skipping malformed pair:', JSON.stringify(pair).slice(0, 200));
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return { pairs: validPairs };
|
|
95
|
+
}
|
|
96
|
+
catch (error) {
|
|
97
|
+
if (error instanceof SyntaxError) {
|
|
98
|
+
console.error('[distiller] Failed to parse LLM JSON response:', error.message);
|
|
99
|
+
return { pairs: [] };
|
|
100
|
+
}
|
|
101
|
+
throw error; // Re-throw API errors for caller to handle
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
//# sourceMappingURL=distiller.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"distiller.js","sourceRoot":"","sources":["../../src/indexing/distiller.ts"],"names":[],"mappings":"AAAA,uEAAuE;AACvE,4EAA4E;AAE5E,OAAO,MAAM,MAAM,QAAQ,CAAC;AA4B5B,gFAAgF;AAEhF,MAAM,aAAa,GAAG,aAAa,CAAC;AACpC,MAAM,oBAAoB,GAAG,GAAG,CAAC;AAEjC,MAAM,aAAa,GAAG;;;;;;;;;;;;;;;;;;;;;;;;;;;;sEA4BgD,CAAC;AAEvE,gFAAgF;AAEhF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAC/B,QAAyB,EACzB,OAA0B;IAE1B,MAAM,KAAK,GAAG,OAAO,EAAE,KAAK,IAAI,aAAa,CAAC;IAC9C,MAAM,WAAW,GAAG,OAAO,EAAE,WAAW,IAAI,oBAAoB,CAAC;IAEjE,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;IACzB,CAAC;IAED,2BAA2B;IAC3B,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC,CAAC;IAEjD,oCAAoC;IACpC,MAAM,UAAU,GAAG,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE;QACnC,MAAM,SAAS,GAAG,GAAG,CAAC,SAAS,IAAI,GAAG,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC;YACvD,CAAC,CAAC,gBAAgB,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,MAAM,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG;YACjF,CAAC,CAAC,EAAE,CAAC;QACT,OAAO,IAAI,GAAG,CAAC,SAAS,KAAK,GAAG,CAAC,MAAM,KAAK,GAAG,CAAC,OAAO,GAAG,SAAS,EAAE,CAAC;IAC1E,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAEhB,MAAM,MAAM,GAAG,OAAO,EAAE,MAAM,IAAI,IAAI,MAAM,CAAC,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;IAE1E,IAAI,CAAC;QACD,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;YAClD,KAAK;YACL,QAAQ,EAAE;gBACN,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,aAAa,EAAE;gBAC1C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE;aACxC;YACD,eAAe,EAAE,EAAE,IAAI,EAAE,aAAa,EAAE;YACxC,WAAW,EAAE,GAAG;SACnB,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QACtD,IAAI,CAAC,OAAO,EAAE,CAAC;YACX,OAAO,CAAC,IAAI,CAAC,qCAAqC,CAAC,CAAC;YACpD,OAAO,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QACzB,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAEnC,qBAAqB;QACrB,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,IAAI,CAAC,8DAA8D,CAAC,CAAC;YAC7E,OAAO,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QACzB,CAAC;QAED,gCAAgC;QAChC,MAAM,UAAU,GAAoB,EAAE,CAAC;QACvC,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,KAAK,EAAE,CAAC;YAC9B,IACI,OAAO,IAAI,CAAC,QAAQ,KAAK,QAAQ,IAAI,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE;gBACzD,OAAO,IAAI,CAAC,MAAM,KAAK,QAAQ,IAAI,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE;gBACrD,OAAO,IAAI,CAAC,UAAU,KAAK,QAAQ;gBACnC,IAAI,CAAC,UAAU,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,IAAI,CAAC,EAC9C,CAAC;gBACC,UAAU,CAAC,IAAI,CAAC;oBACZ,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE;oBAC9B,MAAM,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,EAAE;oBAC1B,UAAU,EAAE,IAAI,CAAC,UAAU;iBAC9B,CAAC,CAAC;YACP,CAAC;iBAAM,CAAC;gBACJ,OAAO,CAAC,IAAI,CAAC,sCAAsC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;YAC7F,CAAC;QACL,CAAC;QAED,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,CAAC;IACjC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,IAAI,KAAK,YAAY,WAAW,EAAE,CAAC;YAC/B,OAAO,CAAC,KAAK,CAAC,gDAAgD,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;YAC/E,OAAO,EAAE,KAAK,EAAE,EAAE,EAAE,CAAC;QACzB,CAAC;QACD,MAAM,KAAK,CAAC,CAAC,2CAA2C;IAC5D,CAAC;AACL,CAAC"}
|
|
@@ -4,6 +4,7 @@ export declare class IndexingOrchestrator {
|
|
|
4
4
|
private processing;
|
|
5
5
|
private activeSources;
|
|
6
6
|
private lastReindexDate;
|
|
7
|
+
onReindexComplete?: (sourceNames: string[]) => void;
|
|
7
8
|
constructor();
|
|
8
9
|
/**
|
|
9
10
|
* Smart startup check: compare DB commit SHAs against remote HEAD.
|
|
@@ -12,10 +13,10 @@ export declare class IndexingOrchestrator {
|
|
|
12
13
|
*/
|
|
13
14
|
checkAndIndex(): Promise<void>;
|
|
14
15
|
/**
|
|
15
|
-
* Get the
|
|
16
|
-
*
|
|
16
|
+
* Get the current state token for a source without acquiring items.
|
|
17
|
+
* Returns null if the source is unavailable.
|
|
17
18
|
*/
|
|
18
|
-
private
|
|
19
|
+
private getSourceStateToken;
|
|
19
20
|
/**
|
|
20
21
|
* Queue a full re-index of all sources. Returns immediately.
|
|
21
22
|
*/
|
|
@@ -24,6 +25,11 @@ export declare class IndexingOrchestrator {
|
|
|
24
25
|
* Queue an incremental re-index for a specific repo. Returns immediately.
|
|
25
26
|
*/
|
|
26
27
|
queueIncrementalReindex(repoUrl: string): void;
|
|
28
|
+
/**
|
|
29
|
+
* Queue a reindex for a single named source. Returns immediately.
|
|
30
|
+
* Used by webhook handlers to trigger reindexing of specific sources.
|
|
31
|
+
*/
|
|
32
|
+
queueSourceReindex(sourceName: string): void;
|
|
27
33
|
/**
|
|
28
34
|
* Returns true if any indexing job is currently running.
|
|
29
35
|
*/
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"orchestrator.d.ts","sourceRoot":"","sources":["../../src/indexing/orchestrator.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"orchestrator.d.ts","sourceRoot":"","sources":["../../src/indexing/orchestrator.ts"],"names":[],"mappings":"AAqCA,qBAAa,oBAAoB;IAC7B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,UAAU,CAAS;IAG3B,OAAO,CAAC,aAAa,CAAqB;IAG1C,OAAO,CAAC,eAAe,CAAuB;IAG9C,iBAAiB,CAAC,EAAE,CAAC,WAAW,EAAE,MAAM,EAAE,KAAK,IAAI,CAAC;;IAMpD;;;;OAIG;IACG,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IA+GpC;;;OAGG;YACW,mBAAmB;IAYjC;;OAEG;IACH,gBAAgB,IAAI,IAAI;IAQxB;;OAEG;IACH,uBAAuB,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI;IAU9C;;;OAGG;IACH,kBAAkB,CAAC,UAAU,EAAE,MAAM,GAAG,IAAI;IAQ5C;;OAEG;IACH,UAAU,IAAI,OAAO;IAIrB;;;OAGG;IACH,mBAAmB,IAAI,IAAI;IA4B3B;;OAEG;IACH,OAAO,CAAC,OAAO;IASf;;OAEG;YACW,KAAK;IAwBnB;;OAEG;YACW,UAAU;IA8DxB;;OAEG;YACW,cAAc;IAqB5B;;OAEG;YACW,qBAAqB;IAiBnC;;OAEG;YACW,oBAAoB;IAsDlC;;OAEG;YACW,cAAc;IAmB5B;;OAEG;YACW,cAAc;CAgB/B"}
|