expxagents 0.25.2 → 0.25.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/dist/cli/src/commands/info.d.ts +2 -1
  2. package/dist/cli/src/commands/login.d.ts +2 -1
  3. package/dist/cli/src/commands/logout.d.ts +2 -1
  4. package/dist/cli/src/commands/outdated.d.ts +2 -1
  5. package/dist/cli/src/commands/publish.d.ts +2 -1
  6. package/dist/cli/src/commands/registry-install.d.ts +2 -1
  7. package/dist/cli/src/commands/search.d.ts +2 -1
  8. package/dist/cli/src/commands/update.d.ts +2 -1
  9. package/dist/cli/src/commands/whoami.d.ts +2 -1
  10. package/dist/cli/src/utils/server-paths.js +17 -5
  11. package/dist/server/scheduler/__tests__/scheduler-service.test.js +1 -1
  12. package/dist/server/scheduler/__tests__/scheduler-service.test.js.map +1 -1
  13. package/dist/server/scheduler/scheduler-service.js +2 -2
  14. package/dist/server/scheduler/scheduler-service.js.map +1 -1
  15. package/node_modules/expxagents-knowledge/dist/config.d.ts +4 -0
  16. package/node_modules/expxagents-knowledge/dist/config.d.ts.map +1 -0
  17. package/node_modules/expxagents-knowledge/dist/config.js +37 -0
  18. package/node_modules/expxagents-knowledge/dist/config.js.map +1 -0
  19. package/node_modules/expxagents-knowledge/dist/db/connection.d.ts +6 -0
  20. package/node_modules/expxagents-knowledge/dist/db/connection.d.ts.map +1 -0
  21. package/node_modules/expxagents-knowledge/dist/db/connection.js +69 -0
  22. package/node_modules/expxagents-knowledge/dist/db/connection.js.map +1 -0
  23. package/node_modules/expxagents-knowledge/dist/db/migrations.d.ts +3 -0
  24. package/node_modules/expxagents-knowledge/dist/db/migrations.d.ts.map +1 -0
  25. package/node_modules/expxagents-knowledge/dist/db/migrations.js +46 -0
  26. package/node_modules/expxagents-knowledge/dist/db/migrations.js.map +1 -0
  27. package/node_modules/expxagents-knowledge/dist/db/schema.d.ts +3 -0
  28. package/node_modules/expxagents-knowledge/dist/db/schema.d.ts.map +1 -0
  29. package/node_modules/expxagents-knowledge/dist/db/schema.js +79 -0
  30. package/node_modules/expxagents-knowledge/dist/db/schema.js.map +1 -0
  31. package/node_modules/expxagents-knowledge/dist/index.d.ts +16 -0
  32. package/node_modules/expxagents-knowledge/dist/index.d.ts.map +1 -0
  33. package/node_modules/expxagents-knowledge/dist/index.js +16 -0
  34. package/node_modules/expxagents-knowledge/dist/index.js.map +1 -0
  35. package/node_modules/expxagents-knowledge/dist/ingest/chunker.d.ts +10 -0
  36. package/node_modules/expxagents-knowledge/dist/ingest/chunker.d.ts.map +1 -0
  37. package/node_modules/expxagents-knowledge/dist/ingest/chunker.js +221 -0
  38. package/node_modules/expxagents-knowledge/dist/ingest/chunker.js.map +1 -0
  39. package/node_modules/expxagents-knowledge/dist/ingest/document-loader.d.ts +4 -0
  40. package/node_modules/expxagents-knowledge/dist/ingest/document-loader.d.ts.map +1 -0
  41. package/node_modules/expxagents-knowledge/dist/ingest/document-loader.js +56 -0
  42. package/node_modules/expxagents-knowledge/dist/ingest/document-loader.js.map +1 -0
  43. package/node_modules/expxagents-knowledge/dist/ingest/embedder.d.ts +4 -0
  44. package/node_modules/expxagents-knowledge/dist/ingest/embedder.d.ts.map +1 -0
  45. package/node_modules/expxagents-knowledge/dist/ingest/embedder.js +25 -0
  46. package/node_modules/expxagents-knowledge/dist/ingest/embedder.js.map +1 -0
  47. package/node_modules/expxagents-knowledge/dist/ingest/entity-extractor.d.ts +21 -0
  48. package/node_modules/expxagents-knowledge/dist/ingest/entity-extractor.d.ts.map +1 -0
  49. package/node_modules/expxagents-knowledge/dist/ingest/entity-extractor.js +54 -0
  50. package/node_modules/expxagents-knowledge/dist/ingest/entity-extractor.js.map +1 -0
  51. package/node_modules/expxagents-knowledge/dist/ingest/extraction-queue.d.ts +16 -0
  52. package/node_modules/expxagents-knowledge/dist/ingest/extraction-queue.d.ts.map +1 -0
  53. package/node_modules/expxagents-knowledge/dist/ingest/extraction-queue.js +49 -0
  54. package/node_modules/expxagents-knowledge/dist/ingest/extraction-queue.js.map +1 -0
  55. package/node_modules/expxagents-knowledge/dist/ingest/pdf-extractor.d.ts +9 -0
  56. package/node_modules/expxagents-knowledge/dist/ingest/pdf-extractor.d.ts.map +1 -0
  57. package/node_modules/expxagents-knowledge/dist/ingest/pdf-extractor.js +116 -0
  58. package/node_modules/expxagents-knowledge/dist/ingest/pdf-extractor.js.map +1 -0
  59. package/node_modules/expxagents-knowledge/dist/ingest/pipeline.d.ts +27 -0
  60. package/node_modules/expxagents-knowledge/dist/ingest/pipeline.d.ts.map +1 -0
  61. package/node_modules/expxagents-knowledge/dist/ingest/pipeline.js +92 -0
  62. package/node_modules/expxagents-knowledge/dist/ingest/pipeline.js.map +1 -0
  63. package/node_modules/expxagents-knowledge/dist/query/graph-traversal.d.ts +41 -0
  64. package/node_modules/expxagents-knowledge/dist/query/graph-traversal.d.ts.map +1 -0
  65. package/node_modules/expxagents-knowledge/dist/query/graph-traversal.js +62 -0
  66. package/node_modules/expxagents-knowledge/dist/query/graph-traversal.js.map +1 -0
  67. package/node_modules/expxagents-knowledge/dist/query/knowledge-query.d.ts +31 -0
  68. package/node_modules/expxagents-knowledge/dist/query/knowledge-query.d.ts.map +1 -0
  69. package/node_modules/expxagents-knowledge/dist/query/knowledge-query.js +106 -0
  70. package/node_modules/expxagents-knowledge/dist/query/knowledge-query.js.map +1 -0
  71. package/node_modules/expxagents-knowledge/dist/query/vector-search.d.ts +26 -0
  72. package/node_modules/expxagents-knowledge/dist/query/vector-search.d.ts.map +1 -0
  73. package/node_modules/expxagents-knowledge/dist/query/vector-search.js +57 -0
  74. package/node_modules/expxagents-knowledge/dist/query/vector-search.js.map +1 -0
  75. package/node_modules/expxagents-knowledge/dist/sources/agent-output.d.ts +10 -0
  76. package/node_modules/expxagents-knowledge/dist/sources/agent-output.d.ts.map +1 -0
  77. package/node_modules/expxagents-knowledge/dist/sources/agent-output.js +29 -0
  78. package/node_modules/expxagents-knowledge/dist/sources/agent-output.js.map +1 -0
  79. package/node_modules/expxagents-knowledge/dist/sources/watcher.d.ts +6 -0
  80. package/node_modules/expxagents-knowledge/dist/sources/watcher.d.ts.map +1 -0
  81. package/node_modules/expxagents-knowledge/dist/sources/watcher.js +42 -0
  82. package/node_modules/expxagents-knowledge/dist/sources/watcher.js.map +1 -0
  83. package/node_modules/expxagents-knowledge/dist/types.d.ts +138 -0
  84. package/node_modules/expxagents-knowledge/dist/types.d.ts.map +1 -0
  85. package/node_modules/expxagents-knowledge/dist/types.js +2 -0
  86. package/node_modules/expxagents-knowledge/dist/types.js.map +1 -0
  87. package/node_modules/expxagents-knowledge/package.json +7 -0
  88. package/package.json +2 -2
@@ -0,0 +1,221 @@
1
+ export function estimateTokens(text) {
2
+ return Math.ceil(text.length / 4);
3
+ }
4
+ export function chunkMarkdown(content, options) {
5
+ if (!content.trim())
6
+ return [];
7
+ const sections = splitByHeadings(content);
8
+ const results = [];
9
+ let chunkIndex = 0;
10
+ for (const section of sections) {
11
+ const sectionTokens = estimateTokens(section.content);
12
+ if (sectionTokens <= options.max_tokens) {
13
+ results.push({
14
+ content: section.content,
15
+ chunk_index: chunkIndex++,
16
+ token_count: sectionTokens,
17
+ metadata: { heading_path: section.heading_path },
18
+ });
19
+ }
20
+ else {
21
+ const subChunks = splitByParagraphs(section.content, options);
22
+ for (const sub of subChunks) {
23
+ results.push({
24
+ content: sub,
25
+ chunk_index: chunkIndex++,
26
+ token_count: estimateTokens(sub),
27
+ metadata: { heading_path: section.heading_path },
28
+ });
29
+ }
30
+ }
31
+ }
32
+ return results;
33
+ }
34
+ function splitByHeadings(content) {
35
+ const lines = content.split('\n');
36
+ const sections = [];
37
+ let currentHeadings = [];
38
+ let currentLines = [];
39
+ for (const line of lines) {
40
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
41
+ if (headingMatch) {
42
+ if (currentLines.length > 0) {
43
+ const text = currentLines.join('\n').trim();
44
+ if (text) {
45
+ sections.push({
46
+ heading_path: currentHeadings.join(' > ') || '',
47
+ content: text,
48
+ });
49
+ }
50
+ currentLines = [];
51
+ }
52
+ const level = headingMatch[1].length;
53
+ const title = headingMatch[2].trim();
54
+ currentHeadings = currentHeadings.slice(0, level - 1);
55
+ currentHeadings[level - 1] = title;
56
+ currentHeadings = currentHeadings.slice(0, level);
57
+ currentLines.push(line);
58
+ }
59
+ else {
60
+ currentLines.push(line);
61
+ }
62
+ }
63
+ if (currentLines.length > 0) {
64
+ const text = currentLines.join('\n').trim();
65
+ if (text) {
66
+ sections.push({
67
+ heading_path: currentHeadings.join(' > ') || '',
68
+ content: text,
69
+ });
70
+ }
71
+ }
72
+ return sections;
73
+ }
74
+ function splitByParagraphs(content, options) {
75
+ const blocks = splitPreservingCodeBlocks(content);
76
+ const chunks = [];
77
+ let current = [];
78
+ let currentTokens = 0;
79
+ for (const block of blocks) {
80
+ const blockTokens = estimateTokens(block);
81
+ if (currentTokens + blockTokens > options.max_tokens && current.length > 0) {
82
+ chunks.push(current.join('\n\n'));
83
+ if (options.overlap > 0) {
84
+ const overlapTokens = Math.floor(options.max_tokens * options.overlap);
85
+ const kept = [];
86
+ let keptTokens = 0;
87
+ for (let i = current.length - 1; i >= 0; i--) {
88
+ const t = estimateTokens(current[i]);
89
+ if (keptTokens + t > overlapTokens)
90
+ break;
91
+ kept.unshift(current[i]);
92
+ keptTokens += t;
93
+ }
94
+ current = kept;
95
+ currentTokens = keptTokens;
96
+ }
97
+ else {
98
+ current = [];
99
+ currentTokens = 0;
100
+ }
101
+ }
102
+ current.push(block);
103
+ currentTokens += blockTokens;
104
+ }
105
+ if (current.length > 0) {
106
+ chunks.push(current.join('\n\n'));
107
+ }
108
+ return chunks;
109
+ }
110
+ const PAGE_SEPARATOR_REGEX = /^\[PAGE (\d+)\]$/;
111
+ export function chunkPlainText(content, options) {
112
+ if (!content.trim())
113
+ return [];
114
+ const pageSections = splitByPages(content);
115
+ const results = [];
116
+ let chunkIndex = 0;
117
+ for (const section of pageSections) {
118
+ const blocks = section.content.split(/\n\n/).filter(b => b.trim());
119
+ const chunks = [];
120
+ let current = [];
121
+ let currentTokens = 0;
122
+ for (const block of blocks) {
123
+ const blockTokens = estimateTokens(block);
124
+ if (currentTokens + blockTokens > options.max_tokens && current.length > 0) {
125
+ chunks.push(current.join('\n\n'));
126
+ if (options.overlap > 0) {
127
+ const overlapTokens = Math.floor(options.max_tokens * options.overlap);
128
+ const kept = [];
129
+ let keptTokens = 0;
130
+ for (let i = current.length - 1; i >= 0; i--) {
131
+ const t = estimateTokens(current[i]);
132
+ if (keptTokens + t > overlapTokens)
133
+ break;
134
+ kept.unshift(current[i]);
135
+ keptTokens += t;
136
+ }
137
+ current = kept;
138
+ currentTokens = keptTokens;
139
+ }
140
+ else {
141
+ current = [];
142
+ currentTokens = 0;
143
+ }
144
+ }
145
+ current.push(block);
146
+ currentTokens += blockTokens;
147
+ }
148
+ if (current.length > 0) {
149
+ chunks.push(current.join('\n\n'));
150
+ }
151
+ for (const chunk of chunks) {
152
+ results.push({
153
+ content: chunk,
154
+ chunk_index: chunkIndex++,
155
+ token_count: estimateTokens(chunk),
156
+ metadata: { page_number: section.page },
157
+ });
158
+ }
159
+ }
160
+ return results;
161
+ }
162
+ function splitByPages(content) {
163
+ const lines = content.split('\n');
164
+ const sections = [];
165
+ let currentLines = [];
166
+ let currentPage = 1;
167
+ for (const line of lines) {
168
+ const match = line.match(PAGE_SEPARATOR_REGEX);
169
+ if (match) {
170
+ if (currentLines.length > 0) {
171
+ const text = currentLines.join('\n').trim();
172
+ if (text) {
173
+ sections.push({ page: currentPage, content: text });
174
+ }
175
+ }
176
+ // [PAGE N] marks the start of page N content
177
+ currentPage = parseInt(match[1], 10);
178
+ currentLines = [];
179
+ }
180
+ else {
181
+ currentLines.push(line);
182
+ }
183
+ }
184
+ if (currentLines.length > 0) {
185
+ const text = currentLines.join('\n').trim();
186
+ if (text) {
187
+ sections.push({ page: currentPage, content: text });
188
+ }
189
+ }
190
+ return sections.length > 0 ? sections : [{ page: 1, content }];
191
+ }
192
+ function splitPreservingCodeBlocks(content) {
193
+ const blocks = [];
194
+ const parts = content.split(/\n\n/);
195
+ let inCodeBlock = false;
196
+ let codeBuffer = [];
197
+ for (const part of parts) {
198
+ const fenceCount = (part.match(/```/g) || []).length;
199
+ if (inCodeBlock) {
200
+ codeBuffer.push(part);
201
+ if (fenceCount % 2 === 1) {
202
+ blocks.push(codeBuffer.join('\n\n'));
203
+ codeBuffer = [];
204
+ inCodeBlock = false;
205
+ }
206
+ }
207
+ else if (fenceCount % 2 === 1) {
208
+ inCodeBlock = true;
209
+ codeBuffer.push(part);
210
+ }
211
+ else {
212
+ if (part.trim())
213
+ blocks.push(part);
214
+ }
215
+ }
216
+ if (codeBuffer.length > 0) {
217
+ blocks.push(codeBuffer.join('\n\n'));
218
+ }
219
+ return blocks;
220
+ }
221
+ //# sourceMappingURL=chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.js","sourceRoot":"","sources":["../../src/ingest/chunker.ts"],"names":[],"mappings":"AAOA,MAAM,UAAU,cAAc,CAAC,IAAY;IACzC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;AACpC,CAAC;AAOD,MAAM,UAAU,aAAa,CAAC,OAAe,EAAE,OAAuB;IACpE,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAE/B,MAAM,QAAQ,GAAG,eAAe,CAAC,OAAO,CAAC,CAAC;IAC1C,MAAM,OAAO,GAAkB,EAAE,CAAC;IAClC,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,aAAa,GAAG,cAAc,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEtD,IAAI,aAAa,IAAI,OAAO,CAAC,UAAU,EAAE,CAAC;YACxC,OAAO,CAAC,IAAI,CAAC;gBACX,OAAO,EAAE,OAAO,CAAC,OAAO;gBACxB,WAAW,EAAE,UAAU,EAAE;gBACzB,WAAW,EAAE,aAAa;gBAC1B,QAAQ,EAAE,EAAE,YAAY,EAAE,OAAO,CAAC,YAAY,EAAE;aACjD,CAAC,CAAC;QACL,CAAC;aAAM,CAAC;YACN,MAAM,SAAS,GAAG,iBAAiB,CAAC,OAAO,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC;YAC9D,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;gBAC5B,OAAO,CAAC,IAAI,CAAC;oBACX,OAAO,EAAE,GAAG;oBACZ,WAAW,EAAE,UAAU,EAAE;oBACzB,WAAW,EAAE,cAAc,CAAC,GAAG,CAAC;oBAChC,QAAQ,EAAE,EAAE,YAAY,EAAE,OAAO,CAAC,YAAY,EAAE;iBACjD,CAAC,CAAC;YACL,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAS,eAAe,CAAC,OAAe;IACtC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,IAAI,eAAe,GAAa,EAAE,CAAC;IACnC,IAAI,YAAY,GAAa,EAAE,CAAC;IAEhC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QACrD,IAAI,YAAY,EAAE,CAAC;YACjB,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC5B,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC5C,IAAI,IAAI,EAAE,CAAC;oBACT,QAAQ,CAAC,IAAI,CAAC;wBACZ,YAAY,EAAE,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;wBAC/C,OAAO,EAAE,IAAI;qBACd,CAAC,CAAC;gBACL,CAAC;gBACD,YAAY,GAAG,EAAE,CAAC;YACpB,CAAC;YACD,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;YACrC,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;YACrC,eAAe,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;YACtD,eAAe,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,KAAK,CAAC;YACnC,eAAe,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;YAClD,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;aAAM,CAAC;YACN,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5C,IAAI,IAAI,EAAE,CAAC;YACT,QAAQ,CAAC,IAAI,CAAC;gBACZ,YAAY,EAAE,eAAe,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE;gBAC/C,OAAO,EAAE,IAAI;aACd,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED,SAAS,iBAAiB,CAAC,OAAe,EAAE,OAAuB;IACjE,MAAM,MAAM,GAAG,yBAAyB,CAAC,OAAO,CAAC,CAAC;IAClD,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAa,EAAE,CAAC;IAC3B,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,WAAW,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC;QAE1C,IAAI,aAAa,GAAG,WAAW,GAAG,OAAO,CAAC,UAAU,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;YAClC,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;gBACxB,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;gBACvE,MAAM,IAAI,GAAa,EAAE,CAAC;gBAC1B,IAAI,UAAU,GAAG,CAAC,CAAC;gBACnB,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC7C,MAAM,CAAC,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;oBACrC,IAAI,UAAU,GAAG,CAAC,GAAG,aAAa;wBAAE,MAAM;oBAC1C,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;oBACzB,UAAU,IAAI,CAAC,CAAC;gBAClB,CAAC;gBACD,OAAO,GAAG,IAAI,CAAC;gBACf,aAAa,GAAG,UAAU,CAAC;YAC7B,CAAC;iBAAM,CAAC;gBACN,OAAO,GAAG,EAAE,CAAC;gBACb,aAAa,GAAG,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACpB,aAAa,IAAI,WAAW,CAAC;IAC/B,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;IACpC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,MAAM,oBAAoB,GAAG,kBAAkB,CAAC;AAEhD,MAAM,UAAU,cAAc,CAAC,OAAe,EAAE,OAAuB;IACrE,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAE/B,MAAM,YAAY,GAAG,YAAY,CAAC,OAAO,CAAC,CAAC;IAC3C,MAAM,OAAO,GAAkB,EAAE,CAAC;IAClC,IAAI,UAAU,GAAG,CAAC,CAAC;IAEnB,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;QACnE,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,IAAI,OAAO,GAAa,EAAE,CAAC;QAC3B,IAAI,aAAa,GAAG,CAAC,CAAC;QAEtB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,WAAW,GAAG,cAAc,CAAC,KAAK,CAAC,CAAC;YAE1C,IAAI,aAAa,GAAG,WAAW,GAAG,OAAO,CAAC,UAAU,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;gBAElC,IAAI,OAAO,CAAC,OAAO,GAAG,CAAC,EAAE,CAAC;oBACxB,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;oBACvE,MAAM,IAAI,GAAa,EAAE,CAAC;oBAC1B,IAAI,UAAU,GAAG,CAAC,CAAC;oBACnB,KAAK,IAAI,CAAC,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;wBAC7C,MAAM,CAAC,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;wBACrC,IAAI,UAAU,GAAG,CAAC,GAAG,aAAa;4BAAE,MAAM;wBAC1C,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC;wBACzB,UAAU,IAAI,CAAC,CAAC;oBAClB,CAAC;oBACD,OAAO,GAAG,IAAI,CAAC;oBACf,aAAa,GAAG,UAAU,CAAC;gBAC7B,CAAC;qBAAM,CAAC;oBACN,OAAO,GAAG,EAAE,CAAC;oBACb,aAAa,GAAG,CAAC,CAAC;gBACpB,CAAC;YACH,CAAC;YAED,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACpB,aAAa,IAAI,WAAW,CAAC;QAC/B,CAAC;QAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;QACpC,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,OAAO,CAAC,IAAI,CAAC;gBACX,OAAO,EAAE,KAAK;gBACd,WAAW,EAAE,UAAU,EAAE;gBACzB,WAAW,EAAE,cAAc,CAAC,KAAK,CAAC;gBAClC,QAAQ,EAAE,EAAE,WAAW,EAAE,OAAO,CAAC,IAAI,EAAE;aACxC,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAOD,SAAS,YAAY,CAAC,OAAe;IACnC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAClC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,IAAI,YAAY,GAAa,EAAE,CAAC;IAChC,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC;QAC/C,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC5B,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;gBAC5C,IAAI,IAAI,EAAE,CAAC;oBACT,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;gBACtD,CAAC;YACH,CAAC;YACD,6CAA6C;YAC7C,WAAW,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACrC,YAAY,GAAG,EAAE,CAAC;QACpB,CAAC;aAAM,CAAC;YACN,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;IACH,CAAC;IAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5C,IAAI,IAAI,EAAE,CAAC;YACT,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC;QACtD,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,CAAC;AACjE,CAAC;AAED,SAAS,yBAAyB,CAAC,OAAe;IAChD,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;IACpC,IAAI,WAAW,GAAG,KAAK,CAAC;IACxB,IAAI,UAAU,GAAa,EAAE,CAAC;IAE9B,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,UAAU,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAErD,IAAI,WAAW,EAAE,CAAC;YAChB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACtB,IAAI,UAAU,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;gBACzB,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;gBACrC,UAAU,GAAG,EAAE,CAAC;gBAChB,WAAW,GAAG,KAAK,CAAC;YACtB,CAAC;QACH,CAAC;aAAM,IAAI,UAAU,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAChC,WAAW,GAAG,IAAI,CAAC;YACnB,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxB,CAAC;aAAM,CAAC;YACN,IAAI,IAAI,CAAC,IAAI,EAAE;gBAAE,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IAED,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { ParsedDocument, PdfConfig } from '../types.js';
2
+ export declare function computeHash(content: string | Buffer): string;
3
+ export declare function loadDocument(filePath: string, maxSizeMb?: number, pdfConfig?: PdfConfig): Promise<ParsedDocument>;
4
+ //# sourceMappingURL=document-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"document-loader.d.ts","sourceRoot":"","sources":["../../src/ingest/document-loader.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AAsB7D,wBAAgB,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,CAE5D;AAED,wBAAsB,YAAY,CAChC,QAAQ,EAAE,MAAM,EAChB,SAAS,SAAK,EACd,SAAS,CAAC,EAAE,SAAS,GACpB,OAAO,CAAC,cAAc,CAAC,CA2BzB"}
@@ -0,0 +1,56 @@
1
+ import crypto from 'node:crypto';
2
+ import fs from 'node:fs';
3
+ import path from 'node:path';
4
+ import { extractPdf } from './pdf-extractor.js';
5
+ const MIME_MAP = {
6
+ '.md': 'text/markdown',
7
+ '.markdown': 'text/markdown',
8
+ '.txt': 'text/plain',
9
+ '.json': 'application/json',
10
+ '.yaml': 'text/yaml',
11
+ '.yml': 'text/yaml',
12
+ '.pdf': 'application/pdf',
13
+ };
14
+ const SUPPORTED_EXTENSIONS = new Set(Object.keys(MIME_MAP));
15
+ const DEFAULT_PDF_CONFIG = {
16
+ ocr_enabled: true,
17
+ ocr_languages: ['eng'],
18
+ ocr_threshold: 50,
19
+ max_pages: 500,
20
+ };
21
+ export function computeHash(content) {
22
+ return crypto.createHash('sha256').update(content).digest('hex');
23
+ }
24
+ export async function loadDocument(filePath, maxSizeMb = 10, pdfConfig) {
25
+ const ext = path.extname(filePath).toLowerCase();
26
+ if (!SUPPORTED_EXTENSIONS.has(ext)) {
27
+ throw new Error(`Unsupported file type: ${ext}`);
28
+ }
29
+ const stat = fs.statSync(filePath);
30
+ const sizeMb = stat.size / (1024 * 1024);
31
+ if (sizeMb > maxSizeMb) {
32
+ throw new Error(`File too large: ${sizeMb.toFixed(1)}MB exceeds limit of ${maxSizeMb}MB`);
33
+ }
34
+ if (ext === '.pdf') {
35
+ const result = await extractPdf(filePath, pdfConfig ?? DEFAULT_PDF_CONFIG);
36
+ return {
37
+ title: path.basename(filePath, '.pdf'),
38
+ content: result.content,
39
+ mime_type: 'application/pdf',
40
+ metadata: result.metadata,
41
+ };
42
+ }
43
+ const content = fs.readFileSync(filePath, 'utf-8');
44
+ const mime_type = MIME_MAP[ext];
45
+ const title = extractTitle(content, filePath, mime_type);
46
+ return { title, content, mime_type };
47
+ }
48
+ function extractTitle(content, filePath, mimeType) {
49
+ if (mimeType === 'text/markdown') {
50
+ const match = content.match(/^#{1,6}\s+(.+)$/m);
51
+ if (match)
52
+ return match[1].trim();
53
+ }
54
+ return path.basename(filePath);
55
+ }
56
+ //# sourceMappingURL=document-loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"document-loader.js","sourceRoot":"","sources":["../../src/ingest/document-loader.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,aAAa,CAAC;AACjC,OAAO,EAAE,MAAM,SAAS,CAAC;AACzB,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAC;AAEhD,MAAM,QAAQ,GAA2B;IACvC,KAAK,EAAE,eAAe;IACtB,WAAW,EAAE,eAAe;IAC5B,MAAM,EAAE,YAAY;IACpB,OAAO,EAAE,kBAAkB;IAC3B,OAAO,EAAE,WAAW;IACpB,MAAM,EAAE,WAAW;IACnB,MAAM,EAAE,iBAAiB;CAC1B,CAAC;AAEF,MAAM,oBAAoB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;AAE5D,MAAM,kBAAkB,GAAc;IACpC,WAAW,EAAE,IAAI;IACjB,aAAa,EAAE,CAAC,KAAK,CAAC;IACtB,aAAa,EAAE,EAAE;IACjB,SAAS,EAAE,GAAG;CACf,CAAC;AAEF,MAAM,UAAU,WAAW,CAAC,OAAwB;IAClD,OAAO,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AACnE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,QAAgB,EAChB,SAAS,GAAG,EAAE,EACd,SAAqB;IAErB,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAC;IACjD,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;QACnC,MAAM,IAAI,KAAK,CAAC,0BAA0B,GAAG,EAAE,CAAC,CAAC;IACnD,CAAC;IAED,MAAM,IAAI,GAAG,EAAE,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IACnC,MAAM,MAAM,GAAG,IAAI,CAAC,IAAI,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;IACzC,IAAI,MAAM,GAAG,SAAS,EAAE,CAAC;QACvB,MAAM,IAAI,KAAK,CAAC,mBAAmB,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,uBAAuB,SAAS,IAAI,CAAC,CAAC;IAC5F,CAAC;IAED,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;QACnB,MAAM,MAAM,GAAG,MAAM,UAAU,CAAC,QAAQ,EAAE,SAAS,IAAI,kBAAkB,CAAC,CAAC;QAC3E,OAAO;YACL,KAAK,EAAE,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;YACtC,OAAO,EAAE,MAAM,CAAC,OAAO;YACvB,SAAS,EAAE,iBAAiB;YAC5B,QAAQ,EAAE,MAAM,CAAC,QAA8C;SAChE,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IACnD,MAAM,SAAS,GAAG,QAAQ,CAAC,GAAG,CAAE,CAAC;IACjC,MAAM,KAAK,GAAG,YAAY,CAAC,OAAO,EAAE,QAAQ,EAAE,SAAS,CAAC,CAAC;IAEzD,OAAO,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;AACvC,CAAC;AAED,SAAS,YAAY,CAAC,OAAe,EAAE,QAAgB,EAAE,QAAgB;IACvE,IAAI,QAAQ,KAAK,eAAe,EAAE,CAAC;QACjC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAC;QAChD,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACpC,CAAC;IACD,OAAO,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AACjC,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { Embedder } from '../types.js';
2
+ export declare function createMockEmbedder(dims: number): Embedder;
3
+ export declare function createEmbedder(provider: string): Embedder;
4
+ //# sourceMappingURL=embedder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedder.d.ts","sourceRoot":"","sources":["../../src/ingest/embedder.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAE5C,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,QAAQ,CAiBzD;AAGD,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,QAAQ,CAGzD"}
@@ -0,0 +1,25 @@
1
+ import crypto from 'node:crypto';
2
+ export function createMockEmbedder(dims) {
3
+ return {
4
+ dimensions: dims,
5
+ async embed(texts) {
6
+ return texts.map((text) => {
7
+ // Deterministic hash-based vector for testing
8
+ const hash = crypto.createHash('sha256').update(text).digest();
9
+ const vector = [];
10
+ for (let i = 0; i < dims; i++) {
11
+ vector.push((hash[i % hash.length] / 255) * 2 - 1);
12
+ }
13
+ // Normalize
14
+ const norm = Math.sqrt(vector.reduce((s, v) => s + v * v, 0));
15
+ return vector.map((v) => v / norm);
16
+ });
17
+ },
18
+ };
19
+ }
20
+ // Placeholder for future real implementations
21
+ export function createEmbedder(provider) {
22
+ // For now, always use mock. Real providers added in follow-up.
23
+ return createMockEmbedder(384);
24
+ }
25
+ //# sourceMappingURL=embedder.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedder.js","sourceRoot":"","sources":["../../src/ingest/embedder.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,aAAa,CAAC;AAGjC,MAAM,UAAU,kBAAkB,CAAC,IAAY;IAC7C,OAAO;QACL,UAAU,EAAE,IAAI;QAChB,KAAK,CAAC,KAAK,CAAC,KAAe;YACzB,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;gBACxB,8CAA8C;gBAC9C,MAAM,IAAI,GAAG,MAAM,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,CAAC;gBAC/D,MAAM,MAAM,GAAa,EAAE,CAAC;gBAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,EAAE,CAAC,EAAE,EAAE,CAAC;oBAC9B,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;gBACrD,CAAC;gBACD,YAAY;gBACZ,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC9D,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC;YACrC,CAAC,CAAC,CAAC;QACL,CAAC;KACF,CAAC;AACJ,CAAC;AAED,8CAA8C;AAC9C,MAAM,UAAU,cAAc,CAAC,QAAgB;IAC7C,+DAA+D;IAC/D,OAAO,kBAAkB,CAAC,GAAG,CAAC,CAAC;AACjC,CAAC"}
@@ -0,0 +1,21 @@
1
+ interface RawEntity {
2
+ name: string;
3
+ type: string;
4
+ description?: string;
5
+ }
6
+ interface RawRelation {
7
+ source: string;
8
+ target: string;
9
+ type: string;
10
+ description?: string;
11
+ }
12
+ interface ExtractionResult {
13
+ entities: RawEntity[];
14
+ relations: RawRelation[];
15
+ }
16
+ export declare function parseExtractionResponse(response: string): ExtractionResult;
17
+ export declare function validateEntities(entities: RawEntity[]): RawEntity[];
18
+ export declare function validateRelations(relations: RawRelation[]): RawRelation[];
19
+ export declare function buildExtractionPrompt(content: string): string;
20
+ export {};
21
+ //# sourceMappingURL=entity-extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"entity-extractor.d.ts","sourceRoot":"","sources":["../../src/ingest/entity-extractor.ts"],"names":[],"mappings":"AAwBA,UAAU,SAAS;IACjB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,UAAU,WAAW;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,UAAU,gBAAgB;IACxB,QAAQ,EAAE,SAAS,EAAE,CAAC;IACtB,SAAS,EAAE,WAAW,EAAE,CAAC;CAC1B;AAED,wBAAgB,uBAAuB,CAAC,QAAQ,EAAE,MAAM,GAAG,gBAAgB,CAW1E;AAED,wBAAgB,gBAAgB,CAAC,QAAQ,EAAE,SAAS,EAAE,GAAG,SAAS,EAAE,CAEnE;AAED,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,WAAW,EAAE,GAAG,WAAW,EAAE,CAIzE;AAED,wBAAgB,qBAAqB,CAAC,OAAO,EAAE,MAAM,GAAG,MAAM,CAgB7D"}
@@ -0,0 +1,54 @@
1
+ import { z } from 'zod';
2
+ const VALID_ENTITY_TYPES = ['technology', 'person', 'decision', 'concept', 'squad', 'agent', 'learning'];
3
+ const VALID_RELATION_TYPES = ['produced', 'depends_on', 'learned_from', 'contradicts', 'decided', 'uses', 'replaced'];
4
+ const RawEntitySchema = z.object({
5
+ name: z.string(),
6
+ type: z.string(),
7
+ description: z.string().optional(),
8
+ });
9
+ const RawRelationSchema = z.object({
10
+ source: z.string(),
11
+ target: z.string(),
12
+ type: z.string(),
13
+ description: z.string().optional(),
14
+ });
15
+ const ExtractionResponseSchema = z.object({
16
+ entities: z.array(RawEntitySchema).default([]),
17
+ relations: z.array(RawRelationSchema).default([]),
18
+ });
19
+ export function parseExtractionResponse(response) {
20
+ const codeBlockMatch = response.match(/```(?:json)?\s*\n?([\s\S]*?)\n?```/);
21
+ const jsonStr = codeBlockMatch ? codeBlockMatch[1].trim() : response.trim();
22
+ try {
23
+ const parsed = JSON.parse(jsonStr);
24
+ const validated = ExtractionResponseSchema.parse(parsed);
25
+ return validated;
26
+ }
27
+ catch {
28
+ return { entities: [], relations: [] };
29
+ }
30
+ }
31
+ export function validateEntities(entities) {
32
+ return entities.filter((e) => VALID_ENTITY_TYPES.includes(e.type) && e.name?.trim());
33
+ }
34
+ export function validateRelations(relations) {
35
+ return relations.filter((r) => VALID_RELATION_TYPES.includes(r.type) && r.source?.trim() && r.target?.trim());
36
+ }
37
+ export function buildExtractionPrompt(content) {
38
+ return `Given the text below, extract:
39
+ 1. Entities (name, type, description)
40
+ 2. Relations between entities (source, target, type, description)
41
+
42
+ Valid entity types: ${VALID_ENTITY_TYPES.join(', ')}
43
+ Valid relation types: ${VALID_RELATION_TYPES.join(', ')}
44
+
45
+ Respond ONLY with JSON in this format:
46
+ {
47
+ "entities": [{"name": "...", "type": "...", "description": "..."}],
48
+ "relations": [{"source": "...", "target": "...", "type": "...", "description": "..."}]
49
+ }
50
+
51
+ TEXT:
52
+ ${content}`;
53
+ }
54
+ //# sourceMappingURL=entity-extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"entity-extractor.js","sourceRoot":"","sources":["../../src/ingest/entity-extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,MAAM,kBAAkB,GAAiB,CAAC,YAAY,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,UAAU,CAAC,CAAC;AACvH,MAAM,oBAAoB,GAAmB,CAAC,UAAU,EAAE,YAAY,EAAE,cAAc,EAAE,aAAa,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;AAEtI,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CACnC,CAAC,CAAC;AAEH,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CACnC,CAAC,CAAC;AAEH,MAAM,wBAAwB,GAAG,CAAC,CAAC,MAAM,CAAC;IACxC,QAAQ,EAAE,CAAC,CAAC,KAAK,CAAC,eAAe,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC;IAC9C,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC;CAClD,CAAC,CAAC;AAoBH,MAAM,UAAU,uBAAuB,CAAC,QAAgB;IACtD,MAAM,cAAc,GAAG,QAAQ,CAAC,KAAK,CAAC,oCAAoC,CAAC,CAAC;IAC5E,MAAM,OAAO,GAAG,cAAc,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;IAE5E,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,wBAAwB,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACzD,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC;IACzC,CAAC;AACH,CAAC;AAED,MAAM,UAAU,gBAAgB,CAAC,QAAqB;IACpD,OAAO,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAkB,CAAC,IAAI,CAAC,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;AACrG,CAAC;AAED,MAAM,UAAU,iBAAiB,CAAC,SAAwB;IACxD,OAAO,SAAS,CAAC,MAAM,CACrB,CAAC,CAAC,EAAE,EAAE,CAAC,oBAAoB,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAoB,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,MAAM,EAAE,IAAI,EAAE,CACrG,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,qBAAqB,CAAC,OAAe;IACnD,OAAO;;;;sBAIa,kBAAkB,CAAC,IAAI,CAAC,IAAI,CAAC;wBAC3B,oBAAoB,CAAC,IAAI,CAAC,IAAI,CAAC;;;;;;;;;EASrD,OAAO,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,16 @@
1
+ import type Database from 'better-sqlite3';
2
+ export declare class ExtractionQueue {
3
+ private db;
4
+ constructor(db: Database.Database);
5
+ enqueue(documentId: string): void;
6
+ dequeue(): {
7
+ id: string;
8
+ document_id: string;
9
+ attempts: number;
10
+ } | null;
11
+ markComplete(jobId: string): void;
12
+ markFailed(jobId: string, error: string): void;
13
+ private cleanupStaleJobs;
14
+ pendingCount(): number;
15
+ }
16
+ //# sourceMappingURL=extraction-queue.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extraction-queue.d.ts","sourceRoot":"","sources":["../../src/ingest/extraction-queue.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,QAAQ,MAAM,gBAAgB,CAAC;AAK3C,qBAAa,eAAe;IACd,OAAO,CAAC,EAAE;gBAAF,EAAE,EAAE,QAAQ,CAAC,QAAQ;IAIzC,OAAO,CAAC,UAAU,EAAE,MAAM,GAAG,IAAI;IAOjC,OAAO,IAAI;QAAE,EAAE,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,GAAG,IAAI;IAWvE,YAAY,CAAC,KAAK,EAAE,MAAM,GAAG,IAAI;IASjC,UAAU,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI;IAa9C,OAAO,CAAC,gBAAgB;IAIxB,YAAY,IAAI,MAAM;CAGvB"}
@@ -0,0 +1,49 @@
1
+ import { nanoid } from 'nanoid';
2
+ const MAX_RETRIES = 2;
3
+ export class ExtractionQueue {
4
+ db;
5
+ constructor(db) {
6
+ this.db = db;
7
+ this.cleanupStaleJobs();
8
+ }
9
+ enqueue(documentId) {
10
+ const now = new Date().toISOString();
11
+ this.db.prepare('INSERT OR IGNORE INTO extraction_jobs (id, document_id, status, attempts, created_at, updated_at) VALUES (?, ?, ?, 0, ?, ?)').run(nanoid(), documentId, 'pending', now, now);
12
+ }
13
+ dequeue() {
14
+ const job = this.db.prepare("SELECT id, document_id, attempts FROM extraction_jobs WHERE status = 'pending' ORDER BY created_at ASC LIMIT 1").get();
15
+ if (!job)
16
+ return null;
17
+ this.db.prepare("UPDATE extraction_jobs SET status = 'running', updated_at = ? WHERE id = ?")
18
+ .run(new Date().toISOString(), job.id);
19
+ return job;
20
+ }
21
+ markComplete(jobId) {
22
+ const now = new Date().toISOString();
23
+ this.db.prepare("UPDATE extraction_jobs SET status = 'complete', updated_at = ? WHERE id = ?").run(now, jobId);
24
+ const job = this.db.prepare('SELECT document_id FROM extraction_jobs WHERE id = ?').get(jobId);
25
+ if (job) {
26
+ this.db.prepare("UPDATE documents SET extraction_status = 'complete', updated_at = ? WHERE id = ?").run(now, job.document_id);
27
+ }
28
+ }
29
+ markFailed(jobId, error) {
30
+ const now = new Date().toISOString();
31
+ const job = this.db.prepare('SELECT document_id, attempts FROM extraction_jobs WHERE id = ?').get(jobId);
32
+ if (!job)
33
+ return;
34
+ if (job.attempts >= MAX_RETRIES) {
35
+ this.db.prepare("UPDATE extraction_jobs SET status = 'failed', error = ?, updated_at = ? WHERE id = ?").run(error, now, jobId);
36
+ this.db.prepare("UPDATE documents SET extraction_status = 'failed', updated_at = ? WHERE id = ?").run(now, job.document_id);
37
+ }
38
+ else {
39
+ this.db.prepare("UPDATE extraction_jobs SET status = 'pending', attempts = attempts + 1, error = ?, updated_at = ? WHERE id = ?").run(error, now, jobId);
40
+ }
41
+ }
42
+ cleanupStaleJobs() {
43
+ this.db.prepare("UPDATE extraction_jobs SET status = 'pending' WHERE status = 'running'").run();
44
+ }
45
+ pendingCount() {
46
+ return this.db.prepare("SELECT COUNT(*) as count FROM extraction_jobs WHERE status = 'pending'").get().count;
47
+ }
48
+ }
49
+ //# sourceMappingURL=extraction-queue.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extraction-queue.js","sourceRoot":"","sources":["../../src/ingest/extraction-queue.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,EAAE,MAAM,QAAQ,CAAC;AAEhC,MAAM,WAAW,GAAG,CAAC,CAAC;AAEtB,MAAM,OAAO,eAAe;IACN;IAApB,YAAoB,EAAqB;QAArB,OAAE,GAAF,EAAE,CAAmB;QACvC,IAAI,CAAC,gBAAgB,EAAE,CAAC;IAC1B,CAAC;IAED,OAAO,CAAC,UAAkB;QACxB,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,EAAE,CAAC,OAAO,CACb,6HAA6H,CAC9H,CAAC,GAAG,CAAC,MAAM,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC;IACnD,CAAC;IAED,OAAO;QACL,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CACzB,gHAAgH,CACjH,CAAC,GAAG,EAAuE,CAAC;QAC7E,IAAI,CAAC,GAAG;YAAE,OAAO,IAAI,CAAC;QAEtB,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,4EAA4E,CAAC;aAC1F,GAAG,CAAC,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;QACzC,OAAO,GAAG,CAAC;IACb,CAAC;IAED,YAAY,CAAC,KAAa;QACxB,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,6EAA6E,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAC/G,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,sDAAsD,CAAC,CAAC,GAAG,CAAC,KAAK,CAAwC,CAAC;QACtI,IAAI,GAAG,EAAE,CAAC;YACR,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,kFAAkF,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,WAAW,CAAC,CAAC;QAChI,CAAC;IACH,CAAC;IAED,UAAU,CAAC,KAAa,EAAE,KAAa;QACrC,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,gEAAgE,CAAC,CAAC,GAAG,CAAC,KAAK,CAA0D,CAAC;QAClK,IAAI,CAAC,GAAG;YAAE,OAAO;QAEjB,IAAI,GAAG,CAAC,QAAQ,IAAI,WAAW,EAAE,CAAC;YAChC,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,sFAAsF,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,EAAE,KAAK,CAAC,CAAC;YAC/H,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,gFAAgF,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,WAAW,CAAC,CAAC;QAC9H,CAAC;aAAM,CAAC;YACN,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,gHAAgH,CAAC,CAAC,GAAG,CAAC,KAAK,EAAE,GAAG,EAAE,KAAK,CAAC,CAAC;QAC3J,CAAC;IACH,CAAC;IAEO,gBAAgB;QACtB,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,wEAAwE,CAAC,CAAC,GAAG,EAAE,CAAC;IAClG,CAAC;IAED,YAAY;QACV,OAAQ,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,wEAAwE,CAAC,CAAC,GAAG,EAAwB,CAAC,KAAK,CAAC;IACtI,CAAC;CACF"}
@@ -0,0 +1,9 @@
1
+ import type { PdfConfig, PdfMetadata } from '../types.js';
2
+ interface PdfExtractionResult {
3
+ content: string;
4
+ metadata: PdfMetadata;
5
+ }
6
+ export declare function extractPdf(filePath: string, config: PdfConfig): Promise<PdfExtractionResult>;
7
+ export declare function terminateOcrWorker(): Promise<void>;
8
+ export {};
9
+ //# sourceMappingURL=pdf-extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pdf-extractor.d.ts","sourceRoot":"","sources":["../../src/ingest/pdf-extractor.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE1D,UAAU,mBAAmB;IAC3B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,WAAW,CAAC;CACvB;AAID,wBAAsB,UAAU,CAC9B,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,SAAS,GAChB,OAAO,CAAC,mBAAmB,CAAC,CA6E9B;AAqCD,wBAAsB,kBAAkB,IAAI,OAAO,CAAC,IAAI,CAAC,CASxD"}
@@ -0,0 +1,116 @@
1
+ import fs from 'node:fs';
2
+ let ocrWorker = null;
3
+ export async function extractPdf(filePath, config) {
4
+ const data = new Uint8Array(fs.readFileSync(filePath));
5
+ const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
6
+ let doc;
7
+ try {
8
+ doc = await pdfjsLib.getDocument({ data, useSystemFonts: true }).promise;
9
+ }
10
+ catch (err) {
11
+ const message = err instanceof Error ? err.message : String(err);
12
+ if (message.includes('password') || message.includes('encrypted')) {
13
+ throw new Error(`Password-protected PDF: ${filePath}`);
14
+ }
15
+ throw new Error(`Failed to parse PDF ${filePath}: ${message}`);
16
+ }
17
+ const pageCount = doc.numPages;
18
+ if (pageCount === 0) {
19
+ throw new Error(`Zero-page PDF: ${filePath}`);
20
+ }
21
+ const pagesToProcess = Math.min(pageCount, config.max_pages);
22
+ const truncated = pagesToProcess < pageCount;
23
+ const pageTexts = [];
24
+ const ocrPages = [];
25
+ for (let i = 1; i <= pagesToProcess; i++) {
26
+ const page = await doc.getPage(i);
27
+ const textContent = await page.getTextContent();
28
+ const text = textContent.items
29
+ .map(item => ('str' in item ? item.str : ''))
30
+ .join(' ')
31
+ .trim();
32
+ if (text.length >= config.ocr_threshold) {
33
+ pageTexts.push(text);
34
+ }
35
+ else if (config.ocr_enabled) {
36
+ try {
37
+ const ocrText = await ocrPage(page, config);
38
+ pageTexts.push(ocrText || text);
39
+ if (ocrText)
40
+ ocrPages.push(i);
41
+ }
42
+ catch {
43
+ pageTexts.push(text);
44
+ }
45
+ }
46
+ else {
47
+ pageTexts.push(text);
48
+ }
49
+ page.cleanup();
50
+ }
51
+ let content;
52
+ if (pageTexts.length <= 1) {
53
+ content = pageTexts[0] ?? '';
54
+ }
55
+ else {
56
+ content = pageTexts
57
+ .map((text, i) => (i > 0 ? `[PAGE ${i + 1}]\n\n${text}` : text))
58
+ .join('\n\n');
59
+ }
60
+ const extraction_method = ocrPages.length === 0
61
+ ? 'native'
62
+ : ocrPages.length === pagesToProcess
63
+ ? 'ocr'
64
+ : 'mixed';
65
+ return {
66
+ content,
67
+ metadata: {
68
+ page_count: pageCount,
69
+ ocr_pages: ocrPages,
70
+ languages: ocrPages.length > 0 ? config.ocr_languages : [],
71
+ extraction_method,
72
+ ...(truncated ? { truncated: true } : {}),
73
+ },
74
+ };
75
+ }
76
+ async function ocrPage(page, config) {
77
+ try {
78
+ const Tesseract = await import('tesseract.js');
79
+ if (!ocrWorker) {
80
+ ocrWorker = await Tesseract.createWorker(config.ocr_languages.join('+'));
81
+ }
82
+ const worker = ocrWorker;
83
+ const viewport = page.getViewport({ scale: 2.0 });
84
+ // Dynamic import for optional native dependency — canvas is only needed for OCR
85
+ const { createCanvas } = await import(/* webpackIgnore: true */ 'canvas');
86
+ const canvas = createCanvas(viewport.width, viewport.height);
87
+ const context = canvas.getContext('2d');
88
+ await page.render({
89
+ canvasContext: context,
90
+ viewport,
91
+ }).promise;
92
+ const imageBuffer = canvas.toBuffer('image/png');
93
+ const result = await worker.recognize(imageBuffer);
94
+ return result.data.text.trim();
95
+ }
96
+ catch (err) {
97
+ const message = err instanceof Error ? err.message : String(err);
98
+ if (message.includes('Cannot find module') || message.includes('MODULE_NOT_FOUND')) {
99
+ console.warn('[pdf-extractor] OCR dependencies not available — skipping OCR');
100
+ return '';
101
+ }
102
+ throw err;
103
+ }
104
+ }
105
+ export async function terminateOcrWorker() {
106
+ if (ocrWorker) {
107
+ try {
108
+ await ocrWorker.terminate();
109
+ }
110
+ catch {
111
+ // Ignore termination errors
112
+ }
113
+ ocrWorker = null;
114
+ }
115
+ }
116
+ //# sourceMappingURL=pdf-extractor.js.map