@soleri/core 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (300) hide show
  1. package/dist/brain/brain.d.ts +7 -0
  2. package/dist/brain/brain.d.ts.map +1 -1
  3. package/dist/brain/brain.js +56 -9
  4. package/dist/brain/brain.js.map +1 -1
  5. package/dist/brain/types.d.ts +2 -2
  6. package/dist/brain/types.d.ts.map +1 -1
  7. package/dist/cognee/client.d.ts +3 -0
  8. package/dist/cognee/client.d.ts.map +1 -1
  9. package/dist/cognee/client.js +17 -0
  10. package/dist/cognee/client.js.map +1 -1
  11. package/dist/cognee/sync-manager.d.ts +94 -0
  12. package/dist/cognee/sync-manager.d.ts.map +1 -0
  13. package/dist/cognee/sync-manager.js +293 -0
  14. package/dist/cognee/sync-manager.js.map +1 -0
  15. package/dist/curator/curator.d.ts +8 -1
  16. package/dist/curator/curator.d.ts.map +1 -1
  17. package/dist/curator/curator.js +64 -1
  18. package/dist/curator/curator.js.map +1 -1
  19. package/dist/errors/classify.d.ts +13 -0
  20. package/dist/errors/classify.d.ts.map +1 -0
  21. package/dist/errors/classify.js +97 -0
  22. package/dist/errors/classify.js.map +1 -0
  23. package/dist/errors/index.d.ts +6 -0
  24. package/dist/errors/index.d.ts.map +1 -0
  25. package/dist/errors/index.js +4 -0
  26. package/dist/errors/index.js.map +1 -0
  27. package/dist/errors/retry.d.ts +40 -0
  28. package/dist/errors/retry.d.ts.map +1 -0
  29. package/dist/errors/retry.js +97 -0
  30. package/dist/errors/retry.js.map +1 -0
  31. package/dist/errors/types.d.ts +48 -0
  32. package/dist/errors/types.d.ts.map +1 -0
  33. package/dist/errors/types.js +59 -0
  34. package/dist/errors/types.js.map +1 -0
  35. package/dist/index.d.ts +25 -5
  36. package/dist/index.d.ts.map +1 -1
  37. package/dist/index.js +21 -3
  38. package/dist/index.js.map +1 -1
  39. package/dist/intake/content-classifier.d.ts +14 -0
  40. package/dist/intake/content-classifier.d.ts.map +1 -0
  41. package/dist/intake/content-classifier.js +125 -0
  42. package/dist/intake/content-classifier.js.map +1 -0
  43. package/dist/intake/dedup-gate.d.ts +17 -0
  44. package/dist/intake/dedup-gate.d.ts.map +1 -0
  45. package/dist/intake/dedup-gate.js +66 -0
  46. package/dist/intake/dedup-gate.js.map +1 -0
  47. package/dist/intake/intake-pipeline.d.ts +63 -0
  48. package/dist/intake/intake-pipeline.d.ts.map +1 -0
  49. package/dist/intake/intake-pipeline.js +373 -0
  50. package/dist/intake/intake-pipeline.js.map +1 -0
  51. package/dist/intake/types.d.ts +65 -0
  52. package/dist/intake/types.d.ts.map +1 -0
  53. package/dist/intake/types.js +3 -0
  54. package/dist/intake/types.js.map +1 -0
  55. package/dist/intelligence/loader.js +1 -1
  56. package/dist/intelligence/loader.js.map +1 -1
  57. package/dist/intelligence/types.d.ts +3 -1
  58. package/dist/intelligence/types.d.ts.map +1 -1
  59. package/dist/loop/loop-manager.d.ts +58 -7
  60. package/dist/loop/loop-manager.d.ts.map +1 -1
  61. package/dist/loop/loop-manager.js +280 -6
  62. package/dist/loop/loop-manager.js.map +1 -1
  63. package/dist/loop/types.d.ts +69 -1
  64. package/dist/loop/types.d.ts.map +1 -1
  65. package/dist/loop/types.js +4 -1
  66. package/dist/loop/types.js.map +1 -1
  67. package/dist/persistence/index.d.ts +3 -0
  68. package/dist/persistence/index.d.ts.map +1 -0
  69. package/dist/persistence/index.js +2 -0
  70. package/dist/persistence/index.js.map +1 -0
  71. package/dist/persistence/sqlite-provider.d.ts +25 -0
  72. package/dist/persistence/sqlite-provider.d.ts.map +1 -0
  73. package/dist/persistence/sqlite-provider.js +59 -0
  74. package/dist/persistence/sqlite-provider.js.map +1 -0
  75. package/dist/persistence/types.d.ts +36 -0
  76. package/dist/persistence/types.d.ts.map +1 -0
  77. package/dist/persistence/types.js +8 -0
  78. package/dist/persistence/types.js.map +1 -0
  79. package/dist/planning/gap-analysis.d.ts +47 -4
  80. package/dist/planning/gap-analysis.d.ts.map +1 -1
  81. package/dist/planning/gap-analysis.js +190 -13
  82. package/dist/planning/gap-analysis.js.map +1 -1
  83. package/dist/planning/gap-types.d.ts +1 -1
  84. package/dist/planning/gap-types.d.ts.map +1 -1
  85. package/dist/planning/gap-types.js.map +1 -1
  86. package/dist/planning/planner.d.ts +277 -9
  87. package/dist/planning/planner.d.ts.map +1 -1
  88. package/dist/planning/planner.js +611 -46
  89. package/dist/planning/planner.js.map +1 -1
  90. package/dist/playbooks/generic/brainstorming.d.ts +9 -0
  91. package/dist/playbooks/generic/brainstorming.d.ts.map +1 -0
  92. package/dist/playbooks/generic/brainstorming.js +105 -0
  93. package/dist/playbooks/generic/brainstorming.js.map +1 -0
  94. package/dist/playbooks/generic/code-review.d.ts +11 -0
  95. package/dist/playbooks/generic/code-review.d.ts.map +1 -0
  96. package/dist/playbooks/generic/code-review.js +176 -0
  97. package/dist/playbooks/generic/code-review.js.map +1 -0
  98. package/dist/playbooks/generic/subagent-execution.d.ts +9 -0
  99. package/dist/playbooks/generic/subagent-execution.d.ts.map +1 -0
  100. package/dist/playbooks/generic/subagent-execution.js +68 -0
  101. package/dist/playbooks/generic/subagent-execution.js.map +1 -0
  102. package/dist/playbooks/generic/systematic-debugging.d.ts +9 -0
  103. package/dist/playbooks/generic/systematic-debugging.d.ts.map +1 -0
  104. package/dist/playbooks/generic/systematic-debugging.js +87 -0
  105. package/dist/playbooks/generic/systematic-debugging.js.map +1 -0
  106. package/dist/playbooks/generic/tdd.d.ts +9 -0
  107. package/dist/playbooks/generic/tdd.d.ts.map +1 -0
  108. package/dist/playbooks/generic/tdd.js +70 -0
  109. package/dist/playbooks/generic/tdd.js.map +1 -0
  110. package/dist/playbooks/generic/verification.d.ts +9 -0
  111. package/dist/playbooks/generic/verification.d.ts.map +1 -0
  112. package/dist/playbooks/generic/verification.js +74 -0
  113. package/dist/playbooks/generic/verification.js.map +1 -0
  114. package/dist/playbooks/index.d.ts +4 -0
  115. package/dist/playbooks/index.d.ts.map +1 -0
  116. package/dist/playbooks/index.js +5 -0
  117. package/dist/playbooks/index.js.map +1 -0
  118. package/dist/playbooks/playbook-registry.d.ts +42 -0
  119. package/dist/playbooks/playbook-registry.d.ts.map +1 -0
  120. package/dist/playbooks/playbook-registry.js +227 -0
  121. package/dist/playbooks/playbook-registry.js.map +1 -0
  122. package/dist/playbooks/playbook-seeder.d.ts +47 -0
  123. package/dist/playbooks/playbook-seeder.d.ts.map +1 -0
  124. package/dist/playbooks/playbook-seeder.js +104 -0
  125. package/dist/playbooks/playbook-seeder.js.map +1 -0
  126. package/dist/playbooks/playbook-types.d.ts +132 -0
  127. package/dist/playbooks/playbook-types.d.ts.map +1 -0
  128. package/dist/playbooks/playbook-types.js +12 -0
  129. package/dist/playbooks/playbook-types.js.map +1 -0
  130. package/dist/project/project-registry.d.ts.map +1 -1
  131. package/dist/project/project-registry.js +9 -11
  132. package/dist/project/project-registry.js.map +1 -1
  133. package/dist/prompts/index.d.ts +4 -0
  134. package/dist/prompts/index.d.ts.map +1 -0
  135. package/dist/prompts/index.js +3 -0
  136. package/dist/prompts/index.js.map +1 -0
  137. package/dist/prompts/parser.d.ts +17 -0
  138. package/dist/prompts/parser.d.ts.map +1 -0
  139. package/dist/prompts/parser.js +47 -0
  140. package/dist/prompts/parser.js.map +1 -0
  141. package/dist/prompts/template-manager.d.ts +25 -0
  142. package/dist/prompts/template-manager.d.ts.map +1 -0
  143. package/dist/prompts/template-manager.js +71 -0
  144. package/dist/prompts/template-manager.js.map +1 -0
  145. package/dist/prompts/types.d.ts +26 -0
  146. package/dist/prompts/types.d.ts.map +1 -0
  147. package/dist/prompts/types.js +5 -0
  148. package/dist/prompts/types.js.map +1 -0
  149. package/dist/runtime/admin-extra-ops.d.ts +5 -3
  150. package/dist/runtime/admin-extra-ops.d.ts.map +1 -1
  151. package/dist/runtime/admin-extra-ops.js +322 -11
  152. package/dist/runtime/admin-extra-ops.js.map +1 -1
  153. package/dist/runtime/admin-ops.d.ts.map +1 -1
  154. package/dist/runtime/admin-ops.js +10 -3
  155. package/dist/runtime/admin-ops.js.map +1 -1
  156. package/dist/runtime/capture-ops.d.ts.map +1 -1
  157. package/dist/runtime/capture-ops.js +20 -2
  158. package/dist/runtime/capture-ops.js.map +1 -1
  159. package/dist/runtime/cognee-sync-ops.d.ts +12 -0
  160. package/dist/runtime/cognee-sync-ops.d.ts.map +1 -0
  161. package/dist/runtime/cognee-sync-ops.js +55 -0
  162. package/dist/runtime/cognee-sync-ops.js.map +1 -0
  163. package/dist/runtime/core-ops.d.ts +8 -6
  164. package/dist/runtime/core-ops.d.ts.map +1 -1
  165. package/dist/runtime/core-ops.js +226 -9
  166. package/dist/runtime/core-ops.js.map +1 -1
  167. package/dist/runtime/curator-extra-ops.d.ts +2 -2
  168. package/dist/runtime/curator-extra-ops.d.ts.map +1 -1
  169. package/dist/runtime/curator-extra-ops.js +15 -3
  170. package/dist/runtime/curator-extra-ops.js.map +1 -1
  171. package/dist/runtime/domain-ops.js +2 -2
  172. package/dist/runtime/domain-ops.js.map +1 -1
  173. package/dist/runtime/grading-ops.d.ts.map +1 -1
  174. package/dist/runtime/grading-ops.js.map +1 -1
  175. package/dist/runtime/intake-ops.d.ts +14 -0
  176. package/dist/runtime/intake-ops.d.ts.map +1 -0
  177. package/dist/runtime/intake-ops.js +110 -0
  178. package/dist/runtime/intake-ops.js.map +1 -0
  179. package/dist/runtime/loop-ops.d.ts +5 -4
  180. package/dist/runtime/loop-ops.d.ts.map +1 -1
  181. package/dist/runtime/loop-ops.js +84 -12
  182. package/dist/runtime/loop-ops.js.map +1 -1
  183. package/dist/runtime/memory-cross-project-ops.d.ts.map +1 -1
  184. package/dist/runtime/memory-cross-project-ops.js.map +1 -1
  185. package/dist/runtime/memory-extra-ops.js +5 -5
  186. package/dist/runtime/memory-extra-ops.js.map +1 -1
  187. package/dist/runtime/orchestrate-ops.d.ts.map +1 -1
  188. package/dist/runtime/orchestrate-ops.js +8 -2
  189. package/dist/runtime/orchestrate-ops.js.map +1 -1
  190. package/dist/runtime/planning-extra-ops.d.ts +13 -5
  191. package/dist/runtime/planning-extra-ops.d.ts.map +1 -1
  192. package/dist/runtime/planning-extra-ops.js +381 -18
  193. package/dist/runtime/planning-extra-ops.js.map +1 -1
  194. package/dist/runtime/playbook-ops.d.ts +14 -0
  195. package/dist/runtime/playbook-ops.d.ts.map +1 -0
  196. package/dist/runtime/playbook-ops.js +141 -0
  197. package/dist/runtime/playbook-ops.js.map +1 -0
  198. package/dist/runtime/project-ops.d.ts.map +1 -1
  199. package/dist/runtime/project-ops.js +7 -2
  200. package/dist/runtime/project-ops.js.map +1 -1
  201. package/dist/runtime/runtime.d.ts.map +1 -1
  202. package/dist/runtime/runtime.js +27 -8
  203. package/dist/runtime/runtime.js.map +1 -1
  204. package/dist/runtime/types.d.ts +8 -0
  205. package/dist/runtime/types.d.ts.map +1 -1
  206. package/dist/runtime/vault-extra-ops.d.ts +3 -2
  207. package/dist/runtime/vault-extra-ops.d.ts.map +1 -1
  208. package/dist/runtime/vault-extra-ops.js +345 -4
  209. package/dist/runtime/vault-extra-ops.js.map +1 -1
  210. package/dist/vault/playbook.d.ts +34 -0
  211. package/dist/vault/playbook.d.ts.map +1 -0
  212. package/dist/vault/playbook.js +60 -0
  213. package/dist/vault/playbook.js.map +1 -0
  214. package/dist/vault/vault.d.ts +31 -32
  215. package/dist/vault/vault.d.ts.map +1 -1
  216. package/dist/vault/vault.js +201 -181
  217. package/dist/vault/vault.js.map +1 -1
  218. package/package.json +7 -3
  219. package/src/__tests__/admin-extra-ops.test.ts +62 -15
  220. package/src/__tests__/admin-ops.test.ts +2 -2
  221. package/src/__tests__/brain.test.ts +3 -3
  222. package/src/__tests__/cognee-integration.test.ts +80 -0
  223. package/src/__tests__/cognee-sync-manager.test.ts +103 -0
  224. package/src/__tests__/core-ops.test.ts +30 -4
  225. package/src/__tests__/curator-extra-ops.test.ts +24 -2
  226. package/src/__tests__/errors.test.ts +388 -0
  227. package/src/__tests__/grading-ops.test.ts +28 -7
  228. package/src/__tests__/intake-pipeline.test.ts +162 -0
  229. package/src/__tests__/loop-ops.test.ts +74 -3
  230. package/src/__tests__/memory-cross-project-ops.test.ts +3 -1
  231. package/src/__tests__/orchestrate-ops.test.ts +8 -3
  232. package/src/__tests__/persistence.test.ts +225 -0
  233. package/src/__tests__/planner.test.ts +99 -21
  234. package/src/__tests__/planning-extra-ops.test.ts +168 -10
  235. package/src/__tests__/playbook-registry.test.ts +326 -0
  236. package/src/__tests__/playbook-seeder.test.ts +163 -0
  237. package/src/__tests__/playbook.test.ts +389 -0
  238. package/src/__tests__/project-ops.test.ts +18 -4
  239. package/src/__tests__/template-manager.test.ts +222 -0
  240. package/src/__tests__/vault-extra-ops.test.ts +82 -7
  241. package/src/brain/brain.ts +71 -9
  242. package/src/brain/types.ts +2 -2
  243. package/src/cognee/client.ts +18 -0
  244. package/src/cognee/sync-manager.ts +389 -0
  245. package/src/curator/curator.ts +88 -7
  246. package/src/errors/classify.ts +102 -0
  247. package/src/errors/index.ts +5 -0
  248. package/src/errors/retry.ts +132 -0
  249. package/src/errors/types.ts +81 -0
  250. package/src/index.ts +114 -3
  251. package/src/intake/content-classifier.ts +146 -0
  252. package/src/intake/dedup-gate.ts +92 -0
  253. package/src/intake/intake-pipeline.ts +503 -0
  254. package/src/intake/types.ts +69 -0
  255. package/src/intelligence/loader.ts +1 -1
  256. package/src/intelligence/types.ts +3 -1
  257. package/src/loop/loop-manager.ts +325 -7
  258. package/src/loop/types.ts +72 -1
  259. package/src/persistence/index.ts +7 -0
  260. package/src/persistence/sqlite-provider.ts +62 -0
  261. package/src/persistence/types.ts +44 -0
  262. package/src/planning/gap-analysis.ts +286 -17
  263. package/src/planning/gap-types.ts +4 -1
  264. package/src/planning/planner.ts +828 -55
  265. package/src/playbooks/generic/brainstorming.ts +110 -0
  266. package/src/playbooks/generic/code-review.ts +181 -0
  267. package/src/playbooks/generic/subagent-execution.ts +74 -0
  268. package/src/playbooks/generic/systematic-debugging.ts +92 -0
  269. package/src/playbooks/generic/tdd.ts +75 -0
  270. package/src/playbooks/generic/verification.ts +79 -0
  271. package/src/playbooks/index.ts +27 -0
  272. package/src/playbooks/playbook-registry.ts +284 -0
  273. package/src/playbooks/playbook-seeder.ts +119 -0
  274. package/src/playbooks/playbook-types.ts +162 -0
  275. package/src/project/project-registry.ts +29 -17
  276. package/src/prompts/index.ts +3 -0
  277. package/src/prompts/parser.ts +59 -0
  278. package/src/prompts/template-manager.ts +77 -0
  279. package/src/prompts/types.ts +28 -0
  280. package/src/runtime/admin-extra-ops.ts +358 -13
  281. package/src/runtime/admin-ops.ts +17 -6
  282. package/src/runtime/capture-ops.ts +25 -6
  283. package/src/runtime/cognee-sync-ops.ts +63 -0
  284. package/src/runtime/core-ops.ts +258 -8
  285. package/src/runtime/curator-extra-ops.ts +17 -3
  286. package/src/runtime/domain-ops.ts +2 -2
  287. package/src/runtime/grading-ops.ts +11 -2
  288. package/src/runtime/intake-ops.ts +126 -0
  289. package/src/runtime/loop-ops.ts +96 -13
  290. package/src/runtime/memory-cross-project-ops.ts +1 -2
  291. package/src/runtime/memory-extra-ops.ts +5 -5
  292. package/src/runtime/orchestrate-ops.ts +8 -2
  293. package/src/runtime/planning-extra-ops.ts +414 -23
  294. package/src/runtime/playbook-ops.ts +169 -0
  295. package/src/runtime/project-ops.ts +9 -3
  296. package/src/runtime/runtime.ts +35 -9
  297. package/src/runtime/types.ts +8 -0
  298. package/src/runtime/vault-extra-ops.ts +385 -4
  299. package/src/vault/playbook.ts +87 -0
  300. package/src/vault/vault.ts +301 -235
@@ -0,0 +1,125 @@
1
+ // ─── Content Classifier — LLM-based knowledge extraction ────────────────────
2
+ //
3
+ // Takes a text chunk and uses an LLM to classify it into structured
4
+ // knowledge items. Graceful degradation: returns [] on any error.
5
+ // =============================================================================
6
+ // CONSTANTS
7
+ // =============================================================================
8
+ export const VALID_TYPES = [
9
+ 'pattern',
10
+ 'anti-pattern',
11
+ 'principle',
12
+ 'concept',
13
+ 'reference',
14
+ 'workflow',
15
+ 'idea',
16
+ 'roadmap',
17
+ ];
18
+ const VALID_SEVERITIES = ['critical', 'warning', 'suggestion'];
19
+ export const CLASSIFICATION_PROMPT = `You are a knowledge extraction engine. Your job is to analyze a text chunk and extract structured knowledge items from it.
20
+
21
+ For each distinct piece of knowledge you identify, produce an object with these fields:
22
+ - type: one of ${JSON.stringify(VALID_TYPES)}
23
+ - title: concise title, max 80 characters
24
+ - description: 2-3 sentence summary of the knowledge
25
+ - tags: 3-5 lowercase single-word or hyphenated tags
26
+ - severity: one of "critical", "warning", "suggestion"
27
+
28
+ Rules:
29
+ - Extract ALL meaningful knowledge items from the text.
30
+ - Each item must be self-contained and independently useful.
31
+ - Use "critical" for must-know items, "warning" for important gotchas, "suggestion" for nice-to-know.
32
+ - Tags should be specific and useful for search.
33
+ - Respond with a pure JSON array of objects. No markdown fences, no explanation, no wrapping.
34
+ - If the text contains no extractable knowledge, respond with an empty array: []`;
35
+ // =============================================================================
36
+ // CLASSIFIER
37
+ // =============================================================================
38
+ /**
39
+ * Classify a text chunk into structured knowledge items using an LLM.
40
+ *
41
+ * @param llm - LLMClient instance
42
+ * @param chunkText - The text to classify
43
+ * @param citation - Source citation (e.g. "book.pdf, pages 12-15")
44
+ * @returns Classified items, or [] on any error
45
+ */
46
+ export async function classifyChunk(llm, chunkText, citation) {
47
+ try {
48
+ const result = await llm.complete({
49
+ provider: 'openai',
50
+ model: 'gpt-4o-mini',
51
+ systemPrompt: CLASSIFICATION_PROMPT,
52
+ userPrompt: chunkText,
53
+ maxTokens: 4096,
54
+ temperature: 0.3,
55
+ caller: 'intake',
56
+ task: 'classify',
57
+ });
58
+ const raw = parseJsonResponse(result.text);
59
+ if (!Array.isArray(raw))
60
+ return [];
61
+ return raw
62
+ .map((item) => sanitizeItem(item, citation))
63
+ .filter((item) => item !== null);
64
+ }
65
+ catch {
66
+ // Graceful degradation — never throw
67
+ return [];
68
+ }
69
+ }
70
+ // =============================================================================
71
+ // HELPERS
72
+ // =============================================================================
73
+ /**
74
+ * Parse a JSON response, handling potential markdown fences the LLM
75
+ * might include despite instructions.
76
+ */
77
+ function parseJsonResponse(text) {
78
+ const trimmed = text.trim();
79
+ // Strip markdown fences if present (defensive)
80
+ const fenceMatch = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/);
81
+ const jsonStr = fenceMatch ? fenceMatch[1] : trimmed;
82
+ return JSON.parse(jsonStr);
83
+ }
84
+ /**
85
+ * Validate and sanitize a single classified item.
86
+ * Returns null if the item is not salvageable.
87
+ */
88
+ function sanitizeItem(raw, citation) {
89
+ if (!raw || typeof raw !== 'object')
90
+ return null;
91
+ const obj = raw;
92
+ // Type — must be a valid KnowledgeType
93
+ const type = typeof obj.type === 'string' ? obj.type : '';
94
+ if (!VALID_TYPES.includes(type))
95
+ return null;
96
+ // Title — required, truncate to 80 chars
97
+ const title = typeof obj.title === 'string' ? obj.title.slice(0, 80).trim() : '';
98
+ if (!title)
99
+ return null;
100
+ // Description — required
101
+ const description = typeof obj.description === 'string' ? obj.description.trim() : '';
102
+ if (!description)
103
+ return null;
104
+ // Tags — must be array of strings, cap at 5
105
+ const tags = Array.isArray(obj.tags)
106
+ ? obj.tags
107
+ .filter((t) => typeof t === 'string')
108
+ .map((t) => t.toLowerCase().trim())
109
+ .filter((t) => t.length > 0)
110
+ .slice(0, 5)
111
+ : [];
112
+ // Severity — default to 'suggestion' if invalid
113
+ const severity = VALID_SEVERITIES.includes(obj.severity)
114
+ ? obj.severity
115
+ : 'suggestion';
116
+ return {
117
+ type: type,
118
+ title,
119
+ description,
120
+ tags,
121
+ severity,
122
+ citation,
123
+ };
124
+ }
125
+ //# sourceMappingURL=content-classifier.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"content-classifier.js","sourceRoot":"","sources":["../../src/intake/content-classifier.ts"],"names":[],"mappings":"AAAA,+EAA+E;AAC/E,EAAE;AACF,oEAAoE;AACpE,kEAAkE;AAKlE,gFAAgF;AAChF,YAAY;AACZ,gFAAgF;AAEhF,MAAM,CAAC,MAAM,WAAW,GAAoB;IAC1C,SAAS;IACT,cAAc;IACd,WAAW;IACX,SAAS;IACT,WAAW;IACX,UAAU;IACV,MAAM;IACN,SAAS;CACV,CAAC;AAEF,MAAM,gBAAgB,GAAG,CAAC,UAAU,EAAE,SAAS,EAAE,YAAY,CAAU,CAAC;AAGxE,MAAM,CAAC,MAAM,qBAAqB,GAAG;;;iBAGpB,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;;;;;;;;;;;;iFAYqC,CAAC;AAElF,gFAAgF;AAChF,aAAa;AACb,gFAAgF;AAEhF;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAc,EACd,SAAiB,EACjB,QAAgB;IAEhB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,QAAQ,CAAC;YAChC,QAAQ,EAAE,QAAQ;YAClB,KAAK,EAAE,aAAa;YACpB,YAAY,EAAE,qBAAqB;YACnC,UAAU,EAAE,SAAS;YACrB,SAAS,EAAE,IAAI;YACf,WAAW,EAAE,GAAG;YAChB,MAAM,EAAE,QAAQ;YAChB,IAAI,EAAE,UAAU;SACjB,CAAC,CAAC;QAEH,MAAM,GAAG,GAAG,iBAAiB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAC3C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;YAAE,OAAO,EAAE,CAAC;QAEnC,OAAO,GAAG;aACP,GAAG,CAAC,CAAC,IAAa,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;aACpD,MAAM,CAAC,CAAC,IAAI,EAA0B,EAAE,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;IAC7D,CAAC;IAAC,MAAM,CAAC;QACP,qCAAqC;QACrC,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,gFAAgF;AAChF,UAAU;AACV,gFAAgF;AAEhF;;;GAGG;AACH,SAAS,iBAAiB,CAAC,IAAY;IACrC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,+CAA+C;IAC/C,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC5E,MAAM,OAAO,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;IAErD,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,GAAY,EAAE,QAAgB;IAClD,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC;IAEjD,MAAM,GAAG,GAAG,GAA8B,CAAC;IAE3C,uCAAuC;IACvC,MAAM,IAAI,GAAG,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,IAAqB,CAAC;QAAE,OAAO,IAAI,CAAC;IAE9D,yCAAyC;IACzC,MAAM,KAAK,GAAG,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,yBAAyB;IACzB,MAAM,WAAW,GAAG,OAAO,GAAG,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACtF,IAAI,CAAC,WAAW;QAAE,OAAO,IAAI,CAAC;IAE9B,4CAA4C;IAC5C,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;QAClC,CAAC,CAAC,GAAG,CAAC,IAAI;aACL,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;aACjD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;aAClC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;QAChB,CAAC,CAAC,EAAE,CAAC;IAEP,gDAAgD;IAChD,MAAM,QAAQ,GAAa,gBAAgB,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAoB,CAAC;QAC5E,CAAC,CAAE,GAAG,CAAC,QAAqB;QAC5B,CAAC,CAAC,YAAY,CAAC;IAEjB,OAAO;QACL,IAAI,EAAE,IAAqB;QAC3B,KAAK;QACL,WAAW;QACX,IAAI;QACJ,QAAQ;QACR,QAAQ;KACT,CAAC;AACJ,CAAC"}
@@ -0,0 +1,17 @@
1
+ import type { Vault } from '../vault/vault.js';
2
+ import type { ClassifiedItem } from './types.js';
3
+ export declare const DEDUP_THRESHOLD = 0.85;
4
+ export interface DedupResult {
5
+ item: ClassifiedItem;
6
+ isDuplicate: boolean;
7
+ bestMatchId?: string;
8
+ similarity: number;
9
+ }
10
+ /**
11
+ * Check new items against existing vault entries for duplicates using TF-IDF cosine similarity.
12
+ *
13
+ * Builds a shared IDF vocabulary from all texts (existing + new), computes TF-IDF vectors,
14
+ * and marks items as duplicates when cosine similarity >= DEDUP_THRESHOLD.
15
+ */
16
+ export declare function dedupItems(items: ClassifiedItem[], vault: Vault): DedupResult[];
17
+ //# sourceMappingURL=dedup-gate.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedup-gate.d.ts","sourceRoot":"","sources":["../../src/intake/dedup-gate.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAEjD,eAAO,MAAM,eAAe,OAAO,CAAC;AAEpC,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,cAAc,CAAC;IACrB,WAAW,EAAE,OAAO,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;;;;GAKG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,cAAc,EAAE,EAAE,KAAK,EAAE,KAAK,GAAG,WAAW,EAAE,CA+D/E"}
@@ -0,0 +1,66 @@
1
+ // ─── Dedup Gate ───────────────────────────────────────────────────
2
+ // TF-IDF cosine similarity check against existing vault entries.
3
+ // Pure function: no side effects, no I/O beyond reading vault.
4
+ import { tokenize, calculateTfIdf, cosineSimilarity, } from '../text/similarity.js';
5
+ export const DEDUP_THRESHOLD = 0.85;
6
+ /**
7
+ * Check new items against existing vault entries for duplicates using TF-IDF cosine similarity.
8
+ *
9
+ * Builds a shared IDF vocabulary from all texts (existing + new), computes TF-IDF vectors,
10
+ * and marks items as duplicates when cosine similarity >= DEDUP_THRESHOLD.
11
+ */
12
+ export function dedupItems(items, vault) {
13
+ const existing = vault.exportAll().entries;
14
+ // Fast path: nothing in vault — everything is new
15
+ if (existing.length === 0) {
16
+ return items.map((item) => ({
17
+ item,
18
+ isDuplicate: false,
19
+ similarity: 0,
20
+ }));
21
+ }
22
+ // ── Build texts for vocabulary ──────────────────────────────────
23
+ const existingTexts = existing.map((e) => `${e.title} ${e.description}`);
24
+ const newTexts = items.map((i) => `${i.title} ${i.description}`);
25
+ const allTexts = [...existingTexts, ...newTexts];
26
+ const totalDocs = allTexts.length;
27
+ // ── Count document frequency per term ───────────────────────────
28
+ const docFreq = new Map();
29
+ for (const text of allTexts) {
30
+ const terms = new Set(tokenize(text));
31
+ for (const term of terms) {
32
+ docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
33
+ }
34
+ }
35
+ // ── Build IDF vocabulary ────────────────────────────────────────
36
+ const vocabulary = new Map();
37
+ for (const [term, df] of docFreq) {
38
+ vocabulary.set(term, Math.log((totalDocs + 1) / (df + 1)) + 1);
39
+ }
40
+ // ── Compute TF-IDF vectors for existing entries ─────────────────
41
+ const existingVectors = existing.map((entry, idx) => ({
42
+ id: entry.id,
43
+ vec: calculateTfIdf(tokenize(existingTexts[idx]), vocabulary),
44
+ }));
45
+ // ── Score each new item against all existing entries ────────────
46
+ return items.map((item, idx) => {
47
+ const itemVec = calculateTfIdf(tokenize(newTexts[idx]), vocabulary);
48
+ let bestSimilarity = 0;
49
+ let bestMatchId;
50
+ for (const { id, vec } of existingVectors) {
51
+ const sim = cosineSimilarity(itemVec, vec);
52
+ if (sim > bestSimilarity) {
53
+ bestSimilarity = sim;
54
+ bestMatchId = id;
55
+ }
56
+ }
57
+ const isDuplicate = bestSimilarity >= DEDUP_THRESHOLD;
58
+ return {
59
+ item,
60
+ isDuplicate,
61
+ bestMatchId: isDuplicate ? bestMatchId : undefined,
62
+ similarity: bestSimilarity,
63
+ };
64
+ });
65
+ }
66
+ //# sourceMappingURL=dedup-gate.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dedup-gate.js","sourceRoot":"","sources":["../../src/intake/dedup-gate.ts"],"names":[],"mappings":"AAAA,qEAAqE;AACrE,iEAAiE;AACjE,+DAA+D;AAE/D,OAAO,EACL,QAAQ,EACR,cAAc,EACd,gBAAgB,GAEjB,MAAM,uBAAuB,CAAC;AAI/B,MAAM,CAAC,MAAM,eAAe,GAAG,IAAI,CAAC;AASpC;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CAAC,KAAuB,EAAE,KAAY;IAC9D,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC,OAAO,CAAC;IAE3C,kDAAkD;IAClD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAC1B,IAAI;YACJ,WAAW,EAAE,KAAK;YAClB,UAAU,EAAE,CAAC;SACd,CAAC,CAAC,CAAC;IACN,CAAC;IAED,mEAAmE;IACnE,MAAM,aAAa,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;IACjE,MAAM,QAAQ,GAAG,CAAC,GAAG,aAAa,EAAE,GAAG,QAAQ,CAAC,CAAC;IACjD,MAAM,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC;IAElC,mEAAmE;IACnE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC1C,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;QACtC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,mEAAmE;IACnE,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,OAAO,EAAE,CAAC;QACjC,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACjE,CAAC;IAED,mEAAmE;IACnE,MAAM,eAAe,GAA6C,QAAQ,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;QAC9F,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,GAAG,EAAE,cAAc,CAAC,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC;KAC9D,CAAC,CAAC,CAAC;IAEJ,mEAAmE;IACnE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE;QAC7B,MAAM,OAAO,GAAG,cAAc,CAAC,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;QAEpE,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,IAAI,WAA+B,CAAC;QAEpC,KAAK,MAAM,EAAE,EAAE,EAAE,GAAG,EAAE,IAAI,eAAe,EAAE,CAAC;YAC1C,MAAM,GAAG,GAAG,gBAAgB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC3C,IAAI,GAAG,GAAG,cAAc,EAAE,CAAC;gBACzB,cAAc,GAAG,GAAG,CAAC;gBACrB,WAAW,GAAG,EAAE,CAAC;YACnB,CAAC;QACH,CAAC;QAED,MAAM,WAAW,GAAG,cAAc,IAAI,eAAe,CAAC;QAEtD,OAAO;YACL,IAAI;YACJ,WAAW;YACX,WAAW,EAAE,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YAClD,UAAU,EAAE,cAAc;SAC3B,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,63 @@
1
+ import type { PersistenceProvider } from '../persistence/types.js';
2
+ import type { Vault } from '../vault/vault.js';
3
+ import type { LLMClient } from '../llm/llm-client.js';
4
+ import type { IntakeConfig, IntakeChunk, IntakeJobRecord, IntakePreviewResult } from './types.js';
5
+ /**
6
+ * Split concatenated PDF text into per-page segments.
7
+ *
8
+ * Strategy: split on form-feed characters first (common in pdf-parse output).
9
+ * If that yields fewer segments than expected, fall back to equal-length splits.
10
+ */
11
+ export declare function splitIntoPages(text: string, numPages: number): string[];
12
+ export declare class IntakePipeline {
13
+ private provider;
14
+ private vault;
15
+ private llm;
16
+ constructor(provider: PersistenceProvider, vault: Vault, llm: LLMClient);
17
+ private initSchema;
18
+ /**
19
+ * Parse a PDF, compute its file hash, create fixed-size page chunks,
20
+ * and persist the job + chunk records to the database.
21
+ */
22
+ ingestBook(config: IntakeConfig): Promise<IntakeJobRecord>;
23
+ /**
24
+ * Process up to `count` pending chunks for a job.
25
+ *
26
+ * For each chunk:
27
+ * 2. Extract page text from PDF
28
+ * 3. Classify via LLM
29
+ * 4. Dedup against vault
30
+ * 5. Store unique items
31
+ *
32
+ * When all chunks are done, finalizes the job (stage 6).
33
+ */
34
+ processChunks(jobId: string, count?: number): Promise<{
35
+ processed: number;
36
+ itemsStored: number;
37
+ itemsDeduped: number;
38
+ remaining: number;
39
+ }>;
40
+ /**
41
+ * Parse a page range from a PDF and classify it without storing.
42
+ * Useful for previewing what the pipeline would extract.
43
+ */
44
+ preview(config: IntakeConfig, pageStart: number, pageEnd: number): Promise<IntakePreviewResult>;
45
+ /**
46
+ * Get a job record by ID.
47
+ */
48
+ getJob(jobId: string): IntakeJobRecord | null;
49
+ /**
50
+ * List all intake jobs.
51
+ */
52
+ listJobs(): IntakeJobRecord[];
53
+ /**
54
+ * Get all chunks for a job.
55
+ */
56
+ getChunks(jobId: string): IntakeChunk[];
57
+ /**
58
+ * Sum stats from all chunks and mark the job as completed.
59
+ */
60
+ private finalizeJob;
61
+ private countPendingChunks;
62
+ }
63
+ //# sourceMappingURL=intake-pipeline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"intake-pipeline.d.ts","sourceRoot":"","sources":["../../src/intake/intake-pipeline.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,yBAAyB,CAAC;AACnE,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAEtD,OAAO,KAAK,EACV,YAAY,EACZ,WAAW,EACX,eAAe,EACf,mBAAmB,EAGpB,MAAM,YAAY,CAAC;AAwBpB;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CAoBvE;AAMD,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAsB;IACtC,OAAO,CAAC,KAAK,CAAQ;IACrB,OAAO,CAAC,GAAG,CAAY;gBAEX,QAAQ,EAAE,mBAAmB,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,SAAS;IASvE,OAAO,CAAC,UAAU;IAiClB;;;OAGG;IACG,UAAU,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAAC,eAAe,CAAC;IAgEhE;;;;;;;;;;OAUG;IACG,aAAa,CACjB,KAAK,EAAE,MAAM,EACb,KAAK,GAAE,MAAU,GAChB,OAAO,CAAC;QACT,SAAS,EAAE,MAAM,CAAC;QAClB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IAwHF;;;OAGG;IACG,OAAO,CACX,MAAM,EAAE,YAAY,EACpB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,mBAAmB,CAAC;IAqB/B;;OAEG;IACH,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI;IAQ7C;;OAEG;IACH,QAAQ,IAAI,eAAe,EAAE;IAO7B;;OAEG;IACH,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,WAAW,EAAE;IAUvC;;OAEG;IACH,OAAO,CAAC,WAAW;IAiCnB,OAAO,CAAC,kBAAkB;CAO3B"}
@@ -0,0 +1,373 @@
1
+ // ─── Intake Pipeline ──────────────────────────────────────────────
2
+ //
3
+ // 6-stage pipeline for ingesting PDF books into the vault:
4
+ // 1. Parse PDF + compute hash + create chunks → job record
5
+ // 2. Extract page text for each chunk
6
+ // 3. Classify chunk text via LLM
7
+ // 4. Dedup classified items against vault
8
+ // 5. Store unique items in vault
9
+ // 6. Finalize job with aggregate stats
10
+ //
11
+ // SQLite-backed job tracking for resumable processing.
12
+ import { createHash, randomUUID } from 'node:crypto';
13
+ import { readFileSync, statSync } from 'node:fs';
14
+ import { classifyChunk } from './content-classifier.js';
15
+ import { dedupItems } from './dedup-gate.js';
16
+ // =============================================================================
17
+ // CONSTANTS
18
+ // =============================================================================
19
+ const DEFAULT_CHUNK_SIZE = 10;
20
+ /**
21
+ * Map KnowledgeType → IntelligenceEntry.type.
22
+ * Only 'pattern' and 'anti-pattern' map directly; everything else becomes 'rule'.
23
+ */
24
+ function mapKnowledgeType(kt) {
25
+ if (kt === 'pattern')
26
+ return 'pattern';
27
+ if (kt === 'anti-pattern')
28
+ return 'anti-pattern';
29
+ return 'rule';
30
+ }
31
+ // =============================================================================
32
+ // HELPERS
33
+ // =============================================================================
34
+ /**
35
+ * Split concatenated PDF text into per-page segments.
36
+ *
37
+ * Strategy: split on form-feed characters first (common in pdf-parse output).
38
+ * If that yields fewer segments than expected, fall back to equal-length splits.
39
+ */
40
+ export function splitIntoPages(text, numPages) {
41
+ if (numPages <= 0)
42
+ return [text];
43
+ // Try form-feed split first
44
+ const ffPages = text.split('\f');
45
+ if (ffPages.length >= numPages) {
46
+ return ffPages.slice(0, numPages);
47
+ }
48
+ // Fallback: equal-length chunks
49
+ const chunkSize = Math.ceil(text.length / numPages);
50
+ const pages = [];
51
+ for (let i = 0; i < text.length; i += chunkSize) {
52
+ pages.push(text.slice(i, i + chunkSize));
53
+ }
54
+ // Pad with empty strings if we somehow got fewer
55
+ while (pages.length < numPages) {
56
+ pages.push('');
57
+ }
58
+ return pages;
59
+ }
60
+ // =============================================================================
61
+ // PIPELINE
62
+ // =============================================================================
63
+ export class IntakePipeline {
64
+ provider;
65
+ vault;
66
+ llm;
67
+ constructor(provider, vault, llm) {
68
+ this.provider = provider;
69
+ this.vault = vault;
70
+ this.llm = llm;
71
+ this.initSchema();
72
+ }
73
+ // ─── Schema ──────────────────────────────────────────────────────
74
+ initSchema() {
75
+ this.provider.execSql(`
76
+ CREATE TABLE IF NOT EXISTS intake_jobs (
77
+ id TEXT PRIMARY KEY,
78
+ status TEXT NOT NULL,
79
+ config TEXT NOT NULL,
80
+ pdf_meta TEXT,
81
+ toc TEXT,
82
+ stats TEXT,
83
+ created_at INTEGER,
84
+ updated_at INTEGER,
85
+ completed_at INTEGER
86
+ );
87
+
88
+ CREATE TABLE IF NOT EXISTS intake_chunks (
89
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
90
+ job_id TEXT NOT NULL REFERENCES intake_jobs(id),
91
+ chunk_index INTEGER,
92
+ title TEXT,
93
+ page_start INTEGER,
94
+ page_end INTEGER,
95
+ status TEXT DEFAULT 'pending',
96
+ items_extracted INTEGER DEFAULT 0,
97
+ items_stored INTEGER DEFAULT 0,
98
+ items_deduped INTEGER DEFAULT 0,
99
+ error TEXT,
100
+ processed_at INTEGER
101
+ );
102
+ `);
103
+ }
104
+ // ─── Stage 1: Ingest Book ────────────────────────────────────────
105
+ /**
106
+ * Parse a PDF, compute its file hash, create fixed-size page chunks,
107
+ * and persist the job + chunk records to the database.
108
+ */
109
+ async ingestBook(config) {
110
+ const jobId = randomUUID();
111
+ const now = Math.floor(Date.now() / 1000);
112
+ const chunkPageSize = config.chunkPageSize ?? DEFAULT_CHUNK_SIZE;
113
+ // Read file
114
+ const fileBuffer = readFileSync(config.pdfPath);
115
+ const fileSize = statSync(config.pdfPath).size;
116
+ const fileHash = createHash('sha256').update(fileBuffer).digest('hex');
117
+ // Dynamic import of pdf-parse
118
+ const pdfParse = (await import('pdf-parse')).default;
119
+ const pdfData = await pdfParse(fileBuffer);
120
+ const totalPages = pdfData.numpages;
121
+ const pdfMeta = { totalPages, fileHash, fileSize };
122
+ // Create chunk definitions (fixed N-page windows)
123
+ const numChunks = Math.ceil(totalPages / chunkPageSize);
124
+ this.provider.transaction(() => {
125
+ // Insert job
126
+ this.provider.run(`INSERT INTO intake_jobs (id, status, config, pdf_meta, toc, stats, created_at, updated_at, completed_at)
127
+ VALUES (@id, @status, @config, @pdfMeta, @toc, @stats, @createdAt, @updatedAt, @completedAt)`, {
128
+ id: jobId,
129
+ status: 'initialized',
130
+ config: JSON.stringify(config),
131
+ pdfMeta: JSON.stringify(pdfMeta),
132
+ toc: null,
133
+ stats: null,
134
+ createdAt: now,
135
+ updatedAt: now,
136
+ completedAt: null,
137
+ });
138
+ // Insert chunk records
139
+ for (let i = 0; i < numChunks; i++) {
140
+ const pageStart = i * chunkPageSize + 1;
141
+ const pageEnd = Math.min((i + 1) * chunkPageSize, totalPages);
142
+ const chunkTitle = `${config.title} — pages ${pageStart}-${pageEnd}`;
143
+ this.provider.run(`INSERT INTO intake_chunks (job_id, chunk_index, title, page_start, page_end, status)
144
+ VALUES (@jobId, @chunkIndex, @title, @pageStart, @pageEnd, @status)`, {
145
+ jobId,
146
+ chunkIndex: i,
147
+ title: chunkTitle,
148
+ pageStart,
149
+ pageEnd,
150
+ status: 'pending',
151
+ });
152
+ }
153
+ });
154
+ return this.getJob(jobId);
155
+ }
156
+ // ─── Stages 2-5: Process Chunks ──────────────────────────────────
157
+ /**
158
+ * Process up to `count` pending chunks for a job.
159
+ *
160
+ * For each chunk:
161
+ * 2. Extract page text from PDF
162
+ * 3. Classify via LLM
163
+ * 4. Dedup against vault
164
+ * 5. Store unique items
165
+ *
166
+ * When all chunks are done, finalizes the job (stage 6).
167
+ */
168
+ async processChunks(jobId, count = 5) {
169
+ // Get pending chunks
170
+ const pendingChunks = this.provider.all(`SELECT * FROM intake_chunks WHERE job_id = @jobId AND status = 'pending' ORDER BY chunk_index ASC LIMIT @limit`, { jobId, limit: count });
171
+ if (pendingChunks.length === 0) {
172
+ const remaining = this.countPendingChunks(jobId);
173
+ return { processed: 0, itemsStored: 0, itemsDeduped: 0, remaining };
174
+ }
175
+ // Mark job as processing
176
+ this.provider.run(`UPDATE intake_jobs SET status = 'processing', updated_at = @now WHERE id = @id`, { id: jobId, now: Math.floor(Date.now() / 1000) });
177
+ // Re-read config and parse PDF
178
+ const job = this.getJob(jobId);
179
+ if (!job) {
180
+ return { processed: 0, itemsStored: 0, itemsDeduped: 0, remaining: 0 };
181
+ }
182
+ const fileBuffer = readFileSync(job.config.pdfPath);
183
+ const pdfParse = (await import('pdf-parse')).default;
184
+ const pdfData = await pdfParse(fileBuffer);
185
+ const totalPages = job.pdfMeta?.totalPages ?? pdfData.numpages;
186
+ const pages = splitIntoPages(pdfData.text, totalPages);
187
+ let totalStored = 0;
188
+ let totalDeduped = 0;
189
+ let processed = 0;
190
+ for (const chunkRow of pendingChunks) {
191
+ const chunkId = chunkRow.id;
192
+ const chunkIndex = chunkRow.chunk_index;
193
+ const pageStart = chunkRow.page_start;
194
+ const pageEnd = chunkRow.page_end;
195
+ try {
196
+ // Mark chunk processing
197
+ this.provider.run(`UPDATE intake_chunks SET status = 'processing' WHERE id = @id`, {
198
+ id: chunkId,
199
+ });
200
+ // Stage 2: Extract page text (1-indexed → 0-indexed)
201
+ const chunkText = pages.slice(pageStart - 1, pageEnd).join('\n\n');
202
+ const citation = `${job.config.title}, pages ${pageStart}-${pageEnd}`;
203
+ // Stage 3: Classify
204
+ const classifiedItems = await classifyChunk(this.llm, chunkText, citation);
205
+ // Stage 4: Dedup
206
+ const dedupResults = dedupItems(classifiedItems, this.vault);
207
+ const uniqueItems = dedupResults.filter((r) => !r.isDuplicate);
208
+ const dupCount = dedupResults.filter((r) => r.isDuplicate).length;
209
+ // Stage 5: Store unique items in vault
210
+ let storedCount = 0;
211
+ for (let itemIdx = 0; itemIdx < uniqueItems.length; itemIdx++) {
212
+ const result = uniqueItems[itemIdx];
213
+ const entry = classifiedItemToEntry(result.item, job.config.domain, jobId, chunkIndex, itemIdx, job.config.tags);
214
+ this.vault.add(entry);
215
+ storedCount++;
216
+ }
217
+ // Update chunk record
218
+ const now = Math.floor(Date.now() / 1000);
219
+ this.provider.run(`UPDATE intake_chunks
220
+ SET status = 'completed', items_extracted = @extracted, items_stored = @stored, items_deduped = @deduped, processed_at = @now
221
+ WHERE id = @id`, {
222
+ id: chunkId,
223
+ extracted: classifiedItems.length,
224
+ stored: storedCount,
225
+ deduped: dupCount,
226
+ now,
227
+ });
228
+ totalStored += storedCount;
229
+ totalDeduped += dupCount;
230
+ processed++;
231
+ }
232
+ catch (err) {
233
+ // Graceful degradation: mark chunk as failed, continue with others
234
+ const errorMsg = err instanceof Error ? err.message : String(err);
235
+ this.provider.run(`UPDATE intake_chunks SET status = 'failed', error = @error, processed_at = @now WHERE id = @id`, { id: chunkId, error: errorMsg, now: Math.floor(Date.now() / 1000) });
236
+ processed++;
237
+ }
238
+ }
239
+ // Update job timestamp
240
+ this.provider.run(`UPDATE intake_jobs SET updated_at = @now WHERE id = @id`, {
241
+ id: jobId,
242
+ now: Math.floor(Date.now() / 1000),
243
+ });
244
+ // Check remaining
245
+ const remaining = this.countPendingChunks(jobId);
246
+ if (remaining === 0) {
247
+ this.finalizeJob(jobId);
248
+ }
249
+ return { processed, itemsStored: totalStored, itemsDeduped: totalDeduped, remaining };
250
+ }
251
+ // ─── Preview ─────────────────────────────────────────────────────
252
+ /**
253
+ * Parse a page range from a PDF and classify it without storing.
254
+ * Useful for previewing what the pipeline would extract.
255
+ */
256
+ async preview(config, pageStart, pageEnd) {
257
+ const fileBuffer = readFileSync(config.pdfPath);
258
+ const pdfParse = (await import('pdf-parse')).default;
259
+ const pdfData = await pdfParse(fileBuffer);
260
+ const totalPages = pdfData.numpages;
261
+ const pages = splitIntoPages(pdfData.text, totalPages);
262
+ const chunkText = pages.slice(pageStart - 1, pageEnd).join('\n\n');
263
+ const citation = `${config.title}, pages ${pageStart}-${pageEnd}`;
264
+ const items = await classifyChunk(this.llm, chunkText, citation);
265
+ return {
266
+ items,
267
+ chunkText,
268
+ pageRange: { start: pageStart, end: pageEnd },
269
+ };
270
+ }
271
+ // ─── Queries ─────────────────────────────────────────────────────
272
+ /**
273
+ * Get a job record by ID.
274
+ */
275
+ getJob(jobId) {
276
+ const row = this.provider.get('SELECT * FROM intake_jobs WHERE id = @id', { id: jobId });
277
+ return row ? rowToJobRecord(row) : null;
278
+ }
279
+ /**
280
+ * List all intake jobs.
281
+ */
282
+ listJobs() {
283
+ const rows = this.provider.all('SELECT * FROM intake_jobs ORDER BY created_at DESC');
284
+ return rows.map(rowToJobRecord);
285
+ }
286
+ /**
287
+ * Get all chunks for a job.
288
+ */
289
+ getChunks(jobId) {
290
+ const rows = this.provider.all('SELECT * FROM intake_chunks WHERE job_id = @jobId ORDER BY chunk_index ASC', { jobId });
291
+ return rows.map(rowToChunk);
292
+ }
293
+ // ─── Stage 6: Finalize ──────────────────────────────────────────
294
+ /**
295
+ * Sum stats from all chunks and mark the job as completed.
296
+ */
297
+ finalizeJob(jobId) {
298
+ const chunks = this.provider.all('SELECT * FROM intake_chunks WHERE job_id = @jobId', { jobId });
299
+ let itemsExtracted = 0;
300
+ let itemsStored = 0;
301
+ let itemsDeduped = 0;
302
+ let itemsFailed = 0;
303
+ for (const chunk of chunks) {
304
+ const status = chunk.status;
305
+ if (status === 'completed') {
306
+ itemsExtracted += chunk.items_extracted ?? 0;
307
+ itemsStored += chunk.items_stored ?? 0;
308
+ itemsDeduped += chunk.items_deduped ?? 0;
309
+ }
310
+ else if (status === 'failed') {
311
+ itemsFailed++;
312
+ }
313
+ }
314
+ const stats = { itemsExtracted, itemsStored, itemsDeduped, itemsFailed };
315
+ const now = Math.floor(Date.now() / 1000);
316
+ this.provider.run(`UPDATE intake_jobs SET status = 'completed', stats = @stats, updated_at = @now, completed_at = @now WHERE id = @id`, { id: jobId, stats: JSON.stringify(stats), now });
317
+ }
318
+ // ─── Private helpers ─────────────────────────────────────────────
319
+ countPendingChunks(jobId) {
320
+ const result = this.provider.get(`SELECT COUNT(*) as count FROM intake_chunks WHERE job_id = @jobId AND status = 'pending'`, { jobId });
321
+ return result?.count ?? 0;
322
+ }
323
+ }
324
+ // =============================================================================
325
+ // ROW MAPPERS
326
+ // =============================================================================
327
+ function rowToJobRecord(row) {
328
+ return {
329
+ id: row.id,
330
+ status: row.status,
331
+ config: JSON.parse(row.config),
332
+ pdfMeta: row.pdf_meta ? JSON.parse(row.pdf_meta) : null,
333
+ toc: row.toc ? JSON.parse(row.toc) : null,
334
+ stats: row.stats ? JSON.parse(row.stats) : null,
335
+ createdAt: row.created_at,
336
+ updatedAt: row.updated_at,
337
+ completedAt: row.completed_at ?? null,
338
+ };
339
+ }
340
+ function rowToChunk(row) {
341
+ return {
342
+ id: row.id,
343
+ jobId: row.job_id,
344
+ chunkIndex: row.chunk_index,
345
+ title: row.title ?? null,
346
+ pageStart: row.page_start,
347
+ pageEnd: row.page_end,
348
+ status: row.status,
349
+ itemsExtracted: row.items_extracted ?? 0,
350
+ itemsStored: row.items_stored ?? 0,
351
+ itemsDeduped: row.items_deduped ?? 0,
352
+ error: row.error ?? null,
353
+ processedAt: row.processed_at ?? null,
354
+ };
355
+ }
356
+ /**
357
+ * Convert a ClassifiedItem to an IntelligenceEntry for vault storage.
358
+ */
359
+ function classifiedItemToEntry(item, domain, jobId, chunkIndex, itemIndex, extraTags) {
360
+ const entryType = mapKnowledgeType(item.type);
361
+ const tags = [...item.tags, ...(extraTags ?? [])];
362
+ return {
363
+ id: `intake-${jobId}-${chunkIndex}-${itemIndex}`,
364
+ type: entryType,
365
+ domain,
366
+ title: item.title,
367
+ severity: item.severity,
368
+ description: item.description,
369
+ context: item.citation,
370
+ tags,
371
+ };
372
+ }
373
+ //# sourceMappingURL=intake-pipeline.js.map