@soleri/core 2.4.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/brain/brain.d.ts +7 -0
- package/dist/brain/brain.d.ts.map +1 -1
- package/dist/brain/brain.js +56 -9
- package/dist/brain/brain.js.map +1 -1
- package/dist/brain/types.d.ts +2 -2
- package/dist/brain/types.d.ts.map +1 -1
- package/dist/cognee/client.d.ts +3 -0
- package/dist/cognee/client.d.ts.map +1 -1
- package/dist/cognee/client.js +17 -0
- package/dist/cognee/client.js.map +1 -1
- package/dist/cognee/sync-manager.d.ts +94 -0
- package/dist/cognee/sync-manager.d.ts.map +1 -0
- package/dist/cognee/sync-manager.js +293 -0
- package/dist/cognee/sync-manager.js.map +1 -0
- package/dist/curator/curator.d.ts +8 -1
- package/dist/curator/curator.d.ts.map +1 -1
- package/dist/curator/curator.js +64 -1
- package/dist/curator/curator.js.map +1 -1
- package/dist/errors/classify.d.ts +13 -0
- package/dist/errors/classify.d.ts.map +1 -0
- package/dist/errors/classify.js +97 -0
- package/dist/errors/classify.js.map +1 -0
- package/dist/errors/index.d.ts +6 -0
- package/dist/errors/index.d.ts.map +1 -0
- package/dist/errors/index.js +4 -0
- package/dist/errors/index.js.map +1 -0
- package/dist/errors/retry.d.ts +40 -0
- package/dist/errors/retry.d.ts.map +1 -0
- package/dist/errors/retry.js +97 -0
- package/dist/errors/retry.js.map +1 -0
- package/dist/errors/types.d.ts +48 -0
- package/dist/errors/types.d.ts.map +1 -0
- package/dist/errors/types.js +59 -0
- package/dist/errors/types.js.map +1 -0
- package/dist/index.d.ts +25 -5
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +21 -3
- package/dist/index.js.map +1 -1
- package/dist/intake/content-classifier.d.ts +14 -0
- package/dist/intake/content-classifier.d.ts.map +1 -0
- package/dist/intake/content-classifier.js +125 -0
- package/dist/intake/content-classifier.js.map +1 -0
- package/dist/intake/dedup-gate.d.ts +17 -0
- package/dist/intake/dedup-gate.d.ts.map +1 -0
- package/dist/intake/dedup-gate.js +66 -0
- package/dist/intake/dedup-gate.js.map +1 -0
- package/dist/intake/intake-pipeline.d.ts +63 -0
- package/dist/intake/intake-pipeline.d.ts.map +1 -0
- package/dist/intake/intake-pipeline.js +373 -0
- package/dist/intake/intake-pipeline.js.map +1 -0
- package/dist/intake/types.d.ts +65 -0
- package/dist/intake/types.d.ts.map +1 -0
- package/dist/intake/types.js +3 -0
- package/dist/intake/types.js.map +1 -0
- package/dist/intelligence/loader.js +1 -1
- package/dist/intelligence/loader.js.map +1 -1
- package/dist/intelligence/types.d.ts +3 -1
- package/dist/intelligence/types.d.ts.map +1 -1
- package/dist/loop/loop-manager.d.ts +58 -7
- package/dist/loop/loop-manager.d.ts.map +1 -1
- package/dist/loop/loop-manager.js +280 -6
- package/dist/loop/loop-manager.js.map +1 -1
- package/dist/loop/types.d.ts +69 -1
- package/dist/loop/types.d.ts.map +1 -1
- package/dist/loop/types.js +4 -1
- package/dist/loop/types.js.map +1 -1
- package/dist/persistence/index.d.ts +3 -0
- package/dist/persistence/index.d.ts.map +1 -0
- package/dist/persistence/index.js +2 -0
- package/dist/persistence/index.js.map +1 -0
- package/dist/persistence/sqlite-provider.d.ts +25 -0
- package/dist/persistence/sqlite-provider.d.ts.map +1 -0
- package/dist/persistence/sqlite-provider.js +59 -0
- package/dist/persistence/sqlite-provider.js.map +1 -0
- package/dist/persistence/types.d.ts +36 -0
- package/dist/persistence/types.d.ts.map +1 -0
- package/dist/persistence/types.js +8 -0
- package/dist/persistence/types.js.map +1 -0
- package/dist/planning/gap-analysis.d.ts +47 -4
- package/dist/planning/gap-analysis.d.ts.map +1 -1
- package/dist/planning/gap-analysis.js +190 -13
- package/dist/planning/gap-analysis.js.map +1 -1
- package/dist/planning/gap-types.d.ts +1 -1
- package/dist/planning/gap-types.d.ts.map +1 -1
- package/dist/planning/gap-types.js.map +1 -1
- package/dist/planning/planner.d.ts +277 -9
- package/dist/planning/planner.d.ts.map +1 -1
- package/dist/planning/planner.js +611 -46
- package/dist/planning/planner.js.map +1 -1
- package/dist/playbooks/generic/brainstorming.d.ts +9 -0
- package/dist/playbooks/generic/brainstorming.d.ts.map +1 -0
- package/dist/playbooks/generic/brainstorming.js +105 -0
- package/dist/playbooks/generic/brainstorming.js.map +1 -0
- package/dist/playbooks/generic/code-review.d.ts +11 -0
- package/dist/playbooks/generic/code-review.d.ts.map +1 -0
- package/dist/playbooks/generic/code-review.js +176 -0
- package/dist/playbooks/generic/code-review.js.map +1 -0
- package/dist/playbooks/generic/subagent-execution.d.ts +9 -0
- package/dist/playbooks/generic/subagent-execution.d.ts.map +1 -0
- package/dist/playbooks/generic/subagent-execution.js +68 -0
- package/dist/playbooks/generic/subagent-execution.js.map +1 -0
- package/dist/playbooks/generic/systematic-debugging.d.ts +9 -0
- package/dist/playbooks/generic/systematic-debugging.d.ts.map +1 -0
- package/dist/playbooks/generic/systematic-debugging.js +87 -0
- package/dist/playbooks/generic/systematic-debugging.js.map +1 -0
- package/dist/playbooks/generic/tdd.d.ts +9 -0
- package/dist/playbooks/generic/tdd.d.ts.map +1 -0
- package/dist/playbooks/generic/tdd.js +70 -0
- package/dist/playbooks/generic/tdd.js.map +1 -0
- package/dist/playbooks/generic/verification.d.ts +9 -0
- package/dist/playbooks/generic/verification.d.ts.map +1 -0
- package/dist/playbooks/generic/verification.js +74 -0
- package/dist/playbooks/generic/verification.js.map +1 -0
- package/dist/playbooks/index.d.ts +4 -0
- package/dist/playbooks/index.d.ts.map +1 -0
- package/dist/playbooks/index.js +5 -0
- package/dist/playbooks/index.js.map +1 -0
- package/dist/playbooks/playbook-registry.d.ts +42 -0
- package/dist/playbooks/playbook-registry.d.ts.map +1 -0
- package/dist/playbooks/playbook-registry.js +227 -0
- package/dist/playbooks/playbook-registry.js.map +1 -0
- package/dist/playbooks/playbook-seeder.d.ts +47 -0
- package/dist/playbooks/playbook-seeder.d.ts.map +1 -0
- package/dist/playbooks/playbook-seeder.js +104 -0
- package/dist/playbooks/playbook-seeder.js.map +1 -0
- package/dist/playbooks/playbook-types.d.ts +132 -0
- package/dist/playbooks/playbook-types.d.ts.map +1 -0
- package/dist/playbooks/playbook-types.js +12 -0
- package/dist/playbooks/playbook-types.js.map +1 -0
- package/dist/project/project-registry.d.ts.map +1 -1
- package/dist/project/project-registry.js +9 -11
- package/dist/project/project-registry.js.map +1 -1
- package/dist/prompts/index.d.ts +4 -0
- package/dist/prompts/index.d.ts.map +1 -0
- package/dist/prompts/index.js +3 -0
- package/dist/prompts/index.js.map +1 -0
- package/dist/prompts/parser.d.ts +17 -0
- package/dist/prompts/parser.d.ts.map +1 -0
- package/dist/prompts/parser.js +47 -0
- package/dist/prompts/parser.js.map +1 -0
- package/dist/prompts/template-manager.d.ts +25 -0
- package/dist/prompts/template-manager.d.ts.map +1 -0
- package/dist/prompts/template-manager.js +71 -0
- package/dist/prompts/template-manager.js.map +1 -0
- package/dist/prompts/types.d.ts +26 -0
- package/dist/prompts/types.d.ts.map +1 -0
- package/dist/prompts/types.js +5 -0
- package/dist/prompts/types.js.map +1 -0
- package/dist/runtime/admin-extra-ops.d.ts +5 -3
- package/dist/runtime/admin-extra-ops.d.ts.map +1 -1
- package/dist/runtime/admin-extra-ops.js +322 -11
- package/dist/runtime/admin-extra-ops.js.map +1 -1
- package/dist/runtime/admin-ops.d.ts.map +1 -1
- package/dist/runtime/admin-ops.js +10 -3
- package/dist/runtime/admin-ops.js.map +1 -1
- package/dist/runtime/capture-ops.d.ts.map +1 -1
- package/dist/runtime/capture-ops.js +20 -2
- package/dist/runtime/capture-ops.js.map +1 -1
- package/dist/runtime/cognee-sync-ops.d.ts +12 -0
- package/dist/runtime/cognee-sync-ops.d.ts.map +1 -0
- package/dist/runtime/cognee-sync-ops.js +55 -0
- package/dist/runtime/cognee-sync-ops.js.map +1 -0
- package/dist/runtime/core-ops.d.ts +8 -6
- package/dist/runtime/core-ops.d.ts.map +1 -1
- package/dist/runtime/core-ops.js +226 -9
- package/dist/runtime/core-ops.js.map +1 -1
- package/dist/runtime/curator-extra-ops.d.ts +2 -2
- package/dist/runtime/curator-extra-ops.d.ts.map +1 -1
- package/dist/runtime/curator-extra-ops.js +15 -3
- package/dist/runtime/curator-extra-ops.js.map +1 -1
- package/dist/runtime/domain-ops.js +2 -2
- package/dist/runtime/domain-ops.js.map +1 -1
- package/dist/runtime/grading-ops.d.ts.map +1 -1
- package/dist/runtime/grading-ops.js.map +1 -1
- package/dist/runtime/intake-ops.d.ts +14 -0
- package/dist/runtime/intake-ops.d.ts.map +1 -0
- package/dist/runtime/intake-ops.js +110 -0
- package/dist/runtime/intake-ops.js.map +1 -0
- package/dist/runtime/loop-ops.d.ts +5 -4
- package/dist/runtime/loop-ops.d.ts.map +1 -1
- package/dist/runtime/loop-ops.js +84 -12
- package/dist/runtime/loop-ops.js.map +1 -1
- package/dist/runtime/memory-cross-project-ops.d.ts.map +1 -1
- package/dist/runtime/memory-cross-project-ops.js.map +1 -1
- package/dist/runtime/memory-extra-ops.js +5 -5
- package/dist/runtime/memory-extra-ops.js.map +1 -1
- package/dist/runtime/orchestrate-ops.d.ts.map +1 -1
- package/dist/runtime/orchestrate-ops.js +8 -2
- package/dist/runtime/orchestrate-ops.js.map +1 -1
- package/dist/runtime/planning-extra-ops.d.ts +13 -5
- package/dist/runtime/planning-extra-ops.d.ts.map +1 -1
- package/dist/runtime/planning-extra-ops.js +381 -18
- package/dist/runtime/planning-extra-ops.js.map +1 -1
- package/dist/runtime/playbook-ops.d.ts +14 -0
- package/dist/runtime/playbook-ops.d.ts.map +1 -0
- package/dist/runtime/playbook-ops.js +141 -0
- package/dist/runtime/playbook-ops.js.map +1 -0
- package/dist/runtime/project-ops.d.ts.map +1 -1
- package/dist/runtime/project-ops.js +7 -2
- package/dist/runtime/project-ops.js.map +1 -1
- package/dist/runtime/runtime.d.ts.map +1 -1
- package/dist/runtime/runtime.js +27 -8
- package/dist/runtime/runtime.js.map +1 -1
- package/dist/runtime/types.d.ts +8 -0
- package/dist/runtime/types.d.ts.map +1 -1
- package/dist/runtime/vault-extra-ops.d.ts +3 -2
- package/dist/runtime/vault-extra-ops.d.ts.map +1 -1
- package/dist/runtime/vault-extra-ops.js +345 -4
- package/dist/runtime/vault-extra-ops.js.map +1 -1
- package/dist/vault/playbook.d.ts +34 -0
- package/dist/vault/playbook.d.ts.map +1 -0
- package/dist/vault/playbook.js +60 -0
- package/dist/vault/playbook.js.map +1 -0
- package/dist/vault/vault.d.ts +31 -32
- package/dist/vault/vault.d.ts.map +1 -1
- package/dist/vault/vault.js +201 -181
- package/dist/vault/vault.js.map +1 -1
- package/package.json +7 -3
- package/src/__tests__/admin-extra-ops.test.ts +62 -15
- package/src/__tests__/admin-ops.test.ts +2 -2
- package/src/__tests__/brain.test.ts +3 -3
- package/src/__tests__/cognee-integration.test.ts +80 -0
- package/src/__tests__/cognee-sync-manager.test.ts +103 -0
- package/src/__tests__/core-ops.test.ts +30 -4
- package/src/__tests__/curator-extra-ops.test.ts +24 -2
- package/src/__tests__/errors.test.ts +388 -0
- package/src/__tests__/grading-ops.test.ts +28 -7
- package/src/__tests__/intake-pipeline.test.ts +162 -0
- package/src/__tests__/loop-ops.test.ts +74 -3
- package/src/__tests__/memory-cross-project-ops.test.ts +3 -1
- package/src/__tests__/orchestrate-ops.test.ts +8 -3
- package/src/__tests__/persistence.test.ts +225 -0
- package/src/__tests__/planner.test.ts +99 -21
- package/src/__tests__/planning-extra-ops.test.ts +168 -10
- package/src/__tests__/playbook-registry.test.ts +326 -0
- package/src/__tests__/playbook-seeder.test.ts +163 -0
- package/src/__tests__/playbook.test.ts +389 -0
- package/src/__tests__/project-ops.test.ts +18 -4
- package/src/__tests__/template-manager.test.ts +222 -0
- package/src/__tests__/vault-extra-ops.test.ts +82 -7
- package/src/brain/brain.ts +71 -9
- package/src/brain/types.ts +2 -2
- package/src/cognee/client.ts +18 -0
- package/src/cognee/sync-manager.ts +389 -0
- package/src/curator/curator.ts +88 -7
- package/src/errors/classify.ts +102 -0
- package/src/errors/index.ts +5 -0
- package/src/errors/retry.ts +132 -0
- package/src/errors/types.ts +81 -0
- package/src/index.ts +114 -3
- package/src/intake/content-classifier.ts +146 -0
- package/src/intake/dedup-gate.ts +92 -0
- package/src/intake/intake-pipeline.ts +503 -0
- package/src/intake/types.ts +69 -0
- package/src/intelligence/loader.ts +1 -1
- package/src/intelligence/types.ts +3 -1
- package/src/loop/loop-manager.ts +325 -7
- package/src/loop/types.ts +72 -1
- package/src/persistence/index.ts +7 -0
- package/src/persistence/sqlite-provider.ts +62 -0
- package/src/persistence/types.ts +44 -0
- package/src/planning/gap-analysis.ts +286 -17
- package/src/planning/gap-types.ts +4 -1
- package/src/planning/planner.ts +828 -55
- package/src/playbooks/generic/brainstorming.ts +110 -0
- package/src/playbooks/generic/code-review.ts +181 -0
- package/src/playbooks/generic/subagent-execution.ts +74 -0
- package/src/playbooks/generic/systematic-debugging.ts +92 -0
- package/src/playbooks/generic/tdd.ts +75 -0
- package/src/playbooks/generic/verification.ts +79 -0
- package/src/playbooks/index.ts +27 -0
- package/src/playbooks/playbook-registry.ts +284 -0
- package/src/playbooks/playbook-seeder.ts +119 -0
- package/src/playbooks/playbook-types.ts +162 -0
- package/src/project/project-registry.ts +29 -17
- package/src/prompts/index.ts +3 -0
- package/src/prompts/parser.ts +59 -0
- package/src/prompts/template-manager.ts +77 -0
- package/src/prompts/types.ts +28 -0
- package/src/runtime/admin-extra-ops.ts +358 -13
- package/src/runtime/admin-ops.ts +17 -6
- package/src/runtime/capture-ops.ts +25 -6
- package/src/runtime/cognee-sync-ops.ts +63 -0
- package/src/runtime/core-ops.ts +258 -8
- package/src/runtime/curator-extra-ops.ts +17 -3
- package/src/runtime/domain-ops.ts +2 -2
- package/src/runtime/grading-ops.ts +11 -2
- package/src/runtime/intake-ops.ts +126 -0
- package/src/runtime/loop-ops.ts +96 -13
- package/src/runtime/memory-cross-project-ops.ts +1 -2
- package/src/runtime/memory-extra-ops.ts +5 -5
- package/src/runtime/orchestrate-ops.ts +8 -2
- package/src/runtime/planning-extra-ops.ts +414 -23
- package/src/runtime/playbook-ops.ts +169 -0
- package/src/runtime/project-ops.ts +9 -3
- package/src/runtime/runtime.ts +35 -9
- package/src/runtime/types.ts +8 -0
- package/src/runtime/vault-extra-ops.ts +385 -4
- package/src/vault/playbook.ts +87 -0
- package/src/vault/vault.ts +301 -235
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
// ─── Content Classifier — LLM-based knowledge extraction ────────────────────
|
|
2
|
+
//
|
|
3
|
+
// Takes a text chunk and uses an LLM to classify it into structured
|
|
4
|
+
// knowledge items. Graceful degradation: returns [] on any error.
|
|
5
|
+
// =============================================================================
|
|
6
|
+
// CONSTANTS
|
|
7
|
+
// =============================================================================
|
|
8
|
+
export const VALID_TYPES = [
|
|
9
|
+
'pattern',
|
|
10
|
+
'anti-pattern',
|
|
11
|
+
'principle',
|
|
12
|
+
'concept',
|
|
13
|
+
'reference',
|
|
14
|
+
'workflow',
|
|
15
|
+
'idea',
|
|
16
|
+
'roadmap',
|
|
17
|
+
];
|
|
18
|
+
const VALID_SEVERITIES = ['critical', 'warning', 'suggestion'];
|
|
19
|
+
export const CLASSIFICATION_PROMPT = `You are a knowledge extraction engine. Your job is to analyze a text chunk and extract structured knowledge items from it.
|
|
20
|
+
|
|
21
|
+
For each distinct piece of knowledge you identify, produce an object with these fields:
|
|
22
|
+
- type: one of ${JSON.stringify(VALID_TYPES)}
|
|
23
|
+
- title: concise title, max 80 characters
|
|
24
|
+
- description: 2-3 sentence summary of the knowledge
|
|
25
|
+
- tags: 3-5 lowercase single-word or hyphenated tags
|
|
26
|
+
- severity: one of "critical", "warning", "suggestion"
|
|
27
|
+
|
|
28
|
+
Rules:
|
|
29
|
+
- Extract ALL meaningful knowledge items from the text.
|
|
30
|
+
- Each item must be self-contained and independently useful.
|
|
31
|
+
- Use "critical" for must-know items, "warning" for important gotchas, "suggestion" for nice-to-know.
|
|
32
|
+
- Tags should be specific and useful for search.
|
|
33
|
+
- Respond with a pure JSON array of objects. No markdown fences, no explanation, no wrapping.
|
|
34
|
+
- If the text contains no extractable knowledge, respond with an empty array: []`;
|
|
35
|
+
// =============================================================================
|
|
36
|
+
// CLASSIFIER
|
|
37
|
+
// =============================================================================
|
|
38
|
+
/**
|
|
39
|
+
* Classify a text chunk into structured knowledge items using an LLM.
|
|
40
|
+
*
|
|
41
|
+
* @param llm - LLMClient instance
|
|
42
|
+
* @param chunkText - The text to classify
|
|
43
|
+
* @param citation - Source citation (e.g. "book.pdf, pages 12-15")
|
|
44
|
+
* @returns Classified items, or [] on any error
|
|
45
|
+
*/
|
|
46
|
+
export async function classifyChunk(llm, chunkText, citation) {
|
|
47
|
+
try {
|
|
48
|
+
const result = await llm.complete({
|
|
49
|
+
provider: 'openai',
|
|
50
|
+
model: 'gpt-4o-mini',
|
|
51
|
+
systemPrompt: CLASSIFICATION_PROMPT,
|
|
52
|
+
userPrompt: chunkText,
|
|
53
|
+
maxTokens: 4096,
|
|
54
|
+
temperature: 0.3,
|
|
55
|
+
caller: 'intake',
|
|
56
|
+
task: 'classify',
|
|
57
|
+
});
|
|
58
|
+
const raw = parseJsonResponse(result.text);
|
|
59
|
+
if (!Array.isArray(raw))
|
|
60
|
+
return [];
|
|
61
|
+
return raw
|
|
62
|
+
.map((item) => sanitizeItem(item, citation))
|
|
63
|
+
.filter((item) => item !== null);
|
|
64
|
+
}
|
|
65
|
+
catch {
|
|
66
|
+
// Graceful degradation — never throw
|
|
67
|
+
return [];
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
// =============================================================================
|
|
71
|
+
// HELPERS
|
|
72
|
+
// =============================================================================
|
|
73
|
+
/**
|
|
74
|
+
* Parse a JSON response, handling potential markdown fences the LLM
|
|
75
|
+
* might include despite instructions.
|
|
76
|
+
*/
|
|
77
|
+
function parseJsonResponse(text) {
|
|
78
|
+
const trimmed = text.trim();
|
|
79
|
+
// Strip markdown fences if present (defensive)
|
|
80
|
+
const fenceMatch = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?\s*```$/);
|
|
81
|
+
const jsonStr = fenceMatch ? fenceMatch[1] : trimmed;
|
|
82
|
+
return JSON.parse(jsonStr);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Validate and sanitize a single classified item.
|
|
86
|
+
* Returns null if the item is not salvageable.
|
|
87
|
+
*/
|
|
88
|
+
function sanitizeItem(raw, citation) {
|
|
89
|
+
if (!raw || typeof raw !== 'object')
|
|
90
|
+
return null;
|
|
91
|
+
const obj = raw;
|
|
92
|
+
// Type — must be a valid KnowledgeType
|
|
93
|
+
const type = typeof obj.type === 'string' ? obj.type : '';
|
|
94
|
+
if (!VALID_TYPES.includes(type))
|
|
95
|
+
return null;
|
|
96
|
+
// Title — required, truncate to 80 chars
|
|
97
|
+
const title = typeof obj.title === 'string' ? obj.title.slice(0, 80).trim() : '';
|
|
98
|
+
if (!title)
|
|
99
|
+
return null;
|
|
100
|
+
// Description — required
|
|
101
|
+
const description = typeof obj.description === 'string' ? obj.description.trim() : '';
|
|
102
|
+
if (!description)
|
|
103
|
+
return null;
|
|
104
|
+
// Tags — must be array of strings, cap at 5
|
|
105
|
+
const tags = Array.isArray(obj.tags)
|
|
106
|
+
? obj.tags
|
|
107
|
+
.filter((t) => typeof t === 'string')
|
|
108
|
+
.map((t) => t.toLowerCase().trim())
|
|
109
|
+
.filter((t) => t.length > 0)
|
|
110
|
+
.slice(0, 5)
|
|
111
|
+
: [];
|
|
112
|
+
// Severity — default to 'suggestion' if invalid
|
|
113
|
+
const severity = VALID_SEVERITIES.includes(obj.severity)
|
|
114
|
+
? obj.severity
|
|
115
|
+
: 'suggestion';
|
|
116
|
+
return {
|
|
117
|
+
type: type,
|
|
118
|
+
title,
|
|
119
|
+
description,
|
|
120
|
+
tags,
|
|
121
|
+
severity,
|
|
122
|
+
citation,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
//# sourceMappingURL=content-classifier.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-classifier.js","sourceRoot":"","sources":["../../src/intake/content-classifier.ts"],"names":[],"mappings":"AAAA,+EAA+E;AAC/E,EAAE;AACF,oEAAoE;AACpE,kEAAkE;AAKlE,gFAAgF;AAChF,YAAY;AACZ,gFAAgF;AAEhF,MAAM,CAAC,MAAM,WAAW,GAAoB;IAC1C,SAAS;IACT,cAAc;IACd,WAAW;IACX,SAAS;IACT,WAAW;IACX,UAAU;IACV,MAAM;IACN,SAAS;CACV,CAAC;AAEF,MAAM,gBAAgB,GAAG,CAAC,UAAU,EAAE,SAAS,EAAE,YAAY,CAAU,CAAC;AAGxE,MAAM,CAAC,MAAM,qBAAqB,GAAG;;;iBAGpB,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;;;;;;;;;;;;iFAYqC,CAAC;AAElF,gFAAgF;AAChF,aAAa;AACb,gFAAgF;AAEhF;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,GAAc,EACd,SAAiB,EACjB,QAAgB;IAEhB,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,MAAM,GAAG,CAAC,QAAQ,CAAC;YAChC,QAAQ,EAAE,QAAQ;YAClB,KAAK,EAAE,aAAa;YACpB,YAAY,EAAE,qBAAqB;YACnC,UAAU,EAAE,SAAS;YACrB,SAAS,EAAE,IAAI;YACf,WAAW,EAAE,GAAG;YAChB,MAAM,EAAE,QAAQ;YAChB,IAAI,EAAE,UAAU;SACjB,CAAC,CAAC;QAEH,MAAM,GAAG,GAAG,iBAAiB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAC3C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;YAAE,OAAO,EAAE,CAAC;QAEnC,OAAO,GAAG;aACP,GAAG,CAAC,CAAC,IAAa,EAAE,EAAE,CAAC,YAAY,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAC;aACpD,MAAM,CAAC,CAAC,IAAI,EAA0B,EAAE,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;IAC7D,CAAC;IAAC,MAAM,CAAC;QACP,qCAAqC;QACrC,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC;AAED,gFAAgF;AAChF,UAAU;AACV,gFAAgF;AAEhF;;;GAGG;AACH,SAAS,iBAAiB,CAAC,IAAY;IACrC,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;IAE5B,+CAA+C;IAC/C,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC5E,MAAM,OAAO,GAAG,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;IAErD,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;AAC7B,CAAC;AAED;;;GAGG;AACH,SAAS,YAAY,CAAC,GAAY,EAAE,QAAgB;IAClD,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAC;IAEjD,MAAM,GAAG,GAAG,GAA8B,CAAC;IAE3C,uCAAuC;IACvC,MAAM,IAAI,GAAG,OAAO,GAAG,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1D,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,IAAqB,CAAC;QAAE,OAAO,IAAI,CAAC;IAE9D,yCAAyC;IACzC,MAAM,KAAK,GAAG,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACjF,IAAI,CAAC,KAAK;QAAE,OAAO,IAAI,CAAC;IAExB,yBAAyB;IACzB,MAAM,WAAW,GAAG,OAAO,GAAG,CAAC,WAAW,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IACtF,IAAI,CAAC,WAAW;QAAE,OAAO,IAAI,CAAC;IAE9B,4CAA4C;IAC5C,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC;QAClC,CAAC,CAAC,GAAG,CAAC,IAAI;aACL,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC;aACjD,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;aAClC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;QAChB,CAAC,CAAC,EAAE,CAAC;IAEP,gDAAgD;IAChD,MAAM,QAAQ,GAAa,gBAAgB,CAAC,QAAQ,CAAC,GAAG,CAAC,QAAoB,CAAC;QAC5E,CAAC,CAAE,GAAG,CAAC,QAAqB;QAC5B,CAAC,CAAC,YAAY,CAAC;IAEjB,OAAO;QACL,IAAI,EAAE,IAAqB;QAC3B,KAAK;QACL,WAAW;QACX,IAAI;QACJ,QAAQ;QACR,QAAQ;KACT,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { Vault } from '../vault/vault.js';
|
|
2
|
+
import type { ClassifiedItem } from './types.js';
|
|
3
|
+
export declare const DEDUP_THRESHOLD = 0.85;
|
|
4
|
+
export interface DedupResult {
|
|
5
|
+
item: ClassifiedItem;
|
|
6
|
+
isDuplicate: boolean;
|
|
7
|
+
bestMatchId?: string;
|
|
8
|
+
similarity: number;
|
|
9
|
+
}
|
|
10
|
+
/**
|
|
11
|
+
* Check new items against existing vault entries for duplicates using TF-IDF cosine similarity.
|
|
12
|
+
*
|
|
13
|
+
* Builds a shared IDF vocabulary from all texts (existing + new), computes TF-IDF vectors,
|
|
14
|
+
* and marks items as duplicates when cosine similarity >= DEDUP_THRESHOLD.
|
|
15
|
+
*/
|
|
16
|
+
export declare function dedupItems(items: ClassifiedItem[], vault: Vault): DedupResult[];
|
|
17
|
+
//# sourceMappingURL=dedup-gate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup-gate.d.ts","sourceRoot":"","sources":["../../src/intake/dedup-gate.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAEjD,eAAO,MAAM,eAAe,OAAO,CAAC;AAEpC,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,cAAc,CAAC;IACrB,WAAW,EAAE,OAAO,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;;;;GAKG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,cAAc,EAAE,EAAE,KAAK,EAAE,KAAK,GAAG,WAAW,EAAE,CA+D/E"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
// ─── Dedup Gate ───────────────────────────────────────────────────
|
|
2
|
+
// TF-IDF cosine similarity check against existing vault entries.
|
|
3
|
+
// Pure function: no side effects, no I/O beyond reading vault.
|
|
4
|
+
import { tokenize, calculateTfIdf, cosineSimilarity, } from '../text/similarity.js';
|
|
5
|
+
export const DEDUP_THRESHOLD = 0.85;
|
|
6
|
+
/**
|
|
7
|
+
* Check new items against existing vault entries for duplicates using TF-IDF cosine similarity.
|
|
8
|
+
*
|
|
9
|
+
* Builds a shared IDF vocabulary from all texts (existing + new), computes TF-IDF vectors,
|
|
10
|
+
* and marks items as duplicates when cosine similarity >= DEDUP_THRESHOLD.
|
|
11
|
+
*/
|
|
12
|
+
export function dedupItems(items, vault) {
|
|
13
|
+
const existing = vault.exportAll().entries;
|
|
14
|
+
// Fast path: nothing in vault — everything is new
|
|
15
|
+
if (existing.length === 0) {
|
|
16
|
+
return items.map((item) => ({
|
|
17
|
+
item,
|
|
18
|
+
isDuplicate: false,
|
|
19
|
+
similarity: 0,
|
|
20
|
+
}));
|
|
21
|
+
}
|
|
22
|
+
// ── Build texts for vocabulary ──────────────────────────────────
|
|
23
|
+
const existingTexts = existing.map((e) => `${e.title} ${e.description}`);
|
|
24
|
+
const newTexts = items.map((i) => `${i.title} ${i.description}`);
|
|
25
|
+
const allTexts = [...existingTexts, ...newTexts];
|
|
26
|
+
const totalDocs = allTexts.length;
|
|
27
|
+
// ── Count document frequency per term ───────────────────────────
|
|
28
|
+
const docFreq = new Map();
|
|
29
|
+
for (const text of allTexts) {
|
|
30
|
+
const terms = new Set(tokenize(text));
|
|
31
|
+
for (const term of terms) {
|
|
32
|
+
docFreq.set(term, (docFreq.get(term) ?? 0) + 1);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
// ── Build IDF vocabulary ────────────────────────────────────────
|
|
36
|
+
const vocabulary = new Map();
|
|
37
|
+
for (const [term, df] of docFreq) {
|
|
38
|
+
vocabulary.set(term, Math.log((totalDocs + 1) / (df + 1)) + 1);
|
|
39
|
+
}
|
|
40
|
+
// ── Compute TF-IDF vectors for existing entries ─────────────────
|
|
41
|
+
const existingVectors = existing.map((entry, idx) => ({
|
|
42
|
+
id: entry.id,
|
|
43
|
+
vec: calculateTfIdf(tokenize(existingTexts[idx]), vocabulary),
|
|
44
|
+
}));
|
|
45
|
+
// ── Score each new item against all existing entries ────────────
|
|
46
|
+
return items.map((item, idx) => {
|
|
47
|
+
const itemVec = calculateTfIdf(tokenize(newTexts[idx]), vocabulary);
|
|
48
|
+
let bestSimilarity = 0;
|
|
49
|
+
let bestMatchId;
|
|
50
|
+
for (const { id, vec } of existingVectors) {
|
|
51
|
+
const sim = cosineSimilarity(itemVec, vec);
|
|
52
|
+
if (sim > bestSimilarity) {
|
|
53
|
+
bestSimilarity = sim;
|
|
54
|
+
bestMatchId = id;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
const isDuplicate = bestSimilarity >= DEDUP_THRESHOLD;
|
|
58
|
+
return {
|
|
59
|
+
item,
|
|
60
|
+
isDuplicate,
|
|
61
|
+
bestMatchId: isDuplicate ? bestMatchId : undefined,
|
|
62
|
+
similarity: bestSimilarity,
|
|
63
|
+
};
|
|
64
|
+
});
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=dedup-gate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dedup-gate.js","sourceRoot":"","sources":["../../src/intake/dedup-gate.ts"],"names":[],"mappings":"AAAA,qEAAqE;AACrE,iEAAiE;AACjE,+DAA+D;AAE/D,OAAO,EACL,QAAQ,EACR,cAAc,EACd,gBAAgB,GAEjB,MAAM,uBAAuB,CAAC;AAI/B,MAAM,CAAC,MAAM,eAAe,GAAG,IAAI,CAAC;AASpC;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CAAC,KAAuB,EAAE,KAAY;IAC9D,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC,OAAO,CAAC;IAE3C,kDAAkD;IAClD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1B,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAC1B,IAAI;YACJ,WAAW,EAAE,KAAK;YAClB,UAAU,EAAE,CAAC;SACd,CAAC,CAAC,CAAC;IACN,CAAC;IAED,mEAAmE;IACnE,MAAM,aAAa,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;IACzE,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;IACjE,MAAM,QAAQ,GAAG,CAAC,GAAG,aAAa,EAAE,GAAG,QAAQ,CAAC,CAAC;IACjD,MAAM,SAAS,GAAG,QAAQ,CAAC,MAAM,CAAC;IAElC,mEAAmE;IACnE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC1C,KAAK,MAAM,IAAI,IAAI,QAAQ,EAAE,CAAC;QAC5B,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;QACtC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAClD,CAAC;IACH,CAAC;IAED,mEAAmE;IACnE,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAC;IAC7C,KAAK,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,OAAO,EAAE,CAAC;QACjC,UAAU,CAAC,GAAG,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACjE,CAAC;IAED,mEAAmE;IACnE,MAAM,eAAe,GAA6C,QAAQ,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC;QAC9F,EAAE,EAAE,KAAK,CAAC,EAAE;QACZ,GAAG,EAAE,cAAc,CAAC,QAAQ,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC;KAC9D,CAAC,CAAC,CAAC;IAEJ,mEAAmE;IACnE,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,GAAG,EAAE,EAAE;QAC7B,MAAM,OAAO,GAAG,cAAc,CAAC,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;QAEpE,IAAI,cAAc,GAAG,CAAC,CAAC;QACvB,IAAI,WAA+B,CAAC;QAEpC,KAAK,MAAM,EAAE,EAAE,EAAE,GAAG,EAAE,IAAI,eAAe,EAAE,CAAC;YAC1C,MAAM,GAAG,GAAG,gBAAgB,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;YAC3C,IAAI,GAAG,GAAG,cAAc,EAAE,CAAC;gBACzB,cAAc,GAAG,GAAG,CAAC;gBACrB,WAAW,GAAG,EAAE,CAAC;YACnB,CAAC;QACH,CAAC;QAED,MAAM,WAAW,GAAG,cAAc,IAAI,eAAe,CAAC;QAEtD,OAAO;YACL,IAAI;YACJ,WAAW;YACX,WAAW,EAAE,WAAW,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,SAAS;YAClD,UAAU,EAAE,cAAc;SAC3B,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import type { PersistenceProvider } from '../persistence/types.js';
|
|
2
|
+
import type { Vault } from '../vault/vault.js';
|
|
3
|
+
import type { LLMClient } from '../llm/llm-client.js';
|
|
4
|
+
import type { IntakeConfig, IntakeChunk, IntakeJobRecord, IntakePreviewResult } from './types.js';
|
|
5
|
+
/**
|
|
6
|
+
* Split concatenated PDF text into per-page segments.
|
|
7
|
+
*
|
|
8
|
+
* Strategy: split on form-feed characters first (common in pdf-parse output).
|
|
9
|
+
* If that yields fewer segments than expected, fall back to equal-length splits.
|
|
10
|
+
*/
|
|
11
|
+
export declare function splitIntoPages(text: string, numPages: number): string[];
|
|
12
|
+
export declare class IntakePipeline {
|
|
13
|
+
private provider;
|
|
14
|
+
private vault;
|
|
15
|
+
private llm;
|
|
16
|
+
constructor(provider: PersistenceProvider, vault: Vault, llm: LLMClient);
|
|
17
|
+
private initSchema;
|
|
18
|
+
/**
|
|
19
|
+
* Parse a PDF, compute its file hash, create fixed-size page chunks,
|
|
20
|
+
* and persist the job + chunk records to the database.
|
|
21
|
+
*/
|
|
22
|
+
ingestBook(config: IntakeConfig): Promise<IntakeJobRecord>;
|
|
23
|
+
/**
|
|
24
|
+
* Process up to `count` pending chunks for a job.
|
|
25
|
+
*
|
|
26
|
+
* For each chunk:
|
|
27
|
+
* 2. Extract page text from PDF
|
|
28
|
+
* 3. Classify via LLM
|
|
29
|
+
* 4. Dedup against vault
|
|
30
|
+
* 5. Store unique items
|
|
31
|
+
*
|
|
32
|
+
* When all chunks are done, finalizes the job (stage 6).
|
|
33
|
+
*/
|
|
34
|
+
processChunks(jobId: string, count?: number): Promise<{
|
|
35
|
+
processed: number;
|
|
36
|
+
itemsStored: number;
|
|
37
|
+
itemsDeduped: number;
|
|
38
|
+
remaining: number;
|
|
39
|
+
}>;
|
|
40
|
+
/**
|
|
41
|
+
* Parse a page range from a PDF and classify it without storing.
|
|
42
|
+
* Useful for previewing what the pipeline would extract.
|
|
43
|
+
*/
|
|
44
|
+
preview(config: IntakeConfig, pageStart: number, pageEnd: number): Promise<IntakePreviewResult>;
|
|
45
|
+
/**
|
|
46
|
+
* Get a job record by ID.
|
|
47
|
+
*/
|
|
48
|
+
getJob(jobId: string): IntakeJobRecord | null;
|
|
49
|
+
/**
|
|
50
|
+
* List all intake jobs.
|
|
51
|
+
*/
|
|
52
|
+
listJobs(): IntakeJobRecord[];
|
|
53
|
+
/**
|
|
54
|
+
* Get all chunks for a job.
|
|
55
|
+
*/
|
|
56
|
+
getChunks(jobId: string): IntakeChunk[];
|
|
57
|
+
/**
|
|
58
|
+
* Sum stats from all chunks and mark the job as completed.
|
|
59
|
+
*/
|
|
60
|
+
private finalizeJob;
|
|
61
|
+
private countPendingChunks;
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=intake-pipeline.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"intake-pipeline.d.ts","sourceRoot":"","sources":["../../src/intake/intake-pipeline.ts"],"names":[],"mappings":"AAcA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,yBAAyB,CAAC;AACnE,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAC/C,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAEtD,OAAO,KAAK,EACV,YAAY,EACZ,WAAW,EACX,eAAe,EACf,mBAAmB,EAGpB,MAAM,YAAY,CAAC;AAwBpB;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,EAAE,CAoBvE;AAMD,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAsB;IACtC,OAAO,CAAC,KAAK,CAAQ;IACrB,OAAO,CAAC,GAAG,CAAY;gBAEX,QAAQ,EAAE,mBAAmB,EAAE,KAAK,EAAE,KAAK,EAAE,GAAG,EAAE,SAAS;IASvE,OAAO,CAAC,UAAU;IAiClB;;;OAGG;IACG,UAAU,CAAC,MAAM,EAAE,YAAY,GAAG,OAAO,CAAC,eAAe,CAAC;IAgEhE;;;;;;;;;;OAUG;IACG,aAAa,CACjB,KAAK,EAAE,MAAM,EACb,KAAK,GAAE,MAAU,GAChB,OAAO,CAAC;QACT,SAAS,EAAE,MAAM,CAAC;QAClB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;QACrB,SAAS,EAAE,MAAM,CAAC;KACnB,CAAC;IAwHF;;;OAGG;IACG,OAAO,CACX,MAAM,EAAE,YAAY,EACpB,SAAS,EAAE,MAAM,EACjB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,mBAAmB,CAAC;IAqB/B;;OAEG;IACH,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,eAAe,GAAG,IAAI;IAQ7C;;OAEG;IACH,QAAQ,IAAI,eAAe,EAAE;IAO7B;;OAEG;IACH,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,WAAW,EAAE;IAUvC;;OAEG;IACH,OAAO,CAAC,WAAW;IAiCnB,OAAO,CAAC,kBAAkB;CAO3B"}
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
// ─── Intake Pipeline ──────────────────────────────────────────────
|
|
2
|
+
//
|
|
3
|
+
// 6-stage pipeline for ingesting PDF books into the vault:
|
|
4
|
+
// 1. Parse PDF + compute hash + create chunks → job record
|
|
5
|
+
// 2. Extract page text for each chunk
|
|
6
|
+
// 3. Classify chunk text via LLM
|
|
7
|
+
// 4. Dedup classified items against vault
|
|
8
|
+
// 5. Store unique items in vault
|
|
9
|
+
// 6. Finalize job with aggregate stats
|
|
10
|
+
//
|
|
11
|
+
// SQLite-backed job tracking for resumable processing.
|
|
12
|
+
import { createHash, randomUUID } from 'node:crypto';
|
|
13
|
+
import { readFileSync, statSync } from 'node:fs';
|
|
14
|
+
import { classifyChunk } from './content-classifier.js';
|
|
15
|
+
import { dedupItems } from './dedup-gate.js';
|
|
16
|
+
// =============================================================================
|
|
17
|
+
// CONSTANTS
|
|
18
|
+
// =============================================================================
|
|
19
|
+
const DEFAULT_CHUNK_SIZE = 10;
|
|
20
|
+
/**
|
|
21
|
+
* Map KnowledgeType → IntelligenceEntry.type.
|
|
22
|
+
* Only 'pattern' and 'anti-pattern' map directly; everything else becomes 'rule'.
|
|
23
|
+
*/
|
|
24
|
+
function mapKnowledgeType(kt) {
|
|
25
|
+
if (kt === 'pattern')
|
|
26
|
+
return 'pattern';
|
|
27
|
+
if (kt === 'anti-pattern')
|
|
28
|
+
return 'anti-pattern';
|
|
29
|
+
return 'rule';
|
|
30
|
+
}
|
|
31
|
+
// =============================================================================
|
|
32
|
+
// HELPERS
|
|
33
|
+
// =============================================================================
|
|
34
|
+
/**
|
|
35
|
+
* Split concatenated PDF text into per-page segments.
|
|
36
|
+
*
|
|
37
|
+
* Strategy: split on form-feed characters first (common in pdf-parse output).
|
|
38
|
+
* If that yields fewer segments than expected, fall back to equal-length splits.
|
|
39
|
+
*/
|
|
40
|
+
export function splitIntoPages(text, numPages) {
|
|
41
|
+
if (numPages <= 0)
|
|
42
|
+
return [text];
|
|
43
|
+
// Try form-feed split first
|
|
44
|
+
const ffPages = text.split('\f');
|
|
45
|
+
if (ffPages.length >= numPages) {
|
|
46
|
+
return ffPages.slice(0, numPages);
|
|
47
|
+
}
|
|
48
|
+
// Fallback: equal-length chunks
|
|
49
|
+
const chunkSize = Math.ceil(text.length / numPages);
|
|
50
|
+
const pages = [];
|
|
51
|
+
for (let i = 0; i < text.length; i += chunkSize) {
|
|
52
|
+
pages.push(text.slice(i, i + chunkSize));
|
|
53
|
+
}
|
|
54
|
+
// Pad with empty strings if we somehow got fewer
|
|
55
|
+
while (pages.length < numPages) {
|
|
56
|
+
pages.push('');
|
|
57
|
+
}
|
|
58
|
+
return pages;
|
|
59
|
+
}
|
|
60
|
+
// =============================================================================
|
|
61
|
+
// PIPELINE
|
|
62
|
+
// =============================================================================
|
|
63
|
+
export class IntakePipeline {
|
|
64
|
+
provider;
|
|
65
|
+
vault;
|
|
66
|
+
llm;
|
|
67
|
+
constructor(provider, vault, llm) {
|
|
68
|
+
this.provider = provider;
|
|
69
|
+
this.vault = vault;
|
|
70
|
+
this.llm = llm;
|
|
71
|
+
this.initSchema();
|
|
72
|
+
}
|
|
73
|
+
// ─── Schema ──────────────────────────────────────────────────────
|
|
74
|
+
initSchema() {
|
|
75
|
+
this.provider.execSql(`
|
|
76
|
+
CREATE TABLE IF NOT EXISTS intake_jobs (
|
|
77
|
+
id TEXT PRIMARY KEY,
|
|
78
|
+
status TEXT NOT NULL,
|
|
79
|
+
config TEXT NOT NULL,
|
|
80
|
+
pdf_meta TEXT,
|
|
81
|
+
toc TEXT,
|
|
82
|
+
stats TEXT,
|
|
83
|
+
created_at INTEGER,
|
|
84
|
+
updated_at INTEGER,
|
|
85
|
+
completed_at INTEGER
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
CREATE TABLE IF NOT EXISTS intake_chunks (
|
|
89
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
90
|
+
job_id TEXT NOT NULL REFERENCES intake_jobs(id),
|
|
91
|
+
chunk_index INTEGER,
|
|
92
|
+
title TEXT,
|
|
93
|
+
page_start INTEGER,
|
|
94
|
+
page_end INTEGER,
|
|
95
|
+
status TEXT DEFAULT 'pending',
|
|
96
|
+
items_extracted INTEGER DEFAULT 0,
|
|
97
|
+
items_stored INTEGER DEFAULT 0,
|
|
98
|
+
items_deduped INTEGER DEFAULT 0,
|
|
99
|
+
error TEXT,
|
|
100
|
+
processed_at INTEGER
|
|
101
|
+
);
|
|
102
|
+
`);
|
|
103
|
+
}
|
|
104
|
+
// ─── Stage 1: Ingest Book ────────────────────────────────────────
|
|
105
|
+
/**
|
|
106
|
+
* Parse a PDF, compute its file hash, create fixed-size page chunks,
|
|
107
|
+
* and persist the job + chunk records to the database.
|
|
108
|
+
*/
|
|
109
|
+
async ingestBook(config) {
|
|
110
|
+
const jobId = randomUUID();
|
|
111
|
+
const now = Math.floor(Date.now() / 1000);
|
|
112
|
+
const chunkPageSize = config.chunkPageSize ?? DEFAULT_CHUNK_SIZE;
|
|
113
|
+
// Read file
|
|
114
|
+
const fileBuffer = readFileSync(config.pdfPath);
|
|
115
|
+
const fileSize = statSync(config.pdfPath).size;
|
|
116
|
+
const fileHash = createHash('sha256').update(fileBuffer).digest('hex');
|
|
117
|
+
// Dynamic import of pdf-parse
|
|
118
|
+
const pdfParse = (await import('pdf-parse')).default;
|
|
119
|
+
const pdfData = await pdfParse(fileBuffer);
|
|
120
|
+
const totalPages = pdfData.numpages;
|
|
121
|
+
const pdfMeta = { totalPages, fileHash, fileSize };
|
|
122
|
+
// Create chunk definitions (fixed N-page windows)
|
|
123
|
+
const numChunks = Math.ceil(totalPages / chunkPageSize);
|
|
124
|
+
this.provider.transaction(() => {
|
|
125
|
+
// Insert job
|
|
126
|
+
this.provider.run(`INSERT INTO intake_jobs (id, status, config, pdf_meta, toc, stats, created_at, updated_at, completed_at)
|
|
127
|
+
VALUES (@id, @status, @config, @pdfMeta, @toc, @stats, @createdAt, @updatedAt, @completedAt)`, {
|
|
128
|
+
id: jobId,
|
|
129
|
+
status: 'initialized',
|
|
130
|
+
config: JSON.stringify(config),
|
|
131
|
+
pdfMeta: JSON.stringify(pdfMeta),
|
|
132
|
+
toc: null,
|
|
133
|
+
stats: null,
|
|
134
|
+
createdAt: now,
|
|
135
|
+
updatedAt: now,
|
|
136
|
+
completedAt: null,
|
|
137
|
+
});
|
|
138
|
+
// Insert chunk records
|
|
139
|
+
for (let i = 0; i < numChunks; i++) {
|
|
140
|
+
const pageStart = i * chunkPageSize + 1;
|
|
141
|
+
const pageEnd = Math.min((i + 1) * chunkPageSize, totalPages);
|
|
142
|
+
const chunkTitle = `${config.title} — pages ${pageStart}-${pageEnd}`;
|
|
143
|
+
this.provider.run(`INSERT INTO intake_chunks (job_id, chunk_index, title, page_start, page_end, status)
|
|
144
|
+
VALUES (@jobId, @chunkIndex, @title, @pageStart, @pageEnd, @status)`, {
|
|
145
|
+
jobId,
|
|
146
|
+
chunkIndex: i,
|
|
147
|
+
title: chunkTitle,
|
|
148
|
+
pageStart,
|
|
149
|
+
pageEnd,
|
|
150
|
+
status: 'pending',
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
});
|
|
154
|
+
return this.getJob(jobId);
|
|
155
|
+
}
|
|
156
|
+
// ─── Stages 2-5: Process Chunks ──────────────────────────────────
|
|
157
|
+
/**
|
|
158
|
+
* Process up to `count` pending chunks for a job.
|
|
159
|
+
*
|
|
160
|
+
* For each chunk:
|
|
161
|
+
* 2. Extract page text from PDF
|
|
162
|
+
* 3. Classify via LLM
|
|
163
|
+
* 4. Dedup against vault
|
|
164
|
+
* 5. Store unique items
|
|
165
|
+
*
|
|
166
|
+
* When all chunks are done, finalizes the job (stage 6).
|
|
167
|
+
*/
|
|
168
|
+
async processChunks(jobId, count = 5) {
|
|
169
|
+
// Get pending chunks
|
|
170
|
+
const pendingChunks = this.provider.all(`SELECT * FROM intake_chunks WHERE job_id = @jobId AND status = 'pending' ORDER BY chunk_index ASC LIMIT @limit`, { jobId, limit: count });
|
|
171
|
+
if (pendingChunks.length === 0) {
|
|
172
|
+
const remaining = this.countPendingChunks(jobId);
|
|
173
|
+
return { processed: 0, itemsStored: 0, itemsDeduped: 0, remaining };
|
|
174
|
+
}
|
|
175
|
+
// Mark job as processing
|
|
176
|
+
this.provider.run(`UPDATE intake_jobs SET status = 'processing', updated_at = @now WHERE id = @id`, { id: jobId, now: Math.floor(Date.now() / 1000) });
|
|
177
|
+
// Re-read config and parse PDF
|
|
178
|
+
const job = this.getJob(jobId);
|
|
179
|
+
if (!job) {
|
|
180
|
+
return { processed: 0, itemsStored: 0, itemsDeduped: 0, remaining: 0 };
|
|
181
|
+
}
|
|
182
|
+
const fileBuffer = readFileSync(job.config.pdfPath);
|
|
183
|
+
const pdfParse = (await import('pdf-parse')).default;
|
|
184
|
+
const pdfData = await pdfParse(fileBuffer);
|
|
185
|
+
const totalPages = job.pdfMeta?.totalPages ?? pdfData.numpages;
|
|
186
|
+
const pages = splitIntoPages(pdfData.text, totalPages);
|
|
187
|
+
let totalStored = 0;
|
|
188
|
+
let totalDeduped = 0;
|
|
189
|
+
let processed = 0;
|
|
190
|
+
for (const chunkRow of pendingChunks) {
|
|
191
|
+
const chunkId = chunkRow.id;
|
|
192
|
+
const chunkIndex = chunkRow.chunk_index;
|
|
193
|
+
const pageStart = chunkRow.page_start;
|
|
194
|
+
const pageEnd = chunkRow.page_end;
|
|
195
|
+
try {
|
|
196
|
+
// Mark chunk processing
|
|
197
|
+
this.provider.run(`UPDATE intake_chunks SET status = 'processing' WHERE id = @id`, {
|
|
198
|
+
id: chunkId,
|
|
199
|
+
});
|
|
200
|
+
// Stage 2: Extract page text (1-indexed → 0-indexed)
|
|
201
|
+
const chunkText = pages.slice(pageStart - 1, pageEnd).join('\n\n');
|
|
202
|
+
const citation = `${job.config.title}, pages ${pageStart}-${pageEnd}`;
|
|
203
|
+
// Stage 3: Classify
|
|
204
|
+
const classifiedItems = await classifyChunk(this.llm, chunkText, citation);
|
|
205
|
+
// Stage 4: Dedup
|
|
206
|
+
const dedupResults = dedupItems(classifiedItems, this.vault);
|
|
207
|
+
const uniqueItems = dedupResults.filter((r) => !r.isDuplicate);
|
|
208
|
+
const dupCount = dedupResults.filter((r) => r.isDuplicate).length;
|
|
209
|
+
// Stage 5: Store unique items in vault
|
|
210
|
+
let storedCount = 0;
|
|
211
|
+
for (let itemIdx = 0; itemIdx < uniqueItems.length; itemIdx++) {
|
|
212
|
+
const result = uniqueItems[itemIdx];
|
|
213
|
+
const entry = classifiedItemToEntry(result.item, job.config.domain, jobId, chunkIndex, itemIdx, job.config.tags);
|
|
214
|
+
this.vault.add(entry);
|
|
215
|
+
storedCount++;
|
|
216
|
+
}
|
|
217
|
+
// Update chunk record
|
|
218
|
+
const now = Math.floor(Date.now() / 1000);
|
|
219
|
+
this.provider.run(`UPDATE intake_chunks
|
|
220
|
+
SET status = 'completed', items_extracted = @extracted, items_stored = @stored, items_deduped = @deduped, processed_at = @now
|
|
221
|
+
WHERE id = @id`, {
|
|
222
|
+
id: chunkId,
|
|
223
|
+
extracted: classifiedItems.length,
|
|
224
|
+
stored: storedCount,
|
|
225
|
+
deduped: dupCount,
|
|
226
|
+
now,
|
|
227
|
+
});
|
|
228
|
+
totalStored += storedCount;
|
|
229
|
+
totalDeduped += dupCount;
|
|
230
|
+
processed++;
|
|
231
|
+
}
|
|
232
|
+
catch (err) {
|
|
233
|
+
// Graceful degradation: mark chunk as failed, continue with others
|
|
234
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
235
|
+
this.provider.run(`UPDATE intake_chunks SET status = 'failed', error = @error, processed_at = @now WHERE id = @id`, { id: chunkId, error: errorMsg, now: Math.floor(Date.now() / 1000) });
|
|
236
|
+
processed++;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
// Update job timestamp
|
|
240
|
+
this.provider.run(`UPDATE intake_jobs SET updated_at = @now WHERE id = @id`, {
|
|
241
|
+
id: jobId,
|
|
242
|
+
now: Math.floor(Date.now() / 1000),
|
|
243
|
+
});
|
|
244
|
+
// Check remaining
|
|
245
|
+
const remaining = this.countPendingChunks(jobId);
|
|
246
|
+
if (remaining === 0) {
|
|
247
|
+
this.finalizeJob(jobId);
|
|
248
|
+
}
|
|
249
|
+
return { processed, itemsStored: totalStored, itemsDeduped: totalDeduped, remaining };
|
|
250
|
+
}
|
|
251
|
+
// ─── Preview ─────────────────────────────────────────────────────
|
|
252
|
+
/**
|
|
253
|
+
* Parse a page range from a PDF and classify it without storing.
|
|
254
|
+
* Useful for previewing what the pipeline would extract.
|
|
255
|
+
*/
|
|
256
|
+
async preview(config, pageStart, pageEnd) {
|
|
257
|
+
const fileBuffer = readFileSync(config.pdfPath);
|
|
258
|
+
const pdfParse = (await import('pdf-parse')).default;
|
|
259
|
+
const pdfData = await pdfParse(fileBuffer);
|
|
260
|
+
const totalPages = pdfData.numpages;
|
|
261
|
+
const pages = splitIntoPages(pdfData.text, totalPages);
|
|
262
|
+
const chunkText = pages.slice(pageStart - 1, pageEnd).join('\n\n');
|
|
263
|
+
const citation = `${config.title}, pages ${pageStart}-${pageEnd}`;
|
|
264
|
+
const items = await classifyChunk(this.llm, chunkText, citation);
|
|
265
|
+
return {
|
|
266
|
+
items,
|
|
267
|
+
chunkText,
|
|
268
|
+
pageRange: { start: pageStart, end: pageEnd },
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
// ─── Queries ─────────────────────────────────────────────────────
|
|
272
|
+
/**
|
|
273
|
+
* Get a job record by ID.
|
|
274
|
+
*/
|
|
275
|
+
getJob(jobId) {
|
|
276
|
+
const row = this.provider.get('SELECT * FROM intake_jobs WHERE id = @id', { id: jobId });
|
|
277
|
+
return row ? rowToJobRecord(row) : null;
|
|
278
|
+
}
|
|
279
|
+
/**
|
|
280
|
+
* List all intake jobs.
|
|
281
|
+
*/
|
|
282
|
+
listJobs() {
|
|
283
|
+
const rows = this.provider.all('SELECT * FROM intake_jobs ORDER BY created_at DESC');
|
|
284
|
+
return rows.map(rowToJobRecord);
|
|
285
|
+
}
|
|
286
|
+
/**
|
|
287
|
+
* Get all chunks for a job.
|
|
288
|
+
*/
|
|
289
|
+
getChunks(jobId) {
|
|
290
|
+
const rows = this.provider.all('SELECT * FROM intake_chunks WHERE job_id = @jobId ORDER BY chunk_index ASC', { jobId });
|
|
291
|
+
return rows.map(rowToChunk);
|
|
292
|
+
}
|
|
293
|
+
// ─── Stage 6: Finalize ──────────────────────────────────────────
|
|
294
|
+
/**
|
|
295
|
+
* Sum stats from all chunks and mark the job as completed.
|
|
296
|
+
*/
|
|
297
|
+
finalizeJob(jobId) {
|
|
298
|
+
const chunks = this.provider.all('SELECT * FROM intake_chunks WHERE job_id = @jobId', { jobId });
|
|
299
|
+
let itemsExtracted = 0;
|
|
300
|
+
let itemsStored = 0;
|
|
301
|
+
let itemsDeduped = 0;
|
|
302
|
+
let itemsFailed = 0;
|
|
303
|
+
for (const chunk of chunks) {
|
|
304
|
+
const status = chunk.status;
|
|
305
|
+
if (status === 'completed') {
|
|
306
|
+
itemsExtracted += chunk.items_extracted ?? 0;
|
|
307
|
+
itemsStored += chunk.items_stored ?? 0;
|
|
308
|
+
itemsDeduped += chunk.items_deduped ?? 0;
|
|
309
|
+
}
|
|
310
|
+
else if (status === 'failed') {
|
|
311
|
+
itemsFailed++;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
const stats = { itemsExtracted, itemsStored, itemsDeduped, itemsFailed };
|
|
315
|
+
const now = Math.floor(Date.now() / 1000);
|
|
316
|
+
this.provider.run(`UPDATE intake_jobs SET status = 'completed', stats = @stats, updated_at = @now, completed_at = @now WHERE id = @id`, { id: jobId, stats: JSON.stringify(stats), now });
|
|
317
|
+
}
|
|
318
|
+
// ─── Private helpers ─────────────────────────────────────────────
|
|
319
|
+
countPendingChunks(jobId) {
|
|
320
|
+
const result = this.provider.get(`SELECT COUNT(*) as count FROM intake_chunks WHERE job_id = @jobId AND status = 'pending'`, { jobId });
|
|
321
|
+
return result?.count ?? 0;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
// =============================================================================
|
|
325
|
+
// ROW MAPPERS
|
|
326
|
+
// =============================================================================
|
|
327
|
+
function rowToJobRecord(row) {
|
|
328
|
+
return {
|
|
329
|
+
id: row.id,
|
|
330
|
+
status: row.status,
|
|
331
|
+
config: JSON.parse(row.config),
|
|
332
|
+
pdfMeta: row.pdf_meta ? JSON.parse(row.pdf_meta) : null,
|
|
333
|
+
toc: row.toc ? JSON.parse(row.toc) : null,
|
|
334
|
+
stats: row.stats ? JSON.parse(row.stats) : null,
|
|
335
|
+
createdAt: row.created_at,
|
|
336
|
+
updatedAt: row.updated_at,
|
|
337
|
+
completedAt: row.completed_at ?? null,
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
function rowToChunk(row) {
|
|
341
|
+
return {
|
|
342
|
+
id: row.id,
|
|
343
|
+
jobId: row.job_id,
|
|
344
|
+
chunkIndex: row.chunk_index,
|
|
345
|
+
title: row.title ?? null,
|
|
346
|
+
pageStart: row.page_start,
|
|
347
|
+
pageEnd: row.page_end,
|
|
348
|
+
status: row.status,
|
|
349
|
+
itemsExtracted: row.items_extracted ?? 0,
|
|
350
|
+
itemsStored: row.items_stored ?? 0,
|
|
351
|
+
itemsDeduped: row.items_deduped ?? 0,
|
|
352
|
+
error: row.error ?? null,
|
|
353
|
+
processedAt: row.processed_at ?? null,
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Convert a ClassifiedItem to an IntelligenceEntry for vault storage.
|
|
358
|
+
*/
|
|
359
|
+
function classifiedItemToEntry(item, domain, jobId, chunkIndex, itemIndex, extraTags) {
|
|
360
|
+
const entryType = mapKnowledgeType(item.type);
|
|
361
|
+
const tags = [...item.tags, ...(extraTags ?? [])];
|
|
362
|
+
return {
|
|
363
|
+
id: `intake-${jobId}-${chunkIndex}-${itemIndex}`,
|
|
364
|
+
type: entryType,
|
|
365
|
+
domain,
|
|
366
|
+
title: item.title,
|
|
367
|
+
severity: item.severity,
|
|
368
|
+
description: item.description,
|
|
369
|
+
context: item.citation,
|
|
370
|
+
tags,
|
|
371
|
+
};
|
|
372
|
+
}
|
|
373
|
+
//# sourceMappingURL=intake-pipeline.js.map
|