@pella-labs/pinakes 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +208 -0
- package/dist/cli/audit.d.ts +30 -0
- package/dist/cli/audit.d.ts.map +1 -0
- package/dist/cli/audit.js +49 -0
- package/dist/cli/audit.js.map +1 -0
- package/dist/cli/export.d.ts +32 -0
- package/dist/cli/export.d.ts.map +1 -0
- package/dist/cli/export.js +73 -0
- package/dist/cli/export.js.map +1 -0
- package/dist/cli/import.d.ts +24 -0
- package/dist/cli/import.d.ts.map +1 -0
- package/dist/cli/import.js +96 -0
- package/dist/cli/import.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +172 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/purge.d.ts +23 -0
- package/dist/cli/purge.d.ts.map +1 -0
- package/dist/cli/purge.js +57 -0
- package/dist/cli/purge.js.map +1 -0
- package/dist/cli/rebuild.d.ts +54 -0
- package/dist/cli/rebuild.d.ts.map +1 -0
- package/dist/cli/rebuild.js +113 -0
- package/dist/cli/rebuild.js.map +1 -0
- package/dist/cli/serve.d.ts +49 -0
- package/dist/cli/serve.d.ts.map +1 -0
- package/dist/cli/serve.js +296 -0
- package/dist/cli/serve.js.map +1 -0
- package/dist/cli/status.d.ts +39 -0
- package/dist/cli/status.d.ts.map +1 -0
- package/dist/cli/status.js +108 -0
- package/dist/cli/status.js.map +1 -0
- package/dist/db/client.d.ts +109 -0
- package/dist/db/client.d.ts.map +1 -0
- package/dist/db/client.js +175 -0
- package/dist/db/client.js.map +1 -0
- package/dist/db/repository.d.ts +82 -0
- package/dist/db/repository.d.ts.map +1 -0
- package/dist/db/repository.js +173 -0
- package/dist/db/repository.js.map +1 -0
- package/dist/db/schema.d.ts +990 -0
- package/dist/db/schema.d.ts.map +1 -0
- package/dist/db/schema.js +259 -0
- package/dist/db/schema.js.map +1 -0
- package/dist/db/types.d.ts +28 -0
- package/dist/db/types.d.ts.map +1 -0
- package/dist/db/types.js +11 -0
- package/dist/db/types.js.map +1 -0
- package/dist/gaps/detector.d.ts +67 -0
- package/dist/gaps/detector.d.ts.map +1 -0
- package/dist/gaps/detector.js +160 -0
- package/dist/gaps/detector.js.map +1 -0
- package/dist/gate/budget.d.ts +90 -0
- package/dist/gate/budget.d.ts.map +1 -0
- package/dist/gate/budget.js +145 -0
- package/dist/gate/budget.js.map +1 -0
- package/dist/ingest/chokidar.d.ts +33 -0
- package/dist/ingest/chokidar.d.ts.map +1 -0
- package/dist/ingest/chokidar.js +152 -0
- package/dist/ingest/chokidar.js.map +1 -0
- package/dist/ingest/ingester.d.ts +117 -0
- package/dist/ingest/ingester.d.ts.map +1 -0
- package/dist/ingest/ingester.js +312 -0
- package/dist/ingest/ingester.js.map +1 -0
- package/dist/ingest/manifest.d.ts +87 -0
- package/dist/ingest/manifest.d.ts.map +1 -0
- package/dist/ingest/manifest.js +223 -0
- package/dist/ingest/manifest.js.map +1 -0
- package/dist/ingest/memory-store.d.ts +55 -0
- package/dist/ingest/memory-store.d.ts.map +1 -0
- package/dist/ingest/memory-store.js +94 -0
- package/dist/ingest/memory-store.js.map +1 -0
- package/dist/ingest/parse/chunk.d.ts +15 -0
- package/dist/ingest/parse/chunk.d.ts.map +1 -0
- package/dist/ingest/parse/chunk.js +88 -0
- package/dist/ingest/parse/chunk.js.map +1 -0
- package/dist/ingest/parse/markdown.d.ts +64 -0
- package/dist/ingest/parse/markdown.d.ts.map +1 -0
- package/dist/ingest/parse/markdown.js +152 -0
- package/dist/ingest/parse/markdown.js.map +1 -0
- package/dist/ingest/queue.d.ts +21 -0
- package/dist/ingest/queue.d.ts.map +1 -0
- package/dist/ingest/queue.js +24 -0
- package/dist/ingest/queue.js.map +1 -0
- package/dist/ingest/source.d.ts +42 -0
- package/dist/ingest/source.d.ts.map +1 -0
- package/dist/ingest/source.js +19 -0
- package/dist/ingest/source.js.map +1 -0
- package/dist/mcp/envelope.d.ts +73 -0
- package/dist/mcp/envelope.d.ts.map +1 -0
- package/dist/mcp/envelope.js +46 -0
- package/dist/mcp/envelope.js.map +1 -0
- package/dist/mcp/tools/execute.d.ts +55 -0
- package/dist/mcp/tools/execute.d.ts.map +1 -0
- package/dist/mcp/tools/execute.js +232 -0
- package/dist/mcp/tools/execute.js.map +1 -0
- package/dist/mcp/tools/search.d.ts +53 -0
- package/dist/mcp/tools/search.d.ts.map +1 -0
- package/dist/mcp/tools/search.js +114 -0
- package/dist/mcp/tools/search.js.map +1 -0
- package/dist/observability/audit.d.ts +25 -0
- package/dist/observability/audit.d.ts.map +1 -0
- package/dist/observability/audit.js +38 -0
- package/dist/observability/audit.js.map +1 -0
- package/dist/observability/logger.d.ts +4 -0
- package/dist/observability/logger.d.ts.map +1 -0
- package/dist/observability/logger.js +56 -0
- package/dist/observability/logger.js.map +1 -0
- package/dist/observability/metrics.d.ts +38 -0
- package/dist/observability/metrics.d.ts.map +1 -0
- package/dist/observability/metrics.js +64 -0
- package/dist/observability/metrics.js.map +1 -0
- package/dist/retrieval/embedder.d.ts +130 -0
- package/dist/retrieval/embedder.d.ts.map +1 -0
- package/dist/retrieval/embedder.js +278 -0
- package/dist/retrieval/embedder.js.map +1 -0
- package/dist/retrieval/fts.d.ts +42 -0
- package/dist/retrieval/fts.d.ts.map +1 -0
- package/dist/retrieval/fts.js +46 -0
- package/dist/retrieval/fts.js.map +1 -0
- package/dist/retrieval/hybrid.d.ts +43 -0
- package/dist/retrieval/hybrid.d.ts.map +1 -0
- package/dist/retrieval/hybrid.js +120 -0
- package/dist/retrieval/hybrid.js.map +1 -0
- package/dist/retrieval/vec.d.ts +39 -0
- package/dist/retrieval/vec.d.ts.map +1 -0
- package/dist/retrieval/vec.js +50 -0
- package/dist/retrieval/vec.js.map +1 -0
- package/dist/sandbox/bindings/budget.d.ts +10 -0
- package/dist/sandbox/bindings/budget.d.ts.map +1 -0
- package/dist/sandbox/bindings/budget.js +44 -0
- package/dist/sandbox/bindings/budget.js.map +1 -0
- package/dist/sandbox/bindings/install.d.ts +23 -0
- package/dist/sandbox/bindings/install.d.ts.map +1 -0
- package/dist/sandbox/bindings/install.js +15 -0
- package/dist/sandbox/bindings/install.js.map +1 -0
- package/dist/sandbox/bindings/kg.d.ts +29 -0
- package/dist/sandbox/bindings/kg.d.ts.map +1 -0
- package/dist/sandbox/bindings/kg.js +323 -0
- package/dist/sandbox/bindings/kg.js.map +1 -0
- package/dist/sandbox/bindings/logger.d.ts +11 -0
- package/dist/sandbox/bindings/logger.d.ts.map +1 -0
- package/dist/sandbox/bindings/logger.js +33 -0
- package/dist/sandbox/bindings/logger.js.map +1 -0
- package/dist/sandbox/bindings/write.d.ts +34 -0
- package/dist/sandbox/bindings/write.d.ts.map +1 -0
- package/dist/sandbox/bindings/write.js +195 -0
- package/dist/sandbox/bindings/write.js.map +1 -0
- package/dist/sandbox/executor.d.ts +68 -0
- package/dist/sandbox/executor.d.ts.map +1 -0
- package/dist/sandbox/executor.js +280 -0
- package/dist/sandbox/executor.js.map +1 -0
- package/dist/sandbox/helpers.d.ts +26 -0
- package/dist/sandbox/helpers.d.ts.map +1 -0
- package/dist/sandbox/helpers.js +131 -0
- package/dist/sandbox/helpers.js.map +1 -0
- package/dist/sandbox/pool.d.ts +63 -0
- package/dist/sandbox/pool.d.ts.map +1 -0
- package/dist/sandbox/pool.js +98 -0
- package/dist/sandbox/pool.js.map +1 -0
- package/dist/sandbox/vendored-codemode.d.ts +99 -0
- package/dist/sandbox/vendored-codemode.d.ts.map +1 -0
- package/dist/sandbox/vendored-codemode.js +471 -0
- package/dist/sandbox/vendored-codemode.js.map +1 -0
- package/dist/server.d.ts +3 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +74 -0
- package/dist/server.js.map +1 -0
- package/dist/spike.d.ts +15 -0
- package/dist/spike.d.ts.map +1 -0
- package/dist/spike.js +90 -0
- package/dist/spike.js.map +1 -0
- package/package.json +60 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"budget.d.ts","sourceRoot":"","sources":["../../src/gate/budget.ts"],"names":[],"mappings":"AAEA;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAC3C,eAAO,MAAM,aAAa,MAAM,CAAC;AA4BjC;;;;;;;;;;;GAWG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAKhD;AAED;;;;GAIG;AACH,wBAAgB,qBAAqB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAG/D;AAED;;;;;GAKG;AACH,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,IAAI,CAAC;IAChB,EAAE,EAAE,MAAM,CAAC;IACX,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,SAAS,CAAC,CAAC;IAC1B,IAAI,EAAE,KAAK,CAAC,CAAC,GAAG,gBAAgB,CAAC,CAAC;IAClC,SAAS,EAAE,OAAO,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,cAAc,EAAE,MAAM,CAAC;CACxB;AAED;;;;;;;;;;;;;;;;;;GAkBG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAC1B,KAAK,EAAE,CAAC,EAAE,EACV,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,MAAM,EAC9B,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,MAAM,EACzB,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,MAAM,GACzB,SAAS,CAAC,CAAC,CAAC,CA4Cd;AAED;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,YAAY,EAAE,MAAM,GAAG,MAAM,CAEhE"}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { getEncoding } from 'js-tiktoken';
|
|
2
|
+
/**
|
|
3
|
+
* Token-counting budget gate.
|
|
4
|
+
*
|
|
5
|
+
* Implements CLAUDE.md §API Rules #6 budget math:
|
|
6
|
+
*
|
|
7
|
+
* envelope_reserve = 500 // bytes set aside for meta/logs/stale_files
|
|
8
|
+
* safety_margin = 0.9 // js-tiktoken is an estimator, not an oracle
|
|
9
|
+
* available = floor((max_tokens - envelope_reserve) * safety_margin)
|
|
10
|
+
*
|
|
11
|
+
* At the default `max_tokens=5000` the available budget for result bodies is:
|
|
12
|
+
* floor((5000 - 500) * 0.9) = 4050 tokens
|
|
13
|
+
*
|
|
14
|
+
* Truncation is greedy by rank: keep the highest-ranked item whole if it fits;
|
|
15
|
+
* otherwise emit a `too_large` sentinel so the caller can re-query with a
|
|
16
|
+
* higher `max_tokens` or fetch the node directly by id.
|
|
17
|
+
*
|
|
18
|
+
* The sentinel pattern is Loop 6.5 A3 / presearch.md D22. A single oversize
|
|
19
|
+
* item must NOT blackhole the whole response — we report its id + uri and
|
|
20
|
+
* let the LLM decide what to do next.
|
|
21
|
+
*
|
|
22
|
+
* Token counting uses the `p50k_base` encoder — close enough to Claude's
|
|
23
|
+
* tokenization for budgeting purposes, and the 10% safety margin absorbs the
|
|
24
|
+
* estimation error between tokenizers.
|
|
25
|
+
*/
|
|
26
|
+
export const ENVELOPE_RESERVE_TOKENS = 500;
|
|
27
|
+
export const SAFETY_MARGIN = 0.9;
|
|
28
|
+
/**
|
|
29
|
+
* Length threshold above which we skip the real tokenizer and use a
|
|
30
|
+
* conservative character-based estimate instead.
|
|
31
|
+
*
|
|
32
|
+
* js-tiktoken's BPE merge loop is O(n²) on long runs — a 60K-char string
|
|
33
|
+
* takes ~200 seconds to tokenize on current hardware (measured), which is
|
|
34
|
+
* a DoS vector on the budget gate. For any string longer than this
|
|
35
|
+
* threshold we estimate `ceil(length / CHARS_PER_TOKEN_LOWER)`, which
|
|
36
|
+
* always over-counts (since real English is 4+ chars/token), keeping us
|
|
37
|
+
* safely conservative with respect to the budget.
|
|
38
|
+
*
|
|
39
|
+
* Why 8000: at that size tiktoken takes ~2.5s which is already too slow
|
|
40
|
+
* for a request path. Anything under the threshold tokenizes in <50ms,
|
|
41
|
+
* which is acceptable.
|
|
42
|
+
*/
|
|
43
|
+
const EXACT_TOKENIZE_MAX_CHARS = 8_000;
|
|
44
|
+
/**
|
|
45
|
+
* Pessimistic chars-per-token ratio for the estimation path. Real English
|
|
46
|
+
* text runs at 4+ chars per token; we use 3.0 to over-count on purpose so
|
|
47
|
+
* the budget gate stays safe even on token-dense content (code, URLs).
|
|
48
|
+
*/
|
|
49
|
+
const CHARS_PER_TOKEN_LOWER = 3.0;
|
|
50
|
+
const encoder = getEncoding('p50k_base');
|
|
51
|
+
/**
|
|
52
|
+
* Count tokens in a UTF-8 string.
|
|
53
|
+
*
|
|
54
|
+
* Fast path (long strings): return a character-based over-estimate. This
|
|
55
|
+
* is strictly a ceiling — we'd rather emit a few extra `results_truncated`
|
|
56
|
+
* responses than block the event loop for minutes on tokenization.
|
|
57
|
+
*
|
|
58
|
+
* Slow path (short strings): use the real p50k_base encoder for an exact
|
|
59
|
+
* count. This is what matters for normal-size response bodies.
|
|
60
|
+
*
|
|
61
|
+
* The encoder is initialized once at module load and shared across calls.
|
|
62
|
+
*/
|
|
63
|
+
export function countTokens(text) {
|
|
64
|
+
if (text.length > EXACT_TOKENIZE_MAX_CHARS) {
|
|
65
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN_LOWER);
|
|
66
|
+
}
|
|
67
|
+
return encoder.encode(text).length;
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Given a user-facing `max_tokens` budget, compute the internal result-body
|
|
71
|
+
* budget after subtracting the envelope reserve and applying the safety
|
|
72
|
+
* margin. Always returns a non-negative integer.
|
|
73
|
+
*/
|
|
74
|
+
export function computeInternalBudget(maxTokens) {
|
|
75
|
+
const raw = Math.floor((maxTokens - ENVELOPE_RESERVE_TOKENS) * SAFETY_MARGIN);
|
|
76
|
+
return Math.max(0, raw);
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Greedy rank-order truncation. Iterates `items` in the order given (caller
|
|
80
|
+
* is responsible for ranking first), measures each one's serialized token
|
|
81
|
+
* count, and keeps items until the next one would exceed the internal
|
|
82
|
+
* budget.
|
|
83
|
+
*
|
|
84
|
+
* If a single item's token count alone exceeds the budget, it is replaced
|
|
85
|
+
* with a `too_large` sentinel and counted as zero body tokens (the sentinel
|
|
86
|
+
* itself is tiny — ~20 tokens). The iteration then continues so that smaller
|
|
87
|
+
* items after the oversize one can still land in the response.
|
|
88
|
+
*
|
|
89
|
+
* @param items Results, pre-ranked (highest rank first).
|
|
90
|
+
* @param maxTokens User-facing `max_tokens` budget from the tool call.
|
|
91
|
+
* @param serialize How to turn one item into the text we'll count. Usually
|
|
92
|
+
* `JSON.stringify`. Broken out so the caller can include
|
|
93
|
+
* framing (commas, wrapping object keys) in the count.
|
|
94
|
+
* @param idOf Read the item's id for sentinel construction.
|
|
95
|
+
* @param uriOf Read the item's source uri for sentinel construction.
|
|
96
|
+
*/
|
|
97
|
+
export function fitResults(items, maxTokens, serialize, idOf, uriOf) {
|
|
98
|
+
const budget = computeInternalBudget(maxTokens);
|
|
99
|
+
const kept = [];
|
|
100
|
+
let used = 0;
|
|
101
|
+
let truncated = false;
|
|
102
|
+
for (const item of items) {
|
|
103
|
+
const serialized = serialize(item);
|
|
104
|
+
const cost = countTokens(serialized);
|
|
105
|
+
if (cost > budget) {
|
|
106
|
+
// Single-oversize case — emit a sentinel so the caller can re-query.
|
|
107
|
+
const sentinel = {
|
|
108
|
+
too_large: true,
|
|
109
|
+
id: idOf(item),
|
|
110
|
+
source_uri: uriOf(item),
|
|
111
|
+
tokens: cost,
|
|
112
|
+
};
|
|
113
|
+
const sentinelCost = countTokens(JSON.stringify(sentinel));
|
|
114
|
+
if (used + sentinelCost > budget) {
|
|
115
|
+
truncated = true;
|
|
116
|
+
break;
|
|
117
|
+
}
|
|
118
|
+
kept.push(sentinel);
|
|
119
|
+
used += sentinelCost;
|
|
120
|
+
truncated = true; // we dropped the actual body
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
if (used + cost > budget) {
|
|
124
|
+
truncated = true;
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
kept.push(item);
|
|
128
|
+
used += cost;
|
|
129
|
+
}
|
|
130
|
+
return {
|
|
131
|
+
kept,
|
|
132
|
+
truncated,
|
|
133
|
+
tokensUsed: used,
|
|
134
|
+
tokensBudgeted: budget,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Count tokens in an already-serialized response body without running the
|
|
139
|
+
* fit loop. Used by the tool handlers to populate `meta.tokens_used` after
|
|
140
|
+
* the envelope has been built.
|
|
141
|
+
*/
|
|
142
|
+
export function countEnvelopeTokens(envelopeJson) {
|
|
143
|
+
return countTokens(envelopeJson);
|
|
144
|
+
}
|
|
145
|
+
//# sourceMappingURL=budget.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"budget.js","sourceRoot":"","sources":["../../src/gate/budget.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAiB,MAAM,aAAa,CAAC;AAEzD;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AAEH,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAC3C,MAAM,CAAC,MAAM,aAAa,GAAG,GAAG,CAAC;AAEjC;;;;;;;;;;;;;;GAcG;AACH,MAAM,wBAAwB,GAAG,KAAK,CAAC;AAEvC;;;;GAIG;AACH,MAAM,qBAAqB,GAAG,GAAG,CAAC;AAElC,MAAM,OAAO,GAAa,WAAW,CAAC,WAAW,CAAC,CAAC;AAEnD;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,IAAI,IAAI,CAAC,MAAM,GAAG,wBAAwB,EAAE,CAAC;QAC3C,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,qBAAqB,CAAC,CAAC;IACxD,CAAC;IACD,OAAO,OAAO,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;AACrC,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CAAC,SAAiB;IACrD,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,SAAS,GAAG,uBAAuB,CAAC,GAAG,aAAa,CAAC,CAAC;IAC9E,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AAC1B,CAAC;AAsBD;;;;;;;;;;;;;;;;;;GAkBG;AACH,MAAM,UAAU,UAAU,CACxB,KAAU,EACV,SAAiB,EACjB,SAA8B,EAC9B,IAAyB,EACzB,KAA0B;IAE1B,MAAM,MAAM,GAAG,qBAAqB,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,IAAI,GAAgC,EAAE,CAAC;IAC7C,IAAI,IAAI,GAAG,CAAC,CAAC;IACb,IAAI,SAAS,GAAG,KAAK,CAAC;IAEtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,UAAU,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,IAAI,GAAG,WAAW,CAAC,UAAU,CAAC,CAAC;QAErC,IAAI,IAAI,GAAG,MAAM,EAAE,CAAC;YAClB,qEAAqE;YACrE,MAAM,QAAQ,GAAqB;gBACjC,SAAS,EAAE,IAAI;gBACf,EAAE,EAAE,IAAI,CAAC,IAAI,CAAC;gBACd,UAAU,EAAE,KAAK,CAAC,IAAI,CAAC;gBACvB,MAAM,EAAE,IAAI;aACb,CAAC;YACF,MAAM,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC;YAC3D,IAAI,IAAI,GAAG,YAAY,GAAG,MAAM,EAAE,CAAC;gBACjC,SAAS,GAAG,IAAI,CAAC;gBACjB,MAAM;YACR,CAAC;YACD,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACpB,IAAI,IAAI,YAAY,CAAC;YACrB,SAAS,GAAG,IAAI,CAAC,CAAC,6BAA6B;YAC/C,SAAS;QACX,CAAC;QAED,IAAI,IAAI,GAAG,IAAI,GAAG,MAAM,EAAE,CAAC;YACzB,SAAS,GAAG,IAAI,CAAC;YACjB,MAAM;QACR,CAAC;QAED,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChB,IAAI,IAAI,IAAI,CAAC;IACf,CAAC;IAED,OAAO;QACL,IAAI;QACJ,SAAS;QACT,UAAU,EAAE,IAAI;QAChB,cAAc,EAAE,MAAM;KACvB,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,mBAAmB,CAAC,YAAoB;IACtD,OAAO,WAAW,CAAC,YAAY,CAAC,CAAC;AACnC,CAAC"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { IngestEvent, IngestSource, Scope } from './source.js';
|
|
2
|
+
export interface ChokidarWatcherOptions {
|
|
3
|
+
/** Absolute path to the wiki root directory */
|
|
4
|
+
rootDir: string;
|
|
5
|
+
/** Which scope to tag events with */
|
|
6
|
+
scope: Scope;
|
|
7
|
+
/** Override debounce window — tests use this with fake timers */
|
|
8
|
+
debounceMs?: number;
|
|
9
|
+
}
|
|
10
|
+
export declare class ChokidarWatcher implements IngestSource {
|
|
11
|
+
private readonly options;
|
|
12
|
+
private watcher;
|
|
13
|
+
private pending;
|
|
14
|
+
private readonly debounceMs;
|
|
15
|
+
private onEvent;
|
|
16
|
+
constructor(options: ChokidarWatcherOptions);
|
|
17
|
+
start(onEvent: (ev: IngestEvent) => Promise<void>): Promise<void>;
|
|
18
|
+
stop(): Promise<void>;
|
|
19
|
+
/**
|
|
20
|
+
* Test-only: dispatch all pending events immediately, bypassing the
|
|
21
|
+
* debounce timer. Used by chokidar.test.ts so the test doesn't have to
|
|
22
|
+
* wait 2 real seconds.
|
|
23
|
+
*/
|
|
24
|
+
__flushForTests(): Promise<void>;
|
|
25
|
+
/**
|
|
26
|
+
* Test-only: number of currently-queued events. Used to assert
|
|
27
|
+
* drop-oldest semantics.
|
|
28
|
+
*/
|
|
29
|
+
__pendingCountForTests(): number;
|
|
30
|
+
private queueEvent;
|
|
31
|
+
private dispatch;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=chokidar.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chokidar.d.ts","sourceRoot":"","sources":["../../src/ingest/chokidar.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,WAAW,EAAmB,YAAY,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAgCrF,MAAM,WAAW,sBAAsB;IACrC,+CAA+C;IAC/C,OAAO,EAAE,MAAM,CAAC;IAChB,qCAAqC;IACrC,KAAK,EAAE,KAAK,CAAC;IACb,iEAAiE;IACjE,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAeD,qBAAa,eAAgB,YAAW,YAAY;IAMtC,OAAO,CAAC,QAAQ,CAAC,OAAO;IALpC,OAAO,CAAC,OAAO,CAA0B;IACzC,OAAO,CAAC,OAAO,CAAwC;IACvD,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,OAAO,CAAqD;gBAEvC,OAAO,EAAE,sBAAsB;IAItD,KAAK,CAAC,OAAO,EAAE,CAAC,EAAE,EAAE,WAAW,KAAK,OAAO,CAAC,IAAI,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IA4CjE,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAc3B;;;;OAIG;IACG,eAAe,IAAI,OAAO,CAAC,IAAI,CAAC;IAStC;;;OAGG;IACH,sBAAsB,IAAI,MAAM;IAIhC,OAAO,CAAC,UAAU;YA8BJ,QAAQ;CAQvB"}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { resolve } from 'node:path';
|
|
2
|
+
import chokidar from 'chokidar';
|
|
3
|
+
import { logger } from '../observability/logger.js';
|
|
4
|
+
/**
|
|
5
|
+
* `ChokidarWatcher` — file-system implementation of `IngestSource` for KG-MCP Phase 2.
|
|
6
|
+
*
|
|
7
|
+
* Watches a wiki directory recursively for `*.md` changes, with the two
|
|
8
|
+
* critical adaptations for Pharos's wiki-updater (CLAUDE.md §Database Rules #4,
|
|
9
|
+
* §Architecture #3, Loop 6.5 A4):
|
|
10
|
+
*
|
|
11
|
+
* **2-second debounce per path** (NOT chokidar's default 50ms). The
|
|
12
|
+
* wiki-updater writes via atomic rename, which fires chokidar events in
|
|
13
|
+
* rapid bursts (`add` immediately followed by `change` from a follow-up
|
|
14
|
+
* write to the same file). A 50ms debounce coalesces some of these but
|
|
15
|
+
* misses the longer-tail bursts where the wiki-updater appends to log.md
|
|
16
|
+
* a few hundred milliseconds after the initial atomic rename. 2 seconds
|
|
17
|
+
* is the empirically-derived sweet spot.
|
|
18
|
+
*
|
|
19
|
+
* **Bounded queue with drop-oldest per source path**. If 3 events arrive
|
|
20
|
+
* for the same file before the 2s timer fires, only the latest event is
|
|
21
|
+
* dispatched. Earlier events are silently dropped. This is correct
|
|
22
|
+
* because each event represents the *current* state of the file — older
|
|
23
|
+
* events are already obsolete by the time we'd ingest them.
|
|
24
|
+
*
|
|
25
|
+
* **Initial scan**: chokidar emits `add` events for every existing file
|
|
26
|
+
* during its first pass (when `ignoreInitial: false`). These are dispatched
|
|
27
|
+
* just like change events, so a fresh `serve` start will trigger the
|
|
28
|
+
* ingester for every wiki file. The ingester's manifest fast-path
|
|
29
|
+
* (source_sha unchanged → noop) makes this cheap on warm starts.
|
|
30
|
+
*/
|
|
31
|
+
const DEBOUNCE_MS = 2_000;
|
|
32
|
+
export class ChokidarWatcher {
|
|
33
|
+
options;
|
|
34
|
+
watcher = null;
|
|
35
|
+
pending = new Map();
|
|
36
|
+
debounceMs;
|
|
37
|
+
onEvent = null;
|
|
38
|
+
constructor(options) {
|
|
39
|
+
this.options = options;
|
|
40
|
+
this.debounceMs = options.debounceMs ?? DEBOUNCE_MS;
|
|
41
|
+
}
|
|
42
|
+
async start(onEvent) {
|
|
43
|
+
if (this.watcher) {
|
|
44
|
+
throw new Error('ChokidarWatcher.start called twice');
|
|
45
|
+
}
|
|
46
|
+
this.onEvent = onEvent;
|
|
47
|
+
const root = resolve(this.options.rootDir);
|
|
48
|
+
// Note: chokidar 4.x removed glob support — pass the root directory
|
|
49
|
+
// and filter paths in the event handlers. We deliberately do NOT use
|
|
50
|
+
// the `ignored` option here because in chokidar 4.x it's matched against
|
|
51
|
+
// every traversed path including the root directory itself, and a naive
|
|
52
|
+
// "not .md → ignore" check ignores the wiki dir before recursing into it.
|
|
53
|
+
// Filtering inside the per-event handlers is simpler and bug-free.
|
|
54
|
+
this.watcher = chokidar.watch(root, {
|
|
55
|
+
ignoreInitial: false,
|
|
56
|
+
// No awaitWriteFinish — we have our own debounce that's better-suited
|
|
57
|
+
// to the wiki-updater's atomic-rename pattern.
|
|
58
|
+
awaitWriteFinish: false,
|
|
59
|
+
});
|
|
60
|
+
const isMarkdown = (path) => path.toLowerCase().endsWith('.md');
|
|
61
|
+
this.watcher.on('add', (path) => {
|
|
62
|
+
if (isMarkdown(path))
|
|
63
|
+
this.queueEvent('file:added', path);
|
|
64
|
+
});
|
|
65
|
+
this.watcher.on('change', (path) => {
|
|
66
|
+
if (isMarkdown(path))
|
|
67
|
+
this.queueEvent('file:changed', path);
|
|
68
|
+
});
|
|
69
|
+
this.watcher.on('unlink', (path) => {
|
|
70
|
+
if (isMarkdown(path))
|
|
71
|
+
this.queueEvent('file:removed', path);
|
|
72
|
+
});
|
|
73
|
+
this.watcher.on('error', (err) => {
|
|
74
|
+
logger.error({ err, root }, 'chokidar error');
|
|
75
|
+
});
|
|
76
|
+
// Wait for the initial scan to complete so callers know all `add` events
|
|
77
|
+
// have been queued before start() resolves. chokidar emits `ready` once
|
|
78
|
+
// the initial scan finishes.
|
|
79
|
+
await new Promise((resolveWait) => {
|
|
80
|
+
this.watcher.once('ready', () => resolveWait());
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
async stop() {
|
|
84
|
+
// Cancel every pending debounce timer so we don't fire after stop.
|
|
85
|
+
for (const pending of this.pending.values()) {
|
|
86
|
+
clearTimeout(pending.timer);
|
|
87
|
+
}
|
|
88
|
+
this.pending.clear();
|
|
89
|
+
if (this.watcher) {
|
|
90
|
+
await this.watcher.close();
|
|
91
|
+
this.watcher = null;
|
|
92
|
+
}
|
|
93
|
+
this.onEvent = null;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Test-only: dispatch all pending events immediately, bypassing the
|
|
97
|
+
* debounce timer. Used by chokidar.test.ts so the test doesn't have to
|
|
98
|
+
* wait 2 real seconds.
|
|
99
|
+
*/
|
|
100
|
+
async __flushForTests() {
|
|
101
|
+
const entries = Array.from(this.pending.entries());
|
|
102
|
+
this.pending.clear();
|
|
103
|
+
for (const [, pending] of entries) {
|
|
104
|
+
clearTimeout(pending.timer);
|
|
105
|
+
await this.dispatch(pending.event);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Test-only: number of currently-queued events. Used to assert
|
|
110
|
+
* drop-oldest semantics.
|
|
111
|
+
*/
|
|
112
|
+
__pendingCountForTests() {
|
|
113
|
+
return this.pending.size;
|
|
114
|
+
}
|
|
115
|
+
queueEvent(kind, path) {
|
|
116
|
+
const abs = resolve(path);
|
|
117
|
+
const event = { kind, path: abs, scope: this.options.scope };
|
|
118
|
+
const existing = this.pending.get(abs);
|
|
119
|
+
if (existing) {
|
|
120
|
+
// Drop-oldest: replace the queued event but DON'T restart the timer.
|
|
121
|
+
// This caps a single file's queue depth at 1 even under heavy load,
|
|
122
|
+
// and bounds the worst-case latency to `debounceMs` after the first
|
|
123
|
+
// event in a burst.
|
|
124
|
+
existing.event = event;
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
// First event for this path — start a fresh debounce timer.
|
|
128
|
+
const timer = setTimeout(() => {
|
|
129
|
+
const pending = this.pending.get(abs);
|
|
130
|
+
this.pending.delete(abs);
|
|
131
|
+
if (pending) {
|
|
132
|
+
// Fire-and-forget: dispatch errors are logged inside dispatch().
|
|
133
|
+
// We don't await here because the chokidar event loop must keep
|
|
134
|
+
// pumping new events; the ingester's single-flight gate handles
|
|
135
|
+
// concurrent dispatches for the same path.
|
|
136
|
+
void this.dispatch(pending.event);
|
|
137
|
+
}
|
|
138
|
+
}, this.debounceMs);
|
|
139
|
+
this.pending.set(abs, { event, timer });
|
|
140
|
+
}
|
|
141
|
+
async dispatch(event) {
|
|
142
|
+
if (!this.onEvent)
|
|
143
|
+
return;
|
|
144
|
+
try {
|
|
145
|
+
await this.onEvent(event);
|
|
146
|
+
}
|
|
147
|
+
catch (err) {
|
|
148
|
+
logger.error({ err, event }, 'ingest event handler failed');
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
//# sourceMappingURL=chokidar.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chokidar.js","sourceRoot":"","sources":["../../src/ingest/chokidar.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAEpC,OAAO,QAA4B,MAAM,UAAU,CAAC;AAEpD,OAAO,EAAE,MAAM,EAAE,MAAM,4BAA4B,CAAC;AAGpD;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,MAAM,WAAW,GAAG,KAAK,CAAC;AAwB1B,MAAM,OAAO,eAAe;IAMG;IALrB,OAAO,GAAqB,IAAI,CAAC;IACjC,OAAO,GAA8B,IAAI,GAAG,EAAE,CAAC;IACtC,UAAU,CAAS;IAC5B,OAAO,GAAgD,IAAI,CAAC;IAEpE,YAA6B,OAA+B;QAA/B,YAAO,GAAP,OAAO,CAAwB;QAC1D,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,WAAW,CAAC;IACtD,CAAC;IAED,KAAK,CAAC,KAAK,CAAC,OAA2C;QACrD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;QACxD,CAAC;QACD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QAEvB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAE3C,oEAAoE;QACpE,qEAAqE;QACrE,yEAAyE;QACzE,wEAAwE;QACxE,0EAA0E;QAC1E,mEAAmE;QACnE,IAAI,CAAC,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,EAAE;YAClC,aAAa,EAAE,KAAK;YACpB,sEAAsE;YACtE,+CAA+C;YAC/C,gBAAgB,EAAE,KAAK;SACxB,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,CAAC,IAAY,EAAW,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;QAEjF,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,EAAE;YAC9B,IAAI,UAAU,CAAC,IAAI,CAAC;gBAAE,IAAI,CAAC,UAAU,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;QAC5D,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,EAAE;YACjC,IAAI,UAAU,CAAC,IAAI,CAAC;gBAAE,IAAI,CAAC,UAAU,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;QAC9D,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,QAAQ,EAAE,CAAC,IAAI,EAAE,EAAE;YACjC,IAAI,UAAU,CAAC,IAAI,CAAC;gBAAE,IAAI,CAAC,UAAU,CAAC,cAAc,EAAE,IAAI,CAAC,CAAC;QAC9D,CAAC,CAAC,CAAC;QACH,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE;YAC/B,MAAM,CAAC,KAAK,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,gBAAgB,CAAC,CAAC;QAChD,CAAC,CAAC,CAAC;QAEH,yEAAyE;QACzE,wEAAwE;QACxE,6BAA6B;QAC7B,MAAM,IAAI,OAAO,CAAO,CAAC,WAAW,EAAE,EAAE;YACtC,IAAI,CAAC,OAAQ,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC;IACL,CAAC;IAED,KAAK,CAAC,IAAI;QACR,mEAAmE;QACnE,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC;YAC5C,YAAY,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QAC9B,CAAC;QACD,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QAErB,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACjB,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;IACtB,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,eAAe;QACnB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAC;QACnD,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QACrB,KAAK,MAAM,CAAC,EAAE,OAAO,CAAC,IAAI,OAAO,EAAE,CAAC;YAClC,YAAY,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;YAC5B,MAAM,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IAED;;;OAGG;IACH,sBAAsB;QACpB,OAAO,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;IAC3B,CAAC;IAEO,UAAU,CAAC,IAAqB,EAAE,IAAY;QACpD,MAAM,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;QAC1B,MAAM,KAAK,GAAgB,EAAE,IAAI,EAAE,IAAI,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;QAE1E,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QACvC,IAAI,QAAQ,EAAE,CAAC;YACb,qEAAqE;YACrE,oEAAoE;YACpE,oEAAoE;YACpE,oBAAoB;YACpB,QAAQ,CAAC,KAAK,GAAG,KAAK,CAAC;YACvB,OAAO;QACT,CAAC;QAED,4DAA4D;QAC5D,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;YAC5B,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YACtC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACzB,IAAI,OAAO,EAAE,CAAC;gBACZ,iEAAiE;gBACjE,gEAAgE;gBAChE,gEAAgE;gBAChE,2CAA2C;gBAC3C,KAAK,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;YACpC,CAAC;QACH,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC;QAEpB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;IAC1C,CAAC;IAEO,KAAK,CAAC,QAAQ,CAAC,KAAkB;QACvC,IAAI,CAAC,IAAI,CAAC,OAAO;YAAE,OAAO;QAC1B,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QAC5B,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,CAAC,KAAK,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE,6BAA6B,CAAC,CAAC;QAC9D,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import type { DbBundle } from '../db/client.js';
|
|
2
|
+
import type { Embedder } from '../retrieval/embedder.js';
|
|
3
|
+
import { type Manifest, fileSha, manifestPathFor } from './manifest.js';
|
|
4
|
+
import { type Chunk as ChunkText } from './parse/chunk.js';
|
|
5
|
+
/**
|
|
6
|
+
* IngesterService — the load-bearing single-flight writer for KG-MCP Phase 2.
|
|
7
|
+
*
|
|
8
|
+
* **What it does**: given an absolute path to a markdown file, parse it into
|
|
9
|
+
* sections, chunk each section, embed only the chunks whose sha changed, and
|
|
10
|
+
* upsert the result into SQLite under one transaction. Update the consistency
|
|
11
|
+
* manifest, append a row to `kg_log`. Errors trigger a clean rollback and an
|
|
12
|
+
* error log entry, but never throw past the caller's await — they're caught,
|
|
13
|
+
* logged, and re-thrown so the rebuild CLI can decide whether to abort or
|
|
14
|
+
* continue.
|
|
15
|
+
*
|
|
16
|
+
* **Key invariants** (from CLAUDE.md §Database Rules):
|
|
17
|
+
*
|
|
18
|
+
* - **Single writer per source_uri**: a module-level `Map<string, Promise>`
|
|
19
|
+
* gates concurrent calls to `ingestFile(samePath)`. The second caller waits
|
|
20
|
+
* on the first's promise, then exits early (no double-ingest). This is the
|
|
21
|
+
* single-flight pattern; CLAUDE.md §Architecture #3 / §Database Rules #5.
|
|
22
|
+
*
|
|
23
|
+
* - **Per-chunk skip-unchanged**: on a re-ingest of an existing file, we
|
|
24
|
+
* load the existing `kg_chunks` rows for that file's nodes, build a
|
|
25
|
+
* `Set<chunk_sha>` of already-embedded chunks, and only call `embedder.embed()`
|
|
26
|
+
* for chunks whose sha is NOT in the set. This is THE optimization that
|
|
27
|
+
* makes Pharos's whole-file-rewrite-per-turn pattern viable — without it,
|
|
28
|
+
* 60 chunks × 50ms embed = 3s of blocking work per turn (Loop 6.5 A4).
|
|
29
|
+
*
|
|
30
|
+
* - **Transaction per file**: all DML for one file lives inside one
|
|
31
|
+
* `BEGIN ... COMMIT` block. Partial failures roll back cleanly so the DB
|
|
32
|
+
* never observes a half-ingested file. The FTS5 triggers fire inside the
|
|
33
|
+
* transaction; sqlite-vec inserts also happen inside.
|
|
34
|
+
*
|
|
35
|
+
* - **Idempotent upserts**: node ids and chunk ids are deterministic
|
|
36
|
+
* `sha1()` of stable inputs, so re-ingesting the same content produces
|
|
37
|
+
* identical row ids. We use `INSERT OR REPLACE` so a re-ingest with
|
|
38
|
+
* different content cleanly overwrites.
|
|
39
|
+
*
|
|
40
|
+
* - **Manifest is the tiebreaker on crash recovery**: written AFTER the
|
|
41
|
+
* transaction commits. If we crash between COMMIT and writeManifest, the
|
|
42
|
+
* next startup's consistency check will detect the divergence (DB has
|
|
43
|
+
* the new chunks but manifest still has old chunk_shas) and re-ingest.
|
|
44
|
+
* The work is duplicated but not wrong — that's the right safety/perf
|
|
45
|
+
* tradeoff for pre-v1 sqlite-vec.
|
|
46
|
+
*/
|
|
47
|
+
export type Scope = 'project' | 'personal';
|
|
48
|
+
export interface IngestResult {
|
|
49
|
+
/** Number of new chunks written (post-skip) */
|
|
50
|
+
chunks_added: number;
|
|
51
|
+
/** Number of chunks reused via per-chunk skip (no embedder call) */
|
|
52
|
+
chunks_skipped: number;
|
|
53
|
+
/** Number of embedder.embed() calls actually made */
|
|
54
|
+
embedder_calls: number;
|
|
55
|
+
/** Number of nodes (sections) written for this file */
|
|
56
|
+
nodes_written: number;
|
|
57
|
+
/** True if the file was unchanged and we exited early via the manifest fast path */
|
|
58
|
+
noop: boolean;
|
|
59
|
+
}
|
|
60
|
+
export interface IngesterOptions {
|
|
61
|
+
/** Optional override for the manifest file path (default: derived from wikiRoot) */
|
|
62
|
+
manifestPath?: string;
|
|
63
|
+
}
|
|
64
|
+
/** Test-only: clear the single-flight map. Do not call from production code. */
|
|
65
|
+
export declare function __resetSingleFlightForTests(): void;
|
|
66
|
+
export declare class IngesterService {
|
|
67
|
+
private readonly bundle;
|
|
68
|
+
private readonly embedder;
|
|
69
|
+
private readonly scope;
|
|
70
|
+
private manifest;
|
|
71
|
+
private readonly manifestPath;
|
|
72
|
+
constructor(bundle: DbBundle, embedder: Embedder, scope: Scope, wikiRoot: string, options?: IngesterOptions);
|
|
73
|
+
/**
|
|
74
|
+
* Ingest a single markdown file. Single-flight: concurrent calls for the
|
|
75
|
+
* same file return the same in-progress promise.
|
|
76
|
+
*
|
|
77
|
+
* Returns the per-chunk metrics for this run (added/skipped/embedder calls)
|
|
78
|
+
* so callers (rebuild CLI, chokidar handler) can log a summary.
|
|
79
|
+
*/
|
|
80
|
+
ingestFile(absPath: string): Promise<IngestResult>;
|
|
81
|
+
/**
|
|
82
|
+
* Remove a file from the index (for `file:removed` chokidar events).
|
|
83
|
+
* Deletes the file's nodes (cascades to chunks/edges/vec rows via FK)
|
|
84
|
+
* and updates the manifest. Single-statement, no transaction needed.
|
|
85
|
+
*/
|
|
86
|
+
removeFile(absPath: string): void;
|
|
87
|
+
/**
|
|
88
|
+
* The current in-memory manifest. Exposed for tests + the startup
|
|
89
|
+
* consistency check in `cli/serve.ts`.
|
|
90
|
+
*/
|
|
91
|
+
getManifest(): Manifest;
|
|
92
|
+
/**
|
|
93
|
+
* Reload the manifest from disk. Used at startup before walking the
|
|
94
|
+
* consistency check, and by tests that mutate the manifest externally.
|
|
95
|
+
*/
|
|
96
|
+
reloadManifest(): void;
|
|
97
|
+
private ingestFileImpl;
|
|
98
|
+
/**
|
|
99
|
+
* Load `chunk_sha → embedding` for every existing chunk of a file. Used by
|
|
100
|
+
* the per-chunk skip optimization: if a new chunk's sha matches one in
|
|
101
|
+
* this map, we reuse the embedding instead of calling the embedder.
|
|
102
|
+
*
|
|
103
|
+
* Returns an empty map if the file isn't in the DB yet (first ingest).
|
|
104
|
+
*/
|
|
105
|
+
private loadExistingEmbeddings;
|
|
106
|
+
private runInTransaction;
|
|
107
|
+
private appendLog;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Reset the in-memory manifest to empty. Used by `kg rebuild` when the
|
|
111
|
+
* caller passes `--clean` to force a full re-ingest from scratch.
|
|
112
|
+
*/
|
|
113
|
+
export declare function freshManifest(): Manifest;
|
|
114
|
+
export type { ChunkText };
|
|
115
|
+
export { manifestPathFor };
|
|
116
|
+
export { fileSha };
|
|
117
|
+
//# sourceMappingURL=ingester.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ingester.d.ts","sourceRoot":"","sources":["../../src/ingest/ingester.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAChD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,0BAA0B,CAAC;AAGzD,OAAO,EACL,KAAK,QAAQ,EAGb,OAAO,EACP,eAAe,EAKhB,MAAM,eAAe,CAAC;AAGvB,OAAO,EAAgB,KAAK,KAAK,IAAI,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAEzE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AAMH,MAAM,MAAM,KAAK,GAAG,SAAS,GAAG,UAAU,CAAC;AAE3C,MAAM,WAAW,YAAY;IAC3B,+CAA+C;IAC/C,YAAY,EAAE,MAAM,CAAC;IACrB,oEAAoE;IACpE,cAAc,EAAE,MAAM,CAAC;IACvB,qDAAqD;IACrD,cAAc,EAAE,MAAM,CAAC;IACvB,uDAAuD;IACvD,aAAa,EAAE,MAAM,CAAC;IACtB,oFAAoF;IACpF,IAAI,EAAE,OAAO,CAAC;CACf;AAED,MAAM,WAAW,eAAe;IAC9B,oFAAoF;IACpF,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAoBD,gFAAgF;AAChF,wBAAgB,2BAA2B,IAAI,IAAI,CAElD;AAMD,qBAAa,eAAe;IAKxB,OAAO,CAAC,QAAQ,CAAC,MAAM;IACvB,OAAO,CAAC,QAAQ,CAAC,QAAQ;IACzB,OAAO,CAAC,QAAQ,CAAC,KAAK;IANxB,OAAO,CAAC,QAAQ,CAAW;IAC3B,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAS;gBAGnB,MAAM,EAAE,QAAQ,EAChB,QAAQ,EAAE,QAAQ,EAClB,KAAK,EAAE,KAAK,EAC7B,QAAQ,EAAE,MAAM,EAChB,OAAO,GAAE,eAAoB;IAM/B;;;;;;OAMG;IACG,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,CAAC;IAsBxD;;;;OAIG;IACH,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI;IAcjC;;;OAGG;IACH,WAAW,IAAI,QAAQ;IAIvB;;;OAGG;IACH,cAAc,IAAI,IAAI;YAQR,cAAc;IAiM5B;;;;;;OAMG;IACH,OAAO,CAAC,sBAAsB;IA4B9B,OAAO,CAAC,gBAAgB;IAKxB,OAAO,CAAC,SAAS;CAYlB;AAsCD;;;GAGG;AACH,wBAAgB,aAAa,IAAI,QAAQ,CAExC;AAGD,YAAY,EAAE,SAAS,EAAE,CAAC;AAC1B,OAAO,EAAE,eAAe,EAAE,CAAC;AAC3B,OAAO,EAAE,OAAO,EAAE,CAAC"}
|