membot 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +3 -0
- package/.cursor/rules/membot.mdc +3 -0
- package/README.md +5 -0
- package/package.json +1 -1
- package/scripts/build-test-docx.ts +84 -0
- package/src/cli.ts +11 -0
- package/src/config/schemas.ts +20 -0
- package/src/constants.ts +15 -0
- package/src/context.ts +24 -0
- package/src/ingest/converter/docx.ts +47 -5
- package/src/ingest/converter/html.ts +10 -3
- package/src/ingest/converter/image.ts +40 -3
- package/src/ingest/converter/images-inline.ts +132 -0
- package/src/ingest/converter/index.ts +4 -3
- package/src/ingest/embed-worker.ts +74 -0
- package/src/ingest/embedder-pool.ts +391 -0
- package/src/ingest/embedder.ts +40 -2
- package/src/ingest/ingest.ts +1 -1
- package/src/operations/add.ts +94 -86
- package/src/operations/index.ts +2 -0
- package/src/operations/refresh.ts +28 -20
- package/src/operations/stats.ts +342 -0
- package/src/operations/write.ts +48 -40
- package/src/refresh/runner.ts +1 -1
- package/src/refresh/scheduler.ts +22 -13
package/src/operations/add.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
+
import { resolveEmbeddingWorkers } from "../context.ts";
|
|
3
|
+
import { withEmbedderPool } from "../ingest/embedder-pool.ts";
|
|
2
4
|
import {
|
|
3
5
|
countResolvedEntries,
|
|
4
6
|
type IngestCallbacks,
|
|
@@ -105,101 +107,107 @@ Pass \`logical_path\` to override. For a multi-source / directory / glob walk it
|
|
|
105
107
|
return `${lines.join("\n")}\n${parts.join(", ")}`;
|
|
106
108
|
},
|
|
107
109
|
handler: async (input, ctx) => {
|
|
108
|
-
|
|
109
|
-
|
|
110
|
+
// Spin up an ephemeral embedder pool for the whole `add` command —
|
|
111
|
+
// `withEmbedderPool` handles the workers=1 short-circuit and disposes
|
|
112
|
+
// the children when the closure returns (see embedder-pool.ts).
|
|
113
|
+
const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
|
|
114
|
+
return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
|
|
115
|
+
const { sources, ...rest } = input;
|
|
116
|
+
const followSymlinks = rest.follow_symlinks ?? true;
|
|
110
117
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
118
|
+
// Phase 1: resolve every source upfront so the shared progress bar
|
|
119
|
+
// knows its total. A resolve failure (bad path, glob with no base) is
|
|
120
|
+
// captured per-source so one bad arg doesn't abort the whole batch.
|
|
121
|
+
type ResolveOutcome = { source: string; resolved: ResolvedSource } | { source: string; error: Error };
|
|
122
|
+
const outcomes: ResolveOutcome[] = [];
|
|
123
|
+
for (const source of sources) {
|
|
124
|
+
try {
|
|
125
|
+
const resolved = await resolveSource(source, {
|
|
126
|
+
include: rest.include,
|
|
127
|
+
exclude: rest.exclude,
|
|
128
|
+
followSymlinks,
|
|
129
|
+
});
|
|
130
|
+
outcomes.push({ source, resolved });
|
|
131
|
+
} catch (err) {
|
|
132
|
+
outcomes.push({ source, error: err instanceof Error ? err : new Error(String(err)) });
|
|
133
|
+
}
|
|
126
134
|
}
|
|
127
|
-
}
|
|
128
135
|
|
|
129
|
-
|
|
136
|
+
const total = outcomes.reduce((n, o) => ("error" in o ? n + 1 : n + countResolvedEntries(o.resolved)), 0);
|
|
130
137
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
+
const aggregated: IngestResult = {
|
|
139
|
+
ingested: [],
|
|
140
|
+
total: 0,
|
|
141
|
+
ok: 0,
|
|
142
|
+
unchanged: 0,
|
|
143
|
+
failed: 0,
|
|
144
|
+
};
|
|
138
145
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
146
|
+
ctx.progress.start(total, "ingest");
|
|
147
|
+
const callbacks: IngestCallbacks = {
|
|
148
|
+
onEntryStart: (label) => ctx.progress.tick(label),
|
|
149
|
+
onEntryComplete: (entry) => ctx.progress.entry(formatEntryLine(entry)),
|
|
150
|
+
onEntryProgress: (_label, sublabel) => ctx.progress.update(sublabel),
|
|
151
|
+
};
|
|
145
152
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
153
|
+
for (const outcome of outcomes) {
|
|
154
|
+
if ("error" in outcome) {
|
|
155
|
+
const failed: IngestEntryResult = {
|
|
156
|
+
source_path: outcome.source,
|
|
157
|
+
logical_path: outcome.source,
|
|
158
|
+
version_id: null,
|
|
159
|
+
status: "failed",
|
|
160
|
+
error: outcome.error.message,
|
|
161
|
+
mime_type: null,
|
|
162
|
+
size_bytes: 0,
|
|
163
|
+
fetcher: "local",
|
|
164
|
+
source_sha256: "",
|
|
165
|
+
};
|
|
166
|
+
callbacks.onEntryStart?.(outcome.source);
|
|
167
|
+
callbacks.onEntryComplete?.(failed);
|
|
168
|
+
aggregated.ingested.push(failed);
|
|
169
|
+
aggregated.total += 1;
|
|
170
|
+
aggregated.failed += 1;
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
166
173
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
174
|
+
try {
|
|
175
|
+
const r = await ingestResolved(outcome.resolved, { ...rest, source: outcome.source }, ctx, callbacks);
|
|
176
|
+
aggregated.ingested.push(...r.ingested);
|
|
177
|
+
aggregated.total += r.total;
|
|
178
|
+
aggregated.ok += r.ok;
|
|
179
|
+
aggregated.unchanged += r.unchanged;
|
|
180
|
+
aggregated.failed += r.failed;
|
|
181
|
+
} catch (err) {
|
|
182
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
183
|
+
const failed: IngestEntryResult = {
|
|
184
|
+
source_path: outcome.source,
|
|
185
|
+
logical_path: outcome.source,
|
|
186
|
+
version_id: null,
|
|
187
|
+
status: "failed",
|
|
188
|
+
error: message,
|
|
189
|
+
mime_type: null,
|
|
190
|
+
size_bytes: 0,
|
|
191
|
+
fetcher: "local",
|
|
192
|
+
source_sha256: "",
|
|
193
|
+
};
|
|
194
|
+
callbacks.onEntryStart?.(outcome.source);
|
|
195
|
+
callbacks.onEntryComplete?.(failed);
|
|
196
|
+
aggregated.ingested.push(failed);
|
|
197
|
+
aggregated.total += 1;
|
|
198
|
+
aggregated.failed += 1;
|
|
199
|
+
} finally {
|
|
200
|
+
// Release the DB lock between sources so other consumers (a
|
|
201
|
+
// concurrent CLI call, the daemon, or a separate MCP server)
|
|
202
|
+
// can wedge in. The next source's first DB call reopens.
|
|
203
|
+
await ctx.db.release();
|
|
204
|
+
}
|
|
197
205
|
}
|
|
198
|
-
}
|
|
199
206
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
207
|
+
const summary = formatSummary(aggregated);
|
|
208
|
+
ctx.progress.done(summary);
|
|
209
|
+
return aggregated;
|
|
210
|
+
});
|
|
203
211
|
},
|
|
204
212
|
});
|
|
205
213
|
|
package/src/operations/index.ts
CHANGED
|
@@ -8,6 +8,7 @@ import { readOperation } from "./read.ts";
|
|
|
8
8
|
import { refreshOperation } from "./refresh.ts";
|
|
9
9
|
import { removeOperation } from "./remove.ts";
|
|
10
10
|
import { searchOperation } from "./search.ts";
|
|
11
|
+
import { statsOperation } from "./stats.ts";
|
|
11
12
|
import { treeOperation } from "./tree.ts";
|
|
12
13
|
import type { Operation } from "./types.ts";
|
|
13
14
|
import { versionsOperation } from "./versions.ts";
|
|
@@ -28,6 +29,7 @@ export const OPERATIONS: Operation<any, any>[] = [
|
|
|
28
29
|
readOperation,
|
|
29
30
|
searchOperation,
|
|
30
31
|
infoOperation,
|
|
32
|
+
statsOperation,
|
|
31
33
|
versionsOperation,
|
|
32
34
|
diffOperation,
|
|
33
35
|
writeOperation,
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
+
import { resolveEmbeddingWorkers } from "../context.ts";
|
|
2
3
|
import { listDueRefreshes } from "../db/files.ts";
|
|
4
|
+
import { withEmbedderPool } from "../ingest/embedder-pool.ts";
|
|
3
5
|
import { colors } from "../output/formatter.ts";
|
|
4
6
|
import { refreshOne } from "../refresh/runner.ts";
|
|
5
7
|
import { defineOperation } from "./types.ts";
|
|
@@ -47,26 +49,32 @@ export const refreshOperation = defineOperation({
|
|
|
47
49
|
return `${lines.join("\n")}\n${parts.join(", ")}`;
|
|
48
50
|
},
|
|
49
51
|
handler: async (input, ctx) => {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
const
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
52
|
+
// Per-command embedder pool: workers come up at the start of the
|
|
53
|
+
// refresh sweep and are killed before we return, so a manual
|
|
54
|
+
// `membot refresh` doesn't leave subprocesses around.
|
|
55
|
+
const workers = resolveEmbeddingWorkers(ctx.config.embedding.workers);
|
|
56
|
+
return withEmbedderPool(workers, ctx.config.embedding_model, async () => {
|
|
57
|
+
const targets = input.logical_path
|
|
58
|
+
? [input.logical_path]
|
|
59
|
+
: (await listDueRefreshes(ctx.db)).map((r) => r.logical_path);
|
|
60
|
+
const out: Array<{
|
|
61
|
+
logical_path: string;
|
|
62
|
+
status: "ok" | "unchanged" | "failed";
|
|
63
|
+
new_version_id?: string;
|
|
64
|
+
error?: string;
|
|
65
|
+
}> = [];
|
|
66
|
+
ctx.progress.start(targets.length, "refresh");
|
|
67
|
+
for (const path of targets) {
|
|
68
|
+
ctx.progress.tick(path);
|
|
69
|
+
try {
|
|
70
|
+
const r = await refreshOne(ctx, path, input.force, (sublabel) => ctx.progress.update(sublabel));
|
|
71
|
+
out.push(r);
|
|
72
|
+
} catch (err) {
|
|
73
|
+
out.push({ logical_path: path, status: "failed", error: err instanceof Error ? err.message : String(err) });
|
|
74
|
+
}
|
|
67
75
|
}
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
ctx.progress.done(`refresh: ${out.filter((r) => r.status === "ok").length}/${out.length} updated`);
|
|
77
|
+
return { processed: out, count: out.length };
|
|
78
|
+
});
|
|
71
79
|
},
|
|
72
80
|
});
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { DbConnection, SqlParam } from "../db/connection.ts";
|
|
3
|
+
import { listDueRefreshes } from "../db/files.ts";
|
|
4
|
+
import { colors } from "../output/formatter.ts";
|
|
5
|
+
import { defineOperation } from "./types.ts";
|
|
6
|
+
|
|
7
|
+
export const statsOperation = defineOperation({
|
|
8
|
+
name: "membot_stats",
|
|
9
|
+
cliName: "stats",
|
|
10
|
+
description: `Summarize the local membot index: file/version/chunk/blob counts, total content and on-disk size, refresh health, and breakdowns by source_type, downloader, and mime_type. Optional prefix narrows aggregates to a subtree (same semantics as 'membot tree <prefix>'). Read-only. Use this before membot_prune to gauge how much there is to drop, or as a first call to confirm the index has anything in it.`,
|
|
11
|
+
inputSchema: z.object({
|
|
12
|
+
prefix: z
|
|
13
|
+
.string()
|
|
14
|
+
.optional()
|
|
15
|
+
.describe(
|
|
16
|
+
"Restrict aggregates to logical paths starting with this prefix (e.g. 'docs/api/'). Omit to summarize the whole index.",
|
|
17
|
+
),
|
|
18
|
+
}),
|
|
19
|
+
outputSchema: z.object({
|
|
20
|
+
prefix: z.string().nullable(),
|
|
21
|
+
db_path: z.string(),
|
|
22
|
+
db_size_bytes: z.number(),
|
|
23
|
+
files: z.object({
|
|
24
|
+
current: z.number(),
|
|
25
|
+
tombstoned_paths: z.number(),
|
|
26
|
+
total_versions: z.number(),
|
|
27
|
+
distinct_paths: z.number(),
|
|
28
|
+
by_source_type: z.record(z.string(), z.number()),
|
|
29
|
+
by_downloader: z.record(z.string(), z.number()),
|
|
30
|
+
by_mime_type: z.record(z.string(), z.number()),
|
|
31
|
+
}),
|
|
32
|
+
content: z.object({
|
|
33
|
+
total_bytes: z.number(),
|
|
34
|
+
total_versions_bytes: z.number(),
|
|
35
|
+
}),
|
|
36
|
+
chunks: z.object({
|
|
37
|
+
current: z.number(),
|
|
38
|
+
total: z.number(),
|
|
39
|
+
}),
|
|
40
|
+
blobs: z.object({
|
|
41
|
+
count: z.number(),
|
|
42
|
+
total_bytes: z.number(),
|
|
43
|
+
}),
|
|
44
|
+
refresh: z.object({
|
|
45
|
+
scheduled: z.number(),
|
|
46
|
+
due_now: z.number(),
|
|
47
|
+
last_status: z.record(z.string(), z.number()),
|
|
48
|
+
}),
|
|
49
|
+
}),
|
|
50
|
+
cli: { positional: ["prefix"] },
|
|
51
|
+
console_formatter: (result) => {
|
|
52
|
+
const lines: string[] = [];
|
|
53
|
+
const heading = (s: string) => colors.bold(s);
|
|
54
|
+
// Always leave at least 2 spaces between key and value, even when the
|
|
55
|
+
// key is wider than the target column (long mime types, long keys).
|
|
56
|
+
const kv = (k: string, v: string, indent = 0) => {
|
|
57
|
+
const target = Math.max(22 - indent, k.length + 2);
|
|
58
|
+
return `${" ".repeat(indent)}${colors.dim(k.padEnd(target))}${v}`;
|
|
59
|
+
};
|
|
60
|
+
const orNone = (record: Record<string, number>): string[] => {
|
|
61
|
+
const keys = Object.keys(record);
|
|
62
|
+
if (keys.length === 0) return [` ${colors.dim("(none)")}`];
|
|
63
|
+
return keys.map((k) => kv(k, String(record[k]), 4));
|
|
64
|
+
};
|
|
65
|
+
const header = result.prefix
|
|
66
|
+
? `${heading("membot index summary")} ${colors.dim(`[prefix=${result.prefix}]`)}`
|
|
67
|
+
: heading("membot index summary");
|
|
68
|
+
lines.push(header);
|
|
69
|
+
lines.push(kv("db_path", result.db_path));
|
|
70
|
+
lines.push(kv("db_size_bytes", formatBytes(result.db_size_bytes)));
|
|
71
|
+
|
|
72
|
+
lines.push("");
|
|
73
|
+
lines.push(heading("files"));
|
|
74
|
+
lines.push(kv("current", String(result.files.current), 2));
|
|
75
|
+
lines.push(kv("tombstoned_paths", String(result.files.tombstoned_paths), 2));
|
|
76
|
+
lines.push(kv("total_versions", String(result.files.total_versions), 2));
|
|
77
|
+
lines.push(kv("distinct_paths", String(result.files.distinct_paths), 2));
|
|
78
|
+
lines.push(kv("by_source_type", "", 2));
|
|
79
|
+
lines.push(...orNone(result.files.by_source_type));
|
|
80
|
+
lines.push(kv("by_downloader", "", 2));
|
|
81
|
+
lines.push(...orNone(result.files.by_downloader));
|
|
82
|
+
lines.push(kv("by_mime_type", "", 2));
|
|
83
|
+
lines.push(...orNone(result.files.by_mime_type));
|
|
84
|
+
|
|
85
|
+
lines.push("");
|
|
86
|
+
lines.push(heading("content"));
|
|
87
|
+
lines.push(kv("total_bytes", formatBytes(result.content.total_bytes), 2));
|
|
88
|
+
lines.push(kv("total_versions_bytes", formatBytes(result.content.total_versions_bytes), 2));
|
|
89
|
+
|
|
90
|
+
lines.push("");
|
|
91
|
+
lines.push(heading("chunks"));
|
|
92
|
+
lines.push(kv("current", String(result.chunks.current), 2));
|
|
93
|
+
lines.push(kv("total", String(result.chunks.total), 2));
|
|
94
|
+
|
|
95
|
+
lines.push("");
|
|
96
|
+
lines.push(heading("blobs"));
|
|
97
|
+
lines.push(kv("count", String(result.blobs.count), 2));
|
|
98
|
+
lines.push(kv("total_bytes", formatBytes(result.blobs.total_bytes), 2));
|
|
99
|
+
|
|
100
|
+
lines.push("");
|
|
101
|
+
lines.push(heading("refresh"));
|
|
102
|
+
lines.push(kv("scheduled", String(result.refresh.scheduled), 2));
|
|
103
|
+
lines.push(kv("due_now", String(result.refresh.due_now), 2));
|
|
104
|
+
lines.push(kv("last_status", "", 2));
|
|
105
|
+
lines.push(...orNone(result.refresh.last_status));
|
|
106
|
+
|
|
107
|
+
return lines.join("\n");
|
|
108
|
+
},
|
|
109
|
+
handler: async (input, ctx) => {
|
|
110
|
+
const prefix = input.prefix ?? null;
|
|
111
|
+
const dbSize = await dbFileSize(ctx.db.path);
|
|
112
|
+
|
|
113
|
+
const files = await collectFileStats(ctx.db, prefix);
|
|
114
|
+
const content = await collectContentStats(ctx.db, prefix);
|
|
115
|
+
const chunks = await collectChunkStats(ctx.db, prefix);
|
|
116
|
+
const blobs = await collectBlobStats(ctx.db, prefix);
|
|
117
|
+
const refresh = await collectRefreshStats(ctx.db, prefix);
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
prefix,
|
|
121
|
+
db_path: ctx.db.path,
|
|
122
|
+
db_size_bytes: dbSize,
|
|
123
|
+
files,
|
|
124
|
+
content,
|
|
125
|
+
chunks,
|
|
126
|
+
blobs,
|
|
127
|
+
refresh,
|
|
128
|
+
};
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
/** Stat the DuckDB file. Returns 0 if the file isn't on disk yet (in-memory or freshly opened). */
|
|
133
|
+
async function dbFileSize(path: string): Promise<number> {
|
|
134
|
+
try {
|
|
135
|
+
const f = Bun.file(path);
|
|
136
|
+
const exists = await f.exists();
|
|
137
|
+
return exists ? f.size : 0;
|
|
138
|
+
} catch {
|
|
139
|
+
return 0;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Build a `logical_path LIKE ?1` clause + params, or empty when prefix is null. */
|
|
144
|
+
function prefixFilter(prefix: string | null): { clause: string; params: SqlParam[] } {
|
|
145
|
+
if (!prefix) return { clause: "", params: [] };
|
|
146
|
+
return { clause: "logical_path LIKE ?1", params: [`${prefix}%`] };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/** Combine an existing WHERE fragment with an optional prefix filter. */
|
|
150
|
+
function and(base: string, extra: string): string {
|
|
151
|
+
if (!base) return extra;
|
|
152
|
+
if (!extra) return base;
|
|
153
|
+
return `${base} AND ${extra}`;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
interface FileStats {
|
|
157
|
+
current: number;
|
|
158
|
+
tombstoned_paths: number;
|
|
159
|
+
total_versions: number;
|
|
160
|
+
distinct_paths: number;
|
|
161
|
+
by_source_type: Record<string, number>;
|
|
162
|
+
by_downloader: Record<string, number>;
|
|
163
|
+
by_mime_type: Record<string, number>;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async function collectFileStats(db: DbConnection, prefix: string | null): Promise<FileStats> {
|
|
167
|
+
const pf = prefixFilter(prefix);
|
|
168
|
+
const where = pf.clause ? `WHERE ${pf.clause}` : "";
|
|
169
|
+
|
|
170
|
+
const current = await scalar(db, `SELECT COUNT(*) AS n FROM current_files ${where}`, ...pf.params);
|
|
171
|
+
const totalVersions = await scalar(db, `SELECT COUNT(*) AS n FROM files ${where}`, ...pf.params);
|
|
172
|
+
const distinctPaths = await scalar(db, `SELECT COUNT(DISTINCT logical_path) AS n FROM files ${where}`, ...pf.params);
|
|
173
|
+
// Tombstoned path = a logical_path whose latest (max version_id) row is a tombstone.
|
|
174
|
+
// current_files already excludes those, so we join "latest per path" against files
|
|
175
|
+
// and count rows where tombstone = TRUE.
|
|
176
|
+
const tombstonedPaths = await scalar(
|
|
177
|
+
db,
|
|
178
|
+
`SELECT COUNT(*) AS n
|
|
179
|
+
FROM files f
|
|
180
|
+
JOIN (
|
|
181
|
+
SELECT logical_path, MAX(version_id) AS v FROM files ${where} GROUP BY logical_path
|
|
182
|
+
) m ON f.logical_path = m.logical_path AND f.version_id = m.v
|
|
183
|
+
WHERE f.tombstone = TRUE`,
|
|
184
|
+
...pf.params,
|
|
185
|
+
);
|
|
186
|
+
|
|
187
|
+
const by_source_type = await groupCount(db, "source_type", "current_files", pf);
|
|
188
|
+
const by_downloader = await groupCount(db, "downloader", "current_files", pf, { skipNull: true });
|
|
189
|
+
const by_mime_type = await groupCount(db, "mime_type", "current_files", pf, { topN: 10, skipNull: true });
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
current,
|
|
193
|
+
tombstoned_paths: tombstonedPaths,
|
|
194
|
+
total_versions: totalVersions,
|
|
195
|
+
distinct_paths: distinctPaths,
|
|
196
|
+
by_source_type,
|
|
197
|
+
by_downloader,
|
|
198
|
+
by_mime_type,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
async function collectContentStats(
|
|
203
|
+
db: DbConnection,
|
|
204
|
+
prefix: string | null,
|
|
205
|
+
): Promise<{ total_bytes: number; total_versions_bytes: number }> {
|
|
206
|
+
const pf = prefixFilter(prefix);
|
|
207
|
+
const where = pf.clause ? `WHERE ${pf.clause}` : "";
|
|
208
|
+
const total_bytes = await scalar(
|
|
209
|
+
db,
|
|
210
|
+
`SELECT COALESCE(SUM(size_bytes), 0) AS n FROM current_files ${where}`,
|
|
211
|
+
...pf.params,
|
|
212
|
+
);
|
|
213
|
+
const total_versions_bytes = await scalar(
|
|
214
|
+
db,
|
|
215
|
+
`SELECT COALESCE(SUM(size_bytes), 0) AS n FROM files ${where}`,
|
|
216
|
+
...pf.params,
|
|
217
|
+
);
|
|
218
|
+
return { total_bytes, total_versions_bytes };
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
async function collectChunkStats(db: DbConnection, prefix: string | null): Promise<{ current: number; total: number }> {
|
|
222
|
+
if (!prefix) {
|
|
223
|
+
const current = await scalar(db, `SELECT COUNT(*) AS n FROM current_chunks`);
|
|
224
|
+
const total = await scalar(db, `SELECT COUNT(*) AS n FROM chunks`);
|
|
225
|
+
return { current, total };
|
|
226
|
+
}
|
|
227
|
+
const pf = prefixFilter(prefix);
|
|
228
|
+
const current = await scalar(db, `SELECT COUNT(*) AS n FROM current_chunks WHERE ${pf.clause}`, ...pf.params);
|
|
229
|
+
const total = await scalar(db, `SELECT COUNT(*) AS n FROM chunks WHERE ${pf.clause}`, ...pf.params);
|
|
230
|
+
return { current, total };
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
async function collectBlobStats(
|
|
234
|
+
db: DbConnection,
|
|
235
|
+
prefix: string | null,
|
|
236
|
+
): Promise<{ count: number; total_bytes: number }> {
|
|
237
|
+
if (!prefix) {
|
|
238
|
+
const row = await db.queryGet<{ count: number | bigint; total: number | bigint | null }>(
|
|
239
|
+
`SELECT COUNT(*) AS count, COALESCE(SUM(size_bytes), 0) AS total FROM blobs`,
|
|
240
|
+
);
|
|
241
|
+
return { count: Number(row?.count ?? 0), total_bytes: Number(row?.total ?? 0) };
|
|
242
|
+
}
|
|
243
|
+
const pf = prefixFilter(prefix);
|
|
244
|
+
const row = await db.queryGet<{ count: number | bigint; total: number | bigint | null }>(
|
|
245
|
+
`SELECT COUNT(*) AS count, COALESCE(SUM(size_bytes), 0) AS total
|
|
246
|
+
FROM blobs
|
|
247
|
+
WHERE sha256 IN (
|
|
248
|
+
SELECT blob_sha256 FROM current_files
|
|
249
|
+
WHERE ${pf.clause} AND blob_sha256 IS NOT NULL
|
|
250
|
+
)`,
|
|
251
|
+
...pf.params,
|
|
252
|
+
);
|
|
253
|
+
return { count: Number(row?.count ?? 0), total_bytes: Number(row?.total ?? 0) };
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
async function collectRefreshStats(
|
|
257
|
+
db: DbConnection,
|
|
258
|
+
prefix: string | null,
|
|
259
|
+
): Promise<{ scheduled: number; due_now: number; last_status: Record<string, number> }> {
|
|
260
|
+
const pf = prefixFilter(prefix);
|
|
261
|
+
const scheduledWhere = and(pf.clause, "refresh_frequency_sec IS NOT NULL");
|
|
262
|
+
const scheduled = await scalar(db, `SELECT COUNT(*) AS n FROM current_files WHERE ${scheduledWhere}`, ...pf.params);
|
|
263
|
+
|
|
264
|
+
const due = await listDueRefreshes(db);
|
|
265
|
+
const due_now = prefix ? due.filter((r) => r.logical_path.startsWith(prefix)).length : due.length;
|
|
266
|
+
|
|
267
|
+
const statusRows = await db.queryAll<{ k: string | null; n: number | bigint }>(
|
|
268
|
+
`SELECT last_refresh_status AS k, COUNT(*) AS n
|
|
269
|
+
FROM current_files
|
|
270
|
+
WHERE last_refresh_status IS NOT NULL${pf.clause ? ` AND ${pf.clause}` : ""}
|
|
271
|
+
GROUP BY last_refresh_status
|
|
272
|
+
ORDER BY n DESC`,
|
|
273
|
+
...pf.params,
|
|
274
|
+
);
|
|
275
|
+
const last_status: Record<string, number> = {};
|
|
276
|
+
for (const r of statusRows) {
|
|
277
|
+
if (r.k !== null) last_status[r.k] = Number(r.n);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return { scheduled, due_now, last_status };
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/** Run a query whose first row has a single numeric column `n`, returning that number (0 when null). */
|
|
284
|
+
async function scalar(db: DbConnection, sql: string, ...params: SqlParam[]): Promise<number> {
|
|
285
|
+
const row = await db.queryGet<{ n: number | bigint | null }>(sql, ...params);
|
|
286
|
+
return Number(row?.n ?? 0);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
interface GroupOptions {
|
|
290
|
+
skipNull?: boolean;
|
|
291
|
+
topN?: number;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* GROUP BY a column on a current_files-shaped table, optionally dropping NULLs
|
|
296
|
+
* and rolling overflow into an "(other)" bucket when topN is set.
|
|
297
|
+
*/
|
|
298
|
+
async function groupCount(
|
|
299
|
+
db: DbConnection,
|
|
300
|
+
column: string,
|
|
301
|
+
table: string,
|
|
302
|
+
pf: { clause: string; params: SqlParam[] },
|
|
303
|
+
opts: GroupOptions = {},
|
|
304
|
+
): Promise<Record<string, number>> {
|
|
305
|
+
const filters: string[] = [];
|
|
306
|
+
if (pf.clause) filters.push(pf.clause);
|
|
307
|
+
if (opts.skipNull) filters.push(`${column} IS NOT NULL`);
|
|
308
|
+
const where = filters.length ? `WHERE ${filters.join(" AND ")}` : "";
|
|
309
|
+
const rows = await db.queryAll<{ k: string | null; n: number | bigint }>(
|
|
310
|
+
`SELECT ${column} AS k, COUNT(*) AS n FROM ${table} ${where} GROUP BY ${column} ORDER BY n DESC`,
|
|
311
|
+
...pf.params,
|
|
312
|
+
);
|
|
313
|
+
const out: Record<string, number> = {};
|
|
314
|
+
if (opts.topN && rows.length > opts.topN) {
|
|
315
|
+
let other = 0;
|
|
316
|
+
for (let i = 0; i < rows.length; i++) {
|
|
317
|
+
const r = rows[i]!;
|
|
318
|
+
const key = r.k ?? "(null)";
|
|
319
|
+
if (i < opts.topN) out[key] = Number(r.n);
|
|
320
|
+
else other += Number(r.n);
|
|
321
|
+
}
|
|
322
|
+
if (other > 0) out["(other)"] = other;
|
|
323
|
+
return out;
|
|
324
|
+
}
|
|
325
|
+
for (const r of rows) {
|
|
326
|
+
out[r.k ?? "(null)"] = Number(r.n);
|
|
327
|
+
}
|
|
328
|
+
return out;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/** Format a byte count in human units. 1024 boundary, 1-decimal precision past KB. */
|
|
332
|
+
function formatBytes(bytes: number): string {
|
|
333
|
+
if (bytes < 1024) return `${bytes} B`;
|
|
334
|
+
const units = ["KB", "MB", "GB", "TB"];
|
|
335
|
+
let i = -1;
|
|
336
|
+
let n = bytes;
|
|
337
|
+
while (n >= 1024 && i < units.length - 1) {
|
|
338
|
+
n /= 1024;
|
|
339
|
+
i++;
|
|
340
|
+
}
|
|
341
|
+
return `${n.toFixed(n >= 100 ? 0 : 1)} ${units[i]}`;
|
|
342
|
+
}
|