pdf-brain 1.3.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -2
- package/package.json +2 -1
- package/scripts/install.sh +1 -1
- package/src/agent/hints.ts +426 -3
- package/src/agent/manifest.ts +24 -4
- package/src/agent/protocol.ts +52 -0
- package/src/chunking.ts +130 -0
- package/src/cli.contract.test.ts +239 -0
- package/src/cli.ts +2573 -840
- package/src/index.ts +259 -6
- package/src/logger.ts +53 -0
- package/src/services/AutoTagger.ts +26 -38
- package/src/services/ClusterSummarizer.ts +3 -3
- package/src/services/Clustering.test.ts +20 -5
- package/src/services/Clustering.ts +48 -11
- package/src/services/Database.ts +27 -0
- package/src/services/EmbeddingProvider.ts +77 -7
- package/src/services/Gateway.ts +8 -7
- package/src/services/LibSQLDatabase.test.ts +139 -0
- package/src/services/LibSQLDatabase.ts +228 -15
- package/src/services/Migration.ts +1 -1
- package/src/services/Ollama.ts +22 -7
- package/src/services/PDFExtractor.test.ts +40 -1
- package/src/services/PDFExtractor.ts +37 -6
- package/src/types.test.ts +22 -0
- package/src/types.ts +82 -2
- package/src/updater.ts +8 -3
package/src/types.test.ts
CHANGED
|
@@ -28,12 +28,16 @@ describe("Unified Search Types", () => {
|
|
|
28
28
|
describe("DocumentSearchResult", () => {
|
|
29
29
|
test("should create valid document search result", () => {
|
|
30
30
|
const result = new DocumentSearchResult({
|
|
31
|
+
chunkId: "doc-123-0",
|
|
31
32
|
docId: "doc-123",
|
|
32
33
|
title: "Test Document",
|
|
33
34
|
page: 1,
|
|
34
35
|
chunkIndex: 0,
|
|
35
36
|
content: "Test content",
|
|
36
37
|
score: 0.95,
|
|
38
|
+
rawScore: 0.95,
|
|
39
|
+
scoreType: "cosine_similarity",
|
|
40
|
+
vectorScore: 0.95,
|
|
37
41
|
matchType: "vector",
|
|
38
42
|
entityType: "document",
|
|
39
43
|
});
|
|
@@ -45,12 +49,16 @@ describe("Unified Search Types", () => {
|
|
|
45
49
|
|
|
46
50
|
test("should support optional expanded content", () => {
|
|
47
51
|
const result = new DocumentSearchResult({
|
|
52
|
+
chunkId: "doc-123-0",
|
|
48
53
|
docId: "doc-123",
|
|
49
54
|
title: "Test Document",
|
|
50
55
|
page: 1,
|
|
51
56
|
chunkIndex: 0,
|
|
52
57
|
content: "Test content",
|
|
53
58
|
score: 0.95,
|
|
59
|
+
rawScore: 0.95,
|
|
60
|
+
scoreType: "cosine_similarity",
|
|
61
|
+
vectorScore: 0.95,
|
|
54
62
|
matchType: "vector",
|
|
55
63
|
entityType: "document",
|
|
56
64
|
expandedContent: "Expanded test content",
|
|
@@ -69,6 +77,8 @@ describe("Unified Search Types", () => {
|
|
|
69
77
|
prefLabel: "Machine Learning",
|
|
70
78
|
definition: "A subset of artificial intelligence...",
|
|
71
79
|
score: 0.88,
|
|
80
|
+
rawScore: 0.88,
|
|
81
|
+
scoreType: "cosine_similarity",
|
|
72
82
|
entityType: "concept",
|
|
73
83
|
});
|
|
74
84
|
|
|
@@ -82,12 +92,16 @@ describe("Unified Search Types", () => {
|
|
|
82
92
|
describe("UnifiedSearchResult", () => {
|
|
83
93
|
test("should accept DocumentSearchResult", () => {
|
|
84
94
|
const docResult: UnifiedSearchResult = new DocumentSearchResult({
|
|
95
|
+
chunkId: "doc-123-0",
|
|
85
96
|
docId: "doc-123",
|
|
86
97
|
title: "Test Document",
|
|
87
98
|
page: 1,
|
|
88
99
|
chunkIndex: 0,
|
|
89
100
|
content: "Test content",
|
|
90
101
|
score: 0.95,
|
|
102
|
+
rawScore: 0.95,
|
|
103
|
+
scoreType: "cosine_similarity",
|
|
104
|
+
vectorScore: 0.95,
|
|
91
105
|
matchType: "vector",
|
|
92
106
|
entityType: "document",
|
|
93
107
|
});
|
|
@@ -101,6 +115,8 @@ describe("Unified Search Types", () => {
|
|
|
101
115
|
prefLabel: "Machine Learning",
|
|
102
116
|
definition: "A subset of artificial intelligence...",
|
|
103
117
|
score: 0.88,
|
|
118
|
+
rawScore: 0.88,
|
|
119
|
+
scoreType: "cosine_similarity",
|
|
104
120
|
entityType: "concept",
|
|
105
121
|
});
|
|
106
122
|
|
|
@@ -110,12 +126,16 @@ describe("Unified Search Types", () => {
|
|
|
110
126
|
test("should discriminate by entityType", () => {
|
|
111
127
|
const results: UnifiedSearchResult[] = [
|
|
112
128
|
new DocumentSearchResult({
|
|
129
|
+
chunkId: "doc-123-0",
|
|
113
130
|
docId: "doc-123",
|
|
114
131
|
title: "Test Document",
|
|
115
132
|
page: 1,
|
|
116
133
|
chunkIndex: 0,
|
|
117
134
|
content: "Test content",
|
|
118
135
|
score: 0.95,
|
|
136
|
+
rawScore: 0.95,
|
|
137
|
+
scoreType: "cosine_similarity",
|
|
138
|
+
vectorScore: 0.95,
|
|
119
139
|
matchType: "vector",
|
|
120
140
|
entityType: "document",
|
|
121
141
|
}),
|
|
@@ -124,6 +144,8 @@ describe("Unified Search Types", () => {
|
|
|
124
144
|
prefLabel: "Machine Learning",
|
|
125
145
|
definition: "A subset of artificial intelligence...",
|
|
126
146
|
score: 0.88,
|
|
147
|
+
rawScore: 0.88,
|
|
148
|
+
scoreType: "cosine_similarity",
|
|
127
149
|
entityType: "concept",
|
|
128
150
|
}),
|
|
129
151
|
];
|
package/src/types.ts
CHANGED
|
@@ -52,12 +52,22 @@ export type EntityType = "document" | "concept";
|
|
|
52
52
|
* @deprecated Use DocumentSearchResult for unified search. Kept for backwards compatibility.
|
|
53
53
|
*/
|
|
54
54
|
export class SearchResult extends Schema.Class<SearchResult>("SearchResult")({
|
|
55
|
+
chunkId: Schema.String,
|
|
55
56
|
docId: Schema.String,
|
|
56
57
|
title: Schema.String,
|
|
57
58
|
page: Schema.Number,
|
|
58
59
|
chunkIndex: Schema.Number,
|
|
59
60
|
content: Schema.String,
|
|
61
|
+
/** Normalized score in 0..1 for ranking across match types */
|
|
60
62
|
score: Schema.Number,
|
|
63
|
+
/** Raw score from the underlying engine (e.g. cosine similarity, FTS rank) */
|
|
64
|
+
rawScore: Schema.Number,
|
|
65
|
+
/** What rawScore represents (do not assume one score meaning across engines) */
|
|
66
|
+
scoreType: Schema.Literal("cosine_similarity", "fts_rank", "hybrid"),
|
|
67
|
+
/** Optional component score for vector results */
|
|
68
|
+
vectorScore: Schema.optional(Schema.Number),
|
|
69
|
+
/** Optional component score for FTS results (raw FTS rank; often negative, more negative = better) */
|
|
70
|
+
ftsRank: Schema.optional(Schema.Number),
|
|
61
71
|
matchType: Schema.Literal("vector", "fts", "hybrid"),
|
|
62
72
|
/** Expanded context around the match (only populated when expandChars > 0) */
|
|
63
73
|
expandedContent: Schema.optional(Schema.String),
|
|
@@ -65,7 +75,47 @@ export class SearchResult extends Schema.Class<SearchResult>("SearchResult")({
|
|
|
65
75
|
expandedRange: Schema.optional(
|
|
66
76
|
Schema.Struct({ start: Schema.Number, end: Schema.Number })
|
|
67
77
|
),
|
|
68
|
-
}) {
|
|
78
|
+
}) {
|
|
79
|
+
/**
|
|
80
|
+
* Backwards-compatible constructor:
|
|
81
|
+
* Older callers used `SearchResult` without chunkId/rawScore/scoreType.
|
|
82
|
+
*
|
|
83
|
+
* This type is deprecated in favor of `DocumentSearchResult`, but we keep
|
|
84
|
+
* legacy input working so downstream code doesn't explode.
|
|
85
|
+
*/
|
|
86
|
+
constructor(props: any) {
|
|
87
|
+
const docId = props?.docId;
|
|
88
|
+
const page = props?.page;
|
|
89
|
+
const chunkIndex = props?.chunkIndex;
|
|
90
|
+
|
|
91
|
+
const matchType: "vector" | "fts" | "hybrid" = props?.matchType;
|
|
92
|
+
const score: number = props?.score;
|
|
93
|
+
|
|
94
|
+
const chunkId =
|
|
95
|
+
props?.chunkId ?? `legacy:${String(docId)}:${String(page)}:${String(chunkIndex)}`;
|
|
96
|
+
|
|
97
|
+
const rawScore = props?.rawScore ?? score;
|
|
98
|
+
|
|
99
|
+
const scoreType =
|
|
100
|
+
props?.scoreType ??
|
|
101
|
+
(matchType === "fts"
|
|
102
|
+
? "fts_rank"
|
|
103
|
+
: matchType === "hybrid"
|
|
104
|
+
? "hybrid"
|
|
105
|
+
: "cosine_similarity");
|
|
106
|
+
|
|
107
|
+
const vectorScore =
|
|
108
|
+
props?.vectorScore ?? (matchType === "vector" ? score : undefined);
|
|
109
|
+
|
|
110
|
+
super({
|
|
111
|
+
...props,
|
|
112
|
+
chunkId,
|
|
113
|
+
rawScore,
|
|
114
|
+
scoreType,
|
|
115
|
+
vectorScore,
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
}
|
|
69
119
|
|
|
70
120
|
/**
|
|
71
121
|
* Document search result with entity type discriminator
|
|
@@ -73,12 +123,22 @@ export class SearchResult extends Schema.Class<SearchResult>("SearchResult")({
|
|
|
73
123
|
export class DocumentSearchResult extends Schema.Class<DocumentSearchResult>(
|
|
74
124
|
"DocumentSearchResult"
|
|
75
125
|
)({
|
|
126
|
+
chunkId: Schema.String,
|
|
76
127
|
docId: Schema.String,
|
|
77
128
|
title: Schema.String,
|
|
78
129
|
page: Schema.Number,
|
|
79
130
|
chunkIndex: Schema.Number,
|
|
80
131
|
content: Schema.String,
|
|
132
|
+
/** Normalized score in 0..1 for ranking across match types */
|
|
81
133
|
score: Schema.Number,
|
|
134
|
+
/** Raw score from the underlying engine (e.g. cosine similarity, FTS rank) */
|
|
135
|
+
rawScore: Schema.Number,
|
|
136
|
+
/** What rawScore represents (do not assume one score meaning across engines) */
|
|
137
|
+
scoreType: Schema.Literal("cosine_similarity", "fts_rank", "hybrid"),
|
|
138
|
+
/** Optional component score for vector results */
|
|
139
|
+
vectorScore: Schema.optional(Schema.Number),
|
|
140
|
+
/** Optional component score for FTS results (raw FTS rank; often negative, more negative = better) */
|
|
141
|
+
ftsRank: Schema.optional(Schema.Number),
|
|
82
142
|
matchType: Schema.Literal("vector", "fts", "hybrid"),
|
|
83
143
|
entityType: Schema.Literal("document"),
|
|
84
144
|
/** Expanded context around the match (only populated when expandChars > 0) */
|
|
@@ -98,7 +158,11 @@ export class ConceptSearchResult extends Schema.Class<ConceptSearchResult>(
|
|
|
98
158
|
conceptId: Schema.String,
|
|
99
159
|
prefLabel: Schema.String,
|
|
100
160
|
definition: Schema.String,
|
|
161
|
+
/** Normalized score in 0..1 */
|
|
101
162
|
score: Schema.Number,
|
|
163
|
+
/** Raw score from the underlying engine (cosine similarity) */
|
|
164
|
+
rawScore: Schema.Number,
|
|
165
|
+
scoreType: Schema.Literal("cosine_similarity"),
|
|
102
166
|
entityType: Schema.Literal("concept"),
|
|
103
167
|
}) {}
|
|
104
168
|
|
|
@@ -166,6 +230,9 @@ export class Config extends Schema.Class<Config>("Config")({
|
|
|
166
230
|
host: Schema.String,
|
|
167
231
|
autoInstall: Schema.Boolean,
|
|
168
232
|
}),
|
|
233
|
+
gateway: Schema.optionalWith(Schema.Struct({
|
|
234
|
+
apiKey: Schema.optional(Schema.String),
|
|
235
|
+
}), { default: () => ({}) }),
|
|
169
236
|
}) {
|
|
170
237
|
/**
|
|
171
238
|
* Default configuration: Ollama for all providers
|
|
@@ -187,7 +254,15 @@ export class Config extends Schema.Class<Config>("Config")({
|
|
|
187
254
|
host: "http://localhost:11434",
|
|
188
255
|
autoInstall: true,
|
|
189
256
|
},
|
|
257
|
+
gateway: {},
|
|
190
258
|
});
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Resolve the gateway API key: config takes precedence over env var.
|
|
262
|
+
*/
|
|
263
|
+
get gatewayApiKey(): string | undefined {
|
|
264
|
+
return this.gateway.apiKey ?? process.env.AI_GATEWAY_API_KEY;
|
|
265
|
+
}
|
|
191
266
|
}
|
|
192
267
|
|
|
193
268
|
// ============================================================================
|
|
@@ -224,7 +299,7 @@ export function loadConfig(): Config {
|
|
|
224
299
|
|
|
225
300
|
/**
|
|
226
301
|
* Save config to $PDF_LIBRARY_PATH/config.json.
|
|
227
|
-
* API keys
|
|
302
|
+
* API keys can be stored in config or read from env var AI_GATEWAY_API_KEY.
|
|
228
303
|
*/
|
|
229
304
|
export function saveConfig(config: Config): void {
|
|
230
305
|
const libraryPath =
|
|
@@ -268,6 +343,11 @@ export class AddOptions extends Schema.Class<AddOptions>("AddOptions")({
|
|
|
268
343
|
metadata: Schema.optional(
|
|
269
344
|
Schema.Record({ key: Schema.String, value: Schema.Unknown })
|
|
270
345
|
),
|
|
346
|
+
/**
|
|
347
|
+
* Internal/advanced: preserve original `addedAt` on re-add/rechunk workflows.
|
|
348
|
+
* CLI does not expose this directly.
|
|
349
|
+
*/
|
|
350
|
+
addedAt: Schema.optional(Schema.Date),
|
|
271
351
|
}) {}
|
|
272
352
|
|
|
273
353
|
// ============================================================================
|
package/src/updater.ts
CHANGED
|
@@ -9,8 +9,9 @@
|
|
|
9
9
|
|
|
10
10
|
import { existsSync, mkdirSync, readFileSync, writeFileSync, renameSync, unlinkSync, chmodSync } from "fs";
|
|
11
11
|
import { join } from "path";
|
|
12
|
+
import { logInfo } from "./logger.js";
|
|
12
13
|
|
|
13
|
-
const REPO = "joelhooks/pdf-
|
|
14
|
+
const REPO = "joelhooks/pdf-brain";
|
|
14
15
|
const STATE_DIR = join(process.env.HOME || "~", ".pdf-brain");
|
|
15
16
|
const STATE_FILE = join(STATE_DIR, "update-check.json");
|
|
16
17
|
const CHECK_INTERVAL_MS = 24 * 60 * 60 * 1000; // 1 day
|
|
@@ -112,6 +113,10 @@ async function downloadAndReplace(version: string): Promise<boolean> {
|
|
|
112
113
|
* Current invocation keeps running the old code — new version takes effect next run.
|
|
113
114
|
*/
|
|
114
115
|
export function backgroundUpdateCheck(currentVersion: string): void {
|
|
116
|
+
// Agent-first default: disable background updates unless explicitly enabled.
|
|
117
|
+
// Background network calls + non-deterministic stderr output are a footgun for tool callers.
|
|
118
|
+
if (process.env.PDF_BRAIN_BACKGROUND_UPDATE !== "1") return;
|
|
119
|
+
|
|
115
120
|
// Don't auto-update in dev mode
|
|
116
121
|
if (currentVersion.includes("compiled") || currentVersion === "0.0.0") return;
|
|
117
122
|
|
|
@@ -135,8 +140,8 @@ export function backgroundUpdateCheck(currentVersion: string): void {
|
|
|
135
140
|
if (ok) {
|
|
136
141
|
newState.lastAutoUpdate = now;
|
|
137
142
|
newState.latestVersion = latest;
|
|
138
|
-
// Brief note so they know why behavior might change
|
|
139
|
-
|
|
143
|
+
// Brief note so they know why behavior might change (stderr only).
|
|
144
|
+
logInfo(`Updated pdf-brain v${currentVersion} -> v${latest}`);
|
|
140
145
|
}
|
|
141
146
|
}
|
|
142
147
|
|