@neuralsea/workspace-indexer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +356 -0
- package/dist/chunk-QPQCSCBN.js +2374 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +91 -0
- package/dist/index.d.ts +505 -0
- package/dist/index.js +22 -0
- package/package.json +45 -0
package/dist/cli.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
HashEmbeddingsProvider,
|
|
4
|
+
OllamaEmbeddingsProvider,
|
|
5
|
+
OpenAIEmbeddingsProvider,
|
|
6
|
+
WorkspaceIndexer,
|
|
7
|
+
loadConfigFile
|
|
8
|
+
} from "./chunk-QPQCSCBN.js";
|
|
9
|
+
|
|
10
|
+
// src/cli.ts
|
|
11
|
+
import yargs from "yargs";
|
|
12
|
+
import { hideBin } from "yargs/helpers";
|
|
13
|
+
import fs from "fs";
|
|
14
|
+
function makeEmbedder(argv) {
|
|
15
|
+
const provider = argv.provider;
|
|
16
|
+
if (provider === "ollama") {
|
|
17
|
+
return new OllamaEmbeddingsProvider({ model: argv.model, baseUrl: argv.baseUrl });
|
|
18
|
+
}
|
|
19
|
+
if (provider === "openai") {
|
|
20
|
+
const apiKey = argv.apiKey || process.env.OPENAI_API_KEY;
|
|
21
|
+
if (!apiKey) throw new Error("Missing --apiKey (or OPENAI_API_KEY) for OpenAI");
|
|
22
|
+
return new OpenAIEmbeddingsProvider({ apiKey, model: argv.model, baseUrl: argv.baseUrl });
|
|
23
|
+
}
|
|
24
|
+
return new HashEmbeddingsProvider(argv.dim);
|
|
25
|
+
}
|
|
26
|
+
function loadConfig(argv) {
|
|
27
|
+
const cfgPath = argv.config;
|
|
28
|
+
if (!cfgPath) return {};
|
|
29
|
+
if (!fs.existsSync(cfgPath)) throw new Error(`Config file not found: ${cfgPath}`);
|
|
30
|
+
return loadConfigFile(cfgPath);
|
|
31
|
+
}
|
|
32
|
+
await yargs(hideBin(process.argv)).scriptName("petri-index").option("config", { type: "string", describe: "Path to a JSON config file" }).option("provider", { type: "string", default: "ollama", choices: ["ollama", "openai", "hash"] }).option("model", { type: "string", default: "nomic-embed-text" }).option("baseUrl", { type: "string", describe: "Base URL for Ollama/OpenAI compatible endpoints" }).option("apiKey", { type: "string", describe: "API key for OpenAI (or set OPENAI_API_KEY)" }).option("dim", { type: "number", default: 384, describe: "Dimension for hash embedder" }).command(
|
|
33
|
+
"index [root]",
|
|
34
|
+
"Index all Git repos under a workspace root",
|
|
35
|
+
(y) => y.positional("root", { type: "string", default: process.cwd() }),
|
|
36
|
+
async (argv) => {
|
|
37
|
+
const cfg = loadConfig(argv);
|
|
38
|
+
const embedder = makeEmbedder(argv);
|
|
39
|
+
const ix = new WorkspaceIndexer(argv.root, embedder, cfg);
|
|
40
|
+
await ix.indexAll();
|
|
41
|
+
await ix.closeAsync();
|
|
42
|
+
console.log("Index complete.");
|
|
43
|
+
}
|
|
44
|
+
).command(
|
|
45
|
+
"watch [root]",
|
|
46
|
+
"Watch and keep indexes up to date",
|
|
47
|
+
(y) => y.positional("root", { type: "string", default: process.cwd() }),
|
|
48
|
+
async (argv) => {
|
|
49
|
+
const cfg = loadConfig(argv);
|
|
50
|
+
const embedder = makeEmbedder(argv);
|
|
51
|
+
const ix = new WorkspaceIndexer(argv.root, embedder, cfg);
|
|
52
|
+
await ix.indexAll();
|
|
53
|
+
await ix.watch();
|
|
54
|
+
console.log("Watching. Press Ctrl+C to exit.");
|
|
55
|
+
}
|
|
56
|
+
).command(
|
|
57
|
+
"query <q> [root]",
|
|
58
|
+
"Quick semantic search (profile: search) and print top hits",
|
|
59
|
+
(y) => y.positional("q", { type: "string", demandOption: true }).positional("root", { type: "string", default: process.cwd() }).option("k", { type: "number", default: 10 }),
|
|
60
|
+
async (argv) => {
|
|
61
|
+
const cfg = loadConfig(argv);
|
|
62
|
+
const embedder = makeEmbedder(argv);
|
|
63
|
+
const ix = new WorkspaceIndexer(argv.root, embedder, cfg);
|
|
64
|
+
await ix.indexAll();
|
|
65
|
+
const bundle = await ix.retrieve(argv.q, { profile: "search", profileOverrides: { k: argv.k } });
|
|
66
|
+
for (const h of bundle.hits) {
|
|
67
|
+
console.log(`
|
|
68
|
+
[${h.score.toFixed(4)}] ${h.chunk.repoRoot} :: ${h.chunk.path}:${h.chunk.startLine}-${h.chunk.endLine} (${h.chunk.kind})`);
|
|
69
|
+
console.log(` ${h.chunk.preview}`);
|
|
70
|
+
}
|
|
71
|
+
await ix.closeAsync();
|
|
72
|
+
}
|
|
73
|
+
).command(
|
|
74
|
+
"retrieve <q> [root]",
|
|
75
|
+
"Run full retrieval with a chosen profile and output JSON context bundle",
|
|
76
|
+
(y) => y.positional("q", { type: "string", demandOption: true }).positional("root", { type: "string", default: process.cwd() }).option("profile", { type: "string", default: "search", choices: ["search", "refactor", "review", "architecture", "rca", "custom"] }).option("k", { type: "number", describe: "Override k in the profile" }).option("changedOnly", { type: "boolean", default: false, describe: "Restrict to files changed compared to baseRef" }).option("baseRef", { type: "string", default: "HEAD~1", describe: "Git ref to diff against when changedOnly is set" }),
|
|
77
|
+
async (argv) => {
|
|
78
|
+
const cfg = loadConfig(argv);
|
|
79
|
+
const embedder = makeEmbedder(argv);
|
|
80
|
+
const ix = new WorkspaceIndexer(argv.root, embedder, cfg);
|
|
81
|
+
await ix.indexAll();
|
|
82
|
+
const profile = argv.profile;
|
|
83
|
+
const bundle = await ix.retrieve(argv.q, {
|
|
84
|
+
profile,
|
|
85
|
+
profileOverrides: argv.k ? { k: argv.k } : void 0,
|
|
86
|
+
scope: argv.changedOnly ? { changedOnly: true, baseRef: argv.baseRef } : void 0
|
|
87
|
+
});
|
|
88
|
+
process.stdout.write(JSON.stringify(bundle, null, 2) + "\n");
|
|
89
|
+
await ix.closeAsync();
|
|
90
|
+
}
|
|
91
|
+
).demandCommand().strict().help().parseAsync();
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
type RepoId = string;
|
|
2
|
+
type RetrievalProfileName = "search" | "refactor" | "review" | "architecture" | "rca" | "custom";
|
|
3
|
+
interface RetrievalWeights {
|
|
4
|
+
/** Weight for semantic vector similarity (cosine). */
|
|
5
|
+
vector: number;
|
|
6
|
+
/** Weight for lexical/BM25-style match (SQLite FTS). */
|
|
7
|
+
lexical: number;
|
|
8
|
+
/** Weight for file recency (mtime). */
|
|
9
|
+
recency: number;
|
|
10
|
+
}
|
|
11
|
+
interface RetrievalProfile {
|
|
12
|
+
name: RetrievalProfileName;
|
|
13
|
+
/** Number of primary hits to return (before expansion). */
|
|
14
|
+
k: number;
|
|
15
|
+
/** Hybrid scoring weights. */
|
|
16
|
+
weights: RetrievalWeights;
|
|
17
|
+
/** Context expansion knobs. */
|
|
18
|
+
expand: RelatedContextOptions & {
|
|
19
|
+
/**
|
|
20
|
+
* If true, include a lightweight file-level synopsis chunk (if available),
|
|
21
|
+
* useful for architecture-style queries.
|
|
22
|
+
*/
|
|
23
|
+
includeFileSynopsis?: boolean;
|
|
24
|
+
};
|
|
25
|
+
/** Candidate selection knobs. */
|
|
26
|
+
candidates?: {
|
|
27
|
+
/** How many candidates to request from vector search before merging. */
|
|
28
|
+
vectorK?: number;
|
|
29
|
+
/** How many candidates to request from lexical search before merging. */
|
|
30
|
+
lexicalK?: number;
|
|
31
|
+
/** Optional hard cap on total candidates before reranking/merging. */
|
|
32
|
+
maxMergedCandidates?: number;
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
/** The default profiles can be overridden or extended by the consumer. */
|
|
36
|
+
interface ProfilesConfig {
|
|
37
|
+
profiles?: Partial<Record<RetrievalProfileName, Partial<RetrievalProfile>>>;
|
|
38
|
+
}
|
|
39
|
+
/** Similarity metric for vector search. */
|
|
40
|
+
type VectorMetric = "cosine" | "ip" | "l2";
|
|
41
|
+
/** Built-in vector backends. */
|
|
42
|
+
type VectorProviderKind = "auto" | "bruteforce" | "hnswlib" | "qdrant" | "faiss" | "custom";
|
|
43
|
+
interface HnswlibVectorConfig {
|
|
44
|
+
/** Max elements for the HNSW index (will auto-resize if needed). */
|
|
45
|
+
maxElements?: number;
|
|
46
|
+
/** HNSW M parameter (graph connectivity). */
|
|
47
|
+
m?: number;
|
|
48
|
+
/** HNSW efConstruction parameter. */
|
|
49
|
+
efConstruction?: number;
|
|
50
|
+
/** HNSW efSearch parameter (quality/speed at query time). */
|
|
51
|
+
efSearch?: number;
|
|
52
|
+
/** Persist the ANN index to disk under cacheDir (default true). */
|
|
53
|
+
persist?: boolean;
|
|
54
|
+
/** Debounce in ms before writing a dirty index to disk (default 2000). */
|
|
55
|
+
persistDebounceMs?: number;
|
|
56
|
+
}
|
|
57
|
+
interface QdrantVectorConfig {
|
|
58
|
+
/** Prefer `url` if set, otherwise `host`+`port`. */
|
|
59
|
+
url?: string;
|
|
60
|
+
host?: string;
|
|
61
|
+
port?: number;
|
|
62
|
+
/** Optional API key (for secured Qdrant instances). */
|
|
63
|
+
apiKey?: string;
|
|
64
|
+
/** Collection naming prefix (default "petri"). */
|
|
65
|
+
collectionPrefix?: string;
|
|
66
|
+
/**
|
|
67
|
+
* "commit" (default): a separate collection per repo+commit.
|
|
68
|
+
* "repo": a single collection per repo; commit is stored in payload.
|
|
69
|
+
*/
|
|
70
|
+
collectionMode?: "commit" | "repo";
|
|
71
|
+
/** If true, drop+recreate the collection on rebuild (default true). */
|
|
72
|
+
recreateOnRebuild?: boolean;
|
|
73
|
+
}
|
|
74
|
+
interface FaissVectorConfig {
|
|
75
|
+
/** Factory descriptor (default "HNSW,Flat"). */
|
|
76
|
+
descriptor?: string;
|
|
77
|
+
/** Persist the FAISS index to disk under cacheDir (default true). */
|
|
78
|
+
persist?: boolean;
|
|
79
|
+
/** Debounce in ms before writing a dirty index to disk (default 2000). */
|
|
80
|
+
persistDebounceMs?: number;
|
|
81
|
+
/** Rebuild strategy: "lazy" (default) rebuild on demand; "eager" rebuild on each write. */
|
|
82
|
+
rebuildStrategy?: "lazy" | "eager";
|
|
83
|
+
}
|
|
84
|
+
interface CustomVectorConfig {
|
|
85
|
+
/** ESM module path to import (relative to process.cwd() or absolute). */
|
|
86
|
+
module: string;
|
|
87
|
+
/** Named export to use (default: "default"). */
|
|
88
|
+
export?: string;
|
|
89
|
+
/** Arbitrary JSON options passed to the provider factory (optional). */
|
|
90
|
+
options?: Record<string, unknown>;
|
|
91
|
+
}
|
|
92
|
+
interface VectorConfig {
|
|
93
|
+
/** Which vector backend to use. Default: "bruteforce". */
|
|
94
|
+
provider?: VectorProviderKind;
|
|
95
|
+
/** Vector metric. Default: "cosine". */
|
|
96
|
+
metric?: VectorMetric;
|
|
97
|
+
/** Provider-specific options. */
|
|
98
|
+
hnswlib?: HnswlibVectorConfig;
|
|
99
|
+
qdrant?: QdrantVectorConfig;
|
|
100
|
+
faiss?: FaissVectorConfig;
|
|
101
|
+
custom?: CustomVectorConfig;
|
|
102
|
+
}
|
|
103
|
+
interface IndexerConfig extends ProfilesConfig {
|
|
104
|
+
cacheDir?: string;
|
|
105
|
+
/** Vector index backend (ANN) configuration. */
|
|
106
|
+
vector?: VectorConfig;
|
|
107
|
+
/** Extensions to index (lowercase, include the dot). */
|
|
108
|
+
includeExtensions?: string[];
|
|
109
|
+
/** Safety limit to avoid indexing huge binaries by accident. */
|
|
110
|
+
maxFileBytes?: number;
|
|
111
|
+
/** Chunking controls. */
|
|
112
|
+
chunk?: {
|
|
113
|
+
maxChars?: number;
|
|
114
|
+
maxLines?: number;
|
|
115
|
+
overlapLines?: number;
|
|
116
|
+
};
|
|
117
|
+
/** Embedding batch controls. */
|
|
118
|
+
embed?: {
|
|
119
|
+
batchSize?: number;
|
|
120
|
+
concurrency?: number;
|
|
121
|
+
};
|
|
122
|
+
/** Watcher controls. */
|
|
123
|
+
watch?: {
|
|
124
|
+
debounceMs?: number;
|
|
125
|
+
};
|
|
126
|
+
/** Extra ignore files (in addition to git’s excludes). */
|
|
127
|
+
ignoreFiles?: string[];
|
|
128
|
+
/**
|
|
129
|
+
* Secret hygiene:
|
|
130
|
+
* - skip obvious secret-y files by path substring
|
|
131
|
+
* - redact patterns prior to storage + embedding
|
|
132
|
+
*/
|
|
133
|
+
redact?: {
|
|
134
|
+
enabled?: boolean;
|
|
135
|
+
/** If path includes any of these substrings, file is skipped. */
|
|
136
|
+
skipPathSubstrings?: string[];
|
|
137
|
+
/** Simple redactions applied before embedding/storage (keeps structure). */
|
|
138
|
+
patterns?: Array<{
|
|
139
|
+
name: string;
|
|
140
|
+
regex: RegExp;
|
|
141
|
+
replaceWith: string;
|
|
142
|
+
}>;
|
|
143
|
+
};
|
|
144
|
+
/** Storage / index tuning. */
|
|
145
|
+
storage?: {
|
|
146
|
+
/** If true, keep full chunk text in SQLite (default true). */
|
|
147
|
+
storeText?: boolean;
|
|
148
|
+
/**
|
|
149
|
+
* Lexical index mode:
|
|
150
|
+
* - "full": store the (redacted) chunk text in FTS (best retrieval, more storage)
|
|
151
|
+
* - "tokens": store only extracted identifiers/tokens (less sensitive, still good for code search)
|
|
152
|
+
* - "off": disable FTS indexing entirely (vector-only retrieval)
|
|
153
|
+
*/
|
|
154
|
+
ftsMode?: "full" | "tokens" | "off";
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
interface ChunkRecord {
|
|
158
|
+
id: string;
|
|
159
|
+
repoId: RepoId;
|
|
160
|
+
repoRoot: string;
|
|
161
|
+
path: string;
|
|
162
|
+
language: string;
|
|
163
|
+
startLine: number;
|
|
164
|
+
endLine: number;
|
|
165
|
+
contentHash: string;
|
|
166
|
+
text: string;
|
|
167
|
+
tokens: number;
|
|
168
|
+
fileMtimeMs: number;
|
|
169
|
+
kind?: "chunk" | "synopsis";
|
|
170
|
+
}
|
|
171
|
+
interface SearchOptions {
|
|
172
|
+
k?: number;
|
|
173
|
+
repoRoots?: string[];
|
|
174
|
+
language?: string;
|
|
175
|
+
pathPrefix?: string;
|
|
176
|
+
}
|
|
177
|
+
interface RetrievalScope {
|
|
178
|
+
/** Restrict results to files that are changed compared to baseRef. */
|
|
179
|
+
changedOnly?: boolean;
|
|
180
|
+
/** e.g. "origin/main", "main", "HEAD~1". Only used if changedOnly is true. */
|
|
181
|
+
baseRef?: string;
|
|
182
|
+
/** Restrict to these file paths (POSIX rel paths within repo). */
|
|
183
|
+
includePaths?: string[];
|
|
184
|
+
}
|
|
185
|
+
interface RetrieveOptions {
|
|
186
|
+
/** Choose a profile; can be overridden with explicit profile settings. */
|
|
187
|
+
profile?: RetrievalProfileName;
|
|
188
|
+
/** Optional ad-hoc overrides, merged into the chosen profile. */
|
|
189
|
+
profileOverrides?: Partial<RetrievalProfile>;
|
|
190
|
+
/** Optional scope restrictions. */
|
|
191
|
+
scope?: RetrievalScope;
|
|
192
|
+
/** Filters (applied after retrieval). */
|
|
193
|
+
filters?: {
|
|
194
|
+
repoRoots?: string[];
|
|
195
|
+
language?: string;
|
|
196
|
+
pathPrefix?: string;
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
interface SearchHit {
|
|
200
|
+
score: number;
|
|
201
|
+
scoreBreakdown?: {
|
|
202
|
+
vector?: number;
|
|
203
|
+
lexical?: number;
|
|
204
|
+
recency?: number;
|
|
205
|
+
};
|
|
206
|
+
chunk: Omit<ChunkRecord, "text"> & {
|
|
207
|
+
preview: string;
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
interface RelatedContextOptions {
|
|
211
|
+
adjacentChunks?: number;
|
|
212
|
+
followImports?: number;
|
|
213
|
+
}
|
|
214
|
+
interface ContextBundle {
|
|
215
|
+
/** Ordered from most to least relevant. */
|
|
216
|
+
hits: SearchHit[];
|
|
217
|
+
/** Expanded supporting context blocks (deduplicated). */
|
|
218
|
+
context: Array<{
|
|
219
|
+
repoRoot: string;
|
|
220
|
+
path: string;
|
|
221
|
+
startLine: number;
|
|
222
|
+
endLine: number;
|
|
223
|
+
text: string;
|
|
224
|
+
reason: string;
|
|
225
|
+
}>;
|
|
226
|
+
/** Useful metadata for your agent’s planner/logs. */
|
|
227
|
+
stats: {
|
|
228
|
+
profile: RetrievalProfileName;
|
|
229
|
+
reposSearched: number;
|
|
230
|
+
candidates: {
|
|
231
|
+
vector: number;
|
|
232
|
+
lexical: number;
|
|
233
|
+
merged: number;
|
|
234
|
+
returned: number;
|
|
235
|
+
};
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
interface EmbeddingsProvider {
|
|
240
|
+
/** Unique identifier including model, used for caching. */
|
|
241
|
+
readonly id: string;
|
|
242
|
+
/** Dimension may be unknown until first call. */
|
|
243
|
+
readonly dimension: number | null;
|
|
244
|
+
embed(texts: string[]): Promise<Float32Array[]>;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
declare class OpenAIEmbeddingsProvider implements EmbeddingsProvider {
|
|
248
|
+
private readonly opts;
|
|
249
|
+
readonly id: string;
|
|
250
|
+
dimension: number | null;
|
|
251
|
+
constructor(opts: {
|
|
252
|
+
apiKey: string;
|
|
253
|
+
model: string;
|
|
254
|
+
baseUrl?: string;
|
|
255
|
+
});
|
|
256
|
+
embed(texts: string[]): Promise<Float32Array[]>;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
declare class OllamaEmbeddingsProvider implements EmbeddingsProvider {
|
|
260
|
+
readonly id: string;
|
|
261
|
+
dimension: number | null;
|
|
262
|
+
private readonly baseUrl;
|
|
263
|
+
private readonly model;
|
|
264
|
+
private readonly concurrency;
|
|
265
|
+
constructor(opts: {
|
|
266
|
+
model: string;
|
|
267
|
+
baseUrl?: string;
|
|
268
|
+
concurrency?: number;
|
|
269
|
+
});
|
|
270
|
+
private tryBatchEndpoint;
|
|
271
|
+
private embedOne;
|
|
272
|
+
embed(texts: string[]): Promise<Float32Array[]>;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Offline deterministic embedding. Not truly semantic, but:
|
|
277
|
+
* - fully offline
|
|
278
|
+
* - deterministic
|
|
279
|
+
* - useful for wiring everything up + tests
|
|
280
|
+
*/
|
|
281
|
+
declare class HashEmbeddingsProvider implements EmbeddingsProvider {
|
|
282
|
+
readonly id: string;
|
|
283
|
+
readonly dimension: number;
|
|
284
|
+
constructor(dimension?: number);
|
|
285
|
+
embed(texts: string[]): Promise<Float32Array[]>;
|
|
286
|
+
private embedOne;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
declare const DEFAULT_PROFILES: Record<RetrievalProfileName, RetrievalProfile>;
|
|
290
|
+
declare function deepMergeProfile(base: RetrievalProfile, patch?: Partial<RetrievalProfile>): RetrievalProfile;
|
|
291
|
+
|
|
292
|
+
interface VectorPoint {
|
|
293
|
+
id: string;
|
|
294
|
+
vector: Float32Array;
|
|
295
|
+
/** Optional payload for vector databases that support it (e.g. Qdrant). */
|
|
296
|
+
payload?: Record<string, unknown>;
|
|
297
|
+
}
|
|
298
|
+
interface VectorSearchHit {
|
|
299
|
+
id: string;
|
|
300
|
+
/** Similarity score (higher is better). For cosine/IP this is usually in [-1, 1]. */
|
|
301
|
+
score: number;
|
|
302
|
+
}
|
|
303
|
+
interface VectorIndexInit {
|
|
304
|
+
repoId: string;
|
|
305
|
+
repoRoot: string;
|
|
306
|
+
commit: string;
|
|
307
|
+
branch: string;
|
|
308
|
+
cacheDir: string;
|
|
309
|
+
dimension: number;
|
|
310
|
+
metric: VectorMetric;
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* A pluggable dense-vector index.
|
|
314
|
+
*
|
|
315
|
+
* All implementations MUST return a score where larger means more similar.
|
|
316
|
+
* This package primarily expects cosine similarity.
|
|
317
|
+
*/
|
|
318
|
+
interface VectorIndex {
|
|
319
|
+
readonly kind: string;
|
|
320
|
+
readonly metric: VectorMetric;
|
|
321
|
+
readonly dimension: number;
|
|
322
|
+
init(init: VectorIndexInit): Promise<void>;
|
|
323
|
+
/** Full rebuild from a set of points. */
|
|
324
|
+
rebuild(points: VectorPoint[]): Promise<void>;
|
|
325
|
+
/** Insert or update points (incremental). */
|
|
326
|
+
upsert(points: VectorPoint[]): Promise<void>;
|
|
327
|
+
/** Remove points by id. */
|
|
328
|
+
remove(ids: string[]): Promise<void>;
|
|
329
|
+
/** Search for the k nearest neighbours. */
|
|
330
|
+
search(query: Float32Array, k: number): Promise<VectorSearchHit[]>;
|
|
331
|
+
/** Number of points currently indexed (best-effort for remote stores). */
|
|
332
|
+
count(): Promise<number>;
|
|
333
|
+
/** Optional: flush pending writes to disk/network. */
|
|
334
|
+
flush(): Promise<void>;
|
|
335
|
+
/** Close resources. */
|
|
336
|
+
close(): Promise<void>;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Create a vector index from config.
|
|
341
|
+
*
|
|
342
|
+
* Provider selection:
|
|
343
|
+
* - explicit provider -> that backend
|
|
344
|
+
* - auto -> qdrant (if configured), else hnswlib (if dependency present), else bruteforce
|
|
345
|
+
*/
|
|
346
|
+
declare function createVectorIndex(vector?: VectorConfig): Promise<VectorIndex>;
|
|
347
|
+
|
|
348
|
+
interface StoredChunkRow {
|
|
349
|
+
id: string;
|
|
350
|
+
path: string;
|
|
351
|
+
language: string;
|
|
352
|
+
kind: string;
|
|
353
|
+
start_line: number;
|
|
354
|
+
end_line: number;
|
|
355
|
+
content_hash: string;
|
|
356
|
+
tokens: number;
|
|
357
|
+
file_mtime: number;
|
|
358
|
+
text: string;
|
|
359
|
+
embedding: Buffer;
|
|
360
|
+
}
|
|
361
|
+
declare class RepoStore {
|
|
362
|
+
private db;
|
|
363
|
+
constructor(dbPath: string);
|
|
364
|
+
/** Monotonically increases whenever the chunk-store is mutated. */
|
|
365
|
+
getStoreVersion(): number;
|
|
366
|
+
/** Internal: bump store version (call inside the same transaction that mutates chunks). */
|
|
367
|
+
private bumpStoreVersion;
|
|
368
|
+
/** Vector index sync marker (per backend kind). */
|
|
369
|
+
getVectorIndexVersion(kind: string): number;
|
|
370
|
+
setVectorIndexVersion(kind: string, storeVersion: number): void;
|
|
371
|
+
setMeta(k: string, v: string): void;
|
|
372
|
+
getMeta(k: string): string | null;
|
|
373
|
+
getFileHash(posixPath: string): string | null;
|
|
374
|
+
getFileMtime(posixPath: string): number | null;
|
|
375
|
+
upsertFile(posixPath: string, hash: string, mtime: number, language: string, size: number): void;
|
|
376
|
+
deleteFile(posixPath: string): void;
|
|
377
|
+
replaceChunksForFile(posixPath: string, rows: Array<{
|
|
378
|
+
id: string;
|
|
379
|
+
language: string;
|
|
380
|
+
kind: "chunk" | "synopsis";
|
|
381
|
+
startLine: number;
|
|
382
|
+
endLine: number;
|
|
383
|
+
contentHash: string;
|
|
384
|
+
text: string;
|
|
385
|
+
ftsText: string;
|
|
386
|
+
tokens: number;
|
|
387
|
+
fileMtime: number;
|
|
388
|
+
embedding: Float32Array;
|
|
389
|
+
}>): void;
|
|
390
|
+
setEdges(fromPath: string, kind: string, values: string[]): void;
|
|
391
|
+
listEdges(fromPath: string, kind: string): string[];
|
|
392
|
+
listAllFiles(): string[];
|
|
393
|
+
countChunks(): number;
|
|
394
|
+
/**
|
|
395
|
+
* Returns the embedding dimension if any chunks exist, otherwise null.
|
|
396
|
+
* Efficient (doesn't load all embeddings).
|
|
397
|
+
*/
|
|
398
|
+
getAnyEmbeddingDimension(): number | null;
|
|
399
|
+
loadAllChunkEmbeddings(): Array<{
|
|
400
|
+
id: string;
|
|
401
|
+
embedding: Float32Array;
|
|
402
|
+
}>;
|
|
403
|
+
getChunkById(id: string): StoredChunkRow | null;
|
|
404
|
+
listChunksForFile(posixPath: string, kind?: "chunk" | "synopsis"): Array<{
|
|
405
|
+
id: string;
|
|
406
|
+
start_line: number;
|
|
407
|
+
end_line: number;
|
|
408
|
+
kind: string;
|
|
409
|
+
}>;
|
|
410
|
+
/**
|
|
411
|
+
* Best-effort lexical search using SQLite FTS5.
|
|
412
|
+
* Returns ids with bm25 values (lower is better).
|
|
413
|
+
*/
|
|
414
|
+
searchFts(ftq: string, limit: number, includePaths?: string[]): Array<{
|
|
415
|
+
id: string;
|
|
416
|
+
bm25: number;
|
|
417
|
+
}>;
|
|
418
|
+
close(): void;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
declare class RepoIndexer {
|
|
422
|
+
private readonly embedder;
|
|
423
|
+
readonly repoRoot: string;
|
|
424
|
+
readonly repoId: string;
|
|
425
|
+
private config;
|
|
426
|
+
private store;
|
|
427
|
+
private vec;
|
|
428
|
+
private vecFlushTimer;
|
|
429
|
+
private vecFlushInFlight;
|
|
430
|
+
private embeddingCache;
|
|
431
|
+
private currentCommit;
|
|
432
|
+
private currentBranch;
|
|
433
|
+
private watcher;
|
|
434
|
+
private fileIgnore;
|
|
435
|
+
private serial;
|
|
436
|
+
private chunkCache;
|
|
437
|
+
constructor(repoRoot: string, embedder: EmbeddingsProvider, config?: IndexerConfig);
|
|
438
|
+
getCommit(): string | null;
|
|
439
|
+
getBranch(): string | null;
|
|
440
|
+
getStore(): RepoStore | null;
|
|
441
|
+
private dbPathForCommit;
|
|
442
|
+
private shouldIndexPath;
|
|
443
|
+
private applyRedactions;
|
|
444
|
+
private vectorMetric;
|
|
445
|
+
private vectorFlushDebounceMs;
|
|
446
|
+
private scheduleVectorFlush;
|
|
447
|
+
private flushVectorNow;
|
|
448
|
+
private ensureVectorIndex;
|
|
449
|
+
openForCurrentHead(): Promise<void>;
|
|
450
|
+
indexAll(): Promise<void>;
|
|
451
|
+
indexFile(posixRelPath: string): Promise<void>;
|
|
452
|
+
private deleteFileInner;
|
|
453
|
+
deleteFile(posixRelPath: string): Promise<void>;
|
|
454
|
+
vectorCandidates(queryEmbedding: Float32Array, k: number, includePaths?: string[]): Promise<Array<{
|
|
455
|
+
id: string;
|
|
456
|
+
score: number;
|
|
457
|
+
}>>;
|
|
458
|
+
lexicalCandidates(queryText: string, k: number, includePaths?: string[]): Promise<Array<{
|
|
459
|
+
id: string;
|
|
460
|
+
score: number;
|
|
461
|
+
}>>;
|
|
462
|
+
private getChunkRowCached;
|
|
463
|
+
private readChunkTextFallback;
|
|
464
|
+
getChunkRecord(id: string): ChunkRecord | null;
|
|
465
|
+
getChunkMeta(id: string): Omit<ChunkRecord, "text"> | null;
|
|
466
|
+
getChunkText(id: string): string;
|
|
467
|
+
getChunkPreview(id: string): string;
|
|
468
|
+
/**
|
|
469
|
+
* Expand context around a hit:
|
|
470
|
+
* - adjacency (previous/next chunks in file)
|
|
471
|
+
* - follow relative imports to include imported file synopses/headers
|
|
472
|
+
*/
|
|
473
|
+
expandContext(chunkId: string, opts: {
|
|
474
|
+
adjacentChunks: number;
|
|
475
|
+
followImports: number;
|
|
476
|
+
includeFileSynopsis: boolean;
|
|
477
|
+
}): Promise<Array<{
|
|
478
|
+
id: string;
|
|
479
|
+
reason: string;
|
|
480
|
+
}>>;
|
|
481
|
+
watch(): Promise<void>;
|
|
482
|
+
closeAsync(): Promise<void>;
|
|
483
|
+
close(): void;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
declare class WorkspaceIndexer {
|
|
487
|
+
private readonly workspaceRoot;
|
|
488
|
+
private readonly embedder;
|
|
489
|
+
private repos;
|
|
490
|
+
private config;
|
|
491
|
+
constructor(workspaceRoot: string, embedder: EmbeddingsProvider, config?: IndexerConfig);
|
|
492
|
+
open(): Promise<void>;
|
|
493
|
+
indexAll(): Promise<void>;
|
|
494
|
+
watch(): Promise<void>;
|
|
495
|
+
getRepoIndexers(): RepoIndexer[];
|
|
496
|
+
private resolveProfile;
|
|
497
|
+
retrieve(query: string, opts?: RetrieveOptions): Promise<ContextBundle>;
|
|
498
|
+
search(query: string, k?: number): Promise<SearchHit[]>;
|
|
499
|
+
closeAsync(): Promise<void>;
|
|
500
|
+
close(): void;
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
declare function loadConfigFile(filePath: string): IndexerConfig;
|
|
504
|
+
|
|
505
|
+
export { type ChunkRecord, type ContextBundle, type CustomVectorConfig, DEFAULT_PROFILES, type EmbeddingsProvider, type FaissVectorConfig, HashEmbeddingsProvider, type HnswlibVectorConfig, type IndexerConfig, OllamaEmbeddingsProvider, OpenAIEmbeddingsProvider, type ProfilesConfig, type QdrantVectorConfig, type RelatedContextOptions, type RepoId, RepoIndexer, type RetrievalProfile, type RetrievalProfileName, type RetrievalScope, type RetrievalWeights, type RetrieveOptions, type SearchHit, type SearchOptions, type VectorConfig, type VectorIndex, type VectorIndexInit, type VectorMetric, type VectorPoint, type VectorProviderKind, type VectorSearchHit, WorkspaceIndexer, createVectorIndex, deepMergeProfile, loadConfigFile };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DEFAULT_PROFILES,
|
|
3
|
+
HashEmbeddingsProvider,
|
|
4
|
+
OllamaEmbeddingsProvider,
|
|
5
|
+
OpenAIEmbeddingsProvider,
|
|
6
|
+
RepoIndexer,
|
|
7
|
+
WorkspaceIndexer,
|
|
8
|
+
createVectorIndex,
|
|
9
|
+
deepMergeProfile,
|
|
10
|
+
loadConfigFile
|
|
11
|
+
} from "./chunk-QPQCSCBN.js";
|
|
12
|
+
export {
|
|
13
|
+
DEFAULT_PROFILES,
|
|
14
|
+
HashEmbeddingsProvider,
|
|
15
|
+
OllamaEmbeddingsProvider,
|
|
16
|
+
OpenAIEmbeddingsProvider,
|
|
17
|
+
RepoIndexer,
|
|
18
|
+
WorkspaceIndexer,
|
|
19
|
+
createVectorIndex,
|
|
20
|
+
deepMergeProfile,
|
|
21
|
+
loadConfigFile
|
|
22
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@neuralsea/workspace-indexer",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Local-first multi-repo workspace indexer (semantic embeddings + git-aware incremental updates + hybrid retrieval profiles) for AI agents.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"types": "./dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
"bin": {
|
|
15
|
+
"petri-index": "./dist/cli.js"
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"README.md",
|
|
20
|
+
"LICENSE"
|
|
21
|
+
],
|
|
22
|
+
"scripts": {
|
|
23
|
+
"build": "tsup src/index.ts src/cli.ts --format esm --dts --clean",
|
|
24
|
+
"dev": "tsup src/index.ts src/cli.ts --format esm --dts --watch",
|
|
25
|
+
"lint": "node -e \"console.log('Add eslint if desired')\"",
|
|
26
|
+
"test": "node --test dist/test.js || echo \"(optional)\""
|
|
27
|
+
},
|
|
28
|
+
"dependencies": {
|
|
29
|
+
"better-sqlite3": "^12.5.0",
|
|
30
|
+
"chokidar": "^5.0.0",
|
|
31
|
+
"ignore": "^7.0.5",
|
|
32
|
+
"p-limit": "^7.2.0",
|
|
33
|
+
"yargs": "^18.0.0"
|
|
34
|
+
},
|
|
35
|
+
"devDependencies": {
|
|
36
|
+
"@types/better-sqlite3": "^7.6.13",
|
|
37
|
+
"@types/node": "^25.0.3",
|
|
38
|
+
"@types/yargs": "^17.0.35",
|
|
39
|
+
"tsup": "^8.5.1",
|
|
40
|
+
"typescript": "^5.9.3"
|
|
41
|
+
},
|
|
42
|
+
"engines": {
|
|
43
|
+
"node": ">=18"
|
|
44
|
+
}
|
|
45
|
+
}
|