baselineos 0.2.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +17 -0
- package/README.md +198 -0
- package/dist/__evals__/runner.d.ts +2 -0
- package/dist/__evals__/runner.js +14687 -0
- package/dist/__evals__/runner.js.map +1 -0
- package/dist/api/server.d.ts +21 -0
- package/dist/api/server.js +1007 -0
- package/dist/api/server.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +8427 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/core/agent-bus.d.ts +110 -0
- package/dist/core/agent-bus.js +242 -0
- package/dist/core/agent-bus.js.map +1 -0
- package/dist/core/cache.d.ts +66 -0
- package/dist/core/cache.js +160 -0
- package/dist/core/cache.js.map +1 -0
- package/dist/core/config.d.ts +1002 -0
- package/dist/core/config.js +429 -0
- package/dist/core/config.js.map +1 -0
- package/dist/core/indexer.d.ts +152 -0
- package/dist/core/indexer.js +481 -0
- package/dist/core/indexer.js.map +1 -0
- package/dist/core/llm-tracer.d.ts +2 -0
- package/dist/core/llm-tracer.js +241 -0
- package/dist/core/llm-tracer.js.map +1 -0
- package/dist/core/memory.d.ts +86 -0
- package/dist/core/memory.js +346 -0
- package/dist/core/memory.js.map +1 -0
- package/dist/core/opa-client.d.ts +51 -0
- package/dist/core/opa-client.js +157 -0
- package/dist/core/opa-client.js.map +1 -0
- package/dist/core/opa-policy-gate.d.ts +133 -0
- package/dist/core/opa-policy-gate.js +454 -0
- package/dist/core/opa-policy-gate.js.map +1 -0
- package/dist/core/orchestrator.d.ts +14 -0
- package/dist/core/orchestrator.js +1297 -0
- package/dist/core/orchestrator.js.map +1 -0
- package/dist/core/pii-detector.d.ts +82 -0
- package/dist/core/pii-detector.js +126 -0
- package/dist/core/pii-detector.js.map +1 -0
- package/dist/core/rag-engine.d.ts +121 -0
- package/dist/core/rag-engine.js +504 -0
- package/dist/core/rag-engine.js.map +1 -0
- package/dist/core/task-queue.d.ts +69 -0
- package/dist/core/task-queue.js +124 -0
- package/dist/core/task-queue.js.map +1 -0
- package/dist/core/telemetry.d.ts +56 -0
- package/dist/core/telemetry.js +94 -0
- package/dist/core/telemetry.js.map +1 -0
- package/dist/core/types.d.ts +328 -0
- package/dist/core/types.js +24 -0
- package/dist/core/types.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.js +12444 -0
- package/dist/index.js.map +1 -0
- package/dist/llm-tracer-CIIujuO-.d.ts +493 -0
- package/dist/mcp/server.d.ts +2651 -0
- package/dist/mcp/server.js +676 -0
- package/dist/mcp/server.js.map +1 -0
- package/dist/orchestrator-DF89k_AK.d.ts +506 -0
- package/package.json +157 -0
- package/templates/README.md +7 -0
- package/templates/baseline.config.ts +207 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { KnowledgeIndexer } from './indexer.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* BaselineOS RAG Engine — Hybrid Retrieval (BM25 + Vector)
|
|
5
|
+
*
|
|
6
|
+
* Combines keyword search (MiniSearch/BM25) with optional vector search
|
|
7
|
+
* (ChromaDB + OpenAI embeddings) for high-quality context retrieval.
|
|
8
|
+
*
|
|
9
|
+
* 6 Collections:
|
|
10
|
+
* codebase — TypeScript/Python source files
|
|
11
|
+
* protocols — Protocol specs, READMEs, architecture docs
|
|
12
|
+
* decisions — ADRs, design decisions
|
|
13
|
+
* errors — Error patterns, debugging context
|
|
14
|
+
* compliance — Regulatory frameworks, audit evidence
|
|
15
|
+
* domain — GTCX domain knowledge, CLAUDE.md files
|
|
16
|
+
*
|
|
17
|
+
* @license Apache-2.0
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
type CollectionName = 'codebase' | 'protocols' | 'decisions' | 'errors' | 'compliance' | 'domain';
|
|
21
|
+
interface RAGConfig {
|
|
22
|
+
projectRoot: string;
|
|
23
|
+
/** OpenAI API key for embeddings (optional — BM25 works without it) */
|
|
24
|
+
openaiApiKey?: string;
|
|
25
|
+
/** ChromaDB host (optional — BM25 works without it) */
|
|
26
|
+
chromaHost?: string;
|
|
27
|
+
chromaPort?: number;
|
|
28
|
+
/** Max chunks per collection */
|
|
29
|
+
maxChunksPerCollection?: number;
|
|
30
|
+
/** Chunk size in characters */
|
|
31
|
+
chunkSize?: number;
|
|
32
|
+
/** Chunk overlap in characters */
|
|
33
|
+
chunkOverlap?: number;
|
|
34
|
+
/** BM25 result limit */
|
|
35
|
+
bm25Limit?: number;
|
|
36
|
+
/** Vector result limit */
|
|
37
|
+
vectorLimit?: number;
|
|
38
|
+
/** Minimum relevance score (0-1) */
|
|
39
|
+
minRelevance?: number;
|
|
40
|
+
}
|
|
41
|
+
interface RAGChunk {
|
|
42
|
+
id: string;
|
|
43
|
+
content: string;
|
|
44
|
+
collection: CollectionName;
|
|
45
|
+
source: string;
|
|
46
|
+
repo: string;
|
|
47
|
+
/** Section within the file */
|
|
48
|
+
section?: string;
|
|
49
|
+
/** Content hash for dedup */
|
|
50
|
+
hash: string;
|
|
51
|
+
/** Token estimate (content.length / 4) */
|
|
52
|
+
tokenEstimate: number;
|
|
53
|
+
metadata: Record<string, string>;
|
|
54
|
+
}
|
|
55
|
+
interface RAGResult {
|
|
56
|
+
chunks: Array<{
|
|
57
|
+
chunk: RAGChunk;
|
|
58
|
+
score: number;
|
|
59
|
+
matchType: 'bm25' | 'vector' | 'hybrid';
|
|
60
|
+
}>;
|
|
61
|
+
query: string;
|
|
62
|
+
totalChunks: number;
|
|
63
|
+
bm25Hits: number;
|
|
64
|
+
vectorHits: number;
|
|
65
|
+
duration: number;
|
|
66
|
+
}
|
|
67
|
+
interface IngestionResult {
|
|
68
|
+
totalFiles: number;
|
|
69
|
+
totalChunks: number;
|
|
70
|
+
collections: Record<CollectionName, number>;
|
|
71
|
+
duration: number;
|
|
72
|
+
errors: string[];
|
|
73
|
+
}
|
|
74
|
+
declare class RAGEngine {
|
|
75
|
+
private config;
|
|
76
|
+
private chunks;
|
|
77
|
+
private searchIndex;
|
|
78
|
+
private initialized;
|
|
79
|
+
private chromaCollections;
|
|
80
|
+
private chromaAvailable;
|
|
81
|
+
constructor(config: RAGConfig);
|
|
82
|
+
ingest(knowledgePaths?: string[]): Promise<IngestionResult>;
|
|
83
|
+
search(query: string, options?: {
|
|
84
|
+
collections?: CollectionName[];
|
|
85
|
+
repo?: string;
|
|
86
|
+
limit?: number;
|
|
87
|
+
minScore?: number;
|
|
88
|
+
}): Promise<RAGResult>;
|
|
89
|
+
/**
|
|
90
|
+
* Get context for a subject using RAG + knowledge indexer.
|
|
91
|
+
* This is the primary retrieval method — combines structured
|
|
92
|
+
* context from the indexer with chunk-level RAG results.
|
|
93
|
+
*/
|
|
94
|
+
getContext(query: string, indexer?: KnowledgeIndexer, options?: {
|
|
95
|
+
collections?: CollectionName[];
|
|
96
|
+
repo?: string;
|
|
97
|
+
maxTokens?: number;
|
|
98
|
+
}): Promise<{
|
|
99
|
+
content: string;
|
|
100
|
+
sources: string[];
|
|
101
|
+
tokenEstimate: number;
|
|
102
|
+
}>;
|
|
103
|
+
private initChroma;
|
|
104
|
+
private vectorSearch;
|
|
105
|
+
private bm25Search;
|
|
106
|
+
private rebuildSearchIndex;
|
|
107
|
+
private resolveFiles;
|
|
108
|
+
private walkGlob;
|
|
109
|
+
private walkDir;
|
|
110
|
+
private matchesFilePattern;
|
|
111
|
+
private extractRepo;
|
|
112
|
+
private extractSection;
|
|
113
|
+
getStats(): {
|
|
114
|
+
totalChunks: number;
|
|
115
|
+
collections: Record<CollectionName, number>;
|
|
116
|
+
initialized: boolean;
|
|
117
|
+
vectorStoreAvailable: boolean;
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export { type CollectionName, type IngestionResult, type RAGChunk, type RAGConfig, RAGEngine, type RAGResult };
|
|
@@ -0,0 +1,504 @@
|
|
|
1
|
+
import { readFileSync, existsSync, readdirSync, statSync } from 'fs';
|
|
2
|
+
import { relative, extname, basename, join } from 'path';
|
|
3
|
+
import { createHash } from 'crypto';
|
|
4
|
+
import MiniSearch from 'minisearch';
|
|
5
|
+
|
|
6
|
+
// src/core/rag-engine.ts
|
|
7
|
+
var COLLECTION_RULES = [
|
|
8
|
+
{
|
|
9
|
+
collection: "decisions",
|
|
10
|
+
pathPatterns: [/decisions\//, /ADR-/, /adr-/],
|
|
11
|
+
extensions: [".md"]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
collection: "protocols",
|
|
15
|
+
pathPatterns: [/3-protocols\//, /SPEC\.md$/, /protocol/i],
|
|
16
|
+
extensions: [".md"]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
collection: "compliance",
|
|
20
|
+
pathPatterns: [/compliance/, /regulatory/, /fatf/i, /esg/i, /10-compliance/],
|
|
21
|
+
extensions: [".md", ".json"]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
collection: "errors",
|
|
25
|
+
pathPatterns: [/error/, /debug/, /troubleshoot/],
|
|
26
|
+
extensions: [".md", ".ts", ".json"]
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
collection: "codebase",
|
|
30
|
+
pathPatterns: [/src\//, /lib\//, /packages\//],
|
|
31
|
+
extensions: [".ts", ".tsx", ".js", ".py"]
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
collection: "domain",
|
|
35
|
+
pathPatterns: [/CLAUDE\.md$/, /_cannon\//, /_sop\//, /README\.md$/],
|
|
36
|
+
extensions: [".md"]
|
|
37
|
+
}
|
|
38
|
+
];
|
|
39
|
+
function classifyFile(filePath) {
|
|
40
|
+
for (const rule of COLLECTION_RULES) {
|
|
41
|
+
const ext2 = extname(filePath);
|
|
42
|
+
if (!rule.extensions.includes(ext2)) continue;
|
|
43
|
+
for (const pattern of rule.pathPatterns) {
|
|
44
|
+
if (pattern.test(filePath)) return rule.collection;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
const ext = extname(filePath);
|
|
48
|
+
if ([".ts", ".tsx", ".js", ".py"].includes(ext)) return "codebase";
|
|
49
|
+
return "domain";
|
|
50
|
+
}
|
|
51
|
+
function chunkContent(content, chunkSize, overlap) {
|
|
52
|
+
const chunks = [];
|
|
53
|
+
const sections = content.split(/(?=^##\s)/m);
|
|
54
|
+
if (sections.length > 1) {
|
|
55
|
+
let current = "";
|
|
56
|
+
for (const section of sections) {
|
|
57
|
+
if (current.length + section.length > chunkSize && current.length > 0) {
|
|
58
|
+
chunks.push(current.trim());
|
|
59
|
+
current = current.slice(-overlap) + section;
|
|
60
|
+
} else {
|
|
61
|
+
current += section;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (current.trim()) chunks.push(current.trim());
|
|
65
|
+
} else {
|
|
66
|
+
for (let i = 0; i < content.length; i += chunkSize - overlap) {
|
|
67
|
+
const chunk = content.slice(i, i + chunkSize);
|
|
68
|
+
if (chunk.trim()) chunks.push(chunk.trim());
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return chunks.length > 0 ? chunks : [content.trim()];
|
|
72
|
+
}
|
|
73
|
+
var RAGEngine = class {
|
|
74
|
+
config;
|
|
75
|
+
chunks = /* @__PURE__ */ new Map();
|
|
76
|
+
searchIndex;
|
|
77
|
+
initialized = false;
|
|
78
|
+
chromaCollections = /* @__PURE__ */ new Map();
|
|
79
|
+
chromaAvailable = false;
|
|
80
|
+
constructor(config) {
|
|
81
|
+
this.config = {
|
|
82
|
+
projectRoot: config.projectRoot,
|
|
83
|
+
openaiApiKey: config.openaiApiKey ?? process.env.OPENAI_API_KEY ?? "",
|
|
84
|
+
chromaHost: config.chromaHost ?? "localhost",
|
|
85
|
+
chromaPort: config.chromaPort ?? 8e3,
|
|
86
|
+
maxChunksPerCollection: config.maxChunksPerCollection ?? 1e3,
|
|
87
|
+
chunkSize: config.chunkSize ?? 1500,
|
|
88
|
+
chunkOverlap: config.chunkOverlap ?? 200,
|
|
89
|
+
bm25Limit: config.bm25Limit ?? 10,
|
|
90
|
+
vectorLimit: config.vectorLimit ?? 5,
|
|
91
|
+
minRelevance: config.minRelevance ?? 0.3
|
|
92
|
+
};
|
|
93
|
+
this.searchIndex = new MiniSearch({
|
|
94
|
+
fields: ["content", "source", "section", "repo"],
|
|
95
|
+
storeFields: ["id", "collection", "source", "repo", "section"],
|
|
96
|
+
searchOptions: {
|
|
97
|
+
boost: { content: 2, section: 1.5, source: 1 },
|
|
98
|
+
fuzzy: 0.2,
|
|
99
|
+
prefix: true
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
// ─── Ingestion ────────────────────────────────────────────────────────────
|
|
104
|
+
async ingest(knowledgePaths) {
|
|
105
|
+
const start = Date.now();
|
|
106
|
+
const errors = [];
|
|
107
|
+
const collectionCounts = {
|
|
108
|
+
codebase: 0,
|
|
109
|
+
protocols: 0,
|
|
110
|
+
decisions: 0,
|
|
111
|
+
errors: 0,
|
|
112
|
+
compliance: 0,
|
|
113
|
+
domain: 0
|
|
114
|
+
};
|
|
115
|
+
const paths = knowledgePaths ?? [
|
|
116
|
+
"**/_sop/.gtcx/decisions/*.md",
|
|
117
|
+
"**/CLAUDE.md",
|
|
118
|
+
"**/_cannon/*.md",
|
|
119
|
+
"3-protocols/**/SPEC.md",
|
|
120
|
+
"3-protocols/**/README.md",
|
|
121
|
+
"6-platforms/**/SPEC.md",
|
|
122
|
+
"6-platforms/**/README.md",
|
|
123
|
+
"5-intelligence/**/README.md",
|
|
124
|
+
"2-core/packages/*/src/**/*.ts",
|
|
125
|
+
"compliance-os/**/*.md",
|
|
126
|
+
"ai-1-agile/_sop/**/*.md"
|
|
127
|
+
];
|
|
128
|
+
const files = this.resolveFiles(paths);
|
|
129
|
+
let totalChunks = 0;
|
|
130
|
+
for (const filePath of files) {
|
|
131
|
+
try {
|
|
132
|
+
const content = readFileSync(filePath, "utf-8");
|
|
133
|
+
if (!content.trim()) continue;
|
|
134
|
+
const relPath = relative(this.config.projectRoot, filePath);
|
|
135
|
+
const collection = classifyFile(relPath);
|
|
136
|
+
const repo = this.extractRepo(relPath);
|
|
137
|
+
if (collectionCounts[collection] >= this.config.maxChunksPerCollection) continue;
|
|
138
|
+
const fileChunks = chunkContent(
|
|
139
|
+
content,
|
|
140
|
+
this.config.chunkSize,
|
|
141
|
+
this.config.chunkOverlap
|
|
142
|
+
);
|
|
143
|
+
for (let i = 0; i < fileChunks.length; i++) {
|
|
144
|
+
const chunkContent2 = fileChunks[i];
|
|
145
|
+
const hash = createHash("sha256").update(chunkContent2).digest("hex").slice(0, 16);
|
|
146
|
+
const id = `${collection}:${hash}`;
|
|
147
|
+
if (this.chunks.has(id)) continue;
|
|
148
|
+
const chunk = {
|
|
149
|
+
id,
|
|
150
|
+
content: chunkContent2,
|
|
151
|
+
collection,
|
|
152
|
+
source: relPath,
|
|
153
|
+
repo,
|
|
154
|
+
section: this.extractSection(chunkContent2),
|
|
155
|
+
hash,
|
|
156
|
+
tokenEstimate: Math.ceil(chunkContent2.length / 4),
|
|
157
|
+
metadata: {
|
|
158
|
+
file: basename(filePath),
|
|
159
|
+
ext: extname(filePath),
|
|
160
|
+
chunkIndex: String(i),
|
|
161
|
+
totalChunks: String(fileChunks.length)
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
this.chunks.set(id, chunk);
|
|
165
|
+
collectionCounts[collection]++;
|
|
166
|
+
totalChunks++;
|
|
167
|
+
}
|
|
168
|
+
} catch (err) {
|
|
169
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
170
|
+
errors.push(`${filePath}: ${msg}`);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
this.rebuildSearchIndex();
|
|
174
|
+
await this.initChroma(errors);
|
|
175
|
+
this.initialized = true;
|
|
176
|
+
return {
|
|
177
|
+
totalFiles: files.length,
|
|
178
|
+
totalChunks,
|
|
179
|
+
collections: collectionCounts,
|
|
180
|
+
duration: Date.now() - start,
|
|
181
|
+
errors
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
// ─── Hybrid Search ────────────────────────────────────────────────────────
|
|
185
|
+
async search(query, options) {
|
|
186
|
+
const start = Date.now();
|
|
187
|
+
const limit = options?.limit ?? this.config.bm25Limit;
|
|
188
|
+
const minScore = options?.minScore ?? this.config.minRelevance;
|
|
189
|
+
const bm25Results = this.bm25Search(query, {
|
|
190
|
+
collections: options?.collections,
|
|
191
|
+
repo: options?.repo,
|
|
192
|
+
limit
|
|
193
|
+
});
|
|
194
|
+
const vectorResults = await this.vectorSearch(query, {
|
|
195
|
+
collections: options?.collections,
|
|
196
|
+
repo: options?.repo,
|
|
197
|
+
limit: this.config.vectorLimit
|
|
198
|
+
});
|
|
199
|
+
const merged = /* @__PURE__ */ new Map();
|
|
200
|
+
for (let i = 0; i < bm25Results.length; i++) {
|
|
201
|
+
const r = bm25Results[i];
|
|
202
|
+
if (r.score < minScore) continue;
|
|
203
|
+
const rrfScore = 1 / (60 + i);
|
|
204
|
+
merged.set(r.chunk.id, { chunk: r.chunk, score: rrfScore, matchType: "bm25" });
|
|
205
|
+
}
|
|
206
|
+
for (let i = 0; i < vectorResults.length; i++) {
|
|
207
|
+
const r = vectorResults[i];
|
|
208
|
+
const rrfScore = 1 / (60 + i);
|
|
209
|
+
const existing = merged.get(r.chunk.id);
|
|
210
|
+
if (existing) {
|
|
211
|
+
existing.score += rrfScore;
|
|
212
|
+
existing.matchType = "hybrid";
|
|
213
|
+
} else {
|
|
214
|
+
merged.set(r.chunk.id, { chunk: r.chunk, score: rrfScore, matchType: "vector" });
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
const sorted = Array.from(merged.values()).sort((a, b) => b.score - a.score);
|
|
218
|
+
return {
|
|
219
|
+
chunks: sorted.slice(0, limit),
|
|
220
|
+
query,
|
|
221
|
+
totalChunks: this.chunks.size,
|
|
222
|
+
bm25Hits: bm25Results.length,
|
|
223
|
+
vectorHits: vectorResults.length,
|
|
224
|
+
duration: Date.now() - start
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Get context for a subject using RAG + knowledge indexer.
|
|
229
|
+
* This is the primary retrieval method — combines structured
|
|
230
|
+
* context from the indexer with chunk-level RAG results.
|
|
231
|
+
*/
|
|
232
|
+
async getContext(query, indexer, options) {
|
|
233
|
+
const maxTokens = options?.maxTokens ?? 4e3;
|
|
234
|
+
const ragResult = await this.search(query, {
|
|
235
|
+
collections: options?.collections,
|
|
236
|
+
repo: options?.repo,
|
|
237
|
+
limit: 15
|
|
238
|
+
});
|
|
239
|
+
const parts = [];
|
|
240
|
+
const sources = [];
|
|
241
|
+
let tokenCount = 0;
|
|
242
|
+
for (const { chunk, score } of ragResult.chunks) {
|
|
243
|
+
if (tokenCount + chunk.tokenEstimate > maxTokens) break;
|
|
244
|
+
parts.push(`<!-- source: ${chunk.source} (${chunk.collection}, score: ${score.toFixed(2)}) -->`);
|
|
245
|
+
parts.push(chunk.content);
|
|
246
|
+
parts.push("");
|
|
247
|
+
if (!sources.includes(chunk.source)) {
|
|
248
|
+
sources.push(chunk.source);
|
|
249
|
+
}
|
|
250
|
+
tokenCount += chunk.tokenEstimate;
|
|
251
|
+
}
|
|
252
|
+
return {
|
|
253
|
+
content: parts.join("\n"),
|
|
254
|
+
sources,
|
|
255
|
+
tokenEstimate: tokenCount
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
// ─── ChromaDB Initialization ─────────────────────────────────────────────
|
|
259
|
+
async initChroma(errors) {
|
|
260
|
+
if (!this.config.openaiApiKey) return;
|
|
261
|
+
let chromadb;
|
|
262
|
+
try {
|
|
263
|
+
chromadb = await import('chromadb');
|
|
264
|
+
} catch {
|
|
265
|
+
return;
|
|
266
|
+
}
|
|
267
|
+
try {
|
|
268
|
+
const ChromaClientCtor = chromadb.ChromaClient;
|
|
269
|
+
const client = new ChromaClientCtor({
|
|
270
|
+
path: `http://${this.config.chromaHost}:${this.config.chromaPort}`
|
|
271
|
+
});
|
|
272
|
+
const collectionNames = [
|
|
273
|
+
"codebase",
|
|
274
|
+
"protocols",
|
|
275
|
+
"decisions",
|
|
276
|
+
"errors",
|
|
277
|
+
"compliance",
|
|
278
|
+
"domain"
|
|
279
|
+
];
|
|
280
|
+
for (const name of collectionNames) {
|
|
281
|
+
const collection = await client.getOrCreateCollection({
|
|
282
|
+
name: `baseline-${name}`,
|
|
283
|
+
metadata: { source: "baselineos-rag", collection: name }
|
|
284
|
+
});
|
|
285
|
+
this.chromaCollections.set(name, collection);
|
|
286
|
+
}
|
|
287
|
+
for (const name of collectionNames) {
|
|
288
|
+
const collection = this.chromaCollections.get(name);
|
|
289
|
+
if (!collection) continue;
|
|
290
|
+
const collectionChunks = Array.from(this.chunks.values()).filter(
|
|
291
|
+
(c) => c.collection === name
|
|
292
|
+
);
|
|
293
|
+
if (collectionChunks.length === 0) continue;
|
|
294
|
+
const existingCount = await collection.count();
|
|
295
|
+
if (existingCount >= collectionChunks.length) continue;
|
|
296
|
+
const batchSize = 100;
|
|
297
|
+
for (let i = 0; i < collectionChunks.length; i += batchSize) {
|
|
298
|
+
const batch = collectionChunks.slice(i, i + batchSize);
|
|
299
|
+
await collection.add({
|
|
300
|
+
ids: batch.map((c) => c.id),
|
|
301
|
+
documents: batch.map((c) => c.content),
|
|
302
|
+
metadatas: batch.map((c) => ({
|
|
303
|
+
source: c.source,
|
|
304
|
+
repo: c.repo,
|
|
305
|
+
section: c.section ?? "",
|
|
306
|
+
hash: c.hash,
|
|
307
|
+
collection: c.collection
|
|
308
|
+
}))
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
this.chromaAvailable = true;
|
|
313
|
+
} catch (err) {
|
|
314
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
315
|
+
errors.push(`ChromaDB init failed (BM25-only mode): ${msg}`);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
// ─── Vector Search ──────────────────────────────────────────────────────
|
|
319
|
+
async vectorSearch(query, options) {
|
|
320
|
+
if (!this.chromaAvailable || !this.config.openaiApiKey) return [];
|
|
321
|
+
const limit = options?.limit ?? this.config.vectorLimit;
|
|
322
|
+
const targetCollections = options?.collections ?? [
|
|
323
|
+
"codebase",
|
|
324
|
+
"protocols",
|
|
325
|
+
"decisions",
|
|
326
|
+
"errors",
|
|
327
|
+
"compliance",
|
|
328
|
+
"domain"
|
|
329
|
+
];
|
|
330
|
+
const results = [];
|
|
331
|
+
for (const collName of targetCollections) {
|
|
332
|
+
const collection = this.chromaCollections.get(collName);
|
|
333
|
+
if (!collection) continue;
|
|
334
|
+
try {
|
|
335
|
+
const where = options?.repo ? { repo: options.repo } : void 0;
|
|
336
|
+
const queryResult = await collection.query({
|
|
337
|
+
queryTexts: [query],
|
|
338
|
+
nResults: limit,
|
|
339
|
+
where,
|
|
340
|
+
include: ["documents", "metadatas", "distances"]
|
|
341
|
+
});
|
|
342
|
+
const ids = queryResult.ids[0] ?? [];
|
|
343
|
+
const distances = queryResult.distances[0] ?? [];
|
|
344
|
+
for (let i = 0; i < ids.length; i++) {
|
|
345
|
+
const chunkId = ids[i];
|
|
346
|
+
const chunk = this.chunks.get(chunkId);
|
|
347
|
+
if (!chunk) continue;
|
|
348
|
+
const distance = distances[i] ?? 1;
|
|
349
|
+
const score = 1 / (1 + distance);
|
|
350
|
+
results.push({ chunk, score, matchType: "vector" });
|
|
351
|
+
}
|
|
352
|
+
} catch (err) {
|
|
353
|
+
process.stderr.write(`[baseline:rag] Vector query failed: ${err.message}
|
|
354
|
+
`);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
return results.sort((a, b) => b.score - a.score).slice(0, limit);
|
|
358
|
+
}
|
|
359
|
+
// ─── BM25 Search ─────────────────────────────────────────────────────────
|
|
360
|
+
bm25Search(query, options) {
|
|
361
|
+
if (!this.initialized || this.chunks.size === 0) return [];
|
|
362
|
+
const limit = options?.limit ?? this.config.bm25Limit;
|
|
363
|
+
let results = this.searchIndex.search(query).slice(0, limit * 3);
|
|
364
|
+
if (options?.collections?.length) {
|
|
365
|
+
results = results.filter(
|
|
366
|
+
(r) => options.collections.includes(r.collection)
|
|
367
|
+
);
|
|
368
|
+
}
|
|
369
|
+
if (options?.repo) {
|
|
370
|
+
results = results.filter((r) => r.repo === options.repo);
|
|
371
|
+
}
|
|
372
|
+
return results.slice(0, limit).map((r) => ({
|
|
373
|
+
chunk: this.chunks.get(r.id),
|
|
374
|
+
score: r.score / (results[0]?.score ?? 1),
|
|
375
|
+
// Normalize to 0-1
|
|
376
|
+
matchType: "bm25"
|
|
377
|
+
})).filter((r) => r.chunk != null);
|
|
378
|
+
}
|
|
379
|
+
// ─── Index Management ─────────────────────────────────────────────────────
|
|
380
|
+
rebuildSearchIndex() {
|
|
381
|
+
this.searchIndex = new MiniSearch({
|
|
382
|
+
fields: ["content", "source", "section", "repo"],
|
|
383
|
+
storeFields: ["id", "collection", "source", "repo", "section"],
|
|
384
|
+
searchOptions: {
|
|
385
|
+
boost: { content: 2, section: 1.5, source: 1 },
|
|
386
|
+
fuzzy: 0.2,
|
|
387
|
+
prefix: true
|
|
388
|
+
}
|
|
389
|
+
});
|
|
390
|
+
const docs = Array.from(this.chunks.values()).map((chunk) => ({
|
|
391
|
+
id: chunk.id,
|
|
392
|
+
content: chunk.content,
|
|
393
|
+
source: chunk.source,
|
|
394
|
+
section: chunk.section ?? "",
|
|
395
|
+
repo: chunk.repo,
|
|
396
|
+
collection: chunk.collection
|
|
397
|
+
}));
|
|
398
|
+
this.searchIndex.addAll(docs);
|
|
399
|
+
}
|
|
400
|
+
// ─── File Resolution ──────────────────────────────────────────────────────
|
|
401
|
+
resolveFiles(patterns) {
|
|
402
|
+
const files = /* @__PURE__ */ new Set();
|
|
403
|
+
for (const pattern of patterns) {
|
|
404
|
+
if (pattern.includes("**")) {
|
|
405
|
+
this.walkGlob(this.config.projectRoot, pattern, files);
|
|
406
|
+
} else {
|
|
407
|
+
const fullPath = join(this.config.projectRoot, pattern);
|
|
408
|
+
if (existsSync(fullPath)) {
|
|
409
|
+
files.add(fullPath);
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
return Array.from(files);
|
|
414
|
+
}
|
|
415
|
+
walkGlob(root, pattern, results) {
|
|
416
|
+
const parts = pattern.split("**/");
|
|
417
|
+
const prefix = parts[0] ?? "";
|
|
418
|
+
const suffix = parts.slice(1).join("**/");
|
|
419
|
+
const startDir = prefix ? join(root, prefix) : root;
|
|
420
|
+
if (!existsSync(startDir)) return;
|
|
421
|
+
this.walkDir(startDir, suffix, results);
|
|
422
|
+
}
|
|
423
|
+
walkDir(dir, filePattern, results) {
|
|
424
|
+
try {
|
|
425
|
+
const entries = readdirSync(dir);
|
|
426
|
+
for (const entry of entries) {
|
|
427
|
+
if (entry.startsWith(".") || entry === "node_modules" || entry === "dist" || entry === "coverage") {
|
|
428
|
+
continue;
|
|
429
|
+
}
|
|
430
|
+
const fullPath = join(dir, entry);
|
|
431
|
+
try {
|
|
432
|
+
const stat = statSync(fullPath);
|
|
433
|
+
if (stat.isDirectory()) {
|
|
434
|
+
this.walkDir(fullPath, filePattern, results);
|
|
435
|
+
} else if (stat.isFile() && this.matchesFilePattern(entry, fullPath, filePattern)) {
|
|
436
|
+
results.add(fullPath);
|
|
437
|
+
}
|
|
438
|
+
} catch {
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
} catch {
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
matchesFilePattern(filename, fullPath, pattern) {
|
|
445
|
+
if (!pattern) return true;
|
|
446
|
+
if (pattern.startsWith("*")) {
|
|
447
|
+
const ext = pattern.slice(1);
|
|
448
|
+
return filename.endsWith(ext);
|
|
449
|
+
}
|
|
450
|
+
if (!pattern.includes("/") && !pattern.includes("*")) {
|
|
451
|
+
return filename === pattern;
|
|
452
|
+
}
|
|
453
|
+
return fullPath.endsWith(pattern.replace(/\*/g, ""));
|
|
454
|
+
}
|
|
455
|
+
// ─── Helpers ──────────────────────────────────────────────────────────────
|
|
456
|
+
extractRepo(relPath) {
|
|
457
|
+
const parts = relPath.split("/");
|
|
458
|
+
return parts[0] ?? "unknown";
|
|
459
|
+
}
|
|
460
|
+
extractSection(content) {
|
|
461
|
+
const match = content.match(/^#+\s+(.+)/m);
|
|
462
|
+
return match?.[1]?.trim();
|
|
463
|
+
}
|
|
464
|
+
// ─── Stats ────────────────────────────────────────────────────────────────
|
|
465
|
+
getStats() {
|
|
466
|
+
const collections = {
|
|
467
|
+
codebase: 0,
|
|
468
|
+
protocols: 0,
|
|
469
|
+
decisions: 0,
|
|
470
|
+
errors: 0,
|
|
471
|
+
compliance: 0,
|
|
472
|
+
domain: 0
|
|
473
|
+
};
|
|
474
|
+
for (const chunk of this.chunks.values()) {
|
|
475
|
+
collections[chunk.collection]++;
|
|
476
|
+
}
|
|
477
|
+
return {
|
|
478
|
+
totalChunks: this.chunks.size,
|
|
479
|
+
collections,
|
|
480
|
+
initialized: this.initialized,
|
|
481
|
+
vectorStoreAvailable: this.chromaAvailable
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
};
|
|
485
|
+
/**
|
|
486
|
+
* BaselineOS RAG Engine — Hybrid Retrieval (BM25 + Vector)
|
|
487
|
+
*
|
|
488
|
+
* Combines keyword search (MiniSearch/BM25) with optional vector search
|
|
489
|
+
* (ChromaDB + OpenAI embeddings) for high-quality context retrieval.
|
|
490
|
+
*
|
|
491
|
+
* 6 Collections:
|
|
492
|
+
* codebase — TypeScript/Python source files
|
|
493
|
+
* protocols — Protocol specs, READMEs, architecture docs
|
|
494
|
+
* decisions — ADRs, design decisions
|
|
495
|
+
* errors — Error patterns, debugging context
|
|
496
|
+
* compliance — Regulatory frameworks, audit evidence
|
|
497
|
+
* domain — GTCX domain knowledge, CLAUDE.md files
|
|
498
|
+
*
|
|
499
|
+
* @license Apache-2.0
|
|
500
|
+
*/
|
|
501
|
+
|
|
502
|
+
export { RAGEngine };
|
|
503
|
+
//# sourceMappingURL=rag-engine.js.map
|
|
504
|
+
//# sourceMappingURL=rag-engine.js.map
|