@o-lang/semantic-doc-search 1.0.40 → 1.0.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -12
- package/package.json +37 -38
- package/src/embeddings/local.js +12 -2
- package/src/embeddings/local.js.bak +153 -0
- package/src/index.js +1 -1
- package/src/resolver.js +59 -179
- package/src/services/docQA.js +56 -19
- package/src/utils/formatResults.js +9 -10
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +0 -25
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +0 -30686
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +0 -15
- package/.env.example +0 -0
- package/bin/cli.js +0 -58
- package/docs/sample1.txt +0 -1
- package/docs/vacation policy +0 -5
- package/embeddings.json +0 -3
- package/test-doc-search-batch.js +0 -36
- package/test-doc-search.js +0 -40
- package/test-embed.js +0 -10
- package/test-single-doc.js +0 -32
package/README.md
CHANGED
|
@@ -1,24 +1,29 @@
|
|
|
1
|
-
# @
|
|
1
|
+
# @o-lang/semantic-doc-search
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Semantic document retrieval engine for O-Lang workflows.
|
|
4
|
+
|
|
5
|
+
This package provides vector-based document search (RAG retrieval layer) that integrates with O-Lang kernel workflows. It handles document ingestion, chunking, embedding, and similarity search, returning LLM-ready context outputs.
|
|
4
6
|
|
|
5
7
|
---
|
|
6
8
|
|
|
7
9
|
## Features
|
|
8
10
|
|
|
9
|
-
-
|
|
10
|
-
-
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
11
|
+
- Semantic vector search using embeddings
|
|
12
|
+
- Document ingestion from local filesystem (`.txt`, `.md`)
|
|
13
|
+
- Automatic text chunking for large documents
|
|
14
|
+
- Pluggable embedding providers (local, OpenAI, Groq, etc.)
|
|
15
|
+
- Multiple vector database support:
|
|
16
|
+
- In-memory store
|
|
17
|
+
- Redis (adapter)
|
|
18
|
+
- PostgreSQL / pgvector (adapter)
|
|
19
|
+
- Pinecone (adapter)
|
|
20
|
+
- Embedding cache support (`embeddings.json`)
|
|
21
|
+
- Normalized LLM-ready output format (`text + matches`)
|
|
22
|
+
- Designed for O-Lang `.ol` workflow integration
|
|
18
23
|
|
|
19
24
|
---
|
|
20
25
|
|
|
21
26
|
## Installation
|
|
22
27
|
|
|
23
28
|
```bash
|
|
24
|
-
npm install @
|
|
29
|
+
npm install @o-lang/semantic-doc-search
|
package/package.json
CHANGED
|
@@ -1,38 +1,37 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "@o-lang/semantic-doc-search",
|
|
3
|
-
"version": "1.0.
|
|
4
|
-
"description": "O-
|
|
5
|
-
"main": "src/index.js",
|
|
6
|
-
"
|
|
7
|
-
|
|
8
|
-
"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"name": "@o-lang/semantic-doc-search",
|
|
3
|
+
"version": "1.0.42",
|
|
4
|
+
"description": "O-Lang semantic document search resolver with vector embeddings",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"exports": {
|
|
7
|
+
".": "./src/index.js",
|
|
8
|
+
"./resolver": "./src/resolver.js",
|
|
9
|
+
"./embeddings/local": "./src/embeddings/local.js"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"src/",
|
|
13
|
+
"package.json",
|
|
14
|
+
"README.md"
|
|
15
|
+
],
|
|
16
|
+
"scripts": {
|
|
17
|
+
"test": "node src/test-doc-search.js",
|
|
18
|
+
"start": "node src/index.js"
|
|
19
|
+
},
|
|
20
|
+
"keywords": [
|
|
21
|
+
"o-lang",
|
|
22
|
+
"resolver",
|
|
23
|
+
"semantic-search",
|
|
24
|
+
"rag",
|
|
25
|
+
"embeddings"
|
|
26
|
+
],
|
|
27
|
+
"author": "O-Lang Team <info@olang.cloud>",
|
|
28
|
+
"license": "MIT",
|
|
29
|
+
"dependencies": {
|
|
30
|
+
"@xenova/transformers": "^2.14.0",
|
|
31
|
+
"axios": "^1.6.0",
|
|
32
|
+
"dotenv": "^16.6.1"
|
|
33
|
+
},
|
|
34
|
+
"engines": {
|
|
35
|
+
"node": ">=18.0.0"
|
|
36
|
+
}
|
|
37
|
+
}
|
package/src/embeddings/local.js
CHANGED
|
@@ -149,5 +149,15 @@ class LocalEmbedding {
|
|
|
149
149
|
}
|
|
150
150
|
}
|
|
151
151
|
|
|
152
|
-
|
|
153
|
-
|
|
152
|
+
// ✅ EXPORT AS FACTORY FUNCTION (what resolver expects)
|
|
153
|
+
// Usage: const embed = await embedder({ dimension: 384 })
|
|
154
|
+
// Returns: async (text) => vector
|
|
155
|
+
const embedderInstance = new LocalEmbedding();
|
|
156
|
+
|
|
157
|
+
module.exports = async ({ dimension = 384 } = {}) => {
|
|
158
|
+
if (dimension && typeof dimension === 'number') {
|
|
159
|
+
embedderInstance.dim = dimension;
|
|
160
|
+
}
|
|
161
|
+
// Return bound embed method that resolver can call: await embed(text)
|
|
162
|
+
return embedderInstance.embed.bind(embedderInstance);
|
|
163
|
+
};
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LocalEmbedding
|
|
3
|
+
* ----------------
|
|
4
|
+
* Real semantic embeddings using all-MiniLM-L6-v2
|
|
5
|
+
* - Singleton model load
|
|
6
|
+
* - No silent failures
|
|
7
|
+
* - No zero vectors
|
|
8
|
+
* - Deterministic behavior
|
|
9
|
+
* - DEFENSIVE against method detaching & invalid vectors
|
|
10
|
+
* - WINDOWS-SAFE (disables SIMD, threads, proxy)
|
|
11
|
+
* - TENSOR-SAFE (handles Float32Array, Array, and all ONNX tensor types)
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
class LocalEmbedding {
|
|
15
|
+
constructor() {
|
|
16
|
+
this.dim = 384;
|
|
17
|
+
this.model = null;
|
|
18
|
+
this.loading = null;
|
|
19
|
+
|
|
20
|
+
// 🔒 Bind methods to prevent resolver breakage
|
|
21
|
+
this.loadModel = this.loadModel.bind(this);
|
|
22
|
+
this.embed = this.embed.bind(this);
|
|
23
|
+
this.embedBatch = this.embedBatch.bind(this);
|
|
24
|
+
this.getDimension = this.getDimension.bind(this);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/* ---------------- INTERNAL ---------------- */
|
|
28
|
+
|
|
29
|
+
async loadModel() {
|
|
30
|
+
if (this.model) return this.model;
|
|
31
|
+
|
|
32
|
+
if (!this.loading) {
|
|
33
|
+
this.loading = (async () => {
|
|
34
|
+
// ⚠️ CRITICAL: Configure environment BEFORE loading model
|
|
35
|
+
const { env } = await import("@xenova/transformers");
|
|
36
|
+
|
|
37
|
+
// Safe settings for all platforms (harmless on macOS/Linux, essential on Windows)
|
|
38
|
+
env.backends.onnx.wasm.simd = false; // Avoids AVX/SIMD crashes on older CPUs
|
|
39
|
+
env.backends.onnx.wasm.threads = false; // Prevents threading issues in Node
|
|
40
|
+
env.backends.onnx.wasm.proxy = false; // Avoids proxy complications
|
|
41
|
+
env.allowLocalModels = true;
|
|
42
|
+
env.backends.onnx.warmup = false;
|
|
43
|
+
env.cacheDir = "./.cache/embeddings"; // Explicit, project-local cache
|
|
44
|
+
|
|
45
|
+
console.log("🔄 Loading local embedding model (first run only)...");
|
|
46
|
+
console.log("⚙️ Using WASM (SIMD disabled) for cross-platform compatibility");
|
|
47
|
+
|
|
48
|
+
const { pipeline } = await import("@xenova/transformers");
|
|
49
|
+
|
|
50
|
+
const model = await pipeline(
|
|
51
|
+
"feature-extraction",
|
|
52
|
+
"Xenova/all-MiniLM-L6-v2",
|
|
53
|
+
{
|
|
54
|
+
revision: "main",
|
|
55
|
+
cache_dir: "./.cache/embeddings",
|
|
56
|
+
}
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
console.log("✅ Local embedding model ready");
|
|
60
|
+
return model;
|
|
61
|
+
})();
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
this.model = await this.loading;
|
|
65
|
+
return this.model;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/* ---------------- PUBLIC API ---------------- */
|
|
69
|
+
|
|
70
|
+
async embed(text) {
|
|
71
|
+
if (typeof text !== "string" || !text.trim()) {
|
|
72
|
+
throw new Error("Embedding input must be a non-empty string");
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const model = await this.loadModel();
|
|
76
|
+
|
|
77
|
+
try {
|
|
78
|
+
const output = await model(text, {
|
|
79
|
+
pooling: "mean",
|
|
80
|
+
normalize: true,
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// 🔍 DEBUG: Inspect output structure
|
|
84
|
+
console.log("🔍 Model output type:", typeof output);
|
|
85
|
+
if (output && typeof output === 'object') {
|
|
86
|
+
console.log("🔍 Output keys:", Object.keys(output));
|
|
87
|
+
console.log("🔍 Output dims:", output.dims);
|
|
88
|
+
console.log("🔍 output.data type:", Object.prototype.toString.call(output.data));
|
|
89
|
+
console.log("🔍 Is TypedArray?", ArrayBuffer.isView(output.data));
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// ✅ UNIVERSAL EXTRACTION: handles Float32Array, Array, and all tensor forms
|
|
93
|
+
let vector = null;
|
|
94
|
+
|
|
95
|
+
if (output && output.data !== undefined) {
|
|
96
|
+
// Handle Float32Array, Uint8Array, etc. (standard in ONNX/WASM)
|
|
97
|
+
if (ArrayBuffer.isView(output.data)) {
|
|
98
|
+
vector = Array.from(output.data);
|
|
99
|
+
}
|
|
100
|
+
// Handle plain JS array (older backends or CPU mode)
|
|
101
|
+
else if (Array.isArray(output.data)) {
|
|
102
|
+
vector = Array.from(output.data);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Handle batch output: [tensor]
|
|
106
|
+
else if (Array.isArray(output) && output[0]?.data !== undefined) {
|
|
107
|
+
if (ArrayBuffer.isView(output[0].data)) {
|
|
108
|
+
vector = Array.from(output[0].data);
|
|
109
|
+
} else if (Array.isArray(output[0].data)) {
|
|
110
|
+
vector = Array.from(output[0].data);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
// Fallback: raw array (rare)
|
|
114
|
+
else if (Array.isArray(output)) {
|
|
115
|
+
vector = output;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Final validation
|
|
119
|
+
if (!Array.isArray(vector) || vector.length !== this.dim) {
|
|
120
|
+
console.error("❌ Invalid embedding vector length:", vector?.length);
|
|
121
|
+
console.error("❌ First few values:", vector?.slice?.(0, 5));
|
|
122
|
+
throw new Error(`Invalid embedding dimension: ${vector?.length || 0} (expected ${this.dim})`);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return vector;
|
|
126
|
+
} catch (err) {
|
|
127
|
+
console.error(
|
|
128
|
+
`❌ Embedding failed for text: "${text.slice(0, 60)}..."`,
|
|
129
|
+
err.message
|
|
130
|
+
);
|
|
131
|
+
throw err;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
async embedBatch(texts = []) {
|
|
136
|
+
if (!Array.isArray(texts)) {
|
|
137
|
+
throw new Error("embedBatch expects an array of strings");
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const results = [];
|
|
141
|
+
for (const text of texts) {
|
|
142
|
+
results.push(await this.embed(text));
|
|
143
|
+
}
|
|
144
|
+
return results;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
getDimension() {
|
|
148
|
+
return this.dim;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const embedder = new LocalEmbedding();
|
|
153
|
+
module.exports = embedder;
|
package/src/index.js
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
// index.js (6 lines)
|
|
2
1
|
const semanticResolver = require("./resolver");
|
|
3
2
|
|
|
4
3
|
async function docSearchResolver(action, context) {
|
|
@@ -6,4 +5,5 @@ async function docSearchResolver(action, context) {
|
|
|
6
5
|
}
|
|
7
6
|
|
|
8
7
|
docSearchResolver.resolverName = "doc-search";
|
|
8
|
+
docSearchResolver.version = "1.0.41";
|
|
9
9
|
module.exports = docSearchResolver;
|
package/src/resolver.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
const VectorRouter = require("./adapters/vectorRouter");
|
|
2
|
-
const embedder = require("./embeddings/local");
|
|
2
|
+
const embedder = require("./embeddings/local");
|
|
3
3
|
const { extractQuery } = require("./utils/extractQuery");
|
|
4
4
|
const { formatResults } = require("./utils/formatResults");
|
|
5
5
|
const fs = require("fs");
|
|
@@ -8,7 +8,9 @@ const crypto = require("crypto");
|
|
|
8
8
|
|
|
9
9
|
const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
|
|
10
10
|
|
|
11
|
-
//
|
|
11
|
+
// ─────────────────────────────────────────────
|
|
12
|
+
// Helpers (UNCHANGED)
|
|
13
|
+
// ─────────────────────────────────────────────
|
|
12
14
|
function loadCache() {
|
|
13
15
|
try {
|
|
14
16
|
if (fs.existsSync(CACHE_PATH)) {
|
|
@@ -24,226 +26,104 @@ function saveCache(cache) {
|
|
|
24
26
|
} catch {}
|
|
25
27
|
}
|
|
26
28
|
|
|
27
|
-
/**
|
|
28
|
-
* Clean text for embedding (defensive)
|
|
29
|
-
*/
|
|
30
29
|
function sanitizeTextForEmbedding(text) {
|
|
31
30
|
if (typeof text !== "string") return "";
|
|
32
|
-
// Remove wrapping quotes and extra whitespace
|
|
33
31
|
return text.replace(/^["']|["']$/g, "").trim();
|
|
34
32
|
}
|
|
35
33
|
|
|
36
|
-
/**
|
|
37
|
-
* Load documents from doc_root if provided
|
|
38
|
-
*/
|
|
39
|
-
function loadDocumentsFromContext(context) {
|
|
40
|
-
if (context.documents && Array.isArray(context.documents)) {
|
|
41
|
-
return context.documents;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
if (context.doc_root) {
|
|
45
|
-
const baseDir = path.resolve(process.cwd(), context.doc_root);
|
|
46
|
-
if (fs.existsSync(baseDir)) {
|
|
47
|
-
const files = fs.readdirSync(baseDir).filter(f => f.endsWith('.txt') || f.endsWith('.md'));
|
|
48
|
-
const docs = files.map(file => {
|
|
49
|
-
try {
|
|
50
|
-
const content = fs.readFileSync(path.join(baseDir, file), 'utf8');
|
|
51
|
-
return { id: file, content, source: `file:${file}` };
|
|
52
|
-
} catch (err) {
|
|
53
|
-
console.warn(`⚠️ Failed to read ${file}:`, err.message);
|
|
54
|
-
return null;
|
|
55
|
-
}
|
|
56
|
-
}).filter(Boolean);
|
|
57
|
-
|
|
58
|
-
console.log(`📄 Loaded ${docs.length} document(s) from ${baseDir}`);
|
|
59
|
-
docs.forEach(d => console.log(` - ${d.id} (${d.content?.length || 0} chars)`));
|
|
60
|
-
return docs;
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
return [];
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Convert any array-like (Float32Array, etc.) to plain JS array
|
|
69
|
-
*/
|
|
70
|
-
function toPlainArray(input) {
|
|
71
|
-
if (!input) return null;
|
|
72
|
-
if (Array.isArray(input)) return input;
|
|
73
|
-
if (ArrayBuffer.isView(input)) return Array.from(input);
|
|
74
|
-
return null;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/**
|
|
78
|
-
* Hash text for caching
|
|
79
|
-
*/
|
|
80
34
|
function hashText(str) {
|
|
81
35
|
return crypto.createHash("sha256").update(str).digest("hex");
|
|
82
36
|
}
|
|
83
37
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
38
|
+
// ─────────────────────────────────────────────
|
|
39
|
+
// 🔥 MAIN RESOLVER
|
|
40
|
+
// ─────────────────────────────────────────────
|
|
87
41
|
async function resolver(action, context = {}) {
|
|
88
42
|
if (typeof action !== "string") return;
|
|
89
|
-
if (!action.toLowerCase().startsWith("ask doc-search")) return;
|
|
90
43
|
|
|
91
|
-
let query = extractQuery(action);
|
|
92
|
-
query = sanitizeTextForEmbedding(query);
|
|
93
|
-
if (!query) return { text: "(Empty query)", meta: { matches: 0 } };
|
|
94
|
-
|
|
95
|
-
// Vector backend
|
|
96
44
|
const vectorStore = VectorRouter.create(context);
|
|
45
|
+
const embed = await embedder({ dimension: 384 });
|
|
97
46
|
|
|
98
|
-
|
|
99
|
-
if (!vectorStore.supports("vector.search")) {
|
|
100
|
-
throw new Error("Vector backend does not support vector.search");
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
// Load documents (from context.documents OR doc_root)
|
|
104
|
-
const documents = loadDocumentsFromContext(context);
|
|
105
|
-
console.log("🔄 Starting ingestion for", documents.length, "documents");
|
|
106
|
-
|
|
107
|
-
// ✅ ONLY USE CACHE FOR PERSISTENT BACKENDS
|
|
47
|
+
const doc_root = context.doc_root || "./docs";
|
|
108
48
|
const useCache = !!context.POSTGRES_URL || !!context.REDIS_URL;
|
|
109
49
|
const cache = useCache ? loadCache() : {};
|
|
110
50
|
|
|
111
|
-
//
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
51
|
+
// =====================================================
|
|
52
|
+
// ✅ 1. VECTOR INSERT (INGEST)
|
|
53
|
+
// =====================================================
|
|
54
|
+
if (action.includes("vector.insert")) {
|
|
55
|
+
let inserted = 0;
|
|
116
56
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
if (!doc?.content) {
|
|
121
|
-
console.warn("⚠️ Skipping empty doc:", doc?.id);
|
|
122
|
-
continue;
|
|
123
|
-
}
|
|
57
|
+
if (fs.existsSync(doc_root)) {
|
|
58
|
+
const files = fs.readdirSync(doc_root);
|
|
124
59
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
const chunkText = require("./utils/chunker.js").chunkText;
|
|
129
|
-
chunks = chunkText(doc.content, 500) || [doc.content];
|
|
130
|
-
console.log("📦", doc.id, "split into", chunks.length, "chunks");
|
|
131
|
-
} catch (err) {
|
|
132
|
-
console.warn("⚠️ Chunking failed, using full doc:", err.message);
|
|
133
|
-
chunks = [doc.content];
|
|
134
|
-
}
|
|
60
|
+
for (const file of files) {
|
|
61
|
+
const fullPath = path.join(doc_root, file);
|
|
62
|
+
if (!fs.statSync(fullPath).isFile()) continue;
|
|
135
63
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
console.log("🧩 Chunk", `${doc.id}:${i}`, "text:", JSON.stringify(text));
|
|
139
|
-
|
|
140
|
-
if (!text) {
|
|
141
|
-
console.warn(`⚠️ Skipping empty chunk ${doc.id}:${i}`);
|
|
142
|
-
continue;
|
|
143
|
-
}
|
|
64
|
+
const content = fs.readFileSync(fullPath, "utf8");
|
|
65
|
+
if (!content) continue;
|
|
144
66
|
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
if (useCache && cache[hash]) {
|
|
148
|
-
console.log(`⏭️ Skipping already ingested chunk ${doc.id}:${i}`);
|
|
149
|
-
continue;
|
|
150
|
-
}
|
|
67
|
+
const chunkText = require("./utils/chunker").chunkText;
|
|
68
|
+
const chunks = chunkText(content, 500, 50);
|
|
151
69
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
rawVector = await embedder.embed(text);
|
|
156
|
-
} catch (err) {
|
|
157
|
-
console.warn(`⚠️ Embedding failed for chunk ${doc.id}:${i} ("${text.slice(0, 30)}..."):`, err.message);
|
|
158
|
-
continue;
|
|
159
|
-
}
|
|
70
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
71
|
+
const text = sanitizeTextForEmbedding(chunks[i]);
|
|
72
|
+
if (!text) continue;
|
|
160
73
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
console.warn(`⚠️ Invalid vector type for chunk ${doc.id}:${i}:`, typeof rawVector);
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
74
|
+
const hash = hashText(text);
|
|
75
|
+
if (useCache && cache[hash]) continue;
|
|
166
76
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
if (isZero) {
|
|
170
|
-
console.warn(`⚠️ Zero vector for chunk ${doc.id}:${i}`);
|
|
171
|
-
continue;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
// Normalize to plain array for storage
|
|
175
|
-
const vector = toPlainArray(rawVector);
|
|
176
|
-
if (!vector) {
|
|
177
|
-
console.warn(`⚠️ Failed to normalize vector for chunk ${doc.id}:${i}`);
|
|
178
|
-
continue;
|
|
179
|
-
}
|
|
77
|
+
const rawVector = await embed(text);
|
|
78
|
+
const vector = Array.from(rawVector);
|
|
180
79
|
|
|
181
|
-
try {
|
|
182
80
|
await vectorStore.upsert({
|
|
183
|
-
id: `${
|
|
81
|
+
id: `${file}:${i}`,
|
|
184
82
|
vector,
|
|
185
83
|
content: text,
|
|
186
|
-
source:
|
|
84
|
+
source: `file:${file}`,
|
|
187
85
|
});
|
|
188
|
-
|
|
189
|
-
if (useCache)
|
|
190
|
-
|
|
191
|
-
}
|
|
192
|
-
console.log(`✅ Upserted ${doc.id}:${i}`);
|
|
193
|
-
} catch (err) {
|
|
194
|
-
console.warn(`⚠️ Upsert failed for ${doc.id}:${i}:`, err.message);
|
|
195
|
-
continue;
|
|
86
|
+
|
|
87
|
+
if (useCache) cache[hash] = true;
|
|
88
|
+
inserted++;
|
|
196
89
|
}
|
|
197
90
|
}
|
|
198
91
|
}
|
|
199
|
-
// ✅ ONLY SAVE CACHE FOR PERSISTENT BACKENDS
|
|
200
|
-
if (useCache) {
|
|
201
|
-
saveCache(cache);
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
92
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
try {
|
|
208
|
-
rawQueryVector = await embedder.embed(query);
|
|
209
|
-
} catch (err) {
|
|
210
|
-
console.error(`❌ Query embedding failed: "${query}"`, err.message);
|
|
211
|
-
return { text: "(Query embedding failed)", meta: { matches: 0 } };
|
|
212
|
-
}
|
|
93
|
+
if (useCache) saveCache(cache);
|
|
94
|
+
if (vectorStore.close) await vectorStore.close();
|
|
213
95
|
|
|
214
|
-
|
|
215
|
-
if (!rawQueryVector || (!Array.isArray(rawQueryVector) && !ArrayBuffer.isView(rawQueryVector))) {
|
|
216
|
-
console.error("❌ Invalid query vector type:", typeof rawQueryVector);
|
|
217
|
-
return { text: "(Invalid query vector)", meta: { matches: 0 } };
|
|
96
|
+
return { inserted, doc_root };
|
|
218
97
|
}
|
|
219
98
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
99
|
+
// =====================================================
|
|
100
|
+
// ✅ 2. VECTOR SEARCH
|
|
101
|
+
// =====================================================
|
|
102
|
+
if (action.includes("vector.search")) {
|
|
103
|
+
const query = sanitizeTextForEmbedding(extractQuery(action));
|
|
104
|
+
if (!query) return { text: "", matches: [] };
|
|
225
105
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
return { text: "(Failed to normalize query vector)", meta: { matches: 0 } };
|
|
229
|
-
}
|
|
106
|
+
const rawQueryVector = await embed(query);
|
|
107
|
+
const queryVector = Array.from(rawQueryVector);
|
|
230
108
|
|
|
231
|
-
// --- SEARCH ---
|
|
232
|
-
try {
|
|
233
|
-
console.log("🔍 Executing vector search...");
|
|
234
109
|
const results = await vectorStore.query(queryVector, {
|
|
235
110
|
topK: context.topK || 5,
|
|
236
|
-
minScore: context.minScore || 0,
|
|
237
111
|
});
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
112
|
+
|
|
113
|
+
if (vectorStore.close) await vectorStore.close();
|
|
114
|
+
|
|
242
115
|
return formatResults(results, query);
|
|
243
|
-
} catch (err) {
|
|
244
|
-
console.error("❌ Vector search failed:", err.message);
|
|
245
|
-
return { text: "(Search failed)", meta: { matches: 0 } };
|
|
246
116
|
}
|
|
117
|
+
|
|
118
|
+
// =====================================================
|
|
119
|
+
// ❌ REMOVE THIS (legacy)
|
|
120
|
+
// =====================================================
|
|
121
|
+
// if (action.startsWith("Ask doc-search")) { ... }
|
|
122
|
+
|
|
123
|
+
return;
|
|
247
124
|
}
|
|
248
125
|
|
|
126
|
+
resolver.resolverName = "vector";
|
|
127
|
+
resolver.version = "1.0.0";
|
|
128
|
+
|
|
249
129
|
module.exports = resolver;
|