@o-lang/semantic-doc-search 1.0.41 → 1.0.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +40 -38
- package/src/embeddings/local.js +12 -2
- package/src/embeddings/local.js.bak +153 -0
- package/src/index.js +1 -1
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/config.json +0 -25
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/onnx/model_quantized.onnx +0 -0
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer.json +0 -30686
- package/.cache/embeddings/Xenova/all-MiniLM-L6-v2/tokenizer_config.json +0 -15
- package/.env.example +0 -0
- package/bin/cli.js +0 -58
- package/docs/sample1.txt +0 -1
- package/docs/vacation policy +0 -5
- package/embeddings.json +0 -3
- package/test-doc-search-batch.js +0 -36
- package/test-doc-search.js +0 -40
- package/test-embed.js +0 -10
- package/test-single-doc.js +0 -32
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"clean_up_tokenization_spaces": true,
|
|
3
|
-
"cls_token": "[CLS]",
|
|
4
|
-
"do_basic_tokenize": true,
|
|
5
|
-
"do_lower_case": true,
|
|
6
|
-
"mask_token": "[MASK]",
|
|
7
|
-
"model_max_length": 512,
|
|
8
|
-
"never_split": null,
|
|
9
|
-
"pad_token": "[PAD]",
|
|
10
|
-
"sep_token": "[SEP]",
|
|
11
|
-
"strip_accents": null,
|
|
12
|
-
"tokenize_chinese_chars": true,
|
|
13
|
-
"tokenizer_class": "BertTokenizer",
|
|
14
|
-
"unk_token": "[UNK]"
|
|
15
|
-
}
|
package/.env.example
DELETED
|
File without changes
|
package/bin/cli.js
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
const yargs = require("yargs");
|
|
3
|
-
const { hideBin } = require("yargs/helpers");
|
|
4
|
-
const resolver = require("../src/index.js");
|
|
5
|
-
|
|
6
|
-
const argv = yargs(hideBin(process.argv))
|
|
7
|
-
.usage("Usage: $0 <query> [options]")
|
|
8
|
-
.option("provider", {
|
|
9
|
-
type: "string",
|
|
10
|
-
describe: "LLM provider: local | openai | groq | anthropic",
|
|
11
|
-
default: "local",
|
|
12
|
-
})
|
|
13
|
-
.option("openai-key", { type: "string", describe: "OpenAI API key" })
|
|
14
|
-
.option("groq-key", { type: "string", describe: "Groq API key" })
|
|
15
|
-
.option("anthropic-key", { type: "string", describe: "Anthropic API key" })
|
|
16
|
-
.option("model", { type: "string", describe: "LLM model to use" })
|
|
17
|
-
.option("doc-root", { type: "string", describe: "Directory of documents" })
|
|
18
|
-
.option("stream", { type: "boolean", describe: "Stream output if supported", default: false })
|
|
19
|
-
.option("vector-backend", {
|
|
20
|
-
type: "string",
|
|
21
|
-
describe: "Vector backend to use: pgvector | memory | pinecone | redis",
|
|
22
|
-
default: "pgvector"
|
|
23
|
-
})
|
|
24
|
-
.demandCommand(1, "Please provide a query")
|
|
25
|
-
.help()
|
|
26
|
-
.argv;
|
|
27
|
-
|
|
28
|
-
// Build context for resolver
|
|
29
|
-
const context = {
|
|
30
|
-
query: argv._.join(" "),
|
|
31
|
-
doc_root: argv.docRoot,
|
|
32
|
-
stream: argv.stream,
|
|
33
|
-
vectorBackend: argv["vector-backend"], // NEW
|
|
34
|
-
options: {
|
|
35
|
-
provider: argv.provider,
|
|
36
|
-
openaiApiKey: argv["openai-key"] || process.env.OPENAI_API_KEY,
|
|
37
|
-
groqApiKey: argv["groq-key"] || process.env.GROQ_API_KEY,
|
|
38
|
-
anthropicApiKey: argv["anthropic-key"] || process.env.ANTHROPIC_API_KEY,
|
|
39
|
-
model: argv.model,
|
|
40
|
-
},
|
|
41
|
-
onToken: token => {
|
|
42
|
-
if (argv.stream) process.stdout.write(token);
|
|
43
|
-
},
|
|
44
|
-
};
|
|
45
|
-
|
|
46
|
-
(async () => {
|
|
47
|
-
try {
|
|
48
|
-
// Pass vectorBackend in the config
|
|
49
|
-
const result = await resolver("search", context);
|
|
50
|
-
if (!argv.stream) {
|
|
51
|
-
console.log("\n\n✅ Result:\n");
|
|
52
|
-
console.log(result.text, "\n");
|
|
53
|
-
console.log("Meta:", result.meta);
|
|
54
|
-
}
|
|
55
|
-
} catch (err) {
|
|
56
|
-
console.error("\n❌ Error running search:", err);
|
|
57
|
-
}
|
|
58
|
-
})();
|
package/docs/sample1.txt
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
Semantic search is a technique that improves search results by understanding the meaning of words in a query rather than relying solely on keyword matching.
|
package/docs/vacation policy
DELETED
package/embeddings.json
DELETED
package/test-doc-search-batch.js
DELETED
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
// test-doc-search-batch.js
|
|
2
|
-
const docSearchResolver = require("./src/index.js");
|
|
3
|
-
|
|
4
|
-
(async () => {
|
|
5
|
-
try {
|
|
6
|
-
const context = {
|
|
7
|
-
doc_root: "./docs", // folder with .txt or .md files
|
|
8
|
-
vectorBackend: "memory", // can also switch to "pgvector" if configured
|
|
9
|
-
};
|
|
10
|
-
|
|
11
|
-
const queries = [
|
|
12
|
-
"Semantic search",
|
|
13
|
-
"Vacation policy",
|
|
14
|
-
"Employee onboarding",
|
|
15
|
-
"Leave requests",
|
|
16
|
-
"HR compliance"
|
|
17
|
-
];
|
|
18
|
-
|
|
19
|
-
console.log("🔎 Running batch doc-search...");
|
|
20
|
-
|
|
21
|
-
for (const query of queries) {
|
|
22
|
-
const action = `Ask doc-search "${query}"`;
|
|
23
|
-
const result = await docSearchResolver(action, context);
|
|
24
|
-
|
|
25
|
-
console.log("\n====================================");
|
|
26
|
-
console.log(`Query: "${query}"`);
|
|
27
|
-
console.log("Text:\n", result.text || "(No matches found)");
|
|
28
|
-
console.log("Meta:", result.meta);
|
|
29
|
-
console.log("====================================");
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
console.log("\n✅ Batch search complete!");
|
|
33
|
-
} catch (err) {
|
|
34
|
-
console.error("❌ Batch doc-search test failed:", err);
|
|
35
|
-
}
|
|
36
|
-
})();
|
package/test-doc-search.js
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
// test-doc-search.js
|
|
2
|
-
const resolver = require('./src/resolver');
|
|
3
|
-
|
|
4
|
-
async function testDocSearch() {
|
|
5
|
-
console.log('🧪 Testing doc-search resolver...');
|
|
6
|
-
|
|
7
|
-
const fs = require('fs');
|
|
8
|
-
const path = require('path');
|
|
9
|
-
const testDir = './test-docs';
|
|
10
|
-
if (!fs.existsSync(testDir)) fs.mkdirSync(testDir);
|
|
11
|
-
fs.writeFileSync(path.join(testDir, 'policy.md'),
|
|
12
|
-
'# Vacation Policy\nFull-time employees get 20 days PTO.\nNew hires accrue immediately.'
|
|
13
|
-
);
|
|
14
|
-
|
|
15
|
-
try {
|
|
16
|
-
const result = await resolver('Ask doc-search "vacation policy"', {
|
|
17
|
-
doc_root: './test-docs',
|
|
18
|
-
topK: 3,
|
|
19
|
-
minScore: 0
|
|
20
|
-
});
|
|
21
|
-
|
|
22
|
-
console.log('✅ SUCCESS!');
|
|
23
|
-
|
|
24
|
-
if (result?.matches?.length > 0) {
|
|
25
|
-
const fullText = result.matches.map(m => m.content).join('\n\n');
|
|
26
|
-
console.log('📄 Retrieved text length:', fullText.length);
|
|
27
|
-
console.log('🔍 First 100 chars:', fullText.substring(0, 100));
|
|
28
|
-
} else {
|
|
29
|
-
console.log('⚠️ No matches found (check minScore or embedding quality)');
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
fs.rmSync(testDir, { recursive: true, force: true });
|
|
33
|
-
} catch (err) {
|
|
34
|
-
console.error('❌ FAILED:', err);
|
|
35
|
-
if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true });
|
|
36
|
-
process.exit(1);
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
testDocSearch();
|
package/test-embed.js
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
// test-embed.js
|
|
2
|
-
const embedder = require("./src/embeddings/local");
|
|
3
|
-
|
|
4
|
-
async function test() {
|
|
5
|
-
console.log("Model dimension:", embedder.getDimension());
|
|
6
|
-
const vector = await embedder.embed("hello world");
|
|
7
|
-
console.log("Embedding result:", vector?.length, vector);
|
|
8
|
-
}
|
|
9
|
-
|
|
10
|
-
test().catch(console.error);
|
package/test-single-doc.js
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
// test-single-doc.js
|
|
2
|
-
const path = require("path");
|
|
3
|
-
const { LocalEmbedding } = require("./embeddings/local.js");
|
|
4
|
-
const { chunkText } = require("./utils/chunker.js");
|
|
5
|
-
const VectorRouter = require("./adapters/vectorRouter");
|
|
6
|
-
|
|
7
|
-
(async () => {
|
|
8
|
-
const embedder = new LocalEmbedding();
|
|
9
|
-
const docPath = path.join(process.cwd(), "docs", "sample1.txt");
|
|
10
|
-
const fs = require("fs");
|
|
11
|
-
const content = fs.readFileSync(docPath, "utf8");
|
|
12
|
-
|
|
13
|
-
const chunks = chunkText(content, 500);
|
|
14
|
-
console.log(`Document split into ${chunks.length} chunk(s)`);
|
|
15
|
-
|
|
16
|
-
const vectorStore = VectorRouter.create({ backend: "memory", dimension: embedder.getDimension() });
|
|
17
|
-
|
|
18
|
-
for (let i = 0; i < chunks.length; i++) {
|
|
19
|
-
const vector = await embedder.embed(chunks[i]);
|
|
20
|
-
console.log(`Chunk ${i} embedding first 5 dims:`, vector.slice(0, 5));
|
|
21
|
-
|
|
22
|
-
await vectorStore.upsert({ id: `sample1:${i}`, vector, content: chunks[i], source: "file:sample1.txt" });
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
const query = "Semantic search";
|
|
26
|
-
const queryVector = await embedder.embed(query);
|
|
27
|
-
|
|
28
|
-
const results = await vectorStore.query(queryVector, { topK: 5 });
|
|
29
|
-
results.forEach((r, idx) => {
|
|
30
|
-
console.log(`Result ${idx}: score=${r.score.toFixed(3)} content=${r.content.substring(0, 50)}...`);
|
|
31
|
-
});
|
|
32
|
-
})();
|