@o-lang/semantic-doc-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ /**
2
+ * In-Memory Vector Store Adapter
3
+ * -----------------------------------
4
+ * Stores embeddings in RAM.
5
+ * Useful for local development and testing.
6
+ */
7
+
8
+ const cosineSimilarity = (a, b) => {
9
+ let dot = 0,
10
+ magA = 0,
11
+ magB = 0;
12
+
13
+ for (let i = 0; i < a.length; i++) {
14
+ dot += a[i] * b[i];
15
+ magA += a[i] * a[i];
16
+ magB += b[i] * b[i];
17
+ }
18
+
19
+ if (magA === 0 || magB === 0) return 0;
20
+ return dot / (Math.sqrt(magA) * Math.sqrt(magB));
21
+ };
22
+
23
+ module.exports = {
24
+ _store: {},
25
+
26
+ async init() {
27
+ this._store = {}; // reset
28
+ return true;
29
+ },
30
+
31
+ async upsert(id, vector, metadata) {
32
+ this._store[id] = {
33
+ id,
34
+ vector,
35
+ metadata,
36
+ };
37
+ },
38
+
39
+ async search(queryVector, limit = 5) {
40
+ const scored = [];
41
+
42
+ for (const key in this._store) {
43
+ const entry = this._store[key];
44
+ const score = cosineSimilarity(queryVector, entry.vector);
45
+
46
+ scored.push({
47
+ id: entry.id,
48
+ score,
49
+ text: entry.metadata.text,
50
+ source: entry.metadata.source,
51
+ });
52
+ }
53
+
54
+ return scored
55
+ .sort((a, b) => b.score - a.score)
56
+ .slice(0, limit);
57
+ },
58
+ };
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Pinecone Vector Store Adapter
3
+ *
4
+ * Requirements:
5
+ * npm install @pinecone-database/pinecone
6
+ *
7
+ * Env:
8
+ * PINECONE_API_KEY=your_key
9
+ * PINECONE_INDEX=olang-docs
10
+ */
11
+
12
+ const { Pinecone } = require("@pinecone-database/pinecone");
13
+
14
+ module.exports = {
15
+ client: null,
16
+ index: null,
17
+
18
+ async init() {
19
+ const apiKey = process.env.PINECONE_API_KEY;
20
+ const indexName = process.env.PINECONE_INDEX || "olang-docs";
21
+
22
+ if (!apiKey) throw new Error("Missing PINECONE_API_KEY");
23
+
24
+ this.client = new Pinecone({ apiKey });
25
+
26
+ try {
27
+ // Create index if missing
28
+ const existingIndexes = await this.client.listIndexes();
29
+ if (!existingIndexes.indexes.some((i) => i.name === indexName)) {
30
+ console.log(`Creating Pinecone index: ${indexName} ...`);
31
+
32
+ await this.client.createIndex({
33
+ name: indexName,
34
+ dimension: 1536, // matches OpenAI text-embedding-3-small
35
+ metric: "cosine",
36
+ });
37
+
38
+ console.log("Index created. It may take a minute to be ready.");
39
+ }
40
+ } catch (err) {
41
+ console.error("Error checking/creating Pinecone index:", err);
42
+ }
43
+
44
+ this.index = this.client.index(indexName);
45
+ console.log("Pinecone adapter initialized ✓");
46
+ },
47
+
48
+ /**
49
+ * Upsert a vector + metadata
50
+ */
51
+ async upsert(id, vector, metadata) {
52
+ await this.index.upsert([
53
+ {
54
+ id: id.toString(),
55
+ values: vector,
56
+ metadata,
57
+ },
58
+ ]);
59
+ },
60
+
61
+ /**
62
+ * Search for nearest vectors
63
+ */
64
+ async search(queryVector, limit = 5) {
65
+ const res = await this.index.query({
66
+ vector: queryVector,
67
+ topK: limit,
68
+ includeMetadata: true,
69
+ });
70
+
71
+ return res.matches.map((m) => ({
72
+ id: m.id,
73
+ score: m.score,
74
+ text: m.metadata?.text,
75
+ source: m.metadata?.source,
76
+ }));
77
+ },
78
+ };
@@ -0,0 +1,103 @@
1
+ /**
2
+ * Redis Vector Store Adapter
3
+ * Uses RediSearch vector similarity (HNSW or FLAT)
4
+ *
5
+ * Requirements:
6
+ * - Redis Stack installed
7
+ * - npm install redis
8
+ *
9
+ * Env:
10
+ * - REDIS_URL=redis://localhost:6379
11
+ */
12
+
13
+ const { createClient } = require("redis");
14
+
15
+ const INDEX_NAME = "olang_docs_idx";
16
+ const VECTOR_DIM = 1536; // default for text-embedding-3-small (OpenAI)
17
+ const VECTOR_TYPE = "FLOAT32";
18
+
19
+ module.exports = {
20
+ client: null,
21
+
22
+ async init() {
23
+ this.client = createClient({
24
+ url: process.env.REDIS_URL || "redis://localhost:6379",
25
+ });
26
+
27
+ this.client.on("error", (err) =>
28
+ console.error("Redis connection error:", err)
29
+ );
30
+
31
+ await this.client.connect();
32
+
33
+ // Try creating index (ignore errors if exists)
34
+ try {
35
+ await this.client.ft.create(
36
+ INDEX_NAME,
37
+ {
38
+ vector: {
39
+ type: "VECTOR",
40
+ ALGORITHM: "HNSW",
41
+ DIM: VECTOR_DIM,
42
+ DISTANCE_METRIC: "COSINE",
43
+ TYPE: VECTOR_TYPE,
44
+ },
45
+ text: {
46
+ type: "TEXT",
47
+ },
48
+ source: {
49
+ type: "TEXT",
50
+ },
51
+ },
52
+ {
53
+ ON: "HASH",
54
+ PREFIX: "doc:",
55
+ }
56
+ );
57
+
58
+ console.log("Redis vector index created.");
59
+ } catch (err) {
60
+ if (err.message.includes("Index already exists")) {
61
+ console.log("Redis vector index already exists ✓");
62
+ } else {
63
+ console.error("Redis index creation error:", err);
64
+ }
65
+ }
66
+ },
67
+
68
+ async upsert(id, vector, metadata) {
69
+ const key = `doc:${id}`;
70
+
71
+ await this.client.hSet(key, {
72
+ vector: Buffer.from(new Float32Array(vector).buffer),
73
+ text: metadata.text,
74
+ source: metadata.source,
75
+ });
76
+ },
77
+
78
+ async search(queryVector, limit = 5) {
79
+ const queryBuffer = Buffer.from(new Float32Array(queryVector).buffer);
80
+
81
+ const results = await this.client.ft.search(
82
+ INDEX_NAME,
83
+ `*=>[KNN ${limit} @vector $BLOB AS score]`,
84
+ {
85
+ PARAMS: {
86
+ BLOB: queryBuffer,
87
+ },
88
+ SORTBY: {
89
+ BY: "score",
90
+ DIRECTION: "ASC",
91
+ },
92
+ RETURN: ["text", "source", "score"],
93
+ }
94
+ );
95
+
96
+ return results.documents.map((doc) => ({
97
+ id: doc.id,
98
+ score: 1 - doc.value.score, // convert distance -> similarity
99
+ text: doc.value.text,
100
+ source: doc.value.source,
101
+ }));
102
+ },
103
+ };
@@ -0,0 +1,45 @@
1
+ // src/embeddings/anthropic.js
2
+
3
+ import Anthropic from "@anthropic-ai/sdk";
4
+
5
+ /**
6
+ * AnthropicEmbedding
7
+ * Generates vector embeddings using Claude embeddings API.
8
+ */
9
+ export class AnthropicEmbedding {
10
+ constructor(apiKey = process.env.ANTHROPIC_API_KEY) {
11
+ if (!apiKey) throw new Error("Missing ANTHROPIC_API_KEY");
12
+ this.client = new Anthropic({ apiKey });
13
+ this.model = process.env.ANTHROPIC_EMBED_MODEL || "claude-embed-v1";
14
+ }
15
+
16
+ /**
17
+ * Generate embedding for a single text string
18
+ */
19
+ async embed(text) {
20
+ if (!text || !text.trim()) return [];
21
+
22
+ const res = await this.client.embeddings.create({
23
+ model: this.model,
24
+ input: text,
25
+ });
26
+
27
+ return res.data[0].embedding;
28
+ }
29
+
30
+ /**
31
+ * Batch embedding for multiple texts
32
+ */
33
+ async embedBatch(textArray = []) {
34
+ if (!Array.isArray(textArray)) {
35
+ throw new Error("embedBatch expects an array");
36
+ }
37
+
38
+ const res = await this.client.embeddings.create({
39
+ model: this.model,
40
+ input: textArray,
41
+ });
42
+
43
+ return res.data.map(item => item.embedding);
44
+ }
45
+ }
@@ -0,0 +1,47 @@
1
+ // src/embeddings/groq.js
2
+
3
+ import Groq from "groq-sdk";
4
+
5
+ /**
6
+ * GroqEmbedding
7
+ * Generates vector embeddings using Groq LLM models.
8
+ *
9
+ * Default model: nomic-embed-text
10
+ */
11
+ export class GroqEmbedding {
12
+ constructor(apiKey = process.env.GROQ_API_KEY) {
13
+ if (!apiKey) throw new Error("Missing GROQ_API_KEY");
14
+ this.client = new Groq({ apiKey });
15
+ this.model = process.env.GROQ_EMBED_MODEL || "nomic-embed-text";
16
+ }
17
+
18
+ /**
19
+ * Generate embedding for a single text string
20
+ */
21
+ async embed(text) {
22
+ if (!text || !text.trim()) return [];
23
+
24
+ const res = await this.client.embeddings.create({
25
+ model: this.model,
26
+ input: text,
27
+ });
28
+
29
+ return res.data[0].embedding;
30
+ }
31
+
32
+ /**
33
+ * Batch embedding for multiple texts
34
+ */
35
+ async embedBatch(textArray = []) {
36
+ if (!Array.isArray(textArray)) {
37
+ throw new Error("embedBatch expects an array");
38
+ }
39
+
40
+ const res = await this.client.embeddings.create({
41
+ model: this.model,
42
+ input: textArray,
43
+ });
44
+
45
+ return res.data.map(item => item.embedding);
46
+ }
47
+ }
@@ -0,0 +1,52 @@
1
+ import crypto from "crypto";
2
+
3
+ /**
4
+ * LocalEmbedding
5
+ * Generates deterministic "fake" embeddings for offline testing or fallback.
6
+ * Each string will produce a consistent vector based on a hash.
7
+ * Note: Not semantic, just a placeholder for testing.
8
+ */
9
+ export class LocalEmbedding {
10
+ constructor(dim = 512) {
11
+ this.dim = dim;
12
+ }
13
+
14
+ /**
15
+ * Convert text → deterministic pseudo-vector
16
+ */
17
+ embed(text) {
18
+ if (!text || !text.trim()) return new Array(this.dim).fill(0);
19
+
20
+ const hash = crypto.createHash("sha256").update(text).digest();
21
+ const vector = [];
22
+
23
+ for (let i = 0; i < this.dim; i++) {
24
+ vector.push(hash[i % hash.length] / 255); // normalize 0–1
25
+ }
26
+
27
+ return vector;
28
+ }
29
+
30
+ /**
31
+ * Batch embedding for multiple strings
32
+ */
33
+ embedBatch(textArray = []) {
34
+ if (!Array.isArray(textArray)) throw new Error("embedBatch expects an array");
35
+ return textArray.map(text => this.embed(text));
36
+ }
37
+ }
38
+
39
+ /**
40
+ * Convenience function for index.js
41
+ * Retries local embedding generation (mostly placeholder, but keeps API compatible)
42
+ */
43
+ export async function createEmbeddingWithRetry(text, options = {}, retries = 1) {
44
+ const embedder = new LocalEmbedding();
45
+ for (let attempt = 1; attempt <= retries; attempt++) {
46
+ try {
47
+ return embedder.embed(text);
48
+ } catch (err) {
49
+ if (attempt === retries) throw err;
50
+ }
51
+ }
52
+ }
@@ -0,0 +1,47 @@
1
+ // src/embeddings/openai.js
2
+
3
+ import OpenAI from "openai";
4
+
5
+ /**
6
+ * OpenAIEmbedding
7
+ * Generates vector embeddings using OpenAI models.
8
+ *
9
+ * Default model: text-embedding-3-large (best for RAG)
10
+ */
11
+ export class OpenAIEmbedding {
12
+ constructor(apiKey = process.env.OPENAI_API_KEY) {
13
+ if (!apiKey) throw new Error("Missing OPENAI_API_KEY");
14
+ this.client = new OpenAI({ apiKey });
15
+ this.model = process.env.OPENAI_EMBED_MODEL || "text-embedding-3-large";
16
+ }
17
+
18
+ /**
19
+ * Generate embedding for a single string
20
+ */
21
+ async embed(text) {
22
+ if (!text || !text.trim()) return [];
23
+
24
+ const res = await this.client.embeddings.create({
25
+ model: this.model,
26
+ input: text,
27
+ });
28
+
29
+ return res.data[0].embedding; // return the vector
30
+ }
31
+
32
+ /**
33
+ * Batch embedding for lists of texts
34
+ */
35
+ async embedBatch(textArray = []) {
36
+ if (!Array.isArray(textArray)) {
37
+ throw new Error("embedBatch expects an array");
38
+ }
39
+
40
+ const res = await this.client.embeddings.create({
41
+ model: this.model,
42
+ input: textArray,
43
+ });
44
+
45
+ return res.data.map(item => item.embedding);
46
+ }
47
+ }
package/src/index.js ADDED
@@ -0,0 +1,181 @@
1
+ // doc-search.js
2
+ import fs from "fs";
3
+ import path from "path";
4
+ import { createLLM } from "./llm/router.js";
5
+ import { LocalEmbedding } from "./embeddings/local.js";
6
+ import { chunkText } from "./utils/chunker.js";
7
+ import { extractKeywords } from "./utils/extractText.js";
8
+ import { cosine } from "./utils/similarity.js";
9
+ import { highlightMatches } from "./utils/highlight.js";
10
+
11
+ const CACHE_PATH = path.join(process.cwd(), "embeddings.json");
12
+
13
+ function safeResolve(base, userPath) {
14
+ const resolved = path.resolve(base, userPath);
15
+ if (!resolved.startsWith(path.resolve(base))) {
16
+ throw new Error("Path traversal detected");
17
+ }
18
+ return resolved;
19
+ }
20
+
21
+ function loadCache() {
22
+ try {
23
+ if (fs.existsSync(CACHE_PATH)) {
24
+ return JSON.parse(fs.readFileSync(CACHE_PATH, "utf8")) || {};
25
+ }
26
+ } catch {}
27
+ return {};
28
+ }
29
+
30
+ function saveCache(cache) {
31
+ try {
32
+ fs.writeFileSync(CACHE_PATH, JSON.stringify(cache, null, 2));
33
+ } catch {}
34
+ }
35
+
36
+ async function performDocQA(query, context = {}) {
37
+ const { doc_root, stream = false } = context;
38
+ const options = context.options || {};
39
+ const CHUNK_SIZE = options.chunkSize || 1200;
40
+ const OVERLAP = Math.floor(CHUNK_SIZE * 0.2);
41
+ const SEMANTIC_WEIGHT = options.semanticWeight ?? 0.75;
42
+ const MIN_SCORE = options.minScore ?? 0.18;
43
+ const model = options.model || "default";
44
+
45
+ if (!query || typeof query !== "string") {
46
+ return { text: "Missing required input: query" };
47
+ }
48
+
49
+ const baseDir = doc_root
50
+ ? safeResolve(process.cwd(), doc_root)
51
+ : path.join(process.cwd(), "docs");
52
+
53
+ if (!fs.existsSync(baseDir)) {
54
+ return { text: `Document directory not found: ${baseDir}` };
55
+ }
56
+
57
+ const files = fs.readdirSync(baseDir).filter(f => f.endsWith(".txt") || f.endsWith(".md"));
58
+ if (!files.length) {
59
+ return { text: "No documents available." };
60
+ }
61
+
62
+ const qLower = query.toLowerCase().trim();
63
+ const exact = files.find(f => path.basename(f, path.extname(f)).toLowerCase() === qLower);
64
+ if (exact) {
65
+ return {
66
+ text: fs.readFileSync(path.join(baseDir, exact), "utf8"),
67
+ meta: { file: exact, method: "exact-filename" }
68
+ };
69
+ }
70
+
71
+ const cache = loadCache();
72
+ const docs = [];
73
+ const localEmbedder = new LocalEmbedding();
74
+
75
+ for (const file of files) {
76
+ const raw = fs.readFileSync(path.join(baseDir, file), "utf8");
77
+ const chunks = chunkText(raw, CHUNK_SIZE, OVERLAP);
78
+ const chunkObjs = [];
79
+
80
+ for (let i = 0; i < chunks.length; i++) {
81
+ const key = `${file}::chunk::${i}`;
82
+ let emb = cache[key];
83
+ if (!emb) {
84
+ try {
85
+ emb = localEmbedder.embed(chunks[i]);
86
+ cache[key] = emb;
87
+ saveCache(cache);
88
+ } catch {
89
+ emb = null;
90
+ }
91
+ }
92
+ chunkObjs.push({ index: i, text: chunks[i], emb });
93
+ }
94
+ docs.push({ file, raw, chunks: chunkObjs });
95
+ }
96
+
97
+ let queryEmb = null;
98
+ try {
99
+ queryEmb = localEmbedder.embed(query);
100
+ } catch {}
101
+
102
+ const keywords = extractKeywords(query);
103
+
104
+ const fileScores = docs.map(doc => {
105
+ let bestChunk = null;
106
+ let bestHybrid = -Infinity;
107
+
108
+ for (const ch of doc.chunks) {
109
+ const semScore = queryEmb && ch.emb ? cosine(queryEmb, ch.emb) : 0;
110
+ const lexScore = keywords.length
111
+ ? keywords.reduce((acc, k) => acc + (ch.text.toLowerCase().includes(k) ? 1 : 0), 0) / keywords.length
112
+ : 0;
113
+ const hybrid = SEMANTIC_WEIGHT * semScore + (1 - SEMANTIC_WEIGHT) * lexScore;
114
+
115
+ if (hybrid > bestHybrid) {
116
+ bestHybrid = hybrid;
117
+ bestChunk = { ...ch, semScore, lexScore, hybrid };
118
+ }
119
+ }
120
+ return { file: doc.file, score: bestHybrid, bestChunk };
121
+ });
122
+
123
+ fileScores.sort((a, b) => b.score - a.score);
124
+ const best = fileScores[0];
125
+
126
+ if (!best || best.score < MIN_SCORE) {
127
+ for (const file of files) {
128
+ const text = fs.readFileSync(path.join(baseDir, file), "utf8").toLowerCase();
129
+ if (keywords.some(k => text.includes(k))) {
130
+ const snippetIndex = text.indexOf(keywords.find(k => text.includes(k)));
131
+ const start = Math.max(0, snippetIndex - 200);
132
+ const snippet = text.slice(start, Math.min(text.length, snippetIndex + 400));
133
+ return { text: snippet, meta: { file, method: "lexical-fallback" } };
134
+ }
135
+ }
136
+ return { text: `No document found matching: "${query}"` };
137
+ }
138
+
139
+ const snippet = highlightMatches(best.bestChunk.text, keywords);
140
+
141
+ if (options.provider && options.provider !== "local") {
142
+ const llm = createLLM({
143
+ provider: options.provider,
144
+ openaiApiKey: options.openaiApiKey,
145
+ groqApiKey: options.groqApiKey,
146
+ anthropicApiKey: options.anthropicApiKey,
147
+ });
148
+
149
+ if (stream && typeof context.onToken === "function") {
150
+ await llm.stream({ prompt: snippet, model, onToken: context.onToken });
151
+ return {
152
+ text: snippet,
153
+ meta: { file: best.file, chunkIndex: best.bestChunk.index, method: "hybrid-semantic-stream" }
154
+ };
155
+ } else {
156
+ const resp = await llm.generate({ prompt: snippet, model });
157
+ return {
158
+ text: resp.text,
159
+ meta: { file: best.file, chunkIndex: best.bestChunk.index, method: "hybrid-semantic" }
160
+ };
161
+ }
162
+ }
163
+
164
+ return {
165
+ text: snippet,
166
+ meta: { file: best.file, chunkIndex: best.bestChunk.index, method: "hybrid-semantic" }
167
+ };
168
+ }
169
+
170
+ // ✅ O-Lang Resolver Interface
171
+ export default async function docSearchResolver(action, context) {
172
+ if (action.startsWith('Ask doc-search ')) {
173
+ const match = action.match(/"(.*)"|'(.*)'/);
174
+ const query = match ? (match[1] || match[2]) : action.replace(/^Ask doc-search\s+/, '').trim();
175
+ return await performDocQA(query, context);
176
+ }
177
+ return undefined;
178
+ }
179
+
180
+ // ✅ Resolver name matches package name: @o-lang/doc-search → doc-search
181
+ docSearchResolver.resolverName = 'doc-search';
@@ -0,0 +1,36 @@
1
+ // src/llm/anthropic.js
2
+ import Anthropic from "@anthropic-ai/sdk";
3
+
4
+ /**
5
+ * Anthropic LLM Provider
6
+ */
7
+ export default class AnthropicProvider {
8
+ constructor({ apiKey }) {
9
+ if (!apiKey) console.warn("⚠️ WARNING: ANTHROPIC_API_KEY missing");
10
+ this.client = new Anthropic({ apiKey });
11
+ }
12
+
13
+ async generate({ model = "claude-v1", prompt }) {
14
+ const resp = await this.client.completions.create({
15
+ model,
16
+ prompt,
17
+ max_tokens_to_sample: 400,
18
+ temperature: 0.4,
19
+ });
20
+ return { text: resp.completion, raw: resp };
21
+ }
22
+
23
+ async stream({ model = "claude-v1", prompt, onToken }) {
24
+ const resp = await this.client.completions.stream({
25
+ model,
26
+ prompt,
27
+ max_tokens_to_sample: 400,
28
+ temperature: 0.4,
29
+ });
30
+
31
+ for await (const event of resp) {
32
+ if (event.type === "completion.delta" && event.delta) onToken(event.delta);
33
+ }
34
+ return { done: true };
35
+ }
36
+ }
@@ -0,0 +1,52 @@
1
+ // src/llm/groq.js
2
+ // Groq LLM Provider (groq-sdk@0.5.0)
3
+ // Multi-tenant safe, streaming-ready
4
+
5
+ import Groq from "groq-sdk";
6
+
7
+ export default class GroqProvider {
8
+ /**
9
+ * @param {Object} options
10
+ * @param {string} options.apiKey - User-provided Groq API key
11
+ */
12
+ constructor({ apiKey }) {
13
+ if (!apiKey) throw new Error("GROQ API key is required");
14
+ this.client = new Groq({ apiKey });
15
+ this.model = "llama-3.1-8b-instant"; // default model for generation
16
+ }
17
+
18
+ /**
19
+ * Generate completion from prompt
20
+ * @param {Object} param0
21
+ * @param {string} param0.prompt
22
+ * @returns {Promise<{text: string, raw: any}>}
23
+ */
24
+ async generate({ prompt }) {
25
+ const resp = await this.client.chat.completions.create({
26
+ messages: [{ role: "user", content: prompt }],
27
+ model: this.model,
28
+ temperature: 0.3,
29
+ max_tokens: 250,
30
+ top_p: 0.9
31
+ });
32
+
33
+ const text = resp.choices[0]?.message?.content?.trim() || "";
34
+ return { text, raw: resp };
35
+ }
36
+
37
+ /**
38
+ * Stream completion (simulated streaming)
39
+ * @param {Object} param0
40
+ * @param {string} param0.prompt
41
+ * @param {function} param0.onToken - callback for each chunk of text
42
+ * @returns {Promise<{done: boolean}>}
43
+ */
44
+ async stream({ prompt, onToken }) {
45
+ const { text } = await this.generate({ prompt });
46
+ if (onToken && typeof onToken === "function") {
47
+ // naive streaming: split by sentence or word
48
+ text.split(/(\s+)/).forEach(token => onToken(token));
49
+ }
50
+ return { done: true };
51
+ }
52
+ }