@o-lang/semantic-doc-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ // src/llm/openai.js
2
+ import OpenAI from "openai";
3
+
4
+ /**
5
+ * OpenAI LLM Provider
6
+ * Supports GPT-4, GPT-4o-mini, GPT-4o, and any Chat model
7
+ */
8
+ export default class OpenAIProvider {
9
+ constructor({ apiKey }) {
10
+ if (!apiKey) console.warn("⚠️ WARNING: OPENAI_API_KEY missing");
11
+ this.client = new OpenAI({ apiKey });
12
+ }
13
+
14
+ async generate({ model = "gpt-4.1-mini", prompt, maxTokens = 400 }) {
15
+ const resp = await this.client.chat.completions.create({
16
+ model,
17
+ messages: [{ role: "user", content: prompt }],
18
+ max_tokens: maxTokens,
19
+ temperature: 0.4,
20
+ });
21
+
22
+ return {
23
+ text: resp.choices[0].message.content,
24
+ raw: resp,
25
+ };
26
+ }
27
+
28
+ async stream({ model = "gpt-4.1-mini", prompt, onToken }) {
29
+ const stream = await this.client.chat.completions.create({
30
+ model,
31
+ stream: true,
32
+ messages: [{ role: "user", content: prompt }],
33
+ temperature: 0.4,
34
+ });
35
+
36
+ for await (const chunk of stream) {
37
+ const token = chunk.choices?.[0]?.delta?.content;
38
+ if (token) onToken(token);
39
+ }
40
+
41
+ return { done: true };
42
+ }
43
+ }
@@ -0,0 +1,22 @@
1
+ import OpenAIProvider from "./openai.js";
2
+ import GroqProvider from "./groq.js";
3
+ import AnthropicProvider from "./anthropic.js";
4
+
5
+ export function createLLM({ provider, openaiApiKey, groqApiKey, anthropicApiKey } = {}) {
6
+ switch ((provider || "").toLowerCase()) {
7
+ case "openai":
8
+ if (!openaiApiKey) throw new Error("Missing OpenAI API key");
9
+ return new OpenAIProvider({ apiKey: openaiApiKey });
10
+
11
+ case "groq":
12
+ if (!groqApiKey) throw new Error("Missing Groq API key");
13
+ return new GroqProvider({ apiKey: groqApiKey });
14
+
15
+ case "anthropic":
16
+ if (!anthropicApiKey) throw new Error("Missing Anthropic API key");
17
+ return new AnthropicProvider({ apiKey: anthropicApiKey });
18
+
19
+ default:
20
+ throw new Error(`Unsupported LLM provider: ${provider}`);
21
+ }
22
+ }
@@ -0,0 +1,39 @@
1
+ // src/rerank/cohere.js
2
+
3
+ import Cohere from "cohere-ai";
4
+
5
+ /**
6
+ * CohereReranker
7
+ * Uses Cohere Rerank API to reorder candidate documents/snippets
8
+ */
9
+ export class CohereReranker {
10
+ constructor(apiKey = process.env.COHERE_API_KEY) {
11
+ if (!apiKey) throw new Error("Missing COHERE_API_KEY");
12
+ Cohere.init(apiKey);
13
+ }
14
+
15
+ /**
16
+ * Rerank an array of text candidates given a query
17
+ * @param {string} query - user query
18
+ * @param {string[]} candidates - array of candidate snippets
19
+ * @returns array of { text, score } sorted by relevance descending
20
+ */
21
+ async rerank(query, candidates = []) {
22
+ if (!query || !Array.isArray(candidates) || candidates.length === 0) return [];
23
+
24
+ const response = await Cohere.rerank({
25
+ model: "rerank-english-v2.0",
26
+ query,
27
+ documents: candidates,
28
+ });
29
+
30
+ // response.ranking is an array of indices
31
+ const ranked = response.ranking.map((idx, i) => ({
32
+ text: candidates[idx],
33
+ score: response.scores[idx],
34
+ rank: i + 1,
35
+ }));
36
+
37
+ return ranked;
38
+ }
39
+ }
@@ -0,0 +1,50 @@
1
+ // src/rerank/groqRerank.js
2
+
3
+ import Groq from "groq-sdk";
4
+
5
+ /**
6
+ * GroqReranker
7
+ * Uses Groq LLMs to rerank candidate documents/snippets given a query
8
+ */
9
+ export class GroqReranker {
10
+ constructor(apiKey = process.env.GROQ_API_KEY) {
11
+ if (!apiKey) throw new Error("Missing GROQ_API_KEY");
12
+ this.client = new Groq({ apiKey });
13
+ }
14
+
15
+ /**
16
+ * Rerank an array of candidates
17
+ * @param {string} query - user query
18
+ * @param {string[]} candidates - candidate snippets
19
+ * @param {string} model - optional rerank model
20
+ * @returns array of { text, score } sorted by relevance descending
21
+ */
22
+ async rerank(query, candidates = [], model = "llama3-8b-8192") {
23
+ if (!query || !Array.isArray(candidates) || candidates.length === 0) return [];
24
+
25
+ const prompt = `
26
+ You are an expert AI assistant for document search.
27
+ Query: ${query}
28
+ Candidates: ${candidates.map((c, i) => `${i + 1}. ${c}`).join("\n")}
29
+
30
+ Rank the candidates from most relevant to least relevant, and assign a relevance score between 0 and 1 for each.
31
+ Return JSON array: [{"text": "...", "score": 0.95}, ...]
32
+ `.trim();
33
+
34
+ const response = await this.client.chat.completions.create({
35
+ model,
36
+ messages: [{ role: "user", content: prompt }],
37
+ temperature: 0,
38
+ });
39
+
40
+ try {
41
+ const content = response.choices?.[0]?.message?.content || "[]";
42
+ const ranked = JSON.parse(content);
43
+ return ranked;
44
+ } catch (err) {
45
+ console.error("GroqReranker parse error:", err);
46
+ // fallback: return candidates with uniform score
47
+ return candidates.map(c => ({ text: c, score: 0.5 }));
48
+ }
49
+ }
50
+ }
@@ -0,0 +1,43 @@
1
+ // src/rerank/local.js
2
+
3
+ import { cosine } from "../utils/similarity.js";
4
+
5
+ /**
6
+ * LocalReranker
7
+ * Simple fallback reranker using keyword overlap + cosine similarity
8
+ */
9
+ export class LocalReranker {
10
+ constructor() {}
11
+
12
+ /**
13
+ * Rerank candidates given a query
14
+ * @param {string} query
15
+ * @param {string[]} candidates
16
+ * @param {Object} options - optional embeddings
17
+ * @returns array of { text, score } sorted descending
18
+ */
19
+ async rerank(query, candidates = [], options = {}) {
20
+ if (!query || !Array.isArray(candidates) || candidates.length === 0) return [];
21
+
22
+ const queryVec = options.queryEmb || null;
23
+ const results = candidates.map((text) => {
24
+ let score = 0;
25
+
26
+ if (queryVec && options.embeddings && options.embeddings[text]) {
27
+ score = cosine(queryVec, options.embeddings[text]);
28
+ } else {
29
+ // fallback: simple keyword overlap
30
+ const queryWords = query.toLowerCase().split(/\W+/).filter(Boolean);
31
+ const textWords = text.toLowerCase().split(/\W+/).filter(Boolean);
32
+ const matches = queryWords.filter(w => textWords.includes(w));
33
+ score = matches.length / queryWords.length;
34
+ }
35
+
36
+ return { text, score };
37
+ });
38
+
39
+ // sort descending
40
+ results.sort((a, b) => b.score - a.score);
41
+ return results;
42
+ }
43
+ }
@@ -0,0 +1,35 @@
1
+ // src/server/streamingHandler.js
2
+
3
+ import express from "express";
4
+ import { initSSE, sendSSE } from "../utils/sseStream.js";
5
+ import { llmRouter } from "../llm/router.js";
6
+
7
+ const router = express.Router();
8
+
9
+ /**
10
+ * POST /stream
11
+ * Body: { query: string, provider: "openai"|"groq"|"anthropic", options: {} }
12
+ */
13
+ router.post("/stream", async (req, res) => {
14
+ try {
15
+ const { query, provider, options } = req.body;
16
+ if (!query) return res.status(400).json({ error: "Missing query" });
17
+
18
+ // Initialize SSE
19
+ initSSE(res);
20
+
21
+ // LLM router handles streaming token-by-token
22
+ await llmRouter.streamQuery(query, provider, options, (token) => {
23
+ sendSSE(res, { token });
24
+ });
25
+
26
+ // End stream
27
+ sendSSE(res, { done: true }, true);
28
+
29
+ } catch (err) {
30
+ console.error("Streaming error:", err);
31
+ res.status(500).json({ error: err.message });
32
+ }
33
+ });
34
+
35
+ export { router as streamingHandler };
@@ -0,0 +1,32 @@
1
+ // src/templates/prompt_templates.js
2
+
3
+ /**
4
+ * Prebuilt prompt templates for LLMs
5
+ */
6
+ export const promptTemplates = {
7
+ summarize: ({ text }) => `
8
+ Summarize the following text in a concise paragraph:
9
+
10
+ ${text}
11
+ `.trim(),
12
+
13
+ shortAnswer: ({ question, context }) => `
14
+ Answer the following question based on the provided context. If the answer is not in the context, say "Not found":
15
+
16
+ Question: ${question}
17
+ Context: ${context}
18
+ `.trim(),
19
+
20
+ bulletPoints: ({ text }) => `
21
+ Convert the following text into a set of clear bullet points:
22
+
23
+ ${text}
24
+ `.trim(),
25
+
26
+ citeSources: ({ text, sources }) => `
27
+ Summarize the text below and cite sources from the provided list:
28
+
29
+ Text: ${text}
30
+ Sources: ${sources.join(", ")}
31
+ `.trim(),
32
+ };
@@ -0,0 +1,27 @@
1
+ // src/utils/chunker.js
2
+
3
+ /**
4
+ * chunkText
5
+ * Split text into overlapping chunks
6
+ *
7
+ * @param {string} text - the full text
8
+ * @param {number} chunkSize - number of characters per chunk
9
+ * @param {number} overlap - number of characters overlap between chunks
10
+ * @returns string[] - array of text chunks
11
+ */
12
+ export function chunkText(text, chunkSize = 1000, overlap = 200) {
13
+ if (!text || !text.trim()) return [];
14
+
15
+ const chunks = [];
16
+ let start = 0;
17
+
18
+ while (start < text.length) {
19
+ const end = Math.min(start + chunkSize, text.length);
20
+ const chunk = text.slice(start, end);
21
+ chunks.push(chunk);
22
+
23
+ start += chunkSize - overlap; // move forward with overlap
24
+ }
25
+
26
+ return chunks;
27
+ }
@@ -0,0 +1,59 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ import { JSDOM } from "jsdom";
4
+ import pdfParse from "pdf-parse";
5
+ import { readFile } from "fs/promises";
6
+
7
+ /**
8
+ * extractTextFromFile
9
+ * Extracts plain text from supported file types:
10
+ * - .txt
11
+ * - .md
12
+ * - .html/.htm
13
+ * - .pdf
14
+ * - .docx (minimal)
15
+ */
16
+ export async function extractTextFromFile(filePath) {
17
+ const ext = path.extname(filePath).toLowerCase();
18
+
19
+ if (ext === ".txt" || ext === ".md") {
20
+ return fs.readFileSync(filePath, "utf8");
21
+ }
22
+
23
+ if (ext === ".html" || ext === ".htm") {
24
+ const html = fs.readFileSync(filePath, "utf8");
25
+ const dom = new JSDOM(html);
26
+ return dom.window.document.body.textContent || "";
27
+ }
28
+
29
+ if (ext === ".pdf") {
30
+ const buffer = fs.readFileSync(filePath);
31
+ const data = await pdfParse(buffer);
32
+ return data.text || "";
33
+ }
34
+
35
+ if (ext === ".docx") {
36
+ // Minimal extraction using ZIP (can be improved)
37
+ const { default: StreamZip } = await import("node-stream-zip");
38
+ const zip = new StreamZip.async({ file: filePath });
39
+ const content = await zip.entryData("word/document.xml");
40
+ await zip.close();
41
+ return content.toString().replace(/<[^>]+>/g, " ");
42
+ }
43
+
44
+ throw new Error(`Unsupported file type: ${ext}`);
45
+ }
46
+
47
+ /**
48
+ * extractKeywords
49
+ * Basic keyword extraction for lexical matching
50
+ * Returns array of lowercase words, stripped of punctuation
51
+ */
52
+ export function extractKeywords(text = "") {
53
+ if (!text) return [];
54
+ return text
55
+ .toLowerCase()
56
+ .replace(/[\W_]+/g, " ") // remove non-alphanumerics
57
+ .split(/\s+/)
58
+ .filter(Boolean);
59
+ }
@@ -0,0 +1,39 @@
1
+ // src/utils/fileLoader.js
2
+
3
+ import fs from "fs";
4
+ import path from "path";
5
+ import { extractTextFromFile } from "./extractText.js";
6
+
7
+ /**
8
+ * loadDocuments
9
+ * Recursively loads all supported files from a directory
10
+ * and extracts text content.
11
+ *
12
+ * @param {string} dirPath - root directory
13
+ * @param {string[]} exts - array of supported file extensions
14
+ * @returns {Promise<Array<{ filePath: string, text: string }>>}
15
+ */
16
+ export async function loadDocuments(dirPath, exts = [".txt", ".md", ".pdf", ".html", ".docx"]) {
17
+ if (!fs.existsSync(dirPath)) return [];
18
+
19
+ const files = fs.readdirSync(dirPath, { withFileTypes: true });
20
+ const docs = [];
21
+
22
+ for (const file of files) {
23
+ const fullPath = path.join(dirPath, file.name);
24
+
25
+ if (file.isDirectory()) {
26
+ const subDocs = await loadDocuments(fullPath, exts);
27
+ docs.push(...subDocs);
28
+ } else if (exts.includes(path.extname(file.name).toLowerCase())) {
29
+ try {
30
+ const text = await extractTextFromFile(fullPath);
31
+ docs.push({ filePath: fullPath, text });
32
+ } catch (err) {
33
+ console.warn(`Failed to extract text from ${fullPath}:`, err.message);
34
+ }
35
+ }
36
+ }
37
+
38
+ return docs;
39
+ }
@@ -0,0 +1,24 @@
1
+ // src/utils/highlight.js
2
+
3
+ /**
4
+ * highlightMatches
5
+ * Wraps all occurrences of keywords in <mark> tags
6
+ *
7
+ * @param {string} text - original text
8
+ * @param {string[]} keywords - array of keywords to highlight
9
+ * @returns string - HTML-safe text with <mark> highlights
10
+ */
11
+ export function highlightMatches(text, keywords = []) {
12
+ if (!text || !keywords || keywords.length === 0) return text;
13
+
14
+ let highlighted = text;
15
+
16
+ // escape special regex chars in keywords
17
+ const escapedKeywords = keywords.map(k => k.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
18
+
19
+ const pattern = new RegExp(`\\b(${escapedKeywords.join("|")})\\b`, "gi");
20
+
21
+ highlighted = highlighted.replace(pattern, "<mark>$1</mark>");
22
+
23
+ return highlighted;
24
+ }
@@ -0,0 +1,42 @@
1
+ // src/utils/similarity.js
2
+
3
+ /**
4
+ * cosine
5
+ * Compute cosine similarity between two vectors
6
+ * @param {number[]} a
7
+ * @param {number[]} b
8
+ * @returns {number} similarity score between -1 and 1
9
+ */
10
+ export function cosine(a, b) {
11
+ if (!a || !b || a.length !== b.length) return 0;
12
+
13
+ let dot = 0, magA = 0, magB = 0;
14
+
15
+ for (let i = 0; i < a.length; i++) {
16
+ dot += a[i] * b[i];
17
+ magA += a[i] * a[i];
18
+ magB += b[i] * b[i];
19
+ }
20
+
21
+ if (magA === 0 || magB === 0) return 0;
22
+
23
+ return dot / (Math.sqrt(magA) * Math.sqrt(magB));
24
+ }
25
+
26
+ /**
27
+ * euclidean
28
+ * Compute Euclidean distance between two vectors
29
+ * @param {number[]} a
30
+ * @param {number[]} b
31
+ * @returns {number} Euclidean distance
32
+ */
33
+ export function euclidean(a, b) {
34
+ if (!a || !b || a.length !== b.length) return Infinity;
35
+
36
+ let sum = 0;
37
+ for (let i = 0; i < a.length; i++) {
38
+ sum += (a[i] - b[i]) ** 2;
39
+ }
40
+
41
+ return Math.sqrt(sum);
42
+ }
@@ -0,0 +1,29 @@
1
+ // src/utils/sseStream.js
2
+
3
+ /**
4
+ * sendSSE
5
+ * Send streaming data over Server-Sent Events (SSE)
6
+ *
7
+ * @param {import('http').ServerResponse} res
8
+ * @param {string} data
9
+ * @param {boolean} [end=false] - whether this is the final message
10
+ */
11
+ export function sendSSE(res, data, end = false) {
12
+ res.write(`data: ${JSON.stringify(data)}\n\n`);
13
+ if (end) res.write("event: end\ndata: [DONE]\n\n");
14
+ }
15
+
16
+ /**
17
+ * initSSE
18
+ * Initialize SSE response headers
19
+ *
20
+ * @param {import('http').ServerResponse} res
21
+ */
22
+ export function initSSE(res) {
23
+ res.writeHead(200, {
24
+ "Content-Type": "text/event-stream",
25
+ "Cache-Control": "no-cache",
26
+ Connection: "keep-alive",
27
+ });
28
+ res.write("\n");
29
+ }