npm - @o-lang/semantic-doc-search - Versions diffs - 1.0.0 - Mend

@o-lang/semantic-doc-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/.env.example +0 -0
package/README.md +24 -0
package/bin/cli.js +91 -0
package/docs/sample1.txt +1 -0
package/embeddings.json +516 -0
package/package.json +35 -0
package/src/adapters/inMemoryAdapter.js +58 -0
package/src/adapters/pineconeAdapter.js +78 -0
package/src/adapters/redisAdapter.js +103 -0
package/src/embeddings/anthropic.js +45 -0
package/src/embeddings/groq.js +47 -0
package/src/embeddings/local.js +52 -0
package/src/embeddings/openai.js +47 -0
package/src/index.js +181 -0
package/src/llm/anthropic.js +36 -0
package/src/llm/groq.js +52 -0
package/src/llm/openai.js +43 -0
package/src/llm/router.js +22 -0
package/src/rerank/cohere.js +39 -0
package/src/rerank/groqRerank.js +50 -0
package/src/rerank/local.js +43 -0
package/src/server/streamingHandler.js +35 -0
package/src/templates/prompt_templates.js +32 -0
package/src/utils/chunker.js +27 -0
package/src/utils/extractText.js +59 -0
package/src/utils/fileLoader.js +39 -0
package/src/utils/highlight.js +24 -0
package/src/utils/similarity.js +42 -0
package/src/utils/sseStream.js +29 -0

package/src/llm/openai.js ADDED Viewed

@@ -0,0 +1,43 @@
+// src/llm/openai.js
+import OpenAI from "openai";
+/**
+ * OpenAI LLM Provider
+ * Supports GPT-4, GPT-4o-mini, GPT-4o, and any Chat model
+ */
+export default class OpenAIProvider {
+  constructor({ apiKey }) {
+    if (!apiKey) console.warn("⚠️ WARNING: OPENAI_API_KEY missing");
+    this.client = new OpenAI({ apiKey });
+  }
+  async generate({ model = "gpt-4.1-mini", prompt, maxTokens = 400 }) {
+    const resp = await this.client.chat.completions.create({
+      model,
+      messages: [{ role: "user", content: prompt }],
+      max_tokens: maxTokens,
+      temperature: 0.4,
+    });
+    return {
+      text: resp.choices[0].message.content,
+      raw: resp,
+    };
+  }
+  async stream({ model = "gpt-4.1-mini", prompt, onToken }) {
+    const stream = await this.client.chat.completions.create({
+      model,
+      stream: true,
+      messages: [{ role: "user", content: prompt }],
+      temperature: 0.4,
+    });
+    for await (const chunk of stream) {
+      const token = chunk.choices?.[0]?.delta?.content;
+      if (token) onToken(token);
+    }
+    return { done: true };
+  }
+}

package/src/llm/router.js ADDED Viewed

@@ -0,0 +1,22 @@
+import OpenAIProvider from "./openai.js";
+import GroqProvider from "./groq.js";
+import AnthropicProvider from "./anthropic.js";
+export function createLLM({ provider, openaiApiKey, groqApiKey, anthropicApiKey } = {}) {
+  switch ((provider || "").toLowerCase()) {
+    case "openai":
+      if (!openaiApiKey) throw new Error("Missing OpenAI API key");
+      return new OpenAIProvider({ apiKey: openaiApiKey });
+    case "groq":
+      if (!groqApiKey) throw new Error("Missing Groq API key");
+      return new GroqProvider({ apiKey: groqApiKey });
+    case "anthropic":
+      if (!anthropicApiKey) throw new Error("Missing Anthropic API key");
+      return new AnthropicProvider({ apiKey: anthropicApiKey });
+    default:
+      throw new Error(`Unsupported LLM provider: ${provider}`);
+  }
+}

package/src/rerank/cohere.js ADDED Viewed

@@ -0,0 +1,39 @@
+// src/rerank/cohere.js
+import Cohere from "cohere-ai";
+/**
+ * CohereReranker
+ * Uses Cohere Rerank API to reorder candidate documents/snippets
+ */
+export class CohereReranker {
+  constructor(apiKey = process.env.COHERE_API_KEY) {
+    if (!apiKey) throw new Error("Missing COHERE_API_KEY");
+    Cohere.init(apiKey);
+  }
+  /**
+   * Rerank an array of text candidates given a query
+   * @param {string} query - user query
+   * @param {string[]} candidates - array of candidate snippets
+   * @returns array of { text, score } sorted by relevance descending
+   */
+  async rerank(query, candidates = []) {
+    if (!query || !Array.isArray(candidates) || candidates.length === 0) return [];
+    const response = await Cohere.rerank({
+      model: "rerank-english-v2.0",
+      query,
+      documents: candidates,
+    });
+    // response.ranking is an array of indices
+    const ranked = response.ranking.map((idx, i) => ({
+      text: candidates[idx],
+      score: response.scores[idx],
+      rank: i + 1,
+    }));
+    return ranked;
+  }
+}

package/src/rerank/groqRerank.js ADDED Viewed

@@ -0,0 +1,50 @@
+// src/rerank/groqRerank.js
+import Groq from "groq-sdk";
+/**
+ * GroqReranker
+ * Uses Groq LLMs to rerank candidate documents/snippets given a query
+ */
+export class GroqReranker {
+  constructor(apiKey = process.env.GROQ_API_KEY) {
+    if (!apiKey) throw new Error("Missing GROQ_API_KEY");
+    this.client = new Groq({ apiKey });
+  }
+  /**
+   * Rerank an array of candidates
+   * @param {string} query - user query
+   * @param {string[]} candidates - candidate snippets
+   * @param {string} model - optional rerank model
+   * @returns array of { text, score } sorted by relevance descending
+   */
+  async rerank(query, candidates = [], model = "llama3-8b-8192") {
+    if (!query || !Array.isArray(candidates) || candidates.length === 0) return [];
+    const prompt = `
+You are an expert AI assistant for document search.
+Query: ${query}
+Candidates: ${candidates.map((c, i) => `${i + 1}. ${c}`).join("\n")}
+Rank the candidates from most relevant to least relevant, and assign a relevance score between 0 and 1 for each.
+Return JSON array: [{"text": "...", "score": 0.95}, ...]
+    `.trim();
+    const response = await this.client.chat.completions.create({
+      model,
+      messages: [{ role: "user", content: prompt }],
+      temperature: 0,
+    });
+    try {
+      const content = response.choices?.[0]?.message?.content || "[]";
+      const ranked = JSON.parse(content);
+      return ranked;
+    } catch (err) {
+      console.error("GroqReranker parse error:", err);
+      // fallback: return candidates with uniform score
+      return candidates.map(c => ({ text: c, score: 0.5 }));
+    }
+  }
+}

package/src/rerank/local.js ADDED Viewed

@@ -0,0 +1,43 @@
+// src/rerank/local.js
+import { cosine } from "../utils/similarity.js";
+/**
+ * LocalReranker
+ * Simple fallback reranker using keyword overlap + cosine similarity
+ */
+export class LocalReranker {
+  constructor() {}
+  /**
+   * Rerank candidates given a query
+   * @param {string} query
+   * @param {string[]} candidates
+   * @param {Object} options - optional embeddings
+   * @returns array of { text, score } sorted descending
+   */
+  async rerank(query, candidates = [], options = {}) {
+    if (!query || !Array.isArray(candidates) || candidates.length === 0) return [];
+    const queryVec = options.queryEmb || null;
+    const results = candidates.map((text) => {
+      let score = 0;
+      if (queryVec && options.embeddings && options.embeddings[text]) {
+        score = cosine(queryVec, options.embeddings[text]);
+      } else {
+        // fallback: simple keyword overlap
+        const queryWords = query.toLowerCase().split(/\W+/).filter(Boolean);
+        const textWords = text.toLowerCase().split(/\W+/).filter(Boolean);
+        const matches = queryWords.filter(w => textWords.includes(w));
+        score = matches.length / queryWords.length;
+      }
+      return { text, score };
+    });
+    // sort descending
+    results.sort((a, b) => b.score - a.score);
+    return results;
+  }
+}

package/src/server/streamingHandler.js ADDED Viewed

@@ -0,0 +1,35 @@
+// src/server/streamingHandler.js
+import express from "express";
+import { initSSE, sendSSE } from "../utils/sseStream.js";
+import { llmRouter } from "../llm/router.js";
+const router = express.Router();
+/**
+ * POST /stream
+ * Body: { query: string, provider: "openai"|"groq"|"anthropic", options: {} }
+ */
+router.post("/stream", async (req, res) => {
+  try {
+    const { query, provider, options } = req.body;
+    if (!query) return res.status(400).json({ error: "Missing query" });
+    // Initialize SSE
+    initSSE(res);
+    // LLM router handles streaming token-by-token
+    await llmRouter.streamQuery(query, provider, options, (token) => {
+      sendSSE(res, { token });
+    });
+    // End stream
+    sendSSE(res, { done: true }, true);
+  } catch (err) {
+    console.error("Streaming error:", err);
+    res.status(500).json({ error: err.message });
+  }
+});
+export { router as streamingHandler };

package/src/templates/prompt_templates.js ADDED Viewed

@@ -0,0 +1,32 @@
+// src/templates/prompt_templates.js
+/**
+ * Prebuilt prompt templates for LLMs
+ */
+export const promptTemplates = {
+  summarize: ({ text }) => `
+Summarize the following text in a concise paragraph:
+${text}
+`.trim(),
+  shortAnswer: ({ question, context }) => `
+Answer the following question based on the provided context. If the answer is not in the context, say "Not found":
+Question: ${question}
+Context: ${context}
+`.trim(),
+  bulletPoints: ({ text }) => `
+Convert the following text into a set of clear bullet points:
+${text}
+`.trim(),
+  citeSources: ({ text, sources }) => `
+Summarize the text below and cite sources from the provided list:
+Text: ${text}
+Sources: ${sources.join(", ")}
+`.trim(),
+};

package/src/utils/chunker.js ADDED Viewed

@@ -0,0 +1,27 @@
+// src/utils/chunker.js
+/**
+ * chunkText
+ * Split text into overlapping chunks
+ *
+ * @param {string} text - the full text
+ * @param {number} chunkSize - number of characters per chunk
+ * @param {number} overlap - number of characters overlap between chunks
+ * @returns string[] - array of text chunks
+ */
+export function chunkText(text, chunkSize = 1000, overlap = 200) {
+  if (!text || !text.trim()) return [];
+  const chunks = [];
+  let start = 0;
+  while (start < text.length) {
+    const end = Math.min(start + chunkSize, text.length);
+    const chunk = text.slice(start, end);
+    chunks.push(chunk);
+    start += chunkSize - overlap; // move forward with overlap
+  }
+  return chunks;
+}

package/src/utils/extractText.js ADDED Viewed

@@ -0,0 +1,59 @@
+import fs from "fs";
+import path from "path";
+import { JSDOM } from "jsdom";
+import pdfParse from "pdf-parse";
+import { readFile } from "fs/promises";
+/**
+ * extractTextFromFile
+ * Extracts plain text from supported file types:
+ * - .txt
+ * - .md
+ * - .html/.htm
+ * - .pdf
+ * - .docx (minimal)
+ */
+export async function extractTextFromFile(filePath) {
+  const ext = path.extname(filePath).toLowerCase();
+  if (ext === ".txt" || ext === ".md") {
+    return fs.readFileSync(filePath, "utf8");
+  }
+  if (ext === ".html" || ext === ".htm") {
+    const html = fs.readFileSync(filePath, "utf8");
+    const dom = new JSDOM(html);
+    return dom.window.document.body.textContent || "";
+  }
+  if (ext === ".pdf") {
+    const buffer = fs.readFileSync(filePath);
+    const data = await pdfParse(buffer);
+    return data.text || "";
+  }
+  if (ext === ".docx") {
+    // Minimal extraction using ZIP (can be improved)
+    const { default: StreamZip } = await import("node-stream-zip");
+    const zip = new StreamZip.async({ file: filePath });
+    const content = await zip.entryData("word/document.xml");
+    await zip.close();
+    return content.toString().replace(/<[^>]+>/g, " ");
+  }
+  throw new Error(`Unsupported file type: ${ext}`);
+}
+/**
+ * extractKeywords
+ * Basic keyword extraction for lexical matching
+ * Returns array of lowercase words, stripped of punctuation
+ */
+export function extractKeywords(text = "") {
+  if (!text) return [];
+  return text
+    .toLowerCase()
+    .replace(/[\W_]+/g, " ") // remove non-alphanumerics
+    .split(/\s+/)
+    .filter(Boolean);
+}

package/src/utils/fileLoader.js ADDED Viewed

@@ -0,0 +1,39 @@
+// src/utils/fileLoader.js
+import fs from "fs";
+import path from "path";
+import { extractTextFromFile } from "./extractText.js";
+/**
+ * loadDocuments
+ * Recursively loads all supported files from a directory
+ * and extracts text content.
+ *
+ * @param {string} dirPath - root directory
+ * @param {string[]} exts - array of supported file extensions
+ * @returns {Promise<Array<{ filePath: string, text: string }>>}
+ */
+export async function loadDocuments(dirPath, exts = [".txt", ".md", ".pdf", ".html", ".docx"]) {
+  if (!fs.existsSync(dirPath)) return [];
+  const files = fs.readdirSync(dirPath, { withFileTypes: true });
+  const docs = [];
+  for (const file of files) {
+    const fullPath = path.join(dirPath, file.name);
+    if (file.isDirectory()) {
+      const subDocs = await loadDocuments(fullPath, exts);
+      docs.push(...subDocs);
+    } else if (exts.includes(path.extname(file.name).toLowerCase())) {
+      try {
+        const text = await extractTextFromFile(fullPath);
+        docs.push({ filePath: fullPath, text });
+      } catch (err) {
+        console.warn(`Failed to extract text from ${fullPath}:`, err.message);
+      }
+    }
+  }
+  return docs;
+}

package/src/utils/highlight.js ADDED Viewed

@@ -0,0 +1,24 @@
+// src/utils/highlight.js
+/**
+ * highlightMatches
+ * Wraps all occurrences of keywords in <mark> tags
+ *
+ * @param {string} text - original text
+ * @param {string[]} keywords - array of keywords to highlight
+ * @returns string - HTML-safe text with <mark> highlights
+ */
+export function highlightMatches(text, keywords = []) {
+  if (!text || !keywords || keywords.length === 0) return text;
+  let highlighted = text;
+  // escape special regex chars in keywords
+  const escapedKeywords = keywords.map(k => k.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
+  const pattern = new RegExp(`\\b(${escapedKeywords.join("|")})\\b`, "gi");
+  highlighted = highlighted.replace(pattern, "<mark>$1</mark>");
+  return highlighted;
+}

package/src/utils/similarity.js ADDED Viewed

@@ -0,0 +1,42 @@
+// src/utils/similarity.js
+/**
+ * cosine
+ * Compute cosine similarity between two vectors
+ * @param {number[]} a
+ * @param {number[]} b
+ * @returns {number} similarity score between -1 and 1
+ */
+export function cosine(a, b) {
+  if (!a || !b || a.length !== b.length) return 0;
+  let dot = 0, magA = 0, magB = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i] * b[i];
+    magA += a[i] * a[i];
+    magB += b[i] * b[i];
+  }
+  if (magA === 0 || magB === 0) return 0;
+  return dot / (Math.sqrt(magA) * Math.sqrt(magB));
+}
+/**
+ * euclidean
+ * Compute Euclidean distance between two vectors
+ * @param {number[]} a
+ * @param {number[]} b
+ * @returns {number} Euclidean distance
+ */
+export function euclidean(a, b) {
+  if (!a || !b || a.length !== b.length) return Infinity;
+  let sum = 0;
+  for (let i = 0; i < a.length; i++) {
+    sum += (a[i] - b[i]) ** 2;
+  }
+  return Math.sqrt(sum);
+}

package/src/utils/sseStream.js ADDED Viewed

@@ -0,0 +1,29 @@
+// src/utils/sseStream.js
+/**
+ * sendSSE
+ * Send streaming data over Server-Sent Events (SSE)
+ *
+ * @param {import('http').ServerResponse} res
+ * @param {string} data
+ * @param {boolean} [end=false] - whether this is the final message
+ */
+export function sendSSE(res, data, end = false) {
+  res.write(`data: ${JSON.stringify(data)}\n\n`);
+  if (end) res.write("event: end\ndata: [DONE]\n\n");
+}
+/**
+ * initSSE
+ * Initialize SSE response headers
+ *
+ * @param {import('http').ServerResponse} res
+ */
+export function initSSE(res) {
+  res.writeHead(200, {
+    "Content-Type": "text/event-stream",
+    "Cache-Control": "no-cache",
+    Connection: "keep-alive",
+  });
+  res.write("\n");
+}