npm - llm-kb - Versions diffs - 0.0.1 → 0.2.0 - Mend

llm-kb 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/bin/chunk-MYQ36JJB.js ADDED Viewed

@@ -0,0 +1,118 @@
+// src/indexer.ts
+import {
+  createAgentSession,
+  createBashTool,
+  createReadTool,
+  createWriteTool,
+  DefaultResourceLoader,
+  SessionManager,
+  SettingsManager
+} from "@mariozechner/pi-coding-agent";
+import { readdir, readFile } from "fs/promises";
+import { join, dirname } from "path";
+import { fileURLToPath } from "url";
+var __filename = fileURLToPath(import.meta.url);
+var __dirname = dirname(__filename);
+function getNodeModulesPath() {
+  let dir = __dirname;
+  for (let i = 0; i < 5; i++) {
+    const candidate = join(dir, "node_modules");
+    try {
+      return candidate;
+    } catch {
+      dir = dirname(dir);
+    }
+  }
+  return join(process.cwd(), "node_modules");
+}
+function buildAgentsContent(sourcesDir, files) {
+  const sourceList = files.filter((f) => f.endsWith(".md")).map((f) => `  - ${f}`).join("\n");
+  return `# llm-kb Knowledge Base
+## How to access documents
+### PDFs (pre-parsed)
+PDFs have been parsed to markdown with bounding boxes.
+Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
+Available parsed sources:
+${sourceList}
+### Other file types (Excel, Word, PowerPoint, CSV, images)
+You have bash and read tools. These libraries are pre-installed and available:
+- **exceljs** \u2014 for .xlsx/.xls files
+- **mammoth** \u2014 for .docx files
+- **officeparser** \u2014 for .pptx files
+- **csv-parse** \u2014 built into Node.js, use fs + split for .csv
+Write a quick Node.js script to extract content when needed.
+## Index file
+Write the index to \`.llm-kb/wiki/index.md\`.
+The index should be a markdown file with:
+1. A title and last-updated timestamp
+2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
+3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
+4. Total word count across all sources
+`;
+}
+async function buildIndex(folder, sourcesDir, onOutput) {
+  const files = await readdir(sourcesDir);
+  const mdFiles = files.filter((f) => f.endsWith(".md"));
+  if (mdFiles.length === 0) {
+    throw new Error("No source files found to index");
+  }
+  const agentsContent = buildAgentsContent(sourcesDir, files);
+  const nodeModulesPath = getNodeModulesPath();
+  process.env.NODE_PATH = nodeModulesPath;
+  const loader = new DefaultResourceLoader({
+    cwd: folder,
+    agentsFilesOverride: (current) => ({
+      agentsFiles: [
+        ...current.agentsFiles,
+        { path: ".llm-kb/AGENTS.md", content: agentsContent }
+      ]
+    })
+  });
+  await loader.reload();
+  const { session } = await createAgentSession({
+    cwd: folder,
+    resourceLoader: loader,
+    tools: [
+      createReadTool(folder),
+      createBashTool(folder),
+      createWriteTool(folder)
+    ],
+    sessionManager: SessionManager.inMemory(),
+    settingsManager: SettingsManager.inMemory({
+      compaction: { enabled: false }
+    })
+  });
+  if (onOutput) {
+    session.subscribe((event) => {
+      if (event.type === "message_update" && event.assistantMessageEvent.type === "text_delta") {
+        onOutput(event.assistantMessageEvent.delta);
+      }
+    });
+  }
+  const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
+Then write .llm-kb/wiki/index.md with a summary table of all sources.
+Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
+Add a total word count estimate at the bottom.`;
+  await session.prompt(prompt);
+  const indexPath = join(sourcesDir, "..", "index.md");
+  try {
+    const content = await readFile(indexPath, "utf-8");
+    session.dispose();
+    return content;
+  } catch {
+    session.dispose();
+    throw new Error("Agent did not create index.md");
+  }
+}
+export {
+  buildIndex
+};

package/bin/cli.js ADDED Viewed

@@ -0,0 +1,409 @@
+#!/usr/bin/env node
+import {
+  buildIndex
+} from "./chunk-MYQ36JJB.js";
+// src/cli.ts
+import { Command } from "commander";
+// src/scan.ts
+import { readdir } from "fs/promises";
+import { resolve, extname, relative } from "path";
+var SUPPORTED_EXTENSIONS = /* @__PURE__ */ new Set([
+  ".pdf",
+  ".xlsx",
+  ".xls",
+  ".docx",
+  ".pptx",
+  ".jpg",
+  ".jpeg",
+  ".png",
+  ".txt",
+  ".md",
+  ".csv"
+]);
+async function scan(folder) {
+  const root = resolve(folder);
+  const entries = await readdir(root, { recursive: true, withFileTypes: true });
+  const files = [];
+  for (const entry of entries) {
+    if (!entry.isFile()) continue;
+    const fullPath = resolve(entry.parentPath, entry.name);
+    const rel = relative(root, fullPath);
+    if (rel.startsWith(".llm-kb")) continue;
+    const ext = extname(entry.name).toLowerCase();
+    if (!SUPPORTED_EXTENSIONS.has(ext)) continue;
+    files.push({ name: entry.name, path: rel, ext });
+  }
+  return files;
+}
+function summarize(files) {
+  const counts = /* @__PURE__ */ new Map();
+  for (const f of files) {
+    counts.set(f.ext, (counts.get(f.ext) || 0) + 1);
+  }
+  const parts = Array.from(counts.entries()).sort((a, b) => b[1] - a[1]).map(([ext, count]) => `${count} ${ext.toUpperCase().slice(1)}`);
+  return parts.join(", ");
+}
+// src/pdf.ts
+import { LiteParse } from "@llamaindex/liteparse";
+import { writeFile, mkdir, stat } from "fs/promises";
+import { join, basename } from "path";
+import { cpus } from "os";
+async function isUpToDate(pdfPath, mdPath, jsonPath) {
+  try {
+    const [pdfStat, mdStat, jsonStat] = await Promise.all([
+      stat(pdfPath),
+      stat(mdPath),
+      stat(jsonPath)
+    ]);
+    return pdfStat.mtimeMs <= mdStat.mtimeMs && pdfStat.mtimeMs <= jsonStat.mtimeMs;
+  } catch {
+    return false;
+  }
+}
+function suppressStderr() {
+  const originalWrite = process.stderr.write.bind(process.stderr);
+  process.stderr.write = (() => true);
+  return () => {
+    process.stderr.write = originalWrite;
+  };
+}
+async function parsePDF(pdfPath, outputDir) {
+  const name = basename(pdfPath, ".pdf");
+  await mkdir(outputDir, { recursive: true });
+  const mdPath = join(outputDir, `${name}.md`);
+  const jsonPath = join(outputDir, `${name}.json`);
+  if (await isUpToDate(pdfPath, mdPath, jsonPath)) {
+    return { name, mdPath, jsonPath, totalPages: 0, textLength: 0, skipped: true };
+  }
+  const ocrServerUrl = process.env.OCR_SERVER_URL;
+  const ocrEnabled = ocrServerUrl ? true : process.env.OCR_ENABLED === "true";
+  const parser = new LiteParse({
+    ocrEnabled,
+    outputFormat: "json",
+    numWorkers: cpus().length,
+    ...ocrServerUrl ? { ocrServerUrl } : {}
+  });
+  const restore = suppressStderr();
+  let result;
+  try {
+    result = await parser.parse(pdfPath, true);
+  } finally {
+    restore();
+  }
+  const markdown = result.pages.map((p) => `# Page ${p.pageNum}
+${p.text}`).join("\n\n---\n\n");
+  const bboxData = {
+    source: basename(pdfPath),
+    totalPages: result.pages.length,
+    pages: result.pages.map((p) => ({
+      page: p.pageNum,
+      width: p.width,
+      height: p.height,
+      textItems: p.textItems.map((item) => ({
+        text: (item.str ?? item.text ?? "").trim(),
+        x: Math.round(item.x * 100) / 100,
+        y: Math.round(item.y * 100) / 100,
+        width: Math.round((item.width ?? item.w ?? 0) * 100) / 100,
+        height: Math.round((item.height ?? item.h ?? 0) * 100) / 100,
+        fontName: item.fontName,
+        fontSize: item.fontSize ? Math.round(item.fontSize * 100) / 100 : void 0
+      }))
+    }))
+  };
+  await writeFile(mdPath, markdown);
+  await writeFile(jsonPath, JSON.stringify(bboxData, null, 2));
+  return {
+    name,
+    mdPath,
+    jsonPath,
+    totalPages: result.pages.length,
+    textLength: markdown.length,
+    skipped: false
+  };
+}
+// src/watcher.ts
+import { watch } from "chokidar";
+import { extname as extname2, basename as basename2 } from "path";
+import chalk from "chalk";
+function startWatcher({ folder, sourcesDir, debounceMs = 2e3 }) {
+  let pendingFiles = [];
+  let debounceTimer = null;
+  async function processBatch() {
+    const files = [...pendingFiles];
+    pendingFiles = [];
+    if (files.length === 0) return;
+    console.log();
+    for (const filePath of files) {
+      const name = basename2(filePath);
+      process.stdout.write(`  Parsing ${name}...`);
+      try {
+        const result = await parsePDF(filePath, sourcesDir);
+        if (result.skipped) {
+          console.log(chalk.dim(` skipped (up to date)`));
+        } else {
+          console.log(chalk.green(` \u2713 ${result.totalPages} pages`));
+        }
+      } catch (err) {
+        console.log(chalk.red(` \u2717 ${err.message}`));
+      }
+    }
+    process.stdout.write(`  Re-indexing...`);
+    try {
+      await buildIndex(folder, sourcesDir);
+      console.log(chalk.green(` \u2713 index.md updated`));
+    } catch (err) {
+      console.log(chalk.red(` \u2717 ${err.message}`));
+    }
+  }
+  function queueFile(filePath) {
+    if (!pendingFiles.includes(filePath)) {
+      pendingFiles.push(filePath);
+    }
+    if (debounceTimer) clearTimeout(debounceTimer);
+    debounceTimer = setTimeout(processBatch, debounceMs);
+  }
+  const watcher = watch(folder, {
+    ignoreInitial: true,
+    ignored: [
+      "**/node_modules/**",
+      "**/.llm-kb/**",
+      "**/.git/**"
+    ],
+    depth: 10
+  });
+  watcher.on("add", (filePath) => {
+    const ext = extname2(filePath).toLowerCase();
+    if (ext === ".pdf") {
+      console.log(chalk.dim(`
+  New file: ${basename2(filePath)}`));
+      queueFile(filePath);
+    }
+  });
+  watcher.on("change", (filePath) => {
+    const ext = extname2(filePath).toLowerCase();
+    if (ext === ".pdf") {
+      console.log(chalk.dim(`
+  Changed: ${basename2(filePath)}`));
+      queueFile(filePath);
+    }
+  });
+  return watcher;
+}
+// src/query.ts
+import {
+  createAgentSession,
+  createBashTool,
+  createReadTool,
+  createWriteTool,
+  DefaultResourceLoader,
+  SessionManager,
+  SettingsManager
+} from "@mariozechner/pi-coding-agent";
+import { readdir as readdir2, mkdir as mkdir2 } from "fs/promises";
+import { join as join3, dirname } from "path";
+import { fileURLToPath } from "url";
+var __dirname = dirname(fileURLToPath(import.meta.url));
+function getNodeModulesPath() {
+  let dir = __dirname;
+  for (let i = 0; i < 5; i++) {
+    const candidate = join3(dir, "node_modules");
+    try {
+      return candidate;
+    } catch {
+      dir = dirname(dir);
+    }
+  }
+  return join3(process.cwd(), "node_modules");
+}
+function buildQueryAgents(sourceFiles, save) {
+  const sourceList = sourceFiles.map((f) => `  - ${f}`).join("\n");
+  let content = `# llm-kb Knowledge Base \u2014 Query Mode
+## How to answer questions
+1. FIRST read .llm-kb/wiki/index.md to understand all available sources
+2. Based on the question, select the most relevant source files (usually 2-5)
+3. Read those source files in full from .llm-kb/wiki/sources/
+4. Answer with inline citations: (filename, page number)
+5. If the answer requires cross-referencing multiple files, read additional ones
+6. If you can't find the answer, say so \u2014 don't hallucinate
+## Available parsed sources
+${sourceList}
+## Non-PDF files
+If the user's folder has Excel, Word, or PowerPoint files, these libraries are available:
+- **exceljs** \u2014 for .xlsx/.xls files
+- **mammoth** \u2014 for .docx files
+- **officeparser** \u2014 for .pptx files
+Write a quick Node.js script via bash to read them.
+## Rules
+- Always cite sources with filename and page number
+- Read the FULL source file, not just the beginning
+- Prefer primary sources over previous analyses
+`;
+  if (save) {
+    content += `
+## Research Mode
+Save your analysis to .llm-kb/wiki/outputs/ with a descriptive filename (e.g., comparison-analysis.md).
+Include the question at the top and all citations.
+`;
+  }
+  return content;
+}
+async function query(folder, question, options) {
+  const sourcesDir = join3(folder, ".llm-kb", "wiki", "sources");
+  const files = await readdir2(sourcesDir);
+  const mdFiles = files.filter((f) => f.endsWith(".md"));
+  if (mdFiles.length === 0) {
+    throw new Error("No sources found. Run 'llm-kb run' first to parse documents.");
+  }
+  if (options.save) {
+    await mkdir2(join3(folder, ".llm-kb", "wiki", "outputs"), { recursive: true });
+  }
+  process.env.NODE_PATH = getNodeModulesPath();
+  const agentsContent = buildQueryAgents(mdFiles, !!options.save);
+  const loader = new DefaultResourceLoader({
+    cwd: folder,
+    agentsFilesOverride: (current) => ({
+      agentsFiles: [
+        ...current.agentsFiles,
+        { path: ".llm-kb/AGENTS.md", content: agentsContent }
+      ]
+    })
+  });
+  await loader.reload();
+  const tools = [createReadTool(folder)];
+  if (options.save) {
+    tools.push(createBashTool(folder), createWriteTool(folder));
+  }
+  const { session } = await createAgentSession({
+    cwd: folder,
+    resourceLoader: loader,
+    tools,
+    sessionManager: SessionManager.inMemory(),
+    settingsManager: SettingsManager.inMemory({
+      compaction: { enabled: false }
+    })
+  });
+  session.subscribe((event) => {
+    if (event.type === "message_update" && event.assistantMessageEvent.type === "text_delta") {
+      process.stdout.write(event.assistantMessageEvent.delta);
+    }
+  });
+  await session.prompt(question);
+  console.log();
+  session.dispose();
+  if (options.save) {
+    const { buildIndex: buildIndex2 } = await import("./indexer-LSYSZXZX.js");
+    await buildIndex2(folder, sourcesDir);
+  }
+}
+// src/resolve-kb.ts
+import { existsSync } from "fs";
+import { resolve as resolve2, join as join4, dirname as dirname2 } from "path";
+function resolveKnowledgeBase(startDir) {
+  let dir = resolve2(startDir);
+  while (true) {
+    if (existsSync(join4(dir, ".llm-kb"))) {
+      return dir;
+    }
+    const parent = dirname2(dir);
+    if (parent === dir) return null;
+    dir = parent;
+  }
+}
+// src/cli.ts
+import { existsSync as existsSync2 } from "fs";
+import { mkdir as mkdir3 } from "fs/promises";
+import { resolve as resolve3, join as join5 } from "path";
+import chalk2 from "chalk";
+var program = new Command();
+program.name("llm-kb").description("Drop files into a folder. Get a knowledge base you can query.").version("0.2.0");
+program.command("run").description("Scan, parse, index, and watch a folder").argument("<folder>", "Path to your documents folder").action(async (folder) => {
+  console.log(`
+${chalk2.bold("llm-kb")} v0.2.0
+`);
+  if (!existsSync2(folder)) {
+    console.error(chalk2.red(`Error: Folder not found: ${folder}`));
+    process.exit(1);
+  }
+  console.log(`Scanning ${folder}...`);
+  const files = await scan(folder);
+  if (files.length === 0) {
+    console.log(chalk2.yellow("  No supported files found."));
+    return;
+  }
+  const pdfs = files.filter((f) => f.ext === ".pdf");
+  console.log(`  Found ${chalk2.bold(files.length.toString())} files (${summarize(files)})`);
+  if (pdfs.length === 0) return;
+  const root = resolve3(folder);
+  const sourcesDir = join5(root, ".llm-kb", "wiki", "sources");
+  await mkdir3(sourcesDir, { recursive: true });
+  let parsed = 0;
+  let skipped = 0;
+  let failed = 0;
+  const errors = [];
+  for (let i = 0; i < pdfs.length; i++) {
+    const pdf = pdfs[i];
+    const fullPath = join5(root, pdf.path);
+    const progress = `  Parsing... ${i + 1}/${pdfs.length} \u2014 ${pdf.name}`;
+    process.stdout.write(`\r${progress.padEnd(80)}`);
+    try {
+      const result = await parsePDF(fullPath, sourcesDir);
+      if (result.skipped) {
+        skipped++;
+      } else {
+        parsed++;
+      }
+    } catch (err) {
+      failed++;
+      errors.push({ name: pdf.name, message: err.message });
+    }
+  }
+  process.stdout.write(`\r${"".padEnd(80)}\r`);
+  const parts = [];
+  if (parsed > 0) parts.push(chalk2.green(`${parsed} parsed`));
+  if (skipped > 0) parts.push(chalk2.dim(`${skipped} skipped (up to date)`));
+  if (failed > 0) parts.push(chalk2.red(`${failed} failed`));
+  console.log(`  ${parts.join(", ")}`);
+  for (const err of errors) {
+    console.log(chalk2.red(`    \u2717 ${err.name} \u2014 ${err.message}`));
+  }
+  console.log(`
+  Building index...`);
+  try {
+    await buildIndex(root, sourcesDir);
+    console.log(chalk2.green(`  Index built: .llm-kb/wiki/index.md`));
+  } catch (err) {
+    console.error(chalk2.red(`  Index failed: ${err.message}`));
+  }
+  console.log(`
+  ${chalk2.dim("Output:")} ${sourcesDir}`);
+  console.log(chalk2.dim(`
+  Watching for new files... (Ctrl+C to stop)`));
+  startWatcher({ folder: root, sourcesDir });
+});
+program.command("query").description("Ask a question across your knowledge base").argument("<question>", "Your question").option("--folder <path>", "Path to document folder (auto-detects if omitted)").option("--save", "Save the answer to wiki/outputs/ (research mode)").action(async (question, options) => {
+  const root = resolveKnowledgeBase(options.folder || process.cwd());
+  if (!root) {
+    console.error(chalk2.red("No knowledge base found. Run 'llm-kb run <folder>' first."));
+    process.exit(1);
+  }
+  try {
+    await query(root, question, { save: options.save });
+  } catch (err) {
+    console.error(chalk2.red(err.message));
+    process.exit(1);
+  }
+});
+program.parse();

package/bin/indexer-LSYSZXZX.js ADDED Viewed

@@ -0,0 +1,6 @@
+import {
+  buildIndex
+} from "./chunk-MYQ36JJB.js";
+export {
+  buildIndex
+};

package/package.json CHANGED Viewed

@@ -1,11 +1,15 @@
 {
   "name": "llm-kb",
-  "version": "0.0.1",
+  "version": "0.2.0",
   "description": "LLM-powered knowledge base. Drop documents, build a wiki, ask questions. Inspired by Karpathy.",
   "bin": {
-    "llm-kb": "./bin/cli.mjs"
+    "llm-kb": "./bin/cli.js"
   },
   "type": "module",
+  "scripts": {
+    "build": "tsup src/cli.ts --format esm --out-dir bin --clean",
+    "dev": "tsup src/cli.ts --format esm --out-dir bin --watch"
+  },
   "keywords": [
     "llm",
     "knowledge-base",
@@ -20,5 +24,20 @@
   "repository": {
     "type": "git",
     "url": "https://github.com/satish860/llm-kb"
+  },
+  "dependencies": {
+    "@llamaindex/liteparse": "^1.4.4",
+    "@mariozechner/pi-coding-agent": "^0.65.0",
+    "chalk": "^5.6.2",
+    "chokidar": "^5.0.0",
+    "commander": "^13.1.0",
+    "exceljs": "^4.4.0",
+    "mammoth": "^1.12.0",
+    "officeparser": "^6.0.7",
+    "ora": "^9.3.0"
+  },
+  "devDependencies": {
+    "tsup": "^8.4.0",
+    "typescript": "^5.8.3"
   }
 }

package/plan.md ADDED Viewed

@@ -0,0 +1,55 @@
+# llm-kb — Phase 1 Build Plan
+> Emergent design. Each slice is a thin vertical slice that works end-to-end, is demoable, and informs the next step. Decisions are made at the last responsible moment.
+## Key Learnings
+- **PDF is the only adapter we build.** Everything else (Excel, Word, PPT, CSV, images) handled dynamically by Pi SDK agent at query time.
+- **`@llamaindex/liteparse`** proven (from parser-study). Extracts text + bounding boxes locally.
+- **Two-output pattern**: `.md` (spatial text) + `.json` (bounding boxes for citations).
+- **OCR off by default.** Most PDFs have native text. Enable via `OCR_SERVER_URL` or `OCR_ENABLED=true`.
+- **Pi SDK `createAgentSession()`** with defaults — no auth/model config needed. Uses Pi's existing auth.
+- **AGENTS.md injected via `agentsFilesOverride`** — user's folder stays clean.
+- **NODE_PATH** set so agent's bash scripts can use bundled libraries (exceljs, mammoth, officeparser).
+- **Config file skipped** — nothing reads it yet. Add when Phase 2/3 needs it.
+---
+## Slice 1: "Hello World" CLI ✅
+Commander CLI with `run <folder>`. Scans folder, lists files by extension.
+## Slice 2: PDF → markdown + bounding boxes ✅
+LiteParse parses PDFs → `.md` + `.json` in `.llm-kb/wiki/sources/`. Tested on 9 real PDFs (1000+ pages).
+## Slice 3: Scanned PDF handling (OCR) ✅
+LiteParse has Tesseract.js built-in. `ocrEnabled` + `ocrServerUrl` config. OCR off by default. Azure OCR bridge tested on 16 legal PDFs (3000+ pages).
+## Slice 4: Progress + error handling ✅
+Inline progress. Stderr suppression. Corrupt file skip + warning. Mtime check — re-runs instant.
+## Slice 5: Indexer (Pi SDK) ✅
+`createAgentSession` with cwd = user's folder. AGENTS.md injected. Agent reads sources, writes `index.md` with summary table.
+## Slice 6: File watcher ✅
+chokidar watches folder. New/changed PDFs → parse → re-index. 2s debounce for batch drops.
+## Slice 7: Config + polish → Skipped
+Config file has no readers yet. Deferred to Phase 2/3. README updated instead.
+---
+## Phase 1 Complete ✅
+**What ships:**
+- `llm-kb run ./folder` — scan, parse PDFs, build index, watch for new files
+- Pre-bundled libraries for agent to handle Excel, Word, PowerPoint at query time
+- OCR via env var (local Tesseract or remote Azure bridge)
+- Auth via Pi SDK (zero config)
+**Phase 2 complete ✅:**
+- `llm-kb query "question"` — auto-detects KB, streams cited answers
+- `--save` flag — research mode, saves to `outputs/`, re-indexes
+- Query mode is read-only (read tool only). Research mode adds bash + write.
+**Deferred to Phase 4:**
+- Trace logging (JSON per query: question, filesRead, citations, tokens, duration)
+- Needed for eval, but no eval system yet to consume traces