npm - llm-kb - Versions diffs - 0.0.1 → 0.2.0 - Mend

llm-kb 0.0.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/src/cli.ts ADDED Viewed

@@ -0,0 +1,132 @@
+#!/usr/bin/env node
+import { Command } from "commander";
+import { scan, summarize } from "./scan.js";
+import { parsePDF } from "./pdf.js";
+import { buildIndex } from "./indexer.js";
+import { startWatcher } from "./watcher.js";
+import { query } from "./query.js";
+import { resolveKnowledgeBase } from "./resolve-kb.js";
+import { existsSync } from "node:fs";
+import { mkdir } from "node:fs/promises";
+import { resolve, join } from "node:path";
+import chalk from "chalk";
+const program = new Command();
+program
+  .name("llm-kb")
+  .description("Drop files into a folder. Get a knowledge base you can query.")
+  .version("0.2.0");
+program
+  .command("run")
+  .description("Scan, parse, index, and watch a folder")
+  .argument("<folder>", "Path to your documents folder")
+  .action(async (folder: string) => {
+    console.log(`\n${chalk.bold("llm-kb")} v0.2.0\n`);
+    if (!existsSync(folder)) {
+      console.error(chalk.red(`Error: Folder not found: ${folder}`));
+      process.exit(1);
+    }
+    console.log(`Scanning ${folder}...`);
+    const files = await scan(folder);
+    if (files.length === 0) {
+      console.log(chalk.yellow("  No supported files found."));
+      return;
+    }
+    const pdfs = files.filter((f) => f.ext === ".pdf");
+    console.log(`  Found ${chalk.bold(files.length.toString())} files (${summarize(files)})`);
+    if (pdfs.length === 0) return;
+    // Set up .llm-kb folder structure
+    const root = resolve(folder);
+    const sourcesDir = join(root, ".llm-kb", "wiki", "sources");
+    await mkdir(sourcesDir, { recursive: true });
+    // Parse PDFs with inline progress
+    let parsed = 0;
+    let skipped = 0;
+    let failed = 0;
+    const errors: { name: string; message: string }[] = [];
+    for (let i = 0; i < pdfs.length; i++) {
+      const pdf = pdfs[i];
+      const fullPath = join(root, pdf.path);
+      // Inline progress — overwrite same line
+      const progress = `  Parsing... ${i + 1}/${pdfs.length} — ${pdf.name}`;
+      process.stdout.write(`\r${progress.padEnd(80)}`);
+      try {
+        const result = await parsePDF(fullPath, sourcesDir);
+        if (result.skipped) {
+          skipped++;
+        } else {
+          parsed++;
+        }
+      } catch (err: any) {
+        failed++;
+        errors.push({ name: pdf.name, message: err.message });
+      }
+    }
+    // Clear progress line
+    process.stdout.write(`\r${"".padEnd(80)}\r`);
+    // Summary
+    const parts: string[] = [];
+    if (parsed > 0) parts.push(chalk.green(`${parsed} parsed`));
+    if (skipped > 0) parts.push(chalk.dim(`${skipped} skipped (up to date)`));
+    if (failed > 0) parts.push(chalk.red(`${failed} failed`));
+    console.log(`  ${parts.join(", ")}`);
+    // Show errors
+    for (const err of errors) {
+      console.log(chalk.red(`    ✗ ${err.name} — ${err.message}`));
+    }
+    // Build index
+    console.log(`\n  Building index...`);
+    try {
+      await buildIndex(root, sourcesDir);
+      console.log(chalk.green(`  Index built: .llm-kb/wiki/index.md`));
+    } catch (err: any) {
+      console.error(chalk.red(`  Index failed: ${err.message}`));
+    }
+    console.log(`\n  ${chalk.dim("Output:")} ${sourcesDir}`);
+    // Start watching for new files
+    console.log(chalk.dim(`\n  Watching for new files... (Ctrl+C to stop)`));
+    startWatcher({ folder: root, sourcesDir });
+  });
+program
+  .command("query")
+  .description("Ask a question across your knowledge base")
+  .argument("<question>", "Your question")
+  .option("--folder <path>", "Path to document folder (auto-detects if omitted)")
+  .option("--save", "Save the answer to wiki/outputs/ (research mode)")
+  .action(async (question: string, options: { folder?: string; save?: boolean }) => {
+    const root = resolveKnowledgeBase(options.folder || process.cwd());
+    if (!root) {
+      console.error(chalk.red("No knowledge base found. Run 'llm-kb run <folder>' first."));
+      process.exit(1);
+    }
+    try {
+      await query(root, question, { save: options.save });
+    } catch (err: any) {
+      console.error(chalk.red(err.message));
+      process.exit(1);
+    }
+  });
+program.parse();

package/src/indexer.ts ADDED Viewed

@@ -0,0 +1,148 @@
+import {
+  createAgentSession,
+  createBashTool,
+  createReadTool,
+  createWriteTool,
+  DefaultResourceLoader,
+  SessionManager,
+  SettingsManager,
+} from "@mariozechner/pi-coding-agent";
+import { readdir, readFile } from "node:fs/promises";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+/**
+ * Find the node_modules directory for llm-kb's bundled libraries.
+ * When running from bin/cli.js, node_modules is at ../node_modules.
+ */
+function getNodeModulesPath(): string {
+  // Walk up from this file to find node_modules
+  let dir = __dirname;
+  for (let i = 0; i < 5; i++) {
+    const candidate = join(dir, "node_modules");
+    try {
+      return candidate;
+    } catch {
+      dir = dirname(dir);
+    }
+  }
+  return join(process.cwd(), "node_modules");
+}
+function buildAgentsContent(sourcesDir: string, files: string[]): string {
+  const sourceList = files
+    .filter((f) => f.endsWith(".md"))
+    .map((f) => `  - ${f}`)
+    .join("\n");
+  return `# llm-kb Knowledge Base
+## How to access documents
+### PDFs (pre-parsed)
+PDFs have been parsed to markdown with bounding boxes.
+Read the markdown versions in \`.llm-kb/wiki/sources/\` instead of the raw PDFs.
+Available parsed sources:
+${sourceList}
+### Other file types (Excel, Word, PowerPoint, CSV, images)
+You have bash and read tools. These libraries are pre-installed and available:
+- **exceljs** — for .xlsx/.xls files
+- **mammoth** — for .docx files
+- **officeparser** — for .pptx files
+- **csv-parse** — built into Node.js, use fs + split for .csv
+Write a quick Node.js script to extract content when needed.
+## Index file
+Write the index to \`.llm-kb/wiki/index.md\`.
+The index should be a markdown file with:
+1. A title and last-updated timestamp
+2. A summary table with columns: Source, Type, Pages/Size, Summary, Key Topics
+3. Each source gets a one-line summary (read the first ~500 chars of each file to generate it)
+4. Total word count across all sources
+`;
+}
+export async function buildIndex(
+  folder: string,
+  sourcesDir: string,
+  onOutput?: (text: string) => void
+): Promise<string> {
+  // List source files
+  const files = await readdir(sourcesDir);
+  const mdFiles = files.filter((f) => f.endsWith(".md"));
+  if (mdFiles.length === 0) {
+    throw new Error("No source files found to index");
+  }
+  // Build AGENTS.md content
+  const agentsContent = buildAgentsContent(sourcesDir, files);
+  // Set NODE_PATH so agent's bash scripts can use bundled libraries
+  const nodeModulesPath = getNodeModulesPath();
+  process.env.NODE_PATH = nodeModulesPath;
+  const loader = new DefaultResourceLoader({
+    cwd: folder,
+    agentsFilesOverride: (current) => ({
+      agentsFiles: [
+        ...current.agentsFiles,
+        { path: ".llm-kb/AGENTS.md", content: agentsContent },
+      ],
+    }),
+  });
+  await loader.reload();
+  const { session } = await createAgentSession({
+    cwd: folder,
+    resourceLoader: loader,
+    tools: [
+      createReadTool(folder),
+      createBashTool(folder),
+      createWriteTool(folder),
+    ],
+    sessionManager: SessionManager.inMemory(),
+    settingsManager: SettingsManager.inMemory({
+      compaction: { enabled: false },
+    }),
+  });
+  // Subscribe to streaming output
+  if (onOutput) {
+    session.subscribe((event) => {
+      if (
+        event.type === "message_update" &&
+        event.assistantMessageEvent.type === "text_delta"
+      ) {
+        onOutput(event.assistantMessageEvent.delta);
+      }
+    });
+  }
+  // Build the prompt
+  const prompt = `Read each file in .llm-kb/wiki/sources/ (one at a time, just the first 500 characters of each).
+Then write .llm-kb/wiki/index.md with a summary table of all sources.
+Include: Source filename, Type (PDF/Excel/Word/etc), Pages (from the JSON if available), a one-line summary, and key topics.
+Add a total word count estimate at the bottom.`;
+  await session.prompt(prompt);
+  // Read the generated index
+  const indexPath = join(sourcesDir, "..", "index.md");
+  try {
+    const content = await readFile(indexPath, "utf-8");
+    session.dispose();
+    return content;
+  } catch {
+    session.dispose();
+    throw new Error("Agent did not create index.md");
+  }
+}

package/src/pdf.ts ADDED Viewed

@@ -0,0 +1,119 @@
+import { LiteParse } from "@llamaindex/liteparse";
+import { writeFile, mkdir, stat } from "node:fs/promises";
+import { join, basename } from "node:path";
+import { cpus } from "node:os";
+export interface ParsedPDF {
+  name: string;
+  mdPath: string;
+  jsonPath: string;
+  totalPages: number;
+  textLength: number;
+  skipped: boolean;
+}
+/**
+ * Check if source PDF is newer than the parsed output.
+ * Returns true if we can skip parsing.
+ */
+async function isUpToDate(
+  pdfPath: string,
+  mdPath: string,
+  jsonPath: string
+): Promise<boolean> {
+  try {
+    const [pdfStat, mdStat, jsonStat] = await Promise.all([
+      stat(pdfPath),
+      stat(mdPath),
+      stat(jsonPath),
+    ]);
+    return pdfStat.mtimeMs <= mdStat.mtimeMs && pdfStat.mtimeMs <= jsonStat.mtimeMs;
+  } catch {
+    return false;
+  }
+}
+/**
+ * Suppress stderr temporarily to hide noisy library warnings.
+ */
+function suppressStderr(): () => void {
+  const originalWrite = process.stderr.write.bind(process.stderr);
+  process.stderr.write = (() => true) as any;
+  return () => {
+    process.stderr.write = originalWrite;
+  };
+}
+export async function parsePDF(
+  pdfPath: string,
+  outputDir: string
+): Promise<ParsedPDF> {
+  const name = basename(pdfPath, ".pdf");
+  await mkdir(outputDir, { recursive: true });
+  const mdPath = join(outputDir, `${name}.md`);
+  const jsonPath = join(outputDir, `${name}.json`);
+  // Skip if already parsed and source hasn't changed
+  if (await isUpToDate(pdfPath, mdPath, jsonPath)) {
+    return { name, mdPath, jsonPath, totalPages: 0, textLength: 0, skipped: true };
+  }
+  const ocrServerUrl = process.env.OCR_SERVER_URL;
+  const ocrEnabled = ocrServerUrl ? true : process.env.OCR_ENABLED === "true";
+  const parser = new LiteParse({
+    ocrEnabled,
+    outputFormat: "json",
+    numWorkers: cpus().length,
+    ...(ocrServerUrl ? { ocrServerUrl } : {}),
+  });
+  // Suppress noisy Tesseract/PDF.js warnings during parse
+  const restore = suppressStderr();
+  let result;
+  try {
+    result = await parser.parse(pdfPath, true);
+  } finally {
+    restore();
+  }
+  // Build markdown — spatial text per page
+  const markdown = result.pages
+    .map((p: any) => `# Page ${p.pageNum}\n\n${p.text}`)
+    .join("\n\n---\n\n");
+  // Build bounding box JSON
+  const bboxData = {
+    source: basename(pdfPath),
+    totalPages: result.pages.length,
+    pages: result.pages.map((p: any) => ({
+      page: p.pageNum,
+      width: p.width,
+      height: p.height,
+      textItems: p.textItems.map((item: any) => ({
+        text: (item.str ?? item.text ?? "").trim(),
+        x: Math.round(item.x * 100) / 100,
+        y: Math.round(item.y * 100) / 100,
+        width: Math.round((item.width ?? item.w ?? 0) * 100) / 100,
+        height: Math.round((item.height ?? item.h ?? 0) * 100) / 100,
+        fontName: item.fontName,
+        fontSize: item.fontSize
+          ? Math.round(item.fontSize * 100) / 100
+          : undefined,
+      })),
+    })),
+  };
+  await writeFile(mdPath, markdown);
+  await writeFile(jsonPath, JSON.stringify(bboxData, null, 2));
+  return {
+    name,
+    mdPath,
+    jsonPath,
+    totalPages: result.pages.length,
+    textLength: markdown.length,
+    skipped: false,
+  };
+}

package/src/query.ts ADDED Viewed

@@ -0,0 +1,132 @@
+import {
+  createAgentSession,
+  createBashTool,
+  createReadTool,
+  createWriteTool,
+  DefaultResourceLoader,
+  SessionManager,
+  SettingsManager,
+} from "@mariozechner/pi-coding-agent";
+import { readdir, mkdir } from "node:fs/promises";
+import { join, dirname } from "node:path";
+import { fileURLToPath } from "node:url";
+const __dirname = dirname(fileURLToPath(import.meta.url));
+function getNodeModulesPath(): string {
+  let dir = __dirname;
+  for (let i = 0; i < 5; i++) {
+    const candidate = join(dir, "node_modules");
+    try { return candidate; } catch { dir = dirname(dir); }
+  }
+  return join(process.cwd(), "node_modules");
+}
+function buildQueryAgents(sourceFiles: string[], save: boolean): string {
+  const sourceList = sourceFiles.map((f) => `  - ${f}`).join("\n");
+  let content = `# llm-kb Knowledge Base — Query Mode
+## How to answer questions
+1. FIRST read .llm-kb/wiki/index.md to understand all available sources
+2. Based on the question, select the most relevant source files (usually 2-5)
+3. Read those source files in full from .llm-kb/wiki/sources/
+4. Answer with inline citations: (filename, page number)
+5. If the answer requires cross-referencing multiple files, read additional ones
+6. If you can't find the answer, say so — don't hallucinate
+## Available parsed sources
+${sourceList}
+## Non-PDF files
+If the user's folder has Excel, Word, or PowerPoint files, these libraries are available:
+- **exceljs** — for .xlsx/.xls files
+- **mammoth** — for .docx files
+- **officeparser** — for .pptx files
+Write a quick Node.js script via bash to read them.
+## Rules
+- Always cite sources with filename and page number
+- Read the FULL source file, not just the beginning
+- Prefer primary sources over previous analyses
+`;
+  if (save) {
+    content += `
+## Research Mode
+Save your analysis to .llm-kb/wiki/outputs/ with a descriptive filename (e.g., comparison-analysis.md).
+Include the question at the top and all citations.
+`;
+  }
+  return content;
+}
+export async function query(
+  folder: string,
+  question: string,
+  options: { save?: boolean }
+): Promise<void> {
+  const sourcesDir = join(folder, ".llm-kb", "wiki", "sources");
+  const files = await readdir(sourcesDir);
+  const mdFiles = files.filter((f) => f.endsWith(".md"));
+  if (mdFiles.length === 0) {
+    throw new Error("No sources found. Run 'llm-kb run' first to parse documents.");
+  }
+  if (options.save) {
+    await mkdir(join(folder, ".llm-kb", "wiki", "outputs"), { recursive: true });
+  }
+  process.env.NODE_PATH = getNodeModulesPath();
+  const agentsContent = buildQueryAgents(mdFiles, !!options.save);
+  const loader = new DefaultResourceLoader({
+    cwd: folder,
+    agentsFilesOverride: (current) => ({
+      agentsFiles: [
+        ...current.agentsFiles,
+        { path: ".llm-kb/AGENTS.md", content: agentsContent },
+      ],
+    }),
+  });
+  await loader.reload();
+  const tools = [createReadTool(folder)];
+  if (options.save) {
+    tools.push(createBashTool(folder), createWriteTool(folder));
+  }
+  const { session } = await createAgentSession({
+    cwd: folder,
+    resourceLoader: loader,
+    tools,
+    sessionManager: SessionManager.inMemory(),
+    settingsManager: SettingsManager.inMemory({
+      compaction: { enabled: false },
+    }),
+  });
+  session.subscribe((event) => {
+    if (
+      event.type === "message_update" &&
+      event.assistantMessageEvent.type === "text_delta"
+    ) {
+      process.stdout.write(event.assistantMessageEvent.delta);
+    }
+  });
+  await session.prompt(question);
+  console.log();
+  session.dispose();
+  // Re-index after save so the compounding loop works
+  if (options.save) {
+    const { buildIndex } = await import("./indexer.js");
+    await buildIndex(folder, sourcesDir);
+  }
+}

package/src/resolve-kb.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import { existsSync } from "node:fs";
+import { resolve, join, dirname } from "node:path";
+/**
+ * Walk up from startDir looking for a .llm-kb/ directory.
+ * Returns the folder containing .llm-kb/, or null if not found.
+ */
+export function resolveKnowledgeBase(startDir: string): string | null {
+  let dir = resolve(startDir);
+  while (true) {
+    if (existsSync(join(dir, ".llm-kb"))) {
+      return dir;
+    }
+    const parent = dirname(dir);
+    if (parent === dir) return null;
+    dir = parent;
+  }
+}

package/src/scan.ts ADDED Viewed

@@ -0,0 +1,59 @@
+import { readdir } from "node:fs/promises";
+import { resolve, extname, relative } from "node:path";
+export interface ScannedFile {
+  name: string;
+  path: string;
+  ext: string;
+}
+const SUPPORTED_EXTENSIONS = new Set([
+  ".pdf",
+  ".xlsx",
+  ".xls",
+  ".docx",
+  ".pptx",
+  ".jpg",
+  ".jpeg",
+  ".png",
+  ".txt",
+  ".md",
+  ".csv",
+]);
+export async function scan(folder: string): Promise<ScannedFile[]> {
+  const root = resolve(folder);
+  const entries = await readdir(root, { recursive: true, withFileTypes: true });
+  const files: ScannedFile[] = [];
+  for (const entry of entries) {
+    if (!entry.isFile()) continue;
+    const fullPath = resolve(entry.parentPath, entry.name);
+    const rel = relative(root, fullPath);
+    // Skip .llm-kb internal folder
+    if (rel.startsWith(".llm-kb")) continue;
+    const ext = extname(entry.name).toLowerCase();
+    if (!SUPPORTED_EXTENSIONS.has(ext)) continue;
+    files.push({ name: entry.name, path: rel, ext });
+  }
+  return files;
+}
+export function summarize(files: ScannedFile[]): string {
+  const counts = new Map<string, number>();
+  for (const f of files) {
+    counts.set(f.ext, (counts.get(f.ext) || 0) + 1);
+  }
+  const parts = Array.from(counts.entries())
+    .sort((a, b) => b[1] - a[1])
+    .map(([ext, count]) => `${count} ${ext.toUpperCase().slice(1)}`);
+  return parts.join(", ");
+}

package/src/watcher.ts ADDED Viewed

@@ -0,0 +1,84 @@
+import { watch } from "chokidar";
+import { extname, join, basename } from "node:path";
+import { parsePDF } from "./pdf.js";
+import { buildIndex } from "./indexer.js";
+import chalk from "chalk";
+interface WatcherOptions {
+  folder: string;
+  sourcesDir: string;
+  debounceMs?: number;
+}
+export function startWatcher({ folder, sourcesDir, debounceMs = 2000 }: WatcherOptions) {
+  let pendingFiles: string[] = [];
+  let debounceTimer: ReturnType<typeof setTimeout> | null = null;
+  async function processBatch() {
+    const files = [...pendingFiles];
+    pendingFiles = [];
+    if (files.length === 0) return;
+    console.log();
+    for (const filePath of files) {
+      const name = basename(filePath);
+      process.stdout.write(`  Parsing ${name}...`);
+      try {
+        const result = await parsePDF(filePath, sourcesDir);
+        if (result.skipped) {
+          console.log(chalk.dim(` skipped (up to date)`));
+        } else {
+          console.log(chalk.green(` ✓ ${result.totalPages} pages`));
+        }
+      } catch (err: any) {
+        console.log(chalk.red(` ✗ ${err.message}`));
+      }
+    }
+    // Re-index
+    process.stdout.write(`  Re-indexing...`);
+    try {
+      await buildIndex(folder, sourcesDir);
+      console.log(chalk.green(` ✓ index.md updated`));
+    } catch (err: any) {
+      console.log(chalk.red(` ✗ ${err.message}`));
+    }
+  }
+  function queueFile(filePath: string) {
+    if (!pendingFiles.includes(filePath)) {
+      pendingFiles.push(filePath);
+    }
+    if (debounceTimer) clearTimeout(debounceTimer);
+    debounceTimer = setTimeout(processBatch, debounceMs);
+  }
+  const watcher = watch(folder, {
+    ignoreInitial: true,
+    ignored: [
+      "**/node_modules/**",
+      "**/.llm-kb/**",
+      "**/.git/**",
+    ],
+    depth: 10,
+  });
+  watcher.on("add", (filePath) => {
+    const ext = extname(filePath).toLowerCase();
+    if (ext === ".pdf") {
+      console.log(chalk.dim(`\n  New file: ${basename(filePath)}`));
+      queueFile(filePath);
+    }
+  });
+  watcher.on("change", (filePath) => {
+    const ext = extname(filePath).toLowerCase();
+    if (ext === ".pdf") {
+      console.log(chalk.dim(`\n  Changed: ${basename(filePath)}`));
+      queueFile(filePath);
+    }
+  });
+  return watcher;
+}

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,14 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ES2022",
+    "moduleResolution": "bundler",
+    "strict": true,
+    "esModuleInterop": true,
+    "outDir": "dist",
+    "rootDir": "src",
+    "declaration": true,
+    "sourceMap": true
+  },
+  "include": ["src"]
+}