npm - @et0and/ovid - Versions diffs - 0.0.2 - Mend

@et0and/ovid 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/src/main.ts ADDED Viewed

@@ -0,0 +1,239 @@
+/**
+ * CLI entry point and pipeline orchestrator for semantic-navigator.
+ *
+ * Pipeline:
+ *   1. Parse CLI flags (commander)
+ *   2. Optionally handle --logout
+ *   3. Init UI
+ *   4. Discover + read files  → progress: "reading"
+ *   5. Tokenise + chunk        → (synchronous, fast)
+ *   6. Embed chunks            → progress: "embedding"
+ *   7. Spectral cluster        → progress: "clustering"
+ *   8. Build labelled tree     → progress: "labelling"
+ *   9. Hand tree to UI
+ */
+import { Command } from "commander"
+import path from "node:path"
+import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
+import { chunkFile } from "./tokenize.ts"
+import { embedChunks, DEFAULT_EMBEDDING_MODEL, type EmbedOptions, type EmbedEntry } from "./embed.ts"
+import { splitCluster, type Cluster } from "./cluster.ts"
+import { buildTree } from "./tree.ts"
+import { clearAuthCache, getCopilotToken } from "./auth.ts"
+import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
+import type { CopilotConfig } from "./labels.ts"
+// ---------------------------------------------------------------------------
+// CLI definition
+// ---------------------------------------------------------------------------
+const program = new Command()
+  .name("semantic-navigator")
+  .description("Browse a repository's files by semantic meaning")
+  .argument("[directory]", "Directory to analyse (default: current working directory)", ".")
+  .option("--completion-model <model>", "Copilot model to use for labelling", "gpt-4o-mini")
+  .option("--max-files <n>", "Maximum number of files to index", (v) => parseInt(v, 10), 2000)
+  .option("--max-file-bytes <n>", "Skip files larger than this many bytes", (v) => parseInt(v, 10), 1_000_000)
+  .option("--exclude-glob <pattern...>", "Glob patterns to exclude (repeatable)", DEFAULT_EXCLUDES)
+  .option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
+  .option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
+  .option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
+  .option("--logout", "Clear cached GitHub / Copilot credentials and exit")
+  .helpOption("-h, --help", "Show help")
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+let _ui: SemanticNavigatorUI | undefined
+async function main(): Promise<void> {
+  program.parse(process.argv)
+  const opts = program.opts<{
+    completionModel: string
+    maxFiles: number
+    maxFileBytes: number
+    excludeGlob: string[]
+    readConcurrency: number
+    embedBatchSize: number
+    embedConcurrency: number
+    logout: boolean | undefined
+  }>()
+  // --- --logout shortcut ---
+  if (opts.logout) {
+    clearAuthCache()
+    console.log("Logged out: cached credentials removed.")
+    process.exit(0)
+  }
+  const directory = path.resolve(program.args[0] ?? ".")
+  // ---------------------------------------------------------------------------
+  // Init UI first so all progress is rendered in-terminal
+  // ---------------------------------------------------------------------------
+  const ui = new SemanticNavigatorUI()
+  _ui = ui
+  await ui.init()
+  // We'll wire console errors through the UI destroy → exit pattern
+  const fatal = (msg: string): never => {
+    ui.destroy()
+    console.error(`\nError: ${msg}`)
+    process.exit(1)
+  }
+  // ---------------------------------------------------------------------------
+  // Step 1: Auth — preload Copilot token before we start the pipeline so the
+  // device-flow prompt appears cleanly before the TUI takes over the terminal.
+  // ---------------------------------------------------------------------------
+  // The device-flow callback: rendered as a status line in the header while
+  // the user completes the browser flow.
+  const onVerification = (url: string, code: string): void => {
+    ui.updateProgress({
+      phase: "reading",
+      done: 0,
+      total: 0,
+      message: `Visit ${url} and enter code: ${code}`,
+    })
+  }
+  // Eagerly authenticate so the token is warm before we need it for labelling.
+  // If this throws we want the TUI torn down cleanly.
+  let copilotToken: string
+  try {
+    copilotToken = await getCopilotToken(onVerification)
+  } catch (err) {
+    fatal(`Authentication failed: ${err instanceof Error ? err.message : String(err)}`)
+  }
+  const copilotConfig: CopilotConfig = {
+    model: opts.completionModel,
+    onVerification,
+  }
+  // ---------------------------------------------------------------------------
+  // Step 2: Discover + read files
+  // ---------------------------------------------------------------------------
+  ui.updateProgress({ phase: "reading", done: 0, total: 0 })
+  const fsOpts: FsOptions = {
+    maxFileBytes: opts.maxFileBytes,
+    excludePatterns: opts.excludeGlob,
+    readConcurrency: opts.readConcurrency,
+    maxFiles: opts.maxFiles,
+  }
+  let files: Awaited<ReturnType<typeof discoverFiles>> | undefined
+  try {
+    files = await discoverFiles(directory, fsOpts, (done, total) => {
+      ui.updateProgress({ phase: "reading", done, total })
+    })
+  } catch (err) {
+    fatal(`File discovery failed: ${err instanceof Error ? err.message : String(err)}`)
+  }
+  // fatal() is typed as never, so execution only reaches here if try succeeded
+  const resolvedFiles = files!
+  if (resolvedFiles.length === 0) {
+    fatal("No files found in the specified directory.")
+  }
+  // ---------------------------------------------------------------------------
+  // Step 3: Tokenise + chunk (synchronous, fast — no progress bar needed)
+  // ---------------------------------------------------------------------------
+  ui.updateProgress({ phase: "embedding", done: 0, total: resolvedFiles.length })
+  const chunks = resolvedFiles
+    .map((f) => chunkFile(f.relativePath, f.content))
+    .filter((c): c is NonNullable<typeof c> => c !== null)
+  if (chunks.length === 0) {
+    fatal("No embeddable chunks produced from the discovered files.")
+  }
+  // ---------------------------------------------------------------------------
+  // Step 4: Embed chunks
+  // ---------------------------------------------------------------------------
+  const embedOpts: EmbedOptions = {
+    model: DEFAULT_EMBEDDING_MODEL,
+    batchSize: opts.embedBatchSize,
+    concurrency: opts.embedConcurrency,
+  }
+  let embedEntriesRaw: EmbedEntry[] | undefined
+  try {
+    embedEntriesRaw = await embedChunks(chunks, embedOpts, (done, total) => {
+      ui.updateProgress({ phase: "embedding", done, total })
+    })
+  } catch (err) {
+    fatal(`Embedding failed: ${err instanceof Error ? err.message : String(err)}`)
+  }
+  const embedEntries = embedEntriesRaw!
+  if (embedEntries.length === 0) {
+    fatal("Embedding produced no results.")
+  }
+  // ---------------------------------------------------------------------------
+  // Step 5: Spectral clustering (CPU-bound, synchronous)
+  // ---------------------------------------------------------------------------
+  ui.updateProgress({
+    phase: "clustering",
+    done: 0,
+    total: embedEntries.length,
+    message: `Clustering ${embedEntries.length} files…`,
+  })
+  const rootCluster: Cluster = { entries: embedEntries }
+  // splitCluster is synchronous; yield to the event loop first so the UI has
+  // a chance to render the "Clustering…" status line.
+  await Bun.sleep(0)
+  // ---------------------------------------------------------------------------
+  // Step 6: Build labelled tree
+  // ---------------------------------------------------------------------------
+  ui.updateProgress({
+    phase: "labelling",
+    done: 0,
+    total: 0,
+    message: "Labelling…",
+  })
+  const rootLabel = path.basename(path.resolve(directory))
+  let treeRaw: Awaited<ReturnType<typeof buildTree>> | undefined
+  try {
+    treeRaw = await buildTree(copilotConfig, rootLabel, rootCluster, (msg) => {
+      ui.updateProgress({ phase: "labelling", done: 0, total: 0, message: msg })
+    })
+  } catch (err) {
+    fatal(`Labelling failed: ${err instanceof Error ? err.message : String(err)}`)
+  }
+  const tree = treeRaw!
+  // ---------------------------------------------------------------------------
+  // Step 7: Hand the tree to the UI
+  // ---------------------------------------------------------------------------
+  ui.setTree(tree)
+  // The UI event loop keeps the process alive until the user presses q/Esc.
+}
+main().catch((err) => {
+  _ui?.destroy()
+  console.error("Unexpected error:", err)
+  process.exit(1)
+})

package/src/tokenize.ts ADDED Viewed

@@ -0,0 +1,76 @@
+import { get_encoding, type Tiktoken } from "tiktoken"
+// For all-MiniLM-L6-v2 the real token limit is 256 word-piece tokens, but
+// we chunk by *cl100k* tokens here to stay consistent with the Python port
+// which chunked by the OpenAI embedding model's tokenizer. The actual BERT
+// tokenizer produces more tokens per word, so this is a conservative upper
+// bound that still keeps chunks well within the model's real limit.
+export const MAX_TOKENS_PER_EMBED = 8192
+// How many chunks we send to the embedding model per batch (mirrors Python).
+export const MAX_TOKENS_PER_BATCH_EMBED = 300_000
+export type TokenEncoding = Tiktoken
+let _encoding: Tiktoken | null = null
+export function getTokenEncoding(): Tiktoken {
+  if (_encoding === null) {
+    // cl100k_base is the closest widely-available encoding; used as a
+    // conservative proxy for chunking any text content.
+    _encoding = get_encoding("cl100k_base")
+  }
+  return _encoding
+}
+export interface Chunk {
+  /** Path of the originating file (relative to root) */
+  path: string
+  /**
+   * The text that will be embedded — includes a `path:\n\n` prefix followed
+   * by (up to) the first MAX_TOKENS_PER_EMBED tokens of the file content.
+   */
+  text: string
+}
+/**
+ * Split file content into at most one chunk (mirrors Python's [:1] slice).
+ * Returns null for empty files.
+ */
+export function chunkFile(path: string, content: string): Chunk | null {
+  const enc = getTokenEncoding()
+  const prefix = `${path}:\n\n`
+  const prefixTokens = enc.encode(prefix)
+  const contentTokens = enc.encode(content)
+  const maxContentTokens = MAX_TOKENS_PER_EMBED - prefixTokens.length
+  if (maxContentTokens <= 0) return null
+  const trimmedContentTokens = contentTokens.slice(0, maxContentTokens)
+  // Decode back to text
+  const prefixBuf = enc.decode(prefixTokens)
+  const contentBuf = enc.decode(trimmedContentTokens)
+  const text =
+    new TextDecoder().decode(prefixBuf) +
+    new TextDecoder().decode(contentBuf)
+  if (trimmedContentTokens.length === 0) return null
+  return { path, text }
+}
+/**
+ * Partition an array of chunks into batches that stay within the
+ * MAX_TOKENS_PER_BATCH_EMBED token budget.
+ */
+export function batchChunks(chunks: Chunk[]): Chunk[][] {
+  const maxPerBatch = Math.floor(MAX_TOKENS_PER_BATCH_EMBED / MAX_TOKENS_PER_EMBED)
+  const batches: Chunk[][] = []
+  for (let i = 0; i < chunks.length; i += maxPerBatch) {
+    batches.push(chunks.slice(i, i + maxPerBatch))
+  }
+  return batches
+}

package/src/tree.ts ADDED Viewed

@@ -0,0 +1,176 @@
+/**
+ * Tree data structure and labelling pipeline.
+ * Direct port of the Python Tree/to_pattern/label_nodes/tree functions.
+ */
+import { splitCluster, type Cluster } from "./cluster.ts"
+import { labelFiles, labelClusters, type CopilotConfig } from "./labels.ts"
+// ---------------------------------------------------------------------------
+// Tree type
+// ---------------------------------------------------------------------------
+export interface Tree {
+  /** Display label (e.g. "src/components/*.tsx: UI Components") */
+  label: string
+  /** All leaf file paths beneath this node */
+  files: string[]
+  /** Child nodes (empty for leaves) */
+  children: Tree[]
+}
+// ---------------------------------------------------------------------------
+// Pattern helper (port of Python to_pattern)
+// ---------------------------------------------------------------------------
+/**
+ * Given a list of file paths, return a compact pattern string like
+ * `"src/components/*.tsx: "` when all files share a common prefix/suffix,
+ * or `""` if there is no meaningful shared pattern.
+ */
+export function toPattern(files: string[]): string {
+  if (files.length === 0) return ""
+  // Longest common prefix
+  let prefix = files[0]!
+  for (const f of files) {
+    while (!f.startsWith(prefix)) prefix = prefix.slice(0, -1)
+    if (prefix === "") break
+  }
+  // Longest common suffix (reverse trick from Python)
+  const reversed = files.map((f) => f.slice(prefix.length).split("").reverse().join(""))
+  let suffix = reversed[0]!
+  for (const r of reversed) {
+    while (!r.startsWith(suffix)) suffix = suffix.slice(0, -1)
+    if (suffix === "") break
+  }
+  // Re-reverse
+  suffix = suffix.split("").reverse().join("")
+  const middles = files.map((f) => {
+    const core = f.slice(prefix.length)
+    return suffix.length > 0 ? core.slice(0, core.length - suffix.length) : core
+  })
+  const hasStar = middles.some((m) => m.length > 0)
+  const star = hasStar ? "*" : ""
+  if (prefix) {
+    if (suffix) return `${prefix}${star}${suffix}: `
+    return `${prefix}${star}: `
+  } else {
+    if (suffix) return `${star}${suffix}: `
+    return ""
+  }
+}
+// ---------------------------------------------------------------------------
+// Collect all file paths from a list of Tree nodes
+// ---------------------------------------------------------------------------
+export function collectFiles(trees: Tree[]): string[] {
+  return trees.flatMap((t) => t.files)
+}
+// ---------------------------------------------------------------------------
+// Recursive labelling pipeline (port of label_nodes + tree)
+// ---------------------------------------------------------------------------
+/**
+ * Recursively label a Cluster, returning a list of Tree nodes.
+ *
+ * @param config    Copilot config
+ * @param cluster   Current cluster to process
+ * @param depth     Recursion depth (0 = root level; shows progress)
+ * @param onStatus  Optional callback for status messages
+ */
+export async function labelNodes(
+  config: CopilotConfig,
+  cluster: Cluster,
+  depth: number,
+  onStatus?: (msg: string) => void
+): Promise<Tree[]> {
+  const children = splitCluster(cluster)
+  if (children.length === 1) {
+    // Leaf cluster: label each file individually
+    const entries = cluster.entries
+    let labels = await labelFiles(config, entries)
+    // Guard: align label count with entry count (Copilot may return fewer)
+    if (labels.length < entries.length) {
+      const missing = entries.length - labels.length
+      labels = [
+        ...labels,
+        ...Array.from({ length: missing }, () => ({
+          overarchingTheme: "",
+          distinguishingFeature: "",
+          label: "unlabelled",
+        })),
+      ]
+    }
+    return entries.map((entry, i) => ({
+      label: `${entry.path}: ${labels[i]!.label}`,
+      files: [entry.path],
+      children: [],
+    }))
+  }
+  // Internal node: recurse into each child cluster, then label the clusters
+  const childTreeLists = await Promise.all(
+    children.map((child) => labelNodes(config, child, depth + 1, onStatus))
+  )
+  if (depth === 0) {
+    onStatus?.(`Labelling ${children.length} clusters…`)
+  }
+  // Build input to cluster labeller: for each child, pass its leaf labels
+  const clusterLabelInputs = childTreeLists.map((trees) =>
+    trees.map((t) => t.label)
+  )
+  let clusterLabels = await labelClusters(config, clusterLabelInputs)
+  // Guard: align label count
+  if (clusterLabels.length < children.length) {
+    const missing = children.length - clusterLabels.length
+    clusterLabels = [
+      ...clusterLabels,
+      ...Array.from({ length: missing }, () => ({
+        overarchingTheme: "",
+        distinguishingFeature: "",
+        label: "unlabelled",
+      })),
+    ]
+  }
+  return clusterLabels.map((clusterLabel, i) => {
+    const trees = childTreeLists[i]!
+    const files = collectFiles(trees)
+    const pattern = toPattern(files)
+    return {
+      label: `${pattern}${clusterLabel.label}`,
+      files,
+      children: trees,
+    }
+  })
+}
+/**
+ * Build the root Tree for a directory.
+ */
+export async function buildTree(
+  config: CopilotConfig,
+  rootLabel: string,
+  cluster: Cluster,
+  onStatus?: (msg: string) => void
+): Promise<Tree> {
+  const children = await labelNodes(config, cluster, 0, onStatus)
+  return {
+    label: rootLabel,
+    files: collectFiles(children),
+    children,
+  }
+}