npm - @et0and/ovid - Versions diffs - 0.0.2 - Mend

@et0and/ovid 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/src/embed.ts ADDED Viewed

@@ -0,0 +1,143 @@
+import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
+import type { Chunk } from "./tokenize.ts"
+export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
+// Embedding dimension for all-MiniLM-L6-v2
+const EMBEDDING_DIM = 384
+export interface EmbedEntry {
+  /** Path relative to root (same as Chunk.path) */
+  path: string
+  /** The chunked text that was embedded */
+  text: string
+  /** L2-normalised embedding vector */
+  embedding: Float32Array
+}
+export interface EmbedOptions {
+  model: string
+  batchSize: number
+  concurrency: number
+}
+let _pipe: FeatureExtractionPipeline | null = null
+async function getEmbedPipeline(model: string): Promise<FeatureExtractionPipeline> {
+  if (_pipe === null) {
+    _pipe = (await pipeline("feature-extraction", model, {
+      // Use float32 for full precision on CPU
+      dtype: "fp32",
+    }) as unknown) as FeatureExtractionPipeline
+  }
+  return _pipe
+}
+/** Mean-pool a raw [seqLen × dim] tensor output into a single [dim] vector. */
+function meanPool(data: Float32Array, seqLen: number, dim: number): Float32Array {
+  const pooled = new Float32Array(dim)
+  for (let t = 0; t < seqLen; t++) {
+    for (let d = 0; d < dim; d++) {
+      pooled[d] = (pooled[d] ?? 0) + (data[t * dim + d] ?? 0)
+    }
+  }
+  for (let d = 0; d < dim; d++) {
+    pooled[d]! /= seqLen
+  }
+  return pooled
+}
+/** L2-normalise a vector in place; returns the same array. */
+function l2Normalise(v: Float32Array): Float32Array {
+  let norm = 0
+  for (let i = 0; i < v.length; i++) norm += v[i]! * v[i]!
+  norm = Math.sqrt(norm)
+  if (norm > 1e-12) {
+    for (let i = 0; i < v.length; i++) v[i]! /= norm
+  }
+  return v
+}
+async function embedBatch(
+  pipe: FeatureExtractionPipeline,
+  texts: string[]
+): Promise<Float32Array[]> {
+  // @huggingface/transformers returns a Tensor with shape [batch, seqLen, dim]
+  const output = await pipe(texts, { pooling: "mean", normalize: true })
+  // output.data is a flat Float32Array of shape [batch * dim]
+  const data = output.data as Float32Array
+  const batchSize = texts.length
+  const dim = data.length / batchSize
+  const results: Float32Array[] = []
+  for (let i = 0; i < batchSize; i++) {
+    // When pooling + normalize are handled by the pipeline we can slice directly
+    const vec = data.slice(i * dim, (i + 1) * dim) as Float32Array
+    results.push(vec)
+  }
+  return results
+}
+/**
+ * Embed all chunks using the local model, with batching + concurrency limits.
+ * Calls `onProgress(done, total)` after each batch completes.
+ */
+export async function embedChunks(
+  chunks: Chunk[],
+  opts: EmbedOptions,
+  onProgress?: (done: number, total: number) => void
+): Promise<EmbedEntry[]> {
+  if (chunks.length === 0) return []
+  const pipe = await getEmbedPipeline(opts.model)
+  const batches: Chunk[][] = []
+  for (let i = 0; i < chunks.length; i += opts.batchSize) {
+    batches.push(chunks.slice(i, i + opts.batchSize))
+  }
+  const entries: EmbedEntry[] = new Array(chunks.length)
+  let chunkIndex = 0
+  let done = 0
+  for (let i = 0; i < batches.length; i += opts.concurrency) {
+    const concurrentBatches = batches.slice(i, i + opts.concurrency)
+    const startIndex = chunkIndex
+    const batchResults = await Promise.all(
+      concurrentBatches.map((batch) =>
+        embedBatch(pipe, batch.map((c) => c.text))
+      )
+    )
+    let offset = startIndex
+    for (let b = 0; b < concurrentBatches.length; b++) {
+      const batch = concurrentBatches[b]!
+      const embeddings = batchResults[b]!
+      for (let j = 0; j < batch.length; j++) {
+        const chunk = batch[j]!
+        entries[offset] = {
+          path: chunk.path,
+          text: chunk.text,
+          embedding: embeddings[j]!,
+        }
+        offset++
+      }
+      chunkIndex += batch.length
+      done += batch.length
+      onProgress?.(done, chunks.length)
+    }
+  }
+  return entries
+}
+/** Compute cosine distance between two L2-normalised vectors: 1 - dot(a,b) */
+export function cosineDist(a: Float32Array, b: Float32Array): number {
+  let dot = 0
+  for (let i = 0; i < a.length; i++) dot += a[i]! * b[i]!
+  return 1 - dot
+}
+export { EMBEDDING_DIM }

package/src/fs.ts ADDED Viewed

@@ -0,0 +1,187 @@
+import path from "node:path"
+import fs from "node:fs"
+// Default glob patterns to exclude from file discovery
+export const DEFAULT_EXCLUDES = [
+  "node_modules",
+  ".git",
+  "dist",
+  "build",
+  "target",
+  ".next",
+  ".nuxt",
+  ".output",
+  "__pycache__",
+  ".cache",
+  "coverage",
+  ".turbo",
+]
+export interface FsOptions {
+  maxFileBytes: number
+  excludePatterns: string[]
+  readConcurrency: number
+  maxFiles: number
+}
+export interface FileEntry {
+  /** Path relative to the root directory passed to the CLI */
+  relativePath: string
+  content: string
+}
+function shouldExclude(relPath: string, excludePatterns: string[]): boolean {
+  const parts = relPath.split("/")
+  return excludePatterns.some((pat) =>
+    parts.some((part) => part === pat || part.startsWith(pat + "/"))
+  )
+}
+/** List paths tracked in the git index, relative to `directory`. */
+async function listGitTrackedPaths(directory: string): Promise<string[]> {
+  const result = await Bun.spawn(
+    ["git", "ls-files", "-z", "--full-name"],
+    {
+      cwd: directory,
+      stdout: "pipe",
+      stderr: "pipe",
+    }
+  )
+  const exitCode = await result.exited
+  if (exitCode !== 0) {
+    throw new Error("git ls-files failed")
+  }
+  const raw = await new Response(result.stdout).text()
+  return raw.split("\0").filter(Boolean)
+}
+/** Walk a directory non-recursively (top-level files only), like Python fallback. */
+function listDirectoryFiles(directory: string): string[] {
+  const entries = fs.readdirSync(directory, { withFileTypes: true })
+  return entries
+    .filter((e) => e.isFile())
+    .map((e) => e.name)
+}
+async function readFile(
+  directory: string,
+  relativePath: string,
+  maxFileBytes: number
+): Promise<FileEntry | null> {
+  const absolutePath = path.join(directory, relativePath)
+  let stat: fs.Stats
+  try {
+    stat = fs.statSync(absolutePath)
+  } catch {
+    return null
+  }
+  // Skip directories (e.g. git submodules or symlinks-to-directories)
+  if (stat.isDirectory()) return null
+  // Skip very large files
+  if (stat.size > maxFileBytes) return null
+  const file = Bun.file(absolutePath)
+  let bytes: ArrayBuffer
+  try {
+    bytes = await file.arrayBuffer()
+  } catch {
+    return null
+  }
+  // Decode as UTF-8; skip binary files
+  let content: string
+  try {
+    content = new TextDecoder("utf-8", { fatal: true }).decode(bytes)
+  } catch {
+    return null
+  }
+  return { relativePath, content }
+}
+/** Resolve the git root for a given directory, or null if not in a git repo. */
+async function resolveGitRoot(directory: string): Promise<string | null> {
+  const result = await Bun.spawn(
+    ["git", "rev-parse", "--show-toplevel"],
+    {
+      cwd: directory,
+      stdout: "pipe",
+      stderr: "pipe",
+    }
+  )
+  const exitCode = await result.exited
+  if (exitCode !== 0) return null
+  const out = await new Response(result.stdout).text()
+  return out.trim()
+}
+export async function discoverFiles(
+  directory: string,
+  opts: FsOptions,
+  onProgress?: (done: number, total: number) => void
+): Promise<FileEntry[]> {
+  const absDir = path.resolve(directory)
+  // --- Path discovery ---
+  let relativePaths: string[]
+  const gitRoot = await resolveGitRoot(absDir)
+  if (gitRoot !== null) {
+    // Get all tracked paths relative to git root, then filter to those under
+    // our target directory (handles monorepo sub-directory invocation).
+    const allTracked = await listGitTrackedPaths(gitRoot)
+    const relToGitRoot = path.relative(gitRoot, absDir)
+    relativePaths = allTracked
+      .map((p) => {
+        // Make path relative to our target directory
+        if (relToGitRoot === "") return p
+        if (p.startsWith(relToGitRoot + "/")) {
+          return p.slice(relToGitRoot.length + 1)
+        }
+        return null
+      })
+      .filter((p): p is string => p !== null)
+  } else {
+    // Non-git: only top-level files (matches Python fallback behaviour)
+    relativePaths = listDirectoryFiles(absDir)
+  }
+  // --- Apply exclude patterns ---
+  relativePaths = relativePaths.filter(
+    (p) => !shouldExclude(p, opts.excludePatterns)
+  )
+  // --- Enforce max-files cap ---
+  if (relativePaths.length > opts.maxFiles) {
+    relativePaths = relativePaths.slice(0, opts.maxFiles)
+  }
+  const total = relativePaths.length
+  // --- Read files with concurrency limit ---
+  const results: FileEntry[] = []
+  let done = 0
+  for (let i = 0; i < total; i += opts.readConcurrency) {
+    const batch = relativePaths.slice(i, i + opts.readConcurrency)
+    const entries = await Promise.all(
+      batch.map((p) => readFile(absDir, p, opts.maxFileBytes))
+    )
+    for (const entry of entries) {
+      if (entry !== null) results.push(entry)
+    }
+    done += batch.length
+    onProgress?.(done, total)
+  }
+  return results
+}

package/src/labels.ts ADDED Viewed

@@ -0,0 +1,205 @@
+/**
+ * Copilot-backed labelling provider.
+ *
+ * Uses the GitHub Copilot chat completions endpoint with JSON schema
+ * enforcement (response_format: json_schema) to produce structured labels,
+ * mirroring the Python `responses.parse(text_format=Labels)` calls.
+ */
+import { z } from "zod"
+import { getCopilotToken } from "./auth.ts"
+import type { EmbedEntry } from "./embed.ts"
+// ---------------------------------------------------------------------------
+// Zod schemas (mirrors Python Pydantic models)
+// ---------------------------------------------------------------------------
+const LabelSchema = z.object({
+  overarchingTheme: z.string(),
+  distinguishingFeature: z.string(),
+  label: z.string(),
+})
+const LabelsSchema = z.object({
+  labels: z.array(LabelSchema),
+})
+export type Label = z.infer<typeof LabelSchema>
+export type Labels = z.infer<typeof LabelsSchema>
+// ---------------------------------------------------------------------------
+// Copilot endpoint constants
+// ---------------------------------------------------------------------------
+const COPILOT_COMPLETIONS_URL =
+  "https://api.githubcopilot.com/chat/completions"
+// ---------------------------------------------------------------------------
+// Provider config
+// ---------------------------------------------------------------------------
+export interface CopilotConfig {
+  model: string
+  /** Passed in from the authentication layer */
+  onVerification: (url: string, code: string) => void
+}
+// ---------------------------------------------------------------------------
+// Core completion call
+// ---------------------------------------------------------------------------
+interface ChatMessage {
+  role: "system" | "user" | "assistant"
+  content: string
+}
+async function chatComplete(
+  config: CopilotConfig,
+  messages: ChatMessage[]
+): Promise<string> {
+  const token = await getCopilotToken(config.onVerification)
+  const body = {
+    model: config.model,
+    messages,
+    temperature: 0,
+    response_format: {
+      type: "json_schema",
+      json_schema: {
+        name: "Labels",
+        strict: true,
+        schema: {
+          type: "object",
+          properties: {
+            labels: {
+              type: "array",
+              items: {
+                type: "object",
+                properties: {
+                  overarchingTheme: { type: "string" },
+                  distinguishingFeature: { type: "string" },
+                  label: { type: "string" },
+                },
+                required: ["overarchingTheme", "distinguishingFeature", "label"],
+                additionalProperties: false,
+              },
+            },
+          },
+          required: ["labels"],
+          additionalProperties: false,
+        },
+      },
+    },
+  }
+  let lastError: unknown
+  for (let attempt = 0; attempt < 3; attempt++) {
+    const resp = await fetch(COPILOT_COMPLETIONS_URL, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${token}`,
+        "Content-Type": "application/json",
+        "Editor-Version": "vscode/1.95.0",
+        "Editor-Plugin-Version": "copilot/1.246.0",
+        "User-Agent": "GitHubCopilotChat/0.22.4",
+        "Openai-Intent": "conversation-panel",
+      },
+      body: JSON.stringify(body),
+    })
+    if (!resp.ok) {
+      lastError = new Error(`Copilot API error: HTTP ${resp.status} ${resp.statusText}`)
+      // Retry on 5xx
+      if (resp.status >= 500) {
+        await Bun.sleep(1000 * (attempt + 1))
+        continue
+      }
+      throw lastError
+    }
+    const data = (await resp.json()) as {
+      choices: Array<{ message: { content: string } }>
+    }
+    const content = data.choices[0]?.message?.content
+    if (content === undefined) throw new Error("Empty Copilot response")
+    return content
+  }
+  throw lastError
+}
+// ---------------------------------------------------------------------------
+// Parse + validate with fallback
+// ---------------------------------------------------------------------------
+function parseLabels(raw: string): Labels {
+  let parsed: unknown
+  try {
+    parsed = JSON.parse(raw)
+  } catch {
+    throw new Error(`Copilot returned non-JSON: ${raw.slice(0, 200)}`)
+  }
+  return LabelsSchema.parse(parsed)
+}
+// ---------------------------------------------------------------------------
+// Public labelling API
+// ---------------------------------------------------------------------------
+/**
+ * Label individual files within a leaf cluster.
+ * Mirrors: `responses.parse(model, input="Label each file in 3 to 7 words...", text_format=Labels)`
+ */
+export async function labelFiles(
+  config: CopilotConfig,
+  entries: EmbedEntry[]
+): Promise<Label[]> {
+  const renderedEntries = entries
+    .map((e) => `# File: ${e.path}\n\n${e.text}`)
+    .join("\n\n")
+  const prompt =
+    `Label each file in 3 to 7 words. Don't include file path/names in descriptions.\n\n` +
+    renderedEntries
+  const raw = await chatComplete(config, [
+    {
+      role: "system",
+      content:
+        "You label source code files and documents with brief, descriptive phrases. " +
+        "Respond only with valid JSON matching the provided schema.",
+    },
+    { role: "user", content: prompt },
+  ])
+  const result = parseLabels(raw)
+  return result.labels
+}
+/**
+ * Label clusters from their child tree labels.
+ * Mirrors: `responses.parse(model, input="Label each cluster in 2 words...", text_format=Labels)`
+ */
+export async function labelClusters(
+  config: CopilotConfig,
+  clusterLabels: string[][]
+): Promise<Label[]> {
+  const rendered = clusterLabels
+    .map((labels) => `# Cluster\n\n${labels.join("\n")}`)
+    .join("\n\n")
+  const prompt =
+    `Label each cluster in 2 words. Don't include file path/names in labels.\n\n` +
+    rendered
+  const raw = await chatComplete(config, [
+    {
+      role: "system",
+      content:
+        "You label clusters of source code files with very short, descriptive phrases. " +
+        "Respond only with valid JSON matching the provided schema.",
+    },
+    { role: "user", content: prompt },
+  ])
+  const result = parseLabels(raw)
+  return result.labels
+}