@et0and/ovid 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/embed.ts ADDED
@@ -0,0 +1,143 @@
1
+ import { pipeline, type FeatureExtractionPipeline } from "@huggingface/transformers"
2
+ import type { Chunk } from "./tokenize.ts"
3
+
4
+ export const DEFAULT_EMBEDDING_MODEL = "Xenova/all-MiniLM-L6-v2"
5
+
6
+ // Embedding dimension for all-MiniLM-L6-v2
7
+ const EMBEDDING_DIM = 384
8
+
9
+ export interface EmbedEntry {
10
+ /** Path relative to root (same as Chunk.path) */
11
+ path: string
12
+ /** The chunked text that was embedded */
13
+ text: string
14
+ /** L2-normalised embedding vector */
15
+ embedding: Float32Array
16
+ }
17
+
18
+ export interface EmbedOptions {
19
+ model: string
20
+ batchSize: number
21
+ concurrency: number
22
+ }
23
+
24
+ let _pipe: FeatureExtractionPipeline | null = null
25
+
26
+ async function getEmbedPipeline(model: string): Promise<FeatureExtractionPipeline> {
27
+ if (_pipe === null) {
28
+ _pipe = (await pipeline("feature-extraction", model, {
29
+ // Use float32 for full precision on CPU
30
+ dtype: "fp32",
31
+ }) as unknown) as FeatureExtractionPipeline
32
+ }
33
+ return _pipe
34
+ }
35
+
36
+ /** Mean-pool a raw [seqLen × dim] tensor output into a single [dim] vector. */
37
+ function meanPool(data: Float32Array, seqLen: number, dim: number): Float32Array {
38
+ const pooled = new Float32Array(dim)
39
+ for (let t = 0; t < seqLen; t++) {
40
+ for (let d = 0; d < dim; d++) {
41
+ pooled[d] = (pooled[d] ?? 0) + (data[t * dim + d] ?? 0)
42
+ }
43
+ }
44
+ for (let d = 0; d < dim; d++) {
45
+ pooled[d]! /= seqLen
46
+ }
47
+ return pooled
48
+ }
49
+
50
+ /** L2-normalise a vector in place; returns the same array. */
51
+ function l2Normalise(v: Float32Array): Float32Array {
52
+ let norm = 0
53
+ for (let i = 0; i < v.length; i++) norm += v[i]! * v[i]!
54
+ norm = Math.sqrt(norm)
55
+ if (norm > 1e-12) {
56
+ for (let i = 0; i < v.length; i++) v[i]! /= norm
57
+ }
58
+ return v
59
+ }
60
+
61
+ async function embedBatch(
62
+ pipe: FeatureExtractionPipeline,
63
+ texts: string[]
64
+ ): Promise<Float32Array[]> {
65
+ // @huggingface/transformers returns a Tensor with shape [batch, seqLen, dim]
66
+ const output = await pipe(texts, { pooling: "mean", normalize: true })
67
+
68
+ // output.data is a flat Float32Array of shape [batch * dim]
69
+ const data = output.data as Float32Array
70
+ const batchSize = texts.length
71
+ const dim = data.length / batchSize
72
+
73
+ const results: Float32Array[] = []
74
+ for (let i = 0; i < batchSize; i++) {
75
+ // When pooling + normalize are handled by the pipeline we can slice directly
76
+ const vec = data.slice(i * dim, (i + 1) * dim) as Float32Array
77
+ results.push(vec)
78
+ }
79
+ return results
80
+ }
81
+
82
+ /**
83
+ * Embed all chunks using the local model, with batching + concurrency limits.
84
+ * Calls `onProgress(done, total)` after each batch completes.
85
+ */
86
+ export async function embedChunks(
87
+ chunks: Chunk[],
88
+ opts: EmbedOptions,
89
+ onProgress?: (done: number, total: number) => void
90
+ ): Promise<EmbedEntry[]> {
91
+ if (chunks.length === 0) return []
92
+
93
+ const pipe = await getEmbedPipeline(opts.model)
94
+
95
+ const batches: Chunk[][] = []
96
+ for (let i = 0; i < chunks.length; i += opts.batchSize) {
97
+ batches.push(chunks.slice(i, i + opts.batchSize))
98
+ }
99
+
100
+ const entries: EmbedEntry[] = new Array(chunks.length)
101
+ let chunkIndex = 0
102
+ let done = 0
103
+
104
+ for (let i = 0; i < batches.length; i += opts.concurrency) {
105
+ const concurrentBatches = batches.slice(i, i + opts.concurrency)
106
+ const startIndex = chunkIndex
107
+
108
+ const batchResults = await Promise.all(
109
+ concurrentBatches.map((batch) =>
110
+ embedBatch(pipe, batch.map((c) => c.text))
111
+ )
112
+ )
113
+
114
+ let offset = startIndex
115
+ for (let b = 0; b < concurrentBatches.length; b++) {
116
+ const batch = concurrentBatches[b]!
117
+ const embeddings = batchResults[b]!
118
+ for (let j = 0; j < batch.length; j++) {
119
+ const chunk = batch[j]!
120
+ entries[offset] = {
121
+ path: chunk.path,
122
+ text: chunk.text,
123
+ embedding: embeddings[j]!,
124
+ }
125
+ offset++
126
+ }
127
+ chunkIndex += batch.length
128
+ done += batch.length
129
+ onProgress?.(done, chunks.length)
130
+ }
131
+ }
132
+
133
+ return entries
134
+ }
135
+
136
+ /** Compute cosine distance between two L2-normalised vectors: 1 - dot(a,b) */
137
+ export function cosineDist(a: Float32Array, b: Float32Array): number {
138
+ let dot = 0
139
+ for (let i = 0; i < a.length; i++) dot += a[i]! * b[i]!
140
+ return 1 - dot
141
+ }
142
+
143
+ export { EMBEDDING_DIM }
package/src/fs.ts ADDED
@@ -0,0 +1,187 @@
1
+ import path from "node:path"
2
+ import fs from "node:fs"
3
+
4
+ // Default glob patterns to exclude from file discovery
5
+ export const DEFAULT_EXCLUDES = [
6
+ "node_modules",
7
+ ".git",
8
+ "dist",
9
+ "build",
10
+ "target",
11
+ ".next",
12
+ ".nuxt",
13
+ ".output",
14
+ "__pycache__",
15
+ ".cache",
16
+ "coverage",
17
+ ".turbo",
18
+ ]
19
+
20
+ export interface FsOptions {
21
+ maxFileBytes: number
22
+ excludePatterns: string[]
23
+ readConcurrency: number
24
+ maxFiles: number
25
+ }
26
+
27
+ export interface FileEntry {
28
+ /** Path relative to the root directory passed to the CLI */
29
+ relativePath: string
30
+ content: string
31
+ }
32
+
33
+ function shouldExclude(relPath: string, excludePatterns: string[]): boolean {
34
+ const parts = relPath.split("/")
35
+ return excludePatterns.some((pat) =>
36
+ parts.some((part) => part === pat || part.startsWith(pat + "/"))
37
+ )
38
+ }
39
+
40
+ /** List paths tracked in the git index, relative to `directory`. */
41
+ async function listGitTrackedPaths(directory: string): Promise<string[]> {
42
+ const result = await Bun.spawn(
43
+ ["git", "ls-files", "-z", "--full-name"],
44
+ {
45
+ cwd: directory,
46
+ stdout: "pipe",
47
+ stderr: "pipe",
48
+ }
49
+ )
50
+
51
+ const exitCode = await result.exited
52
+
53
+ if (exitCode !== 0) {
54
+ throw new Error("git ls-files failed")
55
+ }
56
+
57
+ const raw = await new Response(result.stdout).text()
58
+ return raw.split("\0").filter(Boolean)
59
+ }
60
+
61
+ /** Walk a directory non-recursively (top-level files only), like Python fallback. */
62
+ function listDirectoryFiles(directory: string): string[] {
63
+ const entries = fs.readdirSync(directory, { withFileTypes: true })
64
+ return entries
65
+ .filter((e) => e.isFile())
66
+ .map((e) => e.name)
67
+ }
68
+
69
+ async function readFile(
70
+ directory: string,
71
+ relativePath: string,
72
+ maxFileBytes: number
73
+ ): Promise<FileEntry | null> {
74
+ const absolutePath = path.join(directory, relativePath)
75
+
76
+ let stat: fs.Stats
77
+ try {
78
+ stat = fs.statSync(absolutePath)
79
+ } catch {
80
+ return null
81
+ }
82
+
83
+ // Skip directories (e.g. git submodules or symlinks-to-directories)
84
+ if (stat.isDirectory()) return null
85
+
86
+ // Skip very large files
87
+ if (stat.size > maxFileBytes) return null
88
+
89
+ const file = Bun.file(absolutePath)
90
+
91
+ let bytes: ArrayBuffer
92
+ try {
93
+ bytes = await file.arrayBuffer()
94
+ } catch {
95
+ return null
96
+ }
97
+
98
+ // Decode as UTF-8; skip binary files
99
+ let content: string
100
+ try {
101
+ content = new TextDecoder("utf-8", { fatal: true }).decode(bytes)
102
+ } catch {
103
+ return null
104
+ }
105
+
106
+ return { relativePath, content }
107
+ }
108
+
109
+ /** Resolve the git root for a given directory, or null if not in a git repo. */
110
+ async function resolveGitRoot(directory: string): Promise<string | null> {
111
+ const result = await Bun.spawn(
112
+ ["git", "rev-parse", "--show-toplevel"],
113
+ {
114
+ cwd: directory,
115
+ stdout: "pipe",
116
+ stderr: "pipe",
117
+ }
118
+ )
119
+ const exitCode = await result.exited
120
+ if (exitCode !== 0) return null
121
+ const out = await new Response(result.stdout).text()
122
+ return out.trim()
123
+ }
124
+
125
+ export async function discoverFiles(
126
+ directory: string,
127
+ opts: FsOptions,
128
+ onProgress?: (done: number, total: number) => void
129
+ ): Promise<FileEntry[]> {
130
+ const absDir = path.resolve(directory)
131
+
132
+ // --- Path discovery ---
133
+ let relativePaths: string[]
134
+
135
+ const gitRoot = await resolveGitRoot(absDir)
136
+
137
+ if (gitRoot !== null) {
138
+ // Get all tracked paths relative to git root, then filter to those under
139
+ // our target directory (handles monorepo sub-directory invocation).
140
+ const allTracked = await listGitTrackedPaths(gitRoot)
141
+ const relToGitRoot = path.relative(gitRoot, absDir)
142
+
143
+ relativePaths = allTracked
144
+ .map((p) => {
145
+ // Make path relative to our target directory
146
+ if (relToGitRoot === "") return p
147
+ if (p.startsWith(relToGitRoot + "/")) {
148
+ return p.slice(relToGitRoot.length + 1)
149
+ }
150
+ return null
151
+ })
152
+ .filter((p): p is string => p !== null)
153
+ } else {
154
+ // Non-git: only top-level files (matches Python fallback behaviour)
155
+ relativePaths = listDirectoryFiles(absDir)
156
+ }
157
+
158
+ // --- Apply exclude patterns ---
159
+ relativePaths = relativePaths.filter(
160
+ (p) => !shouldExclude(p, opts.excludePatterns)
161
+ )
162
+
163
+ // --- Enforce max-files cap ---
164
+ if (relativePaths.length > opts.maxFiles) {
165
+ relativePaths = relativePaths.slice(0, opts.maxFiles)
166
+ }
167
+
168
+ const total = relativePaths.length
169
+
170
+ // --- Read files with concurrency limit ---
171
+ const results: FileEntry[] = []
172
+ let done = 0
173
+
174
+ for (let i = 0; i < total; i += opts.readConcurrency) {
175
+ const batch = relativePaths.slice(i, i + opts.readConcurrency)
176
+ const entries = await Promise.all(
177
+ batch.map((p) => readFile(absDir, p, opts.maxFileBytes))
178
+ )
179
+ for (const entry of entries) {
180
+ if (entry !== null) results.push(entry)
181
+ }
182
+ done += batch.length
183
+ onProgress?.(done, total)
184
+ }
185
+
186
+ return results
187
+ }
package/src/labels.ts ADDED
@@ -0,0 +1,205 @@
1
+ /**
2
+ * Copilot-backed labelling provider.
3
+ *
4
+ * Uses the GitHub Copilot chat completions endpoint with JSON schema
5
+ * enforcement (response_format: json_schema) to produce structured labels,
6
+ * mirroring the Python `responses.parse(text_format=Labels)` calls.
7
+ */
8
+
9
+ import { z } from "zod"
10
+ import { getCopilotToken } from "./auth.ts"
11
+ import type { EmbedEntry } from "./embed.ts"
12
+
13
+ // ---------------------------------------------------------------------------
14
+ // Zod schemas (mirrors Python Pydantic models)
15
+ // ---------------------------------------------------------------------------
16
+
17
+ const LabelSchema = z.object({
18
+ overarchingTheme: z.string(),
19
+ distinguishingFeature: z.string(),
20
+ label: z.string(),
21
+ })
22
+
23
+ const LabelsSchema = z.object({
24
+ labels: z.array(LabelSchema),
25
+ })
26
+
27
+ export type Label = z.infer<typeof LabelSchema>
28
+ export type Labels = z.infer<typeof LabelsSchema>
29
+
30
+ // ---------------------------------------------------------------------------
31
+ // Copilot endpoint constants
32
+ // ---------------------------------------------------------------------------
33
+ const COPILOT_COMPLETIONS_URL =
34
+ "https://api.githubcopilot.com/chat/completions"
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Provider config
38
+ // ---------------------------------------------------------------------------
39
+ export interface CopilotConfig {
40
+ model: string
41
+ /** Passed in from the authentication layer */
42
+ onVerification: (url: string, code: string) => void
43
+ }
44
+
45
+ // ---------------------------------------------------------------------------
46
+ // Core completion call
47
+ // ---------------------------------------------------------------------------
48
+
49
+ interface ChatMessage {
50
+ role: "system" | "user" | "assistant"
51
+ content: string
52
+ }
53
+
54
+ async function chatComplete(
55
+ config: CopilotConfig,
56
+ messages: ChatMessage[]
57
+ ): Promise<string> {
58
+ const token = await getCopilotToken(config.onVerification)
59
+
60
+ const body = {
61
+ model: config.model,
62
+ messages,
63
+ temperature: 0,
64
+ response_format: {
65
+ type: "json_schema",
66
+ json_schema: {
67
+ name: "Labels",
68
+ strict: true,
69
+ schema: {
70
+ type: "object",
71
+ properties: {
72
+ labels: {
73
+ type: "array",
74
+ items: {
75
+ type: "object",
76
+ properties: {
77
+ overarchingTheme: { type: "string" },
78
+ distinguishingFeature: { type: "string" },
79
+ label: { type: "string" },
80
+ },
81
+ required: ["overarchingTheme", "distinguishingFeature", "label"],
82
+ additionalProperties: false,
83
+ },
84
+ },
85
+ },
86
+ required: ["labels"],
87
+ additionalProperties: false,
88
+ },
89
+ },
90
+ },
91
+ }
92
+
93
+ let lastError: unknown
94
+ for (let attempt = 0; attempt < 3; attempt++) {
95
+ const resp = await fetch(COPILOT_COMPLETIONS_URL, {
96
+ method: "POST",
97
+ headers: {
98
+ Authorization: `Bearer ${token}`,
99
+ "Content-Type": "application/json",
100
+ "Editor-Version": "vscode/1.95.0",
101
+ "Editor-Plugin-Version": "copilot/1.246.0",
102
+ "User-Agent": "GitHubCopilotChat/0.22.4",
103
+ "Openai-Intent": "conversation-panel",
104
+ },
105
+ body: JSON.stringify(body),
106
+ })
107
+
108
+ if (!resp.ok) {
109
+ lastError = new Error(`Copilot API error: HTTP ${resp.status} ${resp.statusText}`)
110
+ // Retry on 5xx
111
+ if (resp.status >= 500) {
112
+ await Bun.sleep(1000 * (attempt + 1))
113
+ continue
114
+ }
115
+ throw lastError
116
+ }
117
+
118
+ const data = (await resp.json()) as {
119
+ choices: Array<{ message: { content: string } }>
120
+ }
121
+ const content = data.choices[0]?.message?.content
122
+ if (content === undefined) throw new Error("Empty Copilot response")
123
+ return content
124
+ }
125
+
126
+ throw lastError
127
+ }
128
+
129
+ // ---------------------------------------------------------------------------
130
+ // Parse + validate with fallback
131
+ // ---------------------------------------------------------------------------
132
+
133
+ function parseLabels(raw: string): Labels {
134
+ let parsed: unknown
135
+ try {
136
+ parsed = JSON.parse(raw)
137
+ } catch {
138
+ throw new Error(`Copilot returned non-JSON: ${raw.slice(0, 200)}`)
139
+ }
140
+ return LabelsSchema.parse(parsed)
141
+ }
142
+
143
+ // ---------------------------------------------------------------------------
144
+ // Public labelling API
145
+ // ---------------------------------------------------------------------------
146
+
147
+ /**
148
+ * Label individual files within a leaf cluster.
149
+ * Mirrors: `responses.parse(model, input="Label each file in 3 to 7 words...", text_format=Labels)`
150
+ */
151
+ export async function labelFiles(
152
+ config: CopilotConfig,
153
+ entries: EmbedEntry[]
154
+ ): Promise<Label[]> {
155
+ const renderedEntries = entries
156
+ .map((e) => `# File: ${e.path}\n\n${e.text}`)
157
+ .join("\n\n")
158
+
159
+ const prompt =
160
+ `Label each file in 3 to 7 words. Don't include file path/names in descriptions.\n\n` +
161
+ renderedEntries
162
+
163
+ const raw = await chatComplete(config, [
164
+ {
165
+ role: "system",
166
+ content:
167
+ "You label source code files and documents with brief, descriptive phrases. " +
168
+ "Respond only with valid JSON matching the provided schema.",
169
+ },
170
+ { role: "user", content: prompt },
171
+ ])
172
+
173
+ const result = parseLabels(raw)
174
+ return result.labels
175
+ }
176
+
177
+ /**
178
+ * Label clusters from their child tree labels.
179
+ * Mirrors: `responses.parse(model, input="Label each cluster in 2 words...", text_format=Labels)`
180
+ */
181
+ export async function labelClusters(
182
+ config: CopilotConfig,
183
+ clusterLabels: string[][]
184
+ ): Promise<Label[]> {
185
+ const rendered = clusterLabels
186
+ .map((labels) => `# Cluster\n\n${labels.join("\n")}`)
187
+ .join("\n\n")
188
+
189
+ const prompt =
190
+ `Label each cluster in 2 words. Don't include file path/names in labels.\n\n` +
191
+ rendered
192
+
193
+ const raw = await chatComplete(config, [
194
+ {
195
+ role: "system",
196
+ content:
197
+ "You label clusters of source code files with very short, descriptive phrases. " +
198
+ "Respond only with valid JSON matching the provided schema.",
199
+ },
200
+ { role: "user", content: prompt },
201
+ ])
202
+
203
+ const result = parseLabels(raw)
204
+ return result.labels
205
+ }