@et0and/ovid 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/main.ts ADDED
@@ -0,0 +1,239 @@
1
+ /**
2
+ * CLI entry point and pipeline orchestrator for semantic-navigator.
3
+ *
4
+ * Pipeline:
5
+ * 1. Parse CLI flags (commander)
6
+ * 2. Optionally handle --logout
7
+ * 3. Init UI
8
+ * 4. Discover + read files → progress: "reading"
9
+ * 5. Tokenise + chunk → (synchronous, fast)
10
+ * 6. Embed chunks → progress: "embedding"
11
+ * 7. Spectral cluster → progress: "clustering"
12
+ * 8. Build labelled tree → progress: "labelling"
13
+ * 9. Hand tree to UI
14
+ */
15
+
16
+ import { Command } from "commander"
17
+ import path from "node:path"
18
+
19
+ import { discoverFiles, DEFAULT_EXCLUDES, type FsOptions } from "./fs.ts"
20
+ import { chunkFile } from "./tokenize.ts"
21
+ import { embedChunks, DEFAULT_EMBEDDING_MODEL, type EmbedOptions, type EmbedEntry } from "./embed.ts"
22
+ import { splitCluster, type Cluster } from "./cluster.ts"
23
+ import { buildTree } from "./tree.ts"
24
+ import { clearAuthCache, getCopilotToken } from "./auth.ts"
25
+ import { SemanticNavigatorUI, type ProgressState } from "./ui.ts"
26
+ import type { CopilotConfig } from "./labels.ts"
27
+
28
+ // ---------------------------------------------------------------------------
29
+ // CLI definition
30
+ // ---------------------------------------------------------------------------
31
+
32
+ const program = new Command()
33
+ .name("semantic-navigator")
34
+ .description("Browse a repository's files by semantic meaning")
35
+ .argument("[directory]", "Directory to analyse (default: current working directory)", ".")
36
+ .option("--completion-model <model>", "Copilot model to use for labelling", "gpt-4o-mini")
37
+ .option("--max-files <n>", "Maximum number of files to index", (v) => parseInt(v, 10), 2000)
38
+ .option("--max-file-bytes <n>", "Skip files larger than this many bytes", (v) => parseInt(v, 10), 1_000_000)
39
+ .option("--exclude-glob <pattern...>", "Glob patterns to exclude (repeatable)", DEFAULT_EXCLUDES)
40
+ .option("--read-concurrency <n>", "Concurrent file reads", (v) => parseInt(v, 10), 64)
41
+ .option("--embed-batch-size <n>", "Chunks per embedding batch", (v) => parseInt(v, 10), 32)
42
+ .option("--embed-concurrency <n>", "Concurrent embedding batches", (v) => parseInt(v, 10), 2)
43
+ .option("--logout", "Clear cached GitHub / Copilot credentials and exit")
44
+ .helpOption("-h, --help", "Show help")
45
+
46
+ // ---------------------------------------------------------------------------
47
+ // Main
48
+ // ---------------------------------------------------------------------------
49
+
50
+ let _ui: SemanticNavigatorUI | undefined
51
+
52
+ async function main(): Promise<void> {
53
+ program.parse(process.argv)
54
+
55
+ const opts = program.opts<{
56
+ completionModel: string
57
+ maxFiles: number
58
+ maxFileBytes: number
59
+ excludeGlob: string[]
60
+ readConcurrency: number
61
+ embedBatchSize: number
62
+ embedConcurrency: number
63
+ logout: boolean | undefined
64
+ }>()
65
+
66
+ // --- --logout shortcut ---
67
+ if (opts.logout) {
68
+ clearAuthCache()
69
+ console.log("Logged out: cached credentials removed.")
70
+ process.exit(0)
71
+ }
72
+
73
+ const directory = path.resolve(program.args[0] ?? ".")
74
+
75
+ // ---------------------------------------------------------------------------
76
+ // Init UI first so all progress is rendered in-terminal
77
+ // ---------------------------------------------------------------------------
78
+
79
+ const ui = new SemanticNavigatorUI()
80
+ _ui = ui
81
+ await ui.init()
82
+
83
+ // We'll wire console errors through the UI destroy → exit pattern
84
+ const fatal = (msg: string): never => {
85
+ ui.destroy()
86
+ console.error(`\nError: ${msg}`)
87
+ process.exit(1)
88
+ }
89
+
90
+ // ---------------------------------------------------------------------------
91
+ // Step 1: Auth — preload Copilot token before we start the pipeline so the
92
+ // device-flow prompt appears cleanly before the TUI takes over the terminal.
93
+ // ---------------------------------------------------------------------------
94
+
95
+ // The device-flow callback: rendered as a status line in the header while
96
+ // the user completes the browser flow.
97
+ const onVerification = (url: string, code: string): void => {
98
+ ui.updateProgress({
99
+ phase: "reading",
100
+ done: 0,
101
+ total: 0,
102
+ message: `Visit ${url} and enter code: ${code}`,
103
+ })
104
+ }
105
+
106
+ // Eagerly authenticate so the token is warm before we need it for labelling.
107
+ // If this throws we want the TUI torn down cleanly.
108
+ let copilotToken: string
109
+ try {
110
+ copilotToken = await getCopilotToken(onVerification)
111
+ } catch (err) {
112
+ fatal(`Authentication failed: ${err instanceof Error ? err.message : String(err)}`)
113
+ }
114
+
115
+ const copilotConfig: CopilotConfig = {
116
+ model: opts.completionModel,
117
+ onVerification,
118
+ }
119
+
120
+ // ---------------------------------------------------------------------------
121
+ // Step 2: Discover + read files
122
+ // ---------------------------------------------------------------------------
123
+
124
+ ui.updateProgress({ phase: "reading", done: 0, total: 0 })
125
+
126
+ const fsOpts: FsOptions = {
127
+ maxFileBytes: opts.maxFileBytes,
128
+ excludePatterns: opts.excludeGlob,
129
+ readConcurrency: opts.readConcurrency,
130
+ maxFiles: opts.maxFiles,
131
+ }
132
+
133
+ let files: Awaited<ReturnType<typeof discoverFiles>> | undefined
134
+ try {
135
+ files = await discoverFiles(directory, fsOpts, (done, total) => {
136
+ ui.updateProgress({ phase: "reading", done, total })
137
+ })
138
+ } catch (err) {
139
+ fatal(`File discovery failed: ${err instanceof Error ? err.message : String(err)}`)
140
+ }
141
+ // fatal() is typed as never, so execution only reaches here if try succeeded
142
+ const resolvedFiles = files!
143
+
144
+ if (resolvedFiles.length === 0) {
145
+ fatal("No files found in the specified directory.")
146
+ }
147
+
148
+ // ---------------------------------------------------------------------------
149
+ // Step 3: Tokenise + chunk (synchronous, fast — no progress bar needed)
150
+ // ---------------------------------------------------------------------------
151
+
152
+ ui.updateProgress({ phase: "embedding", done: 0, total: resolvedFiles.length })
153
+
154
+ const chunks = resolvedFiles
155
+ .map((f) => chunkFile(f.relativePath, f.content))
156
+ .filter((c): c is NonNullable<typeof c> => c !== null)
157
+
158
+ if (chunks.length === 0) {
159
+ fatal("No embeddable chunks produced from the discovered files.")
160
+ }
161
+
162
+ // ---------------------------------------------------------------------------
163
+ // Step 4: Embed chunks
164
+ // ---------------------------------------------------------------------------
165
+
166
+ const embedOpts: EmbedOptions = {
167
+ model: DEFAULT_EMBEDDING_MODEL,
168
+ batchSize: opts.embedBatchSize,
169
+ concurrency: opts.embedConcurrency,
170
+ }
171
+
172
+ let embedEntriesRaw: EmbedEntry[] | undefined
173
+ try {
174
+ embedEntriesRaw = await embedChunks(chunks, embedOpts, (done, total) => {
175
+ ui.updateProgress({ phase: "embedding", done, total })
176
+ })
177
+ } catch (err) {
178
+ fatal(`Embedding failed: ${err instanceof Error ? err.message : String(err)}`)
179
+ }
180
+ const embedEntries = embedEntriesRaw!
181
+
182
+ if (embedEntries.length === 0) {
183
+ fatal("Embedding produced no results.")
184
+ }
185
+
186
+ // ---------------------------------------------------------------------------
187
+ // Step 5: Spectral clustering (CPU-bound, synchronous)
188
+ // ---------------------------------------------------------------------------
189
+
190
+ ui.updateProgress({
191
+ phase: "clustering",
192
+ done: 0,
193
+ total: embedEntries.length,
194
+ message: `Clustering ${embedEntries.length} files…`,
195
+ })
196
+
197
+ const rootCluster: Cluster = { entries: embedEntries }
198
+
199
+ // splitCluster is synchronous; yield to the event loop first so the UI has
200
+ // a chance to render the "Clustering…" status line.
201
+ await Bun.sleep(0)
202
+
203
+ // ---------------------------------------------------------------------------
204
+ // Step 6: Build labelled tree
205
+ // ---------------------------------------------------------------------------
206
+
207
+ ui.updateProgress({
208
+ phase: "labelling",
209
+ done: 0,
210
+ total: 0,
211
+ message: "Labelling…",
212
+ })
213
+
214
+ const rootLabel = path.basename(path.resolve(directory))
215
+
216
+ let treeRaw: Awaited<ReturnType<typeof buildTree>> | undefined
217
+ try {
218
+ treeRaw = await buildTree(copilotConfig, rootLabel, rootCluster, (msg) => {
219
+ ui.updateProgress({ phase: "labelling", done: 0, total: 0, message: msg })
220
+ })
221
+ } catch (err) {
222
+ fatal(`Labelling failed: ${err instanceof Error ? err.message : String(err)}`)
223
+ }
224
+ const tree = treeRaw!
225
+
226
+ // ---------------------------------------------------------------------------
227
+ // Step 7: Hand the tree to the UI
228
+ // ---------------------------------------------------------------------------
229
+
230
+ ui.setTree(tree)
231
+
232
+ // The UI event loop keeps the process alive until the user presses q/Esc.
233
+ }
234
+
235
+ main().catch((err) => {
236
+ _ui?.destroy()
237
+ console.error("Unexpected error:", err)
238
+ process.exit(1)
239
+ })
@@ -0,0 +1,76 @@
1
+ import { get_encoding, type Tiktoken } from "tiktoken"
2
+
3
+ // For all-MiniLM-L6-v2 the real token limit is 256 word-piece tokens, but
4
+ // we chunk by *cl100k* tokens here to stay consistent with the Python port
5
+ // which chunked by the OpenAI embedding model's tokenizer. The actual BERT
6
+ // tokenizer produces more tokens per word, so this is a conservative upper
7
+ // bound that still keeps chunks well within the model's real limit.
8
+ export const MAX_TOKENS_PER_EMBED = 8192
9
+
10
+ // How many chunks we send to the embedding model per batch (mirrors Python).
11
+ export const MAX_TOKENS_PER_BATCH_EMBED = 300_000
12
+
13
+ export type TokenEncoding = Tiktoken
14
+
15
+ let _encoding: Tiktoken | null = null
16
+
17
+ export function getTokenEncoding(): Tiktoken {
18
+ if (_encoding === null) {
19
+ // cl100k_base is the closest widely-available encoding; used as a
20
+ // conservative proxy for chunking any text content.
21
+ _encoding = get_encoding("cl100k_base")
22
+ }
23
+ return _encoding
24
+ }
25
+
26
+ export interface Chunk {
27
+ /** Path of the originating file (relative to root) */
28
+ path: string
29
+ /**
30
+ * The text that will be embedded — includes a `path:\n\n` prefix followed
31
+ * by (up to) the first MAX_TOKENS_PER_EMBED tokens of the file content.
32
+ */
33
+ text: string
34
+ }
35
+
36
+ /**
37
+ * Split file content into at most one chunk (mirrors Python's [:1] slice).
38
+ * Returns null for empty files.
39
+ */
40
+ export function chunkFile(path: string, content: string): Chunk | null {
41
+ const enc = getTokenEncoding()
42
+
43
+ const prefix = `${path}:\n\n`
44
+ const prefixTokens = enc.encode(prefix)
45
+ const contentTokens = enc.encode(content)
46
+
47
+ const maxContentTokens = MAX_TOKENS_PER_EMBED - prefixTokens.length
48
+ if (maxContentTokens <= 0) return null
49
+
50
+ const trimmedContentTokens = contentTokens.slice(0, maxContentTokens)
51
+
52
+ // Decode back to text
53
+ const prefixBuf = enc.decode(prefixTokens)
54
+ const contentBuf = enc.decode(trimmedContentTokens)
55
+
56
+ const text =
57
+ new TextDecoder().decode(prefixBuf) +
58
+ new TextDecoder().decode(contentBuf)
59
+
60
+ if (trimmedContentTokens.length === 0) return null
61
+
62
+ return { path, text }
63
+ }
64
+
65
+ /**
66
+ * Partition an array of chunks into batches that stay within the
67
+ * MAX_TOKENS_PER_BATCH_EMBED token budget.
68
+ */
69
+ export function batchChunks(chunks: Chunk[]): Chunk[][] {
70
+ const maxPerBatch = Math.floor(MAX_TOKENS_PER_BATCH_EMBED / MAX_TOKENS_PER_EMBED)
71
+ const batches: Chunk[][] = []
72
+ for (let i = 0; i < chunks.length; i += maxPerBatch) {
73
+ batches.push(chunks.slice(i, i + maxPerBatch))
74
+ }
75
+ return batches
76
+ }
package/src/tree.ts ADDED
@@ -0,0 +1,176 @@
1
+ /**
2
+ * Tree data structure and labelling pipeline.
3
+ * Direct port of the Python Tree/to_pattern/label_nodes/tree functions.
4
+ */
5
+
6
+ import { splitCluster, type Cluster } from "./cluster.ts"
7
+ import { labelFiles, labelClusters, type CopilotConfig } from "./labels.ts"
8
+
9
+ // ---------------------------------------------------------------------------
10
+ // Tree type
11
+ // ---------------------------------------------------------------------------
12
+
13
+ export interface Tree {
14
+ /** Display label (e.g. "src/components/*.tsx: UI Components") */
15
+ label: string
16
+ /** All leaf file paths beneath this node */
17
+ files: string[]
18
+ /** Child nodes (empty for leaves) */
19
+ children: Tree[]
20
+ }
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Pattern helper (port of Python to_pattern)
24
+ // ---------------------------------------------------------------------------
25
+
26
+ /**
27
+ * Given a list of file paths, return a compact pattern string like
28
+ * `"src/components/*.tsx: "` when all files share a common prefix/suffix,
29
+ * or `""` if there is no meaningful shared pattern.
30
+ */
31
+ export function toPattern(files: string[]): string {
32
+ if (files.length === 0) return ""
33
+
34
+ // Longest common prefix
35
+ let prefix = files[0]!
36
+ for (const f of files) {
37
+ while (!f.startsWith(prefix)) prefix = prefix.slice(0, -1)
38
+ if (prefix === "") break
39
+ }
40
+
41
+ // Longest common suffix (reverse trick from Python)
42
+ const reversed = files.map((f) => f.slice(prefix.length).split("").reverse().join(""))
43
+ let suffix = reversed[0]!
44
+ for (const r of reversed) {
45
+ while (!r.startsWith(suffix)) suffix = suffix.slice(0, -1)
46
+ if (suffix === "") break
47
+ }
48
+ // Re-reverse
49
+ suffix = suffix.split("").reverse().join("")
50
+
51
+ const middles = files.map((f) => {
52
+ const core = f.slice(prefix.length)
53
+ return suffix.length > 0 ? core.slice(0, core.length - suffix.length) : core
54
+ })
55
+ const hasStar = middles.some((m) => m.length > 0)
56
+ const star = hasStar ? "*" : ""
57
+
58
+ if (prefix) {
59
+ if (suffix) return `${prefix}${star}${suffix}: `
60
+ return `${prefix}${star}: `
61
+ } else {
62
+ if (suffix) return `${star}${suffix}: `
63
+ return ""
64
+ }
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Collect all file paths from a list of Tree nodes
69
+ // ---------------------------------------------------------------------------
70
+
71
+ export function collectFiles(trees: Tree[]): string[] {
72
+ return trees.flatMap((t) => t.files)
73
+ }
74
+
75
+ // ---------------------------------------------------------------------------
76
+ // Recursive labelling pipeline (port of label_nodes + tree)
77
+ // ---------------------------------------------------------------------------
78
+
79
+ /**
80
+ * Recursively label a Cluster, returning a list of Tree nodes.
81
+ *
82
+ * @param config Copilot config
83
+ * @param cluster Current cluster to process
84
+ * @param depth Recursion depth (0 = root level; shows progress)
85
+ * @param onStatus Optional callback for status messages
86
+ */
87
+ export async function labelNodes(
88
+ config: CopilotConfig,
89
+ cluster: Cluster,
90
+ depth: number,
91
+ onStatus?: (msg: string) => void
92
+ ): Promise<Tree[]> {
93
+ const children = splitCluster(cluster)
94
+
95
+ if (children.length === 1) {
96
+ // Leaf cluster: label each file individually
97
+ const entries = cluster.entries
98
+ let labels = await labelFiles(config, entries)
99
+
100
+ // Guard: align label count with entry count (Copilot may return fewer)
101
+ if (labels.length < entries.length) {
102
+ const missing = entries.length - labels.length
103
+ labels = [
104
+ ...labels,
105
+ ...Array.from({ length: missing }, () => ({
106
+ overarchingTheme: "",
107
+ distinguishingFeature: "",
108
+ label: "unlabelled",
109
+ })),
110
+ ]
111
+ }
112
+
113
+ return entries.map((entry, i) => ({
114
+ label: `${entry.path}: ${labels[i]!.label}`,
115
+ files: [entry.path],
116
+ children: [],
117
+ }))
118
+ }
119
+
120
+ // Internal node: recurse into each child cluster, then label the clusters
121
+ const childTreeLists = await Promise.all(
122
+ children.map((child) => labelNodes(config, child, depth + 1, onStatus))
123
+ )
124
+
125
+ if (depth === 0) {
126
+ onStatus?.(`Labelling ${children.length} clusters…`)
127
+ }
128
+
129
+ // Build input to cluster labeller: for each child, pass its leaf labels
130
+ const clusterLabelInputs = childTreeLists.map((trees) =>
131
+ trees.map((t) => t.label)
132
+ )
133
+
134
+ let clusterLabels = await labelClusters(config, clusterLabelInputs)
135
+
136
+ // Guard: align label count
137
+ if (clusterLabels.length < children.length) {
138
+ const missing = children.length - clusterLabels.length
139
+ clusterLabels = [
140
+ ...clusterLabels,
141
+ ...Array.from({ length: missing }, () => ({
142
+ overarchingTheme: "",
143
+ distinguishingFeature: "",
144
+ label: "unlabelled",
145
+ })),
146
+ ]
147
+ }
148
+
149
+ return clusterLabels.map((clusterLabel, i) => {
150
+ const trees = childTreeLists[i]!
151
+ const files = collectFiles(trees)
152
+ const pattern = toPattern(files)
153
+ return {
154
+ label: `${pattern}${clusterLabel.label}`,
155
+ files,
156
+ children: trees,
157
+ }
158
+ })
159
+ }
160
+
161
+ /**
162
+ * Build the root Tree for a directory.
163
+ */
164
+ export async function buildTree(
165
+ config: CopilotConfig,
166
+ rootLabel: string,
167
+ cluster: Cluster,
168
+ onStatus?: (msg: string) => void
169
+ ): Promise<Tree> {
170
+ const children = await labelNodes(config, cluster, 0, onStatus)
171
+ return {
172
+ label: rootLabel,
173
+ files: collectFiles(children),
174
+ children,
175
+ }
176
+ }