@bastani/atomic 0.6.3-0 → 0.6.4-0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/ast-grep/SKILL.md +323 -0
- package/.agents/skills/ast-grep/references/rule_reference.md +297 -0
- package/.agents/skills/ripgrep/SKILL.md +382 -0
- package/.mcp.json +5 -6
- package/dist/commands/cli/claude-inflight-hook.d.ts +100 -0
- package/dist/commands/cli/claude-inflight-hook.d.ts.map +1 -0
- package/dist/commands/cli/claude-stop-hook.d.ts +2 -0
- package/dist/commands/cli/claude-stop-hook.d.ts.map +1 -1
- package/dist/lib/spawn.d.ts +1 -1
- package/dist/lib/spawn.d.ts.map +1 -1
- package/dist/sdk/providers/claude.d.ts +36 -0
- package/dist/sdk/providers/claude.d.ts.map +1 -1
- package/dist/sdk/providers/copilot.d.ts +17 -1
- package/dist/sdk/providers/copilot.d.ts.map +1 -1
- package/dist/sdk/runtime/executor.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts +49 -34
- package/dist/sdk/workflows/builtin/deep-research-codebase/claude/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts +18 -16
- package/dist/sdk/workflows/builtin/deep-research-codebase/copilot/index.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/batching.d.ts +43 -0
- package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/batching.d.ts.map +1 -0
- package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts +30 -0
- package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts +2 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/helpers/scout.d.ts.map +1 -1
- package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts +18 -16
- package/dist/sdk/workflows/builtin/deep-research-codebase/opencode/index.d.ts.map +1 -1
- package/dist/services/config/additional-instructions.d.ts +67 -0
- package/dist/services/config/additional-instructions.d.ts.map +1 -0
- package/package.json +3 -1
- package/src/cli.ts +18 -1
- package/src/commands/cli/chat/index.ts +52 -2
- package/src/commands/cli/claude-inflight-hook.test.ts +598 -0
- package/src/commands/cli/claude-inflight-hook.ts +359 -0
- package/src/commands/cli/claude-stop-hook.ts +40 -4
- package/src/commands/cli/init/index.ts +9 -0
- package/src/lib/spawn.ts +6 -2
- package/src/sdk/providers/claude.ts +131 -0
- package/src/sdk/providers/copilot.ts +30 -1
- package/src/sdk/runtime/executor.ts +43 -2
- package/src/sdk/workflows/builtin/deep-research-codebase/claude/index.ts +318 -158
- package/src/sdk/workflows/builtin/deep-research-codebase/copilot/index.ts +253 -129
- package/src/sdk/workflows/builtin/deep-research-codebase/helpers/batching.ts +65 -0
- package/src/sdk/workflows/builtin/deep-research-codebase/helpers/ignore-by-default.d.ts +8 -0
- package/src/sdk/workflows/builtin/deep-research-codebase/helpers/prompts.ts +203 -12
- package/src/sdk/workflows/builtin/deep-research-codebase/helpers/scout.ts +248 -78
- package/src/sdk/workflows/builtin/deep-research-codebase/opencode/index.ts +258 -146
- package/src/services/config/additional-instructions.ts +273 -0
- package/src/services/system/auto-sync.ts +10 -1
|
@@ -49,6 +49,14 @@ const TRAILING_PROSE_REMINDER =
|
|
|
49
49
|
"Do NOT end the turn on a tool call — downstream stages read your assistant " +
|
|
50
50
|
"transcript and will see nothing if the final message is a tool invocation.";
|
|
51
51
|
|
|
52
|
+
const AST_GREP_ENV_NOTICE =
|
|
53
|
+
"You are operating in an environment where ast-grep is installed. For any " +
|
|
54
|
+
"code search that requires understanding of syntax or code structure, you " +
|
|
55
|
+
"should default to using `ast-grep --lang [language] -p '<pattern>'`. Rely " +
|
|
56
|
+
"on your ast-grep skill for best practices. Adjust the --lang flag as " +
|
|
57
|
+
"needed for the specific programming language. Avoid using text-only " +
|
|
58
|
+
"search tools unless a plain-text search is explicitly requested.";
|
|
59
|
+
|
|
52
60
|
/** Slugify the user's prompt for use in the final research filename. */
|
|
53
61
|
export function slugifyPrompt(prompt: string): string {
|
|
54
62
|
const slug = prompt
|
|
@@ -78,8 +86,15 @@ function renderPartitionDirs(partition: PartitionUnit[]): string {
|
|
|
78
86
|
}
|
|
79
87
|
|
|
80
88
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
81
|
-
// Stage 1a — codebase-scout (single LLM
|
|
89
|
+
// Stage 1a — codebase-scout + query planner (single LLM call)
|
|
82
90
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
91
|
+
//
|
|
92
|
+
// The scout produces both a ≤300-word architectural orientation AND a list of
|
|
93
|
+
// per-partition ast-grep query seeds. The combined output is embedded verbatim
|
|
94
|
+
// into each specialist sub-agent's prompt as a single <ARCHITECTURAL_ORIENTATION>
|
|
95
|
+
// block — no JSON envelope, no deterministic parser. Specialists locate "their"
|
|
96
|
+
// partition's seeds by searching for the matching section header inside the
|
|
97
|
+
// block and treat them as starting points, not commands.
|
|
83
98
|
|
|
84
99
|
export function buildScoutPrompt(opts: {
|
|
85
100
|
question: string;
|
|
@@ -103,8 +118,9 @@ export function buildScoutPrompt(opts: {
|
|
|
103
118
|
`</RESEARCH_QUESTION>`,
|
|
104
119
|
``,
|
|
105
120
|
`<CONTEXT>`,
|
|
106
|
-
`You are the codebase scout for the deep-research-codebase
|
|
107
|
-
`workflow has already computed the codebase layout
|
|
121
|
+
`You are the codebase scout AND query planner for the deep-research-codebase`,
|
|
122
|
+
`workflow. The workflow has already computed the codebase layout`,
|
|
123
|
+
`deterministically:`,
|
|
108
124
|
``,
|
|
109
125
|
`- Total source files: ${opts.totalFiles.toLocaleString()}`,
|
|
110
126
|
`- Total LOC: ${opts.totalLoc.toLocaleString()}`,
|
|
@@ -119,25 +135,49 @@ export function buildScoutPrompt(opts: {
|
|
|
119
135
|
"```",
|
|
120
136
|
`</CONTEXT>`,
|
|
121
137
|
``,
|
|
138
|
+
`<TOOLING>`,
|
|
139
|
+
AST_GREP_ENV_NOTICE,
|
|
140
|
+
`Consult https://ast-grep.github.io/reference/languages.html for the`,
|
|
141
|
+
`canonical language list, and https://ast-grep.github.io/llms-full.txt for`,
|
|
142
|
+
`the full rule reference, when you need them.`,
|
|
143
|
+
`</TOOLING>`,
|
|
144
|
+
``,
|
|
122
145
|
`<TASK>`,
|
|
123
|
-
`
|
|
124
|
-
`
|
|
146
|
+
`Produce TWO sections — both will be embedded verbatim into the specialist`,
|
|
147
|
+
`sub-agents' prompts. Use the markdown headers shown so specialists can`,
|
|
148
|
+
`find their partition's seeds.`,
|
|
125
149
|
``,
|
|
126
|
-
|
|
150
|
+
`## Orientation`,
|
|
151
|
+
`In ≤300 words, cover:`,
|
|
127
152
|
` 1. The repo's overall shape (monorepo vs single package, polyglot or not)`,
|
|
128
153
|
` 2. The 3-5 most important top-level directories and what each contains`,
|
|
129
|
-
` 3. Architectural boundaries / layering
|
|
154
|
+
` 3. Architectural boundaries / layering visible from the tree`,
|
|
130
155
|
` 4. Where entry points or main modules likely live`,
|
|
131
156
|
``,
|
|
132
|
-
|
|
133
|
-
`
|
|
134
|
-
`
|
|
135
|
-
|
|
157
|
+
`## Query Seeds`,
|
|
158
|
+
`For each of the ${opts.explorerCount} partitions, suggest 2-4 ast-grep`,
|
|
159
|
+
`query seeds the specialists could start from. Format each seed as:`,
|
|
160
|
+
``,
|
|
161
|
+
`### Partition <n>`,
|
|
162
|
+
`- Query: \`ast-grep --lang <language> -p '<pattern>'\``,
|
|
163
|
+
` Why: <one sentence>`,
|
|
164
|
+
``,
|
|
165
|
+
`For structural rules (kind + has/inside), use a fenced YAML block instead`,
|
|
166
|
+
`of the \`-p\` form, with the same Why line.`,
|
|
167
|
+
``,
|
|
168
|
+
`Seeds are starting points, not commands — specialists adapt as they find`,
|
|
169
|
+
`things. If a partition is clearly irrelevant to the question, write a`,
|
|
170
|
+
`single-line note explaining why and skip its seeds.`,
|
|
136
171
|
`</TASK>`,
|
|
137
172
|
``,
|
|
138
173
|
`<CONSTRAINTS>`,
|
|
139
174
|
DOCUMENTARIAN_DISCLAIMER,
|
|
140
|
-
`
|
|
175
|
+
`Do NOT investigate the codebase to answer the question yourself — your`,
|
|
176
|
+
`job is orientation + seeding, not investigation. You may use Read/Glob/`,
|
|
177
|
+
`Grep/ast-grep sparingly to verify guesses about a few key files or to`,
|
|
178
|
+
`confirm a pattern parses, but keep output focused.`,
|
|
179
|
+
`Stay under 300 words for the Orientation section. Plain markdown only —`,
|
|
180
|
+
`no JSON envelope, no structured output.`,
|
|
141
181
|
TRAILING_PROSE_REMINDER,
|
|
142
182
|
`</CONSTRAINTS>`,
|
|
143
183
|
``,
|
|
@@ -186,7 +226,16 @@ export function buildLocatorPrompt(opts: {
|
|
|
186
226
|
`relates to the research question, and return a categorized index.`,
|
|
187
227
|
`</MISSION>`,
|
|
188
228
|
``,
|
|
229
|
+
`<TOOLING>`,
|
|
230
|
+
AST_GREP_ENV_NOTICE,
|
|
231
|
+
`</TOOLING>`,
|
|
232
|
+
``,
|
|
189
233
|
`<ARCHITECTURAL_ORIENTATION>`,
|
|
234
|
+
`The briefing below contains both a high-level orientation AND per-partition`,
|
|
235
|
+
`ast-grep query seeds. Find the **Partition ${opts.index}** section for the`,
|
|
236
|
+
`seeds scoped to your investigation — treat them as starting points, not`,
|
|
237
|
+
`commands. Adapt or skip seeds that don't fit what you actually find.`,
|
|
238
|
+
``,
|
|
190
239
|
orientation,
|
|
191
240
|
`</ARCHITECTURAL_ORIENTATION>`,
|
|
192
241
|
``,
|
|
@@ -267,7 +316,16 @@ export function buildPatternFinderPrompt(opts: {
|
|
|
267
316
|
`Return runnable-looking snippets, not abstract descriptions.`,
|
|
268
317
|
`</MISSION>`,
|
|
269
318
|
``,
|
|
319
|
+
`<TOOLING>`,
|
|
320
|
+
AST_GREP_ENV_NOTICE,
|
|
321
|
+
`</TOOLING>`,
|
|
322
|
+
``,
|
|
270
323
|
`<ARCHITECTURAL_ORIENTATION>`,
|
|
324
|
+
`The briefing below contains both a high-level orientation AND per-partition`,
|
|
325
|
+
`ast-grep query seeds. Find the **Partition ${opts.index}** section for the`,
|
|
326
|
+
`seeds scoped to your investigation — treat them as starting points, not`,
|
|
327
|
+
`commands. Adapt or skip seeds that don't fit what you actually find.`,
|
|
328
|
+
``,
|
|
271
329
|
orientation,
|
|
272
330
|
`</ARCHITECTURAL_ORIENTATION>`,
|
|
273
331
|
``,
|
|
@@ -337,7 +395,16 @@ export function buildAnalyzerPrompt(opts: {
|
|
|
337
395
|
`precise \`file.ts:line\` references throughout.`,
|
|
338
396
|
`</MISSION>`,
|
|
339
397
|
``,
|
|
398
|
+
`<TOOLING>`,
|
|
399
|
+
AST_GREP_ENV_NOTICE,
|
|
400
|
+
`</TOOLING>`,
|
|
401
|
+
``,
|
|
340
402
|
`<ARCHITECTURAL_ORIENTATION>`,
|
|
403
|
+
`The briefing below contains both a high-level orientation AND per-partition`,
|
|
404
|
+
`ast-grep query seeds. Find the **Partition ${opts.index}** section for the`,
|
|
405
|
+
`seeds scoped to your investigation — treat them as starting points, not`,
|
|
406
|
+
`commands. Adapt or skip seeds that don't fit what you actually find.`,
|
|
407
|
+
``,
|
|
341
408
|
orientation,
|
|
342
409
|
`</ARCHITECTURAL_ORIENTATION>`,
|
|
343
410
|
``,
|
|
@@ -765,3 +832,127 @@ export function buildAggregatorPrompt(opts: {
|
|
|
765
832
|
`</RESEARCH_QUESTION_REMINDER>`,
|
|
766
833
|
].join("\n");
|
|
767
834
|
}
|
|
835
|
+
|
|
836
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
837
|
+
// Stage 2 — batched specialist dispatch (Task-tool fan-out)
|
|
838
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
839
|
+
//
|
|
840
|
+
// To cap parallel SDK subprocesses, specialist invocations are grouped into
|
|
841
|
+
// "batch sessions" (see helpers/batching.ts). Each batch session is a single
|
|
842
|
+
// Claude Agent SDK call whose main thread dispatches up to N sub-agents via
|
|
843
|
+
// the Task tool. The sub-agents write their verbatim findings to per-task
|
|
844
|
+
// scratch files and reply with a single confirmation token, so the
|
|
845
|
+
// orchestrator's context grows by N short lines rather than N transcripts
|
|
846
|
+
// (filesystem-context skill).
|
|
847
|
+
|
|
848
|
+
/**
|
|
849
|
+
* Wrap a specialist prompt with the "write to file, reply with token only"
|
|
850
|
+
* envelope. The envelope is what the orchestrator hands to the Task tool's
|
|
851
|
+
* `prompt` parameter — the inner specialist prompt is built by the existing
|
|
852
|
+
* `buildLocatorPrompt` / `buildPatternFinderPrompt` / etc. and embedded
|
|
853
|
+
* verbatim so prompt semantics stay identical to the unbatched workflow.
|
|
854
|
+
*/
|
|
855
|
+
export function wrapPromptForTaskDispatch(opts: {
|
|
856
|
+
specialistPrompt: string;
|
|
857
|
+
outputPath: string;
|
|
858
|
+
agentLabel: string;
|
|
859
|
+
}): string {
|
|
860
|
+
return [
|
|
861
|
+
`<TASK_OUTPUT_CONTRACT>`,
|
|
862
|
+
`Write your COMPLETE response — the verbatim markdown findings exactly as`,
|
|
863
|
+
`the prompt below specifies — to this absolute path using the Write tool:`,
|
|
864
|
+
``,
|
|
865
|
+
` ${opts.outputPath}`,
|
|
866
|
+
``,
|
|
867
|
+
`Then reply with exactly the single token "DONE" and nothing else. Your`,
|
|
868
|
+
`parent only needs confirmation; the file is the real output. Do NOT`,
|
|
869
|
+
`inline your findings into your reply, do NOT add commentary, do NOT`,
|
|
870
|
+
`summarise — just write the file, then reply "DONE".`,
|
|
871
|
+
``,
|
|
872
|
+
`If you cannot produce useful findings (e.g. the partition has nothing`,
|
|
873
|
+
`relevant to the question), write a one-line sentinel to the file`,
|
|
874
|
+
`explaining why, then still reply "DONE". Reply with`,
|
|
875
|
+
`"FAILED: <one-line reason>" only if you could not even write the file.`,
|
|
876
|
+
`</TASK_OUTPUT_CONTRACT>`,
|
|
877
|
+
``,
|
|
878
|
+
`<${opts.agentLabel}_TASK>`,
|
|
879
|
+
opts.specialistPrompt,
|
|
880
|
+
`</${opts.agentLabel}_TASK>`,
|
|
881
|
+
].join("\n");
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
/**
|
|
885
|
+
* Build the orchestrator prompt for a batch session. The orchestrator's job
|
|
886
|
+
* is purely deterministic dispatch — fire one Task tool call per task in
|
|
887
|
+
* **a single assistant message** so they execute in parallel, then report a
|
|
888
|
+
* one-line tally. It must NOT inline sub-agent findings, paraphrase the
|
|
889
|
+
* embedded prompts, or retry failures — siblings still run and synthesis
|
|
890
|
+
* tolerates missing files.
|
|
891
|
+
*/
|
|
892
|
+
export function buildBatchOrchestratorPrompt(opts: {
|
|
893
|
+
wave: 1 | 2;
|
|
894
|
+
batchIndex: number;
|
|
895
|
+
totalBatches: number;
|
|
896
|
+
tasks: Array<{
|
|
897
|
+
subagentType: string;
|
|
898
|
+
prompt: string;
|
|
899
|
+
outputPath: string;
|
|
900
|
+
}>;
|
|
901
|
+
}): string {
|
|
902
|
+
const taskBlocks = opts.tasks
|
|
903
|
+
.map((t, i) =>
|
|
904
|
+
[
|
|
905
|
+
`### Task ${i + 1} of ${opts.tasks.length} — \`${t.subagentType}\``,
|
|
906
|
+
`Output path the sub-agent will write to: \`${t.outputPath}\``,
|
|
907
|
+
``,
|
|
908
|
+
`Verbatim prompt to pass as the Task tool's \`prompt\` parameter:`,
|
|
909
|
+
``,
|
|
910
|
+
"````",
|
|
911
|
+
t.prompt,
|
|
912
|
+
"````",
|
|
913
|
+
``,
|
|
914
|
+
].join("\n"),
|
|
915
|
+
)
|
|
916
|
+
.join("\n");
|
|
917
|
+
|
|
918
|
+
return [
|
|
919
|
+
`<BATCH_DISPATCH_MISSION>`,
|
|
920
|
+
`You are the deterministic dispatcher for batch ${opts.batchIndex} of`,
|
|
921
|
+
`${opts.totalBatches} in wave ${opts.wave} of the deep-research-codebase`,
|
|
922
|
+
`workflow. Your sole job is to spawn the ${opts.tasks.length} sub-agent`,
|
|
923
|
+
`task${opts.tasks.length === 1 ? "" : "s"} listed below using the Task tool.`,
|
|
924
|
+
`</BATCH_DISPATCH_MISSION>`,
|
|
925
|
+
``,
|
|
926
|
+
`<DISPATCH_RULES>`,
|
|
927
|
+
`1. Issue ALL ${opts.tasks.length} Task tool calls in a SINGLE assistant`,
|
|
928
|
+
` message (parallel tool use), not sequentially across multiple turns.`,
|
|
929
|
+
` Parallel dispatch is the only reason this batch exists — sequential`,
|
|
930
|
+
` calls defeat its purpose.`,
|
|
931
|
+
`2. For each task: set \`subagent_type\` to the value shown, set \`prompt\``,
|
|
932
|
+
` to the verbatim text inside the fenced block (no paraphrasing,`,
|
|
933
|
+
` truncating, or added framing), and set \`description\` to a short`,
|
|
934
|
+
` 3–5 word label.`,
|
|
935
|
+
`3. Dispatch every task even if some look similar to others. Tasks here`,
|
|
936
|
+
` cover DIFFERENT codebase partitions or DIFFERENT specialist roles —`,
|
|
937
|
+
` apparent overlap is not real overlap. Do NOT merge, skip, or combine.`,
|
|
938
|
+
`4. Do NOT inline any sub-agent's findings into your reply. The sub-agents`,
|
|
939
|
+
` write their output to disk; downstream stages read those files.`,
|
|
940
|
+
`5. Do NOT retry failed sub-agents. Siblings still run and the synthesis`,
|
|
941
|
+
` step tolerates missing files.`,
|
|
942
|
+
`</DISPATCH_RULES>`,
|
|
943
|
+
``,
|
|
944
|
+
`<FINAL_REPLY_FORMAT>`,
|
|
945
|
+
`After all sub-agents complete, your final assistant message must be`,
|
|
946
|
+
`exactly one line of the form:`,
|
|
947
|
+
``,
|
|
948
|
+
` BATCH ${opts.batchIndex} COMPLETE: <ok>/${opts.tasks.length} ok, <failed> failed`,
|
|
949
|
+
``,
|
|
950
|
+
`where <ok> is the count that replied "DONE" and <failed> is the count`,
|
|
951
|
+
`that replied "FAILED" or otherwise did not produce a file.`,
|
|
952
|
+
`</FINAL_REPLY_FORMAT>`,
|
|
953
|
+
``,
|
|
954
|
+
`---`,
|
|
955
|
+
``,
|
|
956
|
+
taskBlocks,
|
|
957
|
+
].join("\n");
|
|
958
|
+
}
|
|
@@ -3,7 +3,8 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Responsibilities:
|
|
5
5
|
* 1. Discover the codebase root (git toplevel, falling back to cwd).
|
|
6
|
-
* 2. List all source files,
|
|
6
|
+
* 2. List all source files, honoring `.gitignore` via git ls-files in repos
|
|
7
|
+
* and via `rg --files` in non-repo directories that still have one.
|
|
7
8
|
* 3. Count lines of code per file using batched `wc -l`.
|
|
8
9
|
* 4. Render a compact directory tree (depth-bounded) for prompt context.
|
|
9
10
|
* 5. Build "partition units" by aggregating LOC at depth-1, then drilling
|
|
@@ -15,32 +16,157 @@
|
|
|
15
16
|
|
|
16
17
|
// Use Bun.spawnSync instead of node:child_process for consistency with the rest of the codebase.
|
|
17
18
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
19
|
+
import * as linguistLanguages from "linguist-languages";
|
|
20
|
+
import type { Language } from "linguist-languages";
|
|
21
|
+
import ignore, { type Ignore } from "ignore";
|
|
22
|
+
import ignoreByDefault from "ignore-by-default";
|
|
23
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
24
|
+
import { join, posix as posixPath, relative, sep } from "node:path";
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Source-file extensions we treat as "code" for LOC accounting.
|
|
28
|
+
*
|
|
29
|
+
* Derived from GitHub Linguist (`linguist-languages`), filtered to
|
|
30
|
+
* `type === "programming"`. Linguist tracks 500+ programming languages and
|
|
31
|
+
* keeps the canonical extension list per language up to date — using it
|
|
32
|
+
* removes a maintenance burden and picks up obscure-but-legitimate
|
|
33
|
+
* languages we'd never enumerate by hand.
|
|
34
|
+
*
|
|
35
|
+
* Three modifications layered on top of the raw linguist data:
|
|
36
|
+
*
|
|
37
|
+
* 1. **Multi-segment extensions are skipped.** Linguist lists entries like
|
|
38
|
+
* `.coffee.md` (Literate CoffeeScript) and `.gradle.kts` (Gradle Kotlin
|
|
39
|
+
* DSL). Our `isCodeFile()` only sees the tail after the final dot, so
|
|
40
|
+
* collapsing `.coffee.md` to `md` would mis-classify Markdown as code.
|
|
41
|
+
* Skipping them is safe because the base languages they extend always
|
|
42
|
+
* list a single-segment extension as well (`.coffee`, `.kts`).
|
|
43
|
+
* 2. **EXCLUDE_EXTENSIONS denylist.** A handful of single-segment
|
|
44
|
+
* extensions that programming-typed languages claim but which in
|
|
45
|
+
* practice almost always mean a non-code file (`.md` is claimed by
|
|
46
|
+
* GCC Machine Description but means Markdown 99.9% of the time).
|
|
47
|
+
* 3. **SCHEMA_EXTENSIONS allowlist.** Schemas/DSLs that linguist
|
|
48
|
+
* categorises as `type: "data"` but which materially shape codebase
|
|
49
|
+
* behaviour and belong in research scope.
|
|
50
|
+
*/
|
|
51
|
+
const SCHEMA_EXTENSIONS = ["sql", "graphql", "proto"] as const;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Single-segment extensions that linguist's `programming`-typed languages
|
|
55
|
+
* claim but which in real-world codebases almost always mean a non-code
|
|
56
|
+
* file. Each entry needs a one-line justification.
|
|
57
|
+
*/
|
|
58
|
+
const EXCLUDE_EXTENSIONS = new Set<string>([
|
|
59
|
+
"md", // claimed by "GCC Machine Description"; almost always Markdown.
|
|
36
60
|
]);
|
|
37
61
|
|
|
38
|
-
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
]
|
|
62
|
+
const CODE_EXTENSIONS: Set<string> = (() => {
|
|
63
|
+
const out = new Set<string>();
|
|
64
|
+
// Each named export of `linguist-languages` is a `Language`; the namespace
|
|
65
|
+
// import has no other shape, so casting `Object.values(...)` to `Language[]`
|
|
66
|
+
// is sound and removes the need for an `unknown` intermediary.
|
|
67
|
+
for (const lang of Object.values(linguistLanguages) as Language[]) {
|
|
68
|
+
if (lang.type !== "programming") continue;
|
|
69
|
+
for (const ext of lang.extensions ?? []) {
|
|
70
|
+
const cleaned = ext.replace(/^\./, "").toLowerCase();
|
|
71
|
+
// Skip multi-segment extensions — see file-level comment.
|
|
72
|
+
if (cleaned.includes(".")) continue;
|
|
73
|
+
if (EXCLUDE_EXTENSIONS.has(cleaned)) continue;
|
|
74
|
+
out.add(cleaned);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
for (const ext of SCHEMA_EXTENSIONS) out.add(ext);
|
|
78
|
+
return out;
|
|
79
|
+
})();
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Recursively walk a directory tree, honoring nested `.gitignore` files at
|
|
83
|
+
* every level and seeding with `ignore-by-default`'s minimal universal set
|
|
84
|
+
* (`node_modules`, `.git`, `coverage`, etc.). Returns repo-relative paths.
|
|
85
|
+
*
|
|
86
|
+
* Used as the last-resort discovery fallback when neither `git ls-files` nor
|
|
87
|
+
* `rg --files` is available. The walker matches `.gitignore` semantics:
|
|
88
|
+
* • Patterns from a `.gitignore` only apply to files at or below the
|
|
89
|
+
* `.gitignore`'s directory.
|
|
90
|
+
* • Inherited rules from ancestor directories continue to apply.
|
|
91
|
+
* • Negations and the rest of gitignore syntax come from the `ignore`
|
|
92
|
+
* package, which is the de facto JS implementation.
|
|
93
|
+
*
|
|
94
|
+
* Symlinks are intentionally not followed (avoids cycles).
|
|
95
|
+
*/
|
|
96
|
+
function walkWithIgnore(root: string): string[] {
|
|
97
|
+
const out: string[] = [];
|
|
98
|
+
|
|
99
|
+
const baseline: Ignore = ignore().add(ignoreByDefault.directories());
|
|
100
|
+
walk(root, [{ basePath: "", matcher: baseline }]);
|
|
101
|
+
|
|
102
|
+
function walk(
|
|
103
|
+
dir: string,
|
|
104
|
+
inheritedScopes: ReadonlyArray<{ basePath: string; matcher: Ignore }>,
|
|
105
|
+
): void {
|
|
106
|
+
let scopes = inheritedScopes;
|
|
107
|
+
try {
|
|
108
|
+
const content = readFileSync(join(dir, ".gitignore"), "utf8");
|
|
109
|
+
const here = ignore().add(content);
|
|
110
|
+
// Normalize basePath to posix so it can be combined with `posix`
|
|
111
|
+
// (forward-slash) entry paths via `posix.relative` below — mixing
|
|
112
|
+
// separators in `path.relative` is undefined behaviour on Windows.
|
|
113
|
+
const basePathRel = relative(root, dir);
|
|
114
|
+
const basePath =
|
|
115
|
+
sep === "/" ? basePathRel : basePathRel.split(sep).join("/");
|
|
116
|
+
scopes = [
|
|
117
|
+
...inheritedScopes,
|
|
118
|
+
{ basePath, matcher: here },
|
|
119
|
+
];
|
|
120
|
+
} catch {
|
|
121
|
+
// No .gitignore at this level — keep inherited scopes.
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let entries;
|
|
125
|
+
try {
|
|
126
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
127
|
+
} catch {
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
for (const entry of entries) {
|
|
132
|
+
// Skip everything that isn't a regular file or a regular directory —
|
|
133
|
+
// most importantly, skip symlinks so we don't follow cycles.
|
|
134
|
+
if (!entry.isFile() && !entry.isDirectory()) continue;
|
|
135
|
+
|
|
136
|
+
const full = join(dir, entry.name);
|
|
137
|
+
const rel = relative(root, full);
|
|
138
|
+
// The `ignore` package requires forward-slash paths.
|
|
139
|
+
const posix = sep === "/" ? rel : rel.split(sep).join("/");
|
|
140
|
+
// Trailing slash so directory-only patterns (`dist/`) match.
|
|
141
|
+
const probe = entry.isDirectory() ? `${posix}/` : posix;
|
|
142
|
+
|
|
143
|
+
let ignored = false;
|
|
144
|
+
for (const scope of scopes) {
|
|
145
|
+
const within =
|
|
146
|
+
scope.basePath === ""
|
|
147
|
+
? probe
|
|
148
|
+
: posixPath.relative(scope.basePath, posix) +
|
|
149
|
+
(entry.isDirectory() ? "/" : "");
|
|
150
|
+
// If `within` escapes the scope (starts with `..`), the file isn't
|
|
151
|
+
// under this .gitignore's reach — skip the check.
|
|
152
|
+
if (within.startsWith("..")) continue;
|
|
153
|
+
if (scope.matcher.ignores(within)) {
|
|
154
|
+
ignored = true;
|
|
155
|
+
break;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
if (ignored) continue;
|
|
159
|
+
|
|
160
|
+
if (entry.isDirectory()) {
|
|
161
|
+
walk(full, scopes);
|
|
162
|
+
} else {
|
|
163
|
+
out.push(rel);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return out;
|
|
169
|
+
}
|
|
44
170
|
|
|
45
171
|
/** Per-file LOC + path. */
|
|
46
172
|
export type FileStats = { path: string; loc: number };
|
|
@@ -72,14 +198,19 @@ export type CodebaseScout = {
|
|
|
72
198
|
|
|
73
199
|
/** Resolve the project root. Prefers `git rev-parse --show-toplevel`. */
|
|
74
200
|
export function getCodebaseRoot(): string {
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
201
|
+
// Bun.spawnSync throws (rather than returning success:false) when the
|
|
202
|
+
// executable is missing from PATH — wrap so the documented "falls back to
|
|
203
|
+
// cwd" contract holds even on machines without git installed.
|
|
204
|
+
try {
|
|
205
|
+
const r = Bun.spawnSync({
|
|
206
|
+
cmd: ["git", "rev-parse", "--show-toplevel"],
|
|
207
|
+
stdout: "pipe",
|
|
208
|
+
stderr: "pipe",
|
|
209
|
+
});
|
|
210
|
+
if (r.success && r.stdout) {
|
|
211
|
+
return r.stdout.toString().trim();
|
|
212
|
+
}
|
|
213
|
+
} catch { /* git not on PATH — fall back to cwd */ }
|
|
83
214
|
return process.cwd();
|
|
84
215
|
}
|
|
85
216
|
|
|
@@ -90,36 +221,53 @@ function isCodeFile(p: string): boolean {
|
|
|
90
221
|
return CODE_EXTENSIONS.has(ext);
|
|
91
222
|
}
|
|
92
223
|
|
|
93
|
-
/**
|
|
224
|
+
/**
|
|
225
|
+
* List all files in the repository, honoring `.gitignore` whenever possible.
|
|
226
|
+
*
|
|
227
|
+
* Three discovery paths, tried in order — every path respects `.gitignore`:
|
|
228
|
+
*
|
|
229
|
+
* 1. **git ls-files** — for git repos. Combines `--cached` (tracked) with
|
|
230
|
+
* `--others --exclude-standard` (untracked-but-not-ignored) so a freshly
|
|
231
|
+
* created file the user hasn't `git add`-ed yet still appears, while
|
|
232
|
+
* anything matching `.gitignore` / `.git/info/exclude` is excluded.
|
|
233
|
+
* 2. **ripgrep `rg --files --hidden`** — for non-git directories that still
|
|
234
|
+
* have a `.gitignore` (or `.ignore`). `rg` honors both without needing
|
|
235
|
+
* a repo, and always excludes `.git/`. `--hidden` keeps tracked dotfiles
|
|
236
|
+
* like `.github/`, `.claude/` visible (matching git's behavior).
|
|
237
|
+
* 3. **In-process walker** — last-resort fallback when neither git nor rg
|
|
238
|
+
* is available. Uses the `ignore` package to honor every `.gitignore`
|
|
239
|
+
* it encounters (including nested ones), seeded with `ignore-by-default`
|
|
240
|
+
* for the universal-ignore baseline (`node_modules`, `.git`, etc.).
|
|
241
|
+
*/
|
|
94
242
|
function listAllFiles(root: string): string[] {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
243
|
+
// Bun.spawnSync throws (rather than returning success:false) when the
|
|
244
|
+
// executable is missing from PATH, so each branch is wrapped in try/catch
|
|
245
|
+
// and falls through to the next discovery strategy on error.
|
|
246
|
+
try {
|
|
247
|
+
const git = Bun.spawnSync({
|
|
248
|
+
cmd: ["git", "ls-files", "--cached", "--others", "--exclude-standard"],
|
|
249
|
+
cwd: root,
|
|
250
|
+
stdout: "pipe",
|
|
251
|
+
stderr: "pipe",
|
|
252
|
+
});
|
|
253
|
+
if (git.success && git.stdout) {
|
|
254
|
+
return git.stdout.toString().split("\n").filter((l) => l.length > 0);
|
|
255
|
+
}
|
|
256
|
+
} catch { /* git not on PATH — fall through to rg */ }
|
|
104
257
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
.split("\n")
|
|
119
|
-
.map((p) => p.replace(/^\.\//, ""))
|
|
120
|
-
.filter((p) => p.length > 0);
|
|
121
|
-
}
|
|
122
|
-
return [];
|
|
258
|
+
try {
|
|
259
|
+
const rg = Bun.spawnSync({
|
|
260
|
+
cmd: ["rg", "--files", "--hidden"],
|
|
261
|
+
cwd: root,
|
|
262
|
+
stdout: "pipe",
|
|
263
|
+
stderr: "pipe",
|
|
264
|
+
});
|
|
265
|
+
if (rg.success && rg.stdout) {
|
|
266
|
+
return rg.stdout.toString().split("\n").filter((l) => l.length > 0);
|
|
267
|
+
}
|
|
268
|
+
} catch { /* rg not on PATH — fall through to in-process walker */ }
|
|
269
|
+
|
|
270
|
+
return walkWithIgnore(root);
|
|
123
271
|
}
|
|
124
272
|
|
|
125
273
|
/**
|
|
@@ -127,7 +275,11 @@ function listAllFiles(root: string): string[] {
|
|
|
127
275
|
* " N filename"
|
|
128
276
|
* " N total" (when more than one file is passed)
|
|
129
277
|
*
|
|
130
|
-
* We batch to avoid command-line length limits.
|
|
278
|
+
* We batch to avoid command-line length limits. When `wc` is missing from
|
|
279
|
+
* PATH (typical on Windows) `Bun.spawnSync` throws ENOENT — each batch is
|
|
280
|
+
* wrapped so we can fall back to an in-process newline counter rather than
|
|
281
|
+
* aborting the workflow or silently zeroing every file's LOC (which would
|
|
282
|
+
* collapse the partition bin-packer).
|
|
131
283
|
*/
|
|
132
284
|
function countLines(root: string, files: string[]): Map<string, number> {
|
|
133
285
|
const result = new Map<string, number>();
|
|
@@ -136,22 +288,40 @@ function countLines(root: string, files: string[]): Map<string, number> {
|
|
|
136
288
|
const BATCH = 200;
|
|
137
289
|
for (let i = 0; i < files.length; i += BATCH) {
|
|
138
290
|
const batch = files.slice(i, i + BATCH);
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
291
|
+
let wcOk = false;
|
|
292
|
+
try {
|
|
293
|
+
const r = Bun.spawnSync({
|
|
294
|
+
cmd: ["wc", "-l", "--", ...batch],
|
|
295
|
+
cwd: root,
|
|
296
|
+
stdout: "pipe",
|
|
297
|
+
stderr: "pipe",
|
|
298
|
+
});
|
|
299
|
+
if (r.stdout) {
|
|
300
|
+
wcOk = true;
|
|
301
|
+
for (const line of r.stdout.toString().split("\n")) {
|
|
302
|
+
const m = line.match(/^\s*(\d+)\s+(.+)$/);
|
|
303
|
+
// Regex groups are typed `string | undefined` under strict mode even
|
|
304
|
+
// when the whole match succeeded — guard explicitly.
|
|
305
|
+
const countStr = m?.[1];
|
|
306
|
+
const filename = m?.[2]?.trim();
|
|
307
|
+
if (countStr === undefined || filename === undefined) continue;
|
|
308
|
+
if (filename === "total") continue;
|
|
309
|
+
result.set(filename, parseInt(countStr, 10));
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
} catch { /* wc not on PATH — fall through to in-process counter */ }
|
|
313
|
+
if (wcOk) continue;
|
|
314
|
+
// In-process fallback: count newline bytes. Matches `wc -l` semantics
|
|
315
|
+
// (a final line without a trailing `\n` is not counted).
|
|
316
|
+
for (const f of batch) {
|
|
317
|
+
try {
|
|
318
|
+
const content = readFileSync(join(root, f), "utf8");
|
|
319
|
+
let count = 0;
|
|
320
|
+
for (let j = 0; j < content.length; j++) {
|
|
321
|
+
if (content.charCodeAt(j) === 10) count++;
|
|
322
|
+
}
|
|
323
|
+
result.set(f, count);
|
|
324
|
+
} catch { /* unreadable — leave unset; consumer treats as 0 */ }
|
|
155
325
|
}
|
|
156
326
|
}
|
|
157
327
|
return result;
|