llm-cost-attribution 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,278 @@
1
+ /**
2
+ * Local git diff-size adapter for cost-driver feature extraction.
3
+ *
4
+ * Limits: this reads only history available in the local checkout, and it can
5
+ * key commits only when their subjects contain issue identifiers.
6
+ */
7
+ import { execFile } from 'node:child_process';
8
+ import { promisify } from 'node:util';
9
+
10
+ const execFileAsync = promisify(execFile);
11
+
12
+ export const DEFAULT_KEY_PATTERN = /[A-Z][A-Z0-9]+-\d+/;
13
+ export const GIT_LOG_FORMAT = '%x1e%H%x1f%s';
14
+
15
+ const COMMIT_SEPARATOR = '\x1e';
16
+ const FIELD_SEPARATOR = '\x1f';
17
+
18
+ /**
19
+ * Local-git implementation of the DiffSource port.
20
+ */
21
+ export class LocalGitDiffSource {
22
+ /**
23
+ * @param {string} [repoPath]
24
+ * @param {object} [options]
25
+ * @returns {AsyncGenerator<object, object, void>}
26
+ */
27
+ async *read(repoPath = process.cwd(), options = {}) {
28
+ const summary = await this.readResult(repoPath, options);
29
+ try {
30
+ for (const record of summary.records) yield record;
31
+ return summary;
32
+ } finally {
33
+ if (typeof options.onSummary === 'function') options.onSummary(summary);
34
+ }
35
+ }
36
+
37
+ /**
38
+ * @param {string} [repoPath]
39
+ * @param {object} [options]
40
+ * @returns {Promise<{ records: object[], unmatched: object, error: object | null }>}
41
+ */
42
+ async readResult(repoPath = process.cwd(), options = {}) {
43
+ if (typeof repoPath !== 'string' || repoPath === '') {
44
+ return emptyResult('repoPath must be a non-empty string');
45
+ }
46
+
47
+ if (typeof options.gitLogText === 'string') {
48
+ return withNoError(parseGitNumstatLog(options.gitLogText, options));
49
+ }
50
+
51
+ let stdout;
52
+ try {
53
+ ({ stdout } = await execFileAsync('git', gitLogArgs(repoPath, options), {
54
+ encoding: 'utf8',
55
+ maxBuffer: options.maxBuffer ?? 1024 * 1024 * 64,
56
+ }));
57
+ } catch (err) {
58
+ return emptyResult(gitErrorMessage(err, repoPath));
59
+ }
60
+
61
+ return withNoError(parseGitNumstatLog(stdout, options));
62
+ }
63
+ }
64
+
65
+ /**
66
+ * Read git diff statistics from a local repository and yield one aggregated
67
+ * diff record per issue key found in commit subjects.
68
+ *
69
+ * The async generator never throws for git failures. On completion, its return
70
+ * value is a summary object `{ records, unmatched, error }`; callers using
71
+ * `for await` can also pass `onSummary(summary)` to receive it.
72
+ *
73
+ * @param {string} [repoPath]
74
+ * Local git repository path. Defaults to `process.cwd()`.
75
+ * @param {object} [options]
76
+ * @param {RegExp | string} [options.keyPattern]
77
+ * Pattern used to extract issue keys from commit subjects.
78
+ * @param {string | string[]} [options.revRange]
79
+ * Optional rev range or list of git rev arguments.
80
+ * @param {string} [options.gitLogText]
81
+ * Recorded `git log --numstat` text. Intended for fixtures/tests.
82
+ * @param {(summary: object) => void} [options.onSummary]
83
+ * Receives `{ records, unmatched, error }` after parsing or git failure.
84
+ * @returns {AsyncGenerator<object, object, void>}
85
+ */
86
+ export async function *readGitDiffs(repoPath = process.cwd(), options = {}) {
87
+ return yield* new LocalGitDiffSource().read(repoPath, options);
88
+ }
89
+
90
+ /**
91
+ * Read and collect local-git diff records with their unmatched/error summary.
92
+ *
93
+ * @param {string} [repoPath]
94
+ * @param {object} [options]
95
+ * @returns {Promise<{ records: object[], unmatched: object, error: object | null }>}
96
+ */
97
+ export async function readGitDiffResult(repoPath = process.cwd(), options = {}) {
98
+ return new LocalGitDiffSource().readResult(repoPath, options);
99
+ }
100
+
101
+ /**
102
+ * Parse recorded `git log --numstat --format=%x1e%H%x1f%s` output.
103
+ *
104
+ * @param {string} logText
105
+ * @param {object} [options]
106
+ * @param {RegExp | string} [options.keyPattern]
107
+ * @returns {{ records: object[], unmatched: object }}
108
+ */
109
+ export function parseGitNumstatLog(logText, options = {}) {
110
+ const keyPattern = globalKeyPattern(options.keyPattern ?? DEFAULT_KEY_PATTERN);
111
+ const aggregates = new Map();
112
+ const unmatched = { count: 0, shas: [] };
113
+ const stats = {
114
+ commits: 0,
115
+ matchedCommits: 0,
116
+ unmatchedCommits: 0,
117
+ skippedEmptyCommits: 0,
118
+ };
119
+
120
+ let current = null;
121
+ for (const rawLine of String(logText).split(/\r?\n/)) {
122
+ if (rawLine.startsWith(COMMIT_SEPARATOR)) {
123
+ consumeCommit(current, keyPattern, aggregates, unmatched, stats);
124
+ current = parseCommitHeader(rawLine);
125
+ continue;
126
+ }
127
+
128
+ if (current === null || rawLine === '') continue;
129
+ const numstat = parseNumstatLine(rawLine);
130
+ if (numstat === null) continue;
131
+ current.additions += numstat.additions;
132
+ current.deletions += numstat.deletions;
133
+ current.changedFiles += 1;
134
+ }
135
+ consumeCommit(current, keyPattern, aggregates, unmatched, stats);
136
+
137
+ return {
138
+ records: Array.from(aggregates.values()),
139
+ unmatched: { ...unmatched, ...stats },
140
+ };
141
+ }
142
+
143
+ function gitLogArgs(repoPath, options) {
144
+ const args = [
145
+ '-C',
146
+ repoPath,
147
+ 'log',
148
+ '--numstat',
149
+ `--format=${GIT_LOG_FORMAT}`,
150
+ ];
151
+ if (options.revRange !== undefined) args.push(...revRangeArgs(options.revRange));
152
+ return args;
153
+ }
154
+
155
+ function revRangeArgs(revRange) {
156
+ if (Array.isArray(revRange)) return revRange.map(String).filter((arg) => arg !== '');
157
+ if (typeof revRange === 'string' && revRange !== '') return [revRange];
158
+ return [];
159
+ }
160
+
161
+ function parseCommitHeader(line) {
162
+ const header = line.slice(COMMIT_SEPARATOR.length);
163
+ const fieldIndex = header.indexOf(FIELD_SEPARATOR);
164
+ const sha = fieldIndex === -1 ? header : header.slice(0, fieldIndex);
165
+ const subject = fieldIndex === -1 ? '' : header.slice(fieldIndex + FIELD_SEPARATOR.length);
166
+ return {
167
+ sha,
168
+ subject,
169
+ additions: 0,
170
+ deletions: 0,
171
+ changedFiles: 0,
172
+ };
173
+ }
174
+
175
+ function parseNumstatLine(line) {
176
+ const fields = line.split('\t');
177
+ if (fields.length < 3) return null;
178
+ return {
179
+ additions: parseNumstatCount(fields[0]),
180
+ deletions: parseNumstatCount(fields[1]),
181
+ };
182
+ }
183
+
184
+ function parseNumstatCount(value) {
185
+ if (value === '-') return 0;
186
+ const count = Number(value);
187
+ return Number.isFinite(count) && count >= 0 ? count : 0;
188
+ }
189
+
190
+ function consumeCommit(commit, keyPattern, aggregates, unmatched, stats) {
191
+ if (commit === null) return;
192
+ stats.commits += 1;
193
+
194
+ if (commit.changedFiles === 0) {
195
+ stats.skippedEmptyCommits += 1;
196
+ return;
197
+ }
198
+
199
+ const keys = uniqueMatches(commit.subject, keyPattern);
200
+ if (keys.length === 0) {
201
+ unmatched.count += 1;
202
+ unmatched.shas.push(commit.sha);
203
+ stats.unmatchedCommits += 1;
204
+ return;
205
+ }
206
+
207
+ stats.matchedCommits += 1;
208
+ for (const key of keys) {
209
+ let record = aggregates.get(key);
210
+ if (record === undefined) {
211
+ record = {
212
+ key,
213
+ additions: 0,
214
+ deletions: 0,
215
+ changedFiles: 0,
216
+ shas: [],
217
+ };
218
+ aggregates.set(key, record);
219
+ }
220
+ record.additions += commit.additions;
221
+ record.deletions += commit.deletions;
222
+ record.changedFiles += commit.changedFiles;
223
+ record.shas.push(commit.sha);
224
+ }
225
+ }
226
+
227
+ function uniqueMatches(subject, keyPattern) {
228
+ keyPattern.lastIndex = 0;
229
+ const keys = [];
230
+ const seen = new Set();
231
+ let match;
232
+ while ((match = keyPattern.exec(subject)) !== null) {
233
+ const key = match[0];
234
+ if (!seen.has(key)) {
235
+ seen.add(key);
236
+ keys.push(key);
237
+ }
238
+ if (match[0] === '') keyPattern.lastIndex += 1;
239
+ }
240
+ return keys;
241
+ }
242
+
243
+ function globalKeyPattern(pattern) {
244
+ if (typeof pattern === 'string') return new RegExp(pattern, 'g');
245
+ if (!(pattern instanceof RegExp)) return globalKeyPattern(DEFAULT_KEY_PATTERN);
246
+ const flags = pattern.flags.includes('g') ? pattern.flags : `${pattern.flags}g`;
247
+ return new RegExp(pattern.source, flags);
248
+ }
249
+
250
+ function withNoError(result) {
251
+ return {
252
+ ...result,
253
+ error: null,
254
+ };
255
+ }
256
+
257
+ function emptyResult(message) {
258
+ return {
259
+ records: [],
260
+ unmatched: {
261
+ count: 0,
262
+ shas: [],
263
+ commits: 0,
264
+ matchedCommits: 0,
265
+ unmatchedCommits: 0,
266
+ skippedEmptyCommits: 0,
267
+ },
268
+ error: {
269
+ message,
270
+ },
271
+ };
272
+ }
273
+
274
+ function gitErrorMessage(err, repoPath) {
275
+ const stderr = typeof err?.stderr === 'string' ? err.stderr.trim() : '';
276
+ const detail = stderr || (err instanceof Error ? err.message : String(err));
277
+ return `git log --numstat failed for ${repoPath}: ${detail}`;
278
+ }
package/src/index.mjs CHANGED
@@ -73,6 +73,15 @@ export {
73
73
  normalizeModelName,
74
74
  ratesForModel,
75
75
  } from './pricing.mjs';
76
+ export { correlateCostWithFeature } from './correlate.mjs';
77
+ export {
78
+ DEFAULT_KEY_PATTERN,
79
+ LocalGitDiffSource,
80
+ parseGitNumstatLog,
81
+ readGitDiffResult,
82
+ readGitDiffs,
83
+ } from './git-diff-source.mjs';
84
+ export { joinCostWithFeature, BUILTIN_JOIN_STRATEGIES } from './cost-feature-join.mjs';
76
85
 
77
86
  /**
78
87
  * Default `PricingTable` adapter for `forecastIssueCost`. Delegates to
@@ -231,6 +240,74 @@ export async function computeIssueCostFromUsage(issueIdentifier, usageSource) {
231
240
  return rollupUsageRecords(issueIdentifier, records);
232
241
  }
233
242
 
243
+ /**
244
+ * Walk every Claude session + every Codex rollout and yield spec-compliant
245
+ * usage.jsonl records one at a time. Sessions whose cwd doesn't match the
246
+ * configured pattern are skipped.
247
+ *
248
+ * Use this when you want to stream records to a downstream consumer (such as
249
+ * `dump-usage` or an in-process correlator) instead of writing them to a file.
250
+ * `backfillUsageFromTranscripts` is built on top of this generator.
251
+ *
252
+ * The generator's return value (from `gen.next().value` once `done` is true)
253
+ * is `{ recordsYielded, sessionsProcessed, sessionsSkipped }`.
254
+ *
255
+ * @param {object} [options]
256
+ * @param {RegExp} [options.cwdPattern]
257
+ * @param {string} [options.claudeProjectsDir]
258
+ * @param {string} [options.codexSessionsDir]
259
+ * @param {(progress: { phase: string, file?: string, processed: number, total: number, recordsYielded: number }) => void} [options.onProgress]
260
+ */
261
+ export async function *iterateUsageFromTranscripts(options = {}) {
262
+ const cwdPattern = options.cwdPattern ?? DEFAULT_CWD_PATTERN;
263
+ const claudeRootDir = options.claudeProjectsDir ?? join(homedir(), '.claude', 'projects');
264
+ const codexRootDir = options.codexSessionsDir ?? join(homedir(), '.codex', 'sessions');
265
+ const onProgress = options.onProgress ?? (() => undefined);
266
+
267
+ let recordsYielded = 0;
268
+ let sessionsProcessed = 0;
269
+ let sessionsSkipped = 0;
270
+
271
+ // Phase 1: walk Claude project dirs and yield records for matching sessions.
272
+ const claudeDirs = await findClaudeProjectDirs(claudeRootDir, (encoded) => issueFromClaudeProjectDirName(encoded, cwdPattern) !== null);
273
+ for (let i = 0; i < claudeDirs.length; i++) {
274
+ const dir = claudeDirs[i];
275
+ const encoded = dir.split('/').pop() ?? '';
276
+ const issueIdentifier = issueFromClaudeProjectDirName(encoded, cwdPattern);
277
+ if (issueIdentifier === null) continue;
278
+ for (const file of await listJsonlsRecursively(dir)) {
279
+ const session = await parseClaudeSession(file);
280
+ if (session === null) { sessionsSkipped += 1; continue; }
281
+ const records = sessionToUsageRecords(session, issueIdentifier);
282
+ if (records.length === 0) { sessionsSkipped += 1; continue; }
283
+ for (const rec of records) yield rec;
284
+ recordsYielded += records.length;
285
+ sessionsProcessed += 1;
286
+ }
287
+ onProgress({ phase: 'claude', file: dir, processed: i + 1, total: claudeDirs.length, recordsYielded });
288
+ }
289
+
290
+ // Phase 2: walk Codex rollouts.
291
+ const codexFiles = await listCodexRollouts(codexRootDir);
292
+ for (let i = 0; i < codexFiles.length; i++) {
293
+ const file = codexFiles[i];
294
+ const session = await parseCodexSession(file);
295
+ if (session === null) { sessionsSkipped += 1; continue; }
296
+ const issueIdentifier = issueFromCwd(session.cwd, cwdPattern);
297
+ if (issueIdentifier === null) { sessionsSkipped += 1; continue; }
298
+ const records = sessionToUsageRecords(session, issueIdentifier);
299
+ if (records.length === 0) { sessionsSkipped += 1; continue; }
300
+ for (const rec of records) yield rec;
301
+ recordsYielded += records.length;
302
+ sessionsProcessed += 1;
303
+ if ((i + 1) % 100 === 0 || i + 1 === codexFiles.length) {
304
+ onProgress({ phase: 'codex', file, processed: i + 1, total: codexFiles.length, recordsYielded });
305
+ }
306
+ }
307
+
308
+ return { recordsYielded, sessionsProcessed, sessionsSkipped };
309
+ }
310
+
234
311
  /**
235
312
  * Walk every Claude session + every Codex rollout, derive spec-compliant
236
313
  * usage.jsonl records for each, and append them to a single output file.