agentic-dataset-builder 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +106 -77
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +166 -0
- package/dist/labeling.d.ts +12 -0
- package/dist/labeling.js +22 -0
- package/dist/parquet.d.ts +4 -0
- package/dist/parquet.js +115 -0
- package/dist/platform/paths.d.ts +4 -0
- package/dist/platform/paths.js +62 -0
- package/dist/schemas/qwen35.d.ts +338 -0
- package/dist/schemas/qwen35.js +139 -0
- package/dist/schemas/source.d.ts +23 -0
- package/dist/schemas/source.js +19 -0
- package/dist/sources/claude.d.ts +2 -0
- package/dist/sources/claude.js +63 -0
- package/dist/sources/codex.d.ts +2 -0
- package/dist/sources/codex.js +256 -0
- package/dist/sources/pi.d.ts +2 -0
- package/dist/sources/pi.js +271 -0
- package/dist/utils/common.d.ts +7 -0
- package/dist/utils/common.js +46 -0
- package/dist/utils/jsonl.d.ts +2 -0
- package/dist/utils/jsonl.js +17 -0
- package/package.json +44 -13
- package/agentic_dataset/__init__.py +0 -1
- package/agentic_dataset/build_agentic_dataset.py +0 -368
- package/agentic_dataset/export_codex_session_to_qwen35.py +0 -466
- package/agentic_dataset/export_pi_session.py +0 -701
- package/agentic_dataset/export_pi_session_to_qwen35.py +0 -742
- package/agentic_dataset/export_qwen35_training.py +0 -1559
- package/agentic_dataset/label_qwen35_agentic.py +0 -156
- package/agentic_dataset/platform_paths.py +0 -85
- package/agentic_dataset/qwen35_training_record.py +0 -179
- package/bin/agentic-dataset-builder.js +0 -77
- package/requirements.txt +0 -2
- package/run.py +0 -8
package/README.md
CHANGED
|
@@ -1,125 +1,154 @@
|
|
|
1
1
|
# Agentic Dataset Builder
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Pure TypeScript CLI for turning local Pi, Codex, and Claude Code history into one validated `dataset.parquet` file.
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## Goal
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Use this repo when you want an AI coding assistant to do one job end-to-end:
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
1. discover local session history
|
|
10
|
+
2. normalize it into the local Qwen35-compatible schema
|
|
11
|
+
3. label records by training use
|
|
12
|
+
4. write one final parquet dataset
|
|
10
13
|
|
|
11
|
-
|
|
12
|
-
- Node 18+ if you want to run via `npx`
|
|
14
|
+
The CLI is native Node.js + TypeScript. It does not require Python.
|
|
13
15
|
|
|
14
|
-
##
|
|
16
|
+
## Fastest path
|
|
15
17
|
|
|
16
|
-
|
|
18
|
+
If the package is published on npm:
|
|
17
19
|
|
|
18
20
|
```bash
|
|
19
|
-
npx agentic-dataset-builder --output-root ./out
|
|
21
|
+
npx --registry=https://registry.npmjs.org/ agentic-dataset-builder@0.2.1 --output-root ./out
|
|
20
22
|
```
|
|
21
23
|
|
|
22
|
-
|
|
24
|
+
If working from this repo locally:
|
|
23
25
|
|
|
24
26
|
```bash
|
|
25
|
-
|
|
27
|
+
npm install
|
|
28
|
+
npm run build
|
|
29
|
+
node dist/cli.js --output-root ./out
|
|
26
30
|
```
|
|
27
31
|
|
|
28
|
-
|
|
32
|
+
## What the command does
|
|
29
33
|
|
|
30
|
-
|
|
31
|
-
python run.py --output-root ./out
|
|
32
|
-
```
|
|
33
|
-
|
|
34
|
-
If you want to pre-create the Python environment yourself:
|
|
35
|
-
|
|
36
|
-
```bash
|
|
37
|
-
pip install -r requirements.txt
|
|
38
|
-
python run.py --output-root ./out
|
|
39
|
-
```
|
|
34
|
+
The CLI will:
|
|
40
35
|
|
|
41
|
-
|
|
36
|
+
- detect local session roots for `pi`, `codex`, and `claude`
|
|
37
|
+
- read supported history files
|
|
38
|
+
- validate normalized records with `Zod`
|
|
39
|
+
- keep only the labels you requested
|
|
40
|
+
- write one final parquet file
|
|
41
|
+
- write a manifest and a run log
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
## Default source behavior
|
|
44
44
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
- `pi`
|
|
46
|
+
- full agent traces
|
|
47
|
+
- can produce `cot_eligible` or `agent_only`
|
|
48
|
+
- `codex`
|
|
49
|
+
- full agent traces
|
|
50
|
+
- usually produces `agent_only`
|
|
51
|
+
- `claude`
|
|
52
|
+
- prompt history only for now
|
|
53
|
+
- produces `prompt_only`
|
|
48
54
|
|
|
49
|
-
|
|
55
|
+
Claude is intentionally low-fidelity right now. It is not treated as a full assistant/tool trace source.
|
|
50
56
|
|
|
51
|
-
|
|
52
|
-
- scan `~/.codex/sessions`
|
|
53
|
-
- convert session history into the local Qwen3.5 schema
|
|
54
|
-
- label records as `cot_eligible`, `agent_only`, or `discard`
|
|
55
|
-
- keep `cot_eligible` and `agent_only`
|
|
56
|
-
- merge them into one final parquet file
|
|
57
|
-
- remove intermediate directories automatically after success
|
|
57
|
+
## Default output
|
|
58
58
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
Each run creates one directory under `./out/`:
|
|
59
|
+
Each run creates one directory:
|
|
62
60
|
|
|
63
61
|
```text
|
|
64
|
-
|
|
62
|
+
<output-root>/agentic-dataset-<timestamp>/
|
|
65
63
|
dataset.parquet
|
|
66
64
|
manifest.json
|
|
67
65
|
run.log
|
|
68
66
|
```
|
|
69
67
|
|
|
70
|
-
|
|
68
|
+
Files:
|
|
71
69
|
|
|
72
70
|
- `dataset.parquet`
|
|
71
|
+
- final merged dataset
|
|
72
|
+
- `manifest.json`
|
|
73
|
+
- source roots, source counts, labels kept, output path
|
|
74
|
+
- `run.log`
|
|
75
|
+
- step-by-step execution log for debugging
|
|
73
76
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
- `manifest.json`: what was scanned, what was kept, summary stats
|
|
77
|
-
- `run.log`: full step-by-step execution log
|
|
77
|
+
## Recommended commands
|
|
78
78
|
|
|
79
|
-
|
|
79
|
+
Pi + Codex:
|
|
80
80
|
|
|
81
81
|
```bash
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
node dist/cli.js --output-root ./out --include-sources pi,codex --include-labels cot_eligible,agent_only
|
|
83
|
+
```
|
|
84
84
|
|
|
85
|
-
|
|
86
|
-
node cli.mjs --output-root ./out --include-sources codex
|
|
85
|
+
Codex + Claude prompt-only:
|
|
87
86
|
|
|
88
|
-
|
|
89
|
-
node cli.
|
|
87
|
+
```bash
|
|
88
|
+
node dist/cli.js --output-root ./out --include-sources codex,claude --include-labels agent_only,prompt_only
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Pi only:
|
|
90
92
|
|
|
91
|
-
|
|
92
|
-
node cli.
|
|
93
|
+
```bash
|
|
94
|
+
node dist/cli.js --output-root ./out --include-sources pi --include-labels cot_eligible,agent_only
|
|
93
95
|
```
|
|
94
96
|
|
|
95
|
-
##
|
|
97
|
+
## Important flags
|
|
96
98
|
|
|
97
|
-
-
|
|
98
|
-
-
|
|
99
|
+
- `--output-root <dir>`
|
|
100
|
+
- required output root
|
|
101
|
+
- `--include-sources <csv>`
|
|
102
|
+
- any of: `pi,codex,claude`
|
|
103
|
+
- `--include-labels <csv>`
|
|
104
|
+
- any of: `cot_eligible,agent_only,prompt_only,discard`
|
|
105
|
+
- `--pi-root <dir>`
|
|
106
|
+
- override detected Pi session path
|
|
107
|
+
- `--codex-root <dir>`
|
|
108
|
+
- override detected Codex session path
|
|
109
|
+
- `--claude-root <dir>`
|
|
110
|
+
- override detected Claude project-history path
|
|
111
|
+
- `--help`
|
|
112
|
+
- print CLI help
|
|
99
113
|
|
|
100
|
-
|
|
114
|
+
## Auto-detected paths
|
|
101
115
|
|
|
102
|
-
|
|
116
|
+
The CLI tries OS-specific defaults automatically.
|
|
103
117
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
run
|
|
115
|
-
|
|
116
|
-
|
|
118
|
+
Typical paths:
|
|
119
|
+
|
|
120
|
+
- Pi: `~/.pi/agent/sessions`
|
|
121
|
+
- Codex: `~/.codex/sessions`
|
|
122
|
+
- Claude: `~/.claude/projects`
|
|
123
|
+
|
|
124
|
+
On Windows it also checks `APPDATA` and `LOCALAPPDATA` variants.
|
|
125
|
+
|
|
126
|
+
## Verification checklist
|
|
127
|
+
|
|
128
|
+
After a run, verify these three things:
|
|
129
|
+
|
|
130
|
+
1. `dataset.parquet` exists
|
|
131
|
+
2. `manifest.json` exists
|
|
132
|
+
3. `run.log` does not end with an uncaught error
|
|
133
|
+
|
|
134
|
+
Typical quick check:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
ls ./out/agentic-dataset-*/
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Development notes
|
|
141
|
+
|
|
142
|
+
Useful development commands:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
npm run check
|
|
146
|
+
npm run test
|
|
147
|
+
npm run build
|
|
117
148
|
```
|
|
118
149
|
|
|
119
|
-
|
|
150
|
+
This repo currently includes:
|
|
120
151
|
|
|
121
|
-
-
|
|
122
|
-
-
|
|
123
|
-
-
|
|
124
|
-
- Codex traces are still useful for agent-behavior distillation even when reasoning is encrypted-only.
|
|
125
|
-
- Redaction is not included yet. Add it before distributing the tool broadly if users may have sensitive local data.
|
|
152
|
+
- Zod validation for source events and final records
|
|
153
|
+
- Vitest coverage for core schema and labeling paths
|
|
154
|
+
- native parquet writing in TypeScript
|
package/dist/cli.d.ts
ADDED
package/dist/cli.js
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { candidateClaudeRoots, candidateCodexRoots, candidatePiRoots, firstExisting } from './platform/paths.js';
|
|
5
|
+
import { collectPiRecords } from './sources/pi.js';
|
|
6
|
+
import { collectCodexRecords } from './sources/codex.js';
|
|
7
|
+
import { collectClaudePromptOnlyRecords } from './sources/claude.js';
|
|
8
|
+
import { labelRecord } from './labeling.js';
|
|
9
|
+
import { Qwen35RecordSchema } from './schemas/qwen35.js';
|
|
10
|
+
import { writeParquet } from './parquet.js';
|
|
11
|
+
function parseArgs(argv) {
|
|
12
|
+
if (argv.includes('--help') || argv.includes('-h')) {
|
|
13
|
+
console.log(`agentic-dataset-builder@0.2.1
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
npx agentic-dataset-builder@0.2.1 --output-root ./out
|
|
17
|
+
|
|
18
|
+
Options:
|
|
19
|
+
--output-root <dir> Output directory root
|
|
20
|
+
--include-sources <list> Comma-separated: pi,codex,claude
|
|
21
|
+
--include-labels <list> Comma-separated: cot_eligible,agent_only,prompt_only,discard
|
|
22
|
+
--pi-root <dir> Override Pi session root
|
|
23
|
+
--codex-root <dir> Override Codex session root
|
|
24
|
+
--claude-root <dir> Override Claude project history root
|
|
25
|
+
--help Show this help message
|
|
26
|
+
`);
|
|
27
|
+
process.exit(0);
|
|
28
|
+
}
|
|
29
|
+
const args = {
|
|
30
|
+
outputRoot: './out',
|
|
31
|
+
includeSources: ['pi', 'codex'],
|
|
32
|
+
includeLabels: new Set(['cot_eligible', 'agent_only']),
|
|
33
|
+
};
|
|
34
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
35
|
+
const arg = argv[i];
|
|
36
|
+
const next = argv[i + 1];
|
|
37
|
+
if (arg === '--output-root' && next) {
|
|
38
|
+
args.outputRoot = next;
|
|
39
|
+
i += 1;
|
|
40
|
+
}
|
|
41
|
+
else if (arg === '--include-sources' && next) {
|
|
42
|
+
args.includeSources = next.split(',').map((v) => v.trim()).filter(Boolean);
|
|
43
|
+
i += 1;
|
|
44
|
+
}
|
|
45
|
+
else if (arg === '--include-labels' && next) {
|
|
46
|
+
args.includeLabels = new Set(next.split(',').map((v) => v.trim()).filter(Boolean));
|
|
47
|
+
i += 1;
|
|
48
|
+
}
|
|
49
|
+
else if (arg === '--pi-root' && next) {
|
|
50
|
+
args.piRoot = next;
|
|
51
|
+
i += 1;
|
|
52
|
+
}
|
|
53
|
+
else if (arg === '--codex-root' && next) {
|
|
54
|
+
args.codexRoot = next;
|
|
55
|
+
i += 1;
|
|
56
|
+
}
|
|
57
|
+
else if (arg === '--claude-root' && next) {
|
|
58
|
+
args.claudeRoot = next;
|
|
59
|
+
i += 1;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return args;
|
|
63
|
+
}
|
|
64
|
+
function stampDir(base) {
|
|
65
|
+
const iso = new Date().toISOString().replace(/[:.]/g, '-');
|
|
66
|
+
return path.resolve(base, `agentic-dataset-${iso}`);
|
|
67
|
+
}
|
|
68
|
+
function createLogger(runDir) {
|
|
69
|
+
const logPath = path.join(runDir, 'run.log');
|
|
70
|
+
fs.writeFileSync(logPath, '', 'utf8');
|
|
71
|
+
return {
|
|
72
|
+
logPath,
|
|
73
|
+
log(step, message) {
|
|
74
|
+
const line = `[${step}] ${message}`;
|
|
75
|
+
console.log(line);
|
|
76
|
+
fs.appendFileSync(logPath, `${line}\n`, 'utf8');
|
|
77
|
+
},
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
async function main() {
|
|
81
|
+
const args = parseArgs(process.argv.slice(2));
|
|
82
|
+
const runDir = stampDir(args.outputRoot);
|
|
83
|
+
fs.mkdirSync(runDir, { recursive: true });
|
|
84
|
+
const logger = createLogger(runDir);
|
|
85
|
+
const allRecords = [];
|
|
86
|
+
const sourceStats = {};
|
|
87
|
+
for (const source of args.includeSources) {
|
|
88
|
+
if (source === 'pi') {
|
|
89
|
+
const root = path.resolve(args.piRoot ?? firstExisting(candidatePiRoots()));
|
|
90
|
+
logger.log('pi', `reading ${root}`);
|
|
91
|
+
const records = await collectPiRecords(root);
|
|
92
|
+
sourceStats.pi = { records: records.length };
|
|
93
|
+
for (const record of records)
|
|
94
|
+
pushLabeled(record, 'pi', args.includeLabels, allRecords);
|
|
95
|
+
logger.log('pi', `kept ${allRecords.filter((r) => r.source_system === 'pi').length} labeled records`);
|
|
96
|
+
}
|
|
97
|
+
if (source === 'codex') {
|
|
98
|
+
const root = path.resolve(args.codexRoot ?? firstExisting(candidateCodexRoots()));
|
|
99
|
+
logger.log('codex', `reading ${root}`);
|
|
100
|
+
const records = await collectCodexRecords(root);
|
|
101
|
+
sourceStats.codex = { records: records.length };
|
|
102
|
+
for (const record of records)
|
|
103
|
+
pushLabeled(record, 'codex', args.includeLabels, allRecords);
|
|
104
|
+
logger.log('codex', `kept ${allRecords.filter((r) => r.source_system === 'codex').length} labeled records`);
|
|
105
|
+
}
|
|
106
|
+
if (source === 'claude') {
|
|
107
|
+
const root = path.resolve(args.claudeRoot ?? firstExisting(candidateClaudeRoots()));
|
|
108
|
+
logger.log('claude', `reading ${root}`);
|
|
109
|
+
const records = await collectClaudePromptOnlyRecords(root);
|
|
110
|
+
sourceStats.claude = { records: records.length };
|
|
111
|
+
for (const record of records)
|
|
112
|
+
pushLabeled(record, 'claude', args.includeLabels, allRecords);
|
|
113
|
+
logger.log('claude', `kept ${allRecords.filter((r) => r.source_system === 'claude').length} labeled records`);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
const datasetPath = path.join(runDir, 'dataset.parquet');
|
|
117
|
+
logger.log('write', `writing ${allRecords.length} records to ${datasetPath}`);
|
|
118
|
+
await writeParquet(datasetPath, allRecords);
|
|
119
|
+
const manifest = {
|
|
120
|
+
runDir,
|
|
121
|
+
datasetParquetPath: datasetPath,
|
|
122
|
+
recordCount: allRecords.length,
|
|
123
|
+
includeSources: args.includeSources,
|
|
124
|
+
includeLabels: [...args.includeLabels],
|
|
125
|
+
sourceStats,
|
|
126
|
+
runLog: logger.logPath,
|
|
127
|
+
};
|
|
128
|
+
fs.writeFileSync(path.join(runDir, 'manifest.json'), `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
|
|
129
|
+
console.log(JSON.stringify(manifest));
|
|
130
|
+
}
|
|
131
|
+
function pushLabeled(record, sourceSystem, includeLabels, target) {
|
|
132
|
+
const info = labelRecord(record);
|
|
133
|
+
if (!includeLabels.has(info.label))
|
|
134
|
+
return;
|
|
135
|
+
const enriched = Qwen35RecordSchema.parse({
|
|
136
|
+
...record,
|
|
137
|
+
label: info.label,
|
|
138
|
+
source_system: sourceSystem,
|
|
139
|
+
source_bucket: record.meta.lossy_source ? 'lossy' : 'strict',
|
|
140
|
+
source_file: record.meta.source,
|
|
141
|
+
agentic_label: {
|
|
142
|
+
label: info.label,
|
|
143
|
+
tool_call_count: info.toolCallCount,
|
|
144
|
+
tool_message_count: info.toolMessageCount,
|
|
145
|
+
dialogue_rounds_est: info.dialogueRounds,
|
|
146
|
+
reasoning_chars: info.reasoningChars,
|
|
147
|
+
has_reasoning: info.hasReasoning,
|
|
148
|
+
lossy_source: record.meta.lossy_source,
|
|
149
|
+
lossy_reasons: info.lossyReasons,
|
|
150
|
+
},
|
|
151
|
+
meta: {
|
|
152
|
+
...record.meta,
|
|
153
|
+
dataset_label: info.label,
|
|
154
|
+
dataset_source_system: sourceSystem,
|
|
155
|
+
dataset_source_bucket: record.meta.lossy_source ? 'lossy' : 'strict',
|
|
156
|
+
dataset_source_file: record.meta.source,
|
|
157
|
+
dataset_has_reasoning: info.hasReasoning,
|
|
158
|
+
dataset_reasoning_chars: info.reasoningChars,
|
|
159
|
+
},
|
|
160
|
+
});
|
|
161
|
+
target.push(enriched);
|
|
162
|
+
}
|
|
163
|
+
main().catch((error) => {
|
|
164
|
+
console.error(error instanceof Error ? error.stack ?? error.message : String(error));
|
|
165
|
+
process.exit(1);
|
|
166
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { Qwen35Record } from './schemas/qwen35.js';
|
|
2
|
+
export type Label = 'cot_eligible' | 'agent_only' | 'prompt_only' | 'discard';
|
|
3
|
+
export interface LabelInfo {
|
|
4
|
+
label: Label;
|
|
5
|
+
toolCallCount: number;
|
|
6
|
+
toolMessageCount: number;
|
|
7
|
+
dialogueRounds: number;
|
|
8
|
+
reasoningChars: number;
|
|
9
|
+
hasReasoning: boolean;
|
|
10
|
+
lossyReasons: string[];
|
|
11
|
+
}
|
|
12
|
+
export declare function labelRecord(record: Qwen35Record): LabelInfo;
|
package/dist/labeling.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export function labelRecord(record) {
|
|
2
|
+
const toolCallCount = record.messages
|
|
3
|
+
.filter((message) => message.role === 'assistant')
|
|
4
|
+
.reduce((sum, message) => sum + (message.tool_calls?.length ?? 0), 0);
|
|
5
|
+
const toolMessageCount = record.messages.filter((message) => message.role === 'tool').length;
|
|
6
|
+
const dialogueRounds = record.messages.filter((message) => message.role === 'user').length;
|
|
7
|
+
const reasoningChars = record.messages
|
|
8
|
+
.filter((message) => message.role === 'assistant')
|
|
9
|
+
.reduce((sum, message) => sum + (typeof message.reasoning_content === 'string' ? message.reasoning_content.length : 0), 0);
|
|
10
|
+
const hasReasoning = reasoningChars > 0;
|
|
11
|
+
const lossyReasons = record.meta.lossy_reasons;
|
|
12
|
+
const promptOnly = lossyReasons.includes('prompt_history_only');
|
|
13
|
+
const agentic = toolCallCount >= 1 && toolMessageCount >= 1 && dialogueRounds >= 1;
|
|
14
|
+
let label = 'discard';
|
|
15
|
+
if (promptOnly)
|
|
16
|
+
label = 'prompt_only';
|
|
17
|
+
else if (agentic && hasReasoning)
|
|
18
|
+
label = 'cot_eligible';
|
|
19
|
+
else if (agentic)
|
|
20
|
+
label = 'agent_only';
|
|
21
|
+
return { label, toolCallCount, toolMessageCount, dialogueRounds, reasoningChars, hasReasoning, lossyReasons };
|
|
22
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { Qwen35Record } from './schemas/qwen35.js';
|
|
2
|
+
export declare function parquetSchema(): any;
|
|
3
|
+
export declare function recordToParquetRow(record: Qwen35Record): Record<string, unknown>;
|
|
4
|
+
export declare function writeParquet(filePath: string, records: Qwen35Record[]): Promise<void>;
|
package/dist/parquet.js
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import parquet from 'parquetjs-lite';
|
|
2
|
+
export function parquetSchema() {
|
|
3
|
+
return new parquet.ParquetSchema({
|
|
4
|
+
id: { type: 'UTF8' },
|
|
5
|
+
request_id: { type: 'UTF8', optional: true },
|
|
6
|
+
endpoint: { type: 'UTF8' },
|
|
7
|
+
status: { type: 'INT64' },
|
|
8
|
+
ts: { type: 'UTF8' },
|
|
9
|
+
key: { type: 'UTF8', optional: true },
|
|
10
|
+
source: { type: 'UTF8', optional: true },
|
|
11
|
+
requested_model: { type: 'UTF8', optional: true },
|
|
12
|
+
actual_model: { type: 'UTF8', optional: true },
|
|
13
|
+
stream: { type: 'BOOLEAN', optional: true },
|
|
14
|
+
thinking_level: { type: 'UTF8', optional: true },
|
|
15
|
+
reasoning_summary_mode_json: { type: 'UTF8', optional: true },
|
|
16
|
+
thinking_type: { type: 'UTF8', optional: true },
|
|
17
|
+
thinking_budget_tokens: { type: 'INT64', optional: true },
|
|
18
|
+
max_output_tokens: { type: 'INT64', optional: true },
|
|
19
|
+
tool_spec_count: { type: 'INT64', optional: true },
|
|
20
|
+
tool_choice_json: { type: 'UTF8', optional: true },
|
|
21
|
+
request_contains_non_text_content: { type: 'BOOLEAN' },
|
|
22
|
+
request_image_block_count: { type: 'INT64' },
|
|
23
|
+
request_video_block_count: { type: 'INT64' },
|
|
24
|
+
request_tool_call_block_count: { type: 'INT64' },
|
|
25
|
+
request_tool_result_block_count: { type: 'INT64' },
|
|
26
|
+
request_thinking_block_count: { type: 'INT64' },
|
|
27
|
+
response_contains_non_text_content: { type: 'BOOLEAN' },
|
|
28
|
+
response_image_block_count: { type: 'INT64' },
|
|
29
|
+
response_video_block_count: { type: 'INT64' },
|
|
30
|
+
response_tool_call_block_count: { type: 'INT64' },
|
|
31
|
+
response_tool_result_block_count: { type: 'INT64' },
|
|
32
|
+
response_thinking_block_count: { type: 'INT64' },
|
|
33
|
+
request_truncated: { type: 'BOOLEAN' },
|
|
34
|
+
response_truncated: { type: 'BOOLEAN' },
|
|
35
|
+
lossy_source: { type: 'BOOLEAN' },
|
|
36
|
+
lossy_reasons_json: { type: 'UTF8' },
|
|
37
|
+
user_message_count: { type: 'INT64' },
|
|
38
|
+
assistant_message_count: { type: 'INT64' },
|
|
39
|
+
tool_message_count: { type: 'INT64' },
|
|
40
|
+
dialogue_rounds_est: { type: 'INT64' },
|
|
41
|
+
tool_call_count: { type: 'INT64' },
|
|
42
|
+
has_reasoning: { type: 'BOOLEAN' },
|
|
43
|
+
reasoning_chars: { type: 'INT64' },
|
|
44
|
+
content_chars_total: { type: 'INT64' },
|
|
45
|
+
messages_json: { type: 'UTF8' },
|
|
46
|
+
tools_json: { type: 'UTF8' },
|
|
47
|
+
meta_json: { type: 'UTF8' },
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
export function recordToParquetRow(record) {
|
|
51
|
+
const messages = record.messages;
|
|
52
|
+
const userMessageCount = messages.filter((m) => m.role === 'user').length;
|
|
53
|
+
const assistantMessages = messages.filter((m) => m.role === 'assistant');
|
|
54
|
+
const toolMessageCount = messages.filter((m) => m.role === 'tool').length;
|
|
55
|
+
const toolCallCount = assistantMessages.reduce((sum, message) => sum + (message.tool_calls?.length ?? 0), 0);
|
|
56
|
+
const reasoningChars = assistantMessages.reduce((sum, message) => sum + (typeof message.reasoning_content === 'string' ? message.reasoning_content.length : 0), 0);
|
|
57
|
+
const contentCharsTotal = messages.reduce((sum, message) => sum + JSON.stringify(message.content).length, 0);
|
|
58
|
+
return {
|
|
59
|
+
id: record.id,
|
|
60
|
+
request_id: record.request_id,
|
|
61
|
+
endpoint: record.meta.endpoint,
|
|
62
|
+
status: record.meta.status,
|
|
63
|
+
ts: record.meta.ts,
|
|
64
|
+
key: record.meta.key,
|
|
65
|
+
source: record.meta.source,
|
|
66
|
+
requested_model: record.meta.requested_model ?? undefined,
|
|
67
|
+
actual_model: record.meta.actual_model ?? undefined,
|
|
68
|
+
stream: record.meta.stream,
|
|
69
|
+
thinking_level: record.meta.thinking_level ?? undefined,
|
|
70
|
+
reasoning_summary_mode_json: JSON.stringify(record.meta.reasoning_summary_mode ?? null),
|
|
71
|
+
thinking_type: record.meta.thinking_type ?? undefined,
|
|
72
|
+
thinking_budget_tokens: record.meta.thinking_budget_tokens ?? undefined,
|
|
73
|
+
max_output_tokens: record.meta.max_output_tokens ?? undefined,
|
|
74
|
+
tool_spec_count: record.meta.tool_spec_count,
|
|
75
|
+
tool_choice_json: JSON.stringify(record.meta.tool_choice ?? null),
|
|
76
|
+
request_contains_non_text_content: record.meta.request_contains_non_text_content,
|
|
77
|
+
request_image_block_count: record.meta.request_image_block_count,
|
|
78
|
+
request_video_block_count: record.meta.request_video_block_count,
|
|
79
|
+
request_tool_call_block_count: record.meta.request_tool_call_block_count,
|
|
80
|
+
request_tool_result_block_count: record.meta.request_tool_result_block_count,
|
|
81
|
+
request_thinking_block_count: record.meta.request_thinking_block_count,
|
|
82
|
+
response_contains_non_text_content: record.meta.response_contains_non_text_content,
|
|
83
|
+
response_image_block_count: record.meta.response_image_block_count,
|
|
84
|
+
response_video_block_count: record.meta.response_video_block_count,
|
|
85
|
+
response_tool_call_block_count: record.meta.response_tool_call_block_count,
|
|
86
|
+
response_tool_result_block_count: record.meta.response_tool_result_block_count,
|
|
87
|
+
response_thinking_block_count: record.meta.response_thinking_block_count,
|
|
88
|
+
request_truncated: record.meta.request_truncated,
|
|
89
|
+
response_truncated: record.meta.response_truncated,
|
|
90
|
+
lossy_source: record.meta.lossy_source,
|
|
91
|
+
lossy_reasons_json: JSON.stringify(record.meta.lossy_reasons),
|
|
92
|
+
user_message_count: userMessageCount,
|
|
93
|
+
assistant_message_count: assistantMessages.length,
|
|
94
|
+
tool_message_count: toolMessageCount,
|
|
95
|
+
dialogue_rounds_est: userMessageCount,
|
|
96
|
+
tool_call_count: toolCallCount,
|
|
97
|
+
has_reasoning: reasoningChars > 0,
|
|
98
|
+
reasoning_chars: reasoningChars,
|
|
99
|
+
content_chars_total: contentCharsTotal,
|
|
100
|
+
messages_json: JSON.stringify(record.messages),
|
|
101
|
+
tools_json: JSON.stringify(record.tools),
|
|
102
|
+
meta_json: JSON.stringify(record.meta),
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
export async function writeParquet(filePath, records) {
|
|
106
|
+
const writer = await parquet.ParquetWriter.openFile(parquetSchema(), filePath);
|
|
107
|
+
try {
|
|
108
|
+
for (const record of records) {
|
|
109
|
+
await writer.appendRow(recordToParquetRow(record));
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
finally {
|
|
113
|
+
await writer.close();
|
|
114
|
+
}
|
|
115
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import os from 'node:os';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import fs from 'node:fs';
|
|
4
|
+
export function candidatePiRoots() {
|
|
5
|
+
const home = os.homedir();
|
|
6
|
+
const appdata = process.env.APPDATA;
|
|
7
|
+
const localappdata = process.env.LOCALAPPDATA;
|
|
8
|
+
return dedupe([
|
|
9
|
+
process.env.PI_SESSION_ROOT,
|
|
10
|
+
path.join(home, '.pi', 'agent', 'sessions'),
|
|
11
|
+
appdata ? path.join(appdata, 'pi', 'agent', 'sessions') : undefined,
|
|
12
|
+
appdata ? path.join(appdata, '.pi', 'agent', 'sessions') : undefined,
|
|
13
|
+
localappdata ? path.join(localappdata, 'pi', 'agent', 'sessions') : undefined,
|
|
14
|
+
localappdata ? path.join(localappdata, '.pi', 'agent', 'sessions') : undefined,
|
|
15
|
+
]);
|
|
16
|
+
}
|
|
17
|
+
export function candidateCodexRoots() {
|
|
18
|
+
const home = os.homedir();
|
|
19
|
+
const appdata = process.env.APPDATA;
|
|
20
|
+
const localappdata = process.env.LOCALAPPDATA;
|
|
21
|
+
return dedupe([
|
|
22
|
+
process.env.CODEX_SESSION_ROOT,
|
|
23
|
+
path.join(home, '.codex', 'sessions'),
|
|
24
|
+
appdata ? path.join(appdata, 'Codex', 'sessions') : undefined,
|
|
25
|
+
appdata ? path.join(appdata, '.codex', 'sessions') : undefined,
|
|
26
|
+
localappdata ? path.join(localappdata, 'Codex', 'sessions') : undefined,
|
|
27
|
+
localappdata ? path.join(localappdata, '.codex', 'sessions') : undefined,
|
|
28
|
+
]);
|
|
29
|
+
}
|
|
30
|
+
export function candidateClaudeRoots() {
|
|
31
|
+
const home = os.homedir();
|
|
32
|
+
const appdata = process.env.APPDATA;
|
|
33
|
+
const localappdata = process.env.LOCALAPPDATA;
|
|
34
|
+
return dedupe([
|
|
35
|
+
process.env.CLAUDE_SESSION_ROOT,
|
|
36
|
+
path.join(home, '.claude', 'projects'),
|
|
37
|
+
appdata ? path.join(appdata, 'Claude', 'projects') : undefined,
|
|
38
|
+
appdata ? path.join(appdata, '.claude', 'projects') : undefined,
|
|
39
|
+
localappdata ? path.join(localappdata, 'Claude', 'projects') : undefined,
|
|
40
|
+
localappdata ? path.join(localappdata, '.claude', 'projects') : undefined,
|
|
41
|
+
]);
|
|
42
|
+
}
|
|
43
|
+
export function firstExisting(candidates) {
|
|
44
|
+
for (const candidate of candidates) {
|
|
45
|
+
if (fs.existsSync(candidate))
|
|
46
|
+
return path.resolve(candidate);
|
|
47
|
+
}
|
|
48
|
+
return path.resolve(candidates[0]);
|
|
49
|
+
}
|
|
50
|
+
function dedupe(values) {
|
|
51
|
+
const seen = new Set();
|
|
52
|
+
const result = [];
|
|
53
|
+
for (const value of values) {
|
|
54
|
+
if (!value)
|
|
55
|
+
continue;
|
|
56
|
+
if (seen.has(value))
|
|
57
|
+
continue;
|
|
58
|
+
seen.add(value);
|
|
59
|
+
result.push(value);
|
|
60
|
+
}
|
|
61
|
+
return result;
|
|
62
|
+
}
|