agentic-dataset-builder 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,64 +1,43 @@
1
1
  # Agentic Dataset Builder
2
2
 
3
- Build one merged training dataset from local Pi and Codex session history.
3
+ Pure TypeScript CLI for building one merged parquet dataset from local Pi, Codex, and Claude Code history.
4
4
 
5
- ## Runtime
5
+ ## Requirements
6
6
 
7
- The core implementation is Python. User-facing entrypoints do not require bash.
7
+ - Node 18+
8
8
 
9
- Requirements:
9
+ ## Install and run
10
10
 
11
- - Python 3.10+
12
- - Node 18+ if you want to run via `npx`
13
-
14
- ## Recommended usage
15
-
16
- Published npm-style entrypoint:
11
+ Without installing globally:
17
12
 
18
13
  ```bash
19
- npx agentic-dataset-builder --output-root ./out
14
+ npx agentic-dataset-builder@0.2.0 --output-root ./out
20
15
  ```
21
16
 
22
- Local repo usage without bash:
17
+ Local development:
23
18
 
24
19
  ```bash
25
- node cli.mjs --output-root ./out
20
+ npm install
21
+ npm run build
22
+ node dist/cli.js --output-root ./out
26
23
  ```
27
24
 
28
- Direct Python entrypoint:
25
+ ## Examples
29
26
 
30
27
  ```bash
31
- python run.py --output-root ./out
32
- ```
28
+ # Pi + Codex
29
+ npx agentic-dataset-builder@0.2.0 --output-root ./out --include-sources pi,codex --include-labels cot_eligible,agent_only
33
30
 
34
- If you want to pre-create the Python environment yourself:
31
+ # Codex + Claude prompt-only
32
+ npx agentic-dataset-builder@0.2.0 --output-root ./out --include-sources codex,claude --include-labels agent_only,prompt_only
35
33
 
36
- ```bash
37
- pip install -r requirements.txt
38
- python run.py --output-root ./out
34
+ # Pi only
35
+ npx agentic-dataset-builder@0.2.0 --output-root ./out --include-sources pi --include-labels cot_eligible,agent_only
39
36
  ```
40
37
 
41
- ## What users run
38
+ ## Output
42
39
 
43
- From this directory, the simplest no-bash local command is:
44
-
45
- ```bash
46
- node cli.mjs --output-root ./out
47
- ```
48
-
49
- That one command will:
50
-
51
- - scan `~/.pi/agent/sessions`
52
- - scan `~/.codex/sessions`
53
- - convert session history into the local Qwen3.5 schema
54
- - label records as `cot_eligible`, `agent_only`, or `discard`
55
- - keep `cot_eligible` and `agent_only`
56
- - merge them into one final parquet file
57
- - remove intermediate directories automatically after success
58
-
59
- ## Final output
60
-
61
- Each run creates one directory under `./out/`:
40
+ Each run creates a directory like:
62
41
 
63
42
  ```text
64
43
  out/agentic-dataset-<timestamp>/
@@ -67,59 +46,18 @@ out/agentic-dataset-<timestamp>/
67
46
  run.log
68
47
  ```
69
48
 
70
- Default deliverable is just one user-facing dataset file:
71
-
72
- - `dataset.parquet`
73
-
74
- Supporting files:
75
-
76
- - `manifest.json`: what was scanned, what was kept, summary stats
77
- - `run.log`: full step-by-step execution log
78
-
79
- ## Common options
80
-
81
- ```bash
82
- # only Pi
83
- node cli.mjs --output-root ./out --include-sources pi
84
-
85
- # only Codex
86
- node cli.mjs --output-root ./out --include-sources codex
87
-
88
- # keep intermediates for debugging
89
- node cli.mjs --output-root ./out --keep-intermediates
90
-
91
- # also emit final merged jsonl/jsonl.gz
92
- node cli.mjs --output-root ./out --final-format both
93
- ```
94
-
95
- ## What is kept by default
96
-
97
- - `cot_eligible`: agentic traces with visible reasoning
98
- - `agent_only`: agentic traces without visible reasoning
49
+ - `dataset.parquet`: final merged dataset
50
+ - `manifest.json`: source roots, counts, labels, and summary stats
51
+ - `run.log`: step-by-step execution log
99
52
 
100
- `discard` records are excluded from the final dataset by default.
53
+ ## Source support
101
54
 
102
- ## Package layout
103
-
104
- ```text
105
- agentic_dataset/
106
- build_agentic_dataset.py
107
- export_pi_session.py
108
- export_pi_session_to_qwen35.py
109
- export_codex_session_to_qwen35.py
110
- label_qwen35_agentic.py
111
- export_qwen35_training.py
112
- qwen35_training_record.py
113
- run.sh
114
- run.py
115
- cli.mjs
116
- README.md
117
- ```
55
+ - `pi`: full agent trace with visible reasoning when available
56
+ - `codex`: agent trace, often without visible reasoning
57
+ - `claude`: prompt-history only for now, labeled `prompt_only`
118
58
 
119
59
  ## Notes
120
60
 
121
- - default session roots are auto-detected for Linux, macOS, and Windows
122
- - override session paths with `--pi-root` and `--codex-root` if needed
123
- - Pi currently provides much better visible reasoning coverage than Codex.
124
- - Codex traces are still useful for agent-behavior distillation even when reasoning is encrypted-only.
125
- - Redaction is not included yet. Add it before distributing the tool broadly if users may have sensitive local data.
61
+ - default source roots are auto-detected for Linux, macOS, and Windows
62
+ - override paths with `--pi-root`, `--codex-root`, and `--claude-root`
63
+ - Claude is intentionally low-fidelity right now: user prompt history only, not full assistant/tool trace
package/dist/cli.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
package/dist/cli.js ADDED
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/env node
2
+ import fs from 'node:fs';
3
+ import path from 'node:path';
4
+ import { candidateClaudeRoots, candidateCodexRoots, candidatePiRoots, firstExisting } from './platform/paths.js';
5
+ import { collectPiRecords } from './sources/pi.js';
6
+ import { collectCodexRecords } from './sources/codex.js';
7
+ import { collectClaudePromptOnlyRecords } from './sources/claude.js';
8
+ import { labelRecord } from './labeling.js';
9
+ import { Qwen35RecordSchema } from './schemas/qwen35.js';
10
+ import { writeParquet } from './parquet.js';
11
+ function parseArgs(argv) {
12
+ const args = {
13
+ outputRoot: './out',
14
+ includeSources: ['pi', 'codex'],
15
+ includeLabels: new Set(['cot_eligible', 'agent_only']),
16
+ };
17
+ for (let i = 0; i < argv.length; i += 1) {
18
+ const arg = argv[i];
19
+ const next = argv[i + 1];
20
+ if (arg === '--output-root' && next) {
21
+ args.outputRoot = next;
22
+ i += 1;
23
+ }
24
+ else if (arg === '--include-sources' && next) {
25
+ args.includeSources = next.split(',').map((v) => v.trim()).filter(Boolean);
26
+ i += 1;
27
+ }
28
+ else if (arg === '--include-labels' && next) {
29
+ args.includeLabels = new Set(next.split(',').map((v) => v.trim()).filter(Boolean));
30
+ i += 1;
31
+ }
32
+ else if (arg === '--pi-root' && next) {
33
+ args.piRoot = next;
34
+ i += 1;
35
+ }
36
+ else if (arg === '--codex-root' && next) {
37
+ args.codexRoot = next;
38
+ i += 1;
39
+ }
40
+ else if (arg === '--claude-root' && next) {
41
+ args.claudeRoot = next;
42
+ i += 1;
43
+ }
44
+ }
45
+ return args;
46
+ }
47
+ function stampDir(base) {
48
+ const iso = new Date().toISOString().replace(/[:.]/g, '-');
49
+ return path.resolve(base, `agentic-dataset-${iso}`);
50
+ }
51
+ function createLogger(runDir) {
52
+ const logPath = path.join(runDir, 'run.log');
53
+ fs.writeFileSync(logPath, '', 'utf8');
54
+ return {
55
+ logPath,
56
+ log(step, message) {
57
+ const line = `[${step}] ${message}`;
58
+ console.log(line);
59
+ fs.appendFileSync(logPath, `${line}\n`, 'utf8');
60
+ },
61
+ };
62
+ }
63
+ async function main() {
64
+ const args = parseArgs(process.argv.slice(2));
65
+ const runDir = stampDir(args.outputRoot);
66
+ fs.mkdirSync(runDir, { recursive: true });
67
+ const logger = createLogger(runDir);
68
+ const allRecords = [];
69
+ const sourceStats = {};
70
+ for (const source of args.includeSources) {
71
+ if (source === 'pi') {
72
+ const root = path.resolve(args.piRoot ?? firstExisting(candidatePiRoots()));
73
+ logger.log('pi', `reading ${root}`);
74
+ const records = await collectPiRecords(root);
75
+ sourceStats.pi = { records: records.length };
76
+ for (const record of records)
77
+ pushLabeled(record, 'pi', args.includeLabels, allRecords);
78
+ logger.log('pi', `kept ${allRecords.filter((r) => r.source_system === 'pi').length} labeled records`);
79
+ }
80
+ if (source === 'codex') {
81
+ const root = path.resolve(args.codexRoot ?? firstExisting(candidateCodexRoots()));
82
+ logger.log('codex', `reading ${root}`);
83
+ const records = await collectCodexRecords(root);
84
+ sourceStats.codex = { records: records.length };
85
+ for (const record of records)
86
+ pushLabeled(record, 'codex', args.includeLabels, allRecords);
87
+ logger.log('codex', `kept ${allRecords.filter((r) => r.source_system === 'codex').length} labeled records`);
88
+ }
89
+ if (source === 'claude') {
90
+ const root = path.resolve(args.claudeRoot ?? firstExisting(candidateClaudeRoots()));
91
+ logger.log('claude', `reading ${root}`);
92
+ const records = await collectClaudePromptOnlyRecords(root);
93
+ sourceStats.claude = { records: records.length };
94
+ for (const record of records)
95
+ pushLabeled(record, 'claude', args.includeLabels, allRecords);
96
+ logger.log('claude', `kept ${allRecords.filter((r) => r.source_system === 'claude').length} labeled records`);
97
+ }
98
+ }
99
+ const datasetPath = path.join(runDir, 'dataset.parquet');
100
+ logger.log('write', `writing ${allRecords.length} records to ${datasetPath}`);
101
+ await writeParquet(datasetPath, allRecords);
102
+ const manifest = {
103
+ runDir,
104
+ datasetParquetPath: datasetPath,
105
+ recordCount: allRecords.length,
106
+ includeSources: args.includeSources,
107
+ includeLabels: [...args.includeLabels],
108
+ sourceStats,
109
+ runLog: logger.logPath,
110
+ };
111
+ fs.writeFileSync(path.join(runDir, 'manifest.json'), `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
112
+ console.log(JSON.stringify(manifest));
113
+ }
114
+ function pushLabeled(record, sourceSystem, includeLabels, target) {
115
+ const info = labelRecord(record);
116
+ if (!includeLabels.has(info.label))
117
+ return;
118
+ const enriched = Qwen35RecordSchema.parse({
119
+ ...record,
120
+ label: info.label,
121
+ source_system: sourceSystem,
122
+ source_bucket: record.meta.lossy_source ? 'lossy' : 'strict',
123
+ source_file: record.meta.source,
124
+ agentic_label: {
125
+ label: info.label,
126
+ tool_call_count: info.toolCallCount,
127
+ tool_message_count: info.toolMessageCount,
128
+ dialogue_rounds_est: info.dialogueRounds,
129
+ reasoning_chars: info.reasoningChars,
130
+ has_reasoning: info.hasReasoning,
131
+ lossy_source: record.meta.lossy_source,
132
+ lossy_reasons: info.lossyReasons,
133
+ },
134
+ meta: {
135
+ ...record.meta,
136
+ dataset_label: info.label,
137
+ dataset_source_system: sourceSystem,
138
+ dataset_source_bucket: record.meta.lossy_source ? 'lossy' : 'strict',
139
+ dataset_source_file: record.meta.source,
140
+ dataset_has_reasoning: info.hasReasoning,
141
+ dataset_reasoning_chars: info.reasoningChars,
142
+ },
143
+ });
144
+ target.push(enriched);
145
+ }
146
+ main().catch((error) => {
147
+ console.error(error instanceof Error ? error.stack ?? error.message : String(error));
148
+ process.exit(1);
149
+ });
@@ -0,0 +1,12 @@
1
+ import type { Qwen35Record } from './schemas/qwen35.js';
2
+ export type Label = 'cot_eligible' | 'agent_only' | 'prompt_only' | 'discard';
3
+ export interface LabelInfo {
4
+ label: Label;
5
+ toolCallCount: number;
6
+ toolMessageCount: number;
7
+ dialogueRounds: number;
8
+ reasoningChars: number;
9
+ hasReasoning: boolean;
10
+ lossyReasons: string[];
11
+ }
12
+ export declare function labelRecord(record: Qwen35Record): LabelInfo;
@@ -0,0 +1,22 @@
1
+ export function labelRecord(record) {
2
+ const toolCallCount = record.messages
3
+ .filter((message) => message.role === 'assistant')
4
+ .reduce((sum, message) => sum + (message.tool_calls?.length ?? 0), 0);
5
+ const toolMessageCount = record.messages.filter((message) => message.role === 'tool').length;
6
+ const dialogueRounds = record.messages.filter((message) => message.role === 'user').length;
7
+ const reasoningChars = record.messages
8
+ .filter((message) => message.role === 'assistant')
9
+ .reduce((sum, message) => sum + (typeof message.reasoning_content === 'string' ? message.reasoning_content.length : 0), 0);
10
+ const hasReasoning = reasoningChars > 0;
11
+ const lossyReasons = record.meta.lossy_reasons;
12
+ const promptOnly = lossyReasons.includes('prompt_history_only');
13
+ const agentic = toolCallCount >= 1 && toolMessageCount >= 1 && dialogueRounds >= 1;
14
+ let label = 'discard';
15
+ if (promptOnly)
16
+ label = 'prompt_only';
17
+ else if (agentic && hasReasoning)
18
+ label = 'cot_eligible';
19
+ else if (agentic)
20
+ label = 'agent_only';
21
+ return { label, toolCallCount, toolMessageCount, dialogueRounds, reasoningChars, hasReasoning, lossyReasons };
22
+ }
@@ -0,0 +1,4 @@
1
+ import type { Qwen35Record } from './schemas/qwen35.js';
2
+ export declare function parquetSchema(): any;
3
+ export declare function recordToParquetRow(record: Qwen35Record): Record<string, unknown>;
4
+ export declare function writeParquet(filePath: string, records: Qwen35Record[]): Promise<void>;
@@ -0,0 +1,115 @@
1
+ import parquet from 'parquetjs-lite';
2
+ export function parquetSchema() {
3
+ return new parquet.ParquetSchema({
4
+ id: { type: 'UTF8' },
5
+ request_id: { type: 'UTF8', optional: true },
6
+ endpoint: { type: 'UTF8' },
7
+ status: { type: 'INT64' },
8
+ ts: { type: 'UTF8' },
9
+ key: { type: 'UTF8', optional: true },
10
+ source: { type: 'UTF8', optional: true },
11
+ requested_model: { type: 'UTF8', optional: true },
12
+ actual_model: { type: 'UTF8', optional: true },
13
+ stream: { type: 'BOOLEAN', optional: true },
14
+ thinking_level: { type: 'UTF8', optional: true },
15
+ reasoning_summary_mode_json: { type: 'UTF8', optional: true },
16
+ thinking_type: { type: 'UTF8', optional: true },
17
+ thinking_budget_tokens: { type: 'INT64', optional: true },
18
+ max_output_tokens: { type: 'INT64', optional: true },
19
+ tool_spec_count: { type: 'INT64', optional: true },
20
+ tool_choice_json: { type: 'UTF8', optional: true },
21
+ request_contains_non_text_content: { type: 'BOOLEAN' },
22
+ request_image_block_count: { type: 'INT64' },
23
+ request_video_block_count: { type: 'INT64' },
24
+ request_tool_call_block_count: { type: 'INT64' },
25
+ request_tool_result_block_count: { type: 'INT64' },
26
+ request_thinking_block_count: { type: 'INT64' },
27
+ response_contains_non_text_content: { type: 'BOOLEAN' },
28
+ response_image_block_count: { type: 'INT64' },
29
+ response_video_block_count: { type: 'INT64' },
30
+ response_tool_call_block_count: { type: 'INT64' },
31
+ response_tool_result_block_count: { type: 'INT64' },
32
+ response_thinking_block_count: { type: 'INT64' },
33
+ request_truncated: { type: 'BOOLEAN' },
34
+ response_truncated: { type: 'BOOLEAN' },
35
+ lossy_source: { type: 'BOOLEAN' },
36
+ lossy_reasons_json: { type: 'UTF8' },
37
+ user_message_count: { type: 'INT64' },
38
+ assistant_message_count: { type: 'INT64' },
39
+ tool_message_count: { type: 'INT64' },
40
+ dialogue_rounds_est: { type: 'INT64' },
41
+ tool_call_count: { type: 'INT64' },
42
+ has_reasoning: { type: 'BOOLEAN' },
43
+ reasoning_chars: { type: 'INT64' },
44
+ content_chars_total: { type: 'INT64' },
45
+ messages_json: { type: 'UTF8' },
46
+ tools_json: { type: 'UTF8' },
47
+ meta_json: { type: 'UTF8' },
48
+ });
49
+ }
50
+ export function recordToParquetRow(record) {
51
+ const messages = record.messages;
52
+ const userMessageCount = messages.filter((m) => m.role === 'user').length;
53
+ const assistantMessages = messages.filter((m) => m.role === 'assistant');
54
+ const toolMessageCount = messages.filter((m) => m.role === 'tool').length;
55
+ const toolCallCount = assistantMessages.reduce((sum, message) => sum + (message.tool_calls?.length ?? 0), 0);
56
+ const reasoningChars = assistantMessages.reduce((sum, message) => sum + (typeof message.reasoning_content === 'string' ? message.reasoning_content.length : 0), 0);
57
+ const contentCharsTotal = messages.reduce((sum, message) => sum + JSON.stringify(message.content).length, 0);
58
+ return {
59
+ id: record.id,
60
+ request_id: record.request_id,
61
+ endpoint: record.meta.endpoint,
62
+ status: record.meta.status,
63
+ ts: record.meta.ts,
64
+ key: record.meta.key,
65
+ source: record.meta.source,
66
+ requested_model: record.meta.requested_model ?? undefined,
67
+ actual_model: record.meta.actual_model ?? undefined,
68
+ stream: record.meta.stream,
69
+ thinking_level: record.meta.thinking_level ?? undefined,
70
+ reasoning_summary_mode_json: JSON.stringify(record.meta.reasoning_summary_mode ?? null),
71
+ thinking_type: record.meta.thinking_type ?? undefined,
72
+ thinking_budget_tokens: record.meta.thinking_budget_tokens ?? undefined,
73
+ max_output_tokens: record.meta.max_output_tokens ?? undefined,
74
+ tool_spec_count: record.meta.tool_spec_count,
75
+ tool_choice_json: JSON.stringify(record.meta.tool_choice ?? null),
76
+ request_contains_non_text_content: record.meta.request_contains_non_text_content,
77
+ request_image_block_count: record.meta.request_image_block_count,
78
+ request_video_block_count: record.meta.request_video_block_count,
79
+ request_tool_call_block_count: record.meta.request_tool_call_block_count,
80
+ request_tool_result_block_count: record.meta.request_tool_result_block_count,
81
+ request_thinking_block_count: record.meta.request_thinking_block_count,
82
+ response_contains_non_text_content: record.meta.response_contains_non_text_content,
83
+ response_image_block_count: record.meta.response_image_block_count,
84
+ response_video_block_count: record.meta.response_video_block_count,
85
+ response_tool_call_block_count: record.meta.response_tool_call_block_count,
86
+ response_tool_result_block_count: record.meta.response_tool_result_block_count,
87
+ response_thinking_block_count: record.meta.response_thinking_block_count,
88
+ request_truncated: record.meta.request_truncated,
89
+ response_truncated: record.meta.response_truncated,
90
+ lossy_source: record.meta.lossy_source,
91
+ lossy_reasons_json: JSON.stringify(record.meta.lossy_reasons),
92
+ user_message_count: userMessageCount,
93
+ assistant_message_count: assistantMessages.length,
94
+ tool_message_count: toolMessageCount,
95
+ dialogue_rounds_est: userMessageCount,
96
+ tool_call_count: toolCallCount,
97
+ has_reasoning: reasoningChars > 0,
98
+ reasoning_chars: reasoningChars,
99
+ content_chars_total: contentCharsTotal,
100
+ messages_json: JSON.stringify(record.messages),
101
+ tools_json: JSON.stringify(record.tools),
102
+ meta_json: JSON.stringify(record.meta),
103
+ };
104
+ }
105
+ export async function writeParquet(filePath, records) {
106
+ const writer = await parquet.ParquetWriter.openFile(parquetSchema(), filePath);
107
+ try {
108
+ for (const record of records) {
109
+ await writer.appendRow(recordToParquetRow(record));
110
+ }
111
+ }
112
+ finally {
113
+ await writer.close();
114
+ }
115
+ }
@@ -0,0 +1,4 @@
1
+ export declare function candidatePiRoots(): string[];
2
+ export declare function candidateCodexRoots(): string[];
3
+ export declare function candidateClaudeRoots(): string[];
4
+ export declare function firstExisting(candidates: string[]): string;
@@ -0,0 +1,62 @@
1
+ import os from 'node:os';
2
+ import path from 'node:path';
3
+ import fs from 'node:fs';
4
+ export function candidatePiRoots() {
5
+ const home = os.homedir();
6
+ const appdata = process.env.APPDATA;
7
+ const localappdata = process.env.LOCALAPPDATA;
8
+ return dedupe([
9
+ process.env.PI_SESSION_ROOT,
10
+ path.join(home, '.pi', 'agent', 'sessions'),
11
+ appdata ? path.join(appdata, 'pi', 'agent', 'sessions') : undefined,
12
+ appdata ? path.join(appdata, '.pi', 'agent', 'sessions') : undefined,
13
+ localappdata ? path.join(localappdata, 'pi', 'agent', 'sessions') : undefined,
14
+ localappdata ? path.join(localappdata, '.pi', 'agent', 'sessions') : undefined,
15
+ ]);
16
+ }
17
+ export function candidateCodexRoots() {
18
+ const home = os.homedir();
19
+ const appdata = process.env.APPDATA;
20
+ const localappdata = process.env.LOCALAPPDATA;
21
+ return dedupe([
22
+ process.env.CODEX_SESSION_ROOT,
23
+ path.join(home, '.codex', 'sessions'),
24
+ appdata ? path.join(appdata, 'Codex', 'sessions') : undefined,
25
+ appdata ? path.join(appdata, '.codex', 'sessions') : undefined,
26
+ localappdata ? path.join(localappdata, 'Codex', 'sessions') : undefined,
27
+ localappdata ? path.join(localappdata, '.codex', 'sessions') : undefined,
28
+ ]);
29
+ }
30
+ export function candidateClaudeRoots() {
31
+ const home = os.homedir();
32
+ const appdata = process.env.APPDATA;
33
+ const localappdata = process.env.LOCALAPPDATA;
34
+ return dedupe([
35
+ process.env.CLAUDE_SESSION_ROOT,
36
+ path.join(home, '.claude', 'projects'),
37
+ appdata ? path.join(appdata, 'Claude', 'projects') : undefined,
38
+ appdata ? path.join(appdata, '.claude', 'projects') : undefined,
39
+ localappdata ? path.join(localappdata, 'Claude', 'projects') : undefined,
40
+ localappdata ? path.join(localappdata, '.claude', 'projects') : undefined,
41
+ ]);
42
+ }
43
+ export function firstExisting(candidates) {
44
+ for (const candidate of candidates) {
45
+ if (fs.existsSync(candidate))
46
+ return path.resolve(candidate);
47
+ }
48
+ return path.resolve(candidates[0]);
49
+ }
50
+ function dedupe(values) {
51
+ const seen = new Set();
52
+ const result = [];
53
+ for (const value of values) {
54
+ if (!value)
55
+ continue;
56
+ if (seen.has(value))
57
+ continue;
58
+ seen.add(value);
59
+ result.push(value);
60
+ }
61
+ return result;
62
+ }