agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from collections import Counter
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Sequence, Tuple
10
+
11
+
12
+ def parse_args(argv: Sequence[str]) -> argparse.Namespace:
13
+ parser = argparse.ArgumentParser(description='Label Qwen35 records for agentic distillation buckets.')
14
+ parser.add_argument('--input', required=True, help='Qwen35 export directory or a single qwen35 jsonl file.')
15
+ parser.add_argument('--output-root', required=True, help='Output directory root for label artifacts.')
16
+ parser.add_argument('--min-tool-calls', type=int, default=1, help='Minimum tool calls required to consider a record agentic.')
17
+ parser.add_argument('--min-tool-messages', type=int, default=1, help='Minimum tool messages required to consider a record agentic.')
18
+ parser.add_argument('--min-rounds', type=int, default=1, help='Minimum dialogue rounds required to consider a record agentic.')
19
+ parser.add_argument('--min-reasoning-chars', type=int, default=1, help='Minimum reasoning chars required for cot_eligible.')
20
+ return parser.parse_args(argv)
21
+
22
+
23
+ def iter_input_files(input_path: Path) -> List[Path]:
24
+ if input_path.is_file():
25
+ return [input_path]
26
+ if not input_path.is_dir():
27
+ raise FileNotFoundError(f'Input path does not exist: {input_path}')
28
+ files: List[Path] = []
29
+ for name in ('qwen35-train.jsonl', 'qwen35-train-lossy.jsonl'):
30
+ candidate = input_path / name
31
+ if candidate.exists():
32
+ files.append(candidate)
33
+ if files:
34
+ return files
35
+ return sorted(input_path.rglob('qwen35-*.jsonl'))
36
+
37
+
38
+ def load_records(files: List[Path]) -> List[Dict[str, Any]]:
39
+ records: List[Dict[str, Any]] = []
40
+ for path in files:
41
+ bucket = 'lossy' if 'lossy' in path.name else 'strict'
42
+ with path.open('r', encoding='utf-8') as handle:
43
+ for line in handle:
44
+ line = line.strip()
45
+ if not line:
46
+ continue
47
+ record = json.loads(line)
48
+ record['_bucket'] = bucket
49
+ record['_source_file'] = str(path)
50
+ records.append(record)
51
+ return records
52
+
53
+
54
+ def count_role(messages: List[Dict[str, Any]], role: str) -> int:
55
+ return sum(1 for message in messages if isinstance(message, dict) and message.get('role') == role)
56
+
57
+
58
+ def tool_call_count(messages: List[Dict[str, Any]]) -> int:
59
+ return sum(
60
+ len(message.get('tool_calls') or [])
61
+ for message in messages
62
+ if isinstance(message, dict) and message.get('role') == 'assistant'
63
+ )
64
+
65
+
66
+ def reasoning_chars(messages: List[Dict[str, Any]]) -> int:
67
+ return sum(
68
+ len(message.get('reasoning_content', ''))
69
+ for message in messages
70
+ if isinstance(message, dict)
71
+ and message.get('role') == 'assistant'
72
+ and isinstance(message.get('reasoning_content'), str)
73
+ )
74
+
75
+
76
+ def label_record(record: Dict[str, Any], args: argparse.Namespace) -> Dict[str, Any]:
77
+ messages = record.get('messages', []) if isinstance(record.get('messages'), list) else []
78
+ user_count = count_role(messages, 'user')
79
+ assistant_count = count_role(messages, 'assistant')
80
+ tool_count = count_role(messages, 'tool')
81
+ calls = tool_call_count(messages)
82
+ reasoning = reasoning_chars(messages)
83
+ has_reasoning = reasoning >= args.min_reasoning_chars
84
+ agentic = calls >= args.min_tool_calls and tool_count >= args.min_tool_messages and user_count >= args.min_rounds
85
+
86
+ if agentic and has_reasoning:
87
+ label = 'cot_eligible'
88
+ elif agentic:
89
+ label = 'agent_only'
90
+ else:
91
+ label = 'discard'
92
+
93
+ return {
94
+ 'id': record.get('id'),
95
+ 'request_id': record.get('request_id'),
96
+ 'label': label,
97
+ 'bucket': record.get('_bucket'),
98
+ 'source_file': record.get('_source_file'),
99
+ 'user_message_count': user_count,
100
+ 'assistant_message_count': assistant_count,
101
+ 'tool_message_count': tool_count,
102
+ 'dialogue_rounds_est': user_count,
103
+ 'tool_call_count': calls,
104
+ 'reasoning_chars': reasoning,
105
+ 'has_reasoning': has_reasoning,
106
+ 'lossy_source': bool(record.get('meta', {}).get('lossy_source')),
107
+ 'lossy_reasons': record.get('meta', {}).get('lossy_reasons', []),
108
+ }
109
+
110
+
111
+ def main(argv: Sequence[str] | None = None) -> int:
112
+ args = parse_args(argv or [])
113
+ input_path = Path(args.input).expanduser().resolve()
114
+ files = iter_input_files(input_path)
115
+ if not files:
116
+ raise SystemExit('No Qwen35 JSONL files found.')
117
+
118
+ records = load_records(files)
119
+ labels = [label_record(record, args) for record in records]
120
+
121
+ out_dir = Path(args.output_root).expanduser().resolve() / f'qwen35-agentic-labels-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
122
+ out_dir.mkdir(parents=True, exist_ok=True)
123
+ labels_path = out_dir / 'labels.jsonl'
124
+ manifest_path = out_dir / 'manifest.json'
125
+
126
+ stats = Counter(label['label'] for label in labels)
127
+ stats['records'] = len(labels)
128
+ stats['strict_records'] = sum(1 for label in labels if label['bucket'] == 'strict')
129
+ stats['lossy_records'] = sum(1 for label in labels if label['bucket'] == 'lossy')
130
+
131
+ with labels_path.open('w', encoding='utf-8') as handle:
132
+ for label in labels:
133
+ handle.write(json.dumps(label, ensure_ascii=False) + '\n')
134
+
135
+ manifest = {
136
+ 'input': str(input_path),
137
+ 'output_dir': str(out_dir),
138
+ 'input_files': [str(path) for path in files],
139
+ 'rules': {
140
+ 'min_tool_calls': args.min_tool_calls,
141
+ 'min_tool_messages': args.min_tool_messages,
142
+ 'min_rounds': args.min_rounds,
143
+ 'min_reasoning_chars': args.min_reasoning_chars,
144
+ 'cot_eligible': 'agentic and has visible reasoning',
145
+ 'agent_only': 'agentic without visible reasoning',
146
+ 'discard': 'does not meet agentic thresholds',
147
+ },
148
+ 'stats': dict(stats),
149
+ }
150
+ manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
151
+ print(json.dumps(manifest, ensure_ascii=False), flush=True)
152
+ return 0
153
+
154
+
155
+ if __name__ == '__main__':
156
+ raise SystemExit(main(__import__('sys').argv[1:]))
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import platform
5
+ from pathlib import Path
6
+ from typing import Iterable, List, Optional
7
+
8
+
9
+ def os_name() -> str:
10
+ return platform.system().lower()
11
+
12
+
13
+ def home_dir() -> Path:
14
+ return Path.home()
15
+
16
+
17
+ def env_path(name: str) -> Optional[Path]:
18
+ value = os.environ.get(name)
19
+ if not value:
20
+ return None
21
+ return Path(value).expanduser()
22
+
23
+
24
+ def existing_or_default(candidates: Iterable[Path]) -> Path:
25
+ collected = [candidate.expanduser() for candidate in candidates]
26
+ for candidate in collected:
27
+ if candidate.exists():
28
+ return candidate.resolve()
29
+ return collected[0].resolve() if collected else home_dir().resolve()
30
+
31
+
32
+ def candidate_pi_session_roots() -> List[Path]:
33
+ home = home_dir()
34
+ appdata = os.environ.get('APPDATA')
35
+ localappdata = os.environ.get('LOCALAPPDATA')
36
+ candidates: List[Path] = []
37
+ override = env_path('PI_SESSION_ROOT')
38
+ if override is not None:
39
+ candidates.append(override)
40
+ candidates.append(home / '.pi' / 'agent' / 'sessions')
41
+ if appdata:
42
+ candidates.append(Path(appdata) / 'pi' / 'agent' / 'sessions')
43
+ candidates.append(Path(appdata) / '.pi' / 'agent' / 'sessions')
44
+ if localappdata:
45
+ candidates.append(Path(localappdata) / 'pi' / 'agent' / 'sessions')
46
+ candidates.append(Path(localappdata) / '.pi' / 'agent' / 'sessions')
47
+ return dedupe(candidates)
48
+
49
+
50
+ def candidate_codex_session_roots() -> List[Path]:
51
+ home = home_dir()
52
+ appdata = os.environ.get('APPDATA')
53
+ localappdata = os.environ.get('LOCALAPPDATA')
54
+ candidates: List[Path] = []
55
+ override = env_path('CODEX_SESSION_ROOT')
56
+ if override is not None:
57
+ candidates.append(override)
58
+ candidates.append(home / '.codex' / 'sessions')
59
+ if appdata:
60
+ candidates.append(Path(appdata) / 'Codex' / 'sessions')
61
+ candidates.append(Path(appdata) / '.codex' / 'sessions')
62
+ if localappdata:
63
+ candidates.append(Path(localappdata) / 'Codex' / 'sessions')
64
+ candidates.append(Path(localappdata) / '.codex' / 'sessions')
65
+ return dedupe(candidates)
66
+
67
+
68
+ def default_pi_session_root() -> Path:
69
+ return existing_or_default(candidate_pi_session_roots())
70
+
71
+
72
+ def default_codex_session_root() -> Path:
73
+ return existing_or_default(candidate_codex_session_roots())
74
+
75
+
76
+ def dedupe(paths: Iterable[Path]) -> List[Path]:
77
+ output: List[Path] = []
78
+ seen: set[str] = set()
79
+ for path in paths:
80
+ key = str(path.expanduser())
81
+ if key in seen:
82
+ continue
83
+ seen.add(key)
84
+ output.append(path)
85
+ return output
@@ -0,0 +1,179 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, List, Literal, Optional, Union
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
6
+
7
+
8
+ class Qwen35TextBlock(BaseModel):
9
+ model_config = ConfigDict(extra='forbid')
10
+
11
+ type: Literal['text']
12
+ text: str
13
+
14
+
15
+ class Qwen35ImageBlock(BaseModel):
16
+ model_config = ConfigDict(extra='allow')
17
+
18
+ type: Literal['image']
19
+ image_url: Optional[str] = None
20
+ placeholder: bool = False
21
+ placeholder_token: Optional[str] = None
22
+ source_kind: Optional[str] = None
23
+ metadata: Optional[Dict[str, Any]] = None
24
+
25
+
26
+ class Qwen35VideoBlock(BaseModel):
27
+ model_config = ConfigDict(extra='allow')
28
+
29
+ type: Literal['video']
30
+ video_url: Optional[str] = None
31
+ placeholder: bool = False
32
+ placeholder_token: Optional[str] = None
33
+ source_kind: Optional[str] = None
34
+ metadata: Optional[Dict[str, Any]] = None
35
+
36
+
37
+ Qwen35ContentBlock = Union[Qwen35TextBlock, Qwen35ImageBlock, Qwen35VideoBlock]
38
+ Qwen35MessageContent = Union[str, List[Qwen35ContentBlock]]
39
+
40
+
41
+ class Qwen35ToolFunction(BaseModel):
42
+ model_config = ConfigDict(extra='forbid')
43
+
44
+ name: str
45
+ arguments: Dict[str, Any] = Field(default_factory=dict)
46
+
47
+
48
+ class Qwen35ToolCall(BaseModel):
49
+ model_config = ConfigDict(extra='forbid')
50
+
51
+ type: Literal['function'] = 'function'
52
+ function: Qwen35ToolFunction
53
+ id: Optional[str] = None
54
+
55
+
56
+ class Qwen35ToolSpec(BaseModel):
57
+ model_config = ConfigDict(extra='allow')
58
+
59
+ name: str
60
+ description: Optional[str] = None
61
+ parameters: Optional[Dict[str, Any]] = None
62
+
63
+
64
+ class Qwen35SystemMessage(BaseModel):
65
+ model_config = ConfigDict(extra='forbid')
66
+
67
+ role: Literal['system']
68
+ content: Qwen35MessageContent
69
+
70
+
71
+ class Qwen35UserMessage(BaseModel):
72
+ model_config = ConfigDict(extra='forbid')
73
+
74
+ role: Literal['user']
75
+ content: Qwen35MessageContent
76
+
77
+
78
+ class Qwen35AssistantMessage(BaseModel):
79
+ model_config = ConfigDict(extra='forbid')
80
+
81
+ role: Literal['assistant']
82
+ content: Qwen35MessageContent
83
+ reasoning_content: Optional[str] = None
84
+ tool_calls: Optional[List[Qwen35ToolCall]] = None
85
+
86
+
87
+ class Qwen35ToolMessage(BaseModel):
88
+ model_config = ConfigDict(extra='forbid')
89
+
90
+ role: Literal['tool']
91
+ content: Qwen35MessageContent
92
+ tool_call_id: Optional[str] = None
93
+ name: Optional[str] = None
94
+
95
+
96
+ Qwen35Message = Union[
97
+ Qwen35SystemMessage,
98
+ Qwen35UserMessage,
99
+ Qwen35AssistantMessage,
100
+ Qwen35ToolMessage,
101
+ ]
102
+
103
+
104
+ class Qwen35Meta(BaseModel):
105
+ model_config = ConfigDict(extra='forbid')
106
+
107
+ endpoint: str
108
+ status: int = Field(ge=100, le=599)
109
+ ts: str
110
+ key: Optional[str] = None
111
+ source: Optional[str] = None
112
+ requested_model: Optional[str] = None
113
+ actual_model: Optional[str] = None
114
+ stream: Optional[bool] = None
115
+ thinking_level: Optional[str] = None
116
+ reasoning_summary_mode: Optional[Union[str, List[Any], Dict[str, Any]]] = None
117
+ thinking_type: Optional[str] = None
118
+ thinking_budget_tokens: Optional[int] = Field(default=None, ge=0)
119
+ max_output_tokens: Optional[int] = Field(default=None, ge=0)
120
+ tool_spec_count: Optional[int] = Field(default=None, ge=0)
121
+ tool_choice: Optional[Union[str, Dict[str, Any], List[Any]]] = None
122
+ request_contains_non_text_content: bool = False
123
+ request_image_block_count: int = Field(default=0, ge=0)
124
+ request_video_block_count: int = Field(default=0, ge=0)
125
+ request_tool_call_block_count: int = Field(default=0, ge=0)
126
+ request_tool_result_block_count: int = Field(default=0, ge=0)
127
+ request_thinking_block_count: int = Field(default=0, ge=0)
128
+ response_contains_non_text_content: bool = False
129
+ response_image_block_count: int = Field(default=0, ge=0)
130
+ response_video_block_count: int = Field(default=0, ge=0)
131
+ response_tool_call_block_count: int = Field(default=0, ge=0)
132
+ response_tool_result_block_count: int = Field(default=0, ge=0)
133
+ response_thinking_block_count: int = Field(default=0, ge=0)
134
+ request_truncated: bool = False
135
+ response_truncated: bool = False
136
+ lossy_source: bool = False
137
+ lossy_reasons: List[str] = Field(default_factory=list)
138
+
139
+
140
+ class Qwen35TrainingRecord(BaseModel):
141
+ model_config = ConfigDict(extra='forbid')
142
+
143
+ id: str
144
+ request_id: Optional[str] = None
145
+ messages: List[Qwen35Message] = Field(min_length=1)
146
+ tools: List[Qwen35ToolSpec] = Field(default_factory=list)
147
+ meta: Qwen35Meta
148
+
149
+ @model_validator(mode='after')
150
+ def validate_qwen35_constraints(self) -> 'Qwen35TrainingRecord':
151
+ seen_user = False
152
+ seen_non_system = False
153
+ for message in self.messages:
154
+ if message.role != 'system':
155
+ seen_non_system = True
156
+ elif seen_non_system:
157
+ raise ValueError('system messages must appear only at the beginning')
158
+
159
+ if message.role == 'user':
160
+ seen_user = True
161
+ if message.role == 'system' and _has_non_text_content(message.content):
162
+ raise ValueError('system messages cannot contain image/video blocks for Qwen3.5')
163
+ if message.role == 'assistant' and message.reasoning_content:
164
+ if '<think>' in message.reasoning_content or '</think>' in message.reasoning_content:
165
+ raise ValueError('reasoning_content must not include <think> wrappers')
166
+ if isinstance(message.content, str) and ('<think>' in message.content or '</think>' in message.content):
167
+ raise ValueError('assistant content must not include inline <think> wrappers when reasoning_content is used')
168
+
169
+ if not seen_user:
170
+ raise ValueError('at least one user message is required')
171
+ if self.meta.lossy_source and not self.meta.lossy_reasons:
172
+ raise ValueError('lossy_source requires at least one lossy_reasons entry')
173
+ return self
174
+
175
+
176
+ def _has_non_text_content(content: Qwen35MessageContent) -> bool:
177
+ if isinstance(content, str):
178
+ return False
179
+ return any(getattr(block, 'type', None) in {'image', 'video'} for block in content)
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env node
2
+ import { spawnSync } from 'node:child_process';
3
+ import { existsSync } from 'node:fs';
4
+ import path from 'node:path';
5
+ import { fileURLToPath } from 'node:url';
6
+
7
+ const __filename = fileURLToPath(import.meta.url);
8
+ const __dirname = path.dirname(__filename);
9
+ const venvDir = path.join(__dirname, '.venv');
10
+ const requirementsPath = path.join(__dirname, 'requirements.txt');
11
+ const runPy = path.join(__dirname, 'run.py');
12
+
13
+ function isWindows() {
14
+ return process.platform === 'win32';
15
+ }
16
+
17
+ function venvPythonPath() {
18
+ return isWindows()
19
+ ? path.join(venvDir, 'Scripts', 'python.exe')
20
+ : path.join(venvDir, 'bin', 'python');
21
+ }
22
+
23
+ function findSystemPython() {
24
+ const candidates = isWindows()
25
+ ? ['py', 'python', 'python3']
26
+ : ['python3', 'python'];
27
+ for (const candidate of candidates) {
28
+ const probeArgs = candidate === 'py' ? ['-3', '--version'] : ['--version'];
29
+ const result = spawnSync(candidate, probeArgs, { stdio: 'ignore' });
30
+ if (result.status === 0) {
31
+ return candidate;
32
+ }
33
+ }
34
+ return null;
35
+ }
36
+
37
+ function run(cmd, args, options = {}) {
38
+ const rendered = [cmd, ...args].join(' ');
39
+ console.error(`[agentic-dataset-builder] ${rendered}`);
40
+ const result = spawnSync(cmd, args, {
41
+ stdio: 'inherit',
42
+ cwd: __dirname,
43
+ env: process.env,
44
+ ...options,
45
+ });
46
+ if (result.status !== 0) {
47
+ process.exit(result.status ?? 1);
48
+ }
49
+ }
50
+
51
+ function ensureEnv() {
52
+ const pythonInVenv = venvPythonPath();
53
+ if (existsSync(pythonInVenv)) {
54
+ return pythonInVenv;
55
+ }
56
+
57
+ const systemPython = findSystemPython();
58
+ if (!systemPython) {
59
+ console.error('Python 3.10+ is required but was not found in PATH.');
60
+ process.exit(1);
61
+ }
62
+
63
+ if (systemPython === 'py') {
64
+ run(systemPython, ['-3', '-m', 'venv', venvDir]);
65
+ } else {
66
+ run(systemPython, ['-m', 'venv', venvDir]);
67
+ }
68
+
69
+ const venvPython = venvPythonPath();
70
+ run(venvPython, ['-m', 'pip', 'install', '--upgrade', 'pip']);
71
+ run(venvPython, ['-m', 'pip', 'install', '-r', requirementsPath]);
72
+ return venvPython;
73
+ }
74
+
75
+ const python = ensureEnv();
76
+ const args = process.argv.slice(2);
77
+ run(python, [runPy, ...args]);
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "agentic-dataset-builder",
3
+ "version": "0.1.0",
4
+ "description": "One-shot local dataset builder for Pi and Codex session histories",
5
+ "homepage": "https://github.com/Dominic789654/agentic-dataset-builder",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "git+https://github.com/Dominic789654/agentic-dataset-builder.git"
9
+ },
10
+ "bugs": {
11
+ "url": "https://github.com/Dominic789654/agentic-dataset-builder/issues"
12
+ },
13
+ "license": "MIT",
14
+ "keywords": [
15
+ "agentic",
16
+ "dataset",
17
+ "pi",
18
+ "codex",
19
+ "qwen",
20
+ "parquet"
21
+ ],
22
+ "files": [
23
+ "bin",
24
+ "run.py",
25
+ "requirements.txt",
26
+ "README.md",
27
+ "LICENSE",
28
+ "agentic_dataset"
29
+ ],
30
+ "bin": {
31
+ "agentic-dataset-builder": "./bin/agentic-dataset-builder.js"
32
+ },
33
+ "type": "module",
34
+ "engines": {
35
+ "node": ">=18"
36
+ },
37
+ "scripts": {
38
+ "pack:check": "npm pack --dry-run"
39
+ }
40
+ }
@@ -0,0 +1,2 @@
1
+ pydantic>=2
2
+ pyarrow>=14
package/run.py ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from agentic_dataset.build_agentic_dataset import main
5
+
6
+
7
+ if __name__ == '__main__':
8
+ raise SystemExit(main())