npm - agentic-dataset-builder - Versions diffs - 0.1.0 - Mend

agentic-dataset-builder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/LICENSE +21 -0
package/README.md +125 -0
package/agentic_dataset/__init__.py +1 -0
package/agentic_dataset/build_agentic_dataset.py +368 -0
package/agentic_dataset/export_codex_session_to_qwen35.py +466 -0
package/agentic_dataset/export_pi_session.py +701 -0
package/agentic_dataset/export_pi_session_to_qwen35.py +742 -0
package/agentic_dataset/export_qwen35_training.py +1559 -0
package/agentic_dataset/label_qwen35_agentic.py +156 -0
package/agentic_dataset/platform_paths.py +85 -0
package/agentic_dataset/qwen35_training_record.py +179 -0
package/bin/agentic-dataset-builder.js +77 -0
package/package.json +40 -0
package/requirements.txt +2 -0
package/run.py +8 -0

package/agentic_dataset/label_qwen35_agentic.py ADDED Viewed

@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+from collections import Counter
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Sequence, Tuple
+def parse_args(argv: Sequence[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description='Label Qwen35 records for agentic distillation buckets.')
+    parser.add_argument('--input', required=True, help='Qwen35 export directory or a single qwen35 jsonl file.')
+    parser.add_argument('--output-root', required=True, help='Output directory root for label artifacts.')
+    parser.add_argument('--min-tool-calls', type=int, default=1, help='Minimum tool calls required to consider a record agentic.')
+    parser.add_argument('--min-tool-messages', type=int, default=1, help='Minimum tool messages required to consider a record agentic.')
+    parser.add_argument('--min-rounds', type=int, default=1, help='Minimum dialogue rounds required to consider a record agentic.')
+    parser.add_argument('--min-reasoning-chars', type=int, default=1, help='Minimum reasoning chars required for cot_eligible.')
+    return parser.parse_args(argv)
+def iter_input_files(input_path: Path) -> List[Path]:
+    if input_path.is_file():
+        return [input_path]
+    if not input_path.is_dir():
+        raise FileNotFoundError(f'Input path does not exist: {input_path}')
+    files: List[Path] = []
+    for name in ('qwen35-train.jsonl', 'qwen35-train-lossy.jsonl'):
+        candidate = input_path / name
+        if candidate.exists():
+            files.append(candidate)
+    if files:
+        return files
+    return sorted(input_path.rglob('qwen35-*.jsonl'))
+def load_records(files: List[Path]) -> List[Dict[str, Any]]:
+    records: List[Dict[str, Any]] = []
+    for path in files:
+        bucket = 'lossy' if 'lossy' in path.name else 'strict'
+        with path.open('r', encoding='utf-8') as handle:
+            for line in handle:
+                line = line.strip()
+                if not line:
+                    continue
+                record = json.loads(line)
+                record['_bucket'] = bucket
+                record['_source_file'] = str(path)
+                records.append(record)
+    return records
+def count_role(messages: List[Dict[str, Any]], role: str) -> int:
+    return sum(1 for message in messages if isinstance(message, dict) and message.get('role') == role)
+def tool_call_count(messages: List[Dict[str, Any]]) -> int:
+    return sum(
+        len(message.get('tool_calls') or [])
+        for message in messages
+        if isinstance(message, dict) and message.get('role') == 'assistant'
+    )
+def reasoning_chars(messages: List[Dict[str, Any]]) -> int:
+    return sum(
+        len(message.get('reasoning_content', ''))
+        for message in messages
+        if isinstance(message, dict)
+        and message.get('role') == 'assistant'
+        and isinstance(message.get('reasoning_content'), str)
+    )
+def label_record(record: Dict[str, Any], args: argparse.Namespace) -> Dict[str, Any]:
+    messages = record.get('messages', []) if isinstance(record.get('messages'), list) else []
+    user_count = count_role(messages, 'user')
+    assistant_count = count_role(messages, 'assistant')
+    tool_count = count_role(messages, 'tool')
+    calls = tool_call_count(messages)
+    reasoning = reasoning_chars(messages)
+    has_reasoning = reasoning >= args.min_reasoning_chars
+    agentic = calls >= args.min_tool_calls and tool_count >= args.min_tool_messages and user_count >= args.min_rounds
+    if agentic and has_reasoning:
+        label = 'cot_eligible'
+    elif agentic:
+        label = 'agent_only'
+    else:
+        label = 'discard'
+    return {
+        'id': record.get('id'),
+        'request_id': record.get('request_id'),
+        'label': label,
+        'bucket': record.get('_bucket'),
+        'source_file': record.get('_source_file'),
+        'user_message_count': user_count,
+        'assistant_message_count': assistant_count,
+        'tool_message_count': tool_count,
+        'dialogue_rounds_est': user_count,
+        'tool_call_count': calls,
+        'reasoning_chars': reasoning,
+        'has_reasoning': has_reasoning,
+        'lossy_source': bool(record.get('meta', {}).get('lossy_source')),
+        'lossy_reasons': record.get('meta', {}).get('lossy_reasons', []),
+    }
+def main(argv: Sequence[str] | None = None) -> int:
+    args = parse_args(argv or [])
+    input_path = Path(args.input).expanduser().resolve()
+    files = iter_input_files(input_path)
+    if not files:
+        raise SystemExit('No Qwen35 JSONL files found.')
+    records = load_records(files)
+    labels = [label_record(record, args) for record in records]
+    out_dir = Path(args.output_root).expanduser().resolve() / f'qwen35-agentic-labels-{datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")}'
+    out_dir.mkdir(parents=True, exist_ok=True)
+    labels_path = out_dir / 'labels.jsonl'
+    manifest_path = out_dir / 'manifest.json'
+    stats = Counter(label['label'] for label in labels)
+    stats['records'] = len(labels)
+    stats['strict_records'] = sum(1 for label in labels if label['bucket'] == 'strict')
+    stats['lossy_records'] = sum(1 for label in labels if label['bucket'] == 'lossy')
+    with labels_path.open('w', encoding='utf-8') as handle:
+        for label in labels:
+            handle.write(json.dumps(label, ensure_ascii=False) + '\n')
+    manifest = {
+        'input': str(input_path),
+        'output_dir': str(out_dir),
+        'input_files': [str(path) for path in files],
+        'rules': {
+            'min_tool_calls': args.min_tool_calls,
+            'min_tool_messages': args.min_tool_messages,
+            'min_rounds': args.min_rounds,
+            'min_reasoning_chars': args.min_reasoning_chars,
+            'cot_eligible': 'agentic and has visible reasoning',
+            'agent_only': 'agentic without visible reasoning',
+            'discard': 'does not meet agentic thresholds',
+        },
+        'stats': dict(stats),
+    }
+    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2) + '\n', encoding='utf-8')
+    print(json.dumps(manifest, ensure_ascii=False), flush=True)
+    return 0
+if __name__ == '__main__':
+    raise SystemExit(main(__import__('sys').argv[1:]))

package/agentic_dataset/platform_paths.py ADDED Viewed

@@ -0,0 +1,85 @@
+from __future__ import annotations
+import os
+import platform
+from pathlib import Path
+from typing import Iterable, List, Optional
+def os_name() -> str:
+    return platform.system().lower()
+def home_dir() -> Path:
+    return Path.home()
+def env_path(name: str) -> Optional[Path]:
+    value = os.environ.get(name)
+    if not value:
+        return None
+    return Path(value).expanduser()
+def existing_or_default(candidates: Iterable[Path]) -> Path:
+    collected = [candidate.expanduser() for candidate in candidates]
+    for candidate in collected:
+        if candidate.exists():
+            return candidate.resolve()
+    return collected[0].resolve() if collected else home_dir().resolve()
+def candidate_pi_session_roots() -> List[Path]:
+    home = home_dir()
+    appdata = os.environ.get('APPDATA')
+    localappdata = os.environ.get('LOCALAPPDATA')
+    candidates: List[Path] = []
+    override = env_path('PI_SESSION_ROOT')
+    if override is not None:
+        candidates.append(override)
+    candidates.append(home / '.pi' / 'agent' / 'sessions')
+    if appdata:
+        candidates.append(Path(appdata) / 'pi' / 'agent' / 'sessions')
+        candidates.append(Path(appdata) / '.pi' / 'agent' / 'sessions')
+    if localappdata:
+        candidates.append(Path(localappdata) / 'pi' / 'agent' / 'sessions')
+        candidates.append(Path(localappdata) / '.pi' / 'agent' / 'sessions')
+    return dedupe(candidates)
+def candidate_codex_session_roots() -> List[Path]:
+    home = home_dir()
+    appdata = os.environ.get('APPDATA')
+    localappdata = os.environ.get('LOCALAPPDATA')
+    candidates: List[Path] = []
+    override = env_path('CODEX_SESSION_ROOT')
+    if override is not None:
+        candidates.append(override)
+    candidates.append(home / '.codex' / 'sessions')
+    if appdata:
+        candidates.append(Path(appdata) / 'Codex' / 'sessions')
+        candidates.append(Path(appdata) / '.codex' / 'sessions')
+    if localappdata:
+        candidates.append(Path(localappdata) / 'Codex' / 'sessions')
+        candidates.append(Path(localappdata) / '.codex' / 'sessions')
+    return dedupe(candidates)
+def default_pi_session_root() -> Path:
+    return existing_or_default(candidate_pi_session_roots())
+def default_codex_session_root() -> Path:
+    return existing_or_default(candidate_codex_session_roots())
+def dedupe(paths: Iterable[Path]) -> List[Path]:
+    output: List[Path] = []
+    seen: set[str] = set()
+    for path in paths:
+        key = str(path.expanduser())
+        if key in seen:
+            continue
+        seen.add(key)
+        output.append(path)
+    return output

package/agentic_dataset/qwen35_training_record.py ADDED Viewed

@@ -0,0 +1,179 @@
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional, Union
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+class Qwen35TextBlock(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    type: Literal['text']
+    text: str
+class Qwen35ImageBlock(BaseModel):
+    model_config = ConfigDict(extra='allow')
+    type: Literal['image']
+    image_url: Optional[str] = None
+    placeholder: bool = False
+    placeholder_token: Optional[str] = None
+    source_kind: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+class Qwen35VideoBlock(BaseModel):
+    model_config = ConfigDict(extra='allow')
+    type: Literal['video']
+    video_url: Optional[str] = None
+    placeholder: bool = False
+    placeholder_token: Optional[str] = None
+    source_kind: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+Qwen35ContentBlock = Union[Qwen35TextBlock, Qwen35ImageBlock, Qwen35VideoBlock]
+Qwen35MessageContent = Union[str, List[Qwen35ContentBlock]]
+class Qwen35ToolFunction(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    name: str
+    arguments: Dict[str, Any] = Field(default_factory=dict)
+class Qwen35ToolCall(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    type: Literal['function'] = 'function'
+    function: Qwen35ToolFunction
+    id: Optional[str] = None
+class Qwen35ToolSpec(BaseModel):
+    model_config = ConfigDict(extra='allow')
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, Any]] = None
+class Qwen35SystemMessage(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    role: Literal['system']
+    content: Qwen35MessageContent
+class Qwen35UserMessage(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    role: Literal['user']
+    content: Qwen35MessageContent
+class Qwen35AssistantMessage(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    role: Literal['assistant']
+    content: Qwen35MessageContent
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[Qwen35ToolCall]] = None
+class Qwen35ToolMessage(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    role: Literal['tool']
+    content: Qwen35MessageContent
+    tool_call_id: Optional[str] = None
+    name: Optional[str] = None
+Qwen35Message = Union[
+    Qwen35SystemMessage,
+    Qwen35UserMessage,
+    Qwen35AssistantMessage,
+    Qwen35ToolMessage,
+]
+class Qwen35Meta(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    endpoint: str
+    status: int = Field(ge=100, le=599)
+    ts: str
+    key: Optional[str] = None
+    source: Optional[str] = None
+    requested_model: Optional[str] = None
+    actual_model: Optional[str] = None
+    stream: Optional[bool] = None
+    thinking_level: Optional[str] = None
+    reasoning_summary_mode: Optional[Union[str, List[Any], Dict[str, Any]]] = None
+    thinking_type: Optional[str] = None
+    thinking_budget_tokens: Optional[int] = Field(default=None, ge=0)
+    max_output_tokens: Optional[int] = Field(default=None, ge=0)
+    tool_spec_count: Optional[int] = Field(default=None, ge=0)
+    tool_choice: Optional[Union[str, Dict[str, Any], List[Any]]] = None
+    request_contains_non_text_content: bool = False
+    request_image_block_count: int = Field(default=0, ge=0)
+    request_video_block_count: int = Field(default=0, ge=0)
+    request_tool_call_block_count: int = Field(default=0, ge=0)
+    request_tool_result_block_count: int = Field(default=0, ge=0)
+    request_thinking_block_count: int = Field(default=0, ge=0)
+    response_contains_non_text_content: bool = False
+    response_image_block_count: int = Field(default=0, ge=0)
+    response_video_block_count: int = Field(default=0, ge=0)
+    response_tool_call_block_count: int = Field(default=0, ge=0)
+    response_tool_result_block_count: int = Field(default=0, ge=0)
+    response_thinking_block_count: int = Field(default=0, ge=0)
+    request_truncated: bool = False
+    response_truncated: bool = False
+    lossy_source: bool = False
+    lossy_reasons: List[str] = Field(default_factory=list)
+class Qwen35TrainingRecord(BaseModel):
+    model_config = ConfigDict(extra='forbid')
+    id: str
+    request_id: Optional[str] = None
+    messages: List[Qwen35Message] = Field(min_length=1)
+    tools: List[Qwen35ToolSpec] = Field(default_factory=list)
+    meta: Qwen35Meta
+    @model_validator(mode='after')
+    def validate_qwen35_constraints(self) -> 'Qwen35TrainingRecord':
+        seen_user = False
+        seen_non_system = False
+        for message in self.messages:
+            if message.role != 'system':
+                seen_non_system = True
+            elif seen_non_system:
+                raise ValueError('system messages must appear only at the beginning')
+            if message.role == 'user':
+                seen_user = True
+            if message.role == 'system' and _has_non_text_content(message.content):
+                raise ValueError('system messages cannot contain image/video blocks for Qwen3.5')
+            if message.role == 'assistant' and message.reasoning_content:
+                if '<think>' in message.reasoning_content or '</think>' in message.reasoning_content:
+                    raise ValueError('reasoning_content must not include <think> wrappers')
+                if isinstance(message.content, str) and ('<think>' in message.content or '</think>' in message.content):
+                    raise ValueError('assistant content must not include inline <think> wrappers when reasoning_content is used')
+        if not seen_user:
+            raise ValueError('at least one user message is required')
+        if self.meta.lossy_source and not self.meta.lossy_reasons:
+            raise ValueError('lossy_source requires at least one lossy_reasons entry')
+        return self
+def _has_non_text_content(content: Qwen35MessageContent) -> bool:
+    if isinstance(content, str):
+        return False
+    return any(getattr(block, 'type', None) in {'image', 'video'} for block in content)

package/bin/agentic-dataset-builder.js ADDED Viewed

@@ -0,0 +1,77 @@
+#!/usr/bin/env node
+import { spawnSync } from 'node:child_process';
+import { existsSync } from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const venvDir = path.join(__dirname, '.venv');
+const requirementsPath = path.join(__dirname, 'requirements.txt');
+const runPy = path.join(__dirname, 'run.py');
+function isWindows() {
+  return process.platform === 'win32';
+}
+function venvPythonPath() {
+  return isWindows()
+    ? path.join(venvDir, 'Scripts', 'python.exe')
+    : path.join(venvDir, 'bin', 'python');
+}
+function findSystemPython() {
+  const candidates = isWindows()
+    ? ['py', 'python', 'python3']
+    : ['python3', 'python'];
+  for (const candidate of candidates) {
+    const probeArgs = candidate === 'py' ? ['-3', '--version'] : ['--version'];
+    const result = spawnSync(candidate, probeArgs, { stdio: 'ignore' });
+    if (result.status === 0) {
+      return candidate;
+    }
+  }
+  return null;
+}
+function run(cmd, args, options = {}) {
+  const rendered = [cmd, ...args].join(' ');
+  console.error(`[agentic-dataset-builder] ${rendered}`);
+  const result = spawnSync(cmd, args, {
+    stdio: 'inherit',
+    cwd: __dirname,
+    env: process.env,
+    ...options,
+  });
+  if (result.status !== 0) {
+    process.exit(result.status ?? 1);
+  }
+}
+function ensureEnv() {
+  const pythonInVenv = venvPythonPath();
+  if (existsSync(pythonInVenv)) {
+    return pythonInVenv;
+  }
+  const systemPython = findSystemPython();
+  if (!systemPython) {
+    console.error('Python 3.10+ is required but was not found in PATH.');
+    process.exit(1);
+  }
+  if (systemPython === 'py') {
+    run(systemPython, ['-3', '-m', 'venv', venvDir]);
+  } else {
+    run(systemPython, ['-m', 'venv', venvDir]);
+  }
+  const venvPython = venvPythonPath();
+  run(venvPython, ['-m', 'pip', 'install', '--upgrade', 'pip']);
+  run(venvPython, ['-m', 'pip', 'install', '-r', requirementsPath]);
+  return venvPython;
+}
+const python = ensureEnv();
+const args = process.argv.slice(2);
+run(python, [runPy, ...args]);

package/package.json ADDED Viewed

@@ -0,0 +1,40 @@
+{
+  "name": "agentic-dataset-builder",
+  "version": "0.1.0",
+  "description": "One-shot local dataset builder for Pi and Codex session histories",
+  "homepage": "https://github.com/Dominic789654/agentic-dataset-builder",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/Dominic789654/agentic-dataset-builder.git"
+  },
+  "bugs": {
+    "url": "https://github.com/Dominic789654/agentic-dataset-builder/issues"
+  },
+  "license": "MIT",
+  "keywords": [
+    "agentic",
+    "dataset",
+    "pi",
+    "codex",
+    "qwen",
+    "parquet"
+  ],
+  "files": [
+    "bin",
+    "run.py",
+    "requirements.txt",
+    "README.md",
+    "LICENSE",
+    "agentic_dataset"
+  ],
+  "bin": {
+    "agentic-dataset-builder": "./bin/agentic-dataset-builder.js"
+  },
+  "type": "module",
+  "engines": {
+    "node": ">=18"
+  },
+  "scripts": {
+    "pack:check": "npm pack --dry-run"
+  }
+}

package/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ pydantic>=2
2	+ pyarrow>=14

package/run.py ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+from agentic_dataset.build_agentic_dataset import main
+if __name__ == '__main__':
+    raise SystemExit(main())