distill-align 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distill_align/__init__.py +9 -0
- distill_align/cli/__init__.py +1 -0
- distill_align/cli/main.py +579 -0
- distill_align/core/__init__.py +1 -0
- distill_align/core/cache.py +418 -0
- distill_align/core/checkpoint.py +416 -0
- distill_align/core/config.py +90 -0
- distill_align/core/config_file.py +275 -0
- distill_align/core/exceptions.py +133 -0
- distill_align/core/json_utils.py +62 -0
- distill_align/core/logging.py +133 -0
- distill_align/core/pii_filter.py +377 -0
- distill_align/core/schemas.py +194 -0
- distill_align/core/telemetry.py +44 -0
- distill_align/exporter/__init__.py +1 -0
- distill_align/exporter/dataset_card.py +259 -0
- distill_align/exporter/formatters/__init__.py +1 -0
- distill_align/exporter/formatters/alpaca.py +144 -0
- distill_align/exporter/formatters/base.py +91 -0
- distill_align/exporter/formatters/chatml.py +93 -0
- distill_align/exporter/formatters/conversation.py +88 -0
- distill_align/exporter/formatters/hf_messages.py +134 -0
- distill_align/exporter/formatters/jsonl.py +177 -0
- distill_align/exporter/formatters/parquet.py +283 -0
- distill_align/exporter/formatters/preference.py +137 -0
- distill_align/exporter/formatters/sharegpt.py +116 -0
- distill_align/exporter/hub.py +94 -0
- distill_align/exporter/pipeline.py +305 -0
- distill_align/exporter/preference_generator.py +234 -0
- distill_align/exporter/splitter.py +210 -0
- distill_align/exporter/unsloth_builder.py +283 -0
- distill_align/exporter/validator.py +410 -0
- distill_align/ingestion/__init__.py +1 -0
- distill_align/ingestion/auto.py +323 -0
- distill_align/ingestion/chunkers/__init__.py +1 -0
- distill_align/ingestion/chunkers/base.py +65 -0
- distill_align/ingestion/chunkers/code.py +226 -0
- distill_align/ingestion/chunkers/markdown.py +170 -0
- distill_align/ingestion/loaders/__init__.py +1 -0
- distill_align/ingestion/loaders/base.py +69 -0
- distill_align/ingestion/loaders/code.py +153 -0
- distill_align/ingestion/loaders/csv_loader.py +101 -0
- distill_align/ingestion/loaders/docx.py +92 -0
- distill_align/ingestion/loaders/html.py +111 -0
- distill_align/ingestion/loaders/json_loader.py +136 -0
- distill_align/ingestion/loaders/jupyter.py +112 -0
- distill_align/ingestion/loaders/markdown.py +70 -0
- distill_align/ingestion/loaders/pdf.py +91 -0
- distill_align/ingestion/loaders/text.py +62 -0
- distill_align/ingestion/loaders/web.py +267 -0
- distill_align/ingestion/pipeline.py +228 -0
- distill_align/synthesis/__init__.py +1 -0
- distill_align/synthesis/conversation_builder.py +256 -0
- distill_align/synthesis/judge.py +124 -0
- distill_align/synthesis/models/__init__.py +1 -0
- distill_align/synthesis/models/anthropic.py +153 -0
- distill_align/synthesis/models/azure.py +181 -0
- distill_align/synthesis/models/base.py +189 -0
- distill_align/synthesis/models/gemini.py +168 -0
- distill_align/synthesis/models/ollama.py +215 -0
- distill_align/synthesis/models/openai.py +189 -0
- distill_align/synthesis/models/vllm.py +61 -0
- distill_align/synthesis/pipeline.py +426 -0
- distill_align/synthesis/prompts/__init__.py +1 -0
- distill_align/synthesis/prompts/scaffold/code_extract.j2 +15 -0
- distill_align/synthesis/prompts/scaffold/system.j2 +15 -0
- distill_align/synthesis/prompts/scaffold/tool_call.j2 +12 -0
- distill_align/synthesis/prompts/scaffold.py +87 -0
- distill_align/synthesis/prompts/socratic/code.j2 +19 -0
- distill_align/synthesis/prompts/socratic/markdown.j2 +17 -0
- distill_align/synthesis/prompts/socratic/system.j2 +19 -0
- distill_align/synthesis/prompts/socratic.py +88 -0
- distill_align/synthesis/pruner.py +237 -0
- distill_align/synthesis/tokenizer.py +437 -0
- distill_align/synthesis/worker.py +356 -0
- distill_align/tui/__init__.py +1 -0
- distill_align/tui/app.py +364 -0
- distill_align-0.1.1.dist-info/METADATA +168 -0
- distill_align-0.1.1.dist-info/RECORD +82 -0
- distill_align-0.1.1.dist-info/WHEEL +4 -0
- distill_align-0.1.1.dist-info/entry_points.txt +3 -0
- distill_align-0.1.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI module: Typer entry points for the Distill-Align framework."""
|
|
@@ -0,0 +1,579 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI entry point for Distill-Align.
|
|
3
|
+
|
|
4
|
+
Uses Typer for command-line interface with Rich for beautiful output.
|
|
5
|
+
|
|
6
|
+
Commands:
|
|
7
|
+
- ingest: Load and chunk files
|
|
8
|
+
- synthesize: Generate conversations from chunks
|
|
9
|
+
- export: Format and export to training datasets
|
|
10
|
+
- validate: Validate and analyze a dataset
|
|
11
|
+
- init: Initialize a new project with config file
|
|
12
|
+
- status: Check configuration and connections
|
|
13
|
+
- jobs: Manage synthesis jobs (list, resume, cancel)
|
|
14
|
+
- config: View/edit configuration
|
|
15
|
+
- tui: Launch the interactive dashboard
|
|
16
|
+
- version: Show version
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Literal, cast
|
|
22
|
+
|
|
23
|
+
import typer # type: ignore[import-not-found]
|
|
24
|
+
from loguru import logger
|
|
25
|
+
from rich.console import Console
|
|
26
|
+
from rich.panel import Panel
|
|
27
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
28
|
+
from rich.table import Table
|
|
29
|
+
|
|
30
|
+
from ..core.config_file import find_config_file, generate_default_config, load_config
|
|
31
|
+
from ..core.logging import setup_logging
|
|
32
|
+
from ..core.schemas import ExportConfig, IngestionConfig, SynthesisConfig
|
|
33
|
+
|
|
34
|
+
# Main Typer app
|
|
35
|
+
app = typer.Typer(
|
|
36
|
+
name="distill-align",
|
|
37
|
+
help="The Structured Reasoning Extraction Factory",
|
|
38
|
+
add_completion=False,
|
|
39
|
+
no_args_is_help=True,
|
|
40
|
+
)
|
|
41
|
+
console = Console()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Subcommand groups
|
|
45
|
+
init_app = typer.Typer(help="Initialize a new project")
|
|
46
|
+
jobs_app = typer.Typer(help="Manage synthesis jobs")
|
|
47
|
+
config_app = typer.Typer(help="Configuration management")
|
|
48
|
+
app.add_typer(init_app, name="init")
|
|
49
|
+
app.add_typer(jobs_app, name="jobs")
|
|
50
|
+
app.add_typer(config_app, name="config")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@app.callback()
|
|
54
|
+
def main(
|
|
55
|
+
ctx: typer.Context,
|
|
56
|
+
log_level: str = typer.Option("INFO", "--log-level", "-l", help="Logging level"),
|
|
57
|
+
log_file: str | None = typer.Option(None, "--log-file", help="Log file path"),
|
|
58
|
+
log_format: str = typer.Option("text", "--log-format", help="Log format: text or json"),
|
|
59
|
+
config_file: str | None = typer.Option(None, "--config", "-c", help="Path to config file"),
|
|
60
|
+
):
|
|
61
|
+
"""Distill-Align: Generate fine-tuning datasets from raw domain data."""
|
|
62
|
+
setup_logging(log_level=log_level, log_file=log_file, log_format=log_format)
|
|
63
|
+
ctx.obj = {"config_file": config_file}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@app.command()
|
|
67
|
+
def ingest(
|
|
68
|
+
ctx: typer.Context,
|
|
69
|
+
source: str = typer.Argument(..., help="Source file or directory path"),
|
|
70
|
+
output: str = typer.Option("./chunks.json", "--output", "-o", help="Output file path"),
|
|
71
|
+
chunk_size: int = typer.Option(1000, "--chunk-size", "-s", help="Chunk size in characters"),
|
|
72
|
+
chunk_overlap: int = typer.Option(200, "--overlap", help="Chunk overlap in characters"),
|
|
73
|
+
recursive: bool = typer.Option(
|
|
74
|
+
True, "--recursive/--no-recursive", "-r", flag_value=True, help="Search subdirectories"
|
|
75
|
+
),
|
|
76
|
+
auto_detect: bool = typer.Option(True, "--auto/--no-auto", flag_value=True, help="Auto-detect file types"),
|
|
77
|
+
):
|
|
78
|
+
"""Ingest files and split into semantic chunks."""
|
|
79
|
+
from ..ingestion.auto import AutoIngestionPipeline
|
|
80
|
+
from ..ingestion.pipeline import IngestionPipeline
|
|
81
|
+
|
|
82
|
+
console.print(Panel.fit("📥 Ingestion Pipeline", style="bold blue"))
|
|
83
|
+
|
|
84
|
+
config = IngestionConfig(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
|
85
|
+
|
|
86
|
+
source_path = Path(source)
|
|
87
|
+
if not source_path.exists():
|
|
88
|
+
console.print(f"[red]Error: Source path does not exist: {source}[/red]")
|
|
89
|
+
raise typer.Exit(1)
|
|
90
|
+
|
|
91
|
+
pipeline: AutoIngestionPipeline | IngestionPipeline = (
|
|
92
|
+
AutoIngestionPipeline(config) if auto_detect else IngestionPipeline(config)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
with Progress(
|
|
96
|
+
SpinnerColumn(),
|
|
97
|
+
TextColumn("[progress.description]{task.description}"),
|
|
98
|
+
console=console,
|
|
99
|
+
) as progress:
|
|
100
|
+
task = progress.add_task("Ingesting files...", total=None)
|
|
101
|
+
|
|
102
|
+
if source_path.is_file():
|
|
103
|
+
chunks = pipeline.ingest_file(source_path)
|
|
104
|
+
else:
|
|
105
|
+
if auto_detect and hasattr(pipeline, "ingest_directory"):
|
|
106
|
+
# AutoIngestionPipeline supports progress callback
|
|
107
|
+
def progress_cb(current, total, name):
|
|
108
|
+
progress.update(
|
|
109
|
+
task, description=f"Processing {name} ({current}/{total})", completed=current, total=total
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
chunks = pipeline.ingest_directory(source_path, recursive=recursive, progress_callback=progress_cb) # type: ignore[call-arg]
|
|
113
|
+
else:
|
|
114
|
+
chunks = pipeline.ingest_directory(source_path, recursive=recursive)
|
|
115
|
+
|
|
116
|
+
progress.update(task, completed=True)
|
|
117
|
+
|
|
118
|
+
import json
|
|
119
|
+
|
|
120
|
+
output_path = Path(output)
|
|
121
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
|
|
123
|
+
chunks_data = [chunk.model_dump() for chunk in chunks]
|
|
124
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
125
|
+
json.dump(chunks_data, f, indent=2, ensure_ascii=False)
|
|
126
|
+
|
|
127
|
+
table = Table(title="Ingestion Summary")
|
|
128
|
+
table.add_column("Metric", style="cyan")
|
|
129
|
+
table.add_column("Value", style="green")
|
|
130
|
+
table.add_row("Total Chunks", str(len(chunks)))
|
|
131
|
+
table.add_row("Output File", str(output_path))
|
|
132
|
+
table.add_row("File Size", f"{output_path.stat().st_size / 1024:.1f} KB")
|
|
133
|
+
console.print(table)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@app.command()
|
|
137
|
+
def synthesize(
|
|
138
|
+
ctx: typer.Context,
|
|
139
|
+
input: str = typer.Argument(..., help="Input chunks JSON file"),
|
|
140
|
+
output: str = typer.Option("./conversations.json", "--output", "-o", help="Output file path"),
|
|
141
|
+
provider: str = typer.Option("openai", "--provider", "-p", help="LLM provider"),
|
|
142
|
+
model: str = typer.Option("gpt-4o", "--model", "-m", help="Model name"),
|
|
143
|
+
base_url: str | None = typer.Option(None, "--base-url", help="API base URL"),
|
|
144
|
+
api_key: str | None = typer.Option(None, "--api-key", help="API key (or use env var)"),
|
|
145
|
+
concurrency: int = typer.Option(5, "--concurrency", "-c", help="Max concurrent requests"),
|
|
146
|
+
rpm: int = typer.Option(60, "--rpm", help="Max requests per minute"),
|
|
147
|
+
job_id: str | None = typer.Option(None, "--job-id", help="Job ID for resume support"),
|
|
148
|
+
resume: bool = typer.Option(False, "--resume", flag_value=True, help="Resume a previous job"),
|
|
149
|
+
no_cache: bool = typer.Option(False, "--no-cache", flag_value=True, help="Disable caching"),
|
|
150
|
+
no_checkpoint: bool = typer.Option(False, "--no-checkpoint", flag_value=True, help="Disable checkpointing"),
|
|
151
|
+
prompt_dir: str | None = typer.Option(None, "--prompts", help="Custom prompts directory"),
|
|
152
|
+
mode: str = typer.Option("default", "--mode", help="Conversation mode: default, teach, debug, review, qa, explain"),
|
|
153
|
+
judge: bool = typer.Option(False, "--judge", flag_value=True, help="Enable LLM-as-judge evaluation"),
|
|
154
|
+
judge_model: str | None = typer.Option(None, "--judge-model", help="Model for judge (defaults to --model)"),
|
|
155
|
+
max_tokens: int | None = typer.Option(
|
|
156
|
+
None, "--max-tokens", help="Max tokens to generate per LLM call (prevents server cutoff)"
|
|
157
|
+
),
|
|
158
|
+
):
|
|
159
|
+
"""Synthesize chunks into structured conversations."""
|
|
160
|
+
from ..core.cache import CacheManager
|
|
161
|
+
from ..core.checkpoint import CheckpointManager
|
|
162
|
+
from ..core.schemas import DataChunk
|
|
163
|
+
from ..synthesis.conversation_builder import ConversationBuilder, ConversationMode
|
|
164
|
+
from ..synthesis.pipeline import SynthesisPipeline
|
|
165
|
+
|
|
166
|
+
console.print(Panel.fit("🧠 Synthesis Pipeline", style="bold magenta"))
|
|
167
|
+
|
|
168
|
+
import json
|
|
169
|
+
|
|
170
|
+
from ..core.json_utils import safe_json_load
|
|
171
|
+
|
|
172
|
+
input_path = Path(input)
|
|
173
|
+
if not input_path.exists():
|
|
174
|
+
console.print(f"[red]Error: Input file does not exist: {input}[/red]")
|
|
175
|
+
raise typer.Exit(1)
|
|
176
|
+
|
|
177
|
+
chunks_data = safe_json_load(input_path)
|
|
178
|
+
chunks = [DataChunk(**chunk) for chunk in chunks_data]
|
|
179
|
+
|
|
180
|
+
# Security: deprecate --api-key in favor of environment variables
|
|
181
|
+
if api_key:
|
|
182
|
+
console.print(
|
|
183
|
+
"[yellow]⚠️ WARNING: --api-key exposes your secret in process listings. "
|
|
184
|
+
"Use the OPENAI_API_KEY or DISTILL_LLM_API_KEY environment variable instead. "
|
|
185
|
+
"This flag will be removed in a future version.[/yellow]"
|
|
186
|
+
)
|
|
187
|
+
import os
|
|
188
|
+
|
|
189
|
+
os.environ.setdefault("OPENAI_API_KEY", api_key)
|
|
190
|
+
|
|
191
|
+
# Setup cache
|
|
192
|
+
cache = None if no_cache else CacheManager(cache_dir=".cache")
|
|
193
|
+
checkpoint = None if no_checkpoint else CheckpointManager()
|
|
194
|
+
|
|
195
|
+
config = SynthesisConfig(
|
|
196
|
+
llm_provider=cast(Literal["openai", "ollama", "vllm", "anthropic", "gemini", "azure"], provider),
|
|
197
|
+
model_name=model,
|
|
198
|
+
base_url=base_url,
|
|
199
|
+
api_key=api_key,
|
|
200
|
+
max_concurrency=concurrency,
|
|
201
|
+
max_rpm=rpm,
|
|
202
|
+
enable_judge=judge,
|
|
203
|
+
judge_model=judge_model,
|
|
204
|
+
max_tokens=max_tokens,
|
|
205
|
+
)
|
|
206
|
+
pipeline = SynthesisPipeline(
|
|
207
|
+
config=config,
|
|
208
|
+
cache_manager=cache,
|
|
209
|
+
checkpoint_manager=checkpoint,
|
|
210
|
+
use_cache=not no_cache,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Use custom conversation mode if specified
|
|
214
|
+
use_conversation_builder = mode != "default"
|
|
215
|
+
|
|
216
|
+
async def run_synthesis():
|
|
217
|
+
with Progress(
|
|
218
|
+
SpinnerColumn(),
|
|
219
|
+
TextColumn("[progress.description]{task.description}"),
|
|
220
|
+
console=console,
|
|
221
|
+
) as progress:
|
|
222
|
+
task = progress.add_task(f"Synthesizing {len(chunks)} chunks...", total=len(chunks))
|
|
223
|
+
|
|
224
|
+
def update_progress(current, total):
|
|
225
|
+
progress.update(task, completed=current)
|
|
226
|
+
|
|
227
|
+
if use_conversation_builder:
|
|
228
|
+
builder = ConversationBuilder()
|
|
229
|
+
client = pipeline._get_client()
|
|
230
|
+
mode_enum = ConversationMode(mode)
|
|
231
|
+
conversations = await builder.build_batch(chunks, mode_enum, client, max_concurrency=concurrency) # type: ignore[arg-type]
|
|
232
|
+
else:
|
|
233
|
+
conversations = await pipeline.synthesize_batch(chunks, update_progress, job_id=job_id, resume=resume)
|
|
234
|
+
return conversations
|
|
235
|
+
|
|
236
|
+
try:
|
|
237
|
+
conversations = asyncio.run(run_synthesis())
|
|
238
|
+
|
|
239
|
+
output_path = Path(output)
|
|
240
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
241
|
+
conv_data = [conv.model_dump() for conv in conversations]
|
|
242
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
243
|
+
json.dump(conv_data, f, indent=2, ensure_ascii=False)
|
|
244
|
+
|
|
245
|
+
table = Table(title="Synthesis Summary")
|
|
246
|
+
table.add_column("Metric", style="cyan")
|
|
247
|
+
table.add_column("Value", style="green")
|
|
248
|
+
table.add_row("Input Chunks", str(len(chunks)))
|
|
249
|
+
table.add_row("Conversations", str(len(conversations)))
|
|
250
|
+
table.add_row("Success Rate", f"{len(conversations) / len(chunks) * 100:.1f}%" if chunks else "0%")
|
|
251
|
+
table.add_row("Output File", str(output_path))
|
|
252
|
+
|
|
253
|
+
if cache:
|
|
254
|
+
cache_stats = cache.stats()
|
|
255
|
+
table.add_row("Cache Hit Rate", f"{cache_stats.hit_rate:.1%}")
|
|
256
|
+
table.add_row("Cache Entries", str(cache_stats.total_entries))
|
|
257
|
+
|
|
258
|
+
console.print(table)
|
|
259
|
+
|
|
260
|
+
# Show cost report if any requests were made
|
|
261
|
+
cost_stats = pipeline.get_cost_stats()
|
|
262
|
+
if cost_stats.num_requests > 0:
|
|
263
|
+
cost_panel = Panel(
|
|
264
|
+
pipeline.get_cost_report(),
|
|
265
|
+
title="💰 Cost Summary",
|
|
266
|
+
style="bold cyan",
|
|
267
|
+
)
|
|
268
|
+
console.print(cost_panel)
|
|
269
|
+
finally:
|
|
270
|
+
# Cleanup HTTP connections to prevent leaks
|
|
271
|
+
asyncio.run(pipeline.close())
|
|
272
|
+
if cache:
|
|
273
|
+
cache.close()
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
@app.command()
|
|
277
|
+
def export(
|
|
278
|
+
ctx: typer.Context,
|
|
279
|
+
input: str = typer.Argument(..., help="Input conversations JSON file"),
|
|
280
|
+
formats: str = typer.Option("sharegpt", "--format", "-f", help="Export formats (comma-separated)"),
|
|
281
|
+
output_dir: str = typer.Option("./output", "--output-dir", "-o", help="Output directory"),
|
|
282
|
+
model_name: str = typer.Option("unsloth/Meta-Llama-3.1-8B-Instruct", "--model", help="Unsloth model name"),
|
|
283
|
+
no_unsloth: bool = typer.Option(False, "--no-unsloth", flag_value=True, help="Skip Unsloth script generation"),
|
|
284
|
+
split: bool = typer.Option(False, "--split", flag_value=True, help="Split into train/val/test"),
|
|
285
|
+
card: bool = typer.Option(False, "--card", flag_value=True, help="Generate dataset card"),
|
|
286
|
+
):
|
|
287
|
+
"""Export conversations to training formats."""
|
|
288
|
+
from ..core.schemas import ConversationSchema
|
|
289
|
+
from ..exporter.pipeline import ExportPipeline
|
|
290
|
+
|
|
291
|
+
console.print(Panel.fit("📤 Export Pipeline", style="bold green"))
|
|
292
|
+
|
|
293
|
+
from ..core.json_utils import safe_json_load
|
|
294
|
+
|
|
295
|
+
input_path = Path(input)
|
|
296
|
+
if not input_path.exists():
|
|
297
|
+
console.print(f"[red]Error: Input file does not exist: {input}[/red]")
|
|
298
|
+
raise typer.Exit(1)
|
|
299
|
+
|
|
300
|
+
conv_data = safe_json_load(input_path)
|
|
301
|
+
conversations = [ConversationSchema(**conv) for conv in conv_data]
|
|
302
|
+
|
|
303
|
+
format_list = [f.strip() for f in formats.split(",")]
|
|
304
|
+
config = ExportConfig(
|
|
305
|
+
formats=cast(list[Literal["sharegpt", "alpaca", "chatml", "conversation", "hf_messages"]], format_list),
|
|
306
|
+
output_dir=output_dir,
|
|
307
|
+
unsloth_model=model_name,
|
|
308
|
+
)
|
|
309
|
+
pipeline = ExportPipeline(config)
|
|
310
|
+
|
|
311
|
+
with Progress(
|
|
312
|
+
SpinnerColumn(),
|
|
313
|
+
TextColumn("[progress.description]{task.description}"),
|
|
314
|
+
console=console,
|
|
315
|
+
) as progress:
|
|
316
|
+
task = progress.add_task("Exporting...", total=None)
|
|
317
|
+
output_files = pipeline.export(
|
|
318
|
+
conversations,
|
|
319
|
+
formats=format_list,
|
|
320
|
+
generate_unsloth=not no_unsloth,
|
|
321
|
+
split=split,
|
|
322
|
+
generate_card=card,
|
|
323
|
+
)
|
|
324
|
+
progress.update(task, completed=True)
|
|
325
|
+
|
|
326
|
+
table = Table(title="Export Summary")
|
|
327
|
+
table.add_column("Format", style="cyan")
|
|
328
|
+
table.add_column("File", style="green")
|
|
329
|
+
table.add_column("Size", style="yellow")
|
|
330
|
+
for format_name, file_path in output_files.items():
|
|
331
|
+
size = f"{file_path.stat().st_size / 1024:.1f} KB" if file_path.exists() else "N/A"
|
|
332
|
+
table.add_row(format_name, str(file_path), size)
|
|
333
|
+
console.print(table)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@app.command()
|
|
337
|
+
def validate(
|
|
338
|
+
input: str = typer.Argument(..., help="Input conversations JSON file"),
|
|
339
|
+
dedupe: bool = typer.Option(True, "--dedupe/--no-dedupe", flag_value=True, help="Remove duplicates"),
|
|
340
|
+
output: str | None = typer.Option(None, "--output", "-o", help="Save report to file"),
|
|
341
|
+
):
|
|
342
|
+
"""Validate and analyze a dataset."""
|
|
343
|
+
from ..core.schemas import ConversationSchema
|
|
344
|
+
from ..exporter.validator import DatasetValidator
|
|
345
|
+
|
|
346
|
+
console.print(Panel.fit("🔍 Dataset Validation", style="bold yellow"))
|
|
347
|
+
|
|
348
|
+
from ..core.json_utils import safe_json_load
|
|
349
|
+
|
|
350
|
+
input_path = Path(input)
|
|
351
|
+
if not input_path.exists():
|
|
352
|
+
console.print(f"[red]Error: Input file does not exist: {input}[/red]")
|
|
353
|
+
raise typer.Exit(1)
|
|
354
|
+
|
|
355
|
+
conv_data = safe_json_load(input_path)
|
|
356
|
+
conversations = [ConversationSchema(**conv) for conv in conv_data]
|
|
357
|
+
|
|
358
|
+
validator = DatasetValidator()
|
|
359
|
+
if dedupe:
|
|
360
|
+
conversations = validator.deduplicate(conversations)
|
|
361
|
+
|
|
362
|
+
report = validator.validate(conversations)
|
|
363
|
+
console.print(report.summary())
|
|
364
|
+
|
|
365
|
+
if output:
|
|
366
|
+
report_path = Path(output)
|
|
367
|
+
report_path.write_text(report.summary(), encoding="utf-8")
|
|
368
|
+
console.print(f"\n[green]Report saved to {report_path}[/green]")
|
|
369
|
+
|
|
370
|
+
if not report.is_valid:
|
|
371
|
+
raise typer.Exit(1)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
@app.command()
|
|
375
|
+
def status():
|
|
376
|
+
"""Check configuration and connections."""
|
|
377
|
+
from .. import __version__
|
|
378
|
+
from ..core.config_file import find_config_file
|
|
379
|
+
|
|
380
|
+
console.print(Panel.fit(f"🩺 Distill-Align Status v{__version__}", style="bold green"))
|
|
381
|
+
|
|
382
|
+
table = Table(title="System Status")
|
|
383
|
+
table.add_column("Check", style="cyan")
|
|
384
|
+
table.add_column("Status", style="green")
|
|
385
|
+
|
|
386
|
+
# Check Python version
|
|
387
|
+
import sys
|
|
388
|
+
|
|
389
|
+
py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
|
|
390
|
+
table.add_row("Python Version", py_version)
|
|
391
|
+
|
|
392
|
+
# Check config file
|
|
393
|
+
config_path = find_config_file()
|
|
394
|
+
table.add_row("Config File", str(config_path) if config_path else "[yellow]Not found[/yellow]")
|
|
395
|
+
|
|
396
|
+
# Check cache directory
|
|
397
|
+
cache_dir = Path(".cache")
|
|
398
|
+
if cache_dir.exists():
|
|
399
|
+
table.add_row(
|
|
400
|
+
"Cache Directory",
|
|
401
|
+
f"[green]{cache_dir}[/green] ({sum(f.stat().st_size for f in cache_dir.rglob('*') if f.is_file()) / 1024 / 1024:.1f} MB)",
|
|
402
|
+
)
|
|
403
|
+
else:
|
|
404
|
+
table.add_row("Cache Directory", "[yellow]Not created yet[/yellow]")
|
|
405
|
+
|
|
406
|
+
# Check env vars
|
|
407
|
+
import os
|
|
408
|
+
|
|
409
|
+
api_key = os.getenv("OPENAI_API_KEY") or os.getenv("DISTILL_LLM_API_KEY")
|
|
410
|
+
if api_key:
|
|
411
|
+
table.add_row("API Key", "[green]Set[/green]")
|
|
412
|
+
else:
|
|
413
|
+
table.add_row("API Key", "[yellow]Not set (use OPENAI_API_KEY env var)[/yellow]")
|
|
414
|
+
|
|
415
|
+
console.print(table)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# Jobs subcommand group
|
|
419
|
+
@jobs_app.command("list")
|
|
420
|
+
def jobs_list(
|
|
421
|
+
status: str | None = typer.Option(None, "--status", help="Filter by status"),
|
|
422
|
+
job_type: str | None = typer.Option(None, "--type", help="Filter by job type"),
|
|
423
|
+
limit: int = typer.Option(20, "--limit", help="Max jobs to show"),
|
|
424
|
+
):
|
|
425
|
+
"""List all synthesis jobs."""
|
|
426
|
+
from ..core.checkpoint import CheckpointManager, JobStatus
|
|
427
|
+
|
|
428
|
+
manager = CheckpointManager()
|
|
429
|
+
status_filter = JobStatus(status) if status else None
|
|
430
|
+
jobs = manager.list_jobs(status=status_filter, job_type=job_type, limit=limit)
|
|
431
|
+
|
|
432
|
+
if not jobs:
|
|
433
|
+
console.print("[yellow]No jobs found[/yellow]")
|
|
434
|
+
return
|
|
435
|
+
|
|
436
|
+
table = Table(title=f"Jobs ({len(jobs)})")
|
|
437
|
+
table.add_column("Job ID", style="cyan")
|
|
438
|
+
table.add_column("Type", style="blue")
|
|
439
|
+
table.add_column("Status", style="green")
|
|
440
|
+
table.add_column("Progress", style="yellow")
|
|
441
|
+
table.add_column("Created", style="magenta")
|
|
442
|
+
|
|
443
|
+
for job in jobs:
|
|
444
|
+
from datetime import datetime
|
|
445
|
+
|
|
446
|
+
created = datetime.fromtimestamp(job.created_at).strftime("%Y-%m-%d %H:%M")
|
|
447
|
+
table.add_row(
|
|
448
|
+
job.job_id,
|
|
449
|
+
job.job_type,
|
|
450
|
+
job.status.value,
|
|
451
|
+
f"{job.processed_items}/{job.total_items} ({job.progress_pct:.0f}%)",
|
|
452
|
+
created,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
console.print(table)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
@jobs_app.command("resume")
|
|
459
|
+
def jobs_resume(
|
|
460
|
+
job_id: str = typer.Argument(..., help="Job ID to resume"),
|
|
461
|
+
):
|
|
462
|
+
"""Resume a previous synthesis job."""
|
|
463
|
+
from ..core.checkpoint import CheckpointManager
|
|
464
|
+
|
|
465
|
+
manager = CheckpointManager()
|
|
466
|
+
checkpoint = manager.load_job(job_id)
|
|
467
|
+
|
|
468
|
+
if not checkpoint:
|
|
469
|
+
console.print(f"[red]Job {job_id} not found[/red]")
|
|
470
|
+
raise typer.Exit(1)
|
|
471
|
+
|
|
472
|
+
console.print(f"Resuming job {job_id}: {checkpoint.processed_items}/{checkpoint.total_items} done")
|
|
473
|
+
console.print(f"[yellow]Re-run: distill-align synthesize ... --job-id {job_id} --resume[/yellow]")
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
@jobs_app.command("delete")
|
|
477
|
+
def jobs_delete(
|
|
478
|
+
job_id: str = typer.Argument(..., help="Job ID to delete"),
|
|
479
|
+
force: bool = typer.Option(False, "--force", "-f", flag_value=True, help="Skip confirmation"),
|
|
480
|
+
):
|
|
481
|
+
"""Delete a job checkpoint."""
|
|
482
|
+
from ..core.checkpoint import CheckpointManager
|
|
483
|
+
|
|
484
|
+
if not force and not typer.confirm(f"Delete job {job_id}?"):
|
|
485
|
+
raise typer.Abort()
|
|
486
|
+
|
|
487
|
+
manager = CheckpointManager()
|
|
488
|
+
if manager.delete_job(job_id):
|
|
489
|
+
console.print(f"[green]Deleted job {job_id}[/green]")
|
|
490
|
+
else:
|
|
491
|
+
console.print(f"[red]Job {job_id} not found[/red]")
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
@jobs_app.command("cleanup")
|
|
495
|
+
def jobs_cleanup(
|
|
496
|
+
days: int = typer.Option(30, "--days", help="Remove jobs older than N days"),
|
|
497
|
+
):
|
|
498
|
+
"""Clean up old job checkpoints."""
|
|
499
|
+
from ..core.checkpoint import CheckpointManager
|
|
500
|
+
|
|
501
|
+
manager = CheckpointManager()
|
|
502
|
+
removed = manager.cleanup_old_jobs(older_than_days=days)
|
|
503
|
+
console.print(f"[green]Cleaned up {removed} old jobs[/green]")
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
# Config subcommand group
|
|
507
|
+
@config_app.command("show")
|
|
508
|
+
def config_show():
|
|
509
|
+
"""Show current configuration."""
|
|
510
|
+
config_path = find_config_file()
|
|
511
|
+
if not config_path:
|
|
512
|
+
console.print("[yellow]No config file found. Run 'distill-align init' to create one.[/yellow]")
|
|
513
|
+
return
|
|
514
|
+
|
|
515
|
+
config = load_config(config_path)
|
|
516
|
+
console.print(Panel(str(config_path), title="Config File"))
|
|
517
|
+
console.print(config.model_dump_json(indent=2))
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
@config_app.command("path")
|
|
521
|
+
def config_path():
|
|
522
|
+
"""Show the path to the active config file."""
|
|
523
|
+
config_path = find_config_file()
|
|
524
|
+
if config_path:
|
|
525
|
+
console.print(str(config_path))
|
|
526
|
+
else:
|
|
527
|
+
console.print("[yellow]No config file found[/yellow]")
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# Init subcommand
|
|
531
|
+
@init_app.command("run")
|
|
532
|
+
def init_run(
|
|
533
|
+
path: str = typer.Option("distill-align.yaml", "--path", "-p", help="Output config path"),
|
|
534
|
+
name: str = typer.Option("my-dataset", "--name", "-n", help="Project name"),
|
|
535
|
+
):
|
|
536
|
+
"""Initialize a new project config file."""
|
|
537
|
+
output = generate_default_config(project_name=name, path=path)
|
|
538
|
+
console.print(f"[green]✓ Created config file: {output}[/green]")
|
|
539
|
+
console.print("\nEdit it to configure your pipeline, then run:")
|
|
540
|
+
console.print(" [cyan]distill-align ingest --source ./data[/cyan]")
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
@app.command()
|
|
544
|
+
def tui():
|
|
545
|
+
"""Launch the interactive TUI dashboard."""
|
|
546
|
+
import os
|
|
547
|
+
|
|
548
|
+
if os.getenv("CI") or os.getenv("PYTEST_CURRENT_TEST"):
|
|
549
|
+
console.print("[yellow]TUI requires an interactive terminal. Skipping in non-TTY mode.[/yellow]")
|
|
550
|
+
return
|
|
551
|
+
console.print(Panel.fit("🖥️ Launching TUI...", style="bold cyan"))
|
|
552
|
+
from ..tui.app import DistillAlignApp
|
|
553
|
+
|
|
554
|
+
tui_app = DistillAlignApp()
|
|
555
|
+
tui_app.run()
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
@app.command()
|
|
559
|
+
def version():
|
|
560
|
+
"""Show version information."""
|
|
561
|
+
from .. import __version__
|
|
562
|
+
|
|
563
|
+
console.print(f"distill-align v{__version__}")
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def entry_point() -> None:
|
|
567
|
+
"""CLI entry point with global exception handling for production use."""
|
|
568
|
+
try:
|
|
569
|
+
app()
|
|
570
|
+
except typer.Exit:
|
|
571
|
+
raise # Let Typer handle its own exit codes
|
|
572
|
+
except Exception as e:
|
|
573
|
+
console.print(f"\n[red]❌ Unexpected error: {e}[/red]")
|
|
574
|
+
logger.opt(exception=True).error("Unhandled CLI exception")
|
|
575
|
+
raise typer.Exit(1) from None
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
if __name__ == "__main__":
|
|
579
|
+
entry_point()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core module: configuration, schemas, exceptions, and logging."""
|