themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,207 @@
1
+ """Multiple-choice question benchmark commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Callable, Literal, Sequence
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.datasets import (
12
+ mmlu_pro as mmlu_pro_dataset,
13
+ )
14
+ from themis.datasets import (
15
+ super_gpqa as super_gpqa_dataset,
16
+ )
17
+ from themis.experiment import mcq as mcq_experiment
18
+ from themis.experiment import storage as experiment_storage
19
+ from themis.utils.logging_utils import configure_logging
20
+ from themis.utils.progress import ProgressReporter
21
+
22
+
23
+ def load_multiple_choice_dataset(
24
+ *,
25
+ loader: Callable[..., Sequence],
26
+ source: Literal["huggingface", "local"],
27
+ data_dir: Path | None,
28
+ split: str,
29
+ limit: int | None,
30
+ subjects: Sequence[str] | None,
31
+ ):
32
+ """Load multiple choice dataset.
33
+
34
+ Args:
35
+ loader: Dataset loader function
36
+ source: Dataset source
37
+ data_dir: Directory containing local dataset
38
+ split: Dataset split
39
+ limit: Max rows to load
40
+ subjects: Subjects to filter
41
+
42
+ Returns:
43
+ List of generation examples
44
+ """
45
+ if source == "local" and data_dir is None:
46
+ raise ValueError(
47
+ "The --data-dir option is required when --source=local so Themis "
48
+ "knows where to read the dataset."
49
+ )
50
+ samples = loader(
51
+ source=source,
52
+ data_dir=data_dir,
53
+ split=split,
54
+ limit=limit,
55
+ subjects=subjects,
56
+ )
57
+ return [sample.to_generation_example() for sample in samples]
58
+
59
+
60
+ def supergpqa_command(
61
+ *,
62
+ source: Annotated[
63
+ Literal["huggingface", "local"], Parameter(help="Dataset source")
64
+ ] = "huggingface",
65
+ split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
66
+ data_dir: Annotated[
67
+ Path | None, Parameter(help="Directory containing local dataset")
68
+ ] = None,
69
+ limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
70
+ subjects: Annotated[
71
+ tuple[str, ...], Parameter(help="Subjects or categories to filter")
72
+ ] = (),
73
+ max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
74
+ storage: Annotated[
75
+ Path | None, Parameter(help="Cache directory for datasets/results")
76
+ ] = None,
77
+ run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
78
+ resume: Annotated[
79
+ bool, Parameter(help="Reuse cached generations when storage is set")
80
+ ] = True,
81
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
82
+ log_level: Annotated[
83
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
84
+ ] = "info",
85
+ csv_output: Annotated[
86
+ Path | None, Parameter(help="Write CSV export to this path")
87
+ ] = None,
88
+ html_output: Annotated[
89
+ Path | None, Parameter(help="Write HTML summary to this path")
90
+ ] = None,
91
+ json_output: Annotated[
92
+ Path | None, Parameter(help="Write JSON export to this path")
93
+ ] = None,
94
+ ) -> int:
95
+ """Run the SuperGPQA multiple-choice evaluation."""
96
+ configure_logging(log_level)
97
+ subject_filter = list(subjects) if subjects else None
98
+ rows = load_multiple_choice_dataset(
99
+ loader=super_gpqa_dataset.load_super_gpqa,
100
+ source=source,
101
+ data_dir=data_dir,
102
+ split=split,
103
+ limit=limit,
104
+ subjects=subject_filter,
105
+ )
106
+
107
+ storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
108
+ experiment = mcq_experiment.build_multiple_choice_json_experiment(
109
+ dataset_name="supergpqa",
110
+ task_id="supergpqa",
111
+ temperature=temperature,
112
+ storage=storage_impl,
113
+ )
114
+
115
+ total = effective_total(len(rows), max_samples)
116
+ with ProgressReporter(total=total, description="Generating") as progress:
117
+ report = experiment.run(
118
+ rows,
119
+ max_samples=max_samples,
120
+ run_id=run_id,
121
+ resume=resume,
122
+ on_result=progress.on_result,
123
+ )
124
+ print(mcq_experiment.summarize_report(report))
125
+ export_outputs(
126
+ report,
127
+ csv_output=csv_output,
128
+ html_output=html_output,
129
+ json_output=json_output,
130
+ title="supergpqa experiment",
131
+ )
132
+ return 0
133
+
134
+
135
+ def mmlu_pro_command(
136
+ *,
137
+ source: Annotated[
138
+ Literal["huggingface", "local"], Parameter(help="Dataset source")
139
+ ] = "huggingface",
140
+ split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
141
+ data_dir: Annotated[
142
+ Path | None, Parameter(help="Directory containing local dataset")
143
+ ] = None,
144
+ limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
145
+ subjects: Annotated[
146
+ tuple[str, ...], Parameter(help="Subjects or categories to filter")
147
+ ] = (),
148
+ max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
149
+ storage: Annotated[
150
+ Path | None, Parameter(help="Cache directory for datasets/results")
151
+ ] = None,
152
+ run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
153
+ resume: Annotated[
154
+ bool, Parameter(help="Reuse cached generations when storage is set")
155
+ ] = True,
156
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
157
+ log_level: Annotated[
158
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
159
+ ] = "info",
160
+ csv_output: Annotated[
161
+ Path | None, Parameter(help="Write CSV export to this path")
162
+ ] = None,
163
+ html_output: Annotated[
164
+ Path | None, Parameter(help="Write HTML summary to this path")
165
+ ] = None,
166
+ json_output: Annotated[
167
+ Path | None, Parameter(help="Write JSON export to this path")
168
+ ] = None,
169
+ ) -> int:
170
+ """Run the MMLU-Pro multiple-choice evaluation."""
171
+ configure_logging(log_level)
172
+ subject_filter = list(subjects) if subjects else None
173
+ rows = load_multiple_choice_dataset(
174
+ loader=mmlu_pro_dataset.load_mmlu_pro,
175
+ source=source,
176
+ data_dir=data_dir,
177
+ split=split,
178
+ limit=limit,
179
+ subjects=subject_filter,
180
+ )
181
+
182
+ storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
183
+ experiment = mcq_experiment.build_multiple_choice_json_experiment(
184
+ dataset_name="mmlu-pro",
185
+ task_id="mmlu_pro",
186
+ temperature=temperature,
187
+ storage=storage_impl,
188
+ )
189
+
190
+ total = effective_total(len(rows), max_samples)
191
+ with ProgressReporter(total=total, description="Generating") as progress:
192
+ report = experiment.run(
193
+ rows,
194
+ max_samples=max_samples,
195
+ run_id=run_id,
196
+ resume=resume,
197
+ on_result=progress.on_result,
198
+ )
199
+ print(mcq_experiment.summarize_report(report))
200
+ export_outputs(
201
+ report,
202
+ csv_output=csv_output,
203
+ html_output=html_output,
204
+ json_output=json_output,
205
+ title="mmlu_pro experiment",
206
+ )
207
+ return 0
@@ -0,0 +1,244 @@
1
+ """Sample run command for quick testing before full experiments."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.commands.config_commands import run_configured_experiment
11
+
12
+
13
+ def sample_run_command(
14
+ *,
15
+ config: Annotated[Path, Parameter(help="Path to experiment configuration file")],
16
+ n: Annotated[int, Parameter(help="Number of samples to test")] = 5,
17
+ verbose: Annotated[bool, Parameter(help="Show detailed output")] = False,
18
+ show_outputs: Annotated[
19
+ bool, Parameter(help="Display sample outputs and predictions")
20
+ ] = False,
21
+ estimate_cost: Annotated[
22
+ bool, Parameter(help="Estimate full run cost based on sample")
23
+ ] = True,
24
+ ) -> int:
25
+ """Quick test run on N samples before running full experiment.
26
+
27
+ This command helps you:
28
+ - Test your configuration works correctly
29
+ - Preview sample outputs before full run
30
+ - Estimate total cost based on actual token usage
31
+ - Catch configuration errors early
32
+ - Iterate on prompts quickly
33
+
34
+ Examples:
35
+ # Basic quick test
36
+ uv run python -m themis.cli sample-run \\
37
+ --config my_config.yaml \\
38
+ --n 5
39
+
40
+ # Test with verbose output
41
+ uv run python -m themis.cli sample-run \\
42
+ --config my_config.yaml \\
43
+ --n 3 \\
44
+ --verbose \\
45
+ --show-outputs
46
+
47
+ # Test and estimate full run cost
48
+ uv run python -m themis.cli sample-run \\
49
+ --config my_config.yaml \\
50
+ --n 10 \\
51
+ --estimate-cost
52
+ """
53
+ try:
54
+ import json
55
+ import tempfile
56
+
57
+ from hydra import compose, initialize_config_dir
58
+
59
+ # Load config
60
+ config_path = Path(config).resolve()
61
+ if not config_path.exists():
62
+ print(f"Error: Config file not found: {config_path}")
63
+ return 1
64
+
65
+ config_dir = str(config_path.parent)
66
+ config_name = config_path.stem
67
+
68
+ print("=" * 80)
69
+ print(f"🧪 Sample Run: Testing {n} samples")
70
+ print("=" * 80)
71
+ print(f"Config: {config_path}")
72
+ print(f"Samples: {n}")
73
+ print()
74
+
75
+ # Initialize Hydra
76
+ with initialize_config_dir(config_dir=config_dir, version_base=None):
77
+ cfg = compose(config_name=config_name)
78
+
79
+ # Override dataset limit
80
+ original_limit = cfg.dataset.get("limit")
81
+ cfg.dataset.limit = n
82
+
83
+ # Use temporary storage
84
+ with tempfile.TemporaryDirectory() as temp_dir:
85
+ cfg.storage.path = temp_dir
86
+
87
+ # Generate temporary run_id
88
+ cfg.run_id = "sample-run-temp"
89
+ cfg.resume = False
90
+
91
+ print("📋 Configuration:")
92
+ print(f" Model: {cfg.generation.model_identifier}")
93
+ print(f" Provider: {cfg.generation.provider.name}")
94
+ print(f" Temperature: {cfg.generation.sampling.temperature}")
95
+ print(f" Max tokens: {cfg.generation.sampling.max_tokens}")
96
+ if hasattr(cfg.dataset, "source"):
97
+ print(f" Dataset: {cfg.dataset.source}")
98
+ print()
99
+
100
+ # Run experiment on sample
101
+ print("🚀 Running sample experiment...")
102
+ print()
103
+
104
+ # Redirect to capture run
105
+ result = run_configured_experiment(
106
+ config_path=config_path,
107
+ overrides=[
108
+ f"dataset.limit={n}",
109
+ f"storage.path={temp_dir}",
110
+ "run_id=sample-run-temp",
111
+ "resume=false",
112
+ ],
113
+ )
114
+
115
+ if result != 0:
116
+ print("\n❌ Sample run failed")
117
+ return result
118
+
119
+ # Load results
120
+ report_path = Path(temp_dir) / "sample-run-temp" / "report.json"
121
+ if not report_path.exists():
122
+ print("\n⚠️ No report generated")
123
+ return 1
124
+
125
+ with report_path.open("r") as f:
126
+ report_data = json.load(f)
127
+
128
+ # Display results
129
+ print("\n" + "=" * 80)
130
+ print("✅ Sample Run Complete")
131
+ print("=" * 80)
132
+
133
+ # Metrics
134
+ metrics = report_data.get("metrics", [])
135
+ if metrics:
136
+ print("\n📊 Metrics:")
137
+ for metric in metrics:
138
+ name = metric["name"]
139
+ mean = metric["mean"]
140
+ count = metric["count"]
141
+ print(f" {name}: {mean:.4f} (n={count})")
142
+
143
+ # Cost analysis
144
+ cost_data = report_data.get("summary", {}).get("cost")
145
+ if cost_data:
146
+ total_cost = cost_data.get("total_cost", 0)
147
+ token_counts = cost_data.get("token_counts", {})
148
+ prompt_tokens = token_counts.get("prompt_tokens", 0)
149
+ completion_tokens = token_counts.get("completion_tokens", 0)
150
+
151
+ print("\n💰 Cost (sample run):")
152
+ print(f" Total: ${total_cost:.4f}")
153
+ print(f" Per sample: ${total_cost / n:.6f}")
154
+ print(
155
+ f" Prompt tokens: {prompt_tokens} ({prompt_tokens / n:.0f} avg)"
156
+ )
157
+ print(
158
+ f" Completion tokens: {completion_tokens} ({completion_tokens / n:.0f} avg)"
159
+ )
160
+
161
+ # Estimate full run cost
162
+ if estimate_cost and original_limit:
163
+ full_cost = (total_cost / n) * original_limit
164
+ print("\n📈 Estimated full run cost:")
165
+ print(f" Dataset size: {original_limit} samples")
166
+ print(f" Estimated cost: ${full_cost:.2f}")
167
+ print(
168
+ f" 95% CI: ${full_cost * 0.8:.2f} - ${full_cost * 1.2:.2f}"
169
+ )
170
+
171
+ if full_cost > 10.0:
172
+ print(f"\n⚠️ Warning: Estimated cost is ${full_cost:.2f}")
173
+ print(" Consider using --limit for initial testing")
174
+
175
+ # Failures
176
+ failures = report_data.get("run_failures", [])
177
+ eval_failures = report_data.get("evaluation_failures", [])
178
+ total_failures = len(failures) + len(eval_failures)
179
+
180
+ if total_failures > 0:
181
+ print(f"\n⚠️ Failures: {total_failures}")
182
+ if failures:
183
+ print(f" Generation failures: {len(failures)}")
184
+ if verbose:
185
+ for failure in failures[:3]:
186
+ print(
187
+ f" - {failure.get('sample_id')}: {failure.get('message')}"
188
+ )
189
+ if eval_failures:
190
+ print(f" Evaluation failures: {len(eval_failures)}")
191
+
192
+ # Show sample outputs
193
+ if show_outputs:
194
+ samples = report_data.get("samples", [])
195
+ print("\n📝 Sample Outputs (showing up to 3):")
196
+ for i, sample in enumerate(samples[:3], 1):
197
+ sample_id = sample.get("sample_id", f"sample-{i}")
198
+ scores = sample.get("scores", [])
199
+
200
+ print(f"\n Sample {i}: {sample_id}")
201
+ if scores:
202
+ for score in scores:
203
+ metric_name = score.get("metric")
204
+ value = score.get("value")
205
+ print(f" {metric_name}: {value:.4f}")
206
+
207
+ # Summary
208
+ print("\n" + "=" * 80)
209
+ print("✨ Next Steps:")
210
+ print("=" * 80)
211
+
212
+ if total_failures == 0 and metrics:
213
+ avg_metric = metrics[0]["mean"]
214
+ if avg_metric > 0.1: # Reasonable performance
215
+ print(" ✅ Configuration looks good!")
216
+ print(" Run full experiment with:")
217
+ print(
218
+ f" uv run python -m themis.cli run-config --config {config_path}"
219
+ )
220
+ else:
221
+ print(" ⚠️ Low performance on sample - consider:")
222
+ print(" - Adjusting prompt template")
223
+ print(" - Tuning temperature/max_tokens")
224
+ print(" - Testing different model")
225
+ else:
226
+ print(" ⚠️ Issues detected:")
227
+ if total_failures > 0:
228
+ print(" - Fix failures before full run")
229
+ if not metrics:
230
+ print(" - Check evaluation metrics")
231
+ print(" - Review configuration")
232
+
233
+ return 0
234
+
235
+ except Exception as e:
236
+ print(f"\n❌ Error: {e}")
237
+ import traceback
238
+
239
+ if verbose:
240
+ traceback.print_exc()
241
+ return 1
242
+
243
+
244
+ __all__ = ["sample_run_command"]