eval-protocol 0.2.93.dev2__py3-none-any.whl → 0.2.93.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,99 +1,37 @@
1
+ import argparse
1
2
  import json
2
3
  import os
3
4
  import sys
4
5
  import time
5
- import argparse
6
6
  from typing import Any, Dict, Optional
7
7
 
8
8
  import requests
9
+ from pydantic import ValidationError
9
10
 
10
- from ..auth import (
11
- get_fireworks_account_id,
12
- get_fireworks_api_base,
13
- get_fireworks_api_key,
14
- verify_api_key_and_get_account_id,
15
- )
11
+ from ..auth import get_fireworks_api_base, get_fireworks_api_key
16
12
  from ..common_utils import get_user_agent
17
13
  from ..fireworks_rft import (
18
- _map_api_host_to_app_host,
19
14
  build_default_output_model,
20
15
  create_dataset_from_jsonl,
21
16
  create_reinforcement_fine_tuning_job,
17
+ detect_dataset_builder,
18
+ materialize_dataset_via_builder,
22
19
  )
23
- from ..fireworks_rft import detect_dataset_builder, materialize_dataset_via_builder
24
- from .upload import _discover_tests, _normalize_evaluator_id, _prompt_select
25
-
26
-
27
- def _ensure_account_id() -> Optional[str]:
28
- account_id = get_fireworks_account_id()
29
- api_key = get_fireworks_api_key()
30
- if not account_id and api_key:
31
- resolved = verify_api_key_and_get_account_id(api_key=api_key, api_base=get_fireworks_api_base())
32
- if resolved:
33
- os.environ["FIREWORKS_ACCOUNT_ID"] = resolved
34
- return resolved
35
- return account_id
36
-
37
-
38
- def _extract_terminal_segment(resource_name: str) -> str:
39
- """Return the last path segment if a fully-qualified resource name is provided."""
40
- try:
41
- return resource_name.strip("/").split("/")[-1]
42
- except Exception:
43
- return resource_name
44
-
45
-
46
- def _print_links(evaluator_id: str, dataset_id: str, job_name: Optional[str]) -> None:
47
- api_base = get_fireworks_api_base()
48
- app_base = _map_api_host_to_app_host(api_base)
49
- print("\n📊 Dashboard Links:")
50
- evaluator_slug = _extract_terminal_segment(evaluator_id)
51
- print(f" Evaluator: {app_base}/dashboard/evaluators/{evaluator_slug}")
52
- if dataset_id:
53
- print(f" Dataset: {app_base}/dashboard/datasets/{dataset_id}")
54
- if job_name:
55
- # job_name likely like accounts/{account}/reinforcementFineTuningJobs/{id}
56
- try:
57
- job_id = job_name.strip().split("/")[-1]
58
- print(f" RFT Job: {app_base}/dashboard/fine-tuning/reinforcement/{job_id}")
59
- except Exception:
60
- pass
61
-
62
-
63
- def _auto_find_jsonl(cwd: str) -> Optional[str]:
64
- """Find a reasonable JSONL dataset file in the current project.
65
-
66
- Priority order:
67
- - dataset.jsonl in cwd
68
- - data/dataset.jsonl
69
- - first *.jsonl under cwd (depth-first, skipping common vendor/venv/build dirs)
70
- Returns a RELATIVE path from cwd if possible.
71
- """
72
- # Direct candidates
73
- direct_candidates = [
74
- os.path.join(cwd, "dataset.jsonl"),
75
- os.path.join(cwd, "data", "dataset.jsonl"),
76
- ]
77
- for p in direct_candidates:
78
- if os.path.isfile(p):
79
- try:
80
- return os.path.relpath(p, cwd)
81
- except Exception:
82
- return p
83
-
84
- # Walk and find any .jsonl
85
- skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
86
- for dirpath, dirnames, filenames in os.walk(cwd):
87
- # prune
88
- dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
89
- for name in sorted(filenames):
90
- if name.endswith(".jsonl"):
91
- candidate = os.path.join(dirpath, name)
92
- try:
93
- return os.path.relpath(candidate, cwd)
94
- except Exception:
95
- return candidate
96
- return None
20
+ from ..models import EvaluationRow
21
+ from .upload import upload_command
22
+ from .utils import (
23
+ _build_entry_point,
24
+ _build_trimmed_dataset_id,
25
+ _build_evaluator_dashboard_url,
26
+ _discover_and_select_tests,
27
+ _discover_tests,
28
+ _ensure_account_id,
29
+ _extract_terminal_segment,
30
+ _normalize_evaluator_id,
31
+ _print_links,
32
+ _resolve_selected_test,
33
+ )
34
+ from .local_test import run_evaluator_test
97
35
 
98
36
 
99
37
  def _extract_jsonl_from_dataloader(test_file_path: str, test_func_name: str) -> Optional[str]:
@@ -205,83 +143,23 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
205
143
  if isinstance(dataset_path, (list, tuple)) and len(dataset_path) > 0:
206
144
  dataset_path = dataset_path[0]
207
145
  if isinstance(dataset_path, str) and dataset_path:
146
+ candidate_paths = []
208
147
  if os.path.isabs(dataset_path):
209
- return dataset_path
210
- base_dir = os.path.dirname(os.path.abspath(test_file_path))
211
- resolved = os.path.abspath(os.path.join(base_dir, dataset_path))
212
- if os.path.isfile(resolved):
213
- return resolved
214
- # Try resolving from project root if relative to test file doesn't work
215
- if not os.path.isabs(dataset_path):
216
- # Try resolving from current working directory
217
- cwd_path = os.path.abspath(os.path.join(os.getcwd(), dataset_path))
218
- if os.path.isfile(cwd_path):
219
- return cwd_path
148
+ candidate_paths.append(dataset_path)
149
+ else:
150
+ base_dir = os.path.dirname(os.path.abspath(test_file_path))
151
+ candidate_paths.append(os.path.abspath(os.path.join(base_dir, dataset_path)))
152
+ # Also try resolving from current working directory
153
+ candidate_paths.append(os.path.abspath(os.path.join(os.getcwd(), dataset_path)))
154
+
155
+ for candidate in candidate_paths:
156
+ if os.path.isfile(candidate) and _validate_dataset_jsonl(candidate):
157
+ return candidate
220
158
  return None
221
159
  except Exception:
222
160
  return None
223
161
 
224
162
 
225
- def _build_trimmed_dataset_id(evaluator_id: str) -> str:
226
- """Build a dataset id derived from evaluator_id, trimmed to 63 chars.
227
-
228
- Format: <normalized-base>-dataset-YYYYMMDDHHMMSS, where base is trimmed to fit.
229
- """
230
- # Normalize base similarly to evaluator id rules
231
- from .upload import _normalize_evaluator_id # local import to avoid cycle at module import time
232
-
233
- base = _normalize_evaluator_id(evaluator_id)
234
- suffix = f"-dataset-{time.strftime('%Y%m%d%H%M%S')}"
235
- max_total = 63
236
- max_base_len = max_total - len(suffix)
237
- if max_base_len < 1:
238
- max_base_len = 1
239
- if len(base) > max_base_len:
240
- base = base[:max_base_len].rstrip("-")
241
- if not base:
242
- base = "dataset"
243
- # Ensure first char is a letter
244
- if not base:
245
- base = "dataset"
246
- if not base[0].isalpha():
247
- base = f"eval-{base}"
248
- if len(base) > max_base_len:
249
- base = base[:max_base_len]
250
- base = base.rstrip("-") or "dataset"
251
- return f"{base}{suffix}"
252
-
253
-
254
- def _resolve_selected_test(
255
- project_root: str,
256
- evaluator_id: Optional[str],
257
- selected_tests: Optional[list] = None,
258
- ) -> tuple[Optional[str], Optional[str]]:
259
- """
260
- Resolve a single test's source file path and function name to use downstream.
261
- Priority:
262
- 1) If selected_tests provided and length == 1, use it.
263
- 2) Else discover tests; if exactly one test, use it.
264
- 3) Else, if evaluator_id provided, match by normalized '<file-stem>-<func-name>'.
265
- Returns: (file_path, func_name) or (None, None) if unresolved.
266
- """
267
- try:
268
- tests = selected_tests if selected_tests is not None else _discover_tests(project_root)
269
- if not tests:
270
- return None, None
271
- if len(tests) == 1:
272
- return tests[0].file_path, tests[0].qualname.split(".")[-1]
273
- if evaluator_id:
274
- for t in tests:
275
- func_name = t.qualname.split(".")[-1]
276
- source_file_name = os.path.splitext(os.path.basename(t.file_path))[0]
277
- candidate = _normalize_evaluator_id(f"{source_file_name}-{func_name}")
278
- if candidate == evaluator_id:
279
- return t.file_path, func_name
280
- return None, None
281
- except Exception:
282
- return None, None
283
-
284
-
285
163
  def _poll_evaluator_status(
286
164
  evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
287
165
  ) -> bool:
@@ -343,45 +221,96 @@ def _poll_evaluator_status(
343
221
  return False
344
222
 
345
223
 
346
- def create_rft_command(args) -> int:
347
- evaluator_id: Optional[str] = getattr(args, "evaluator", None)
348
- non_interactive: bool = bool(getattr(args, "yes", False))
349
- dry_run: bool = bool(getattr(args, "dry_run", False))
350
- force: bool = bool(getattr(args, "force", False))
351
- # Track the specifically chosen test (if any) to aid dataset inference later
352
- selected_test_file_path: Optional[str] = None
353
- selected_test_func_name: Optional[str] = None
224
+ def _validate_dataset_jsonl(jsonl_path: str, sample_limit: int = 50) -> bool:
225
+ """Validate that a JSONL file contains rows compatible with EvaluationRow.
354
226
 
355
- api_key = get_fireworks_api_key()
356
- if not api_key:
357
- print("Error: FIREWORKS_API_KEY not set.")
358
- return 1
227
+ We stream up to `sample_limit` rows, ensuring each is JSON-decodable and can be
228
+ parsed by the EvaluationRow model. Returns True on success, False on any error.
229
+ """
230
+ try:
231
+ if not os.path.isfile(jsonl_path):
232
+ print(f"Error: dataset JSONL not found at path: {jsonl_path}")
233
+ return False
234
+
235
+ row_count = 0
236
+ with open(jsonl_path, "r", encoding="utf-8") as f:
237
+ for line in f:
238
+ line = line.strip()
239
+ if not line:
240
+ continue
241
+ try:
242
+ data = json.loads(line)
243
+ except json.JSONDecodeError as e:
244
+ print(f"Error: dataset JSONL contains invalid JSON (line {row_count + 1}): {e}")
245
+ return False
359
246
 
360
- account_id = _ensure_account_id()
361
- if not account_id:
362
- print("Error: FIREWORKS_ACCOUNT_ID not set and could not be resolved.")
363
- return 1
247
+ try:
248
+ EvaluationRow.model_validate(data)
249
+ except ValidationError as e:
250
+ print(f"Error: dataset JSONL row {row_count + 1} is not a valid EvaluationRow: {e}")
251
+ return False
364
252
 
365
- api_base = get_fireworks_api_base()
253
+ row_count += 1
254
+ if row_count >= sample_limit:
255
+ break
256
+
257
+ if row_count == 0:
258
+ print(f"Error: dataset JSONL at {jsonl_path} appears to be empty.")
259
+ return False
260
+
261
+ return True
262
+ except Exception as e:
263
+ print(f"Error validating dataset JSONL at {jsonl_path}: {e}")
264
+ return False
265
+
266
+
267
+ def _validate_dataset(dataset_jsonl: Optional[str]) -> bool:
268
+ """Validate dataset JSONL path when available; no-op when using dataset IDs only."""
269
+ if not dataset_jsonl:
270
+ return True
271
+ return _validate_dataset_jsonl(dataset_jsonl)
272
+
273
+
274
+ def _validate_evaluator_locally(
275
+ project_root: str,
276
+ selected_test_file: Optional[str],
277
+ selected_test_func: Optional[str],
278
+ ignore_docker: bool,
279
+ docker_build_extra: str,
280
+ docker_run_extra: str,
281
+ ) -> bool:
282
+ """Run pytest locally for the selected evaluation test to validate the evaluator."""
283
+ if not selected_test_file or not selected_test_func:
284
+ # No local test associated; skip validation but warn the user.
285
+ print("Warning: Could not resolve a local evaluation test for this evaluator; skipping local validation.")
286
+ return True
287
+
288
+ pytest_target = _build_entry_point(project_root, selected_test_file, selected_test_func)
289
+ exit_code = run_evaluator_test(
290
+ project_root=project_root,
291
+ pytest_target=pytest_target,
292
+ ignore_docker=ignore_docker,
293
+ docker_build_extra=docker_build_extra,
294
+ docker_run_extra=docker_run_extra,
295
+ )
296
+ return exit_code == 0
297
+
298
+
299
+ def _resolve_evaluator(
300
+ project_root: str,
301
+ evaluator_arg: Optional[str],
302
+ non_interactive: bool,
303
+ account_id: str,
304
+ ) -> tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
305
+ """Resolve evaluator id/resource and associated local test (file + func)."""
306
+ evaluator_id = evaluator_arg
307
+ selected_test_file_path: Optional[str] = None
308
+ selected_test_func_name: Optional[str] = None
366
309
 
367
- # Resolve evaluator id/entry if omitted (reuse upload's selector flow)
368
- project_root = os.getcwd()
369
310
  if not evaluator_id:
370
- print("Scanning for evaluation tests...")
371
- tests = _discover_tests(project_root)
372
- if not tests:
373
- print("No evaluation tests found.")
374
- print("\nHint: Make sure your tests use the @evaluation_test decorator.")
375
- return 1
376
- # Always interactive selection here
377
- try:
378
- selected_tests = _prompt_select(tests, non_interactive=non_interactive)
379
- except Exception:
380
- print("Error: Failed to open selector UI. Please pass --evaluator or --entry explicitly.")
381
- return 1
311
+ selected_tests = _discover_and_select_tests(project_root, non_interactive=non_interactive)
382
312
  if not selected_tests:
383
- print("No tests selected.")
384
- return 1
313
+ return None, None, None, None
385
314
  if len(selected_tests) != 1:
386
315
  if non_interactive and len(selected_tests) > 1:
387
316
  print("Error: Multiple evaluation tests found in --yes (non-interactive) mode.")
@@ -400,7 +329,8 @@ def create_rft_command(args) -> int:
400
329
  pass
401
330
  else:
402
331
  print("Error: Please select exactly one evaluation test for 'create rft'.")
403
- return 1
332
+ return None, None, None, None
333
+
404
334
  # Derive evaluator_id from user's single selection
405
335
  chosen = selected_tests[0]
406
336
  func_name = chosen.qualname.split(".")[-1]
@@ -410,129 +340,49 @@ def create_rft_command(args) -> int:
410
340
  selected_test_file_path, selected_test_func_name = _resolve_selected_test(
411
341
  project_root, evaluator_id, selected_tests=selected_tests
412
342
  )
413
- # Resolve evaluator resource name to fully-qualified format required by API.
414
- # Allow users to pass either short id or fully-qualified resource.
415
- if evaluator_id and evaluator_id.startswith("accounts/"):
416
- evaluator_resource_name = evaluator_id
417
- evaluator_id = _extract_terminal_segment(evaluator_id)
418
343
  else:
419
- evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
420
-
421
- # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
422
- skip_upload = False
423
- if not force:
424
- try:
425
- headers = {
426
- "Authorization": f"Bearer {api_key}",
427
- "Content-Type": "application/json",
428
- "User-Agent": get_user_agent(),
429
- }
430
- resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
431
- if resp.ok:
432
- state = resp.json().get("state", "STATE_UNSPECIFIED")
433
- print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
434
- # Poll for ACTIVE before proceeding
435
- print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
436
- if not _poll_evaluator_status(
437
- evaluator_resource_name=evaluator_resource_name,
438
- api_key=api_key,
439
- api_base=api_base,
440
- timeout_minutes=10,
441
- ):
442
- app_base = _map_api_host_to_app_host(api_base)
443
- evaluator_slug = _extract_terminal_segment(evaluator_id)
444
- dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}"
445
- print("\n❌ Evaluator is not ready within the timeout period.")
446
- print(f"📊 Please check the evaluator status at: {dashboard_url}")
447
- print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
448
- return 1
449
- skip_upload = True
450
- # Populate selected test info for dataset inference later
451
- st_path, st_func = _resolve_selected_test(project_root, evaluator_id)
452
- if st_path and st_func:
453
- selected_test_file_path = st_path
454
- selected_test_func_name = st_func
455
- except requests.exceptions.RequestException:
456
- pass
457
-
458
- # Ensure evaluator exists by invoking the upload flow programmatically
459
- if not skip_upload:
460
- try:
461
- from .upload import upload_command
462
-
463
- tests = _discover_tests(project_root)
464
- selected_entry: Optional[str] = None
465
- st_path, st_func = _resolve_selected_test(project_root, evaluator_id, selected_tests=tests)
466
- if st_path and st_func:
467
- abs_path = os.path.abspath(st_path)
468
- try:
469
- rel = os.path.relpath(abs_path, project_root)
470
- except Exception:
471
- rel = abs_path
472
- selected_entry = f"{rel}::{st_func}"
473
- selected_test_file_path = st_path
474
- selected_test_func_name = st_func
475
- # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
476
- if selected_entry is None and len(tests) > 1:
477
- print(
478
- f"Error: Multiple evaluation tests found, and the selected evaluator {evaluator_id} does not match any discovered test.\n"
479
- " Please re-run specifying the evaluator.\n"
480
- " Hints:\n"
481
- " - eval-protocol create rft --evaluator <existing-evaluator-id>\n"
482
- )
483
- return 1
344
+ # Caller provided an evaluator id or fully-qualified resource; try to resolve local test
345
+ short_id = evaluator_id
346
+ if evaluator_id.startswith("accounts/"):
347
+ short_id = _extract_terminal_segment(evaluator_id)
348
+ st_path, st_func = _resolve_selected_test(project_root, short_id)
349
+ if st_path and st_func:
350
+ selected_test_file_path = st_path
351
+ selected_test_func_name = st_func
352
+ evaluator_id = short_id
484
353
 
485
- upload_args = argparse.Namespace(
486
- path=project_root,
487
- entry=selected_entry,
488
- id=evaluator_id,
489
- display_name=None,
490
- description=None,
491
- force=force, # Pass through the --force flag
492
- yes=True,
493
- env_file=None, # Add the new env_file parameter
494
- )
495
-
496
- if force:
497
- print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
498
-
499
- rc = upload_command(upload_args)
500
- if rc == 0:
501
- print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
354
+ if not evaluator_id:
355
+ return None, None, None, None
502
356
 
503
- # Poll for evaluator status
504
- print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
505
- is_active = _poll_evaluator_status(
506
- evaluator_resource_name=evaluator_resource_name,
507
- api_key=api_key,
508
- api_base=api_base,
509
- timeout_minutes=10,
510
- )
357
+ # Resolve evaluator resource name to fully-qualified format required by API.
358
+ if evaluator_arg and evaluator_arg.startswith("accounts/"):
359
+ evaluator_resource_name = evaluator_arg
360
+ else:
361
+ evaluator_resource_name = f"accounts/{account_id}/evaluators/{evaluator_id}"
511
362
 
512
- if not is_active:
513
- # Print helpful message with dashboard link
514
- app_base = _map_api_host_to_app_host(api_base)
515
- evaluator_slug = _extract_terminal_segment(evaluator_id)
516
- dashboard_url = f"{app_base}/dashboard/evaluators/{evaluator_slug}"
363
+ return evaluator_id, evaluator_resource_name, selected_test_file_path, selected_test_func_name
517
364
 
518
- print("\n❌ Evaluator is not ready within the timeout period.")
519
- print(f"📊 Please check the evaluator status at: {dashboard_url}")
520
- print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
521
- return 1
522
- else:
523
- # Evaluator ACTIVE; proceed
524
- pass
525
- else:
526
- print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
527
- except Exception as e:
528
- print(f"Warning: Failed to upload evaluator automatically: {e}")
529
365
 
530
- # Determine dataset id and materialization path
366
+ def _resolve_dataset(
367
+ project_root: str,
368
+ account_id: str,
369
+ evaluator_id: str,
370
+ args: argparse.Namespace,
371
+ selected_test_file_path: Optional[str],
372
+ selected_test_func_name: Optional[str],
373
+ ) -> tuple[Optional[str], Optional[str], Optional[str]]:
374
+ """Resolve dataset source without performing any uploads.
375
+
376
+ Returns a tuple of:
377
+ - dataset_id: existing dataset id when using --dataset or fully-qualified dataset resource
378
+ - dataset_resource: fully-qualified dataset resource for existing datasets; None for JSONL sources
379
+ - dataset_jsonl: local JSONL path when using --dataset-jsonl or inferred sources; None for id-only datasets
380
+ """
531
381
  dataset_id = getattr(args, "dataset", None)
532
382
  dataset_jsonl = getattr(args, "dataset_jsonl", None)
533
383
  dataset_display_name = getattr(args, "dataset_display_name", None)
534
- dataset_builder = getattr(args, "dataset_builder", None) # accepted but unused in simplified flow
535
384
  dataset_resource_override: Optional[str] = None
385
+
536
386
  if isinstance(dataset_id, str) and dataset_id.startswith("accounts/"):
537
387
  # Caller passed a fully-qualified dataset; capture it for body and keep only terminal id for printing
538
388
  dataset_resource_override = dataset_id
@@ -553,23 +403,21 @@ def create_rft_command(args) -> int:
553
403
  test_file_for_infer = tests[0].file_path
554
404
  func_for_infer = tests[0].qualname.split(".")[-1]
555
405
  if test_file_for_infer and func_for_infer:
556
- # Try data_loaders first
406
+ # Block using data loaders as a dataset source
557
407
  dataset_jsonl = _extract_jsonl_from_dataloader(test_file_for_infer, func_for_infer)
408
+ if dataset_jsonl:
409
+ print(
410
+ "Error: Evaluation tests that use 'data_loaders' to provide a dataset JSONL are not supported for 'create rft'.\n"
411
+ " Please switch to a JSONL-based dataset via input_dataset arg in @evaluation_test decorator."
412
+ )
413
+ return None, None, None
414
+ dataset_jsonl = _extract_jsonl_from_input_dataset(test_file_for_infer, func_for_infer)
558
415
  if dataset_jsonl:
559
416
  try:
560
417
  rel = os.path.relpath(dataset_jsonl, project_root)
561
418
  except Exception:
562
419
  rel = dataset_jsonl
563
- print(f"✓ Using JSONL from data loader: {rel}")
564
- if not dataset_jsonl:
565
- # Fall back to input_dataset (dataset_path)
566
- dataset_jsonl = _extract_jsonl_from_input_dataset(test_file_for_infer, func_for_infer)
567
- if dataset_jsonl:
568
- try:
569
- rel = os.path.relpath(dataset_jsonl, project_root)
570
- except Exception:
571
- rel = dataset_jsonl
572
- print(f"✓ Using JSONL from input_dataset: {rel}")
420
+ print(f"✓ Using JSONL from input_dataset: {rel}")
573
421
  if not dataset_jsonl:
574
422
  # Last resort: attempt to detect and run a dataset builder in the test's directory
575
423
  metric_dir = os.path.dirname(test_file_for_infer)
@@ -585,33 +433,182 @@ def create_rft_command(args) -> int:
585
433
  print(
586
434
  "Error: Could not determine dataset. Provide --dataset or --dataset-jsonl, or ensure a JSONL-based data loader or input_dataset is used in your single discovered test."
587
435
  )
588
- return 1
436
+ return None, None, None
589
437
 
590
- inferred_dataset_id = _build_trimmed_dataset_id(evaluator_id)
591
- if dry_run:
592
- print("--dry-run: would create dataset and upload JSONL")
593
- dataset_id = inferred_dataset_id
594
- else:
595
- try:
596
- # Resolve dataset_jsonl path relative to CWD if needed
597
- jsonl_path_for_upload = (
598
- dataset_jsonl
599
- if os.path.isabs(dataset_jsonl)
600
- else os.path.abspath(os.path.join(project_root, dataset_jsonl))
601
- )
602
- dataset_id, _ = create_dataset_from_jsonl(
603
- account_id=account_id,
438
+ # Build dataset resource for existing datasets; JSONL-based datasets will be uploaded later.
439
+ dataset_resource = None
440
+ if dataset_id:
441
+ dataset_resource = dataset_resource_override or f"accounts/{account_id}/datasets/{dataset_id}"
442
+
443
+ return dataset_id, dataset_resource, dataset_jsonl
444
+
445
+
446
+ def _upload_dataset(
447
+ project_root: str,
448
+ account_id: str,
449
+ api_key: str,
450
+ api_base: str,
451
+ evaluator_id: str,
452
+ dataset_id: Optional[str],
453
+ dataset_resource: Optional[str],
454
+ dataset_jsonl: Optional[str],
455
+ args: argparse.Namespace,
456
+ dry_run: bool,
457
+ ) -> tuple[Optional[str], Optional[str]]:
458
+ """Create/upload the dataset when using a local JSONL source.
459
+
460
+ For existing datasets (--dataset or fully-qualified ids), this is a no-op that
461
+ simply ensures dataset_id and dataset_resource are populated.
462
+ """
463
+ # Existing dataset case: nothing to upload
464
+ if not dataset_jsonl:
465
+ if not dataset_id:
466
+ return None, None
467
+ if not dataset_resource:
468
+ dataset_resource = f"accounts/{account_id}/datasets/{dataset_id}"
469
+ return dataset_id, dataset_resource
470
+
471
+ # JSONL-based dataset: upload or simulate upload
472
+ inferred_dataset_id = _build_trimmed_dataset_id(evaluator_id)
473
+ dataset_display_name = getattr(args, "dataset_display_name", None) or inferred_dataset_id
474
+
475
+ # Resolve dataset_jsonl path relative to CWD if needed
476
+ jsonl_path_for_upload = (
477
+ dataset_jsonl if os.path.isabs(dataset_jsonl) else os.path.abspath(os.path.join(project_root, dataset_jsonl))
478
+ )
479
+
480
+ if dry_run:
481
+ print("--dry-run: would create dataset and upload JSONL")
482
+ dataset_id = inferred_dataset_id
483
+ dataset_resource = f"accounts/{account_id}/datasets/{dataset_id}"
484
+ return dataset_id, dataset_resource
485
+
486
+ try:
487
+ dataset_id, _ = create_dataset_from_jsonl(
488
+ account_id=account_id,
489
+ api_key=api_key,
490
+ api_base=api_base,
491
+ dataset_id=inferred_dataset_id,
492
+ display_name=dataset_display_name,
493
+ jsonl_path=jsonl_path_for_upload,
494
+ )
495
+ print(f"✓ Created and uploaded dataset: {dataset_id}")
496
+ dataset_resource = f"accounts/{account_id}/datasets/{dataset_id}"
497
+ return dataset_id, dataset_resource
498
+ except Exception as e:
499
+ print(f"Error creating/uploading dataset: {e}")
500
+ return None, None
501
+
502
+
503
+ def _upload_and_ensure_evaluator(
504
+ project_root: str,
505
+ evaluator_id: str,
506
+ evaluator_resource_name: str,
507
+ api_key: str,
508
+ api_base: str,
509
+ force: bool,
510
+ ) -> bool:
511
+ """Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
512
+ # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
513
+ if not force:
514
+ try:
515
+ headers = {
516
+ "Authorization": f"Bearer {api_key}",
517
+ "Content-Type": "application/json",
518
+ "User-Agent": get_user_agent(),
519
+ }
520
+ resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
521
+ if resp.ok:
522
+ state = resp.json().get("state", "STATE_UNSPECIFIED")
523
+ print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
524
+ # Poll for ACTIVE before proceeding
525
+ print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
526
+ if not _poll_evaluator_status(
527
+ evaluator_resource_name=evaluator_resource_name,
604
528
  api_key=api_key,
605
529
  api_base=api_base,
606
- dataset_id=inferred_dataset_id,
607
- display_name=dataset_display_name or inferred_dataset_id,
608
- jsonl_path=jsonl_path_for_upload,
609
- )
610
- print(f" Created and uploaded dataset: {dataset_id}")
611
- except Exception as e:
612
- print(f"Error creating/uploading dataset: {e}")
613
- return 1
530
+ timeout_minutes=10,
531
+ ):
532
+ dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
533
+ print("\n❌ Evaluator is not ready within the timeout period.")
534
+ print(f"📊 Please check the evaluator status at: {dashboard_url}")
535
+ print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
536
+ return False
537
+ return True
538
+ except requests.exceptions.RequestException:
539
+ pass
614
540
 
541
+ # Ensure evaluator exists by invoking the upload flow programmatically
542
+ try:
543
+ tests = _discover_tests(project_root)
544
+ selected_entry: Optional[str] = None
545
+ st_path, st_func = _resolve_selected_test(project_root, evaluator_id, selected_tests=tests)
546
+ if st_path and st_func:
547
+ selected_entry = _build_entry_point(project_root, st_path, st_func)
548
+ # If still unresolved and multiple tests exist, fail fast to avoid uploading unintended evaluators
549
+ if selected_entry is None and len(tests) > 1:
550
+ print(
551
+ f"Error: Multiple evaluation tests found, and the selected evaluator {evaluator_id} does not match any discovered test.\n"
552
+ " Please re-run specifying the evaluator.\n"
553
+ " Hints:\n"
554
+ " - eval-protocol create rft --evaluator <existing-evaluator-id>\n"
555
+ )
556
+ return False
557
+
558
+ upload_args = argparse.Namespace(
559
+ path=project_root,
560
+ entry=selected_entry,
561
+ id=evaluator_id,
562
+ display_name=None,
563
+ description=None,
564
+ force=force, # Pass through the --force flag
565
+ yes=True,
566
+ env_file=None, # Add the new env_file parameter
567
+ )
568
+
569
+ if force:
570
+ print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
571
+
572
+ rc = upload_command(upload_args)
573
+ if rc == 0:
574
+ print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
575
+
576
+ # Poll for evaluator status
577
+ print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
578
+ is_active = _poll_evaluator_status(
579
+ evaluator_resource_name=evaluator_resource_name,
580
+ api_key=api_key,
581
+ api_base=api_base,
582
+ timeout_minutes=10,
583
+ )
584
+
585
+ if not is_active:
586
+ dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
587
+ print("\n❌ Evaluator is not ready within the timeout period.")
588
+ print(f"📊 Please check the evaluator status at: {dashboard_url}")
589
+ print(" Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
590
+ return False
591
+ return True
592
+ else:
593
+ print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
594
+ return False
595
+ except Exception as e:
596
+ print(f"Warning: Failed to upload evaluator automatically: {e}")
597
+ return False
598
+
599
+
600
+ def _create_rft_job(
601
+ account_id: str,
602
+ api_key: str,
603
+ api_base: str,
604
+ evaluator_id: str,
605
+ evaluator_resource_name: str,
606
+ dataset_id: str,
607
+ dataset_resource: str,
608
+ args: argparse.Namespace,
609
+ dry_run: bool,
610
+ ) -> int:
611
+ """Build and submit the RFT job request."""
615
612
  # Build training config/body
616
613
  # Exactly one of base-model or warm-start-from must be provided
617
614
  base_model_raw = getattr(args, "base_model", None)
@@ -682,9 +679,6 @@ def create_rft_command(args) -> int:
682
679
  "runId": getattr(args, "wandb_run_id", None),
683
680
  }
684
681
 
685
- # Build dataset resource (prefer override when provided)
686
- dataset_resource = dataset_resource_override or f"accounts/{account_id}/datasets/{dataset_id}"
687
-
688
682
  body: Dict[str, Any] = {
689
683
  "displayName": getattr(args, "display_name", None),
690
684
  "dataset": dataset_resource,
@@ -732,3 +726,107 @@ def create_rft_command(args) -> int:
732
726
  except Exception as e:
733
727
  print(f"Error creating RFT job: {e}")
734
728
  return 1
729
+
730
+
731
+ def create_rft_command(args) -> int:
732
+ # Pre-flight: resolve auth and environment
733
+ api_key = get_fireworks_api_key()
734
+ if not api_key:
735
+ print("Error: FIREWORKS_API_KEY not set.")
736
+ return 1
737
+
738
+ account_id = _ensure_account_id()
739
+ if not account_id:
740
+ print("Error: FIREWORKS_ACCOUNT_ID not set and could not be resolved.")
741
+ return 1
742
+
743
+ api_base = get_fireworks_api_base()
744
+ project_root = os.getcwd()
745
+ evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
746
+ non_interactive: bool = bool(getattr(args, "yes", False))
747
+ dry_run: bool = bool(getattr(args, "dry_run", False))
748
+ force: bool = bool(getattr(args, "force", False))
749
+ skip_validation: bool = bool(getattr(args, "skip_validation", False))
750
+ ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
751
+ docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
752
+ docker_run_extra: str = getattr(args, "docker_run_extra", "") or ""
753
+
754
+ # 1) Resolve evaluator and associated local test
755
+ (
756
+ evaluator_id,
757
+ evaluator_resource_name,
758
+ selected_test_file_path,
759
+ selected_test_func_name,
760
+ ) = _resolve_evaluator(project_root, evaluator_arg, non_interactive, account_id)
761
+ if not evaluator_id or not evaluator_resource_name:
762
+ return 1
763
+
764
+ # 2) Resolve dataset source (id or JSONL path)
765
+ dataset_id, dataset_resource, dataset_jsonl = _resolve_dataset(
766
+ project_root=project_root,
767
+ account_id=account_id,
768
+ evaluator_id=evaluator_id,
769
+ args=args,
770
+ selected_test_file_path=selected_test_file_path,
771
+ selected_test_func_name=selected_test_func_name,
772
+ )
773
+ # Require either an existing dataset id or a JSONL source to materialize from
774
+ if dataset_jsonl is None and not dataset_id:
775
+ return 1
776
+
777
+ # 3) Optional local validation
778
+ if not skip_validation:
779
+ # Dataset validation (JSONL must be EvaluationRow-compatible when present)
780
+ if not _validate_dataset(dataset_jsonl):
781
+ return 1
782
+
783
+ # Evaluator validation (run pytest for the selected test, possibly via Docker)
784
+ if not _validate_evaluator_locally(
785
+ project_root=project_root,
786
+ selected_test_file=selected_test_file_path,
787
+ selected_test_func=selected_test_func_name,
788
+ ignore_docker=ignore_docker,
789
+ docker_build_extra=docker_build_extra,
790
+ docker_run_extra=docker_run_extra,
791
+ ):
792
+ return 1
793
+
794
+ # 4) Upload dataset when using JSONL sources (no-op for existing datasets)
795
+ dataset_id, dataset_resource = _upload_dataset(
796
+ project_root=project_root,
797
+ account_id=account_id,
798
+ api_key=api_key,
799
+ api_base=api_base,
800
+ evaluator_id=evaluator_id,
801
+ dataset_id=dataset_id,
802
+ dataset_resource=dataset_resource,
803
+ dataset_jsonl=dataset_jsonl,
804
+ args=args,
805
+ dry_run=dry_run,
806
+ )
807
+ if not dataset_id or not dataset_resource:
808
+ return 1
809
+
810
+ # 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
811
+ if not _upload_and_ensure_evaluator(
812
+ project_root=project_root,
813
+ evaluator_id=evaluator_id,
814
+ evaluator_resource_name=evaluator_resource_name,
815
+ api_key=api_key,
816
+ api_base=api_base,
817
+ force=force,
818
+ ):
819
+ return 1
820
+
821
+ # 6) Create the RFT job
822
+ return _create_rft_job(
823
+ account_id=account_id,
824
+ api_key=api_key,
825
+ api_base=api_base,
826
+ evaluator_id=evaluator_id,
827
+ evaluator_resource_name=evaluator_resource_name,
828
+ dataset_id=dataset_id,
829
+ dataset_resource=dataset_resource,
830
+ args=args,
831
+ dry_run=dry_run,
832
+ )