langwatch 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ """
2
+ langwatch.experiment - Run experiments on LangWatch platform or via SDK.
3
+
4
+ This module provides two ways to run experiments:
5
+
6
+ 1. Platform experiments (CI/CD):
7
+ Run experiments configured in the LangWatch platform UI.
8
+
9
+ ```python
10
+ import langwatch
11
+
12
+ result = langwatch.experiment.run("my-experiment-slug")
13
+ result.print_summary()
14
+ ```
15
+
16
+ 2. SDK-defined experiments:
17
+ Define and run experiments programmatically.
18
+
19
+ ```python
20
+ import langwatch
21
+
22
+ experiment = langwatch.experiment.init("my-experiment")
23
+
24
+ for index, row in experiment.loop(df.iterrows(), threads=4):
25
+ async def task(index, row):
26
+ result = await my_agent(row["input"])
27
+ experiment.evaluate(
28
+ "langevals/exact_match",
29
+ index=index,
30
+ data={"output": result, "expected_output": row["expected"]},
31
+ settings={},
32
+ )
33
+ experiment.submit(task, index, row)
34
+ ```
35
+ """
36
+ from typing import Optional
37
+
38
+ # Re-export the Experiment class for SDK-defined experiments
39
+ from langwatch.experiment.experiment import Experiment
40
+
41
+ # Re-export the platform run function and related types
42
+ from langwatch.experiment.platform_run import (
43
+ run,
44
+ ExperimentRunResult,
45
+ ExperimentRunSummary,
46
+ ExperimentNotFoundError,
47
+ ExperimentTimeoutError,
48
+ ExperimentRunFailedError,
49
+ ExperimentsApiError,
50
+ TargetStats,
51
+ EvaluatorStats,
52
+ )
53
+
54
+
55
+ def init(name: str, *, run_id: Optional[str] = None) -> Experiment:
56
+ """
57
+ Initialize an SDK-defined experiment.
58
+
59
+ This creates an Experiment instance that you can use to run evaluators
60
+ programmatically using datasets and custom logic.
61
+
62
+ Args:
63
+ name: Name for this experiment run
64
+ run_id: Optional custom run ID (auto-generated if not provided)
65
+
66
+ Returns:
67
+ Experiment instance with methods:
68
+ - loop(): Iterate over dataset rows with parallel execution
69
+ - evaluate(): Run an evaluator on the current row
70
+ - log(): Log custom metrics
71
+ - submit(): Submit async tasks
72
+
73
+ Example:
74
+ ```python
75
+ import langwatch
76
+
77
+ experiment = langwatch.experiment.init("my-experiment")
78
+
79
+ for index, row in experiment.loop(df.iterrows(), threads=4):
80
+ async def task(index, row):
81
+ result = await my_agent(row["input"])
82
+ experiment.evaluate(
83
+ "langevals/exact_match",
84
+ index=index,
85
+ data={"output": result, "expected_output": row["expected"]},
86
+ settings={},
87
+ )
88
+ experiment.submit(task, index, row)
89
+ ```
90
+ """
91
+ experiment = Experiment(name, run_id=run_id)
92
+ experiment.init()
93
+ return experiment
94
+
95
+
96
+ __all__ = [
97
+ "init",
98
+ "run",
99
+ "Experiment",
100
+ "ExperimentRunResult",
101
+ "ExperimentRunSummary",
102
+ "ExperimentNotFoundError",
103
+ "ExperimentTimeoutError",
104
+ "ExperimentRunFailedError",
105
+ "ExperimentsApiError",
106
+ "TargetStats",
107
+ "EvaluatorStats",
108
+ ]
@@ -135,7 +135,7 @@ class IterationInfo(TypedDict):
135
135
  error: Optional[Exception]
136
136
 
137
137
 
138
- class Evaluation:
138
+ class Experiment:
139
139
  _executor: ThreadPoolExecutor
140
140
  _futures: List[Future[Any]]
141
141
  _current_index: int
@@ -255,7 +255,7 @@ class Evaluation:
255
255
  progress_bar.close()
256
256
 
257
257
  except Exception as e:
258
- Evaluation._log_results(
258
+ Experiment._log_results(
259
259
  langwatch.get_api_key() or "",
260
260
  {
261
261
  "experiment_slug": self.experiment_slug,
@@ -456,7 +456,7 @@ class Evaluation:
456
456
 
457
457
  # Start a new thread to send the batch
458
458
  thread = threading.Thread(
459
- target=Evaluation._log_results,
459
+ target=Experiment._log_results,
460
460
  args=(langwatch.get_api_key(), body),
461
461
  )
462
462
  thread.start()
@@ -485,7 +485,7 @@ class Evaluation:
485
485
  better_raise_for_status(response)
486
486
 
487
487
  def _wait_for_completion(self):
488
- async def wait_for_completion(self: Evaluation):
488
+ async def wait_for_completion(self: Experiment):
489
489
  # Send any remaining batch
490
490
  self._send_batch(finished=True)
491
491
 
@@ -837,7 +837,7 @@ class Evaluation:
837
837
  with self.lock:
838
838
  self.batch["evaluations"].append(eval)
839
839
 
840
- def run(
840
+ def evaluate(
841
841
  self,
842
842
  evaluator_id: str,
843
843
  index: Union[int, Hashable],
@@ -846,6 +846,17 @@ class Evaluation:
846
846
  name: Optional[str] = None,
847
847
  as_guardrail: bool = False,
848
848
  ):
849
+ """
850
+ Run an evaluator on the current row.
851
+
852
+ Args:
853
+ evaluator_id: The evaluator type/slug (e.g., "langevals/exact_match", "ragas/faithfulness")
854
+ index: The row index for this evaluation
855
+ data: Data to pass to the evaluator (e.g., {"input": ..., "output": ..., "expected_output": ...})
856
+ settings: Evaluator-specific settings
857
+ name: Optional display name for the evaluation (defaults to evaluator_id)
858
+ as_guardrail: Whether to run as a guardrail (stricter pass/fail)
859
+ """
849
860
  duration: Optional[int] = None
850
861
 
851
862
  start_time = time.time()
@@ -871,3 +882,31 @@ class Evaluation:
871
882
  duration=duration,
872
883
  cost=result.cost,
873
884
  )
885
+
886
+ def run(
887
+ self,
888
+ evaluator_id: str,
889
+ index: Union[int, Hashable],
890
+ data: Dict[str, Any],
891
+ settings: Dict[str, Any],
892
+ name: Optional[str] = None,
893
+ as_guardrail: bool = False,
894
+ ):
895
+ """
896
+ Deprecated: Use `evaluate()` instead.
897
+ """
898
+ import warnings
899
+
900
+ warnings.warn(
901
+ "evaluation.run() is deprecated, use evaluation.evaluate() instead",
902
+ DeprecationWarning,
903
+ stacklevel=2,
904
+ )
905
+ return self.evaluate(
906
+ evaluator_id=evaluator_id,
907
+ index=index,
908
+ data=data,
909
+ settings=settings,
910
+ name=name,
911
+ as_guardrail=as_guardrail,
912
+ )
@@ -1,5 +1,5 @@
1
1
  """
2
- Runner for platform-configured evaluations (Evaluations V3).
2
+ Runner for platform-configured experiments (Experiments Workbench).
3
3
 
4
4
  This module provides the `run()` function to execute evaluations that are
5
5
  configured in the LangWatch platform from CI/CD pipelines or scripts.
@@ -35,16 +35,16 @@ def _replace_url_domain(url: str, new_base: str) -> str:
35
35
  ))
36
36
 
37
37
 
38
- class EvaluationNotFoundError(Exception):
39
- """Raised when evaluation slug doesn't exist."""
38
+ class ExperimentNotFoundError(Exception):
39
+ """Raised when experiment slug doesn't exist."""
40
40
 
41
41
  def __init__(self, slug: str):
42
42
  self.slug = slug
43
43
  super().__init__(f"Evaluation not found: {slug}")
44
44
 
45
45
 
46
- class EvaluationTimeoutError(Exception):
47
- """Raised when evaluation run times out."""
46
+ class ExperimentTimeoutError(Exception):
47
+ """Raised when experiment run times out."""
48
48
 
49
49
  def __init__(self, run_id: str, progress: int, total: int):
50
50
  self.run_id = run_id
@@ -55,8 +55,8 @@ class EvaluationTimeoutError(Exception):
55
55
  )
56
56
 
57
57
 
58
- class EvaluationRunFailedError(Exception):
59
- """Raised when evaluation run fails."""
58
+ class ExperimentRunFailedError(Exception):
59
+ """Raised when experiment run fails."""
60
60
 
61
61
  def __init__(self, run_id: str, error: str):
62
62
  self.run_id = run_id
@@ -64,7 +64,7 @@ class EvaluationRunFailedError(Exception):
64
64
  super().__init__(f"Evaluation run failed: {error}")
65
65
 
66
66
 
67
- class EvaluationsApiError(Exception):
67
+ class ExperimentsApiError(Exception):
68
68
  """Raised for other API errors."""
69
69
 
70
70
  def __init__(self, message: str, status_code: int):
@@ -97,8 +97,8 @@ class EvaluatorStats:
97
97
 
98
98
 
99
99
  @dataclass
100
- class EvaluationRunSummary:
101
- """Summary of a completed evaluation run."""
100
+ class ExperimentRunSummary:
101
+ """Summary of a completed experiment run."""
102
102
 
103
103
  run_id: str
104
104
  total_cells: int
@@ -115,7 +115,7 @@ class EvaluationRunSummary:
115
115
 
116
116
 
117
117
  @dataclass
118
- class EvaluationRunResult:
118
+ class ExperimentRunResult:
119
119
  """Result of running a platform evaluation."""
120
120
 
121
121
  run_id: str
@@ -125,7 +125,7 @@ class EvaluationRunResult:
125
125
  pass_rate: float
126
126
  duration: int
127
127
  run_url: str
128
- summary: EvaluationRunSummary
128
+ summary: ExperimentRunSummary
129
129
 
130
130
  def print_summary(self, exit_on_failure: Optional[bool] = None) -> None:
131
131
  """
@@ -161,42 +161,42 @@ def _is_notebook() -> bool:
161
161
  return False
162
162
 
163
163
 
164
- def evaluate(
164
+ def run(
165
165
  slug: str,
166
166
  *,
167
167
  poll_interval: float = 2.0,
168
168
  timeout: float = 600.0,
169
169
  on_progress: Optional[Callable[[int, int], None]] = None,
170
170
  api_key: Optional[str] = None,
171
- ) -> EvaluationRunResult:
171
+ ) -> ExperimentRunResult:
172
172
  """
173
- Run a platform-configured evaluation and wait for completion.
173
+ Run a platform-configured experiment and wait for completion.
174
174
 
175
- This runs an Evaluation that you have configured in the LangWatch platform.
176
- The evaluation will execute all targets and evaluators defined in the configuration.
175
+ This runs an Experiment that you have configured in the LangWatch platform.
176
+ The experiment will execute all targets and evaluators defined in the configuration.
177
177
 
178
178
  Args:
179
- slug: The slug of the evaluation to run (found in the evaluation URL)
179
+ slug: The slug of the experiment to run (found in the experiment URL)
180
180
  poll_interval: Seconds between status checks (default: 2.0)
181
181
  timeout: Maximum seconds to wait for completion (default: 600.0 = 10 minutes)
182
182
  on_progress: Optional callback for progress updates (completed, total)
183
183
  api_key: Optional API key override (uses LANGWATCH_API_KEY env var by default)
184
184
 
185
185
  Returns:
186
- EvaluationRunResult with pass rate and summary. Call result.print_summary()
186
+ ExperimentRunResult with pass rate and summary. Call result.print_summary()
187
187
  to display results and exit with code 1 on failure.
188
188
 
189
189
  Raises:
190
- EvaluationNotFoundError: If the evaluation slug doesn't exist
191
- EvaluationTimeoutError: If the evaluation doesn't complete within timeout
192
- EvaluationRunFailedError: If the evaluation fails
193
- EvaluationsApiError: For other API errors
190
+ ExperimentNotFoundError: If the experiment slug doesn't exist
191
+ ExperimentTimeoutError: If the experiment doesn't complete within timeout
192
+ ExperimentRunFailedError: If the experiment fails
193
+ ExperimentsApiError: For other API errors
194
194
 
195
195
  Example:
196
196
  ```python
197
197
  import langwatch
198
198
 
199
- result = langwatch.evaluation.evaluate("my-evaluation-slug")
199
+ result = langwatch.experiment.run("my-experiment-slug")
200
200
  result.print_summary()
201
201
  ```
202
202
  """
@@ -219,7 +219,7 @@ def evaluate(
219
219
  api_run_url = start_response.get("runUrl", "")
220
220
  run_url = _replace_url_domain(api_run_url, endpoint) if api_run_url else ""
221
221
 
222
- print(f"Started evaluation run: {run_id}")
222
+ print(f"Started experiment run: {run_id}")
223
223
  if run_url:
224
224
  print(f"Follow live: {run_url}")
225
225
 
@@ -238,7 +238,7 @@ def evaluate(
238
238
  if time.time() - start_time > timeout:
239
239
  print() # Newline after progress
240
240
  status = _get_run_status(run_id, endpoint, effective_api_key)
241
- raise EvaluationTimeoutError(
241
+ raise ExperimentTimeoutError(
242
242
  run_id, status.get("progress", 0), status.get("total", 0)
243
243
  )
244
244
 
@@ -267,7 +267,7 @@ def evaluate(
267
267
 
268
268
  if run_status == "failed":
269
269
  print() # Newline after progress
270
- raise EvaluationRunFailedError(
270
+ raise ExperimentRunFailedError(
271
271
  run_id, status.get("error", "Unknown error")
272
272
  )
273
273
 
@@ -278,7 +278,7 @@ def evaluate(
278
278
 
279
279
 
280
280
  def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
281
- """Start an evaluation run."""
281
+ """Start an experiment run."""
282
282
  with httpx.Client(timeout=60) as client:
283
283
  response = client.post(
284
284
  f"{endpoint}/api/evaluations/v3/{slug}/run",
@@ -286,12 +286,12 @@ def _start_run(slug: str, endpoint: str, api_key: str) -> dict:
286
286
  )
287
287
 
288
288
  if response.status_code == 404:
289
- raise EvaluationNotFoundError(slug)
289
+ raise ExperimentNotFoundError(slug)
290
290
  if response.status_code == 401:
291
- raise EvaluationsApiError("Unauthorized - check your API key", 401)
291
+ raise ExperimentsApiError("Unauthorized - check your API key", 401)
292
292
  if not response.is_success:
293
293
  error_body = response.json() if response.content else {}
294
- raise EvaluationsApiError(
294
+ raise ExperimentsApiError(
295
295
  error_body.get("error", f"Failed to start evaluation: {response.status_code}"),
296
296
  response.status_code,
297
297
  )
@@ -308,12 +308,12 @@ def _get_run_status(run_id: str, endpoint: str, api_key: str) -> dict:
308
308
  )
309
309
 
310
310
  if response.status_code == 404:
311
- raise EvaluationsApiError(f"Run not found: {run_id}", 404)
311
+ raise ExperimentsApiError(f"Run not found: {run_id}", 404)
312
312
  if response.status_code == 401:
313
- raise EvaluationsApiError("Unauthorized - check your API key", 401)
313
+ raise ExperimentsApiError("Unauthorized - check your API key", 401)
314
314
  if not response.is_success:
315
315
  error_body = response.json() if response.content else {}
316
- raise EvaluationsApiError(
316
+ raise ExperimentsApiError(
317
317
  error_body.get("error", f"Failed to get run status: {response.status_code}"),
318
318
  response.status_code,
319
319
  )
@@ -326,7 +326,7 @@ def _build_result(
326
326
  status: Literal["completed", "failed", "stopped"],
327
327
  summary_data: dict,
328
328
  run_url: str,
329
- ) -> EvaluationRunResult:
329
+ ) -> ExperimentRunResult:
330
330
  """Build the result object from API response."""
331
331
  total_cells = summary_data.get("totalCells", 0)
332
332
  completed_cells = summary_data.get("completedCells", 0)
@@ -368,7 +368,7 @@ def _build_result(
368
368
  )
369
369
  )
370
370
 
371
- summary = EvaluationRunSummary(
371
+ summary = ExperimentRunSummary(
372
372
  run_id=run_id,
373
373
  total_cells=total_cells,
374
374
  completed_cells=completed_cells,
@@ -383,7 +383,7 @@ def _build_result(
383
383
  total_cost=summary_data.get("totalCost", 0),
384
384
  )
385
385
 
386
- return EvaluationRunResult(
386
+ return ExperimentRunResult(
387
387
  run_id=run_id,
388
388
  status=status,
389
389
  passed=total_passed,
@@ -395,12 +395,12 @@ def _build_result(
395
395
  )
396
396
 
397
397
 
398
- def _print_summary(result: EvaluationRunResult) -> None:
399
- """Print a CI-friendly summary of the evaluation results."""
398
+ def _print_summary(result: ExperimentRunResult) -> None:
399
+ """Print a CI-friendly summary of the experiment results."""
400
400
  summary = result.summary
401
401
 
402
402
  print("\n" + "═" * 60)
403
- print(" EVALUATION RESULTS")
403
+ print(" EXPERIMENT RESULTS")
404
404
  print("═" * 60)
405
405
  print(f" Run ID: {result.run_id}")
406
406
  print(f" Status: {result.status.upper()}")
@@ -433,30 +433,3 @@ def _print_summary(result: EvaluationRunResult) -> None:
433
433
  print("═" * 60 + "\n")
434
434
 
435
435
 
436
- def run(
437
- slug: str,
438
- *,
439
- poll_interval: float = 2.0,
440
- timeout: float = 600.0,
441
- on_progress: Optional[Callable[[int, int], None]] = None,
442
- api_key: Optional[str] = None,
443
- ) -> EvaluationRunResult:
444
- """
445
- Deprecated: Use `evaluate()` instead.
446
-
447
- Run a platform-configured evaluation and wait for completion.
448
- """
449
- import warnings
450
-
451
- warnings.warn(
452
- "langwatch.evaluation.run() is deprecated, use langwatch.evaluation.evaluate() instead",
453
- DeprecationWarning,
454
- stacklevel=2,
455
- )
456
- return evaluate(
457
- slug,
458
- poll_interval=poll_interval,
459
- timeout=timeout,
460
- on_progress=on_progress,
461
- api_key=api_key,
462
- )
langwatch/litellm.py CHANGED
@@ -246,6 +246,9 @@ class LiteLLMPatch:
246
246
  SpanMetrics(
247
247
  prompt_tokens=safe_get(usage, "prompt_tokens"),
248
248
  completion_tokens=safe_get(usage, "completion_tokens"),
249
+ reasoning_tokens=safe_get(
250
+ usage, "completion_tokens_details", "reasoning_tokens"
251
+ ),
249
252
  )
250
253
  if usage
251
254
  else SpanMetrics()
@@ -281,6 +284,9 @@ class LiteLLMPatch:
281
284
  metrics=SpanMetrics(
282
285
  prompt_tokens=safe_get(response, "usage", "prompt_tokens"),
283
286
  completion_tokens=safe_get(response, "usage", "completion_tokens"),
287
+ reasoning_tokens=safe_get(
288
+ response, "usage", "completion_tokens_details", "reasoning_tokens"
289
+ ),
284
290
  ),
285
291
  timestamps=timestamps,
286
292
  **kwargs,
@@ -338,6 +344,7 @@ class LiteLLMPatch:
338
344
  "functions",
339
345
  "user",
340
346
  "response_format",
347
+ "reasoning_effort",
341
348
  ]
342
349
  for param in params:
343
350
  if kwargs.get(param):
langwatch/openai.py CHANGED
@@ -296,6 +296,9 @@ class OpenAICompletionTracer:
296
296
  metrics=SpanMetrics(
297
297
  prompt_tokens=safe_get(response, "usage", "prompt_tokens"),
298
298
  completion_tokens=safe_get(response, "usage", "completion_tokens"),
299
+ reasoning_tokens=safe_get(
300
+ response, "usage", "completion_tokens_details", "reasoning_tokens"
301
+ ),
299
302
  ),
300
303
  timestamps=timestamps,
301
304
  **kwargs,
@@ -336,22 +339,31 @@ class OpenAICompletionTracer:
336
339
  if len(outputs) == 0
337
340
  else outputs[0] if len(outputs) == 1 else {"type": "list", "value": outputs}
338
341
  )
339
- params = SpanParams(
340
- temperature=kwargs.get("temperature", 1.0),
341
- stream=kwargs.get("stream", False),
342
- )
343
- functions = kwargs.get("functions", None)
344
- if functions:
345
- params["functions"] = functions
346
- tools = kwargs.get("tools", None)
347
- if tools:
348
- params["tools"] = tools
349
- tool_choice = kwargs.get("tool_choice", None)
350
- if tool_choice:
351
- params["tool_choice"] = tool_choice
352
- response_format = kwargs.get("response_format", None)
353
- if response_format:
354
- params["response_format"] = response_format
342
+ span_params = SpanParams()
343
+ param_names = [
344
+ "frequency_penalty",
345
+ "logit_bias",
346
+ "logprobs",
347
+ "top_logprobs",
348
+ "max_tokens",
349
+ "n",
350
+ "presence_penalty",
351
+ "seed",
352
+ "stop",
353
+ "stream",
354
+ "temperature",
355
+ "top_p",
356
+ "tools",
357
+ "tool_choice",
358
+ "parallel_tool_calls",
359
+ "functions",
360
+ "user",
361
+ "response_format",
362
+ "reasoning_effort",
363
+ ]
364
+ for param in param_names:
365
+ if kwargs.get(param) is not None:
366
+ span_params[param] = kwargs.get(param)
355
367
 
356
368
  vendor = (
357
369
  "azure"
@@ -367,7 +379,7 @@ class OpenAICompletionTracer:
367
379
  ),
368
380
  output=output,
369
381
  error=error,
370
- params=params,
382
+ params=span_params,
371
383
  metrics=metrics,
372
384
  timestamps=timestamps,
373
385
  )
@@ -611,6 +623,9 @@ class OpenAIChatCompletionTracer:
611
623
  SpanMetrics(
612
624
  prompt_tokens=usage.prompt_tokens if usage else None,
613
625
  completion_tokens=usage.completion_tokens if usage else None,
626
+ reasoning_tokens=safe_get(
627
+ usage, "completion_tokens_details", "reasoning_tokens"
628
+ ),
614
629
  )
615
630
  if usage
616
631
  else SpanMetrics()
@@ -643,6 +658,9 @@ class OpenAIChatCompletionTracer:
643
658
  metrics=SpanMetrics(
644
659
  prompt_tokens=safe_get(response, "usage", "prompt_tokens"),
645
660
  completion_tokens=safe_get(response, "usage", "completion_tokens"),
661
+ reasoning_tokens=safe_get(
662
+ response, "usage", "completion_tokens_details", "reasoning_tokens"
663
+ ),
646
664
  ),
647
665
  timestamps=timestamps,
648
666
  **kwargs,
@@ -683,22 +701,31 @@ class OpenAIChatCompletionTracer:
683
701
  if len(outputs) == 0
684
702
  else outputs[0] if len(outputs) == 1 else {"type": "list", "value": outputs}
685
703
  )
686
- params = SpanParams(
687
- temperature=kwargs.get("temperature", 1.0),
688
- stream=kwargs.get("stream", False),
689
- )
690
- functions = kwargs.get("functions", None)
691
- if functions:
692
- params["functions"] = functions
693
- tools = kwargs.get("tools", None)
694
- if tools:
695
- params["tools"] = tools
696
- tool_choice = kwargs.get("tool_choice", None)
697
- if tool_choice:
698
- params["tool_choice"] = tool_choice
699
- response_format = kwargs.get("response_format", None)
700
- if response_format:
701
- params["response_format"] = response_format
704
+ span_params = SpanParams()
705
+ param_names = [
706
+ "frequency_penalty",
707
+ "logit_bias",
708
+ "logprobs",
709
+ "top_logprobs",
710
+ "max_tokens",
711
+ "n",
712
+ "presence_penalty",
713
+ "seed",
714
+ "stop",
715
+ "stream",
716
+ "temperature",
717
+ "top_p",
718
+ "tools",
719
+ "tool_choice",
720
+ "parallel_tool_calls",
721
+ "functions",
722
+ "user",
723
+ "response_format",
724
+ "reasoning_effort",
725
+ ]
726
+ for param in param_names:
727
+ if kwargs.get(param) is not None:
728
+ span_params[param] = kwargs.get(param)
702
729
 
703
730
  vendor = (
704
731
  "azure"
@@ -714,7 +741,7 @@ class OpenAIChatCompletionTracer:
714
741
  ),
715
742
  output=output,
716
743
  error=error,
717
- params=params,
744
+ params=span_params,
718
745
  metrics=metrics,
719
746
  timestamps=timestamps,
720
747
  )
@@ -24,6 +24,8 @@ logger = logging.getLogger(__name__)
24
24
  class LocalPromptLoader:
25
25
  """Loads prompts from local files in CLI format."""
26
26
 
27
+ _warned_no_prompts_path: bool = False
28
+
27
29
  def __init__(self, base_path: Optional[Path] = None):
28
30
  """Initialize with base path (defaults to current working directory at load time)."""
29
31
  self._base_path = base_path
@@ -43,6 +45,16 @@ class LocalPromptLoader:
43
45
  # Check if prompts.json exists
44
46
  prompts_json_path = self.base_path / "prompts.json"
45
47
  if not prompts_json_path.exists():
48
+ # Warn once if no prompts_path was configured and prompts.json doesn't exist
49
+ if self._base_path is None and not LocalPromptLoader._warned_no_prompts_path:
50
+ LocalPromptLoader._warned_no_prompts_path = True
51
+ warnings.warn(
52
+ f"No prompts.json found at {prompts_json_path}. "
53
+ f"If you have local prompt files, configure the path with "
54
+ f"langwatch.setup(prompts_path='/path/to/prompts') or ensure "
55
+ f"prompts.json is in the current working directory.",
56
+ UserWarning,
57
+ )
46
58
  logger.debug(
47
59
  f"No prompts.json found at {prompts_json_path}, falling back to API"
48
60
  )