harbor-langsmith 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: harbor-langsmith
3
+ Version: 0.1.0
4
+ Summary: LangSmith plugin for Harbor jobs.
5
+ Author: Alex Shaw
6
+ Author-email: Alex Shaw <alexgshaw64@gmail.com>
7
+ License-Expression: Apache-2.0
8
+ Requires-Dist: harbor>=0.13.0
9
+ Requires-Dist: requests>=2.32.4
10
+ Requires-Python: >=3.12
11
+ Project-URL: Repository, https://github.com/harbor-framework/harbor
12
+ Project-URL: Issues, https://github.com/harbor-framework/harbor/issues
13
+ Description-Content-Type: text/markdown
14
+
15
+ # harbor-langsmith
16
+
17
+ LangSmith plugin for Harbor jobs.
18
+
19
+ ```bash
20
+ pip install "harbor[langsmith]"
21
+ export LANGSMITH_API_KEY=...
22
+ harbor run ... --plugin langsmith
23
+ ```
24
+
25
+ You can also pass the full import path:
26
+
27
+ ```bash
28
+ harbor run ... --plugin harbor_langsmith:LangSmithPlugin
29
+ ```
30
+
31
+ Optional environment variables:
32
+
33
+ - `HARBOR_LANGSMITH_DATASET`
34
+ - `HARBOR_LANGSMITH_EXPERIMENT`
35
+ - `LANGSMITH_ENDPOINT`
36
+ - `LANGSMITH_WORKSPACE_ID`
37
+ - `HARBOR_LANGSMITH_SYNC_DATASET=false`
38
+ - `HARBOR_LANGSMITH_FAIL_FAST=true`
39
+
40
+ Plugin kwargs (CLI `--pk` or job config `kwargs:`) mirror the constructor options:
41
+ `dataset_name`, `experiment_name`, `endpoint`, `api_key`, `workspace_id`,
42
+ `sync_dataset`, and `fail_fast`.
@@ -0,0 +1,28 @@
1
+ # harbor-langsmith
2
+
3
+ LangSmith plugin for Harbor jobs.
4
+
5
+ ```bash
6
+ pip install "harbor[langsmith]"
7
+ export LANGSMITH_API_KEY=...
8
+ harbor run ... --plugin langsmith
9
+ ```
10
+
11
+ You can also pass the full import path:
12
+
13
+ ```bash
14
+ harbor run ... --plugin harbor_langsmith:LangSmithPlugin
15
+ ```
16
+
17
+ Optional environment variables:
18
+
19
+ - `HARBOR_LANGSMITH_DATASET`
20
+ - `HARBOR_LANGSMITH_EXPERIMENT`
21
+ - `LANGSMITH_ENDPOINT`
22
+ - `LANGSMITH_WORKSPACE_ID`
23
+ - `HARBOR_LANGSMITH_SYNC_DATASET=false`
24
+ - `HARBOR_LANGSMITH_FAIL_FAST=true`
25
+
26
+ Plugin kwargs (CLI `--pk` or job config `kwargs:`) mirror the constructor options:
27
+ `dataset_name`, `experiment_name`, `endpoint`, `api_key`, `workspace_id`,
28
+ `sync_dataset`, and `fail_fast`.
@@ -0,0 +1,29 @@
1
+ [project]
2
+ name = "harbor-langsmith"
3
+ version = "0.1.0"
4
+ description = "LangSmith plugin for Harbor jobs."
5
+ readme = "README.md"
6
+ license = "Apache-2.0"
7
+ authors = [{ name = "Alex Shaw", email = "alexgshaw64@gmail.com" }]
8
+ requires-python = ">=3.12"
9
+ dependencies = [
10
+ "harbor>=0.13.0",
11
+ "requests>=2.32.4",
12
+ ]
13
+
14
+ [project.entry-points."harbor.plugins"]
15
+ langsmith = "harbor_langsmith:LangSmithPlugin"
16
+
17
+ [project.urls]
18
+ Repository = "https://github.com/harbor-framework/harbor"
19
+ Issues = "https://github.com/harbor-framework/harbor/issues"
20
+
21
+ [tool.uv.sources]
22
+ harbor = { workspace = true }
23
+
24
+ [build-system]
25
+ requires = ["uv_build>=0.10.8,<0.11.0"]
26
+ build-backend = "uv_build"
27
+
28
+ [tool.uv.build-backend]
29
+ module-name = "harbor_langsmith"
@@ -0,0 +1,3 @@
1
+ from harbor_langsmith.plugin import LangSmithPlugin
2
+
3
+ __all__ = ["LangSmithPlugin"]
@@ -0,0 +1,496 @@
1
+ import asyncio
2
+ import os
3
+ import tomllib
4
+ from datetime import datetime, timezone
5
+ from typing import Any
6
+ from uuid import NAMESPACE_URL, uuid5
7
+
8
+ import requests
9
+
10
+ from harbor.job import Job
11
+ from harbor.models.job.plugin import BaseJobPlugin
12
+ from harbor.models.job.result import JobResult
13
+ from harbor.trial.hooks import TrialEvent, TrialHookEvent
14
+
15
+
16
+ class LangSmithPlugin(BaseJobPlugin):
17
+ def __init__(
18
+ self,
19
+ *,
20
+ dataset_name: str | None = None,
21
+ experiment_name: str | None = None,
22
+ endpoint: str | None = None,
23
+ api_key: str | None = None,
24
+ workspace_id: str | None = None,
25
+ sync_dataset: bool | None = None,
26
+ fail_fast: bool | None = None,
27
+ ):
28
+ super().__init__()
29
+ self.dataset_name = dataset_name or os.getenv("HARBOR_LANGSMITH_DATASET")
30
+ self.experiment_name = experiment_name or os.getenv(
31
+ "HARBOR_LANGSMITH_EXPERIMENT"
32
+ )
33
+ self.endpoint = endpoint or os.getenv("LANGSMITH_ENDPOINT")
34
+ self.api_key = api_key or os.getenv("LANGSMITH_API_KEY")
35
+ self.workspace_id = workspace_id or os.getenv("LANGSMITH_WORKSPACE_ID")
36
+ self.sync_dataset = (
37
+ self._env_bool("HARBOR_LANGSMITH_SYNC_DATASET", default=True)
38
+ if sync_dataset is None
39
+ else sync_dataset
40
+ )
41
+ self.fail_fast = (
42
+ self._env_bool("HARBOR_LANGSMITH_FAIL_FAST", default=False)
43
+ if fail_fast is None
44
+ else fail_fast
45
+ )
46
+ self.request_timeout = float(
47
+ os.getenv("HARBOR_LANGSMITH_REQUEST_TIMEOUT", "120")
48
+ )
49
+ self._session = requests.Session()
50
+ self._base_url = ""
51
+ self._dataset_id: str | None = None
52
+ self._experiment_id: str | None = None
53
+ self._example_ids: dict[str, str] = {}
54
+ self._run_ids: dict[str, str] = {}
55
+ self._phase_run_ids: dict[tuple[str, TrialEvent], str] = {}
56
+
57
+ async def on_job_start(self, job: Job) -> None:
58
+ await asyncio.to_thread(self._setup, job)
59
+ job.on_trial_started(self._handle_event)
60
+ job.on_environment_started(self._handle_event)
61
+ job.on_agent_started(self._handle_event)
62
+ job.on_verification_started(self._handle_event)
63
+ job.on_trial_ended(self._handle_event)
64
+ job.on_trial_cancelled(self._handle_event)
65
+
66
+ async def on_job_end(self, job_result: JobResult) -> None:
67
+ if self._experiment_id is None:
68
+ return
69
+ try:
70
+ await asyncio.to_thread(
71
+ self._request,
72
+ "PATCH",
73
+ f"/sessions/{self._experiment_id}",
74
+ json={
75
+ "end_time": self._format_time(
76
+ job_result.finished_at or datetime.now(timezone.utc)
77
+ )
78
+ },
79
+ ok_statuses={200, 202, 204},
80
+ )
81
+ except Exception:
82
+ if self.fail_fast:
83
+ raise
84
+
85
+ def _setup(self, job: Any) -> None:
86
+ if not self.api_key:
87
+ raise RuntimeError("LANGSMITH_API_KEY is required for LangSmithPlugin")
88
+
89
+ endpoint = (self.endpoint or "https://api.smith.langchain.com").rstrip("/")
90
+ self._base_url = (
91
+ endpoint if endpoint.endswith("/api/v1") else f"{endpoint}/api/v1"
92
+ )
93
+
94
+ self._session.headers.update({"x-api-key": self.api_key})
95
+ if self.workspace_id:
96
+ self._session.headers.update({"LANGSMITH-WORKSPACE-ID": self.workspace_id})
97
+
98
+ if self.sync_dataset:
99
+ self._dataset_id = self._get_or_create_dataset(job)
100
+ self._example_ids = self._get_or_create_examples(job)
101
+
102
+ experiment_id = self._stable_uuid(job.id, "experiment")
103
+ payload: dict[str, Any] = {
104
+ "id": experiment_id,
105
+ "name": self.experiment_name or job.config.job_name,
106
+ "start_time": self._format_time(datetime.now(timezone.utc)),
107
+ "extra": {
108
+ "metadata": {
109
+ "harbor_job_id": str(job.id),
110
+ "harbor_job_name": job.config.job_name,
111
+ "harbor_job_dir": str(job.job_dir),
112
+ }
113
+ },
114
+ }
115
+ if self._dataset_id is not None:
116
+ payload["reference_dataset_id"] = self._dataset_id
117
+
118
+ self._request("POST", "/sessions", json=payload, ok_statuses={200, 201, 409})
119
+ self._experiment_id = experiment_id
120
+
121
+ async def _handle_event(self, event: TrialHookEvent) -> None:
122
+ try:
123
+ await asyncio.to_thread(self._handle_event_sync, event)
124
+ except Exception:
125
+ if self.fail_fast:
126
+ raise
127
+
128
+ def _handle_event_sync(self, event: TrialHookEvent) -> None:
129
+ if self._experiment_id is None:
130
+ return
131
+ if event.event == TrialEvent.START:
132
+ self._create_root_run(event)
133
+ return
134
+ if event.event in {
135
+ TrialEvent.ENVIRONMENT_START,
136
+ TrialEvent.AGENT_START,
137
+ TrialEvent.VERIFICATION_START,
138
+ }:
139
+ self._create_phase_run(event)
140
+ return
141
+ if event.event in {TrialEvent.END, TrialEvent.CANCEL}:
142
+ self._finish_trial(event)
143
+
144
+ def _get_or_create_dataset(self, job: Any) -> str | None:
145
+ dataset_name = self.dataset_name or self._default_dataset_name(job)
146
+ if dataset_name is None:
147
+ return None
148
+
149
+ existing = self._find_dataset(dataset_name)
150
+ if existing is not None:
151
+ return existing
152
+
153
+ payload = {
154
+ "name": dataset_name,
155
+ "description": f"Harbor dataset synced from job {job.config.job_name}",
156
+ "metadata": {"source": "harbor"},
157
+ }
158
+ response = self._request(
159
+ "POST", "/datasets", json=payload, ok_statuses={200, 201, 409}
160
+ )
161
+ if response.status_code == 409:
162
+ return self._find_dataset(dataset_name)
163
+ return self._extract_id(response.json())
164
+
165
+ def _get_or_create_examples(self, job: Any) -> dict[str, str]:
166
+ if self._dataset_id is None:
167
+ return {}
168
+
169
+ example_ids: dict[str, str] = {}
170
+ for task_config in job._task_configs:
171
+ task_id = task_config.get_task_id()
172
+ full_task_name = task_id.get_name()
173
+ task_name = full_task_name.split("/")[-1]
174
+ example_id = self._stable_uuid(self._dataset_id, "example", task_name)
175
+ instruction = self._read_instruction(task_config)
176
+ payload = {
177
+ "id": example_id,
178
+ "dataset_id": self._dataset_id,
179
+ "inputs": {
180
+ "task_name": task_name,
181
+ "instruction": instruction,
182
+ "task_id": task_id.model_dump(mode="json"),
183
+ },
184
+ "outputs": {},
185
+ "metadata": {
186
+ "source": "harbor",
187
+ "task_config": task_config.model_dump(mode="json"),
188
+ },
189
+ }
190
+ response = self._request(
191
+ "POST", "/examples", json=payload, ok_statuses={200, 201, 409}
192
+ )
193
+ if response.status_code == 409:
194
+ self._request(
195
+ "PATCH",
196
+ f"/examples/{example_id}",
197
+ json={
198
+ "inputs": payload["inputs"],
199
+ "outputs": payload["outputs"],
200
+ "metadata": payload["metadata"],
201
+ },
202
+ ok_statuses={200, 202, 204},
203
+ )
204
+ example_ids[task_name] = example_id
205
+ example_ids[full_task_name] = example_id
206
+ configured_task_name = self._read_configured_task_name(task_config)
207
+ if configured_task_name is not None:
208
+ example_ids[configured_task_name] = example_id
209
+ example_ids[configured_task_name.split("/")[-1]] = example_id
210
+ return example_ids
211
+
212
+ def _create_root_run(self, event: TrialHookEvent) -> None:
213
+ run_id = self._stable_uuid(
214
+ event.config.job_id, "trial", event.config.trial_name
215
+ )
216
+ self._run_ids[event.config.trial_name] = run_id
217
+ reference_example_id = self._example_ids.get(
218
+ event.task_name
219
+ ) or self._example_ids.get(event.task_name.split("/")[-1])
220
+ payload = {
221
+ "id": run_id,
222
+ "name": event.config.trial_name,
223
+ "run_type": "chain",
224
+ "inputs": {
225
+ "task_name": event.task_name,
226
+ "instruction": self._read_instruction(event.config.task),
227
+ "task_id": event.config.task.get_task_id().model_dump(mode="json"),
228
+ "trial_name": event.config.trial_name,
229
+ "agent": event.config.agent.name or event.config.agent.import_path,
230
+ "model": event.config.agent.model_name,
231
+ },
232
+ "start_time": self._format_time(event.timestamp),
233
+ "session_id": self._experiment_id,
234
+ "reference_example_id": reference_example_id,
235
+ "extra": {
236
+ "metadata": self._trial_metadata(event),
237
+ "tags": ["harbor", "harbor-trial"],
238
+ },
239
+ }
240
+ if payload["reference_example_id"] is None:
241
+ payload.pop("reference_example_id")
242
+ self._request("POST", "/runs", json=payload, ok_statuses={200, 201, 409})
243
+
244
+ def _create_phase_run(self, event: TrialHookEvent) -> None:
245
+ parent_run_id = self._run_ids.get(event.config.trial_name)
246
+ if parent_run_id is None:
247
+ self._create_root_run(event)
248
+ parent_run_id = self._run_ids[event.config.trial_name]
249
+
250
+ run_id = self._stable_uuid(
251
+ event.config.job_id,
252
+ "trial",
253
+ event.config.trial_name,
254
+ "phase",
255
+ event.event.value,
256
+ )
257
+ self._phase_run_ids[(event.config.trial_name, event.event)] = run_id
258
+ payload = {
259
+ "id": run_id,
260
+ "name": event.event.value,
261
+ "run_type": "chain",
262
+ "inputs": {"phase": event.event.value},
263
+ "start_time": self._format_time(event.timestamp),
264
+ "session_id": self._experiment_id,
265
+ "parent_run_id": parent_run_id,
266
+ "extra": {
267
+ "metadata": self._trial_metadata(event),
268
+ "tags": ["harbor", "harbor-phase", event.event.value],
269
+ },
270
+ }
271
+ self._request("POST", "/runs", json=payload, ok_statuses={200, 201, 409})
272
+
273
+ def _finish_trial(self, event: TrialHookEvent) -> None:
274
+ run_id = self._run_ids.get(event.config.trial_name)
275
+ if run_id is None:
276
+ self._create_root_run(event)
277
+ run_id = self._run_ids[event.config.trial_name]
278
+
279
+ result = event.result
280
+ payload: dict[str, Any] = {
281
+ "outputs": self._trial_outputs(result),
282
+ "end_time": self._format_time(
283
+ result.finished_at if result and result.finished_at else event.timestamp
284
+ ),
285
+ }
286
+ if result and result.exception_info is not None:
287
+ payload["error"] = (
288
+ f"{result.exception_info.exception_type}: "
289
+ f"{result.exception_info.exception_message}"
290
+ )
291
+ self._request(
292
+ "PATCH", f"/runs/{run_id}", json=payload, ok_statuses={200, 202, 204}
293
+ )
294
+
295
+ if result is not None:
296
+ self._finish_phase_runs(result)
297
+ self._create_feedback(run_id, result)
298
+
299
+ def _finish_phase_runs(self, result: Any) -> None:
300
+ phases = {
301
+ TrialEvent.ENVIRONMENT_START: result.environment_setup,
302
+ TrialEvent.AGENT_START: result.agent_execution,
303
+ TrialEvent.VERIFICATION_START: result.verifier,
304
+ }
305
+ for event, timing in phases.items():
306
+ if timing is None:
307
+ continue
308
+ run_id = self._phase_run_ids.get((result.trial_name, event))
309
+ if run_id is None or timing.finished_at is None:
310
+ continue
311
+ self._request(
312
+ "PATCH",
313
+ f"/runs/{run_id}",
314
+ json={
315
+ "outputs": {"phase": event.value},
316
+ "end_time": self._format_time(timing.finished_at),
317
+ },
318
+ ok_statuses={200, 202, 204},
319
+ )
320
+
321
+ def _create_feedback(self, run_id: str, result: Any) -> None:
322
+ if result.verifier_result is not None:
323
+ for key, score in result.verifier_result.rewards.items():
324
+ self._request(
325
+ "POST",
326
+ "/feedback",
327
+ json={
328
+ "id": self._stable_uuid(run_id, "feedback", key),
329
+ "run_id": run_id,
330
+ "key": key,
331
+ "score": score,
332
+ "feedback_source_type": "api",
333
+ },
334
+ ok_statuses={200, 201, 409},
335
+ )
336
+ if result.exception_info is not None:
337
+ self._request(
338
+ "POST",
339
+ "/feedback",
340
+ json={
341
+ "id": self._stable_uuid(run_id, "feedback", "harbor_error"),
342
+ "run_id": run_id,
343
+ "key": "harbor_error",
344
+ "score": 1,
345
+ "value": result.exception_info.exception_type,
346
+ "comment": result.exception_info.exception_message,
347
+ "feedback_source_type": "api",
348
+ },
349
+ ok_statuses={200, 201, 409},
350
+ )
351
+
352
+ def _trial_outputs(self, result: Any | None) -> dict[str, Any]:
353
+ if result is None:
354
+ return {}
355
+ n_input, n_cache, n_output, cost = result.compute_token_cost_totals()
356
+ agent_metadata = (
357
+ result.agent_result.metadata
358
+ if result.agent_result is not None
359
+ and result.agent_result.metadata is not None
360
+ else None
361
+ )
362
+ return {
363
+ "task_name": result.task_name,
364
+ "trial_name": result.trial_name,
365
+ "agent_output": (
366
+ agent_metadata.get("answer_written") if agent_metadata else None
367
+ ),
368
+ "agent_metadata": agent_metadata,
369
+ "rewards": (
370
+ result.verifier_result.rewards
371
+ if result.verifier_result is not None
372
+ else None
373
+ ),
374
+ "exception": (
375
+ result.exception_info.model_dump(mode="json")
376
+ if result.exception_info is not None
377
+ else None
378
+ ),
379
+ "tokens": {
380
+ "input": n_input,
381
+ "cache": n_cache,
382
+ "output": n_output,
383
+ },
384
+ "cost_usd": cost,
385
+ }
386
+
387
+ def _trial_metadata(self, event: TrialHookEvent) -> dict[str, Any]:
388
+ return {
389
+ "harbor_trial_id": event.trial_id,
390
+ "harbor_trial_name": event.config.trial_name,
391
+ "harbor_task_name": event.task_name,
392
+ "harbor_job_id": str(event.config.job_id),
393
+ "harbor_agent": event.config.agent.name or event.config.agent.import_path,
394
+ "harbor_model": event.config.agent.model_name,
395
+ "harbor_trial_config": event.config.model_dump(mode="json"),
396
+ "source": "harbor",
397
+ }
398
+
399
+ @staticmethod
400
+ def _read_instruction(task_config: Any) -> str | None:
401
+ try:
402
+ instruction_path = task_config.get_local_path() / "instruction.md"
403
+ if instruction_path.exists():
404
+ return instruction_path.read_text()
405
+ except Exception:
406
+ return None
407
+ return None
408
+
409
+ @staticmethod
410
+ def _read_configured_task_name(task_config: Any) -> str | None:
411
+ try:
412
+ config_path = task_config.get_local_path() / "task.toml"
413
+ if not config_path.exists():
414
+ return None
415
+ data = tomllib.loads(config_path.read_text())
416
+ task = data.get("task")
417
+ if isinstance(task, dict):
418
+ name = task.get("name")
419
+ return name if isinstance(name, str) else None
420
+ except Exception:
421
+ return None
422
+ return None
423
+
424
+ def _default_dataset_name(self, job: Any) -> str | None:
425
+ if len(job.config.datasets) == 1:
426
+ dataset = job.config.datasets[0]
427
+ name = dataset.name or (dataset.path.name if dataset.path else None)
428
+ if name is not None and dataset.version:
429
+ return f"{name}@{dataset.version}"
430
+ return name
431
+ if len(job.config.tasks) == 1:
432
+ task_id = job.config.tasks[0].get_task_id()
433
+ return task_id.get_name().split("/")[-1]
434
+ if len(job.config.tasks) > 1:
435
+ return "harbor-adhoc-tasks"
436
+ return None
437
+
438
+ def _find_dataset(self, dataset_name: str) -> str | None:
439
+ for params in ({"name": dataset_name}, {"dataset_name": dataset_name}):
440
+ response = self._request(
441
+ "GET", "/datasets", params=params, ok_statuses={200, 404}
442
+ )
443
+ if response.status_code == 404:
444
+ continue
445
+ datasets = response.json()
446
+ if isinstance(datasets, dict):
447
+ datasets = datasets.get("datasets") or datasets.get("items") or []
448
+ if not isinstance(datasets, list):
449
+ continue
450
+ for dataset in datasets:
451
+ if dataset.get("name") == dataset_name:
452
+ return self._extract_id(dataset)
453
+ return None
454
+
455
+ def _request(
456
+ self,
457
+ method: str,
458
+ path: str,
459
+ *,
460
+ ok_statuses: set[int],
461
+ **kwargs: Any,
462
+ ) -> requests.Response:
463
+ response = self._session.request(
464
+ method,
465
+ f"{self._base_url}{path}",
466
+ timeout=self.request_timeout,
467
+ **kwargs,
468
+ )
469
+ if response.status_code not in ok_statuses:
470
+ response.raise_for_status()
471
+ return response
472
+
473
+ @staticmethod
474
+ def _extract_id(payload: Any) -> str | None:
475
+ if isinstance(payload, dict):
476
+ value = payload.get("id") or payload.get("dataset_id")
477
+ return str(value) if value is not None else None
478
+ return None
479
+
480
+ @staticmethod
481
+ def _stable_uuid(*parts: Any) -> str:
482
+ normalized = ":".join(str(part) for part in parts if part is not None)
483
+ return str(uuid5(NAMESPACE_URL, f"harbor-langsmith:{normalized}"))
484
+
485
+ @staticmethod
486
+ def _format_time(value: datetime) -> str:
487
+ if value.tzinfo is None:
488
+ value = value.replace(tzinfo=timezone.utc)
489
+ return value.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
490
+
491
+ @staticmethod
492
+ def _env_bool(name: str, *, default: bool) -> bool:
493
+ value = os.getenv(name)
494
+ if value is None:
495
+ return default
496
+ return value.lower() in {"1", "true", "yes", "on"}