orca-sdk 0.0.98__py3-none-any.whl → 0.0.101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
orca_sdk/client.py CHANGED
@@ -263,8 +263,8 @@ class MemoryMetrics(TypedDict):
263
263
  neighbor_predicted_label_matches_current_label: NotRequired[bool | None]
264
264
  spread: NotRequired[float]
265
265
  uniformity: NotRequired[float]
266
- subconcept_cluster_id: NotRequired[int | None]
267
- subconcept_name: NotRequired[str | None]
266
+ concept_id: NotRequired[int | None]
267
+ subconcept_id: NotRequired[int | None]
268
268
 
269
269
 
270
270
  MemoryType = Literal["LABELED", "SCORED"]
@@ -301,6 +301,19 @@ class MemorysetClusterMetrics(TypedDict):
301
301
  num_clusters: int
302
302
 
303
303
 
304
+ class MemorysetConceptAnalysisConfig(TypedDict):
305
+ high_level_description: NotRequired[str | None]
306
+ max_sample_rows: NotRequired[int]
307
+ max_trial_count: NotRequired[int]
308
+ min_desired_clusters_per_label: NotRequired[int]
309
+ max_desired_clusters_per_label: NotRequired[int]
310
+ accuracy_importance: NotRequired[float]
311
+ noise_penalty: NotRequired[float]
312
+ naming_examples_count: NotRequired[int]
313
+ naming_counterexample_count: NotRequired[int]
314
+ seed: NotRequired[int]
315
+
316
+
304
317
  class MemorysetDuplicateAnalysisConfig(TypedDict):
305
318
  potential_duplicate_threshold: NotRequired[float]
306
319
 
@@ -339,19 +352,6 @@ class MemorysetProjectionMetrics(TypedDict):
339
352
  pass
340
353
 
341
354
 
342
- class MemorysetSubconceptAnalysisConfig(TypedDict):
343
- high_level_description: NotRequired[str | None]
344
- max_sample_rows: NotRequired[int]
345
- max_trial_count: NotRequired[int]
346
- min_desired_clusters_per_label: NotRequired[int]
347
- max_desired_clusters_per_label: NotRequired[int]
348
- accuracy_importance: NotRequired[float]
349
- noise_penalty: NotRequired[float]
350
- naming_examples_count: NotRequired[int]
351
- naming_counterexample_count: NotRequired[int]
352
- seed: NotRequired[int]
353
-
354
-
355
355
  class MemorysetUpdate(TypedDict):
356
356
  label_names: NotRequired[list[str]]
357
357
  description: NotRequired[str | None]
@@ -628,11 +628,11 @@ class ServiceUnavailableErrorResponse(TypedDict):
628
628
  service: str
629
629
 
630
630
 
631
- class SubconceptClusterMetrics(TypedDict):
632
- cluster_id: int
631
+ class SubConceptMetrics(TypedDict):
632
+ id: int
633
633
  name: str
634
- primary_label_index: NotRequired[int | None]
635
- description: NotRequired[str | None]
634
+ description: str | None
635
+ primary_label: int | None
636
636
  memory_count: int
637
637
 
638
638
 
@@ -840,6 +840,13 @@ class GetDatasourceByNameOrIdEmbeddingEvaluationByTaskIdParams(TypedDict):
840
840
 
841
841
  class GetDatasourceByNameOrIdDownloadParams(TypedDict):
842
842
  name_or_id: str
843
+ file_type: NotRequired[Literal["hf_dataset", "json", "csv"]]
844
+ """
845
+ File type to download:
846
+ * `hf_dataset`: Zipped HuggingFace dataset (default)
847
+ * `json`: Row-oriented JSON array
848
+ * `csv`: CSV file
849
+ """
843
850
 
844
851
 
845
852
  class PatchClassificationModelByNameOrIdParams(TypedDict):
@@ -938,7 +945,6 @@ class GetTelemetryPredictionByPredictionIdExplanationParams(TypedDict):
938
945
 
939
946
  class GetTelemetryPredictionByPredictionIdActionParams(TypedDict):
940
947
  prediction_id: str
941
- refresh: NotRequired[bool]
942
948
 
943
949
 
944
950
  class GetTelemetryPredictionByPredictionIdMemorySuggestionsParams(TypedDict):
@@ -1100,6 +1106,15 @@ class ColumnInfo(TypedDict):
1100
1106
  int_values: NotRequired[list[int] | None]
1101
1107
 
1102
1108
 
1109
+ class ConceptMetrics(TypedDict):
1110
+ id: int
1111
+ name: str
1112
+ description: str | None
1113
+ primary_label: int | None
1114
+ memory_count: int
1115
+ subconcepts: list[SubConceptMetrics]
1116
+
1117
+
1103
1118
  class CreateClassificationModelRequest(TypedDict):
1104
1119
  name: str
1105
1120
  description: NotRequired[str | None]
@@ -1378,7 +1393,7 @@ class MemorysetAnalysisConfigs(TypedDict):
1378
1393
  projection: NotRequired[MemorysetProjectionAnalysisConfig | None]
1379
1394
  cluster: NotRequired[MemorysetClusterAnalysisConfig | None]
1380
1395
  class_patterns: NotRequired[MemorysetClassPatternsAnalysisConfig | None]
1381
- subconcepts: NotRequired[MemorysetSubconceptAnalysisConfig | None]
1396
+ concepts: NotRequired[MemorysetConceptAnalysisConfig | None]
1382
1397
 
1383
1398
 
1384
1399
  class MemorysetAnalysisRequest(TypedDict):
@@ -1388,12 +1403,21 @@ class MemorysetAnalysisRequest(TypedDict):
1388
1403
  configs: MemorysetAnalysisConfigs
1389
1404
 
1390
1405
 
1391
- class MemorysetSubconceptMetrics(TypedDict):
1392
- clusters_by_id: dict[str, SubconceptClusterMetrics]
1393
- num_clusters: int
1406
+ class MemorysetConceptMetrics(TypedDict):
1407
+ concepts: list[ConceptMetrics]
1394
1408
  num_outliers: int
1395
1409
 
1396
1410
 
1411
+ class MemorysetMetrics(TypedDict):
1412
+ neighbor: NotRequired[MemorysetNeighborMetrics | None]
1413
+ label: NotRequired[MemorysetLabelMetrics | None]
1414
+ duplicate: NotRequired[MemorysetDuplicateMetrics | None]
1415
+ projection: NotRequired[MemorysetProjectionMetrics | None]
1416
+ cluster: NotRequired[MemorysetClusterMetrics | None]
1417
+ class_patterns: NotRequired[MemorysetClassPatternsMetrics | None]
1418
+ concepts: NotRequired[MemorysetConceptMetrics | None]
1419
+
1420
+
1397
1421
  class PaginatedUnionLabeledMemoryWithFeedbackMetricsScoredMemoryWithFeedbackMetrics(TypedDict):
1398
1422
  items: list[LabeledMemoryWithFeedbackMetrics | ScoredMemoryWithFeedbackMetrics]
1399
1423
  total: int
@@ -1449,34 +1473,6 @@ class EmbeddingEvaluationResult(TypedDict):
1449
1473
  evaluation_results: list[EmbeddingModelResult]
1450
1474
 
1451
1475
 
1452
- class MemorysetMetrics(TypedDict):
1453
- neighbor: NotRequired[MemorysetNeighborMetrics | None]
1454
- label: NotRequired[MemorysetLabelMetrics | None]
1455
- duplicate: NotRequired[MemorysetDuplicateMetrics | None]
1456
- projection: NotRequired[MemorysetProjectionMetrics | None]
1457
- cluster: NotRequired[MemorysetClusterMetrics | None]
1458
- class_patterns: NotRequired[MemorysetClassPatternsMetrics | None]
1459
- subconcepts: NotRequired[MemorysetSubconceptMetrics | None]
1460
-
1461
-
1462
- class PaginatedTask(TypedDict):
1463
- items: list[Task]
1464
- total: int
1465
- offset: int
1466
- limit: int
1467
-
1468
-
1469
- class DatasourceEmbeddingEvaluationsResponse(TypedDict):
1470
- task_id: str
1471
- org_id: str
1472
- datasource_id: str
1473
- status: TaskStatus
1474
- result: EmbeddingEvaluationResult | None
1475
- payload: DatasourceEmbeddingEvaluationsTaskPayload
1476
- created_at: str
1477
- updated_at: str
1478
-
1479
-
1480
1476
  class MemorysetAnalysisResponse(TypedDict):
1481
1477
  task_id: str
1482
1478
  org_id: str
@@ -1516,6 +1512,13 @@ class MemorysetMetadata(TypedDict):
1516
1512
  query_prompt_override: str | None
1517
1513
 
1518
1514
 
1515
+ class PaginatedTask(TypedDict):
1516
+ items: list[Task]
1517
+ total: int
1518
+ offset: int
1519
+ limit: int
1520
+
1521
+
1519
1522
  class BootstrapClassificationModelMeta(TypedDict):
1520
1523
  datasource_meta: DatasourceMetadata
1521
1524
  memoryset_meta: MemorysetMetadata
@@ -1531,6 +1534,17 @@ class BootstrapClassificationModelResponse(TypedDict):
1531
1534
  input: BootstrapClassificationModelRequest | None
1532
1535
 
1533
1536
 
1537
+ class DatasourceEmbeddingEvaluationsResponse(TypedDict):
1538
+ task_id: str
1539
+ org_id: str
1540
+ datasource_id: str
1541
+ status: TaskStatus
1542
+ result: EmbeddingEvaluationResult | None
1543
+ payload: DatasourceEmbeddingEvaluationsTaskPayload
1544
+ created_at: str
1545
+ updated_at: str
1546
+
1547
+
1534
1548
  class OrcaClient(Client):
1535
1549
  @staticmethod
1536
1550
  def _parse_params(
@@ -1911,6 +1925,40 @@ class OrcaClient(Client):
1911
1925
  """Get an embedding evaluation task by ID."""
1912
1926
  pass
1913
1927
 
1928
+ @overload
1929
+ def GET(
1930
+ self,
1931
+ path: Literal["/datasource/{name_or_id}/download"],
1932
+ *,
1933
+ params: GetDatasourceByNameOrIdDownloadParams,
1934
+ parse_as: Literal["json"] = "json",
1935
+ headers: HeaderTypes | None = None,
1936
+ cookies: CookieTypes | None = None,
1937
+ auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
1938
+ follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
1939
+ timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
1940
+ extensions: RequestExtensions | None = None,
1941
+ ) -> list[dict[str, Any]]:
1942
+ """Download datasource in the specified format."""
1943
+ pass
1944
+
1945
+ @overload
1946
+ def GET(
1947
+ self,
1948
+ path: Literal["/datasource/{name_or_id}/download"],
1949
+ *,
1950
+ params: GetDatasourceByNameOrIdDownloadParams,
1951
+ parse_as: Literal["text"],
1952
+ headers: HeaderTypes | None = None,
1953
+ cookies: CookieTypes | None = None,
1954
+ auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
1955
+ follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
1956
+ timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
1957
+ extensions: RequestExtensions | None = None,
1958
+ ) -> str:
1959
+ """Download datasource in the specified format."""
1960
+ pass
1961
+
1914
1962
  @overload
1915
1963
  def GET(
1916
1964
  self,
@@ -1925,7 +1973,7 @@ class OrcaClient(Client):
1925
1973
  timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
1926
1974
  extensions: RequestExtensions | None = None,
1927
1975
  ) -> bytes:
1928
- """Streams a zipped dataset file response"""
1976
+ """Download datasource in the specified format."""
1929
1977
  pass
1930
1978
 
1931
1979
  @overload
orca_sdk/datasource.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import os
4
5
  import tempfile
5
6
  import zipfile
6
7
  from datetime import datetime
@@ -141,28 +142,6 @@ class Datasource:
141
142
  + "})"
142
143
  )
143
144
 
144
- def download(self, output_path: str | PathLike) -> None:
145
- """
146
- Download the datasource as a ZIP and extract them to a specified path.
147
-
148
- Params:
149
- output_path: The local file path or directory where the downloaded files will be saved.
150
-
151
- Returns:
152
- None
153
- """
154
- # TODO: add progress bar to the download
155
- response = orca_api.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id}, parse_as=None)
156
- with tempfile.NamedTemporaryFile(suffix=".zip") as tmp_zip:
157
- tmp_zip.write(response)
158
- tmp_zip.flush()
159
- with zipfile.ZipFile(tmp_zip.name, "r") as zf:
160
- Path(output_path).mkdir(parents=True, exist_ok=True)
161
- for file in zf.namelist():
162
- out_file = Path(output_path) / Path(file).name
163
- with zf.open(file) as af:
164
- out_file.write_bytes(af.read())
165
-
166
145
  @classmethod
167
146
  def from_hf_dataset(
168
147
  cls, name: str, dataset: Dataset, if_exists: CreateMode = "error", description: str | None = None
@@ -501,3 +480,45 @@ class Datasource:
501
480
 
502
481
  def __len__(self) -> int:
503
482
  return self.length
483
+
484
+ def download(
485
+ self, output_dir: str | PathLike, file_type: Literal["hf_dataset", "json", "csv"] = "hf_dataset"
486
+ ) -> None:
487
+ """
488
+ Download the datasource to a specified path in the specified format type
489
+
490
+ Params:
491
+ output_dir: The local directory where the downloaded file will be saved.
492
+ file_type: The type of file to download.
493
+
494
+ Returns:
495
+ None
496
+ """
497
+ extension = "zip" if file_type == "hf_dataset" else file_type
498
+ output_path = Path(output_dir) / f"{self.name}.{extension}"
499
+ with open(output_path, "wb") as download_file:
500
+ with orca_api.stream("GET", f"/datasource/{self.id}/download", params={"file_type": file_type}) as response:
501
+ total_chunks = int(response.headers["X-Total-Chunks"]) if "X-Total-Chunks" in response.headers else None
502
+ with tqdm(desc=f"Downloading", total=total_chunks, disable=total_chunks is None) as progress:
503
+ for chunk in response.iter_bytes():
504
+ download_file.write(chunk)
505
+ progress.update(1)
506
+
507
+ # extract the zip file
508
+ if extension == "zip":
509
+ extract_dir = Path(output_dir) / self.name
510
+ with zipfile.ZipFile(output_path, "r") as zip_ref:
511
+ zip_ref.extractall(extract_dir)
512
+ output_path.unlink() # Remove the zip file after extraction
513
+ logging.info(f"Downloaded {extract_dir}")
514
+ else:
515
+ logging.info(f"Downloaded {output_path}")
516
+
517
+ def to_list(self) -> list[dict]:
518
+ """
519
+ Convert the datasource to a list of dictionaries.
520
+
521
+ Returns:
522
+ A list of dictionaries representation of the datasource.
523
+ """
524
+ return orca_api.GET("/datasource/{name_or_id}/download", params={"name_or_id": self.id, "file_type": "json"})
@@ -1,10 +1,14 @@
1
+ import json
1
2
  import os
2
3
  import tempfile
4
+ from typing import cast
3
5
  from uuid import uuid4
4
6
 
7
+ import numpy as np
5
8
  import pandas as pd
6
9
  import pyarrow as pa
7
10
  import pytest
11
+ from datasets import Dataset
8
12
 
9
13
  from .datasource import Datasource
10
14
 
@@ -99,13 +103,6 @@ def test_drop_datasource_invalid_input():
99
103
  Datasource.drop("not valid id")
100
104
 
101
105
 
102
- def test_download_datasource(datasource):
103
- with tempfile.TemporaryDirectory() as temp_dir:
104
- output_path = os.path.join(temp_dir, "datasource.zip")
105
- datasource.download(output_path)
106
- assert os.path.exists(output_path)
107
-
108
-
109
106
  def test_from_list():
110
107
  # Test creating datasource from list of dictionaries
111
108
  data = [
@@ -296,3 +293,40 @@ def test_from_disk_already_exists():
296
293
  assert datasource2.name == datasource1.name
297
294
  finally:
298
295
  os.unlink(f.name)
296
+
297
+
298
+ def test_to_list(hf_dataset, datasource):
299
+ assert datasource.to_list() == hf_dataset.to_list()
300
+
301
+
302
+ def test_download_datasource(hf_dataset, datasource):
303
+ with tempfile.TemporaryDirectory() as temp_dir:
304
+ # Dataset download
305
+ datasource.download(temp_dir)
306
+ downloaded_hf_dataset_dir = f"{temp_dir}/{datasource.name}"
307
+ assert os.path.exists(downloaded_hf_dataset_dir)
308
+ assert os.path.isdir(downloaded_hf_dataset_dir)
309
+ assert not os.path.exists(f"{downloaded_hf_dataset_dir}.zip")
310
+ dataset_from_downloaded_hf_dataset = Dataset.load_from_disk(downloaded_hf_dataset_dir)
311
+ assert dataset_from_downloaded_hf_dataset.column_names == hf_dataset.column_names
312
+ assert dataset_from_downloaded_hf_dataset.to_dict() == hf_dataset.to_dict()
313
+
314
+ # JSON download
315
+ datasource.download(temp_dir, file_type="json")
316
+ downloaded_json_file = f"{temp_dir}/{datasource.name}.json"
317
+ assert os.path.exists(downloaded_json_file)
318
+ with open(downloaded_json_file, "r") as f:
319
+ content = json.load(f)
320
+ assert content == hf_dataset.to_list()
321
+
322
+ # CSV download
323
+ datasource.download(temp_dir, file_type="csv")
324
+ downloaded_csv_file = f"{temp_dir}/{datasource.name}.csv"
325
+ assert os.path.exists(downloaded_csv_file)
326
+ dataset_from_downloaded_csv = cast(Dataset, Dataset.from_csv(downloaded_csv_file))
327
+ assert dataset_from_downloaded_csv.column_names == hf_dataset.column_names
328
+ assert (
329
+ dataset_from_downloaded_csv.remove_columns("score").to_dict()
330
+ == hf_dataset.remove_columns("score").to_dict()
331
+ )
332
+ assert np.allclose(dataset_from_downloaded_csv["score"], hf_dataset["score"])
orca_sdk/telemetry.py CHANGED
@@ -562,7 +562,7 @@ class ClassificationPrediction(_Prediction):
562
562
 
563
563
  response = orca_api.GET(
564
564
  "/telemetry/prediction/{prediction_id}/action",
565
- params={"prediction_id": self.prediction_id, "refresh": refresh},
565
+ params={"prediction_id": self.prediction_id},
566
566
  timeout=30,
567
567
  )
568
568
  return (response["action"], response["rationale"])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: orca_sdk
3
- Version: 0.0.98
3
+ Version: 0.0.101
4
4
  Summary: SDK for interacting with Orca Services
5
5
  License: Apache-2.0
6
6
  Author: Orca DB Inc.
@@ -19,12 +19,12 @@ orca_sdk/_utils/value_parser.py,sha256=c3qMABCCDQcIjn9N1orYYnlRwDW9JWdGwW_2TDZPL
19
19
  orca_sdk/_utils/value_parser_test.py,sha256=OybsiC-Obi32RRi9NIuwrVBRAnlyPMV1xVAaevSrb7M,1079
20
20
  orca_sdk/classification_model.py,sha256=SUiUgv_o3UUngpz3Le_L6DsijhjXVEB3yo84hrD1MX4,31172
21
21
  orca_sdk/classification_model_test.py,sha256=WganVoP-0vw1cqiVWJ2vXyGi4lwYp_hbZHultpxvFqk,19536
22
- orca_sdk/client.py,sha256=LEqdqMHbidh9NeZ_42RbEDR64KqXnqnX0JBjaoo2pnY,115704
22
+ orca_sdk/client.py,sha256=gZ0e0gHoAxpvtsGYRnqH80XAZzi1EaYy2A0BplouOBA,117268
23
23
  orca_sdk/conftest.py,sha256=LHA46gDU_D0T_ogS6XOVQvGDDMD01nVZFWVcBYDConc,8885
24
24
  orca_sdk/credentials.py,sha256=KrmgP_5uqBKJXFJV6utTHIhU2odsr95VEqEXWe277DY,5074
25
25
  orca_sdk/credentials_test.py,sha256=whUweSJIEws6C8Bku-5V31Dv9hD1mFDDW7X2cCsB6g0,1629
26
- orca_sdk/datasource.py,sha256=344gqZsSV_N3RDlqXuLnDrPSizvciaAsSPmsiljNSmI,19329
27
- orca_sdk/datasource_test.py,sha256=nKQGjhX0VwCCLCdwl1ns-6kA5Ow-8pWQkSS9WV3CVww,9975
26
+ orca_sdk/datasource.py,sha256=6wARRq-eNDJVSBABdVzn41z7s69xasTsqbozaVAsf9U,20263
27
+ orca_sdk/datasource_test.py,sha256=wENPourrJvQN-uJJPaJI9EDuof6wVw3GirOhbrY4sFI,11564
28
28
  orca_sdk/embedding_model.py,sha256=YxMXdZ3tvnxnK93nArOr_HZ6QoRB-Mc5VNQJ0mcIdpk,26021
29
29
  orca_sdk/embedding_model_test.py,sha256=1aELyCuIzxSxUg7Z4tYtNOd8-hV5hFb-gsZTNh712OQ,7765
30
30
  orca_sdk/job.py,sha256=wWJPkkQbkNu_ylBtZN4AscU00VwWTfqlSmysRBUlivw,12787
@@ -33,8 +33,8 @@ orca_sdk/memoryset.py,sha256=hnhgHxsuV5JEDMnxazmO03VjxzPBENtHDPppHRfh9z4,85354
33
33
  orca_sdk/memoryset_test.py,sha256=14WG6u_adVmm6CSn2dM5lEfQYxybwh3s9Q7RBeTuoPE,20486
34
34
  orca_sdk/regression_model.py,sha256=je2g1BmoPCuouv5iWqDolja90F2w2vD6TooXA8KjL7c,24552
35
35
  orca_sdk/regression_model_test.py,sha256=8LDhtQeh52grZQ2Xd0ju1MQvb_hwosY_ORDDE3wS2LA,14570
36
- orca_sdk/telemetry.py,sha256=jOOFYYr1s3i5EASbCcmUJ_O469xCK5OFg7NVcVfTAlU,25400
36
+ orca_sdk/telemetry.py,sha256=dRyf8fIvThkSBLDyD8BYkixg3nphsN-HbneWq7nbp_4,25380
37
37
  orca_sdk/telemetry_test.py,sha256=eT66C5lFdNg-pQdo2I__BP7Tn5fTc9aTkVo9ZhWwhU0,5519
38
- orca_sdk-0.0.98.dist-info/METADATA,sha256=z5MKWnC226a5mzHHN7aKJFvpFm9qTe46-gtaRTH8ppM,3613
39
- orca_sdk-0.0.98.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
40
- orca_sdk-0.0.98.dist-info/RECORD,,
38
+ orca_sdk-0.0.101.dist-info/METADATA,sha256=VmuFgh5XlRRIkRJbWd5y9APC3-b6uFZuUril5dys9vI,3614
39
+ orca_sdk-0.0.101.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
40
+ orca_sdk-0.0.101.dist-info/RECORD,,