orca-sdk 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,7 +20,9 @@ from numpy.typing import NDArray
20
20
  def softmax(logits: np.ndarray, axis: int = -1) -> np.ndarray:
21
21
  shifted = logits - np.max(logits, axis=axis, keepdims=True)
22
22
  exps = np.exp(shifted)
23
- return exps / np.sum(exps, axis=axis, keepdims=True)
23
+ sums = np.sum(exps, axis=axis, keepdims=True)
24
+ # Guard against division by zero (can happen if all logits are -inf or NaN)
25
+ return exps / np.where(sums > 0, sums, 1.0)
24
26
 
25
27
 
26
28
  # We don't want to depend on transformers just for the eval_pred type in orca_sdk
@@ -300,7 +302,9 @@ def convert_logits_to_probabilities(logits: NDArray[np.float32]) -> NDArray[np.f
300
302
  probabilities = cast(NDArray[np.float32], softmax(logits))
301
303
  elif not np.allclose(logits.sum(-1, keepdims=True), 1.0):
302
304
  # Rows don't sum to 1: normalize to probabilities
303
- probabilities = cast(NDArray[np.float32], logits / logits.sum(-1, keepdims=True))
305
+ row_sums = logits.sum(-1, keepdims=True)
306
+ # Guard against division by zero (can happen if all values in a row are 0 or NaN)
307
+ probabilities = cast(NDArray[np.float32], logits / np.where(row_sums > 0, row_sums, 1.0))
304
308
  else:
305
309
  # Already normalized probabilities
306
310
  probabilities = logits
@@ -349,7 +353,7 @@ def calculate_classification_metrics(
349
353
  num_classes_references = len(set(references))
350
354
  num_classes_predictions = len(set(predictions))
351
355
  num_none_predictions = np.isnan(probabilities).all(axis=-1).sum()
352
- coverage = 1 - num_none_predictions / len(probabilities)
356
+ coverage = 1 - (num_none_predictions / len(probabilities) if len(probabilities) > 0 else 0)
353
357
 
354
358
  if average is None:
355
359
  average = "binary" if num_classes_references == 2 and num_none_predictions == 0 else "weighted"
@@ -503,7 +507,7 @@ def calculate_regression_metrics(
503
507
  # Filter out NaN values from predictions (expected_scores are already validated to be non-NaN)
504
508
  valid_mask = ~np.isnan(predictions)
505
509
  num_none_predictions = (~valid_mask).sum()
506
- coverage = 1 - num_none_predictions / len(predictions)
510
+ coverage = 1 - (num_none_predictions / len(predictions) if len(predictions) > 0 else 0)
507
511
  if num_none_predictions > 0:
508
512
  references = references[valid_mask]
509
513
  predictions = predictions[valid_mask]
orca_sdk/async_client.py CHANGED
@@ -555,16 +555,7 @@ class PredictiveModelUpdate(TypedDict):
555
555
 
556
556
 
557
557
  PretrainedEmbeddingModelName = Literal[
558
- "CLIP_BASE",
559
- "GTE_BASE",
560
- "CDE_SMALL",
561
- "DISTILBERT",
562
- "GTE_SMALL",
563
- "MXBAI_LARGE",
564
- "E5_LARGE",
565
- "QWEN2_1_5B",
566
- "BGE_BASE",
567
- "GIST_LARGE",
558
+ "CLIP_BASE", "GTE_BASE", "CDE_SMALL", "DISTILBERT", "GTE_SMALL", "MXBAI_LARGE", "E5_LARGE", "BGE_BASE", "GIST_LARGE"
568
559
  ]
569
560
 
570
561
 
@@ -1175,7 +1166,14 @@ class BootstrapClassificationModelRequest(TypedDict):
1175
1166
  num_examples_per_label: NotRequired[int]
1176
1167
 
1177
1168
 
1178
- class BootstrapClassificationModelResult(TypedDict):
1169
+ class BootstrapLabeledMemoryDataInput(TypedDict):
1170
+ model_description: str
1171
+ label_names: list[str]
1172
+ initial_examples: NotRequired[list[LabeledExample]]
1173
+ num_examples_per_label: NotRequired[int]
1174
+
1175
+
1176
+ class BootstrapLabeledMemoryDataResult(TypedDict):
1179
1177
  model_description: str
1180
1178
  label_names: list[str]
1181
1179
  model_name: str
@@ -1680,7 +1678,7 @@ class BootstrapClassificationModelMeta(TypedDict):
1680
1678
  datasource_meta: DatasourceMetadata
1681
1679
  memoryset_meta: MemorysetMetadata
1682
1680
  model_meta: ClassificationModelMetadata
1683
- agent_output: BootstrapClassificationModelResult
1681
+ agent_output: BootstrapLabeledMemoryDataResult
1684
1682
 
1685
1683
 
1686
1684
  class BootstrapClassificationModelResponse(TypedDict):
@@ -2570,7 +2568,7 @@ class OrcaAsyncClient(AsyncClient):
2570
2568
  timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
2571
2569
  extensions: RequestExtensions | None = None,
2572
2570
  ) -> BootstrapClassificationModelResponse:
2573
- """Get the status of a bootstrap classification model job"""
2571
+ """Get the status of a bootstrap labeled memory data job"""
2574
2572
  pass
2575
2573
 
2576
2574
  async def GET(
@@ -3292,6 +3290,32 @@ class OrcaAsyncClient(AsyncClient):
3292
3290
  """Get row count from a specific datasource with optional filtering."""
3293
3291
  pass
3294
3292
 
3293
+ @overload
3294
+ async def POST(
3295
+ self,
3296
+ path: Literal["/datasource/bootstrap_memory_data"],
3297
+ *,
3298
+ params: None = None,
3299
+ json: BootstrapLabeledMemoryDataInput,
3300
+ data: None = None,
3301
+ files: None = None,
3302
+ content: None = None,
3303
+ parse_as: Literal["json"] = "json",
3304
+ headers: HeaderTypes | None = None,
3305
+ cookies: CookieTypes | None = None,
3306
+ auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3307
+ follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
3308
+ timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3309
+ extensions: RequestExtensions | None = None,
3310
+ ) -> BootstrapLabeledMemoryDataResult:
3311
+ """
3312
+ Bootstrap memory data using an AI agent.
3313
+
3314
+ This endpoint uses the bootstrap labeled memory data agent to generate
3315
+ high-quality, diverse training examples for a classification model.
3316
+ """
3317
+ pass
3318
+
3295
3319
  @overload
3296
3320
  async def POST(
3297
3321
  self,
@@ -3540,7 +3564,7 @@ class OrcaAsyncClient(AsyncClient):
3540
3564
  """
3541
3565
  Bootstrap a classification model by creating a memoryset with generated memories and a classification model.
3542
3566
 
3543
- This endpoint uses the bootstrap_classification_model agent to generate:
3567
+ This endpoint uses the bootstrap_labeled_memory_data agent to generate:
3544
3568
  1. Memoryset configuration with appropriate settings
3545
3569
  2. Model configuration with optimal parameters
3546
3570
  3. High-quality training memories for each label
@@ -12,12 +12,10 @@ from ._utils.common import UNSET, CreateMode, DropMode
12
12
  from .async_client import OrcaAsyncClient
13
13
  from .client import (
14
14
  BootstrapClassificationModelMeta,
15
- BootstrapClassificationModelResult,
16
- ClassificationEvaluationRequest,
15
+ BootstrapLabeledMemoryDataResult,
17
16
  ClassificationModelMetadata,
18
17
  ClassificationPredictionRequest,
19
18
  OrcaClient,
20
- PostClassificationModelByModelNameOrIdEvaluationParams,
21
19
  PredictiveModelUpdate,
22
20
  RACHeadType,
23
21
  )
@@ -43,7 +41,7 @@ class BootstrappedClassificationModel:
43
41
  datasource: Datasource | None
44
42
  memoryset: LabeledMemoryset | None
45
43
  classification_model: ClassificationModel | None
46
- agent_output: BootstrapClassificationModelResult | None
44
+ agent_output: BootstrapLabeledMemoryDataResult | None
47
45
 
48
46
  def __init__(self, metadata: BootstrapClassificationModelMeta):
49
47
  self.datasource = Datasource.open(metadata["datasource_meta"]["id"])
orca_sdk/client.py CHANGED
@@ -553,16 +553,7 @@ class PredictiveModelUpdate(TypedDict):
553
553
 
554
554
 
555
555
  PretrainedEmbeddingModelName = Literal[
556
- "CLIP_BASE",
557
- "GTE_BASE",
558
- "CDE_SMALL",
559
- "DISTILBERT",
560
- "GTE_SMALL",
561
- "MXBAI_LARGE",
562
- "E5_LARGE",
563
- "QWEN2_1_5B",
564
- "BGE_BASE",
565
- "GIST_LARGE",
556
+ "CLIP_BASE", "GTE_BASE", "CDE_SMALL", "DISTILBERT", "GTE_SMALL", "MXBAI_LARGE", "E5_LARGE", "BGE_BASE", "GIST_LARGE"
566
557
  ]
567
558
 
568
559
 
@@ -1173,7 +1164,14 @@ class BootstrapClassificationModelRequest(TypedDict):
1173
1164
  num_examples_per_label: NotRequired[int]
1174
1165
 
1175
1166
 
1176
- class BootstrapClassificationModelResult(TypedDict):
1167
+ class BootstrapLabeledMemoryDataInput(TypedDict):
1168
+ model_description: str
1169
+ label_names: list[str]
1170
+ initial_examples: NotRequired[list[LabeledExample]]
1171
+ num_examples_per_label: NotRequired[int]
1172
+
1173
+
1174
+ class BootstrapLabeledMemoryDataResult(TypedDict):
1177
1175
  model_description: str
1178
1176
  label_names: list[str]
1179
1177
  model_name: str
@@ -1678,7 +1676,7 @@ class BootstrapClassificationModelMeta(TypedDict):
1678
1676
  datasource_meta: DatasourceMetadata
1679
1677
  memoryset_meta: MemorysetMetadata
1680
1678
  model_meta: ClassificationModelMetadata
1681
- agent_output: BootstrapClassificationModelResult
1679
+ agent_output: BootstrapLabeledMemoryDataResult
1682
1680
 
1683
1681
 
1684
1682
  class BootstrapClassificationModelResponse(TypedDict):
@@ -2568,7 +2566,7 @@ class OrcaClient(Client):
2568
2566
  timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
2569
2567
  extensions: RequestExtensions | None = None,
2570
2568
  ) -> BootstrapClassificationModelResponse:
2571
- """Get the status of a bootstrap classification model job"""
2569
+ """Get the status of a bootstrap labeled memory data job"""
2572
2570
  pass
2573
2571
 
2574
2572
  def GET(
@@ -3290,6 +3288,32 @@ class OrcaClient(Client):
3290
3288
  """Get row count from a specific datasource with optional filtering."""
3291
3289
  pass
3292
3290
 
3291
+ @overload
3292
+ def POST(
3293
+ self,
3294
+ path: Literal["/datasource/bootstrap_memory_data"],
3295
+ *,
3296
+ params: None = None,
3297
+ json: BootstrapLabeledMemoryDataInput,
3298
+ data: None = None,
3299
+ files: None = None,
3300
+ content: None = None,
3301
+ parse_as: Literal["json"] = "json",
3302
+ headers: HeaderTypes | None = None,
3303
+ cookies: CookieTypes | None = None,
3304
+ auth: AuthTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3305
+ follow_redirects: bool | UseClientDefault = USE_CLIENT_DEFAULT,
3306
+ timeout: TimeoutTypes | UseClientDefault = USE_CLIENT_DEFAULT,
3307
+ extensions: RequestExtensions | None = None,
3308
+ ) -> BootstrapLabeledMemoryDataResult:
3309
+ """
3310
+ Bootstrap memory data using an AI agent.
3311
+
3312
+ This endpoint uses the bootstrap labeled memory data agent to generate
3313
+ high-quality, diverse training examples for a classification model.
3314
+ """
3315
+ pass
3316
+
3293
3317
  @overload
3294
3318
  def POST(
3295
3319
  self,
@@ -3538,7 +3562,7 @@ class OrcaClient(Client):
3538
3562
  """
3539
3563
  Bootstrap a classification model by creating a memoryset with generated memories and a classification model.
3540
3564
 
3541
- This endpoint uses the bootstrap_classification_model agent to generate:
3565
+ This endpoint uses the bootstrap_labeled_memory_data agent to generate:
3542
3566
  1. Memoryset configuration with appropriate settings
3543
3567
  2. Model configuration with optimal parameters
3544
3568
  3. High-quality training memories for each label
@@ -340,7 +340,6 @@ class PretrainedEmbeddingModel(EmbeddingModelBase):
340
340
  - **`E5_LARGE`**: E5-Large instruction-tuned embedding model from Hugging Face ([intfloat/multilingual-e5-large-instruct](https://huggingface.co/intfloat/multilingual-e5-large-instruct))
341
341
  - **`GIST_LARGE`**: GIST-Large embedding model from Hugging Face ([avsolatorio/GIST-large-Embedding-v0](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0))
342
342
  - **`MXBAI_LARGE`**: Mixbreas's Large embedding model from Hugging Face ([mixedbread-ai/mxbai-embed-large-v1](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1))
343
- - **`QWEN2_1_5B`**: Alibaba's Qwen2-1.5B instruction-tuned embedding model from Hugging Face ([Alibaba-NLP/gte-Qwen2-1.5B-instruct](https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct))
344
343
  - **`BGE_BASE`**: BAAI's BGE-Base instruction-tuned embedding model from Hugging Face ([BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5))
345
344
 
346
345
  **Instruction Support:**
@@ -373,7 +372,6 @@ class PretrainedEmbeddingModel(EmbeddingModelBase):
373
372
  E5_LARGE = _ModelDescriptor("E5_LARGE")
374
373
  GIST_LARGE = _ModelDescriptor("GIST_LARGE")
375
374
  MXBAI_LARGE = _ModelDescriptor("MXBAI_LARGE")
376
- QWEN2_1_5B = _ModelDescriptor("QWEN2_1_5B")
377
375
  BGE_BASE = _ModelDescriptor("BGE_BASE")
378
376
 
379
377
  name: PretrainedEmbeddingModelName
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: orca_sdk
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: SDK for interacting with Orca Services
5
5
  License-Expression: Apache-2.0
6
6
  Author: Orca DB Inc.
@@ -1,6 +1,6 @@
1
1
  orca_sdk/__init__.py,sha256=xyjNwkLQXaX8A-UYgGwYDjv2btOXArT_yiMTfmW7KA8,1003
2
2
  orca_sdk/_shared/__init__.py,sha256=3Kt0Hu3QLI5FEp9nqGTxqAm3hAoBJKcagfaGQZ-lbJQ,223
3
- orca_sdk/_shared/metrics.py,sha256=m-d2-AsHI12REWev1WeniOcQRhF5cXxNjUgC4skM2o4,19412
3
+ orca_sdk/_shared/metrics.py,sha256=a_FdsPGDjR3CMOEBaEhEBqMfWUg7sqz9Jeh26XzAeg0,19756
4
4
  orca_sdk/_shared/metrics_test.py,sha256=n7eEAT8e6RqbI94ftEDljTBzOuh-YkFpXfF3DOoZA10,12905
5
5
  orca_sdk/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  orca_sdk/_utils/analysis_ui.py,sha256=nT-M_YcNRCVPQzvuqYNFKnNHhYkADYBvq1GlIUePrWw,9232
@@ -17,16 +17,16 @@ orca_sdk/_utils/prediction_result_ui.py,sha256=Ur_FY7dz3oWNmtPiP3Wl3yRlEMgK8q9Uf
17
17
  orca_sdk/_utils/tqdm_file_reader.py,sha256=Lw7Cg1UgNuRUoN6jjqZb-IlV00H-kbRcrZLdudr1GxE,324
18
18
  orca_sdk/_utils/value_parser.py,sha256=c3qMABCCDQcIjn9N1orYYnlRwDW9JWdGwW_2TDZPLdI,1286
19
19
  orca_sdk/_utils/value_parser_test.py,sha256=OybsiC-Obi32RRi9NIuwrVBRAnlyPMV1xVAaevSrb7M,1079
20
- orca_sdk/async_client.py,sha256=V16wWwdJFvAzmKd5zHsFo3ny_-7B34UrONl80bZzKKs,131628
21
- orca_sdk/classification_model.py,sha256=90r-PfJ3ZW7ZJ7jrZPTbhuXRds46f7Ooe8FTp-iUJgg,46350
20
+ orca_sdk/async_client.py,sha256=y2D3fPQZmbwmtYWAk5acJ45atSZen9MNfjP2tKjpP6Q,132737
21
+ orca_sdk/classification_model.py,sha256=4AcQvAm0EN7w0qx0WpgEs7VUoIIPTqIVE86wtkaIAYs,46249
22
22
  orca_sdk/classification_model_test.py,sha256=vBn7KBb9-ACuJEdzW50n54Fn6Mh9iEYbn1197lE8-yI,36997
23
- orca_sdk/client.py,sha256=J3Od1sWO7YK2M5afcRNeJcjzNEgZ4zt6e7vLJdk6Nbs,130695
23
+ orca_sdk/client.py,sha256=oQd8Lm0agetLyAdVRP8IZqe6S5mjxhFSnbVHqhT7dmI,131798
24
24
  orca_sdk/conftest.py,sha256=0O1VY-SPKNAvi9fBLdY1RMnYVgZvMjP92y99bNAqqiw,12461
25
25
  orca_sdk/credentials.py,sha256=80_1r8n5jruEvN_E629SaRrRhKvF_NhWUEZyZzPXkqQ,6620
26
26
  orca_sdk/credentials_test.py,sha256=TLbXJMz3IlThvtSrHeLM7jRsKnrncA_ahOTpHg15Ei4,4089
27
27
  orca_sdk/datasource.py,sha256=6QaccghiyFEUSFcqnwjIJzpgIh9Id0snJk2EqViqPsU,22356
28
28
  orca_sdk/datasource_test.py,sha256=sCk3IcQJbDut5oN4Wf7PXhTxyMwalxMuCXJekSxy9wk,16665
29
- orca_sdk/embedding_model.py,sha256=bZhbNJBimWc9Ryklza3q9HS0MRWsiH5Lhn6p7pff0RI,28165
29
+ orca_sdk/embedding_model.py,sha256=4xxfo26b5X_YJtU8KyqoMmJQ6VgfHEcYftVSz-RfDng,27920
30
30
  orca_sdk/embedding_model_test.py,sha256=-NItbNb3tTVj5jAvSi3WjV3FP448q08lmT5iObg9vwA,8133
31
31
  orca_sdk/job.py,sha256=wHwVt-s7i-v8udhLGybB-90Kp4dwOLrY806bE4Tam5Q,13092
32
32
  orca_sdk/job_test.py,sha256=nRSWxd_1UIfrj9oMVvrXjt6OBkBpddYAjb2y6P-DTUg,4327
@@ -36,6 +36,6 @@ orca_sdk/regression_model.py,sha256=vXdY2Fbfc0MyECUR3fa_IR-nETPrDN7VFAdjvsgHPrs,
36
36
  orca_sdk/regression_model_test.py,sha256=DfWLkqxB835jjwM-sj1uxQ6Yz_ZBMnt8EHjdfnHsRnU,25103
37
37
  orca_sdk/telemetry.py,sha256=ZyCMiyyo_SchjadWZH55TlLrC4Ucq5S316NbW26LL4Y,27834
38
38
  orca_sdk/telemetry_test.py,sha256=eT66C5lFdNg-pQdo2I__BP7Tn5fTc9aTkVo9ZhWwhU0,5519
39
- orca_sdk-0.1.5.dist-info/METADATA,sha256=NsXoCiKQ-frqwZeydk_OzvK-QqD1_SnGdRuERXM1ILc,3659
40
- orca_sdk-0.1.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
- orca_sdk-0.1.5.dist-info/RECORD,,
39
+ orca_sdk-0.1.6.dist-info/METADATA,sha256=85QDZDP9Uxda4oZ3BMPP_kI5T4GPy1mFMYtWh1-nI54,3659
40
+ orca_sdk-0.1.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
+ orca_sdk-0.1.6.dist-info/RECORD,,