openaivec 0.14.13__py3-none-any.whl → 0.14.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/spark.py +66 -17
- {openaivec-0.14.13.dist-info → openaivec-0.14.14.dist-info}/METADATA +1 -1
- {openaivec-0.14.13.dist-info → openaivec-0.14.14.dist-info}/RECORD +5 -5
- {openaivec-0.14.13.dist-info → openaivec-0.14.14.dist-info}/WHEEL +0 -0
- {openaivec-0.14.13.dist-info → openaivec-0.14.14.dist-info}/licenses/LICENSE +0 -0
openaivec/spark.py
CHANGED
|
@@ -193,8 +193,6 @@ def setup(
|
|
|
193
193
|
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(responses_model_name))
|
|
194
194
|
|
|
195
195
|
if embeddings_model_name:
|
|
196
|
-
from openaivec._model import EmbeddingsModelName
|
|
197
|
-
|
|
198
196
|
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(embeddings_model_name))
|
|
199
197
|
|
|
200
198
|
CONTAINER.clear_singletons()
|
|
@@ -244,6 +242,50 @@ def setup_azure(
|
|
|
244
242
|
CONTAINER.clear_singletons()
|
|
245
243
|
|
|
246
244
|
|
|
245
|
+
def set_responses_model(model_name: str):
|
|
246
|
+
"""Set the default model name for response generation in the DI container.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
model_name (str): The model name to set as default for responses.
|
|
250
|
+
"""
|
|
251
|
+
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName(model_name))
|
|
252
|
+
CONTAINER.clear_singletons()
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def get_responses_model() -> str | None:
|
|
256
|
+
"""Get the default model name for response generation from the DI container.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
str | None: The default model name for responses, or None if not set.
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
return CONTAINER.resolve(ResponsesModelName).value
|
|
263
|
+
except Exception:
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def set_embeddings_model(model_name: str):
|
|
268
|
+
"""Set the default model name for embeddings in the DI container.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
model_name (str): The model name to set as default for embeddings.
|
|
272
|
+
"""
|
|
273
|
+
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName(model_name))
|
|
274
|
+
CONTAINER.clear_singletons()
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def get_embeddings_model() -> str | None:
|
|
278
|
+
"""Get the default model name for embeddings from the DI container.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
str | None: The default model name for embeddings, or None if not set.
|
|
282
|
+
"""
|
|
283
|
+
try:
|
|
284
|
+
return CONTAINER.resolve(EmbeddingsModelName).value
|
|
285
|
+
except Exception:
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
|
|
247
289
|
def _python_type_to_spark(python_type):
|
|
248
290
|
origin = get_origin(python_type)
|
|
249
291
|
|
|
@@ -322,7 +364,7 @@ def _safe_dump(x: BaseModel | None) -> dict:
|
|
|
322
364
|
def responses_udf(
|
|
323
365
|
instructions: str,
|
|
324
366
|
response_format: type[ResponseFormat] = str,
|
|
325
|
-
model_name: str =
|
|
367
|
+
model_name: str | None = None,
|
|
326
368
|
batch_size: int | None = None,
|
|
327
369
|
max_concurrency: int = 8,
|
|
328
370
|
**api_kwargs,
|
|
@@ -351,8 +393,9 @@ def responses_udf(
|
|
|
351
393
|
instructions (str): The system prompt or instructions for the model.
|
|
352
394
|
response_format (type[ResponseFormat]): The desired output format. Either `str` for plain text
|
|
353
395
|
or a Pydantic `BaseModel` for structured JSON output. Defaults to `str`.
|
|
354
|
-
model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
|
|
355
|
-
For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
|
|
396
|
+
model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
|
|
397
|
+
For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
|
|
398
|
+
via ResponsesModelName if not provided.
|
|
356
399
|
batch_size (int | None): Number of rows per async batch request within each partition.
|
|
357
400
|
Larger values reduce API call overhead but increase memory usage.
|
|
358
401
|
Defaults to None (automatic batch size optimization that dynamically
|
|
@@ -382,13 +425,15 @@ def responses_udf(
|
|
|
382
425
|
- Consider your OpenAI tier limits: total_requests = max_concurrency × executors
|
|
383
426
|
- Use Spark UI to optimize partition sizes relative to batch_size
|
|
384
427
|
"""
|
|
428
|
+
_model_name = model_name or CONTAINER.resolve(ResponsesModelName).value
|
|
429
|
+
|
|
385
430
|
if issubclass(response_format, BaseModel):
|
|
386
431
|
spark_schema = _pydantic_to_spark_schema(response_format)
|
|
387
432
|
json_schema_string = serialize_base_model(response_format)
|
|
388
433
|
|
|
389
434
|
@pandas_udf(returnType=spark_schema) # type: ignore[call-overload]
|
|
390
435
|
def structure_udf(col: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
|
|
391
|
-
pandas_ext.responses_model(
|
|
436
|
+
pandas_ext.responses_model(_model_name)
|
|
392
437
|
response_format = deserialize_base_model(json_schema_string)
|
|
393
438
|
cache = AsyncBatchingMapProxy[str, response_format](
|
|
394
439
|
batch_size=batch_size,
|
|
@@ -415,7 +460,7 @@ def responses_udf(
|
|
|
415
460
|
|
|
416
461
|
@pandas_udf(returnType=StringType()) # type: ignore[call-overload]
|
|
417
462
|
def string_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
|
|
418
|
-
pandas_ext.responses_model(
|
|
463
|
+
pandas_ext.responses_model(_model_name)
|
|
419
464
|
cache = AsyncBatchingMapProxy[str, str](
|
|
420
465
|
batch_size=batch_size,
|
|
421
466
|
max_concurrency=max_concurrency,
|
|
@@ -443,7 +488,7 @@ def responses_udf(
|
|
|
443
488
|
|
|
444
489
|
def task_udf(
|
|
445
490
|
task: PreparedTask[ResponseFormat],
|
|
446
|
-
model_name: str =
|
|
491
|
+
model_name: str | None = None,
|
|
447
492
|
batch_size: int | None = None,
|
|
448
493
|
max_concurrency: int = 8,
|
|
449
494
|
**api_kwargs,
|
|
@@ -459,8 +504,9 @@ def task_udf(
|
|
|
459
504
|
Args:
|
|
460
505
|
task (PreparedTask): A predefined task configuration containing instructions,
|
|
461
506
|
response format, and API parameters.
|
|
462
|
-
model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
|
|
463
|
-
For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
|
|
507
|
+
model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
|
|
508
|
+
For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
|
|
509
|
+
via ResponsesModelName if not provided.
|
|
464
510
|
batch_size (int | None): Number of rows per async batch request within each partition.
|
|
465
511
|
Larger values reduce API call overhead but increase memory usage.
|
|
466
512
|
Defaults to None (automatic batch size optimization that dynamically
|
|
@@ -550,7 +596,7 @@ def parse_udf(
|
|
|
550
596
|
example_table_name: str | None = None,
|
|
551
597
|
example_field_name: str | None = None,
|
|
552
598
|
max_examples: int = 100,
|
|
553
|
-
model_name: str =
|
|
599
|
+
model_name: str | None = None,
|
|
554
600
|
batch_size: int | None = None,
|
|
555
601
|
max_concurrency: int = 8,
|
|
556
602
|
**api_kwargs,
|
|
@@ -574,8 +620,9 @@ def parse_udf(
|
|
|
574
620
|
If provided, `example_table_name` must also be specified.
|
|
575
621
|
max_examples (int): Maximum number of examples to retrieve for schema inference.
|
|
576
622
|
Defaults to 100.
|
|
577
|
-
model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
|
|
578
|
-
For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
|
|
623
|
+
model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-gpt4-deployment").
|
|
624
|
+
For OpenAI, use the model name (e.g., "gpt-4.1-mini"). Defaults to configured model in DI container
|
|
625
|
+
via ResponsesModelName if not provided.
|
|
579
626
|
batch_size (int | None): Number of rows per async batch request within each partition.
|
|
580
627
|
Larger values reduce API call overhead but increase memory usage.
|
|
581
628
|
Defaults to None (automatic batch size optimization that dynamically
|
|
@@ -622,7 +669,7 @@ def parse_udf(
|
|
|
622
669
|
|
|
623
670
|
|
|
624
671
|
def embeddings_udf(
|
|
625
|
-
model_name: str =
|
|
672
|
+
model_name: str | None = None,
|
|
626
673
|
batch_size: int | None = None,
|
|
627
674
|
max_concurrency: int = 8,
|
|
628
675
|
**api_kwargs,
|
|
@@ -648,9 +695,9 @@ def embeddings_udf(
|
|
|
648
695
|
sc.environment["AZURE_OPENAI_API_VERSION"] = "preview"
|
|
649
696
|
|
|
650
697
|
Args:
|
|
651
|
-
model_name (str): For Azure OpenAI, use your deployment name (e.g., "my-embedding-deployment").
|
|
698
|
+
model_name (str | None): For Azure OpenAI, use your deployment name (e.g., "my-embedding-deployment").
|
|
652
699
|
For OpenAI, use the model name (e.g., "text-embedding-3-small").
|
|
653
|
-
Defaults to configured model in DI container.
|
|
700
|
+
Defaults to configured model in DI container via EmbeddingsModelName if not provided.
|
|
654
701
|
batch_size (int | None): Number of rows per async batch request within each partition.
|
|
655
702
|
Larger values reduce API call overhead but increase memory usage.
|
|
656
703
|
Defaults to None (automatic batch size optimization that dynamically
|
|
@@ -678,9 +725,11 @@ def embeddings_udf(
|
|
|
678
725
|
- Use larger batch_size for embeddings compared to response generation
|
|
679
726
|
"""
|
|
680
727
|
|
|
728
|
+
_model_name = model_name or CONTAINER.resolve(EmbeddingsModelName).value
|
|
729
|
+
|
|
681
730
|
@pandas_udf(returnType=ArrayType(FloatType())) # type: ignore[call-overload,misc]
|
|
682
731
|
def _embeddings_udf(col: Iterator[pd.Series]) -> Iterator[pd.Series]:
|
|
683
|
-
pandas_ext.embeddings_model(
|
|
732
|
+
pandas_ext.embeddings_model(_model_name)
|
|
684
733
|
cache = AsyncBatchingMapProxy[str, np.ndarray](
|
|
685
734
|
batch_size=batch_size,
|
|
686
735
|
max_concurrency=max_concurrency,
|
|
@@ -13,7 +13,7 @@ openaivec/_schema.py,sha256=iOeR5J_ihZRDZtzmqvOK1ZtInKcx4OnoR38DB3VmmQw,15666
|
|
|
13
13
|
openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
|
|
14
14
|
openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
|
|
15
15
|
openaivec/pandas_ext.py,sha256=r2jpFqDnWcQYK3pMv5hCtOStOMltccDyLkpprLmIOls,85715
|
|
16
|
-
openaivec/spark.py,sha256=
|
|
16
|
+
openaivec/spark.py,sha256=5-89uy2K-23Z_j1aRa84Gvl8DV0lusnkRI1zxuFeOEA,34020
|
|
17
17
|
openaivec/task/__init__.py,sha256=RkYIKrcE83M_9Um9cSMkeGzL9kPRAovajfRvr31YxLE,6178
|
|
18
18
|
openaivec/task/customer_support/__init__.py,sha256=KWfGyXPdZyfGdRH17x7hPpJJ1N2EP9PPhZx0fvBAwSI,884
|
|
19
19
|
openaivec/task/customer_support/customer_sentiment.py,sha256=d8spZUtImjePK0xWGvIW98ghbdyOZ0KEZmaUpG8QB7M,7532
|
|
@@ -31,7 +31,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=u-zpqAaQYcr7I3mqMv_CTJXkfxtoLft3
|
|
|
31
31
|
openaivec/task/nlp/translation.py,sha256=kgWj2oN8pUId3vuHTJNx636gB49AGEKXWICA_XJgE_0,6628
|
|
32
32
|
openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
|
|
33
33
|
openaivec/task/table/fillna.py,sha256=zL6m5hGD4kamV7qHETnn__B59wIY540Ks0EzNgUJgdI,6888
|
|
34
|
-
openaivec-0.14.
|
|
35
|
-
openaivec-0.14.
|
|
36
|
-
openaivec-0.14.
|
|
37
|
-
openaivec-0.14.
|
|
34
|
+
openaivec-0.14.14.dist-info/METADATA,sha256=SlUl_cvN1l-4ZxO5-g8jXxCupez29wDeRICq0c6qH3k,28216
|
|
35
|
+
openaivec-0.14.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
36
|
+
openaivec-0.14.14.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
|
|
37
|
+
openaivec-0.14.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|