openaivec 0.14.10__tar.gz → 0.14.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openaivec-0.14.10 → openaivec-0.14.13}/.github/copilot-instructions.md +18 -3
- {openaivec-0.14.10 → openaivec-0.14.13}/PKG-INFO +40 -16
- {openaivec-0.14.10 → openaivec-0.14.13}/README.md +39 -15
- {openaivec-0.14.10 → openaivec-0.14.13}/pyproject.toml +1 -0
- openaivec-0.14.13/pytest.ini +42 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_di.py +21 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_embeddings.py +17 -4
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_model.py +7 -12
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_prompt.py +3 -6
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_provider.py +8 -29
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_responses.py +39 -117
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_schema.py +27 -23
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/pandas_ext.py +356 -343
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/spark.py +253 -115
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/__init__.py +1 -1
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/customer_sentiment.py +4 -9
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/inquiry_classification.py +5 -8
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/inquiry_summary.py +5 -6
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/intent_analysis.py +5 -7
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/response_suggestion.py +5 -8
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/urgency_analysis.py +5 -8
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/dependency_parsing.py +1 -2
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/keyword_extraction.py +1 -2
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/morphological_analysis.py +1 -2
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/named_entity_recognition.py +1 -2
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/sentiment_analysis.py +1 -2
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/translation.py +1 -1
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/table/fillna.py +8 -3
- openaivec-0.14.13/tests/conftest.py +352 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_dynamic.py +51 -67
- openaivec-0.14.13/tests/test_embeddings.py +159 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_optimize.py +38 -73
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_pandas_ext.py +247 -281
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_prompt.py +14 -13
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_provider.py +93 -107
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_responses.py +72 -48
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_schema.py +107 -133
- openaivec-0.14.13/tests/test_serialize.py +265 -0
- openaivec-0.14.13/tests/test_serialize_pydantic_v2_compliance.py +117 -0
- openaivec-0.14.13/tests/test_spark.py +425 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_task.py +49 -54
- openaivec-0.14.13/tests/test_util.py +228 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/uv.lock +1245 -1229
- openaivec-0.14.10/tests/test_embeddings.py +0 -121
- openaivec-0.14.10/tests/test_serialize.py +0 -282
- openaivec-0.14.10/tests/test_serialize_pydantic_v2_compliance.py +0 -1045
- openaivec-0.14.10/tests/test_spark.py +0 -266
- openaivec-0.14.10/tests/test_util.py +0 -298
- {openaivec-0.14.10 → openaivec-0.14.13}/.env.example +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/.github/workflows/python-mkdocs.yml +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/.github/workflows/python-package.yml +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/.github/workflows/python-test.yml +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/.github/workflows/python-update.yml +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/.gitignore +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/CODE_OF_CONDUCT.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/LICENSE +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/SECURITY.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/SUPPORT.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/main.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/pandas_ext.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/spark.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/task.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/api/tasks/nlp/translation.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/index.md +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/docs/robots.txt +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/mkdocs.yml +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/__init__.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_dynamic.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_log.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_optimize.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_proxy.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_serialize.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/_util.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/customer_support/__init__.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/nlp/__init__.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/src/openaivec/task/table/__init__.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/__init__.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_di.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_proxy.py +0 -0
- {openaivec-0.14.10 → openaivec-0.14.13}/tests/test_proxy_suggester.py +0 -0
|
@@ -24,7 +24,10 @@ Entry points:
|
|
|
24
24
|
- Spark UDF builders in `spark.py`
|
|
25
25
|
- Structured tasks under `task/`
|
|
26
26
|
|
|
27
|
-
Azure note: Use deployment name as `model`.
|
|
27
|
+
Azure note: Use deployment name as `model`. Standard Azure OpenAI configuration uses:
|
|
28
|
+
- Base URL: `https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/`
|
|
29
|
+
- API Version: `"preview"`
|
|
30
|
+
Warn if base URL not v1. Behavior otherwise mirrors OpenAI.
|
|
28
31
|
|
|
29
32
|
---
|
|
30
33
|
|
|
@@ -137,7 +140,16 @@ Public exports (`__init__.py`): `BatchResponses`, `AsyncBatchResponses`, `BatchE
|
|
|
137
140
|
## 10. Provider / Azure Rules
|
|
138
141
|
|
|
139
142
|
- Auto-detect provider from env variables; deployment name = model for Azure.
|
|
140
|
-
-
|
|
143
|
+
- Standard Azure OpenAI configuration:
|
|
144
|
+
- Base URL: `https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/`
|
|
145
|
+
- API Version: `"preview"`
|
|
146
|
+
- Environment variables:
|
|
147
|
+
```bash
|
|
148
|
+
export AZURE_OPENAI_API_KEY="your-azure-key"
|
|
149
|
+
export AZURE_OPENAI_BASE_URL="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
|
|
150
|
+
export AZURE_OPENAI_API_VERSION="preview"
|
|
151
|
+
```
|
|
152
|
+
- Warn (don't fail) if Azure base URL not v1 format; still proceed.
|
|
141
153
|
- Keep code paths unified; avoid forking logic unless behavior diverges.
|
|
142
154
|
|
|
143
155
|
---
|
|
@@ -348,6 +360,9 @@ uv run mkdocs serve
|
|
|
348
360
|
Environment setup notes:
|
|
349
361
|
|
|
350
362
|
- Set `OPENAI_API_KEY` or Azure trio (`AZURE_OPENAI_API_KEY`, `AZURE_OPENAI_BASE_URL`, `AZURE_OPENAI_API_VERSION`).
|
|
363
|
+
- Standard Azure OpenAI configuration:
|
|
364
|
+
- `AZURE_OPENAI_BASE_URL="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"`
|
|
365
|
+
- `AZURE_OPENAI_API_VERSION="preview"`
|
|
351
366
|
- Tests auto-skip live paths when credentials absent.
|
|
352
367
|
- Use separate shell profiles per provider if switching frequently.
|
|
353
|
-
- Azure canonical base URL
|
|
368
|
+
- Azure canonical base URL must end with `/openai/v1/` (e.g. `https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/`); non‑v1 forms emit a warning.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openaivec
|
|
3
|
-
Version: 0.14.
|
|
3
|
+
Version: 0.14.13
|
|
4
4
|
Summary: Generative mutation for tabular calculation
|
|
5
5
|
Project-URL: Homepage, https://microsoft.github.io/openaivec/
|
|
6
6
|
Project-URL: Repository, https://github.com/microsoft/openaivec
|
|
@@ -334,26 +334,34 @@ Scale to enterprise datasets with distributed processing:
|
|
|
334
334
|
First, obtain a Spark session and configure authentication:
|
|
335
335
|
|
|
336
336
|
```python
|
|
337
|
-
import os
|
|
338
337
|
from pyspark.sql import SparkSession
|
|
338
|
+
from openaivec.spark import setup, setup_azure
|
|
339
339
|
|
|
340
340
|
spark = SparkSession.builder.getOrCreate()
|
|
341
|
-
sc = spark.sparkContext
|
|
342
341
|
|
|
343
|
-
# Configure authentication via SparkContext environment variables
|
|
344
342
|
# Option 1: Using OpenAI
|
|
345
|
-
|
|
343
|
+
setup(
|
|
344
|
+
spark,
|
|
345
|
+
api_key="your-openai-api-key",
|
|
346
|
+
responses_model_name="gpt-4.1-mini", # Optional: set default model
|
|
347
|
+
embeddings_model_name="text-embedding-3-small" # Optional: set default model
|
|
348
|
+
)
|
|
346
349
|
|
|
347
350
|
# Option 2: Using Azure OpenAI
|
|
348
|
-
#
|
|
349
|
-
#
|
|
350
|
-
#
|
|
351
|
+
# setup_azure(
|
|
352
|
+
# spark,
|
|
353
|
+
# api_key="your-azure-openai-api-key",
|
|
354
|
+
# base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
355
|
+
# api_version="preview",
|
|
356
|
+
# responses_model_name="my-gpt4-deployment", # Optional: set default deployment
|
|
357
|
+
# embeddings_model_name="my-embedding-deployment" # Optional: set default deployment
|
|
358
|
+
# )
|
|
351
359
|
```
|
|
352
360
|
|
|
353
361
|
Next, create and register UDFs using the provided functions:
|
|
354
362
|
|
|
355
363
|
```python
|
|
356
|
-
from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf
|
|
364
|
+
from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf, similarity_udf, parse_udf
|
|
357
365
|
from pydantic import BaseModel
|
|
358
366
|
|
|
359
367
|
# --- Register Responses UDF (String Output) ---
|
|
@@ -387,6 +395,9 @@ spark.udf.register(
|
|
|
387
395
|
# --- Register Token Counting UDF ---
|
|
388
396
|
spark.udf.register("count_tokens", count_tokens_udf())
|
|
389
397
|
|
|
398
|
+
# --- Register Similarity UDF ---
|
|
399
|
+
spark.udf.register("compute_similarity", similarity_udf())
|
|
400
|
+
|
|
390
401
|
# --- Register UDFs with Pre-configured Tasks ---
|
|
391
402
|
from openaivec.task import nlp, customer_support
|
|
392
403
|
|
|
@@ -414,6 +425,17 @@ spark.udf.register(
|
|
|
414
425
|
)
|
|
415
426
|
)
|
|
416
427
|
|
|
428
|
+
# --- Register Parse UDF (Dynamic Schema Inference) ---
|
|
429
|
+
spark.udf.register(
|
|
430
|
+
"parse_dynamic",
|
|
431
|
+
parse_udf(
|
|
432
|
+
instructions="Extract key entities and attributes from the text",
|
|
433
|
+
example_table_name="sample_texts", # Infer schema from examples
|
|
434
|
+
example_field_name="text",
|
|
435
|
+
max_examples=50
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
|
|
417
439
|
```
|
|
418
440
|
|
|
419
441
|
You can now use these UDFs in Spark SQL:
|
|
@@ -691,17 +713,19 @@ steps:
|
|
|
691
713
|
- In the notebook, import and use `openaivec.spark` functions as you normally would. For example:
|
|
692
714
|
|
|
693
715
|
```python
|
|
694
|
-
import
|
|
695
|
-
from openaivec.spark import responses_udf, embeddings_udf
|
|
716
|
+
from openaivec.spark import setup_azure, responses_udf, embeddings_udf
|
|
696
717
|
|
|
697
718
|
# In Microsoft Fabric, spark session is automatically available
|
|
698
719
|
# spark = SparkSession.builder.getOrCreate()
|
|
699
|
-
|
|
700
|
-
|
|
720
|
+
|
|
701
721
|
# Configure Azure OpenAI authentication
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
722
|
+
setup_azure(
|
|
723
|
+
spark,
|
|
724
|
+
api_key="<your-api-key>",
|
|
725
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
726
|
+
api_version="preview",
|
|
727
|
+
responses_model_name="my-gpt4-deployment" # Your Azure deployment name
|
|
728
|
+
)
|
|
705
729
|
|
|
706
730
|
# Register UDFs
|
|
707
731
|
spark.udf.register(
|
|
@@ -308,26 +308,34 @@ Scale to enterprise datasets with distributed processing:
|
|
|
308
308
|
First, obtain a Spark session and configure authentication:
|
|
309
309
|
|
|
310
310
|
```python
|
|
311
|
-
import os
|
|
312
311
|
from pyspark.sql import SparkSession
|
|
312
|
+
from openaivec.spark import setup, setup_azure
|
|
313
313
|
|
|
314
314
|
spark = SparkSession.builder.getOrCreate()
|
|
315
|
-
sc = spark.sparkContext
|
|
316
315
|
|
|
317
|
-
# Configure authentication via SparkContext environment variables
|
|
318
316
|
# Option 1: Using OpenAI
|
|
319
|
-
|
|
317
|
+
setup(
|
|
318
|
+
spark,
|
|
319
|
+
api_key="your-openai-api-key",
|
|
320
|
+
responses_model_name="gpt-4.1-mini", # Optional: set default model
|
|
321
|
+
embeddings_model_name="text-embedding-3-small" # Optional: set default model
|
|
322
|
+
)
|
|
320
323
|
|
|
321
324
|
# Option 2: Using Azure OpenAI
|
|
322
|
-
#
|
|
323
|
-
#
|
|
324
|
-
#
|
|
325
|
+
# setup_azure(
|
|
326
|
+
# spark,
|
|
327
|
+
# api_key="your-azure-openai-api-key",
|
|
328
|
+
# base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
329
|
+
# api_version="preview",
|
|
330
|
+
# responses_model_name="my-gpt4-deployment", # Optional: set default deployment
|
|
331
|
+
# embeddings_model_name="my-embedding-deployment" # Optional: set default deployment
|
|
332
|
+
# )
|
|
325
333
|
```
|
|
326
334
|
|
|
327
335
|
Next, create and register UDFs using the provided functions:
|
|
328
336
|
|
|
329
337
|
```python
|
|
330
|
-
from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf
|
|
338
|
+
from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf, similarity_udf, parse_udf
|
|
331
339
|
from pydantic import BaseModel
|
|
332
340
|
|
|
333
341
|
# --- Register Responses UDF (String Output) ---
|
|
@@ -361,6 +369,9 @@ spark.udf.register(
|
|
|
361
369
|
# --- Register Token Counting UDF ---
|
|
362
370
|
spark.udf.register("count_tokens", count_tokens_udf())
|
|
363
371
|
|
|
372
|
+
# --- Register Similarity UDF ---
|
|
373
|
+
spark.udf.register("compute_similarity", similarity_udf())
|
|
374
|
+
|
|
364
375
|
# --- Register UDFs with Pre-configured Tasks ---
|
|
365
376
|
from openaivec.task import nlp, customer_support
|
|
366
377
|
|
|
@@ -388,6 +399,17 @@ spark.udf.register(
|
|
|
388
399
|
)
|
|
389
400
|
)
|
|
390
401
|
|
|
402
|
+
# --- Register Parse UDF (Dynamic Schema Inference) ---
|
|
403
|
+
spark.udf.register(
|
|
404
|
+
"parse_dynamic",
|
|
405
|
+
parse_udf(
|
|
406
|
+
instructions="Extract key entities and attributes from the text",
|
|
407
|
+
example_table_name="sample_texts", # Infer schema from examples
|
|
408
|
+
example_field_name="text",
|
|
409
|
+
max_examples=50
|
|
410
|
+
)
|
|
411
|
+
)
|
|
412
|
+
|
|
391
413
|
```
|
|
392
414
|
|
|
393
415
|
You can now use these UDFs in Spark SQL:
|
|
@@ -665,17 +687,19 @@ steps:
|
|
|
665
687
|
- In the notebook, import and use `openaivec.spark` functions as you normally would. For example:
|
|
666
688
|
|
|
667
689
|
```python
|
|
668
|
-
import
|
|
669
|
-
from openaivec.spark import responses_udf, embeddings_udf
|
|
690
|
+
from openaivec.spark import setup_azure, responses_udf, embeddings_udf
|
|
670
691
|
|
|
671
692
|
# In Microsoft Fabric, spark session is automatically available
|
|
672
693
|
# spark = SparkSession.builder.getOrCreate()
|
|
673
|
-
|
|
674
|
-
|
|
694
|
+
|
|
675
695
|
# Configure Azure OpenAI authentication
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
696
|
+
setup_azure(
|
|
697
|
+
spark,
|
|
698
|
+
api_key="<your-api-key>",
|
|
699
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
700
|
+
api_version="preview",
|
|
701
|
+
responses_model_name="my-gpt4-deployment" # Your Azure deployment name
|
|
702
|
+
)
|
|
679
703
|
|
|
680
704
|
# Register UDFs
|
|
681
705
|
spark.udf.register(
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[tool:pytest]
|
|
2
|
+
# Pytest configuration for openaivec
|
|
3
|
+
|
|
4
|
+
# Test discovery
|
|
5
|
+
testpaths = tests
|
|
6
|
+
python_files = test_*.py
|
|
7
|
+
python_classes = Test*
|
|
8
|
+
python_functions = test_*
|
|
9
|
+
|
|
10
|
+
# Markers
|
|
11
|
+
markers =
|
|
12
|
+
slow: marks tests as slow (deselect with '-m "not slow"')
|
|
13
|
+
requires_api: marks tests as requiring OPENAI_API_KEY environment variable
|
|
14
|
+
asyncio: marks tests as async (handled by pytest-asyncio)
|
|
15
|
+
spark: marks tests as requiring Spark session
|
|
16
|
+
integration: marks tests as integration tests
|
|
17
|
+
|
|
18
|
+
# Output options
|
|
19
|
+
addopts =
|
|
20
|
+
--tb=short
|
|
21
|
+
--strict-markers
|
|
22
|
+
--strict-config
|
|
23
|
+
--disable-warnings
|
|
24
|
+
-ra
|
|
25
|
+
|
|
26
|
+
# Async configuration
|
|
27
|
+
asyncio_mode = auto
|
|
28
|
+
|
|
29
|
+
# Logging
|
|
30
|
+
log_cli = false
|
|
31
|
+
log_cli_level = INFO
|
|
32
|
+
log_cli_format = %(asctime)s [%(levelname)8s] %(name)s: %(message)s
|
|
33
|
+
log_cli_date_format = %Y-%m-%d %H:%M:%S
|
|
34
|
+
|
|
35
|
+
# Minimum version
|
|
36
|
+
minversion = 6.0
|
|
37
|
+
|
|
38
|
+
# Filter warnings
|
|
39
|
+
filterwarnings =
|
|
40
|
+
ignore::UserWarning:openai.*
|
|
41
|
+
ignore::DeprecationWarning:pandas.*
|
|
42
|
+
ignore::RuntimeWarning:numpy.*
|
|
@@ -303,3 +303,24 @@ class Container:
|
|
|
303
303
|
self._providers.clear()
|
|
304
304
|
self._instances.clear()
|
|
305
305
|
self._resolving.clear()
|
|
306
|
+
|
|
307
|
+
def clear_singletons(self) -> None:
|
|
308
|
+
"""Clear all cached singleton instances from the container.
|
|
309
|
+
|
|
310
|
+
Removes all cached singleton instances while keeping the registered
|
|
311
|
+
providers intact. After calling this method, the next resolve call
|
|
312
|
+
for any service will create a new instance using the provider function.
|
|
313
|
+
|
|
314
|
+
Example:
|
|
315
|
+
```python
|
|
316
|
+
container = Container()
|
|
317
|
+
container.register(str, lambda: "Hello")
|
|
318
|
+
instance1 = container.resolve(str)
|
|
319
|
+
container.clear_singletons()
|
|
320
|
+
instance2 = container.resolve(str)
|
|
321
|
+
print(instance1 is instance2)
|
|
322
|
+
# False - different instances after clearing singletons
|
|
323
|
+
```
|
|
324
|
+
"""
|
|
325
|
+
with self._lock:
|
|
326
|
+
self._instances.clear()
|
|
@@ -26,14 +26,16 @@ class BatchEmbeddings:
|
|
|
26
26
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
|
|
27
27
|
(e.g., ``"text-embedding-3-small"``).
|
|
28
28
|
cache (BatchingMapProxy[str, NDArray[np.float32]]): Batching proxy for ordered, cached mapping.
|
|
29
|
+
api_kwargs (dict[str, Any]): Additional OpenAI API parameters stored at initialization.
|
|
29
30
|
"""
|
|
30
31
|
|
|
31
32
|
client: OpenAI
|
|
32
33
|
model_name: str
|
|
33
34
|
cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
|
|
35
|
+
api_kwargs: dict[str, int | float | str | bool] = field(default_factory=dict)
|
|
34
36
|
|
|
35
37
|
@classmethod
|
|
36
|
-
def of(cls, client: OpenAI, model_name: str, batch_size: int | None = None) -> "BatchEmbeddings":
|
|
38
|
+
def of(cls, client: OpenAI, model_name: str, batch_size: int | None = None, **api_kwargs) -> "BatchEmbeddings":
|
|
37
39
|
"""Factory constructor.
|
|
38
40
|
|
|
39
41
|
Args:
|
|
@@ -41,11 +43,17 @@ class BatchEmbeddings:
|
|
|
41
43
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
|
|
42
44
|
batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
|
|
43
45
|
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
46
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
44
47
|
|
|
45
48
|
Returns:
|
|
46
49
|
BatchEmbeddings: Configured instance backed by a batching proxy.
|
|
47
50
|
"""
|
|
48
|
-
return cls(
|
|
51
|
+
return cls(
|
|
52
|
+
client=client,
|
|
53
|
+
model_name=model_name,
|
|
54
|
+
cache=BatchingMapProxy(batch_size=batch_size),
|
|
55
|
+
api_kwargs=api_kwargs,
|
|
56
|
+
)
|
|
49
57
|
|
|
50
58
|
@observe(_LOGGER)
|
|
51
59
|
@backoff(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
@@ -62,7 +70,7 @@ class BatchEmbeddings:
|
|
|
62
70
|
Returns:
|
|
63
71
|
list[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
64
72
|
"""
|
|
65
|
-
responses = self.client.embeddings.create(input=inputs, model=self.model_name)
|
|
73
|
+
responses = self.client.embeddings.create(input=inputs, model=self.model_name, **self.api_kwargs)
|
|
66
74
|
return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
|
|
67
75
|
|
|
68
76
|
@observe(_LOGGER)
|
|
@@ -122,6 +130,7 @@ class AsyncBatchEmbeddings:
|
|
|
122
130
|
client (AsyncOpenAI): Configured OpenAI async client.
|
|
123
131
|
model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
|
|
124
132
|
cache (AsyncBatchingMapProxy[str, NDArray[np.float32]]): Async batching proxy.
|
|
133
|
+
api_kwargs (dict): Additional OpenAI API parameters stored at initialization.
|
|
125
134
|
"""
|
|
126
135
|
|
|
127
136
|
client: AsyncOpenAI
|
|
@@ -129,6 +138,7 @@ class AsyncBatchEmbeddings:
|
|
|
129
138
|
cache: AsyncBatchingMapProxy[str, NDArray[np.float32]] = field(
|
|
130
139
|
default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
|
|
131
140
|
)
|
|
141
|
+
api_kwargs: dict[str, int | float | str | bool] = field(default_factory=dict)
|
|
132
142
|
|
|
133
143
|
@classmethod
|
|
134
144
|
def of(
|
|
@@ -137,6 +147,7 @@ class AsyncBatchEmbeddings:
|
|
|
137
147
|
model_name: str,
|
|
138
148
|
batch_size: int | None = None,
|
|
139
149
|
max_concurrency: int = 8,
|
|
150
|
+
**api_kwargs,
|
|
140
151
|
) -> "AsyncBatchEmbeddings":
|
|
141
152
|
"""Factory constructor.
|
|
142
153
|
|
|
@@ -146,6 +157,7 @@ class AsyncBatchEmbeddings:
|
|
|
146
157
|
batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
|
|
147
158
|
(automatic batch size optimization). Set to a positive integer for fixed batch size.
|
|
148
159
|
max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
|
|
160
|
+
**api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
|
|
149
161
|
|
|
150
162
|
Returns:
|
|
151
163
|
AsyncBatchEmbeddings: Configured instance with an async batching proxy.
|
|
@@ -154,6 +166,7 @@ class AsyncBatchEmbeddings:
|
|
|
154
166
|
client=client,
|
|
155
167
|
model_name=model_name,
|
|
156
168
|
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
169
|
+
api_kwargs=api_kwargs,
|
|
157
170
|
)
|
|
158
171
|
|
|
159
172
|
@backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
|
|
@@ -174,7 +187,7 @@ class AsyncBatchEmbeddings:
|
|
|
174
187
|
Raises:
|
|
175
188
|
RateLimitError: Propagated if retries are exhausted.
|
|
176
189
|
"""
|
|
177
|
-
responses = await self.client.embeddings.create(input=inputs, model=self.model_name)
|
|
190
|
+
responses = await self.client.embeddings.create(input=inputs, model=self.model_name, **self.api_kwargs)
|
|
178
191
|
return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
|
|
179
192
|
|
|
180
193
|
@observe(_LOGGER)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
2
|
from typing import Generic, TypeVar
|
|
3
3
|
|
|
4
4
|
__all__ = [
|
|
@@ -14,7 +14,7 @@ class PreparedTask(Generic[ResponseFormat]):
|
|
|
14
14
|
|
|
15
15
|
This class encapsulates all the necessary parameters for executing a task,
|
|
16
16
|
including the instructions to be sent to the model, the expected response
|
|
17
|
-
format using Pydantic models, and
|
|
17
|
+
format using Pydantic models, and API parameters for controlling
|
|
18
18
|
the model's output behavior.
|
|
19
19
|
|
|
20
20
|
Attributes:
|
|
@@ -22,12 +22,9 @@ class PreparedTask(Generic[ResponseFormat]):
|
|
|
22
22
|
This should contain clear, specific directions for the task.
|
|
23
23
|
response_format (type[ResponseFormat]): A Pydantic model class or str type that defines the expected
|
|
24
24
|
structure of the response. Can be either a BaseModel subclass or str.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Defaults to
|
|
28
|
-
top_p (float): Controls diversity via nucleus sampling. Only tokens
|
|
29
|
-
comprising the top_p probability mass are considered.
|
|
30
|
-
Range: 0.0 to 1.0. Defaults to 1.0.
|
|
25
|
+
api_kwargs (dict[str, int | float | str | bool]): Additional OpenAI API parameters
|
|
26
|
+
such as temperature, top_p, frequency_penalty, presence_penalty, seed, etc.
|
|
27
|
+
Defaults to an empty dict.
|
|
31
28
|
|
|
32
29
|
Example:
|
|
33
30
|
Creating a custom task:
|
|
@@ -43,8 +40,7 @@ class PreparedTask(Generic[ResponseFormat]):
|
|
|
43
40
|
custom_task = PreparedTask(
|
|
44
41
|
instructions="Translate the following text to French:",
|
|
45
42
|
response_format=TranslationResponse,
|
|
46
|
-
temperature
|
|
47
|
-
top_p=0.9
|
|
43
|
+
api_kwargs={"temperature": 0.1, "top_p": 0.9}
|
|
48
44
|
)
|
|
49
45
|
```
|
|
50
46
|
|
|
@@ -55,8 +51,7 @@ class PreparedTask(Generic[ResponseFormat]):
|
|
|
55
51
|
|
|
56
52
|
instructions: str
|
|
57
53
|
response_format: type[ResponseFormat]
|
|
58
|
-
|
|
59
|
-
top_p: float = 1.0
|
|
54
|
+
api_kwargs: dict[str, int | float | str | bool] = field(default_factory=dict)
|
|
60
55
|
|
|
61
56
|
|
|
62
57
|
@dataclass(frozen=True)
|
|
@@ -445,8 +445,7 @@ class FewShotPromptBuilder:
|
|
|
445
445
|
self,
|
|
446
446
|
client: OpenAI | None = None,
|
|
447
447
|
model_name: str | None = None,
|
|
448
|
-
|
|
449
|
-
top_p: float | None = None,
|
|
448
|
+
**api_kwargs,
|
|
450
449
|
) -> "FewShotPromptBuilder":
|
|
451
450
|
"""Iteratively refine the prompt using an LLM.
|
|
452
451
|
|
|
@@ -460,8 +459,7 @@ class FewShotPromptBuilder:
|
|
|
460
459
|
Args:
|
|
461
460
|
client (OpenAI | None): Configured OpenAI client. If None, uses DI container with environment variables.
|
|
462
461
|
model_name (str | None): Model identifier. If None, uses default ``gpt-4.1-mini``.
|
|
463
|
-
|
|
464
|
-
top_p (float | None): Nucleus sampling parameter. If None, uses model default.
|
|
462
|
+
**api_kwargs: Additional OpenAI API parameters (temperature, top_p, etc.).
|
|
465
463
|
|
|
466
464
|
Returns:
|
|
467
465
|
FewShotPromptBuilder: The current builder instance containing the refined prompt and iteration history.
|
|
@@ -479,9 +477,8 @@ class FewShotPromptBuilder:
|
|
|
479
477
|
model=_model_name,
|
|
480
478
|
instructions=_PROMPT,
|
|
481
479
|
input=Request(prompt=self._prompt).model_dump_json(),
|
|
482
|
-
temperature=temperature,
|
|
483
|
-
top_p=top_p,
|
|
484
480
|
text_format=Response,
|
|
481
|
+
**api_kwargs,
|
|
485
482
|
)
|
|
486
483
|
|
|
487
484
|
# keep the original prompt
|
|
@@ -130,35 +130,9 @@ def provide_async_openai_client() -> AsyncOpenAI:
|
|
|
130
130
|
)
|
|
131
131
|
|
|
132
132
|
|
|
133
|
-
|
|
134
|
-
CONTAINER.register(
|
|
135
|
-
CONTAINER.register(
|
|
136
|
-
CONTAINER.register(AzureOpenAIAPIKey, lambda: AzureOpenAIAPIKey(os.getenv("AZURE_OPENAI_API_KEY")))
|
|
137
|
-
CONTAINER.register(AzureOpenAIBaseURL, lambda: AzureOpenAIBaseURL(os.getenv("AZURE_OPENAI_BASE_URL")))
|
|
138
|
-
CONTAINER.register(
|
|
139
|
-
cls=AzureOpenAIAPIVersion,
|
|
140
|
-
provider=lambda: AzureOpenAIAPIVersion(os.getenv("AZURE_OPENAI_API_VERSION", "preview")),
|
|
141
|
-
)
|
|
142
|
-
CONTAINER.register(OpenAI, provide_openai_client)
|
|
143
|
-
CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
|
|
144
|
-
CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"))
|
|
145
|
-
CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
|
|
146
|
-
CONTAINER.register(
|
|
147
|
-
SchemaInferer,
|
|
148
|
-
lambda: SchemaInferer(
|
|
149
|
-
client=CONTAINER.resolve(OpenAI),
|
|
150
|
-
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
151
|
-
),
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def reset_environment_registrations():
|
|
156
|
-
"""Reset environment variable related registrations in the container.
|
|
157
|
-
|
|
158
|
-
This function re-registers environment variable dependent services to pick up
|
|
159
|
-
current environment variable values. Useful for testing when environment
|
|
160
|
-
variables are changed after initial container setup.
|
|
161
|
-
"""
|
|
133
|
+
def set_default_registrations():
|
|
134
|
+
CONTAINER.register(ResponsesModelName, lambda: ResponsesModelName("gpt-4.1-mini"))
|
|
135
|
+
CONTAINER.register(EmbeddingsModelName, lambda: EmbeddingsModelName("text-embedding-3-small"))
|
|
162
136
|
CONTAINER.register(OpenAIAPIKey, lambda: OpenAIAPIKey(os.getenv("OPENAI_API_KEY")))
|
|
163
137
|
CONTAINER.register(AzureOpenAIAPIKey, lambda: AzureOpenAIAPIKey(os.getenv("AZURE_OPENAI_API_KEY")))
|
|
164
138
|
CONTAINER.register(AzureOpenAIBaseURL, lambda: AzureOpenAIBaseURL(os.getenv("AZURE_OPENAI_BASE_URL")))
|
|
@@ -168,6 +142,8 @@ def reset_environment_registrations():
|
|
|
168
142
|
)
|
|
169
143
|
CONTAINER.register(OpenAI, provide_openai_client)
|
|
170
144
|
CONTAINER.register(AsyncOpenAI, provide_async_openai_client)
|
|
145
|
+
CONTAINER.register(tiktoken.Encoding, lambda: tiktoken.get_encoding("o200k_base"))
|
|
146
|
+
CONTAINER.register(TextChunker, lambda: TextChunker(CONTAINER.resolve(tiktoken.Encoding)))
|
|
171
147
|
CONTAINER.register(
|
|
172
148
|
SchemaInferer,
|
|
173
149
|
lambda: SchemaInferer(
|
|
@@ -175,3 +151,6 @@ def reset_environment_registrations():
|
|
|
175
151
|
model_name=CONTAINER.resolve(ResponsesModelName).value,
|
|
176
152
|
),
|
|
177
153
|
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
set_default_registrations()
|