openaivec 0.12.5__tar.gz → 0.13.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openaivec-0.12.5 → openaivec-0.13.0}/PKG-INFO +39 -16
- {openaivec-0.12.5 → openaivec-0.13.0}/README.md +38 -15
- openaivec-0.13.0/docs/api/proxy.md +102 -0
- openaivec-0.13.0/src/openaivec/embeddings.py +188 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/model.py +20 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/pandas_ext.py +455 -121
- openaivec-0.13.0/src/openaivec/provider.py +98 -0
- openaivec-0.13.0/src/openaivec/proxy.py +608 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/responses.py +175 -105
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/serialize.py +41 -33
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/spark.py +137 -88
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/__init__.py +3 -3
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/__init__.py +1 -1
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/table/__init__.py +1 -1
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/util.py +1 -69
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_embeddings.py +21 -20
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_pandas_ext.py +215 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_provider.py +183 -84
- openaivec-0.13.0/tests/test_proxy.py +581 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_responses.py +32 -12
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_serialize.py +64 -113
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_spark.py +23 -19
- openaivec-0.13.0/tests/test_util.py +41 -0
- openaivec-0.12.5/src/openaivec/embeddings.py +0 -172
- openaivec-0.12.5/src/openaivec/provider.py +0 -45
- openaivec-0.12.5/tests/test_util.py +0 -176
- {openaivec-0.12.5 → openaivec-0.13.0}/.env.example +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-mkdocs.yml +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-package.yml +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-test.yml +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-update.yml +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/.gitignore +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/CODE_OF_CONDUCT.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/LICENSE +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/SECURITY.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/SUPPORT.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/di.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/embeddings.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/pandas_ext.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/prompt.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/responses.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/spark.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/task.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/translation.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/util.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/index.md +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/docs/robots.txt +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/mkdocs.yml +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/pyproject.toml +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/__init__.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/di.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/log.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/prompt.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/__init__.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/translation.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/table/fillna.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/__init__.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_di.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_prompt.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_task.py +0 -0
- {openaivec-0.12.5 → openaivec-0.13.0}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openaivec
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.13.0
|
|
4
4
|
Summary: Generative mutation for tabular calculation
|
|
5
5
|
Project-URL: Homepage, https://microsoft.github.io/openaivec/
|
|
6
6
|
Project-URL: Repository, https://github.com/microsoft/openaivec
|
|
@@ -153,13 +153,14 @@ from openai import OpenAI
|
|
|
153
153
|
from openaivec import BatchResponses
|
|
154
154
|
|
|
155
155
|
# Initialize the batch client
|
|
156
|
-
client = BatchResponses(
|
|
156
|
+
client = BatchResponses.of(
|
|
157
157
|
client=OpenAI(),
|
|
158
158
|
model_name="gpt-4.1-mini",
|
|
159
|
-
system_message="Please answer only with 'xx family' and do not output anything else."
|
|
159
|
+
system_message="Please answer only with 'xx family' and do not output anything else.",
|
|
160
|
+
batch_size=32,
|
|
160
161
|
)
|
|
161
162
|
|
|
162
|
-
result = client.parse(["panda", "rabbit", "koala"]
|
|
163
|
+
result = client.parse(["panda", "rabbit", "koala"])
|
|
163
164
|
print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
|
|
164
165
|
```
|
|
165
166
|
|
|
@@ -170,10 +171,25 @@ print(result) # Expected output: ['bear family', 'rabbit family', 'koala family
|
|
|
170
171
|
The easiest way to get started with your DataFrames:
|
|
171
172
|
|
|
172
173
|
```python
|
|
174
|
+
import os
|
|
173
175
|
import pandas as pd
|
|
174
176
|
from openaivec import pandas_ext
|
|
175
177
|
|
|
176
|
-
#
|
|
178
|
+
# Authentication Option 1: Environment variables (automatic detection)
|
|
179
|
+
# For OpenAI:
|
|
180
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key-here"
|
|
181
|
+
# Or for Azure OpenAI:
|
|
182
|
+
# os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-key"
|
|
183
|
+
# os.environ["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
|
|
184
|
+
# os.environ["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
|
|
185
|
+
|
|
186
|
+
# Authentication Option 2: Custom client (optional)
|
|
187
|
+
# from openai import OpenAI, AsyncOpenAI
|
|
188
|
+
# pandas_ext.use(OpenAI())
|
|
189
|
+
# For async operations:
|
|
190
|
+
# pandas_ext.use_async(AsyncOpenAI())
|
|
191
|
+
|
|
192
|
+
# Configure model (optional - defaults to gpt-4.1-mini)
|
|
177
193
|
pandas_ext.responses_model("gpt-4.1-mini")
|
|
178
194
|
|
|
179
195
|
# Create your data
|
|
@@ -230,7 +246,7 @@ extracted_results = (results
|
|
|
230
246
|
|
|
231
247
|
**Available Task Categories:**
|
|
232
248
|
|
|
233
|
-
- **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.
|
|
249
|
+
- **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.MULTILINGUAL_TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`
|
|
234
250
|
- **Content Classification**: `customer_support.INTENT_ANALYSIS`, `customer_support.URGENCY_ANALYSIS`, `customer_support.INQUIRY_CLASSIFICATION`
|
|
235
251
|
|
|
236
252
|
**Benefits of Pre-configured Tasks:**
|
|
@@ -345,7 +361,7 @@ spark.udf.register(
|
|
|
345
361
|
)
|
|
346
362
|
|
|
347
363
|
# --- Register Token Counting UDF ---
|
|
348
|
-
spark.udf.register("count_tokens", count_tokens_udf(
|
|
364
|
+
spark.udf.register("count_tokens", count_tokens_udf())
|
|
349
365
|
|
|
350
366
|
# --- Register UDFs with Pre-configured Tasks ---
|
|
351
367
|
from openaivec.task import nlp, customer_support
|
|
@@ -393,16 +409,23 @@ FROM product_reviews;
|
|
|
393
409
|
|
|
394
410
|
Example Output (structure might vary slightly):
|
|
395
411
|
|
|
396
|
-
| id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required
|
|
397
|
-
| ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- |
|
|
398
|
-
| 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid...
|
|
399
|
-
| 1002 | Quantum Galaxy has great battery life but the price is too high for what...
|
|
400
|
-
| 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue
|
|
412
|
+
| id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required | embedding | token_count |
|
|
413
|
+
| ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- | ------------------ | ---------------------- | ----------- |
|
|
414
|
+
| 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid... | Nexus Corp | {en: ..., fr: ..., ja: ...} | positive | 0.95 | provide_feedback | acknowledge_review | [0.1, -0.2, ..., 0.5] | 19 |
|
|
415
|
+
| 1002 | Quantum Galaxy has great battery life but the price is too high for what... | Quantum | {en: ..., fr: ..., ja: ...} | mixed | 0.78 | provide_feedback | follow_up_pricing | [-0.3, 0.1, ..., -0.1] | 16 |
|
|
416
|
+
| 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue | [0.0, 0.4, ..., 0.2] | 12 |
|
|
401
417
|
|
|
402
418
|
### Spark Performance Tuning
|
|
403
419
|
|
|
404
420
|
When using openaivec with Spark, proper configuration of `batch_size` and `max_concurrency` is crucial for optimal performance:
|
|
405
421
|
|
|
422
|
+
**Automatic Caching** (New):
|
|
423
|
+
|
|
424
|
+
- **Duplicate Detection**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`) automatically cache duplicate inputs within each partition
|
|
425
|
+
- **Cost Reduction**: Significantly reduces API calls and costs on datasets with repeated content
|
|
426
|
+
- **Transparent**: Works automatically without code changes - your existing UDFs become more efficient
|
|
427
|
+
- **Partition-Level**: Each partition maintains its own cache, optimal for distributed processing patterns
|
|
428
|
+
|
|
406
429
|
**`batch_size`** (default: 128):
|
|
407
430
|
|
|
408
431
|
- Controls how many rows are processed together in each API request within a partition
|
|
@@ -635,16 +658,16 @@ steps:
|
|
|
635
658
|
|
|
636
659
|
```python
|
|
637
660
|
import os
|
|
638
|
-
from pyspark.sql import SparkSession
|
|
639
661
|
from openaivec.spark import responses_udf, embeddings_udf
|
|
640
662
|
|
|
641
|
-
spark
|
|
663
|
+
# In Microsoft Fabric, spark session is automatically available
|
|
664
|
+
# spark = SparkSession.builder.getOrCreate()
|
|
642
665
|
sc = spark.sparkContext
|
|
643
666
|
|
|
644
667
|
# Configure Azure OpenAI authentication
|
|
645
668
|
sc.environment["AZURE_OPENAI_API_KEY"] = "<your-api-key>"
|
|
646
|
-
sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.
|
|
647
|
-
sc.environment["AZURE_OPENAI_API_VERSION"] = "
|
|
669
|
+
sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
|
|
670
|
+
sc.environment["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
|
|
648
671
|
|
|
649
672
|
# Register UDFs
|
|
650
673
|
spark.udf.register(
|
|
@@ -129,13 +129,14 @@ from openai import OpenAI
|
|
|
129
129
|
from openaivec import BatchResponses
|
|
130
130
|
|
|
131
131
|
# Initialize the batch client
|
|
132
|
-
client = BatchResponses(
|
|
132
|
+
client = BatchResponses.of(
|
|
133
133
|
client=OpenAI(),
|
|
134
134
|
model_name="gpt-4.1-mini",
|
|
135
|
-
system_message="Please answer only with 'xx family' and do not output anything else."
|
|
135
|
+
system_message="Please answer only with 'xx family' and do not output anything else.",
|
|
136
|
+
batch_size=32,
|
|
136
137
|
)
|
|
137
138
|
|
|
138
|
-
result = client.parse(["panda", "rabbit", "koala"]
|
|
139
|
+
result = client.parse(["panda", "rabbit", "koala"])
|
|
139
140
|
print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
|
|
140
141
|
```
|
|
141
142
|
|
|
@@ -146,10 +147,25 @@ print(result) # Expected output: ['bear family', 'rabbit family', 'koala family
|
|
|
146
147
|
The easiest way to get started with your DataFrames:
|
|
147
148
|
|
|
148
149
|
```python
|
|
150
|
+
import os
|
|
149
151
|
import pandas as pd
|
|
150
152
|
from openaivec import pandas_ext
|
|
151
153
|
|
|
152
|
-
#
|
|
154
|
+
# Authentication Option 1: Environment variables (automatic detection)
|
|
155
|
+
# For OpenAI:
|
|
156
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key-here"
|
|
157
|
+
# Or for Azure OpenAI:
|
|
158
|
+
# os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-key"
|
|
159
|
+
# os.environ["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
|
|
160
|
+
# os.environ["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
|
|
161
|
+
|
|
162
|
+
# Authentication Option 2: Custom client (optional)
|
|
163
|
+
# from openai import OpenAI, AsyncOpenAI
|
|
164
|
+
# pandas_ext.use(OpenAI())
|
|
165
|
+
# For async operations:
|
|
166
|
+
# pandas_ext.use_async(AsyncOpenAI())
|
|
167
|
+
|
|
168
|
+
# Configure model (optional - defaults to gpt-4.1-mini)
|
|
153
169
|
pandas_ext.responses_model("gpt-4.1-mini")
|
|
154
170
|
|
|
155
171
|
# Create your data
|
|
@@ -206,7 +222,7 @@ extracted_results = (results
|
|
|
206
222
|
|
|
207
223
|
**Available Task Categories:**
|
|
208
224
|
|
|
209
|
-
- **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.
|
|
225
|
+
- **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.MULTILINGUAL_TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`
|
|
210
226
|
- **Content Classification**: `customer_support.INTENT_ANALYSIS`, `customer_support.URGENCY_ANALYSIS`, `customer_support.INQUIRY_CLASSIFICATION`
|
|
211
227
|
|
|
212
228
|
**Benefits of Pre-configured Tasks:**
|
|
@@ -321,7 +337,7 @@ spark.udf.register(
|
|
|
321
337
|
)
|
|
322
338
|
|
|
323
339
|
# --- Register Token Counting UDF ---
|
|
324
|
-
spark.udf.register("count_tokens", count_tokens_udf(
|
|
340
|
+
spark.udf.register("count_tokens", count_tokens_udf())
|
|
325
341
|
|
|
326
342
|
# --- Register UDFs with Pre-configured Tasks ---
|
|
327
343
|
from openaivec.task import nlp, customer_support
|
|
@@ -369,16 +385,23 @@ FROM product_reviews;
|
|
|
369
385
|
|
|
370
386
|
Example Output (structure might vary slightly):
|
|
371
387
|
|
|
372
|
-
| id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required
|
|
373
|
-
| ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- |
|
|
374
|
-
| 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid...
|
|
375
|
-
| 1002 | Quantum Galaxy has great battery life but the price is too high for what...
|
|
376
|
-
| 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue
|
|
388
|
+
| id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required | embedding | token_count |
|
|
389
|
+
| ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- | ------------------ | ---------------------- | ----------- |
|
|
390
|
+
| 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid... | Nexus Corp | {en: ..., fr: ..., ja: ...} | positive | 0.95 | provide_feedback | acknowledge_review | [0.1, -0.2, ..., 0.5] | 19 |
|
|
391
|
+
| 1002 | Quantum Galaxy has great battery life but the price is too high for what... | Quantum | {en: ..., fr: ..., ja: ...} | mixed | 0.78 | provide_feedback | follow_up_pricing | [-0.3, 0.1, ..., -0.1] | 16 |
|
|
392
|
+
| 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue | [0.0, 0.4, ..., 0.2] | 12 |
|
|
377
393
|
|
|
378
394
|
### Spark Performance Tuning
|
|
379
395
|
|
|
380
396
|
When using openaivec with Spark, proper configuration of `batch_size` and `max_concurrency` is crucial for optimal performance:
|
|
381
397
|
|
|
398
|
+
**Automatic Caching** (New):
|
|
399
|
+
|
|
400
|
+
- **Duplicate Detection**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`) automatically cache duplicate inputs within each partition
|
|
401
|
+
- **Cost Reduction**: Significantly reduces API calls and costs on datasets with repeated content
|
|
402
|
+
- **Transparent**: Works automatically without code changes - your existing UDFs become more efficient
|
|
403
|
+
- **Partition-Level**: Each partition maintains its own cache, optimal for distributed processing patterns
|
|
404
|
+
|
|
382
405
|
**`batch_size`** (default: 128):
|
|
383
406
|
|
|
384
407
|
- Controls how many rows are processed together in each API request within a partition
|
|
@@ -611,16 +634,16 @@ steps:
|
|
|
611
634
|
|
|
612
635
|
```python
|
|
613
636
|
import os
|
|
614
|
-
from pyspark.sql import SparkSession
|
|
615
637
|
from openaivec.spark import responses_udf, embeddings_udf
|
|
616
638
|
|
|
617
|
-
spark
|
|
639
|
+
# In Microsoft Fabric, spark session is automatically available
|
|
640
|
+
# spark = SparkSession.builder.getOrCreate()
|
|
618
641
|
sc = spark.sparkContext
|
|
619
642
|
|
|
620
643
|
# Configure Azure OpenAI authentication
|
|
621
644
|
sc.environment["AZURE_OPENAI_API_KEY"] = "<your-api-key>"
|
|
622
|
-
sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.
|
|
623
|
-
sc.environment["AZURE_OPENAI_API_VERSION"] = "
|
|
645
|
+
sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
|
|
646
|
+
sc.environment["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
|
|
624
647
|
|
|
625
648
|
# Register UDFs
|
|
626
649
|
spark.udf.register(
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# proxy
|
|
2
|
+
|
|
3
|
+
Batching proxies for order-preserving, cached batch mapping.
|
|
4
|
+
|
|
5
|
+
This module provides two helpers:
|
|
6
|
+
|
|
7
|
+
- BatchingMapProxy: thread-safe synchronous batching with caching and de-duplication.
|
|
8
|
+
- AsyncBatchingMapProxy: asyncio-friendly batching with optional concurrency limits.
|
|
9
|
+
|
|
10
|
+
Both proxies accept the mapping function as the second argument to map(). The function must:
|
|
11
|
+
|
|
12
|
+
- Accept a list of inputs and return a list of outputs in the same order.
|
|
13
|
+
- Be pure relative to a single call (side effects should be idempotent or safe).
|
|
14
|
+
|
|
15
|
+
## Synchronous usage (BatchingMapProxy)
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from typing import List
|
|
19
|
+
from openaivec.proxy import BatchingMapProxy
|
|
20
|
+
|
|
21
|
+
# Define your batch mapping function
|
|
22
|
+
def fetch_many(keys: List[int]) -> List[str]:
|
|
23
|
+
# Example: echo values as strings
|
|
24
|
+
return [f"val:{k}" for k in keys]
|
|
25
|
+
|
|
26
|
+
# Create proxy with an optional batch size hint
|
|
27
|
+
proxy = BatchingMapProxy[int, str](batch_size=3)
|
|
28
|
+
|
|
29
|
+
# Map items using the proxy. Duplicates are de-duplicated and order preserved.
|
|
30
|
+
items = [1, 2, 2, 3, 4, 4, 5]
|
|
31
|
+
outputs = proxy.map(items, fetch_many)
|
|
32
|
+
assert outputs == ["val:1", "val:2", "val:2", "val:3", "val:4", "val:4", "val:5"]
|
|
33
|
+
|
|
34
|
+
# Cache is reused across calls
|
|
35
|
+
outputs2 = proxy.map([5, 4, 3, 2, 1], fetch_many)
|
|
36
|
+
assert outputs2 == ["val:5", "val:4", "val:3", "val:2", "val:1"]
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Notes
|
|
40
|
+
|
|
41
|
+
- If `batch_size` is None or <= 0, all unique items are processed in a single call.
|
|
42
|
+
- Under concurrency, the proxy prevents duplicate work by coordinating in-flight keys.
|
|
43
|
+
|
|
44
|
+
## Asynchronous usage (AsyncBatchingMapProxy)
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import asyncio
|
|
48
|
+
from typing import List
|
|
49
|
+
from openaivec.proxy import AsyncBatchingMapProxy
|
|
50
|
+
|
|
51
|
+
# Define your async batch mapping function
|
|
52
|
+
async def fetch_many_async(keys: List[int]) -> List[str]:
|
|
53
|
+
# Simulate I/O
|
|
54
|
+
await asyncio.sleep(0.01)
|
|
55
|
+
return [f"val:{k}" for k in keys]
|
|
56
|
+
|
|
57
|
+
# Create proxy with batch size and an optional concurrency cap for map_func calls
|
|
58
|
+
proxy = AsyncBatchingMapProxy[int, str](batch_size=3, max_concurrency=2)
|
|
59
|
+
|
|
60
|
+
async def main():
|
|
61
|
+
items = [1, 2, 3, 4, 5]
|
|
62
|
+
out = await proxy.map(items, fetch_many_async)
|
|
63
|
+
assert out == ["val:1", "val:2", "val:3", "val:4", "val:5"]
|
|
64
|
+
|
|
65
|
+
# Overlapping requests deduplicate work and share results via the cache
|
|
66
|
+
r1 = proxy.map([1, 2, 3, 4], fetch_many_async)
|
|
67
|
+
r2 = proxy.map([3, 4, 5], fetch_many_async)
|
|
68
|
+
o1, o2 = await asyncio.gather(r1, r2)
|
|
69
|
+
assert o1 == ["val:1", "val:2", "val:3", "val:4"]
|
|
70
|
+
assert o2 == ["val:3", "val:4", "val:5"]
|
|
71
|
+
|
|
72
|
+
asyncio.run(main())
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Notes
|
|
76
|
+
|
|
77
|
+
- `max_concurrency` limits concurrent invocations of `map_func` across overlapping `map()` calls.
|
|
78
|
+
- The proxy rechecks the cache immediately before each batch call to avoid redundant work.
|
|
79
|
+
|
|
80
|
+
## API summary
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
class BatchingMapProxy[S: Hashable, T]:
|
|
84
|
+
batch_size: int | None
|
|
85
|
+
|
|
86
|
+
def map(self, items: list[S], map_func: Callable[[list[S]], list[T]]) -> list[T]:
|
|
87
|
+
...
|
|
88
|
+
|
|
89
|
+
class AsyncBatchingMapProxy[S: Hashable, T]:
|
|
90
|
+
batch_size: int | None
|
|
91
|
+
max_concurrency: int
|
|
92
|
+
|
|
93
|
+
async def map(self, items: list[S], map_func: Callable[[list[S]], Awaitable[list[T]]]) -> list[T]:
|
|
94
|
+
...
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Implementation details:
|
|
98
|
+
|
|
99
|
+
- Inputs are de-duplicated with first-occurrence order preserved.
|
|
100
|
+
- Cache is filled atomically and shared across calls.
|
|
101
|
+
- In-flight keys are coordinated (threading.Event / asyncio.Event) to prevent duplicated computation.
|
|
102
|
+
- Errors from `map_func` propagate; in-flight keys are released to avoid deadlocks.
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from logging import Logger, getLogger
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from numpy.typing import NDArray
|
|
7
|
+
from openai import AsyncOpenAI, OpenAI, RateLimitError
|
|
8
|
+
|
|
9
|
+
from .log import observe
|
|
10
|
+
from .proxy import AsyncBatchingMapProxy, BatchingMapProxy
|
|
11
|
+
from .util import backoff, backoff_async
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"BatchEmbeddings",
|
|
15
|
+
"AsyncBatchEmbeddings",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
_LOGGER: Logger = getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class BatchEmbeddings:
|
|
23
|
+
"""Thin wrapper around the OpenAI embeddings endpoint (synchronous).
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
client (OpenAI): Configured OpenAI client.
|
|
27
|
+
model_name (str): Model identifier (e.g., ``"text-embedding-3-small"``).
|
|
28
|
+
cache (BatchingMapProxy[str, NDArray[np.float32]]): Batching proxy for ordered, cached mapping.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
client: OpenAI
|
|
32
|
+
model_name: str
|
|
33
|
+
cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=128))
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def of(cls, client: OpenAI, model_name: str, batch_size: int = 128) -> "BatchEmbeddings":
|
|
37
|
+
"""Factory constructor.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
client (OpenAI): OpenAI client.
|
|
41
|
+
model_name (str): Embeddings model name.
|
|
42
|
+
batch_size (int, optional): Max unique inputs per API call. Defaults to 128.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
BatchEmbeddings: Configured instance backed by a batching proxy.
|
|
46
|
+
"""
|
|
47
|
+
return cls(client=client, model_name=model_name, cache=BatchingMapProxy(batch_size=batch_size))
|
|
48
|
+
|
|
49
|
+
@observe(_LOGGER)
|
|
50
|
+
@backoff(exception=RateLimitError, scale=15, max_retries=8)
|
|
51
|
+
def _embed_chunk(self, inputs: List[str]) -> List[NDArray[np.float32]]:
|
|
52
|
+
"""Embed one minibatch of strings.
|
|
53
|
+
|
|
54
|
+
This private helper is the unit of work used by the map/parallel
|
|
55
|
+
utilities. Exponential back‑off is applied automatically when
|
|
56
|
+
``openai.RateLimitError`` is raised.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
inputs (List[str]): Input strings to be embedded. Duplicates allowed.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
63
|
+
"""
|
|
64
|
+
responses = self.client.embeddings.create(input=inputs, model=self.model_name)
|
|
65
|
+
return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
|
|
66
|
+
|
|
67
|
+
@observe(_LOGGER)
|
|
68
|
+
def create(self, inputs: List[str]) -> List[NDArray[np.float32]]:
|
|
69
|
+
"""Generate embeddings for inputs using cached, ordered batching.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
inputs (List[str]): Input strings. Duplicates allowed.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
76
|
+
"""
|
|
77
|
+
return self.cache.map(inputs, self._embed_chunk)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class AsyncBatchEmbeddings:
|
|
82
|
+
"""Thin wrapper around the OpenAI embeddings endpoint (asynchronous).
|
|
83
|
+
|
|
84
|
+
This class provides an asynchronous interface for generating embeddings using
|
|
85
|
+
OpenAI models. It manages concurrency, handles rate limits automatically,
|
|
86
|
+
and efficiently processes batches of inputs, including de-duplication.
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
```python
|
|
90
|
+
import asyncio
|
|
91
|
+
import numpy as np
|
|
92
|
+
from openai import AsyncOpenAI
|
|
93
|
+
from openaivec import AsyncBatchEmbeddings
|
|
94
|
+
|
|
95
|
+
# Assuming openai_async_client is an initialized AsyncOpenAI client
|
|
96
|
+
openai_async_client = AsyncOpenAI() # Replace with your actual client initialization
|
|
97
|
+
|
|
98
|
+
embedder = AsyncBatchEmbeddings.of(
|
|
99
|
+
client=openai_async_client,
|
|
100
|
+
model_name="text-embedding-3-small",
|
|
101
|
+
batch_size=128,
|
|
102
|
+
max_concurrency=8,
|
|
103
|
+
)
|
|
104
|
+
texts = ["This is the first document.", "This is the second document.", "This is the first document."]
|
|
105
|
+
|
|
106
|
+
# Asynchronous call
|
|
107
|
+
async def main():
|
|
108
|
+
embeddings = await embedder.create(texts)
|
|
109
|
+
# embeddings will be a list of numpy arrays (float32)
|
|
110
|
+
# The embedding for the third text will be identical to the first
|
|
111
|
+
# due to automatic de-duplication.
|
|
112
|
+
print(f"Generated {len(embeddings)} embeddings.")
|
|
113
|
+
print(f"Shape of first embedding: {embeddings[0].shape}")
|
|
114
|
+
assert np.array_equal(embeddings[0], embeddings[2])
|
|
115
|
+
|
|
116
|
+
# Run the async function
|
|
117
|
+
asyncio.run(main())
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Attributes:
|
|
121
|
+
client (AsyncOpenAI): Configured OpenAI async client.
|
|
122
|
+
model_name (str): Embeddings model name.
|
|
123
|
+
cache (AsyncBatchingMapProxy[str, NDArray[np.float32]]): Async batching proxy.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
client: AsyncOpenAI
|
|
127
|
+
model_name: str
|
|
128
|
+
cache: AsyncBatchingMapProxy[str, NDArray[np.float32]] = field(
|
|
129
|
+
default_factory=lambda: AsyncBatchingMapProxy(batch_size=128, max_concurrency=8)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def of(
|
|
134
|
+
cls,
|
|
135
|
+
client: AsyncOpenAI,
|
|
136
|
+
model_name: str,
|
|
137
|
+
batch_size: int = 128,
|
|
138
|
+
max_concurrency: int = 8,
|
|
139
|
+
) -> "AsyncBatchEmbeddings":
|
|
140
|
+
"""Factory constructor.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
client (AsyncOpenAI): OpenAI async client.
|
|
144
|
+
model_name (str): Embeddings model name.
|
|
145
|
+
batch_size (int, optional): Max unique inputs per API call. Defaults to 128.
|
|
146
|
+
max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
AsyncBatchEmbeddings: Configured instance with an async batching proxy.
|
|
150
|
+
"""
|
|
151
|
+
return cls(
|
|
152
|
+
client=client,
|
|
153
|
+
model_name=model_name,
|
|
154
|
+
cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
@observe(_LOGGER)
|
|
158
|
+
@backoff_async(exception=RateLimitError, scale=15, max_retries=8)
|
|
159
|
+
async def _embed_chunk(self, inputs: List[str]) -> List[NDArray[np.float32]]:
|
|
160
|
+
"""Embed one minibatch of strings asynchronously.
|
|
161
|
+
|
|
162
|
+
This private helper handles the actual API call for a batch of inputs.
|
|
163
|
+
Exponential back-off is applied automatically when ``openai.RateLimitError``
|
|
164
|
+
is raised.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
inputs (List[str]): Input strings to be embedded. Duplicates allowed.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
RateLimitError: Propagated if retries are exhausted.
|
|
174
|
+
"""
|
|
175
|
+
responses = await self.client.embeddings.create(input=inputs, model=self.model_name)
|
|
176
|
+
return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
|
|
177
|
+
|
|
178
|
+
@observe(_LOGGER)
|
|
179
|
+
async def create(self, inputs: List[str]) -> List[NDArray[np.float32]]:
|
|
180
|
+
"""Generate embeddings for inputs using proxy batching (async).
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
inputs (List[str]): Input strings. Duplicates allowed.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
|
|
187
|
+
"""
|
|
188
|
+
return await self.cache.map(inputs, self._embed_chunk)
|
|
@@ -65,3 +65,23 @@ class ResponsesModelName:
|
|
|
65
65
|
@dataclass(frozen=True)
|
|
66
66
|
class EmbeddingsModelName:
|
|
67
67
|
value: str
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class OpenAIAPIKey:
|
|
72
|
+
value: str
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass(frozen=True)
|
|
76
|
+
class AzureOpenAIAPIKey:
|
|
77
|
+
value: str
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class AzureOpenAIEndpoint:
|
|
82
|
+
value: str
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass(frozen=True)
|
|
86
|
+
class AzureOpenAIAPIVersion:
|
|
87
|
+
value: str
|