openaivec 0.12.5__tar.gz → 0.13.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {openaivec-0.12.5 → openaivec-0.13.0}/PKG-INFO +39 -16
  2. {openaivec-0.12.5 → openaivec-0.13.0}/README.md +38 -15
  3. openaivec-0.13.0/docs/api/proxy.md +102 -0
  4. openaivec-0.13.0/src/openaivec/embeddings.py +188 -0
  5. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/model.py +20 -0
  6. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/pandas_ext.py +455 -121
  7. openaivec-0.13.0/src/openaivec/provider.py +98 -0
  8. openaivec-0.13.0/src/openaivec/proxy.py +608 -0
  9. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/responses.py +175 -105
  10. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/serialize.py +41 -33
  11. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/spark.py +137 -88
  12. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/__init__.py +3 -3
  13. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/__init__.py +1 -1
  14. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/table/__init__.py +1 -1
  15. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/util.py +1 -69
  16. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_embeddings.py +21 -20
  17. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_pandas_ext.py +215 -0
  18. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_provider.py +183 -84
  19. openaivec-0.13.0/tests/test_proxy.py +581 -0
  20. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_responses.py +32 -12
  21. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_serialize.py +64 -113
  22. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_spark.py +23 -19
  23. openaivec-0.13.0/tests/test_util.py +41 -0
  24. openaivec-0.12.5/src/openaivec/embeddings.py +0 -172
  25. openaivec-0.12.5/src/openaivec/provider.py +0 -45
  26. openaivec-0.12.5/tests/test_util.py +0 -176
  27. {openaivec-0.12.5 → openaivec-0.13.0}/.env.example +0 -0
  28. {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-mkdocs.yml +0 -0
  29. {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-package.yml +0 -0
  30. {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-test.yml +0 -0
  31. {openaivec-0.12.5 → openaivec-0.13.0}/.github/workflows/python-update.yml +0 -0
  32. {openaivec-0.12.5 → openaivec-0.13.0}/.gitignore +0 -0
  33. {openaivec-0.12.5 → openaivec-0.13.0}/CODE_OF_CONDUCT.md +0 -0
  34. {openaivec-0.12.5 → openaivec-0.13.0}/LICENSE +0 -0
  35. {openaivec-0.12.5 → openaivec-0.13.0}/SECURITY.md +0 -0
  36. {openaivec-0.12.5 → openaivec-0.13.0}/SUPPORT.md +0 -0
  37. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/di.md +0 -0
  38. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/embeddings.md +0 -0
  39. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/pandas_ext.md +0 -0
  40. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/prompt.md +0 -0
  41. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/responses.md +0 -0
  42. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/spark.md +0 -0
  43. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/task.md +0 -0
  44. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
  45. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
  46. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
  47. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
  48. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
  49. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
  50. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
  51. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
  52. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
  53. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
  54. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
  55. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/tasks/nlp/translation.md +0 -0
  56. {openaivec-0.12.5 → openaivec-0.13.0}/docs/api/util.md +0 -0
  57. {openaivec-0.12.5 → openaivec-0.13.0}/docs/index.md +0 -0
  58. {openaivec-0.12.5 → openaivec-0.13.0}/docs/robots.txt +0 -0
  59. {openaivec-0.12.5 → openaivec-0.13.0}/mkdocs.yml +0 -0
  60. {openaivec-0.12.5 → openaivec-0.13.0}/pyproject.toml +0 -0
  61. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/__init__.py +0 -0
  62. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/di.py +0 -0
  63. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/log.py +0 -0
  64. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/prompt.py +0 -0
  65. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/__init__.py +0 -0
  66. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
  67. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
  68. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
  69. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
  70. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
  71. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
  72. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
  73. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
  74. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
  75. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
  76. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
  77. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/nlp/translation.py +0 -0
  78. {openaivec-0.12.5 → openaivec-0.13.0}/src/openaivec/task/table/fillna.py +0 -0
  79. {openaivec-0.12.5 → openaivec-0.13.0}/tests/__init__.py +0 -0
  80. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_di.py +0 -0
  81. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_prompt.py +0 -0
  82. {openaivec-0.12.5 → openaivec-0.13.0}/tests/test_task.py +0 -0
  83. {openaivec-0.12.5 → openaivec-0.13.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.12.5
3
+ Version: 0.13.0
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -153,13 +153,14 @@ from openai import OpenAI
153
153
  from openaivec import BatchResponses
154
154
 
155
155
  # Initialize the batch client
156
- client = BatchResponses(
156
+ client = BatchResponses.of(
157
157
  client=OpenAI(),
158
158
  model_name="gpt-4.1-mini",
159
- system_message="Please answer only with 'xx family' and do not output anything else."
159
+ system_message="Please answer only with 'xx family' and do not output anything else.",
160
+ batch_size=32,
160
161
  )
161
162
 
162
- result = client.parse(["panda", "rabbit", "koala"], batch_size=32)
163
+ result = client.parse(["panda", "rabbit", "koala"])
163
164
  print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
164
165
  ```
165
166
 
@@ -170,10 +171,25 @@ print(result) # Expected output: ['bear family', 'rabbit family', 'koala family
170
171
  The easiest way to get started with your DataFrames:
171
172
 
172
173
  ```python
174
+ import os
173
175
  import pandas as pd
174
176
  from openaivec import pandas_ext
175
177
 
176
- # Setup (optional - uses OPENAI_API_KEY environment variable by default)
178
+ # Authentication Option 1: Environment variables (automatic detection)
179
+ # For OpenAI:
180
+ os.environ["OPENAI_API_KEY"] = "your-api-key-here"
181
+ # Or for Azure OpenAI:
182
+ # os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-key"
183
+ # os.environ["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
184
+ # os.environ["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
185
+
186
+ # Authentication Option 2: Custom client (optional)
187
+ # from openai import OpenAI, AsyncOpenAI
188
+ # pandas_ext.use(OpenAI())
189
+ # For async operations:
190
+ # pandas_ext.use_async(AsyncOpenAI())
191
+
192
+ # Configure model (optional - defaults to gpt-4.1-mini)
177
193
  pandas_ext.responses_model("gpt-4.1-mini")
178
194
 
179
195
  # Create your data
@@ -230,7 +246,7 @@ extracted_results = (results
230
246
 
231
247
  **Available Task Categories:**
232
248
 
233
- - **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`
249
+ - **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.MULTILINGUAL_TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`
234
250
  - **Content Classification**: `customer_support.INTENT_ANALYSIS`, `customer_support.URGENCY_ANALYSIS`, `customer_support.INQUIRY_CLASSIFICATION`
235
251
 
236
252
  **Benefits of Pre-configured Tasks:**
@@ -345,7 +361,7 @@ spark.udf.register(
345
361
  )
346
362
 
347
363
  # --- Register Token Counting UDF ---
348
- spark.udf.register("count_tokens", count_tokens_udf("gpt-4o"))
364
+ spark.udf.register("count_tokens", count_tokens_udf())
349
365
 
350
366
  # --- Register UDFs with Pre-configured Tasks ---
351
367
  from openaivec.task import nlp, customer_support
@@ -393,16 +409,23 @@ FROM product_reviews;
393
409
 
394
410
  Example Output (structure might vary slightly):
395
411
 
396
- | id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required | embedding | token_count |
397
- | ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- | ------------------- | ---------------------- | ----------- |
398
- | 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid... | Nexus Corp | {en: ..., fr: ..., ja: ...} | positive | 0.95 | provide_feedback | acknowledge_review | [0.1, -0.2, ..., 0.5] | 19 |
399
- | 1002 | Quantum Galaxy has great battery life but the price is too high for what... | Quantum | {en: ..., fr: ..., ja: ...} | mixed | 0.78 | provide_feedback | follow_up_pricing | [-0.3, 0.1, ..., -0.1] | 16 |
400
- | 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue | [0.0, 0.4, ..., 0.2] | 12 |
412
+ | id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required | embedding | token_count |
413
+ | ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- | ------------------ | ---------------------- | ----------- |
414
+ | 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid... | Nexus Corp | {en: ..., fr: ..., ja: ...} | positive | 0.95 | provide_feedback | acknowledge_review | [0.1, -0.2, ..., 0.5] | 19 |
415
+ | 1002 | Quantum Galaxy has great battery life but the price is too high for what... | Quantum | {en: ..., fr: ..., ja: ...} | mixed | 0.78 | provide_feedback | follow_up_pricing | [-0.3, 0.1, ..., -0.1] | 16 |
416
+ | 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue | [0.0, 0.4, ..., 0.2] | 12 |
401
417
 
402
418
  ### Spark Performance Tuning
403
419
 
404
420
  When using openaivec with Spark, proper configuration of `batch_size` and `max_concurrency` is crucial for optimal performance:
405
421
 
422
+ **Automatic Caching** (New):
423
+
424
+ - **Duplicate Detection**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`) automatically cache duplicate inputs within each partition
425
+ - **Cost Reduction**: Significantly reduces API calls and costs on datasets with repeated content
426
+ - **Transparent**: Works automatically without code changes - your existing UDFs become more efficient
427
+ - **Partition-Level**: Each partition maintains its own cache, optimal for distributed processing patterns
428
+
406
429
  **`batch_size`** (default: 128):
407
430
 
408
431
  - Controls how many rows are processed together in each API request within a partition
@@ -635,16 +658,16 @@ steps:
635
658
 
636
659
  ```python
637
660
  import os
638
- from pyspark.sql import SparkSession
639
661
  from openaivec.spark import responses_udf, embeddings_udf
640
662
 
641
- spark = SparkSession.builder.getOrCreate()
663
+ # In Microsoft Fabric, spark session is automatically available
664
+ # spark = SparkSession.builder.getOrCreate()
642
665
  sc = spark.sparkContext
643
666
 
644
667
  # Configure Azure OpenAI authentication
645
668
  sc.environment["AZURE_OPENAI_API_KEY"] = "<your-api-key>"
646
- sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.openai.azure.com"
647
- sc.environment["AZURE_OPENAI_API_VERSION"] = "2024-10-21"
669
+ sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
670
+ sc.environment["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
648
671
 
649
672
  # Register UDFs
650
673
  spark.udf.register(
@@ -129,13 +129,14 @@ from openai import OpenAI
129
129
  from openaivec import BatchResponses
130
130
 
131
131
  # Initialize the batch client
132
- client = BatchResponses(
132
+ client = BatchResponses.of(
133
133
  client=OpenAI(),
134
134
  model_name="gpt-4.1-mini",
135
- system_message="Please answer only with 'xx family' and do not output anything else."
135
+ system_message="Please answer only with 'xx family' and do not output anything else.",
136
+ batch_size=32,
136
137
  )
137
138
 
138
- result = client.parse(["panda", "rabbit", "koala"], batch_size=32)
139
+ result = client.parse(["panda", "rabbit", "koala"])
139
140
  print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
140
141
  ```
141
142
 
@@ -146,10 +147,25 @@ print(result) # Expected output: ['bear family', 'rabbit family', 'koala family
146
147
  The easiest way to get started with your DataFrames:
147
148
 
148
149
  ```python
150
+ import os
149
151
  import pandas as pd
150
152
  from openaivec import pandas_ext
151
153
 
152
- # Setup (optional - uses OPENAI_API_KEY environment variable by default)
154
+ # Authentication Option 1: Environment variables (automatic detection)
155
+ # For OpenAI:
156
+ os.environ["OPENAI_API_KEY"] = "your-api-key-here"
157
+ # Or for Azure OpenAI:
158
+ # os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-key"
159
+ # os.environ["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
160
+ # os.environ["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
161
+
162
+ # Authentication Option 2: Custom client (optional)
163
+ # from openai import OpenAI, AsyncOpenAI
164
+ # pandas_ext.use(OpenAI())
165
+ # For async operations:
166
+ # pandas_ext.use_async(AsyncOpenAI())
167
+
168
+ # Configure model (optional - defaults to gpt-4.1-mini)
153
169
  pandas_ext.responses_model("gpt-4.1-mini")
154
170
 
155
171
  # Create your data
@@ -206,7 +222,7 @@ extracted_results = (results
206
222
 
207
223
  **Available Task Categories:**
208
224
 
209
- - **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`
225
+ - **Text Analysis**: `nlp.SENTIMENT_ANALYSIS`, `nlp.MULTILINGUAL_TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`
210
226
  - **Content Classification**: `customer_support.INTENT_ANALYSIS`, `customer_support.URGENCY_ANALYSIS`, `customer_support.INQUIRY_CLASSIFICATION`
211
227
 
212
228
  **Benefits of Pre-configured Tasks:**
@@ -321,7 +337,7 @@ spark.udf.register(
321
337
  )
322
338
 
323
339
  # --- Register Token Counting UDF ---
324
- spark.udf.register("count_tokens", count_tokens_udf("gpt-4o"))
340
+ spark.udf.register("count_tokens", count_tokens_udf())
325
341
 
326
342
  # --- Register UDFs with Pre-configured Tasks ---
327
343
  from openaivec.task import nlp, customer_support
@@ -369,16 +385,23 @@ FROM product_reviews;
369
385
 
370
386
  Example Output (structure might vary slightly):
371
387
 
372
- | id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required | embedding | token_count |
373
- | ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- | ------------------- | ---------------------- | ----------- |
374
- | 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid... | Nexus Corp | {en: ..., fr: ..., ja: ...} | positive | 0.95 | provide_feedback | acknowledge_review | [0.1, -0.2, ..., 0.5] | 19 |
375
- | 1002 | Quantum Galaxy has great battery life but the price is too high for what... | Quantum | {en: ..., fr: ..., ja: ...} | mixed | 0.78 | provide_feedback | follow_up_pricing | [-0.3, 0.1, ..., -0.1] | 16 |
376
- | 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue | [0.0, 0.4, ..., 0.2] | 12 |
388
+ | id | review_text | brand | translation | sentiment | sentiment_confidence | intent | action_required | embedding | token_count |
389
+ | ---- | ----------------------------------------------------------------------------- | ---------- | --------------------------- | --------- | -------------------- | ---------------- | ------------------ | ---------------------- | ----------- |
390
+ | 1001 | The new TechPhone X camera quality is amazing, Nexus Corp really outdid... | Nexus Corp | {en: ..., fr: ..., ja: ...} | positive | 0.95 | provide_feedback | acknowledge_review | [0.1, -0.2, ..., 0.5] | 19 |
391
+ | 1002 | Quantum Galaxy has great battery life but the price is too high for what... | Quantum | {en: ..., fr: ..., ja: ...} | mixed | 0.78 | provide_feedback | follow_up_pricing | [-0.3, 0.1, ..., -0.1] | 16 |
392
+ | 1003 | Zephyr mobile phone crashed twice today, very disappointed with this purchase | Zephyr | {en: ..., fr: ..., ja: ...} | negative | 0.88 | complaint | investigate_issue | [0.0, 0.4, ..., 0.2] | 12 |
377
393
 
378
394
  ### Spark Performance Tuning
379
395
 
380
396
  When using openaivec with Spark, proper configuration of `batch_size` and `max_concurrency` is crucial for optimal performance:
381
397
 
398
+ **Automatic Caching** (New):
399
+
400
+ - **Duplicate Detection**: All AI-powered UDFs (`responses_udf`, `task_udf`, `embeddings_udf`) automatically cache duplicate inputs within each partition
401
+ - **Cost Reduction**: Significantly reduces API calls and costs on datasets with repeated content
402
+ - **Transparent**: Works automatically without code changes - your existing UDFs become more efficient
403
+ - **Partition-Level**: Each partition maintains its own cache, optimal for distributed processing patterns
404
+
382
405
  **`batch_size`** (default: 128):
383
406
 
384
407
  - Controls how many rows are processed together in each API request within a partition
@@ -611,16 +634,16 @@ steps:
611
634
 
612
635
  ```python
613
636
  import os
614
- from pyspark.sql import SparkSession
615
637
  from openaivec.spark import responses_udf, embeddings_udf
616
638
 
617
- spark = SparkSession.builder.getOrCreate()
639
+ # In Microsoft Fabric, spark session is automatically available
640
+ # spark = SparkSession.builder.getOrCreate()
618
641
  sc = spark.sparkContext
619
642
 
620
643
  # Configure Azure OpenAI authentication
621
644
  sc.environment["AZURE_OPENAI_API_KEY"] = "<your-api-key>"
622
- sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.openai.azure.com"
623
- sc.environment["AZURE_OPENAI_API_VERSION"] = "2024-10-21"
645
+ sc.environment["AZURE_OPENAI_API_ENDPOINT"] = "https://<your-resource-name>.services.ai.azure.com"
646
+ sc.environment["AZURE_OPENAI_API_VERSION"] = "2025-04-01-preview"
624
647
 
625
648
  # Register UDFs
626
649
  spark.udf.register(
@@ -0,0 +1,102 @@
1
+ # proxy
2
+
3
+ Batching proxies for order-preserving, cached batch mapping.
4
+
5
+ This module provides two helpers:
6
+
7
+ - BatchingMapProxy: thread-safe synchronous batching with caching and de-duplication.
8
+ - AsyncBatchingMapProxy: asyncio-friendly batching with optional concurrency limits.
9
+
10
+ Both proxies accept the mapping function as the second argument to map(). The function must:
11
+
12
+ - Accept a list of inputs and return a list of outputs in the same order.
13
+ - Be pure relative to a single call (side effects should be idempotent or safe).
14
+
15
+ ## Synchronous usage (BatchingMapProxy)
16
+
17
+ ```python
18
+ from typing import List
19
+ from openaivec.proxy import BatchingMapProxy
20
+
21
+ # Define your batch mapping function
22
+ def fetch_many(keys: List[int]) -> List[str]:
23
+ # Example: echo values as strings
24
+ return [f"val:{k}" for k in keys]
25
+
26
+ # Create proxy with an optional batch size hint
27
+ proxy = BatchingMapProxy[int, str](batch_size=3)
28
+
29
+ # Map items using the proxy. Duplicates are de-duplicated and order preserved.
30
+ items = [1, 2, 2, 3, 4, 4, 5]
31
+ outputs = proxy.map(items, fetch_many)
32
+ assert outputs == ["val:1", "val:2", "val:2", "val:3", "val:4", "val:4", "val:5"]
33
+
34
+ # Cache is reused across calls
35
+ outputs2 = proxy.map([5, 4, 3, 2, 1], fetch_many)
36
+ assert outputs2 == ["val:5", "val:4", "val:3", "val:2", "val:1"]
37
+ ```
38
+
39
+ ### Notes
40
+
41
+ - If `batch_size` is None or <= 0, all unique items are processed in a single call.
42
+ - Under concurrency, the proxy prevents duplicate work by coordinating in-flight keys.
43
+
44
+ ## Asynchronous usage (AsyncBatchingMapProxy)
45
+
46
+ ```python
47
+ import asyncio
48
+ from typing import List
49
+ from openaivec.proxy import AsyncBatchingMapProxy
50
+
51
+ # Define your async batch mapping function
52
+ async def fetch_many_async(keys: List[int]) -> List[str]:
53
+ # Simulate I/O
54
+ await asyncio.sleep(0.01)
55
+ return [f"val:{k}" for k in keys]
56
+
57
+ # Create proxy with batch size and an optional concurrency cap for map_func calls
58
+ proxy = AsyncBatchingMapProxy[int, str](batch_size=3, max_concurrency=2)
59
+
60
+ async def main():
61
+ items = [1, 2, 3, 4, 5]
62
+ out = await proxy.map(items, fetch_many_async)
63
+ assert out == ["val:1", "val:2", "val:3", "val:4", "val:5"]
64
+
65
+ # Overlapping requests deduplicate work and share results via the cache
66
+ r1 = proxy.map([1, 2, 3, 4], fetch_many_async)
67
+ r2 = proxy.map([3, 4, 5], fetch_many_async)
68
+ o1, o2 = await asyncio.gather(r1, r2)
69
+ assert o1 == ["val:1", "val:2", "val:3", "val:4"]
70
+ assert o2 == ["val:3", "val:4", "val:5"]
71
+
72
+ asyncio.run(main())
73
+ ```
74
+
75
+ ### Notes
76
+
77
+ - `max_concurrency` limits concurrent invocations of `map_func` across overlapping `map()` calls.
78
+ - The proxy rechecks the cache immediately before each batch call to avoid redundant work.
79
+
80
+ ## API summary
81
+
82
+ ```python
83
+ class BatchingMapProxy[S: Hashable, T]:
84
+ batch_size: int | None
85
+
86
+ def map(self, items: list[S], map_func: Callable[[list[S]], list[T]]) -> list[T]:
87
+ ...
88
+
89
+ class AsyncBatchingMapProxy[S: Hashable, T]:
90
+ batch_size: int | None
91
+ max_concurrency: int
92
+
93
+ async def map(self, items: list[S], map_func: Callable[[list[S]], Awaitable[list[T]]]) -> list[T]:
94
+ ...
95
+ ```
96
+
97
+ Implementation details:
98
+
99
+ - Inputs are de-duplicated with first-occurrence order preserved.
100
+ - Cache is filled atomically and shared across calls.
101
+ - In-flight keys are coordinated (threading.Event / asyncio.Event) to prevent duplicated computation.
102
+ - Errors from `map_func` propagate; in-flight keys are released to avoid deadlocks.
@@ -0,0 +1,188 @@
1
+ from dataclasses import dataclass, field
2
+ from logging import Logger, getLogger
3
+ from typing import List
4
+
5
+ import numpy as np
6
+ from numpy.typing import NDArray
7
+ from openai import AsyncOpenAI, OpenAI, RateLimitError
8
+
9
+ from .log import observe
10
+ from .proxy import AsyncBatchingMapProxy, BatchingMapProxy
11
+ from .util import backoff, backoff_async
12
+
13
+ __all__ = [
14
+ "BatchEmbeddings",
15
+ "AsyncBatchEmbeddings",
16
+ ]
17
+
18
+ _LOGGER: Logger = getLogger(__name__)
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class BatchEmbeddings:
23
+ """Thin wrapper around the OpenAI embeddings endpoint (synchronous).
24
+
25
+ Attributes:
26
+ client (OpenAI): Configured OpenAI client.
27
+ model_name (str): Model identifier (e.g., ``"text-embedding-3-small"``).
28
+ cache (BatchingMapProxy[str, NDArray[np.float32]]): Batching proxy for ordered, cached mapping.
29
+ """
30
+
31
+ client: OpenAI
32
+ model_name: str
33
+ cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=128))
34
+
35
+ @classmethod
36
+ def of(cls, client: OpenAI, model_name: str, batch_size: int = 128) -> "BatchEmbeddings":
37
+ """Factory constructor.
38
+
39
+ Args:
40
+ client (OpenAI): OpenAI client.
41
+ model_name (str): Embeddings model name.
42
+ batch_size (int, optional): Max unique inputs per API call. Defaults to 128.
43
+
44
+ Returns:
45
+ BatchEmbeddings: Configured instance backed by a batching proxy.
46
+ """
47
+ return cls(client=client, model_name=model_name, cache=BatchingMapProxy(batch_size=batch_size))
48
+
49
+ @observe(_LOGGER)
50
+ @backoff(exception=RateLimitError, scale=15, max_retries=8)
51
+ def _embed_chunk(self, inputs: List[str]) -> List[NDArray[np.float32]]:
52
+ """Embed one minibatch of strings.
53
+
54
+ This private helper is the unit of work used by the map/parallel
55
+ utilities. Exponential back‑off is applied automatically when
56
+ ``openai.RateLimitError`` is raised.
57
+
58
+ Args:
59
+ inputs (List[str]): Input strings to be embedded. Duplicates allowed.
60
+
61
+ Returns:
62
+ List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
63
+ """
64
+ responses = self.client.embeddings.create(input=inputs, model=self.model_name)
65
+ return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
66
+
67
+ @observe(_LOGGER)
68
+ def create(self, inputs: List[str]) -> List[NDArray[np.float32]]:
69
+ """Generate embeddings for inputs using cached, ordered batching.
70
+
71
+ Args:
72
+ inputs (List[str]): Input strings. Duplicates allowed.
73
+
74
+ Returns:
75
+ List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
76
+ """
77
+ return self.cache.map(inputs, self._embed_chunk)
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class AsyncBatchEmbeddings:
82
+ """Thin wrapper around the OpenAI embeddings endpoint (asynchronous).
83
+
84
+ This class provides an asynchronous interface for generating embeddings using
85
+ OpenAI models. It manages concurrency, handles rate limits automatically,
86
+ and efficiently processes batches of inputs, including de-duplication.
87
+
88
+ Example:
89
+ ```python
90
+ import asyncio
91
+ import numpy as np
92
+ from openai import AsyncOpenAI
93
+ from openaivec import AsyncBatchEmbeddings
94
+
95
+ # Assuming openai_async_client is an initialized AsyncOpenAI client
96
+ openai_async_client = AsyncOpenAI() # Replace with your actual client initialization
97
+
98
+ embedder = AsyncBatchEmbeddings.of(
99
+ client=openai_async_client,
100
+ model_name="text-embedding-3-small",
101
+ batch_size=128,
102
+ max_concurrency=8,
103
+ )
104
+ texts = ["This is the first document.", "This is the second document.", "This is the first document."]
105
+
106
+ # Asynchronous call
107
+ async def main():
108
+ embeddings = await embedder.create(texts)
109
+ # embeddings will be a list of numpy arrays (float32)
110
+ # The embedding for the third text will be identical to the first
111
+ # due to automatic de-duplication.
112
+ print(f"Generated {len(embeddings)} embeddings.")
113
+ print(f"Shape of first embedding: {embeddings[0].shape}")
114
+ assert np.array_equal(embeddings[0], embeddings[2])
115
+
116
+ # Run the async function
117
+ asyncio.run(main())
118
+ ```
119
+
120
+ Attributes:
121
+ client (AsyncOpenAI): Configured OpenAI async client.
122
+ model_name (str): Embeddings model name.
123
+ cache (AsyncBatchingMapProxy[str, NDArray[np.float32]]): Async batching proxy.
124
+ """
125
+
126
+ client: AsyncOpenAI
127
+ model_name: str
128
+ cache: AsyncBatchingMapProxy[str, NDArray[np.float32]] = field(
129
+ default_factory=lambda: AsyncBatchingMapProxy(batch_size=128, max_concurrency=8)
130
+ )
131
+
132
+ @classmethod
133
+ def of(
134
+ cls,
135
+ client: AsyncOpenAI,
136
+ model_name: str,
137
+ batch_size: int = 128,
138
+ max_concurrency: int = 8,
139
+ ) -> "AsyncBatchEmbeddings":
140
+ """Factory constructor.
141
+
142
+ Args:
143
+ client (AsyncOpenAI): OpenAI async client.
144
+ model_name (str): Embeddings model name.
145
+ batch_size (int, optional): Max unique inputs per API call. Defaults to 128.
146
+ max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
147
+
148
+ Returns:
149
+ AsyncBatchEmbeddings: Configured instance with an async batching proxy.
150
+ """
151
+ return cls(
152
+ client=client,
153
+ model_name=model_name,
154
+ cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
155
+ )
156
+
157
+ @observe(_LOGGER)
158
+ @backoff_async(exception=RateLimitError, scale=15, max_retries=8)
159
+ async def _embed_chunk(self, inputs: List[str]) -> List[NDArray[np.float32]]:
160
+ """Embed one minibatch of strings asynchronously.
161
+
162
+ This private helper handles the actual API call for a batch of inputs.
163
+ Exponential back-off is applied automatically when ``openai.RateLimitError``
164
+ is raised.
165
+
166
+ Args:
167
+ inputs (List[str]): Input strings to be embedded. Duplicates allowed.
168
+
169
+ Returns:
170
+ List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
171
+
172
+ Raises:
173
+ RateLimitError: Propagated if retries are exhausted.
174
+ """
175
+ responses = await self.client.embeddings.create(input=inputs, model=self.model_name)
176
+ return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
177
+
178
+ @observe(_LOGGER)
179
+ async def create(self, inputs: List[str]) -> List[NDArray[np.float32]]:
180
+ """Generate embeddings for inputs using proxy batching (async).
181
+
182
+ Args:
183
+ inputs (List[str]): Input strings. Duplicates allowed.
184
+
185
+ Returns:
186
+ List[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
187
+ """
188
+ return await self.cache.map(inputs, self._embed_chunk)
@@ -65,3 +65,23 @@ class ResponsesModelName:
65
65
  @dataclass(frozen=True)
66
66
  class EmbeddingsModelName:
67
67
  value: str
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class OpenAIAPIKey:
72
+ value: str
73
+
74
+
75
+ @dataclass(frozen=True)
76
+ class AzureOpenAIAPIKey:
77
+ value: str
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class AzureOpenAIEndpoint:
82
+ value: str
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class AzureOpenAIAPIVersion:
87
+ value: str