openaivec 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openaivec/_cache/proxy.py +52 -15
- openaivec/spark.py +74 -0
- openaivec-1.0.2.dist-info/METADATA +377 -0
- {openaivec-1.0.0.dist-info → openaivec-1.0.2.dist-info}/RECORD +6 -6
- openaivec-1.0.0.dist-info/METADATA +0 -807
- {openaivec-1.0.0.dist-info → openaivec-1.0.2.dist-info}/WHEEL +0 -0
- {openaivec-1.0.0.dist-info → openaivec-1.0.2.dist-info}/licenses/LICENSE +0 -0
openaivec/_cache/proxy.py
CHANGED
|
@@ -186,11 +186,15 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
186
186
|
performance (targeting 30-60 seconds per batch).
|
|
187
187
|
|
|
188
188
|
Example:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
189
|
+
```python
|
|
190
|
+
p = BatchingMapProxy[int, str](batch_size=3)
|
|
191
|
+
|
|
192
|
+
def f(xs: list[int]) -> list[str]:
|
|
193
|
+
return [f"v:{x}" for x in xs]
|
|
194
|
+
|
|
195
|
+
p.map([1, 2, 2, 3, 4], f)
|
|
196
|
+
# ['v:1', 'v:2', 'v:2', 'v:3', 'v:4']
|
|
197
|
+
```
|
|
194
198
|
"""
|
|
195
199
|
|
|
196
200
|
# Number of items to process per call to map_func.
|
|
@@ -449,6 +453,21 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
449
453
|
|
|
450
454
|
Raises:
|
|
451
455
|
Exception: Propagates any exception raised by ``map_func``.
|
|
456
|
+
|
|
457
|
+
Example:
|
|
458
|
+
```python
|
|
459
|
+
proxy: BatchingMapProxy[int, str] = BatchingMapProxy(batch_size=2)
|
|
460
|
+
calls: list[list[int]] = []
|
|
461
|
+
|
|
462
|
+
def mapper(chunk: list[int]) -> list[str]:
|
|
463
|
+
calls.append(chunk)
|
|
464
|
+
return [f"v:{x}" for x in chunk]
|
|
465
|
+
|
|
466
|
+
proxy.map([1, 2, 2, 3], mapper)
|
|
467
|
+
# ['v:1', 'v:2', 'v:2', 'v:3']
|
|
468
|
+
calls # duplicate ``2`` is only computed once
|
|
469
|
+
# [[1, 2], [3]]
|
|
470
|
+
```
|
|
452
471
|
"""
|
|
453
472
|
if self.__all_cached(items):
|
|
454
473
|
return self.__values(items)
|
|
@@ -490,16 +509,21 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
490
509
|
performance (targeting 30-60 seconds per batch).
|
|
491
510
|
|
|
492
511
|
Example:
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
512
|
+
```python
|
|
513
|
+
import asyncio
|
|
514
|
+
|
|
515
|
+
p = AsyncBatchingMapProxy[int, str](batch_size=2)
|
|
516
|
+
|
|
517
|
+
async def af(xs: list[int]) -> list[str]:
|
|
518
|
+
await asyncio.sleep(0)
|
|
519
|
+
return [f"v:{x}" for x in xs]
|
|
520
|
+
|
|
521
|
+
async def run():
|
|
522
|
+
return await p.map([1, 2, 3], af)
|
|
523
|
+
|
|
524
|
+
asyncio.run(run())
|
|
525
|
+
# ['v:1', 'v:2', 'v:3']
|
|
526
|
+
```
|
|
503
527
|
"""
|
|
504
528
|
|
|
505
529
|
# Number of items to process per call to map_func.
|
|
@@ -747,6 +771,19 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
747
771
|
|
|
748
772
|
Returns:
|
|
749
773
|
list[T]: Mapped values corresponding to ``items`` in the same order.
|
|
774
|
+
|
|
775
|
+
Example:
|
|
776
|
+
```python
|
|
777
|
+
import asyncio
|
|
778
|
+
|
|
779
|
+
async def mapper(chunk: list[int]) -> list[str]:
|
|
780
|
+
await asyncio.sleep(0)
|
|
781
|
+
return [f"v:{x}" for x in chunk]
|
|
782
|
+
|
|
783
|
+
proxy: AsyncBatchingMapProxy[int, str] = AsyncBatchingMapProxy(batch_size=2)
|
|
784
|
+
asyncio.run(proxy.map([1, 1, 2], mapper))
|
|
785
|
+
# ['v:1', 'v:1', 'v:2']
|
|
786
|
+
```
|
|
750
787
|
"""
|
|
751
788
|
if await self.__all_cached(items):
|
|
752
789
|
return await self.__values(items)
|
openaivec/spark.py
CHANGED
|
@@ -181,6 +181,20 @@ def setup(
|
|
|
181
181
|
If provided, registers `ResponsesModelName` in the DI container.
|
|
182
182
|
embeddings_model_name (str | None): Default model name for embeddings.
|
|
183
183
|
If provided, registers `EmbeddingsModelName` in the DI container.
|
|
184
|
+
|
|
185
|
+
Example:
|
|
186
|
+
```python
|
|
187
|
+
from pyspark.sql import SparkSession
|
|
188
|
+
from openaivec.spark import setup
|
|
189
|
+
|
|
190
|
+
spark = SparkSession.builder.getOrCreate()
|
|
191
|
+
setup(
|
|
192
|
+
spark,
|
|
193
|
+
api_key="sk-***",
|
|
194
|
+
responses_model_name="gpt-4.1-mini",
|
|
195
|
+
embeddings_model_name="text-embedding-3-small",
|
|
196
|
+
)
|
|
197
|
+
```
|
|
184
198
|
"""
|
|
185
199
|
|
|
186
200
|
CONTAINER.register(SparkSession, lambda: spark)
|
|
@@ -221,6 +235,22 @@ def setup_azure(
|
|
|
221
235
|
If provided, registers `ResponsesModelName` in the DI container.
|
|
222
236
|
embeddings_model_name (str | None): Default model name for embeddings.
|
|
223
237
|
If provided, registers `EmbeddingsModelName` in the DI container.
|
|
238
|
+
|
|
239
|
+
Example:
|
|
240
|
+
```python
|
|
241
|
+
from pyspark.sql import SparkSession
|
|
242
|
+
from openaivec.spark import setup_azure
|
|
243
|
+
|
|
244
|
+
spark = SparkSession.builder.getOrCreate()
|
|
245
|
+
setup_azure(
|
|
246
|
+
spark,
|
|
247
|
+
api_key="azure-key",
|
|
248
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
249
|
+
api_version="preview",
|
|
250
|
+
responses_model_name="gpt4-deployment",
|
|
251
|
+
embeddings_model_name="embedding-deployment",
|
|
252
|
+
)
|
|
253
|
+
```
|
|
224
254
|
"""
|
|
225
255
|
|
|
226
256
|
CONTAINER.register(SparkSession, lambda: spark)
|
|
@@ -375,6 +405,19 @@ def responses_udf(
|
|
|
375
405
|
Raises:
|
|
376
406
|
ValueError: If `response_format` is not `str` or a Pydantic `BaseModel`.
|
|
377
407
|
|
|
408
|
+
Example:
|
|
409
|
+
```python
|
|
410
|
+
from pyspark.sql import SparkSession
|
|
411
|
+
from openaivec.spark import responses_udf, setup
|
|
412
|
+
|
|
413
|
+
spark = SparkSession.builder.getOrCreate()
|
|
414
|
+
setup(spark, api_key="sk-***", responses_model_name="gpt-4.1-mini")
|
|
415
|
+
udf = responses_udf("Reply with one word.")
|
|
416
|
+
spark.udf.register("short_answer", udf)
|
|
417
|
+
df = spark.createDataFrame([("hello",), ("bye",)], ["text"])
|
|
418
|
+
df.selectExpr("short_answer(text) as reply").show()
|
|
419
|
+
```
|
|
420
|
+
|
|
378
421
|
Note:
|
|
379
422
|
For optimal performance in distributed environments:
|
|
380
423
|
- **Automatic Caching**: Duplicate inputs within each partition are cached,
|
|
@@ -533,6 +576,20 @@ def infer_schema(
|
|
|
533
576
|
|
|
534
577
|
Returns:
|
|
535
578
|
InferredSchema: An object containing the inferred schema and response format.
|
|
579
|
+
|
|
580
|
+
Example:
|
|
581
|
+
```python
|
|
582
|
+
from pyspark.sql import SparkSession
|
|
583
|
+
|
|
584
|
+
spark = SparkSession.builder.getOrCreate()
|
|
585
|
+
spark.createDataFrame([("great product",), ("bad service",)], ["text"]).createOrReplaceTempView("examples")
|
|
586
|
+
infer_schema(
|
|
587
|
+
instructions="Classify sentiment as positive or negative.",
|
|
588
|
+
example_table_name="examples",
|
|
589
|
+
example_field_name="text",
|
|
590
|
+
max_examples=2,
|
|
591
|
+
)
|
|
592
|
+
```
|
|
536
593
|
"""
|
|
537
594
|
|
|
538
595
|
spark = CONTAINER.resolve(SparkSession)
|
|
@@ -595,6 +652,23 @@ def parse_udf(
|
|
|
595
652
|
forwarded verbatim to the underlying API calls. These parameters are applied to
|
|
596
653
|
all API requests made by the UDF and override any parameters set in the
|
|
597
654
|
response_format or example data.
|
|
655
|
+
Example:
|
|
656
|
+
```python
|
|
657
|
+
from pyspark.sql import SparkSession
|
|
658
|
+
|
|
659
|
+
spark = SparkSession.builder.getOrCreate()
|
|
660
|
+
spark.createDataFrame(
|
|
661
|
+
[("Order #123 delivered",), ("Order #456 delayed",)],
|
|
662
|
+
["body"],
|
|
663
|
+
).createOrReplaceTempView("messages")
|
|
664
|
+
udf = parse_udf(
|
|
665
|
+
instructions="Extract order id as `order_id` and status as `status`.",
|
|
666
|
+
example_table_name="messages",
|
|
667
|
+
example_field_name="body",
|
|
668
|
+
)
|
|
669
|
+
spark.udf.register("parse_ticket", udf)
|
|
670
|
+
spark.sql("SELECT parse_ticket(body) AS parsed FROM messages").show()
|
|
671
|
+
```
|
|
598
672
|
Returns:
|
|
599
673
|
UserDefinedFunction: A Spark pandas UDF configured to parse responses asynchronously.
|
|
600
674
|
Output schema is `StringType` for str response format or a struct derived from
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: openaivec
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: Generative mutation for tabular calculation
|
|
5
|
+
Project-URL: Homepage, https://microsoft.github.io/openaivec/
|
|
6
|
+
Project-URL: Repository, https://github.com/microsoft/openaivec
|
|
7
|
+
Author-email: Hiroki Mizukami <hmizukami@microsoft.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: llm,openai,openai-api,openai-python,pandas,pyspark
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: ipywidgets>=8.1.7
|
|
19
|
+
Requires-Dist: openai>=1.74.0
|
|
20
|
+
Requires-Dist: pandas>=2.2.3
|
|
21
|
+
Requires-Dist: tiktoken>=0.9.0
|
|
22
|
+
Requires-Dist: tqdm>=4.67.1
|
|
23
|
+
Provides-Extra: spark
|
|
24
|
+
Requires-Dist: pyspark>=3.5.5; extra == 'spark'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# openaivec
|
|
28
|
+
|
|
29
|
+
Transform pandas and Spark workflows with AI-powered text processing—batching, caching, and guardrails included.
|
|
30
|
+
|
|
31
|
+
[Contributor guidelines](AGENTS.md)
|
|
32
|
+
|
|
33
|
+
## Quick start
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install openaivec
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
import os
|
|
41
|
+
import pandas as pd
|
|
42
|
+
from openaivec import pandas_ext
|
|
43
|
+
|
|
44
|
+
# Auth: choose OpenAI or Azure OpenAI
|
|
45
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key"
|
|
46
|
+
# Azure alternative:
|
|
47
|
+
# os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-key"
|
|
48
|
+
# os.environ["AZURE_OPENAI_BASE_URL"] = "https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
|
|
49
|
+
# os.environ["AZURE_OPENAI_API_VERSION"] = "preview"
|
|
50
|
+
|
|
51
|
+
pandas_ext.set_responses_model("gpt-5.1") # Optional override (use deployment name for Azure)
|
|
52
|
+
|
|
53
|
+
reviews = pd.Series([
|
|
54
|
+
"Great coffee and friendly staff.",
|
|
55
|
+
"Delivery was late and the package was damaged.",
|
|
56
|
+
])
|
|
57
|
+
|
|
58
|
+
sentiment = reviews.ai.responses(
|
|
59
|
+
"Summarize sentiment in one short sentence.",
|
|
60
|
+
reasoning={"effort": "none"}, # Mirrors OpenAI SDK for reasoning models
|
|
61
|
+
)
|
|
62
|
+
print(sentiment.tolist())
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Try it live:** https://microsoft.github.io/openaivec/examples/pandas/
|
|
66
|
+
|
|
67
|
+
## Contents
|
|
68
|
+
|
|
69
|
+
- [Why openaivec?](#why-openaivec)
|
|
70
|
+
- [Core Workflows](#core-workflows)
|
|
71
|
+
- [Using with Apache Spark UDFs](#using-with-apache-spark-udfs)
|
|
72
|
+
- [Building Prompts](#building-prompts)
|
|
73
|
+
- [Using with Microsoft Fabric](#using-with-microsoft-fabric)
|
|
74
|
+
- [Contributing](#contributing)
|
|
75
|
+
- [Additional Resources](#additional-resources)
|
|
76
|
+
- [Community](#community)
|
|
77
|
+
|
|
78
|
+
## Why openaivec?
|
|
79
|
+
|
|
80
|
+
- Drop-in `.ai` and `.aio` accessors keep pandas analysts in familiar tooling.
|
|
81
|
+
- Smart batching (`BatchingMapProxy`/`AsyncBatchingMapProxy`) dedupes prompts, preserves order, and releases waiters on failure.
|
|
82
|
+
- Reasoning support mirrors the OpenAI SDK; structured outputs accept Pydantic `response_format`.
|
|
83
|
+
- Built-in caches and retries remove boilerplate; helpers reuse caches across pandas, Spark, and async flows.
|
|
84
|
+
- Spark UDFs and Microsoft Fabric guides move notebooks into production-scale ETL.
|
|
85
|
+
- Prompt tooling (`FewShotPromptBuilder`, `improve`) and the task library ship curated prompts with validated outputs.
|
|
86
|
+
|
|
87
|
+
# Overview
|
|
88
|
+
|
|
89
|
+
Vectorized OpenAI access so you process many inputs per call instead of one-by-one. Batching proxies dedupe inputs, enforce ordered outputs, and unblock waiters even on upstream errors. Cache helpers (`responses_with_cache`, Spark UDF builders) plug into the same layer so expensive prompts are reused across pandas, Spark, and async flows. Reasoning models honor SDK semantics. Requires Python 3.10+.
|
|
90
|
+
|
|
91
|
+
## Core Workflows
|
|
92
|
+
|
|
93
|
+
### Direct API usage
|
|
94
|
+
|
|
95
|
+
For maximum control over batch processing:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import os
|
|
99
|
+
from openai import OpenAI
|
|
100
|
+
from openaivec import BatchResponses
|
|
101
|
+
|
|
102
|
+
# Initialize the batch client
|
|
103
|
+
client = BatchResponses.of(
|
|
104
|
+
client=OpenAI(),
|
|
105
|
+
model_name="gpt-5.1",
|
|
106
|
+
system_message="Please answer only with 'xx family' and do not output anything else.",
|
|
107
|
+
# batch_size defaults to None (automatic optimization)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
result = client.parse(
|
|
111
|
+
["panda", "rabbit", "koala"],
|
|
112
|
+
reasoning={"effort": "none"},
|
|
113
|
+
)
|
|
114
|
+
print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
📓 **[Complete tutorial →](https://microsoft.github.io/openaivec/examples/pandas/)**
|
|
118
|
+
|
|
119
|
+
### pandas integration (recommended)
|
|
120
|
+
|
|
121
|
+
The easiest way to get started with your DataFrames:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import os
|
|
125
|
+
import pandas as pd
|
|
126
|
+
from openaivec import pandas_ext
|
|
127
|
+
|
|
128
|
+
# Authentication Option 1: Environment variables (automatic detection)
|
|
129
|
+
os.environ["OPENAI_API_KEY"] = "your-api-key-here"
|
|
130
|
+
# Or for Azure OpenAI:
|
|
131
|
+
# os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-key"
|
|
132
|
+
# os.environ["AZURE_OPENAI_BASE_URL"] = "https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/"
|
|
133
|
+
# os.environ["AZURE_OPENAI_API_VERSION"] = "preview"
|
|
134
|
+
|
|
135
|
+
# Authentication Option 2: Custom client (optional)
|
|
136
|
+
# from openai import OpenAI, AsyncOpenAI
|
|
137
|
+
# pandas_ext.set_client(OpenAI())
|
|
138
|
+
# pandas_ext.set_async_client(AsyncOpenAI())
|
|
139
|
+
|
|
140
|
+
# Configure model (optional - defaults to gpt-5.1; use deployment name for Azure)
|
|
141
|
+
pandas_ext.set_responses_model("gpt-5.1")
|
|
142
|
+
|
|
143
|
+
# Create your data
|
|
144
|
+
df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
|
|
145
|
+
|
|
146
|
+
# Add AI-powered columns
|
|
147
|
+
result = df.assign(
|
|
148
|
+
family=lambda df: df.name.ai.responses(
|
|
149
|
+
"What animal family? Answer with 'X family'",
|
|
150
|
+
reasoning={"effort": "none"},
|
|
151
|
+
),
|
|
152
|
+
habitat=lambda df: df.name.ai.responses(
|
|
153
|
+
"Primary habitat in one word",
|
|
154
|
+
reasoning={"effort": "none"},
|
|
155
|
+
),
|
|
156
|
+
fun_fact=lambda df: df.name.ai.responses(
|
|
157
|
+
"One interesting fact in 10 words or less",
|
|
158
|
+
reasoning={"effort": "none"},
|
|
159
|
+
),
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
| name | family | habitat | fun_fact |
|
|
164
|
+
| ------ | ---------------- | ------- | -------------------------- |
|
|
165
|
+
| panda | bear family | forest | Eats bamboo 14 hours daily |
|
|
166
|
+
| rabbit | rabbit family | meadow | Can see nearly 360 degrees |
|
|
167
|
+
| koala | marsupial family | tree | Sleeps 22 hours per day |
|
|
168
|
+
|
|
169
|
+
📓 **[Interactive pandas examples →](https://microsoft.github.io/openaivec/examples/pandas/)**
|
|
170
|
+
|
|
171
|
+
### Using with reasoning models
|
|
172
|
+
|
|
173
|
+
Reasoning models (o1-preview, o1-mini, o3-mini, etc.) work without special flags. `reasoning` mirrors the OpenAI SDK.
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
|
|
177
|
+
|
|
178
|
+
result = df.assign(
|
|
179
|
+
analysis=lambda df: df.text.ai.responses(
|
|
180
|
+
"Analyze this text step by step",
|
|
181
|
+
reasoning={"effort": "none"} # Optional: mirrors the OpenAI SDK argument
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
You can omit `reasoning` to use the model defaults or tune it per request with the same shape (`dict` with effort) as the OpenAI SDK.
|
|
187
|
+
|
|
188
|
+
### Using pre-configured tasks
|
|
189
|
+
|
|
190
|
+
For common text processing operations, openaivec provides ready-to-use tasks that eliminate the need to write custom prompts:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from openaivec.task import nlp, customer_support
|
|
194
|
+
|
|
195
|
+
text_df = pd.DataFrame({
|
|
196
|
+
"text": [
|
|
197
|
+
"Great product, fast delivery!",
|
|
198
|
+
"Need help with billing issue",
|
|
199
|
+
"How do I reset my password?"
|
|
200
|
+
]
|
|
201
|
+
})
|
|
202
|
+
|
|
203
|
+
results = text_df.assign(
|
|
204
|
+
sentiment=lambda df: df.text.ai.task(nlp.SENTIMENT_ANALYSIS),
|
|
205
|
+
intent=lambda df: df.text.ai.task(customer_support.INTENT_ANALYSIS),
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Extract structured results into separate columns
|
|
209
|
+
extracted_results = results.ai.extract("sentiment")
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
**Task categories:** Text analysis (`nlp.SENTIMENT_ANALYSIS`, `nlp.MULTILINGUAL_TRANSLATION`, `nlp.NAMED_ENTITY_RECOGNITION`, `nlp.KEYWORD_EXTRACTION`); Content classification (`customer_support.INTENT_ANALYSIS`, `customer_support.URGENCY_ANALYSIS`, `customer_support.INQUIRY_CLASSIFICATION`).
|
|
213
|
+
|
|
214
|
+
### Asynchronous processing with `.aio`
|
|
215
|
+
|
|
216
|
+
High-throughput workloads use the `.aio` accessor for async versions of all operations:
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
import asyncio
|
|
220
|
+
import pandas as pd
|
|
221
|
+
from openaivec import pandas_ext
|
|
222
|
+
|
|
223
|
+
pandas_ext.set_responses_model("gpt-5.1")
|
|
224
|
+
|
|
225
|
+
df = pd.DataFrame({"text": [
|
|
226
|
+
"This product is amazing!",
|
|
227
|
+
"Terrible customer service",
|
|
228
|
+
"Good value for money",
|
|
229
|
+
"Not what I expected"
|
|
230
|
+
] * 250}) # 1000 rows for demonstration
|
|
231
|
+
|
|
232
|
+
async def process_data():
|
|
233
|
+
return await df["text"].aio.responses(
|
|
234
|
+
"Analyze sentiment and classify as positive/negative/neutral",
|
|
235
|
+
reasoning={"effort": "none"}, # Required for gpt-5.1
|
|
236
|
+
max_concurrency=12 # Allow up to 12 concurrent requests
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
sentiments = asyncio.run(process_data())
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**Performance benefits:** Parallel processing with automatic batching/deduplication, built-in rate limiting and error handling, and memory-efficient streaming for large datasets.
|
|
243
|
+
|
|
244
|
+
## Using with Apache Spark UDFs
|
|
245
|
+
|
|
246
|
+
Scale to enterprise datasets with distributed processing.
|
|
247
|
+
|
|
248
|
+
📓 **[Spark tutorial →](https://microsoft.github.io/openaivec/examples/spark/)**
|
|
249
|
+
|
|
250
|
+
First, obtain a Spark session and configure authentication:
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
from pyspark.sql import SparkSession
|
|
254
|
+
from openaivec.spark import setup, setup_azure
|
|
255
|
+
|
|
256
|
+
spark = SparkSession.builder.getOrCreate()
|
|
257
|
+
|
|
258
|
+
# Option 1: Using OpenAI
|
|
259
|
+
setup(
|
|
260
|
+
spark,
|
|
261
|
+
api_key="your-openai-api-key",
|
|
262
|
+
responses_model_name="gpt-5.1", # Optional: set default model
|
|
263
|
+
embeddings_model_name="text-embedding-3-small" # Optional: set default model
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Option 2: Using Azure OpenAI
|
|
267
|
+
# setup_azure(
|
|
268
|
+
# spark,
|
|
269
|
+
# api_key="your-azure-openai-api-key",
|
|
270
|
+
# base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
271
|
+
# api_version="preview",
|
|
272
|
+
# responses_model_name="my-gpt4-deployment", # Optional: set default deployment
|
|
273
|
+
# embeddings_model_name="my-embedding-deployment" # Optional: set default deployment
|
|
274
|
+
# )
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Create and register UDFs using the provided helpers:
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
from openaivec.spark import responses_udf, task_udf, embeddings_udf, count_tokens_udf, similarity_udf, parse_udf
|
|
281
|
+
from pydantic import BaseModel
|
|
282
|
+
|
|
283
|
+
spark.udf.register(
|
|
284
|
+
"extract_brand",
|
|
285
|
+
responses_udf(
|
|
286
|
+
instructions="Extract the brand name from the product. Return only the brand name.",
|
|
287
|
+
reasoning={"effort": "none"}, # Recommended with gpt-5.1
|
|
288
|
+
)
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
class Translation(BaseModel):
|
|
292
|
+
en: str
|
|
293
|
+
fr: str
|
|
294
|
+
ja: str
|
|
295
|
+
|
|
296
|
+
spark.udf.register(
|
|
297
|
+
"translate_struct",
|
|
298
|
+
responses_udf(
|
|
299
|
+
instructions="Translate the text to English, French, and Japanese.",
|
|
300
|
+
response_format=Translation,
|
|
301
|
+
reasoning={"effort": "none"}, # Recommended with gpt-5.1
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
spark.udf.register("embed_text", embeddings_udf())
|
|
306
|
+
spark.udf.register("count_tokens", count_tokens_udf())
|
|
307
|
+
spark.udf.register("compute_similarity", similarity_udf())
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### Spark performance tips
|
|
311
|
+
|
|
312
|
+
- Duplicate detection automatically caches repeated inputs per partition for UDFs.
|
|
313
|
+
- `batch_size=None` auto-optimizes; set 32–128 for fixed sizes if needed.
|
|
314
|
+
- `max_concurrency` is per executor; total concurrency = executors × max_concurrency. Start with 4–12.
|
|
315
|
+
- Monitor rate limits and adjust concurrency to your OpenAI tier.
|
|
316
|
+
|
|
317
|
+
## Building Prompts
|
|
318
|
+
|
|
319
|
+
Few-shot prompts improve LLM quality. `FewShotPromptBuilder` structures purpose, cautions, and examples; `improve()` iterates with OpenAI to remove contradictions.
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
from openaivec import FewShotPromptBuilder
|
|
323
|
+
|
|
324
|
+
prompt = (
|
|
325
|
+
FewShotPromptBuilder()
|
|
326
|
+
.purpose("Return the smallest category that includes the given word")
|
|
327
|
+
.caution("Never use proper nouns as categories")
|
|
328
|
+
.example("Apple", "Fruit")
|
|
329
|
+
.example("Car", "Vehicle")
|
|
330
|
+
.improve(max_iter=1) # optional
|
|
331
|
+
.build()
|
|
332
|
+
)
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
📓 **[Advanced prompting techniques →](https://microsoft.github.io/openaivec/examples/prompt/)**
|
|
336
|
+
|
|
337
|
+
## Using with Microsoft Fabric
|
|
338
|
+
|
|
339
|
+
[Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
|
|
340
|
+
|
|
341
|
+
## Contributing
|
|
342
|
+
|
|
343
|
+
We welcome contributions! Please:
|
|
344
|
+
|
|
345
|
+
1. Fork and branch from `main`.
|
|
346
|
+
2. Add or update tests when you change code.
|
|
347
|
+
3. Run formatting and tests before opening a PR.
|
|
348
|
+
|
|
349
|
+
Install dev deps:
|
|
350
|
+
|
|
351
|
+
```bash
|
|
352
|
+
uv sync --all-extras --dev
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
Lint and format:
|
|
356
|
+
|
|
357
|
+
```bash
|
|
358
|
+
uv run ruff check . --fix
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
Quick test pass:
|
|
362
|
+
|
|
363
|
+
```bash
|
|
364
|
+
uv run pytest -m "not slow and not requires_api"
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
## Additional Resources
|
|
368
|
+
|
|
369
|
+
📓 **[Customer feedback analysis →](https://microsoft.github.io/openaivec/examples/customer_analysis/)** - Sentiment analysis & prioritization
|
|
370
|
+
📓 **[Survey data transformation →](https://microsoft.github.io/openaivec/examples/survey_transformation/)** - Unstructured to structured data
|
|
371
|
+
📓 **[Asynchronous processing examples →](https://microsoft.github.io/openaivec/examples/aio/)** - High-performance async workflows
|
|
372
|
+
📓 **[Auto-generate FAQs from documents →](https://microsoft.github.io/openaivec/examples/generate_faq/)** - Create FAQs using AI
|
|
373
|
+
📓 **[All examples →](https://microsoft.github.io/openaivec/examples/pandas/)** - Complete collection of tutorials and use cases
|
|
374
|
+
|
|
375
|
+
## Community
|
|
376
|
+
|
|
377
|
+
Join our Discord community for support and announcements: https://discord.gg/hXCS9J6Qek
|
|
@@ -9,10 +9,10 @@ openaivec/_responses.py,sha256=Lb37ajlFQoVVac_p9oVf3scUDS3AI1ro4tRlk_UBqVg,20412
|
|
|
9
9
|
openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
|
|
10
10
|
openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
|
|
11
11
|
openaivec/pandas_ext.py,sha256=XEmB08FS6lFtk6V7zzM4XHnzPkLCZ08OFFlkX-f0Oko,86730
|
|
12
|
-
openaivec/spark.py,sha256=
|
|
12
|
+
openaivec/spark.py,sha256=OsEuwRRBzNs8Zv-hwDDsXR7A-dqZT651sdRmGkziTOo,35236
|
|
13
13
|
openaivec/_cache/__init__.py,sha256=IYUH5GKsJXuCX-k3XtT259rEz49EZm9KW2TIOTGW4uQ,314
|
|
14
14
|
openaivec/_cache/optimize.py,sha256=3nS8VehbS7iGC1tPDDQh-iAgyKHbVYmMbCRBWM77U_U,3827
|
|
15
|
-
openaivec/_cache/proxy.py,sha256=
|
|
15
|
+
openaivec/_cache/proxy.py,sha256=aVjH_hmJIIso6SetV_-Ct3VaOSG-n9Dpil7TttnbYkE,30556
|
|
16
16
|
openaivec/_schema/__init__.py,sha256=XUj3Jv6ZVDjyYzSmH6Q5lmDj-hBMfUg_eBNeZACXR6Q,368
|
|
17
17
|
openaivec/_schema/infer.py,sha256=gcrpw0OVJMWdmUlzimP-C14cuCAAOnHQd8-bUNR220o,15705
|
|
18
18
|
openaivec/_schema/spec.py,sha256=7ZaC59w2Edemnao57XeZVO4qmSOA-Kus6TchZC3Dd5o,14821
|
|
@@ -33,7 +33,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=1igoAhns-VgsDE8XI47Dw-zeOcR5wEY9
|
|
|
33
33
|
openaivec/task/nlp/translation.py,sha256=TtV7F6bmKPqLi3_Ok7GoOqT_GKJiemotVq-YEbKd6IA,6617
|
|
34
34
|
openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
|
|
35
35
|
openaivec/task/table/fillna.py,sha256=4j27fWT5IzOhQqCPwLhomjBOAWPBslyIBbBMspjqtbw,6877
|
|
36
|
-
openaivec-1.0.
|
|
37
|
-
openaivec-1.0.
|
|
38
|
-
openaivec-1.0.
|
|
39
|
-
openaivec-1.0.
|
|
36
|
+
openaivec-1.0.2.dist-info/METADATA,sha256=jemoWZqfVq031gFXafWoJVjMlo2OYxvGSGhl7MzlSAw,12844
|
|
37
|
+
openaivec-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
openaivec-1.0.2.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
|
|
39
|
+
openaivec-1.0.2.dist-info/RECORD,,
|