openaivec 1.0.1__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {openaivec-1.0.1 → openaivec-1.0.3}/PKG-INFO +26 -12
- {openaivec-1.0.1 → openaivec-1.0.3}/README.md +25 -11
- {openaivec-1.0.1 → openaivec-1.0.3}/mkdocs.yml +2 -1
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_cache/proxy.py +52 -15
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/spark.py +74 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/uv.lock +671 -656
- {openaivec-1.0.1 → openaivec-1.0.3}/.env.example +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/.github/copilot-instructions.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/.github/dependabot.yml +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/.github/workflows/docs.yml +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/.github/workflows/publish.yml +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/.github/workflows/test.yml +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/.gitignore +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/AGENTS.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/CODE_OF_CONDUCT.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/LICENSE +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/SECURITY.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/SUPPORT.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/main.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/pandas_ext.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/spark.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/task.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/translation.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/contributor-guide.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/index.md +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/docs/robots.txt +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/pyproject.toml +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/pytest.ini +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_cache/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_cache/optimize.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_di.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_embeddings.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_log.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_model.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_prompt.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_provider.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_responses.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_schema/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_schema/infer.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_schema/spec.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_serialize.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_util.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/pandas_ext.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/translation.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/table/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/table/fillna.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/__init__.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/_cache/test_optimize.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/_cache/test_proxy.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/_cache/test_proxy_suggester.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/_schema/test_infer.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/_schema/test_spec.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/conftest.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_di.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_embeddings.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_pandas_ext.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_prompt.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_provider.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_responses.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_serialize.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_serialize_pydantic_v2_compliance.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_spark.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_task.py +0 -0
- {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_util.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: openaivec
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.3
|
|
4
4
|
Summary: Generative mutation for tabular calculation
|
|
5
5
|
Project-URL: Homepage, https://microsoft.github.io/openaivec/
|
|
6
6
|
Project-URL: Repository, https://github.com/microsoft/openaivec
|
|
@@ -57,13 +57,27 @@ reviews = pd.Series([
|
|
|
57
57
|
|
|
58
58
|
sentiment = reviews.ai.responses(
|
|
59
59
|
"Summarize sentiment in one short sentence.",
|
|
60
|
-
reasoning={"effort": "
|
|
60
|
+
reasoning={"effort": "none"}, # Mirrors OpenAI SDK for reasoning models
|
|
61
61
|
)
|
|
62
62
|
print(sentiment.tolist())
|
|
63
63
|
```
|
|
64
64
|
|
|
65
65
|
**Try it live:** https://microsoft.github.io/openaivec/examples/pandas/
|
|
66
66
|
|
|
67
|
+
## Benchmarks
|
|
68
|
+
|
|
69
|
+
Simple task benchmark from [benchmark.ipynb](https://github.com/microsoft/openaivec/blob/main/docs/examples/benchmark.ipynb) (100 numeric strings → integer literals, `Series.aio.responses`, model `gpt-5.1`):
|
|
70
|
+
|
|
71
|
+
| Mode | Settings | Time (s) |
|
|
72
|
+
| ------------------- | ----------------------------------------------- | -------- |
|
|
73
|
+
| Serial | `batch_size=1`, `max_concurrency=1` | ~141 |
|
|
74
|
+
| Batching | default `batch_size`, `max_concurrency=1` | ~15 |
|
|
75
|
+
| Concurrent batching | default `batch_size`, default `max_concurrency` | ~6 |
|
|
76
|
+
|
|
77
|
+
Batching alone removes most HTTP overhead, and letting batching overlap with concurrency cuts total runtime to a few seconds while still yielding one output per input.
|
|
78
|
+
|
|
79
|
+

|
|
80
|
+
|
|
67
81
|
## Contents
|
|
68
82
|
|
|
69
83
|
- [Why openaivec?](#why-openaivec)
|
|
@@ -109,7 +123,7 @@ client = BatchResponses.of(
|
|
|
109
123
|
|
|
110
124
|
result = client.parse(
|
|
111
125
|
["panda", "rabbit", "koala"],
|
|
112
|
-
reasoning={"effort": "
|
|
126
|
+
reasoning={"effort": "none"},
|
|
113
127
|
)
|
|
114
128
|
print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
|
|
115
129
|
```
|
|
@@ -147,15 +161,15 @@ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
|
|
|
147
161
|
result = df.assign(
|
|
148
162
|
family=lambda df: df.name.ai.responses(
|
|
149
163
|
"What animal family? Answer with 'X family'",
|
|
150
|
-
reasoning={"effort": "
|
|
164
|
+
reasoning={"effort": "none"},
|
|
151
165
|
),
|
|
152
166
|
habitat=lambda df: df.name.ai.responses(
|
|
153
167
|
"Primary habitat in one word",
|
|
154
|
-
reasoning={"effort": "
|
|
168
|
+
reasoning={"effort": "none"},
|
|
155
169
|
),
|
|
156
170
|
fun_fact=lambda df: df.name.ai.responses(
|
|
157
171
|
"One interesting fact in 10 words or less",
|
|
158
|
-
reasoning={"effort": "
|
|
172
|
+
reasoning={"effort": "none"},
|
|
159
173
|
),
|
|
160
174
|
)
|
|
161
175
|
```
|
|
@@ -178,7 +192,7 @@ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
|
|
|
178
192
|
result = df.assign(
|
|
179
193
|
analysis=lambda df: df.text.ai.responses(
|
|
180
194
|
"Analyze this text step by step",
|
|
181
|
-
reasoning={"effort": "
|
|
195
|
+
reasoning={"effort": "none"} # Optional: mirrors the OpenAI SDK argument
|
|
182
196
|
)
|
|
183
197
|
)
|
|
184
198
|
```
|
|
@@ -232,7 +246,7 @@ df = pd.DataFrame({"text": [
|
|
|
232
246
|
async def process_data():
|
|
233
247
|
return await df["text"].aio.responses(
|
|
234
248
|
"Analyze sentiment and classify as positive/negative/neutral",
|
|
235
|
-
reasoning={"effort": "
|
|
249
|
+
reasoning={"effort": "none"}, # Required for gpt-5.1
|
|
236
250
|
max_concurrency=12 # Allow up to 12 concurrent requests
|
|
237
251
|
)
|
|
238
252
|
|
|
@@ -284,7 +298,7 @@ spark.udf.register(
|
|
|
284
298
|
"extract_brand",
|
|
285
299
|
responses_udf(
|
|
286
300
|
instructions="Extract the brand name from the product. Return only the brand name.",
|
|
287
|
-
reasoning={"effort": "
|
|
301
|
+
reasoning={"effort": "none"}, # Recommended with gpt-5.1
|
|
288
302
|
)
|
|
289
303
|
)
|
|
290
304
|
|
|
@@ -298,7 +312,7 @@ spark.udf.register(
|
|
|
298
312
|
responses_udf(
|
|
299
313
|
instructions="Translate the text to English, French, and Japanese.",
|
|
300
314
|
response_format=Translation,
|
|
301
|
-
reasoning={"effort": "
|
|
315
|
+
reasoning={"effort": "none"}, # Recommended with gpt-5.1
|
|
302
316
|
)
|
|
303
317
|
)
|
|
304
318
|
|
|
@@ -336,7 +350,7 @@ prompt = (
|
|
|
336
350
|
|
|
337
351
|
## Using with Microsoft Fabric
|
|
338
352
|
|
|
339
|
-
[Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
|
|
353
|
+
[Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
|
|
340
354
|
|
|
341
355
|
## Contributing
|
|
342
356
|
|
|
@@ -374,4 +388,4 @@ uv run pytest -m "not slow and not requires_api"
|
|
|
374
388
|
|
|
375
389
|
## Community
|
|
376
390
|
|
|
377
|
-
Join our Discord community for support and announcements: https://discord.gg/
|
|
391
|
+
Join our Discord community for support and announcements: https://discord.gg/hXCS9J6Qek
|
|
@@ -31,13 +31,27 @@ reviews = pd.Series([
|
|
|
31
31
|
|
|
32
32
|
sentiment = reviews.ai.responses(
|
|
33
33
|
"Summarize sentiment in one short sentence.",
|
|
34
|
-
reasoning={"effort": "
|
|
34
|
+
reasoning={"effort": "none"}, # Mirrors OpenAI SDK for reasoning models
|
|
35
35
|
)
|
|
36
36
|
print(sentiment.tolist())
|
|
37
37
|
```
|
|
38
38
|
|
|
39
39
|
**Try it live:** https://microsoft.github.io/openaivec/examples/pandas/
|
|
40
40
|
|
|
41
|
+
## Benchmarks
|
|
42
|
+
|
|
43
|
+
Simple task benchmark from [benchmark.ipynb](https://github.com/microsoft/openaivec/blob/main/docs/examples/benchmark.ipynb) (100 numeric strings → integer literals, `Series.aio.responses`, model `gpt-5.1`):
|
|
44
|
+
|
|
45
|
+
| Mode | Settings | Time (s) |
|
|
46
|
+
| ------------------- | ----------------------------------------------- | -------- |
|
|
47
|
+
| Serial | `batch_size=1`, `max_concurrency=1` | ~141 |
|
|
48
|
+
| Batching | default `batch_size`, `max_concurrency=1` | ~15 |
|
|
49
|
+
| Concurrent batching | default `batch_size`, default `max_concurrency` | ~6 |
|
|
50
|
+
|
|
51
|
+
Batching alone removes most HTTP overhead, and letting batching overlap with concurrency cuts total runtime to a few seconds while still yielding one output per input.
|
|
52
|
+
|
|
53
|
+

|
|
54
|
+
|
|
41
55
|
## Contents
|
|
42
56
|
|
|
43
57
|
- [Why openaivec?](#why-openaivec)
|
|
@@ -83,7 +97,7 @@ client = BatchResponses.of(
|
|
|
83
97
|
|
|
84
98
|
result = client.parse(
|
|
85
99
|
["panda", "rabbit", "koala"],
|
|
86
|
-
reasoning={"effort": "
|
|
100
|
+
reasoning={"effort": "none"},
|
|
87
101
|
)
|
|
88
102
|
print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
|
|
89
103
|
```
|
|
@@ -121,15 +135,15 @@ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
|
|
|
121
135
|
result = df.assign(
|
|
122
136
|
family=lambda df: df.name.ai.responses(
|
|
123
137
|
"What animal family? Answer with 'X family'",
|
|
124
|
-
reasoning={"effort": "
|
|
138
|
+
reasoning={"effort": "none"},
|
|
125
139
|
),
|
|
126
140
|
habitat=lambda df: df.name.ai.responses(
|
|
127
141
|
"Primary habitat in one word",
|
|
128
|
-
reasoning={"effort": "
|
|
142
|
+
reasoning={"effort": "none"},
|
|
129
143
|
),
|
|
130
144
|
fun_fact=lambda df: df.name.ai.responses(
|
|
131
145
|
"One interesting fact in 10 words or less",
|
|
132
|
-
reasoning={"effort": "
|
|
146
|
+
reasoning={"effort": "none"},
|
|
133
147
|
),
|
|
134
148
|
)
|
|
135
149
|
```
|
|
@@ -152,7 +166,7 @@ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
|
|
|
152
166
|
result = df.assign(
|
|
153
167
|
analysis=lambda df: df.text.ai.responses(
|
|
154
168
|
"Analyze this text step by step",
|
|
155
|
-
reasoning={"effort": "
|
|
169
|
+
reasoning={"effort": "none"} # Optional: mirrors the OpenAI SDK argument
|
|
156
170
|
)
|
|
157
171
|
)
|
|
158
172
|
```
|
|
@@ -206,7 +220,7 @@ df = pd.DataFrame({"text": [
|
|
|
206
220
|
async def process_data():
|
|
207
221
|
return await df["text"].aio.responses(
|
|
208
222
|
"Analyze sentiment and classify as positive/negative/neutral",
|
|
209
|
-
reasoning={"effort": "
|
|
223
|
+
reasoning={"effort": "none"}, # Required for gpt-5.1
|
|
210
224
|
max_concurrency=12 # Allow up to 12 concurrent requests
|
|
211
225
|
)
|
|
212
226
|
|
|
@@ -258,7 +272,7 @@ spark.udf.register(
|
|
|
258
272
|
"extract_brand",
|
|
259
273
|
responses_udf(
|
|
260
274
|
instructions="Extract the brand name from the product. Return only the brand name.",
|
|
261
|
-
reasoning={"effort": "
|
|
275
|
+
reasoning={"effort": "none"}, # Recommended with gpt-5.1
|
|
262
276
|
)
|
|
263
277
|
)
|
|
264
278
|
|
|
@@ -272,7 +286,7 @@ spark.udf.register(
|
|
|
272
286
|
responses_udf(
|
|
273
287
|
instructions="Translate the text to English, French, and Japanese.",
|
|
274
288
|
response_format=Translation,
|
|
275
|
-
reasoning={"effort": "
|
|
289
|
+
reasoning={"effort": "none"}, # Recommended with gpt-5.1
|
|
276
290
|
)
|
|
277
291
|
)
|
|
278
292
|
|
|
@@ -310,7 +324,7 @@ prompt = (
|
|
|
310
324
|
|
|
311
325
|
## Using with Microsoft Fabric
|
|
312
326
|
|
|
313
|
-
[Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
|
|
327
|
+
[Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
|
|
314
328
|
|
|
315
329
|
## Contributing
|
|
316
330
|
|
|
@@ -348,4 +362,4 @@ uv run pytest -m "not slow and not requires_api"
|
|
|
348
362
|
|
|
349
363
|
## Community
|
|
350
364
|
|
|
351
|
-
Join our Discord community for support and announcements: https://discord.gg/
|
|
365
|
+
Join our Discord community for support and announcements: https://discord.gg/hXCS9J6Qek
|
|
@@ -63,6 +63,7 @@ nav:
|
|
|
63
63
|
- Prompt Engineering: examples/prompt.ipynb
|
|
64
64
|
- FAQ Generation: examples/generate_faq.ipynb
|
|
65
65
|
- Token Count and Processing Time: examples/batch_size.ipynb
|
|
66
|
+
- Request Batching Benchmark: examples/benchmark.ipynb
|
|
66
67
|
- API Reference:
|
|
67
68
|
- Main Package: api/main.md
|
|
68
69
|
- pandas_ext: api/pandas_ext.md
|
|
@@ -121,7 +122,7 @@ extra:
|
|
|
121
122
|
- icon: fontawesome/brands/python
|
|
122
123
|
link: https://pypi.org/project/openaivec/
|
|
123
124
|
- icon: fontawesome/brands/discord
|
|
124
|
-
link: https://discord.gg/
|
|
125
|
+
link: https://discord.gg/hXCS9J6Qek
|
|
125
126
|
|
|
126
127
|
plugins:
|
|
127
128
|
- search:
|
|
@@ -186,11 +186,15 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
186
186
|
performance (targeting 30-60 seconds per batch).
|
|
187
187
|
|
|
188
188
|
Example:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
189
|
+
```python
|
|
190
|
+
p = BatchingMapProxy[int, str](batch_size=3)
|
|
191
|
+
|
|
192
|
+
def f(xs: list[int]) -> list[str]:
|
|
193
|
+
return [f"v:{x}" for x in xs]
|
|
194
|
+
|
|
195
|
+
p.map([1, 2, 2, 3, 4], f)
|
|
196
|
+
# ['v:1', 'v:2', 'v:2', 'v:3', 'v:4']
|
|
197
|
+
```
|
|
194
198
|
"""
|
|
195
199
|
|
|
196
200
|
# Number of items to process per call to map_func.
|
|
@@ -449,6 +453,21 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
449
453
|
|
|
450
454
|
Raises:
|
|
451
455
|
Exception: Propagates any exception raised by ``map_func``.
|
|
456
|
+
|
|
457
|
+
Example:
|
|
458
|
+
```python
|
|
459
|
+
proxy: BatchingMapProxy[int, str] = BatchingMapProxy(batch_size=2)
|
|
460
|
+
calls: list[list[int]] = []
|
|
461
|
+
|
|
462
|
+
def mapper(chunk: list[int]) -> list[str]:
|
|
463
|
+
calls.append(chunk)
|
|
464
|
+
return [f"v:{x}" for x in chunk]
|
|
465
|
+
|
|
466
|
+
proxy.map([1, 2, 2, 3], mapper)
|
|
467
|
+
# ['v:1', 'v:2', 'v:2', 'v:3']
|
|
468
|
+
calls # duplicate ``2`` is only computed once
|
|
469
|
+
# [[1, 2], [3]]
|
|
470
|
+
```
|
|
452
471
|
"""
|
|
453
472
|
if self.__all_cached(items):
|
|
454
473
|
return self.__values(items)
|
|
@@ -490,16 +509,21 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
490
509
|
performance (targeting 30-60 seconds per batch).
|
|
491
510
|
|
|
492
511
|
Example:
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
512
|
+
```python
|
|
513
|
+
import asyncio
|
|
514
|
+
|
|
515
|
+
p = AsyncBatchingMapProxy[int, str](batch_size=2)
|
|
516
|
+
|
|
517
|
+
async def af(xs: list[int]) -> list[str]:
|
|
518
|
+
await asyncio.sleep(0)
|
|
519
|
+
return [f"v:{x}" for x in xs]
|
|
520
|
+
|
|
521
|
+
async def run():
|
|
522
|
+
return await p.map([1, 2, 3], af)
|
|
523
|
+
|
|
524
|
+
asyncio.run(run())
|
|
525
|
+
# ['v:1', 'v:2', 'v:3']
|
|
526
|
+
```
|
|
503
527
|
"""
|
|
504
528
|
|
|
505
529
|
# Number of items to process per call to map_func.
|
|
@@ -747,6 +771,19 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
|
|
|
747
771
|
|
|
748
772
|
Returns:
|
|
749
773
|
list[T]: Mapped values corresponding to ``items`` in the same order.
|
|
774
|
+
|
|
775
|
+
Example:
|
|
776
|
+
```python
|
|
777
|
+
import asyncio
|
|
778
|
+
|
|
779
|
+
async def mapper(chunk: list[int]) -> list[str]:
|
|
780
|
+
await asyncio.sleep(0)
|
|
781
|
+
return [f"v:{x}" for x in chunk]
|
|
782
|
+
|
|
783
|
+
proxy: AsyncBatchingMapProxy[int, str] = AsyncBatchingMapProxy(batch_size=2)
|
|
784
|
+
asyncio.run(proxy.map([1, 1, 2], mapper))
|
|
785
|
+
# ['v:1', 'v:1', 'v:2']
|
|
786
|
+
```
|
|
750
787
|
"""
|
|
751
788
|
if await self.__all_cached(items):
|
|
752
789
|
return await self.__values(items)
|
|
@@ -181,6 +181,20 @@ def setup(
|
|
|
181
181
|
If provided, registers `ResponsesModelName` in the DI container.
|
|
182
182
|
embeddings_model_name (str | None): Default model name for embeddings.
|
|
183
183
|
If provided, registers `EmbeddingsModelName` in the DI container.
|
|
184
|
+
|
|
185
|
+
Example:
|
|
186
|
+
```python
|
|
187
|
+
from pyspark.sql import SparkSession
|
|
188
|
+
from openaivec.spark import setup
|
|
189
|
+
|
|
190
|
+
spark = SparkSession.builder.getOrCreate()
|
|
191
|
+
setup(
|
|
192
|
+
spark,
|
|
193
|
+
api_key="sk-***",
|
|
194
|
+
responses_model_name="gpt-4.1-mini",
|
|
195
|
+
embeddings_model_name="text-embedding-3-small",
|
|
196
|
+
)
|
|
197
|
+
```
|
|
184
198
|
"""
|
|
185
199
|
|
|
186
200
|
CONTAINER.register(SparkSession, lambda: spark)
|
|
@@ -221,6 +235,22 @@ def setup_azure(
|
|
|
221
235
|
If provided, registers `ResponsesModelName` in the DI container.
|
|
222
236
|
embeddings_model_name (str | None): Default model name for embeddings.
|
|
223
237
|
If provided, registers `EmbeddingsModelName` in the DI container.
|
|
238
|
+
|
|
239
|
+
Example:
|
|
240
|
+
```python
|
|
241
|
+
from pyspark.sql import SparkSession
|
|
242
|
+
from openaivec.spark import setup_azure
|
|
243
|
+
|
|
244
|
+
spark = SparkSession.builder.getOrCreate()
|
|
245
|
+
setup_azure(
|
|
246
|
+
spark,
|
|
247
|
+
api_key="azure-key",
|
|
248
|
+
base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
|
|
249
|
+
api_version="preview",
|
|
250
|
+
responses_model_name="gpt4-deployment",
|
|
251
|
+
embeddings_model_name="embedding-deployment",
|
|
252
|
+
)
|
|
253
|
+
```
|
|
224
254
|
"""
|
|
225
255
|
|
|
226
256
|
CONTAINER.register(SparkSession, lambda: spark)
|
|
@@ -375,6 +405,19 @@ def responses_udf(
|
|
|
375
405
|
Raises:
|
|
376
406
|
ValueError: If `response_format` is not `str` or a Pydantic `BaseModel`.
|
|
377
407
|
|
|
408
|
+
Example:
|
|
409
|
+
```python
|
|
410
|
+
from pyspark.sql import SparkSession
|
|
411
|
+
from openaivec.spark import responses_udf, setup
|
|
412
|
+
|
|
413
|
+
spark = SparkSession.builder.getOrCreate()
|
|
414
|
+
setup(spark, api_key="sk-***", responses_model_name="gpt-4.1-mini")
|
|
415
|
+
udf = responses_udf("Reply with one word.")
|
|
416
|
+
spark.udf.register("short_answer", udf)
|
|
417
|
+
df = spark.createDataFrame([("hello",), ("bye",)], ["text"])
|
|
418
|
+
df.selectExpr("short_answer(text) as reply").show()
|
|
419
|
+
```
|
|
420
|
+
|
|
378
421
|
Note:
|
|
379
422
|
For optimal performance in distributed environments:
|
|
380
423
|
- **Automatic Caching**: Duplicate inputs within each partition are cached,
|
|
@@ -533,6 +576,20 @@ def infer_schema(
|
|
|
533
576
|
|
|
534
577
|
Returns:
|
|
535
578
|
InferredSchema: An object containing the inferred schema and response format.
|
|
579
|
+
|
|
580
|
+
Example:
|
|
581
|
+
```python
|
|
582
|
+
from pyspark.sql import SparkSession
|
|
583
|
+
|
|
584
|
+
spark = SparkSession.builder.getOrCreate()
|
|
585
|
+
spark.createDataFrame([("great product",), ("bad service",)], ["text"]).createOrReplaceTempView("examples")
|
|
586
|
+
infer_schema(
|
|
587
|
+
instructions="Classify sentiment as positive or negative.",
|
|
588
|
+
example_table_name="examples",
|
|
589
|
+
example_field_name="text",
|
|
590
|
+
max_examples=2,
|
|
591
|
+
)
|
|
592
|
+
```
|
|
536
593
|
"""
|
|
537
594
|
|
|
538
595
|
spark = CONTAINER.resolve(SparkSession)
|
|
@@ -595,6 +652,23 @@ def parse_udf(
|
|
|
595
652
|
forwarded verbatim to the underlying API calls. These parameters are applied to
|
|
596
653
|
all API requests made by the UDF and override any parameters set in the
|
|
597
654
|
response_format or example data.
|
|
655
|
+
Example:
|
|
656
|
+
```python
|
|
657
|
+
from pyspark.sql import SparkSession
|
|
658
|
+
|
|
659
|
+
spark = SparkSession.builder.getOrCreate()
|
|
660
|
+
spark.createDataFrame(
|
|
661
|
+
[("Order #123 delivered",), ("Order #456 delayed",)],
|
|
662
|
+
["body"],
|
|
663
|
+
).createOrReplaceTempView("messages")
|
|
664
|
+
udf = parse_udf(
|
|
665
|
+
instructions="Extract order id as `order_id` and status as `status`.",
|
|
666
|
+
example_table_name="messages",
|
|
667
|
+
example_field_name="body",
|
|
668
|
+
)
|
|
669
|
+
spark.udf.register("parse_ticket", udf)
|
|
670
|
+
spark.sql("SELECT parse_ticket(body) AS parsed FROM messages").show()
|
|
671
|
+
```
|
|
598
672
|
Returns:
|
|
599
673
|
UserDefinedFunction: A Spark pandas UDF configured to parse responses asynchronously.
|
|
600
674
|
Output schema is `StringType` for str response format or a struct derived from
|