openaivec 1.0.1__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {openaivec-1.0.1 → openaivec-1.0.3}/PKG-INFO +26 -12
  2. {openaivec-1.0.1 → openaivec-1.0.3}/README.md +25 -11
  3. {openaivec-1.0.1 → openaivec-1.0.3}/mkdocs.yml +2 -1
  4. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_cache/proxy.py +52 -15
  5. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/spark.py +74 -0
  6. {openaivec-1.0.1 → openaivec-1.0.3}/uv.lock +671 -656
  7. {openaivec-1.0.1 → openaivec-1.0.3}/.env.example +0 -0
  8. {openaivec-1.0.1 → openaivec-1.0.3}/.github/copilot-instructions.md +0 -0
  9. {openaivec-1.0.1 → openaivec-1.0.3}/.github/dependabot.yml +0 -0
  10. {openaivec-1.0.1 → openaivec-1.0.3}/.github/workflows/docs.yml +0 -0
  11. {openaivec-1.0.1 → openaivec-1.0.3}/.github/workflows/publish.yml +0 -0
  12. {openaivec-1.0.1 → openaivec-1.0.3}/.github/workflows/test.yml +0 -0
  13. {openaivec-1.0.1 → openaivec-1.0.3}/.gitignore +0 -0
  14. {openaivec-1.0.1 → openaivec-1.0.3}/AGENTS.md +0 -0
  15. {openaivec-1.0.1 → openaivec-1.0.3}/CODE_OF_CONDUCT.md +0 -0
  16. {openaivec-1.0.1 → openaivec-1.0.3}/LICENSE +0 -0
  17. {openaivec-1.0.1 → openaivec-1.0.3}/SECURITY.md +0 -0
  18. {openaivec-1.0.1 → openaivec-1.0.3}/SUPPORT.md +0 -0
  19. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/main.md +0 -0
  20. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/pandas_ext.md +0 -0
  21. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/spark.md +0 -0
  22. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/task.md +0 -0
  23. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/customer_sentiment.md +0 -0
  24. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/inquiry_classification.md +0 -0
  25. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/inquiry_summary.md +0 -0
  26. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/intent_analysis.md +0 -0
  27. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/response_suggestion.md +0 -0
  28. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/customer_support/urgency_analysis.md +0 -0
  29. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/dependency_parsing.md +0 -0
  30. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/keyword_extraction.md +0 -0
  31. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/morphological_analysis.md +0 -0
  32. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/named_entity_recognition.md +0 -0
  33. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/sentiment_analysis.md +0 -0
  34. {openaivec-1.0.1 → openaivec-1.0.3}/docs/api/tasks/nlp/translation.md +0 -0
  35. {openaivec-1.0.1 → openaivec-1.0.3}/docs/contributor-guide.md +0 -0
  36. {openaivec-1.0.1 → openaivec-1.0.3}/docs/index.md +0 -0
  37. {openaivec-1.0.1 → openaivec-1.0.3}/docs/robots.txt +0 -0
  38. {openaivec-1.0.1 → openaivec-1.0.3}/pyproject.toml +0 -0
  39. {openaivec-1.0.1 → openaivec-1.0.3}/pytest.ini +0 -0
  40. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/__init__.py +0 -0
  41. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_cache/__init__.py +0 -0
  42. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_cache/optimize.py +0 -0
  43. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_di.py +0 -0
  44. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_embeddings.py +0 -0
  45. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_log.py +0 -0
  46. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_model.py +0 -0
  47. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_prompt.py +0 -0
  48. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_provider.py +0 -0
  49. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_responses.py +0 -0
  50. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_schema/__init__.py +0 -0
  51. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_schema/infer.py +0 -0
  52. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_schema/spec.py +0 -0
  53. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_serialize.py +0 -0
  54. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/_util.py +0 -0
  55. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/pandas_ext.py +0 -0
  56. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/__init__.py +0 -0
  57. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/__init__.py +0 -0
  58. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/customer_sentiment.py +0 -0
  59. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/inquiry_classification.py +0 -0
  60. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/inquiry_summary.py +0 -0
  61. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/intent_analysis.py +0 -0
  62. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/response_suggestion.py +0 -0
  63. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/customer_support/urgency_analysis.py +0 -0
  64. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/__init__.py +0 -0
  65. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/dependency_parsing.py +0 -0
  66. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/keyword_extraction.py +0 -0
  67. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/morphological_analysis.py +0 -0
  68. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/named_entity_recognition.py +0 -0
  69. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/sentiment_analysis.py +0 -0
  70. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/nlp/translation.py +0 -0
  71. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/table/__init__.py +0 -0
  72. {openaivec-1.0.1 → openaivec-1.0.3}/src/openaivec/task/table/fillna.py +0 -0
  73. {openaivec-1.0.1 → openaivec-1.0.3}/tests/__init__.py +0 -0
  74. {openaivec-1.0.1 → openaivec-1.0.3}/tests/_cache/test_optimize.py +0 -0
  75. {openaivec-1.0.1 → openaivec-1.0.3}/tests/_cache/test_proxy.py +0 -0
  76. {openaivec-1.0.1 → openaivec-1.0.3}/tests/_cache/test_proxy_suggester.py +0 -0
  77. {openaivec-1.0.1 → openaivec-1.0.3}/tests/_schema/test_infer.py +0 -0
  78. {openaivec-1.0.1 → openaivec-1.0.3}/tests/_schema/test_spec.py +0 -0
  79. {openaivec-1.0.1 → openaivec-1.0.3}/tests/conftest.py +0 -0
  80. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_di.py +0 -0
  81. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_embeddings.py +0 -0
  82. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_pandas_ext.py +0 -0
  83. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_prompt.py +0 -0
  84. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_provider.py +0 -0
  85. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_responses.py +0 -0
  86. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_serialize.py +0 -0
  87. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_serialize_pydantic_v2_compliance.py +0 -0
  88. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_spark.py +0 -0
  89. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_task.py +0 -0
  90. {openaivec-1.0.1 → openaivec-1.0.3}/tests/test_util.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -57,13 +57,27 @@ reviews = pd.Series([
57
57
 
58
58
  sentiment = reviews.ai.responses(
59
59
  "Summarize sentiment in one short sentence.",
60
- reasoning={"effort": "medium"}, # Mirrors OpenAI SDK for reasoning models
60
+ reasoning={"effort": "none"}, # Mirrors OpenAI SDK for reasoning models
61
61
  )
62
62
  print(sentiment.tolist())
63
63
  ```
64
64
 
65
65
  **Try it live:** https://microsoft.github.io/openaivec/examples/pandas/
66
66
 
67
+ ## Benchmarks
68
+
69
+ Simple task benchmark from [benchmark.ipynb](https://github.com/microsoft/openaivec/blob/main/docs/examples/benchmark.ipynb) (100 numeric strings → integer literals, `Series.aio.responses`, model `gpt-5.1`):
70
+
71
+ | Mode | Settings | Time (s) |
72
+ | ------------------- | ----------------------------------------------- | -------- |
73
+ | Serial | `batch_size=1`, `max_concurrency=1` | ~141 |
74
+ | Batching | default `batch_size`, `max_concurrency=1` | ~15 |
75
+ | Concurrent batching | default `batch_size`, default `max_concurrency` | ~6 |
76
+
77
+ Batching alone removes most HTTP overhead, and letting batching overlap with concurrency cuts total runtime to a few seconds while still yielding one output per input.
78
+
79
+ ![Benchmark comparison for simple task](https://private-user-images.githubusercontent.com/6128022/519474214-d1931e34-6f9e-4695-8042-88b771e002c3.png?jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NjQyMDc5ODAsIm5iZiI6MTc2NDIwNzY4MCwicGF0aCI6Ii82MTI4MDIyLzUxOTQ3NDIxNC1kMTkzMWUzNC02ZjllLTQ2OTUtODA0Mi04OGI3NzFlMDAyYzMucG5nP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI1MTEyNyUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNTExMjdUMDE0MTIwWiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9Y2JhYmU2YjZhNDUxNDkxZDg5NGMxZGI1OTUzODgyYjQ4OTVhYzEzZjU3NmRkMjE1M2Y1ZDI3ZTdiNWI0M2VlMCZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QifQ.yuxT4AbDIBNsRGCIxPMjpGiHFqLcQUCLg_DjpqH02Lw)
80
+
67
81
  ## Contents
68
82
 
69
83
  - [Why openaivec?](#why-openaivec)
@@ -109,7 +123,7 @@ client = BatchResponses.of(
109
123
 
110
124
  result = client.parse(
111
125
  ["panda", "rabbit", "koala"],
112
- reasoning={"effort": "medium"}, # Required for gpt-5.1
126
+ reasoning={"effort": "none"},
113
127
  )
114
128
  print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
115
129
  ```
@@ -147,15 +161,15 @@ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
147
161
  result = df.assign(
148
162
  family=lambda df: df.name.ai.responses(
149
163
  "What animal family? Answer with 'X family'",
150
- reasoning={"effort": "medium"},
164
+ reasoning={"effort": "none"},
151
165
  ),
152
166
  habitat=lambda df: df.name.ai.responses(
153
167
  "Primary habitat in one word",
154
- reasoning={"effort": "medium"},
168
+ reasoning={"effort": "none"},
155
169
  ),
156
170
  fun_fact=lambda df: df.name.ai.responses(
157
171
  "One interesting fact in 10 words or less",
158
- reasoning={"effort": "medium"},
172
+ reasoning={"effort": "none"},
159
173
  ),
160
174
  )
161
175
  ```
@@ -178,7 +192,7 @@ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
178
192
  result = df.assign(
179
193
  analysis=lambda df: df.text.ai.responses(
180
194
  "Analyze this text step by step",
181
- reasoning={"effort": "medium"} # Optional: mirrors the OpenAI SDK argument
195
+ reasoning={"effort": "none"} # Optional: mirrors the OpenAI SDK argument
182
196
  )
183
197
  )
184
198
  ```
@@ -232,7 +246,7 @@ df = pd.DataFrame({"text": [
232
246
  async def process_data():
233
247
  return await df["text"].aio.responses(
234
248
  "Analyze sentiment and classify as positive/negative/neutral",
235
- reasoning={"effort": "medium"}, # Required for gpt-5.1
249
+ reasoning={"effort": "none"}, # Required for gpt-5.1
236
250
  max_concurrency=12 # Allow up to 12 concurrent requests
237
251
  )
238
252
 
@@ -284,7 +298,7 @@ spark.udf.register(
284
298
  "extract_brand",
285
299
  responses_udf(
286
300
  instructions="Extract the brand name from the product. Return only the brand name.",
287
- reasoning={"effort": "medium"}, # Recommended with gpt-5.1
301
+ reasoning={"effort": "none"}, # Recommended with gpt-5.1
288
302
  )
289
303
  )
290
304
 
@@ -298,7 +312,7 @@ spark.udf.register(
298
312
  responses_udf(
299
313
  instructions="Translate the text to English, French, and Japanese.",
300
314
  response_format=Translation,
301
- reasoning={"effort": "medium"}, # Recommended with gpt-5.1
315
+ reasoning={"effort": "none"}, # Recommended with gpt-5.1
302
316
  )
303
317
  )
304
318
 
@@ -336,7 +350,7 @@ prompt = (
336
350
 
337
351
  ## Using with Microsoft Fabric
338
352
 
339
- [Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark. Detailed walkthrough: 📓 **[Fabric guide →](https://microsoft.github.io/openaivec/examples/fabric/)**.
353
+ [Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
340
354
 
341
355
  ## Contributing
342
356
 
@@ -374,4 +388,4 @@ uv run pytest -m "not slow and not requires_api"
374
388
 
375
389
  ## Community
376
390
 
377
- Join our Discord community for support and announcements: https://discord.gg/vbb83Pgn
391
+ Join our Discord community for support and announcements: https://discord.gg/hXCS9J6Qek
@@ -31,13 +31,27 @@ reviews = pd.Series([
31
31
 
32
32
  sentiment = reviews.ai.responses(
33
33
  "Summarize sentiment in one short sentence.",
34
- reasoning={"effort": "medium"}, # Mirrors OpenAI SDK for reasoning models
34
+ reasoning={"effort": "none"}, # Mirrors OpenAI SDK for reasoning models
35
35
  )
36
36
  print(sentiment.tolist())
37
37
  ```
38
38
 
39
39
  **Try it live:** https://microsoft.github.io/openaivec/examples/pandas/
40
40
 
41
+ ## Benchmarks
42
+
43
+ Simple task benchmark from [benchmark.ipynb](https://github.com/microsoft/openaivec/blob/main/docs/examples/benchmark.ipynb) (100 numeric strings → integer literals, `Series.aio.responses`, model `gpt-5.1`):
44
+
45
+ | Mode | Settings | Time (s) |
46
+ | ------------------- | ----------------------------------------------- | -------- |
47
+ | Serial | `batch_size=1`, `max_concurrency=1` | ~141 |
48
+ | Batching | default `batch_size`, `max_concurrency=1` | ~15 |
49
+ | Concurrent batching | default `batch_size`, default `max_concurrency` | ~6 |
50
+
51
+ Batching alone removes most HTTP overhead, and letting batching overlap with concurrency cuts total runtime to a few seconds while still yielding one output per input.
52
+
53
+ ![Benchmark comparison for simple task](https://private-user-images.githubusercontent.com/6128022/519474214-d1931e34-6f9e-4695-8042-88b771e002c3.png?jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NjQyMDc5ODAsIm5iZiI6MTc2NDIwNzY4MCwicGF0aCI6Ii82MTI4MDIyLzUxOTQ3NDIxNC1kMTkzMWUzNC02ZjllLTQ2OTUtODA0Mi04OGI3NzFlMDAyYzMucG5nP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI1MTEyNyUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNTExMjdUMDE0MTIwWiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9Y2JhYmU2YjZhNDUxNDkxZDg5NGMxZGI1OTUzODgyYjQ4OTVhYzEzZjU3NmRkMjE1M2Y1ZDI3ZTdiNWI0M2VlMCZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QifQ.yuxT4AbDIBNsRGCIxPMjpGiHFqLcQUCLg_DjpqH02Lw)
54
+
41
55
  ## Contents
42
56
 
43
57
  - [Why openaivec?](#why-openaivec)
@@ -83,7 +97,7 @@ client = BatchResponses.of(
83
97
 
84
98
  result = client.parse(
85
99
  ["panda", "rabbit", "koala"],
86
- reasoning={"effort": "medium"}, # Required for gpt-5.1
100
+ reasoning={"effort": "none"},
87
101
  )
88
102
  print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
89
103
  ```
@@ -121,15 +135,15 @@ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
121
135
  result = df.assign(
122
136
  family=lambda df: df.name.ai.responses(
123
137
  "What animal family? Answer with 'X family'",
124
- reasoning={"effort": "medium"},
138
+ reasoning={"effort": "none"},
125
139
  ),
126
140
  habitat=lambda df: df.name.ai.responses(
127
141
  "Primary habitat in one word",
128
- reasoning={"effort": "medium"},
142
+ reasoning={"effort": "none"},
129
143
  ),
130
144
  fun_fact=lambda df: df.name.ai.responses(
131
145
  "One interesting fact in 10 words or less",
132
- reasoning={"effort": "medium"},
146
+ reasoning={"effort": "none"},
133
147
  ),
134
148
  )
135
149
  ```
@@ -152,7 +166,7 @@ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
152
166
  result = df.assign(
153
167
  analysis=lambda df: df.text.ai.responses(
154
168
  "Analyze this text step by step",
155
- reasoning={"effort": "medium"} # Optional: mirrors the OpenAI SDK argument
169
+ reasoning={"effort": "none"} # Optional: mirrors the OpenAI SDK argument
156
170
  )
157
171
  )
158
172
  ```
@@ -206,7 +220,7 @@ df = pd.DataFrame({"text": [
206
220
  async def process_data():
207
221
  return await df["text"].aio.responses(
208
222
  "Analyze sentiment and classify as positive/negative/neutral",
209
- reasoning={"effort": "medium"}, # Required for gpt-5.1
223
+ reasoning={"effort": "none"}, # Required for gpt-5.1
210
224
  max_concurrency=12 # Allow up to 12 concurrent requests
211
225
  )
212
226
 
@@ -258,7 +272,7 @@ spark.udf.register(
258
272
  "extract_brand",
259
273
  responses_udf(
260
274
  instructions="Extract the brand name from the product. Return only the brand name.",
261
- reasoning={"effort": "medium"}, # Recommended with gpt-5.1
275
+ reasoning={"effort": "none"}, # Recommended with gpt-5.1
262
276
  )
263
277
  )
264
278
 
@@ -272,7 +286,7 @@ spark.udf.register(
272
286
  responses_udf(
273
287
  instructions="Translate the text to English, French, and Japanese.",
274
288
  response_format=Translation,
275
- reasoning={"effort": "medium"}, # Recommended with gpt-5.1
289
+ reasoning={"effort": "none"}, # Recommended with gpt-5.1
276
290
  )
277
291
  )
278
292
 
@@ -310,7 +324,7 @@ prompt = (
310
324
 
311
325
  ## Using with Microsoft Fabric
312
326
 
313
- [Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark. Detailed walkthrough: 📓 **[Fabric guide →](https://microsoft.github.io/openaivec/examples/fabric/)**.
327
+ [Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
314
328
 
315
329
  ## Contributing
316
330
 
@@ -348,4 +362,4 @@ uv run pytest -m "not slow and not requires_api"
348
362
 
349
363
  ## Community
350
364
 
351
- Join our Discord community for support and announcements: https://discord.gg/vbb83Pgn
365
+ Join our Discord community for support and announcements: https://discord.gg/hXCS9J6Qek
@@ -63,6 +63,7 @@ nav:
63
63
  - Prompt Engineering: examples/prompt.ipynb
64
64
  - FAQ Generation: examples/generate_faq.ipynb
65
65
  - Token Count and Processing Time: examples/batch_size.ipynb
66
+ - Request Batching Benchmark: examples/benchmark.ipynb
66
67
  - API Reference:
67
68
  - Main Package: api/main.md
68
69
  - pandas_ext: api/pandas_ext.md
@@ -121,7 +122,7 @@ extra:
121
122
  - icon: fontawesome/brands/python
122
123
  link: https://pypi.org/project/openaivec/
123
124
  - icon: fontawesome/brands/discord
124
- link: https://discord.gg/vbb83Pgn
125
+ link: https://discord.gg/hXCS9J6Qek
125
126
 
126
127
  plugins:
127
128
  - search:
@@ -186,11 +186,15 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
186
186
  performance (targeting 30-60 seconds per batch).
187
187
 
188
188
  Example:
189
- >>> p = BatchingMapProxy[int, str](batch_size=3)
190
- >>> def f(xs: list[int]) -> list[str]:
191
- ... return [f"v:{x}" for x in xs]
192
- >>> p.map([1, 2, 2, 3, 4], f)
193
- ['v:1', 'v:2', 'v:2', 'v:3', 'v:4']
189
+ ```python
190
+ p = BatchingMapProxy[int, str](batch_size=3)
191
+
192
+ def f(xs: list[int]) -> list[str]:
193
+ return [f"v:{x}" for x in xs]
194
+
195
+ p.map([1, 2, 2, 3, 4], f)
196
+ # ['v:1', 'v:2', 'v:2', 'v:3', 'v:4']
197
+ ```
194
198
  """
195
199
 
196
200
  # Number of items to process per call to map_func.
@@ -449,6 +453,21 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
449
453
 
450
454
  Raises:
451
455
  Exception: Propagates any exception raised by ``map_func``.
456
+
457
+ Example:
458
+ ```python
459
+ proxy: BatchingMapProxy[int, str] = BatchingMapProxy(batch_size=2)
460
+ calls: list[list[int]] = []
461
+
462
+ def mapper(chunk: list[int]) -> list[str]:
463
+ calls.append(chunk)
464
+ return [f"v:{x}" for x in chunk]
465
+
466
+ proxy.map([1, 2, 2, 3], mapper)
467
+ # ['v:1', 'v:2', 'v:2', 'v:3']
468
+ calls # duplicate ``2`` is only computed once
469
+ # [[1, 2], [3]]
470
+ ```
452
471
  """
453
472
  if self.__all_cached(items):
454
473
  return self.__values(items)
@@ -490,16 +509,21 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
490
509
  performance (targeting 30-60 seconds per batch).
491
510
 
492
511
  Example:
493
- >>> import asyncio
494
- >>> from typing import List
495
- >>> p = AsyncBatchingMapProxy[int, str](batch_size=2)
496
- >>> async def af(xs: list[int]) -> list[str]:
497
- ... await asyncio.sleep(0)
498
- ... return [f"v:{x}" for x in xs]
499
- >>> async def run():
500
- ... return await p.map([1, 2, 3], af)
501
- >>> asyncio.run(run())
502
- ['v:1', 'v:2', 'v:3']
512
+ ```python
513
+ import asyncio
514
+
515
+ p = AsyncBatchingMapProxy[int, str](batch_size=2)
516
+
517
+ async def af(xs: list[int]) -> list[str]:
518
+ await asyncio.sleep(0)
519
+ return [f"v:{x}" for x in xs]
520
+
521
+ async def run():
522
+ return await p.map([1, 2, 3], af)
523
+
524
+ asyncio.run(run())
525
+ # ['v:1', 'v:2', 'v:3']
526
+ ```
503
527
  """
504
528
 
505
529
  # Number of items to process per call to map_func.
@@ -747,6 +771,19 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
747
771
 
748
772
  Returns:
749
773
  list[T]: Mapped values corresponding to ``items`` in the same order.
774
+
775
+ Example:
776
+ ```python
777
+ import asyncio
778
+
779
+ async def mapper(chunk: list[int]) -> list[str]:
780
+ await asyncio.sleep(0)
781
+ return [f"v:{x}" for x in chunk]
782
+
783
+ proxy: AsyncBatchingMapProxy[int, str] = AsyncBatchingMapProxy(batch_size=2)
784
+ asyncio.run(proxy.map([1, 1, 2], mapper))
785
+ # ['v:1', 'v:1', 'v:2']
786
+ ```
750
787
  """
751
788
  if await self.__all_cached(items):
752
789
  return await self.__values(items)
@@ -181,6 +181,20 @@ def setup(
181
181
  If provided, registers `ResponsesModelName` in the DI container.
182
182
  embeddings_model_name (str | None): Default model name for embeddings.
183
183
  If provided, registers `EmbeddingsModelName` in the DI container.
184
+
185
+ Example:
186
+ ```python
187
+ from pyspark.sql import SparkSession
188
+ from openaivec.spark import setup
189
+
190
+ spark = SparkSession.builder.getOrCreate()
191
+ setup(
192
+ spark,
193
+ api_key="sk-***",
194
+ responses_model_name="gpt-4.1-mini",
195
+ embeddings_model_name="text-embedding-3-small",
196
+ )
197
+ ```
184
198
  """
185
199
 
186
200
  CONTAINER.register(SparkSession, lambda: spark)
@@ -221,6 +235,22 @@ def setup_azure(
221
235
  If provided, registers `ResponsesModelName` in the DI container.
222
236
  embeddings_model_name (str | None): Default model name for embeddings.
223
237
  If provided, registers `EmbeddingsModelName` in the DI container.
238
+
239
+ Example:
240
+ ```python
241
+ from pyspark.sql import SparkSession
242
+ from openaivec.spark import setup_azure
243
+
244
+ spark = SparkSession.builder.getOrCreate()
245
+ setup_azure(
246
+ spark,
247
+ api_key="azure-key",
248
+ base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
249
+ api_version="preview",
250
+ responses_model_name="gpt4-deployment",
251
+ embeddings_model_name="embedding-deployment",
252
+ )
253
+ ```
224
254
  """
225
255
 
226
256
  CONTAINER.register(SparkSession, lambda: spark)
@@ -375,6 +405,19 @@ def responses_udf(
375
405
  Raises:
376
406
  ValueError: If `response_format` is not `str` or a Pydantic `BaseModel`.
377
407
 
408
+ Example:
409
+ ```python
410
+ from pyspark.sql import SparkSession
411
+ from openaivec.spark import responses_udf, setup
412
+
413
+ spark = SparkSession.builder.getOrCreate()
414
+ setup(spark, api_key="sk-***", responses_model_name="gpt-4.1-mini")
415
+ udf = responses_udf("Reply with one word.")
416
+ spark.udf.register("short_answer", udf)
417
+ df = spark.createDataFrame([("hello",), ("bye",)], ["text"])
418
+ df.selectExpr("short_answer(text) as reply").show()
419
+ ```
420
+
378
421
  Note:
379
422
  For optimal performance in distributed environments:
380
423
  - **Automatic Caching**: Duplicate inputs within each partition are cached,
@@ -533,6 +576,20 @@ def infer_schema(
533
576
 
534
577
  Returns:
535
578
  InferredSchema: An object containing the inferred schema and response format.
579
+
580
+ Example:
581
+ ```python
582
+ from pyspark.sql import SparkSession
583
+
584
+ spark = SparkSession.builder.getOrCreate()
585
+ spark.createDataFrame([("great product",), ("bad service",)], ["text"]).createOrReplaceTempView("examples")
586
+ infer_schema(
587
+ instructions="Classify sentiment as positive or negative.",
588
+ example_table_name="examples",
589
+ example_field_name="text",
590
+ max_examples=2,
591
+ )
592
+ ```
536
593
  """
537
594
 
538
595
  spark = CONTAINER.resolve(SparkSession)
@@ -595,6 +652,23 @@ def parse_udf(
595
652
  forwarded verbatim to the underlying API calls. These parameters are applied to
596
653
  all API requests made by the UDF and override any parameters set in the
597
654
  response_format or example data.
655
+ Example:
656
+ ```python
657
+ from pyspark.sql import SparkSession
658
+
659
+ spark = SparkSession.builder.getOrCreate()
660
+ spark.createDataFrame(
661
+ [("Order #123 delivered",), ("Order #456 delayed",)],
662
+ ["body"],
663
+ ).createOrReplaceTempView("messages")
664
+ udf = parse_udf(
665
+ instructions="Extract order id as `order_id` and status as `status`.",
666
+ example_table_name="messages",
667
+ example_field_name="body",
668
+ )
669
+ spark.udf.register("parse_ticket", udf)
670
+ spark.sql("SELECT parse_ticket(body) AS parsed FROM messages").show()
671
+ ```
598
672
  Returns:
599
673
  UserDefinedFunction: A Spark pandas UDF configured to parse responses asynchronously.
600
674
  Output schema is `StringType` for str response format or a struct derived from