openaivec 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/_cache/proxy.py CHANGED
@@ -186,11 +186,15 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
186
186
  performance (targeting 30-60 seconds per batch).
187
187
 
188
188
  Example:
189
- >>> p = BatchingMapProxy[int, str](batch_size=3)
190
- >>> def f(xs: list[int]) -> list[str]:
191
- ... return [f"v:{x}" for x in xs]
192
- >>> p.map([1, 2, 2, 3, 4], f)
193
- ['v:1', 'v:2', 'v:2', 'v:3', 'v:4']
189
+ ```python
190
+ p = BatchingMapProxy[int, str](batch_size=3)
191
+
192
+ def f(xs: list[int]) -> list[str]:
193
+ return [f"v:{x}" for x in xs]
194
+
195
+ p.map([1, 2, 2, 3, 4], f)
196
+ # ['v:1', 'v:2', 'v:2', 'v:3', 'v:4']
197
+ ```
194
198
  """
195
199
 
196
200
  # Number of items to process per call to map_func.
@@ -449,6 +453,21 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
449
453
 
450
454
  Raises:
451
455
  Exception: Propagates any exception raised by ``map_func``.
456
+
457
+ Example:
458
+ ```python
459
+ proxy: BatchingMapProxy[int, str] = BatchingMapProxy(batch_size=2)
460
+ calls: list[list[int]] = []
461
+
462
+ def mapper(chunk: list[int]) -> list[str]:
463
+ calls.append(chunk)
464
+ return [f"v:{x}" for x in chunk]
465
+
466
+ proxy.map([1, 2, 2, 3], mapper)
467
+ # ['v:1', 'v:2', 'v:2', 'v:3']
468
+ calls # duplicate ``2`` is only computed once
469
+ # [[1, 2], [3]]
470
+ ```
452
471
  """
453
472
  if self.__all_cached(items):
454
473
  return self.__values(items)
@@ -490,16 +509,21 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
490
509
  performance (targeting 30-60 seconds per batch).
491
510
 
492
511
  Example:
493
- >>> import asyncio
494
- >>> from typing import List
495
- >>> p = AsyncBatchingMapProxy[int, str](batch_size=2)
496
- >>> async def af(xs: list[int]) -> list[str]:
497
- ... await asyncio.sleep(0)
498
- ... return [f"v:{x}" for x in xs]
499
- >>> async def run():
500
- ... return await p.map([1, 2, 3], af)
501
- >>> asyncio.run(run())
502
- ['v:1', 'v:2', 'v:3']
512
+ ```python
513
+ import asyncio
514
+
515
+ p = AsyncBatchingMapProxy[int, str](batch_size=2)
516
+
517
+ async def af(xs: list[int]) -> list[str]:
518
+ await asyncio.sleep(0)
519
+ return [f"v:{x}" for x in xs]
520
+
521
+ async def run():
522
+ return await p.map([1, 2, 3], af)
523
+
524
+ asyncio.run(run())
525
+ # ['v:1', 'v:2', 'v:3']
526
+ ```
503
527
  """
504
528
 
505
529
  # Number of items to process per call to map_func.
@@ -747,6 +771,19 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
747
771
 
748
772
  Returns:
749
773
  list[T]: Mapped values corresponding to ``items`` in the same order.
774
+
775
+ Example:
776
+ ```python
777
+ import asyncio
778
+
779
+ async def mapper(chunk: list[int]) -> list[str]:
780
+ await asyncio.sleep(0)
781
+ return [f"v:{x}" for x in chunk]
782
+
783
+ proxy: AsyncBatchingMapProxy[int, str] = AsyncBatchingMapProxy(batch_size=2)
784
+ asyncio.run(proxy.map([1, 1, 2], mapper))
785
+ # ['v:1', 'v:1', 'v:2']
786
+ ```
750
787
  """
751
788
  if await self.__all_cached(items):
752
789
  return await self.__values(items)
openaivec/spark.py CHANGED
@@ -181,6 +181,20 @@ def setup(
181
181
  If provided, registers `ResponsesModelName` in the DI container.
182
182
  embeddings_model_name (str | None): Default model name for embeddings.
183
183
  If provided, registers `EmbeddingsModelName` in the DI container.
184
+
185
+ Example:
186
+ ```python
187
+ from pyspark.sql import SparkSession
188
+ from openaivec.spark import setup
189
+
190
+ spark = SparkSession.builder.getOrCreate()
191
+ setup(
192
+ spark,
193
+ api_key="sk-***",
194
+ responses_model_name="gpt-4.1-mini",
195
+ embeddings_model_name="text-embedding-3-small",
196
+ )
197
+ ```
184
198
  """
185
199
 
186
200
  CONTAINER.register(SparkSession, lambda: spark)
@@ -221,6 +235,22 @@ def setup_azure(
221
235
  If provided, registers `ResponsesModelName` in the DI container.
222
236
  embeddings_model_name (str | None): Default model name for embeddings.
223
237
  If provided, registers `EmbeddingsModelName` in the DI container.
238
+
239
+ Example:
240
+ ```python
241
+ from pyspark.sql import SparkSession
242
+ from openaivec.spark import setup_azure
243
+
244
+ spark = SparkSession.builder.getOrCreate()
245
+ setup_azure(
246
+ spark,
247
+ api_key="azure-key",
248
+ base_url="https://YOUR-RESOURCE-NAME.services.ai.azure.com/openai/v1/",
249
+ api_version="preview",
250
+ responses_model_name="gpt4-deployment",
251
+ embeddings_model_name="embedding-deployment",
252
+ )
253
+ ```
224
254
  """
225
255
 
226
256
  CONTAINER.register(SparkSession, lambda: spark)
@@ -375,6 +405,19 @@ def responses_udf(
375
405
  Raises:
376
406
  ValueError: If `response_format` is not `str` or a Pydantic `BaseModel`.
377
407
 
408
+ Example:
409
+ ```python
410
+ from pyspark.sql import SparkSession
411
+ from openaivec.spark import responses_udf, setup
412
+
413
+ spark = SparkSession.builder.getOrCreate()
414
+ setup(spark, api_key="sk-***", responses_model_name="gpt-4.1-mini")
415
+ udf = responses_udf("Reply with one word.")
416
+ spark.udf.register("short_answer", udf)
417
+ df = spark.createDataFrame([("hello",), ("bye",)], ["text"])
418
+ df.selectExpr("short_answer(text) as reply").show()
419
+ ```
420
+
378
421
  Note:
379
422
  For optimal performance in distributed environments:
380
423
  - **Automatic Caching**: Duplicate inputs within each partition are cached,
@@ -533,6 +576,20 @@ def infer_schema(
533
576
 
534
577
  Returns:
535
578
  InferredSchema: An object containing the inferred schema and response format.
579
+
580
+ Example:
581
+ ```python
582
+ from pyspark.sql import SparkSession
583
+
584
+ spark = SparkSession.builder.getOrCreate()
585
+ spark.createDataFrame([("great product",), ("bad service",)], ["text"]).createOrReplaceTempView("examples")
586
+ infer_schema(
587
+ instructions="Classify sentiment as positive or negative.",
588
+ example_table_name="examples",
589
+ example_field_name="text",
590
+ max_examples=2,
591
+ )
592
+ ```
536
593
  """
537
594
 
538
595
  spark = CONTAINER.resolve(SparkSession)
@@ -595,6 +652,23 @@ def parse_udf(
595
652
  forwarded verbatim to the underlying API calls. These parameters are applied to
596
653
  all API requests made by the UDF and override any parameters set in the
597
654
  response_format or example data.
655
+ Example:
656
+ ```python
657
+ from pyspark.sql import SparkSession
658
+
659
+ spark = SparkSession.builder.getOrCreate()
660
+ spark.createDataFrame(
661
+ [("Order #123 delivered",), ("Order #456 delayed",)],
662
+ ["body"],
663
+ ).createOrReplaceTempView("messages")
664
+ udf = parse_udf(
665
+ instructions="Extract order id as `order_id` and status as `status`.",
666
+ example_table_name="messages",
667
+ example_field_name="body",
668
+ )
669
+ spark.udf.register("parse_ticket", udf)
670
+ spark.sql("SELECT parse_ticket(body) AS parsed FROM messages").show()
671
+ ```
598
672
  Returns:
599
673
  UserDefinedFunction: A Spark pandas UDF configured to parse responses asynchronously.
600
674
  Output schema is `StringType` for str response format or a struct derived from
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 1.0.1
3
+ Version: 1.0.3
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -57,13 +57,27 @@ reviews = pd.Series([
57
57
 
58
58
  sentiment = reviews.ai.responses(
59
59
  "Summarize sentiment in one short sentence.",
60
- reasoning={"effort": "medium"}, # Mirrors OpenAI SDK for reasoning models
60
+ reasoning={"effort": "none"}, # Mirrors OpenAI SDK for reasoning models
61
61
  )
62
62
  print(sentiment.tolist())
63
63
  ```
64
64
 
65
65
  **Try it live:** https://microsoft.github.io/openaivec/examples/pandas/
66
66
 
67
+ ## Benchmarks
68
+
69
+ Simple task benchmark from [benchmark.ipynb](https://github.com/microsoft/openaivec/blob/main/docs/examples/benchmark.ipynb) (100 numeric strings → integer literals, `Series.aio.responses`, model `gpt-5.1`):
70
+
71
+ | Mode | Settings | Time (s) |
72
+ | ------------------- | ----------------------------------------------- | -------- |
73
+ | Serial | `batch_size=1`, `max_concurrency=1` | ~141 |
74
+ | Batching | default `batch_size`, `max_concurrency=1` | ~15 |
75
+ | Concurrent batching | default `batch_size`, default `max_concurrency` | ~6 |
76
+
77
+ Batching alone removes most HTTP overhead, and letting batching overlap with concurrency cuts total runtime to a few seconds while still yielding one output per input.
78
+
79
+ ![Benchmark comparison for simple task](https://private-user-images.githubusercontent.com/6128022/519474214-d1931e34-6f9e-4695-8042-88b771e002c3.png?jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NjQyMDc5ODAsIm5iZiI6MTc2NDIwNzY4MCwicGF0aCI6Ii82MTI4MDIyLzUxOTQ3NDIxNC1kMTkzMWUzNC02ZjllLTQ2OTUtODA0Mi04OGI3NzFlMDAyYzMucG5nP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI1MTEyNyUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNTExMjdUMDE0MTIwWiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9Y2JhYmU2YjZhNDUxNDkxZDg5NGMxZGI1OTUzODgyYjQ4OTVhYzEzZjU3NmRkMjE1M2Y1ZDI3ZTdiNWI0M2VlMCZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QifQ.yuxT4AbDIBNsRGCIxPMjpGiHFqLcQUCLg_DjpqH02Lw)
80
+
67
81
  ## Contents
68
82
 
69
83
  - [Why openaivec?](#why-openaivec)
@@ -109,7 +123,7 @@ client = BatchResponses.of(
109
123
 
110
124
  result = client.parse(
111
125
  ["panda", "rabbit", "koala"],
112
- reasoning={"effort": "medium"}, # Required for gpt-5.1
126
+ reasoning={"effort": "none"},
113
127
  )
114
128
  print(result) # Expected output: ['bear family', 'rabbit family', 'koala family']
115
129
  ```
@@ -147,15 +161,15 @@ df = pd.DataFrame({"name": ["panda", "rabbit", "koala"]})
147
161
  result = df.assign(
148
162
  family=lambda df: df.name.ai.responses(
149
163
  "What animal family? Answer with 'X family'",
150
- reasoning={"effort": "medium"},
164
+ reasoning={"effort": "none"},
151
165
  ),
152
166
  habitat=lambda df: df.name.ai.responses(
153
167
  "Primary habitat in one word",
154
- reasoning={"effort": "medium"},
168
+ reasoning={"effort": "none"},
155
169
  ),
156
170
  fun_fact=lambda df: df.name.ai.responses(
157
171
  "One interesting fact in 10 words or less",
158
- reasoning={"effort": "medium"},
172
+ reasoning={"effort": "none"},
159
173
  ),
160
174
  )
161
175
  ```
@@ -178,7 +192,7 @@ pandas_ext.set_responses_model("o1-mini") # Set your reasoning model
178
192
  result = df.assign(
179
193
  analysis=lambda df: df.text.ai.responses(
180
194
  "Analyze this text step by step",
181
- reasoning={"effort": "medium"} # Optional: mirrors the OpenAI SDK argument
195
+ reasoning={"effort": "none"} # Optional: mirrors the OpenAI SDK argument
182
196
  )
183
197
  )
184
198
  ```
@@ -232,7 +246,7 @@ df = pd.DataFrame({"text": [
232
246
  async def process_data():
233
247
  return await df["text"].aio.responses(
234
248
  "Analyze sentiment and classify as positive/negative/neutral",
235
- reasoning={"effort": "medium"}, # Required for gpt-5.1
249
+ reasoning={"effort": "none"}, # Required for gpt-5.1
236
250
  max_concurrency=12 # Allow up to 12 concurrent requests
237
251
  )
238
252
 
@@ -284,7 +298,7 @@ spark.udf.register(
284
298
  "extract_brand",
285
299
  responses_udf(
286
300
  instructions="Extract the brand name from the product. Return only the brand name.",
287
- reasoning={"effort": "medium"}, # Recommended with gpt-5.1
301
+ reasoning={"effort": "none"}, # Recommended with gpt-5.1
288
302
  )
289
303
  )
290
304
 
@@ -298,7 +312,7 @@ spark.udf.register(
298
312
  responses_udf(
299
313
  instructions="Translate the text to English, French, and Japanese.",
300
314
  response_format=Translation,
301
- reasoning={"effort": "medium"}, # Recommended with gpt-5.1
315
+ reasoning={"effort": "none"}, # Recommended with gpt-5.1
302
316
  )
303
317
  )
304
318
 
@@ -336,7 +350,7 @@ prompt = (
336
350
 
337
351
  ## Using with Microsoft Fabric
338
352
 
339
- [Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark. Detailed walkthrough: 📓 **[Fabric guide →](https://microsoft.github.io/openaivec/examples/fabric/)**.
353
+ [Microsoft Fabric](https://www.microsoft.com/en-us/microsoft-fabric/) is a unified, cloud-based analytics platform. Add `openaivec` from PyPI in your Fabric environment, select it in your notebook, and use `openaivec.spark` like standard Spark.
340
354
 
341
355
  ## Contributing
342
356
 
@@ -374,4 +388,4 @@ uv run pytest -m "not slow and not requires_api"
374
388
 
375
389
  ## Community
376
390
 
377
- Join our Discord community for support and announcements: https://discord.gg/vbb83Pgn
391
+ Join our Discord community for support and announcements: https://discord.gg/hXCS9J6Qek
@@ -9,10 +9,10 @@ openaivec/_responses.py,sha256=Lb37ajlFQoVVac_p9oVf3scUDS3AI1ro4tRlk_UBqVg,20412
9
9
  openaivec/_serialize.py,sha256=u2Om94Sc_QgJkTlW2BAGw8wd6gYDhc6IRqvS-qevFSs,8399
10
10
  openaivec/_util.py,sha256=XfueAycVCQvgRLS7wF7e306b53lebORvZOBzbQjy4vE,6438
11
11
  openaivec/pandas_ext.py,sha256=XEmB08FS6lFtk6V7zzM4XHnzPkLCZ08OFFlkX-f0Oko,86730
12
- openaivec/spark.py,sha256=PIZxy3pVSrUv9PB2KBXQNM8beEvn_abaCYGS1DZmanY,32764
12
+ openaivec/spark.py,sha256=OsEuwRRBzNs8Zv-hwDDsXR7A-dqZT651sdRmGkziTOo,35236
13
13
  openaivec/_cache/__init__.py,sha256=IYUH5GKsJXuCX-k3XtT259rEz49EZm9KW2TIOTGW4uQ,314
14
14
  openaivec/_cache/optimize.py,sha256=3nS8VehbS7iGC1tPDDQh-iAgyKHbVYmMbCRBWM77U_U,3827
15
- openaivec/_cache/proxy.py,sha256=mBUaYNFLrix6ZDblSHXmKlrd4qraaoVpbHGJ-_RlK-s,29666
15
+ openaivec/_cache/proxy.py,sha256=aVjH_hmJIIso6SetV_-Ct3VaOSG-n9Dpil7TttnbYkE,30556
16
16
  openaivec/_schema/__init__.py,sha256=XUj3Jv6ZVDjyYzSmH6Q5lmDj-hBMfUg_eBNeZACXR6Q,368
17
17
  openaivec/_schema/infer.py,sha256=gcrpw0OVJMWdmUlzimP-C14cuCAAOnHQd8-bUNR220o,15705
18
18
  openaivec/_schema/spec.py,sha256=7ZaC59w2Edemnao57XeZVO4qmSOA-Kus6TchZC3Dd5o,14821
@@ -33,7 +33,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=1igoAhns-VgsDE8XI47Dw-zeOcR5wEY9
33
33
  openaivec/task/nlp/translation.py,sha256=TtV7F6bmKPqLi3_Ok7GoOqT_GKJiemotVq-YEbKd6IA,6617
34
34
  openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
35
35
  openaivec/task/table/fillna.py,sha256=4j27fWT5IzOhQqCPwLhomjBOAWPBslyIBbBMspjqtbw,6877
36
- openaivec-1.0.1.dist-info/METADATA,sha256=xLWrSZd9aX_mgAElLZgHft-jUbvrZfumvx_uJbr8C1Y,12991
37
- openaivec-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- openaivec-1.0.1.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
39
- openaivec-1.0.1.dist-info/RECORD,,
36
+ openaivec-1.0.3.dist-info/METADATA,sha256=p07OFXtO0GMy4uiWIs-XooTcpJr-SnFIvd9-4x2W9-8,14484
37
+ openaivec-1.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ openaivec-1.0.3.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
39
+ openaivec-1.0.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any