openaivec 0.13.3__py3-none-any.whl → 0.13.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
openaivec/optimize.py ADDED
@@ -0,0 +1,108 @@
1
+ import threading
2
+ import time
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime, timezone
6
+ from typing import List
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class PerformanceMetric:
11
+ duration: float
12
+ batch_size: int
13
+ executed_at: datetime
14
+ exception: BaseException | None = None
15
+
16
+
17
+ @dataclass
18
+ class BatchSizeSuggester:
19
+ current_batch_size: int = 10
20
+ min_batch_size: int = 10
21
+ min_duration: float = 30.0
22
+ max_duration: float = 60.0
23
+ step_ratio: float = 0.1
24
+ sample_size: int = 10
25
+ _history: List[PerformanceMetric] = field(default_factory=list)
26
+ _lock: threading.RLock = field(default_factory=threading.RLock, repr=False)
27
+ _batch_size_changed_at: datetime | None = field(default=None, init=False)
28
+
29
+ def __post_init__(self) -> None:
30
+ if self.min_batch_size <= 0:
31
+ raise ValueError("min_batch_size must be > 0")
32
+ if self.current_batch_size < self.min_batch_size:
33
+ raise ValueError("current_batch_size must be >= min_batch_size")
34
+ if self.sample_size <= 0:
35
+ raise ValueError("sample_size must be > 0")
36
+ if self.step_ratio <= 0:
37
+ raise ValueError("step_ratio must be > 0")
38
+ if self.min_duration <= 0 or self.max_duration <= 0:
39
+ raise ValueError("min_duration and max_duration must be > 0")
40
+ if self.min_duration >= self.max_duration:
41
+ raise ValueError("min_duration must be < max_duration")
42
+
43
+ @contextmanager
44
+ def record(self, batch_size: int):
45
+ start_time = time.perf_counter()
46
+ executed_at = datetime.now(timezone.utc)
47
+ caught_exception: BaseException | None = None
48
+ try:
49
+ yield
50
+ except BaseException as e:
51
+ caught_exception = e
52
+ raise
53
+ finally:
54
+ duration = time.perf_counter() - start_time
55
+ with self._lock:
56
+ self._history.append(
57
+ PerformanceMetric(
58
+ duration=duration,
59
+ batch_size=batch_size,
60
+ executed_at=executed_at,
61
+ exception=caught_exception,
62
+ )
63
+ )
64
+
65
+ @property
66
+ def samples(self) -> List[PerformanceMetric]:
67
+ with self._lock:
68
+ selected: List[PerformanceMetric] = []
69
+ for metric in reversed(self._history):
70
+ if metric.exception is not None:
71
+ continue
72
+ if self._batch_size_changed_at and metric.executed_at < self._batch_size_changed_at:
73
+ continue
74
+ selected.append(metric)
75
+ if len(selected) >= self.sample_size:
76
+ break
77
+ return list(reversed(selected))
78
+
79
+ def clear_history(self):
80
+ with self._lock:
81
+ self._history.clear()
82
+
83
+ def suggest_batch_size(self) -> int:
84
+ selected = self.samples
85
+
86
+ if len(selected) < self.sample_size:
87
+ with self._lock:
88
+ return self.current_batch_size
89
+
90
+ average_duration = sum(m.duration for m in selected) / len(selected)
91
+
92
+ with self._lock:
93
+ current_size = self.current_batch_size
94
+
95
+ if average_duration < self.min_duration:
96
+ new_batch_size = int(current_size * (1 + self.step_ratio))
97
+ elif average_duration > self.max_duration:
98
+ new_batch_size = int(current_size * (1 - self.step_ratio))
99
+ else:
100
+ new_batch_size = current_size
101
+
102
+ new_batch_size = max(new_batch_size, self.min_batch_size)
103
+
104
+ if new_batch_size != self.current_batch_size:
105
+ self._batch_size_changed_at = datetime.now(timezone.utc)
106
+ self.current_batch_size = new_batch_size
107
+
108
+ return self.current_batch_size
openaivec/proxy.py CHANGED
@@ -4,6 +4,8 @@ from collections.abc import Hashable
4
4
  from dataclasses import dataclass, field
5
5
  from typing import Awaitable, Callable, Dict, Generic, List, Optional, TypeVar
6
6
 
7
+ from openaivec.optimize import BatchSizeSuggester
8
+
7
9
  S = TypeVar("S", bound=Hashable)
8
10
  T = TypeVar("T")
9
11
 
@@ -22,6 +24,7 @@ class ProxyBase(Generic[S, T]):
22
24
 
23
25
  batch_size: Optional[int] = None # subclasses may override via dataclass
24
26
  show_progress: bool = False # Enable progress bar display
27
+ suggester: BatchSizeSuggester = None # Batch size optimization, initialized by subclasses
25
28
 
26
29
  def _is_notebook_environment(self) -> bool:
27
30
  """Check if running in a Jupyter notebook environment.
@@ -125,7 +128,7 @@ class ProxyBase(Generic[S, T]):
125
128
  progress_bar.close()
126
129
 
127
130
  @staticmethod
128
- def __unique_in_order(seq: List[S]) -> List[S]:
131
+ def _unique_in_order(seq: List[S]) -> List[S]:
129
132
  """Return unique items preserving their first-occurrence order.
130
133
 
131
134
  Args:
@@ -143,11 +146,11 @@ class ProxyBase(Generic[S, T]):
143
146
  out.append(x)
144
147
  return out
145
148
 
146
- def __normalized_batch_size(self, total: int) -> int:
149
+ def _normalized_batch_size(self, total: int) -> int:
147
150
  """Compute the effective batch size used for processing.
148
151
 
149
- If ``batch_size`` is not set or non-positive, the entire ``total`` is
150
- processed in a single call.
152
+ If ``batch_size`` is None, use the suggester to determine optimal batch size.
153
+ If ``batch_size`` is non-positive, process the entire ``total`` in a single call.
151
154
 
152
155
  Args:
153
156
  total (int): Number of items intended to be processed.
@@ -155,7 +158,15 @@ class ProxyBase(Generic[S, T]):
155
158
  Returns:
156
159
  int: The positive batch size to use.
157
160
  """
158
- return self.batch_size if (self.batch_size and self.batch_size > 0) else total
161
+ if self.batch_size and self.batch_size > 0:
162
+ return self.batch_size
163
+ elif self.batch_size is None:
164
+ # Use suggester to determine optimal batch size
165
+ suggested = self.suggester.suggest_batch_size()
166
+ return min(suggested, total) # Don't exceed total items
167
+ else:
168
+ # batch_size is 0 or negative, process all at once
169
+ return total
159
170
 
160
171
 
161
172
  @dataclass
@@ -180,19 +191,13 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
180
191
  # Number of items to process per call to map_func. If None or <= 0, process all at once.
181
192
  batch_size: Optional[int] = None
182
193
  show_progress: bool = False
194
+ suggester: BatchSizeSuggester = field(default_factory=BatchSizeSuggester, repr=False)
195
+
196
+ # internals
183
197
  __cache: Dict[S, T] = field(default_factory=dict)
184
- # Thread-safety primitives (not part of public API)
185
198
  __lock: threading.RLock = field(default_factory=threading.RLock, repr=False)
186
199
  __inflight: Dict[S, threading.Event] = field(default_factory=dict, repr=False)
187
200
 
188
- # ---- private helpers -------------------------------------------------
189
- # expose base helpers under subclass private names for compatibility
190
- __unique_in_order = staticmethod(ProxyBase._ProxyBase__unique_in_order)
191
- __normalized_batch_size = ProxyBase._ProxyBase__normalized_batch_size
192
- _create_progress_bar = ProxyBase._create_progress_bar
193
- _update_progress_bar = ProxyBase._update_progress_bar
194
- _close_progress_bar = ProxyBase._close_progress_bar
195
-
196
201
  def __all_cached(self, items: List[S]) -> bool:
197
202
  """Check whether all items are present in the cache.
198
203
 
@@ -320,16 +325,17 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
320
325
  """
321
326
  if not owned:
322
327
  return
323
- batch_size = self.__normalized_batch_size(len(owned))
328
+ # Setup progress bar
329
+ progress_bar = self._create_progress_bar(len(owned))
324
330
 
325
331
  # Accumulate uncached items to maximize batch size utilization
326
332
  pending_to_call: List[S] = []
327
333
 
328
- # Setup progress bar
329
- progress_bar = self._create_progress_bar(len(owned))
330
-
331
- for i in range(0, len(owned), batch_size):
332
- batch = owned[i : i + batch_size]
334
+ i = 0
335
+ while i < len(owned):
336
+ # Get dynamic batch size for each iteration
337
+ current_batch_size = self._normalized_batch_size(len(owned))
338
+ batch = owned[i : i + current_batch_size]
333
339
  # Double-check cache right before processing
334
340
  with self.__lock:
335
341
  uncached_in_batch = [x for x in batch if x not in self.__cache]
@@ -337,14 +343,16 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
337
343
  pending_to_call.extend(uncached_in_batch)
338
344
 
339
345
  # Process accumulated items when we reach batch_size or at the end
340
- is_last_batch = i + batch_size >= len(owned)
341
- if len(pending_to_call) >= batch_size or (is_last_batch and pending_to_call):
346
+ is_last_batch = i + current_batch_size >= len(owned)
347
+ if len(pending_to_call) >= current_batch_size or (is_last_batch and pending_to_call):
342
348
  # Take up to batch_size items to process
343
- to_call = pending_to_call[:batch_size]
344
- pending_to_call = pending_to_call[batch_size:]
349
+ to_call = pending_to_call[:current_batch_size]
350
+ pending_to_call = pending_to_call[current_batch_size:]
345
351
 
346
352
  try:
347
- results = map_func(to_call)
353
+ # Always measure execution time using suggester
354
+ with self.suggester.record(len(to_call)):
355
+ results = map_func(to_call)
348
356
  except Exception:
349
357
  self.__finalize_failure(to_call)
350
358
  raise
@@ -353,13 +361,19 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
353
361
  # Update progress bar
354
362
  self._update_progress_bar(progress_bar, len(to_call))
355
363
 
364
+ # Move to next batch
365
+ i += current_batch_size
366
+
356
367
  # Process any remaining items
357
368
  while pending_to_call:
358
- to_call = pending_to_call[:batch_size]
359
- pending_to_call = pending_to_call[batch_size:]
369
+ # Get dynamic batch size for remaining items
370
+ remaining_batch_size = self._normalized_batch_size(len(pending_to_call))
371
+ to_call = pending_to_call[:remaining_batch_size]
372
+ pending_to_call = pending_to_call[remaining_batch_size:]
360
373
 
361
374
  try:
362
- results = map_func(to_call)
375
+ with self.suggester.record(len(to_call)):
376
+ results = map_func(to_call)
363
377
  except Exception:
364
378
  self.__finalize_failure(to_call)
365
379
  raise
@@ -430,7 +444,7 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
430
444
  if self.__all_cached(items):
431
445
  return self.__values(items)
432
446
 
433
- unique_items = self.__unique_in_order(items)
447
+ unique_items = self._unique_in_order(items)
434
448
  owned, wait_for = self.__acquire_ownership(unique_items)
435
449
 
436
450
  self.__process_owned(owned, map_func)
@@ -465,6 +479,7 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
465
479
  batch_size: Optional[int] = None
466
480
  max_concurrency: int = 8
467
481
  show_progress: bool = False
482
+ suggester: BatchSizeSuggester = field(default_factory=BatchSizeSuggester, repr=False)
468
483
 
469
484
  # internals
470
485
  __cache: Dict[S, T] = field(default_factory=dict, repr=False)
@@ -490,14 +505,6 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
490
505
  else:
491
506
  self.__sema = None
492
507
 
493
- # ---- private helpers -------------------------------------------------
494
- # expose base helpers under subclass private names for compatibility
495
- __unique_in_order = staticmethod(ProxyBase._ProxyBase__unique_in_order)
496
- __normalized_batch_size = ProxyBase._ProxyBase__normalized_batch_size
497
- _create_progress_bar = ProxyBase._create_progress_bar
498
- _update_progress_bar = ProxyBase._update_progress_bar
499
- _close_progress_bar = ProxyBase._close_progress_bar
500
-
501
508
  async def __all_cached(self, items: List[S]) -> bool:
502
509
  """Check whether all items are present in the cache.
503
510
 
@@ -602,69 +609,43 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
602
609
  await self.clear()
603
610
 
604
611
  async def __process_owned(self, owned: List[S], map_func: Callable[[List[S]], Awaitable[List[T]]]) -> None:
605
- """Process owned keys in mini-batches, re-checking cache before awaits.
606
-
607
- Before calling ``map_func`` for each batch, the cache is re-checked to
608
- skip any keys that may have been filled in the meantime. Items
609
- are accumulated across multiple original batches to maximize batch
610
- size utilization when some items are cached. On exceptions raised
611
- by ``map_func``, all corresponding in-flight events are released
612
- to prevent deadlocks, and the exception is propagated.
612
+ """Process owned keys using Producer-Consumer pattern with dynamic batch sizing.
613
613
 
614
614
  Args:
615
- owned (list[S]): Items for which this coroutine holds computation
616
- ownership.
615
+ owned (list[S]): Items for which this coroutine holds computation ownership.
617
616
 
618
617
  Raises:
619
618
  Exception: Propagates any exception raised by ``map_func``.
620
619
  """
621
620
  if not owned:
622
621
  return
623
- batch_size = self.__normalized_batch_size(len(owned))
624
622
 
625
- # Accumulate uncached items to maximize batch size utilization
626
- pending_to_call: List[S] = []
627
-
628
- # Setup progress bar
629
623
  progress_bar = self._create_progress_bar(len(owned))
624
+ batch_queue: asyncio.Queue = asyncio.Queue(maxsize=self.max_concurrency)
625
+
626
+ async def producer():
627
+ index = 0
628
+ while index < len(owned):
629
+ batch_size = self._normalized_batch_size(len(owned) - index)
630
+ batch = owned[index : index + batch_size]
631
+ await batch_queue.put(batch)
632
+ index += batch_size
633
+ # Send completion signals
634
+ for _ in range(self.max_concurrency):
635
+ await batch_queue.put(None)
636
+
637
+ async def consumer():
638
+ while True:
639
+ batch = await batch_queue.get()
640
+ try:
641
+ if batch is None:
642
+ break
643
+ await self.__process_single_batch(batch, map_func, progress_bar)
644
+ finally:
645
+ batch_queue.task_done()
630
646
 
631
- # Collect all batches to process
632
- batches_to_process: List[List[S]] = []
633
-
634
- for i in range(0, len(owned), batch_size):
635
- batch = owned[i : i + batch_size]
636
- async with self.__lock:
637
- uncached_in_batch = [x for x in batch if x not in self.__cache]
638
-
639
- pending_to_call.extend(uncached_in_batch)
640
-
641
- # Process accumulated items when we reach batch_size or at the end
642
- is_last_batch = i + batch_size >= len(owned)
643
- if len(pending_to_call) >= batch_size or (is_last_batch and pending_to_call):
644
- # Take up to batch_size items to process
645
- to_call = pending_to_call[:batch_size]
646
- pending_to_call = pending_to_call[batch_size:]
647
- if to_call: # Only add non-empty batches
648
- batches_to_process.append(to_call)
649
-
650
- # Process any remaining items
651
- while pending_to_call:
652
- to_call = pending_to_call[:batch_size]
653
- pending_to_call = pending_to_call[batch_size:]
654
- if to_call: # Only add non-empty batches
655
- batches_to_process.append(to_call)
656
-
657
- # Process all batches concurrently
658
- if batches_to_process:
659
- tasks = []
660
- for batch in batches_to_process:
661
- task = self.__process_single_batch(batch, map_func, progress_bar)
662
- tasks.append(task)
663
-
664
- # Wait for all batches to complete
665
- await asyncio.gather(*tasks)
647
+ await asyncio.gather(producer(), *[consumer() for _ in range(self.max_concurrency)])
666
648
 
667
- # Close progress bar
668
649
  self._close_progress_bar(progress_bar)
669
650
 
670
651
  async def __process_single_batch(
@@ -676,7 +657,9 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
676
657
  if self.__sema:
677
658
  await self.__sema.acquire()
678
659
  acquired = True
679
- results = await map_func(to_call)
660
+ # Measure async map_func execution using suggester
661
+ with self.suggester.record(len(to_call)):
662
+ results = await map_func(to_call)
680
663
  except Exception:
681
664
  await self.__finalize_failure(to_call)
682
665
  raise
@@ -737,7 +720,7 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
737
720
  if await self.__all_cached(items):
738
721
  return await self.__values(items)
739
722
 
740
- unique_items = self.__unique_in_order(items)
723
+ unique_items = self._unique_in_order(items)
741
724
  owned, wait_for = await self.__acquire_ownership(unique_items)
742
725
 
743
726
  await self.__process_owned(owned, map_func)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: openaivec
3
- Version: 0.13.3
3
+ Version: 0.13.4
4
4
  Summary: Generative mutation for tabular calculation
5
5
  Project-URL: Homepage, https://microsoft.github.io/openaivec/
6
6
  Project-URL: Repository, https://github.com/microsoft/openaivec
@@ -3,10 +3,11 @@ openaivec/di.py,sha256=eNewaSRx7f_O0IQcyjzGpIMak6O-bc6OeMqjytCfr88,10634
3
3
  openaivec/embeddings.py,sha256=ypED2-MkC6t4Fvuopeldab8kNoR3-hN8BLVzRPDaWhk,7210
4
4
  openaivec/log.py,sha256=GofgzUpv_xDVuGC-gYmit5Oyu06it1SBXRck6COR5go,1439
5
5
  openaivec/model.py,sha256=wu1UGetqLbUGvGqmOiQna4SJnO5VvyMoCHdAQhSG6MY,3295
6
+ openaivec/optimize.py,sha256=-9h03O_bDjtHYHg5L9M76gmaEddkE87HmN964XCO4bU,3838
6
7
  openaivec/pandas_ext.py,sha256=H4DswYgfTf0NrBl-L1LgR78ZIfgHONs3YIEWptahh8U,56361
7
8
  openaivec/prompt.py,sha256=3-fcmFW-yroKL_Yt-wE0u1FwZ22ja8ul3o6Llhefzzo,18544
8
9
  openaivec/provider.py,sha256=kkC9eYgXRUwb88EvN4dhEc00FDKT3l5D_ZDsW0Ty7SM,6218
9
- openaivec/proxy.py,sha256=yBEylh3hpb_Q5H-AiaWNMgconQbhPkT6F6ChTChit0k,28953
10
+ openaivec/proxy.py,sha256=Y-ig4hEf_r0sBvb0fuKqvUkT2hFRV3Zj_6mSu9yphrs,28113
10
11
  openaivec/responses.py,sha256=_465ufTtA58DwI5KQg9YIkEfLT0XYHvyWSZfjU8hNSs,20457
11
12
  openaivec/serialize.py,sha256=HXi5l_4b_4eUMNL7f7o_suyHHF_hz3RYsUsri5CQ7_4,7325
12
13
  openaivec/spark.py,sha256=ZCoR5_7FKX_fC_rhV6X9N4lQ927nLScEKVqhFYGkiWU,23169
@@ -28,7 +29,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=9HdMpi7FkjHNXAswaN98k8jeKsatBBXT
28
29
  openaivec/task/nlp/translation.py,sha256=4fjKtbVvOvivWMrpZfreIsdg8d0DplDujO8kAdLbAKI,6625
29
30
  openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
30
31
  openaivec/task/table/fillna.py,sha256=vi8t5QEIU-W3e05wwpATb3MEUDyf8luVnE8U-5VebZo,6582
31
- openaivec-0.13.3.dist-info/METADATA,sha256=oaM12LUwa_zsrYnlEbOhlEXRP_Xexh7Wkw7cYVuFQQQ,27329
32
- openaivec-0.13.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
33
- openaivec-0.13.3.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
34
- openaivec-0.13.3.dist-info/RECORD,,
32
+ openaivec-0.13.4.dist-info/METADATA,sha256=86Y8Pmh1y3cTTxKx9OWNRl5xwY3AAV8AHlAsZ80FHv4,27329
33
+ openaivec-0.13.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
34
+ openaivec-0.13.4.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
35
+ openaivec-0.13.4.dist-info/RECORD,,