PyPI - openaivec - Versions diffs - 0.13.3__py3-none-any.whl → 0.13.4__py3-none-any.whl - Mend

openaivec 0.13.3py3-none-any.whl → 0.13.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

openaivec/optimize.py ADDED Viewed

@@ -0,0 +1,108 @@
+import threading
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import List
+@dataclass(frozen=True)
+class PerformanceMetric:
+    duration: float
+    batch_size: int
+    executed_at: datetime
+    exception: BaseException | None = None
+@dataclass
+class BatchSizeSuggester:
+    current_batch_size: int = 10
+    min_batch_size: int = 10
+    min_duration: float = 30.0
+    max_duration: float = 60.0
+    step_ratio: float = 0.1
+    sample_size: int = 10
+    _history: List[PerformanceMetric] = field(default_factory=list)
+    _lock: threading.RLock = field(default_factory=threading.RLock, repr=False)
+    _batch_size_changed_at: datetime | None = field(default=None, init=False)
+    def __post_init__(self) -> None:
+        if self.min_batch_size <= 0:
+            raise ValueError("min_batch_size must be > 0")
+        if self.current_batch_size < self.min_batch_size:
+            raise ValueError("current_batch_size must be >= min_batch_size")
+        if self.sample_size <= 0:
+            raise ValueError("sample_size must be > 0")
+        if self.step_ratio <= 0:
+            raise ValueError("step_ratio must be > 0")
+        if self.min_duration <= 0 or self.max_duration <= 0:
+            raise ValueError("min_duration and max_duration must be > 0")
+        if self.min_duration >= self.max_duration:
+            raise ValueError("min_duration must be < max_duration")
+    @contextmanager
+    def record(self, batch_size: int):
+        start_time = time.perf_counter()
+        executed_at = datetime.now(timezone.utc)
+        caught_exception: BaseException | None = None
+        try:
+            yield
+        except BaseException as e:
+            caught_exception = e
+            raise
+        finally:
+            duration = time.perf_counter() - start_time
+            with self._lock:
+                self._history.append(
+                    PerformanceMetric(
+                        duration=duration,
+                        batch_size=batch_size,
+                        executed_at=executed_at,
+                        exception=caught_exception,
+                    )
+                )
+    @property
+    def samples(self) -> List[PerformanceMetric]:
+        with self._lock:
+            selected: List[PerformanceMetric] = []
+            for metric in reversed(self._history):
+                if metric.exception is not None:
+                    continue
+                if self._batch_size_changed_at and metric.executed_at < self._batch_size_changed_at:
+                    continue
+                selected.append(metric)
+                if len(selected) >= self.sample_size:
+                    break
+            return list(reversed(selected))
+    def clear_history(self):
+        with self._lock:
+            self._history.clear()
+    def suggest_batch_size(self) -> int:
+        selected = self.samples
+        if len(selected) < self.sample_size:
+            with self._lock:
+                return self.current_batch_size
+        average_duration = sum(m.duration for m in selected) / len(selected)
+        with self._lock:
+            current_size = self.current_batch_size
+            if average_duration < self.min_duration:
+                new_batch_size = int(current_size * (1 + self.step_ratio))
+            elif average_duration > self.max_duration:
+                new_batch_size = int(current_size * (1 - self.step_ratio))
+            else:
+                new_batch_size = current_size
+            new_batch_size = max(new_batch_size, self.min_batch_size)
+            if new_batch_size != self.current_batch_size:
+                self._batch_size_changed_at = datetime.now(timezone.utc)
+                self.current_batch_size = new_batch_size
+            return self.current_batch_size

openaivec/proxy.py CHANGED Viewed

@@ -4,6 +4,8 @@ from collections.abc import Hashable
 from dataclasses import dataclass, field
 from typing import Awaitable, Callable, Dict, Generic, List, Optional, TypeVar
+from openaivec.optimize import BatchSizeSuggester
 S = TypeVar("S", bound=Hashable)
 T = TypeVar("T")
@@ -22,6 +24,7 @@ class ProxyBase(Generic[S, T]):
     batch_size: Optional[int] = None  # subclasses may override via dataclass
     show_progress: bool = False  # Enable progress bar display
+    suggester: BatchSizeSuggester = None  # Batch size optimization, initialized by subclasses
     def _is_notebook_environment(self) -> bool:
         """Check if running in a Jupyter notebook environment.
@@ -125,7 +128,7 @@ class ProxyBase(Generic[S, T]):
             progress_bar.close()
     @staticmethod
-    def __unique_in_order(seq: List[S]) -> List[S]:
+    def _unique_in_order(seq: List[S]) -> List[S]:
         """Return unique items preserving their first-occurrence order.
         Args:
@@ -143,11 +146,11 @@ class ProxyBase(Generic[S, T]):
                 out.append(x)
         return out
-    def __normalized_batch_size(self, total: int) -> int:
+    def _normalized_batch_size(self, total: int) -> int:
         """Compute the effective batch size used for processing.
-        If ``batch_size`` is not set or non-positive, the entire ``total`` is
-        processed in a single call.
+        If ``batch_size`` is None, use the suggester to determine optimal batch size.
+        If ``batch_size`` is non-positive, process the entire ``total`` in a single call.
         Args:
             total (int): Number of items intended to be processed.
@@ -155,7 +158,15 @@ class ProxyBase(Generic[S, T]):
         Returns:
             int: The positive batch size to use.
         """
-        return self.batch_size if (self.batch_size and self.batch_size > 0) else total
+        if self.batch_size and self.batch_size > 0:
+            return self.batch_size
+        elif self.batch_size is None:
+            # Use suggester to determine optimal batch size
+            suggested = self.suggester.suggest_batch_size()
+            return min(suggested, total)  # Don't exceed total items
+        else:
+            # batch_size is 0 or negative, process all at once
+            return total
 @dataclass
@@ -180,19 +191,13 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
     # Number of items to process per call to map_func. If None or <= 0, process all at once.
     batch_size: Optional[int] = None
     show_progress: bool = False
+    suggester: BatchSizeSuggester = field(default_factory=BatchSizeSuggester, repr=False)
+    # internals
     __cache: Dict[S, T] = field(default_factory=dict)
-    # Thread-safety primitives (not part of public API)
     __lock: threading.RLock = field(default_factory=threading.RLock, repr=False)
     __inflight: Dict[S, threading.Event] = field(default_factory=dict, repr=False)
-    # ---- private helpers -------------------------------------------------
-    # expose base helpers under subclass private names for compatibility
-    __unique_in_order = staticmethod(ProxyBase._ProxyBase__unique_in_order)
-    __normalized_batch_size = ProxyBase._ProxyBase__normalized_batch_size
-    _create_progress_bar = ProxyBase._create_progress_bar
-    _update_progress_bar = ProxyBase._update_progress_bar
-    _close_progress_bar = ProxyBase._close_progress_bar
     def __all_cached(self, items: List[S]) -> bool:
         """Check whether all items are present in the cache.
@@ -320,16 +325,17 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
         """
         if not owned:
             return
-        batch_size = self.__normalized_batch_size(len(owned))
+        # Setup progress bar
+        progress_bar = self._create_progress_bar(len(owned))
         # Accumulate uncached items to maximize batch size utilization
         pending_to_call: List[S] = []
-        # Setup progress bar
-        progress_bar = self._create_progress_bar(len(owned))
-        for i in range(0, len(owned), batch_size):
-            batch = owned[i : i + batch_size]
+        i = 0
+        while i < len(owned):
+            # Get dynamic batch size for each iteration
+            current_batch_size = self._normalized_batch_size(len(owned))
+            batch = owned[i : i + current_batch_size]
             # Double-check cache right before processing
             with self.__lock:
                 uncached_in_batch = [x for x in batch if x not in self.__cache]
@@ -337,14 +343,16 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
             pending_to_call.extend(uncached_in_batch)
             # Process accumulated items when we reach batch_size or at the end
-            is_last_batch = i + batch_size >= len(owned)
-            if len(pending_to_call) >= batch_size or (is_last_batch and pending_to_call):
+            is_last_batch = i + current_batch_size >= len(owned)
+            if len(pending_to_call) >= current_batch_size or (is_last_batch and pending_to_call):
                 # Take up to batch_size items to process
-                to_call = pending_to_call[:batch_size]
-                pending_to_call = pending_to_call[batch_size:]
+                to_call = pending_to_call[:current_batch_size]
+                pending_to_call = pending_to_call[current_batch_size:]
                 try:
-                    results = map_func(to_call)
+                    # Always measure execution time using suggester
+                    with self.suggester.record(len(to_call)):
+                        results = map_func(to_call)
                 except Exception:
                     self.__finalize_failure(to_call)
                     raise
@@ -353,13 +361,19 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
                 # Update progress bar
                 self._update_progress_bar(progress_bar, len(to_call))
+            # Move to next batch
+            i += current_batch_size
         # Process any remaining items
         while pending_to_call:
-            to_call = pending_to_call[:batch_size]
-            pending_to_call = pending_to_call[batch_size:]
+            # Get dynamic batch size for remaining items
+            remaining_batch_size = self._normalized_batch_size(len(pending_to_call))
+            to_call = pending_to_call[:remaining_batch_size]
+            pending_to_call = pending_to_call[remaining_batch_size:]
             try:
-                results = map_func(to_call)
+                with self.suggester.record(len(to_call)):
+                    results = map_func(to_call)
             except Exception:
                 self.__finalize_failure(to_call)
                 raise
@@ -430,7 +444,7 @@ class BatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
         if self.__all_cached(items):
             return self.__values(items)
-        unique_items = self.__unique_in_order(items)
+        unique_items = self._unique_in_order(items)
         owned, wait_for = self.__acquire_ownership(unique_items)
         self.__process_owned(owned, map_func)
@@ -465,6 +479,7 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
     batch_size: Optional[int] = None
     max_concurrency: int = 8
     show_progress: bool = False
+    suggester: BatchSizeSuggester = field(default_factory=BatchSizeSuggester, repr=False)
     # internals
     __cache: Dict[S, T] = field(default_factory=dict, repr=False)
@@ -490,14 +505,6 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
         else:
             self.__sema = None
-    # ---- private helpers -------------------------------------------------
-    # expose base helpers under subclass private names for compatibility
-    __unique_in_order = staticmethod(ProxyBase._ProxyBase__unique_in_order)
-    __normalized_batch_size = ProxyBase._ProxyBase__normalized_batch_size
-    _create_progress_bar = ProxyBase._create_progress_bar
-    _update_progress_bar = ProxyBase._update_progress_bar
-    _close_progress_bar = ProxyBase._close_progress_bar
     async def __all_cached(self, items: List[S]) -> bool:
         """Check whether all items are present in the cache.
@@ -602,69 +609,43 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
         await self.clear()
     async def __process_owned(self, owned: List[S], map_func: Callable[[List[S]], Awaitable[List[T]]]) -> None:
-        """Process owned keys in mini-batches, re-checking cache before awaits.
-        Before calling ``map_func`` for each batch, the cache is re-checked to
-        skip any keys that may have been filled in the meantime. Items
-        are accumulated across multiple original batches to maximize batch
-        size utilization when some items are cached. On exceptions raised
-        by ``map_func``, all corresponding in-flight events are released
-        to prevent deadlocks, and the exception is propagated.
+        """Process owned keys using Producer-Consumer pattern with dynamic batch sizing.
         Args:
-            owned (list[S]): Items for which this coroutine holds computation
-            ownership.
+            owned (list[S]): Items for which this coroutine holds computation ownership.
         Raises:
             Exception: Propagates any exception raised by ``map_func``.
         """
         if not owned:
             return
-        batch_size = self.__normalized_batch_size(len(owned))
-        # Accumulate uncached items to maximize batch size utilization
-        pending_to_call: List[S] = []
-        # Setup progress bar
         progress_bar = self._create_progress_bar(len(owned))
+        batch_queue: asyncio.Queue = asyncio.Queue(maxsize=self.max_concurrency)
+        async def producer():
+            index = 0
+            while index < len(owned):
+                batch_size = self._normalized_batch_size(len(owned) - index)
+                batch = owned[index : index + batch_size]
+                await batch_queue.put(batch)
+                index += batch_size
+            # Send completion signals
+            for _ in range(self.max_concurrency):
+                await batch_queue.put(None)
+        async def consumer():
+            while True:
+                batch = await batch_queue.get()
+                try:
+                    if batch is None:
+                        break
+                    await self.__process_single_batch(batch, map_func, progress_bar)
+                finally:
+                    batch_queue.task_done()
-        # Collect all batches to process
-        batches_to_process: List[List[S]] = []
-        for i in range(0, len(owned), batch_size):
-            batch = owned[i : i + batch_size]
-            async with self.__lock:
-                uncached_in_batch = [x for x in batch if x not in self.__cache]
-            pending_to_call.extend(uncached_in_batch)
-            # Process accumulated items when we reach batch_size or at the end
-            is_last_batch = i + batch_size >= len(owned)
-            if len(pending_to_call) >= batch_size or (is_last_batch and pending_to_call):
-                # Take up to batch_size items to process
-                to_call = pending_to_call[:batch_size]
-                pending_to_call = pending_to_call[batch_size:]
-                if to_call:  # Only add non-empty batches
-                    batches_to_process.append(to_call)
-        # Process any remaining items
-        while pending_to_call:
-            to_call = pending_to_call[:batch_size]
-            pending_to_call = pending_to_call[batch_size:]
-            if to_call:  # Only add non-empty batches
-                batches_to_process.append(to_call)
-        # Process all batches concurrently
-        if batches_to_process:
-            tasks = []
-            for batch in batches_to_process:
-                task = self.__process_single_batch(batch, map_func, progress_bar)
-                tasks.append(task)
-            # Wait for all batches to complete
-            await asyncio.gather(*tasks)
+        await asyncio.gather(producer(), *[consumer() for _ in range(self.max_concurrency)])
-        # Close progress bar
         self._close_progress_bar(progress_bar)
     async def __process_single_batch(
@@ -676,7 +657,9 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
             if self.__sema:
                 await self.__sema.acquire()
                 acquired = True
-            results = await map_func(to_call)
+            # Measure async map_func execution using suggester
+            with self.suggester.record(len(to_call)):
+                results = await map_func(to_call)
         except Exception:
             await self.__finalize_failure(to_call)
             raise
@@ -737,7 +720,7 @@ class AsyncBatchingMapProxy(ProxyBase[S, T], Generic[S, T]):
         if await self.__all_cached(items):
             return await self.__values(items)
-        unique_items = self.__unique_in_order(items)
+        unique_items = self._unique_in_order(items)
         owned, wait_for = await self.__acquire_ownership(unique_items)
         await self.__process_owned(owned, map_func)

{openaivec-0.13.3.dist-info → openaivec-0.13.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openaivec
-Version: 0.13.3
+Version: 0.13.4
 Summary: Generative mutation for tabular calculation
 Project-URL: Homepage, https://microsoft.github.io/openaivec/
 Project-URL: Repository, https://github.com/microsoft/openaivec

{openaivec-0.13.3.dist-info → openaivec-0.13.4.dist-info}/RECORD RENAMED Viewed

@@ -3,10 +3,11 @@ openaivec/di.py,sha256=eNewaSRx7f_O0IQcyjzGpIMak6O-bc6OeMqjytCfr88,10634
 openaivec/embeddings.py,sha256=ypED2-MkC6t4Fvuopeldab8kNoR3-hN8BLVzRPDaWhk,7210
 openaivec/log.py,sha256=GofgzUpv_xDVuGC-gYmit5Oyu06it1SBXRck6COR5go,1439
 openaivec/model.py,sha256=wu1UGetqLbUGvGqmOiQna4SJnO5VvyMoCHdAQhSG6MY,3295
+openaivec/optimize.py,sha256=-9h03O_bDjtHYHg5L9M76gmaEddkE87HmN964XCO4bU,3838
 openaivec/pandas_ext.py,sha256=H4DswYgfTf0NrBl-L1LgR78ZIfgHONs3YIEWptahh8U,56361
 openaivec/prompt.py,sha256=3-fcmFW-yroKL_Yt-wE0u1FwZ22ja8ul3o6Llhefzzo,18544
 openaivec/provider.py,sha256=kkC9eYgXRUwb88EvN4dhEc00FDKT3l5D_ZDsW0Ty7SM,6218
-openaivec/proxy.py,sha256=yBEylh3hpb_Q5H-AiaWNMgconQbhPkT6F6ChTChit0k,28953
+openaivec/proxy.py,sha256=Y-ig4hEf_r0sBvb0fuKqvUkT2hFRV3Zj_6mSu9yphrs,28113
 openaivec/responses.py,sha256=_465ufTtA58DwI5KQg9YIkEfLT0XYHvyWSZfjU8hNSs,20457
 openaivec/serialize.py,sha256=HXi5l_4b_4eUMNL7f7o_suyHHF_hz3RYsUsri5CQ7_4,7325
 openaivec/spark.py,sha256=ZCoR5_7FKX_fC_rhV6X9N4lQ927nLScEKVqhFYGkiWU,23169
@@ -28,7 +29,7 @@ openaivec/task/nlp/sentiment_analysis.py,sha256=9HdMpi7FkjHNXAswaN98k8jeKsatBBXT
 openaivec/task/nlp/translation.py,sha256=4fjKtbVvOvivWMrpZfreIsdg8d0DplDujO8kAdLbAKI,6625
 openaivec/task/table/__init__.py,sha256=kJz15WDJXjyC7UIHKBvlTRhCf347PCDMH5T5fONV2sU,83
 openaivec/task/table/fillna.py,sha256=vi8t5QEIU-W3e05wwpATb3MEUDyf8luVnE8U-5VebZo,6582
-openaivec-0.13.3.dist-info/METADATA,sha256=oaM12LUwa_zsrYnlEbOhlEXRP_Xexh7Wkw7cYVuFQQQ,27329
-openaivec-0.13.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-openaivec-0.13.3.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
-openaivec-0.13.3.dist-info/RECORD,,
+openaivec-0.13.4.dist-info/METADATA,sha256=86Y8Pmh1y3cTTxKx9OWNRl5xwY3AAV8AHlAsZ80FHv4,27329
+openaivec-0.13.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+openaivec-0.13.4.dist-info/licenses/LICENSE,sha256=ws_MuBL-SCEBqPBFl9_FqZkaaydIJmxHrJG2parhU4M,1141
+openaivec-0.13.4.dist-info/RECORD,,

{openaivec-0.13.3.dist-info → openaivec-0.13.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{openaivec-0.13.3.dist-info → openaivec-0.13.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

openaivec 0.13.3__py3-none-any.whl → 0.13.4__py3-none-any.whl

openaivec 0.13.3py3-none-any.whl → 0.13.4py3-none-any.whl