openaivec 0.12.5__py3-none-any.whl → 1.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. openaivec/__init__.py +13 -4
  2. openaivec/_cache/__init__.py +12 -0
  3. openaivec/_cache/optimize.py +109 -0
  4. openaivec/_cache/proxy.py +806 -0
  5. openaivec/{di.py → _di.py} +36 -12
  6. openaivec/_embeddings.py +203 -0
  7. openaivec/{log.py → _log.py} +2 -2
  8. openaivec/_model.py +113 -0
  9. openaivec/{prompt.py → _prompt.py} +95 -28
  10. openaivec/_provider.py +207 -0
  11. openaivec/_responses.py +511 -0
  12. openaivec/_schema/__init__.py +9 -0
  13. openaivec/_schema/infer.py +340 -0
  14. openaivec/_schema/spec.py +350 -0
  15. openaivec/_serialize.py +234 -0
  16. openaivec/{util.py → _util.py} +25 -85
  17. openaivec/pandas_ext.py +1496 -318
  18. openaivec/spark.py +485 -183
  19. openaivec/task/__init__.py +9 -7
  20. openaivec/task/customer_support/__init__.py +9 -15
  21. openaivec/task/customer_support/customer_sentiment.py +17 -15
  22. openaivec/task/customer_support/inquiry_classification.py +23 -22
  23. openaivec/task/customer_support/inquiry_summary.py +14 -13
  24. openaivec/task/customer_support/intent_analysis.py +21 -19
  25. openaivec/task/customer_support/response_suggestion.py +16 -16
  26. openaivec/task/customer_support/urgency_analysis.py +24 -25
  27. openaivec/task/nlp/__init__.py +4 -4
  28. openaivec/task/nlp/dependency_parsing.py +10 -12
  29. openaivec/task/nlp/keyword_extraction.py +11 -14
  30. openaivec/task/nlp/morphological_analysis.py +12 -14
  31. openaivec/task/nlp/named_entity_recognition.py +16 -18
  32. openaivec/task/nlp/sentiment_analysis.py +14 -11
  33. openaivec/task/nlp/translation.py +6 -9
  34. openaivec/task/table/__init__.py +2 -2
  35. openaivec/task/table/fillna.py +11 -11
  36. openaivec-1.0.10.dist-info/METADATA +399 -0
  37. openaivec-1.0.10.dist-info/RECORD +39 -0
  38. {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/WHEEL +1 -1
  39. openaivec/embeddings.py +0 -172
  40. openaivec/model.py +0 -67
  41. openaivec/provider.py +0 -45
  42. openaivec/responses.py +0 -393
  43. openaivec/serialize.py +0 -225
  44. openaivec-0.12.5.dist-info/METADATA +0 -696
  45. openaivec-0.12.5.dist-info/RECORD +0 -33
  46. {openaivec-0.12.5.dist-info → openaivec-1.0.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,9 @@
1
+ from collections.abc import Callable
1
2
  from dataclasses import dataclass, field
2
3
  from threading import RLock
3
- from typing import Any, Callable, Dict, Set, Type, TypeVar
4
+ from typing import Any, TypeVar
5
+
6
+ __all__ = []
4
7
 
5
8
  """Simple dependency injection container with singleton lifecycle management.
6
9
 
@@ -11,14 +14,14 @@ are created once and reused across multiple resolve calls.
11
14
  Example:
12
15
  ```python
13
16
  from openaivec.di import Container
14
-
17
+
15
18
  class DatabaseService:
16
19
  def __init__(self):
17
20
  self.connection = "database://localhost"
18
-
21
+
19
22
  container = Container()
20
23
  container.register(DatabaseService, lambda: DatabaseService())
21
-
24
+
22
25
  db1 = container.resolve(DatabaseService)
23
26
  db2 = container.resolve(DatabaseService)
24
27
  print(db1 is db2) # True - same instance
@@ -117,12 +120,12 @@ class Container:
117
120
  ```
118
121
  """
119
122
 
120
- _instances: Dict[Type[Any], Any] = field(default_factory=dict)
121
- _providers: Dict[Type[Any], Provider[Any]] = field(default_factory=dict)
123
+ _instances: dict[type[Any], Any] = field(default_factory=dict)
124
+ _providers: dict[type[Any], Provider[Any]] = field(default_factory=dict)
122
125
  _lock: RLock = field(default_factory=RLock)
123
- _resolving: Set[Type[Any]] = field(default_factory=set)
126
+ _resolving: set[type[Any]] = field(default_factory=set)
124
127
 
125
- def register(self, cls: Type[T], provider: Provider[T]) -> None:
128
+ def register(self, cls: type[T], provider: Provider[T]) -> None:
126
129
  """Register a provider function for a service type.
127
130
 
128
131
  The provider function will be called once to create the singleton instance
@@ -148,7 +151,7 @@ class Container:
148
151
 
149
152
  self._providers[cls] = provider
150
153
 
151
- def register_instance(self, cls: Type[T], instance: T) -> None:
154
+ def register_instance(self, cls: type[T], instance: T) -> None:
152
155
  """Register a pre-created instance for a service type.
153
156
 
154
157
  The provided instance will be stored directly in the container and returned
@@ -176,7 +179,7 @@ class Container:
176
179
  self._instances[cls] = instance
177
180
  self._providers[cls] = lambda: instance
178
181
 
179
- def resolve(self, cls: Type[T]) -> T:
182
+ def resolve(self, cls: type[T]) -> T:
180
183
  """Resolve a service instance, creating it if necessary.
181
184
 
182
185
  Returns the singleton instance for the requested service type. If this is
@@ -230,7 +233,7 @@ class Container:
230
233
  finally:
231
234
  self._resolving.discard(cls)
232
235
 
233
- def is_registered(self, cls: Type[Any]) -> bool:
236
+ def is_registered(self, cls: type[Any]) -> bool:
234
237
  """Check if a service type is registered in the container.
235
238
 
236
239
  Args:
@@ -250,7 +253,7 @@ class Container:
250
253
  with self._lock:
251
254
  return cls in self._providers
252
255
 
253
- def unregister(self, cls: Type[Any]) -> None:
256
+ def unregister(self, cls: type[Any]) -> None:
254
257
  """Unregister a service type from the container.
255
258
 
256
259
  Removes the provider function and any cached singleton instance for
@@ -300,3 +303,24 @@ class Container:
300
303
  self._providers.clear()
301
304
  self._instances.clear()
302
305
  self._resolving.clear()
306
+
307
+ def clear_singletons(self) -> None:
308
+ """Clear all cached singleton instances from the container.
309
+
310
+ Removes all cached singleton instances while keeping the registered
311
+ providers intact. After calling this method, the next resolve call
312
+ for any service will create a new instance using the provider function.
313
+
314
+ Example:
315
+ ```python
316
+ container = Container()
317
+ container.register(str, lambda: "Hello")
318
+ instance1 = container.resolve(str)
319
+ container.clear_singletons()
320
+ instance2 = container.resolve(str)
321
+ print(instance1 is instance2)
322
+ # False - different instances after clearing singletons
323
+ ```
324
+ """
325
+ with self._lock:
326
+ self._instances.clear()
@@ -0,0 +1,203 @@
1
+ from dataclasses import dataclass, field
2
+ from logging import Logger, getLogger
3
+
4
+ import numpy as np
5
+ from numpy.typing import NDArray
6
+ from openai import AsyncOpenAI, InternalServerError, OpenAI, RateLimitError
7
+
8
+ from openaivec._cache import AsyncBatchingMapProxy, BatchingMapProxy
9
+ from openaivec._log import observe
10
+ from openaivec._util import backoff, backoff_async
11
+
12
+ __all__ = [
13
+ "BatchEmbeddings",
14
+ "AsyncBatchEmbeddings",
15
+ ]
16
+
17
+ _LOGGER: Logger = getLogger(__name__)
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class BatchEmbeddings:
22
+ """Thin wrapper around the OpenAI embeddings endpoint (synchronous).
23
+
24
+ Attributes:
25
+ client (OpenAI): Configured OpenAI client.
26
+ model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name
27
+ (e.g., ``"text-embedding-3-small"``).
28
+ cache (BatchingMapProxy[str, NDArray[np.float32]]): Batching proxy for ordered, cached mapping.
29
+ api_kwargs (dict[str, Any]): Additional OpenAI API parameters stored at initialization.
30
+ """
31
+
32
+ client: OpenAI
33
+ model_name: str
34
+ cache: BatchingMapProxy[str, NDArray[np.float32]] = field(default_factory=lambda: BatchingMapProxy(batch_size=None))
35
+ api_kwargs: dict[str, int | float | str | bool] = field(default_factory=dict)
36
+
37
+ @classmethod
38
+ def of(cls, client: OpenAI, model_name: str, batch_size: int | None = None, **api_kwargs) -> "BatchEmbeddings":
39
+ """Factory constructor.
40
+
41
+ Args:
42
+ client (OpenAI): OpenAI client.
43
+ model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
44
+ batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
45
+ (automatic batch size optimization). Set to a positive integer for fixed batch size.
46
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
47
+
48
+ Returns:
49
+ BatchEmbeddings: Configured instance backed by a batching proxy.
50
+ """
51
+ return cls(
52
+ client=client,
53
+ model_name=model_name,
54
+ cache=BatchingMapProxy(batch_size=batch_size),
55
+ api_kwargs=api_kwargs,
56
+ )
57
+
58
+ @observe(_LOGGER)
59
+ @backoff(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
60
+ def _embed_chunk(self, inputs: list[str]) -> list[NDArray[np.float32]]:
61
+ """Embed one minibatch of strings.
62
+
63
+ This private helper is the unit of work used by the map/parallel
64
+ utilities. Exponential back‑off is applied automatically when
65
+ ``openai.RateLimitError`` is raised.
66
+
67
+ Args:
68
+ inputs (list[str]): Input strings to be embedded. Duplicates allowed.
69
+
70
+ Returns:
71
+ list[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
72
+ """
73
+ responses = self.client.embeddings.create(input=inputs, model=self.model_name, **self.api_kwargs)
74
+ return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
75
+
76
+ @observe(_LOGGER)
77
+ def create(self, inputs: list[str]) -> list[NDArray[np.float32]]:
78
+ """Generate embeddings for inputs using cached, ordered batching.
79
+
80
+ Args:
81
+ inputs (list[str]): Input strings. Duplicates allowed.
82
+
83
+ Returns:
84
+ list[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
85
+ """
86
+ return self.cache.map(inputs, self._embed_chunk)
87
+
88
+
89
+ @dataclass(frozen=True)
90
+ class AsyncBatchEmbeddings:
91
+ """Thin wrapper around the OpenAI embeddings endpoint (asynchronous).
92
+
93
+ This class provides an asynchronous interface for generating embeddings using
94
+ OpenAI models. It manages concurrency, handles rate limits automatically,
95
+ and efficiently processes batches of inputs, including de-duplication.
96
+
97
+ Example:
98
+ ```python
99
+ import asyncio
100
+ import numpy as np
101
+ from openai import AsyncOpenAI
102
+ from openaivec import AsyncBatchEmbeddings
103
+
104
+ # Assuming openai_async_client is an initialized AsyncOpenAI client
105
+ openai_async_client = AsyncOpenAI() # Replace with your actual client initialization
106
+
107
+ embedder = AsyncBatchEmbeddings.of(
108
+ client=openai_async_client,
109
+ model_name="text-embedding-3-small",
110
+ batch_size=128,
111
+ max_concurrency=8,
112
+ )
113
+ texts = ["This is the first document.", "This is the second document.", "This is the first document."]
114
+
115
+ # Asynchronous call
116
+ async def main():
117
+ embeddings = await embedder.create(texts)
118
+ # embeddings will be a list of numpy arrays (float32)
119
+ # The embedding for the third text will be identical to the first
120
+ # due to automatic de-duplication.
121
+ print(f"Generated {len(embeddings)} embeddings.")
122
+ print(f"Shape of first embedding: {embeddings[0].shape}")
123
+ assert np.array_equal(embeddings[0], embeddings[2])
124
+
125
+ # Run the async function
126
+ asyncio.run(main())
127
+ ```
128
+
129
+ Attributes:
130
+ client (AsyncOpenAI): Configured OpenAI async client.
131
+ model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
132
+ cache (AsyncBatchingMapProxy[str, NDArray[np.float32]]): Async batching proxy.
133
+ api_kwargs (dict): Additional OpenAI API parameters stored at initialization.
134
+ """
135
+
136
+ client: AsyncOpenAI
137
+ model_name: str
138
+ cache: AsyncBatchingMapProxy[str, NDArray[np.float32]] = field(
139
+ default_factory=lambda: AsyncBatchingMapProxy(batch_size=None, max_concurrency=8)
140
+ )
141
+ api_kwargs: dict[str, int | float | str | bool] = field(default_factory=dict)
142
+
143
+ @classmethod
144
+ def of(
145
+ cls,
146
+ client: AsyncOpenAI,
147
+ model_name: str,
148
+ batch_size: int | None = None,
149
+ max_concurrency: int = 8,
150
+ **api_kwargs,
151
+ ) -> "AsyncBatchEmbeddings":
152
+ """Factory constructor.
153
+
154
+ Args:
155
+ client (AsyncOpenAI): OpenAI async client.
156
+ model_name (str): For Azure OpenAI, use your deployment name. For OpenAI, use the model name.
157
+ batch_size (int | None, optional): Max unique inputs per API call. Defaults to None
158
+ (automatic batch size optimization). Set to a positive integer for fixed batch size.
159
+ max_concurrency (int, optional): Max concurrent API calls. Defaults to 8.
160
+ **api_kwargs: Additional OpenAI API parameters (e.g., dimensions for text-embedding-3 models).
161
+
162
+ Returns:
163
+ AsyncBatchEmbeddings: Configured instance with an async batching proxy.
164
+ """
165
+ return cls(
166
+ client=client,
167
+ model_name=model_name,
168
+ cache=AsyncBatchingMapProxy(batch_size=batch_size, max_concurrency=max_concurrency),
169
+ api_kwargs=api_kwargs,
170
+ )
171
+
172
+ @backoff_async(exceptions=[RateLimitError, InternalServerError], scale=1, max_retries=12)
173
+ @observe(_LOGGER)
174
+ async def _embed_chunk(self, inputs: list[str]) -> list[NDArray[np.float32]]:
175
+ """Embed one minibatch of strings asynchronously.
176
+
177
+ This private helper handles the actual API call for a batch of inputs.
178
+ Exponential back-off is applied automatically when ``openai.RateLimitError``
179
+ is raised.
180
+
181
+ Args:
182
+ inputs (list[str]): Input strings to be embedded. Duplicates allowed.
183
+
184
+ Returns:
185
+ list[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
186
+
187
+ Raises:
188
+ RateLimitError: Propagated if retries are exhausted.
189
+ """
190
+ responses = await self.client.embeddings.create(input=inputs, model=self.model_name, **self.api_kwargs)
191
+ return [np.array(d.embedding, dtype=np.float32) for d in responses.data]
192
+
193
+ @observe(_LOGGER)
194
+ async def create(self, inputs: list[str]) -> list[NDArray[np.float32]]:
195
+ """Generate embeddings for inputs using proxy batching (async).
196
+
197
+ Args:
198
+ inputs (list[str]): Input strings. Duplicates allowed.
199
+
200
+ Returns:
201
+ list[NDArray[np.float32]]: Embedding vectors aligned to ``inputs``.
202
+ """
203
+ return await self.cache.map(inputs, self._embed_chunk) # type: ignore[arg-type]
@@ -2,10 +2,10 @@ import functools
2
2
  import json
3
3
  import time
4
4
  import uuid
5
+ from collections.abc import Callable
5
6
  from logging import Logger
6
- from typing import Callable
7
7
 
8
- __all__ = ["observe"]
8
+ __all__ = []
9
9
 
10
10
 
11
11
  def observe(logger: Logger):
openaivec/_model.py ADDED
@@ -0,0 +1,113 @@
1
+ from dataclasses import dataclass
2
+ from typing import Generic, TypeVar
3
+
4
+ __all__ = [
5
+ "PreparedTask",
6
+ ]
7
+
8
+ ResponseFormat = TypeVar("ResponseFormat")
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class PreparedTask(Generic[ResponseFormat]):
13
+ """A data class representing a complete task configuration for OpenAI API calls.
14
+
15
+ This class encapsulates the instructions and expected response format for
16
+ executing a task against the OpenAI Responses API.
17
+
18
+ Attributes:
19
+ instructions (str): The prompt or instructions to send to the OpenAI model.
20
+ This should contain clear, specific directions for the task.
21
+ response_format (type[ResponseFormat]): A Pydantic model class or str type that defines the expected
22
+ structure of the response. Can be either a BaseModel subclass or str.
23
+
24
+ Example:
25
+ Creating a custom task:
26
+
27
+ ```python
28
+ from pydantic import BaseModel
29
+
30
+ class TranslationResponse(BaseModel):
31
+ translated_text: str
32
+ source_language: str
33
+ target_language: str
34
+
35
+ custom_task = PreparedTask(
36
+ instructions="Translate the following text to French:",
37
+ response_format=TranslationResponse,
38
+ )
39
+ ```
40
+
41
+ Note:
42
+ This class is frozen (immutable) to ensure task configurations
43
+ cannot be accidentally modified after creation.
44
+ """
45
+
46
+ instructions: str
47
+ response_format: type[ResponseFormat]
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class ResponsesModelName:
52
+ """Container for responses model name configuration.
53
+
54
+ Attributes:
55
+ value (str): The model name for OpenAI responses API.
56
+ """
57
+
58
+ value: str
59
+
60
+
61
+ @dataclass(frozen=True)
62
+ class EmbeddingsModelName:
63
+ """Container for embeddings model name configuration.
64
+
65
+ Attributes:
66
+ value (str): The model name for OpenAI embeddings API.
67
+ """
68
+
69
+ value: str
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class OpenAIAPIKey:
74
+ """Container for OpenAI API key configuration.
75
+
76
+ Attributes:
77
+ value (str | None): The API key for OpenAI services.
78
+ """
79
+
80
+ value: str | None
81
+
82
+
83
+ @dataclass(frozen=True)
84
+ class AzureOpenAIAPIKey:
85
+ """Container for Azure OpenAI API key configuration.
86
+
87
+ Attributes:
88
+ value (str | None): The API key for Azure OpenAI services.
89
+ """
90
+
91
+ value: str | None
92
+
93
+
94
+ @dataclass(frozen=True)
95
+ class AzureOpenAIBaseURL:
96
+ """Container for Azure OpenAI base URL configuration.
97
+
98
+ Attributes:
99
+ value (str | None): The base URL for Azure OpenAI services.
100
+ """
101
+
102
+ value: str | None
103
+
104
+
105
+ @dataclass(frozen=True)
106
+ class AzureOpenAIAPIVersion:
107
+ """Container for Azure OpenAI API version configuration.
108
+
109
+ Attributes:
110
+ value (str): The API version for Azure OpenAI services.
111
+ """
112
+
113
+ value: str