promptbuilder 0.4.18__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,206 @@
1
+ import os
2
+ import asyncio
3
+ from copy import deepcopy
4
+ from typing import Literal, get_args
5
+
6
+ import numpy as np
7
+ from google import genai
8
+ from google.genai.types import EmbedContentConfig, EmbedContentResponse
9
+ from openai import AsyncOpenAI
10
+
11
+ import promptbuilder.llm_client.utils as utils
12
+
13
+
14
+ type EMBS_TASK_TYPE = Literal["RETRIEVAL_QUERY", "RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY"]
15
+ type EMBEDDING = list[float]
16
+ EMBS_TASKS = get_args(EMBS_TASK_TYPE)
17
+
18
+
19
+ def normalize_embeddings(embs: list[list[float]] | list[float]) -> list[list[float]] | list[float]:
20
+ embs_np = np.array(embs)
21
+ emb_norms = np.sqrt(np.sum(embs_np * embs_np, axis=-1, keepdims=True))
22
+ embs_np = embs_np / emb_norms
23
+ return embs_np.tolist()
24
+
25
+
26
+ class EmbeddingsApi(utils.InheritDecoratorsMixin):
27
+ available_model_dims: dict[str, list[int]] = {}
28
+ default_model_dim: dict[str, int] = {}
29
+ model_name_prefix: str = ""
30
+
31
+ def __init__(self, model_name: str, embs_dim: int | None = None, *args, retry_times: int = 0, retry_delay: float = 0, **kwargs):
32
+ if model_name not in self.available_model_dims:
33
+ raise ValueError(f"Model {model_name} is not supported.")
34
+ if embs_dim is None:
35
+ embs_dim = self.default_model_dim[model_name]
36
+ else:
37
+ if embs_dim not in self.available_model_dims[model_name]:
38
+ raise ValueError(f"Model {model_name} does not support embedding dimension {embs_dim}.")
39
+
40
+ self._model_name = model_name
41
+ self._embs_dim = embs_dim
42
+ self._retry_times = retry_times
43
+ self._retry_delay = retry_delay
44
+
45
+ @property
46
+ def embeddings_dim(self) -> int:
47
+ return self._embs_dim
48
+
49
+ @property
50
+ def model_name(self) -> str:
51
+ return self.model_name_prefix + self._model_name
52
+
53
+ @utils.retry_cls_async
54
+ async def get_embeddings(
55
+ self,
56
+ texts: list[str] | str,
57
+ task_types: list[EMBS_TASK_TYPE] | EMBS_TASK_TYPE = ["SEMANTIC_SIMILARITY"],
58
+ normalize: bool = True,
59
+ ) -> dict[EMBS_TASK_TYPE, list[EMBEDDING]] | dict[EMBS_TASK_TYPE, EMBEDDING] | list[EMBEDDING] | EMBEDDING:
60
+ pass
61
+
62
+
63
+ class GoogleEmbsApi(EmbeddingsApi):
64
+ available_model_dims: dict[str, list[int]] = {"text-embedding-004": [768]}
65
+ default_model_dim: dict[str, int] = {"text-embedding-004": 768}
66
+ model_name_prefix: str = "google:"
67
+
68
+ def __init__(
69
+ self,
70
+ model_name: str = "text-embedding-004",
71
+ embs_dim: int | None = None,
72
+ *,
73
+ retry_times: int = 0,
74
+ retry_delay: float = 0,
75
+ **kwargs,
76
+ ):
77
+ super().__init__(model_name, embs_dim, retry_times=retry_times, retry_delay=retry_delay)
78
+ self._client = genai.Client(api_key=os.getenv("GOOGLEAI_API_KEY"))
79
+ self._rpm_limit = 145
80
+
81
+ async def get_embeddings(
82
+ self,
83
+ texts: list[str] | str,
84
+ task_types: list[EMBS_TASK_TYPE] | EMBS_TASK_TYPE = ["SEMANTIC_SIMILARITY"],
85
+ normalize: bool = True,
86
+ **kwargs,
87
+ ) -> dict[EMBS_TASK_TYPE, list[EMBEDDING]] | dict[EMBS_TASK_TYPE, EMBEDDING] | list[EMBEDDING] | EMBEDDING:
88
+ batch_size = 10
89
+
90
+ if isinstance(task_types, list):
91
+ task_types = list(set(task_types))
92
+ embeddings = await asyncio.gather(*[self.get_embeddings(texts, task_type, normalize) for task_type in task_types])
93
+ response = {task_type: embs for task_type, embs in zip(task_types, embeddings)}
94
+ return response
95
+
96
+ task_type = task_types
97
+ if isinstance(texts, str):
98
+ response = await self._api_request(
99
+ model=self._model_name,
100
+ contents=texts,
101
+ config=EmbedContentConfig(task_type=task_type),
102
+ )
103
+ if normalize:
104
+ return normalize_embeddings(response.embeddings[0].values)
105
+ else:
106
+ return response.embeddings[0].values
107
+ elif isinstance(texts, list):
108
+ batches_num = len(texts) // batch_size + 1
109
+ result_embeddings: list[list[float]] = []
110
+
111
+ for i in range(batches_num):
112
+ first_idx = i * batch_size
113
+ last_idx = (i + 1) * batch_size
114
+ batch = texts[first_idx: last_idx]
115
+ if len(batch) > 0:
116
+ response = await self._api_request(
117
+ model=self._model_name,
118
+ contents=batch,
119
+ config=EmbedContentConfig(task_type=task_type),
120
+ )
121
+ result_embeddings += [embeddings.values for embeddings in response.embeddings]
122
+
123
+ if normalize:
124
+ return normalize_embeddings(result_embeddings)
125
+ else:
126
+ return result_embeddings
127
+ else:
128
+ raise ValueError("'texts' must be a string or a list of strings.")
129
+
130
+ @utils.rpm_limit_cls_async
131
+ async def _api_request(self, model: str, contents: str | list[str], config: EmbedContentConfig) -> EmbedContentResponse:
132
+ return await self._client.aio.models.embed_content(
133
+ model=model,
134
+ contents=contents,
135
+ config=config,
136
+ )
137
+
138
+
139
+ class OpenAIEmbsApi(EmbeddingsApi):
140
+ available_model_dims: dict[str, list[int]] = {
141
+ "text-embedding-3-small": [512, 1536],
142
+ "text-embedding-3-large": [1024, 3072],
143
+ }
144
+ default_model_dim: dict[str, int] = {
145
+ "text-embedding-3-small": 1536,
146
+ "text-embedding-3-large": 3072,
147
+ }
148
+ model_name_prefix: str = "openai:"
149
+
150
+ def __init__(
151
+ self,
152
+ model_name: str = "text-embedding-3-small",
153
+ embs_dim: int | None = None,
154
+ *,
155
+ retry_times: int = 0,
156
+ retry_delay: float = 0,
157
+ **kwargs,
158
+ ):
159
+ super().__init__(model_name, embs_dim, retry_times=retry_times, retry_delay=retry_delay)
160
+ self._client = AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
161
+
162
+ async def get_embeddings(
163
+ self,
164
+ texts: list[str] | str,
165
+ task_types: list[EMBS_TASK_TYPE] | EMBS_TASK_TYPE = ["SEMANTIC_SIMILARITY"],
166
+ normalize: bool = True,
167
+ **kwargs,
168
+ ) -> dict[EMBS_TASK_TYPE, list[EMBEDDING]] | dict[EMBS_TASK_TYPE, EMBEDDING] | list[EMBEDDING] | EMBEDDING:
169
+ if isinstance(task_types, list):
170
+ task_types = list(set(task_types))
171
+ embeddings = await self.get_embeddings(texts, "SEMANTIC_SIMILARITY", normalize)
172
+ response = {task_type: deepcopy(embeddings) for task_type in task_types}
173
+ return response
174
+
175
+ if isinstance(texts, str):
176
+ response = await self._client.embeddings.create(
177
+ input=texts,
178
+ model=self._model_name,
179
+ dimensions=self._embs_dim,
180
+ )
181
+ if normalize:
182
+ return normalize_embeddings(response.data[0].embedding)
183
+ else:
184
+ return response.data[0].embedding
185
+ elif isinstance(texts, list):
186
+ batches_num = len(texts) // 100 + 1
187
+ result_embeddings = []
188
+
189
+ for i in range(batches_num):
190
+ first_idx = i * 100
191
+ last_idx = (i + 1) * 100
192
+ batch = texts[first_idx: last_idx]
193
+ if len(batch) > 0:
194
+ response = await self._client.embeddings.create(
195
+ input=texts,
196
+ model=self._model_name,
197
+ dimensions=self._embs_dim,
198
+ )
199
+ result_embeddings += [emb.embedding for emb in response.data]
200
+
201
+ if normalize:
202
+ return normalize_embeddings(result_embeddings)
203
+ else:
204
+ return result_embeddings
205
+ else:
206
+ raise ValueError("'texts' must be a string or a list of strings.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: promptbuilder
3
- Version: 0.4.18
3
+ Version: 0.4.19
4
4
  Summary: Library for building prompts for LLMs
5
5
  Home-page: https://github.com/kapulkin/promptbuilder
6
6
  Author: Kapulkin Stanislav
@@ -1,4 +1,5 @@
1
1
  promptbuilder/__init__.py,sha256=o_NdXl7NppM399-fy5VGfYkSN8iYDAaFAwJNhdkW3bI,56
2
+ promptbuilder/embeddings.py,sha256=bu-soCNYiHxshc1jejGmI5iJTIdotqEhmvpImSjlFTY,8087
2
3
  promptbuilder/prompt_builder.py,sha256=kK6WHr2umYmsanYb2fQVxqEajs_dzGPXRulTo40g36E,12428
3
4
  promptbuilder/agent/__init__.py,sha256=qG4Jq4wbmCH5NKLOX6ZMtZ7lFURhJXf464BntR-u5rU,56
4
5
  promptbuilder/agent/agent.py,sha256=dVu251C1r9w5LS2P_shsIRH9tFz1Jq93MDv3Uu41_4E,9274
@@ -17,8 +18,8 @@ promptbuilder/llm_client/main.py,sha256=k4JTyKq2atNyFtI1bjjqXEnGSEugj4xk0AJEvHJi
17
18
  promptbuilder/llm_client/openai_client.py,sha256=5yvjp-Zzp4JsBC9_ffSb1A9-iMG4Lu2B2et2CdtK9R0,22864
18
19
  promptbuilder/llm_client/types.py,sha256=2E-aPRb5uAkLFJocmjF1Lh2aQRq9r8a5JRIw-duHfjA,7460
19
20
  promptbuilder/llm_client/utils.py,sha256=79lvSppjrrItHB5MIozbp_5Oq7TsOK4Qzt9Ae3XMLFw,7624
20
- promptbuilder-0.4.18.dist-info/licenses/LICENSE,sha256=fqXmInzgsvEOIaKSBgcrwKyYCGYF0MKErJ0YivtODcc,1096
21
- promptbuilder-0.4.18.dist-info/METADATA,sha256=bbynjS91gKgHZKKCzw1VgD2FgI54Orn5OLRUqZJsQmA,3738
22
- promptbuilder-0.4.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
23
- promptbuilder-0.4.18.dist-info/top_level.txt,sha256=UBVcYn4UgrPy3O3fmmnPEU_kieuplBMgheetIMei4EI,14
24
- promptbuilder-0.4.18.dist-info/RECORD,,
21
+ promptbuilder-0.4.19.dist-info/licenses/LICENSE,sha256=fqXmInzgsvEOIaKSBgcrwKyYCGYF0MKErJ0YivtODcc,1096
22
+ promptbuilder-0.4.19.dist-info/METADATA,sha256=H7BlzTYhhJi7NGunmjiYhaUqWAhWS-6ELC682S14VKY,3738
23
+ promptbuilder-0.4.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ promptbuilder-0.4.19.dist-info/top_level.txt,sha256=UBVcYn4UgrPy3O3fmmnPEU_kieuplBMgheetIMei4EI,14
25
+ promptbuilder-0.4.19.dist-info/RECORD,,