nomic 3.0.41__tar.gz → 3.0.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.0.41 → nomic-3.0.43}/PKG-INFO +1 -1
- {nomic-3.0.41 → nomic-3.0.43}/nomic/aws/sagemaker.py +37 -20
- {nomic-3.0.41 → nomic-3.0.43}/nomic/dataset.py +5 -5
- {nomic-3.0.41 → nomic-3.0.43}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.0.41 → nomic-3.0.43}/setup.py +1 -1
- {nomic-3.0.41 → nomic-3.0.43}/README.md +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/__init__.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/atlas.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/aws/__init__.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/cli.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/data_inference.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/data_operations.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/embed.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/settings.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic/utils.py +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/pyproject.toml +0 -0
- {nomic-3.0.41 → nomic-3.0.43}/setup.cfg +0 -0
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import multiprocessing as mp
|
|
6
6
|
from pathlib import PosixPath
|
|
7
|
-
from typing import List, Optional, Union
|
|
7
|
+
from typing import List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import boto3
|
|
10
10
|
import PIL
|
|
@@ -187,7 +187,22 @@ def embed_text(
|
|
|
187
187
|
}
|
|
188
188
|
|
|
189
189
|
|
|
190
|
-
|
|
190
|
+
# only way I could get sagemaker with multipart to work
|
|
191
|
+
def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
|
|
192
|
+
# Prepare the multipart body
|
|
193
|
+
boundary = b"---------------------------Boundary"
|
|
194
|
+
body = b""
|
|
195
|
+
for i, (name, img_bytes) in enumerate(images):
|
|
196
|
+
body += b"--" + boundary + b"\r\n"
|
|
197
|
+
body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
|
|
198
|
+
body += b"Content-Type: image/jpeg\r\n\r\n"
|
|
199
|
+
body += img_bytes + b"\r\n"
|
|
200
|
+
body += b"--" + boundary + b"--\r\n"
|
|
201
|
+
|
|
202
|
+
return body, boundary
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
|
|
191
206
|
"""
|
|
192
207
|
Preprocess a list of images for embedding using a sagemaker model.
|
|
193
208
|
|
|
@@ -210,17 +225,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
|
|
|
210
225
|
image = image.convert("RGB")
|
|
211
226
|
buffered = io.BytesIO()
|
|
212
227
|
image.save(buffered, format="JPEG")
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
228
|
+
encoded_images.append(("image_data", buffered.getvalue()))
|
|
229
|
+
|
|
230
|
+
body, boundary = prepare_multipart_request(encoded_images)
|
|
231
|
+
return body, boundary
|
|
216
232
|
|
|
217
233
|
|
|
218
|
-
def sagemaker_image_request(
|
|
219
|
-
|
|
234
|
+
def sagemaker_image_request(
|
|
235
|
+
images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
|
|
236
|
+
):
|
|
237
|
+
body, boundary = preprocess_image(images)
|
|
220
238
|
|
|
221
239
|
client = boto3.client("sagemaker-runtime", region_name=region_name)
|
|
222
240
|
response = client.invoke_endpoint(
|
|
223
|
-
EndpointName=sagemaker_endpoint,
|
|
241
|
+
EndpointName=sagemaker_endpoint,
|
|
242
|
+
Body=body,
|
|
243
|
+
ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
|
|
224
244
|
)
|
|
225
245
|
|
|
226
246
|
return parse_sagemaker_response(response)
|
|
@@ -230,21 +250,18 @@ def embed_image(
|
|
|
230
250
|
images: List[Union[str, "PIL.Image.Image", bytes]],
|
|
231
251
|
sagemaker_endpoint: str,
|
|
232
252
|
region_name: str,
|
|
233
|
-
model_name="nomic-embed-vision-v1",
|
|
253
|
+
model_name="nomic-embed-vision-v1.5",
|
|
254
|
+
batch_size=16,
|
|
234
255
|
) -> dict:
|
|
235
256
|
embeddings = []
|
|
236
257
|
|
|
237
|
-
max_workers = mp.cpu_count()
|
|
238
258
|
pbar = tqdm(total=len(images))
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
for future in concurrent.futures.as_completed(futures):
|
|
247
|
-
embeddings.extend(future.result())
|
|
259
|
+
for i in range(0, len(images), batch_size):
|
|
260
|
+
batch = images[i : i + batch_size]
|
|
261
|
+
embeddings.extend(
|
|
262
|
+
sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
|
|
263
|
+
)
|
|
264
|
+
pbar.update(len(batch))
|
|
248
265
|
|
|
249
266
|
return {
|
|
250
267
|
"embeddings": embeddings,
|
|
@@ -260,7 +277,7 @@ def batch_transform_image(
|
|
|
260
277
|
arn: Optional[str] = None,
|
|
261
278
|
role: Optional[str] = None,
|
|
262
279
|
max_payload: Optional[int] = 6,
|
|
263
|
-
instance_type: str = "ml.
|
|
280
|
+
instance_type: str = "ml.g4dn.xlarge",
|
|
264
281
|
n_instances: int = 1,
|
|
265
282
|
wait: bool = True,
|
|
266
283
|
logs: bool = True,
|
|
@@ -1089,6 +1089,11 @@ class AtlasDataset(AtlasClass):
|
|
|
1089
1089
|
if modality is None:
|
|
1090
1090
|
modality = self.meta["modality"]
|
|
1091
1091
|
|
|
1092
|
+
if modality == "image":
|
|
1093
|
+
indexed_field = "_blob_hash"
|
|
1094
|
+
if indexed_field is not None:
|
|
1095
|
+
logger.warning("Ignoring indexed_field for image datasets. Only _blob_hash is supported.")
|
|
1096
|
+
|
|
1092
1097
|
colorable_fields = []
|
|
1093
1098
|
|
|
1094
1099
|
for field in self.dataset_fields:
|
|
@@ -1155,11 +1160,6 @@ class AtlasDataset(AtlasClass):
|
|
|
1155
1160
|
if indexed_field is None and modality == "text":
|
|
1156
1161
|
raise Exception("You did not specify a field to index. Specify an 'indexed_field'.")
|
|
1157
1162
|
|
|
1158
|
-
if modality == "image":
|
|
1159
|
-
indexed_field = "_blob_hash"
|
|
1160
|
-
if indexed_field is not None:
|
|
1161
|
-
logger.warning("Ignoring indexed_field for image datasets. Only _blob_hash is supported.")
|
|
1162
|
-
|
|
1163
1163
|
if indexed_field not in self.dataset_fields:
|
|
1164
1164
|
raise Exception(f"Indexing on {indexed_field} not allowed. Valid options are: {self.dataset_fields}")
|
|
1165
1165
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|