nomic 3.0.41__tar.gz → 3.0.43__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nomic might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.0.41
3
+ Version: 3.0.43
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -4,7 +4,7 @@ import json
4
4
  import logging
5
5
  import multiprocessing as mp
6
6
  from pathlib import PosixPath
7
- from typing import List, Optional, Union
7
+ from typing import List, Optional, Tuple, Union
8
8
 
9
9
  import boto3
10
10
  import PIL
@@ -187,7 +187,22 @@ def embed_text(
187
187
  }
188
188
 
189
189
 
190
- def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List[bytes]:
190
+ # only way I could get sagemaker with multipart to work
191
+ def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
192
+ # Prepare the multipart body
193
+ boundary = b"---------------------------Boundary"
194
+ body = b""
195
+ for i, (name, img_bytes) in enumerate(images):
196
+ body += b"--" + boundary + b"\r\n"
197
+ body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
198
+ body += b"Content-Type: image/jpeg\r\n\r\n"
199
+ body += img_bytes + b"\r\n"
200
+ body += b"--" + boundary + b"--\r\n"
201
+
202
+ return body, boundary
203
+
204
+
205
+ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
191
206
  """
192
207
  Preprocess a list of images for embedding using a sagemaker model.
193
208
 
@@ -210,17 +225,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
210
225
  image = image.convert("RGB")
211
226
  buffered = io.BytesIO()
212
227
  image.save(buffered, format="JPEG")
213
- encoded_image = buffered.getvalue()
214
- encoded_images.append(encoded_image)
215
- return encoded_images
228
+ encoded_images.append(("image_data", buffered.getvalue()))
229
+
230
+ body, boundary = prepare_multipart_request(encoded_images)
231
+ return body, boundary
216
232
 
217
233
 
218
- def sagemaker_image_request(image: Union[str, bytes, "PIL.Image.Image"], sagemaker_endpoint: str, region_name: str):
219
- preprocessed_image = preprocess_image([image])
234
+ def sagemaker_image_request(
235
+ images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
236
+ ):
237
+ body, boundary = preprocess_image(images)
220
238
 
221
239
  client = boto3.client("sagemaker-runtime", region_name=region_name)
222
240
  response = client.invoke_endpoint(
223
- EndpointName=sagemaker_endpoint, Body=preprocessed_image[0], ContentType="image/jpeg"
241
+ EndpointName=sagemaker_endpoint,
242
+ Body=body,
243
+ ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
224
244
  )
225
245
 
226
246
  return parse_sagemaker_response(response)
@@ -230,21 +250,18 @@ def embed_image(
230
250
  images: List[Union[str, "PIL.Image.Image", bytes]],
231
251
  sagemaker_endpoint: str,
232
252
  region_name: str,
233
- model_name="nomic-embed-vision-v1",
253
+ model_name="nomic-embed-vision-v1.5",
254
+ batch_size=16,
234
255
  ) -> dict:
235
256
  embeddings = []
236
257
 
237
- max_workers = mp.cpu_count()
238
258
  pbar = tqdm(total=len(images))
239
- with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
240
- futures = []
241
- for image in images:
242
- future = executor.submit(sagemaker_image_request, image, sagemaker_endpoint, region_name)
243
- future.add_done_callback(lambda p: pbar.update())
244
- futures.append(future)
245
-
246
- for future in concurrent.futures.as_completed(futures):
247
- embeddings.extend(future.result())
259
+ for i in range(0, len(images), batch_size):
260
+ batch = images[i : i + batch_size]
261
+ embeddings.extend(
262
+ sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
263
+ )
264
+ pbar.update(len(batch))
248
265
 
249
266
  return {
250
267
  "embeddings": embeddings,
@@ -260,7 +277,7 @@ def batch_transform_image(
260
277
  arn: Optional[str] = None,
261
278
  role: Optional[str] = None,
262
279
  max_payload: Optional[int] = 6,
263
- instance_type: str = "ml.p3.2xlarge",
280
+ instance_type: str = "ml.g4dn.xlarge",
264
281
  n_instances: int = 1,
265
282
  wait: bool = True,
266
283
  logs: bool = True,
@@ -1089,6 +1089,11 @@ class AtlasDataset(AtlasClass):
1089
1089
  if modality is None:
1090
1090
  modality = self.meta["modality"]
1091
1091
 
1092
+ if modality == "image":
1093
+ indexed_field = "_blob_hash"
1094
+ if indexed_field is not None:
1095
+ logger.warning("Ignoring indexed_field for image datasets. Only _blob_hash is supported.")
1096
+
1092
1097
  colorable_fields = []
1093
1098
 
1094
1099
  for field in self.dataset_fields:
@@ -1155,11 +1160,6 @@ class AtlasDataset(AtlasClass):
1155
1160
  if indexed_field is None and modality == "text":
1156
1161
  raise Exception("You did not specify a field to index. Specify an 'indexed_field'.")
1157
1162
 
1158
- if modality == "image":
1159
- indexed_field = "_blob_hash"
1160
- if indexed_field is not None:
1161
- logger.warning("Ignoring indexed_field for image datasets. Only _blob_hash is supported.")
1162
-
1163
1163
  if indexed_field not in self.dataset_fields:
1164
1164
  raise Exception(f"Indexing on {indexed_field} not allowed. Valid options are: {self.dataset_fields}")
1165
1165
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.0.41
3
+ Version: 3.0.43
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -8,7 +8,7 @@ description = "The official Nomic python client."
8
8
 
9
9
  setup(
10
10
  name="nomic",
11
- version="3.0.41",
11
+ version="3.0.43",
12
12
  url="https://github.com/nomic-ai/nomic",
13
13
  description=description,
14
14
  long_description=description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes