nomic 3.0.40__tar.gz → 3.0.42__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nomic might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.0.40
3
+ Version: 3.0.42
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -38,7 +38,7 @@ def map_data(
38
38
 
39
39
  Args:
40
40
  data: An ordered collection of the datapoints you are structuring. Can be a list of dictionaries, Pandas Dataframe or PyArrow Table.
41
- blobs: A list of image paths, bytes, or PIL images to add to your image dataset.
41
+ blobs: A list of image paths, bytes, or PIL images to add to your image dataset that are stored locally.
42
42
  embeddings: An [N,d] numpy array containing the N embeddings to add.
43
43
  identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
44
44
  description: The description of your dataset
@@ -4,7 +4,7 @@ import json
4
4
  import logging
5
5
  import multiprocessing as mp
6
6
  from pathlib import PosixPath
7
- from typing import List, Optional, Union
7
+ from typing import List, Optional, Tuple, Union
8
8
 
9
9
  import boto3
10
10
  import PIL
@@ -187,7 +187,22 @@ def embed_text(
187
187
  }
188
188
 
189
189
 
190
- def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List[bytes]:
190
+ # only way I could get sagemaker with multipart to work
191
+ def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
192
+ # Prepare the multipart body
193
+ boundary = b"---------------------------Boundary"
194
+ body = b""
195
+ for i, (name, img_bytes) in enumerate(images):
196
+ body += b"--" + boundary + b"\r\n"
197
+ body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
198
+ body += b"Content-Type: image/jpeg\r\n\r\n"
199
+ body += img_bytes + b"\r\n"
200
+ body += b"--" + boundary + b"--\r\n"
201
+
202
+ return body, boundary
203
+
204
+
205
+ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
191
206
  """
192
207
  Preprocess a list of images for embedding using a sagemaker model.
193
208
 
@@ -210,17 +225,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
210
225
  image = image.convert("RGB")
211
226
  buffered = io.BytesIO()
212
227
  image.save(buffered, format="JPEG")
213
- encoded_image = buffered.getvalue()
214
- encoded_images.append(encoded_image)
215
- return encoded_images
228
+ encoded_images.append(("image_data", buffered.getvalue()))
229
+
230
+ body, boundary = prepare_multipart_request(encoded_images)
231
+ return body, boundary
216
232
 
217
233
 
218
- def sagemaker_image_request(image: Union[str, bytes, "PIL.Image.Image"], sagemaker_endpoint: str, region_name: str):
219
- preprocessed_image = preprocess_image([image])
234
+ def sagemaker_image_request(
235
+ images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
236
+ ):
237
+ body, boundary = preprocess_image(images)
220
238
 
221
239
  client = boto3.client("sagemaker-runtime", region_name=region_name)
222
240
  response = client.invoke_endpoint(
223
- EndpointName=sagemaker_endpoint, Body=preprocessed_image[0], ContentType="image/jpeg"
241
+ EndpointName=sagemaker_endpoint,
242
+ Body=body,
243
+ ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
224
244
  )
225
245
 
226
246
  return parse_sagemaker_response(response)
@@ -230,21 +250,18 @@ def embed_image(
230
250
  images: List[Union[str, "PIL.Image.Image", bytes]],
231
251
  sagemaker_endpoint: str,
232
252
  region_name: str,
233
- model_name="nomic-embed-vision-v1",
253
+ model_name="nomic-embed-vision-v1.5",
254
+ batch_size=16,
234
255
  ) -> dict:
235
256
  embeddings = []
236
257
 
237
- max_workers = mp.cpu_count()
238
258
  pbar = tqdm(total=len(images))
239
- with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
240
- futures = []
241
- for image in images:
242
- future = executor.submit(sagemaker_image_request, image, sagemaker_endpoint, region_name)
243
- future.add_done_callback(lambda p: pbar.update())
244
- futures.append(future)
245
-
246
- for future in concurrent.futures.as_completed(futures):
247
- embeddings.extend(future.result())
259
+ for i in range(0, len(images), batch_size):
260
+ batch = images[i : i + batch_size]
261
+ embeddings.extend(
262
+ sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
263
+ )
264
+ pbar.update(len(batch))
248
265
 
249
266
  return {
250
267
  "embeddings": embeddings,
@@ -260,7 +277,7 @@ def batch_transform_image(
260
277
  arn: Optional[str] = None,
261
278
  role: Optional[str] = None,
262
279
  max_payload: Optional[int] = 6,
263
- instance_type: str = "ml.p3.2xlarge",
280
+ instance_type: str = "ml.g4dn.xlarge",
264
281
  n_instances: int = 1,
265
282
  wait: bool = True,
266
283
  logs: bool = True,
@@ -1356,6 +1356,7 @@ class AtlasDataset(AtlasClass):
1356
1356
  Args:
1357
1357
  data: A pandas DataFrame, list of dictionaries, or pyarrow Table matching the dataset schema.
1358
1358
  embeddings: A numpy array of embeddings: each row corresponds to a row in the table. Use if you already have embeddings for your datapoints.
1359
+ blobs: A list of image paths, bytes, or PIL Images. Use if you want to create an AtlasDataset using image embeddings over your images. Note: Blobs are stored locally only.
1359
1360
  pbar: (Optional). A tqdm progress bar to update.
1360
1361
  """
1361
1362
  if embeddings is not None:
@@ -1374,6 +1375,7 @@ class AtlasDataset(AtlasClass):
1374
1375
  """
1375
1376
  Add data, with associated blobs, to the dataset.
1376
1377
  Uploads blobs to the server and associates them with the data.
1378
+ Blobs must reference objects stored locally
1377
1379
  """
1378
1380
  if isinstance(data, DataFrame):
1379
1381
  data = pa.Table.from_pandas(data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.0.40
3
+ Version: 3.0.42
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -8,7 +8,7 @@ description = "The official Nomic python client."
8
8
 
9
9
  setup(
10
10
  name="nomic",
11
- version="3.0.40",
11
+ version="3.0.42",
12
12
  url="https://github.com/nomic-ai/nomic",
13
13
  description=description,
14
14
  long_description=description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes