PyPI - nomic - Versions diffs - 3.0.40__tar.gz → 3.0.42__tar.gz - Mend

nomic 3.0.40tar.gz → 3.0.42tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nomic might be problematic. Click here for more details.

Files changed (24) hide show

{nomic-3.0.40 → nomic-3.0.42}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.0.40
+Version: 3.0.42
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.0.40 → nomic-3.0.42}/nomic/atlas.py RENAMED Viewed

@@ -38,7 +38,7 @@ def map_data(
     Args:
         data: An ordered collection of the datapoints you are structuring. Can be a list of dictionaries, Pandas Dataframe or PyArrow Table.
-        blobs: A list of image paths, bytes, or PIL images to add to your image dataset.
+        blobs: A list of image paths, bytes, or PIL images to add to your image dataset that are stored locally.
         embeddings: An [N,d] numpy array containing the N embeddings to add.
         identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
         description: The description of your dataset

{nomic-3.0.40 → nomic-3.0.42}/nomic/aws/sagemaker.py RENAMED Viewed

@@ -4,7 +4,7 @@ import json
 import logging
 import multiprocessing as mp
 from pathlib import PosixPath
-from typing import List, Optional, Union
+from typing import List, Optional, Tuple, Union
 import boto3
 import PIL
@@ -187,7 +187,22 @@ def embed_text(
     }
-def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List[bytes]:
+# only way I could get sagemaker with multipart to work
+def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
+    # Prepare the multipart body
+    boundary = b"---------------------------Boundary"
+    body = b""
+    for i, (name, img_bytes) in enumerate(images):
+        body += b"--" + boundary + b"\r\n"
+        body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
+        body += b"Content-Type: image/jpeg\r\n\r\n"
+        body += img_bytes + b"\r\n"
+    body += b"--" + boundary + b"--\r\n"
+    return body, boundary
+def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
     """
     Preprocess a list of images for embedding using a sagemaker model.
@@ -210,17 +225,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
         image = image.convert("RGB")
         buffered = io.BytesIO()
         image.save(buffered, format="JPEG")
-        encoded_image = buffered.getvalue()
-        encoded_images.append(encoded_image)
-    return encoded_images
+        encoded_images.append(("image_data", buffered.getvalue()))
+    body, boundary = prepare_multipart_request(encoded_images)
+    return body, boundary
-def sagemaker_image_request(image: Union[str, bytes, "PIL.Image.Image"], sagemaker_endpoint: str, region_name: str):
-    preprocessed_image = preprocess_image([image])
+def sagemaker_image_request(
+    images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
+):
+    body, boundary = preprocess_image(images)
     client = boto3.client("sagemaker-runtime", region_name=region_name)
     response = client.invoke_endpoint(
-        EndpointName=sagemaker_endpoint, Body=preprocessed_image[0], ContentType="image/jpeg"
+        EndpointName=sagemaker_endpoint,
+        Body=body,
+        ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
     )
     return parse_sagemaker_response(response)
@@ -230,21 +250,18 @@ def embed_image(
     images: List[Union[str, "PIL.Image.Image", bytes]],
     sagemaker_endpoint: str,
     region_name: str,
-    model_name="nomic-embed-vision-v1",
+    model_name="nomic-embed-vision-v1.5",
+    batch_size=16,
 ) -> dict:
     embeddings = []
-    max_workers = mp.cpu_count()
     pbar = tqdm(total=len(images))
-    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
-        futures = []
-        for image in images:
-            future = executor.submit(sagemaker_image_request, image, sagemaker_endpoint, region_name)
-            future.add_done_callback(lambda p: pbar.update())
-            futures.append(future)
-        for future in concurrent.futures.as_completed(futures):
-            embeddings.extend(future.result())
+    for i in range(0, len(images), batch_size):
+        batch = images[i : i + batch_size]
+        embeddings.extend(
+            sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
+        )
+        pbar.update(len(batch))
     return {
         "embeddings": embeddings,
@@ -260,7 +277,7 @@ def batch_transform_image(
     arn: Optional[str] = None,
     role: Optional[str] = None,
     max_payload: Optional[int] = 6,
-    instance_type: str = "ml.p3.2xlarge",
+    instance_type: str = "ml.g4dn.xlarge",
     n_instances: int = 1,
     wait: bool = True,
     logs: bool = True,

{nomic-3.0.40 → nomic-3.0.42}/nomic/dataset.py RENAMED Viewed

@@ -1356,6 +1356,7 @@ class AtlasDataset(AtlasClass):
         Args:
             data: A pandas DataFrame, list of dictionaries, or pyarrow Table matching the dataset schema.
             embeddings: A numpy array of embeddings: each row corresponds to a row in the table. Use if you already have embeddings for your datapoints.
+            blobs: A list of image paths, bytes, or PIL Images. Use if you want to create an AtlasDataset using image embeddings over your images. Note: Blobs are stored locally only.
             pbar: (Optional). A tqdm progress bar to update.
         """
         if embeddings is not None:
@@ -1374,6 +1375,7 @@ class AtlasDataset(AtlasClass):
         """
         Add data, with associated blobs, to the dataset.
         Uploads blobs to the server and associates them with the data.
+        Blobs must reference objects stored locally
         """
         if isinstance(data, DataFrame):
             data = pa.Table.from_pandas(data)

{nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.0.40
+Version: 3.0.42
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.0.40 → nomic-3.0.42}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ description = "The official Nomic python client."
 setup(
     name="nomic",
-    version="3.0.40",
+    version="3.0.42",
     url="https://github.com/nomic-ai/nomic",
     description=description,
     long_description=description,