PyPI - nomic - Versions diffs - 3.3.3__tar.gz → 3.4.0__tar.gz - Mend

nomic 3.3.3tar.gz → 3.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nomic might be problematic. Click here for more details.

Files changed (24) hide show

{nomic-3.3.3 → nomic-3.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.3.3
+Version: 3.4.0
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.3.3 → nomic-3.4.0}/nomic/atlas.py RENAMED Viewed

@@ -44,6 +44,7 @@ def map_data(
         description: The description of your dataset
         id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
         is_public: Should the dataset be accessible outside your Nomic Atlas organization.
+        indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
         projection: Options to adjust Nomic Project - the dimensionality algorithm organizing your dataset.
         topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
         duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
@@ -62,6 +63,10 @@ def map_data(
         modality = "text"
     if blobs is not None:
+        if embeddings is not None:
+            raise ValueError(
+                "You cannot pass both `blobs` and `embeddings` to map_data(). To create a map of images, include `blobs` and not `embeddings`. To create a map of embeddings with images as metadata, include your images as a field in your `data` parameter."
+            )
         # change this when we support other modalities
         modality = "image"
         indexed_field = "_blob_hash"

{nomic-3.3.3 → nomic-3.4.0}/nomic/dataset.py RENAMED Viewed

@@ -1,10 +1,13 @@
 import base64
 import concurrent
 import concurrent.futures
+import importlib.metadata
 import io
 import json
 import os
+import re
 import time
+import unicodedata
 from contextlib import contextmanager
 from datetime import datetime
 from io import BytesIO
@@ -72,13 +75,20 @@ class AtlasClass(object):
         token = self.credentials["token"]
         self.token = token
-        self.header = {"Authorization": f"Bearer {token}"}
+        try:
+            version = importlib.metadata.version("nomic")
+        except Exception:
+            version = "unknown"
+        self.header = {"Authorization": f"Bearer {token}", "User-Agent": f"py-nomic/{version}"}
         if self.token:
             response = requests.get(
                 self.atlas_api_path + "/v1/user",
                 headers=self.header,
             )
+            if "X-AtlasWarning" in response.headers:
+                logger.warning(response.headers["X-AtlasWarning"])
             response = validate_api_http_response(response)
             if not response.status_code == 200:
                 logger.warning(str(response))
@@ -664,14 +674,22 @@ class AtlasProjection:
         sidecar_suffix = "feather"
         if sidecar_name != "":
             sidecar_suffix = f"{sidecar_name}.feather"
-        for key in tqdm(self._manifest["key"].to_pylist()):
-            sidecar_path = self.tile_destination / f"{key}.{sidecar_suffix}"
-            sidecar_url = (
-                self.dataset.atlas_api_path
-                + f"/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/{key}.{sidecar_suffix}"
-            )
-            download_feather(sidecar_url, sidecar_path, headers=self.dataset.header, overwrite=overwrite)
-            downloaded_files.append(sidecar_path)
+        with concurrent.futures.ThreadPoolExecutor(4) as ex:
+            futures = []
+            for key in tqdm(self._manifest["key"].to_pylist()):
+                sidecar_path = self.tile_destination / f"{key}.{sidecar_suffix}"
+                sidecar_url = (
+                    self.dataset.atlas_api_path
+                    + f"/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/{key}.{sidecar_suffix}"
+                )
+                futures.append(
+                    ex.submit(
+                        download_feather, sidecar_url, sidecar_path, headers=self.dataset.header, overwrite=overwrite
+                    )
+                )
+                downloaded_files.append(sidecar_path)
+            for f in futures:
+                f.result()
         return downloaded_files
     @property
@@ -753,6 +771,15 @@ class AtlasDataset(AtlasClass):
         * **dataset_id** - An alternative way to load a dataset is by passing the dataset_id directly. This only works if a dataset exists.
         """
         assert identifier is not None or dataset_id is not None, "You must pass a dataset identifier"
+        # Normalize identifier.
+        if identifier is not None:
+            s = identifier.split("/", 1)
+            identifier = unicodedata.normalize("NFD", s[-1])  # normalize accents
+            identifier = identifier.lower().replace(" ", "-").replace("_", "-")
+            identifier = re.sub(r"[^a-z0-9-]", "", identifier)
+            identifier = re.sub(r"-+", "-", identifier)
+            if len(s) == 2:
+                identifier = f"{s[0]}/{identifier}"
         super().__init__()
@@ -761,6 +788,8 @@ class AtlasDataset(AtlasClass):
                 f"Passing organization_name has been removed in Nomic Python client 3.0. Instead identify your dataset with `organization_name/project_name` (e.g. sterling-cooper/november-ads)."
             )
+        # Set this before possible early return.
+        self._schema = None
         if dataset_id is not None:
             self.meta = self._get_project_by_id(dataset_id)
             return
@@ -793,7 +822,6 @@ class AtlasDataset(AtlasClass):
             )
         self.meta = self._get_project_by_id(project_id=dataset_id)
-        self._schema = None
     def delete(self):
         """
@@ -1074,6 +1102,7 @@ class AtlasDataset(AtlasClass):
         else:
             projection = NomicProjectOptions()
+        topic_model_was_false = topic_model is False
         if isinstance(topic_model, Dict):
             topic_model = NomicTopicOptions(**topic_model)
         elif isinstance(topic_model, NomicTopicOptions):
@@ -1117,7 +1146,7 @@ class AtlasDataset(AtlasClass):
         build_template = {}
         if modality == "embedding":
-            if topic_model.topic_label_field is None:
+            if (not topic_model_was_false) and topic_model.topic_label_field is None:
                 logger.warning(
                     "You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
                 )
@@ -1620,6 +1649,7 @@ class AtlasDataset(AtlasClass):
             close_pbar = True
             pbar = tqdm(total=int(len(data)) // shard_size)
         failed = 0
+        failed_reqs = 0
         succeeded = 0
         errors_504 = 0
         with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
@@ -1668,6 +1698,11 @@ class AtlasDataset(AtlasClass):
                                 failed += shard_size
                                 pbar.update(1)
                                 response.close()
+                            failed_reqs += 1
+                            if failed_reqs > 10:
+                                raise RuntimeError(
+                                    f"{self.identifier}: Too many upload requests have failed at this time. Please try again later."
+                                )
                     else:
                         # A successful upload.
                         succeeded += shard_size

{nomic-3.3.3 → nomic-3.4.0}/nomic/embed.py RENAMED Viewed

@@ -12,6 +12,7 @@ from urllib.parse import urlparse
 import PIL
 import PIL.Image
 import requests
+from tqdm import tqdm
 from .dataset import AtlasClass
 from .settings import *
@@ -286,18 +287,28 @@ def _text_embed4all(
         limits = {"cpu": 16, "kompute": 32, "metal": 1024}
         return n_tokens > limits[backend]
-    output = _embed4all.embed(
-        texts,
-        prefix=task_type,
-        dimensionality=dimensionality,
-        long_text_mode=long_text_mode,
-        return_dict=True,
-        atlas=True,
-        cancel_cb=cancel_cb if dynamic_mode else None,
-    )
-    ntok = output["n_prompt_tokens"]
+    pb = tqdm(total=len(texts), desc="Embedding texts", unit="inputs")
+    output_embeddings = []
+    ntok = 0
+    batch_size = 64
+    for start in range(0, len(texts), batch_size):
+        end = min(len(texts), start + batch_size)
+        b = end - start
+        out = _embed4all.embed(
+            texts[start:end],
+            prefix=task_type,
+            dimensionality=dimensionality,
+            long_text_mode=long_text_mode,
+            return_dict=True,
+            atlas=True,
+            cancel_cb=cancel_cb if dynamic_mode else None,
+        )
+        ntok += out["n_prompt_tokens"]
+        output_embeddings.extend(out["embeddings"])
+        pb.update(b)
+    pb.close()
     usage = {"prompt_tokens": ntok, "total_tokens": ntok}
-    return {"embeddings": output["embeddings"], "usage": usage, "model": model, "inference_mode": "local"}
+    return {"embeddings": output_embeddings, "usage": usage, "model": model, "inference_mode": "local"}
 def free_embedding_model() -> None:

{nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.3.3
+Version: 3.4.0
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.3.3 → nomic-3.4.0}/setup.py RENAMED Viewed

@@ -23,7 +23,7 @@ with open("README.md") as f:
 setup(
     name="nomic",
-    version="3.3.3",
+    version="3.4.0",
     url="https://github.com/nomic-ai/nomic",
     description=description,
     long_description=long_description,