PyPI - nomic - Versions diffs - 3.5.0__tar.gz → 3.5.2__tar.gz - Mend

nomic 3.5.0tar.gz → 3.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{nomic-3.5.0 → nomic-3.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.5.0
+Version: 3.5.2
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.5.0 → nomic-3.5.2}/nomic/atlas.py RENAMED Viewed

@@ -3,7 +3,6 @@ This class allows for programmatic interactions with Atlas - Nomic's neural data
 or in a Jupyter Notebook to organize and interact with your unstructured data.
 """
-import uuid
 from typing import Dict, Iterable, List, Optional, Union
 import numpy as np
@@ -42,7 +41,7 @@ def map_data(
         embeddings: An [N,d] numpy array containing the N embeddings to add.
         identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
         description: The description of your dataset
-        id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
+        id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
         is_public: Should the dataset be accessible outside your Nomic Atlas organization.
         indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
         projection: Options for configuring the 2D projection algorithm.
@@ -86,9 +85,6 @@ def map_data(
             # default to vision v1.5
             embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
-    if id_field is None:
-        id_field = ATLAS_DEFAULT_ID_FIELD
     project_name = get_random_name()
     dataset_name = project_name
@@ -100,38 +96,6 @@ def map_data(
     if description:
         description = description
-    # no metadata was specified
-    added_id_field = False
-    if data is None:
-        added_id_field = True
-        if embeddings is not None:
-            data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
-        elif blobs is not None:
-            data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(blobs))]
-        else:
-            raise ValueError("You must specify either data, embeddings, or blobs")
-    if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
-        if isinstance(data, list) and id_field not in data[0]:
-            added_id_field = True
-            for i in range(len(data)):
-                # do not modify object the user passed in - also ensures IDs are unique if two input datums are the same *object*
-                data[i] = data[i].copy()
-                data[i][id_field] = b64int(i)
-        elif isinstance(data, DataFrame) and id_field not in data.columns:
-            data[id_field] = [b64int(i) for i in range(data.shape[0])]
-            added_id_field = True
-        elif isinstance(data, pa.Table) and not id_field in data.column_names:  # type: ignore
-            ids = pa.array([b64int(i) for i in range(len(data))])
-            data = data.append_column(id_field, ids)  # type: ignore
-            added_id_field = True
-        elif id_field not in data[0]:
-            raise ValueError("map_data data must be a list of dicts, a pandas dataframe, or a pyarrow table")
-    if added_id_field:
-        logger.warning("An ID field was not specified in your data so one was generated for you in insertion order.")
     dataset = AtlasDataset(
         identifier=dataset_name, description=description, unique_id_field=id_field, is_public=is_public
     )
@@ -202,7 +166,7 @@ def map_embeddings(
     Args:
         embeddings: An [N,d] numpy array containing the batch of N embeddings to add.
         data: An [N,] element list of dictionaries containing metadata for each embedding.
-        id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
+        id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
         name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
         description: A description for your map.
         is_public: Should this embedding map be public? Private maps can only be accessed by members of your organization.
@@ -250,7 +214,7 @@ def map_text(
     Args:
         data: An [N,] element iterable of dictionaries containing metadata for each embedding.
         indexed_field: The name the data field containing the text your want to map.
-        id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
+        id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
         name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
         description: A description for your map.
         build_topic_model: Builds a hierarchical topic model over your data to discover patterns.

{nomic-3.5.0 → nomic-3.5.2}/nomic/data_operations.py RENAMED Viewed

@@ -25,7 +25,6 @@ class AtlasMapDuplicates:
     def __init__(self, projection: "AtlasProjection"):  # type: ignore
         self.projection = projection
-        self.id_field = self.projection.dataset.id_field
         duplicate_columns = [
             (field, sidecar)
@@ -40,7 +39,13 @@ class AtlasMapDuplicates:
         self._duplicate_column = duplicate_columns[0]
         self._cluster_column = cluster_columns[0]
+        self.duplicate_field = None
+        self.cluster_field = None
         self._tb = None
+        self._has_unique_id_field = (
+            "unique_id_field" in self.projection.dataset.meta
+            and self.projection.dataset.meta["unique_id_field"] is not None
+        )
     def _load_duplicates(self):
         """
@@ -51,30 +56,80 @@ class AtlasMapDuplicates:
         self.duplicate_field = self._duplicate_column[0].lstrip("_")
         self.cluster_field = self._cluster_column[0].lstrip("_")
         logger.info("Loading duplicates")
+        id_field_name = "_position_index"
+        if self._has_unique_id_field:
+            id_field_name = self.projection.dataset.meta["unique_id_field"]
         for key in tqdm(self.projection._manifest["key"].to_pylist()):
-            # Use datum id as root table
-            tb = feather.read_table(
-                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
-            )
-            path = self.projection.tile_destination
+            # Use datum id as root table if available, otherwise create synthetic IDs
+            if self._has_unique_id_field:
+                try:
+                    datum_id_path = self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather")
+                    tb = feather.read_table(datum_id_path, memory_map=True)
+                except (FileNotFoundError, pa.ArrowInvalid):
+                    # Create a synthetic ID table
+                    tb = self._create_synthetic_id_table(key, duplicate_sidecar)
+            else:
+                # Create synthetic IDs when no unique_id_field is available
+                tb = self._create_synthetic_id_table(key, duplicate_sidecar)
+            path = self.projection.tile_destination
             if duplicate_sidecar == "":
                 path = path / Path(key).with_suffix(".feather")
             else:
                 path = path / Path(key).with_suffix(f".{duplicate_sidecar}.feather")
-            duplicate_tb = feather.read_table(path, memory_map=True)
-            for field in (self._duplicate_column[0], self._cluster_column[0]):
-                tb = tb.append_column(field, duplicate_tb[field])
-            tbs.append(tb)
-        self._tb = pa.concat_tables(tbs).rename_columns([self.id_field, self.duplicate_field, self.cluster_field])
+            try:
+                duplicate_tb = feather.read_table(path, memory_map=True)
+                for field in (self._duplicate_column[0], self._cluster_column[0]):
+                    tb = tb.append_column(field, duplicate_tb[field])
+                tbs.append(tb)
+            except (FileNotFoundError, pa.ArrowInvalid) as e:
+                logger.warning(f"Error loading duplicate data for key {key}: {e}")
+                continue
+        if not tbs:
+            raise ValueError("No duplicate data could be loaded. Duplicate data files may be missing or corrupt.")
+        self._tb = pa.concat_tables(tbs).rename_columns([id_field_name, self.duplicate_field, self.cluster_field])
+    def _create_synthetic_id_table(self, key, sidecar):
+        """
+        Create a synthetic table with position indices when datum_id file isn't available
+        or when unique_id_field isn't specified.
+        """
+        # Try to determine the size of the table by loading the duplicate sidecar
+        path = self.projection.tile_destination
+        if sidecar == "":
+            path = path / Path(key).with_suffix(".feather")
+        else:
+            path = path / Path(key).with_suffix(f".{sidecar}.feather")
+        try:
+            sidecar_tb = feather.read_table(path, memory_map=True)
+            size = len(sidecar_tb)
+            # Create a table with position indices as IDs
+            position_indices = [f"pos_{i}" for i in range(size)]
+            return pa.Table.from_arrays([pa.array(position_indices)], names=["_position_index"])
+        except Exception as e:
+            logger.error(f"Failed to create synthetic IDs for {key}: {e}")
+            # Return an empty table as fallback
+            return pa.Table.from_arrays([pa.array([])], names=["_position_index"])
     def _download_duplicates(self):
         """
         Downloads the feather tree for duplicates.
         """
         logger.info("Downloading duplicates")
-        self.projection._download_sidecar("datum_id", overwrite=False)
+        # Only download datum_id if we have a unique ID field
+        if self._has_unique_id_field:
+            try:
+                self.projection._download_sidecar("datum_id", overwrite=False)
+            except ValueError as e:
+                logger.warning(f"Failed to download datum_id files: {e}. Will use synthetic IDs instead.")
+                self._has_unique_id_field = False
         assert self._cluster_column[1] == self._duplicate_column[1], "Cluster and duplicate should be in same sidecar"
         self.projection._download_sidecar(self._duplicate_column[1], overwrite=False)
@@ -100,17 +155,24 @@ class AtlasMapDuplicates:
     def deletion_candidates(self) -> List[str]:
         """
         Returns:
             The ids for all data points which are semantic duplicates and are candidates for being deleted from the dataset. If you remove these data points from your dataset, your dataset will be semantically deduplicated.
         """
-        dupes = self.tb[self.id_field].filter(pa.compute.equal(self.tb[self.duplicate_field], "deletion candidate"))  # type: ignore
+        id_field_name = "_position_index"
+        if self._has_unique_id_field:
+            id_field_name = self.projection.dataset.meta["unique_id_field"]
+        dupes = self.tb[id_field_name].filter(pa.compute.equal(self.tb[self.duplicate_field], "deletion candidate"))  # type: ignore
         return dupes.to_pylist()
     def __repr__(self) -> str:
+        id_field_name = "_position_index"
+        if self._has_unique_id_field:
+            id_field_name = self.projection.dataset.meta["unique_id_field"]
         repr = f"===Atlas Duplicates for ({self.projection})\n"
         duplicate_count = len(
-            self.tb[self.id_field].filter(pa.compute.equal(self.tb[self.duplicate_field], "deletion candidate"))  # type: ignore
+            self.tb[id_field_name].filter(pa.compute.equal(self.tb[self.duplicate_field], "deletion candidate"))  # type: ignore
         )
         cluster_count = len(self.tb[self.cluster_field].value_counts())
         repr += f"{duplicate_count} deletion candidates in {cluster_count} clusters\n"
@@ -125,7 +187,6 @@ class AtlasMapTopics:
     def __init__(self, projection: "AtlasProjection"):  # type: ignore
         self.projection = projection
         self.dataset = projection.dataset
-        self.id_field = self.projection.dataset.id_field
         self._metadata = None
         self._hierarchy = None
         self._topic_columns = [
@@ -134,6 +195,9 @@ class AtlasMapTopics:
         assert len(self._topic_columns) > 0, "Topic modeling has not yet been run on this map."
         self.depth = len(self._topic_columns)
         self._tb = None
+        self._has_unique_id_field = (
+            "unique_id_field" in self.dataset.meta and self.dataset.meta["unique_id_field"] is not None
+        )
     def _load_topics(self):
         """
@@ -149,45 +213,96 @@ class AtlasMapTopics:
         # Should just be one sidecar
         topic_sidecar = set([sidecar for _, sidecar in self._topic_columns]).pop()
         logger.info("Loading topics")
+        id_field_name = "_position_index"
+        if self._has_unique_id_field:
+            id_field_name = self.dataset.meta["unique_id_field"]
         for key in tqdm(self.projection._manifest["key"].to_pylist()):
-            # Use datum id as root table
-            tb = feather.read_table(
-                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
-            )
+            # Use datum id as root table if available, otherwise create a synthetic index
+            if self._has_unique_id_field:
+                try:
+                    datum_id_path = self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather")
+                    tb = feather.read_table(datum_id_path, memory_map=True)
+                except (FileNotFoundError, pa.ArrowInvalid):
+                    # If datum_id file doesn't exist, create a table with a position index
+                    logger.warning(f"Datum ID file not found for key {key}. Creating synthetic IDs.")
+                    tb = self._create_synthetic_id_table(key, topic_sidecar)
+            else:
+                # Create a table with a position index when no unique_id_field is available
+                tb = self._create_synthetic_id_table(key, topic_sidecar)
             path = self.projection.tile_destination
             if topic_sidecar == "":
                 path = path / Path(key).with_suffix(".feather")
             else:
                 path = path / Path(key).with_suffix(f".{topic_sidecar}.feather")
-            topic_tb = feather.read_table(path, memory_map=True)
-            # Do this in depth order
-            for d in range(1, self.depth + 1):
-                column = f"_topic_depth_{d}"
-                if integer_topics:
-                    column = f"_topic_depth_{d}_int"
-                    topic_ids_to_label = topic_tb[column].to_pandas().rename("topic_id")
-                    assert label_df is not None
-                    topic_ids_to_label = pd.DataFrame(label_df[label_df["depth"] == d]).merge(
-                        topic_ids_to_label, on="topic_id", how="right"
-                    )
-                    new_column = f"_topic_depth_{d}"
-                    tb = tb.append_column(
-                        new_column, pa.Array.from_pandas(topic_ids_to_label["topic_short_description"])
-                    )
-                else:
-                    tb = tb.append_column(f"_topic_depth_1", topic_tb["_topic_depth_1"])
-            tbs.append(tb)
+            try:
+                topic_tb = feather.read_table(path, memory_map=True)
+                # Do this in depth order
+                for d in range(1, self.depth + 1):
+                    column = f"_topic_depth_{d}"
+                    if integer_topics:
+                        column = f"_topic_depth_{d}_int"
+                        topic_ids_to_label = topic_tb[column].to_pandas().rename("topic_id")
+                        assert label_df is not None
+                        topic_ids_to_label = pd.DataFrame(label_df[label_df["depth"] == d]).merge(
+                            topic_ids_to_label, on="topic_id", how="right"
+                        )
+                        new_column = f"_topic_depth_{d}"
+                        tb = tb.append_column(
+                            new_column, pa.Array.from_pandas(topic_ids_to_label["topic_short_description"])
+                        )
+                    else:
+                        tb = tb.append_column(f"_topic_depth_1", topic_tb["_topic_depth_1"])
+                tbs.append(tb)
+            except (FileNotFoundError, pa.ArrowInvalid) as e:
+                logger.warning(f"Error loading topic data for key {key}: {e}")
+                continue
-        renamed_columns = [self.id_field] + [f"topic_depth_{i}" for i in range(1, self.depth + 1)]
+        if not tbs:
+            raise ValueError("No topic data could be loaded. Topic data files may be missing or corrupt.")
+        renamed_columns = [id_field_name] + [f"topic_depth_{i}" for i in range(1, self.depth + 1)]
         self._tb = pa.concat_tables(tbs).rename_columns(renamed_columns)
+    def _create_synthetic_id_table(self, key, sidecar):
+        """
+        Create a synthetic table with position indices when datum_id file isn't available
+        or when unique_id_field isn't specified.
+        """
+        # Try to determine the size of the table by loading the topic sidecar
+        path = self.projection.tile_destination
+        if sidecar == "":
+            path = path / Path(key).with_suffix(".feather")
+        else:
+            path = path / Path(key).with_suffix(f".{sidecar}.feather")
+        try:
+            topic_tb = feather.read_table(path, memory_map=True)
+            size = len(topic_tb)
+            # Create a table with position indices as IDs
+            position_indices = [f"pos_{i}" for i in range(size)]
+            return pa.Table.from_arrays([pa.array(position_indices)], names=["_position_index"])
+        except Exception as e:
+            logger.error(f"Failed to create synthetic IDs for {key}: {e}")
+            # Return an empty table as fallback
+            return pa.Table.from_arrays([pa.array([])], names=["_position_index"])
     def _download_topics(self):
         """
         Downloads the feather tree for topics.
         """
         logger.info("Downloading topics")
-        self.projection._download_sidecar("datum_id", overwrite=False)
+        # Only download datum_id if we have a unique ID field
+        if self._has_unique_id_field:
+            try:
+                self.projection._download_sidecar("datum_id", overwrite=False)
+            except ValueError as e:
+                logger.warning(f"Failed to download datum_id files: {e}. Will use synthetic IDs instead.")
+                self._has_unique_id_field = False
         topic_sidecars = set([sidecar for _, sidecar in self._topic_columns])
         assert len(topic_sidecars) == 1, "Multiple topic sidecars found."
         self.projection._download_sidecar(topic_sidecars.pop(), overwrite=False)
@@ -286,7 +401,10 @@ class AtlasMapTopics:
             raise ValueError("Topic depth out of range.")
         # Unique datum id column to aggregate
-        datum_id_col = self.dataset.meta["unique_id_field"]
+        datum_id_col = "_position_index"
+        if "unique_id_field" in self.dataset.meta and self.dataset.meta["unique_id_field"] is not None:
+            datum_id_col = self.dataset.meta["unique_id_field"]
         df = self.df
         topic_datum_dict = df.groupby(f"topic_depth_{topic_depth}")[datum_id_col].apply(set).to_dict()
@@ -333,8 +451,12 @@ class AtlasMapTopics:
             A list of `{topic, count}` dictionaries, sorted from largest count to smallest count.
         """
         data = AtlasMapData(self.projection, fields=[time_field])
-        time_data = data.tb.select([self.id_field, time_field])
-        merged_tb = self.tb.join(time_data, self.id_field, join_type="inner").combine_chunks()
+        id_field_name = "_position_index"
+        if "unique_id_field" in self.dataset.meta and self.dataset.meta["unique_id_field"] is not None:
+            id_field_name = self.dataset.meta["unique_id_field"]
+        time_data = data.tb.select([id_field_name, time_field])
+        merged_tb = self.tb.join(time_data, id_field_name, join_type="inner").combine_chunks()
         del time_data  # free up memory
@@ -343,12 +465,12 @@ class AtlasMapTopics:
         topic_densities = {}
         for depth in range(1, self.depth + 1):
             topic_column = f"topic_depth_{depth}"
-            topic_counts = merged_tb.group_by(topic_column).aggregate([(self.id_field, "count")]).to_pandas()
+            topic_counts = merged_tb.group_by(topic_column).aggregate([(id_field_name, "count")]).to_pandas()
             for _, row in topic_counts.iterrows():
                 topic = row[topic_column]
                 if topic not in topic_densities:
                     topic_densities[topic] = 0
-                topic_densities[topic] += row[self.id_field + "_count"]
+                topic_densities[topic] += row[id_field_name + "_count"]
         return topic_densities
     def vector_search_topics(self, queries: np.ndarray, k: int = 32, depth: int = 3) -> Dict:
@@ -448,10 +570,12 @@ class AtlasMapEmbeddings:
     def __init__(self, projection: "AtlasProjection"):  # type: ignore
         self.projection = projection
-        self.id_field = self.projection.dataset.id_field
         self.dataset = projection.dataset
         self._tb: pa.Table = None
         self._latent = None
+        self._has_unique_id_field = (
+            "unique_id_field" in self.dataset.meta and self.dataset.meta["unique_id_field"] is not None
+        )
     @property
     def df(self):
@@ -481,23 +605,81 @@ class AtlasMapEmbeddings:
         tbs = []
         coord_sidecar = self.projection._get_sidecar_from_field("x")
         for key in tqdm(self.projection._manifest["key"].to_pylist()):
-            # Use datum id as root table
-            tb = feather.read_table(
-                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
-            )
+            # Use datum id as root table if available, otherwise create synthetic IDs
+            if self._has_unique_id_field:
+                try:
+                    datum_id_path = self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather")
+                    tb = feather.read_table(datum_id_path, memory_map=True)
+                except (FileNotFoundError, pa.ArrowInvalid):
+                    # Create a synthetic ID table
+                    tb = self._create_synthetic_id_table(key, coord_sidecar)
+            else:
+                # Create synthetic IDs when no unique_id_field is available
+                tb = self._create_synthetic_id_table(key, coord_sidecar)
             path = self.projection.tile_destination
             if coord_sidecar == "":
                 path = path / Path(key).with_suffix(".feather")
             else:
                 path = path / Path(key).with_suffix(f".{coord_sidecar}.feather")
-            carfile = feather.read_table(path, memory_map=True)
-            for col in carfile.column_names:
-                if col in ["x", "y"]:
-                    tb = tb.append_column(col, carfile[col])
-            tbs.append(tb)
+            try:
+                carfile = feather.read_table(path, memory_map=True)
+                for col in carfile.column_names:
+                    if col in ["x", "y"]:
+                        tb = tb.append_column(col, carfile[col])
+                tbs.append(tb)
+            except (FileNotFoundError, pa.ArrowInvalid) as e:
+                logger.warning(f"Error loading embedding data for key {key}: {e}")
+                continue
+        if not tbs:
+            raise ValueError("No embedding data could be loaded. Embedding data files may be missing or corrupt.")
         self._tb = pa.concat_tables(tbs)
         return self._tb
+    def _create_synthetic_id_table(self, key, sidecar):
+        """
+        Create a synthetic table with position indices when datum_id file isn't available
+        or when unique_id_field isn't specified.
+        """
+        # Try to determine the size of the table by loading the coordinate sidecar
+        path = self.projection.tile_destination
+        if sidecar == "":
+            path = path / Path(key).with_suffix(".feather")
+        else:
+            path = path / Path(key).with_suffix(f".{sidecar}.feather")
+        try:
+            coord_tb = feather.read_table(path, memory_map=True)
+            size = len(coord_tb)
+            # Create a table with position indices as IDs
+            position_indices = [f"pos_{i}" for i in range(size)]
+            return pa.Table.from_arrays([pa.array(position_indices)], names=["_position_index"])
+        except Exception as e:
+            logger.error(f"Failed to create synthetic IDs for {key}: {e}")
+            # Return an empty table as fallback
+            return pa.Table.from_arrays([pa.array([])], names=["_position_index"])
+    def _download_projected(self) -> List[Path]:
+        """
+        Downloads the feather tree for projection coordinates.
+        """
+        logger.info("Downloading projected embeddings")
+        # Note that y coord should be in same sidecar
+        coord_sidecar = self.projection._get_sidecar_from_field("x")
+        # Only download datum_id if we have a unique ID field
+        if self._has_unique_id_field:
+            try:
+                self.projection._download_sidecar("datum_id", overwrite=False)
+            except ValueError as e:
+                logger.warning(f"Failed to download datum_id files: {e}. Will use synthetic IDs instead.")
+                self._has_unique_id_field = False
+        return self.projection._download_sidecar(coord_sidecar, overwrite=False)
     @property
     def projected(self) -> pd.DataFrame:
         """
@@ -531,16 +713,6 @@ class AtlasMapEmbeddings:
             all_embeddings.append(pa.compute.list_flatten(tb["_embeddings"]).to_numpy().reshape(-1, dims))  # type: ignore
         return np.vstack(all_embeddings)
-    def _download_projected(self) -> List[Path]:
-        """
-        Downloads the feather tree for projection coordinates.
-        """
-        logger.info("Downloading projected embeddings")
-        # Note that y coord should be in same sidecar
-        coord_sidecar = self.projection._get_sidecar_from_field("x")
-        self.projection._download_sidecar("datum_id", overwrite=False)
-        return self.projection._download_sidecar(coord_sidecar, overwrite=False)
     def _download_latent(self) -> List[Path]:
         """
         Downloads the feather tree for embeddings.
@@ -670,12 +842,17 @@ class AtlasMapTags:
     def __init__(self, projection: "AtlasProjection", auto_cleanup: Optional[bool] = False):  # type: ignore
         self.projection = projection
         self.dataset = projection.dataset
-        self.id_field = self.projection.dataset.id_field
         # Pre-fetch datum ids first upon initialization
-        try:
-            self.projection._download_sidecar("datum_id")
-        except Exception:
-            raise ValueError("Failed to fetch datum ids which is required to load tags.")
+        self._has_unique_id_field = (
+            "unique_id_field" in self.dataset.meta and self.dataset.meta["unique_id_field"] is not None
+        )
+        if self._has_unique_id_field:
+            try:
+                self.projection._download_sidecar("datum_id")
+            except Exception as e:
+                logger.warning(f"Failed to fetch datum ids: {e}. Will use synthetic IDs.")
+                self._has_unique_id_field = False
         self.auto_cleanup = auto_cleanup
     @property
@@ -748,20 +925,38 @@ class AtlasMapTags:
         """
         tag_paths = self._download_tag(tag_name, overwrite=overwrite)
         datum_ids = []
+        id_field_name = "_position_index"
+        if self._has_unique_id_field:
+            id_field_name = self.dataset.meta["unique_id_field"]
         for path in tag_paths:
             tb = feather.read_table(path)
             last_coord = path.name.split(".")[0]
-            tile_path = path.with_name(last_coord + ".datum_id.feather")
-            tile_tb = feather.read_table(tile_path).select([self.id_field])
+            # Get ID information - either from datum_id file or create synthetic IDs
+            if self._has_unique_id_field:
+                try:
+                    tile_path = path.with_name(last_coord + ".datum_id.feather")
+                    tile_tb = feather.read_table(tile_path).select([id_field_name])
+                except (FileNotFoundError, pa.ArrowInvalid):
+                    # Create synthetic IDs if datum_id file not found
+                    size = len(tb)
+                    position_indices = [f"pos_{i}" for i in range(size)]
+                    tile_tb = pa.Table.from_arrays([pa.array(position_indices)], names=[id_field_name])
+            else:
+                # Create synthetic IDs when no unique_id_field is available
+                size = len(tb)
+                position_indices = [f"pos_{i}" for i in range(size)]
+                tile_tb = pa.Table.from_arrays([pa.array(position_indices)], names=[id_field_name])
             if "all_set" in tb.column_names:
                 if tb["all_set"][0].as_py() == True:
-                    datum_ids.extend(tile_tb[self.id_field].to_pylist())
+                    datum_ids.extend(tile_tb[id_field_name].to_pylist())
             else:
                 # filter on rows
                 try:
-                    tb = tb.append_column(self.id_field, tile_tb[self.id_field])
-                    datum_ids.extend(tb.filter(pc.field("bitmask") == True)[self.id_field].to_pylist())
+                    tb = tb.append_column(id_field_name, tile_tb[id_field_name])
+                    datum_ids.extend(tb.filter(pc.field("bitmask") == True)[id_field_name].to_pylist())
                 except Exception as e:
                     raise Exception(f"Failed to fetch datums in tag. {e}")
         return datum_ids
@@ -849,7 +1044,6 @@ class AtlasMapData:
     def __init__(self, projection: "AtlasProjection", fields=None):  # type: ignore
         self.projection = projection
         self.dataset = projection.dataset
-        self.id_field = self.projection.dataset.id_field
         if fields is None:
             # TODO: fall back on something more reliable here
             self.fields = self.dataset.dataset_fields
@@ -858,6 +1052,9 @@ class AtlasMapData:
                 assert field in self.dataset.dataset_fields, f"Field {field} not found in dataset fields."
             self.fields = fields
         self._tb = None
+        self._has_unique_id_field = (
+            "unique_id_field" in self.dataset.meta and self.dataset.meta["unique_id_field"] is not None
+        )
     def _load_data(self, data_columns: List[Tuple[str, str]]):
         """
@@ -871,24 +1068,67 @@ class AtlasMapData:
         sidecars_to_load = set([sidecar for _, sidecar in data_columns if sidecar != "datum_id"])
         logger.info("Loading data")
         for key in tqdm(self.projection._manifest["key"].to_pylist()):
-            # Use datum id as root table
-            tb = feather.read_table(
-                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
-            )
+            # Use datum id as root table if available, otherwise create synthetic IDs
+            if self._has_unique_id_field:
+                try:
+                    datum_id_path = self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather")
+                    tb = feather.read_table(datum_id_path, memory_map=True)
+                except (FileNotFoundError, pa.ArrowInvalid):
+                    # Create a synthetic ID table
+                    # Using the first sidecar to determine table size
+                    first_sidecar = next(iter(sidecars_to_load)) if sidecars_to_load else ""
+                    tb = self._create_synthetic_id_table(key, first_sidecar)
+            else:
+                # Create synthetic IDs when no unique_id_field is available
+                first_sidecar = next(iter(sidecars_to_load)) if sidecars_to_load else ""
+                tb = self._create_synthetic_id_table(key, first_sidecar)
             for sidecar in sidecars_to_load:
                 path = self.projection.tile_destination
                 if sidecar == "":
                     path = path / Path(key).with_suffix(".feather")
                 else:
                     path = path / Path(key).with_suffix(f".{sidecar}.feather")
-                carfile = feather.read_table(path, memory_map=True)
-                for col in carfile.column_names:
-                    if col in self.fields:
-                        tb = tb.append_column(col, carfile[col])
+                try:
+                    carfile = feather.read_table(path, memory_map=True)
+                    for col in carfile.column_names:
+                        if col in self.fields:
+                            tb = tb.append_column(col, carfile[col])
+                except (FileNotFoundError, pa.ArrowInvalid) as e:
+                    logger.warning(f"Error loading data for key {key}, sidecar {sidecar}: {e}")
+                    continue
             tbs.append(tb)
+        if not tbs:
+            raise ValueError("No data could be loaded. Data files may be missing or corrupt.")
         self._tb = pa.concat_tables(tbs)
+    def _create_synthetic_id_table(self, key, sidecar):
+        """
+        Create a synthetic table with position indices when datum_id file isn't available
+        or when unique_id_field isn't specified.
+        """
+        # Try to determine the size of the table by loading a sidecar file
+        path = self.projection.tile_destination
+        if sidecar == "":
+            path = path / Path(key).with_suffix(".feather")
+        else:
+            path = path / Path(key).with_suffix(f".{sidecar}.feather")
+        try:
+            sidecar_tb = feather.read_table(path, memory_map=True)
+            size = len(sidecar_tb)
+            # Create a table with position indices as IDs
+            position_indices = [f"pos_{i}" for i in range(size)]
+            return pa.Table.from_arrays([pa.array(position_indices)], names=["_position_index"])
+        except Exception as e:
+            logger.error(f"Failed to create synthetic IDs for {key}: {e}")
+            # Return an empty table as fallback
+            return pa.Table.from_arrays([pa.array([])], names=["_position_index"])
     def _download_data(self, fields: Optional[List[str]] = None) -> List[Tuple[str, str]]:
         """
         Downloads the feather tree for user uploaded data.
@@ -902,16 +1142,26 @@ class AtlasMapData:
         logger.info("Downloading data")
         self.projection.tile_destination.mkdir(parents=True, exist_ok=True)
-        # Download specified or all sidecar fields + always download datum_id
+        # Download specified or all sidecar fields + always download datum_id if available
         data_columns_to_load = [
             (str(field), str(sidecar))
             for field, sidecar in self.projection._registered_columns
             if field[0] != "_" and ((field in fields) or sidecar == "datum_id")
         ]
-        # TODO: less confusing progress bar
-        for sidecar in set([sidecar for _, sidecar in data_columns_to_load]):
+        # Only download datum_id if we have a unique ID field
+        if self._has_unique_id_field:
+            try:
+                self.projection._download_sidecar("datum_id")
+            except ValueError as e:
+                logger.warning(f"Failed to download datum_id files: {e}. Will use synthetic IDs instead.")
+                self._has_unique_id_field = False
+        # Download all required sidecars for the fields
+        sidecars_to_download = set(sidecar for _, sidecar in data_columns_to_load if sidecar != "datum_id")
+        for sidecar in sidecars_to_download:
             self.projection._download_sidecar(sidecar)
         return data_columns_to_load
     @property

{nomic-3.5.0 → nomic-3.5.2}/nomic/dataset.py RENAMED Viewed

@@ -115,15 +115,12 @@ class AtlasClass(object):
         return response.json()
-    def _validate_map_data_inputs(self, colorable_fields, id_field, data_sample):
+    def _validate_map_data_inputs(self, colorable_fields, data_sample):
         """Validates inputs to map data calls."""
         if not isinstance(colorable_fields, list):
             raise ValueError("colorable_fields must be a list of fields")
-        if id_field in colorable_fields:
-            raise Exception(f"Cannot color by unique id field: {id_field}")
         for field in colorable_fields:
             if field not in data_sample:
                 raise Exception(f"Cannot color by field `{field}` as it is not present in the metadata.")
@@ -274,14 +271,12 @@ class AtlasClass(object):
         """
         Private method. validates upload data against the dataset arrow schema, and associated other checks.
-        1. If unique_id_field is specified, validates that each datum has that field. If not, adds it and then notifies the user that it was added.
         Args:
             data: an arrow table.
             project: the atlas dataset you are validating the data for.
         Returns:
+            Validated pyarrow table.
         """
         if not isinstance(data, pa.Table):
             raise Exception("Invalid data type for upload: {}".format(type(data)))
@@ -295,8 +290,32 @@ class AtlasClass(object):
                 msg = "Must include embeddings in embedding dataset upload."
                 raise ValueError(msg)
-        if project.id_field not in data.column_names:
-            raise ValueError(f"Data must contain the ID column `{project.id_field}`")
+        # Check and validate ID field if specified
+        if "unique_id_field" in project.meta and project.meta["unique_id_field"] is not None:
+            id_field = project.meta["unique_id_field"]
+            # Check if ID field exists in data
+            if id_field not in data.column_names:
+                raise ValueError(
+                    f"Data must contain the ID column `{id_field}` as specified in dataset's unique_id_field"
+                )
+            # Check for null values in ID field
+            if data[id_field].null_count > 0:
+                raise ValueError(
+                    f"As your unique id field, {id_field} must not contain null values, but {data[id_field].null_count} found."
+                )
+            # Check ID field length (36 characters max)
+            if pa.types.is_string(data[id_field].type):
+                # Use a safer alternative to check string length
+                utf8_length_values = pc.utf8_length(data[id_field])  # type: ignore
+                max_length_scalar = pc.max(utf8_length_values)  # type: ignore
+                max_length = max_length_scalar.as_py()
+                if max_length > 36:
+                    raise ValueError(
+                        f"The id_field contains values greater than 36 characters. Atlas does not support id_fields longer than 36 characters."
+                    )
         seen = set()
         for col in data.column_names:
@@ -313,11 +332,6 @@ class AtlasClass(object):
         # filling in nulls, etc.
         reformatted = {}
-        if data[project.id_field].null_count > 0:
-            raise ValueError(
-                f"{project.id_field} must not contain null values, but {data[project.id_field].null_count} found."
-            )
         assert project.schema is not None, "Project schema not found."
         for field in project.schema:
@@ -335,10 +349,15 @@ class AtlasClass(object):
                         f"Replacing {data[field.name].null_count} null values for field {field.name} with string 'null'. This behavior will change in a future version."
                     )
                     reformatted[field.name] = pc.fill_null(reformatted[field.name], "null")
-                if pa.compute.any(pa.compute.equal(pa.compute.binary_length(reformatted[field.name]), 0)):  # type: ignore
-                    mask = pa.compute.equal(pa.compute.binary_length(reformatted[field.name]), 0).combine_chunks()  # type: ignore
-                    assert pa.types.is_boolean(mask.type)  # type: ignore
-                    reformatted[field.name] = pa.compute.replace_with_mask(reformatted[field.name], mask, "null")  # type: ignore
+                # Check for empty strings and replace with "null"
+                # Separate the operations for better type checking
+                binary_length_values = pc.binary_length(reformatted[field.name])  # type: ignore
+                has_empty_strings = pc.equal(binary_length_values, 0)  # type: ignore
+                if pc.any(has_empty_strings).as_py():  # type: ignore
+                    mask = has_empty_strings.combine_chunks()
+                    assert pa.types.is_boolean(mask.type)
+                    reformatted[field.name] = pc.replace_with_mask(reformatted[field.name], mask, "null")  # type: ignore
         for field in data.schema:
             if not field.name in reformatted:
                 if field.name == "_embeddings":
@@ -350,27 +369,12 @@ class AtlasClass(object):
         if project.meta["insert_update_delete_lock"]:
             raise Exception("Project is currently indexing and cannot ingest new datums. Try again later.")
-        # The following two conditions should never occur given the above, but just in case...
-        assert project.id_field in data.column_names, f"Upload does not contain your specified id_field"
-        if not pa.types.is_string(data[project.id_field].type):
-            logger.warning(f"id_field is not a string. Converting to string from {data[project.id_field].type}")
-            data = data.drop([project.id_field]).append_column(
-                project.id_field, data[project.id_field].cast(pa.string())
-            )
         for key in data.column_names:
             if key.startswith("_"):
                 if key == "_embeddings" or key == "_blob_hash":
                     continue
                 raise ValueError("Metadata fields cannot start with _")
-        if pa.compute.max(pa.compute.utf8_length(data[project.id_field])).as_py() > 36:  # type: ignore
-            first_match = data.filter(
-                pa.compute.greater(pa.compute.utf8_length(data[project.id_field]), 36)  # type: ignore
-            ).to_pylist()[0][project.id_field]
-            raise ValueError(
-                f"The id_field {first_match} is greater than 36 characters. Atlas does not support id_fields longer than 36 characters."
-            )
         return data
     def _get_organization(self, organization_slug=None, organization_id=None) -> Tuple[str, str]:
@@ -696,36 +700,6 @@ class AtlasProjection:
     def tile_destination(self):
         return Path("~/.nomic/cache", self.id).expanduser()
-    @property
-    def datum_id_field(self):
-        return self.dataset.meta["unique_id_field"]
-    def _get_atoms(self, ids: List[str]) -> List[Dict]:
-        """
-        Retrieves atoms by id
-        Args:
-            ids: list of atom ids
-        Returns:
-            A dictionary containing the resulting atoms, keyed by atom id.
-        """
-        if not isinstance(ids, list):
-            raise ValueError("You must specify a list of ids when getting data.")
-        response = requests.post(
-            self.dataset.atlas_api_path + "/v1/project/atoms/get",
-            headers=self.dataset.header,
-            json={"project_id": self.dataset.id, "index_id": self.atlas_index_id, "atom_ids": ids},
-        )
-        if response.status_code == 200:
-            return response.json()["atoms"]
-        else:
-            raise Exception(response.text)
 class AtlasDataStream(AtlasClass):
     def __init__(self, name: Optional[str] = "contrastors"):
@@ -766,7 +740,7 @@ class AtlasDataset(AtlasClass):
         * **identifier** - The dataset identifier in the form `dataset` or `organization/dataset`. If no organization is passed, the organization tied to the API key you logged in to Nomic with will be used.
         * **description** - A description for the dataset.
-        * **unique_id_field** - The field that uniquely identifies each data point.
+        * **unique_id_field** - A field that uniquely identifies each data point.
         * **is_public** - Should this dataset be publicly accessible for viewing (read only). If False, only members of your Nomic organization can view.
         * **dataset_id** - An alternative way to load a dataset is by passing the dataset_id directly. This only works if a dataset exists.
         """
@@ -805,13 +779,6 @@ class AtlasDataset(AtlasClass):
             dataset_id = dataset["id"]
         if dataset_id is None:  # if there is no existing project, make a new one.
-            if unique_id_field is None:  # if not all parameters are specified, we weren't trying to make a project
-                raise ValueError(f"Dataset `{identifier}` does not exist.")
-            # if modality is None:
-            #     raise ValueError("You must specify a modality when creating a new dataset.")
-            #
-            # assert modality in ['text', 'embedding'], "Modality must be either `text` or `embedding`"
             assert identifier is not None
             dataset_id = self._create_project(
@@ -840,7 +807,7 @@ class AtlasDataset(AtlasClass):
         self,
         identifier: str,
         description: Optional[str],
-        unique_id_field: str,
+        unique_id_field: Optional[str] = None,
         is_public: bool = True,
     ):
         """
@@ -852,7 +819,7 @@ class AtlasDataset(AtlasClass):
         * **identifier** - The identifier for the dataset.
         * **description** - A description for the dataset.
-        * **unique_id_field** - The field that uniquely identifies each datum. If a datum does not contain this field, it will be added and assigned a random unique ID.
+        * **unique_id_field** - A field that uniquely identifies each data point.
         * **is_public** - Should this dataset be publicly accessible for viewing (read only). If False, only members of your Nomic organization can view.
         **Returns:** project_id on success.
@@ -865,15 +832,6 @@ class AtlasDataset(AtlasClass):
         if "/" in identifier:
             org_name = identifier.split("/")[0]
             logger.info(f"Organization name: `{org_name}`")
-        # supported_modalities = ['text', 'embedding']
-        # if modality not in supported_modalities:
-        #     msg = 'Tried to create dataset with modality: {}, but Atlas only supports: {}'.format(
-        #         modality, supported_modalities
-        #     )
-        #     raise ValueError(msg)
-        if unique_id_field is None:
-            raise ValueError("You must specify a unique id field")
         if description is None:
             description = ""
         response = requests.post(
@@ -884,7 +842,6 @@ class AtlasDataset(AtlasClass):
                 "project_name": project_slug,
                 "description": description,
                 "unique_id_field": unique_id_field,
-                # 'modality': modality,
                 "is_public": is_public,
             },
         )
@@ -939,11 +896,12 @@ class AtlasDataset(AtlasClass):
     @property
     def id(self) -> str:
-        """The UUID of the dataset."""
+        """The ID of the dataset."""
         return self.meta["id"]
     @property
     def id_field(self) -> str:
+        """The unique_id_field of the dataset."""
         return self.meta["unique_id_field"]
     @property
@@ -1147,7 +1105,7 @@ class AtlasDataset(AtlasClass):
         colorable_fields = []
         for field in self.dataset_fields:
-            if field not in [self.id_field, indexed_field] and not field.startswith("_"):
+            if field not in [indexed_field] and not field.startswith("_"):
                 colorable_fields.append(field)
         build_template = {}
@@ -1410,98 +1368,126 @@ class AtlasDataset(AtlasClass):
         Uploads blobs to the server and associates them with the data.
         Blobs must reference objects stored locally
         """
+        data_as_table: pa.Table
         if isinstance(data, DataFrame):
-            data = pa.Table.from_pandas(data)
+            data_as_table = pa.Table.from_pandas(data)
         elif isinstance(data, list):
-            data = pa.Table.from_pylist(data)
-        elif not isinstance(data, pa.Table):
+            data_as_table = pa.Table.from_pylist(data)
+        elif isinstance(data, pa.Table):
+            data_as_table = data
+        else:
             raise ValueError("Data must be a pandas DataFrame, list of dictionaries, or a pyarrow Table.")
-        blob_upload_endpoint = "/v1/project/data/add/blobs"
-        # uploda batch of blobs
-        # return hash of blob
-        # add hash to data as _blob_hash
-        # set indexed_field to _blob_hash
-        # call _add_data
-        # Cast self id field to string for merged data lower down on function
-        data = data.set_column(  # type: ignore
-            data.schema.get_field_index(self.id_field), self.id_field, pc.cast(data[self.id_field], pa.string())  # type: ignore
-        )
+        # Compute dataset length
+        data_length = len(data_as_table)
+        if data_length != len(blobs):
+            raise ValueError(f"Number of data points ({data_length}) must match number of blobs ({len(blobs)})")
-        ids = data[self.id_field].to_pylist()  # type: ignore
-        if not isinstance(ids[0], str):
-            ids = [str(uuid) for uuid in ids]
-        # TODO: add support for other modalities
-        images = []
-        for uuid, blob in tqdm(zip(ids, blobs), total=len(ids), desc="Loading images"):
-            if (isinstance(blob, str) or isinstance(blob, Path)) and os.path.exists(blob):
-                # Auto resize to max 512x512
-                image = Image.open(blob)
-                image = image.convert("RGB")
+        TEMP_ID_COLUMN = "_nomic_internal_temp_id"
+        temp_id_values = [str(i) for i in range(data_length)]
+        data_as_table = data_as_table.append_column(TEMP_ID_COLUMN, pa.array(temp_id_values, type=pa.string()))
+        blob_upload_endpoint = "/v1/project/data/add/blobs"
+        actual_temp_ids = data_as_table[TEMP_ID_COLUMN].to_pylist()
+        images = []  # List of (temp_id, image_bytes)
+        for i in tqdm(range(data_length), desc="Processing images"):
+            current_temp_id = actual_temp_ids[i]
+            blob_item = blobs[i]
+            processed_blob_value = None
+            if (isinstance(blob_item, str) or isinstance(blob_item, Path)) and os.path.exists(blob_item):
+                image = Image.open(blob_item).convert("RGB")
                 if image.height > 512 or image.width > 512:
                     image = image.resize((512, 512))
                 buffered = BytesIO()
                 image.save(buffered, format="JPEG")
-                images.append((uuid, buffered.getvalue()))
-            elif isinstance(blob, bytes):
-                images.append((uuid, blob))
-            elif isinstance(blob, Image.Image):
-                blob = blob.convert("RGB")  # type: ignore
-                if blob.height > 512 or blob.width > 512:
-                    blob = blob.resize((512, 512))
+                processed_blob_value = buffered.getvalue()
+            elif isinstance(blob_item, bytes):
+                processed_blob_value = blob_item
+            elif isinstance(blob_item, Image.Image):
+                img_pil = blob_item.convert("RGB")  # Ensure it's PIL Image for methods
+                if img_pil.height > 512 or img_pil.width > 512:
+                    img_pil = img_pil.resize((512, 512))
                 buffered = BytesIO()
-                blob.save(buffered, format="JPEG")
-                images.append((uuid, buffered.getvalue()))
+                img_pil.save(buffered, format="JPEG")
+                processed_blob_value = buffered.getvalue()
             else:
-                raise ValueError(f"Invalid blob type for {uuid}. Must be a path to an image, bytes, or PIL Image.")
+                raise ValueError(
+                    f"Invalid blob type for item at index {i} (temp_id: {current_temp_id}). Must be a path, bytes, or PIL Image. Got: {type(blob_item)}"
+                )
+            if processed_blob_value is not None:
+                images.append((current_temp_id, processed_blob_value))
         batch_size = 40
-        num_workers = 10
+        num_workers = 2
+        def send_request(batch_start_index):
+            image_batch = images[batch_start_index : batch_start_index + batch_size]
+            temp_ids_in_batch = [item_id for item_id, _ in image_batch]
+            blobs_for_api = [("blobs", blob_val) for _, blob_val in image_batch]
-        def send_request(i):
-            image_batch = images[i : i + batch_size]
-            ids = [uuid for uuid, _ in image_batch]
-            blobs = [("blobs", blob) for _, blob in image_batch]
             response = requests.post(
                 self.atlas_api_path + blob_upload_endpoint,
                 headers=self.header,
-                data={"dataset_id": self.id},
-                files=blobs,
+                data={"dataset_id": self.id},  # self.id is project_id
+                files=blobs_for_api,
             )
             if response.status_code != 200:
-                raise Exception(response.text)
-            return {uuid: blob_hash for uuid, blob_hash in zip(ids, response.json()["hashes"])}
+                failed_ids_sample = temp_ids_in_batch[:5]
+                logger.error(
+                    f"Blob upload request failed for batch starting with temp_ids: {failed_ids_sample}. Status: {response.status_code}, Response: {response.text}"
+                )
+                raise Exception(f"Blob upload failed: {response.text}")
+            return {temp_id: blob_hash for temp_id, blob_hash in zip(temp_ids_in_batch, response.json()["hashes"])}
-        # if this method is being called internally, we pass a global progress bar
-        if pbar is None:
-            pbar = tqdm(total=len(data), desc="Uploading blobs to Atlas")
+        upload_pbar = pbar  # Use passed-in pbar if available
+        close_upload_pbar_locally = False
+        if upload_pbar is None:
+            upload_pbar = tqdm(total=len(images), desc="Uploading blobs to Atlas")
+            close_upload_pbar_locally = True
-        hash_schema = pa.schema([(self.id_field, pa.string()), ("_blob_hash", pa.string())])
-        returned_ids = []
+        returned_temp_ids = []
         returned_hashes = []
+        succeeded_uploads = 0
-        succeeded = 0
         with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-            futures = {executor.submit(send_request, i): i for i in range(0, len(data), batch_size)}
+            futures = {executor.submit(send_request, i): i for i in range(0, len(images), batch_size)}
             for future in concurrent.futures.as_completed(futures):
-                response = future.result()
-                # add hash to data as _blob_hash
-                for uuid, blob_hash in response.items():
-                    returned_ids.append(uuid)
-                    returned_hashes.append(blob_hash)
-                # A successful upload.
-                succeeded += len(response)
-                pbar.update(len(response))
+                try:
+                    response_dict = future.result()  # This is {temp_id: blob_hash}
+                    for temp_id, blob_hash_val in response_dict.items():
+                        returned_temp_ids.append(temp_id)
+                        returned_hashes.append(blob_hash_val)
+                    succeeded_uploads += len(response_dict)
+                    if upload_pbar:
+                        upload_pbar.update(len(response_dict))
+                except Exception as e:
+                    logger.error(f"An error occurred during blob upload processing for a batch: {e}")
+                    # Optionally, collect failed batch info here if needed for partial success
+        if close_upload_pbar_locally and upload_pbar:
+            upload_pbar.close()
+        hash_schema = pa.schema([(TEMP_ID_COLUMN, pa.string()), ("_blob_hash", pa.string())])
+        merged_data_as_table: pa.Table
+        if succeeded_uploads > 0:  # Only create hash_tb if there are successful uploads
+            hash_tb = pa.Table.from_pydict(
+                {TEMP_ID_COLUMN: returned_temp_ids, "_blob_hash": returned_hashes}, schema=hash_schema
+            )
+            merged_data_as_table = data_as_table.join(right_table=hash_tb, keys=TEMP_ID_COLUMN, join_type="left outer")
+        else:  # No successful uploads, so no hashes to merge, but keep original data structure
+            # Need to ensure _blob_hash column is added with nulls, and id_field is present
+            if "_blob_hash" not in data_as_table.column_names:
+                data_as_table = data_as_table.append_column(
+                    "_blob_hash", pa.nulls(data_as_table.num_rows, type=pa.string())
+                )
+            merged_data_as_table = data_as_table
-        hash_tb = pa.Table.from_pydict({self.id_field: returned_ids, "_blob_hash": returned_hashes}, schema=hash_schema)
-        merged_data = data.join(right_table=hash_tb, keys=self.id_field)  # type: ignore
+        merged_data_as_table = merged_data_as_table.drop_columns([TEMP_ID_COLUMN])
-        self._add_data(merged_data, pbar=pbar)
+        self._add_data(merged_data_as_table, pbar=pbar)  # Pass original pbar argument
     def _add_text(self, data=Union[DataFrame, List[Dict], pa.Table], pbar=None):
         """
@@ -1580,12 +1566,8 @@ class AtlasDataset(AtlasClass):
             None
         """
-        # Exactly 10 upload workers at a time.
-        num_workers = 10
+        num_workers = 2
         # Each worker currently is too slow beyond a shard_size of 10000
         # The heuristic here is: Never let shards be more than 10,000 items,
         # OR more than 16MB uncompressed. Whichever is smaller.
@@ -1701,7 +1683,7 @@ class AtlasDataset(AtlasClass):
             else:
                 logger.info("Upload succeeded.")
-    def update_maps(self, data: List[Dict], embeddings: Optional[np.ndarray] = None, num_workers: int = 10):
+    def update_maps(self, data: List[Dict], embeddings: Optional[np.ndarray] = None, num_workers: int = 2):
         """
         Utility method to update a project's maps by adding the given data.

{nomic-3.5.0 → nomic-3.5.2}/nomic.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.5.0
+Version: 3.5.2
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.5.0 → nomic-3.5.2}/setup.py RENAMED Viewed

@@ -23,7 +23,7 @@ with open("README.md") as f:
 setup(
     name="nomic",
-    version="3.5.0",
+    version="3.5.2",
     url="https://github.com/nomic-ai/nomic",
     description=description,
     long_description=long_description,