PyPI - nomic - Versions diffs - 3.0.31__tar.gz → 3.0.33__tar.gz - Mend

nomic 3.0.31tar.gz → 3.0.33tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nomic might be problematic. Click here for more details.

Files changed (24) hide show

{nomic-3.0.31 → nomic-3.0.33}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.0.31
+Version: 3.0.33
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.0.31 → nomic-3.0.33}/nomic/aws/sagemaker.py RENAMED Viewed

@@ -232,7 +232,6 @@ def embed_image(
     region_name: str,
     model_name="nomic-embed-vision-v1",
 ) -> dict:
     embeddings = []
     max_workers = mp.cpu_count()

{nomic-3.0.31 → nomic-3.0.33}/nomic/data_inference.py RENAMED Viewed

@@ -84,11 +84,11 @@ class NomicTopicOptions(BaseModel):
     Args:
         build_topic_model: If True, builds a topic model over your dataset's embeddings.
-        community_description_target_field: The dataset field/column that Atlas will use to assign a human-readable description to each topic.
+        topic_label_field: The dataset column (usually the column you embedded) that Atlas will use to assign a human-readable description to each topic.
     """
     build_topic_model: bool = True
-    community_description_target_field: Optional[str] = Field(default=None, alias="topic_label_field")
+    topic_label_field: Optional[str] = Field(default=None, alias="community_description_target_field")
     cluster_method: str = "fast"
     enforce_topic_hierarchy: bool = False

{nomic-3.0.31 → nomic-3.0.33}/nomic/data_operations.py RENAMED Viewed

@@ -1,15 +1,10 @@
 import base64
-import concurrent
-import concurrent.futures
-import glob
 import io
 import json
-import os
 from collections import defaultdict
 from datetime import datetime
-from io import BytesIO
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 import numpy as np
 import pandas as pd
@@ -17,12 +12,9 @@ import pyarrow as pa
 import requests
 from loguru import logger
 from pyarrow import compute as pc
-from pyarrow import feather, ipc
+from pyarrow import feather
 from tqdm import tqdm
-from .settings import EMBEDDING_PAGINATION_LIMIT
-from .utils import download_feather
 class AtlasMapDuplicates:
     """
@@ -34,22 +26,57 @@ class AtlasMapDuplicates:
     def __init__(self, projection: "AtlasProjection"):  # type: ignore
         self.projection = projection
         self.id_field = self.projection.dataset.id_field
-        try:
-            duplicate_fields = [
-                field for field in projection._fetch_tiles().column_names if "_duplicate_class" in field
-            ]
-            cluster_fields = [field for field in projection._fetch_tiles().column_names if "_cluster" in field]
-            assert len(duplicate_fields) > 0, "Duplicate detection has not yet been run on this map."
-            self.duplicate_field = duplicate_fields[0]
-            self.cluster_field = cluster_fields[0]
-            self._tb: pa.Table = projection._fetch_tiles().select(
-                [self.id_field, self.duplicate_field, self.cluster_field]
+        duplicate_columns = [
+            (field, sidecar)
+            for field, sidecar in self.projection._registered_columns
+            if field.startswith("_duplicate_class")
+        ]
+        cluster_columns = [
+            (field, sidecar) for field, sidecar in self.projection._registered_columns if field.startswith("_cluster")
+        ]
+        assert len(duplicate_columns) > 0, "Duplicate detection has not yet been run on this map."
+        self._duplicate_column = duplicate_columns[0]
+        self._cluster_column = cluster_columns[0]
+        self._tb = None
+    def _load_duplicates(self):
+        """
+        Loads duplicates from the feather tree.
+        """
+        tbs = []
+        duplicate_sidecar = self._duplicate_column[1]
+        self.duplicate_field = self._duplicate_column[0].lstrip("_")
+        self.cluster_field = self._cluster_column[0].lstrip("_")
+        logger.info("Loading duplicates")
+        for key in tqdm(self.projection._manifest["key"].to_pylist()):
+            # Use datum id as root table
+            tb = feather.read_table(
+                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
             )
-        except pa.lib.ArrowInvalid as e:  # type: ignore
-            raise ValueError("Duplicate detection has not yet been run on this map.")
-        self.duplicate_field = self.duplicate_field.lstrip("_")
-        self.cluster_field = self.cluster_field.lstrip("_")
-        self._tb = self._tb.rename_columns([self.id_field, self.duplicate_field, self.cluster_field])
+            path = self.projection.tile_destination
+            if duplicate_sidecar == "":
+                path = path / Path(key).with_suffix(".feather")
+            else:
+                path = path / Path(key).with_suffix(f".{duplicate_sidecar}.feather")
+            duplicate_tb = feather.read_table(path, memory_map=True)
+            for field in (self._duplicate_column[0], self._cluster_column[0]):
+                tb = tb.append_column(field, duplicate_tb[field])
+            tbs.append(tb)
+        self._tb = pa.concat_tables(tbs).rename_columns([self.id_field, self.duplicate_field, self.cluster_field])
+    def _download_duplicates(self):
+        """
+        Downloads the feather tree for duplicates.
+        """
+        logger.info("Downloading duplicates")
+        self.projection._download_sidecar("datum_id", overwrite=False)
+        assert self._cluster_column[1] == self._duplicate_column[1], "Cluster and duplicate should be in same sidecar"
+        self.projection._download_sidecar(self._duplicate_column[1], overwrite=False)
     @property
     def df(self) -> pd.DataFrame:
@@ -65,6 +92,10 @@ class AtlasMapDuplicates:
         This table is memmapped from the underlying files and is the most efficient way to
         access duplicate information.
         """
+        if isinstance(self._tb, pa.Table):
+            return self._tb
+        self._download_duplicates()
+        self._load_duplicates()
         return self._tb
     def deletion_candidates(self) -> List[str]:
@@ -97,34 +128,69 @@ class AtlasMapTopics:
         self.id_field = self.projection.dataset.id_field
         self._metadata = None
         self._hierarchy = None
+        self._topic_columns = [
+            column for column in self.projection._registered_columns if column[0].startswith("_topic_depth_")
+        ]
+        assert len(self._topic_columns) > 0, "Topic modeling has not yet been run on this map."
+        self.depth = len(self._topic_columns)
+        self._tb = None
+    def _load_topics(self):
+        """
+        Loads topics from the feather tree.
+        """
+        integer_topics = False
+        # pd.Series to match pd typing
+        label_df: Optional[Union[pd.DataFrame, pd.Series]] = None
+        if "int" in self._topic_columns[0][0]:
+            integer_topics = True
+            label_df = self.metadata[["topic_id", "depth", "topic_short_description"]]
+        tbs = []
+        # Should just be one sidecar
+        topic_sidecar = set([sidecar for _, sidecar in self._topic_columns]).pop()
+        logger.info("Loading topics")
+        for key in tqdm(self.projection._manifest["key"].to_pylist()):
+            # Use datum id as root table
+            tb = feather.read_table(
+                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
+            )
+            path = self.projection.tile_destination
+            if topic_sidecar == "":
+                path = path / Path(key).with_suffix(".feather")
+            else:
+                path = path / Path(key).with_suffix(f".{topic_sidecar}.feather")
-        try:
-            self._tb: pa.Table = projection._fetch_tiles()
-            topic_fields = [column for column in self._tb.column_names if column.startswith("_topic_depth_")]
-            self.depth = len(topic_fields)
-            # If using topic ids, fetch topic labels
-            if "int" in topic_fields[0]:
-                new_topic_fields = []
-                label_df = self.metadata[["topic_id", "depth", "topic_short_description"]]
-                for d in range(1, self.depth + 1):
+            topic_tb = feather.read_table(path, memory_map=True)
+            # Do this in depth order
+            for d in range(1, self.depth + 1):
+                column = f"_topic_depth_{d}"
+                if integer_topics:
                     column = f"_topic_depth_{d}_int"
-                    topic_ids_to_label = self._tb[column].to_pandas().rename("topic_id")
+                    topic_ids_to_label = topic_tb[column].to_pandas().rename("topic_id")
+                    assert label_df is not None
                     topic_ids_to_label = pd.DataFrame(label_df[label_df["depth"] == d]).merge(
                         topic_ids_to_label, on="topic_id", how="right"
                     )
                     new_column = f"_topic_depth_{d}"
-                    self._tb = self._tb.append_column(
+                    tb = tb.append_column(
                         new_column, pa.Array.from_pandas(topic_ids_to_label["topic_short_description"])
                     )
-                    new_topic_fields.append(new_column)
-                topic_fields = new_topic_fields
+                else:
+                    tb = tb.append_column(f"_topic_depth_1", topic_tb["_topic_depth_1"])
+            tbs.append(tb)
-            renamed_fields = [f"topic_depth_{i}" for i in range(1, self.depth + 1)]
-            self._tb = self._tb.select([self.id_field] + topic_fields).rename_columns([self.id_field] + renamed_fields)
+        renamed_columns = [self.id_field] + [f"topic_depth_{i}" for i in range(1, self.depth + 1)]
+        self._tb = pa.concat_tables(tbs).rename_columns(renamed_columns)
-        except pa.lib.ArrowInvalid as e:  # type: ignore
-            raise ValueError("Topic modeling has not yet been run on this map.")
+    def _download_topics(self):
+        """
+        Downloads the feather tree for topics.
+        """
+        logger.info("Downloading topics")
+        self.projection._download_sidecar("datum_id", overwrite=False)
+        topic_sidecars = set([sidecar for _, sidecar in self._topic_columns])
+        assert len(topic_sidecars) == 1, "Multiple topic sidecars found."
+        self.projection._download_sidecar(topic_sidecars.pop(), overwrite=False)
     @property
     def df(self) -> pd.DataFrame:
@@ -140,6 +206,10 @@ class AtlasMapTopics:
         This table is memmapped from the underlying files and is the most efficient way to
         access topic information.
         """
+        if isinstance(self._tb, pa.Table):
+            return self._tb
+        self._download_topics()
+        self._load_topics()
         return self._tb
     @property
@@ -263,8 +333,8 @@ class AtlasMapTopics:
             A list of `{topic, count}` dictionaries, sorted from largest count to smallest count.
         """
         data = AtlasMapData(self.projection, fields=[time_field])
-        time_data = data._tb.select([self.id_field, time_field])
-        merged_tb = self._tb.join(time_data, self.id_field, join_type="inner").combine_chunks()
+        time_data = data.tb.select([self.id_field, time_field])
+        merged_tb = self.tb.join(time_data, self.id_field, join_type="inner").combine_chunks()
         del time_data  # free up memory
@@ -379,8 +449,8 @@ class AtlasMapEmbeddings:
     def __init__(self, projection: "AtlasProjection"):  # type: ignore
         self.projection = projection
         self.id_field = self.projection.dataset.id_field
-        self._tb: pa.Table = projection._fetch_tiles().select([self.id_field, "x", "y"])
         self.dataset = projection.dataset
+        self._tb: pa.Table = None
         self._latent = None
     @property
@@ -401,6 +471,31 @@ class AtlasMapEmbeddings:
         Does not include high-dimensional embeddings.
         """
+        if isinstance(self._tb, pa.Table):
+            return self._tb
+        self._download_projected()
+        logger.info("Loading projected embeddings")
+        tbs = []
+        coord_sidecar = self.projection._get_sidecar_from_field("x")
+        for key in tqdm(self.projection._manifest["key"].to_pylist()):
+            # Use datum id as root table
+            tb = feather.read_table(
+                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
+            )
+            path = self.projection.tile_destination
+            if coord_sidecar == "":
+                path = path / Path(key).with_suffix(".feather")
+            else:
+                path = path / Path(key).with_suffix(f".{coord_sidecar}.feather")
+            carfile = feather.read_table(path, memory_map=True)
+            for col in carfile.column_names:
+                if col in ["x", "y"]:
+                    tb = tb.append_column(col, carfile[col])
+            tbs.append(tb)
+        self._tb = pa.concat_tables(tbs)
         return self._tb
     @property
@@ -426,53 +521,43 @@ class AtlasMapEmbeddings:
         if self._latent is not None:
             return self._latent
-        root_embedding = self.projection.tile_destination / "0/0/0-0.embeddings.feather"
-        # Not the most complete check, hence the warning below.
-        if not root_embedding.exists():
-            self._download_latent()
+        downloaded_files_in_tile_order = self._download_latent()
+        assert len(downloaded_files_in_tile_order) > 0, "No embeddings found for this map."
         all_embeddings = []
-        for path in self.projection._tiles_in_order(coords_only=False):
-            # double with-suffix to remove '.embeddings.feather'
-            files = path.parent.glob(path.with_suffix("").stem + "-*.embeddings.feather")
-            # Should there be more than 10, we need to sort by int values, not string values
-            sortable = sorted(files, key=lambda x: int(x.with_suffix("").stem.split("-")[-1]))
-            if len(sortable) == 0:
-                raise FileNotFoundError(
-                    "Could not find any embeddings for tile {}".format(path)
-                    + " If you possibly downloaded only some of the embeddings, run '[map_name].download_latent()'."
-                )
-            for file in sortable:
-                tb = feather.read_table(file, memory_map=True)
-                dims = tb["_embeddings"].type.list_size
-                all_embeddings.append(pa.compute.list_flatten(tb["_embeddings"]).to_numpy().reshape(-1, dims))  # type: ignore
+        logger.info("Loading latent embeddings")
+        for path in tqdm(downloaded_files_in_tile_order):
+            tb = feather.read_table(path, memory_map=True)
+            dims = tb["_embeddings"].type.list_size
+            all_embeddings.append(pa.compute.list_flatten(tb["_embeddings"]).to_numpy().reshape(-1, dims))  # type: ignore
         return np.vstack(all_embeddings)
-    def _download_latent(self):
+    def _download_projected(self) -> List[Path]:
         """
-        Downloads the latent embeddings one file at a time.
+        Downloads the feather tree for projection coordinates.
         """
-        logger.warning("Downloading latent embeddings of all datapoints.")
-        limit = 10_000
-        route = self.projection.dataset.atlas_api_path + "/v1/project/data/get/embedding/paged"
-        last = None
+        logger.info("Downloading projected embeddings")
+        # Note that y coord should be in same sidecar
+        coord_sidecar = self.projection._get_sidecar_from_field("x")
+        self.projection._download_sidecar("datum_id", overwrite=False)
+        return self.projection._download_sidecar(coord_sidecar, overwrite=False)
-        with tqdm(total=self.dataset.total_datums // limit) as pbar:
-            while True:
-                params = {"projection_id": self.projection.id, "last_file": last, "page_size": limit}
-                r = requests.post(route, headers=self.projection.dataset.header, json=params)
-                if r.status_code == 204:
-                    # Download complete!
-                    break
-                fin = BytesIO(r.content)
-                tb = feather.read_table(fin, memory_map=True)
+    def _download_latent(self) -> List[Path]:
+        """
+        Downloads the feather tree for embeddings.
+        Returns the path to downloaded embeddings.
+        """
+        # TODO: Is size of the embedding files (several hundreds of MBs) going to be a problem here?
+        logger.info("Downloading latent embeddings")
+        embedding_sidecar = None
+        for field, sidecar in self.projection._registered_columns:
+            # NOTE: be _embeddings or _embedding
+            if field == "_embeddings":
+                embedding_sidecar = sidecar
+                break
-                tilename = tb.schema.metadata[b"tile"].decode("utf-8")
-                dest = (self.projection.tile_destination / tilename).with_suffix(".embeddings.feather")
-                dest.parent.mkdir(parents=True, exist_ok=True)
-                feather.write_feather(tb, dest)
-                last = tilename
-                pbar.update(1)
+        if embedding_sidecar is None:
+            raise ValueError("No embeddings found for this map.")
+        return self.projection._download_sidecar(embedding_sidecar, overwrite=False)
     def vector_search(
         self, queries: Optional[np.ndarray] = None, ids: Optional[List[str]] = None, k: int = 5
@@ -586,12 +671,15 @@ class AtlasMapTags:
         self.projection = projection
         self.dataset = projection.dataset
         self.id_field = self.projection.dataset.id_field
-        # Pre-fetch tiles first upon initialization
-        self.projection._fetch_tiles(overwrite=False)
+        # Pre-fetch datum ids first upon initialization
+        try:
+            self.projection._download_sidecar("datum_id")
+        except Exception:
+            raise ValueError("Failed to fetch datum ids which is required to load tags.")
         self.auto_cleanup = auto_cleanup
     @property
-    def df(self, overwrite: Optional[bool] = False) -> pd.DataFrame:
+    def df(self, overwrite: bool = False) -> pd.DataFrame:
         """
         Pandas DataFrame mapping each data point to its tags.
         """
@@ -602,16 +690,13 @@ class AtlasMapTags:
         for tag in tags:
             self._download_tag(tag["tag_name"], overwrite=overwrite)
         tbs = []
-        all_quads = list(self.projection._tiles_in_order(coords_only=True))
-        for quad in tqdm(all_quads):
-            quad_str = os.path.join(*[str(q) for q in quad])
-            datum_id_filename = quad_str + "." + "datum_id" + ".feather"
-            path = self.projection.tile_destination / Path(datum_id_filename)
-            tb = feather.read_table(path, memory_map=True)
+        logger.info("Loading tags")
+        for key in tqdm(self.projection._manifest["key"].to_pylist()):
+            datum_id_path = self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather")
+            tb = feather.read_table(datum_id_path, memory_map=True)
             for tag in tags:
                 tag_definition_id = tag["tag_definition_id"]
-                tag_filename = quad_str + "." + f"_tag.{tag_definition_id}" + ".feather"
-                path = self.projection.tile_destination / Path(tag_filename)
+                path = self.projection.tile_destination / Path(key).with_suffix(f"._tag.{tag_definition_id}.feather")
                 tag_tb = feather.read_table(path, memory_map=True)
                 bitmask = None
                 if "all_set" in tag_tb.column_names:
@@ -650,7 +735,7 @@ class AtlasMapTags:
                 keep_tags.append(tag)
         return keep_tags
-    def get_datums_in_tag(self, tag_name: str, overwrite: Optional[bool] = False):
+    def get_datums_in_tag(self, tag_name: str, overwrite: bool = False):
         """
         Returns the datum ids in a given tag.
@@ -661,9 +746,9 @@ class AtlasMapTags:
         Returns:
             List of datum ids.
         """
-        ordered_tag_paths = self._download_tag(tag_name, overwrite=overwrite)
+        tag_paths = self._download_tag(tag_name, overwrite=overwrite)
         datum_ids = []
-        for path in ordered_tag_paths:
+        for path in tag_paths:
             tb = feather.read_table(path)
             last_coord = path.name.split(".")[0]
             tile_path = path.with_name(last_coord + ".datum_id.feather")
@@ -690,38 +775,14 @@ class AtlasMapTags:
                 return tag
         raise ValueError(f"Tag {name} not found in projection {self.projection.id}.")
-    def _download_tag(self, tag_name: str, overwrite: Optional[bool] = False):
+    def _download_tag(self, tag_name: str, overwrite: bool = False):
         """
         Downloads the feather tree for large sidecar columns.
         """
-        self.projection.tile_destination.mkdir(parents=True, exist_ok=True)
-        root_url = f"{self.dataset.atlas_api_path}/v1/project/{self.dataset.id}/index/projection/{self.projection.id}/quadtree/"
+        logger.info("Downloading tags")
         tag = self._get_tag_by_name(tag_name)
         tag_definition_id = tag["tag_definition_id"]
-        all_quads = list(self.projection._tiles_in_order(coords_only=True))
-        ordered_tag_paths = []
-        for quad in tqdm(all_quads):
-            quad_str = os.path.join(*[str(q) for q in quad])
-            filename = quad_str + "." + f"_tag.{tag_definition_id}" + ".feather"
-            path = self.projection.tile_destination / Path(filename)
-            download_attempt = 0
-            download_success = False
-            while download_attempt < 3 and not download_success:
-                download_attempt += 1
-                if not path.exists() or overwrite:
-                    download_feather(root_url + filename, path, headers=self.dataset.header)
-                try:
-                    ipc.open_file(path).schema
-                    download_success = True
-                except pa.ArrowInvalid:
-                    path.unlink(missing_ok=True)
-            if not download_success:
-                raise Exception(f"Failed to download tag {tag_name}.")
-            ordered_tag_paths.append(path)
-        return ordered_tag_paths
+        return self.projection._download_sidecar(f"_tag.{tag_definition_id}", overwrite=overwrite)
     def _remove_outdated_tag_files(self, tag_definition_ids: List[str]):
         """
@@ -732,14 +793,12 @@ class AtlasMapTags:
             tag_definition_ids: A list of tag definition ids to keep.
         """
         # NOTE: This currently only gets triggered on `df` property
-        all_quads = list(self.projection._tiles_in_order(coords_only=True))
-        for quad in tqdm(all_quads):
-            quad_str = os.path.join(*[str(q) for q in quad])
-            tile = self.projection.tile_destination / Path(quad_str)
+        for key in self.projection._manifest["key"].to_pylist():
+            tile = self.projection.tile_destination / Path(key)
             tile_dir = tile.parent
             if tile_dir.exists():
-                tagged_files = tile_dir.glob("*_tag*")
-                for file in tagged_files:
+                tag_files = tile_dir.glob("*_tag*")
+                for file in tag_files:
                     tag_definition_id = file.name.split(".")[-2]
                     if tag_definition_id in tag_definition_ids:
                         try:
@@ -791,81 +850,69 @@ class AtlasMapData:
         self.projection = projection
         self.dataset = projection.dataset
         self.id_field = self.projection.dataset.id_field
-        self.fields = fields
-        try:
-            # Run fetch_tiles first to guarantee existence of quad feather files
-            self._basic_data: pa.Table = self.projection._fetch_tiles()
-            sidecars = self._download_data(fields=fields)
-            self._tb = self._read_prefetched_tiles_with_sidecars(sidecars)
+        if fields is None:
+            # TODO: fall back on something more reliable here
+            self.fields = self.dataset.dataset_fields
+        else:
+            for field in fields:
+                assert field in self.dataset.dataset_fields, f"Field {field} not found in dataset fields."
+            self.fields = fields
+        self._tb = None
-        except pa.lib.ArrowInvalid as e:  # type: ignore
-            raise ValueError("Failed to fetch tiles for this map")
+    def _load_data(self, data_columns: List[Tuple[str, str]]):
+        """
+        Loads data from a list of data columns (field and sidecar name tuples).
-    def _read_prefetched_tiles_with_sidecars(self, additional_sidecars):
+        Args:
+            data_columns: A list of tuples containing field name and sidecar name.
+        """
         tbs = []
-        root = feather.read_table(self.projection.tile_destination / Path("0/0/0.feather"))  # type: ignore
-        try:
-            small_sidecars = set([v for k, v in json.loads(root.schema.metadata[b"sidecars"]).items()])
-        except KeyError:
-            small_sidecars = set([])
-        for path in self.projection._tiles_in_order():
-            tb = pa.feather.read_table(path).drop(["_id", "ix", "x", "y"])  # type: ignore
-            for col in tb.column_names:
-                if col[0] == "_":
-                    tb = tb.drop([col])
-            for sidecar_file in small_sidecars:
-                carfile = pa.feather.read_table(path.parent / f"{path.stem}.{sidecar_file}.feather", memory_map=True)  # type: ignore
-                for col in carfile.column_names:
-                    tb = tb.append_column(col, carfile[col])
-            for big_sidecar in additional_sidecars:
-                fname = (
-                    base64.urlsafe_b64encode(big_sidecar.encode("utf-8")).decode("utf-8")
-                    if big_sidecar != "datum_id"
-                    else big_sidecar
-                )
-                carfile = pa.feather.read_table(path.parent / f"{path.stem}.{fname}.feather", memory_map=True)  # type: ignore
+        sidecars_to_load = set([sidecar for _, sidecar in data_columns if sidecar != "datum_id"])
+        logger.info("Loading data")
+        for key in tqdm(self.projection._manifest["key"].to_pylist()):
+            # Use datum id as root table
+            tb = feather.read_table(
+                self.projection.tile_destination / Path(key).with_suffix(".datum_id.feather"), memory_map=True
+            )
+            for sidecar in sidecars_to_load:
+                path = self.projection.tile_destination
+                if sidecar == "":
+                    path = path / Path(key).with_suffix(".feather")
+                else:
+                    path = path / Path(key).with_suffix(f".{sidecar}.feather")
+                carfile = feather.read_table(path, memory_map=True)
                 for col in carfile.column_names:
-                    tb = tb.append_column(col, carfile[col])
+                    if col in self.fields:
+                        tb = tb.append_column(col, carfile[col])
             tbs.append(tb)
-        self._tb = pa.concat_tables(tbs)
-        return self._tb
+        self._tb = pa.concat_tables(tbs)
-    def _download_data(self, fields=None):
+    def _download_data(self, fields: Optional[List[str]] = None) -> List[Tuple[str, str]]:
         """
-        Downloads the feather tree for large sidecar columns.
+        Downloads the feather tree for user uploaded data.
+        fields:
+            A list of fields to download. If None, downloads all fields.
+        Returns:
+            List of downloaded columns
         """
+        logger.info("Downloading data")
         self.projection.tile_destination.mkdir(parents=True, exist_ok=True)
-        root = f"{self.dataset.atlas_api_path}/v1/project/{self.dataset.id}/index/projection/{self.projection.id}/quadtree/"
-        all_quads = list(self.projection._tiles_in_order(coords_only=True))
-        sidecars = fields
-        registered_sidecars = self.projection._registered_sidecars()
-        if sidecars is None:
-            sidecars = [
-                field
-                for field in self.dataset.dataset_fields
-                if field not in self._basic_data.column_names and field != "_embeddings"
-            ]
-        else:
-            for field in sidecars:
-                assert field in self.dataset.dataset_fields, f"Field {field} not found in dataset fields."
-        encoded_sidecars = [base64.urlsafe_b64encode(sidecar.encode("utf-8")).decode("utf-8") for sidecar in sidecars]
-        if any(sidecar == "datum_id" for (field, sidecar) in registered_sidecars):
-            sidecars.append("datum_id")
-            encoded_sidecars.append("datum_id")
-        for quad in tqdm(all_quads):
-            for encoded_colname in encoded_sidecars:
-                quad_str = os.path.join(*[str(q) for q in quad])
-                filename = quad_str + "." + encoded_colname + ".feather"
-                path = self.projection.tile_destination / Path(filename)
-                if not os.path.exists(path):
-                    # WARNING: Potentially large data request here
-                    download_feather(root + filename, path, headers=self.dataset.header)
+        # Download specified or all sidecar fields + always download datum_id
+        data_columns_to_load = [
+            (str(field), str(sidecar))
+            for field, sidecar in self.projection._registered_columns
+            if field[0] != "_" and ((field in fields) or sidecar == "datum_id")
+        ]
-        return sidecars
+        # TODO: less confusing progress bar
+        for sidecar in set([sidecar for _, sidecar in data_columns_to_load]):
+            self.projection._download_sidecar(sidecar)
+        return data_columns_to_load
     @property
     def df(self) -> pd.DataFrame:
@@ -873,7 +920,8 @@ class AtlasMapData:
         A pandas DataFrame associating each datapoint on your map to their metadata.
         Converting to pandas DataFrame may materialize a large amount of data into memory.
         """
-        return self._tb.to_pandas()
+        logger.warning("Converting to pandas dataframe. This may materialize a large amount of data into memory.")
+        return self.tb.to_pandas()
     @property
     def tb(self) -> pa.Table:
@@ -882,4 +930,9 @@ class AtlasMapData:
         This table is memmapped from the underlying files and is the most efficient way to
         access metadata information.
         """
+        if isinstance(self._tb, pa.Table):
+            return self._tb
+        columns = self._download_data(fields=self.fields)
+        self._load_data(columns)
         return self._tb

{nomic-3.0.31 → nomic-3.0.33}/nomic/dataset.py RENAMED Viewed

@@ -4,28 +4,21 @@ import concurrent.futures
 import io
 import json
 import os
-import pickle
 import time
-import uuid
-from collections import defaultdict
 from contextlib import contextmanager
-from datetime import date, datetime
+from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union, overload
+from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
-import pandas as pd
 import pyarrow as pa
 import requests
 from loguru import logger
 from pandas import DataFrame
 from pyarrow import compute as pc
 from pyarrow import feather, ipc
-from pydantic import BaseModel, Field
 from tqdm import tqdm
-import nomic
 from .cli import refresh_bearer_token, validate_api_http_response
 from .data_inference import (
     NomicDuplicatesOptions,
@@ -36,7 +29,7 @@ from .data_inference import (
 )
 from .data_operations import AtlasMapData, AtlasMapDuplicates, AtlasMapEmbeddings, AtlasMapTags, AtlasMapTopics
 from .settings import *
-from .utils import assert_valid_project_id, get_object_size_in_bytes
+from .utils import assert_valid_project_id, download_feather
 class AtlasUser:
@@ -433,6 +426,8 @@ class AtlasProjection:
         self._tile_data = None
         self._data = None
         self._schema = None
+        self._manifest_tb: Optional[pa.Table] = None
+        self._columns: List[Tuple[str, str]] = []
     @property
     def map_link(self):
@@ -590,147 +585,77 @@ class AtlasProjection:
             self._schema = ipc.read_schema(io.BytesIO(content))
         return self._schema
-    def _registered_sidecars(self) -> List[Tuple[str, str]]:
+    @property
+    def _registered_columns(self) -> List[Tuple[str, str]]:
         "Returns [(field_name, sidecar_name), ...]"
-        sidecars = []
+        if self._columns:
+            return self._columns
+        self._columns = []
         for field in self.schema:
             sidecar_name = json.loads(field.metadata.get(b"sidecar_name", b'""'))
-            if sidecar_name:
-                sidecars.append((field.name, sidecar_name))
-        return sidecars
+            if sidecar_name is not None:
+                self._columns.append((field.name, sidecar_name))
+        return self._columns
-    def _fetch_tiles(self, overwrite: bool = False):
+    @property
+    def _manifest(self) -> pa.Table:
         """
-        Downloads all web data for the projection to the specified directory and returns it as a memmapped arrow table.
-        Args:
-            overwrite: If True then overwrite web tile files.
-        Returns:
-            An Arrow table containing information for all data points in the index.
+        Returns the tile manifest for the projection.
+        Tile manifest is in quadtree order. All quadtree operations should
+        depend on tile manifest to ensure consistency.
         """
-        if self._tile_data is not None:
-            return self._tile_data
-        self._download_large_feather(overwrite=overwrite)
-        tbs = []
-        root = feather.read_table(self.tile_destination / "0/0/0.feather", memory_map=True)
-        try:
-            sidecars = set([v for k, v in json.loads(root.schema.metadata[b"sidecars"]).items()])
-        except KeyError:
-            sidecars = set([])
-        sidecars |= set(sidecar_name for (_, sidecar_name) in self._registered_sidecars())
-        for path in self._tiles_in_order():
-            tb = pa.feather.read_table(path, memory_map=True)  # type: ignore
-            for sidecar_file in sidecars:
-                carfile = pa.feather.read_table(  # type: ignore
-                    path.parent / f"{path.stem}.{sidecar_file}.feather", memory_map=True
-                )
-                for col in carfile.column_names:
-                    tb = tb.append_column(col, carfile[col])
-            tbs.append(tb)
-        self._tile_data = pa.concat_tables(tbs)
+        if self._manifest_tb is not None:
+            return self._manifest_tb
-        return self._tile_data
+        manifest_path = self.tile_destination / "manifest.feather"
+        manifest_url = (
+            self.dataset.atlas_api_path
+            + f"/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/manifest.feather"
+        )
-    @overload
-    def _tiles_in_order(self, *, coords_only: Literal[False] = ...) -> Iterator[Path]: ...
-    @overload
-    def _tiles_in_order(self, *, coords_only: Literal[True]) -> Iterator[Tuple[int, int, int]]: ...
-    @overload
-    def _tiles_in_order(self, *, coords_only: bool) -> Iterator[Any]: ...
+        download_feather(manifest_url, manifest_path, headers=self.dataset.header, overwrite=False)
+        self._manifest_tb = feather.read_table(manifest_path, memory_map=False)
+        return self._manifest_tb
-    def _tiles_in_order(self, *, coords_only: bool = False) -> Iterator[Any]:
+    def _get_sidecar_from_field(self, field: str) -> str:
         """
-        Returns:
-            A list of all tiles in the projection in a fixed order so that all
-            datasets are guaranteed to be aligned.
-        """
-        def children(z, x, y):
-            # This is the definition of a quadtree.
-            return [
-                (z + 1, x * 2, y * 2),
-                (z + 1, x * 2 + 1, y * 2),
-                (z + 1, x * 2, y * 2 + 1),
-                (z + 1, x * 2 + 1, y * 2 + 1),
-            ]
-        # start with the root
-        paths = [(0, 0, 0)]
-        # Pop off the front, extend the back (breadth first traversal)
-        while len(paths) > 0:
-            z, x, y = paths.pop(0)
-            path = Path(self.tile_destination, str(z), str(x), str(y)).with_suffix(".feather")
-            if path.exists():
-                if coords_only:
-                    yield (z, x, y)
-                else:
-                    yield path
-                paths.extend(children(z, x, y))  # pyright: ignore
+        Returns the sidecar name for a given field.
-    @property
-    def tile_destination(self):
-        return Path("~/.nomic/cache", self.id).expanduser()
+        Args:
+            field: the name of the field
+        """
+        for f, sidecar in self._registered_columns:
+            if field == f:
+                return sidecar
+        raise ValueError(f"Field {field} not found in registered columns.")
-    def _download_large_feather(self, dest: Optional[Union[str, Path]] = None, overwrite: bool = True):
+    def _download_sidecar(self, sidecar_name, overwrite: bool = False) -> List[Path]:
         """
-        Downloads the feather tree.
+        Downloads sidecar files from the quadtree
         Args:
+            sidecar_name: the name of the sidecar file
             overwrite: if True then overwrite existing feather files.
         Returns:
-            A list containing all quadtiles downloads.
-        """
-        # TODO: change overwrite default to False once updating projection is removed.
-        quads = [f"0/0/0"]
-        self.tile_destination.mkdir(parents=True, exist_ok=True)
-        root = f"{self.dataset.atlas_api_path}/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/"
-        all_quads = []
-        sidecars = None
-        registered_sidecars = set(sidecar_name for (_, sidecar_name) in self._registered_sidecars())
-        while len(quads) > 0:
-            rawquad = quads.pop(0)
-            quad = rawquad + ".feather"
-            all_quads.append(quad)
-            path = self.tile_destination / quad
-            download_attempt = 0
-            download_success = False
-            schema = None
-            while download_attempt < 3 and not download_success:
-                download_attempt += 1
-                if not path.exists() or overwrite:
-                    data = requests.get(root + quad, headers=self.dataset.header)
-                    readable = io.BytesIO(data.content)
-                    readable.seek(0)
-                    tb = feather.read_table(readable, memory_map=True)
-                    path.parent.mkdir(parents=True, exist_ok=True)
-                    feather.write_feather(tb, path)
-                try:
-                    schema = ipc.open_file(path).schema
-                    download_success = True
-                except pa.ArrowInvalid:
-                    path.unlink(missing_ok=True)
-            if not download_success or schema is None:
-                raise Exception(f"Failed to download tiles. Aborting...")
-            if sidecars is None and b"sidecars" in schema.metadata:
-                # Grab just the filenames
-                sidecars = set([v for k, v in json.loads(schema.metadata.get(b"sidecars")).items()])
-            elif sidecars is None:
-                sidecars = set()
-            if not "." in rawquad:
-                for sidecar in sidecars | registered_sidecars:
-                    # The sidecar loses the feather suffix because it's supposed to be raw.
-                    quads.append(quad.replace(".feather", f".{sidecar}"))
-            if not schema.metadata or b"children" not in schema.metadata:
-                # Sidecars don't have children.
-                continue
-            kids = schema.metadata.get(b"children")
-            children = json.loads(kids)
-            quads.extend(children)
-        return all_quads
+            List of downloaded feather files.
+        """
+        downloaded_files = []
+        sidecar_suffix = "feather"
+        if sidecar_name != "":
+            sidecar_suffix = f"{sidecar_name}.feather"
+        for key in tqdm(self._manifest["key"].to_pylist()):
+            sidecar_path = self.tile_destination / f"{key}.{sidecar_suffix}"
+            sidecar_url = (
+                self.dataset.atlas_api_path
+                + f"/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/{key}.{sidecar_suffix}"
+            )
+            download_feather(sidecar_url, sidecar_path, headers=self.dataset.header, overwrite=overwrite)
+            downloaded_files.append(sidecar_path)
+        return downloaded_files
+    @property
+    def tile_destination(self):
+        return Path("~/.nomic/cache", self.id).expanduser()
     @property
     def datum_id_field(self):
@@ -1160,7 +1085,7 @@ class AtlasDataset(AtlasClass):
         build_template = {}
         if self.modality == "embedding":
-            if topic_model.community_description_target_field is None:
+            if topic_model.topic_label_field is None:
                 logger.warning(
                     "You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
                 )
@@ -1188,7 +1113,7 @@ class AtlasDataset(AtlasClass):
                 "topic_model_hyperparameters": json.dumps(
                     {
                         "build_topic_model": topic_model.build_topic_model,
-                        "community_description_target_field": topic_model.community_description_target_field,
+                        "community_description_target_field": topic_model.topic_label_field,  # TODO change key to topic_label_field post v0.0.85
                         "cluster_method": topic_model.cluster_method,
                         "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
                     }
@@ -1253,7 +1178,7 @@ class AtlasDataset(AtlasClass):
                 "topic_model_hyperparameters": json.dumps(
                     {
                         "build_topic_model": topic_model.build_topic_model,
-                        "community_description_target_field": indexed_field,
+                        "community_description_target_field": indexed_field,  # TODO change key to topic_label_field post v0.0.85
                         "cluster_method": topic_model.build_topic_model,
                         "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
                     }

{nomic-3.0.31 → nomic-3.0.33}/nomic/utils.py RENAMED Viewed

@@ -4,11 +4,12 @@ import random
 import sys
 from io import BytesIO
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 from uuid import UUID
 import pyarrow as pa
 import requests
+from pyarrow import ipc
 nouns = [
     "newton",
@@ -241,10 +242,49 @@ def get_object_size_in_bytes(obj):
 # Helpful function for downloading feather files
 # Best for small feather files
-def download_feather(url: str, path: Path, headers: Optional[dict] = None):
-    data = requests.get(url, headers=headers)
-    readable = BytesIO(data.content)
-    readable.seek(0)
-    tb = pa.feather.read_table(readable, memory_map=True)  # type: ignore
-    path.parent.mkdir(parents=True, exist_ok=True)
-    pa.feather.write_feather(tb, path)  # type: ignore
+def download_feather(
+    url: Union[str, Path], path: Path, headers: Optional[dict] = None, num_attempts=1, overwrite=False
+) -> pa.Schema:
+    """
+    Download a feather file from a URL to a local path.
+    Returns the schema of feather file if successful.
+    Parameters:
+        url (str): URL to download feather file from.
+        path (Path): Local path to save feather file to.
+        headers (dict): Optional headers to include in request.
+        num_attempts (int): Number of download attempts before raising an error.
+        overwrite (bool): Whether to overwrite existing file.
+    Returns:
+        Feather schema.
+    """
+    assert num_attempts > 0, "Num attempts must be greater than 0"
+    download_attempt = 0
+    download_success = False
+    schema = None
+    while download_attempt < num_attempts and not download_success:
+        download_attempt += 1
+        if not path.exists() or overwrite:
+            # Attempt download
+            try:
+                data = requests.get(str(url), headers=headers)
+                readable = BytesIO(data.content)
+                readable.seek(0)
+                tb = pa.feather.read_table(readable, memory_map=False)  # type: ignore
+                schema = tb.schema
+                path.parent.mkdir(parents=True, exist_ok=True)
+                pa.feather.write_feather(tb, path)  # type: ignore
+                download_success = True
+            except pa.ArrowInvalid:
+                # failed try again
+                path.unlink(missing_ok=True)
+        else:
+            # Load existing file
+            try:
+                schema = ipc.open_file(path).schema
+                download_success = True
+            except pa.ArrowInvalid:
+                path.unlink(missing_ok=True)
+    if not download_success or schema is None:
+        raise ValueError(f"Failed to download feather file from {url} after {num_attempts} attempts.")
+    return schema

{nomic-3.0.31 → nomic-3.0.33}/nomic.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.0.31
+Version: 3.0.33
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.0.31 → nomic-3.0.33}/nomic.egg-info/requires.txt RENAMED Viewed

@@ -20,7 +20,7 @@ sagemaker
 [dev]
 nomic[all]
-black
+black==24.3.0
 coverage
 pylint
 pytest

{nomic-3.0.31 → nomic-3.0.33}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ description = "The official Nomic python client."
 setup(
     name="nomic",
-    version="3.0.31",
+    version="3.0.33",
     url="https://github.com/nomic-ai/nomic",
     description=description,
     long_description=description,
@@ -43,7 +43,7 @@ setup(
         ],
         "dev": [
             "nomic[all]",
-            "black",
+            "black==24.3.0",
             "coverage",
             "pylint",
             "pytest",