PyPI - nomic - Versions diffs - 3.0.31__tar.gz → 3.0.32__tar.gz - Mend

nomic 3.0.31tar.gz → 3.0.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{nomic-3.0.31 → nomic-3.0.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.0.31
+Version: 3.0.32
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.0.31 → nomic-3.0.32}/nomic/aws/sagemaker.py RENAMED Viewed

@@ -232,7 +232,6 @@ def embed_image(
     region_name: str,
     model_name="nomic-embed-vision-v1",
 ) -> dict:
     embeddings = []
     max_workers = mp.cpu_count()

{nomic-3.0.31 → nomic-3.0.32}/nomic/data_inference.py RENAMED Viewed

@@ -84,11 +84,11 @@ class NomicTopicOptions(BaseModel):
     Args:
         build_topic_model: If True, builds a topic model over your dataset's embeddings.
-        community_description_target_field: The dataset field/column that Atlas will use to assign a human-readable description to each topic.
+        topic_label_field: The dataset column (usually the column you embedded) that Atlas will use to assign a human-readable description to each topic.
     """
     build_topic_model: bool = True
-    community_description_target_field: Optional[str] = Field(default=None, alias="topic_label_field")
+    topic_label_field: Optional[str] = Field(default=None, alias="community_description_target_field")
     cluster_method: str = "fast"
     enforce_topic_hierarchy: bool = False

{nomic-3.0.31 → nomic-3.0.32}/nomic/data_operations.py RENAMED Viewed

@@ -1,13 +1,9 @@
 import base64
-import concurrent
-import concurrent.futures
-import glob
 import io
 import json
 import os
 from collections import defaultdict
 from datetime import datetime
-from io import BytesIO
 from pathlib import Path
 from typing import Dict, Iterable, List, Optional, Tuple
@@ -17,7 +13,7 @@ import pyarrow as pa
 import requests
 from loguru import logger
 from pyarrow import compute as pc
-from pyarrow import feather, ipc
+from pyarrow import feather
 from tqdm import tqdm
 from .settings import EMBEDDING_PAGINATION_LIMIT
@@ -99,6 +95,7 @@ class AtlasMapTopics:
         self._hierarchy = None
         try:
+            logger.info("Downloading topics")
             self._tb: pa.Table = projection._fetch_tiles()
             topic_fields = [column for column in self._tb.column_names if column.startswith("_topic_depth_")]
             self.depth = len(topic_fields)
@@ -426,53 +423,41 @@ class AtlasMapEmbeddings:
         if self._latent is not None:
             return self._latent
-        root_embedding = self.projection.tile_destination / "0/0/0-0.embeddings.feather"
-        # Not the most complete check, hence the warning below.
-        if not root_embedding.exists():
-            self._download_latent()
+        downloaded_files_in_tile_order = self._download_latent()
+        assert len(downloaded_files_in_tile_order) > 0, "No embeddings found for this map."
         all_embeddings = []
-        for path in self.projection._tiles_in_order(coords_only=False):
-            # double with-suffix to remove '.embeddings.feather'
-            files = path.parent.glob(path.with_suffix("").stem + "-*.embeddings.feather")
+        for path in downloaded_files_in_tile_order:
             # Should there be more than 10, we need to sort by int values, not string values
-            sortable = sorted(files, key=lambda x: int(x.with_suffix("").stem.split("-")[-1]))
-            if len(sortable) == 0:
-                raise FileNotFoundError(
-                    "Could not find any embeddings for tile {}".format(path)
-                    + " If you possibly downloaded only some of the embeddings, run '[map_name].download_latent()'."
-                )
-            for file in sortable:
-                tb = feather.read_table(file, memory_map=True)
-                dims = tb["_embeddings"].type.list_size
-                all_embeddings.append(pa.compute.list_flatten(tb["_embeddings"]).to_numpy().reshape(-1, dims))  # type: ignore
+            tb = feather.read_table(path, memory_map=True)
+            dims = tb["_embeddings"].type.list_size
+            all_embeddings.append(pa.compute.list_flatten(tb["_embeddings"]).to_numpy().reshape(-1, dims))  # type: ignore
         return np.vstack(all_embeddings)
-    def _download_latent(self):
+    def _download_latent(self) -> List[Path]:
         """
-        Downloads the latent embeddings one file at a time.
+        Downloads the feather tree for embeddings.
+        Returns the path to downloaded embeddings.
         """
-        logger.warning("Downloading latent embeddings of all datapoints.")
-        limit = 10_000
-        route = self.projection.dataset.atlas_api_path + "/v1/project/data/get/embedding/paged"
-        last = None
+        # TODO: Is size of the embedding files (several hundreds of MBs) going to be a problem here?
+        self.projection.tile_destination.mkdir(parents=True, exist_ok=True)
+        root_url = Path(
+            f"{self.dataset.atlas_api_path}/v1/project/{self.dataset.id}/index/projection/{self.projection.id}/quadtree/"
+        )
-        with tqdm(total=self.dataset.total_datums // limit) as pbar:
-            while True:
-                params = {"projection_id": self.projection.id, "last_file": last, "page_size": limit}
-                r = requests.post(route, headers=self.projection.dataset.header, json=params)
-                if r.status_code == 204:
-                    # Download complete!
-                    break
-                fin = BytesIO(r.content)
-                tb = feather.read_table(fin, memory_map=True)
+        registered_sidecar_names = [sidecar[1] for sidecar in self.projection._registered_sidecars()]
+        assert "embeddings" in registered_sidecar_names, "Embeddings not found in sidecars."
-                tilename = tb.schema.metadata[b"tile"].decode("utf-8")
-                dest = (self.projection.tile_destination / tilename).with_suffix(".embeddings.feather")
-                dest.parent.mkdir(parents=True, exist_ok=True)
-                feather.write_feather(tb, dest)
-                last = tilename
-                pbar.update(1)
+        downloaded_files_in_tile_order = []
+        logger.info("Downloading latent embeddings...")
+        all_quads = list(self.projection._tiles_in_order())
+        for quad in tqdm(all_quads):
+            path = quad.with_suffix(".embeddings.feather")
+            # WARNING: Potentially large data request here
+            quadtree_loc = Path(*path.parts[-3:])
+            download_feather(root_url / quadtree_loc, path, headers=self.dataset.header, overwrite=False)
+            downloaded_files_in_tile_order.append(path)
+        return downloaded_files_in_tile_order
     def vector_search(
         self, queries: Optional[np.ndarray] = None, ids: Optional[List[str]] = None, k: int = 5
@@ -694,6 +679,7 @@ class AtlasMapTags:
         """
         Downloads the feather tree for large sidecar columns.
         """
+        logger.info("Downloading tags")
         self.projection.tile_destination.mkdir(parents=True, exist_ok=True)
         root_url = f"{self.dataset.atlas_api_path}/v1/project/{self.dataset.id}/index/projection/{self.projection.id}/quadtree/"
@@ -706,20 +692,7 @@ class AtlasMapTags:
             quad_str = os.path.join(*[str(q) for q in quad])
             filename = quad_str + "." + f"_tag.{tag_definition_id}" + ".feather"
             path = self.projection.tile_destination / Path(filename)
-            download_attempt = 0
-            download_success = False
-            while download_attempt < 3 and not download_success:
-                download_attempt += 1
-                if not path.exists() or overwrite:
-                    download_feather(root_url + filename, path, headers=self.dataset.header)
-                try:
-                    ipc.open_file(path).schema
-                    download_success = True
-                except pa.ArrowInvalid:
-                    path.unlink(missing_ok=True)
-            if not download_success:
-                raise Exception(f"Failed to download tag {tag_name}.")
+            download_feather(root_url + filename, path, headers=self.dataset.header, overwrite=True)
             ordered_tag_paths.append(path)
         return ordered_tag_paths
@@ -791,7 +764,6 @@ class AtlasMapData:
         self.projection = projection
         self.dataset = projection.dataset
         self.id_field = self.projection.dataset.id_field
-        self.fields = fields
         try:
             # Run fetch_tiles first to guarantee existence of quad feather files
             self._basic_data: pa.Table = self.projection._fetch_tiles()
@@ -801,29 +773,15 @@ class AtlasMapData:
         except pa.lib.ArrowInvalid as e:  # type: ignore
             raise ValueError("Failed to fetch tiles for this map")
-    def _read_prefetched_tiles_with_sidecars(self, additional_sidecars):
+    def _read_prefetched_tiles_with_sidecars(self, sidecars):
         tbs = []
-        root = feather.read_table(self.projection.tile_destination / Path("0/0/0.feather"))  # type: ignore
-        try:
-            small_sidecars = set([v for k, v in json.loads(root.schema.metadata[b"sidecars"]).items()])
-        except KeyError:
-            small_sidecars = set([])
         for path in self.projection._tiles_in_order():
             tb = pa.feather.read_table(path).drop(["_id", "ix", "x", "y"])  # type: ignore
             for col in tb.column_names:
                 if col[0] == "_":
                     tb = tb.drop([col])
-            for sidecar_file in small_sidecars:
-                carfile = pa.feather.read_table(path.parent / f"{path.stem}.{sidecar_file}.feather", memory_map=True)  # type: ignore
-                for col in carfile.column_names:
-                    tb = tb.append_column(col, carfile[col])
-            for big_sidecar in additional_sidecars:
-                fname = (
-                    base64.urlsafe_b64encode(big_sidecar.encode("utf-8")).decode("utf-8")
-                    if big_sidecar != "datum_id"
-                    else big_sidecar
-                )
-                carfile = pa.feather.read_table(path.parent / f"{path.stem}.{fname}.feather", memory_map=True)  # type: ignore
+            for _, sidecar in sidecars:
+                carfile = pa.feather.read_table(path.parent / f"{path.stem}.{sidecar}.feather", memory_map=True)  # type: ignore
                 for col in carfile.column_names:
                     tb = tb.append_column(col, carfile[col])
             tbs.append(tb)
@@ -835,36 +793,31 @@ class AtlasMapData:
         """
         Downloads the feather tree for large sidecar columns.
         """
+        logger.info("Downloading dataset")
         self.projection.tile_destination.mkdir(parents=True, exist_ok=True)
         root = f"{self.dataset.atlas_api_path}/v1/project/{self.dataset.id}/index/projection/{self.projection.id}/quadtree/"
         all_quads = list(self.projection._tiles_in_order(coords_only=True))
-        sidecars = fields
-        registered_sidecars = self.projection._registered_sidecars()
-        if sidecars is None:
-            sidecars = [
-                field
-                for field in self.dataset.dataset_fields
-                if field not in self._basic_data.column_names and field != "_embeddings"
-            ]
+        sidecars = None
+        if fields is None:
+            fields = self.dataset.dataset_fields
         else:
-            for field in sidecars:
+            for field in fields:
                 assert field in self.dataset.dataset_fields, f"Field {field} not found in dataset fields."
-        encoded_sidecars = [base64.urlsafe_b64encode(sidecar.encode("utf-8")).decode("utf-8") for sidecar in sidecars]
-        if any(sidecar == "datum_id" for (field, sidecar) in registered_sidecars):
-            sidecars.append("datum_id")
-            encoded_sidecars.append("datum_id")
+        sidecars = [
+            (field, sidecar)
+            for field, sidecar in self.projection._registered_sidecars()
+            if field[0] != "_" and field in fields
+        ]
         for quad in tqdm(all_quads):
-            for encoded_colname in encoded_sidecars:
+            for field, encoded_colname in sidecars:
                 quad_str = os.path.join(*[str(q) for q in quad])
                 filename = quad_str + "." + encoded_colname + ".feather"
                 path = self.projection.tile_destination / Path(filename)
-                if not os.path.exists(path):
-                    # WARNING: Potentially large data request here
-                    download_feather(root + filename, path, headers=self.dataset.header)
+                # WARNING: Potentially large data request here
+                download_feather(root + filename, path, headers=self.dataset.header, overwrite=False)
         return sidecars
     @property

{nomic-3.0.31 → nomic-3.0.32}/nomic/dataset.py RENAMED Viewed

@@ -36,7 +36,7 @@ from .data_inference import (
 )
 from .data_operations import AtlasMapData, AtlasMapDuplicates, AtlasMapEmbeddings, AtlasMapTags, AtlasMapTopics
 from .settings import *
-from .utils import assert_valid_project_id, get_object_size_in_bytes
+from .utils import assert_valid_project_id, download_feather
 class AtlasUser:
@@ -611,6 +611,7 @@ class AtlasProjection:
         """
         if self._tile_data is not None:
             return self._tile_data
+        logger.info(f"Downloading files for projection {self.projection_id}")
         self._download_large_feather(overwrite=overwrite)
         tbs = []
         root = feather.read_table(self.tile_destination / "0/0/0.feather", memory_map=True)
@@ -634,8 +635,10 @@ class AtlasProjection:
     @overload
     def _tiles_in_order(self, *, coords_only: Literal[False] = ...) -> Iterator[Path]: ...
     @overload
     def _tiles_in_order(self, *, coords_only: Literal[True]) -> Iterator[Tuple[int, int, int]]: ...
     @overload
     def _tiles_in_order(self, *, coords_only: bool) -> Iterator[Any]: ...
@@ -693,27 +696,7 @@ class AtlasProjection:
             quad = rawquad + ".feather"
             all_quads.append(quad)
             path = self.tile_destination / quad
-            download_attempt = 0
-            download_success = False
-            schema = None
-            while download_attempt < 3 and not download_success:
-                download_attempt += 1
-                if not path.exists() or overwrite:
-                    data = requests.get(root + quad, headers=self.dataset.header)
-                    readable = io.BytesIO(data.content)
-                    readable.seek(0)
-                    tb = feather.read_table(readable, memory_map=True)
-                    path.parent.mkdir(parents=True, exist_ok=True)
-                    feather.write_feather(tb, path)
-                try:
-                    schema = ipc.open_file(path).schema
-                    download_success = True
-                except pa.ArrowInvalid:
-                    path.unlink(missing_ok=True)
-            if not download_success or schema is None:
-                raise Exception(f"Failed to download tiles. Aborting...")
+            schema = download_feather(root + quad, path, headers=self.dataset.header, overwrite=overwrite)
             if sidecars is None and b"sidecars" in schema.metadata:
                 # Grab just the filenames
@@ -1160,7 +1143,7 @@ class AtlasDataset(AtlasClass):
         build_template = {}
         if self.modality == "embedding":
-            if topic_model.community_description_target_field is None:
+            if topic_model.topic_label_field is None:
                 logger.warning(
                     "You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
                 )
@@ -1188,7 +1171,7 @@ class AtlasDataset(AtlasClass):
                 "topic_model_hyperparameters": json.dumps(
                     {
                         "build_topic_model": topic_model.build_topic_model,
-                        "community_description_target_field": topic_model.community_description_target_field,
+                        "community_description_target_field": topic_model.topic_label_field,  # TODO change key to topic_label_field post v0.0.85
                         "cluster_method": topic_model.cluster_method,
                         "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
                     }
@@ -1253,7 +1236,7 @@ class AtlasDataset(AtlasClass):
                 "topic_model_hyperparameters": json.dumps(
                     {
                         "build_topic_model": topic_model.build_topic_model,
-                        "community_description_target_field": indexed_field,
+                        "community_description_target_field": indexed_field,  # TODO change key to topic_label_field post v0.0.85
                         "cluster_method": topic_model.build_topic_model,
                         "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
                     }

{nomic-3.0.31 → nomic-3.0.32}/nomic/utils.py RENAMED Viewed

@@ -4,11 +4,12 @@ import random
 import sys
 from io import BytesIO
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 from uuid import UUID
 import pyarrow as pa
 import requests
+from pyarrow import ipc
 nouns = [
     "newton",
@@ -241,10 +242,31 @@ def get_object_size_in_bytes(obj):
 # Helpful function for downloading feather files
 # Best for small feather files
-def download_feather(url: str, path: Path, headers: Optional[dict] = None):
-    data = requests.get(url, headers=headers)
-    readable = BytesIO(data.content)
-    readable.seek(0)
-    tb = pa.feather.read_table(readable, memory_map=True)  # type: ignore
-    path.parent.mkdir(parents=True, exist_ok=True)
-    pa.feather.write_feather(tb, path)  # type: ignore
+def download_feather(
+    url: Union[str, Path], path: Path, headers: Optional[dict] = None, retries=3, overwrite=False
+) -> pa.Schema:
+    """
+    Download a feather file from a URL to a local path.
+    Returns the schema of feather file if successful.
+    """
+    assert retries > 0, "Retries must be greater than 0"
+    download_attempt = 0
+    download_success = False
+    schema = None
+    while download_attempt < retries and not download_success:
+        download_attempt += 1
+        if not path.exists() or overwrite:
+            data = requests.get(str(url), headers=headers)
+            readable = BytesIO(data.content)
+            readable.seek(0)
+            tb = pa.feather.read_table(readable, memory_map=False)  # type: ignore
+            path.parent.mkdir(parents=True, exist_ok=True)
+            pa.feather.write_feather(tb, path)  # type: ignore
+        try:
+            schema = ipc.open_file(path).schema
+            download_success = True
+        except pa.ArrowInvalid:
+            path.unlink(missing_ok=True)
+    if not download_success or schema is None:
+        raise ValueError(f"Failed to download feather file from {url} after {retries} attempts.")
+    return schema

{nomic-3.0.31 → nomic-3.0.32}/nomic.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nomic
-Version: 3.0.31
+Version: 3.0.32
 Summary: The official Nomic python client.
 Home-page: https://github.com/nomic-ai/nomic
 Author: nomic.ai

{nomic-3.0.31 → nomic-3.0.32}/nomic.egg-info/requires.txt RENAMED Viewed

@@ -20,7 +20,7 @@ sagemaker
 [dev]
 nomic[all]
-black
+black==24.3.0
 coverage
 pylint
 pytest

{nomic-3.0.31 → nomic-3.0.32}/setup.py RENAMED Viewed

@@ -8,7 +8,7 @@ description = "The official Nomic python client."
 setup(
     name="nomic",
-    version="3.0.31",
+    version="3.0.32",
     url="https://github.com/nomic-ai/nomic",
     description=description,
     long_description=description,
@@ -43,7 +43,7 @@ setup(
         ],
         "dev": [
             "nomic[all]",
-            "black",
+            "black==24.3.0",
             "coverage",
             "pylint",
             "pytest",