nomic 3.3.2__tar.gz → 3.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.3.2 → nomic-3.3.4}/PKG-INFO +1 -1
- {nomic-3.3.2 → nomic-3.3.4}/nomic/dataset.py +59 -11
- {nomic-3.3.2 → nomic-3.3.4}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.3.2 → nomic-3.3.4}/setup.py +1 -1
- {nomic-3.3.2 → nomic-3.3.4}/README.md +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/__init__.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/atlas.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/aws/__init__.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/aws/sagemaker.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/cli.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/data_inference.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/data_operations.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/embed.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/settings.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic/utils.py +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/pyproject.toml +0 -0
- {nomic-3.3.2 → nomic-3.3.4}/setup.cfg +0 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import concurrent
|
|
3
3
|
import concurrent.futures
|
|
4
|
+
import importlib.metadata
|
|
4
5
|
import io
|
|
5
6
|
import json
|
|
6
7
|
import os
|
|
8
|
+
import re
|
|
7
9
|
import time
|
|
10
|
+
import unicodedata
|
|
8
11
|
from contextlib import contextmanager
|
|
9
12
|
from datetime import datetime
|
|
10
13
|
from io import BytesIO
|
|
@@ -12,6 +15,7 @@ from pathlib import Path
|
|
|
12
15
|
from typing import Dict, List, Optional, Tuple, Union
|
|
13
16
|
|
|
14
17
|
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
15
19
|
import pyarrow as pa
|
|
16
20
|
import requests
|
|
17
21
|
from loguru import logger
|
|
@@ -71,13 +75,20 @@ class AtlasClass(object):
|
|
|
71
75
|
token = self.credentials["token"]
|
|
72
76
|
self.token = token
|
|
73
77
|
|
|
74
|
-
|
|
78
|
+
try:
|
|
79
|
+
version = importlib.metadata.version("nomic")
|
|
80
|
+
except Exception:
|
|
81
|
+
version = "unknown"
|
|
82
|
+
|
|
83
|
+
self.header = {"Authorization": f"Bearer {token}", "User-Agent": f"py-nomic/{version}"}
|
|
75
84
|
|
|
76
85
|
if self.token:
|
|
77
86
|
response = requests.get(
|
|
78
87
|
self.atlas_api_path + "/v1/user",
|
|
79
88
|
headers=self.header,
|
|
80
89
|
)
|
|
90
|
+
if "X-AtlasWarning" in response.headers:
|
|
91
|
+
logger.warning(response.headers["X-AtlasWarning"])
|
|
81
92
|
response = validate_api_http_response(response)
|
|
82
93
|
if not response.status_code == 200:
|
|
83
94
|
logger.warning(str(response))
|
|
@@ -663,14 +674,22 @@ class AtlasProjection:
|
|
|
663
674
|
sidecar_suffix = "feather"
|
|
664
675
|
if sidecar_name != "":
|
|
665
676
|
sidecar_suffix = f"{sidecar_name}.feather"
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
self.
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
677
|
+
with concurrent.futures.ThreadPoolExecutor(4) as ex:
|
|
678
|
+
futures = []
|
|
679
|
+
for key in tqdm(self._manifest["key"].to_pylist()):
|
|
680
|
+
sidecar_path = self.tile_destination / f"{key}.{sidecar_suffix}"
|
|
681
|
+
sidecar_url = (
|
|
682
|
+
self.dataset.atlas_api_path
|
|
683
|
+
+ f"/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/{key}.{sidecar_suffix}"
|
|
684
|
+
)
|
|
685
|
+
futures.append(
|
|
686
|
+
ex.submit(
|
|
687
|
+
download_feather, sidecar_url, sidecar_path, headers=self.dataset.header, overwrite=overwrite
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
downloaded_files.append(sidecar_path)
|
|
691
|
+
for f in futures:
|
|
692
|
+
f.result()
|
|
674
693
|
return downloaded_files
|
|
675
694
|
|
|
676
695
|
@property
|
|
@@ -752,6 +771,15 @@ class AtlasDataset(AtlasClass):
|
|
|
752
771
|
* **dataset_id** - An alternative way to load a dataset is by passing the dataset_id directly. This only works if a dataset exists.
|
|
753
772
|
"""
|
|
754
773
|
assert identifier is not None or dataset_id is not None, "You must pass a dataset identifier"
|
|
774
|
+
# Normalize identifier.
|
|
775
|
+
if identifier is not None:
|
|
776
|
+
s = identifier.split("/", 1)
|
|
777
|
+
identifier = unicodedata.normalize("NFD", s[-1]) # normalize accents
|
|
778
|
+
identifier = identifier.lower().replace(" ", "-").replace("_", "-")
|
|
779
|
+
identifier = re.sub(r"[^a-z0-9-]", "", identifier)
|
|
780
|
+
identifier = re.sub(r"-+", "-", identifier)
|
|
781
|
+
if len(s) == 2:
|
|
782
|
+
identifier = f"{s[0]}/{identifier}"
|
|
755
783
|
|
|
756
784
|
super().__init__()
|
|
757
785
|
|
|
@@ -760,6 +788,8 @@ class AtlasDataset(AtlasClass):
|
|
|
760
788
|
f"Passing organization_name has been removed in Nomic Python client 3.0. Instead identify your dataset with `organization_name/project_name` (e.g. sterling-cooper/november-ads)."
|
|
761
789
|
)
|
|
762
790
|
|
|
791
|
+
# Set this before possible early return.
|
|
792
|
+
self._schema = None
|
|
763
793
|
if dataset_id is not None:
|
|
764
794
|
self.meta = self._get_project_by_id(dataset_id)
|
|
765
795
|
return
|
|
@@ -792,7 +822,6 @@ class AtlasDataset(AtlasClass):
|
|
|
792
822
|
)
|
|
793
823
|
|
|
794
824
|
self.meta = self._get_project_by_id(project_id=dataset_id)
|
|
795
|
-
self._schema = None
|
|
796
825
|
|
|
797
826
|
def delete(self):
|
|
798
827
|
"""
|
|
@@ -1073,6 +1102,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1073
1102
|
else:
|
|
1074
1103
|
projection = NomicProjectOptions()
|
|
1075
1104
|
|
|
1105
|
+
topic_model_was_false = topic_model is False
|
|
1076
1106
|
if isinstance(topic_model, Dict):
|
|
1077
1107
|
topic_model = NomicTopicOptions(**topic_model)
|
|
1078
1108
|
elif isinstance(topic_model, NomicTopicOptions):
|
|
@@ -1116,7 +1146,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1116
1146
|
|
|
1117
1147
|
build_template = {}
|
|
1118
1148
|
if modality == "embedding":
|
|
1119
|
-
if topic_model.topic_label_field is None:
|
|
1149
|
+
if (not topic_model_was_false) and topic_model.topic_label_field is None:
|
|
1120
1150
|
logger.warning(
|
|
1121
1151
|
"You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
|
|
1122
1152
|
)
|
|
@@ -1373,6 +1403,18 @@ class AtlasDataset(AtlasClass):
|
|
|
1373
1403
|
blobs: A list of image paths, bytes, or PIL Images. Use if you want to create an AtlasDataset using image embeddings over your images. Note: Blobs are stored locally only.
|
|
1374
1404
|
pbar: (Optional). A tqdm progress bar to update.
|
|
1375
1405
|
"""
|
|
1406
|
+
if isinstance(data, DataFrame):
|
|
1407
|
+
cols_before = set(data.columns)
|
|
1408
|
+
for col in cols_before:
|
|
1409
|
+
if col.startswith("_"):
|
|
1410
|
+
raise ValueError(
|
|
1411
|
+
f"You are attempting to upload a pandas dataframe with the column name {col}, but columns beginning with '_' are reserved for Atlas internal use. Please rename your column and try again."
|
|
1412
|
+
)
|
|
1413
|
+
data = pa.Table.from_pandas(data)
|
|
1414
|
+
for newcol in set(data.column_names).difference(cols_before):
|
|
1415
|
+
logger.warning(f"Dropping column {newcol} added in pandas conversion to pyarrow")
|
|
1416
|
+
data = data.drop([newcol])
|
|
1417
|
+
|
|
1376
1418
|
if embeddings is not None:
|
|
1377
1419
|
self._add_embeddings(data=data, embeddings=embeddings, pbar=pbar)
|
|
1378
1420
|
elif isinstance(data, pa.Table) and "_embeddings" in data.column_names: # type: ignore
|
|
@@ -1607,6 +1649,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1607
1649
|
close_pbar = True
|
|
1608
1650
|
pbar = tqdm(total=int(len(data)) // shard_size)
|
|
1609
1651
|
failed = 0
|
|
1652
|
+
failed_reqs = 0
|
|
1610
1653
|
succeeded = 0
|
|
1611
1654
|
errors_504 = 0
|
|
1612
1655
|
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
@@ -1655,6 +1698,11 @@ class AtlasDataset(AtlasClass):
|
|
|
1655
1698
|
failed += shard_size
|
|
1656
1699
|
pbar.update(1)
|
|
1657
1700
|
response.close()
|
|
1701
|
+
failed_reqs += 1
|
|
1702
|
+
if failed_reqs > 10:
|
|
1703
|
+
raise RuntimeError(
|
|
1704
|
+
f"{self.identifier}: Too many upload requests have failed at this time. Please try again later."
|
|
1705
|
+
)
|
|
1658
1706
|
else:
|
|
1659
1707
|
# A successful upload.
|
|
1660
1708
|
succeeded += shard_size
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|