nomic 3.3.3__tar.gz → 3.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.3.3 → nomic-3.4.0}/PKG-INFO +1 -1
- {nomic-3.3.3 → nomic-3.4.0}/nomic/atlas.py +5 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/dataset.py +46 -11
- {nomic-3.3.3 → nomic-3.4.0}/nomic/embed.py +22 -11
- {nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.3.3 → nomic-3.4.0}/setup.py +1 -1
- {nomic-3.3.3 → nomic-3.4.0}/README.md +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/__init__.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/aws/__init__.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/aws/sagemaker.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/cli.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/data_inference.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/data_operations.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/settings.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic/utils.py +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/pyproject.toml +0 -0
- {nomic-3.3.3 → nomic-3.4.0}/setup.cfg +0 -0
|
@@ -44,6 +44,7 @@ def map_data(
|
|
|
44
44
|
description: The description of your dataset
|
|
45
45
|
id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
|
|
46
46
|
is_public: Should the dataset be accessible outside your Nomic Atlas organization.
|
|
47
|
+
indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
|
|
47
48
|
projection: Options to adjust Nomic Project - the dimensionality algorithm organizing your dataset.
|
|
48
49
|
topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
|
|
49
50
|
duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
|
|
@@ -62,6 +63,10 @@ def map_data(
|
|
|
62
63
|
modality = "text"
|
|
63
64
|
|
|
64
65
|
if blobs is not None:
|
|
66
|
+
if embeddings is not None:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"You cannot pass both `blobs` and `embeddings` to map_data(). To create a map of images, include `blobs` and not `embeddings`. To create a map of embeddings with images as metadata, include your images as a field in your `data` parameter."
|
|
69
|
+
)
|
|
65
70
|
# change this when we support other modalities
|
|
66
71
|
modality = "image"
|
|
67
72
|
indexed_field = "_blob_hash"
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import concurrent
|
|
3
3
|
import concurrent.futures
|
|
4
|
+
import importlib.metadata
|
|
4
5
|
import io
|
|
5
6
|
import json
|
|
6
7
|
import os
|
|
8
|
+
import re
|
|
7
9
|
import time
|
|
10
|
+
import unicodedata
|
|
8
11
|
from contextlib import contextmanager
|
|
9
12
|
from datetime import datetime
|
|
10
13
|
from io import BytesIO
|
|
@@ -72,13 +75,20 @@ class AtlasClass(object):
|
|
|
72
75
|
token = self.credentials["token"]
|
|
73
76
|
self.token = token
|
|
74
77
|
|
|
75
|
-
|
|
78
|
+
try:
|
|
79
|
+
version = importlib.metadata.version("nomic")
|
|
80
|
+
except Exception:
|
|
81
|
+
version = "unknown"
|
|
82
|
+
|
|
83
|
+
self.header = {"Authorization": f"Bearer {token}", "User-Agent": f"py-nomic/{version}"}
|
|
76
84
|
|
|
77
85
|
if self.token:
|
|
78
86
|
response = requests.get(
|
|
79
87
|
self.atlas_api_path + "/v1/user",
|
|
80
88
|
headers=self.header,
|
|
81
89
|
)
|
|
90
|
+
if "X-AtlasWarning" in response.headers:
|
|
91
|
+
logger.warning(response.headers["X-AtlasWarning"])
|
|
82
92
|
response = validate_api_http_response(response)
|
|
83
93
|
if not response.status_code == 200:
|
|
84
94
|
logger.warning(str(response))
|
|
@@ -664,14 +674,22 @@ class AtlasProjection:
|
|
|
664
674
|
sidecar_suffix = "feather"
|
|
665
675
|
if sidecar_name != "":
|
|
666
676
|
sidecar_suffix = f"{sidecar_name}.feather"
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
self.
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
677
|
+
with concurrent.futures.ThreadPoolExecutor(4) as ex:
|
|
678
|
+
futures = []
|
|
679
|
+
for key in tqdm(self._manifest["key"].to_pylist()):
|
|
680
|
+
sidecar_path = self.tile_destination / f"{key}.{sidecar_suffix}"
|
|
681
|
+
sidecar_url = (
|
|
682
|
+
self.dataset.atlas_api_path
|
|
683
|
+
+ f"/v1/project/{self.dataset.id}/index/projection/{self.id}/quadtree/{key}.{sidecar_suffix}"
|
|
684
|
+
)
|
|
685
|
+
futures.append(
|
|
686
|
+
ex.submit(
|
|
687
|
+
download_feather, sidecar_url, sidecar_path, headers=self.dataset.header, overwrite=overwrite
|
|
688
|
+
)
|
|
689
|
+
)
|
|
690
|
+
downloaded_files.append(sidecar_path)
|
|
691
|
+
for f in futures:
|
|
692
|
+
f.result()
|
|
675
693
|
return downloaded_files
|
|
676
694
|
|
|
677
695
|
@property
|
|
@@ -753,6 +771,15 @@ class AtlasDataset(AtlasClass):
|
|
|
753
771
|
* **dataset_id** - An alternative way to load a dataset is by passing the dataset_id directly. This only works if a dataset exists.
|
|
754
772
|
"""
|
|
755
773
|
assert identifier is not None or dataset_id is not None, "You must pass a dataset identifier"
|
|
774
|
+
# Normalize identifier.
|
|
775
|
+
if identifier is not None:
|
|
776
|
+
s = identifier.split("/", 1)
|
|
777
|
+
identifier = unicodedata.normalize("NFD", s[-1]) # normalize accents
|
|
778
|
+
identifier = identifier.lower().replace(" ", "-").replace("_", "-")
|
|
779
|
+
identifier = re.sub(r"[^a-z0-9-]", "", identifier)
|
|
780
|
+
identifier = re.sub(r"-+", "-", identifier)
|
|
781
|
+
if len(s) == 2:
|
|
782
|
+
identifier = f"{s[0]}/{identifier}"
|
|
756
783
|
|
|
757
784
|
super().__init__()
|
|
758
785
|
|
|
@@ -761,6 +788,8 @@ class AtlasDataset(AtlasClass):
|
|
|
761
788
|
f"Passing organization_name has been removed in Nomic Python client 3.0. Instead identify your dataset with `organization_name/project_name` (e.g. sterling-cooper/november-ads)."
|
|
762
789
|
)
|
|
763
790
|
|
|
791
|
+
# Set this before possible early return.
|
|
792
|
+
self._schema = None
|
|
764
793
|
if dataset_id is not None:
|
|
765
794
|
self.meta = self._get_project_by_id(dataset_id)
|
|
766
795
|
return
|
|
@@ -793,7 +822,6 @@ class AtlasDataset(AtlasClass):
|
|
|
793
822
|
)
|
|
794
823
|
|
|
795
824
|
self.meta = self._get_project_by_id(project_id=dataset_id)
|
|
796
|
-
self._schema = None
|
|
797
825
|
|
|
798
826
|
def delete(self):
|
|
799
827
|
"""
|
|
@@ -1074,6 +1102,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1074
1102
|
else:
|
|
1075
1103
|
projection = NomicProjectOptions()
|
|
1076
1104
|
|
|
1105
|
+
topic_model_was_false = topic_model is False
|
|
1077
1106
|
if isinstance(topic_model, Dict):
|
|
1078
1107
|
topic_model = NomicTopicOptions(**topic_model)
|
|
1079
1108
|
elif isinstance(topic_model, NomicTopicOptions):
|
|
@@ -1117,7 +1146,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1117
1146
|
|
|
1118
1147
|
build_template = {}
|
|
1119
1148
|
if modality == "embedding":
|
|
1120
|
-
if topic_model.topic_label_field is None:
|
|
1149
|
+
if (not topic_model_was_false) and topic_model.topic_label_field is None:
|
|
1121
1150
|
logger.warning(
|
|
1122
1151
|
"You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
|
|
1123
1152
|
)
|
|
@@ -1620,6 +1649,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1620
1649
|
close_pbar = True
|
|
1621
1650
|
pbar = tqdm(total=int(len(data)) // shard_size)
|
|
1622
1651
|
failed = 0
|
|
1652
|
+
failed_reqs = 0
|
|
1623
1653
|
succeeded = 0
|
|
1624
1654
|
errors_504 = 0
|
|
1625
1655
|
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
@@ -1668,6 +1698,11 @@ class AtlasDataset(AtlasClass):
|
|
|
1668
1698
|
failed += shard_size
|
|
1669
1699
|
pbar.update(1)
|
|
1670
1700
|
response.close()
|
|
1701
|
+
failed_reqs += 1
|
|
1702
|
+
if failed_reqs > 10:
|
|
1703
|
+
raise RuntimeError(
|
|
1704
|
+
f"{self.identifier}: Too many upload requests have failed at this time. Please try again later."
|
|
1705
|
+
)
|
|
1671
1706
|
else:
|
|
1672
1707
|
# A successful upload.
|
|
1673
1708
|
succeeded += shard_size
|
|
@@ -12,6 +12,7 @@ from urllib.parse import urlparse
|
|
|
12
12
|
import PIL
|
|
13
13
|
import PIL.Image
|
|
14
14
|
import requests
|
|
15
|
+
from tqdm import tqdm
|
|
15
16
|
|
|
16
17
|
from .dataset import AtlasClass
|
|
17
18
|
from .settings import *
|
|
@@ -286,18 +287,28 @@ def _text_embed4all(
|
|
|
286
287
|
limits = {"cpu": 16, "kompute": 32, "metal": 1024}
|
|
287
288
|
return n_tokens > limits[backend]
|
|
288
289
|
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
290
|
+
pb = tqdm(total=len(texts), desc="Embedding texts", unit="inputs")
|
|
291
|
+
output_embeddings = []
|
|
292
|
+
ntok = 0
|
|
293
|
+
batch_size = 64
|
|
294
|
+
for start in range(0, len(texts), batch_size):
|
|
295
|
+
end = min(len(texts), start + batch_size)
|
|
296
|
+
b = end - start
|
|
297
|
+
out = _embed4all.embed(
|
|
298
|
+
texts[start:end],
|
|
299
|
+
prefix=task_type,
|
|
300
|
+
dimensionality=dimensionality,
|
|
301
|
+
long_text_mode=long_text_mode,
|
|
302
|
+
return_dict=True,
|
|
303
|
+
atlas=True,
|
|
304
|
+
cancel_cb=cancel_cb if dynamic_mode else None,
|
|
305
|
+
)
|
|
306
|
+
ntok += out["n_prompt_tokens"]
|
|
307
|
+
output_embeddings.extend(out["embeddings"])
|
|
308
|
+
pb.update(b)
|
|
309
|
+
pb.close()
|
|
299
310
|
usage = {"prompt_tokens": ntok, "total_tokens": ntok}
|
|
300
|
-
return {"embeddings":
|
|
311
|
+
return {"embeddings": output_embeddings, "usage": usage, "model": model, "inference_mode": "local"}
|
|
301
312
|
|
|
302
313
|
|
|
303
314
|
def free_embedding_model() -> None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|