nomic 3.0.34__tar.gz → 3.0.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.0.34 → nomic-3.0.36}/PKG-INFO +1 -1
- {nomic-3.0.34 → nomic-3.0.36}/nomic/atlas.py +34 -2
- {nomic-3.0.34 → nomic-3.0.36}/nomic/data_inference.py +6 -4
- {nomic-3.0.34 → nomic-3.0.36}/nomic/dataset.py +133 -6
- {nomic-3.0.34 → nomic-3.0.36}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.0.34 → nomic-3.0.36}/setup.py +1 -1
- {nomic-3.0.34 → nomic-3.0.36}/README.md +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/__init__.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/aws/__init__.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/aws/sagemaker.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/cli.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/data_operations.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/embed.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/settings.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic/utils.py +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/pyproject.toml +0 -0
- {nomic-3.0.34 → nomic-3.0.36}/setup.cfg +0 -0
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
|
10
10
|
import pyarrow as pa
|
|
11
11
|
from loguru import logger
|
|
12
12
|
from pandas import DataFrame
|
|
13
|
+
from PIL import Image
|
|
13
14
|
from pyarrow import Table
|
|
14
15
|
from tqdm import tqdm
|
|
15
16
|
|
|
@@ -21,6 +22,7 @@ from .utils import arrow_iterator, b64int, get_random_name
|
|
|
21
22
|
|
|
22
23
|
def map_data(
|
|
23
24
|
data: Optional[Union[DataFrame, List[Dict], Table]] = None,
|
|
25
|
+
blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
|
|
24
26
|
embeddings: Optional[np.ndarray] = None,
|
|
25
27
|
identifier: Optional[str] = None,
|
|
26
28
|
description: str = "",
|
|
@@ -54,8 +56,30 @@ def map_data(
|
|
|
54
56
|
raise Exception("Your embeddings cannot be empty")
|
|
55
57
|
|
|
56
58
|
if indexed_field is not None:
|
|
59
|
+
if embeddings is not None:
|
|
60
|
+
logger.warning("You have specified an indexed field but are using embeddings. Embeddings will be ignored.")
|
|
57
61
|
modality = "text"
|
|
58
62
|
|
|
63
|
+
if blobs is not None:
|
|
64
|
+
# change this when we support other modalities
|
|
65
|
+
modality = "image"
|
|
66
|
+
indexed_field = "_blob_hash"
|
|
67
|
+
if embedding_model is not None:
|
|
68
|
+
if isinstance(embedding_model, str):
|
|
69
|
+
model_name = embedding_model
|
|
70
|
+
elif isinstance(embedding_model, dict):
|
|
71
|
+
model_name = embedding_model["model"]
|
|
72
|
+
elif isinstance(embedding_model, NomicEmbedOptions):
|
|
73
|
+
model_name = embedding_model.model
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError("embedding_model must be a string, dictionary, or NomicEmbedOptions object")
|
|
76
|
+
|
|
77
|
+
if model_name in ["nomic-embed-text-v1", "nomic-embed-text-v1.5"]:
|
|
78
|
+
raise Exception("You cannot use a text embedding model with blobs")
|
|
79
|
+
else:
|
|
80
|
+
# default to vision v1.5
|
|
81
|
+
embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
|
|
82
|
+
|
|
59
83
|
if id_field is None:
|
|
60
84
|
id_field = ATLAS_DEFAULT_ID_FIELD
|
|
61
85
|
|
|
@@ -73,9 +97,14 @@ def map_data(
|
|
|
73
97
|
# no metadata was specified
|
|
74
98
|
added_id_field = False
|
|
75
99
|
|
|
76
|
-
if data is None
|
|
77
|
-
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
|
|
100
|
+
if data is None:
|
|
78
101
|
added_id_field = True
|
|
102
|
+
if embeddings is not None:
|
|
103
|
+
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
|
|
104
|
+
elif blobs is not None:
|
|
105
|
+
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(blobs))]
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError("You must specify either data, embeddings, or blobs")
|
|
79
108
|
|
|
80
109
|
if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
|
|
81
110
|
if isinstance(data, list) and id_field not in data[0]:
|
|
@@ -116,6 +145,9 @@ def map_data(
|
|
|
116
145
|
embeddings=embeddings,
|
|
117
146
|
data=data,
|
|
118
147
|
)
|
|
148
|
+
elif modality == "image":
|
|
149
|
+
dataset.add_data(blobs=blobs, data=data)
|
|
150
|
+
|
|
119
151
|
except BaseException as e:
|
|
120
152
|
if number_of_datums_before_upload == 0:
|
|
121
153
|
logger.info(f"{dataset.identifier}: Deleting dataset due to failure in initial upload.")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Optional, Union
|
|
1
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
2
2
|
|
|
3
3
|
import pyarrow as pa
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
4
|
+
from pydantic import AliasChoices, BaseModel, Field
|
|
5
5
|
|
|
6
6
|
from .settings import DEFAULT_DUPLICATE_THRESHOLD
|
|
7
7
|
|
|
@@ -29,7 +29,7 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
|
|
|
29
29
|
for field in schema:
|
|
30
30
|
if field.name.startswith("_"):
|
|
31
31
|
# Underscore fields are private to Atlas and will be handled with their own logic.
|
|
32
|
-
if not field.name in {"_embeddings"}:
|
|
32
|
+
if not field.name in {"_embeddings", "_blob_hash"}:
|
|
33
33
|
raise ValueError(f"Underscore fields are reserved for Atlas internal use: {field.name}")
|
|
34
34
|
whitelist[field.name] = field.type
|
|
35
35
|
elif pa.types.is_boolean(field.type):
|
|
@@ -114,4 +114,6 @@ class NomicEmbedOptions(BaseModel):
|
|
|
114
114
|
model: The Nomic Embedding Model to use.
|
|
115
115
|
"""
|
|
116
116
|
|
|
117
|
-
model:
|
|
117
|
+
model: Literal[
|
|
118
|
+
"nomic-embed-text-v1", "nomic-embed-vision-v1", "nomic-embed-text-v1.5", "nomic-embed-vision-v1.5"
|
|
119
|
+
] = "nomic-embed-text-v1.5"
|
|
@@ -7,6 +7,7 @@ import os
|
|
|
7
7
|
import time
|
|
8
8
|
from contextlib import contextmanager
|
|
9
9
|
from datetime import datetime
|
|
10
|
+
from io import BytesIO
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import Dict, List, Optional, Tuple, Union
|
|
12
13
|
|
|
@@ -15,6 +16,7 @@ import pyarrow as pa
|
|
|
15
16
|
import requests
|
|
16
17
|
from loguru import logger
|
|
17
18
|
from pandas import DataFrame
|
|
19
|
+
from PIL import Image
|
|
18
20
|
from pyarrow import compute as pc
|
|
19
21
|
from pyarrow import feather, ipc
|
|
20
22
|
from tqdm import tqdm
|
|
@@ -337,7 +339,7 @@ class AtlasClass(object):
|
|
|
337
339
|
|
|
338
340
|
for key in data.column_names:
|
|
339
341
|
if key.startswith("_"):
|
|
340
|
-
if key == "_embeddings":
|
|
342
|
+
if key == "_embeddings" or key == "_blob_hash":
|
|
341
343
|
continue
|
|
342
344
|
raise ValueError("Metadata fields cannot start with _")
|
|
343
345
|
if pa.compute.max(pa.compute.utf8_length(data[project.id_field])).as_py() > 36: # type: ignore
|
|
@@ -437,6 +439,13 @@ class AtlasProjection:
|
|
|
437
439
|
return f"{self.dataset.web_path}/data/{self.dataset.meta['organization_slug']}/{self.dataset.meta['slug']}/map"
|
|
438
440
|
# return f"{self.project.web_path}/data/{self.project.meta['organization_slug']}/{self.project.meta['slug']}/map"
|
|
439
441
|
|
|
442
|
+
@property
|
|
443
|
+
def dataset_link(self):
|
|
444
|
+
"""
|
|
445
|
+
Retrieves a dataset link.
|
|
446
|
+
"""
|
|
447
|
+
return f"{self.dataset.web_path}/data/{self.dataset.meta['organization_slug']}/{self.dataset.meta['slug']}"
|
|
448
|
+
|
|
440
449
|
@property
|
|
441
450
|
def _status(self):
|
|
442
451
|
response = requests.get(
|
|
@@ -450,7 +459,7 @@ class AtlasProjection:
|
|
|
450
459
|
return content
|
|
451
460
|
|
|
452
461
|
def __str__(self):
|
|
453
|
-
return f"{self.name}: {self.
|
|
462
|
+
return f"{self.name}: {self.dataset_link}"
|
|
454
463
|
|
|
455
464
|
def __repr__(self):
|
|
456
465
|
return self.__str__()
|
|
@@ -1073,7 +1082,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1073
1082
|
elif isinstance(embedding_model, NomicEmbedOptions):
|
|
1074
1083
|
pass
|
|
1075
1084
|
elif isinstance(embedding_model, str):
|
|
1076
|
-
embedding_model = NomicEmbedOptions(model=embedding_model)
|
|
1085
|
+
embedding_model = NomicEmbedOptions(model=embedding_model) # type: ignore
|
|
1077
1086
|
else:
|
|
1078
1087
|
embedding_model = NomicEmbedOptions()
|
|
1079
1088
|
|
|
@@ -1126,7 +1135,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1126
1135
|
),
|
|
1127
1136
|
}
|
|
1128
1137
|
|
|
1129
|
-
elif self.modality == "text":
|
|
1138
|
+
elif self.modality == "text" or self.modality == "image":
|
|
1130
1139
|
# find the index id of the index with name reuse_embeddings_from_index
|
|
1131
1140
|
reuse_embedding_from_index_id = None
|
|
1132
1141
|
indices = self.indices
|
|
@@ -1146,6 +1155,18 @@ class AtlasDataset(AtlasClass):
|
|
|
1146
1155
|
if indexed_field not in self.dataset_fields:
|
|
1147
1156
|
raise Exception(f"Indexing on {indexed_field} not allowed. Valid options are: {self.dataset_fields}")
|
|
1148
1157
|
|
|
1158
|
+
if self.modality == "image":
|
|
1159
|
+
if topic_model.topic_label_field is None:
|
|
1160
|
+
print(
|
|
1161
|
+
"You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
|
|
1162
|
+
)
|
|
1163
|
+
topic_field = None
|
|
1164
|
+
topic_model.build_topic_model = False
|
|
1165
|
+
else:
|
|
1166
|
+
topic_field = topic_model.topic_label_field
|
|
1167
|
+
else:
|
|
1168
|
+
topic_field = topic_model.topic_label_field
|
|
1169
|
+
|
|
1149
1170
|
build_template = {
|
|
1150
1171
|
"project_id": self.id,
|
|
1151
1172
|
"index_name": name,
|
|
@@ -1178,7 +1199,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1178
1199
|
"topic_model_hyperparameters": json.dumps(
|
|
1179
1200
|
{
|
|
1180
1201
|
"build_topic_model": topic_model.build_topic_model,
|
|
1181
|
-
"community_description_target_field":
|
|
1202
|
+
"community_description_target_field": topic_field,
|
|
1182
1203
|
"cluster_method": topic_model.build_topic_model,
|
|
1183
1204
|
"enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
|
|
1184
1205
|
}
|
|
@@ -1313,7 +1334,13 @@ class AtlasDataset(AtlasClass):
|
|
|
1313
1334
|
else:
|
|
1314
1335
|
raise Exception(response.text)
|
|
1315
1336
|
|
|
1316
|
-
def add_data(
|
|
1337
|
+
def add_data(
|
|
1338
|
+
self,
|
|
1339
|
+
data=Union[DataFrame, List[Dict], pa.Table],
|
|
1340
|
+
embeddings: Optional[np.ndarray] = None,
|
|
1341
|
+
blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
|
|
1342
|
+
pbar=None,
|
|
1343
|
+
):
|
|
1317
1344
|
"""
|
|
1318
1345
|
Adds data of varying modality to an Atlas dataset.
|
|
1319
1346
|
Args:
|
|
@@ -1326,9 +1353,109 @@ class AtlasDataset(AtlasClass):
|
|
|
1326
1353
|
elif isinstance(data, pa.Table) and "_embeddings" in data.column_names: # type: ignore
|
|
1327
1354
|
embeddings = np.array(data.column("_embeddings").to_pylist()) # type: ignore
|
|
1328
1355
|
self._add_embeddings(data=data, embeddings=embeddings, pbar=pbar)
|
|
1356
|
+
elif blobs is not None:
|
|
1357
|
+
self._add_blobs(data=data, blobs=blobs, pbar=pbar)
|
|
1329
1358
|
else:
|
|
1330
1359
|
self._add_text(data=data, pbar=pbar)
|
|
1331
1360
|
|
|
1361
|
+
def _add_blobs(
|
|
1362
|
+
self, data: Union[DataFrame, List[Dict], pa.Table], blobs: List[Union[str, bytes, Image.Image]], pbar=None
|
|
1363
|
+
):
|
|
1364
|
+
"""
|
|
1365
|
+
Add data, with associated blobs, to the dataset.
|
|
1366
|
+
Uploads blobs to the server and associates them with the data.
|
|
1367
|
+
"""
|
|
1368
|
+
if isinstance(data, DataFrame):
|
|
1369
|
+
data = pa.Table.from_pandas(data)
|
|
1370
|
+
elif isinstance(data, list):
|
|
1371
|
+
data = pa.Table.from_pylist(data)
|
|
1372
|
+
elif not isinstance(data, pa.Table):
|
|
1373
|
+
raise ValueError("Data must be a pandas DataFrame, list of dictionaries, or a pyarrow Table.")
|
|
1374
|
+
|
|
1375
|
+
blob_upload_endpoint = "/v1/project/data/add/blobs"
|
|
1376
|
+
|
|
1377
|
+
# uploda batch of blobs
|
|
1378
|
+
# return hash of blob
|
|
1379
|
+
# add hash to data as _blob_hash
|
|
1380
|
+
# set indexed_field to _blob_hash
|
|
1381
|
+
# call _add_data
|
|
1382
|
+
|
|
1383
|
+
# Cast self id field to string for merged data lower down on function
|
|
1384
|
+
data = data.set_column( # type: ignore
|
|
1385
|
+
data.schema.get_field_index(self.id_field), self.id_field, pc.cast(data[self.id_field], pa.string()) # type: ignore
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
ids = data[self.id_field].to_pylist() # type: ignore
|
|
1389
|
+
if not isinstance(ids[0], str):
|
|
1390
|
+
ids = [str(uuid) for uuid in ids]
|
|
1391
|
+
|
|
1392
|
+
# TODO: add support for other modalities
|
|
1393
|
+
images = []
|
|
1394
|
+
for uuid, blob in tqdm(zip(ids, blobs), total=len(ids), desc="Loading images"):
|
|
1395
|
+
if isinstance(blob, str) and os.path.exists(blob):
|
|
1396
|
+
# Auto resize to max 512x512
|
|
1397
|
+
image = Image.open(blob)
|
|
1398
|
+
if image.height > 512 or image.width > 512:
|
|
1399
|
+
image = image.resize((512, 512))
|
|
1400
|
+
buffered = BytesIO()
|
|
1401
|
+
image.save(buffered, format="JPEG")
|
|
1402
|
+
images.append((uuid, buffered.getvalue()))
|
|
1403
|
+
elif isinstance(blob, bytes):
|
|
1404
|
+
images.append((uuid, blob))
|
|
1405
|
+
elif isinstance(blob, Image.Image):
|
|
1406
|
+
if blob.height > 512 or blob.width > 512:
|
|
1407
|
+
blob = blob.resize((512, 512))
|
|
1408
|
+
buffered = BytesIO()
|
|
1409
|
+
blob.save(buffered, format="JPEG")
|
|
1410
|
+
images.append((uuid, buffered.getvalue()))
|
|
1411
|
+
else:
|
|
1412
|
+
raise ValueError(f"Invalid blob type for {uuid}. Must be a path to an image, bytes, or PIL Image.")
|
|
1413
|
+
|
|
1414
|
+
batch_size = 40
|
|
1415
|
+
num_workers = 10
|
|
1416
|
+
|
|
1417
|
+
def send_request(i):
|
|
1418
|
+
image_batch = images[i : i + batch_size]
|
|
1419
|
+
ids = [uuid for uuid, _ in image_batch]
|
|
1420
|
+
blobs = [("blobs", blob) for _, blob in image_batch]
|
|
1421
|
+
response = requests.post(
|
|
1422
|
+
self.atlas_api_path + blob_upload_endpoint,
|
|
1423
|
+
headers=self.header,
|
|
1424
|
+
data={"dataset_id": self.id},
|
|
1425
|
+
files=blobs,
|
|
1426
|
+
)
|
|
1427
|
+
if response.status_code != 200:
|
|
1428
|
+
raise Exception(response.text)
|
|
1429
|
+
return {uuid: blob_hash for uuid, blob_hash in zip(ids, response.json()["hashes"])}
|
|
1430
|
+
|
|
1431
|
+
# if this method is being called internally, we pass a global progress bar
|
|
1432
|
+
if pbar is None:
|
|
1433
|
+
pbar = tqdm(total=len(data), desc="Uploading blobs to Atlas")
|
|
1434
|
+
|
|
1435
|
+
hash_schema = pa.schema([(self.id_field, pa.string()), ("_blob_hash", pa.string())])
|
|
1436
|
+
returned_ids = []
|
|
1437
|
+
returned_hashes = []
|
|
1438
|
+
|
|
1439
|
+
succeeded = 0
|
|
1440
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
1441
|
+
futures = {executor.submit(send_request, i): i for i in range(0, len(data), batch_size)}
|
|
1442
|
+
|
|
1443
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1444
|
+
response = future.result()
|
|
1445
|
+
# add hash to data as _blob_hash
|
|
1446
|
+
for uuid, blob_hash in response.items():
|
|
1447
|
+
returned_ids.append(uuid)
|
|
1448
|
+
returned_hashes.append(blob_hash)
|
|
1449
|
+
|
|
1450
|
+
# A successful upload.
|
|
1451
|
+
succeeded += len(response)
|
|
1452
|
+
pbar.update(len(response))
|
|
1453
|
+
|
|
1454
|
+
hash_tb = pa.Table.from_pydict({self.id_field: returned_ids, "_blob_hash": returned_hashes}, schema=hash_schema)
|
|
1455
|
+
merged_data = data.join(right_table=hash_tb, keys=self.id_field) # type: ignore
|
|
1456
|
+
|
|
1457
|
+
self._add_data(merged_data, pbar=pbar)
|
|
1458
|
+
|
|
1332
1459
|
def _add_text(self, data=Union[DataFrame, List[Dict], pa.Table], pbar=None):
|
|
1333
1460
|
"""
|
|
1334
1461
|
Add text data to the dataset.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|