nomic 3.0.35__tar.gz → 3.0.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.0.35 → nomic-3.0.37}/PKG-INFO +1 -1
- {nomic-3.0.35 → nomic-3.0.37}/nomic/atlas.py +34 -2
- {nomic-3.0.35 → nomic-3.0.37}/nomic/data_inference.py +6 -4
- {nomic-3.0.35 → nomic-3.0.37}/nomic/dataset.py +126 -6
- {nomic-3.0.35 → nomic-3.0.37}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.0.35 → nomic-3.0.37}/setup.py +1 -1
- {nomic-3.0.35 → nomic-3.0.37}/README.md +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/__init__.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/aws/__init__.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/aws/sagemaker.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/cli.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/data_operations.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/embed.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/settings.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic/utils.py +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/pyproject.toml +0 -0
- {nomic-3.0.35 → nomic-3.0.37}/setup.cfg +0 -0
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
|
10
10
|
import pyarrow as pa
|
|
11
11
|
from loguru import logger
|
|
12
12
|
from pandas import DataFrame
|
|
13
|
+
from PIL import Image
|
|
13
14
|
from pyarrow import Table
|
|
14
15
|
from tqdm import tqdm
|
|
15
16
|
|
|
@@ -21,6 +22,7 @@ from .utils import arrow_iterator, b64int, get_random_name
|
|
|
21
22
|
|
|
22
23
|
def map_data(
|
|
23
24
|
data: Optional[Union[DataFrame, List[Dict], Table]] = None,
|
|
25
|
+
blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
|
|
24
26
|
embeddings: Optional[np.ndarray] = None,
|
|
25
27
|
identifier: Optional[str] = None,
|
|
26
28
|
description: str = "",
|
|
@@ -54,8 +56,30 @@ def map_data(
|
|
|
54
56
|
raise Exception("Your embeddings cannot be empty")
|
|
55
57
|
|
|
56
58
|
if indexed_field is not None:
|
|
59
|
+
if embeddings is not None:
|
|
60
|
+
logger.warning("You have specified an indexed field but are using embeddings. Embeddings will be ignored.")
|
|
57
61
|
modality = "text"
|
|
58
62
|
|
|
63
|
+
if blobs is not None:
|
|
64
|
+
# change this when we support other modalities
|
|
65
|
+
modality = "image"
|
|
66
|
+
indexed_field = "_blob_hash"
|
|
67
|
+
if embedding_model is not None:
|
|
68
|
+
if isinstance(embedding_model, str):
|
|
69
|
+
model_name = embedding_model
|
|
70
|
+
elif isinstance(embedding_model, dict):
|
|
71
|
+
model_name = embedding_model["model"]
|
|
72
|
+
elif isinstance(embedding_model, NomicEmbedOptions):
|
|
73
|
+
model_name = embedding_model.model
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError("embedding_model must be a string, dictionary, or NomicEmbedOptions object")
|
|
76
|
+
|
|
77
|
+
if model_name in ["nomic-embed-text-v1", "nomic-embed-text-v1.5"]:
|
|
78
|
+
raise Exception("You cannot use a text embedding model with blobs")
|
|
79
|
+
else:
|
|
80
|
+
# default to vision v1.5
|
|
81
|
+
embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
|
|
82
|
+
|
|
59
83
|
if id_field is None:
|
|
60
84
|
id_field = ATLAS_DEFAULT_ID_FIELD
|
|
61
85
|
|
|
@@ -73,9 +97,14 @@ def map_data(
|
|
|
73
97
|
# no metadata was specified
|
|
74
98
|
added_id_field = False
|
|
75
99
|
|
|
76
|
-
if data is None
|
|
77
|
-
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
|
|
100
|
+
if data is None:
|
|
78
101
|
added_id_field = True
|
|
102
|
+
if embeddings is not None:
|
|
103
|
+
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
|
|
104
|
+
elif blobs is not None:
|
|
105
|
+
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(blobs))]
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError("You must specify either data, embeddings, or blobs")
|
|
79
108
|
|
|
80
109
|
if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
|
|
81
110
|
if isinstance(data, list) and id_field not in data[0]:
|
|
@@ -116,6 +145,9 @@ def map_data(
|
|
|
116
145
|
embeddings=embeddings,
|
|
117
146
|
data=data,
|
|
118
147
|
)
|
|
148
|
+
elif modality == "image":
|
|
149
|
+
dataset.add_data(blobs=blobs, data=data)
|
|
150
|
+
|
|
119
151
|
except BaseException as e:
|
|
120
152
|
if number_of_datums_before_upload == 0:
|
|
121
153
|
logger.info(f"{dataset.identifier}: Deleting dataset due to failure in initial upload.")
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Optional, Union
|
|
1
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
2
2
|
|
|
3
3
|
import pyarrow as pa
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
4
|
+
from pydantic import AliasChoices, BaseModel, Field
|
|
5
5
|
|
|
6
6
|
from .settings import DEFAULT_DUPLICATE_THRESHOLD
|
|
7
7
|
|
|
@@ -29,7 +29,7 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
|
|
|
29
29
|
for field in schema:
|
|
30
30
|
if field.name.startswith("_"):
|
|
31
31
|
# Underscore fields are private to Atlas and will be handled with their own logic.
|
|
32
|
-
if not field.name in {"_embeddings"}:
|
|
32
|
+
if not field.name in {"_embeddings", "_blob_hash"}:
|
|
33
33
|
raise ValueError(f"Underscore fields are reserved for Atlas internal use: {field.name}")
|
|
34
34
|
whitelist[field.name] = field.type
|
|
35
35
|
elif pa.types.is_boolean(field.type):
|
|
@@ -114,4 +114,6 @@ class NomicEmbedOptions(BaseModel):
|
|
|
114
114
|
model: The Nomic Embedding Model to use.
|
|
115
115
|
"""
|
|
116
116
|
|
|
117
|
-
model:
|
|
117
|
+
model: Literal[
|
|
118
|
+
"nomic-embed-text-v1", "nomic-embed-vision-v1", "nomic-embed-text-v1.5", "nomic-embed-vision-v1.5"
|
|
119
|
+
] = "nomic-embed-text-v1.5"
|
|
@@ -7,6 +7,7 @@ import os
|
|
|
7
7
|
import time
|
|
8
8
|
from contextlib import contextmanager
|
|
9
9
|
from datetime import datetime
|
|
10
|
+
from io import BytesIO
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import Dict, List, Optional, Tuple, Union
|
|
12
13
|
|
|
@@ -15,6 +16,7 @@ import pyarrow as pa
|
|
|
15
16
|
import requests
|
|
16
17
|
from loguru import logger
|
|
17
18
|
from pandas import DataFrame
|
|
19
|
+
from PIL import Image
|
|
18
20
|
from pyarrow import compute as pc
|
|
19
21
|
from pyarrow import feather, ipc
|
|
20
22
|
from tqdm import tqdm
|
|
@@ -337,7 +339,7 @@ class AtlasClass(object):
|
|
|
337
339
|
|
|
338
340
|
for key in data.column_names:
|
|
339
341
|
if key.startswith("_"):
|
|
340
|
-
if key == "_embeddings":
|
|
342
|
+
if key == "_embeddings" or key == "_blob_hash":
|
|
341
343
|
continue
|
|
342
344
|
raise ValueError("Metadata fields cannot start with _")
|
|
343
345
|
if pa.compute.max(pa.compute.utf8_length(data[project.id_field])).as_py() > 36: # type: ignore
|
|
@@ -1080,7 +1082,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1080
1082
|
elif isinstance(embedding_model, NomicEmbedOptions):
|
|
1081
1083
|
pass
|
|
1082
1084
|
elif isinstance(embedding_model, str):
|
|
1083
|
-
embedding_model = NomicEmbedOptions(model=embedding_model)
|
|
1085
|
+
embedding_model = NomicEmbedOptions(model=embedding_model) # type: ignore
|
|
1084
1086
|
else:
|
|
1085
1087
|
embedding_model = NomicEmbedOptions()
|
|
1086
1088
|
|
|
@@ -1133,7 +1135,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1133
1135
|
),
|
|
1134
1136
|
}
|
|
1135
1137
|
|
|
1136
|
-
elif self.modality == "text":
|
|
1138
|
+
elif self.modality == "text" or self.modality == "image":
|
|
1137
1139
|
# find the index id of the index with name reuse_embeddings_from_index
|
|
1138
1140
|
reuse_embedding_from_index_id = None
|
|
1139
1141
|
indices = self.indices
|
|
@@ -1153,6 +1155,18 @@ class AtlasDataset(AtlasClass):
|
|
|
1153
1155
|
if indexed_field not in self.dataset_fields:
|
|
1154
1156
|
raise Exception(f"Indexing on {indexed_field} not allowed. Valid options are: {self.dataset_fields}")
|
|
1155
1157
|
|
|
1158
|
+
if self.modality == "image":
|
|
1159
|
+
if topic_model.topic_label_field is None:
|
|
1160
|
+
print(
|
|
1161
|
+
"You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
|
|
1162
|
+
)
|
|
1163
|
+
topic_field = None
|
|
1164
|
+
topic_model.build_topic_model = False
|
|
1165
|
+
else:
|
|
1166
|
+
topic_field = topic_model.topic_label_field
|
|
1167
|
+
else:
|
|
1168
|
+
topic_field = topic_model.topic_label_field
|
|
1169
|
+
|
|
1156
1170
|
build_template = {
|
|
1157
1171
|
"project_id": self.id,
|
|
1158
1172
|
"index_name": name,
|
|
@@ -1185,7 +1199,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1185
1199
|
"topic_model_hyperparameters": json.dumps(
|
|
1186
1200
|
{
|
|
1187
1201
|
"build_topic_model": topic_model.build_topic_model,
|
|
1188
|
-
"community_description_target_field":
|
|
1202
|
+
"community_description_target_field": topic_field,
|
|
1189
1203
|
"cluster_method": topic_model.build_topic_model,
|
|
1190
1204
|
"enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
|
|
1191
1205
|
}
|
|
@@ -1231,7 +1245,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1231
1245
|
logger.warning("Could not find a map being built for this dataset.")
|
|
1232
1246
|
else:
|
|
1233
1247
|
logger.info(
|
|
1234
|
-
f"Created map `{atlas_projection.name}` in dataset `{self.identifier}`: {atlas_projection.
|
|
1248
|
+
f"Created map `{atlas_projection.name}` in dataset `{self.identifier}`: {atlas_projection.dataset_link}"
|
|
1235
1249
|
)
|
|
1236
1250
|
return atlas_projection
|
|
1237
1251
|
|
|
@@ -1320,7 +1334,13 @@ class AtlasDataset(AtlasClass):
|
|
|
1320
1334
|
else:
|
|
1321
1335
|
raise Exception(response.text)
|
|
1322
1336
|
|
|
1323
|
-
def add_data(
|
|
1337
|
+
def add_data(
|
|
1338
|
+
self,
|
|
1339
|
+
data=Union[DataFrame, List[Dict], pa.Table],
|
|
1340
|
+
embeddings: Optional[np.ndarray] = None,
|
|
1341
|
+
blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
|
|
1342
|
+
pbar=None,
|
|
1343
|
+
):
|
|
1324
1344
|
"""
|
|
1325
1345
|
Adds data of varying modality to an Atlas dataset.
|
|
1326
1346
|
Args:
|
|
@@ -1333,9 +1353,109 @@ class AtlasDataset(AtlasClass):
|
|
|
1333
1353
|
elif isinstance(data, pa.Table) and "_embeddings" in data.column_names: # type: ignore
|
|
1334
1354
|
embeddings = np.array(data.column("_embeddings").to_pylist()) # type: ignore
|
|
1335
1355
|
self._add_embeddings(data=data, embeddings=embeddings, pbar=pbar)
|
|
1356
|
+
elif blobs is not None:
|
|
1357
|
+
self._add_blobs(data=data, blobs=blobs, pbar=pbar)
|
|
1336
1358
|
else:
|
|
1337
1359
|
self._add_text(data=data, pbar=pbar)
|
|
1338
1360
|
|
|
1361
|
+
def _add_blobs(
|
|
1362
|
+
self, data: Union[DataFrame, List[Dict], pa.Table], blobs: List[Union[str, bytes, Image.Image]], pbar=None
|
|
1363
|
+
):
|
|
1364
|
+
"""
|
|
1365
|
+
Add data, with associated blobs, to the dataset.
|
|
1366
|
+
Uploads blobs to the server and associates them with the data.
|
|
1367
|
+
"""
|
|
1368
|
+
if isinstance(data, DataFrame):
|
|
1369
|
+
data = pa.Table.from_pandas(data)
|
|
1370
|
+
elif isinstance(data, list):
|
|
1371
|
+
data = pa.Table.from_pylist(data)
|
|
1372
|
+
elif not isinstance(data, pa.Table):
|
|
1373
|
+
raise ValueError("Data must be a pandas DataFrame, list of dictionaries, or a pyarrow Table.")
|
|
1374
|
+
|
|
1375
|
+
blob_upload_endpoint = "/v1/project/data/add/blobs"
|
|
1376
|
+
|
|
1377
|
+
# uploda batch of blobs
|
|
1378
|
+
# return hash of blob
|
|
1379
|
+
# add hash to data as _blob_hash
|
|
1380
|
+
# set indexed_field to _blob_hash
|
|
1381
|
+
# call _add_data
|
|
1382
|
+
|
|
1383
|
+
# Cast self id field to string for merged data lower down on function
|
|
1384
|
+
data = data.set_column( # type: ignore
|
|
1385
|
+
data.schema.get_field_index(self.id_field), self.id_field, pc.cast(data[self.id_field], pa.string()) # type: ignore
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
ids = data[self.id_field].to_pylist() # type: ignore
|
|
1389
|
+
if not isinstance(ids[0], str):
|
|
1390
|
+
ids = [str(uuid) for uuid in ids]
|
|
1391
|
+
|
|
1392
|
+
# TODO: add support for other modalities
|
|
1393
|
+
images = []
|
|
1394
|
+
for uuid, blob in tqdm(zip(ids, blobs), total=len(ids), desc="Loading images"):
|
|
1395
|
+
if isinstance(blob, str) and os.path.exists(blob):
|
|
1396
|
+
# Auto resize to max 512x512
|
|
1397
|
+
image = Image.open(blob)
|
|
1398
|
+
if image.height > 512 or image.width > 512:
|
|
1399
|
+
image = image.resize((512, 512))
|
|
1400
|
+
buffered = BytesIO()
|
|
1401
|
+
image.save(buffered, format="JPEG")
|
|
1402
|
+
images.append((uuid, buffered.getvalue()))
|
|
1403
|
+
elif isinstance(blob, bytes):
|
|
1404
|
+
images.append((uuid, blob))
|
|
1405
|
+
elif isinstance(blob, Image.Image):
|
|
1406
|
+
if blob.height > 512 or blob.width > 512:
|
|
1407
|
+
blob = blob.resize((512, 512))
|
|
1408
|
+
buffered = BytesIO()
|
|
1409
|
+
blob.save(buffered, format="JPEG")
|
|
1410
|
+
images.append((uuid, buffered.getvalue()))
|
|
1411
|
+
else:
|
|
1412
|
+
raise ValueError(f"Invalid blob type for {uuid}. Must be a path to an image, bytes, or PIL Image.")
|
|
1413
|
+
|
|
1414
|
+
batch_size = 40
|
|
1415
|
+
num_workers = 10
|
|
1416
|
+
|
|
1417
|
+
def send_request(i):
|
|
1418
|
+
image_batch = images[i : i + batch_size]
|
|
1419
|
+
ids = [uuid for uuid, _ in image_batch]
|
|
1420
|
+
blobs = [("blobs", blob) for _, blob in image_batch]
|
|
1421
|
+
response = requests.post(
|
|
1422
|
+
self.atlas_api_path + blob_upload_endpoint,
|
|
1423
|
+
headers=self.header,
|
|
1424
|
+
data={"dataset_id": self.id},
|
|
1425
|
+
files=blobs,
|
|
1426
|
+
)
|
|
1427
|
+
if response.status_code != 200:
|
|
1428
|
+
raise Exception(response.text)
|
|
1429
|
+
return {uuid: blob_hash for uuid, blob_hash in zip(ids, response.json()["hashes"])}
|
|
1430
|
+
|
|
1431
|
+
# if this method is being called internally, we pass a global progress bar
|
|
1432
|
+
if pbar is None:
|
|
1433
|
+
pbar = tqdm(total=len(data), desc="Uploading blobs to Atlas")
|
|
1434
|
+
|
|
1435
|
+
hash_schema = pa.schema([(self.id_field, pa.string()), ("_blob_hash", pa.string())])
|
|
1436
|
+
returned_ids = []
|
|
1437
|
+
returned_hashes = []
|
|
1438
|
+
|
|
1439
|
+
succeeded = 0
|
|
1440
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
1441
|
+
futures = {executor.submit(send_request, i): i for i in range(0, len(data), batch_size)}
|
|
1442
|
+
|
|
1443
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1444
|
+
response = future.result()
|
|
1445
|
+
# add hash to data as _blob_hash
|
|
1446
|
+
for uuid, blob_hash in response.items():
|
|
1447
|
+
returned_ids.append(uuid)
|
|
1448
|
+
returned_hashes.append(blob_hash)
|
|
1449
|
+
|
|
1450
|
+
# A successful upload.
|
|
1451
|
+
succeeded += len(response)
|
|
1452
|
+
pbar.update(len(response))
|
|
1453
|
+
|
|
1454
|
+
hash_tb = pa.Table.from_pydict({self.id_field: returned_ids, "_blob_hash": returned_hashes}, schema=hash_schema)
|
|
1455
|
+
merged_data = data.join(right_table=hash_tb, keys=self.id_field) # type: ignore
|
|
1456
|
+
|
|
1457
|
+
self._add_data(merged_data, pbar=pbar)
|
|
1458
|
+
|
|
1339
1459
|
def _add_text(self, data=Union[DataFrame, List[Dict], pa.Table], pbar=None):
|
|
1340
1460
|
"""
|
|
1341
1461
|
Add text data to the dataset.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|