nomic 3.0.35__tar.gz → 3.0.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nomic might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.0.35
3
+ Version: 3.0.37
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pyarrow as pa
11
11
  from loguru import logger
12
12
  from pandas import DataFrame
13
+ from PIL import Image
13
14
  from pyarrow import Table
14
15
  from tqdm import tqdm
15
16
 
@@ -21,6 +22,7 @@ from .utils import arrow_iterator, b64int, get_random_name
21
22
 
22
23
  def map_data(
23
24
  data: Optional[Union[DataFrame, List[Dict], Table]] = None,
25
+ blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
24
26
  embeddings: Optional[np.ndarray] = None,
25
27
  identifier: Optional[str] = None,
26
28
  description: str = "",
@@ -54,8 +56,30 @@ def map_data(
54
56
  raise Exception("Your embeddings cannot be empty")
55
57
 
56
58
  if indexed_field is not None:
59
+ if embeddings is not None:
60
+ logger.warning("You have specified an indexed field but are using embeddings. Embeddings will be ignored.")
57
61
  modality = "text"
58
62
 
63
+ if blobs is not None:
64
+ # change this when we support other modalities
65
+ modality = "image"
66
+ indexed_field = "_blob_hash"
67
+ if embedding_model is not None:
68
+ if isinstance(embedding_model, str):
69
+ model_name = embedding_model
70
+ elif isinstance(embedding_model, dict):
71
+ model_name = embedding_model["model"]
72
+ elif isinstance(embedding_model, NomicEmbedOptions):
73
+ model_name = embedding_model.model
74
+ else:
75
+ raise ValueError("embedding_model must be a string, dictionary, or NomicEmbedOptions object")
76
+
77
+ if model_name in ["nomic-embed-text-v1", "nomic-embed-text-v1.5"]:
78
+ raise Exception("You cannot use a text embedding model with blobs")
79
+ else:
80
+ # default to vision v1.5
81
+ embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
82
+
59
83
  if id_field is None:
60
84
  id_field = ATLAS_DEFAULT_ID_FIELD
61
85
 
@@ -73,9 +97,14 @@ def map_data(
73
97
  # no metadata was specified
74
98
  added_id_field = False
75
99
 
76
- if data is None and embeddings is not None:
77
- data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
100
+ if data is None:
78
101
  added_id_field = True
102
+ if embeddings is not None:
103
+ data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
104
+ elif blobs is not None:
105
+ data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(blobs))]
106
+ else:
107
+ raise ValueError("You must specify either data, embeddings, or blobs")
79
108
 
80
109
  if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
81
110
  if isinstance(data, list) and id_field not in data[0]:
@@ -116,6 +145,9 @@ def map_data(
116
145
  embeddings=embeddings,
117
146
  data=data,
118
147
  )
148
+ elif modality == "image":
149
+ dataset.add_data(blobs=blobs, data=data)
150
+
119
151
  except BaseException as e:
120
152
  if number_of_datums_before_upload == 0:
121
153
  logger.info(f"{dataset.identifier}: Deleting dataset due to failure in initial upload.")
@@ -1,7 +1,7 @@
1
- from typing import Any, Dict, List, Optional, Union
1
+ from typing import Any, Dict, List, Literal, Optional, Union
2
2
 
3
3
  import pyarrow as pa
4
- from pydantic import BaseModel, Field
4
+ from pydantic import AliasChoices, BaseModel, Field
5
5
 
6
6
  from .settings import DEFAULT_DUPLICATE_THRESHOLD
7
7
 
@@ -29,7 +29,7 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
29
29
  for field in schema:
30
30
  if field.name.startswith("_"):
31
31
  # Underscore fields are private to Atlas and will be handled with their own logic.
32
- if not field.name in {"_embeddings"}:
32
+ if not field.name in {"_embeddings", "_blob_hash"}:
33
33
  raise ValueError(f"Underscore fields are reserved for Atlas internal use: {field.name}")
34
34
  whitelist[field.name] = field.type
35
35
  elif pa.types.is_boolean(field.type):
@@ -114,4 +114,6 @@ class NomicEmbedOptions(BaseModel):
114
114
  model: The Nomic Embedding Model to use.
115
115
  """
116
116
 
117
- model: str = "NomicEmbed"
117
+ model: Literal[
118
+ "nomic-embed-text-v1", "nomic-embed-vision-v1", "nomic-embed-text-v1.5", "nomic-embed-vision-v1.5"
119
+ ] = "nomic-embed-text-v1.5"
@@ -7,6 +7,7 @@ import os
7
7
  import time
8
8
  from contextlib import contextmanager
9
9
  from datetime import datetime
10
+ from io import BytesIO
10
11
  from pathlib import Path
11
12
  from typing import Dict, List, Optional, Tuple, Union
12
13
 
@@ -15,6 +16,7 @@ import pyarrow as pa
15
16
  import requests
16
17
  from loguru import logger
17
18
  from pandas import DataFrame
19
+ from PIL import Image
18
20
  from pyarrow import compute as pc
19
21
  from pyarrow import feather, ipc
20
22
  from tqdm import tqdm
@@ -337,7 +339,7 @@ class AtlasClass(object):
337
339
 
338
340
  for key in data.column_names:
339
341
  if key.startswith("_"):
340
- if key == "_embeddings":
342
+ if key == "_embeddings" or key == "_blob_hash":
341
343
  continue
342
344
  raise ValueError("Metadata fields cannot start with _")
343
345
  if pa.compute.max(pa.compute.utf8_length(data[project.id_field])).as_py() > 36: # type: ignore
@@ -1080,7 +1082,7 @@ class AtlasDataset(AtlasClass):
1080
1082
  elif isinstance(embedding_model, NomicEmbedOptions):
1081
1083
  pass
1082
1084
  elif isinstance(embedding_model, str):
1083
- embedding_model = NomicEmbedOptions(model=embedding_model)
1085
+ embedding_model = NomicEmbedOptions(model=embedding_model) # type: ignore
1084
1086
  else:
1085
1087
  embedding_model = NomicEmbedOptions()
1086
1088
 
@@ -1133,7 +1135,7 @@ class AtlasDataset(AtlasClass):
1133
1135
  ),
1134
1136
  }
1135
1137
 
1136
- elif self.modality == "text":
1138
+ elif self.modality == "text" or self.modality == "image":
1137
1139
  # find the index id of the index with name reuse_embeddings_from_index
1138
1140
  reuse_embedding_from_index_id = None
1139
1141
  indices = self.indices
@@ -1153,6 +1155,18 @@ class AtlasDataset(AtlasClass):
1153
1155
  if indexed_field not in self.dataset_fields:
1154
1156
  raise Exception(f"Indexing on {indexed_field} not allowed. Valid options are: {self.dataset_fields}")
1155
1157
 
1158
+ if self.modality == "image":
1159
+ if topic_model.topic_label_field is None:
1160
+ print(
1161
+ "You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
1162
+ )
1163
+ topic_field = None
1164
+ topic_model.build_topic_model = False
1165
+ else:
1166
+ topic_field = topic_model.topic_label_field
1167
+ else:
1168
+ topic_field = topic_model.topic_label_field
1169
+
1156
1170
  build_template = {
1157
1171
  "project_id": self.id,
1158
1172
  "index_name": name,
@@ -1185,7 +1199,7 @@ class AtlasDataset(AtlasClass):
1185
1199
  "topic_model_hyperparameters": json.dumps(
1186
1200
  {
1187
1201
  "build_topic_model": topic_model.build_topic_model,
1188
- "community_description_target_field": indexed_field, # TODO change key to topic_label_field post v0.0.85
1202
+ "community_description_target_field": topic_field,
1189
1203
  "cluster_method": topic_model.build_topic_model,
1190
1204
  "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
1191
1205
  }
@@ -1231,7 +1245,7 @@ class AtlasDataset(AtlasClass):
1231
1245
  logger.warning("Could not find a map being built for this dataset.")
1232
1246
  else:
1233
1247
  logger.info(
1234
- f"Created map `{atlas_projection.name}` in dataset `{self.identifier}`: {atlas_projection.map_link}"
1248
+ f"Created map `{atlas_projection.name}` in dataset `{self.identifier}`: {atlas_projection.dataset_link}"
1235
1249
  )
1236
1250
  return atlas_projection
1237
1251
 
@@ -1320,7 +1334,13 @@ class AtlasDataset(AtlasClass):
1320
1334
  else:
1321
1335
  raise Exception(response.text)
1322
1336
 
1323
- def add_data(self, data=Union[DataFrame, List[Dict], pa.Table], embeddings: Optional[np.ndarray] = None, pbar=None):
1337
+ def add_data(
1338
+ self,
1339
+ data=Union[DataFrame, List[Dict], pa.Table],
1340
+ embeddings: Optional[np.ndarray] = None,
1341
+ blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
1342
+ pbar=None,
1343
+ ):
1324
1344
  """
1325
1345
  Adds data of varying modality to an Atlas dataset.
1326
1346
  Args:
@@ -1333,9 +1353,109 @@ class AtlasDataset(AtlasClass):
1333
1353
  elif isinstance(data, pa.Table) and "_embeddings" in data.column_names: # type: ignore
1334
1354
  embeddings = np.array(data.column("_embeddings").to_pylist()) # type: ignore
1335
1355
  self._add_embeddings(data=data, embeddings=embeddings, pbar=pbar)
1356
+ elif blobs is not None:
1357
+ self._add_blobs(data=data, blobs=blobs, pbar=pbar)
1336
1358
  else:
1337
1359
  self._add_text(data=data, pbar=pbar)
1338
1360
 
1361
+ def _add_blobs(
1362
+ self, data: Union[DataFrame, List[Dict], pa.Table], blobs: List[Union[str, bytes, Image.Image]], pbar=None
1363
+ ):
1364
+ """
1365
+ Add data, with associated blobs, to the dataset.
1366
+ Uploads blobs to the server and associates them with the data.
1367
+ """
1368
+ if isinstance(data, DataFrame):
1369
+ data = pa.Table.from_pandas(data)
1370
+ elif isinstance(data, list):
1371
+ data = pa.Table.from_pylist(data)
1372
+ elif not isinstance(data, pa.Table):
1373
+ raise ValueError("Data must be a pandas DataFrame, list of dictionaries, or a pyarrow Table.")
1374
+
1375
+ blob_upload_endpoint = "/v1/project/data/add/blobs"
1376
+
1377
+ # uploda batch of blobs
1378
+ # return hash of blob
1379
+ # add hash to data as _blob_hash
1380
+ # set indexed_field to _blob_hash
1381
+ # call _add_data
1382
+
1383
+ # Cast self id field to string for merged data lower down on function
1384
+ data = data.set_column( # type: ignore
1385
+ data.schema.get_field_index(self.id_field), self.id_field, pc.cast(data[self.id_field], pa.string()) # type: ignore
1386
+ )
1387
+
1388
+ ids = data[self.id_field].to_pylist() # type: ignore
1389
+ if not isinstance(ids[0], str):
1390
+ ids = [str(uuid) for uuid in ids]
1391
+
1392
+ # TODO: add support for other modalities
1393
+ images = []
1394
+ for uuid, blob in tqdm(zip(ids, blobs), total=len(ids), desc="Loading images"):
1395
+ if isinstance(blob, str) and os.path.exists(blob):
1396
+ # Auto resize to max 512x512
1397
+ image = Image.open(blob)
1398
+ if image.height > 512 or image.width > 512:
1399
+ image = image.resize((512, 512))
1400
+ buffered = BytesIO()
1401
+ image.save(buffered, format="JPEG")
1402
+ images.append((uuid, buffered.getvalue()))
1403
+ elif isinstance(blob, bytes):
1404
+ images.append((uuid, blob))
1405
+ elif isinstance(blob, Image.Image):
1406
+ if blob.height > 512 or blob.width > 512:
1407
+ blob = blob.resize((512, 512))
1408
+ buffered = BytesIO()
1409
+ blob.save(buffered, format="JPEG")
1410
+ images.append((uuid, buffered.getvalue()))
1411
+ else:
1412
+ raise ValueError(f"Invalid blob type for {uuid}. Must be a path to an image, bytes, or PIL Image.")
1413
+
1414
+ batch_size = 40
1415
+ num_workers = 10
1416
+
1417
+ def send_request(i):
1418
+ image_batch = images[i : i + batch_size]
1419
+ ids = [uuid for uuid, _ in image_batch]
1420
+ blobs = [("blobs", blob) for _, blob in image_batch]
1421
+ response = requests.post(
1422
+ self.atlas_api_path + blob_upload_endpoint,
1423
+ headers=self.header,
1424
+ data={"dataset_id": self.id},
1425
+ files=blobs,
1426
+ )
1427
+ if response.status_code != 200:
1428
+ raise Exception(response.text)
1429
+ return {uuid: blob_hash for uuid, blob_hash in zip(ids, response.json()["hashes"])}
1430
+
1431
+ # if this method is being called internally, we pass a global progress bar
1432
+ if pbar is None:
1433
+ pbar = tqdm(total=len(data), desc="Uploading blobs to Atlas")
1434
+
1435
+ hash_schema = pa.schema([(self.id_field, pa.string()), ("_blob_hash", pa.string())])
1436
+ returned_ids = []
1437
+ returned_hashes = []
1438
+
1439
+ succeeded = 0
1440
+ with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
1441
+ futures = {executor.submit(send_request, i): i for i in range(0, len(data), batch_size)}
1442
+
1443
+ for future in concurrent.futures.as_completed(futures):
1444
+ response = future.result()
1445
+ # add hash to data as _blob_hash
1446
+ for uuid, blob_hash in response.items():
1447
+ returned_ids.append(uuid)
1448
+ returned_hashes.append(blob_hash)
1449
+
1450
+ # A successful upload.
1451
+ succeeded += len(response)
1452
+ pbar.update(len(response))
1453
+
1454
+ hash_tb = pa.Table.from_pydict({self.id_field: returned_ids, "_blob_hash": returned_hashes}, schema=hash_schema)
1455
+ merged_data = data.join(right_table=hash_tb, keys=self.id_field) # type: ignore
1456
+
1457
+ self._add_data(merged_data, pbar=pbar)
1458
+
1339
1459
  def _add_text(self, data=Union[DataFrame, List[Dict], pa.Table], pbar=None):
1340
1460
  """
1341
1461
  Add text data to the dataset.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.0.35
3
+ Version: 3.0.37
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -8,7 +8,7 @@ description = "The official Nomic python client."
8
8
 
9
9
  setup(
10
10
  name="nomic",
11
- version="3.0.35",
11
+ version="3.0.37",
12
12
  url="https://github.com/nomic-ai/nomic",
13
13
  description=description,
14
14
  long_description=description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes