nomic 3.0.40__tar.gz → 3.0.42__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.0.40 → nomic-3.0.42}/PKG-INFO +1 -1
- {nomic-3.0.40 → nomic-3.0.42}/nomic/atlas.py +1 -1
- {nomic-3.0.40 → nomic-3.0.42}/nomic/aws/sagemaker.py +37 -20
- {nomic-3.0.40 → nomic-3.0.42}/nomic/dataset.py +2 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.0.40 → nomic-3.0.42}/setup.py +1 -1
- {nomic-3.0.40 → nomic-3.0.42}/README.md +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/__init__.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/aws/__init__.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/cli.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/data_inference.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/data_operations.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/embed.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/settings.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic/utils.py +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/pyproject.toml +0 -0
- {nomic-3.0.40 → nomic-3.0.42}/setup.cfg +0 -0
|
@@ -38,7 +38,7 @@ def map_data(
|
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
40
|
data: An ordered collection of the datapoints you are structuring. Can be a list of dictionaries, Pandas Dataframe or PyArrow Table.
|
|
41
|
-
blobs: A list of image paths, bytes, or PIL images to add to your image dataset.
|
|
41
|
+
blobs: A list of image paths, bytes, or PIL images to add to your image dataset that are stored locally.
|
|
42
42
|
embeddings: An [N,d] numpy array containing the N embeddings to add.
|
|
43
43
|
identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
|
|
44
44
|
description: The description of your dataset
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import multiprocessing as mp
|
|
6
6
|
from pathlib import PosixPath
|
|
7
|
-
from typing import List, Optional, Union
|
|
7
|
+
from typing import List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import boto3
|
|
10
10
|
import PIL
|
|
@@ -187,7 +187,22 @@ def embed_text(
|
|
|
187
187
|
}
|
|
188
188
|
|
|
189
189
|
|
|
190
|
-
|
|
190
|
+
# only way I could get sagemaker with multipart to work
|
|
191
|
+
def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
|
|
192
|
+
# Prepare the multipart body
|
|
193
|
+
boundary = b"---------------------------Boundary"
|
|
194
|
+
body = b""
|
|
195
|
+
for i, (name, img_bytes) in enumerate(images):
|
|
196
|
+
body += b"--" + boundary + b"\r\n"
|
|
197
|
+
body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
|
|
198
|
+
body += b"Content-Type: image/jpeg\r\n\r\n"
|
|
199
|
+
body += img_bytes + b"\r\n"
|
|
200
|
+
body += b"--" + boundary + b"--\r\n"
|
|
201
|
+
|
|
202
|
+
return body, boundary
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
|
|
191
206
|
"""
|
|
192
207
|
Preprocess a list of images for embedding using a sagemaker model.
|
|
193
208
|
|
|
@@ -210,17 +225,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
|
|
|
210
225
|
image = image.convert("RGB")
|
|
211
226
|
buffered = io.BytesIO()
|
|
212
227
|
image.save(buffered, format="JPEG")
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
228
|
+
encoded_images.append(("image_data", buffered.getvalue()))
|
|
229
|
+
|
|
230
|
+
body, boundary = prepare_multipart_request(encoded_images)
|
|
231
|
+
return body, boundary
|
|
216
232
|
|
|
217
233
|
|
|
218
|
-
def sagemaker_image_request(
|
|
219
|
-
|
|
234
|
+
def sagemaker_image_request(
|
|
235
|
+
images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
|
|
236
|
+
):
|
|
237
|
+
body, boundary = preprocess_image(images)
|
|
220
238
|
|
|
221
239
|
client = boto3.client("sagemaker-runtime", region_name=region_name)
|
|
222
240
|
response = client.invoke_endpoint(
|
|
223
|
-
EndpointName=sagemaker_endpoint,
|
|
241
|
+
EndpointName=sagemaker_endpoint,
|
|
242
|
+
Body=body,
|
|
243
|
+
ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
|
|
224
244
|
)
|
|
225
245
|
|
|
226
246
|
return parse_sagemaker_response(response)
|
|
@@ -230,21 +250,18 @@ def embed_image(
|
|
|
230
250
|
images: List[Union[str, "PIL.Image.Image", bytes]],
|
|
231
251
|
sagemaker_endpoint: str,
|
|
232
252
|
region_name: str,
|
|
233
|
-
model_name="nomic-embed-vision-v1",
|
|
253
|
+
model_name="nomic-embed-vision-v1.5",
|
|
254
|
+
batch_size=16,
|
|
234
255
|
) -> dict:
|
|
235
256
|
embeddings = []
|
|
236
257
|
|
|
237
|
-
max_workers = mp.cpu_count()
|
|
238
258
|
pbar = tqdm(total=len(images))
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
for future in concurrent.futures.as_completed(futures):
|
|
247
|
-
embeddings.extend(future.result())
|
|
259
|
+
for i in range(0, len(images), batch_size):
|
|
260
|
+
batch = images[i : i + batch_size]
|
|
261
|
+
embeddings.extend(
|
|
262
|
+
sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
|
|
263
|
+
)
|
|
264
|
+
pbar.update(len(batch))
|
|
248
265
|
|
|
249
266
|
return {
|
|
250
267
|
"embeddings": embeddings,
|
|
@@ -260,7 +277,7 @@ def batch_transform_image(
|
|
|
260
277
|
arn: Optional[str] = None,
|
|
261
278
|
role: Optional[str] = None,
|
|
262
279
|
max_payload: Optional[int] = 6,
|
|
263
|
-
instance_type: str = "ml.
|
|
280
|
+
instance_type: str = "ml.g4dn.xlarge",
|
|
264
281
|
n_instances: int = 1,
|
|
265
282
|
wait: bool = True,
|
|
266
283
|
logs: bool = True,
|
|
@@ -1356,6 +1356,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1356
1356
|
Args:
|
|
1357
1357
|
data: A pandas DataFrame, list of dictionaries, or pyarrow Table matching the dataset schema.
|
|
1358
1358
|
embeddings: A numpy array of embeddings: each row corresponds to a row in the table. Use if you already have embeddings for your datapoints.
|
|
1359
|
+
blobs: A list of image paths, bytes, or PIL Images. Use if you want to create an AtlasDataset using image embeddings over your images. Note: Blobs are stored locally only.
|
|
1359
1360
|
pbar: (Optional). A tqdm progress bar to update.
|
|
1360
1361
|
"""
|
|
1361
1362
|
if embeddings is not None:
|
|
@@ -1374,6 +1375,7 @@ class AtlasDataset(AtlasClass):
|
|
|
1374
1375
|
"""
|
|
1375
1376
|
Add data, with associated blobs, to the dataset.
|
|
1376
1377
|
Uploads blobs to the server and associates them with the data.
|
|
1378
|
+
Blobs must reference objects stored locally
|
|
1377
1379
|
"""
|
|
1378
1380
|
if isinstance(data, DataFrame):
|
|
1379
1381
|
data = pa.Table.from_pandas(data)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|