nomic 3.0.32__tar.gz → 3.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomic-3.8.0/MANIFEST.in +1 -0
- nomic-3.8.0/PKG-INFO +247 -0
- nomic-3.8.0/nomic/__init__.py +10 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic/atlas.py +41 -37
- {nomic-3.0.32 → nomic-3.8.0}/nomic/aws/sagemaker.py +45 -41
- {nomic-3.0.32 → nomic-3.8.0}/nomic/cli.py +1 -1
- nomic-3.8.0/nomic/client.py +430 -0
- nomic-3.8.0/nomic/client_models.py +103 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic/data_inference.py +36 -19
- {nomic-3.0.32 → nomic-3.8.0}/nomic/data_operations.py +514 -164
- {nomic-3.0.32 → nomic-3.8.0}/nomic/dataset.py +422 -314
- {nomic-3.0.32 → nomic-3.8.0}/nomic/embed.py +25 -14
- nomic-3.8.0/nomic/py.typed +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic/settings.py +0 -3
- {nomic-3.0.32 → nomic-3.8.0}/nomic/utils.py +33 -15
- nomic-3.8.0/nomic.egg-info/PKG-INFO +247 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/SOURCES.txt +4 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/requires.txt +3 -2
- {nomic-3.0.32 → nomic-3.8.0}/setup.py +22 -4
- nomic-3.0.32/PKG-INFO +0 -18
- nomic-3.0.32/nomic/__init__.py +0 -2
- nomic-3.0.32/nomic.egg-info/PKG-INFO +0 -18
- {nomic-3.0.32 → nomic-3.8.0}/README.md +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic/aws/__init__.py +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/pyproject.toml +0 -0
- {nomic-3.0.32 → nomic-3.8.0}/setup.cfg +0 -0
nomic-3.8.0/MANIFEST.in
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include nomic/py.typed
|
nomic-3.8.0/PKG-INFO
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: nomic
|
|
3
|
+
Version: 3.8.0
|
|
4
|
+
Summary: The official Nomic python client.
|
|
5
|
+
Home-page: https://github.com/nomic-ai/nomic
|
|
6
|
+
Author: nomic.ai
|
|
7
|
+
Author-email: support@nomic.ai
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Platform: UNKNOWN
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Provides-Extra: local
|
|
14
|
+
Provides-Extra: aws
|
|
15
|
+
Provides-Extra: all
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
|
|
18
|
+
<h1 align="center">Nomic Atlas Python Client</h1>
|
|
19
|
+
<h3 align="center">Explore, label, search and share massive datasets in your web browser.</h3>
|
|
20
|
+
<p>This repository contains Python bindings for working with <a href="https://atlas.nomic.ai/">Nomic Atlas</a>, the world’s most powerful unstructured data interaction platform. Atlas supports datasets from hundreds to tens of millions of points, and supports data modalities ranging from text to image to audio to video. </p>
|
|
21
|
+
|
|
22
|
+
With Nomic Atlas, you can:
|
|
23
|
+
|
|
24
|
+
- Generate, store and retrieve embeddings for your unstructured data.
|
|
25
|
+
- Find insights in your unstructured data and embeddings all from your web browser.
|
|
26
|
+
- Share and present your datasets and data findings to anyone.
|
|
27
|
+
|
|
28
|
+
### Where to find us?
|
|
29
|
+
|
|
30
|
+
[https://atlas.nomic.ai/](https://atlas.nomic.ai/)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
## Table of Contents
|
|
35
|
+
|
|
36
|
+
- [Quick resources](#quick-resources)
|
|
37
|
+
- [Example maps](#example-maps)
|
|
38
|
+
- [Features](#features)
|
|
39
|
+
- [Quickstart](#quickstart)
|
|
40
|
+
- [Installation](#installation)
|
|
41
|
+
- [Make your first map](#make-your-first-map)
|
|
42
|
+
- [Atlas usage examples](#atlas-usage-examples)
|
|
43
|
+
- [Access your embeddings](#access-your-embeddings)
|
|
44
|
+
- [View your data's topic model](#view-your-datas-topic-model)
|
|
45
|
+
- [Search for data semantically](#search-for-data-semantically)
|
|
46
|
+
- [Documentation](#documentation)
|
|
47
|
+
- [Discussion](#discussion)
|
|
48
|
+
- [Community](#community)
|
|
49
|
+
|
|
50
|
+
## Quick Resources
|
|
51
|
+
|
|
52
|
+
<p >
|
|
53
|
+
Try the <a href="https://colab.research.google.com/drive/1CZBo3LV0FoRTVRN3v068tvNJgbeWpcSX?usp=sharing">:notebook: Colab Demo</a> to get started in Python
|
|
54
|
+
</p>
|
|
55
|
+
|
|
56
|
+
<p>
|
|
57
|
+
Read the <a href="https://docs.nomic.ai">:closed_book: Atlas Docs</a>
|
|
58
|
+
</p>
|
|
59
|
+
|
|
60
|
+
<p>
|
|
61
|
+
Join our <a href="https://discord.gg/myY5YDR8z8">:hut: Discord</a> to start chatting and get help
|
|
62
|
+
</p>
|
|
63
|
+
|
|
64
|
+
#### Example maps
|
|
65
|
+
|
|
66
|
+
<a href="https://atlas.nomic.ai/map/twitter">:world_map: Map of Twitter</a> (5.4 million tweets)
|
|
67
|
+
<br> <br>
|
|
68
|
+
<a href="https://atlas.nomic.ai/map/stablediffusion">:world_map: Map of StableDiffusion Generations</a> (6.4 million images)
|
|
69
|
+
<br> <br>
|
|
70
|
+
<a href="https://atlas.nomic.ai/map/neurips">:world_map: Map of NeurIPS Proceedings</a> (16,623 abstracts)
|
|
71
|
+
|
|
72
|
+
</p>
|
|
73
|
+
|
|
74
|
+
## Features
|
|
75
|
+
|
|
76
|
+
Here are just a few of the features which Atlas offers:
|
|
77
|
+
|
|
78
|
+
- Organize your **text, image, and embedding data**
|
|
79
|
+
- Create **beautiful and shareable** maps **with or without coding knowledge**
|
|
80
|
+
- Have easy access to both **high-level data structures** and **individual datapoints**
|
|
81
|
+
- **Search** millions of datapoints **instantly**
|
|
82
|
+
- **Cluster data** into semantic topics
|
|
83
|
+
- **Tag and clean** your dataset
|
|
84
|
+
- **Deduplicate** text, images, video, audio
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
## Quickstart
|
|
89
|
+
|
|
90
|
+
### Installation
|
|
91
|
+
|
|
92
|
+
1. Install the Nomic library
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pip install nomic
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
2. Login or create your Nomic account:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
nomic login
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
3. Follow the instructions to obtain your access token.
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
nomic login [token]
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Make your first map
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from nomic import atlas
|
|
114
|
+
import numpy as np
|
|
115
|
+
|
|
116
|
+
# Randomly generate a set of 10,000 high-dimensional embeddings
|
|
117
|
+
num_embeddings = 10000
|
|
118
|
+
embeddings = np.random.rand(num_embeddings, 256)
|
|
119
|
+
|
|
120
|
+
# Create Atlas project
|
|
121
|
+
dataset = atlas.map_data(embeddings=embeddings)
|
|
122
|
+
|
|
123
|
+
print(dataset)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Atlas usage examples
|
|
127
|
+
|
|
128
|
+
### Access your embeddings
|
|
129
|
+
|
|
130
|
+
Atlas stores, manages and generates embeddings for your unstructured data.
|
|
131
|
+
|
|
132
|
+
You can access Atlas latent embeddings (e.g. high dimensional) or the two-dimensional embeddings generated for web display.
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
# Access your Atlas map and download your embeddings
|
|
136
|
+
map = dataset.maps[0]
|
|
137
|
+
|
|
138
|
+
projected_embeddings = map.embeddings.projected
|
|
139
|
+
latent_embeddings = map.embeddings.latent
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
print(projected_embeddings)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
# Response:
|
|
148
|
+
id x y
|
|
149
|
+
0 9.815330 -8.105308
|
|
150
|
+
1 -8.725819 5.980628
|
|
151
|
+
2 13.199472 -1.103389
|
|
152
|
+
... ... ... ...
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
print(latent_embeddings)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
# Response:
|
|
161
|
+
n x d numpy.ndarray where n = number of datapoints and d = number of latent dimensions
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### View your data’s topic model
|
|
165
|
+
|
|
166
|
+
Atlas automatically organizes your data into topics informed by the latent contents of your embeddings. Visually, these are represented by regions of homogenous color on an Atlas map.
|
|
167
|
+
|
|
168
|
+
You can access and operate on topics programmatically by using the `topics` attribute
|
|
169
|
+
of an AtlasMap.
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# Access your Atlas map
|
|
173
|
+
map = dataset.maps[0]
|
|
174
|
+
|
|
175
|
+
# Access a pandas DataFrame associating each datum on your map to their topics at each topic depth.
|
|
176
|
+
topic_df = map.topics.df
|
|
177
|
+
|
|
178
|
+
print(map.topics.df)
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
Response:
|
|
184
|
+
|
|
185
|
+
id topic_depth_1 topic_depth_2
|
|
186
|
+
0 Oil Prices mergers and acquisitions
|
|
187
|
+
1 Iraq War Trial of Thatcher
|
|
188
|
+
2 Oil Prices Economic Growth
|
|
189
|
+
... ... ... ...
|
|
190
|
+
9997 Oil Prices Economic Growth
|
|
191
|
+
9998 Baseball Giambi's contract
|
|
192
|
+
9999 Olympic Gold Medal European Football
|
|
193
|
+
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### Search for data semantically
|
|
197
|
+
|
|
198
|
+
Use Atlas to automatically find nearest neighbors in your vector database.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
# Load map and perform vector search for the five nearest neighbors of datum with id "my_query_point"
|
|
202
|
+
map = dataset.maps[0]
|
|
203
|
+
|
|
204
|
+
with dataset.wait_for_dataset_lock():
|
|
205
|
+
neighbors, _ = map.embeddings.vector_search(ids=['my_query_point'], k=5)
|
|
206
|
+
|
|
207
|
+
# Return similar data points
|
|
208
|
+
similar_datapoints = dataset.get_data(ids=neighbors[0])
|
|
209
|
+
|
|
210
|
+
print(similar_datapoints)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
```
|
|
214
|
+
Response:
|
|
215
|
+
|
|
216
|
+
Original query point:
|
|
217
|
+
"Intel abandons digital TV chip project NEW YORK, October 22 (newratings.com) - Global semiconductor giant Intel Corporation (INTC.NAS) has called off its plan to develop a new chip for the digital projection televisions."
|
|
218
|
+
|
|
219
|
+
Nearest neighbors:
|
|
220
|
+
"Intel awaits government move on expensing options Figuring it's had enough of fighting over options, the chip giant is waiting to see what Congress comes up with."
|
|
221
|
+
"Citigroup Takes On Intel The financial services giant takes over non-memory semiconductor chip production."
|
|
222
|
+
"Intel Seen Readying New Wi-Fi Chips SAN FRANCISCO (Reuters) - Intel Corp. this week is expected to introduce a chip that adds support for a relatively obscure version of Wi-Fi, analysts said on Monday, in a move that could help ease congestion on wireless networks."
|
|
223
|
+
"Intel pledges to bring Itanic down to Xeon price-point EM64T a stand-in until the real anti-AMD64 kit arrives"
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Background
|
|
227
|
+
|
|
228
|
+
Atlas is developed by the [Nomic AI](https://home.nomic.ai/) team, which is based in NYC. Nomic also developed and maintains [GPT4All](https://gpt4all.io/index.html), an open-source LLM chatbot ecosystem.
|
|
229
|
+
|
|
230
|
+
## Discussion
|
|
231
|
+
|
|
232
|
+
Join the discussion on our [:hut: Discord](https://discord.gg/myY5YDR8z8) to ask questions, get help, and chat with others about Atlas, Nomic, GPT4All, and related topics. Our doors are open to enthusiasts of all skill levels.
|
|
233
|
+
|
|
234
|
+
## Community
|
|
235
|
+
|
|
236
|
+
- Blog: [https://blog.nomic.ai/](https://blog.nomic.ai/)
|
|
237
|
+
- Twitter: [https://twitter.com/nomic_ai](https://twitter.com/nomic_ai)
|
|
238
|
+
- Nomic Website: [https://home.nomic.ai/](https://home.nomic.ai/)
|
|
239
|
+
- Atlas Website: [https://atlas.nomic.ai/](https://atlas.nomic.ai/)
|
|
240
|
+
- GPT4All Website: [https://gpt4all.io/index.html](https://gpt4all.io/index.html)
|
|
241
|
+
- LinkedIn: [https://www.linkedin.com/company/nomic-ai](https://www.linkedin.com/company/nomic-ai)
|
|
242
|
+
|
|
243
|
+
<br>
|
|
244
|
+
|
|
245
|
+
[Go to top](#)
|
|
246
|
+
|
|
247
|
+
|
|
@@ -3,17 +3,17 @@ This class allows for programmatic interactions with Atlas - Nomic's neural data
|
|
|
3
3
|
or in a Jupyter Notebook to organize and interact with your unstructured data.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import uuid
|
|
7
6
|
from typing import Dict, Iterable, List, Optional, Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import pyarrow as pa
|
|
11
10
|
from loguru import logger
|
|
12
11
|
from pandas import DataFrame
|
|
12
|
+
from PIL import Image
|
|
13
13
|
from pyarrow import Table
|
|
14
14
|
from tqdm import tqdm
|
|
15
15
|
|
|
16
|
-
from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions,
|
|
16
|
+
from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicTopicOptions, ProjectionOptions
|
|
17
17
|
from .dataset import AtlasDataset, AtlasDataStream
|
|
18
18
|
from .settings import *
|
|
19
19
|
from .utils import arrow_iterator, b64int, get_random_name
|
|
@@ -21,13 +21,14 @@ from .utils import arrow_iterator, b64int, get_random_name
|
|
|
21
21
|
|
|
22
22
|
def map_data(
|
|
23
23
|
data: Optional[Union[DataFrame, List[Dict], Table]] = None,
|
|
24
|
+
blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
|
|
24
25
|
embeddings: Optional[np.ndarray] = None,
|
|
25
26
|
identifier: Optional[str] = None,
|
|
26
27
|
description: str = "",
|
|
27
28
|
id_field: Optional[str] = None,
|
|
28
29
|
is_public: bool = True,
|
|
29
30
|
indexed_field: Optional[str] = None,
|
|
30
|
-
projection: Union[
|
|
31
|
+
projection: Optional[Union[Dict, ProjectionOptions]] = None,
|
|
31
32
|
topic_model: Union[bool, Dict, NomicTopicOptions] = True,
|
|
32
33
|
duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
|
|
33
34
|
embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
|
|
@@ -36,12 +37,14 @@ def map_data(
|
|
|
36
37
|
|
|
37
38
|
Args:
|
|
38
39
|
data: An ordered collection of the datapoints you are structuring. Can be a list of dictionaries, Pandas Dataframe or PyArrow Table.
|
|
40
|
+
blobs: A list of image paths, bytes, or PIL images to add to your image dataset that are stored locally.
|
|
39
41
|
embeddings: An [N,d] numpy array containing the N embeddings to add.
|
|
40
42
|
identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
|
|
41
43
|
description: The description of your dataset
|
|
42
|
-
id_field: Specify
|
|
44
|
+
id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
|
|
43
45
|
is_public: Should the dataset be accessible outside your Nomic Atlas organization.
|
|
44
|
-
|
|
46
|
+
indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
|
|
47
|
+
projection: Options for configuring the 2D projection algorithm.
|
|
45
48
|
topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
|
|
46
49
|
duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
|
|
47
50
|
embedding_model: Options to adjust the embedding model used to embed your dataset.
|
|
@@ -54,10 +57,33 @@ def map_data(
|
|
|
54
57
|
raise Exception("Your embeddings cannot be empty")
|
|
55
58
|
|
|
56
59
|
if indexed_field is not None:
|
|
60
|
+
if embeddings is not None:
|
|
61
|
+
logger.warning("You have specified an indexed field but are using embeddings. Embeddings will be ignored.")
|
|
57
62
|
modality = "text"
|
|
58
63
|
|
|
59
|
-
if
|
|
60
|
-
|
|
64
|
+
if blobs is not None:
|
|
65
|
+
if embeddings is not None:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"You cannot pass both `blobs` and `embeddings` to map_data(). To create a map of images, include `blobs` and not `embeddings`. To create a map of embeddings with images as metadata, include your images as a field in your `data` parameter."
|
|
68
|
+
)
|
|
69
|
+
# change this when we support other modalities
|
|
70
|
+
modality = "image"
|
|
71
|
+
indexed_field = "_blob_hash"
|
|
72
|
+
if embedding_model is not None:
|
|
73
|
+
if isinstance(embedding_model, str):
|
|
74
|
+
model_name = embedding_model
|
|
75
|
+
elif isinstance(embedding_model, dict):
|
|
76
|
+
model_name = embedding_model["model"]
|
|
77
|
+
elif isinstance(embedding_model, NomicEmbedOptions):
|
|
78
|
+
model_name = embedding_model.model
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError("embedding_model must be a string, dictionary, or NomicEmbedOptions object")
|
|
81
|
+
|
|
82
|
+
if model_name in ["nomic-embed-text-v1", "nomic-embed-text-v1.5"]:
|
|
83
|
+
raise Exception("You cannot use a text embedding model with blobs")
|
|
84
|
+
else:
|
|
85
|
+
# default to vision v1.5
|
|
86
|
+
embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
|
|
61
87
|
|
|
62
88
|
project_name = get_random_name()
|
|
63
89
|
|
|
@@ -70,33 +96,6 @@ def map_data(
|
|
|
70
96
|
if description:
|
|
71
97
|
description = description
|
|
72
98
|
|
|
73
|
-
# no metadata was specified
|
|
74
|
-
added_id_field = False
|
|
75
|
-
|
|
76
|
-
if data is None and embeddings is not None:
|
|
77
|
-
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
|
|
78
|
-
added_id_field = True
|
|
79
|
-
|
|
80
|
-
if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
|
|
81
|
-
if isinstance(data, list) and id_field not in data[0]:
|
|
82
|
-
added_id_field = True
|
|
83
|
-
for i in range(len(data)):
|
|
84
|
-
# do not modify object the user passed in - also ensures IDs are unique if two input datums are the same *object*
|
|
85
|
-
data[i] = data[i].copy()
|
|
86
|
-
data[i][id_field] = b64int(i)
|
|
87
|
-
elif isinstance(data, DataFrame) and id_field not in data.columns:
|
|
88
|
-
data[id_field] = [b64int(i) for i in range(data.shape[0])]
|
|
89
|
-
added_id_field = True
|
|
90
|
-
elif isinstance(data, pa.Table) and not id_field in data.column_names: # type: ignore
|
|
91
|
-
ids = pa.array([b64int(i) for i in range(len(data))])
|
|
92
|
-
data = data.append_column(id_field, ids) # type: ignore
|
|
93
|
-
added_id_field = True
|
|
94
|
-
elif id_field not in data[0]:
|
|
95
|
-
raise ValueError("map_data data must be a list of dicts, a pandas dataframe, or a pyarrow table")
|
|
96
|
-
|
|
97
|
-
if added_id_field:
|
|
98
|
-
logger.warning("An ID field was not specified in your data so one was generated for you in insertion order.")
|
|
99
|
-
|
|
100
99
|
dataset = AtlasDataset(
|
|
101
100
|
identifier=dataset_name, description=description, unique_id_field=id_field, is_public=is_public
|
|
102
101
|
)
|
|
@@ -109,6 +108,8 @@ def map_data(
|
|
|
109
108
|
# Add data by modality
|
|
110
109
|
logger.info("Uploading data to Atlas.")
|
|
111
110
|
try:
|
|
111
|
+
if isinstance(data, DataFrame):
|
|
112
|
+
data = data.to_dict(orient="records")
|
|
112
113
|
if modality == "text":
|
|
113
114
|
dataset.add_data(data=data)
|
|
114
115
|
elif modality == "embedding":
|
|
@@ -116,6 +117,9 @@ def map_data(
|
|
|
116
117
|
embeddings=embeddings,
|
|
117
118
|
data=data,
|
|
118
119
|
)
|
|
120
|
+
elif modality == "image":
|
|
121
|
+
dataset.add_data(blobs=blobs, data=data)
|
|
122
|
+
|
|
119
123
|
except BaseException as e:
|
|
120
124
|
if number_of_datums_before_upload == 0:
|
|
121
125
|
logger.info(f"{dataset.identifier}: Deleting dataset due to failure in initial upload.")
|
|
@@ -128,7 +132,7 @@ def map_data(
|
|
|
128
132
|
name=index_name,
|
|
129
133
|
indexed_field=indexed_field,
|
|
130
134
|
modality=modality,
|
|
131
|
-
projection=projection,
|
|
135
|
+
projection=projection, # type: ignore[arg-type]
|
|
132
136
|
topic_model=topic_model,
|
|
133
137
|
duplicate_detection=duplicate_detection,
|
|
134
138
|
embedding_model=embedding_model,
|
|
@@ -162,7 +166,7 @@ def map_embeddings(
|
|
|
162
166
|
Args:
|
|
163
167
|
embeddings: An [N,d] numpy array containing the batch of N embeddings to add.
|
|
164
168
|
data: An [N,] element list of dictionaries containing metadata for each embedding.
|
|
165
|
-
id_field: Specify
|
|
169
|
+
id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
|
|
166
170
|
name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
|
|
167
171
|
description: A description for your map.
|
|
168
172
|
is_public: Should this embedding map be public? Private maps can only be accessed by members of your organization.
|
|
@@ -210,7 +214,7 @@ def map_text(
|
|
|
210
214
|
Args:
|
|
211
215
|
data: An [N,] element iterable of dictionaries containing metadata for each embedding.
|
|
212
216
|
indexed_field: The name the data field containing the text your want to map.
|
|
213
|
-
id_field: Specify
|
|
217
|
+
id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
|
|
214
218
|
name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
|
|
215
219
|
description: A description for your map.
|
|
216
220
|
build_topic_model: Builds a hierarchical topic model over your data to discover patterns.
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import multiprocessing as mp
|
|
6
6
|
from pathlib import PosixPath
|
|
7
|
-
from typing import List, Optional, Union
|
|
7
|
+
from typing import List, Optional, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import boto3
|
|
10
10
|
import PIL
|
|
@@ -38,26 +38,6 @@ def parse_sagemaker_response(response):
|
|
|
38
38
|
return resp["embeddings"]
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
def preprocess_texts(texts: List[str], task_type: str = "search_document"):
|
|
42
|
-
"""
|
|
43
|
-
Preprocess a list of texts for embedding using a sagemaker model.
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
texts: List of texts to be embedded.
|
|
47
|
-
task_type: The task type to use when embedding. One of `search_query`, `search_document`, `classification`, `clustering`
|
|
48
|
-
|
|
49
|
-
Returns:
|
|
50
|
-
List of texts formatted for sagemaker embedding.
|
|
51
|
-
"""
|
|
52
|
-
assert task_type in [
|
|
53
|
-
"search_query",
|
|
54
|
-
"search_document",
|
|
55
|
-
"classification",
|
|
56
|
-
"clustering",
|
|
57
|
-
], f"Invalid task type: {task_type}"
|
|
58
|
-
return [f"{task_type}: {text}" for text in texts]
|
|
59
|
-
|
|
60
|
-
|
|
61
41
|
def batch_transform_text(
|
|
62
42
|
s3_input_path: str,
|
|
63
43
|
s3_output_path: str,
|
|
@@ -157,7 +137,13 @@ def embed_text(
|
|
|
157
137
|
logger.warning("No texts to embed.")
|
|
158
138
|
return None
|
|
159
139
|
|
|
160
|
-
|
|
140
|
+
assert task_type in [
|
|
141
|
+
"search_query",
|
|
142
|
+
"search_document",
|
|
143
|
+
"classification",
|
|
144
|
+
"clustering",
|
|
145
|
+
], f"Invalid task type: {task_type}"
|
|
146
|
+
|
|
161
147
|
assert dimensionality in (
|
|
162
148
|
64,
|
|
163
149
|
128,
|
|
@@ -175,6 +161,7 @@ def embed_text(
|
|
|
175
161
|
"texts": texts[i : i + batch_size],
|
|
176
162
|
"binary": binary,
|
|
177
163
|
"dimensionality": dimensionality,
|
|
164
|
+
"task_type": task_type,
|
|
178
165
|
}
|
|
179
166
|
)
|
|
180
167
|
response = client.invoke_endpoint(EndpointName=sagemaker_endpoint, Body=batch, ContentType="application/json")
|
|
@@ -187,7 +174,22 @@ def embed_text(
|
|
|
187
174
|
}
|
|
188
175
|
|
|
189
176
|
|
|
190
|
-
|
|
177
|
+
# only way I could get sagemaker with multipart to work
|
|
178
|
+
def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
|
|
179
|
+
# Prepare the multipart body
|
|
180
|
+
boundary = b"---------------------------Boundary"
|
|
181
|
+
body = b""
|
|
182
|
+
for i, (name, img_bytes) in enumerate(images):
|
|
183
|
+
body += b"--" + boundary + b"\r\n"
|
|
184
|
+
body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
|
|
185
|
+
body += b"Content-Type: image/jpeg\r\n\r\n"
|
|
186
|
+
body += img_bytes + b"\r\n"
|
|
187
|
+
body += b"--" + boundary + b"--\r\n"
|
|
188
|
+
|
|
189
|
+
return body, boundary
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
|
|
191
193
|
"""
|
|
192
194
|
Preprocess a list of images for embedding using a sagemaker model.
|
|
193
195
|
|
|
@@ -210,17 +212,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
|
|
|
210
212
|
image = image.convert("RGB")
|
|
211
213
|
buffered = io.BytesIO()
|
|
212
214
|
image.save(buffered, format="JPEG")
|
|
213
|
-
|
|
214
|
-
encoded_images.append(encoded_image)
|
|
215
|
-
return encoded_images
|
|
215
|
+
encoded_images.append(("image_data", buffered.getvalue()))
|
|
216
216
|
|
|
217
|
+
body, boundary = prepare_multipart_request(encoded_images)
|
|
218
|
+
return body, boundary
|
|
217
219
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
+
|
|
221
|
+
def sagemaker_image_request(
|
|
222
|
+
images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
|
|
223
|
+
):
|
|
224
|
+
body, boundary = preprocess_image(images)
|
|
220
225
|
|
|
221
226
|
client = boto3.client("sagemaker-runtime", region_name=region_name)
|
|
222
227
|
response = client.invoke_endpoint(
|
|
223
|
-
EndpointName=sagemaker_endpoint,
|
|
228
|
+
EndpointName=sagemaker_endpoint,
|
|
229
|
+
Body=body,
|
|
230
|
+
ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
|
|
224
231
|
)
|
|
225
232
|
|
|
226
233
|
return parse_sagemaker_response(response)
|
|
@@ -230,21 +237,18 @@ def embed_image(
|
|
|
230
237
|
images: List[Union[str, "PIL.Image.Image", bytes]],
|
|
231
238
|
sagemaker_endpoint: str,
|
|
232
239
|
region_name: str,
|
|
233
|
-
model_name="nomic-embed-vision-v1",
|
|
240
|
+
model_name="nomic-embed-vision-v1.5",
|
|
241
|
+
batch_size=16,
|
|
234
242
|
) -> dict:
|
|
235
243
|
embeddings = []
|
|
236
244
|
|
|
237
|
-
max_workers = mp.cpu_count()
|
|
238
245
|
pbar = tqdm(total=len(images))
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
for future in concurrent.futures.as_completed(futures):
|
|
247
|
-
embeddings.extend(future.result())
|
|
246
|
+
for i in range(0, len(images), batch_size):
|
|
247
|
+
batch = images[i : i + batch_size]
|
|
248
|
+
embeddings.extend(
|
|
249
|
+
sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
|
|
250
|
+
)
|
|
251
|
+
pbar.update(len(batch))
|
|
248
252
|
|
|
249
253
|
return {
|
|
250
254
|
"embeddings": embeddings,
|
|
@@ -260,7 +264,7 @@ def batch_transform_image(
|
|
|
260
264
|
arn: Optional[str] = None,
|
|
261
265
|
role: Optional[str] = None,
|
|
262
266
|
max_payload: Optional[int] = 6,
|
|
263
|
-
instance_type: str = "ml.
|
|
267
|
+
instance_type: str = "ml.g4dn.xlarge",
|
|
264
268
|
n_instances: int = 1,
|
|
265
269
|
wait: bool = True,
|
|
266
270
|
logs: bool = True,
|
|
@@ -53,7 +53,7 @@ def login(token, tenant="production", domain=None):
|
|
|
53
53
|
console.print("Authenticate with the Nomic API", style=style, justify="center")
|
|
54
54
|
console.print(auth0_auth_endpoint, style=style, justify="center")
|
|
55
55
|
console.print(
|
|
56
|
-
"Click the above link to retrieve your access token and then run `nomic login [token]`",
|
|
56
|
+
"Click the above link to retrieve your access token and then run `nomic login \\[token]`",
|
|
57
57
|
style=style,
|
|
58
58
|
justify="center",
|
|
59
59
|
)
|