nomic 3.0.32__tar.gz → 3.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. nomic-3.8.0/MANIFEST.in +1 -0
  2. nomic-3.8.0/PKG-INFO +247 -0
  3. nomic-3.8.0/nomic/__init__.py +10 -0
  4. {nomic-3.0.32 → nomic-3.8.0}/nomic/atlas.py +41 -37
  5. {nomic-3.0.32 → nomic-3.8.0}/nomic/aws/sagemaker.py +45 -41
  6. {nomic-3.0.32 → nomic-3.8.0}/nomic/cli.py +1 -1
  7. nomic-3.8.0/nomic/client.py +430 -0
  8. nomic-3.8.0/nomic/client_models.py +103 -0
  9. {nomic-3.0.32 → nomic-3.8.0}/nomic/data_inference.py +36 -19
  10. {nomic-3.0.32 → nomic-3.8.0}/nomic/data_operations.py +514 -164
  11. {nomic-3.0.32 → nomic-3.8.0}/nomic/dataset.py +422 -314
  12. {nomic-3.0.32 → nomic-3.8.0}/nomic/embed.py +25 -14
  13. nomic-3.8.0/nomic/py.typed +0 -0
  14. {nomic-3.0.32 → nomic-3.8.0}/nomic/settings.py +0 -3
  15. {nomic-3.0.32 → nomic-3.8.0}/nomic/utils.py +33 -15
  16. nomic-3.8.0/nomic.egg-info/PKG-INFO +247 -0
  17. {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/SOURCES.txt +4 -0
  18. {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/requires.txt +3 -2
  19. {nomic-3.0.32 → nomic-3.8.0}/setup.py +22 -4
  20. nomic-3.0.32/PKG-INFO +0 -18
  21. nomic-3.0.32/nomic/__init__.py +0 -2
  22. nomic-3.0.32/nomic.egg-info/PKG-INFO +0 -18
  23. {nomic-3.0.32 → nomic-3.8.0}/README.md +0 -0
  24. {nomic-3.0.32 → nomic-3.8.0}/nomic/aws/__init__.py +0 -0
  25. {nomic-3.0.32 → nomic-3.8.0}/nomic/pl_callbacks/__init__.py +0 -0
  26. {nomic-3.0.32 → nomic-3.8.0}/nomic/pl_callbacks/pl_callback.py +0 -0
  27. {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/dependency_links.txt +0 -0
  28. {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/entry_points.txt +0 -0
  29. {nomic-3.0.32 → nomic-3.8.0}/nomic.egg-info/top_level.txt +0 -0
  30. {nomic-3.0.32 → nomic-3.8.0}/pyproject.toml +0 -0
  31. {nomic-3.0.32 → nomic-3.8.0}/setup.cfg +0 -0
@@ -0,0 +1 @@
1
+ include nomic/py.typed
nomic-3.8.0/PKG-INFO ADDED
@@ -0,0 +1,247 @@
1
+ Metadata-Version: 2.1
2
+ Name: nomic
3
+ Version: 3.8.0
4
+ Summary: The official Nomic python client.
5
+ Home-page: https://github.com/nomic-ai/nomic
6
+ Author: nomic.ai
7
+ Author-email: support@nomic.ai
8
+ License: UNKNOWN
9
+ Platform: UNKNOWN
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Description-Content-Type: text/markdown
13
+ Provides-Extra: local
14
+ Provides-Extra: aws
15
+ Provides-Extra: all
16
+ Provides-Extra: dev
17
+
18
+ <h1 align="center">Nomic Atlas Python Client</h1>
19
+ <h3 align="center">Explore, label, search and share massive datasets in your web browser.</h3>
20
+ <p>This repository contains Python bindings for working with <a href="https://atlas.nomic.ai/">Nomic Atlas</a>, the world’s most powerful unstructured data interaction platform. Atlas supports datasets from hundreds to tens of millions of points, and supports data modalities ranging from text to image to audio to video. </p>
21
+
22
+ With Nomic Atlas, you can:
23
+
24
+ - Generate, store and retrieve embeddings for your unstructured data.
25
+ - Find insights in your unstructured data and embeddings all from your web browser.
26
+ - Share and present your datasets and data findings to anyone.
27
+
28
+ ### Where to find us?
29
+
30
+ [https://atlas.nomic.ai/](https://atlas.nomic.ai/)
31
+
32
+
33
+
34
+ ## Table of Contents
35
+
36
+ - [Quick resources](#quick-resources)
37
+ - [Example maps](#example-maps)
38
+ - [Features](#features)
39
+ - [Quickstart](#quickstart)
40
+ - [Installation](#installation)
41
+ - [Make your first map](#make-your-first-map)
42
+ - [Atlas usage examples](#atlas-usage-examples)
43
+ - [Access your embeddings](#access-your-embeddings)
44
+ - [View your data's topic model](#view-your-datas-topic-model)
45
+ - [Search for data semantically](#search-for-data-semantically)
46
+ - [Documentation](#documentation)
47
+ - [Discussion](#discussion)
48
+ - [Community](#community)
49
+
50
+ ## Quick Resources
51
+
52
+ <p >
53
+ Try the <a href="https://colab.research.google.com/drive/1CZBo3LV0FoRTVRN3v068tvNJgbeWpcSX?usp=sharing">:notebook: Colab Demo</a> to get started in Python
54
+ </p>
55
+
56
+ <p>
57
+ Read the <a href="https://docs.nomic.ai">:closed_book: Atlas Docs</a>
58
+ </p>
59
+
60
+ <p>
61
+ Join our <a href="https://discord.gg/myY5YDR8z8">:hut: Discord</a> to start chatting and get help
62
+ </p>
63
+
64
+ #### Example maps
65
+
66
+ <a href="https://atlas.nomic.ai/map/twitter">:world_map: Map of Twitter</a> (5.4 million tweets)
67
+ <br> <br>
68
+ <a href="https://atlas.nomic.ai/map/stablediffusion">:world_map: Map of StableDiffusion Generations</a> (6.4 million images)
69
+ <br> <br>
70
+ <a href="https://atlas.nomic.ai/map/neurips">:world_map: Map of NeurIPS Proceedings</a> (16,623 abstracts)
71
+
72
+ </p>
73
+
74
+ ## Features
75
+
76
+ Here are just a few of the features which Atlas offers:
77
+
78
+ - Organize your **text, image, and embedding data**
79
+ - Create **beautiful and shareable** maps **with or without coding knowledge**
80
+ - Have easy access to both **high-level data structures** and **individual datapoints**
81
+ - **Search** millions of datapoints **instantly**
82
+ - **Cluster data** into semantic topics
83
+ - **Tag and clean** your dataset
84
+ - **Deduplicate** text, images, video, audio
85
+
86
+
87
+
88
+ ## Quickstart
89
+
90
+ ### Installation
91
+
92
+ 1. Install the Nomic library
93
+
94
+ ```bash
95
+ pip install nomic
96
+ ```
97
+
98
+ 2. Login or create your Nomic account:
99
+
100
+ ```bash
101
+ nomic login
102
+ ```
103
+
104
+ 3. Follow the instructions to obtain your access token.
105
+
106
+ ```bash
107
+ nomic login [token]
108
+ ```
109
+
110
+ ### Make your first map
111
+
112
+ ```python
113
+ from nomic import atlas
114
+ import numpy as np
115
+
116
+ # Randomly generate a set of 10,000 high-dimensional embeddings
117
+ num_embeddings = 10000
118
+ embeddings = np.random.rand(num_embeddings, 256)
119
+
120
+ # Create Atlas project
121
+ dataset = atlas.map_data(embeddings=embeddings)
122
+
123
+ print(dataset)
124
+ ```
125
+
126
+ ## Atlas usage examples
127
+
128
+ ### Access your embeddings
129
+
130
+ Atlas stores, manages and generates embeddings for your unstructured data.
131
+
132
+ You can access Atlas latent embeddings (e.g. high dimensional) or the two-dimensional embeddings generated for web display.
133
+
134
+ ```python
135
+ # Access your Atlas map and download your embeddings
136
+ map = dataset.maps[0]
137
+
138
+ projected_embeddings = map.embeddings.projected
139
+ latent_embeddings = map.embeddings.latent
140
+ ```
141
+
142
+ ```python
143
+ print(projected_embeddings)
144
+ ```
145
+
146
+ ```
147
+ # Response:
148
+ id x y
149
+ 0 9.815330 -8.105308
150
+ 1 -8.725819 5.980628
151
+ 2 13.199472 -1.103389
152
+ ... ... ... ...
153
+ ```
154
+
155
+ ```python
156
+ print(latent_embeddings)
157
+ ```
158
+
159
+ ```
160
+ # Response:
161
+ n x d numpy.ndarray where n = number of datapoints and d = number of latent dimensions
162
+ ```
163
+
164
+ ### View your data’s topic model
165
+
166
+ Atlas automatically organizes your data into topics informed by the latent contents of your embeddings. Visually, these are represented by regions of homogenous color on an Atlas map.
167
+
168
+ You can access and operate on topics programmatically by using the `topics` attribute
169
+ of an AtlasMap.
170
+
171
+ ```python
172
+ # Access your Atlas map
173
+ map = dataset.maps[0]
174
+
175
+ # Access a pandas DataFrame associating each datum on your map to their topics at each topic depth.
176
+ topic_df = map.topics.df
177
+
178
+ print(map.topics.df)
179
+
180
+ ```
181
+
182
+ ```
183
+ Response:
184
+
185
+ id topic_depth_1 topic_depth_2
186
+ 0 Oil Prices mergers and acquisitions
187
+ 1 Iraq War Trial of Thatcher
188
+ 2 Oil Prices Economic Growth
189
+ ... ... ... ...
190
+ 9997 Oil Prices Economic Growth
191
+ 9998 Baseball Giambi's contract
192
+ 9999 Olympic Gold Medal European Football
193
+
194
+ ```
195
+
196
+ ### Search for data semantically
197
+
198
+ Use Atlas to automatically find nearest neighbors in your vector database.
199
+
200
+ ```python
201
+ # Load map and perform vector search for the five nearest neighbors of datum with id "my_query_point"
202
+ map = dataset.maps[0]
203
+
204
+ with dataset.wait_for_dataset_lock():
205
+ neighbors, _ = map.embeddings.vector_search(ids=['my_query_point'], k=5)
206
+
207
+ # Return similar data points
208
+ similar_datapoints = dataset.get_data(ids=neighbors[0])
209
+
210
+ print(similar_datapoints)
211
+ ```
212
+
213
+ ```
214
+ Response:
215
+
216
+ Original query point:
217
+ "Intel abandons digital TV chip project NEW YORK, October 22 (newratings.com) - Global semiconductor giant Intel Corporation (INTC.NAS) has called off its plan to develop a new chip for the digital projection televisions."
218
+
219
+ Nearest neighbors:
220
+ "Intel awaits government move on expensing options Figuring it's had enough of fighting over options, the chip giant is waiting to see what Congress comes up with."
221
+ "Citigroup Takes On Intel The financial services giant takes over non-memory semiconductor chip production."
222
+ "Intel Seen Readying New Wi-Fi Chips SAN FRANCISCO (Reuters) - Intel Corp. this week is expected to introduce a chip that adds support for a relatively obscure version of Wi-Fi, analysts said on Monday, in a move that could help ease congestion on wireless networks."
223
+ "Intel pledges to bring Itanic down to Xeon price-point EM64T a stand-in until the real anti-AMD64 kit arrives"
224
+ ```
225
+
226
+ ## Background
227
+
228
+ Atlas is developed by the [Nomic AI](https://home.nomic.ai/) team, which is based in NYC. Nomic also developed and maintains [GPT4All](https://gpt4all.io/index.html), an open-source LLM chatbot ecosystem.
229
+
230
+ ## Discussion
231
+
232
+ Join the discussion on our [:hut: Discord](https://discord.gg/myY5YDR8z8) to ask questions, get help, and chat with others about Atlas, Nomic, GPT4All, and related topics. Our doors are open to enthusiasts of all skill levels.
233
+
234
+ ## Community
235
+
236
+ - Blog: [https://blog.nomic.ai/](https://blog.nomic.ai/)
237
+ - Twitter: [https://twitter.com/nomic_ai](https://twitter.com/nomic_ai)
238
+ - Nomic Website: [https://home.nomic.ai/](https://home.nomic.ai/)
239
+ - Atlas Website: [https://atlas.nomic.ai/](https://atlas.nomic.ai/)
240
+ - GPT4All Website: [https://gpt4all.io/index.html](https://gpt4all.io/index.html)
241
+ - LinkedIn: [https://www.linkedin.com/company/nomic-ai](https://www.linkedin.com/company/nomic-ai)
242
+
243
+ <br>
244
+
245
+ [Go to top](#)
246
+
247
+
@@ -0,0 +1,10 @@
1
+ from .cli import login
2
+ from .client import NomicClient
3
+ from .dataset import AtlasDataset, AtlasUser
4
+
5
+ __all__ = [
6
+ "AtlasDataset",
7
+ "AtlasUser",
8
+ "NomicClient",
9
+ "login",
10
+ ]
@@ -3,17 +3,17 @@ This class allows for programmatic interactions with Atlas - Nomic's neural data
3
3
  or in a Jupyter Notebook to organize and interact with your unstructured data.
4
4
  """
5
5
 
6
- import uuid
7
6
  from typing import Dict, Iterable, List, Optional, Union
8
7
 
9
8
  import numpy as np
10
9
  import pyarrow as pa
11
10
  from loguru import logger
12
11
  from pandas import DataFrame
12
+ from PIL import Image
13
13
  from pyarrow import Table
14
14
  from tqdm import tqdm
15
15
 
16
- from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicProjectOptions, NomicTopicOptions
16
+ from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicTopicOptions, ProjectionOptions
17
17
  from .dataset import AtlasDataset, AtlasDataStream
18
18
  from .settings import *
19
19
  from .utils import arrow_iterator, b64int, get_random_name
@@ -21,13 +21,14 @@ from .utils import arrow_iterator, b64int, get_random_name
21
21
 
22
22
  def map_data(
23
23
  data: Optional[Union[DataFrame, List[Dict], Table]] = None,
24
+ blobs: Optional[List[Union[str, bytes, Image.Image]]] = None,
24
25
  embeddings: Optional[np.ndarray] = None,
25
26
  identifier: Optional[str] = None,
26
27
  description: str = "",
27
28
  id_field: Optional[str] = None,
28
29
  is_public: bool = True,
29
30
  indexed_field: Optional[str] = None,
30
- projection: Union[bool, Dict, NomicProjectOptions] = True,
31
+ projection: Optional[Union[Dict, ProjectionOptions]] = None,
31
32
  topic_model: Union[bool, Dict, NomicTopicOptions] = True,
32
33
  duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
33
34
  embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
@@ -36,12 +37,14 @@ def map_data(
36
37
 
37
38
  Args:
38
39
  data: An ordered collection of the datapoints you are structuring. Can be a list of dictionaries, Pandas Dataframe or PyArrow Table.
40
+ blobs: A list of image paths, bytes, or PIL images to add to your image dataset that are stored locally.
39
41
  embeddings: An [N,d] numpy array containing the N embeddings to add.
40
42
  identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
41
43
  description: The description of your dataset
42
- id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
44
+ id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
43
45
  is_public: Should the dataset be accessible outside your Nomic Atlas organization.
44
- projection: Options to adjust Nomic Project - the dimensionality algorithm organizing your dataset.
46
+ indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
47
+ projection: Options for configuring the 2D projection algorithm.
45
48
  topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
46
49
  duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
47
50
  embedding_model: Options to adjust the embedding model used to embed your dataset.
@@ -54,10 +57,33 @@ def map_data(
54
57
  raise Exception("Your embeddings cannot be empty")
55
58
 
56
59
  if indexed_field is not None:
60
+ if embeddings is not None:
61
+ logger.warning("You have specified an indexed field but are using embeddings. Embeddings will be ignored.")
57
62
  modality = "text"
58
63
 
59
- if id_field is None:
60
- id_field = ATLAS_DEFAULT_ID_FIELD
64
+ if blobs is not None:
65
+ if embeddings is not None:
66
+ raise ValueError(
67
+ "You cannot pass both `blobs` and `embeddings` to map_data(). To create a map of images, include `blobs` and not `embeddings`. To create a map of embeddings with images as metadata, include your images as a field in your `data` parameter."
68
+ )
69
+ # change this when we support other modalities
70
+ modality = "image"
71
+ indexed_field = "_blob_hash"
72
+ if embedding_model is not None:
73
+ if isinstance(embedding_model, str):
74
+ model_name = embedding_model
75
+ elif isinstance(embedding_model, dict):
76
+ model_name = embedding_model["model"]
77
+ elif isinstance(embedding_model, NomicEmbedOptions):
78
+ model_name = embedding_model.model
79
+ else:
80
+ raise ValueError("embedding_model must be a string, dictionary, or NomicEmbedOptions object")
81
+
82
+ if model_name in ["nomic-embed-text-v1", "nomic-embed-text-v1.5"]:
83
+ raise Exception("You cannot use a text embedding model with blobs")
84
+ else:
85
+ # default to vision v1.5
86
+ embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
61
87
 
62
88
  project_name = get_random_name()
63
89
 
@@ -70,33 +96,6 @@ def map_data(
70
96
  if description:
71
97
  description = description
72
98
 
73
- # no metadata was specified
74
- added_id_field = False
75
-
76
- if data is None and embeddings is not None:
77
- data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
78
- added_id_field = True
79
-
80
- if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
81
- if isinstance(data, list) and id_field not in data[0]:
82
- added_id_field = True
83
- for i in range(len(data)):
84
- # do not modify object the user passed in - also ensures IDs are unique if two input datums are the same *object*
85
- data[i] = data[i].copy()
86
- data[i][id_field] = b64int(i)
87
- elif isinstance(data, DataFrame) and id_field not in data.columns:
88
- data[id_field] = [b64int(i) for i in range(data.shape[0])]
89
- added_id_field = True
90
- elif isinstance(data, pa.Table) and not id_field in data.column_names: # type: ignore
91
- ids = pa.array([b64int(i) for i in range(len(data))])
92
- data = data.append_column(id_field, ids) # type: ignore
93
- added_id_field = True
94
- elif id_field not in data[0]:
95
- raise ValueError("map_data data must be a list of dicts, a pandas dataframe, or a pyarrow table")
96
-
97
- if added_id_field:
98
- logger.warning("An ID field was not specified in your data so one was generated for you in insertion order.")
99
-
100
99
  dataset = AtlasDataset(
101
100
  identifier=dataset_name, description=description, unique_id_field=id_field, is_public=is_public
102
101
  )
@@ -109,6 +108,8 @@ def map_data(
109
108
  # Add data by modality
110
109
  logger.info("Uploading data to Atlas.")
111
110
  try:
111
+ if isinstance(data, DataFrame):
112
+ data = data.to_dict(orient="records")
112
113
  if modality == "text":
113
114
  dataset.add_data(data=data)
114
115
  elif modality == "embedding":
@@ -116,6 +117,9 @@ def map_data(
116
117
  embeddings=embeddings,
117
118
  data=data,
118
119
  )
120
+ elif modality == "image":
121
+ dataset.add_data(blobs=blobs, data=data)
122
+
119
123
  except BaseException as e:
120
124
  if number_of_datums_before_upload == 0:
121
125
  logger.info(f"{dataset.identifier}: Deleting dataset due to failure in initial upload.")
@@ -128,7 +132,7 @@ def map_data(
128
132
  name=index_name,
129
133
  indexed_field=indexed_field,
130
134
  modality=modality,
131
- projection=projection,
135
+ projection=projection, # type: ignore[arg-type]
132
136
  topic_model=topic_model,
133
137
  duplicate_detection=duplicate_detection,
134
138
  embedding_model=embedding_model,
@@ -162,7 +166,7 @@ def map_embeddings(
162
166
  Args:
163
167
  embeddings: An [N,d] numpy array containing the batch of N embeddings to add.
164
168
  data: An [N,] element list of dictionaries containing metadata for each embedding.
165
- id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
169
+ id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
166
170
  name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
167
171
  description: A description for your map.
168
172
  is_public: Should this embedding map be public? Private maps can only be accessed by members of your organization.
@@ -210,7 +214,7 @@ def map_text(
210
214
  Args:
211
215
  data: An [N,] element iterable of dictionaries containing metadata for each embedding.
212
216
  indexed_field: The name the data field containing the text your want to map.
213
- id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
217
+ id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
214
218
  name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
215
219
  description: A description for your map.
216
220
  build_topic_model: Builds a hierarchical topic model over your data to discover patterns.
@@ -4,7 +4,7 @@ import json
4
4
  import logging
5
5
  import multiprocessing as mp
6
6
  from pathlib import PosixPath
7
- from typing import List, Optional, Union
7
+ from typing import List, Optional, Tuple, Union
8
8
 
9
9
  import boto3
10
10
  import PIL
@@ -38,26 +38,6 @@ def parse_sagemaker_response(response):
38
38
  return resp["embeddings"]
39
39
 
40
40
 
41
- def preprocess_texts(texts: List[str], task_type: str = "search_document"):
42
- """
43
- Preprocess a list of texts for embedding using a sagemaker model.
44
-
45
- Args:
46
- texts: List of texts to be embedded.
47
- task_type: The task type to use when embedding. One of `search_query`, `search_document`, `classification`, `clustering`
48
-
49
- Returns:
50
- List of texts formatted for sagemaker embedding.
51
- """
52
- assert task_type in [
53
- "search_query",
54
- "search_document",
55
- "classification",
56
- "clustering",
57
- ], f"Invalid task type: {task_type}"
58
- return [f"{task_type}: {text}" for text in texts]
59
-
60
-
61
41
  def batch_transform_text(
62
42
  s3_input_path: str,
63
43
  s3_output_path: str,
@@ -157,7 +137,13 @@ def embed_text(
157
137
  logger.warning("No texts to embed.")
158
138
  return None
159
139
 
160
- texts = preprocess_texts(texts, task_type)
140
+ assert task_type in [
141
+ "search_query",
142
+ "search_document",
143
+ "classification",
144
+ "clustering",
145
+ ], f"Invalid task type: {task_type}"
146
+
161
147
  assert dimensionality in (
162
148
  64,
163
149
  128,
@@ -175,6 +161,7 @@ def embed_text(
175
161
  "texts": texts[i : i + batch_size],
176
162
  "binary": binary,
177
163
  "dimensionality": dimensionality,
164
+ "task_type": task_type,
178
165
  }
179
166
  )
180
167
  response = client.invoke_endpoint(EndpointName=sagemaker_endpoint, Body=batch, ContentType="application/json")
@@ -187,7 +174,22 @@ def embed_text(
187
174
  }
188
175
 
189
176
 
190
- def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List[bytes]:
177
+ # only way I could get sagemaker with multipart to work
178
+ def prepare_multipart_request(images: List[Tuple[str, bytes]]) -> Tuple[bytes, bytes]:
179
+ # Prepare the multipart body
180
+ boundary = b"---------------------------Boundary"
181
+ body = b""
182
+ for i, (name, img_bytes) in enumerate(images):
183
+ body += b"--" + boundary + b"\r\n"
184
+ body += f'Content-Disposition: form-data; name="{name}"; filename="image_{i}.jpg"\r\n'.encode("utf-8")
185
+ body += b"Content-Type: image/jpeg\r\n\r\n"
186
+ body += img_bytes + b"\r\n"
187
+ body += b"--" + boundary + b"--\r\n"
188
+
189
+ return body, boundary
190
+
191
+
192
+ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> Tuple[bytes, bytes]:
191
193
  """
192
194
  Preprocess a list of images for embedding using a sagemaker model.
193
195
 
@@ -210,17 +212,22 @@ def preprocess_image(images: List[Union[str, "PIL.Image.Image", bytes]]) -> List
210
212
  image = image.convert("RGB")
211
213
  buffered = io.BytesIO()
212
214
  image.save(buffered, format="JPEG")
213
- encoded_image = buffered.getvalue()
214
- encoded_images.append(encoded_image)
215
- return encoded_images
215
+ encoded_images.append(("image_data", buffered.getvalue()))
216
216
 
217
+ body, boundary = prepare_multipart_request(encoded_images)
218
+ return body, boundary
217
219
 
218
- def sagemaker_image_request(image: Union[str, bytes, "PIL.Image.Image"], sagemaker_endpoint: str, region_name: str):
219
- preprocessed_image = preprocess_image([image])
220
+
221
+ def sagemaker_image_request(
222
+ images: List[Union[str, bytes, "PIL.Image.Image"]], sagemaker_endpoint: str, region_name: str
223
+ ):
224
+ body, boundary = preprocess_image(images)
220
225
 
221
226
  client = boto3.client("sagemaker-runtime", region_name=region_name)
222
227
  response = client.invoke_endpoint(
223
- EndpointName=sagemaker_endpoint, Body=preprocessed_image[0], ContentType="image/jpeg"
228
+ EndpointName=sagemaker_endpoint,
229
+ Body=body,
230
+ ContentType=f'multipart/form-data; boundary={boundary.decode("utf-8")}',
224
231
  )
225
232
 
226
233
  return parse_sagemaker_response(response)
@@ -230,21 +237,18 @@ def embed_image(
230
237
  images: List[Union[str, "PIL.Image.Image", bytes]],
231
238
  sagemaker_endpoint: str,
232
239
  region_name: str,
233
- model_name="nomic-embed-vision-v1",
240
+ model_name="nomic-embed-vision-v1.5",
241
+ batch_size=16,
234
242
  ) -> dict:
235
243
  embeddings = []
236
244
 
237
- max_workers = mp.cpu_count()
238
245
  pbar = tqdm(total=len(images))
239
- with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
240
- futures = []
241
- for image in images:
242
- future = executor.submit(sagemaker_image_request, image, sagemaker_endpoint, region_name)
243
- future.add_done_callback(lambda p: pbar.update())
244
- futures.append(future)
245
-
246
- for future in concurrent.futures.as_completed(futures):
247
- embeddings.extend(future.result())
246
+ for i in range(0, len(images), batch_size):
247
+ batch = images[i : i + batch_size]
248
+ embeddings.extend(
249
+ sagemaker_image_request(batch, sagemaker_endpoint=sagemaker_endpoint, region_name=region_name)
250
+ )
251
+ pbar.update(len(batch))
248
252
 
249
253
  return {
250
254
  "embeddings": embeddings,
@@ -260,7 +264,7 @@ def batch_transform_image(
260
264
  arn: Optional[str] = None,
261
265
  role: Optional[str] = None,
262
266
  max_payload: Optional[int] = 6,
263
- instance_type: str = "ml.p3.2xlarge",
267
+ instance_type: str = "ml.g4dn.xlarge",
264
268
  n_instances: int = 1,
265
269
  wait: bool = True,
266
270
  logs: bool = True,
@@ -53,7 +53,7 @@ def login(token, tenant="production", domain=None):
53
53
  console.print("Authenticate with the Nomic API", style=style, justify="center")
54
54
  console.print(auth0_auth_endpoint, style=style, justify="center")
55
55
  console.print(
56
- "Click the above link to retrieve your access token and then run `nomic login [token]`",
56
+ "Click the above link to retrieve your access token and then run `nomic login \\[token]`",
57
57
  style=style,
58
58
  justify="center",
59
59
  )