nomic 3.4.1__tar.gz → 3.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nomic might be problematic. Click here for more details.
- {nomic-3.4.1 → nomic-3.5.1}/PKG-INFO +1 -1
- {nomic-3.4.1 → nomic-3.5.1}/nomic/atlas.py +9 -43
- {nomic-3.4.1 → nomic-3.5.1}/nomic/data_inference.py +25 -14
- {nomic-3.4.1 → nomic-3.5.1}/nomic/data_operations.py +337 -87
- {nomic-3.4.1 → nomic-3.5.1}/nomic/dataset.py +157 -157
- {nomic-3.4.1 → nomic-3.5.1}/nomic/settings.py +0 -3
- {nomic-3.4.1 → nomic-3.5.1}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.4.1 → nomic-3.5.1}/setup.py +1 -1
- {nomic-3.4.1 → nomic-3.5.1}/README.md +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/__init__.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/aws/__init__.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/aws/sagemaker.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/cli.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/embed.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic/utils.py +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic.egg-info/SOURCES.txt +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic.egg-info/requires.txt +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/pyproject.toml +0 -0
- {nomic-3.4.1 → nomic-3.5.1}/setup.cfg +0 -0
|
@@ -3,7 +3,6 @@ This class allows for programmatic interactions with Atlas - Nomic's neural data
|
|
|
3
3
|
or in a Jupyter Notebook to organize and interact with your unstructured data.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import uuid
|
|
7
6
|
from typing import Dict, Iterable, List, Optional, Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
@@ -14,7 +13,7 @@ from PIL import Image
|
|
|
14
13
|
from pyarrow import Table
|
|
15
14
|
from tqdm import tqdm
|
|
16
15
|
|
|
17
|
-
from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions,
|
|
16
|
+
from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicTopicOptions, ProjectionOptions
|
|
18
17
|
from .dataset import AtlasDataset, AtlasDataStream
|
|
19
18
|
from .settings import *
|
|
20
19
|
from .utils import arrow_iterator, b64int, get_random_name
|
|
@@ -29,7 +28,7 @@ def map_data(
|
|
|
29
28
|
id_field: Optional[str] = None,
|
|
30
29
|
is_public: bool = True,
|
|
31
30
|
indexed_field: Optional[str] = None,
|
|
32
|
-
projection: Union[
|
|
31
|
+
projection: Optional[Union[Dict, ProjectionOptions]] = None,
|
|
33
32
|
topic_model: Union[bool, Dict, NomicTopicOptions] = True,
|
|
34
33
|
duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
|
|
35
34
|
embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
|
|
@@ -42,10 +41,10 @@ def map_data(
|
|
|
42
41
|
embeddings: An [N,d] numpy array containing the N embeddings to add.
|
|
43
42
|
identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
|
|
44
43
|
description: The description of your dataset
|
|
45
|
-
id_field: Specify
|
|
44
|
+
id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
|
|
46
45
|
is_public: Should the dataset be accessible outside your Nomic Atlas organization.
|
|
47
46
|
indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
|
|
48
|
-
projection: Options
|
|
47
|
+
projection: Options for configuring the 2D projection algorithm.
|
|
49
48
|
topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
|
|
50
49
|
duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
|
|
51
50
|
embedding_model: Options to adjust the embedding model used to embed your dataset.
|
|
@@ -86,9 +85,6 @@ def map_data(
|
|
|
86
85
|
# default to vision v1.5
|
|
87
86
|
embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
|
|
88
87
|
|
|
89
|
-
if id_field is None:
|
|
90
|
-
id_field = ATLAS_DEFAULT_ID_FIELD
|
|
91
|
-
|
|
92
88
|
project_name = get_random_name()
|
|
93
89
|
|
|
94
90
|
dataset_name = project_name
|
|
@@ -100,38 +96,6 @@ def map_data(
|
|
|
100
96
|
if description:
|
|
101
97
|
description = description
|
|
102
98
|
|
|
103
|
-
# no metadata was specified
|
|
104
|
-
added_id_field = False
|
|
105
|
-
|
|
106
|
-
if data is None:
|
|
107
|
-
added_id_field = True
|
|
108
|
-
if embeddings is not None:
|
|
109
|
-
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
|
|
110
|
-
elif blobs is not None:
|
|
111
|
-
data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(blobs))]
|
|
112
|
-
else:
|
|
113
|
-
raise ValueError("You must specify either data, embeddings, or blobs")
|
|
114
|
-
|
|
115
|
-
if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
|
|
116
|
-
if isinstance(data, list) and id_field not in data[0]:
|
|
117
|
-
added_id_field = True
|
|
118
|
-
for i in range(len(data)):
|
|
119
|
-
# do not modify object the user passed in - also ensures IDs are unique if two input datums are the same *object*
|
|
120
|
-
data[i] = data[i].copy()
|
|
121
|
-
data[i][id_field] = b64int(i)
|
|
122
|
-
elif isinstance(data, DataFrame) and id_field not in data.columns:
|
|
123
|
-
data[id_field] = [b64int(i) for i in range(data.shape[0])]
|
|
124
|
-
added_id_field = True
|
|
125
|
-
elif isinstance(data, pa.Table) and not id_field in data.column_names: # type: ignore
|
|
126
|
-
ids = pa.array([b64int(i) for i in range(len(data))])
|
|
127
|
-
data = data.append_column(id_field, ids) # type: ignore
|
|
128
|
-
added_id_field = True
|
|
129
|
-
elif id_field not in data[0]:
|
|
130
|
-
raise ValueError("map_data data must be a list of dicts, a pandas dataframe, or a pyarrow table")
|
|
131
|
-
|
|
132
|
-
if added_id_field:
|
|
133
|
-
logger.warning("An ID field was not specified in your data so one was generated for you in insertion order.")
|
|
134
|
-
|
|
135
99
|
dataset = AtlasDataset(
|
|
136
100
|
identifier=dataset_name, description=description, unique_id_field=id_field, is_public=is_public
|
|
137
101
|
)
|
|
@@ -144,6 +108,8 @@ def map_data(
|
|
|
144
108
|
# Add data by modality
|
|
145
109
|
logger.info("Uploading data to Atlas.")
|
|
146
110
|
try:
|
|
111
|
+
if isinstance(data, DataFrame):
|
|
112
|
+
data = data.to_dict(orient="records")
|
|
147
113
|
if modality == "text":
|
|
148
114
|
dataset.add_data(data=data)
|
|
149
115
|
elif modality == "embedding":
|
|
@@ -166,7 +132,7 @@ def map_data(
|
|
|
166
132
|
name=index_name,
|
|
167
133
|
indexed_field=indexed_field,
|
|
168
134
|
modality=modality,
|
|
169
|
-
projection=projection,
|
|
135
|
+
projection=projection, # type: ignore[arg-type]
|
|
170
136
|
topic_model=topic_model,
|
|
171
137
|
duplicate_detection=duplicate_detection,
|
|
172
138
|
embedding_model=embedding_model,
|
|
@@ -200,7 +166,7 @@ def map_embeddings(
|
|
|
200
166
|
Args:
|
|
201
167
|
embeddings: An [N,d] numpy array containing the batch of N embeddings to add.
|
|
202
168
|
data: An [N,] element list of dictionaries containing metadata for each embedding.
|
|
203
|
-
id_field: Specify
|
|
169
|
+
id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
|
|
204
170
|
name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
|
|
205
171
|
description: A description for your map.
|
|
206
172
|
is_public: Should this embedding map be public? Private maps can only be accessed by members of your organization.
|
|
@@ -248,7 +214,7 @@ def map_text(
|
|
|
248
214
|
Args:
|
|
249
215
|
data: An [N,] element iterable of dictionaries containing metadata for each embedding.
|
|
250
216
|
indexed_field: The name the data field containing the text your want to map.
|
|
251
|
-
id_field: Specify
|
|
217
|
+
id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
|
|
252
218
|
name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
|
|
253
219
|
description: A description for your map.
|
|
254
220
|
build_topic_model: Builds a hierarchical topic model over your data to discover patterns.
|
|
@@ -57,25 +57,36 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
|
|
|
57
57
|
return pa.schema({**usertypes, **whitelist})
|
|
58
58
|
|
|
59
59
|
|
|
60
|
-
class
|
|
60
|
+
class ProjectionOptions(BaseModel):
|
|
61
61
|
"""
|
|
62
|
-
|
|
62
|
+
Generic options for 2D Dimensionality Reduction
|
|
63
63
|
|
|
64
64
|
Args:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
spread: Determines how tight together points appear.
|
|
70
|
-
|
|
65
|
+
model: The projection model to use.
|
|
66
|
+
n_neighbors: The number of neighbors to use for the projection algorithm.
|
|
67
|
+
n_epochs: How many dataset passes to train the projection model.
|
|
68
|
+
min_dist: Controls how tightly points are packed together.
|
|
69
|
+
spread: Nomic Project specific: Determines how tight together points appear.
|
|
70
|
+
local_neighborhood_size: Nomic Project v2 specific: Controls the local neighborhood size.
|
|
71
|
+
rho: Nomic Project v2 specific: Controls the spread in local structure.
|
|
71
72
|
"""
|
|
72
73
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
74
|
+
model: Optional[str] = Field(
|
|
75
|
+
default=None,
|
|
76
|
+
description="Projection model to use (e.g., 'umap', 'nomic-project-v1', 'nomic-project-v2').",
|
|
77
|
+
)
|
|
78
|
+
n_neighbors: Optional[int] = Field(default=None, description="Number of neighbors for the projection algorithm.")
|
|
79
|
+
n_epochs: Optional[int] = Field(default=None, description="Number of epochs for training the projection model.")
|
|
80
|
+
min_dist: Optional[float] = Field(default=None, description="Minimum distance between points.")
|
|
81
|
+
spread: Optional[float] = Field(default=None, description="Nomic Project specific: Spread of the point layout.")
|
|
82
|
+
local_neighborhood_size: Optional[int] = Field(
|
|
83
|
+
default=None,
|
|
84
|
+
description="Nomic Project v2 specific: Local neighborhood size. Only used when model is 'nomic-project-v2'.",
|
|
85
|
+
)
|
|
86
|
+
rho: Optional[float] = Field(
|
|
87
|
+
default=None,
|
|
88
|
+
description="Nomic Project v2 specific: Rho parameter. Only used when model is 'nomic-project-v2'.",
|
|
89
|
+
)
|
|
79
90
|
|
|
80
91
|
|
|
81
92
|
class NomicTopicOptions(BaseModel):
|