nomic 3.4.1__tar.gz → 3.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nomic might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.4.1
3
+ Version: 3.5.1
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -3,7 +3,6 @@ This class allows for programmatic interactions with Atlas - Nomic's neural data
3
3
  or in a Jupyter Notebook to organize and interact with your unstructured data.
4
4
  """
5
5
 
6
- import uuid
7
6
  from typing import Dict, Iterable, List, Optional, Union
8
7
 
9
8
  import numpy as np
@@ -14,7 +13,7 @@ from PIL import Image
14
13
  from pyarrow import Table
15
14
  from tqdm import tqdm
16
15
 
17
- from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicProjectOptions, NomicTopicOptions
16
+ from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicTopicOptions, ProjectionOptions
18
17
  from .dataset import AtlasDataset, AtlasDataStream
19
18
  from .settings import *
20
19
  from .utils import arrow_iterator, b64int, get_random_name
@@ -29,7 +28,7 @@ def map_data(
29
28
  id_field: Optional[str] = None,
30
29
  is_public: bool = True,
31
30
  indexed_field: Optional[str] = None,
32
- projection: Union[bool, Dict, NomicProjectOptions] = True,
31
+ projection: Optional[Union[Dict, ProjectionOptions]] = None,
33
32
  topic_model: Union[bool, Dict, NomicTopicOptions] = True,
34
33
  duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
35
34
  embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
@@ -42,10 +41,10 @@ def map_data(
42
41
  embeddings: An [N,d] numpy array containing the N embeddings to add.
43
42
  identifier: A name for your dataset that is used to generate the dataset identifier. A unique name will be chosen if not supplied.
44
43
  description: The description of your dataset
45
- id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
44
+ id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
46
45
  is_public: Should the dataset be accessible outside your Nomic Atlas organization.
47
46
  indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
48
- projection: Options to adjust Nomic Project - the dimensionality algorithm organizing your dataset.
47
+ projection: Options for configuring the 2D projection algorithm.
49
48
  topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
50
49
  duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
51
50
  embedding_model: Options to adjust the embedding model used to embed your dataset.
@@ -86,9 +85,6 @@ def map_data(
86
85
  # default to vision v1.5
87
86
  embedding_model = NomicEmbedOptions(model="nomic-embed-vision-v1.5")
88
87
 
89
- if id_field is None:
90
- id_field = ATLAS_DEFAULT_ID_FIELD
91
-
92
88
  project_name = get_random_name()
93
89
 
94
90
  dataset_name = project_name
@@ -100,38 +96,6 @@ def map_data(
100
96
  if description:
101
97
  description = description
102
98
 
103
- # no metadata was specified
104
- added_id_field = False
105
-
106
- if data is None:
107
- added_id_field = True
108
- if embeddings is not None:
109
- data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(embeddings))]
110
- elif blobs is not None:
111
- data = [{ATLAS_DEFAULT_ID_FIELD: b64int(i)} for i in range(len(blobs))]
112
- else:
113
- raise ValueError("You must specify either data, embeddings, or blobs")
114
-
115
- if id_field == ATLAS_DEFAULT_ID_FIELD and data is not None:
116
- if isinstance(data, list) and id_field not in data[0]:
117
- added_id_field = True
118
- for i in range(len(data)):
119
- # do not modify object the user passed in - also ensures IDs are unique if two input datums are the same *object*
120
- data[i] = data[i].copy()
121
- data[i][id_field] = b64int(i)
122
- elif isinstance(data, DataFrame) and id_field not in data.columns:
123
- data[id_field] = [b64int(i) for i in range(data.shape[0])]
124
- added_id_field = True
125
- elif isinstance(data, pa.Table) and not id_field in data.column_names: # type: ignore
126
- ids = pa.array([b64int(i) for i in range(len(data))])
127
- data = data.append_column(id_field, ids) # type: ignore
128
- added_id_field = True
129
- elif id_field not in data[0]:
130
- raise ValueError("map_data data must be a list of dicts, a pandas dataframe, or a pyarrow table")
131
-
132
- if added_id_field:
133
- logger.warning("An ID field was not specified in your data so one was generated for you in insertion order.")
134
-
135
99
  dataset = AtlasDataset(
136
100
  identifier=dataset_name, description=description, unique_id_field=id_field, is_public=is_public
137
101
  )
@@ -144,6 +108,8 @@ def map_data(
144
108
  # Add data by modality
145
109
  logger.info("Uploading data to Atlas.")
146
110
  try:
111
+ if isinstance(data, DataFrame):
112
+ data = data.to_dict(orient="records")
147
113
  if modality == "text":
148
114
  dataset.add_data(data=data)
149
115
  elif modality == "embedding":
@@ -166,7 +132,7 @@ def map_data(
166
132
  name=index_name,
167
133
  indexed_field=indexed_field,
168
134
  modality=modality,
169
- projection=projection,
135
+ projection=projection, # type: ignore[arg-type]
170
136
  topic_model=topic_model,
171
137
  duplicate_detection=duplicate_detection,
172
138
  embedding_model=embedding_model,
@@ -200,7 +166,7 @@ def map_embeddings(
200
166
  Args:
201
167
  embeddings: An [N,d] numpy array containing the batch of N embeddings to add.
202
168
  data: An [N,] element list of dictionaries containing metadata for each embedding.
203
- id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
169
+ id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
204
170
  name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
205
171
  description: A description for your map.
206
172
  is_public: Should this embedding map be public? Private maps can only be accessed by members of your organization.
@@ -248,7 +214,7 @@ def map_text(
248
214
  Args:
249
215
  data: An [N,] element iterable of dictionaries containing metadata for each embedding.
250
216
  indexed_field: The name the data field containing the text your want to map.
251
- id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
217
+ id_field: Specify a field that uniquely identifies each datapoint. This field can be up 36 characters in length.
252
218
  name: A name for your dataset. Specify in the format `organization/project` to create in a specific organization.
253
219
  description: A description for your map.
254
220
  build_topic_model: Builds a hierarchical topic model over your data to discover patterns.
@@ -57,25 +57,36 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
57
57
  return pa.schema({**usertypes, **whitelist})
58
58
 
59
59
 
60
- class NomicProjectOptions(BaseModel):
60
+ class ProjectionOptions(BaseModel):
61
61
  """
62
- Options for Nomic 2D Dimensionality Reduction Model
62
+ Generic options for 2D Dimensionality Reduction
63
63
 
64
64
  Args:
65
- n_neighbors: The number of neighbors to use when approximating the high dimensional embedding space during reduction. Default: `None` (Auto-inferred).
66
- n_epochs: How many dataset passes to train the projection model. Default: `None` (Auto-inferred).
67
- model: The model to use when generating the 2D projected embedding space layout. Possible values: `None` or `nomic-project-v1` or `nomic-project-v2`. Default: `None`.
68
- local_neighborhood_size: Only used when model is `nomic-project-v2`. Controls the size of the neighborhood used in the local structure optimizing step of `nomic-project-v2` algorithm. Min value: `max(n_neighbors, 1)`; max value: `128`. Default: `None` (Auto-inferred).
69
- spread: Determines how tight together points appear. Larger values result a more spread out point layout. Min value: `0`. It is recommended leaving this value as the default `None` (Auto-inferred).
70
- rho: Only used when model is nomic-project-v2. Controls the spread in the local structure optimizing step of `nomic-project-v2`. Min value: `0`; max value: `1`. It is recommended to leave this value as the default `None` (Auto-inferred).
65
+ model: The projection model to use.
66
+ n_neighbors: The number of neighbors to use for the projection algorithm.
67
+ n_epochs: How many dataset passes to train the projection model.
68
+ min_dist: Controls how tightly points are packed together.
69
+ spread: Nomic Project specific: Determines how tight together points appear.
70
+ local_neighborhood_size: Nomic Project v2 specific: Controls the local neighborhood size.
71
+ rho: Nomic Project v2 specific: Controls the spread in local structure.
71
72
  """
72
73
 
73
- n_neighbors: Optional[int] = None
74
- n_epochs: Optional[int] = None
75
- spread: Optional[float] = None
76
- local_neighborhood_size: Optional[int] = None
77
- model: Optional[str] = None
78
- rho: Optional[float] = None
74
+ model: Optional[str] = Field(
75
+ default=None,
76
+ description="Projection model to use (e.g., 'umap', 'nomic-project-v1', 'nomic-project-v2').",
77
+ )
78
+ n_neighbors: Optional[int] = Field(default=None, description="Number of neighbors for the projection algorithm.")
79
+ n_epochs: Optional[int] = Field(default=None, description="Number of epochs for training the projection model.")
80
+ min_dist: Optional[float] = Field(default=None, description="Minimum distance between points.")
81
+ spread: Optional[float] = Field(default=None, description="Nomic Project specific: Spread of the point layout.")
82
+ local_neighborhood_size: Optional[int] = Field(
83
+ default=None,
84
+ description="Nomic Project v2 specific: Local neighborhood size. Only used when model is 'nomic-project-v2'.",
85
+ )
86
+ rho: Optional[float] = Field(
87
+ default=None,
88
+ description="Nomic Project v2 specific: Rho parameter. Only used when model is 'nomic-project-v2'.",
89
+ )
79
90
 
80
91
 
81
92
  class NomicTopicOptions(BaseModel):