nomic 3.4.1__tar.gz → 3.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nomic might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.4.1
3
+ Version: 3.5.0
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -14,7 +14,7 @@ from PIL import Image
14
14
  from pyarrow import Table
15
15
  from tqdm import tqdm
16
16
 
17
- from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicProjectOptions, NomicTopicOptions
17
+ from .data_inference import NomicDuplicatesOptions, NomicEmbedOptions, NomicTopicOptions, ProjectionOptions
18
18
  from .dataset import AtlasDataset, AtlasDataStream
19
19
  from .settings import *
20
20
  from .utils import arrow_iterator, b64int, get_random_name
@@ -29,7 +29,7 @@ def map_data(
29
29
  id_field: Optional[str] = None,
30
30
  is_public: bool = True,
31
31
  indexed_field: Optional[str] = None,
32
- projection: Union[bool, Dict, NomicProjectOptions] = True,
32
+ projection: Optional[Union[Dict, ProjectionOptions]] = None,
33
33
  topic_model: Union[bool, Dict, NomicTopicOptions] = True,
34
34
  duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
35
35
  embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
@@ -45,7 +45,7 @@ def map_data(
45
45
  id_field: Specify your data unique id field. This field can be up 36 characters in length. If not specified, one will be created for you named `id_`.
46
46
  is_public: Should the dataset be accessible outside your Nomic Atlas organization.
47
47
  indexed_field: The text field from the dataset that will be used to create embeddings, which determines the layout of the data map in Atlas. Required for text data but won't have an impact if uploading embeddings or image blobs.
48
- projection: Options to adjust Nomic Project - the dimensionality algorithm organizing your dataset.
48
+ projection: Options for configuring the 2D projection algorithm.
49
49
  topic_model: Options to adjust Nomic Topic - the topic model organizing your dataset.
50
50
  duplicate_detection: Options to adjust Nomic Duplicates - the duplicate detection algorithm.
51
51
  embedding_model: Options to adjust the embedding model used to embed your dataset.
@@ -144,6 +144,8 @@ def map_data(
144
144
  # Add data by modality
145
145
  logger.info("Uploading data to Atlas.")
146
146
  try:
147
+ if isinstance(data, DataFrame):
148
+ data = data.to_dict(orient="records")
147
149
  if modality == "text":
148
150
  dataset.add_data(data=data)
149
151
  elif modality == "embedding":
@@ -166,7 +168,7 @@ def map_data(
166
168
  name=index_name,
167
169
  indexed_field=indexed_field,
168
170
  modality=modality,
169
- projection=projection,
171
+ projection=projection, # type: ignore[arg-type]
170
172
  topic_model=topic_model,
171
173
  duplicate_detection=duplicate_detection,
172
174
  embedding_model=embedding_model,
@@ -57,25 +57,36 @@ def convert_pyarrow_schema_for_atlas(schema: pa.Schema) -> pa.Schema:
57
57
  return pa.schema({**usertypes, **whitelist})
58
58
 
59
59
 
60
- class NomicProjectOptions(BaseModel):
60
+ class ProjectionOptions(BaseModel):
61
61
  """
62
- Options for Nomic 2D Dimensionality Reduction Model
62
+ Generic options for 2D Dimensionality Reduction
63
63
 
64
64
  Args:
65
- n_neighbors: The number of neighbors to use when approximating the high dimensional embedding space during reduction. Default: `None` (Auto-inferred).
66
- n_epochs: How many dataset passes to train the projection model. Default: `None` (Auto-inferred).
67
- model: The model to use when generating the 2D projected embedding space layout. Possible values: `None` or `nomic-project-v1` or `nomic-project-v2`. Default: `None`.
68
- local_neighborhood_size: Only used when model is `nomic-project-v2`. Controls the size of the neighborhood used in the local structure optimizing step of `nomic-project-v2` algorithm. Min value: `max(n_neighbors, 1)`; max value: `128`. Default: `None` (Auto-inferred).
69
- spread: Determines how tight together points appear. Larger values result a more spread out point layout. Min value: `0`. It is recommended leaving this value as the default `None` (Auto-inferred).
70
- rho: Only used when model is nomic-project-v2. Controls the spread in the local structure optimizing step of `nomic-project-v2`. Min value: `0`; max value: `1`. It is recommended to leave this value as the default `None` (Auto-inferred).
65
+ model: The projection model to use.
66
+ n_neighbors: The number of neighbors to use for the projection algorithm.
67
+ n_epochs: How many dataset passes to train the projection model.
68
+ min_dist: Controls how tightly points are packed together.
69
+ spread: Nomic Project specific: Determines how tight together points appear.
70
+ local_neighborhood_size: Nomic Project v2 specific: Controls the local neighborhood size.
71
+ rho: Nomic Project v2 specific: Controls the spread in local structure.
71
72
  """
72
73
 
73
- n_neighbors: Optional[int] = None
74
- n_epochs: Optional[int] = None
75
- spread: Optional[float] = None
76
- local_neighborhood_size: Optional[int] = None
77
- model: Optional[str] = None
78
- rho: Optional[float] = None
74
+ model: Optional[str] = Field(
75
+ default=None,
76
+ description="Projection model to use (e.g., 'umap', 'nomic-project-v1', 'nomic-project-v2').",
77
+ )
78
+ n_neighbors: Optional[int] = Field(default=None, description="Number of neighbors for the projection algorithm.")
79
+ n_epochs: Optional[int] = Field(default=None, description="Number of epochs for training the projection model.")
80
+ min_dist: Optional[float] = Field(default=None, description="Minimum distance between points.")
81
+ spread: Optional[float] = Field(default=None, description="Nomic Project specific: Spread of the point layout.")
82
+ local_neighborhood_size: Optional[int] = Field(
83
+ default=None,
84
+ description="Nomic Project v2 specific: Local neighborhood size. Only used when model is 'nomic-project-v2'.",
85
+ )
86
+ rho: Optional[float] = Field(
87
+ default=None,
88
+ description="Nomic Project v2 specific: Rho parameter. Only used when model is 'nomic-project-v2'.",
89
+ )
79
90
 
80
91
 
81
92
  class NomicTopicOptions(BaseModel):
@@ -29,8 +29,8 @@ from .cli import refresh_bearer_token, validate_api_http_response
29
29
  from .data_inference import (
30
30
  NomicDuplicatesOptions,
31
31
  NomicEmbedOptions,
32
- NomicProjectOptions,
33
32
  NomicTopicOptions,
33
+ ProjectionOptions,
34
34
  convert_pyarrow_schema_for_atlas,
35
35
  )
36
36
  from .data_operations import AtlasMapData, AtlasMapDuplicates, AtlasMapEmbeddings, AtlasMapTags, AtlasMapTopics
@@ -764,7 +764,7 @@ class AtlasDataset(AtlasClass):
764
764
 
765
765
  **Parameters:**
766
766
 
767
- * **identifier** - The dataset identifier in the form `dataset` or `organization/dataset`. If no organization is passed, your default organization will be used.
767
+ * **identifier** - The dataset identifier in the form `dataset` or `organization/dataset`. If no organization is passed, the organization tied to the API key you logged in to Nomic with will be used.
768
768
  * **description** - A description for the dataset.
769
769
  * **unique_id_field** - The field that uniquely identifies each data point.
770
770
  * **is_public** - Should this dataset be publicly accessible for viewing (read only). If False, only members of your Nomic organization can view.
@@ -1049,7 +1049,7 @@ class AtlasDataset(AtlasClass):
1049
1049
  for projection in index.projections:
1050
1050
  if projection.id == projection_id:
1051
1051
  return projection
1052
- raise ValueError(f"Could not find a map with projection_id='{atlas_index_id}'")
1052
+ raise ValueError(f"Could not find a map with projection_id='{projection_id}'")
1053
1053
 
1054
1054
  if len(indices) == 0:
1055
1055
  raise ValueError("You have no maps built in your project")
@@ -1071,7 +1071,7 @@ class AtlasDataset(AtlasClass):
1071
1071
  name: Optional[str] = None,
1072
1072
  indexed_field: Optional[str] = None,
1073
1073
  modality: Optional[str] = None,
1074
- projection: Union[bool, Dict, NomicProjectOptions] = True,
1074
+ projection: Union[Dict, ProjectionOptions, None] = None,
1075
1075
  topic_model: Union[bool, Dict, NomicTopicOptions] = True,
1076
1076
  duplicate_detection: Union[bool, Dict, NomicDuplicatesOptions] = True,
1077
1077
  embedding_model: Optional[Union[str, Dict, NomicEmbedOptions]] = None,
@@ -1085,7 +1085,7 @@ class AtlasDataset(AtlasClass):
1085
1085
  indexed_field: For text datasets, name the data field corresponding to the text to be mapped.
1086
1086
  reuse_embeddings_from_index: the name of the index to reuse embeddings from.
1087
1087
  modality: The data modality of this index. Currently, Atlas supports either `text`, `image`, or `embedding` indices.
1088
- projection: Options for configuring the 2D projection algorithm
1088
+ projection: Options for configuring the 2D projection algorithm or None to let cloud decide
1089
1089
  topic_model: Options for configuring the topic model
1090
1090
  duplicate_detection: Options for configuring semantic duplicate detection
1091
1091
  embedding_model: Options for configuring the embedding model
@@ -1097,10 +1097,16 @@ class AtlasDataset(AtlasClass):
1097
1097
 
1098
1098
  self._latest_dataset_state()
1099
1099
 
1100
- if isinstance(projection, Dict):
1101
- projection = NomicProjectOptions(**projection)
1102
- else:
1103
- projection = NomicProjectOptions()
1100
+ projection_options: Optional[ProjectionOptions] = None
1101
+
1102
+ if isinstance(projection, ProjectionOptions):
1103
+ projection_options = projection
1104
+ elif isinstance(projection, dict):
1105
+ projection_options = ProjectionOptions(**projection)
1106
+
1107
+ projection_hyperparameters: dict = {}
1108
+ if projection_options is not None:
1109
+ projection_hyperparameters = projection_options.model_dump()
1104
1110
 
1105
1111
  topic_model_was_false = topic_model is False
1106
1112
  if isinstance(topic_model, Dict):
@@ -1134,9 +1140,9 @@ class AtlasDataset(AtlasClass):
1134
1140
  modality = self.meta["modality"]
1135
1141
 
1136
1142
  if modality == "image":
1143
+ if indexed_field is not None and indexed_field != "_blob_hash":
1144
+ logger.warning("Ignoring user-provided indexed_field for image datasets. Using _blob_hash.")
1137
1145
  indexed_field = "_blob_hash"
1138
- if indexed_field is not None:
1139
- logger.warning("Ignoring indexed_field for image datasets. Only _blob_hash is supported.")
1140
1146
 
1141
1147
  colorable_fields = []
1142
1148
 
@@ -1150,6 +1156,7 @@ class AtlasDataset(AtlasClass):
1150
1156
  logger.warning(
1151
1157
  "You did not specify the `topic_label_field` option in your topic_model, your dataset will not contain auto-labeled topics."
1152
1158
  )
1159
+
1153
1160
  build_template = {
1154
1161
  "project_id": self.id,
1155
1162
  "index_name": name,
@@ -1161,20 +1168,11 @@ class AtlasDataset(AtlasClass):
1161
1168
  "nearest_neighbor_index": "HNSWIndex",
1162
1169
  "nearest_neighbor_index_hyperparameters": json.dumps({"space": "l2", "ef_construction": 100, "M": 16}),
1163
1170
  "projection": "NomicProject",
1164
- "projection_hyperparameters": json.dumps(
1165
- {
1166
- "n_neighbors": projection.n_neighbors,
1167
- "n_epochs": projection.n_epochs,
1168
- "spread": projection.spread,
1169
- "local_neighborhood_size": projection.local_neighborhood_size,
1170
- "rho": projection.rho,
1171
- "model": projection.model,
1172
- }
1173
- ),
1171
+ "projection_hyperparameters": json.dumps(projection_hyperparameters),
1174
1172
  "topic_model_hyperparameters": json.dumps(
1175
1173
  {
1176
1174
  "build_topic_model": topic_model.build_topic_model,
1177
- "community_description_target_field": topic_model.topic_label_field, # TODO change key to topic_label_field post v0.0.85
1175
+ "community_description_target_field": topic_model.topic_label_field,
1178
1176
  "cluster_method": topic_model.cluster_method,
1179
1177
  "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
1180
1178
  }
@@ -1188,7 +1186,6 @@ class AtlasDataset(AtlasClass):
1188
1186
  }
1189
1187
 
1190
1188
  elif modality == "text" or modality == "image":
1191
- # find the index id of the index with name reuse_embeddings_from_index
1192
1189
  reuse_embedding_from_index_id = None
1193
1190
  indices = self.indices
1194
1191
  if reuse_embeddings_from_index is not None:
@@ -1240,21 +1237,12 @@ class AtlasDataset(AtlasClass):
1240
1237
  "nearest_neighbor_index": "HNSWIndex",
1241
1238
  "nearest_neighbor_index_hyperparameters": json.dumps({"space": "l2", "ef_construction": 100, "M": 16}),
1242
1239
  "projection": "NomicProject",
1243
- "projection_hyperparameters": json.dumps(
1244
- {
1245
- "n_neighbors": projection.n_neighbors,
1246
- "n_epochs": projection.n_epochs,
1247
- "spread": projection.spread,
1248
- "local_neighborhood_size": projection.local_neighborhood_size,
1249
- "rho": projection.rho,
1250
- "model": projection.model,
1251
- }
1252
- ),
1240
+ "projection_hyperparameters": json.dumps(projection_hyperparameters),
1253
1241
  "topic_model_hyperparameters": json.dumps(
1254
1242
  {
1255
1243
  "build_topic_model": topic_model.build_topic_model,
1256
1244
  "community_description_target_field": topic_field,
1257
- "cluster_method": topic_model.build_topic_model,
1245
+ "cluster_method": topic_model.cluster_method,
1258
1246
  "enforce_topic_hierarchy": topic_model.enforce_topic_hierarchy,
1259
1247
  }
1260
1248
  ),
@@ -12,8 +12,5 @@ DEFAULT_LARGE_PROJECTION_N_NEIGHBORS = 128
12
12
  DEFAULT_LARGE_PROJECTION_EPOCHS = 128
13
13
  DEFAULT_INDEX_N_NEIGHBORS = 32
14
14
  DEFAULT_PROJECTION_RHO = 0.2
15
-
16
- DEFAULT_PROJECTION_MODEL = "nomic-project-v1"
17
-
18
15
  DEFAULT_PROJECTION_SPREAD = 1.0
19
16
  DEFAULT_DUPLICATE_THRESHOLD = 0.1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: nomic
3
- Version: 3.4.1
3
+ Version: 3.5.0
4
4
  Summary: The official Nomic python client.
5
5
  Home-page: https://github.com/nomic-ai/nomic
6
6
  Author: nomic.ai
@@ -23,7 +23,7 @@ with open("README.md") as f:
23
23
 
24
24
  setup(
25
25
  name="nomic",
26
- version="3.4.1",
26
+ version="3.5.0",
27
27
  url="https://github.com/nomic-ai/nomic",
28
28
  description=description,
29
29
  long_description=long_description,
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes