scale-nucleus 0.1.22__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. cli/client.py +14 -0
  2. cli/datasets.py +77 -0
  3. cli/helpers/__init__.py +0 -0
  4. cli/helpers/nucleus_url.py +10 -0
  5. cli/helpers/web_helper.py +40 -0
  6. cli/install_completion.py +33 -0
  7. cli/jobs.py +42 -0
  8. cli/models.py +35 -0
  9. cli/nu.py +42 -0
  10. cli/reference.py +8 -0
  11. cli/slices.py +62 -0
  12. cli/tests.py +121 -0
  13. nucleus/__init__.py +453 -699
  14. nucleus/annotation.py +435 -80
  15. nucleus/autocurate.py +9 -0
  16. nucleus/connection.py +87 -0
  17. nucleus/constants.py +12 -2
  18. nucleus/data_transfer_object/__init__.py +0 -0
  19. nucleus/data_transfer_object/dataset_details.py +9 -0
  20. nucleus/data_transfer_object/dataset_info.py +26 -0
  21. nucleus/data_transfer_object/dataset_size.py +5 -0
  22. nucleus/data_transfer_object/scenes_list.py +18 -0
  23. nucleus/dataset.py +1139 -215
  24. nucleus/dataset_item.py +130 -26
  25. nucleus/dataset_item_uploader.py +297 -0
  26. nucleus/deprecation_warning.py +32 -0
  27. nucleus/errors.py +21 -1
  28. nucleus/job.py +71 -3
  29. nucleus/logger.py +9 -0
  30. nucleus/metadata_manager.py +45 -0
  31. nucleus/metrics/__init__.py +10 -0
  32. nucleus/metrics/base.py +117 -0
  33. nucleus/metrics/categorization_metrics.py +197 -0
  34. nucleus/metrics/errors.py +7 -0
  35. nucleus/metrics/filters.py +40 -0
  36. nucleus/metrics/geometry.py +198 -0
  37. nucleus/metrics/metric_utils.py +28 -0
  38. nucleus/metrics/polygon_metrics.py +480 -0
  39. nucleus/metrics/polygon_utils.py +299 -0
  40. nucleus/model.py +121 -15
  41. nucleus/model_run.py +34 -57
  42. nucleus/payload_constructor.py +30 -18
  43. nucleus/prediction.py +259 -17
  44. nucleus/pydantic_base.py +26 -0
  45. nucleus/retry_strategy.py +4 -0
  46. nucleus/scene.py +204 -19
  47. nucleus/slice.py +230 -67
  48. nucleus/upload_response.py +20 -9
  49. nucleus/url_utils.py +4 -0
  50. nucleus/utils.py +139 -35
  51. nucleus/validate/__init__.py +24 -0
  52. nucleus/validate/client.py +168 -0
  53. nucleus/validate/constants.py +20 -0
  54. nucleus/validate/data_transfer_objects/__init__.py +0 -0
  55. nucleus/validate/data_transfer_objects/eval_function.py +81 -0
  56. nucleus/validate/data_transfer_objects/scenario_test.py +19 -0
  57. nucleus/validate/data_transfer_objects/scenario_test_evaluations.py +11 -0
  58. nucleus/validate/data_transfer_objects/scenario_test_metric.py +12 -0
  59. nucleus/validate/errors.py +6 -0
  60. nucleus/validate/eval_functions/__init__.py +0 -0
  61. nucleus/validate/eval_functions/available_eval_functions.py +212 -0
  62. nucleus/validate/eval_functions/base_eval_function.py +60 -0
  63. nucleus/validate/scenario_test.py +143 -0
  64. nucleus/validate/scenario_test_evaluation.py +114 -0
  65. nucleus/validate/scenario_test_metric.py +14 -0
  66. nucleus/validate/utils.py +8 -0
  67. {scale_nucleus-0.1.22.dist-info → scale_nucleus-0.6.4.dist-info}/LICENSE +0 -0
  68. scale_nucleus-0.6.4.dist-info/METADATA +213 -0
  69. scale_nucleus-0.6.4.dist-info/RECORD +71 -0
  70. {scale_nucleus-0.1.22.dist-info → scale_nucleus-0.6.4.dist-info}/WHEEL +1 -1
  71. scale_nucleus-0.6.4.dist-info/entry_points.txt +3 -0
  72. scale_nucleus-0.1.22.dist-info/METADATA +0 -85
  73. scale_nucleus-0.1.22.dist-info/RECORD +0 -21
nucleus/dataset.py CHANGED
@@ -1,65 +1,122 @@
1
+ import os
1
2
  from typing import Any, Dict, List, Optional, Sequence, Union
2
3
 
3
4
  import requests
4
5
 
5
6
  from nucleus.job import AsyncJob
7
+ from nucleus.prediction import (
8
+ BoxPrediction,
9
+ CategoryPrediction,
10
+ CuboidPrediction,
11
+ PolygonPrediction,
12
+ SegmentationPrediction,
13
+ from_json,
14
+ )
6
15
  from nucleus.url_utils import sanitize_string_args
7
16
  from nucleus.utils import (
8
17
  convert_export_payload,
9
18
  format_dataset_item_response,
19
+ format_prediction_response,
10
20
  serialize_and_write_to_presigned_url,
11
21
  )
12
22
 
13
23
  from .annotation import (
14
24
  Annotation,
25
+ BoxAnnotation,
26
+ CategoryAnnotation,
27
+ CuboidAnnotation,
28
+ MultiCategoryAnnotation,
29
+ PolygonAnnotation,
30
+ SegmentationAnnotation,
15
31
  check_all_mask_paths_remote,
16
32
  )
17
33
  from .constants import (
18
- DATASET_LENGTH_KEY,
19
- DATASET_MODEL_RUNS_KEY,
20
- DATASET_NAME_KEY,
21
- DATASET_SLICES_KEY,
34
+ ANNOTATIONS_KEY,
35
+ AUTOTAG_SCORE_THRESHOLD,
36
+ BACKFILL_JOB_KEY,
37
+ DATASET_ID_KEY,
38
+ DATASET_IS_SCENE_KEY,
22
39
  DEFAULT_ANNOTATION_UPDATE_MODE,
40
+ EMBEDDING_DIMENSION_KEY,
41
+ EMBEDDINGS_URL_KEY,
23
42
  EXPORTED_ROWS,
43
+ KEEP_HISTORY_KEY,
44
+ MESSAGE_KEY,
24
45
  NAME_KEY,
25
46
  REFERENCE_IDS_KEY,
26
47
  REQUEST_ID_KEY,
27
- AUTOTAG_SCORE_THRESHOLD,
48
+ SLICE_ID_KEY,
28
49
  UPDATE_KEY,
29
50
  )
51
+ from .data_transfer_object.dataset_info import DatasetInfo
52
+ from .data_transfer_object.dataset_size import DatasetSize
53
+ from .data_transfer_object.scenes_list import ScenesList, ScenesListEntry
30
54
  from .dataset_item import (
31
55
  DatasetItem,
32
56
  check_all_paths_remote,
33
57
  check_for_duplicate_reference_ids,
34
58
  )
35
- from .scene import LidarScene, check_all_scene_paths_remote
59
+ from .dataset_item_uploader import DatasetItemUploader
60
+ from .deprecation_warning import deprecated
61
+ from .errors import DatasetItemRetrievalError
62
+ from .metadata_manager import ExportMetadataType, MetadataManager
36
63
  from .payload_constructor import (
37
64
  construct_append_scenes_payload,
38
65
  construct_model_run_creation_payload,
39
66
  construct_taxonomy_payload,
40
67
  )
68
+ from .scene import LidarScene, Scene, check_all_scene_paths_remote
69
+ from .slice import Slice
70
+ from .upload_response import UploadResponse
71
+
72
+ # TODO: refactor to reduce this file to under 1000 lines.
73
+ # pylint: disable=C0302
74
+
41
75
 
42
76
  WARN_FOR_LARGE_UPLOAD = 50000
43
77
  WARN_FOR_LARGE_SCENES_UPLOAD = 5
44
78
 
45
79
 
46
80
  class Dataset:
47
- """
48
- Nucleus Dataset. You can append images with metadata to your dataset,
49
- annotate it with ground truth and upload model predictions to evaluate and
50
- compare model performance on you data.
81
+ """Datasets are collections of your data that can be associated with models.
82
+
83
+ You can append :class:`DatasetItems<DatasetItem>` or :class:`Scenes<LidarScene>`
84
+ with metadata to your dataset, annotate it with ground truth, and upload
85
+ model predictions to evaluate and compare model performance on your data.
86
+
87
+ Make sure that the dataset is set up correctly supporting the required datatype (see code sample below).
88
+
89
+ Datasets cannot be instantiated directly and instead must be created via API
90
+ endpoint using :meth:`NucleusClient.create_dataset`, or in the dashboard.
91
+
92
+ ::
93
+
94
+ import nucleus
95
+
96
+ client = nucleus.NucleusClient(YOUR_SCALE_API_KEY)
97
+
98
+ # Create new dataset supporting DatasetItems
99
+ dataset = client.create_dataset(YOUR_DATASET_NAME, is_scene=False)
100
+
101
+ # OR create new dataset supporting LidarScenes
102
+ dataset = client.create_dataset(YOUR_DATASET_NAME, is_scene=True)
103
+
104
+ # Or, retrieve existing dataset by ID
105
+ # This ID can be fetched using client.list_datasets() or from a dashboard URL
106
+ existing_dataset = client.get_dataset("YOUR_DATASET_ID")
51
107
  """
52
108
 
53
- def __init__(
54
- self,
55
- dataset_id: str,
56
- client: "NucleusClient", # type:ignore # noqa: F821
57
- ):
109
+ def __init__(self, dataset_id, client, name=None):
58
110
  self.id = dataset_id
59
111
  self._client = client
112
+ # NOTE: Optionally set name on creation such that the property access doesn't need to hit the server
113
+ self._name = name
60
114
 
61
115
  def __repr__(self):
62
- return f"Dataset(dataset_id='{self.id}', client={self._client})"
116
+ if os.environ.get("NUCLEUS_DEBUG", None):
117
+ return f"Dataset(name='{self.name}, dataset_id='{self.id}', is_scene='{self.is_scene}', client={self._client})"
118
+ else:
119
+ return f"Dataset(name='{self.name}, dataset_id='{self.id}', is_scene='{self.is_scene}')"
63
120
 
64
121
  def __eq__(self, other):
65
122
  if self.id == other.id:
@@ -69,44 +126,107 @@ class Dataset:
69
126
 
70
127
  @property
71
128
  def name(self) -> str:
72
- return self.info().get(DATASET_NAME_KEY, "")
129
+ """User-defined name of the Dataset."""
130
+ if self._name is None:
131
+ self._name = self._client.make_request(
132
+ {}, f"dataset/{self.id}/name", requests.get
133
+ )["name"]
134
+ return self._name
135
+
136
+ @property
137
+ def is_scene(self) -> bool:
138
+ """If the dataset can contain scenes or not."""
139
+ response = self._client.make_request(
140
+ {}, f"dataset/{self.id}/is_scene", requests.get
141
+ )[DATASET_IS_SCENE_KEY]
142
+ return response
73
143
 
74
144
  @property
75
145
  def model_runs(self) -> List[str]:
76
- return self.info().get(DATASET_MODEL_RUNS_KEY, [])
146
+ """List of all model runs associated with the Dataset."""
147
+ # TODO: model_runs -> models
148
+ response = self._client.make_request(
149
+ {}, f"dataset/{self.id}/model_runs", requests.get
150
+ )
151
+ return response
77
152
 
78
153
  @property
79
154
  def slices(self) -> List[str]:
80
- return self.info().get(DATASET_SLICES_KEY, [])
155
+ """List of all Slice IDs created from the Dataset."""
156
+ response = self._client.make_request(
157
+ {}, f"dataset/{self.id}/slices", requests.get
158
+ )
159
+ return response
81
160
 
82
161
  @property
83
162
  def size(self) -> int:
84
- return self.info().get(DATASET_LENGTH_KEY, 0)
163
+ """Number of items in the Dataset."""
164
+ response = self._client.make_request(
165
+ {}, f"dataset/{self.id}/size", requests.get
166
+ )
167
+ dataset_size = DatasetSize.parse_obj(response)
168
+ return dataset_size.count
85
169
 
86
170
  @property
87
171
  def items(self) -> List[DatasetItem]:
88
- return self._client.get_dataset_items(self.id)
172
+ """List of all DatasetItem objects in the Dataset."""
173
+ response = self._client.make_request(
174
+ {}, f"dataset/{self.id}/datasetItems", requests.get
175
+ )
176
+ dataset_items = response.get("dataset_items", None)
177
+ error = response.get("error", None)
178
+ constructed_dataset_items = []
179
+ if dataset_items:
180
+ for item in dataset_items:
181
+ image_url = item.get("original_image_url")
182
+ metadata = item.get("metadata", None)
183
+ ref_id = item.get("ref_id", None)
184
+ dataset_item = DatasetItem(image_url, ref_id, metadata)
185
+ constructed_dataset_items.append(dataset_item)
186
+ elif error:
187
+ raise DatasetItemRetrievalError(message=error)
188
+ return constructed_dataset_items
189
+
190
+ @property
191
+ def scenes(self) -> List[ScenesListEntry]:
192
+ """List of ID, reference ID, type, and metadata for all scenes in the Dataset."""
193
+ response = self._client.make_request(
194
+ {}, f"dataset/{self.id}/scenes_list", requests.get
195
+ )
196
+
197
+ scenes_list = ScenesList.parse_obj(response)
198
+ return scenes_list.scenes
89
199
 
90
200
  @sanitize_string_args
91
201
  def autotag_items(self, autotag_name, for_scores_greater_than=0):
92
- """For a given Autotag of this dataset, export its tagged items with scores above a threshold, largest scores first.
202
+ """Fetches the autotag's items above the score threshold, sorted by descending score.
93
203
 
94
- :return: dictionary of the form
95
- {
96
- 'autotagItems': {
97
- ref_id: str,
98
- score: float,
99
- model_prediction_annotation_id: str | None
100
- ground_truth_annotation_id: str | None,
101
- }[],
102
- 'autotag': {
103
- id: str,
104
- name: str,
105
- status: 'started' | 'completed',
106
- autotag_level: 'Image' | 'Object'
204
+ Parameters:
205
+ autotag_name: The user-defined name of the autotag.
206
+ for_scores_greater_than (Optional[int]): Score threshold between -1
207
+ and 1 above which to include autotag items.
208
+
209
+ Returns:
210
+ List of autotagged items above the given score threshold, sorted by
211
+ descending score, and autotag info, packaged into a dict as follows::
212
+
213
+ {
214
+ "autotagItems": List[{
215
+ ref_id: str,
216
+ score: float,
217
+ model_prediction_annotation_id: str | None
218
+ ground_truth_annotation_id: str | None,
219
+ }],
220
+ "autotag": {
221
+ id: str,
222
+ name: str,
223
+ status: "started" | "completed",
224
+ autotag_level: "Image" | "Object"
225
+ }
107
226
  }
108
- }
109
- See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-items for more details on the return types.
227
+
228
+ Note ``model_prediction_annotation_id`` and ``ground_truth_annotation_id``
229
+ are only relevant for object autotags.
110
230
  """
111
231
  response = self._client.make_request(
112
232
  payload={AUTOTAG_SCORE_THRESHOLD: for_scores_greater_than},
@@ -116,23 +236,31 @@ class Dataset:
116
236
  return response
117
237
 
118
238
  def autotag_training_items(self, autotag_name):
119
- """For a given Autotag of this dataset, export its training items. These are user selected positives during refinement.
239
+ """Fetches items that were manually selected during refinement of the autotag.
120
240
 
121
- :return: dictionary of the form
122
- {
123
- 'autotagPositiveTrainingItems': {
124
- ref_id: str,
125
- model_prediction_annotation_id: str | None,
126
- ground_truth_annotation_id: str | None,
127
- }[],
128
- 'autotag': {
129
- id: str,
130
- name: str,
131
- status: 'started' | 'completed',
132
- autotag_level: 'Image' | 'Object'
241
+ Parameters:
242
+ autotag_name: The user-defined name of the autotag.
243
+
244
+ Returns:
245
+ List of user-selected positives and autotag info, packaged into a
246
+ dict as follows::
247
+
248
+ {
249
+ "autotagPositiveTrainingItems": {
250
+ ref_id: str,
251
+ model_prediction_annotation_id: str | None,
252
+ ground_truth_annotation_id: str | None,
253
+ }[],
254
+ "autotag": {
255
+ id: str,
256
+ name: str,
257
+ status: "started" | "completed",
258
+ autotag_level: "Image" | "Object"
259
+ }
133
260
  }
134
- }
135
- See https://dashboard.nucleus.scale.com/nucleus/docs/api#export-autotag-training-items for more details on the return types.
261
+
262
+ Note ``model_prediction_annotation_id`` and ``ground_truth_annotation_id``
263
+ are only relevant for object autotags.
136
264
  """
137
265
  response = self._client.make_request(
138
266
  payload={},
@@ -141,19 +269,21 @@ class Dataset:
141
269
  )
142
270
  return response
143
271
 
144
- def info(self) -> dict:
145
- """
146
- Returns information about existing dataset
147
- :return: dictionary of the form
148
- {
149
- 'name': str,
150
- 'length': int,
151
- 'model_run_ids': List[str],
152
- 'slice_ids': List[str]
153
- }
272
+ def info(self) -> DatasetInfo:
273
+ """Retrieve information about the dataset
274
+
275
+ Returns:
276
+ :class:`DatasetInfo`
154
277
  """
155
- return self._client.dataset_info(self.id)
278
+ response = self._client.make_request(
279
+ {}, f"dataset/{self.id}/info", requests.get
280
+ )
281
+ dataset_info = DatasetInfo.parse_obj(response)
282
+ return dataset_info
156
283
 
284
+ @deprecated(
285
+ "Model runs have been deprecated and will be removed. Use a Model instead"
286
+ )
157
287
  def create_model_run(
158
288
  self,
159
289
  name: str,
@@ -162,22 +292,6 @@ class Dataset:
162
292
  metadata: Optional[Dict[str, Any]] = None,
163
293
  annotation_metadata_schema: Optional[Dict] = None,
164
294
  ):
165
- """
166
- :param name: A name for the model run.
167
- :param reference_id: The user-specified reference identifier to associate with the model.
168
- The 'model_id' field should be empty if this field is populated,
169
- :param model_id: The internally-controlled identifier of the model.
170
- The 'reference_id' field should be empty if this field is populated,
171
- :param metadata: An arbitrary metadata blob for the current run.
172
- :param annotation_metadata_schema: A dictionary that defines schema for annotations.
173
- :param segmentation_metadata_schema: A dictionary that defines schema for segmentation.
174
-
175
- :return:
176
- {
177
- "model_id": str,
178
- "model_run_id": str,
179
- }
180
- """
181
295
  payload = construct_model_run_creation_payload(
182
296
  name,
183
297
  reference_id,
@@ -189,22 +303,64 @@ class Dataset:
189
303
 
190
304
  def annotate(
191
305
  self,
192
- annotations: Sequence[Annotation],
193
- update: Optional[bool] = DEFAULT_ANNOTATION_UPDATE_MODE,
306
+ annotations: Sequence[
307
+ Union[
308
+ BoxAnnotation,
309
+ PolygonAnnotation,
310
+ CuboidAnnotation,
311
+ CategoryAnnotation,
312
+ MultiCategoryAnnotation,
313
+ SegmentationAnnotation,
314
+ ]
315
+ ],
316
+ update: bool = DEFAULT_ANNOTATION_UPDATE_MODE,
194
317
  batch_size: int = 5000,
195
318
  asynchronous: bool = False,
196
319
  ) -> Union[Dict[str, Any], AsyncJob]:
197
- """
198
- Uploads ground truth annotations for a given dataset.
199
- :param annotations: ground truth annotations for a given dataset to upload
200
- :param batch_size: batch parameter for long uploads
201
- :return:
202
- {
203
- "dataset_id: str,
204
- "new_items": int,
205
- "updated_items": int,
206
- "ignored_items": int,
207
- }
320
+ """Uploads ground truth annotations to the dataset.
321
+
322
+ Adding ground truth to your dataset in Nucleus allows you to visualize
323
+ annotations, query dataset items based on the annotations they contain,
324
+ and evaluate models by comparing their predictions to ground truth.
325
+
326
+ Nucleus supports :class:`Box<BoxAnnotation>`, :class:`Polygon<PolygonAnnotation>`,
327
+ :class:`Cuboid<CuboidAnnotation>`, :class:`Segmentation<SegmentationAnnotation>`,
328
+ and :class:`Category<CategoryAnnotation>` annotations. Cuboid annotations
329
+ can only be uploaded to a :class:`pointcloud DatasetItem<LidarScene>`.
330
+
331
+ When uploading an annotation, you need to specify which item you are
332
+ annotating via the reference_id you provided when uploading the image
333
+ or pointcloud.
334
+
335
+ Ground truth uploads can be made idempotent by specifying an optional
336
+ annotation_id for each annotation. This id should be unique within the
337
+ dataset_item so that (reference_id, annotation_id) is unique within the
338
+ dataset.
339
+
340
+ See :class:`SegmentationAnnotation` for specific requirements to upload
341
+ segmentation annotations.
342
+
343
+ For ingesting large annotation payloads, see the `Guide for Large Ingestions
344
+ <https://nucleus.scale.com/docs/large-ingestion>`_.
345
+
346
+ Parameters:
347
+ annotations (Sequence[:class:`Annotation`]): List of annotation
348
+ objects to upload.
349
+ update: Whether to ignore or overwrite metadata for conflicting annotations.
350
+ batch_size: Number of annotations processed in each concurrent batch.
351
+ Default is 5000.
352
+ asynchronous: Whether or not to process the upload asynchronously (and
353
+ return an :class:`AsyncJob` object). Default is False.
354
+
355
+ Returns:
356
+ If synchronous, payload describing the upload result::
357
+
358
+ {
359
+ "dataset_id": str,
360
+ "annotations_processed": int
361
+ }
362
+
363
+ Otherwise, returns an :class:`AsyncJob` object.
208
364
  """
209
365
  check_all_mask_paths_remote(annotations)
210
366
 
@@ -217,45 +373,138 @@ class Dataset:
217
373
  route=f"dataset/{self.id}/annotate?async=1",
218
374
  )
219
375
  return AsyncJob.from_json(response, self._client)
220
-
221
376
  return self._client.annotate_dataset(
222
377
  self.id, annotations, update=update, batch_size=batch_size
223
378
  )
224
379
 
225
- def ingest_tasks(self, task_ids: dict):
226
- """
227
- If you already submitted tasks to Scale for annotation this endpoint ingests your completed tasks
228
- annotated by Scale into your Nucleus Dataset.
229
- Right now we support ingestion from Videobox Annotation and 2D Box Annotation projects.
230
- Lated we'll support more annotation types.
231
- :param task_ids: list of task ids
232
- :return: {"ingested_tasks": int, "ignored_tasks": int, "pending_tasks": int}
380
+ def ingest_tasks(self, task_ids: List[str]) -> dict:
381
+ """Ingest specific tasks from an existing Scale or Rapid project into the dataset.
382
+
383
+ Note: if you would like to create a new Dataset from an exisiting Scale
384
+ labeling project, use :meth:`NucleusClient.create_dataset_from_project`.
385
+
386
+ For more info, see our `Ingest From Labeling Guide
387
+ <https://nucleus.scale.com/docs/ingest-from-labeling>`_.
388
+
389
+ Parameters:
390
+ task_ids: List of task IDs to ingest.
391
+
392
+ Returns:
393
+ Payload describing the asynchronous upload result::
394
+
395
+ {
396
+ "ingested_tasks": int,
397
+ "ignored_tasks": int,
398
+ "pending_tasks": int
399
+ }
233
400
  """
234
- return self._client.ingest_tasks(self.id, {"tasks": task_ids})
401
+ # TODO(gunnar): Validate right behaviour. Pydantic?
402
+ return self._client.make_request(
403
+ {"tasks": task_ids}, f"dataset/{self.id}/ingest_tasks"
404
+ )
235
405
 
236
406
  def append(
237
407
  self,
238
408
  items: Union[Sequence[DatasetItem], Sequence[LidarScene]],
239
- update: Optional[bool] = False,
240
- batch_size: Optional[int] = 20,
241
- asynchronous=False,
242
- ) -> Union[dict, AsyncJob]:
243
- """
244
- Appends images with metadata (dataset items) or scenes to the dataset. Overwrites images on collision if forced.
409
+ update: bool = False,
410
+ batch_size: int = 20,
411
+ asynchronous: bool = False,
412
+ ) -> Union[Dict[Any, Any], AsyncJob, UploadResponse]:
413
+ """Appends items or scenes to a dataset.
414
+
415
+ .. note::
416
+ Datasets can only accept one of :class:`DatasetItems <DatasetItem>`
417
+ or :class:`Scenes <LidarScene>`, never both.
418
+
419
+ This behavior is set during Dataset :meth:`creation
420
+ <NucleusClient.create_dataset>` with the ``is_scene`` flag.
421
+
422
+ ::
423
+
424
+ import nucleus
425
+
426
+ client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
427
+ dataset = client.get_dataset("YOUR_DATASET_ID")
428
+
429
+ local_item = nucleus.DatasetItem(
430
+ image_location="./1.jpg",
431
+ reference_id="image_1",
432
+ metadata={"key": "value"}
433
+ )
434
+ remote_item = nucleus.DatasetItem(
435
+ image_location="s3://your-bucket/2.jpg",
436
+ reference_id="image_2",
437
+ metadata={"key": "value"}
438
+ )
439
+
440
+ # default is synchronous upload
441
+ sync_response = dataset.append(items=[local_item])
442
+
443
+ # async jobs have higher throughput but can be more difficult to debug
444
+ async_job = dataset.append(
445
+ items=[remote_item], # all items must be remote for async
446
+ asynchronous=True
447
+ )
448
+ print(async_job.status())
449
+
450
+ A :class:`Dataset` can be populated with labeled and unlabeled
451
+ data. Using Nucleus, you can filter down the data inside your dataset
452
+ using custom metadata about your images.
453
+
454
+ For instance, your local dataset may contain ``Sunny``, ``Foggy``, and
455
+ ``Rainy`` folders of images. All of these images can be uploaded into a
456
+ single Nucleus ``Dataset``, with (queryable) metadata like ``{"weather":
457
+ "Sunny"}``.
458
+
459
+ To update an item's metadata, you can re-ingest the same items with the
460
+ ``update`` argument set to true. Existing metadata will be overwritten
461
+ for ``DatasetItems`` in the payload that share a ``reference_id`` with a
462
+ previously uploaded ``DatasetItem``. To retrieve your existing
463
+ ``reference_ids``, use :meth:`Dataset.items`.
464
+
465
+ ::
466
+
467
+ # overwrite metadata by reuploading the item
468
+ remote_item.metadata["weather"] = "Sunny"
469
+
470
+ async_job_2 = dataset.append(
471
+ items=[remote_item],
472
+ update=True,
473
+ asynchronous=True
474
+ )
245
475
 
246
476
  Parameters:
247
- :param items: items to upload
248
- :param update: if True overwrites images and metadata on collision
249
- :param batch_size: batch parameter for long uploads
250
- :param aynchronous: if True, return a job object representing asynchronous ingestion job.
251
- :return:
252
- {
253
- 'dataset_id': str,
254
- 'new_items': int,
255
- 'updated_items': int,
256
- 'ignored_items': int,
257
- }
477
+ dataset_items ( \
478
+ Union[ \
479
+ Sequence[:class:`DatasetItem`], \
480
+ Sequence[:class:`LidarScene`] \
481
+ ]): List of items or scenes to upload.
482
+ batch_size: Size of the batch for larger uploads. Default is 20.
483
+ update: Whether or not to overwrite metadata on reference ID collision.
484
+ Default is False.
485
+ asynchronous: Whether or not to process the upload asynchronously (and
486
+ return an :class:`AsyncJob` object). This is highly encouraged for
487
+ 3D data to drastically increase throughput. Default is False.
488
+
489
+ Returns:
490
+ For scenes
491
+ If synchronous, returns a payload describing the upload result::
492
+
493
+ {
494
+ "dataset_id: str,
495
+ "new_items": int,
496
+ "updated_items": int,
497
+ "ignored_items": int,
498
+ "upload_errors": int
499
+ }
500
+
501
+ Otherwise, returns an :class:`AsyncJob` object.
502
+ For images
503
+ If synchronous returns UploadResponse otherwise :class:`AsyncJob`
258
504
  """
505
+ assert (
506
+ batch_size is None or batch_size < 30
507
+ ), "Please specify a batch size smaller than 30 to avoid timeouts."
259
508
  dataset_items = [
260
509
  item for item in items if isinstance(item, DatasetItem)
261
510
  ]
@@ -265,7 +514,11 @@ class Dataset:
265
514
  "You must append either DatasetItems or Scenes to the dataset."
266
515
  )
267
516
  if scenes:
268
- return self.append_scenes(scenes, update, asynchronous)
517
+ assert (
518
+ asynchronous
519
+ ), "In order to avoid timeouts, you must set asynchronous=True when uploading scenes."
520
+
521
+ return self._append_scenes(scenes, update, asynchronous)
269
522
 
270
523
  check_for_duplicate_reference_ids(dataset_items)
271
524
 
@@ -288,42 +541,44 @@ class Dataset:
288
541
  )
289
542
  return AsyncJob.from_json(response, self._client)
290
543
 
291
- return self._client.populate_dataset(
292
- self.id,
544
+ return self._upload_items(
293
545
  dataset_items,
294
546
  update=update,
295
547
  batch_size=batch_size,
296
548
  )
297
549
 
550
+ @deprecated("Prefer using Dataset.append instead.")
298
551
  def append_scenes(
299
552
  self,
300
553
  scenes: List[LidarScene],
301
554
  update: Optional[bool] = False,
302
555
  asynchronous: Optional[bool] = False,
303
556
  ) -> Union[dict, AsyncJob]:
304
- """
305
- Appends scenes with given frames (containing pointclouds and optional images) to the dataset
557
+ return self._append_scenes(scenes, update, asynchronous)
558
+
559
+ def _append_scenes(
560
+ self,
561
+ scenes: List[LidarScene],
562
+ update: Optional[bool] = False,
563
+ asynchronous: Optional[bool] = False,
564
+ ) -> Union[dict, AsyncJob]:
565
+ # TODO: make private in favor of Dataset.append invocation
566
+ if not self.is_scene:
567
+ raise Exception(
568
+ "Your dataset is not a scene dataset but only supports single dataset items. "
569
+ "In order to be able to add scenes, please create another dataset with "
570
+ "client.create_dataset(<dataset_name>, is_scene=True) or add the scenes to "
571
+ "an existing scene dataset."
572
+ )
306
573
 
307
- Parameters:
308
- :param scenes: scenes to upload
309
- :param update: if True, overwrite scene on collision
310
- :param asynchronous: if True, return a job object representing asynchronous ingestion job
311
- :return:
312
- {
313
- 'dataset_id': str,
314
- 'new_scenes': int,
315
- 'ignored_scenes': int,
316
- 'scenes_errored': int,
317
- 'errors': List[str],
318
- }
319
- """
320
574
  for scene in scenes:
321
575
  scene.validate()
322
576
 
323
- if len(scenes) > WARN_FOR_LARGE_SCENES_UPLOAD and not asynchronous:
577
+ if not asynchronous:
324
578
  print(
325
- "Tip: for large uploads, get faster performance by importing your data "
326
- "into Nucleus directly from a cloud storage provider. See "
579
+ "WARNING: Processing lidar pointclouds usually takes several seconds. As a result, sychronous scene upload"
580
+ "requests are likely to timeout. For large uploads, we recommend using the flag asynchronous=True "
581
+ "to avoid HTTP timeouts. Please see"
327
582
  "https://dashboard.scale.com/nucleus/docs/api?language=python#guide-for-large-ingestions"
328
583
  " for details."
329
584
  )
@@ -347,51 +602,98 @@ class Dataset:
347
602
  return response
348
603
 
349
604
  def iloc(self, i: int) -> dict:
605
+ """Retrieves dataset item by absolute numerical index.
606
+
607
+ Parameters:
608
+ i: Absolute numerical index of the dataset item within the dataset.
609
+
610
+ Returns:
611
+ Payload describing the dataset item and associated annotations::
612
+
613
+ {
614
+ "item": DatasetItem
615
+ "annotations": {
616
+ "box": Optional[List[BoxAnnotation]],
617
+ "cuboid": Optional[List[CuboidAnnotation]],
618
+ "polygon": Optional[List[PolygonAnnotation]],
619
+ "segmentation": Optional[List[SegmentationAnnotation]],
620
+ "category": Optional[List[CategoryAnnotation]],
621
+ }
622
+ }
350
623
  """
351
- Returns Dataset Item Info By Dataset Item Number.
352
- :param i: absolute number of dataset item for the given dataset.
353
- :return:
354
- {
355
- "item": DatasetItem,
356
- "annotations": List[Union[BoxAnnotation, PolygonAnnotation, CuboidAnnotation, SegmentationAnnotation]],
357
- }
358
- """
359
- response = self._client.dataitem_iloc(self.id, i)
624
+ response = self._client.make_request(
625
+ {}, f"dataset/{self.id}/iloc/{i}", requests.get
626
+ )
360
627
  return format_dataset_item_response(response)
361
628
 
629
+ @sanitize_string_args
362
630
  def refloc(self, reference_id: str) -> dict:
631
+ """Retrieves a dataset item by reference ID.
632
+
633
+ Parameters:
634
+ reference_id: User-defined reference ID of the dataset item.
635
+
636
+ Returns:
637
+ Payload containing the dataset item and associated annotations::
638
+
639
+ {
640
+ "item": DatasetItem
641
+ "annotations": {
642
+ "box": Optional[List[BoxAnnotation]],
643
+ "cuboid": Optional[List[CuboidAnnotation]],
644
+ "polygon": Optional[List[PolygonAnnotation]],
645
+ "segmentation": Optional[List[SegmentationAnnotation]],
646
+ "category": Optional[List[CategoryAnnotation]],
647
+ }
648
+ }
363
649
  """
364
- Returns Dataset Item Info By Dataset Item Reference Id.
365
- :param reference_id: reference_id of dataset item.
366
- :return:
367
- {
368
- "item": DatasetItem,
369
- "annotations": List[Union[BoxAnnotation, PolygonAnnotation, CuboidAnnotation, SegmentationAnnotation]],
370
- }
371
- """
372
- response = self._client.dataitem_ref_id(self.id, reference_id)
650
+ response = self._client.make_request(
651
+ {}, f"dataset/{self.id}/refloc/{reference_id}", requests.get
652
+ )
373
653
  return format_dataset_item_response(response)
374
654
 
375
655
  def loc(self, dataset_item_id: str) -> dict:
656
+ """Retrieves a dataset item by Nucleus-generated ID.
657
+
658
+ Parameters:
659
+ dataset_item_id: Nucleus-generated dataset item ID (starts with ``di_``).
660
+ This can be retrieved via :meth:`Dataset.items` or a Nucleus dashboard URL.
661
+
662
+ Returns:
663
+ Payload containing the dataset item and associated annotations::
664
+
665
+ {
666
+ "item": DatasetItem
667
+ "annotations": {
668
+ "box": Optional[List[BoxAnnotation]],
669
+ "cuboid": Optional[List[CuboidAnnotation]],
670
+ "polygon": Optional[List[PolygonAnnotation]],
671
+ "segmentation": Optional[List[SegmentationAnnotation]],
672
+ "category": Optional[List[CategoryAnnotation]],
673
+ }
674
+ }
376
675
  """
377
- Returns Dataset Item Info By Dataset Item Id.
378
- :param dataset_item_id: internally controlled id for the dataset item.
379
- :return:
380
- {
381
- "item": DatasetItem,
382
- "annotations": List[Union[BoxAnnotation, PolygonAnnotation, CuboidAnnotation, SegmentationAnnotation]],
383
- }
384
- """
385
- response = self._client.dataitem_loc(self.id, dataset_item_id)
676
+ response = self._client.make_request(
677
+ {}, f"dataset/{self.id}/loc/{dataset_item_id}", requests.get
678
+ )
386
679
  return format_dataset_item_response(response)
387
680
 
388
681
  def ground_truth_loc(self, reference_id: str, annotation_id: str):
389
- """
390
- Returns info for single ground truth Annotation by its id.
391
- :param reference_id: User specified id for the dataset item the ground truth is attached to
392
- :param annotation_id: User specified, or auto-generated id for the annotation
393
- :return:
394
- BoxAnnotation | PolygonAnnotation | CuboidAnnotation
682
+ """Fetches a single ground truth annotation by id.
683
+
684
+ Parameters:
685
+ reference_id: User-defined reference ID of the dataset item associated
686
+ with the ground truth annotation.
687
+ annotation_id: User-defined ID of the ground truth annotation.
688
+
689
+ Returns:
690
+ Union[\
691
+ :class:`BoxAnnotation`, \
692
+ :class:`PolygonAnnotation`, \
693
+ :class:`CuboidAnnotation`, \
694
+ :class:`SegmentationAnnotation` \
695
+ :class:`CategoryAnnotation` \
696
+ ]: Ground truth annotation object with the specified annotation ID.
395
697
  """
396
698
  response = self._client.make_request(
397
699
  {},
@@ -404,67 +706,296 @@ class Dataset:
404
706
  self,
405
707
  name: str,
406
708
  reference_ids: List[str],
407
- ):
408
- """
409
- Creates a slice from items already present in a dataset.
410
- The caller must exclusively use either datasetItemIds or reference_ids
411
- as a means of identifying items in the dataset.
709
+ ) -> Slice:
710
+ """Creates a :class:`Slice` of dataset items within a dataset.
412
711
 
413
- :param name: The human-readable name of the slice.
414
- :param reference_ids: A list of user-specified identifier for the items in the slice
712
+ Parameters:
713
+ name: A human-readable name for the slice.
714
+ reference_ids: List of reference IDs of dataset items to add to the slice::
415
715
 
416
- :return: new Slice object
716
+ Returns:
717
+ :class:`Slice`: The newly constructed slice item.
417
718
  """
418
- return self._client.create_slice(
419
- self.id, {NAME_KEY: name, REFERENCE_IDS_KEY: reference_ids}
719
+ payload = {NAME_KEY: name, REFERENCE_IDS_KEY: reference_ids}
720
+ response = self._client.make_request(
721
+ payload, f"dataset/{self.id}/create_slice"
420
722
  )
723
+ return Slice(response[SLICE_ID_KEY], self._client)
421
724
 
422
- def delete_item(self, reference_id: str):
423
- return self._client.delete_dataset_item(
424
- self.id, reference_id=reference_id
725
+ @sanitize_string_args
726
+ def delete_item(self, reference_id: str) -> dict:
727
+ """Deletes an item from the dataset by item reference ID.
728
+
729
+ All annotations and predictions associated with the item will be deleted
730
+ as well.
731
+
732
+ Parameters:
733
+ reference_id: The user-defined reference ID of the item to delete.
734
+
735
+ Returns:
736
+ Payload to indicate deletion invocation.
737
+ """
738
+ return self._client.make_request(
739
+ {},
740
+ f"dataset/{self.id}/refloc/{reference_id}",
741
+ requests.delete,
425
742
  )
426
743
 
744
+ @sanitize_string_args
745
+ def delete_scene(self, reference_id: str):
746
+ """Deletes a Scene associated with the Dataset
747
+
748
+ All items, annotations and predictions associated with the scene will be
749
+ deleted as well.
750
+
751
+ Parameters:
752
+ reference_id: The user-defined reference ID of the item to delete.
753
+ """
754
+ self._client.delete(f"dataset/{self.id}/scene/{reference_id}")
755
+
427
756
  def list_autotags(self):
757
+ """Fetches all autotags of the dataset.
758
+
759
+ Returns:
760
+ List of autotag payloads::
761
+
762
+ List[{
763
+ "id": str,
764
+ "name": str,
765
+ "status": "completed" | "pending",
766
+ "autotag_level": "Image" | "Object"
767
+ }]
768
+ """
428
769
  return self._client.list_autotags(self.id)
429
770
 
430
- def create_custom_index(self, embeddings_urls: list, embedding_dim: int):
431
- return self._client.create_custom_index(
432
- self.id,
433
- embeddings_urls,
434
- embedding_dim,
771
+ def update_autotag(self, autotag_id):
772
+ """Will rerun inference on all dataset items in the dataset.
773
+ For now this endpoint does not try to skip already inferenced items, but this
774
+ improvement is planned for the future. This means that for now, you can only
775
+ have one job running at time, so please await the result using job.sleep_until_complete()
776
+ before launching another job.
777
+
778
+ Parameters:
779
+ autotag_id: Id of the autotag to re-inference. You can figure out which
780
+ id you want by using dataset.list_autotags, or by looking at the URL in the
781
+ manage autotag page.
782
+
783
+ Returns:
784
+ :class:`AsyncJob`: Asynchronous job object to track processing status.
785
+ """
786
+ return AsyncJob.from_json(
787
+ payload=self._client.make_request(
788
+ {}, f"autotag/{autotag_id}", requests.post
789
+ ),
790
+ client=self._client,
791
+ )
792
+
793
+ def create_custom_index(
794
+ self, embeddings_urls: List[str], embedding_dim: int
795
+ ):
796
+ """Processes user-provided embeddings for the dataset to use with autotag and simsearch.
797
+
798
+ ::
799
+
800
+ import nucleus
801
+
802
+ client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
803
+ dataset = client.get_dataset("YOUR_DATASET_ID")
804
+
805
+ embeddings = {
806
+ "reference_id_0": [0.1, 0.2, 0.3],
807
+ "reference_id_1": [0.4, 0.5, 0.6],
808
+ } # uploaded to s3 with the below URL
809
+
810
+ embeddings_url = "s3://dataset/embeddings_map.json"
811
+
812
+ response = dataset.create_custom_index(
813
+ embeddings_url=[embeddings_url],
814
+ embedding_dim=3
815
+ )
816
+
817
+ Parameters:
818
+ embeddings_urls: List of URLs, each of which pointing to
819
+ a JSON mapping reference_id -> embedding vector.
820
+ embedding_dim: The dimension of the embedding vectors. Must be consistent
821
+ across all embedding vectors in the index.
822
+
823
+ Returns:
824
+ :class:`AsyncJob`: Asynchronous job object to track processing status.
825
+ """
826
+ res = self._client.post(
827
+ {
828
+ EMBEDDINGS_URL_KEY: embeddings_urls,
829
+ EMBEDDING_DIMENSION_KEY: embedding_dim,
830
+ },
831
+ f"indexing/{self.id}",
832
+ )
833
+ return AsyncJob.from_json(
834
+ res,
835
+ self._client,
435
836
  )
436
837
 
437
838
  def delete_custom_index(self):
839
+ """Deletes the custom index uploaded to the dataset.
840
+
841
+ Returns:
842
+ Payload containing information that can be used to track the job's status::
843
+
844
+ {
845
+ "dataset_id": str,
846
+ "job_id": str,
847
+ "message": str
848
+ }
849
+ """
438
850
  return self._client.delete_custom_index(self.id)
439
851
 
440
852
  def set_continuous_indexing(self, enable: bool = True):
441
- return self._client.set_continuous_indexing(self.id, enable)
853
+ """Toggle whether embeddings are automatically generated for new data.
854
+
855
+ Sets continuous indexing for a given dataset, which will automatically
856
+ generate embeddings for use with autotag whenever new images are uploaded.
857
+
858
+ Parameters:
859
+ enable: Whether to enable or disable continuous indexing. Default is
860
+ True.
861
+
862
+ Returns:
863
+ Response payload::
864
+
865
+ {
866
+ "dataset_id": str,
867
+ "message": str
868
+ "backfill_job": AsyncJob,
869
+ }
870
+ """
871
+ preprocessed_response = self._client.set_continuous_indexing(
872
+ self.id, enable
873
+ )
874
+ response = {
875
+ DATASET_ID_KEY: preprocessed_response[DATASET_ID_KEY],
876
+ MESSAGE_KEY: preprocessed_response[MESSAGE_KEY],
877
+ }
878
+ if enable:
879
+ response[BACKFILL_JOB_KEY] = (
880
+ AsyncJob.from_json(preprocessed_response, self._client),
881
+ )
882
+ return response
442
883
 
443
884
  def create_image_index(self):
885
+ """Creates or updates image index by generating embeddings for images that do not already have embeddings.
886
+
887
+ The embeddings are used for autotag and similarity search.
888
+
889
+ This endpoint is limited to index up to 2 million images at a time and the
890
+ job will fail for payloads that exceed this limit.
891
+
892
+ Response:
893
+ :class:`AsyncJob`: Asynchronous job object to track processing status.
894
+ """
444
895
  response = self._client.create_image_index(self.id)
445
896
  return AsyncJob.from_json(response, self._client)
446
897
 
898
+ def create_object_index(
899
+ self, model_run_id: str = None, gt_only: bool = None
900
+ ):
901
+ """Creates or updates object index by generating embeddings for objects that do not already have embeddings.
902
+
903
+ These embeddings are used for autotag and similarity search. This endpoint
904
+ only supports indexing objects sourced from the predictions of a specific
905
+ model or the ground truth annotations of the dataset.
906
+
907
+ This endpoint is idempotent. If this endpoint is called again for a model
908
+ whose predictions were indexed in the past, the previously indexed predictions
909
+ will not have new embeddings recomputed. The same is true for ground truth
910
+ annotations.
911
+
912
+ Note that this means if you change update a prediction or ground truth
913
+ bounding box that already has an associated embedding, the embedding will
914
+ not be updated, even with another call to this endpoint. For now, we
915
+ recommend deleting the prediction or ground truth annotation and
916
+ re-inserting it to force generate a new embedding.
917
+
918
+ This endpoint is limited to generating embeddings for 3 million objects
919
+ at a time and the job will fail for payloads that exceed this limit.
920
+
921
+ Parameters:
922
+ model_run_id: The ID of the model whose predictions should be indexed.
923
+ Default is None, but must be supplied in the absence of ``gt_only``.
924
+
925
+ .. todo ::
926
+ Deprecate model run
927
+
928
+ gt_only: Whether to only generate embeddings for the ground truth
929
+ annotations of the dataset. Default is None, but must be supplied
930
+ in the absence of ``model_run_id``.
931
+
932
+ Returns:
933
+ Payload containing an :class:`AsyncJob` object to monitor progress.
934
+ """
935
+ response = self._client.create_object_index(
936
+ self.id, model_run_id, gt_only
937
+ )
938
+ return AsyncJob.from_json(response, self._client)
939
+
447
940
  def add_taxonomy(
448
941
  self,
449
942
  taxonomy_name: str,
450
943
  taxonomy_type: str,
451
944
  labels: List[str],
945
+ update: bool = False,
452
946
  ):
453
- """
454
- Creates a new taxonomy.
455
- Returns a response with dataset_id, taxonomy_name and type for the new taxonomy.
456
- :param taxonomy_name: name of the taxonomy
457
- :param type: type of the taxonomy
458
- :param labels: list of possible labels for the taxonomy
947
+ """Creates a new taxonomy.
948
+ ::
949
+
950
+ import nucleus
951
+ client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
952
+ dataset = client.get_dataset("YOUR_DATASET_ID")
953
+
954
+ response = dataset.add_taxonomy(
955
+ taxonomy_name="clothing_type",
956
+ taxonomy_type="category",
957
+ labels=["shirt", "trousers", "dress"],
958
+ update=False
959
+ )
960
+
961
+ Parameters:
962
+ taxonomy_name: The name of the taxonomy. Taxonomy names must be
963
+ unique within a dataset.
964
+ taxonomy_type: The type of this taxonomy as a string literal.
965
+ Currently, the only supported taxonomy type is "category".
966
+ labels: The list of possible labels for the taxonomy.
967
+ update: Whether or not to update taxonomy labels on taxonomy name collision. Default is False. Note that taxonomy labels will not be deleted on update, they can only be appended.
968
+
969
+ Returns:
970
+ Returns a response with dataset_id, taxonomy_name and status of the add taxonomy operation.
459
971
  """
460
972
  return self._client.make_request(
461
- construct_taxonomy_payload(taxonomy_name, taxonomy_type, labels),
973
+ construct_taxonomy_payload(
974
+ taxonomy_name, taxonomy_type, labels, update
975
+ ),
462
976
  f"dataset/{self.id}/add_taxonomy",
463
977
  requests_command=requests.post,
464
978
  )
465
979
 
466
- def check_index_status(self, job_id: str):
467
- return self._client.check_index_status(job_id)
980
+ def delete_taxonomy(
981
+ self,
982
+ taxonomy_name: str,
983
+ ):
984
+ """Deletes the given taxonomy.
985
+
986
+ All annotations and predictions associated with the taxonomy will be deleted as well.
987
+
988
+ Parameters:
989
+ taxonomy_name: The name of the taxonomy.
990
+
991
+ Returns:
992
+ Returns a response with dataset_id, taxonomy_name and status of the delete taxonomy operation.
993
+ """
994
+ return self._client.make_request(
995
+ {},
996
+ f"dataset/{self.id}/taxonomy/{taxonomy_name}",
997
+ requests.delete,
998
+ )
468
999
 
469
1000
  def items_and_annotations(
470
1001
  self,
@@ -472,12 +1003,18 @@ class Dataset:
472
1003
  """Returns a list of all DatasetItems and Annotations in this slice.
473
1004
 
474
1005
  Returns:
475
- A list, where each item is a dict with two keys representing a row
476
- in the dataset.
477
- * One value in the dict is the DatasetItem, containing a reference to the
478
- item that was annotated.
479
- * The other value is a dictionary containing all the annotations for this
480
- dataset item, sorted by annotation type.
1006
+ A list of dicts, each with two keys representing a row in the dataset::
1007
+
1008
+ List[{
1009
+ "item": DatasetItem,
1010
+ "annotations": {
1011
+ "box": Optional[List[BoxAnnotation]],
1012
+ "cuboid": Optional[List[CuboidAnnotation]],
1013
+ "polygon": Optional[List[PolygonAnnotation]],
1014
+ "segmentation": Optional[List[SegmentationAnnotation]],
1015
+ "category": Optional[List[CategoryAnnotation]],
1016
+ }
1017
+ }]
481
1018
  """
482
1019
  api_payload = self._client.make_request(
483
1020
  payload=None,
@@ -489,25 +1026,412 @@ class Dataset:
489
1026
  def export_embeddings(
490
1027
  self,
491
1028
  ) -> List[Dict[str, Union[str, List[float]]]]:
492
- """Returns a pd.Dataframe-ready format of dataset embeddings.
1029
+ """Fetches a pd.DataFrame-ready list of dataset embeddings.
493
1030
 
494
1031
  Returns:
495
1032
  A list, where each item is a dict with two keys representing a row
496
- in the dataset.
497
- * One value in the dict is the reference id
498
- * The other value is a list of the embedding values
1033
+ in the dataset::
1034
+
1035
+ List[{
1036
+ "reference_id": str,
1037
+ "embedding_vector": List[float]
1038
+ }]
499
1039
  """
500
1040
  api_payload = self._client.make_request(
501
1041
  payload=None,
502
1042
  route=f"dataset/{self.id}/embeddings",
503
1043
  requests_command=requests.get,
504
1044
  )
505
- return api_payload
1045
+ return api_payload # type: ignore
506
1046
 
507
1047
  def delete_annotations(
508
1048
  self, reference_ids: list = None, keep_history=False
509
- ):
510
- response = self._client.delete_annotations(
511
- self.id, reference_ids, keep_history
1049
+ ) -> AsyncJob:
1050
+ """Deletes all annotations associated with the specified item reference IDs.
1051
+
1052
+ Parameters:
1053
+ reference_ids: List of user-defined reference IDs of the dataset items
1054
+ from which to delete annotations.
1055
+ keep_history: Whether to preserve version history. If False, all
1056
+ previous versions will be deleted along with the annotations. If
1057
+ True, the version history (including deletion) wil persist.
1058
+ Default is False.
1059
+
1060
+ Returns:
1061
+ :class:`AsyncJob`: Empty payload response.
1062
+ """
1063
+ payload = {KEEP_HISTORY_KEY: keep_history}
1064
+ if reference_ids:
1065
+ payload[REFERENCE_IDS_KEY] = reference_ids
1066
+ response = self._client.make_request(
1067
+ payload,
1068
+ f"annotation/{self.id}",
1069
+ requests_command=requests.delete,
512
1070
  )
513
1071
  return AsyncJob.from_json(response, self._client)
1072
+
1073
+ def get_scene(self, reference_id: str) -> Scene:
1074
+ """Fetches a single scene in the dataset by its reference ID.
1075
+
1076
+ Parameters:
1077
+ reference_id: User-defined reference ID of the scene.
1078
+
1079
+ Returns:
1080
+ :class:`Scene<LidarScene>`: A scene object containing frames, which
1081
+ in turn contain pointcloud or image items.
1082
+ """
1083
+ return LidarScene.from_json(
1084
+ self._client.make_request(
1085
+ payload=None,
1086
+ route=f"dataset/{self.id}/scene/{reference_id}",
1087
+ requests_command=requests.get,
1088
+ )
1089
+ )
1090
+
1091
+ def export_predictions(self, model):
1092
+ """Fetches all predictions of a model that were uploaded to the dataset.
1093
+
1094
+ Parameters:
1095
+ model (:class:`Model`): The model whose predictions to retrieve.
1096
+
1097
+ Returns:
1098
+ List[Union[\
1099
+ :class:`BoxPrediction`, \
1100
+ :class:`PolygonPrediction`, \
1101
+ :class:`CuboidPrediction`, \
1102
+ :class:`SegmentationPrediction` \
1103
+ ]]: List of prediction objects from the model.
1104
+
1105
+ """
1106
+ json_response = self._client.make_request(
1107
+ payload=None,
1108
+ route=f"dataset/{self.id}/model/{model.id}/export",
1109
+ requests_command=requests.get,
1110
+ )
1111
+ return format_prediction_response({ANNOTATIONS_KEY: json_response})
1112
+
1113
+ def calculate_evaluation_metrics(self, model, options: dict = None):
1114
+ """Starts computation of evaluation metrics for a model on the dataset.
1115
+
1116
+ To update matches and metrics calculated for a model on a given dataset you
1117
+ can call this endpoint. This is required in order to sort by IOU, view false
1118
+ positives/false negatives, and view model insights.
1119
+
1120
+ You can add predictions from a model to a dataset after running the
1121
+ calculation of the metrics. However, the calculation of metrics will have
1122
+ to be retriggered for the new predictions to be matched with ground truth
1123
+ and appear as false positives/negatives, or for the new predictions effect
1124
+ on metrics to be reflected in model run insights.
1125
+
1126
+ During IoU calculation, bounding box Predictions are compared to
1127
+ GroundTruth using a greedy matching algorithm that matches prediction and
1128
+ ground truth boxes that have the highest ious first. By default the
1129
+ matching algorithm is class-agnostic: it will greedily create matches
1130
+ regardless of the class labels.
1131
+
1132
+ The algorithm can be tuned to classify true positives between certain
1133
+ classes, but not others. This is useful if the labels in your ground truth
1134
+ do not match the exact strings of your model predictions, or if you want
1135
+ to associate multiple predictions with one ground truth label, or multiple
1136
+ ground truth labels with one prediction. To recompute metrics based on
1137
+ different matching, you can re-commit the run with new request parameters.
1138
+
1139
+ ::
1140
+
1141
+ import nucleus
1142
+
1143
+ client = nucleus.NucleusClient("YOUR_SCALE_API_KEY")
1144
+ dataset = client.get_dataset(dataset_id="YOUR_DATASET_ID")
1145
+
1146
+ model = client.get_model(
1147
+ model_id="YOUR_MODEL_PRJ_ID",
1148
+ dataset_id="YOUR_DATASET_ID"
1149
+ )
1150
+
1151
+ # Compute all evaluation metrics including IOU-based matching:
1152
+ dataset.calculate_evaluation_metrics(model)
1153
+
1154
+ # Match car and bus bounding boxes (for IOU computation)
1155
+ # Otherwise enforce that class labels must match
1156
+ dataset.calculate_evaluation_metrics(model, options={
1157
+ 'allowed_label_matches': [
1158
+ {
1159
+ 'ground_truth_label': 'car',
1160
+ 'model_prediction_label': 'bus'
1161
+ },
1162
+ {
1163
+ 'ground_truth_label': 'bus',
1164
+ 'model_prediction_label': 'car'
1165
+ }
1166
+ ]
1167
+ })
1168
+
1169
+ Parameters:
1170
+ model (:class:`Model`): The model object for which to calculate metrics.
1171
+ options: Dictionary of specific options to configure metrics calculation.
1172
+
1173
+ class_agnostic
1174
+ Whether ground truth and prediction classes can differ when
1175
+ being matched for evaluation metrics. Default is True.
1176
+
1177
+ allowed_label_matches
1178
+ Pairs of ground truth and prediction classes that should
1179
+ be considered matchable when computing metrics. If supplied,
1180
+ ``class_agnostic`` must be False.
1181
+
1182
+ ::
1183
+
1184
+ {
1185
+ "class_agnostic": bool,
1186
+ "allowed_label_matches": List[{
1187
+ "ground_truth_label": str,
1188
+ "model_prediction_label": str
1189
+ }]
1190
+ }
1191
+ """
1192
+ if options is None:
1193
+ options = {}
1194
+ return self._client.make_request(
1195
+ payload=options,
1196
+ route=f"dataset/{self.id}/model/{model.id}/calculateEvaluationMetrics",
1197
+ )
1198
+
1199
+ def upload_predictions(
1200
+ self,
1201
+ model,
1202
+ predictions: List[
1203
+ Union[
1204
+ BoxPrediction,
1205
+ PolygonPrediction,
1206
+ CuboidPrediction,
1207
+ SegmentationPrediction,
1208
+ CategoryPrediction,
1209
+ ]
1210
+ ],
1211
+ update: bool = False,
1212
+ asynchronous: bool = False,
1213
+ ):
1214
+ """Uploads predictions and associates them with an existing :class:`Model`.
1215
+
1216
+ Adding predictions to your dataset in Nucleus allows you to visualize
1217
+ discrepancies against ground truth, query dataset items based on the
1218
+ predictions they contain, and evaluate your models by comparing their
1219
+ predictions to ground truth.
1220
+
1221
+ Nucleus supports :class:`Box<BoxPrediction>`, :class:`Polygon<PolygonPrediction>`,
1222
+ :class:`Cuboid<CuboidPrediction>`, :class:`Segmentation<SegmentationPrediction>`,
1223
+ and :class:`Category<CategoryPrediction>` predictions. Cuboid predictions
1224
+ can only be uploaded to a :class:`pointcloud DatasetItem<LidarScene>`.
1225
+
1226
+ When uploading an prediction, you need to specify which item you are
1227
+ annotating via the reference_id you provided when uploading the image
1228
+ or pointcloud.
1229
+
1230
+ Ground truth uploads can be made idempotent by specifying an optional
1231
+ annotation_id for each prediction. This id should be unique within the
1232
+ dataset_item so that (reference_id, annotation_id) is unique within the
1233
+ dataset.
1234
+
1235
+ See :class:`SegmentationPrediction` for specific requirements to upload
1236
+ segmentation predictions.
1237
+
1238
+ For ingesting large prediction payloads, see the `Guide for Large Ingestions
1239
+ <https://nucleus.scale.com/docs/large-ingestion>`_.
1240
+
1241
+ Parameters:
1242
+ model (:class:`Model`): Nucleus-generated model ID (starts with ``prj_``). This can
1243
+ be retrieved via :meth:`list_models` or a Nucleus dashboard URL.
1244
+ predictions (List[Union[\
1245
+ :class:`BoxPrediction`, \
1246
+ :class:`PolygonPrediction`, \
1247
+ :class:`CuboidPrediction`, \
1248
+ :class:`SegmentationPrediction`, \
1249
+ :class:`CategoryPrediction` \
1250
+ ]]): List of prediction objects to upload.
1251
+ update: Whether or not to overwrite metadata or ignore on reference ID
1252
+ collision. Default is False.
1253
+ asynchronous: Whether or not to process the upload asynchronously (and
1254
+ return an :class:`AsyncJob` object). Default is False.
1255
+
1256
+ Returns:
1257
+ Payload describing the synchronous upload::
1258
+
1259
+ {
1260
+ "dataset_id": str,
1261
+ "model_run_id": str,
1262
+ "predictions_processed": int,
1263
+ "predictions_ignored": int,
1264
+ }
1265
+ """
1266
+ if asynchronous:
1267
+ check_all_mask_paths_remote(predictions)
1268
+
1269
+ request_id = serialize_and_write_to_presigned_url(
1270
+ predictions, self.id, self._client
1271
+ )
1272
+ response = self._client.make_request(
1273
+ payload={REQUEST_ID_KEY: request_id, UPDATE_KEY: update},
1274
+ route=f"dataset/{self.id}/model/{model.id}/uploadPredictions?async=1",
1275
+ )
1276
+ return AsyncJob.from_json(response, self._client)
1277
+ else:
1278
+ return self._client.predict(
1279
+ model_run_id=None,
1280
+ dataset_id=self.id,
1281
+ model_id=model.id,
1282
+ annotations=predictions,
1283
+ update=update,
1284
+ )
1285
+
1286
+ def predictions_iloc(self, model, index):
1287
+ """Fetches all predictions of a dataset item by its absolute index.
1288
+
1289
+ Parameters:
1290
+ model (:class:`Model`): Model object from which to fetch the prediction.
1291
+ index (int): Absolute index of the dataset item within the dataset.
1292
+
1293
+ Returns:
1294
+ Dict[str, List[Union[BoxPrediction, PolygonPrediction, CuboidPrediction,
1295
+ SegmentationPrediction, CategoryPrediction]]]: Dictionary mapping prediction
1296
+ type to a list of such prediction objects from the given model::
1297
+
1298
+ {
1299
+ "box": List[BoxPrediction],
1300
+ "polygon": List[PolygonPrediction],
1301
+ "cuboid": List[CuboidPrediction],
1302
+ "segmentation": List[SegmentationPrediction],
1303
+ "category": List[CategoryPrediction],
1304
+ }
1305
+ """
1306
+ return format_prediction_response(
1307
+ self._client.make_request(
1308
+ payload=None,
1309
+ route=f"dataset/{self.id}/model/{model.id}/iloc/{index}",
1310
+ requests_command=requests.get,
1311
+ )
1312
+ )
1313
+
1314
+ def predictions_refloc(self, model, reference_id):
1315
+ """Fetches all predictions of a dataset item by its reference ID.
1316
+
1317
+ Parameters:
1318
+ model (:class:`Model`): Model object from which to fetch the prediction.
1319
+ reference_id (str): User-defined ID of the dataset item from which to fetch
1320
+ all predictions.
1321
+
1322
+ Returns:
1323
+ Dict[str, List[Union[BoxPrediction, PolygonPrediction, CuboidPrediction,
1324
+ SegmentationPrediction, CategoryPrediction]]]: Dictionary mapping prediction
1325
+ type to a list of such prediction objects from the given model::
1326
+
1327
+ {
1328
+ "box": List[BoxPrediction],
1329
+ "polygon": List[PolygonPrediction],
1330
+ "cuboid": List[CuboidPrediction],
1331
+ "segmentation": List[SegmentationPrediction],
1332
+ "category": List[CategoryPrediction],
1333
+ }
1334
+ """
1335
+ return format_prediction_response(
1336
+ self._client.make_request(
1337
+ payload=None,
1338
+ route=f"dataset/{self.id}/model/{model.id}/referenceId/{reference_id}",
1339
+ requests_command=requests.get,
1340
+ )
1341
+ )
1342
+
1343
+ def prediction_loc(self, model, reference_id, annotation_id):
1344
+ """Fetches a single ground truth annotation by id.
1345
+
1346
+ Parameters:
1347
+ model (:class:`Model`): Model object from which to fetch the prediction.
1348
+ reference_id (str): User-defined reference ID of the dataset item
1349
+ associated with the model prediction.
1350
+ annotation_id (str): User-defined ID of the ground truth annotation.
1351
+
1352
+ Returns:
1353
+ Union[\
1354
+ :class:`BoxPrediction`, \
1355
+ :class:`PolygonPrediction`, \
1356
+ :class:`CuboidPrediction`, \
1357
+ :class:`SegmentationPrediction` \
1358
+ :class:`CategoryPrediction` \
1359
+ ]: Model prediction object with the specified annotation ID.
1360
+ """
1361
+ return from_json(
1362
+ self._client.make_request(
1363
+ payload=None,
1364
+ route=f"dataset/{self.id}/model/{model.id}/loc/{reference_id}/{annotation_id}",
1365
+ requests_command=requests.get,
1366
+ )
1367
+ )
1368
+
1369
+ def _upload_items(
1370
+ self,
1371
+ dataset_items: List[DatasetItem],
1372
+ batch_size: int = 20,
1373
+ update: bool = False,
1374
+ ) -> UploadResponse:
1375
+ """
1376
+ Appends images to a dataset with given dataset_id.
1377
+ Overwrites images on collision if updated.
1378
+
1379
+ Args:
1380
+ dataset_items: Items to Upload
1381
+ batch_size: size of the batch for long payload
1382
+ update: Update records on conflict otherwise overwrite
1383
+ Returns:
1384
+ UploadResponse
1385
+ """
1386
+ if self.is_scene:
1387
+ raise Exception(
1388
+ "Your dataset is a scene dataset and does not support the upload of single dataset items. "
1389
+ "In order to be able to add dataset items, please create another dataset with "
1390
+ "client.create_dataset(<dataset_name>, is_scene=False) or add the dataset items to "
1391
+ "an existing dataset supporting dataset items."
1392
+ )
1393
+
1394
+ populator = DatasetItemUploader(self.id, self._client)
1395
+ return populator.upload(dataset_items, batch_size, update)
1396
+
1397
+ def update_scene_metadata(self, mapping: Dict[str, dict]):
1398
+ """
1399
+ Update (merge) scene metadata for each reference_id given in the mapping.
1400
+ The backed will join the specified mapping metadata to the exisiting metadata.
1401
+ If there is a key-collision, the value given in the mapping will take precedence.
1402
+
1403
+ Args:
1404
+ mapping: key-value pair of <reference_id>: <metadata>
1405
+
1406
+ Examples:
1407
+ >>> mapping = {"scene_ref_1": {"new_key": "foo"}, "scene_ref_2": {"some_value": 123}}
1408
+ >>> dataset.update_scene_metadata(mapping)
1409
+
1410
+ Returns:
1411
+ A dictionary outlining success or failures.
1412
+ """
1413
+ mm = MetadataManager(
1414
+ self.id, self._client, mapping, ExportMetadataType.SCENES
1415
+ )
1416
+ return mm.update()
1417
+
1418
+ def update_item_metadata(self, mapping: Dict[str, dict]):
1419
+ """
1420
+ Update (merge) dataset item metadata for each reference_id given in the mapping.
1421
+ The backed will join the specified mapping metadata to the exisiting metadata.
1422
+ If there is a key-collision, the value given in the mapping will take precedence.
1423
+
1424
+ Args:
1425
+ mapping: key-value pair of <reference_id>: <metadata>
1426
+
1427
+ Examples:
1428
+ >>> mapping = {"item_ref_1": {"new_key": "foo"}, "item_ref_2": {"some_value": 123}}
1429
+ >>> dataset.update_item_metadata(mapping)
1430
+
1431
+ Returns:
1432
+ A dictionary outlining success or failures.
1433
+ """
1434
+ mm = MetadataManager(
1435
+ self.id, self._client, mapping, ExportMetadataType.DATASET_ITEMS
1436
+ )
1437
+ return mm.update()