scale-nucleus 0.1.10__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nucleus/scene.py ADDED
@@ -0,0 +1,241 @@
1
+ import json
2
+ from abc import ABC
3
+ from dataclasses import dataclass, field
4
+ from typing import Optional, Any, Dict, List
5
+ from nucleus.constants import (
6
+ FRAMES_KEY,
7
+ LENGTH_KEY,
8
+ METADATA_KEY,
9
+ NUM_SENSORS_KEY,
10
+ REFERENCE_ID_KEY,
11
+ POINTCLOUD_LOCATION_KEY,
12
+ IMAGE_LOCATION_KEY,
13
+ )
14
+ from .annotation import is_local_path
15
+ from .dataset_item import DatasetItemType, DatasetItem
16
+
17
+
18
+ class Frame:
19
+ def __init__(self, **kwargs):
20
+ self.items = {}
21
+ for key, value in kwargs.items():
22
+ self.items[key] = value
23
+
24
+ def __post_init__(self):
25
+ for key, value in self.items.items():
26
+ assert isinstance(key, str), "All keys must be names of sensors"
27
+ assert isinstance(
28
+ value, DatasetItem
29
+ ), "All values must be DatasetItems"
30
+
31
+ def __repr__(self) -> str:
32
+ return f"Frame(items={self.items})"
33
+
34
+ def add_item(self, item: DatasetItem, sensor_name: str):
35
+ self.items[sensor_name] = item
36
+
37
+ def get_item(self, sensor_name: str):
38
+ if sensor_name not in self.items:
39
+ raise ValueError(
40
+ f"This frame does not have a {sensor_name} sensor"
41
+ )
42
+ return self.items[sensor_name]
43
+
44
+ def get_items(self):
45
+ return list(self.items.values())
46
+
47
+ def get_sensors(self):
48
+ return list(self.items.keys())
49
+
50
+ @classmethod
51
+ def from_json(cls, payload: dict):
52
+ items = {
53
+ sensor: DatasetItem.from_json(item, is_scene=True)
54
+ for sensor, item in payload.items()
55
+ }
56
+ return cls(**items)
57
+
58
+ def to_payload(self) -> dict:
59
+ return {
60
+ sensor: dataset_item.to_payload(is_scene=True)
61
+ for sensor, dataset_item in self.items.items()
62
+ }
63
+
64
+
65
+ @dataclass
66
+ class Scene(ABC):
67
+ reference_id: str
68
+ frames: List[Frame] = field(default_factory=list)
69
+ metadata: Optional[dict] = None
70
+
71
+ def __post_init__(self):
72
+ self.sensors = set(
73
+ flatten([frame.get_sensors() for frame in self.frames])
74
+ )
75
+ self.frames_dict = dict(enumerate(self.frames))
76
+
77
+ @property
78
+ def length(self) -> int:
79
+ return len(self.frames_dict)
80
+
81
+ @property
82
+ def num_sensors(self) -> int:
83
+ return len(self.get_sensors())
84
+
85
+ def validate(self):
86
+ assert self.length > 0, "Must have at least 1 frame in a scene"
87
+ for frame in self.frames_dict.values():
88
+ assert isinstance(
89
+ frame, Frame
90
+ ), "Each frame in a scene must be a Frame object"
91
+
92
+ def add_item(self, index: int, sensor_name: str, item: DatasetItem):
93
+ self.sensors.add(sensor_name)
94
+ if index not in self.frames_dict:
95
+ new_frame = Frame(**{sensor_name: item})
96
+ self.frames_dict[index] = new_frame
97
+ else:
98
+ self.frames_dict[index].items[sensor_name] = item
99
+
100
+ def add_frame(self, frame: Frame, index: int, update: bool = False):
101
+ if (
102
+ index not in self.frames_dict
103
+ or index in self.frames_dict
104
+ and update
105
+ ):
106
+ self.frames_dict[index] = frame
107
+ self.sensors.update(frame.get_sensors())
108
+
109
+ def get_frame(self, index: int):
110
+ if index not in self.frames_dict:
111
+ raise ValueError(
112
+ f"This scene does not have a frame at index {index}"
113
+ )
114
+ return self.frames_dict[index]
115
+
116
+ def get_frames(self):
117
+ return [
118
+ frame
119
+ for _, frame in sorted(
120
+ self.frames_dict.items(), key=lambda x: x[0]
121
+ )
122
+ ]
123
+
124
+ def get_sensors(self):
125
+ return list(self.sensors)
126
+
127
+ def get_item(self, index: int, sensor_name: str):
128
+ frame = self.get_frame(index)
129
+ return frame.get_item(sensor_name)
130
+
131
+ def get_items_from_sensor(self, sensor_name: str):
132
+ if sensor_name not in self.sensors:
133
+ raise ValueError(
134
+ f"This scene does not have a {sensor_name} sensor"
135
+ )
136
+ items_from_sensor = []
137
+ for frame in self.frames_dict.values():
138
+ try:
139
+ sensor_item = frame.get_item(sensor_name)
140
+ items_from_sensor.append(sensor_item)
141
+ except ValueError:
142
+ # This sensor is not present at current frame
143
+ items_from_sensor.append(None)
144
+ return items_from_sensor
145
+
146
+ def get_items(self):
147
+ return flatten([frame.get_items() for frame in self.get_frames()])
148
+
149
+ def info(self):
150
+ return {
151
+ REFERENCE_ID_KEY: self.reference_id,
152
+ LENGTH_KEY: self.length,
153
+ NUM_SENSORS_KEY: self.num_sensors,
154
+ }
155
+
156
+ def validate_frames_dict(self):
157
+ is_continuous = set(list(range(len(self.frames_dict)))) == set(
158
+ self.frames_dict.keys()
159
+ )
160
+ assert (
161
+ is_continuous
162
+ ), "frames must be 0-indexed and continuous (no missing frames)"
163
+
164
+ @classmethod
165
+ def from_json(cls, payload: dict):
166
+ frames_payload = payload.get(FRAMES_KEY, [])
167
+ frames = [Frame.from_json(frame) for frame in frames_payload]
168
+ return cls(
169
+ reference_id=payload[REFERENCE_ID_KEY],
170
+ frames=frames,
171
+ metadata=payload.get(METADATA_KEY, None),
172
+ )
173
+
174
+ def to_payload(self) -> dict:
175
+ self.validate_frames_dict()
176
+ ordered_frames = self.get_frames()
177
+ frames_payload = [frame.to_payload() for frame in ordered_frames]
178
+ payload: Dict[str, Any] = {
179
+ REFERENCE_ID_KEY: self.reference_id,
180
+ FRAMES_KEY: frames_payload,
181
+ }
182
+ if self.metadata:
183
+ payload[METADATA_KEY] = self.metadata
184
+ return payload
185
+
186
+ def to_json(self) -> str:
187
+ return json.dumps(self.to_payload(), allow_nan=False)
188
+
189
+
190
+ @dataclass
191
+ class LidarScene(Scene):
192
+ def __repr__(self) -> str:
193
+ return f"LidarScene(reference_id='{self.reference_id}', frames={self.get_frames()}, metadata={self.metadata})"
194
+
195
+ def validate(self):
196
+ super().validate()
197
+ lidar_sensors = flatten(
198
+ [
199
+ [
200
+ sensor
201
+ for sensor in frame.items.keys()
202
+ if frame.items[sensor].type == DatasetItemType.POINTCLOUD
203
+ ]
204
+ for frame in self.frames_dict.values()
205
+ ]
206
+ )
207
+ assert (
208
+ len(set(lidar_sensors)) == 1
209
+ ), "Each lidar scene must have exactly one lidar sensor"
210
+
211
+ for frame in self.frames_dict.values():
212
+ num_pointclouds = sum(
213
+ [
214
+ int(item.type == DatasetItemType.POINTCLOUD)
215
+ for item in frame.get_items()
216
+ ]
217
+ )
218
+ assert (
219
+ num_pointclouds == 1
220
+ ), "Each frame of a lidar scene must have exactly 1 pointcloud"
221
+
222
+
223
+ def flatten(t):
224
+ return [item for sublist in t for item in sublist]
225
+
226
+
227
+ def check_all_scene_paths_remote(scenes: List[LidarScene]):
228
+ for scene in scenes:
229
+ for item in scene.get_items():
230
+ pointcloud_location = getattr(item, POINTCLOUD_LOCATION_KEY)
231
+ if pointcloud_location and is_local_path(pointcloud_location):
232
+ raise ValueError(
233
+ f"All paths for DatasetItems in a Scene must be remote, but {item.pointcloud_location} is either "
234
+ "local, or a remote URL type that is not supported."
235
+ )
236
+ image_location = getattr(item, IMAGE_LOCATION_KEY)
237
+ if image_location and is_local_path(image_location):
238
+ raise ValueError(
239
+ f"All paths for DatasetItems in a Scene must be remote, but {item.image_location} is either "
240
+ "local, or a remote URL type that is not supported."
241
+ )
nucleus/slice.py CHANGED
@@ -6,7 +6,9 @@ from nucleus.annotation import Annotation
6
6
  from nucleus.dataset_item import DatasetItem
7
7
  from nucleus.job import AsyncJob
8
8
  from nucleus.utils import convert_export_payload, format_dataset_item_response
9
- from nucleus.constants import EXPORTED_ROWS
9
+ from nucleus.constants import (
10
+ EXPORTED_ROWS,
11
+ )
10
12
 
11
13
 
12
14
  class Slice:
@@ -52,7 +54,6 @@ class Slice:
52
54
 
53
55
  def append(
54
56
  self,
55
- dataset_item_ids: List[str] = None,
56
57
  reference_ids: List[str] = None,
57
58
  ) -> dict:
58
59
  """
@@ -61,7 +62,6 @@ class Slice:
61
62
  as a means of identifying items in the dataset.
62
63
 
63
64
  :param
64
- dataset_item_ids: List[str],
65
65
  reference_ids: List[str],
66
66
 
67
67
  :return:
@@ -71,7 +71,6 @@ class Slice:
71
71
  """
72
72
  response = self._client.append_to_slice(
73
73
  slice_id=self.slice_id,
74
- dataset_item_ids=dataset_item_ids,
75
74
  reference_ids=reference_ids,
76
75
  )
77
76
  return response
@@ -122,12 +121,30 @@ class Slice:
122
121
  response = self._client.make_request(
123
122
  {}, f"slice/{self.slice_id}/{project_id}/send_to_labeling"
124
123
  )
125
- return AsyncJob(response["job_id"], self._client)
124
+ return AsyncJob.from_json(response, self._client)
125
+
126
+ def export_embeddings(
127
+ self,
128
+ ) -> List[Dict[str, Union[str, List[float]]]]:
129
+ """Returns a pd.Dataframe-ready format of dataset embeddings.
130
+
131
+ Returns:
132
+ A list, where each item is a dict with two keys representing a row
133
+ in the dataset.
134
+ * One value in the dict is the reference id
135
+ * The other value is a list of the embedding values
136
+ """
137
+ api_payload = self._client.make_request(
138
+ payload=None,
139
+ route=f"slice/{self.slice_id}/embeddings",
140
+ requests_command=requests.get,
141
+ )
142
+ return api_payload
126
143
 
127
144
 
128
145
  def check_annotations_are_in_slice(
129
146
  annotations: List[Annotation], slice_to_check: Slice
130
- ) -> Tuple[bool, Set[str], Set[str]]:
147
+ ) -> Tuple[bool, Set[str]]:
131
148
  """Check membership of the annotation targets within this slice.
132
149
 
133
150
  annotations: Annnotations with ids referring to targets.
@@ -142,13 +159,6 @@ def check_annotations_are_in_slice(
142
159
  """
143
160
  info = slice_to_check.info()
144
161
 
145
- item_ids_not_found_in_slice = {
146
- annotation.item_id
147
- for annotation in annotations
148
- if annotation.item_id is not None
149
- }.difference(
150
- {item_metadata["id"] for item_metadata in info["dataset_items"]}
151
- )
152
162
  reference_ids_not_found_in_slice = {
153
163
  annotation.reference_id
154
164
  for annotation in annotations
@@ -156,13 +166,12 @@ def check_annotations_are_in_slice(
156
166
  }.difference(
157
167
  {item_metadata["ref_id"] for item_metadata in info["dataset_items"]}
158
168
  )
159
- if item_ids_not_found_in_slice or reference_ids_not_found_in_slice:
169
+ if reference_ids_not_found_in_slice:
160
170
  annotations_are_in_slice = False
161
171
  else:
162
172
  annotations_are_in_slice = True
163
173
 
164
174
  return (
165
175
  annotations_are_in_slice,
166
- item_ids_not_found_in_slice,
167
176
  reference_ids_not_found_in_slice,
168
177
  )
nucleus/url_utils.py ADDED
@@ -0,0 +1,22 @@
1
+ import urllib.request
2
+
3
+
4
+ def sanitize_field(field):
5
+ return urllib.request.quote(field.encode("UTF-8"), safe="")
6
+
7
+
8
+ def sanitize_string_args(function):
9
+ def sanitized_function(*args, **kwargs):
10
+ sanitized_args = []
11
+ sanitized_kwargs = {}
12
+ for arg in args:
13
+ if isinstance(arg, str):
14
+ arg = sanitize_field(arg)
15
+ sanitized_args.append(arg)
16
+ for key, value in kwargs.items():
17
+ if isinstance(value, str):
18
+ value = sanitize_field(value)
19
+ sanitized_kwargs[key] = value
20
+ return function(*sanitized_args, **sanitized_kwargs)
21
+
22
+ return sanitized_function
nucleus/utils.py CHANGED
@@ -3,6 +3,7 @@
3
3
  from collections import defaultdict
4
4
  import io
5
5
  import uuid
6
+ import json
6
7
  from typing import IO, Dict, List, Sequence, Union
7
8
 
8
9
  import requests
@@ -11,7 +12,9 @@ from requests.models import HTTPError
11
12
  from nucleus.annotation import (
12
13
  Annotation,
13
14
  BoxAnnotation,
15
+ CuboidAnnotation,
14
16
  PolygonAnnotation,
17
+ CategoryAnnotation,
15
18
  SegmentationAnnotation,
16
19
  )
17
20
 
@@ -19,13 +22,16 @@ from .constants import (
19
22
  ANNOTATION_TYPES,
20
23
  ANNOTATIONS_KEY,
21
24
  BOX_TYPE,
25
+ CUBOID_TYPE,
26
+ CATEGORY_TYPE,
22
27
  ITEM_KEY,
23
28
  POLYGON_TYPE,
24
29
  REFERENCE_ID_KEY,
25
30
  SEGMENTATION_TYPE,
26
31
  )
27
32
  from .dataset_item import DatasetItem
28
- from .prediction import BoxPrediction, PolygonPrediction
33
+ from .prediction import BoxPrediction, CuboidPrediction, PolygonPrediction
34
+ from .scene import LidarScene
29
35
 
30
36
 
31
37
  def _get_all_field_values(metadata_list: List[dict], key: str):
@@ -34,7 +40,10 @@ def _get_all_field_values(metadata_list: List[dict], key: str):
34
40
 
35
41
  def suggest_metadata_schema(
36
42
  data: Union[
37
- List[DatasetItem], List[BoxPrediction], List[PolygonPrediction]
43
+ List[DatasetItem],
44
+ List[BoxPrediction],
45
+ List[PolygonPrediction],
46
+ List[CuboidPrediction],
38
47
  ]
39
48
  ):
40
49
  metadata_list: List[dict] = [
@@ -106,17 +115,29 @@ def convert_export_payload(api_payload):
106
115
  for box in row[BOX_TYPE]:
107
116
  box[REFERENCE_ID_KEY] = row[ITEM_KEY][REFERENCE_ID_KEY]
108
117
  annotations[BOX_TYPE].append(BoxAnnotation.from_json(box))
118
+ for cuboid in row[CUBOID_TYPE]:
119
+ cuboid[REFERENCE_ID_KEY] = row[ITEM_KEY][REFERENCE_ID_KEY]
120
+ annotations[CUBOID_TYPE].append(CuboidAnnotation.from_json(cuboid))
121
+ for category in row[CATEGORY_TYPE]:
122
+ category[REFERENCE_ID_KEY] = row[ITEM_KEY][REFERENCE_ID_KEY]
123
+ annotations[CATEGORY_TYPE].append(
124
+ CategoryAnnotation.from_json(category)
125
+ )
109
126
  return_payload_row[ANNOTATIONS_KEY] = annotations
110
127
  return_payload.append(return_payload_row)
111
128
  return return_payload
112
129
 
113
130
 
114
131
  def serialize_and_write(
115
- upload_units: Sequence[Union[DatasetItem, Annotation]], file_pointer
132
+ upload_units: Sequence[Union[DatasetItem, Annotation, LidarScene]],
133
+ file_pointer,
116
134
  ):
117
135
  for unit in upload_units:
118
136
  try:
119
- file_pointer.write(unit.to_json() + "\n")
137
+ if isinstance(unit, (DatasetItem, Annotation, LidarScene)):
138
+ file_pointer.write(unit.to_json() + "\n")
139
+ else:
140
+ file_pointer.write(json.dumps(unit) + "\n")
120
141
  except TypeError as e:
121
142
  type_name = type(unit).__name__
122
143
  message = (
@@ -143,7 +164,7 @@ def upload_to_presigned_url(presigned_url: str, file_pointer: IO):
143
164
 
144
165
 
145
166
  def serialize_and_write_to_presigned_url(
146
- upload_units: Sequence[Union["DatasetItem", Annotation]],
167
+ upload_units: Sequence[Union[DatasetItem, Annotation, LidarScene]],
147
168
  dataset_id: str,
148
169
  client,
149
170
  ):
@@ -0,0 +1,85 @@
1
+ Metadata-Version: 2.1
2
+ Name: scale-nucleus
3
+ Version: 0.1.24
4
+ Summary: The official Python client library for Nucleus, the Data Platform for AI
5
+ Home-page: https://scale.com/nucleus
6
+ License: MIT
7
+ Author: Scale AI Nucleus Team
8
+ Author-email: nucleusapi@scaleapi.com
9
+ Requires-Python: >=3.6.2,<4.0.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.7
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Requires-Dist: aiohttp (>=3.7.4,<4.0.0)
16
+ Requires-Dist: dataclasses (>=0.7,<0.8); python_full_version >= "3.6.1" and python_version < "3.7"
17
+ Requires-Dist: nest-asyncio (>=1.5.1,<2.0.0)
18
+ Requires-Dist: requests (>=2.23.0,<3.0.0)
19
+ Requires-Dist: tqdm (>=4.41.0,<5.0.0)
20
+ Project-URL: Documentation, https://dashboard.scale.com/nucleus/docs/api
21
+ Project-URL: Repository, https://github.com/scaleapi/nucleus-python-client
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Nucleus
25
+
26
+ https://dashboard.scale.com/nucleus
27
+
28
+ Aggregate metrics in ML are not good enough. To improve production ML, you need to understand their qualitative failure modes, fix them by gathering more data, and curate diverse scenarios.
29
+
30
+ Scale Nucleus helps you:
31
+
32
+ - Visualize your data
33
+ - Curate interesting slices within your dataset
34
+ - Review and manage annotations
35
+ - Measure and debug your model performance
36
+
37
+ Nucleus is a new way—the right way—to develop ML models, helping us move away from the concept of one dataset and towards a paradigm of collections of scenarios.
38
+
39
+ ## Installation
40
+
41
+ `$ pip install scale-nucleus`
42
+
43
+ ## Common issues/FAQ
44
+
45
+ ### Outdated Client
46
+
47
+ Nucleus is iterating rapidly and as a result we do not always perfectly preserve backwards compatibility with older versions of the client. If you run into any unexpected error, it's a good idea to upgrade your version of the client by running
48
+ ```
49
+ pip install --upgrade scale-nucleus
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ For the most up to date documentation, reference: https://dashboard.scale.com/nucleus/docs/api?language=python.
55
+
56
+ ## For Developers
57
+
58
+ Clone from github and install as editable
59
+
60
+ ```
61
+ git clone git@github.com:scaleapi/nucleus-python-client.git
62
+ cd nucleus-python-client
63
+ pip3 install poetry
64
+ poetry install
65
+ ```
66
+
67
+ Please install the pre-commit hooks by running the following command:
68
+
69
+ ```python
70
+ poetry run pre-commit install
71
+ ```
72
+
73
+ **Best practices for testing:**
74
+ (1). Please run pytest from the root directory of the repo, i.e.
75
+
76
+ ```
77
+ poetry run pytest tests/test_dataset.py
78
+ ```
79
+
80
+ (2) To skip slow integration tests that have to wait for an async job to start.
81
+
82
+ ```
83
+ poetry run pytest -m "not integration"
84
+ ```
85
+
@@ -0,0 +1,21 @@
1
+ nucleus/__init__.py,sha256=105pVyWKhc34vRxhXTFbL9APvyH9Ka6FWOMOCElFsp8,40780
2
+ nucleus/annotation.py,sha256=tjkO_DCJIXQTTMI9gkWXe9W3lveyFsIQjlsM5jfyFyw,10007
3
+ nucleus/autocurate.py,sha256=ogEX3kbuKCciWODOnTjUHU-JSwhQ_34wbNvW4xA79oY,854
4
+ nucleus/constants.py,sha256=86tEkPqITYgd3SB_OWcG5LDcuAUGuc78kBtS5WOqo64,3026
5
+ nucleus/dataset.py,sha256=0amQbRnC3JbcDz_coJNvQsZsmfp41EYiqbXEtVh_m00,18290
6
+ nucleus/dataset_item.py,sha256=lKMMwNH9Iz5jxf1beIJSWrcD1UYNXbMbnPwenVW1He0,5781
7
+ nucleus/errors.py,sha256=quBOj9Dwi8NrC6SIqSI6DLv-fT49e315OSLirSiF4kQ,2338
8
+ nucleus/job.py,sha256=N2Ei3zJflcUyiZBavJOph3eLvckLANMrL7SwYzLUYAA,2301
9
+ nucleus/model.py,sha256=akuWKehw6u5fp-FfBuI2RobkSoceNN-huh9_G3rxWPo,2147
10
+ nucleus/model_run.py,sha256=-m_YzEqv253foD_ZQAIvD66CuDipvtKedzq9Pk0IBs4,7983
11
+ nucleus/payload_constructor.py,sha256=UN9J0NEL6gJqh-EAvwEc51eXJSTaK9ZMH1p0FDgMDsI,3567
12
+ nucleus/prediction.py,sha256=WJu5echvJKBjL67lQ6U9jM_LlbXvA1SPhUHyzdTeVpE,6276
13
+ nucleus/scene.py,sha256=w8mNU5Pt7U-jn9WQCL4Ch7AaZ2RHVPW8nTtIhlqTx0k,7803
14
+ nucleus/slice.py,sha256=zVLF6YyxU0ShJTERGTydcm1XiEx1yaVfJ1coq4H5KrI,5737
15
+ nucleus/upload_response.py,sha256=pwOb3iS6TbpoumC1Mao6Pyli7dXBRDcI0zjNfCMU4_c,2729
16
+ nucleus/url_utils.py,sha256=6iODEEVAa061-ROkqYM_Zhc4RbPHqOSYMczqYGVv4y0,660
17
+ nucleus/utils.py,sha256=WDBx8tw5MEFA1afS9Z0difBi6SQCk56SJX-hfDkBq5k,6194
18
+ scale_nucleus-0.1.24.dist-info/LICENSE,sha256=jaTGyQSQIZeWMo5iyYqgbAYHR9Bdy7nOzgE-Up3m_-g,1075
19
+ scale_nucleus-0.1.24.dist-info/WHEEL,sha256=DRf8A_Psd1SF2kVqTQOOFU1Xzl3-A2qljAxBMTOusUs,83
20
+ scale_nucleus-0.1.24.dist-info/METADATA,sha256=sxWeNc6pC9LBbOll4dfwRyqymKYOljHVHy8LslAoZvM,2656
21
+ scale_nucleus-0.1.24.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry 1.0.3
2
+ Generator: poetry 1.0.6
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any