maite-datasets 0.0.4__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/PKG-INFO +1 -1
  2. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_builder.py +5 -4
  3. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_reader/_base.py +2 -2
  4. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_reader/_coco.py +45 -41
  5. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_reader/_yolo.py +46 -43
  6. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/.gitignore +0 -0
  7. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/LICENSE +0 -0
  8. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/README.md +0 -0
  9. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/pyproject.toml +0 -0
  10. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/__init__.py +0 -0
  11. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_base.py +0 -0
  12. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_collate.py +0 -0
  13. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_fileio.py +0 -0
  14. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_mixin/__init__.py +0 -0
  15. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_mixin/_numpy.py +0 -0
  16. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_mixin/_torch.py +0 -0
  17. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_protocols.py +0 -0
  18. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_reader/__init__.py +0 -0
  19. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_reader/_factory.py +0 -0
  20. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_types.py +0 -0
  21. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/_validate.py +0 -0
  22. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/image_classification/__init__.py +0 -0
  23. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/image_classification/_cifar10.py +0 -0
  24. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/image_classification/_mnist.py +0 -0
  25. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/image_classification/_ships.py +0 -0
  26. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/object_detection/__init__.py +0 -0
  27. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/object_detection/_antiuav.py +0 -0
  28. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/object_detection/_milco.py +0 -0
  29. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/object_detection/_seadrone.py +0 -0
  30. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/object_detection/_voc.py +0 -0
  31. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/object_detection/_voc_torch.py +0 -0
  32. {maite_datasets-0.0.4 → maite_datasets-0.0.5}/src/maite_datasets/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: maite-datasets
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: A collection of Image Classification and Object Detection task datasets conforming to the MAITE protocol.
5
5
  Author-email: Andrew Weng <andrew.weng@ariacoustics.com>, Ryan Wood <ryan.wood@ariacoustics.com>, Shaun Jullens <shaun.jullens@ariacoustics.com>
6
6
  License-Expression: MIT
@@ -22,11 +22,12 @@ from maite_datasets._protocols import (
22
22
  DatasetMetadata,
23
23
  ImageClassificationDataset,
24
24
  ObjectDetectionDataset,
25
+ DatumMetadata,
25
26
  )
26
27
 
27
28
 
28
- def _ensure_id(index: int, metadata: dict[str, Any]) -> dict[str, Any]:
29
- return {"id": index, **metadata} if "id" not in metadata else metadata
29
+ def _ensure_id(index: int, metadata: dict[str, Any]) -> DatumMetadata:
30
+ return DatumMetadata(**({"id": index, **metadata} if "id" not in metadata else metadata))
30
31
 
31
32
 
32
33
  def _validate_data(
@@ -141,7 +142,7 @@ class CustomImageClassificationDataset(BaseAnnotatedDataset[Sequence[int]], Imag
141
142
  self.__class__.__name__ = name
142
143
  self.__class__.__qualname__ = name
143
144
 
144
- def __getitem__(self, idx: int, /) -> tuple[Array, Array, dict[str, Any]]:
145
+ def __getitem__(self, idx: int, /) -> tuple[Array, Array, DatumMetadata]:
145
146
  one_hot = [0.0] * len(self._index2label)
146
147
  one_hot[self._labels[idx]] = 1.0
147
148
  return (
@@ -206,7 +207,7 @@ class CustomObjectDetectionDataset(BaseAnnotatedDataset[Sequence[Sequence[int]]]
206
207
  def metadata(self) -> DatasetMetadata:
207
208
  return DatasetMetadata(id=self._id, index2label=self._index2label)
208
209
 
209
- def __getitem__(self, idx: int, /) -> tuple[Array, ObjectDetectionTarget, dict[str, Any]]:
210
+ def __getitem__(self, idx: int, /) -> tuple[Array, ObjectDetectionTarget, DatumMetadata]:
210
211
  return (
211
212
  self._images[idx],
212
213
  self.ObjectDetectionTarget(self._labels[idx], self._bboxes[idx], len(self._classes)),
@@ -49,8 +49,8 @@ class BaseDatasetReader(ABC):
49
49
  """
50
50
 
51
51
  def __init__(self, dataset_path: str | Path, dataset_id: str | None = None) -> None:
52
- self.dataset_path = Path(dataset_path)
53
- self._dataset_id = dataset_id or self.dataset_path.name
52
+ self.dataset_path: Path = Path(dataset_path)
53
+ self.dataset_id: str = dataset_id or self.dataset_path.name
54
54
 
55
55
  # Basic path validation
56
56
  if not self.dataset_path.exists():
@@ -107,23 +107,23 @@ class COCODatasetReader(BaseDatasetReader):
107
107
  classes_file: str | None = "classes.txt",
108
108
  dataset_id: str | None = None,
109
109
  ) -> None:
110
- self.annotation_file = annotation_file
111
- self.images_dir = images_dir
112
- self.classes_file = classes_file
110
+ self._annotation_file = annotation_file
111
+ self._images_dir = images_dir
112
+ self._classes_file = classes_file
113
113
 
114
114
  # Initialize base class
115
115
  super().__init__(dataset_path, dataset_id)
116
116
 
117
117
  def _initialize_format_specific(self) -> None:
118
118
  """Initialize COCO-specific components."""
119
- self.images_path = self.dataset_path / self.images_dir
120
- self.annotation_path = self.dataset_path / self.annotation_file
121
- self.classes_path = self.dataset_path / self.classes_file if self.classes_file else None
119
+ self._images_path = self.dataset_path / self._images_dir
120
+ self._annotation_path = self.dataset_path / self._annotation_file
121
+ self._classes_path = self.dataset_path / self._classes_file if self._classes_file else None
122
122
 
123
- if not self.annotation_path.exists():
124
- raise FileNotFoundError(f"Annotation file not found: {self.annotation_path}")
125
- if not self.images_path.exists():
126
- raise FileNotFoundError(f"Images directory not found: {self.images_path}")
123
+ if not self._annotation_path.exists():
124
+ raise FileNotFoundError(f"Annotation file not found: {self._annotation_path}")
125
+ if not self._images_path.exists():
126
+ raise FileNotFoundError(f"Images directory not found: {self._images_path}")
127
127
 
128
128
  self._load_annotations()
129
129
 
@@ -141,62 +141,62 @@ class COCODatasetReader(BaseDatasetReader):
141
141
  issues = []
142
142
  stats = {}
143
143
 
144
- annotation_path = self.dataset_path / self.annotation_file
144
+ annotation_path = self.dataset_path / self._annotation_file
145
145
  if not annotation_path.exists():
146
- issues.append(f"Missing {self.annotation_file} file")
146
+ issues.append(f"Missing {self._annotation_file} file")
147
147
  return issues, stats
148
148
 
149
149
  try:
150
150
  with open(annotation_path) as f:
151
151
  coco_data = json.load(f)
152
152
  except json.JSONDecodeError as e:
153
- issues.append(f"Invalid JSON in {self.annotation_file}: {e}")
153
+ issues.append(f"Invalid JSON in {self._annotation_file}: {e}")
154
154
  return issues, stats
155
155
 
156
156
  # Check required keys
157
157
  required_keys = ["images", "annotations", "categories"]
158
158
  for key in required_keys:
159
159
  if key not in coco_data:
160
- issues.append(f"Missing required key '{key}' in {self.annotation_file}")
160
+ issues.append(f"Missing required key '{key}' in {self._annotation_file}")
161
161
  else:
162
162
  stats[f"num_{key}"] = len(coco_data[key])
163
163
 
164
164
  # Check optional classes.txt
165
- if self.classes_file:
166
- classes_path = self.dataset_path / self.classes_file
165
+ if self._classes_file:
166
+ classes_path = self.dataset_path / self._classes_file
167
167
  if classes_path.exists():
168
168
  try:
169
169
  with open(classes_path) as f:
170
170
  class_lines = [line.strip() for line in f if line.strip()]
171
171
  stats["num_class_names"] = len(class_lines)
172
172
  except Exception as e:
173
- issues.append(f"Error reading {self.classes_file}: {e}")
173
+ issues.append(f"Error reading {self._classes_file}: {e}")
174
174
 
175
175
  return issues, stats
176
176
 
177
177
  def _load_annotations(self) -> None:
178
178
  """Load and parse COCO annotations."""
179
- with open(self.annotation_path) as f:
180
- self.coco_data = json.load(f)
179
+ with open(self._annotation_path) as f:
180
+ self._coco_data = json.load(f)
181
181
 
182
182
  # Build mappings
183
- self.image_id_to_info = {img["id"]: img for img in self.coco_data["images"]}
184
- self.category_id_to_idx = {cat["id"]: idx for idx, cat in enumerate(self.coco_data["categories"])}
183
+ self._image_id_to_info = {img["id"]: img for img in self._coco_data["images"]}
184
+ self._category_id_to_idx = {cat["id"]: idx for idx, cat in enumerate(self._coco_data["categories"])}
185
185
 
186
186
  # Group annotations by image
187
187
  self.image_id_to_annotations: dict[int, list[dict[str, Any]]] = {}
188
- for ann in self.coco_data["annotations"]:
188
+ for ann in self._coco_data["annotations"]:
189
189
  img_id = ann["image_id"]
190
190
  if img_id not in self.image_id_to_annotations:
191
191
  self.image_id_to_annotations[img_id] = []
192
192
  self.image_id_to_annotations[img_id].append(ann)
193
193
 
194
194
  # Load class names
195
- if self.classes_path and self.classes_path.exists():
196
- with open(self.classes_path) as f:
195
+ if self._classes_path and self._classes_path.exists():
196
+ with open(self._classes_path) as f:
197
197
  class_names = [line.strip() for line in f if line.strip()]
198
198
  else:
199
- class_names = [cat["name"] for cat in self.coco_data["categories"]]
199
+ class_names = [cat["name"] for cat in self._coco_data["categories"]]
200
200
 
201
201
  self._index2label = {idx: name for idx, name in enumerate(class_names)}
202
202
 
@@ -206,12 +206,12 @@ class _COCODataset:
206
206
 
207
207
  def __init__(self, reader: COCODatasetReader) -> None:
208
208
  self.reader = reader
209
- self.image_ids = list(reader.image_id_to_info.keys())
209
+ self.image_ids = list(reader._image_id_to_info.keys())
210
210
 
211
211
  @property
212
212
  def metadata(self) -> DatasetMetadata:
213
213
  return DatasetMetadata(
214
- id=self.reader._dataset_id,
214
+ id=self.reader.dataset_id,
215
215
  index2label=self.reader.index2label,
216
216
  )
217
217
 
@@ -220,10 +220,10 @@ class _COCODataset:
220
220
 
221
221
  def __getitem__(self, index: int) -> ObjectDetectionDatum:
222
222
  image_id = self.image_ids[index]
223
- image_info = self.reader.image_id_to_info[image_id]
223
+ image_info = self.reader._image_id_to_info[image_id]
224
224
 
225
225
  # Load image
226
- image_path = self.reader.images_path / image_info["file_name"]
226
+ image_path = self.reader._images_path / image_info["file_name"]
227
227
  image = np.array(Image.open(image_path).convert("RGB"))
228
228
  image = np.transpose(image, (2, 0, 1)) # Convert to CHW format
229
229
 
@@ -241,7 +241,7 @@ class _COCODataset:
241
241
  boxes.append([x, y, x + w, y + h])
242
242
 
243
243
  # Map category_id to class index
244
- cat_idx = self.reader.category_id_to_idx[ann["category_id"]]
244
+ cat_idx = self.reader._category_id_to_idx[ann["category_id"]]
245
245
  labels.append(cat_idx)
246
246
 
247
247
  # Collect annotation metadata
@@ -271,17 +271,21 @@ class _COCODataset:
271
271
 
272
272
  # Create comprehensive datum metadata
273
273
  datum_metadata = DatumMetadata(
274
- id=f"{self.reader._dataset_id}_{image_id}",
275
- # Image-level metadata
276
- coco_image_id=image_id,
277
- file_name=image_info["file_name"],
278
- width=image_info["width"],
279
- height=image_info["height"],
280
- # Optional COCO image fields
281
- **{key: value for key, value in image_info.items() if key not in ["id", "file_name", "width", "height"]},
282
- # Annotation metadata
283
- annotations=annotation_metadata,
284
- num_annotations=len(annotations),
274
+ **{
275
+ "id": f"{self.reader.dataset_id}_{image_id}",
276
+ # Image-level metadata
277
+ "coco_image_id": image_id,
278
+ "file_name": image_info["file_name"],
279
+ "width": image_info["width"],
280
+ "height": image_info["height"],
281
+ # Optional COCO image fields
282
+ **{
283
+ key: value for key, value in image_info.items() if key not in ["id", "file_name", "width", "height"]
284
+ },
285
+ # Annotation metadata
286
+ "annotations": annotation_metadata,
287
+ "num_annotations": len(annotations),
288
+ }
285
289
  )
286
290
 
287
291
  return image, target, datum_metadata
@@ -88,29 +88,29 @@ class YOLODatasetReader(BaseDatasetReader):
88
88
  dataset_id: str | None = None,
89
89
  image_extensions: list[str] | None = None,
90
90
  ) -> None:
91
- self.images_dir = images_dir
92
- self.labels_dir = labels_dir
93
- self.classes_file = classes_file
91
+ self._images_dir = images_dir
92
+ self._labels_dir = labels_dir
93
+ self._classes_file = classes_file
94
94
 
95
95
  if image_extensions is None:
96
96
  image_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
97
- self.image_extensions = [ext.lower() for ext in image_extensions]
97
+ self._image_extensions = [ext.lower() for ext in image_extensions]
98
98
 
99
99
  # Initialize base class
100
100
  super().__init__(dataset_path, dataset_id)
101
101
 
102
102
  def _initialize_format_specific(self) -> None:
103
103
  """Initialize YOLO-specific components."""
104
- self.images_path = self.dataset_path / self.images_dir
105
- self.labels_path = self.dataset_path / self.labels_dir
106
- self.classes_path = self.dataset_path / self.classes_file
104
+ self._images_path = self.dataset_path / self._images_dir
105
+ self._labels_path = self.dataset_path / self._labels_dir
106
+ self._classes_path = self.dataset_path / self._classes_file
107
107
 
108
- if not self.images_path.exists():
109
- raise FileNotFoundError(f"Images directory not found: {self.images_path}")
110
- if not self.labels_path.exists():
111
- raise FileNotFoundError(f"Labels directory not found: {self.labels_path}")
112
- if not self.classes_path.exists():
113
- raise FileNotFoundError(f"Classes file not found: {self.classes_path}")
108
+ if not self._images_path.exists():
109
+ raise FileNotFoundError(f"Images directory not found: {self._images_path}")
110
+ if not self._labels_path.exists():
111
+ raise FileNotFoundError(f"Labels directory not found: {self._labels_path}")
112
+ if not self._classes_path.exists():
113
+ raise FileNotFoundError(f"Classes file not found: {self._classes_path}")
114
114
 
115
115
  self._load_class_names()
116
116
  self._find_image_files()
@@ -130,32 +130,32 @@ class YOLODatasetReader(BaseDatasetReader):
130
130
  stats = {}
131
131
 
132
132
  # Check labels directory
133
- labels_path = self.dataset_path / self.labels_dir
133
+ labels_path = self.dataset_path / self._labels_dir
134
134
  if not labels_path.exists():
135
- issues.append(f"Missing {self.labels_dir}/ directory")
135
+ issues.append(f"Missing {self._labels_dir}/ directory")
136
136
  else:
137
137
  label_files = list(labels_path.glob("*.txt"))
138
138
  stats["num_label_files"] = len(label_files)
139
139
  if len(label_files) == 0:
140
- issues.append(f"No label files found in {self.labels_dir}/ directory")
140
+ issues.append(f"No label files found in {self._labels_dir}/ directory")
141
141
  else:
142
142
  # Validate label file format (sample check)
143
143
  label_issues = self._validate_yolo_label_format(labels_path)
144
144
  issues.extend(label_issues)
145
145
 
146
146
  # Check required classes.txt
147
- classes_path = self.dataset_path / self.classes_file
147
+ classes_path = self.dataset_path / self._classes_file
148
148
  if not classes_path.exists():
149
- issues.append(f"Missing required {self.classes_file} file")
149
+ issues.append(f"Missing required {self._classes_file} file")
150
150
  else:
151
151
  try:
152
152
  with open(classes_path) as f:
153
153
  class_lines = [line.strip() for line in f if line.strip()]
154
154
  stats["num_classes"] = len(class_lines)
155
155
  if len(class_lines) == 0:
156
- issues.append(f"{self.classes_file} is empty")
156
+ issues.append(f"{self._classes_file} is empty")
157
157
  except Exception as e:
158
- issues.append(f"Error reading {self.classes_file}: {e}")
158
+ issues.append(f"Error reading {self._classes_file}: {e}")
159
159
 
160
160
  return issues, stats
161
161
 
@@ -167,6 +167,7 @@ class YOLODatasetReader(BaseDatasetReader):
167
167
  if not label_files:
168
168
  return issues
169
169
 
170
+ label_files.sort()
170
171
  sample_label = label_files[0]
171
172
  try:
172
173
  with open(sample_label) as f:
@@ -197,19 +198,19 @@ class YOLODatasetReader(BaseDatasetReader):
197
198
 
198
199
  def _load_class_names(self) -> None:
199
200
  """Load class names from classes file."""
200
- with open(self.classes_path) as f:
201
+ with open(self._classes_path) as f:
201
202
  class_names = [line.strip() for line in f if line.strip()]
202
203
  self._index2label = {idx: name for idx, name in enumerate(class_names)}
203
204
 
204
205
  def _find_image_files(self) -> None:
205
206
  """Find all valid image files."""
206
- self.image_files = []
207
- for ext in self.image_extensions:
208
- self.image_files.extend(self.images_path.glob(f"*{ext}"))
209
- self.image_files.sort()
207
+ self._image_files = []
208
+ for ext in self._image_extensions:
209
+ self._image_files.extend(self._images_path.glob(f"*{ext}"))
210
+ self._image_files.sort()
210
211
 
211
- if not self.image_files:
212
- raise ValueError(f"No image files found in {self.images_path}")
212
+ if not self._image_files:
213
+ raise ValueError(f"No image files found in {self._images_path}")
213
214
 
214
215
 
215
216
  class _YOLODataset:
@@ -221,15 +222,15 @@ class _YOLODataset:
221
222
  @property
222
223
  def metadata(self) -> DatasetMetadata:
223
224
  return DatasetMetadata(
224
- id=self.reader._dataset_id,
225
+ id=self.reader.dataset_id,
225
226
  index2label=self.reader.index2label,
226
227
  )
227
228
 
228
229
  def __len__(self) -> int:
229
- return len(self.reader.image_files)
230
+ return len(self.reader._image_files)
230
231
 
231
232
  def __getitem__(self, index: int) -> ObjectDetectionDatum:
232
- image_path = self.reader.image_files[index]
233
+ image_path = self.reader._image_files[index]
233
234
 
234
235
  # Load image
235
236
  image = np.array(Image.open(image_path).convert("RGB"))
@@ -237,7 +238,7 @@ class _YOLODataset:
237
238
  image = np.transpose(image, (2, 0, 1)) # Convert to CHW format
238
239
 
239
240
  # Load corresponding label file
240
- label_path = self.reader.labels_path / f"{image_path.stem}.txt"
241
+ label_path = self.reader._labels_path / f"{image_path.stem}.txt"
241
242
 
242
243
  annotation_metadata = []
243
244
  if label_path.exists():
@@ -295,18 +296,20 @@ class _YOLODataset:
295
296
 
296
297
  # Create comprehensive datum metadata
297
298
  datum_metadata = DatumMetadata(
298
- id=f"{self.reader._dataset_id}_{image_path.stem}",
299
- # Image-level metadata
300
- file_name=image_path.name,
301
- file_path=str(image_path),
302
- width=img_width,
303
- height=img_height,
304
- # Label file metadata
305
- label_file=label_path.name if label_path.exists() else None,
306
- label_file_exists=label_path.exists(),
307
- # Annotation metadata
308
- annotations=annotation_metadata,
309
- num_annotations=len(annotation_metadata),
299
+ **{
300
+ "id": f"{self.reader.dataset_id}_{image_path.stem}",
301
+ # Image-level metadata
302
+ "file_name": image_path.name,
303
+ "file_path": str(image_path),
304
+ "width": img_width,
305
+ "height": img_height,
306
+ # Label file metadata
307
+ "label_file": label_path.name if label_path.exists() else None,
308
+ "label_file_exists": label_path.exists(),
309
+ # Annotation metadata
310
+ "annotations": annotation_metadata,
311
+ "num_annotations": len(annotation_metadata),
312
+ }
310
313
  )
311
314
 
312
315
  return image, target, datum_metadata
File without changes
File without changes