megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,984 @@
1
+ """
2
+
3
+ yolo_to_coco.py
4
+
5
+ Converts a folder of YOLO-formatted annotation files to a COCO-formatted dataset.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import json
12
+ import os
13
+ import argparse
14
+ import sys
15
+
16
+ from multiprocessing.pool import ThreadPool
17
+ from multiprocessing.pool import Pool
18
+ from functools import partial
19
+
20
+ from tqdm import tqdm
21
+
22
+ from megadetector.utils.path_utils import find_images
23
+ from megadetector.utils.path_utils import recursive_file_list
24
+ from megadetector.utils.path_utils import find_image_strings
25
+ from megadetector.utils.ct_utils import invert_dictionary
26
+ from megadetector.utils.ct_utils import write_json
27
+ from megadetector.visualization.visualization_utils import open_image
28
+ from megadetector.data_management.yolo_output_to_md_output import \
29
+ read_classes_from_yolo_dataset_file
30
+
31
+
32
+ #%% Support functions
33
+
34
+ def _filename_to_image_id(fn):
35
+ """
36
+ Image IDs can't have spaces in them, replace spaces with underscores
37
+ """
38
+
39
+ return fn.replace(' ','_').replace('\\','/')
40
+
41
+
42
+ def _process_image(fn_abs,input_folder,category_id_to_name,label_folder):
43
+ """
44
+ Internal support function for processing one image's labels.
45
+ """
46
+
47
+ # Create the image object for this image
48
+ #
49
+ # Always use forward slashes in image filenames and IDs
50
+ image_fn_relative = os.path.relpath(fn_abs,input_folder).replace('\\','/')
51
+ image_id = _filename_to_image_id(image_fn_relative)
52
+
53
+ # This is done in a separate loop now
54
+ #
55
+ # assert image_id not in image_ids, \
56
+ # 'Oops, you have hit a very esoteric case where you have the same filename ' + \
57
+ # 'with both spaces and underscores, this is not currently handled.'
58
+ # image_ids.add(image_id)
59
+
60
+ im = {}
61
+ im['file_name'] = image_fn_relative
62
+ im['id'] = image_id
63
+
64
+ annotations_this_image = []
65
+
66
+ try:
67
+ pil_im = open_image(fn_abs)
68
+ im_width, im_height = pil_im.size
69
+ im['width'] = im_width
70
+ im['height'] = im_height
71
+ im['error'] = None
72
+ except Exception as e:
73
+ print('Warning: error reading {}:\n{}'.format(image_fn_relative,str(e)))
74
+ im['width'] = -1
75
+ im['height'] = -1
76
+ im['error'] = str(e)
77
+ return (im,annotations_this_image)
78
+
79
+ # Is there an annotation file for this image?
80
+ if label_folder is not None:
81
+ assert input_folder in fn_abs, \
82
+ 'Annotation file {} is not inside folder {}'.format(
83
+ fn_abs,input_folder)
84
+ label_file_abs_base = fn_abs.replace(input_folder,label_folder)
85
+ else:
86
+ label_file_abs_base = fn_abs
87
+
88
+ annotation_file = os.path.splitext(label_file_abs_base)[0] + '.txt'
89
+ if not os.path.isfile(annotation_file):
90
+ annotation_file = os.path.splitext(fn_abs)[0] + '.TXT'
91
+
92
+ if os.path.isfile(annotation_file):
93
+
94
+ with open(annotation_file,'r') as f:
95
+ lines = f.readlines()
96
+ lines = [s.strip() for s in lines]
97
+
98
+ # s = lines[0]
99
+ annotation_number = 0
100
+
101
+ for s in lines:
102
+
103
+ if len(s.strip()) == 0:
104
+ continue
105
+
106
+ tokens = s.split()
107
+ assert len(tokens) == 5, \
108
+ 'Illegal line in annotation file {}:\n{}'.format(
109
+ annotation_file,s)
110
+ category_id = int(tokens[0])
111
+ assert category_id in category_id_to_name, \
112
+ 'Unrecognized category ID {} in annotation file {}'.format(
113
+ category_id,annotation_file)
114
+ ann = {}
115
+ ann['id'] = im['id'] + '_' + str(annotation_number)
116
+ ann['image_id'] = im['id']
117
+ ann['category_id'] = category_id
118
+ ann['sequence_level_annotation'] = False
119
+
120
+ # COCO: [x_min, y_min, width, height] in absolute coordinates
121
+ # YOLO: [class, x_center, y_center, width, height] in normalized coordinates
122
+
123
+ yolo_bbox = [float(x) for x in tokens[1:]]
124
+
125
+ normalized_x_center = yolo_bbox[0]
126
+ normalized_y_center = yolo_bbox[1]
127
+ normalized_width = yolo_bbox[2]
128
+ normalized_height = yolo_bbox[3]
129
+
130
+ absolute_x_center = normalized_x_center * im_width
131
+ absolute_y_center = normalized_y_center * im_height
132
+ absolute_width = normalized_width * im_width
133
+ absolute_height = normalized_height * im_height
134
+ absolute_x_min = absolute_x_center - absolute_width / 2
135
+ absolute_y_min = absolute_y_center - absolute_height / 2
136
+
137
+ coco_bbox = [absolute_x_min, absolute_y_min, absolute_width, absolute_height]
138
+
139
+ ann['bbox'] = coco_bbox
140
+ annotation_number += 1
141
+
142
+ annotations_this_image.append(ann)
143
+
144
+ # ...for each annotation
145
+
146
+ # ...if this image has annotations
147
+
148
+ return (im,annotations_this_image)
149
+
150
+ # ...def _process_image(...)
151
+
152
+
153
+ def load_yolo_class_list(class_name_file):
154
+ """
155
+ Loads a dictionary mapping zero-indexed IDs to class names from the text/yaml file
156
+ [class_name_file].
157
+
158
+ Args:
159
+ class_name_file (str or list): this can be:
160
+ - a .yaml or .yaml file in YOLO's dataset.yaml format
161
+ - a .txt or .data file containing a flat list of class names
162
+ - a list of class names
163
+
164
+ Returns:
165
+ dict: A dict mapping zero-indexed integer IDs to class names
166
+ """
167
+
168
+ # class_name_file can also be a list of class names
169
+ if isinstance(class_name_file,list):
170
+ category_id_to_name = {}
171
+ for i_name,name in enumerate(class_name_file):
172
+ category_id_to_name[i_name] = name
173
+ return category_id_to_name
174
+
175
+ ext = os.path.splitext(class_name_file)[1][1:]
176
+ assert ext in ('yml','txt','yaml','data'), \
177
+ 'Unrecognized class name file type {}'.format(
178
+ class_name_file)
179
+
180
+ if ext in ('txt','data'):
181
+
182
+ with open(class_name_file,'r') as f:
183
+ lines = f.readlines()
184
+ lines = [s.strip() for s in lines]
185
+ assert len(lines) > 0, \
186
+ 'Empty class name file {}'.format(class_name_file)
187
+ assert len(lines[0]) > 0, \
188
+ 'Empty class name file {} (empty first line)'.format(class_name_file)
189
+
190
+ # Blank lines should only appear at the end
191
+ b_found_blank = False
192
+ for s in lines:
193
+ if len(s) == 0:
194
+ b_found_blank = True
195
+ elif b_found_blank:
196
+ raise ValueError('Invalid class name file {}, non-blank line after the last blank line'.format(
197
+ class_name_file))
198
+
199
+ category_id_to_name = {}
200
+ for i_category_id,category_name in enumerate(lines):
201
+ assert len(category_name) > 0, \
202
+ 'Empty category name in file {}'.format(class_name_file)
203
+ category_id_to_name[i_category_id] = category_name
204
+
205
+ else:
206
+
207
+ assert ext in ('yml','yaml'), \
208
+ 'Illegal class name file extension for {}'.fomat(class_name_file)
209
+ category_id_to_name = read_classes_from_yolo_dataset_file(class_name_file)
210
+
211
+ return category_id_to_name
212
+
213
+ # ...load_yolo_class_list(...)
214
+
215
+
216
+ def validate_label_file(label_file,category_id_to_name=None,verbose=False):
217
+ """"
218
+ Verifies that [label_file] is a valid YOLO label file. Does not check the extension.
219
+
220
+ Args:
221
+ label_file (str): the .txt file to validate
222
+ category_id_to_name (dict, optional): a dict mapping integer category IDs to names;
223
+ if this is not None, this function errors if the file uses a category that's not
224
+ in this dict
225
+ verbose (bool, optional): enable additional debug console output
226
+
227
+ Returns:
228
+ dict: a dict with keys 'file' (the same as [label_file]) and 'errors' (a list of
229
+ errors (if any) that we found in this file)
230
+ """
231
+
232
+ label_result = {}
233
+ label_result['file'] = label_file
234
+ label_result['errors'] = []
235
+
236
+ try:
237
+ with open(label_file,'r') as f:
238
+ lines = f.readlines()
239
+ except Exception as e:
240
+ label_result['errors'].append('Read error: {}'.format(str(e)))
241
+ return label_result
242
+
243
+ # i_line 0; line = lines[i_line]
244
+ for i_line,line in enumerate(lines):
245
+ s = line.strip()
246
+ if len(s) == 0 or s[0] == '#':
247
+ continue
248
+
249
+ try:
250
+
251
+ tokens = s.split()
252
+ assert len(tokens) == 5, \
253
+ 'YOLO label lines should have five tokens, found {} on line {} of file {}'.format(
254
+ len(tokens),i_line,label_file)
255
+
256
+ if category_id_to_name is not None:
257
+ category_id = int(tokens[0])
258
+ assert category_id in category_id_to_name, \
259
+ 'Unrecognized category ID {}'.format(category_id)
260
+
261
+ yolo_bbox = [float(x) for x in tokens[1:]]
262
+
263
+ except Exception as e:
264
+ label_result['errors'].append('Token error at line {}: {}'.format(i_line,str(e)))
265
+ continue
266
+
267
+ normalized_x_center = yolo_bbox[0]
268
+ normalized_y_center = yolo_bbox[1]
269
+ normalized_width = yolo_bbox[2]
270
+ normalized_height = yolo_bbox[3]
271
+
272
+ normalized_x_min = normalized_x_center - normalized_width / 2.0
273
+ normalized_x_max = normalized_x_center + normalized_width / 2.0
274
+ normalized_y_min = normalized_y_center - normalized_height / 2.0
275
+ normalized_y_max = normalized_y_center + normalized_height / 2.0
276
+
277
+ if normalized_x_min < 0 or normalized_y_min < 0 or \
278
+ normalized_x_max > 1 or normalized_y_max > 1:
279
+ label_result['errors'].append('Invalid bounding box: {} {} {} {}'.format(
280
+ normalized_x_min,normalized_y_min,normalized_x_max,normalized_y_max))
281
+
282
+ # ...for each line
283
+
284
+ if verbose:
285
+ if len(label_result['errors']) > 0:
286
+ print('Errors for {}:'.format(label_file))
287
+ for error in label_result['errors']:
288
+ print(error)
289
+
290
+ return label_result
291
+
292
+ # ...def validate_label_file(...)
293
+
294
+
295
+ def validate_yolo_dataset(input_folder,
296
+ class_name_file,
297
+ n_workers=1,
298
+ pool_type='thread',
299
+ verbose=False):
300
+ """
301
+ Verifies all the labels in a YOLO dataset folder. Does not yet support the case where the
302
+ labels and images are in different folders (yolo_to_coco() supports this).
303
+
304
+ Looks for:
305
+
306
+ * Image files without label files
307
+ * Text files without image files
308
+ * Illegal classes in label files
309
+ * Invalid boxes in label files
310
+
311
+ Args:
312
+ input_folder (str): the YOLO dataset folder to validate
313
+ class_name_file (str or list): a list of classes, a flat text file, or a yolo
314
+ dataset.yml/.yaml file. If it's a dataset.yml file, that file should point to
315
+ input_folder as the base folder, though this is not explicitly checked.
316
+ n_workers (int, optional): number of concurrent workers, set to <= 1 to disable
317
+ parallelization
318
+ pool_type (str, optional): 'thread' or 'process', worker type to use for parallelization;
319
+ not used if [n_workers] <= 1
320
+ verbose (bool, optional): enable additional debug console output
321
+
322
+ Returns:
323
+ dict: validation results, as a dict with fields:
324
+
325
+ - image_files_without_label_files (list)
326
+ - label_files_without_image_files (list)
327
+ - label_results (list of dicts with field 'filename', 'errors') (list)
328
+ """
329
+
330
+ # Validate arguments
331
+ assert os.path.isdir(input_folder), \
332
+ 'Could not find input folder {}'.format(input_folder)
333
+ if n_workers > 1:
334
+ assert pool_type in ('thread','process'), \
335
+ 'Illegal pool type {}'.format(pool_type)
336
+
337
+ category_id_to_name = load_yolo_class_list(class_name_file)
338
+
339
+ print('Enumerating files in {}'.format(input_folder))
340
+
341
+ all_files = recursive_file_list(input_folder,recursive=True,return_relative_paths=False,
342
+ convert_slashes=True)
343
+ label_files = [fn for fn in all_files if fn.endswith('.txt')]
344
+ image_files = find_image_strings(all_files)
345
+ print('Found {} images files and {} label files in {}'.format(
346
+ len(image_files),len(label_files),input_folder))
347
+
348
+ label_files_set = set(label_files)
349
+
350
+ image_files_without_extension = set()
351
+ for fn in image_files:
352
+ image_file_without_extension = os.path.splitext(fn)[0]
353
+ assert image_file_without_extension not in image_files_without_extension, \
354
+ 'Duplicate image file, likely with different extensions: {}'.format(fn)
355
+ image_files_without_extension.add(image_file_without_extension)
356
+
357
+ print('Looking for missing image/label files')
358
+
359
+ image_files_without_label_files = []
360
+ label_files_without_images = []
361
+
362
+ for image_file in tqdm(image_files):
363
+ expected_label_file = os.path.splitext(image_file)[0] + '.txt'
364
+ if expected_label_file not in label_files_set:
365
+ image_files_without_label_files.append(image_file)
366
+
367
+ for label_file in tqdm(label_files):
368
+ expected_image_file_without_extension = os.path.splitext(label_file)[0]
369
+ if expected_image_file_without_extension not in image_files_without_extension:
370
+ label_files_without_images.append(label_file)
371
+
372
+ print('Found {} image files without labels, {} labels without images'.format(
373
+ len(image_files_without_label_files),len(label_files_without_images)))
374
+
375
+ print('Validating label files')
376
+
377
+ if n_workers <= 1:
378
+
379
+ label_results = []
380
+ for fn_abs in tqdm(label_files):
381
+ label_results.append(validate_label_file(fn_abs,
382
+ category_id_to_name=category_id_to_name,
383
+ verbose=verbose))
384
+
385
+ else:
386
+
387
+ assert pool_type in ('process','thread'), \
388
+ 'Illegal pool type {}'.format(pool_type)
389
+
390
+ pool = None
391
+ try:
392
+ if pool_type == 'thread':
393
+ pool = ThreadPool(n_workers)
394
+ else:
395
+ pool = Pool(n_workers)
396
+
397
+ print('Starting a {} pool of {} workers'.format(pool_type,n_workers))
398
+
399
+ p = partial(validate_label_file,
400
+ category_id_to_name=category_id_to_name,
401
+ verbose=verbose)
402
+ label_results = list(tqdm(pool.imap(p, label_files),
403
+ total=len(label_files)))
404
+ finally:
405
+ if pool is not None:
406
+ pool.close()
407
+ pool.join()
408
+ print('Pool closed and joined for label file validation')
409
+
410
+ assert len(label_results) == len(label_files), \
411
+ 'Mismatch: {} results for {} files'.format(
412
+ len(label_results),len(label_files))
413
+
414
+ validation_results = {}
415
+ validation_results['image_files_without_label_files'] = image_files_without_label_files
416
+ validation_results['label_files_without_images'] = label_files_without_images
417
+ validation_results['label_results'] = label_results
418
+
419
+ return validation_results
420
+
421
+ # ...validate_yolo_dataset(...)
422
+
423
+
424
+ #%% Main conversion function
425
+
426
+ def yolo_to_coco(input_folder,
427
+ class_name_file,
428
+ output_file=None,
429
+ empty_image_handling='no_annotations',
430
+ empty_image_category_name='empty',
431
+ error_image_handling='no_annotations',
432
+ allow_images_without_label_files=True,
433
+ n_workers=1,
434
+ pool_type='thread',
435
+ recursive=True,
436
+ exclude_string=None,
437
+ include_string=None,
438
+ overwrite_handling='overwrite',
439
+ label_folder=None,
440
+ supercategory=None,
441
+ force_integer_ids=False,
442
+ include_area=False,
443
+ include_crowd=False,
444
+ invalid_annotation_handling='error'):
445
+ """
446
+ Converts a YOLO-formatted dataset to a COCO-formatted dataset.
447
+
448
+ All images will be assigned an "error" value, usually None.
449
+
450
+ Args:
451
+ input_folder (str): the YOLO dataset folder to convert. If the image and label
452
+ folders are different, this is the image folder, and [label_folder] is the
453
+ label folder.
454
+ class_name_file (str or list): a list of classes, a flat text file, or a yolo
455
+ dataset.yml/.yaml file. If it's a dataset.yml file, that file should point to
456
+ input_folder as the base folder, though this is not explicitly checked.
457
+ output_file (str, optional): .json file to which we should write COCO .json data
458
+ empty_image_handling (str, optional): how to handle images with no boxes; whether
459
+ this includes images with no .txt files depends on the value of
460
+ [allow_images_without_label_files]. Can be:
461
+
462
+ - 'no_annotations': include the image in the image list, with no annotations
463
+ - 'empty_annotations': include the image in the image list, and add an annotation without
464
+ any bounding boxes, using a category called [empty_image_category_name].
465
+ - 'skip': don't include the image in the image list
466
+ - 'error': there shouldn't be any empty images
467
+ empty_image_category_name (str, optional): if we're going to be inserting annotations for
468
+ images with no boxes, what category name should we use?
469
+ error_image_handling (str, optional): how to handle images that don't load properly; can
470
+ be:
471
+
472
+ - 'skip': don't include the image at all
473
+ - 'no_annotations': include with no annotations
474
+ allow_images_without_label_files (bool, optional): whether to silently allow images with
475
+ no label files (True) or raise errors for images with no label files (False)
476
+ n_workers (int, optional): number of concurrent workers, set to <= 1 to disable
477
+ parallelization
478
+ pool_type (str, optional): 'thread' or 'process', worker type to use for parallelization;
479
+ not used if [n_workers] <= 1
480
+ recursive (bool, optional): whether to recurse into [input_folder]
481
+ exclude_string (str, optional): exclude any images whose filename contains a string
482
+ include_string (str, optional): include only images whose filename contains a string
483
+ overwrite_handling (bool, optional): behavior if output_file exists ('load', 'overwrite', or
484
+ 'error')
485
+ label_folder (str, optional): label folder, if different from the image folder
486
+ supercategory (str, optional): populate the 'supercategory' field, currently only supports
487
+ None (don't populate) or a single supercategory for the whole dataset. This is mostly
488
+ only here because RF-DETR requires something to be populated in this field.
489
+ force_integer_ids (bool, optional): force image and annotation IDs to be integers
490
+ include_area (bool, optional): add the "area" field for boxes
491
+ include_crowd (bool, optional): include the "iscrowd" field (always 0) for annotations
492
+ invalid_annotation_handling (str, optional): how to handle invalid annotations, e.g.
493
+ negative-height bounding boxes. Can be 'error', 'warn', or 'exclude'. 'exclude'
494
+ implies 'warn'.
495
+
496
+
497
+ Returns:
498
+ dict: COCO-formatted data, the same as what's written to [output_file]
499
+ """
500
+
501
+ ## Validate input
502
+
503
+ input_folder = input_folder.replace('\\','/')
504
+
505
+ assert os.path.isdir(input_folder), \
506
+ 'Input folder {} does not exist or is not a folder'.format(input_folder)
507
+
508
+ if isinstance(class_name_file,str):
509
+ assert os.path.isfile(class_name_file), \
510
+ 'Class name file {} does not exist or is not a file'.format(class_name_file)
511
+
512
+ assert empty_image_handling in \
513
+ ('no_annotations','empty_annotations','skip','error'), \
514
+ 'Unrecognized empty image handling spec: {}'.format(empty_image_handling)
515
+
516
+ assert invalid_annotation_handling in ('error','warn','exclude')
517
+
518
+ if (output_file is not None) and os.path.isfile(output_file):
519
+
520
+ if overwrite_handling == 'overwrite':
521
+ print('Warning: output file {} exists, over-writing'.format(output_file))
522
+ elif overwrite_handling == 'load':
523
+ print('Output file {} exists, loading and returning'.format(output_file))
524
+ with open(output_file,'r') as f:
525
+ d = json.load(f)
526
+ return d
527
+ elif overwrite_handling == 'error':
528
+ raise ValueError('Output file {} exists'.format(output_file))
529
+ else:
530
+ raise ValueError('Unrecognized overwrite_handling value: {}'.format(overwrite_handling))
531
+
532
+
533
+ ## Read class names
534
+
535
+ category_id_to_name = load_yolo_class_list(class_name_file)
536
+
537
+
538
+ # Find or create the empty image category, if necessary
539
+ empty_category_id = None
540
+
541
+ if empty_image_handling == 'empty_annotations':
542
+ category_name_to_id = invert_dictionary(category_id_to_name)
543
+ if empty_image_category_name in category_name_to_id:
544
+ empty_category_id = category_name_to_id[empty_image_category_name]
545
+ print('Using existing empty image category with name {}, ID {}'.format(
546
+ empty_image_category_name,empty_category_id))
547
+ else:
548
+ empty_category_id = len(category_id_to_name)
549
+ print('Adding an empty category with name {}, ID {}'.format(
550
+ empty_image_category_name,empty_category_id))
551
+ category_id_to_name[empty_category_id] = empty_image_category_name
552
+
553
+
554
+ ## Enumerate images
555
+
556
+ print('Enumerating images...')
557
+
558
+ image_files_abs = find_images(input_folder,recursive=recursive,convert_slashes=True)
559
+
560
+ n_files_original = len(image_files_abs)
561
+
562
+ # Optionally include/exclude images matching specific strings
563
+ if exclude_string is not None:
564
+ image_files_abs = [fn for fn in image_files_abs if exclude_string not in fn]
565
+ if include_string is not None:
566
+ image_files_abs = [fn for fn in image_files_abs if include_string in fn]
567
+
568
+ if len(image_files_abs) != n_files_original or exclude_string is not None or include_string is not None:
569
+ n_excluded = n_files_original - len(image_files_abs)
570
+ print('Excluded {} of {} images based on filenames'.format(n_excluded,n_files_original))
571
+
572
+ categories = []
573
+
574
+ for category_id in category_id_to_name:
575
+ categories.append({'id':category_id,'name':category_id_to_name[category_id]})
576
+
577
+ if supercategory is not None:
578
+ for cat in categories:
579
+ cat['supercategory'] = supercategory
580
+
581
+ info = {}
582
+ info['version'] = '1.0'
583
+ info['description'] = 'Converted from YOLO format'
584
+
585
+ image_ids = set()
586
+
587
+
588
+ ## If we're expected to have labels for every image, check before we process all the images
589
+
590
+ if not allow_images_without_label_files:
591
+ print('Verifying that label files exist')
592
+ # image_file_abs = image_files_abs[0]
593
+ for image_file_abs in tqdm(image_files_abs):
594
+ if label_folder is not None:
595
+ assert input_folder in image_file_abs, \
596
+ 'File {} is not in folder {}'.format(image_file_abs,input_folder)
597
+ label_file_abs_base = image_file_abs.replace(input_folder,label_folder)
598
+ else:
599
+ label_file_abs_base = image_file_abs
600
+ label_file_abs = os.path.splitext(label_file_abs_base)[0] + '.txt'
601
+ assert os.path.isfile(label_file_abs), \
602
+ 'No annotation file for {}'.format(image_file_abs)
603
+
604
+
605
+ ## Initial loop to make sure image IDs will be unique
606
+
607
+ print('Validating image IDs...')
608
+
609
+ for fn_abs in tqdm(image_files_abs):
610
+
611
+ fn_relative = os.path.relpath(fn_abs,input_folder).replace('\\','/')
612
+ image_id = _filename_to_image_id(fn_relative)
613
+ assert image_id not in image_ids, \
614
+ 'Oops, you have hit a very esoteric case where you have the same filename ' + \
615
+ 'with both spaces and underscores, this is not currently handled.'
616
+ image_ids.add(image_id)
617
+
618
+
619
+ ## Main loop to process labels
620
+
621
+ print('Processing labels...')
622
+
623
+ if n_workers <= 1:
624
+
625
+ image_results = []
626
+ # fn_abs = image_files_abs[0]
627
+ for fn_abs in tqdm(image_files_abs):
628
+ image_results.append(_process_image(fn_abs,
629
+ input_folder,
630
+ category_id_to_name,
631
+ label_folder))
632
+
633
+ else:
634
+
635
+ assert pool_type in ('process','thread'), \
636
+ 'Illegal pool type {}'.format(pool_type)
637
+
638
+ pool = None
639
+ try:
640
+ if pool_type == 'thread':
641
+ pool = ThreadPool(n_workers)
642
+ else:
643
+ pool = Pool(n_workers)
644
+
645
+ print('Starting a {} pool of {} workers'.format(pool_type,n_workers))
646
+
647
+ p = partial(_process_image,
648
+ input_folder=input_folder,
649
+ category_id_to_name=category_id_to_name,
650
+ label_folder=label_folder)
651
+ image_results = list(tqdm(pool.imap(p, image_files_abs),
652
+ total=len(image_files_abs)))
653
+ finally:
654
+ if pool is not None:
655
+ pool.close()
656
+ pool.join()
657
+ print('Pool closed and joined for YOLO to COCO conversion')
658
+
659
+
660
+ assert len(image_results) == len(image_files_abs), \
661
+ 'Result count mismatch: {} results for {} image files'.format(
662
+ len(image_results),len(image_files_abs))
663
+
664
+
665
+ ## Re-assembly of results into a COCO dict
666
+
667
+ print('Assembling labels...')
668
+
669
+ images = []
670
+ annotations = []
671
+
672
+ input_id_to_output_id = None
673
+ if force_integer_ids:
674
+ input_id_to_output_id = {}
675
+
676
+ for image_result in tqdm(image_results):
677
+
678
+ im = image_result[0]
679
+ annotations_this_image = image_result[1]
680
+
681
+ skip_image = False
682
+
683
+ # Validate annotations
684
+ for ann in annotations_this_image:
685
+ if 'bbox' not in ann:
686
+ continue
687
+ # coco_bbox = [absolute_x_min, absolute_y_min, absolute_width, absolute_height]
688
+ box_is_valid = True
689
+ if len(ann['bbox']) != 4:
690
+ box_is_valid = False
691
+ elif ann['bbox'][2] < 0:
692
+ box_is_valid = False
693
+ elif ann['bbox'][3] < 0:
694
+ box_is_valid = False
695
+
696
+ if not box_is_valid:
697
+
698
+ s = 'Illegal bounding box {} for image {}'.format(
699
+ str(ann['bbox']),im['file_name'])
700
+ if invalid_annotation_handling == 'error':
701
+ raise ValueError(s)
702
+ if invalid_annotation_handling in ('warn','exclude'):
703
+ print('Warning: {}'.format(s))
704
+ if invalid_annotation_handling == 'exclude':
705
+ skip_image = True
706
+ break
707
+
708
+ if include_area:
709
+ ann['area'] = ann['bbox'][2] * ann['bbox'][3]
710
+
711
+ # ...for each annotation
712
+
713
+ if skip_image:
714
+ continue
715
+
716
+ # If we need to constrain image IDs to be integers
717
+ if force_integer_ids:
718
+ input_id = im['id']
719
+ output_id = len(input_id_to_output_id)
720
+ input_id_to_output_id[input_id] = output_id
721
+ im['id'] = output_id
722
+ for ann in annotations_this_image:
723
+ ann['image_id'] = im['id']
724
+
725
+ # If we have annotations for this image
726
+ if len(annotations_this_image) > 0:
727
+ assert im['error'] is None, \
728
+ "We shouldn't have errors for images that have annotations"
729
+ images.append(im)
730
+ for ann in annotations_this_image:
731
+ if include_crowd:
732
+ ann['iscrowd'] = 0
733
+ annotations.append(ann)
734
+
735
+ # If this image failed to read
736
+ elif im['error'] is not None:
737
+
738
+ if error_image_handling == 'skip':
739
+ pass
740
+ elif error_image_handling == 'no_annotations':
741
+ images.append(im)
742
+
743
+ # If this image read successfully, but there are no annotations
744
+ else:
745
+
746
+ if empty_image_handling == 'skip':
747
+ pass
748
+ elif empty_image_handling == 'no_annotations':
749
+ images.append(im)
750
+ elif empty_image_handling == 'empty_annotations':
751
+ assert empty_category_id is not None, \
752
+ 'An empty category ID must be supplied if we are including empty annotations'
753
+ ann = {}
754
+ if include_crowd:
755
+ ann['iscrowd'] = 0
756
+ ann['id'] = im['id'] + '_0'
757
+ ann['image_id'] = im['id']
758
+ ann['category_id'] = empty_category_id
759
+ ann['sequence_level_annotation'] = False
760
+ # This would also be a reasonable thing to do, but it's not the convention
761
+ # we're adopting, i.e. we are not including fake boxes for annotations
762
+ # on empty images.
763
+ # ann['bbox'] = [0,0,0,0]
764
+ annotations.append(ann)
765
+ images.append(im)
766
+
767
+ # ...if we do/don't have annotations for this image
768
+
769
+ # ...for each image result
770
+
771
+ # Create integer IDs for annotations if necessary
772
+ #
773
+ # Annotation IDs don't really mean anything, so just assign incrementing
774
+ # integers.
775
+ if force_integer_ids:
776
+ for i_ann,ann in enumerate(annotations):
777
+ ann['id'] = i_ann
778
+
779
+ # Clean up unnecessary error fields
780
+ for im in images:
781
+ if 'error' in im and im['error'] is None:
782
+ del im['error']
783
+
784
+ print('Read {} annotations for {} images'.format(len(annotations),
785
+ len(images)))
786
+
787
+ d = {}
788
+ d['images'] = images
789
+ d['annotations'] = annotations
790
+ d['categories'] = categories
791
+ d['info'] = info
792
+
793
+ if output_file is not None:
794
+ print('Writing to {}'.format(output_file))
795
+ write_json(output_file,d)
796
+
797
+ return d
798
+
799
+ # ...def yolo_to_coco()
800
+
801
+
802
+ #%% Interactive driver
803
+
804
+ if False:
805
+
806
+ pass
807
+
808
+ #%% Convert YOLO folders to COCO
809
+
810
+ preview_folder = '/home/user/data/noaa-fish/val-coco-conversion-preview'
811
+ input_folder = '/home/user/data/noaa-fish/val'
812
+ output_file = '/home/user/data/noaa-fish/val.json'
813
+ class_name_file = '/home/user/data/noaa-fish/AllImagesWithAnnotations/classes.txt'
814
+
815
+ d = yolo_to_coco(input_folder,class_name_file,output_file)
816
+
817
+ input_folder = '/home/user/data/noaa-fish/train'
818
+ output_file = '/home/user/data/noaa-fish/train.json'
819
+ class_name_file = '/home/user/data/noaa-fish/AllImagesWithAnnotations/classes.txt'
820
+
821
+ d = yolo_to_coco(input_folder,class_name_file,output_file)
822
+
823
+
824
+ #%% Check DB integrity
825
+
826
+ from megadetector.data_management.databases import integrity_check_json_db
827
+
828
+ options = integrity_check_json_db.IntegrityCheckOptions()
829
+ options.baseDir = input_folder
830
+ options.bCheckImageSizes = False
831
+ options.bCheckImageExistence = True
832
+ options.bFindUnusedImages = True
833
+
834
+ _, _, _ = integrity_check_json_db.integrity_check_json_db(output_file, options)
835
+
836
+
837
+ #%% Preview some images
838
+
839
+ from megadetector.visualization import visualize_db
840
+
841
+ viz_options = visualize_db.DbVizOptions()
842
+ viz_options.num_to_visualize = None
843
+ viz_options.trim_to_images_with_bboxes = False
844
+ viz_options.add_search_links = False
845
+ viz_options.sort_by_filename = False
846
+ viz_options.parallelize_rendering = True
847
+ viz_options.include_filename_links = True
848
+
849
+ html_output_file, _ = visualize_db.visualize_db(db_path=output_file,
850
+ output_dir=preview_folder,
851
+ image_base_dir=input_folder,
852
+ options=viz_options)
853
+
854
+ from megadetector.utils.path_utils import open_file
855
+ open_file(html_output_file)
856
+
857
+
858
+ #%% Command-line driver
859
+
860
+ def main():
861
+ """
862
+ Command-line driver for YOLO to COCO conversion.
863
+ """
864
+
865
+ parser = argparse.ArgumentParser(
866
+ description='Convert a YOLO-formatted dataset to COCO format'
867
+ )
868
+ parser.add_argument(
869
+ 'input_folder',
870
+ type=str,
871
+ help='Path to the YOLO dataset folder (image folder)'
872
+ )
873
+ parser.add_argument(
874
+ 'class_name_file',
875
+ type=str,
876
+ help='Path to the file containing class names (e.g., classes.txt or dataset.yaml)'
877
+ )
878
+ parser.add_argument(
879
+ 'output_file',
880
+ type=str,
881
+ help='Path to the output COCO .json file.'
882
+ )
883
+ parser.add_argument(
884
+ '--label_folder',
885
+ type=str,
886
+ default=None,
887
+ help='Label folder, if different from the image folder. Default: None (labels are in the image folder)'
888
+ )
889
+ parser.add_argument(
890
+ '--empty_image_handling',
891
+ type=str,
892
+ default='no_annotations',
893
+ choices=['no_annotations', 'empty_annotations', 'skip', 'error'],
894
+ help='How to handle images with no bounding boxes.'
895
+ )
896
+ parser.add_argument(
897
+ '--empty_image_category_name',
898
+ type=str,
899
+ default='empty',
900
+ help='Category name for empty images if empty_image_handling is "empty_annotations"'
901
+ )
902
+ parser.add_argument(
903
+ '--error_image_handling',
904
+ type=str,
905
+ default='no_annotations',
906
+ choices=['skip', 'no_annotations'],
907
+ help='How to handle images that fail to load'
908
+ )
909
+ parser.add_argument(
910
+ '--allow_images_without_label_files',
911
+ type=str,
912
+ default='true',
913
+ choices=['true', 'false'],
914
+ help='Whether to allow images that do not have corresponding label files (true/false)'
915
+ )
916
+ parser.add_argument(
917
+ '--n_workers',
918
+ type=int,
919
+ default=1,
920
+ help='Number of workers for parallel processing. <=1 for sequential'
921
+ )
922
+ parser.add_argument(
923
+ '--pool_type',
924
+ type=str,
925
+ default='thread',
926
+ choices=['thread', 'process'],
927
+ help='Type of multiprocessing pool if n_workers > 1'
928
+ )
929
+ parser.add_argument(
930
+ '--recursive',
931
+ type=str,
932
+ default='true',
933
+ choices=['true', 'false'],
934
+ help='Whether to search for images recursively in the input folder (true/false)'
935
+ )
936
+ parser.add_argument(
937
+ '--exclude_string',
938
+ type=str,
939
+ default=None,
940
+ help='Exclude images whose filename contains this string'
941
+ )
942
+ parser.add_argument(
943
+ '--include_string',
944
+ type=str,
945
+ default=None,
946
+ help='Include images only if filename contains this string'
947
+ )
948
+ parser.add_argument(
949
+ '--overwrite_handling',
950
+ type=str,
951
+ default='overwrite',
952
+ choices=['load', 'overwrite', 'error'],
953
+ help='Behavior if output_file exists.'
954
+ )
955
+
956
+ if len(sys.argv[1:]) == 0:
957
+ parser.print_help()
958
+ parser.exit()
959
+
960
+ args = parser.parse_args()
961
+
962
+ parsed_allow_images = args.allow_images_without_label_files.lower() == 'true'
963
+ parsed_recursive = args.recursive.lower() == 'true'
964
+
965
+ yolo_to_coco(
966
+ args.input_folder,
967
+ args.class_name_file,
968
+ output_file=args.output_file,
969
+ label_folder=args.label_folder,
970
+ empty_image_handling=args.empty_image_handling,
971
+ empty_image_category_name=args.empty_image_category_name,
972
+ error_image_handling=args.error_image_handling,
973
+ allow_images_without_label_files=parsed_allow_images,
974
+ n_workers=args.n_workers,
975
+ pool_type=args.pool_type,
976
+ recursive=parsed_recursive,
977
+ exclude_string=args.exclude_string,
978
+ include_string=args.include_string,
979
+ overwrite_handling=args.overwrite_handling
980
+ )
981
+ print(f"Dataset conversion complete, output written to {args.output_file}")
982
+
983
+ if __name__ == '__main__':
984
+ main()