megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,696 @@
1
+ """
2
+
3
+ json_validator.py
4
+
5
+ Validates a classification label specification JSON file and optionally
6
+ queries MegaDB to find matching image files.
7
+
8
+ See README.md for an example of a classification label specification JSON file.
9
+
10
+ The validation step takes the classification label specification JSON file and
11
+ finds the dataset labels that belong to each classification label. It checks
12
+ that the following conditions hold:
13
+
14
+ 1) Each classification label specification matches at least 1 dataset label.
15
+
16
+ 2) If the classification label includes a taxonomical specification, then the
17
+ taxa is actually a part of our master taxonomy.
18
+
19
+ 3) If the 'prioritize' key is found for a given label, then the label must
20
+ also have a 'max_count' key.
21
+
22
+ 4) If --allow-multilabel=False, then no dataset label is included in more than
23
+ one classification label.
24
+
25
+ If --output-dir <output_dir> is given, then we query MegaDB for images
26
+ that match the dataset labels identified during the validation step. We filter
27
+ out images that have unaccepted file extensions and images that don't actually
28
+ exist in Azure Blob Storage. In total, we output the following files:
29
+
30
+ <output_dir>/
31
+
32
+ - included_dataset_labels.txt
33
+ lists the original dataset classes included for each classification label
34
+
35
+ - image_counts_by_label_presample.json
36
+ number of images for each classification label after filtering bad
37
+ images, but before sampling
38
+
39
+ - image_counts_by_label_sampled.json
40
+ number of images for each classification label in queried_images.json
41
+
42
+ - json_validator_log_{timestamp}.json
43
+ log of excluded images / labels
44
+
45
+ - queried_images.json
46
+ main output file, ex:
47
+
48
+ {
49
+ "caltech/cct_images/59f5fe2b-23d2-11e8-a6a3-ec086b02610b.jpg": {
50
+ "dataset": "caltech",
51
+ "location": 13,
52
+ "class": "mountain_lion", // class from dataset
53
+ "label": ["monutain_lion"] // labels to use in classifier
54
+ },
55
+ "caltech/cct_images/59f79901-23d2-11e8-a6a3-ec086b02610b.jpg": {
56
+ "dataset": "caltech",
57
+ "location": 13,
58
+ "class": "mountain_lion", // class from dataset
59
+ "bbox": [{"category": "animal",
60
+ "bbox": [0, 0.347, 0.237, 0.257]}],
61
+ "label": ["monutain_lion"] // labels to use in classifier
62
+ },
63
+ ...
64
+ }
65
+
66
+ """
67
+
68
+ #%% Imports
69
+
70
+ from __future__ import annotations
71
+
72
+ import argparse
73
+ import json
74
+ import os
75
+ import pprint
76
+ import random
77
+
78
+ import pandas as pd
79
+
80
+ from tqdm import tqdm
81
+ from collections import defaultdict
82
+ from collections.abc import Container, Iterable, Mapping, MutableMapping
83
+ from concurrent import futures
84
+ from datetime import datetime
85
+ from typing import Any
86
+
87
+ from megadetector.utils import path_utils
88
+ from megadetector.utils import sas_blob_utils
89
+ from megadetector.utils import ct_utils
90
+
91
+ from megadetector.data_management.megadb import megadb_utils
92
+ from megadetector.taxonomy_mapping.taxonomy_graph import (
93
+ build_taxonomy_graph, dag_to_tree, TaxonNode)
94
+
95
+
96
+ #%% Example usage
97
+
98
+ """
99
+ python json_validator.py label_spec.json \
100
+ $HOME/camera-traps-private/camera_trap_taxonomy_mapping.csv \
101
+ --output-dir run --json-indent 2
102
+ """
103
+
104
+
105
+ #%% Main function
106
+
107
+ def main(label_spec_json_path: str,
108
+ taxonomy_csv_path: str,
109
+ allow_multilabel: bool = False,
110
+ single_parent_taxonomy: bool = False,
111
+ check_blob_exists: bool | str = False,
112
+ min_locs: int | None = None,
113
+ output_dir: str | None = None,
114
+ json_indent: int | None = None,
115
+ seed: int = 123,
116
+ mislabeled_images_dir: str | None = None) -> None:
117
+
118
+ # input validation
119
+ assert os.path.exists(label_spec_json_path)
120
+ assert os.path.exists(taxonomy_csv_path)
121
+ if mislabeled_images_dir is not None:
122
+ assert os.path.isdir(mislabeled_images_dir)
123
+
124
+ random.seed(seed)
125
+
126
+ print('Building taxonomy hierarchy')
127
+ taxonomy_df = pd.read_csv(taxonomy_csv_path)
128
+ if single_parent_taxonomy:
129
+ TaxonNode.single_parent_only = True
130
+ graph, taxonomy_dict, _ = build_taxonomy_graph(taxonomy_df)
131
+ dag_to_tree(graph, taxonomy_dict)
132
+
133
+ print('Validating input json')
134
+ with open(label_spec_json_path, 'r') as f:
135
+ input_js = json.load(f)
136
+ label_to_inclusions = validate_json(
137
+ input_js, taxonomy_dict, allow_multilabel=allow_multilabel)
138
+
139
+ if output_dir is None:
140
+ pprint.pprint(label_to_inclusions)
141
+ return
142
+
143
+ os.makedirs(output_dir, exist_ok=True)
144
+ labels_path = os.path.join(output_dir, 'included_dataset_labels.txt')
145
+ with open(labels_path, 'w') as f:
146
+ pprint.pprint(label_to_inclusions, stream=f)
147
+
148
+ # use MegaDB to generate list of images
149
+ print('Generating output json')
150
+ output_js = get_output_json(label_to_inclusions, mislabeled_images_dir)
151
+ print(f'In total found {len(output_js)} images')
152
+
153
+ # only keep images that:
154
+ # 1) end in a supported file extension, and
155
+ # 2) actually exist in Azure Blob Storage
156
+ # 3) belong to a label with at least min_locs locations
157
+ log: dict[str, Any] = {}
158
+ remove_non_images(output_js, log)
159
+ if isinstance(check_blob_exists, str):
160
+ remove_nonexistent_images(output_js, log, check_local=check_blob_exists)
161
+ elif check_blob_exists:
162
+ remove_nonexistent_images(output_js, log)
163
+ if min_locs is not None:
164
+ remove_images_insufficient_locs(output_js, log, min_locs)
165
+
166
+ # write out log of images / labels that were removed
167
+ date = datetime.now().strftime('%Y%m%d_%H%M%S') # ex: '20200722_110816'
168
+ log_path = os.path.join(output_dir, f'json_validator_log_{date}.json')
169
+ print(f'Saving log of bad images to {log_path}')
170
+ ct_utils.write_json(log_path, log)
171
+
172
+ # save label counts, pre-subsampling
173
+ print('Saving pre-sampling label counts')
174
+ save_path = os.path.join(output_dir, 'image_counts_by_label_presample.json')
175
+ image_counts_by_label_presample = {
176
+ label: len(filter_images(output_js, label))
177
+ for label in sorted(input_js.keys())
178
+ }
179
+ ct_utils.write_json(save_path, image_counts_by_label_presample)
180
+
181
+ print('Sampling with priority (if needed)')
182
+ output_js = sample_with_priority(input_js, output_js)
183
+
184
+ print('Saving queried_images.json')
185
+ output_json_path = os.path.join(output_dir, 'queried_images.json')
186
+ ct_utils.write_json(output_json_path, output_js, indent=json_indent)
187
+
188
+ # save label counts, post-subsampling
189
+ print('Saving post-sampling label counts')
190
+ save_path = os.path.join(output_dir, 'image_counts_by_label_sampled.json')
191
+ image_counts_by_label_sampled = {
192
+ label: len(filter_images(output_js, label))
193
+ for label in sorted(input_js.keys())
194
+ }
195
+ ct_utils.write_json(save_path, image_counts_by_label_sampled)
196
+
197
+
198
+ #%% Support functions
199
+
200
+ def parse_spec(spec_dict: Mapping[str, Any],
201
+ taxonomy_dict: dict[tuple[str, str], TaxonNode]
202
+ ) -> set[tuple[str, str]]:
203
+ """
204
+ Gathers the dataset labels corresponding to a particular classification
205
+ label specification.
206
+
207
+ Args:
208
+ spec_dict: dict, contains keys ['taxa', 'dataset_labels']
209
+ taxonomy_dict: dict, maps (taxon_level, taxon_name) to a TaxonNode
210
+
211
+ Returns: set of (ds, ds_label), dataset labels requested by the spec
212
+
213
+ Raises: ValueError, if specification does not match any dataset labels
214
+ """
215
+
216
+ results = set()
217
+ if 'taxa' in spec_dict:
218
+ # spec_dict['taxa']: list of dict
219
+ # [
220
+ # {'level': 'family', 'name': 'cervidae', 'datasets': ['idfg']},
221
+ # {'level': 'genus', 'name': 'meleagris'}
222
+ # ]
223
+ for taxon in spec_dict['taxa']:
224
+ key = (taxon['level'].lower(), taxon['name'].lower())
225
+ datasets = taxon.get('datasets', None)
226
+ results |= taxonomy_dict[key].get_dataset_labels(datasets)
227
+
228
+ if 'dataset_labels' in spec_dict:
229
+ # spec_dict['dataset_labels']: dict, dataset => list of dataset_label
230
+ # {
231
+ # "idfg": ["deer", "elk", "prong"],
232
+ # "idfg_swwlf_2019": ["elk", "muledeer", "whitetaileddeer"]
233
+ # }
234
+ for ds, ds_labels in spec_dict['dataset_labels'].items():
235
+ for ds_label in ds_labels:
236
+ results.add((ds, ds_label))
237
+
238
+ if len(results) == 0:
239
+ raise ValueError('specification matched no dataset labels')
240
+ return results
241
+
242
+
243
+ def validate_json(input_js: dict[str, dict[str, Any]],
244
+ taxonomy_dict: dict[tuple[str, str], TaxonNode],
245
+ allow_multilabel: bool) -> dict[str, set[tuple[str, str]]]:
246
+ """
247
+ Validates JSON.
248
+
249
+ Args:
250
+ input_js: dict, Python dict representation of JSON file
251
+ see classification/README.md
252
+ taxonomy_dict: dict, maps (taxon_level, taxon_name) to a TaxonNode
253
+ allow_multilabel: bool, whether to allow a dataset label to be assigned
254
+ to multiple output labels
255
+
256
+ Returns: dict, maps label name to set of (dataset, dataset_label) tuples
257
+
258
+ Raises: ValueError, if a classification label specification matches no
259
+ dataset labels, or if allow_multilabel=False but a dataset label is
260
+ included in two or more classification labels
261
+ """
262
+
263
+ # maps output label name to set of (dataset, dataset_label) tuples
264
+ label_to_inclusions: dict[str, set[tuple[str, str]]] = {}
265
+ for label, spec_dict in input_js.items():
266
+ include_set = parse_spec(spec_dict, taxonomy_dict)
267
+ if 'exclude' in spec_dict:
268
+ include_set -= parse_spec(spec_dict['exclude'], taxonomy_dict)
269
+
270
+ for label_b, set_b in label_to_inclusions.items():
271
+ shared = include_set.intersection(set_b)
272
+ if len(shared) > 0:
273
+ print(f'Labels {label} and {label_b} share images:', shared)
274
+ if not allow_multilabel:
275
+ raise ValueError('Intersection between sets!')
276
+
277
+ label_to_inclusions[label] = include_set
278
+ return label_to_inclusions
279
+
280
+
281
+ def get_output_json(label_to_inclusions: dict[str, set[tuple[str, str]]],
282
+ mislabeled_images_dir: str | None = None
283
+ ) -> dict[str, dict[str, Any]]:
284
+ """
285
+ Queries MegaDB to get image paths matching dataset_labels.
286
+
287
+ Args:
288
+ label_to_inclusions: dict, maps label name to set of
289
+ (dataset, dataset_label) tuples, output of validate_json()
290
+ mislabeled_images_dir: str, path to directory of CSVs with known
291
+ mislabeled images
292
+
293
+ Returns: dict, maps sorted image_path <dataset>/<img_file> to a dict of
294
+ properties
295
+ - 'dataset': str, name of dataset that image is from
296
+ - 'location': str or int, optional
297
+ - 'class': str, class label from the dataset
298
+ - 'label': list of str, assigned output label
299
+ - 'bbox': list of dicts, optional
300
+ """
301
+
302
+ # Because MegaDB is organized by dataset, we do the same...
303
+ #
304
+ # ds_to_labels = {
305
+ # 'dataset_name': {
306
+ # 'dataset_label': [output_label1, output_label2]
307
+ # }
308
+ # }
309
+ ds_to_labels: dict[str, dict[str, list[str]]] = {}
310
+ for output_label, ds_dslabels_set in label_to_inclusions.items():
311
+ for (ds, ds_label) in ds_dslabels_set:
312
+ if ds not in ds_to_labels:
313
+ ds_to_labels[ds] = {}
314
+ if ds_label not in ds_to_labels[ds]:
315
+ ds_to_labels[ds][ds_label] = []
316
+ ds_to_labels[ds][ds_label].append(output_label)
317
+
318
+ # we need the datasets table for getting full image paths
319
+ megadb = megadb_utils.MegadbUtils()
320
+ datasets_table = megadb.get_datasets_table()
321
+
322
+ # The line
323
+ # [img.class[0], seq.class[0]][0] as class
324
+ # selects the image-level class label if available. Otherwise it selects the
325
+ # sequence-level class label. This line assumes the following conditions,
326
+ # expressed in the WHERE clause:
327
+ # - at least one of the image or sequence class label is given
328
+ # - the image and sequence class labels are arrays with length at most 1
329
+ # - the image class label takes priority over the sequence class label
330
+ #
331
+ # In Azure Cosmos DB, if a field is not defined, then it is simply excluded
332
+ # from the result. For example, on the following JSON object,
333
+ # {
334
+ # "dataset": "camera_traps",
335
+ # "seq_id": "1234",
336
+ # "location": "A1",
337
+ # "images": [{"file": "abcd.jpeg"}],
338
+ # "class": ["deer"],
339
+ # }
340
+ # the array [img.class[0], seq.class[0]] just gives ['deer'] because
341
+ # img.class is undefined and therefore excluded.
342
+ query = '''
343
+ SELECT
344
+ seq.dataset,
345
+ seq.location,
346
+ img.file,
347
+ [img.class[0], seq.class[0]][0] as class,
348
+ img.bbox
349
+ FROM sequences seq JOIN img IN seq.images
350
+ WHERE (ARRAY_LENGTH(img.class) = 1
351
+ AND ARRAY_CONTAINS(@dataset_labels, img.class[0])
352
+ )
353
+ OR (ARRAY_LENGTH(seq.class) = 1
354
+ AND ARRAY_CONTAINS(@dataset_labels, seq.class[0])
355
+ AND (NOT IS_DEFINED(img.class))
356
+ )
357
+ '''
358
+
359
+ output_json = {} # maps full image path to json object
360
+
361
+ for ds in tqdm(sorted(ds_to_labels.keys())): # sort for determinism
362
+
363
+ mislabeled_images: Mapping[str, Any] = {}
364
+ if mislabeled_images_dir is not None:
365
+ csv_path = os.path.join(mislabeled_images_dir, f'{ds}.csv')
366
+ if os.path.exists(csv_path):
367
+ mislabeled_images = pd.read_csv(csv_path, index_col='file',
368
+ squeeze=True)
369
+
370
+ ds_labels = sorted(ds_to_labels[ds].keys())
371
+ tqdm.write(f'Querying dataset "{ds}" for dataset labels: {ds_labels}')
372
+
373
+ start = datetime.now()
374
+ parameters = [dict(name='@dataset_labels', value=ds_labels)]
375
+ results = megadb.query_sequences_table(
376
+ query, partition_key=ds, parameters=parameters)
377
+ elapsed = (datetime.now() - start).total_seconds()
378
+ tqdm.write(f'- query took {elapsed:.0f}s, found {len(results)} images')
379
+
380
+ # if no path prefix, set it to the empty string '', because
381
+ # os.path.join('', x, '') = '{x}/'
382
+ path_prefix = datasets_table[ds].get('path_prefix', '')
383
+ count_corrected = 0
384
+ count_removed = 0
385
+ for result in results:
386
+ # result keys
387
+ # - already has: ['dataset', 'location', 'file', 'class', 'bbox']
388
+ # - add ['label'], remove ['file']
389
+ img_file = os.path.join(path_prefix, result['file'])
390
+
391
+ # if img is mislabeled, but we don't know the correct class, skip it
392
+ # otherwise, update the img with the correct class, but skip the
393
+ # img if the correct class is not one we queried for
394
+ if img_file in mislabeled_images:
395
+ new_class = mislabeled_images[img_file]
396
+ if pd.isna(new_class) or new_class not in ds_to_labels[ds]:
397
+ count_removed += 1
398
+ continue
399
+
400
+ count_corrected += 1
401
+ result['class'] = new_class
402
+
403
+ img_path = os.path.join(ds, img_file)
404
+ del result['file']
405
+ ds_label = result['class']
406
+ result['label'] = ds_to_labels[ds][ds_label]
407
+ output_json[img_path] = result
408
+
409
+ tqdm.write(f'- Removed {count_removed} mislabeled images.')
410
+ tqdm.write(f'- Corrected labels for {count_corrected} images.')
411
+
412
+ # sort keys for determinism
413
+ output_json = {k: output_json[k] for k in sorted(output_json.keys())}
414
+ return output_json
415
+
416
+
417
+ def get_image_sas_uris(img_paths: Iterable[str]) -> list[str]:
418
+ """
419
+ Converts a image paths to Azure Blob Storage blob URIs with SAS tokens.
420
+
421
+ Args:
422
+ img_paths: list of str, <dataset-name>/<image-filename>
423
+
424
+ Returns:
425
+ image_sas_uris: list of str, image blob URIs with SAS tokens, ready to
426
+ pass to the batch detection API
427
+ """
428
+
429
+ # we need the datasets table for getting SAS keys
430
+ datasets_table = megadb_utils.MegadbUtils().get_datasets_table()
431
+
432
+ image_sas_uris = []
433
+ for img_path in img_paths:
434
+ dataset, img_file = img_path.split('/', maxsplit=1)
435
+
436
+ # strip leading '?' from SAS token
437
+ sas_token = datasets_table[dataset]['container_sas_key']
438
+ if sas_token[0] == '?':
439
+ sas_token = sas_token[1:]
440
+
441
+ image_sas_uri = sas_blob_utils.build_azure_storage_uri(
442
+ account=datasets_table[dataset]['storage_account'],
443
+ container=datasets_table[dataset]['container'],
444
+ blob=img_file,
445
+ sas_token=sas_token)
446
+ image_sas_uris.append(image_sas_uri)
447
+ return image_sas_uris
448
+
449
+
450
+ def remove_non_images(js: MutableMapping[str, dict[str, Any]],
451
+ log: MutableMapping[str, Any]) -> None:
452
+ """
453
+ Remove images with non-image file extensions. Modifies [js] and [log]
454
+ in-place.
455
+
456
+ Args:
457
+ js: dict, img_path => info dict
458
+ log: dict, maps str description to log info
459
+ """
460
+ print('Removing images with invalid image file extensions...')
461
+ nonimg_paths = [k for k in js.keys() if not path_utils.is_image_file(k)]
462
+ for img_path in nonimg_paths:
463
+ del js[img_path]
464
+ print(f'Removed {len(nonimg_paths)} files with non-image extensions.')
465
+ if len(nonimg_paths) > 0:
466
+ log['nonimage_files'] = sorted(nonimg_paths)
467
+
468
+
469
+ def remove_nonexistent_images(js: MutableMapping[str, dict[str, Any]],
470
+ log: MutableMapping[str, Any],
471
+ check_local: str | None = None,
472
+ num_threads: int = 50) -> None:
473
+ """
474
+ Remove images that don't actually exist locally or on Azure Blob Storage.
475
+ Modifies [js] and [log] in-place.
476
+
477
+ Args:
478
+ js: dict, image paths <dataset>/<img_file> => info dict
479
+ log: dict, maps str description to log info
480
+ check_local: optional str, path to local dir
481
+ num_threads: int, number of threads to use for checking blob existence
482
+ """
483
+
484
+ def check_local_then_azure(local_path: str, blob_url: str) -> bool:
485
+ return (os.path.exists(local_path)
486
+ or sas_blob_utils.check_blob_exists(blob_url))
487
+
488
+ pool = futures.ThreadPoolExecutor(max_workers=num_threads)
489
+ future_to_img_path = {}
490
+ blob_urls = get_image_sas_uris(js.keys())
491
+ total = len(js)
492
+ print(f'Checking {total} images for existence...')
493
+ pbar = tqdm(zip(js.keys(), blob_urls), total=total)
494
+ if check_local is None:
495
+ # only check Azure Blob Storage
496
+ for img_path, blob_url in pbar:
497
+ future = pool.submit(sas_blob_utils.check_blob_exists, blob_url)
498
+ future_to_img_path[future] = img_path
499
+ else:
500
+ # check local directory first before checking Azure Blob Storage
501
+ for img_path, blob_url in pbar:
502
+ local_path = os.path.join(check_local, img_path)
503
+ future = pool.submit(check_local_then_azure, local_path, blob_url)
504
+ future_to_img_path[future] = img_path
505
+
506
+ nonexistent_images = []
507
+ print('Fetching results...')
508
+ for future in tqdm(futures.as_completed(future_to_img_path), total=total):
509
+ img_path = future_to_img_path[future]
510
+ try:
511
+ if future.result(): # blob_url exists
512
+ continue
513
+ except Exception as e: # pylint: disable=broad-except
514
+ exception_type = type(e).__name__
515
+ tqdm.write(f'{img_path} - generated {exception_type}: {e}')
516
+ nonexistent_images.append(img_path)
517
+ del js[img_path]
518
+ pool.shutdown()
519
+
520
+ print(f'Found {len(nonexistent_images)} nonexistent blobs.')
521
+ if len(nonexistent_images) > 0:
522
+ log['nonexistent_images'] = sorted(nonexistent_images)
523
+
524
+
525
+ def remove_images_insufficient_locs(js: MutableMapping[str, dict[str, Any]],
526
+ log: MutableMapping[str, Any],
527
+ min_locs: int) -> None:
528
+ """
529
+ Removes images that have labels that don't have at least min_locs
530
+ locations. Modifies [js] and [log] in-place.
531
+
532
+ Args:
533
+ js: dict, image paths <dataset>/<img_file> => info dict
534
+ log: dict, maps str description to log info
535
+ min_locs: optional int, minimum # of locations that each label must
536
+ have in order to be included
537
+ """
538
+
539
+ # 1st pass: populate label_to_locs
540
+ # label (tuple of str) => set of (dataset, location)
541
+ label_to_locs = defaultdict(set)
542
+ for img_path, img_info in js.items():
543
+ label = tuple(img_info['label'])
544
+ loc = (img_info['dataset'], img_info.get('location', ''))
545
+ label_to_locs[label].add(loc)
546
+
547
+ bad_labels = set(label for label, locs in label_to_locs.items()
548
+ if len(locs) < min_locs)
549
+ print(f'Found {len(bad_labels)} labels with < {min_locs} locations.')
550
+
551
+ # 2nd pass: eliminate bad images
552
+ if len(bad_labels) > 0:
553
+ log[f'labels with < {min_locs} locs'] = sorted(bad_labels)
554
+ for img_path in list(js.keys()): # copy keys to modify js in-place
555
+ label = tuple(js[img_path]['label'])
556
+ if label in bad_labels:
557
+ del js[img_path]
558
+
559
+
560
+ def filter_images(output_js: Mapping[str, Mapping[str, Any]], label: str,
561
+ datasets: Container[str] | None = None) -> set[str]:
562
+ """
563
+ Finds image files from output_js that have a given label and are from
564
+ a set of datasets.
565
+
566
+ Args:
567
+ output_js: dict, output of get_output_json()
568
+ label: str, desired label
569
+ datasets: optional list str, dataset names, images from any dataset are
570
+ allowed if datasets=None
571
+
572
+ Returns: set of str, image files that match the filtering criteria
573
+ """
574
+
575
+ img_files: set[str] = set()
576
+ for img_file, img_dict in output_js.items():
577
+ cond1 = (label in img_dict['label'])
578
+ cond2 = (datasets is None or img_dict['dataset'] in datasets)
579
+ if cond1 and cond2:
580
+ img_files.add(img_file)
581
+ return img_files
582
+
583
+
584
+ def sample_with_priority(input_js: Mapping[str, Mapping[str, Any]],
585
+ output_js: Mapping[str, dict[str, Any]]
586
+ ) -> dict[str, dict[str, Any]]:
587
+ """
588
+ Uses the optional 'max_count' and 'prioritize' keys from the input
589
+ classification labels specifications JSON file to sample images for each
590
+ classification label.
591
+
592
+ Returns: dict, keys are image file names, sorted alphabetically
593
+ """
594
+
595
+ filtered_imgs: set[str] = set()
596
+ for label, spec_dict in input_js.items():
597
+ if 'prioritize' in spec_dict and 'max_count' not in spec_dict:
598
+ raise ValueError('prioritize is invalid without a max_count value.')
599
+
600
+ if 'max_count' not in spec_dict:
601
+ filtered_imgs.update(filter_images(output_js, label, datasets=None))
602
+ continue
603
+ quota = spec_dict['max_count']
604
+
605
+ # prioritize is a list of prioritization levels
606
+ prioritize = spec_dict.get('prioritize', [])
607
+ prioritize.append(None)
608
+
609
+ for level in prioritize:
610
+ img_files = filter_images(output_js, label, datasets=level)
611
+
612
+ # number of already matching images
613
+ num_already_matching = len(img_files & filtered_imgs)
614
+ quota = max(0, quota - num_already_matching)
615
+ img_files -= filtered_imgs
616
+
617
+ num_to_sample = min(quota, len(img_files))
618
+ sample = random.sample(img_files, k=num_to_sample)
619
+ filtered_imgs.update(sample)
620
+
621
+ quota -= num_to_sample
622
+ if quota == 0:
623
+ break
624
+
625
+ output_js = {
626
+ img_file: output_js[img_file]
627
+ for img_file in sorted(filtered_imgs)
628
+ }
629
+ return output_js
630
+
631
+
632
+ #%% Command-line driver
633
+
634
+ def _parse_args() -> argparse.Namespace:
635
+
636
+ parser = argparse.ArgumentParser(
637
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
638
+ description='Validates JSON.')
639
+ parser.add_argument(
640
+ 'label_spec_json',
641
+ help='path to JSON file containing label specification')
642
+ parser.add_argument(
643
+ 'taxonomy_csv',
644
+ help='path to taxonomy CSV file')
645
+ parser.add_argument(
646
+ '--allow-multilabel', action='store_true',
647
+ help='allow assigning a (dataset, dataset_label) pair to multiple '
648
+ 'output labels')
649
+ parser.add_argument(
650
+ '--single-parent-taxonomy', action='store_true',
651
+ help='flag that restricts the taxonomy to only allow a single parent '
652
+ 'for each taxon node')
653
+ parser.add_argument(
654
+ '-c', '--check-blob-exists', nargs='?', const=True,
655
+ help='check that the blob for each queried image actually exists. Can '
656
+ 'be very slow if reaching throttling limits. Optionally pass in a '
657
+ 'local directory to check before checking Azure Blob Storage.')
658
+ parser.add_argument(
659
+ '--min-locs', type=int,
660
+ help='minimum number of locations that each label must have in order '
661
+ 'to be included')
662
+ parser.add_argument(
663
+ '-o', '--output-dir',
664
+ help='path to directory to save outputs. The output JSON file is saved '
665
+ 'at <output-dir>/queried_images.json, and the mapping from '
666
+ 'classification labels to dataset labels is saved at '
667
+ '<output-dir>/included_dataset_labels.txt.')
668
+ parser.add_argument(
669
+ '--json-indent', type=int,
670
+ help='number of spaces to use for JSON indent (default no indent), '
671
+ 'only used if --output-dir is given')
672
+ parser.add_argument(
673
+ '--seed', type=int, default=123,
674
+ help='random seed for sampling images, only used if --output-dir is '
675
+ 'given and a label specification includes a "max_count" key')
676
+ parser.add_argument(
677
+ '-m', '--mislabeled-images',
678
+ help='path to `megadb_mislabeled` directory of locally mounted '
679
+ '`classifier-training` Azure Blob Storage container where known '
680
+ 'mislabeled images are tracked')
681
+ return parser.parse_args()
682
+
683
+
684
+ if __name__ == '__main__':
685
+
686
+ args = _parse_args()
687
+ main(label_spec_json_path=args.label_spec_json,
688
+ taxonomy_csv_path=args.taxonomy_csv,
689
+ allow_multilabel=args.allow_multilabel,
690
+ single_parent_taxonomy=args.single_parent_taxonomy,
691
+ check_blob_exists=args.check_blob_exists,
692
+ min_locs=args.min_locs,
693
+ output_dir=args.output_dir,
694
+ json_indent=args.json_indent,
695
+ seed=args.seed,
696
+ mislabeled_images_dir=args.mislabeled_images)