megadetector 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show
  1. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
  2. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
  3. megadetector-5.0.11.dist-info/RECORD +5 -0
  4. megadetector-5.0.11.dist-info/top_level.txt +1 -0
  5. api/__init__.py +0 -0
  6. api/batch_processing/__init__.py +0 -0
  7. api/batch_processing/api_core/__init__.py +0 -0
  8. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  9. api/batch_processing/api_core/batch_service/score.py +0 -439
  10. api/batch_processing/api_core/server.py +0 -294
  11. api/batch_processing/api_core/server_api_config.py +0 -98
  12. api/batch_processing/api_core/server_app_config.py +0 -55
  13. api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  14. api/batch_processing/api_core/server_job_status_table.py +0 -152
  15. api/batch_processing/api_core/server_orchestration.py +0 -360
  16. api/batch_processing/api_core/server_utils.py +0 -92
  17. api/batch_processing/api_core_support/__init__.py +0 -0
  18. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  19. api/batch_processing/api_support/__init__.py +0 -0
  20. api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  21. api/batch_processing/data_preparation/__init__.py +0 -0
  22. api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
  23. api/batch_processing/data_preparation/manage_video_batch.py +0 -327
  24. api/batch_processing/integration/digiKam/setup.py +0 -6
  25. api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
  26. api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
  27. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
  28. api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
  29. api/batch_processing/postprocessing/__init__.py +0 -0
  30. api/batch_processing/postprocessing/add_max_conf.py +0 -64
  31. api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
  32. api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
  33. api/batch_processing/postprocessing/compare_batch_results.py +0 -958
  34. api/batch_processing/postprocessing/convert_output_format.py +0 -397
  35. api/batch_processing/postprocessing/load_api_results.py +0 -195
  36. api/batch_processing/postprocessing/md_to_coco.py +0 -310
  37. api/batch_processing/postprocessing/md_to_labelme.py +0 -330
  38. api/batch_processing/postprocessing/merge_detections.py +0 -401
  39. api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
  40. api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
  41. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
  42. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
  43. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
  44. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
  45. api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
  46. api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
  47. api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
  48. api/synchronous/__init__.py +0 -0
  49. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  50. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
  51. api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
  52. api/synchronous/api_core/animal_detection_api/config.py +0 -35
  53. api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
  54. api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
  55. api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
  56. api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
  57. api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
  58. api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
  59. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
  60. api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
  61. api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
  62. api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
  63. api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
  64. api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
  65. api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
  66. api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
  67. api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
  68. api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
  69. api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
  70. api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
  71. api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
  72. api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
  73. api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
  74. api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
  75. api/synchronous/api_core/tests/__init__.py +0 -0
  76. api/synchronous/api_core/tests/load_test.py +0 -110
  77. classification/__init__.py +0 -0
  78. classification/aggregate_classifier_probs.py +0 -108
  79. classification/analyze_failed_images.py +0 -227
  80. classification/cache_batchapi_outputs.py +0 -198
  81. classification/create_classification_dataset.py +0 -627
  82. classification/crop_detections.py +0 -516
  83. classification/csv_to_json.py +0 -226
  84. classification/detect_and_crop.py +0 -855
  85. classification/efficientnet/__init__.py +0 -9
  86. classification/efficientnet/model.py +0 -415
  87. classification/efficientnet/utils.py +0 -610
  88. classification/evaluate_model.py +0 -520
  89. classification/identify_mislabeled_candidates.py +0 -152
  90. classification/json_to_azcopy_list.py +0 -63
  91. classification/json_validator.py +0 -695
  92. classification/map_classification_categories.py +0 -276
  93. classification/merge_classification_detection_output.py +0 -506
  94. classification/prepare_classification_script.py +0 -194
  95. classification/prepare_classification_script_mc.py +0 -228
  96. classification/run_classifier.py +0 -286
  97. classification/save_mislabeled.py +0 -110
  98. classification/train_classifier.py +0 -825
  99. classification/train_classifier_tf.py +0 -724
  100. classification/train_utils.py +0 -322
  101. data_management/__init__.py +0 -0
  102. data_management/annotations/__init__.py +0 -0
  103. data_management/annotations/annotation_constants.py +0 -34
  104. data_management/camtrap_dp_to_coco.py +0 -238
  105. data_management/cct_json_utils.py +0 -395
  106. data_management/cct_to_md.py +0 -176
  107. data_management/cct_to_wi.py +0 -289
  108. data_management/coco_to_labelme.py +0 -272
  109. data_management/coco_to_yolo.py +0 -662
  110. data_management/databases/__init__.py +0 -0
  111. data_management/databases/add_width_and_height_to_db.py +0 -33
  112. data_management/databases/combine_coco_camera_traps_files.py +0 -206
  113. data_management/databases/integrity_check_json_db.py +0 -477
  114. data_management/databases/subset_json_db.py +0 -115
  115. data_management/generate_crops_from_cct.py +0 -149
  116. data_management/get_image_sizes.py +0 -188
  117. data_management/importers/add_nacti_sizes.py +0 -52
  118. data_management/importers/add_timestamps_to_icct.py +0 -79
  119. data_management/importers/animl_results_to_md_results.py +0 -158
  120. data_management/importers/auckland_doc_test_to_json.py +0 -372
  121. data_management/importers/auckland_doc_to_json.py +0 -200
  122. data_management/importers/awc_to_json.py +0 -189
  123. data_management/importers/bellevue_to_json.py +0 -273
  124. data_management/importers/cacophony-thermal-importer.py +0 -796
  125. data_management/importers/carrizo_shrubfree_2018.py +0 -268
  126. data_management/importers/carrizo_trail_cam_2017.py +0 -287
  127. data_management/importers/cct_field_adjustments.py +0 -57
  128. data_management/importers/channel_islands_to_cct.py +0 -913
  129. data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  130. data_management/importers/eMammal/eMammal_helpers.py +0 -249
  131. data_management/importers/eMammal/make_eMammal_json.py +0 -223
  132. data_management/importers/ena24_to_json.py +0 -275
  133. data_management/importers/filenames_to_json.py +0 -385
  134. data_management/importers/helena_to_cct.py +0 -282
  135. data_management/importers/idaho-camera-traps.py +0 -1407
  136. data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  137. data_management/importers/jb_csv_to_json.py +0 -150
  138. data_management/importers/mcgill_to_json.py +0 -250
  139. data_management/importers/missouri_to_json.py +0 -489
  140. data_management/importers/nacti_fieldname_adjustments.py +0 -79
  141. data_management/importers/noaa_seals_2019.py +0 -181
  142. data_management/importers/pc_to_json.py +0 -365
  143. data_management/importers/plot_wni_giraffes.py +0 -123
  144. data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
  145. data_management/importers/prepare_zsl_imerit.py +0 -131
  146. data_management/importers/rspb_to_json.py +0 -356
  147. data_management/importers/save_the_elephants_survey_A.py +0 -320
  148. data_management/importers/save_the_elephants_survey_B.py +0 -332
  149. data_management/importers/snapshot_safari_importer.py +0 -758
  150. data_management/importers/snapshot_safari_importer_reprise.py +0 -665
  151. data_management/importers/snapshot_serengeti_lila.py +0 -1067
  152. data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  153. data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  154. data_management/importers/sulross_get_exif.py +0 -65
  155. data_management/importers/timelapse_csv_set_to_json.py +0 -490
  156. data_management/importers/ubc_to_json.py +0 -399
  157. data_management/importers/umn_to_json.py +0 -507
  158. data_management/importers/wellington_to_json.py +0 -263
  159. data_management/importers/wi_to_json.py +0 -441
  160. data_management/importers/zamba_results_to_md_results.py +0 -181
  161. data_management/labelme_to_coco.py +0 -548
  162. data_management/labelme_to_yolo.py +0 -272
  163. data_management/lila/__init__.py +0 -0
  164. data_management/lila/add_locations_to_island_camera_traps.py +0 -97
  165. data_management/lila/add_locations_to_nacti.py +0 -147
  166. data_management/lila/create_lila_blank_set.py +0 -557
  167. data_management/lila/create_lila_test_set.py +0 -151
  168. data_management/lila/create_links_to_md_results_files.py +0 -106
  169. data_management/lila/download_lila_subset.py +0 -177
  170. data_management/lila/generate_lila_per_image_labels.py +0 -515
  171. data_management/lila/get_lila_annotation_counts.py +0 -170
  172. data_management/lila/get_lila_image_counts.py +0 -111
  173. data_management/lila/lila_common.py +0 -300
  174. data_management/lila/test_lila_metadata_urls.py +0 -132
  175. data_management/ocr_tools.py +0 -874
  176. data_management/read_exif.py +0 -681
  177. data_management/remap_coco_categories.py +0 -84
  178. data_management/remove_exif.py +0 -66
  179. data_management/resize_coco_dataset.py +0 -189
  180. data_management/wi_download_csv_to_coco.py +0 -246
  181. data_management/yolo_output_to_md_output.py +0 -441
  182. data_management/yolo_to_coco.py +0 -676
  183. detection/__init__.py +0 -0
  184. detection/detector_training/__init__.py +0 -0
  185. detection/detector_training/model_main_tf2.py +0 -114
  186. detection/process_video.py +0 -703
  187. detection/pytorch_detector.py +0 -337
  188. detection/run_detector.py +0 -779
  189. detection/run_detector_batch.py +0 -1219
  190. detection/run_inference_with_yolov5_val.py +0 -917
  191. detection/run_tiled_inference.py +0 -935
  192. detection/tf_detector.py +0 -188
  193. detection/video_utils.py +0 -606
  194. docs/source/conf.py +0 -43
  195. md_utils/__init__.py +0 -0
  196. md_utils/azure_utils.py +0 -174
  197. md_utils/ct_utils.py +0 -612
  198. md_utils/directory_listing.py +0 -246
  199. md_utils/md_tests.py +0 -968
  200. md_utils/path_utils.py +0 -1044
  201. md_utils/process_utils.py +0 -157
  202. md_utils/sas_blob_utils.py +0 -509
  203. md_utils/split_locations_into_train_val.py +0 -228
  204. md_utils/string_utils.py +0 -92
  205. md_utils/url_utils.py +0 -323
  206. md_utils/write_html_image_list.py +0 -225
  207. md_visualization/__init__.py +0 -0
  208. md_visualization/plot_utils.py +0 -293
  209. md_visualization/render_images_with_thumbnails.py +0 -275
  210. md_visualization/visualization_utils.py +0 -1537
  211. md_visualization/visualize_db.py +0 -551
  212. md_visualization/visualize_detector_output.py +0 -406
  213. megadetector-5.0.10.dist-info/RECORD +0 -224
  214. megadetector-5.0.10.dist-info/top_level.txt +0 -8
  215. taxonomy_mapping/__init__.py +0 -0
  216. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
  217. taxonomy_mapping/map_new_lila_datasets.py +0 -154
  218. taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
  219. taxonomy_mapping/preview_lila_taxonomy.py +0 -591
  220. taxonomy_mapping/retrieve_sample_image.py +0 -71
  221. taxonomy_mapping/simple_image_download.py +0 -218
  222. taxonomy_mapping/species_lookup.py +0 -834
  223. taxonomy_mapping/taxonomy_csv_checker.py +0 -159
  224. taxonomy_mapping/taxonomy_graph.py +0 -346
  225. taxonomy_mapping/validate_lila_category_mappings.py +0 -83
  226. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0
@@ -1,695 +0,0 @@
1
- """
2
-
3
- json_validator.py
4
-
5
- Validates a classification label specification JSON file and optionally
6
- queries MegaDB to find matching image files.
7
-
8
- See README.md for an example of a classification label specification JSON file.
9
-
10
- The validation step takes the classification label specification JSON file and
11
- finds the dataset labels that belong to each classification label. It checks
12
- that the following conditions hold:
13
-
14
- 1) Each classification label specification matches at least 1 dataset label.
15
-
16
- 2) If the classification label includes a taxonomical specification, then the
17
- taxa is actually a part of our master taxonomy.
18
-
19
- 3) If the 'prioritize' key is found for a given label, then the label must
20
- also have a 'max_count' key.
21
-
22
- 4) If --allow-multilabel=False, then no dataset label is included in more than
23
- one classification label.
24
-
25
- If --output-dir <output_dir> is given, then we query MegaDB for images
26
- that match the dataset labels identified during the validation step. We filter
27
- out images that have unaccepted file extensions and images that don't actually
28
- exist in Azure Blob Storage. In total, we output the following files:
29
-
30
- <output_dir>/
31
-
32
- - included_dataset_labels.txt
33
- lists the original dataset classes included for each classification label
34
-
35
- - image_counts_by_label_presample.json
36
- number of images for each classification label after filtering bad
37
- images, but before sampling
38
-
39
- - image_counts_by_label_sampled.json
40
- number of images for each classification label in queried_images.json
41
-
42
- - json_validator_log_{timestamp}.json
43
- log of excluded images / labels
44
-
45
- - queried_images.json
46
- main output file, ex:
47
-
48
- {
49
- "caltech/cct_images/59f5fe2b-23d2-11e8-a6a3-ec086b02610b.jpg": {
50
- "dataset": "caltech",
51
- "location": 13,
52
- "class": "mountain_lion", // class from dataset
53
- "label": ["monutain_lion"] // labels to use in classifier
54
- },
55
- "caltech/cct_images/59f79901-23d2-11e8-a6a3-ec086b02610b.jpg": {
56
- "dataset": "caltech",
57
- "location": 13,
58
- "class": "mountain_lion", // class from dataset
59
- "bbox": [{"category": "animal",
60
- "bbox": [0, 0.347, 0.237, 0.257]}],
61
- "label": ["monutain_lion"] // labels to use in classifier
62
- },
63
- ...
64
- }
65
-
66
- """
67
-
68
- from __future__ import annotations
69
-
70
- import argparse
71
- from collections import defaultdict
72
- from collections.abc import Container, Iterable, Mapping, MutableMapping
73
- from concurrent import futures
74
- from datetime import datetime
75
- import json
76
- import os
77
- import pprint
78
- import random
79
- from typing import Any
80
-
81
- import pandas as pd
82
- from md_utils import path_utils
83
- from md_utils import sas_blob_utils
84
- from tqdm import tqdm
85
-
86
- from data_management.megadb import megadb_utils
87
- from taxonomy_mapping.taxonomy_graph import (
88
- build_taxonomy_graph, dag_to_tree, TaxonNode)
89
-
90
-
91
- #%% Example usage
92
-
93
- """
94
- python json_validator.py label_spec.json \
95
- $HOME/camera-traps-private/camera_trap_taxonomy_mapping.csv \
96
- --output-dir run --json-indent 2
97
- """
98
-
99
-
100
- #%% Main function
101
-
102
- def main(label_spec_json_path: str,
103
- taxonomy_csv_path: str,
104
- allow_multilabel: bool = False,
105
- single_parent_taxonomy: bool = False,
106
- check_blob_exists: bool | str = False,
107
- min_locs: int | None = None,
108
- output_dir: str | None = None,
109
- json_indent: int | None = None,
110
- seed: int = 123,
111
- mislabeled_images_dir: str | None = None) -> None:
112
-
113
- # input validation
114
- assert os.path.exists(label_spec_json_path)
115
- assert os.path.exists(taxonomy_csv_path)
116
- if mislabeled_images_dir is not None:
117
- assert os.path.isdir(mislabeled_images_dir)
118
-
119
- random.seed(seed)
120
-
121
- print('Building taxonomy hierarchy')
122
- taxonomy_df = pd.read_csv(taxonomy_csv_path)
123
- if single_parent_taxonomy:
124
- TaxonNode.single_parent_only = True
125
- graph, taxonomy_dict, _ = build_taxonomy_graph(taxonomy_df)
126
- dag_to_tree(graph, taxonomy_dict)
127
-
128
- print('Validating input json')
129
- with open(label_spec_json_path, 'r') as f:
130
- input_js = json.load(f)
131
- label_to_inclusions = validate_json(
132
- input_js, taxonomy_dict, allow_multilabel=allow_multilabel)
133
-
134
- if output_dir is None:
135
- pprint.pprint(label_to_inclusions)
136
- return
137
-
138
- os.makedirs(output_dir, exist_ok=True)
139
- labels_path = os.path.join(output_dir, 'included_dataset_labels.txt')
140
- with open(labels_path, 'w') as f:
141
- pprint.pprint(label_to_inclusions, stream=f)
142
-
143
- # use MegaDB to generate list of images
144
- print('Generating output json')
145
- output_js = get_output_json(label_to_inclusions, mislabeled_images_dir)
146
- print(f'In total found {len(output_js)} images')
147
-
148
- # only keep images that:
149
- # 1) end in a supported file extension, and
150
- # 2) actually exist in Azure Blob Storage
151
- # 3) belong to a label with at least min_locs locations
152
- log: dict[str, Any] = {}
153
- remove_non_images(output_js, log)
154
- if isinstance(check_blob_exists, str):
155
- remove_nonexistent_images(output_js, log, check_local=check_blob_exists)
156
- elif check_blob_exists:
157
- remove_nonexistent_images(output_js, log)
158
- if min_locs is not None:
159
- remove_images_insufficient_locs(output_js, log, min_locs)
160
-
161
- # write out log of images / labels that were removed
162
- date = datetime.now().strftime('%Y%m%d_%H%M%S') # ex: '20200722_110816'
163
- log_path = os.path.join(output_dir, f'json_validator_log_{date}.json')
164
- print(f'Saving log of bad images to {log_path}')
165
- with open(log_path, 'w') as f:
166
- json.dump(log, f, indent=1)
167
-
168
- # save label counts, pre-subsampling
169
- print('Saving pre-sampling label counts')
170
- save_path = os.path.join(output_dir, 'image_counts_by_label_presample.json')
171
- with open(save_path, 'w') as f:
172
- image_counts_by_label = {
173
- label: len(filter_images(output_js, label))
174
- for label in sorted(input_js.keys())
175
- }
176
- json.dump(image_counts_by_label, f, indent=1)
177
-
178
- print('Sampling with priority (if needed)')
179
- output_js = sample_with_priority(input_js, output_js)
180
-
181
- print('Saving queried_images.json')
182
- output_json_path = os.path.join(output_dir, 'queried_images.json')
183
- with open(output_json_path, 'w') as f:
184
- json.dump(output_js, f, indent=json_indent)
185
-
186
- # save label counts, post-subsampling
187
- print('Saving post-sampling label counts')
188
- save_path = os.path.join(output_dir, 'image_counts_by_label_sampled.json')
189
- with open(save_path, 'w') as f:
190
- image_counts_by_label = {
191
- label: len(filter_images(output_js, label))
192
- for label in sorted(input_js.keys())
193
- }
194
- json.dump(image_counts_by_label, f, indent=1)
195
-
196
-
197
- #%% Support functions
198
-
199
- def parse_spec(spec_dict: Mapping[str, Any],
200
- taxonomy_dict: dict[tuple[str, str], TaxonNode]
201
- ) -> set[tuple[str, str]]:
202
- """
203
- Gathers the dataset labels corresponding to a particular classification
204
- label specification.
205
-
206
- Args:
207
- spec_dict: dict, contains keys ['taxa', 'dataset_labels']
208
- taxonomy_dict: dict, maps (taxon_level, taxon_name) to a TaxonNode
209
-
210
- Returns: set of (ds, ds_label), dataset labels requested by the spec
211
-
212
- Raises: ValueError, if specification does not match any dataset labels
213
- """
214
-
215
- results = set()
216
- if 'taxa' in spec_dict:
217
- # spec_dict['taxa']: list of dict
218
- # [
219
- # {'level': 'family', 'name': 'cervidae', 'datasets': ['idfg']},
220
- # {'level': 'genus', 'name': 'meleagris'}
221
- # ]
222
- for taxon in spec_dict['taxa']:
223
- key = (taxon['level'].lower(), taxon['name'].lower())
224
- datasets = taxon.get('datasets', None)
225
- results |= taxonomy_dict[key].get_dataset_labels(datasets)
226
-
227
- if 'dataset_labels' in spec_dict:
228
- # spec_dict['dataset_labels']: dict, dataset => list of dataset_label
229
- # {
230
- # "idfg": ["deer", "elk", "prong"],
231
- # "idfg_swwlf_2019": ["elk", "muledeer", "whitetaileddeer"]
232
- # }
233
- for ds, ds_labels in spec_dict['dataset_labels'].items():
234
- for ds_label in ds_labels:
235
- results.add((ds, ds_label))
236
-
237
- if len(results) == 0:
238
- raise ValueError('specification matched no dataset labels')
239
- return results
240
-
241
-
242
- def validate_json(input_js: dict[str, dict[str, Any]],
243
- taxonomy_dict: dict[tuple[str, str], TaxonNode],
244
- allow_multilabel: bool) -> dict[str, set[tuple[str, str]]]:
245
- """
246
- Validates JSON.
247
-
248
- Args:
249
- input_js: dict, Python dict representation of JSON file
250
- see classification/README.md
251
- taxonomy_dict: dict, maps (taxon_level, taxon_name) to a TaxonNode
252
- allow_multilabel: bool, whether to allow a dataset label to be assigned
253
- to multiple output labels
254
-
255
- Returns: dict, maps label name to set of (dataset, dataset_label) tuples
256
-
257
- Raises: ValueError, if a classification label specification matches no
258
- dataset labels, or if allow_multilabel=False but a dataset label is
259
- included in two or more classification labels
260
- """
261
-
262
- # maps output label name to set of (dataset, dataset_label) tuples
263
- label_to_inclusions: dict[str, set[tuple[str, str]]] = {}
264
- for label, spec_dict in input_js.items():
265
- include_set = parse_spec(spec_dict, taxonomy_dict)
266
- if 'exclude' in spec_dict:
267
- include_set -= parse_spec(spec_dict['exclude'], taxonomy_dict)
268
-
269
- for label_b, set_b in label_to_inclusions.items():
270
- shared = include_set.intersection(set_b)
271
- if len(shared) > 0:
272
- print(f'Labels {label} and {label_b} share images:', shared)
273
- if not allow_multilabel:
274
- raise ValueError('Intersection between sets!')
275
-
276
- label_to_inclusions[label] = include_set
277
- return label_to_inclusions
278
-
279
-
280
- def get_output_json(label_to_inclusions: dict[str, set[tuple[str, str]]],
281
- mislabeled_images_dir: str | None = None
282
- ) -> dict[str, dict[str, Any]]:
283
- """
284
- Queries MegaDB to get image paths matching dataset_labels.
285
-
286
- Args:
287
- label_to_inclusions: dict, maps label name to set of
288
- (dataset, dataset_label) tuples, output of validate_json()
289
- mislabeled_images_dir: str, path to directory of CSVs with known
290
- mislabeled images
291
-
292
- Returns: dict, maps sorted image_path <dataset>/<img_file> to a dict of
293
- properties
294
- - 'dataset': str, name of dataset that image is from
295
- - 'location': str or int, optional
296
- - 'class': str, class label from the dataset
297
- - 'label': list of str, assigned output label
298
- - 'bbox': list of dicts, optional
299
- """
300
-
301
- # Because MegaDB is organized by dataset, we do the same...
302
- #
303
- # ds_to_labels = {
304
- # 'dataset_name': {
305
- # 'dataset_label': [output_label1, output_label2]
306
- # }
307
- # }
308
- ds_to_labels: dict[str, dict[str, list[str]]] = {}
309
- for output_label, ds_dslabels_set in label_to_inclusions.items():
310
- for (ds, ds_label) in ds_dslabels_set:
311
- if ds not in ds_to_labels:
312
- ds_to_labels[ds] = {}
313
- if ds_label not in ds_to_labels[ds]:
314
- ds_to_labels[ds][ds_label] = []
315
- ds_to_labels[ds][ds_label].append(output_label)
316
-
317
- # we need the datasets table for getting full image paths
318
- megadb = megadb_utils.MegadbUtils()
319
- datasets_table = megadb.get_datasets_table()
320
-
321
- # The line
322
- # [img.class[0], seq.class[0]][0] as class
323
- # selects the image-level class label if available. Otherwise it selects the
324
- # sequence-level class label. This line assumes the following conditions,
325
- # expressed in the WHERE clause:
326
- # - at least one of the image or sequence class label is given
327
- # - the image and sequence class labels are arrays with length at most 1
328
- # - the image class label takes priority over the sequence class label
329
- #
330
- # In Azure Cosmos DB, if a field is not defined, then it is simply excluded
331
- # from the result. For example, on the following JSON object,
332
- # {
333
- # "dataset": "camera_traps",
334
- # "seq_id": "1234",
335
- # "location": "A1",
336
- # "images": [{"file": "abcd.jpeg"}],
337
- # "class": ["deer"],
338
- # }
339
- # the array [img.class[0], seq.class[0]] just gives ['deer'] because
340
- # img.class is undefined and therefore excluded.
341
- query = '''
342
- SELECT
343
- seq.dataset,
344
- seq.location,
345
- img.file,
346
- [img.class[0], seq.class[0]][0] as class,
347
- img.bbox
348
- FROM sequences seq JOIN img IN seq.images
349
- WHERE (ARRAY_LENGTH(img.class) = 1
350
- AND ARRAY_CONTAINS(@dataset_labels, img.class[0])
351
- )
352
- OR (ARRAY_LENGTH(seq.class) = 1
353
- AND ARRAY_CONTAINS(@dataset_labels, seq.class[0])
354
- AND (NOT IS_DEFINED(img.class))
355
- )
356
- '''
357
-
358
- output_json = {} # maps full image path to json object
359
-
360
- for ds in tqdm(sorted(ds_to_labels.keys())): # sort for determinism
361
-
362
- mislabeled_images: Mapping[str, Any] = {}
363
- if mislabeled_images_dir is not None:
364
- csv_path = os.path.join(mislabeled_images_dir, f'{ds}.csv')
365
- if os.path.exists(csv_path):
366
- mislabeled_images = pd.read_csv(csv_path, index_col='file',
367
- squeeze=True)
368
-
369
- ds_labels = sorted(ds_to_labels[ds].keys())
370
- tqdm.write(f'Querying dataset "{ds}" for dataset labels: {ds_labels}')
371
-
372
- start = datetime.now()
373
- parameters = [dict(name='@dataset_labels', value=ds_labels)]
374
- results = megadb.query_sequences_table(
375
- query, partition_key=ds, parameters=parameters)
376
- elapsed = (datetime.now() - start).total_seconds()
377
- tqdm.write(f'- query took {elapsed:.0f}s, found {len(results)} images')
378
-
379
- # if no path prefix, set it to the empty string '', because
380
- # os.path.join('', x, '') = '{x}/'
381
- path_prefix = datasets_table[ds].get('path_prefix', '')
382
- count_corrected = 0
383
- count_removed = 0
384
- for result in results:
385
- # result keys
386
- # - already has: ['dataset', 'location', 'file', 'class', 'bbox']
387
- # - add ['label'], remove ['file']
388
- img_file = os.path.join(path_prefix, result['file'])
389
-
390
- # if img is mislabeled, but we don't know the correct class, skip it
391
- # otherwise, update the img with the correct class, but skip the
392
- # img if the correct class is not one we queried for
393
- if img_file in mislabeled_images:
394
- new_class = mislabeled_images[img_file]
395
- if pd.isna(new_class) or new_class not in ds_to_labels[ds]:
396
- count_removed += 1
397
- continue
398
-
399
- count_corrected += 1
400
- result['class'] = new_class
401
-
402
- img_path = os.path.join(ds, img_file)
403
- del result['file']
404
- ds_label = result['class']
405
- result['label'] = ds_to_labels[ds][ds_label]
406
- output_json[img_path] = result
407
-
408
- tqdm.write(f'- Removed {count_removed} mislabeled images.')
409
- tqdm.write(f'- Corrected labels for {count_corrected} images.')
410
-
411
- # sort keys for determinism
412
- output_json = {k: output_json[k] for k in sorted(output_json.keys())}
413
- return output_json
414
-
415
-
416
- def get_image_sas_uris(img_paths: Iterable[str]) -> list[str]:
417
- """
418
- Converts a image paths to Azure Blob Storage blob URIs with SAS tokens.
419
-
420
- Args:
421
- img_paths: list of str, <dataset-name>/<image-filename>
422
-
423
- Returns:
424
- image_sas_uris: list of str, image blob URIs with SAS tokens, ready to
425
- pass to the batch detection API
426
- """
427
-
428
- # we need the datasets table for getting SAS keys
429
- datasets_table = megadb_utils.MegadbUtils().get_datasets_table()
430
-
431
- image_sas_uris = []
432
- for img_path in img_paths:
433
- dataset, img_file = img_path.split('/', maxsplit=1)
434
-
435
- # strip leading '?' from SAS token
436
- sas_token = datasets_table[dataset]['container_sas_key']
437
- if sas_token[0] == '?':
438
- sas_token = sas_token[1:]
439
-
440
- image_sas_uri = sas_blob_utils.build_azure_storage_uri(
441
- account=datasets_table[dataset]['storage_account'],
442
- container=datasets_table[dataset]['container'],
443
- blob=img_file,
444
- sas_token=sas_token)
445
- image_sas_uris.append(image_sas_uri)
446
- return image_sas_uris
447
-
448
-
449
- def remove_non_images(js: MutableMapping[str, dict[str, Any]],
450
- log: MutableMapping[str, Any]) -> None:
451
- """
452
- Remove images with non-image file extensions. Modifies [js] and [log]
453
- in-place.
454
-
455
- Args:
456
- js: dict, img_path => info dict
457
- log: dict, maps str description to log info
458
- """
459
- print('Removing images with invalid image file extensions...')
460
- nonimg_paths = [k for k in js.keys() if not path_utils.is_image_file(k)]
461
- for img_path in nonimg_paths:
462
- del js[img_path]
463
- print(f'Removed {len(nonimg_paths)} files with non-image extensions.')
464
- if len(nonimg_paths) > 0:
465
- log['nonimage_files'] = sorted(nonimg_paths)
466
-
467
-
468
- def remove_nonexistent_images(js: MutableMapping[str, dict[str, Any]],
469
- log: MutableMapping[str, Any],
470
- check_local: str | None = None,
471
- num_threads: int = 50) -> None:
472
- """
473
- Remove images that don't actually exist locally or on Azure Blob Storage.
474
- Modifies [js] and [log] in-place.
475
-
476
- Args:
477
- js: dict, image paths <dataset>/<img_file> => info dict
478
- log: dict, maps str description to log info
479
- check_local: optional str, path to local dir
480
- num_threads: int, number of threads to use for checking blob existence
481
- """
482
-
483
- def check_local_then_azure(local_path: str, blob_url: str) -> bool:
484
- return (os.path.exists(local_path)
485
- or sas_blob_utils.check_blob_exists(blob_url))
486
-
487
- pool = futures.ThreadPoolExecutor(max_workers=num_threads)
488
- future_to_img_path = {}
489
- blob_urls = get_image_sas_uris(js.keys())
490
- total = len(js)
491
- print(f'Checking {total} images for existence...')
492
- pbar = tqdm(zip(js.keys(), blob_urls), total=total)
493
- if check_local is None:
494
- # only check Azure Blob Storage
495
- for img_path, blob_url in pbar:
496
- future = pool.submit(sas_blob_utils.check_blob_exists, blob_url)
497
- future_to_img_path[future] = img_path
498
- else:
499
- # check local directory first before checking Azure Blob Storage
500
- for img_path, blob_url in pbar:
501
- local_path = os.path.join(check_local, img_path)
502
- future = pool.submit(check_local_then_azure, local_path, blob_url)
503
- future_to_img_path[future] = img_path
504
-
505
- nonexistent_images = []
506
- print('Fetching results...')
507
- for future in tqdm(futures.as_completed(future_to_img_path), total=total):
508
- img_path = future_to_img_path[future]
509
- try:
510
- if future.result(): # blob_url exists
511
- continue
512
- except Exception as e: # pylint: disable=broad-except
513
- exception_type = type(e).__name__
514
- tqdm.write(f'{img_path} - generated {exception_type}: {e}')
515
- nonexistent_images.append(img_path)
516
- del js[img_path]
517
- pool.shutdown()
518
-
519
- print(f'Found {len(nonexistent_images)} nonexistent blobs.')
520
- if len(nonexistent_images) > 0:
521
- log['nonexistent_images'] = sorted(nonexistent_images)
522
-
523
-
524
- def remove_images_insufficient_locs(js: MutableMapping[str, dict[str, Any]],
525
- log: MutableMapping[str, Any],
526
- min_locs: int) -> None:
527
- """
528
- Removes images that have labels that don't have at least min_locs
529
- locations. Modifies [js] and [log] in-place.
530
-
531
- Args:
532
- js: dict, image paths <dataset>/<img_file> => info dict
533
- log: dict, maps str description to log info
534
- min_locs: optional int, minimum # of locations that each label must
535
- have in order to be included
536
- """
537
-
538
- # 1st pass: populate label_to_locs
539
- # label (tuple of str) => set of (dataset, location)
540
- label_to_locs = defaultdict(set)
541
- for img_path, img_info in js.items():
542
- label = tuple(img_info['label'])
543
- loc = (img_info['dataset'], img_info.get('location', ''))
544
- label_to_locs[label].add(loc)
545
-
546
- bad_labels = set(label for label, locs in label_to_locs.items()
547
- if len(locs) < min_locs)
548
- print(f'Found {len(bad_labels)} labels with < {min_locs} locations.')
549
-
550
- # 2nd pass: eliminate bad images
551
- if len(bad_labels) > 0:
552
- log[f'labels with < {min_locs} locs'] = sorted(bad_labels)
553
- for img_path in list(js.keys()): # copy keys to modify js in-place
554
- label = tuple(js[img_path]['label'])
555
- if label in bad_labels:
556
- del js[img_path]
557
-
558
-
559
- def filter_images(output_js: Mapping[str, Mapping[str, Any]], label: str,
560
- datasets: Container[str] | None = None) -> set[str]:
561
- """
562
- Finds image files from output_js that have a given label and are from
563
- a set of datasets.
564
-
565
- Args:
566
- output_js: dict, output of get_output_json()
567
- label: str, desired label
568
- datasets: optional list str, dataset names, images from any dataset are
569
- allowed if datasets=None
570
-
571
- Returns: set of str, image files that match the filtering criteria
572
- """
573
-
574
- img_files: set[str] = set()
575
- for img_file, img_dict in output_js.items():
576
- cond1 = (label in img_dict['label'])
577
- cond2 = (datasets is None or img_dict['dataset'] in datasets)
578
- if cond1 and cond2:
579
- img_files.add(img_file)
580
- return img_files
581
-
582
-
583
- def sample_with_priority(input_js: Mapping[str, Mapping[str, Any]],
584
- output_js: Mapping[str, dict[str, Any]]
585
- ) -> dict[str, dict[str, Any]]:
586
- """
587
- Uses the optional 'max_count' and 'prioritize' keys from the input
588
- classification labels specifications JSON file to sample images for each
589
- classification label.
590
-
591
- Returns: dict, keys are image file names, sorted alphabetically
592
- """
593
-
594
- filtered_imgs: set[str] = set()
595
- for label, spec_dict in input_js.items():
596
- if 'prioritize' in spec_dict and 'max_count' not in spec_dict:
597
- raise ValueError('prioritize is invalid without a max_count value.')
598
-
599
- if 'max_count' not in spec_dict:
600
- filtered_imgs.update(filter_images(output_js, label, datasets=None))
601
- continue
602
- quota = spec_dict['max_count']
603
-
604
- # prioritize is a list of prioritization levels
605
- prioritize = spec_dict.get('prioritize', [])
606
- prioritize.append(None)
607
-
608
- for level in prioritize:
609
- img_files = filter_images(output_js, label, datasets=level)
610
-
611
- # number of already matching images
612
- num_already_matching = len(img_files & filtered_imgs)
613
- quota = max(0, quota - num_already_matching)
614
- img_files -= filtered_imgs
615
-
616
- num_to_sample = min(quota, len(img_files))
617
- sample = random.sample(img_files, k=num_to_sample)
618
- filtered_imgs.update(sample)
619
-
620
- quota -= num_to_sample
621
- if quota == 0:
622
- break
623
-
624
- output_js = {
625
- img_file: output_js[img_file]
626
- for img_file in sorted(filtered_imgs)
627
- }
628
- return output_js
629
-
630
-
631
- #%% Command-line driver
632
-
633
- def _parse_args() -> argparse.Namespace:
634
-
635
- parser = argparse.ArgumentParser(
636
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
637
- description='Validates JSON.')
638
- parser.add_argument(
639
- 'label_spec_json',
640
- help='path to JSON file containing label specification')
641
- parser.add_argument(
642
- 'taxonomy_csv',
643
- help='path to taxonomy CSV file')
644
- parser.add_argument(
645
- '--allow-multilabel', action='store_true',
646
- help='allow assigning a (dataset, dataset_label) pair to multiple '
647
- 'output labels')
648
- parser.add_argument(
649
- '--single-parent-taxonomy', action='store_true',
650
- help='flag that restricts the taxonomy to only allow a single parent '
651
- 'for each taxon node')
652
- parser.add_argument(
653
- '-c', '--check-blob-exists', nargs='?', const=True,
654
- help='check that the blob for each queried image actually exists. Can '
655
- 'be very slow if reaching throttling limits. Optionally pass in a '
656
- 'local directory to check before checking Azure Blob Storage.')
657
- parser.add_argument(
658
- '--min-locs', type=int,
659
- help='minimum number of locations that each label must have in order '
660
- 'to be included')
661
- parser.add_argument(
662
- '-o', '--output-dir',
663
- help='path to directory to save outputs. The output JSON file is saved '
664
- 'at <output-dir>/queried_images.json, and the mapping from '
665
- 'classification labels to dataset labels is saved at '
666
- '<output-dir>/included_dataset_labels.txt.')
667
- parser.add_argument(
668
- '--json-indent', type=int,
669
- help='number of spaces to use for JSON indent (default no indent), '
670
- 'only used if --output-dir is given')
671
- parser.add_argument(
672
- '--seed', type=int, default=123,
673
- help='random seed for sampling images, only used if --output-dir is '
674
- 'given and a label specification includes a "max_count" key')
675
- parser.add_argument(
676
- '-m', '--mislabeled-images',
677
- help='path to `megadb_mislabeled` directory of locally mounted '
678
- '`classifier-training` Azure Blob Storage container where known '
679
- 'mislabeled images are tracked')
680
- return parser.parse_args()
681
-
682
-
683
- if __name__ == '__main__':
684
-
685
- args = _parse_args()
686
- main(label_spec_json_path=args.label_spec_json,
687
- taxonomy_csv_path=args.taxonomy_csv,
688
- allow_multilabel=args.allow_multilabel,
689
- single_parent_taxonomy=args.single_parent_taxonomy,
690
- check_blob_exists=args.check_blob_exists,
691
- min_locs=args.min_locs,
692
- output_dir=args.output_dir,
693
- json_indent=args.json_indent,
694
- seed=args.seed,
695
- mislabeled_images_dir=args.mislabeled_images)