megadetector 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show
  1. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
  2. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
  3. megadetector-5.0.11.dist-info/RECORD +5 -0
  4. megadetector-5.0.11.dist-info/top_level.txt +1 -0
  5. api/__init__.py +0 -0
  6. api/batch_processing/__init__.py +0 -0
  7. api/batch_processing/api_core/__init__.py +0 -0
  8. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  9. api/batch_processing/api_core/batch_service/score.py +0 -439
  10. api/batch_processing/api_core/server.py +0 -294
  11. api/batch_processing/api_core/server_api_config.py +0 -98
  12. api/batch_processing/api_core/server_app_config.py +0 -55
  13. api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  14. api/batch_processing/api_core/server_job_status_table.py +0 -152
  15. api/batch_processing/api_core/server_orchestration.py +0 -360
  16. api/batch_processing/api_core/server_utils.py +0 -92
  17. api/batch_processing/api_core_support/__init__.py +0 -0
  18. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  19. api/batch_processing/api_support/__init__.py +0 -0
  20. api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  21. api/batch_processing/data_preparation/__init__.py +0 -0
  22. api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
  23. api/batch_processing/data_preparation/manage_video_batch.py +0 -327
  24. api/batch_processing/integration/digiKam/setup.py +0 -6
  25. api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
  26. api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
  27. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
  28. api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
  29. api/batch_processing/postprocessing/__init__.py +0 -0
  30. api/batch_processing/postprocessing/add_max_conf.py +0 -64
  31. api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
  32. api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
  33. api/batch_processing/postprocessing/compare_batch_results.py +0 -958
  34. api/batch_processing/postprocessing/convert_output_format.py +0 -397
  35. api/batch_processing/postprocessing/load_api_results.py +0 -195
  36. api/batch_processing/postprocessing/md_to_coco.py +0 -310
  37. api/batch_processing/postprocessing/md_to_labelme.py +0 -330
  38. api/batch_processing/postprocessing/merge_detections.py +0 -401
  39. api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
  40. api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
  41. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
  42. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
  43. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
  44. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
  45. api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
  46. api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
  47. api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
  48. api/synchronous/__init__.py +0 -0
  49. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  50. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
  51. api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
  52. api/synchronous/api_core/animal_detection_api/config.py +0 -35
  53. api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
  54. api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
  55. api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
  56. api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
  57. api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
  58. api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
  59. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
  60. api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
  61. api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
  62. api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
  63. api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
  64. api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
  65. api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
  66. api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
  67. api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
  68. api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
  69. api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
  70. api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
  71. api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
  72. api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
  73. api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
  74. api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
  75. api/synchronous/api_core/tests/__init__.py +0 -0
  76. api/synchronous/api_core/tests/load_test.py +0 -110
  77. classification/__init__.py +0 -0
  78. classification/aggregate_classifier_probs.py +0 -108
  79. classification/analyze_failed_images.py +0 -227
  80. classification/cache_batchapi_outputs.py +0 -198
  81. classification/create_classification_dataset.py +0 -627
  82. classification/crop_detections.py +0 -516
  83. classification/csv_to_json.py +0 -226
  84. classification/detect_and_crop.py +0 -855
  85. classification/efficientnet/__init__.py +0 -9
  86. classification/efficientnet/model.py +0 -415
  87. classification/efficientnet/utils.py +0 -610
  88. classification/evaluate_model.py +0 -520
  89. classification/identify_mislabeled_candidates.py +0 -152
  90. classification/json_to_azcopy_list.py +0 -63
  91. classification/json_validator.py +0 -695
  92. classification/map_classification_categories.py +0 -276
  93. classification/merge_classification_detection_output.py +0 -506
  94. classification/prepare_classification_script.py +0 -194
  95. classification/prepare_classification_script_mc.py +0 -228
  96. classification/run_classifier.py +0 -286
  97. classification/save_mislabeled.py +0 -110
  98. classification/train_classifier.py +0 -825
  99. classification/train_classifier_tf.py +0 -724
  100. classification/train_utils.py +0 -322
  101. data_management/__init__.py +0 -0
  102. data_management/annotations/__init__.py +0 -0
  103. data_management/annotations/annotation_constants.py +0 -34
  104. data_management/camtrap_dp_to_coco.py +0 -238
  105. data_management/cct_json_utils.py +0 -395
  106. data_management/cct_to_md.py +0 -176
  107. data_management/cct_to_wi.py +0 -289
  108. data_management/coco_to_labelme.py +0 -272
  109. data_management/coco_to_yolo.py +0 -662
  110. data_management/databases/__init__.py +0 -0
  111. data_management/databases/add_width_and_height_to_db.py +0 -33
  112. data_management/databases/combine_coco_camera_traps_files.py +0 -206
  113. data_management/databases/integrity_check_json_db.py +0 -477
  114. data_management/databases/subset_json_db.py +0 -115
  115. data_management/generate_crops_from_cct.py +0 -149
  116. data_management/get_image_sizes.py +0 -188
  117. data_management/importers/add_nacti_sizes.py +0 -52
  118. data_management/importers/add_timestamps_to_icct.py +0 -79
  119. data_management/importers/animl_results_to_md_results.py +0 -158
  120. data_management/importers/auckland_doc_test_to_json.py +0 -372
  121. data_management/importers/auckland_doc_to_json.py +0 -200
  122. data_management/importers/awc_to_json.py +0 -189
  123. data_management/importers/bellevue_to_json.py +0 -273
  124. data_management/importers/cacophony-thermal-importer.py +0 -796
  125. data_management/importers/carrizo_shrubfree_2018.py +0 -268
  126. data_management/importers/carrizo_trail_cam_2017.py +0 -287
  127. data_management/importers/cct_field_adjustments.py +0 -57
  128. data_management/importers/channel_islands_to_cct.py +0 -913
  129. data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  130. data_management/importers/eMammal/eMammal_helpers.py +0 -249
  131. data_management/importers/eMammal/make_eMammal_json.py +0 -223
  132. data_management/importers/ena24_to_json.py +0 -275
  133. data_management/importers/filenames_to_json.py +0 -385
  134. data_management/importers/helena_to_cct.py +0 -282
  135. data_management/importers/idaho-camera-traps.py +0 -1407
  136. data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  137. data_management/importers/jb_csv_to_json.py +0 -150
  138. data_management/importers/mcgill_to_json.py +0 -250
  139. data_management/importers/missouri_to_json.py +0 -489
  140. data_management/importers/nacti_fieldname_adjustments.py +0 -79
  141. data_management/importers/noaa_seals_2019.py +0 -181
  142. data_management/importers/pc_to_json.py +0 -365
  143. data_management/importers/plot_wni_giraffes.py +0 -123
  144. data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
  145. data_management/importers/prepare_zsl_imerit.py +0 -131
  146. data_management/importers/rspb_to_json.py +0 -356
  147. data_management/importers/save_the_elephants_survey_A.py +0 -320
  148. data_management/importers/save_the_elephants_survey_B.py +0 -332
  149. data_management/importers/snapshot_safari_importer.py +0 -758
  150. data_management/importers/snapshot_safari_importer_reprise.py +0 -665
  151. data_management/importers/snapshot_serengeti_lila.py +0 -1067
  152. data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  153. data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  154. data_management/importers/sulross_get_exif.py +0 -65
  155. data_management/importers/timelapse_csv_set_to_json.py +0 -490
  156. data_management/importers/ubc_to_json.py +0 -399
  157. data_management/importers/umn_to_json.py +0 -507
  158. data_management/importers/wellington_to_json.py +0 -263
  159. data_management/importers/wi_to_json.py +0 -441
  160. data_management/importers/zamba_results_to_md_results.py +0 -181
  161. data_management/labelme_to_coco.py +0 -548
  162. data_management/labelme_to_yolo.py +0 -272
  163. data_management/lila/__init__.py +0 -0
  164. data_management/lila/add_locations_to_island_camera_traps.py +0 -97
  165. data_management/lila/add_locations_to_nacti.py +0 -147
  166. data_management/lila/create_lila_blank_set.py +0 -557
  167. data_management/lila/create_lila_test_set.py +0 -151
  168. data_management/lila/create_links_to_md_results_files.py +0 -106
  169. data_management/lila/download_lila_subset.py +0 -177
  170. data_management/lila/generate_lila_per_image_labels.py +0 -515
  171. data_management/lila/get_lila_annotation_counts.py +0 -170
  172. data_management/lila/get_lila_image_counts.py +0 -111
  173. data_management/lila/lila_common.py +0 -300
  174. data_management/lila/test_lila_metadata_urls.py +0 -132
  175. data_management/ocr_tools.py +0 -874
  176. data_management/read_exif.py +0 -681
  177. data_management/remap_coco_categories.py +0 -84
  178. data_management/remove_exif.py +0 -66
  179. data_management/resize_coco_dataset.py +0 -189
  180. data_management/wi_download_csv_to_coco.py +0 -246
  181. data_management/yolo_output_to_md_output.py +0 -441
  182. data_management/yolo_to_coco.py +0 -676
  183. detection/__init__.py +0 -0
  184. detection/detector_training/__init__.py +0 -0
  185. detection/detector_training/model_main_tf2.py +0 -114
  186. detection/process_video.py +0 -703
  187. detection/pytorch_detector.py +0 -337
  188. detection/run_detector.py +0 -779
  189. detection/run_detector_batch.py +0 -1219
  190. detection/run_inference_with_yolov5_val.py +0 -917
  191. detection/run_tiled_inference.py +0 -935
  192. detection/tf_detector.py +0 -188
  193. detection/video_utils.py +0 -606
  194. docs/source/conf.py +0 -43
  195. md_utils/__init__.py +0 -0
  196. md_utils/azure_utils.py +0 -174
  197. md_utils/ct_utils.py +0 -612
  198. md_utils/directory_listing.py +0 -246
  199. md_utils/md_tests.py +0 -968
  200. md_utils/path_utils.py +0 -1044
  201. md_utils/process_utils.py +0 -157
  202. md_utils/sas_blob_utils.py +0 -509
  203. md_utils/split_locations_into_train_val.py +0 -228
  204. md_utils/string_utils.py +0 -92
  205. md_utils/url_utils.py +0 -323
  206. md_utils/write_html_image_list.py +0 -225
  207. md_visualization/__init__.py +0 -0
  208. md_visualization/plot_utils.py +0 -293
  209. md_visualization/render_images_with_thumbnails.py +0 -275
  210. md_visualization/visualization_utils.py +0 -1537
  211. md_visualization/visualize_db.py +0 -551
  212. md_visualization/visualize_detector_output.py +0 -406
  213. megadetector-5.0.10.dist-info/RECORD +0 -224
  214. megadetector-5.0.10.dist-info/top_level.txt +0 -8
  215. taxonomy_mapping/__init__.py +0 -0
  216. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
  217. taxonomy_mapping/map_new_lila_datasets.py +0 -154
  218. taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
  219. taxonomy_mapping/preview_lila_taxonomy.py +0 -591
  220. taxonomy_mapping/retrieve_sample_image.py +0 -71
  221. taxonomy_mapping/simple_image_download.py +0 -218
  222. taxonomy_mapping/species_lookup.py +0 -834
  223. taxonomy_mapping/taxonomy_csv_checker.py +0 -159
  224. taxonomy_mapping/taxonomy_graph.py +0 -346
  225. taxonomy_mapping/validate_lila_category_mappings.py +0 -83
  226. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0
@@ -1,557 +0,0 @@
1
- """
2
-
3
- create_lila_blank_set.py
4
-
5
- Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
6
- locations will be oversampled relative to more common locations. We'll also run MegaDetector
7
- (with manual review) to remove some incorrectly-labeled, not-actually-empty images from our
8
- blank set.
9
-
10
- We'll store location information for each image in a .json file, so we can split locations
11
- into train/val in downstream tasks.
12
-
13
- """
14
-
15
- #%% Constants and imports
16
-
17
- import os
18
- import random
19
- import math
20
- import json
21
-
22
- import numpy as np
23
- from tqdm import tqdm
24
- from multiprocessing.pool import ThreadPool
25
- from urllib.parse import urlparse
26
- from collections import defaultdict
27
-
28
- from data_management.lila.lila_common import read_lila_all_images_file
29
- from md_utils.url_utils import download_url
30
- from md_visualization import visualization_utils as vis_utils
31
- from md_utils.path_utils import recursive_file_list
32
-
33
- # We'll write images, metadata downloads, and temporary files here
34
- lila_local_base = os.path.expanduser('~/lila')
35
-
36
- metadata_dir = os.path.join(lila_local_base,'metadata')
37
- os.makedirs(metadata_dir,exist_ok=True)
38
-
39
- project_base = os.path.join(lila_local_base,'lila_blanks')
40
-
41
- candidate_blanks_base = os.path.join(project_base,'candidate_blanks')
42
- os.makedirs(candidate_blanks_base,exist_ok=True)
43
-
44
- confirmed_blanks_base = os.path.join(project_base,'confirmed_blanks')
45
- os.makedirs(confirmed_blanks_base,exist_ok=True)
46
-
47
- md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
48
- os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
49
-
50
- location_to_blank_image_urls_cache_file = os.path.join(project_base,
51
- 'location_to_blank_image_urls.json')
52
-
53
- md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
54
-
55
- all_fn_relative_to_location_file = os.path.join(project_base,'all_fn_relative_to_location.json')
56
- confirmed_fn_relative_to_location_file = os.path.join(project_base,'confirmed_fn_relative_to_location.json')
57
-
58
- preferred_image_download_source = 'gcp'
59
-
60
- # Number of concurrent download threads
61
- n_download_threads = 20
62
-
63
- n_blanks = 100000
64
-
65
- random.seed(0)
66
-
67
-
68
- #%% Download and open the giant table of image URLs and labels
69
-
70
- # ~60 seconds to download, unzip, and open
71
- df = read_lila_all_images_file(metadata_dir)
72
-
73
-
74
- #%% Explore blank labels
75
-
76
- # Original labels we're treating as blank:
77
- blank_original_labels = (
78
- 'empty','misfire'
79
- )
80
-
81
- # Some notable original labels we're *not* treating as blank:
82
- nonblank_original_labels = (
83
- 'unclassifiable', 'unidentifiable', 'unidentified', 'unknown', 'fire',
84
- 'foggy lens', 'foggy weather', 'blurred', 'end', 'eye_shine', 'ignore',
85
- 'lens obscured', 'misdirected', 'other', 'start', 'sun', 'problem',
86
- 'tilted', 'vegetation obstruction', 'snow on lens', 'malfunction'
87
- )
88
-
89
- other_labels_without_common_names = (
90
- 'car', 'motorcycle', 'vehicle'
91
- )
92
-
93
- common_names = sorted(list(df['common_name'].unique()),
94
- key=lambda x:str(x) if isinstance(x,float) else x)
95
- original_labels = sorted(list(df['original_label'].unique()),
96
- key=lambda x:str(x) if isinstance(x,float) else x)
97
-
98
- # Blanks are represented as NaN in the "common_name" column (though not all NaN's are blanks)
99
- assert '' not in common_names
100
- assert all([s not in common_names for s in blank_original_labels])
101
- assert all([s not in common_names for s in nonblank_original_labels])
102
- assert np.nan in common_names
103
-
104
- # Blanks are represented as "empty" or "misfire" in the "original_label" column
105
- assert all([s in original_labels for s in blank_original_labels])
106
- assert all([s in original_labels for s in nonblank_original_labels])
107
- assert all([s in original_labels for s in other_labels_without_common_names])
108
- assert all([s not in original_labels for s in ('','blank','none',np.nan)])
109
-
110
-
111
- #%% Count empty labels and common names
112
-
113
- common_names_with_empty_original_labels = set()
114
- original_labels_with_nan_common_names = set()
115
-
116
- common_name_to_count = defaultdict(int)
117
- original_label_to_count = defaultdict(int)
118
-
119
- # This loop takes ~10 mins
120
- for i_row,row in tqdm(df.iterrows(),total=len(df)):
121
-
122
- common_name = row['common_name']
123
- original_label = row['original_label']
124
-
125
- if isinstance(common_name,float):
126
- assert np.isnan(common_name)
127
- original_labels_with_nan_common_names.add(original_label)
128
-
129
- common_name = str(common_name)
130
-
131
- assert isinstance(original_label,str)
132
- if original_label in blank_original_labels:
133
- common_names_with_empty_original_labels.add(common_name)
134
- common_name_to_count[common_name] += 1
135
- original_label_to_count[original_label] += 1
136
-
137
-
138
- #%% Look at the most common labels and common names
139
-
140
- from md_utils.ct_utils import sort_dictionary_by_value
141
- common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
142
- original_label_to_count = sort_dictionary_by_value(original_label_to_count,reverse=True)
143
-
144
- k = 10
145
-
146
- print('\nMost frequent common names:\n')
147
-
148
- i_label = 0
149
- for i_label,s in enumerate(common_name_to_count):
150
- if i_label >= k:
151
- break
152
- print('{}: {}'.format(s,common_name_to_count[s]))
153
-
154
- print('\nMost frequent original labels:\n')
155
-
156
- i_label = 0
157
- for i_label,s in enumerate(original_label_to_count):
158
- if i_label >= k:
159
- break
160
- print('{}: {}'.format(s,original_label_to_count[s]))
161
-
162
-
163
- #%% Do some consistency checks over the empty labels and stats
164
-
165
- # All images called 'empty' should have NaN as their common name
166
- assert (len(common_names_with_empty_original_labels) == 1)
167
- assert next(iter(common_names_with_empty_original_labels)) == 'nan'
168
-
169
- # 'empty' should be the most frequent original label overall
170
- assert next(iter(original_label_to_count)) == 'empty'
171
-
172
- # NaN should be the most frequent common name overall
173
- assert next(iter(common_name_to_count)) == 'nan'
174
-
175
- for s in original_labels_with_nan_common_names:
176
- assert \
177
- (s in blank_original_labels) or \
178
- (s in nonblank_original_labels) or \
179
- (s in other_labels_without_common_names)
180
-
181
-
182
- #%% Map locations to blank images
183
-
184
- force_map_locations = False
185
-
186
- # Load from .json if available
187
- if (not force_map_locations) and (os.path.isfile(location_to_blank_image_urls_cache_file)):
188
-
189
- with open(location_to_blank_image_urls_cache_file,'r') as f:
190
- location_to_blank_image_urls = json.load(f)
191
-
192
- else:
193
-
194
- location_to_blank_image_urls = defaultdict(list)
195
-
196
- # i_row = 0; row = df.iloc[i_row]
197
- for i_row,row in tqdm(df.iterrows(),total=len(df)):
198
-
199
- location_id = row['location_id']
200
- url = row['url']
201
-
202
- original_label = row['original_label']
203
- if original_label in blank_original_labels:
204
- assert np.isnan(row['common_name'])
205
- location_to_blank_image_urls[location_id].append(url)
206
-
207
- with open(location_to_blank_image_urls_cache_file,'w') as f:
208
- json.dump(location_to_blank_image_urls,f,indent=1)
209
-
210
- n_locations_with_blanks = len(location_to_blank_image_urls)
211
- print('Found {} locations with blank images'.format(n_locations_with_blanks))
212
-
213
-
214
- #%% Sample blanks
215
-
216
- random.seed(0)
217
-
218
- # Make a fresh copy of the lists
219
- location_to_unsampled_blank_image_urls = {}
220
-
221
- # location = next(iter(location_to_blank_image_urls.keys()))
222
- for location in location_to_blank_image_urls:
223
- blank_image_urls_this_location = location_to_blank_image_urls[location]
224
- unsampled_blank_image_urls_this_location = blank_image_urls_this_location.copy()
225
- location_to_unsampled_blank_image_urls[location] = unsampled_blank_image_urls_this_location
226
-
227
- # Put locations in a random order
228
- location_ids = list(location_to_unsampled_blank_image_urls.keys())
229
- random.shuffle(location_ids)
230
-
231
- blank_urls = []
232
- location_to_sampled_blanks = defaultdict(list)
233
- fully_sampled_locations = set()
234
-
235
- # Pick from each location until we hit our limit or have no blanks left
236
- while(True):
237
-
238
- found_sample = False
239
-
240
- # location = location_ids[0]
241
- for location in location_ids:
242
-
243
- unsampled_images_this_location = location_to_unsampled_blank_image_urls[location]
244
- if len(unsampled_images_this_location) == 0:
245
- fully_sampled_locations.add(location)
246
- continue
247
-
248
- url = random.choice(unsampled_images_this_location)
249
- blank_urls.append(url)
250
- location_to_unsampled_blank_image_urls[location].remove(url)
251
- location_to_sampled_blanks[location].append(url)
252
- found_sample = True
253
-
254
- if len(blank_urls) == n_blanks:
255
- break
256
-
257
- # ...for each location
258
-
259
- if not found_sample:
260
- print('Terminating after {} blanks, we ran out before hitting {}'.format(
261
- len(blank_urls),n_blanks))
262
-
263
- if len(blank_urls) == n_blanks:
264
- break
265
-
266
- # ...while(True)
267
-
268
- assert len(blank_urls) <= n_blanks
269
- min_blanks_per_location = math.floor(n_blanks/n_locations_with_blanks)
270
- max_blanks_per_location = -1
271
- for location in location_to_sampled_blanks:
272
- n_blanks_this_location = len(location_to_sampled_blanks[location])
273
- if n_blanks_this_location >= max_blanks_per_location:
274
- max_blanks_per_location = n_blanks_this_location
275
- assert (location in fully_sampled_locations) or \
276
- n_blanks_this_location >= min_blanks_per_location
277
-
278
- print('Choose {} blanks from {} locations'.format(n_blanks,len(location_ids)))
279
- print('Fully sampled {} locations'.format(len(fully_sampled_locations)))
280
- print('Max samples per location: {}'.format(max_blanks_per_location))
281
-
282
-
283
- #%% Download those image files (prep)
284
-
285
- container_to_url_base = {
286
- 'lilablobssc.blob.core.windows.net':'/',
287
- 'storage.googleapis.com':'/public-datasets-lila/'
288
- }
289
-
290
- def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
291
- """
292
- Download a URL to output_base, preserving relative path
293
- """
294
-
295
- result = {'status':'unknown','url':url,'destination_filename':None}
296
-
297
- if url_base is None:
298
- assert url.startswith('https://')
299
- container = url.split('/')[2]
300
- assert container in container_to_url_base
301
- url_base = container_to_url_base[container]
302
-
303
- assert url_base.startswith('/') and url_base.endswith('/')
304
-
305
- p = urlparse(url)
306
- relative_filename = str(p.path)
307
- # remove the leading '/'
308
- assert relative_filename.startswith(url_base)
309
- relative_filename = relative_filename.replace(url_base,'',1)
310
-
311
- destination_filename = os.path.join(output_base,relative_filename)
312
- result['destination_filename'] = destination_filename
313
-
314
- if ((os.path.isfile(destination_filename)) and (not overwrite)):
315
- result['status'] = 'skipped'
316
- return result
317
- try:
318
- download_url(url, destination_filename, verbose=verbose)
319
- except Exception as e:
320
- print('Warning: error downloading URL {}: {}'.format(
321
- url,str(e)))
322
- result['status'] = 'error: {}'.format(str(e))
323
- return result
324
-
325
- result['status'] = 'success'
326
- return result
327
-
328
- def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
329
- """
330
- Most URLs point to Azure by default, but most files are available on both Azure and GCP.
331
- This function converts an Azure URL to the corresponding GCP http:// url.
332
- """
333
-
334
- lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
335
- gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
336
- error_if_not_azure_url = False
337
-
338
- if error_if_not_azure_url:
339
- assert url.startswith(lila_azure_storage_account)
340
- gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
341
- return gcp_url
342
-
343
- # Convert Azure URLs to GCP URLs if necessary
344
- if preferred_image_download_source != 'azure':
345
- assert preferred_image_download_source == 'gcp'
346
- blank_urls = [azure_url_to_gcp_http_url(url) for url in blank_urls]
347
-
348
-
349
- #%% Download those image files (execution)
350
-
351
- print('Downloading {} images on {} workers'.format(len(blank_urls),n_download_threads))
352
-
353
- if n_download_threads <= 1:
354
-
355
- results = []
356
-
357
- # url = all_urls[0]
358
- for url in tqdm(blank_urls):
359
- results.append(download_relative_filename(url,candidate_blanks_base,url_base=None))
360
-
361
- else:
362
-
363
- pool = ThreadPool(n_download_threads)
364
- results = list(tqdm(pool.imap(lambda s: download_relative_filename(
365
- s,candidate_blanks_base,url_base=None),
366
- blank_urls), total=len(blank_urls)))
367
-
368
- # pool.terminate()
369
-
370
-
371
- #%% Review results
372
-
373
- error_urls = []
374
- for r in results:
375
- if r['status'] != 'success':
376
- error_urls.append(r['url'])
377
-
378
- print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
379
-
380
-
381
- #%% Run MegaDetector on the folder
382
-
383
- cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
384
- candidate_blanks_base,md_results_file)
385
- cmd += ' --recursive --output_relative_filenames'
386
-
387
- import clipboard; clipboard.copy(cmd); print(cmd)
388
-
389
-
390
- #%% Review MD results that suggests images are non-empty
391
-
392
- assert os.path.isfile(md_results_file)
393
-
394
- category_name_to_threshold = {'animal':0.25,'person':0.25,'vehicle':0.25}
395
- min_threshold = min(category_name_to_threshold.values())
396
- with open(md_results_file,'r') as f:
397
- md_results = json.load(f)
398
-
399
- images_to_review_to_detections = {}
400
-
401
- category_id_to_threshold = {}
402
- for category_id in md_results['detection_categories']:
403
- category_name = md_results['detection_categories'][category_id]
404
- category_id_to_threshold[category_id] = category_name_to_threshold[category_name]
405
-
406
- # im = md_results['images'][0]
407
- for im in md_results['images']:
408
-
409
- if 'detections' not in im:
410
- continue
411
-
412
- found_object = False
413
- for det in im['detections']:
414
- threshold = category_id_to_threshold[det['category']]
415
- if det['conf'] >= threshold:
416
- found_object = True
417
- break
418
- if found_object:
419
- images_to_review_to_detections[im['file']] = im['detections']
420
-
421
- print('Flagging {} of {} images for review'.format(len(images_to_review_to_detections),len(md_results['images'])))
422
-
423
- output_file_to_source_file = {}
424
-
425
- # i_fn = 0; source_file_relative = images_to_review[i_fn]
426
- for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
427
- total=len(images_to_review_to_detections)):
428
-
429
- source_file_abs = os.path.join(candidate_blanks_base,source_file_relative)
430
- assert os.path.isfile(source_file_abs)
431
- ext = os.path.splitext(source_file_abs)[1]
432
- target_file_relative = str(i_fn).zfill(8) + ext
433
- target_file_abs = os.path.join(md_possible_non_blanks_folder,target_file_relative)
434
- output_file_to_source_file[target_file_relative] = source_file_relative
435
- # shutil.copyfile(source_file_abs,target_file_abs)
436
- vis_utils.draw_bounding_boxes_on_file(input_file=source_file_abs,
437
- output_file=target_file_abs,
438
- detections=images_to_review_to_detections[source_file_relative],
439
- confidence_threshold=min_threshold,
440
- target_size=(1280,-1))
441
-
442
- # This is a temporary file I just used during debugging
443
- with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
444
- json.dump(output_file_to_source_file,f,indent=1)
445
-
446
-
447
- #%% Manual review
448
-
449
- # Delete images that are *not* empty
450
-
451
-
452
- #%% Figure out which images are still there; these are the actually-blank ones
453
-
454
- remaining_images = set(os.listdir(md_possible_non_blanks_folder))
455
- print('Kept {} of {} candidate blank images'.format(len(remaining_images),
456
- len(images_to_review_to_detections)))
457
-
458
- removed_blank_images_relative = []
459
-
460
- # output_file = next(iter(output_file_to_source_file.keys()))
461
- for output_file in tqdm(output_file_to_source_file.keys()):
462
- if output_file not in remaining_images:
463
- source_file_relative = output_file_to_source_file[output_file]
464
- removed_blank_images_relative.append(source_file_relative)
465
-
466
- removed_blank_images_relative_set = set(removed_blank_images_relative)
467
- assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
468
-
469
-
470
- #%% Copy only the confirmed blanks to the confirmed folder
471
-
472
- from md_utils.path_utils import is_image_file
473
-
474
- all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
475
- print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
476
-
477
- skipped_images_relative = []
478
- skipped_non_images = []
479
-
480
- for source_fn_relative in tqdm(all_candidate_blanks):
481
-
482
- # Skip anything we removed from the "candidate non-blanks" folder; these weren't really
483
- # blank.
484
- if source_fn_relative in removed_blank_images_relative_set:
485
- skipped_images_relative.append(source_fn_relative)
486
- continue
487
-
488
- if not is_image_file(source_fn_relative):
489
- # Not a typo; "skipped images" really means "skipped files"
490
- skipped_images_relative.append(source_fn_relative)
491
- skipped_non_images.append(source_fn_relative)
492
-
493
-
494
- source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
495
- assert os.path.isfile(source_fn_abs)
496
- target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
497
- os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
498
- # shutil.copyfile(source_fn_abs,target_fn_abs)
499
-
500
- print('Skipped {} files ({} non-image files)'.format(len(skipped_images_relative),
501
- len(skipped_non_images)))
502
-
503
-
504
- #%% Validate the folder of confirmed blanks
505
-
506
- from md_utils.path_utils import find_images
507
- # all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
508
- all_confirmed_blanks = find_images(confirmed_blanks_base,return_relative_paths=True,recursive=True)
509
- assert len(all_confirmed_blanks) < len(all_candidate_blanks)
510
- print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
511
-
512
-
513
- #%% Manually review a few of the images we skipped
514
-
515
- # ...to make sure they're non-blank
516
- i_image = random.randint(0, len(skipped_images_relative))
517
- fn_relative = skipped_images_relative[i_image]
518
- fn_abs = os.path.join(candidate_blanks_base,fn_relative)
519
- assert os.path.isfile(fn_abs)
520
- import clipboard
521
- clipboard.copy('feh --scale-down "{}"'.format(fn_abs))
522
-
523
-
524
- #%% Record location information for each confirmed file
525
-
526
- # Map every URL's path to the corresponding location
527
- #
528
- # This is *all empty URLs*, not just the ones we downloaded
529
- all_fn_relative_to_location = {}
530
-
531
- # location = next(iter(location_to_blank_image_urls.keys()))
532
- for location in tqdm(location_to_blank_image_urls):
533
- urls_this_location = location_to_blank_image_urls[location]
534
-
535
- # url = urls_this_location[0]
536
- for url in urls_this_location:
537
- # Turn:
538
- #
539
- # https://lilablobssc.blob.core.windows.net/caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
540
- #
541
- # ...into:
542
- #
543
- # caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
544
- p = urlparse(url)
545
- fn_relative = str(p.path)[1:]
546
- all_fn_relative_to_location[fn_relative] = location
547
-
548
- # Build a much smaller mapping of just the confirmed blanks
549
- confirmed_fn_relative_to_location = {}
550
- for i_fn,fn_relative in tqdm(enumerate(all_confirmed_blanks),total=len(all_confirmed_blanks)):
551
- confirmed_fn_relative_to_location[fn_relative] = all_fn_relative_to_location[fn_relative]
552
-
553
- with open(all_fn_relative_to_location_file,'w') as f:
554
- json.dump(all_fn_relative_to_location,f,indent=1)
555
-
556
- with open(confirmed_fn_relative_to_location_file,'w') as f:
557
- json.dump(confirmed_fn_relative_to_location,f,indent=1)
@@ -1,151 +0,0 @@
1
- """
2
-
3
- create_lila_test_set.py
4
-
5
- Create a test set of camera trap images, containing N empty and N non-empty
6
- images from each LILA data set.
7
-
8
- """
9
-
10
- #%% Constants and imports
11
-
12
- import json
13
- import os
14
- import random
15
-
16
- from data_management.lila.lila_common import read_lila_metadata, read_metadata_file_for_dataset
17
-
18
- from md_utils.url_utils import download_url
19
-
20
- n_empty_images_per_dataset = 1
21
- n_non_empty_images_per_dataset = 1
22
-
23
- # We'll write images, metadata downloads, and temporary files here
24
- lila_local_base = os.path.expanduser('~/lila')
25
-
26
- output_dir = os.path.join(lila_local_base,'lila_test_set')
27
- os.makedirs(output_dir,exist_ok=True)
28
-
29
- metadata_dir = os.path.join(lila_local_base,'metadata')
30
- os.makedirs(metadata_dir,exist_ok=True)
31
-
32
- random.seed(0)
33
-
34
-
35
- #%% Download and parse the metadata file
36
-
37
- metadata_table = read_lila_metadata(metadata_dir)
38
-
39
-
40
- #%% Download and extract metadata for every dataset
41
-
42
- for ds_name in metadata_table.keys():
43
- metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
44
- metadata_dir=metadata_dir,
45
- metadata_table=metadata_table)
46
-
47
-
48
- #%% Choose images from each dataset
49
-
50
- # ds_name = (list(metadata_table.keys()))[0]
51
- for ds_name in metadata_table.keys():
52
-
53
- print('Choosing images for {}'.format(ds_name))
54
-
55
- json_filename = metadata_table[ds_name]['metadata_filename']
56
-
57
- with open(json_filename,'r') as f:
58
- d = json.load(f)
59
-
60
- category_id_to_name = {c['id']:c['name'] for c in d['categories']}
61
- category_name_to_id = {c['name']:c['id'] for c in d['categories']}
62
-
63
- ## Find empty images
64
-
65
- if 'empty' not in category_name_to_id:
66
- empty_annotations_to_download = []
67
- else:
68
- empty_category_id = category_name_to_id['empty']
69
- empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] == empty_category_id]
70
- try:
71
- empty_annotations_to_download = random.sample(empty_annotations,n_empty_images_per_dataset)
72
- except ValueError:
73
- print('No empty images available for dataset {}'.format(ds_name))
74
- empty_annotations_to_download = []
75
-
76
- ## Find non-empty images
77
-
78
- non_empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] != empty_category_id]
79
- try:
80
- non_empty_annotations_to_download = random.sample(non_empty_annotations,n_non_empty_images_per_dataset)
81
- except ValueError:
82
- print('No non-empty images available for dataset {}'.format(ds_name))
83
- non_empty_annotations_to_download = []
84
-
85
-
86
- annotations_to_download = empty_annotations_to_download + non_empty_annotations_to_download
87
-
88
- image_ids_to_download = set([ann['image_id'] for ann in annotations_to_download])
89
- assert len(image_ids_to_download) == len(set(image_ids_to_download))
90
-
91
- images_to_download = []
92
- for im in d['images']:
93
- if im['id'] in image_ids_to_download:
94
- images_to_download.append(im)
95
- assert len(images_to_download) == len(image_ids_to_download)
96
-
97
- metadata_table[ds_name]['images_to_download'] = images_to_download
98
-
99
- # ...for each dataset
100
-
101
-
102
- #%% Convert to URLs
103
-
104
- # ds_name = (list(metadata_table.keys()))[0]
105
- for ds_name in metadata_table.keys():
106
-
107
- base_url = metadata_table[ds_name]['image_base_url']
108
- assert not base_url.endswith('/')
109
-
110
- # Retrieve image file names
111
- filenames = [im['file_name'] for im in metadata_table[ds_name]['images_to_download']]
112
-
113
- urls_to_download = []
114
-
115
- # Convert to URLs
116
- for fn in filenames:
117
- url = base_url + '/' + fn
118
- urls_to_download.append(url)
119
-
120
- metadata_table[ds_name]['urls_to_download'] = urls_to_download
121
-
122
- # ...for each dataset
123
-
124
-
125
- #%% Download those image files
126
-
127
- # TODO: trivially parallelizable
128
- #
129
- # ds_name = (list(metadata_table.keys()))[0]
130
- for ds_name in metadata_table.keys():
131
-
132
- base_url = metadata_table[ds_name]['image_base_url']
133
- assert not base_url.endswith('/')
134
- base_url += '/'
135
-
136
- urls_to_download = metadata_table[ds_name]['urls_to_download']
137
-
138
- # url = urls_to_download[0]
139
- for url in urls_to_download:
140
-
141
- assert base_url in url
142
- output_file_relative = ds_name.lower().replace(' ','_') + '_' + url.replace(base_url,'').replace('/','_').replace('\\','_')
143
- output_file_absolute = os.path.join(output_dir,output_file_relative)
144
- try:
145
- download_url(url, destination_filename=output_file_absolute, force_download=False, verbose=True)
146
- except Exception as e:
147
- print('\n*** Error downloading {} ***\n{}'.format(url,str(e)))
148
-
149
- # ...for each url
150
-
151
- # ...for each dataset