megadetector 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +65 -65
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
  20. api/batch_processing/postprocessing/compare_batch_results.py +113 -43
  21. api/batch_processing/postprocessing/convert_output_format.py +41 -16
  22. api/batch_processing/postprocessing/load_api_results.py +16 -17
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +52 -22
  25. api/batch_processing/postprocessing/merge_detections.py +14 -14
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
  27. api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +102 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -263
  71. data_management/coco_to_yolo.py +79 -58
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +62 -24
  76. data_management/databases/subset_json_db.py +24 -15
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -162
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -158
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +7 -7
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +65 -24
  120. data_management/labelme_to_yolo.py +8 -8
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +13 -13
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +44 -110
  128. data_management/lila/generate_lila_per_image_labels.py +55 -42
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +96 -33
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +110 -97
  135. data_management/remap_coco_categories.py +83 -83
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +30 -23
  138. data_management/wi_download_csv_to_coco.py +246 -239
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +300 -60
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +179 -113
  147. detection/run_inference_with_yolov5_val.py +108 -48
  148. detection/run_tiled_inference.py +111 -40
  149. detection/tf_detector.py +51 -29
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +228 -68
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -871
  157. md_utils/path_utils.py +460 -134
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +176 -60
  163. md_utils/write_html_image_list.py +40 -33
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +597 -291
  168. md_visualization/visualize_db.py +76 -48
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/METADATA +13 -7
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  173. taxonomy_mapping/__init__.py +0 -0
  174. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  175. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  176. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  177. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  178. taxonomy_mapping/retrieve_sample_image.py +12 -12
  179. taxonomy_mapping/simple_image_download.py +11 -11
  180. taxonomy_mapping/species_lookup.py +10 -10
  181. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  182. taxonomy_mapping/taxonomy_graph.py +47 -47
  183. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  184. data_management/cct_json_to_filename_json.py +0 -89
  185. data_management/cct_to_csv.py +0 -140
  186. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  187. detection/detector_training/copy_checkpoints.py +0 -43
  188. megadetector-5.0.8.dist-info/RECORD +0 -205
  189. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
  190. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/WHEEL +0 -0
@@ -1,106 +1,106 @@
1
- ########
2
- #
3
- # create_links_to_md_results_files.py
4
- #
5
- # One-off script to populate the columns in the camera trap data .csv file that point to MD results.
6
- #
7
- ########
8
-
9
- #%% Imports and constants
10
-
11
- import os
12
-
13
- import pandas as pd
14
-
15
- input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
16
- output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
17
-
18
- md_results_local_folder = r'g:\temp\lila-md-results'
19
- md_base_url = 'https://lila.science/public/lila-md-results/'
20
- assert md_base_url.endswith('/')
21
-
22
- # No RDE files for datasets with no location information
23
- datasets_without_location_info = ('ena24','missouri-camera-traps')
24
-
25
- md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
26
-
27
- validate_urls = False
28
-
29
-
30
- #%% Read input data
31
-
32
- df = pd.read_csv(input_csv_file)
33
- for s in md_results_column_names:
34
- df[s] = ''
35
-
36
-
37
- #%% Find matching files locally, and create URLs
38
-
39
- local_files = os.listdir(md_results_local_folder)
40
- local_files = [fn for fn in local_files if fn.endswith('.zip')]
41
-
42
- # i_row = 0; row = df.iloc[i_row]
43
- for i_row,row in df.iterrows():
44
-
45
- if not isinstance(row['name'],str):
46
- continue
47
-
48
- dataset_shortname = row['short_name']
49
- matching_files = [fn for fn in local_files if dataset_shortname in fn]
50
-
51
- # No RDE files for datasets with no location information
52
- if dataset_shortname in datasets_without_location_info:
53
- assert len(matching_files) == 2
54
- mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
55
- mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
56
- assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
57
- df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
- df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
- else:
60
- # Exclude single-season files for snapshot-serengeti
61
- if dataset_shortname == 'snapshot-serengeti':
62
- matching_files = [fn for fn in matching_files if '_S' not in fn]
63
- assert len(matching_files) == 2
64
- assert all(['mdv4' in fn for fn in matching_files])
65
- rde_files = [fn for fn in matching_files if 'rde' in fn]
66
- raw_files = [fn for fn in matching_files if 'rde' not in fn]
67
- assert len(rde_files) == 1 and len(raw_files) == 1
68
- df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
69
- df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
70
- else:
71
- assert len(matching_files) == 3
72
- mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
73
- mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
74
- rde_files = [fn for fn in matching_files if 'rde' in fn]
75
- assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
76
- df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
77
- df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
78
- df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
79
-
80
- print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
81
-
82
- # ...for each row
83
-
84
-
85
- #%% Validate URLs
86
-
87
- if validate_urls:
88
-
89
- from md_utils.url_utils import test_urls
90
-
91
- urls = set()
92
-
93
- for i_row,row in df.iterrows():
94
- for column_name in md_results_column_names:
95
- if len(row[column_name]) > 0:
96
- assert row[column_name] not in urls
97
- urls.add(row[column_name])
98
-
99
- test_urls(urls,error_on_failure=True)
100
-
101
- print('Validated {} URLs'.format(len(urls)))
102
-
103
-
104
- #%% Write new .csv file
105
-
106
- df.to_csv(output_csv_file,header=True,index=False)
1
+ """
2
+
3
+ create_links_to_md_results_files.py
4
+
5
+ One-off script to populate the columns in the camera trap data .csv file that point to MD results.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+
13
+ import pandas as pd
14
+
15
+ input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
16
+ output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
17
+
18
+ md_results_local_folder = r'g:\temp\lila-md-results'
19
+ md_base_url = 'https://lila.science/public/lila-md-results/'
20
+ assert md_base_url.endswith('/')
21
+
22
+ # No RDE files for datasets with no location information
23
+ datasets_without_location_info = ('ena24','missouri-camera-traps')
24
+
25
+ md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
26
+
27
+ validate_urls = False
28
+
29
+
30
+ #%% Read input data
31
+
32
+ df = pd.read_csv(input_csv_file)
33
+ for s in md_results_column_names:
34
+ df[s] = ''
35
+
36
+
37
+ #%% Find matching files locally, and create URLs
38
+
39
+ local_files = os.listdir(md_results_local_folder)
40
+ local_files = [fn for fn in local_files if fn.endswith('.zip')]
41
+
42
+ # i_row = 0; row = df.iloc[i_row]
43
+ for i_row,row in df.iterrows():
44
+
45
+ if not isinstance(row['name'],str):
46
+ continue
47
+
48
+ dataset_shortname = row['short_name']
49
+ matching_files = [fn for fn in local_files if dataset_shortname in fn]
50
+
51
+ # No RDE files for datasets with no location information
52
+ if dataset_shortname in datasets_without_location_info:
53
+ assert len(matching_files) == 2
54
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
55
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
56
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
57
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
+ else:
60
+ # Exclude single-season files for snapshot-serengeti
61
+ if dataset_shortname == 'snapshot-serengeti':
62
+ matching_files = [fn for fn in matching_files if '_S' not in fn]
63
+ assert len(matching_files) == 2
64
+ assert all(['mdv4' in fn for fn in matching_files])
65
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
66
+ raw_files = [fn for fn in matching_files if 'rde' not in fn]
67
+ assert len(rde_files) == 1 and len(raw_files) == 1
68
+ df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
69
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
70
+ else:
71
+ assert len(matching_files) == 3
72
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
73
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
74
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
75
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
76
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
77
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
78
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
79
+
80
+ print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
81
+
82
+ # ...for each row
83
+
84
+
85
+ #%% Validate URLs
86
+
87
+ if validate_urls:
88
+
89
+ from md_utils.url_utils import test_urls
90
+
91
+ urls = set()
92
+
93
+ for i_row,row in df.iterrows():
94
+ for column_name in md_results_column_names:
95
+ if len(row[column_name]) > 0:
96
+ assert row[column_name] not in urls
97
+ urls.add(row[column_name])
98
+
99
+ test_urls(urls,error_on_failure=True)
100
+
101
+ print('Validated {} URLs'.format(len(urls)))
102
+
103
+
104
+ #%% Write new .csv file
105
+
106
+ df.to_csv(output_csv_file,header=True,index=False)
@@ -1,17 +1,11 @@
1
- ########
2
- #
3
- # download_lila_subset.py
4
- #
5
- # Example of how to download a list of files from LILA, e.g. all the files
6
- # in a data set corresponding to a particular species.
7
- #
8
- # Organizes the downloaded images by dataset. How you actually want to organize files,
9
- # what you want to query for, etc., is very application-specific; this is just meant as a
10
- # demo.
11
- #
12
- # Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
13
- #
14
- ########
1
+ """
2
+
3
+ download_lila_subset.py
4
+
5
+ Example of how to download a list of files from LILA, e.g. all the files
6
+ in a data set corresponding to a particular species.
7
+
8
+ """
15
9
 
16
10
  #%% Constants and imports
17
11
 
@@ -19,11 +13,9 @@ import os
19
13
  import random
20
14
 
21
15
  from tqdm import tqdm
22
- from multiprocessing.pool import ThreadPool
23
16
  from collections import defaultdict
24
17
 
25
18
  from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
26
- from md_utils.url_utils import download_url
27
19
 
28
20
  for s in lila_base_urls.values():
29
21
  assert s.endswith('/')
@@ -43,70 +35,22 @@ os.makedirs(output_dir,exist_ok=True)
43
35
  # Number of concurrent download threads
44
36
  n_download_threads = 20
45
37
 
46
- verbose = False
47
-
48
38
  max_images_per_dataset = 10 # None
49
39
 
50
- # This impacts the data download, but not the metadata download
51
- #
52
- # Setting this to "Azure" really means "Azure if available"; some datasets are
53
- # not available on Azure.
54
40
  preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
55
41
 
56
42
  random.seed(0)
57
43
 
58
44
 
59
- #%% Support functions
60
-
61
- def download_relative_url(relative_url, output_base, provider='gcp',
62
- verbose=False, overwrite=False):
63
- """
64
- Download a URL to output_base, preserving the path relative to the common LILA root.
65
- """
66
-
67
- assert not relative_url.startswith('/')
68
-
69
- # Not all datasets are available on Azure, fall back in these cases. The decision
70
- # to fall back to GCP rather than AWS is arbitrary.
71
- if provider == 'azure':
72
- nominal_provider = relative_url_to_nominal_provider[relative_url]
73
- if nominal_provider != 'azure':
74
- if verbose:
75
- print('URL {} not available on Azure, falling back to GCP'.format(
76
- relative_url))
77
- provider = 'gcp'
78
-
79
- url = lila_base_urls[provider] + relative_url
80
-
81
- result = {'status':'unknown','url':url,'destination_filename':None}
82
-
83
- destination_filename = os.path.join(output_base,relative_url)
84
- result['destination_filename'] = destination_filename
85
-
86
- if ((os.path.isfile(destination_filename)) and (not overwrite)):
87
- result['status'] = 'skipped'
88
- return result
89
- try:
90
- download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
91
- except Exception as e:
92
- print('Warning: error downloading URL {}: {}'.format(
93
- url,str(e)))
94
- result['status'] = 'error: {}'.format(str(e))
95
- return result
96
-
97
- result['status'] = 'success'
98
- return result
99
-
100
-
101
45
  #%% Download and open the giant table of image URLs and labels
102
46
 
103
- # ~60 seconds to download, unzip, and open
47
+ # Takes ~60 seconds to download, unzip, and open
104
48
  df = read_lila_all_images_file(metadata_dir)
105
49
 
106
50
 
107
51
  #%% Find all the images we want to download
108
52
 
109
- # ~2 minutes
53
+ # Takes ~2 minutes
110
54
 
111
55
  common_name_to_count = defaultdict(int)
112
56
 
@@ -119,6 +63,8 @@ def find_items(row):
119
63
 
120
64
  match = False
121
65
 
66
+ # This is the only bit of this file that's specific to a particular query. In this case
67
+ # we're checking whether each row is on a list of species of interest, but you do you.
122
68
  for species_name in species_of_interest:
123
69
  if species_name in row['common_name']:
124
70
  match = True
@@ -126,7 +72,7 @@ def find_items(row):
126
72
  break
127
73
 
128
74
  if match:
129
- ds_name_to_urls[row['dataset_name']].append(row['url'])
75
+ ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
130
76
 
131
77
  tqdm.pandas()
132
78
  _ = df.progress_apply(find_items,axis=1)
@@ -154,58 +100,47 @@ else:
154
100
  ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
155
101
 
156
102
 
157
- #%% Convert URLs to be relative to the common LILA base
103
+ #%% Choose target files for each URL
158
104
 
159
- all_urls = list(ds_name_to_urls.values())
160
- all_urls = [item for sublist in all_urls for item in sublist]
105
+ from data_management.lila.lila_common import lila_base_urls
161
106
 
162
- all_urls_relative = []
107
+ # We have a list of URLs per dataset, flatten that into a single list of URLs
108
+ urls_to_download = set()
109
+ for ds_name in ds_name_to_urls:
110
+ for url in ds_name_to_urls[ds_name]:
111
+ urls_to_download.add(url)
112
+ urls_to_download = sorted(list(urls_to_download))
163
113
 
164
- # Each file has a nominal URL in the .csv file. For now, the only thing this tells is
165
- # is that if the nominal URL isn't an Azure URL, the file isn't on Azure. All files are on
166
- # GCP and AWS.
114
+ # A URL might look like this:
167
115
  #
168
- # Keep track of the nominal provider for each URL.
169
- relative_url_to_nominal_provider = {}
170
-
171
- for url in all_urls:
172
- found_base = False
173
- for provider in lila_base_urls.keys():
174
- base = lila_base_urls[provider]
175
- if url.startswith(base):
176
- relative_url = url.replace(base,'')
177
- all_urls_relative.append(relative_url)
178
- relative_url_to_nominal_provider[relative_url] = provider
179
- found_base = True
180
- break
181
- assert found_base
182
-
183
- assert len(all_urls) == len(all_urls_relative)
116
+ # https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
117
+ #
118
+ # We'll write that to an output file that looks like this (relative to output_dir):
119
+ #
120
+ # wcs-unzipped/animals/0667/0302.jpg
121
+ #
122
+ # ...so we need to remove the base URL to get the target file.
123
+ base_url = lila_base_urls[preferred_provider]
124
+ assert base_url.endswith('/')
184
125
 
126
+ url_to_target_file = {}
185
127
 
186
- #%% Download image files
128
+ for url in urls_to_download:
129
+ assert url.startswith(base_url)
130
+ target_fn_relative = url.replace(base_url,'')
131
+ target_fn_abs = os.path.join(output_dir,target_fn_relative)
132
+ url_to_target_file[url] = target_fn_abs
187
133
 
188
- print('Downloading {} images on {} workers, preferred provider is {}'.format(
189
- len(all_urls),n_download_threads,preferred_provider))
190
134
 
191
- if n_download_threads <= 1:
135
+ #%% Download image files
192
136
 
193
- results = []
194
-
195
- # url_relative = all_urls_relative[0]
196
- for url_relative in tqdm(all_urls_relative):
197
- result = download_relative_url(url_relative,
198
- output_base=output_dir,
199
- provider=preferred_provider,
200
- verbose=verbose)
201
- results.append(result)
202
-
203
- else:
137
+ from md_utils.url_utils import parallel_download_urls
204
138
 
205
- pool = ThreadPool(n_download_threads)
206
- results = list(tqdm(pool.imap(lambda s: download_relative_url(
207
- s,output_base=output_dir,provider=preferred_provider,verbose=verbose),
208
- all_urls_relative), total=len(all_urls_relative)))
139
+ download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
140
+ verbose=False,
141
+ overwrite=False,
142
+ n_workers=n_download_threads,
143
+ pool_type='thread')
209
144
 
210
145
 
211
146
  #%% Scrap
@@ -240,4 +175,3 @@ if False:
240
175
  print('\nDatasets by count:\n')
241
176
  for k in dataset_to_count:
242
177
  print('{} ({})'.format(k,dataset_to_count[k]))
243
-
@@ -1,19 +1,19 @@
1
- ########
2
- #
3
- # generate_lila_per_image_labels.py
4
- #
5
- # Generate a .csv file with one row per annotation, containing full URLs to every
6
- # camera trap image on LILA, with taxonomically expanded labels.
7
- #
8
- # Typically there will be one row per image, though images with multiple annotations
9
- # will have multiple rows.
10
- #
11
- # Some images may not physically exist, particularly images that are labeled as "human".
12
- # This script does not validate image URLs.
13
- #
14
- # Does not include bounding box annotations.
15
- #
16
- ########
1
+ """
2
+
3
+ generate_lila_per_image_labels.py
4
+
5
+ Generate a .csv file with one row per annotation, containing full URLs to every
6
+ camera trap image on LILA, with taxonomically expanded labels.
7
+
8
+ Typically there will be one row per image, though images with multiple annotations
9
+ will have multiple rows.
10
+
11
+ Some images may not physically exist, particularly images that are labeled as "human".
12
+ This script does not validate image URLs.
13
+
14
+ Does not include bounding box annotations.
15
+
16
+ """
17
17
 
18
18
  #%% Constants and imports
19
19
 
@@ -23,8 +23,6 @@ import pandas as pd
23
23
  import numpy as np
24
24
  import dateparser
25
25
  import csv
26
- import urllib
27
- import urllib.request
28
26
 
29
27
  from collections import defaultdict
30
28
  from tqdm import tqdm
@@ -36,7 +34,6 @@ from data_management.lila.lila_common import read_lila_metadata, \
36
34
  from md_utils import write_html_image_list
37
35
  from md_utils.path_utils import zip_file
38
36
  from md_utils.path_utils import open_file
39
- from md_utils.url_utils import download_url
40
37
 
41
38
  # We'll write images, metadata downloads, and temporary files here
42
39
  lila_local_base = os.path.expanduser('~/lila')
@@ -107,12 +104,15 @@ for i_row,row in taxonomy_df.iterrows():
107
104
 
108
105
  # Takes several hours
109
106
 
110
- header = ['dataset_name','url','image_id','sequence_id','location_id','frame_num','original_label',\
111
- 'scientific_name','common_name','datetime','annotation_level']
107
+ # The order of these headers needs to match the order in which fields are added later in this cell;
108
+ # don't mess with this order.
109
+ header = ['dataset_name','url_gcp','url_aws','url_azure',
110
+ 'image_id','sequence_id','location_id','frame_num',
111
+ 'original_label','scientific_name','common_name','datetime','annotation_level']
112
112
 
113
113
  taxonomy_levels_to_include = \
114
114
  ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
115
- 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',\
115
+ 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
116
116
  'variety']
117
117
 
118
118
  header.extend(taxonomy_levels_to_include)
@@ -179,10 +179,17 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
179
179
  break
180
180
 
181
181
  file_name = im['file_name'].replace('\\','/')
182
- base_url = metadata_table[ds_name]['image_base_url']
183
- assert not base_url.endswith('/')
184
- url = base_url + '/' + file_name
182
+ base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
183
+ base_url_aws = metadata_table[ds_name]['image_base_url_aws']
184
+ base_url_azure = metadata_table[ds_name]['image_base_url_azure']
185
+ assert not base_url_gcp.endswith('/')
186
+ assert not base_url_aws.endswith('/')
187
+ assert not base_url_azure.endswith('/')
185
188
 
189
+ url_gcp = base_url_gcp + '/' + file_name
190
+ url_aws = base_url_aws + '/' + file_name
191
+ url_azure = base_url_azure + '/' + file_name
192
+
186
193
  for k in im.keys():
187
194
  if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
188
195
  raise ValueError('Unrecognized datetime field')
@@ -297,7 +304,9 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
297
304
 
298
305
  row = []
299
306
  row.append(ds_name)
300
- row.append(url)
307
+ row.append(url_gcp)
308
+ row.append(url_aws)
309
+ row.append(url_azure)
301
310
  row.append(image_id)
302
311
  row.append(sequence_id)
303
312
  row.append(location_id)
@@ -365,7 +374,8 @@ dataset_name_to_locations = defaultdict(set)
365
374
  def check_row(row):
366
375
 
367
376
  assert row['dataset_name'] in metadata_table.keys()
368
- assert row['url'].startswith('https://')
377
+ for url_column in ['url_gcp','url_aws','url_azure']:
378
+ assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
369
379
  assert ' : ' in row['image_id']
370
380
  assert 'seq' not in row['location_id'].lower()
371
381
  assert row['annotation_level'] in valid_annotation_levels
@@ -446,28 +456,31 @@ for ds_name in metadata_table.keys():
446
456
  print('Selected {} total images'.format(len(images_to_download)))
447
457
 
448
458
 
449
- #%% Download images
459
+ #%% Download images (prep)
450
460
 
451
461
  # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
452
462
 
453
- # TODO: trivially parallelizable
454
- #
463
+ preferred_cloud = 'aws'
464
+
465
+ url_to_target_file = {}
466
+
455
467
  # i_image = 10; image = images_to_download[i_image]
456
468
  for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
457
469
 
458
- url = image['url']
470
+ url = image['url_' + preferred_cloud]
459
471
  ext = os.path.splitext(url)[1]
460
- image_file = os.path.join(preview_folder,'image_{}'.format(str(i_image).zfill(4)) + ext)
461
- relative_file = os.path.relpath(image_file,preview_folder)
462
- try:
463
- download_url(url,image_file,verbose=False)
464
- image['relative_file'] = relative_file
465
- except urllib.error.HTTPError:
466
- print('Image {} does not exist ({}:{})'.format(
467
- i_image,image['dataset_name'],image['original_label']))
468
- image['relative_file'] = None
472
+ fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
473
+ fn_abs = os.path.join(preview_folder,fn_relative)
474
+ image['relative_file'] = fn_relative
475
+ image['url'] = url
476
+ url_to_target_file[url] = fn_abs
477
+
478
+
479
+ #%% Download images (execution)
469
480
 
470
- # ...for each image we need to download
481
+ from md_utils.url_utils import parallel_download_urls
482
+ download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
483
+ n_workers=20,pool_type='thread')
471
484
 
472
485
 
473
486
  #%% Write preview HTML
@@ -499,4 +512,4 @@ open_file(html_filename)
499
512
 
500
513
  zipped_output_file = zip_file(output_file,verbose=True)
501
514
 
502
- print('Zipped {} to {}'.format(output_file,zipped_output_file))
515
+ print('Zipped {} to {}'.format(output_file,zipped_output_file))
@@ -1,16 +1,16 @@
1
- ########
2
- #
3
- # get_lila_annotation_counts.py
4
- #
5
- # Generates a .json-formatted dictionary mapping each LILA dataset to all categories
6
- # that exist for that dataset, with counts for the number of occurrences of each category
7
- # (the number of *annotations* for each category, not the number of *images*).
8
- #
9
- # Also loads the taxonomy mapping file, to include scientific names for each category.
10
- #
11
- # get_lila_image_counts.py counts the number of *images* for each category in each dataset.
12
- #
13
- ########
1
+ """
2
+
3
+ get_lila_annotation_counts.py
4
+
5
+ Generates a .json-formatted dictionary mapping each LILA dataset to all categories
6
+ that exist for that dataset, with counts for the number of occurrences of each category
7
+ (the number of *annotations* for each category, not the number of *images*).
8
+
9
+ Also loads the taxonomy mapping file, to include scientific names for each category.
10
+
11
+ get_lila_image_counts.py counts the number of *images* for each category in each dataset.
12
+
13
+ """
14
14
 
15
15
  #%% Constants and imports
16
16
 
@@ -20,6 +20,9 @@ import os
20
20
  from data_management.lila.lila_common import read_lila_metadata,\
21
21
  read_metadata_file_for_dataset, read_lila_taxonomy_mapping
22
22
 
23
+ # cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
24
+ preferred_cloud = 'gcp'
25
+
23
26
  # array to fill for output
24
27
  category_list = []
25
28
 
@@ -96,9 +99,9 @@ for ds_name in metadata_table.keys():
96
99
  print('Warning: taxonomy mapping not available for {}'.format(ds_name))
97
100
 
98
101
  print('Finding categories in {}'.format(ds_name))
99
-
102
+
100
103
  json_filename = metadata_table[ds_name]['json_filename']
101
- base_url = metadata_table[ds_name]['image_base_url']
104
+ base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
102
105
  assert not base_url.endswith('/')
103
106
 
104
107
  # Open the metadata file