megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (191) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +93 -79
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
  20. api/batch_processing/postprocessing/compare_batch_results.py +114 -44
  21. api/batch_processing/postprocessing/convert_output_format.py +62 -19
  22. api/batch_processing/postprocessing/load_api_results.py +17 -20
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +165 -68
  25. api/batch_processing/postprocessing/merge_detections.py +40 -15
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
  27. api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +107 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -0
  71. data_management/coco_to_yolo.py +86 -62
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +130 -83
  76. data_management/databases/subset_json_db.py +25 -16
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -144
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -160
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +8 -8
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +309 -159
  120. data_management/labelme_to_yolo.py +103 -60
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +114 -31
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +92 -90
  128. data_management/lila/generate_lila_per_image_labels.py +56 -43
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +103 -70
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +161 -99
  135. data_management/remap_coco_categories.py +84 -0
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +32 -44
  138. data_management/wi_download_csv_to_coco.py +246 -0
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +535 -95
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +189 -114
  147. detection/run_inference_with_yolov5_val.py +118 -51
  148. detection/run_tiled_inference.py +113 -42
  149. detection/tf_detector.py +51 -28
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +249 -70
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -862
  157. md_utils/path_utils.py +655 -155
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +208 -27
  163. md_utils/write_html_image_list.py +51 -35
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +908 -311
  168. md_visualization/visualize_db.py +109 -58
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
  173. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  174. taxonomy_mapping/__init__.py +0 -0
  175. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  176. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  177. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  178. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  179. taxonomy_mapping/retrieve_sample_image.py +12 -12
  180. taxonomy_mapping/simple_image_download.py +11 -11
  181. taxonomy_mapping/species_lookup.py +10 -10
  182. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  183. taxonomy_mapping/taxonomy_graph.py +47 -47
  184. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  185. data_management/cct_json_to_filename_json.py +0 -89
  186. data_management/cct_to_csv.py +0 -140
  187. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  188. detection/detector_training/copy_checkpoints.py +0 -43
  189. md_visualization/visualize_megadb.py +0 -183
  190. megadetector-5.0.7.dist-info/RECORD +0 -202
  191. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
@@ -1,17 +1,11 @@
1
- ########
2
- #
3
- # download_lila_subset.py
4
- #
5
- # Example of how to download a list of files from LILA, e.g. all the files
6
- # in a data set corresponding to a particular species.
7
- #
8
- # Organizes the downloaded images by dataset. How you actually want to organize files,
9
- # what you want to query for, etc., is very application-specific; this is just meant as a
10
- # demo.
11
- #
12
- # Can download from either Azure or GCP.
13
- #
14
- ########
1
+ """
2
+
3
+ download_lila_subset.py
4
+
5
+ Example of how to download a list of files from LILA, e.g. all the files
6
+ in a data set corresponding to a particular species.
7
+
8
+ """
15
9
 
16
10
  #%% Constants and imports
17
11
 
@@ -19,16 +13,15 @@ import os
19
13
  import random
20
14
 
21
15
  from tqdm import tqdm
22
- from multiprocessing.pool import ThreadPool
23
- from urllib.parse import urlparse
24
16
  from collections import defaultdict
25
17
 
26
- from data_management.lila.lila_common import \
27
- read_lila_all_images_file, is_empty, azure_url_to_gcp_http_url
28
- from md_utils.url_utils import download_url
18
+ from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
19
+
20
+ for s in lila_base_urls.values():
21
+ assert s.endswith('/')
29
22
 
30
23
  # If any of these strings appear in the common name of a species, we'll download that image
31
- species_of_interest = ['grey fox','red fox','leopard cat','kiwi']
24
+ species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
32
25
 
33
26
  # We'll write images, metadata downloads, and temporary files here
34
27
  lila_local_base = os.path.expanduser('~/lila')
@@ -44,24 +37,22 @@ n_download_threads = 20
44
37
 
45
38
  max_images_per_dataset = 10 # None
46
39
 
47
- # This impacts the data download, but not the metadata download
48
- #
49
- # "Azure" really means "Azure if available"; recent datasets are only available
50
- # on GCP.
51
- image_download_source = 'azure' # 'azure' or 'gcp'
40
+ preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
52
41
 
53
42
  random.seed(0)
54
43
 
55
44
 
56
45
  #%% Download and open the giant table of image URLs and labels
57
46
 
58
- # ~60 seconds to download, unzip, and open
47
+ # Takes ~60 seconds to download, unzip, and open
59
48
  df = read_lila_all_images_file(metadata_dir)
60
49
 
61
50
 
62
51
  #%% Find all the images we want to download
63
52
 
64
- # ~2 minutes
53
+ # Takes ~2 minutes
54
+
55
+ common_name_to_count = defaultdict(int)
65
56
 
66
57
  ds_name_to_urls = defaultdict(list)
67
58
 
@@ -72,26 +63,33 @@ def find_items(row):
72
63
 
73
64
  match = False
74
65
 
66
+ # This is the only bit of this file that's specific to a particular query. In this case
67
+ # we're checking whether each row is on a list of species of interest, but you do you.
75
68
  for species_name in species_of_interest:
76
69
  if species_name in row['common_name']:
77
70
  match = True
71
+ common_name_to_count[species_name] += 1
78
72
  break
79
73
 
80
74
  if match:
81
- ds_name_to_urls[row['dataset_name']].append(row['url'])
75
+ ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
82
76
 
83
77
  tqdm.pandas()
84
78
  _ = df.progress_apply(find_items,axis=1)
85
79
 
80
+ # We have a list of URLs for each dataset, flatten them all into a list of URLs
86
81
  all_urls = list(ds_name_to_urls.values())
87
82
  all_urls = [item for sublist in all_urls for item in sublist]
88
83
  print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
89
84
 
85
+ for common_name in common_name_to_count:
86
+ print('{}: {}'.format(common_name,common_name_to_count[common_name]))
87
+
90
88
  from copy import deepcopy
91
89
  ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
92
90
 
93
91
 
94
- #%% Trim to a fixed number of URLs per dataset
92
+ #%% Optionally trim to a fixed number of URLs per dataset
95
93
 
96
94
  if max_images_per_dataset is None:
97
95
  pass
@@ -102,74 +100,78 @@ else:
102
100
  ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
103
101
 
104
102
 
105
- #%% Download those image files
103
+ #%% Choose target files for each URL
106
104
 
107
- container_to_url_base = {
108
- 'lilablobssc.blob.core.windows.net':'/',
109
- 'storage.googleapis.com':'/public-datasets-lila/'
110
- }
105
+ from data_management.lila.lila_common import lila_base_urls
111
106
 
112
- def download_relative_filename(url, output_base, verbose=False, url_base=None, overwrite=False):
113
- """
114
- Download a URL to output_base, preserving relative path
115
- """
116
-
117
- result = {'status':'unknown','url':url,'destination_filename':None}
118
-
119
- if url_base is None:
120
- assert url.startswith('https://')
121
- container = url.split('/')[2]
122
- assert container in container_to_url_base
123
- url_base = container_to_url_base[container]
124
-
125
- assert url_base.startswith('/') and url_base.endswith('/')
126
-
127
- p = urlparse(url)
128
- relative_filename = str(p.path)
129
- # remove the leading '/'
130
- assert relative_filename.startswith(url_base)
131
- relative_filename = relative_filename.replace(url_base,'',1)
132
-
133
- destination_filename = os.path.join(output_base,relative_filename)
134
- result['destination_filename'] = destination_filename
135
-
136
- if ((os.path.isfile(destination_filename)) and (not overwrite)):
137
- result['status'] = 'skipped'
138
- return result
139
- try:
140
- download_url(url, destination_filename, verbose=verbose)
141
- except Exception as e:
142
- print('Warning: error downloading URL {}: {}'.format(
143
- url,str(e)))
144
- result['status'] = 'error: {}'.format(str(e))
145
- return result
146
-
147
- result['status'] = 'success'
148
- return result
107
+ # We have a list of URLs per dataset, flatten that into a single list of URLs
108
+ urls_to_download = set()
109
+ for ds_name in ds_name_to_urls:
110
+ for url in ds_name_to_urls[ds_name]:
111
+ urls_to_download.add(url)
112
+ urls_to_download = sorted(list(urls_to_download))
149
113
 
114
+ # A URL might look like this:
115
+ #
116
+ # https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
117
+ #
118
+ # We'll write that to an output file that looks like this (relative to output_dir):
119
+ #
120
+ # wcs-unzipped/animals/0667/0302.jpg
121
+ #
122
+ # ...so we need to remove the base URL to get the target file.
123
+ base_url = lila_base_urls[preferred_provider]
124
+ assert base_url.endswith('/')
150
125
 
151
- # ds_name_to_urls maps dataset names to lists of URLs; flatten to a single list of URLs
152
- all_urls = list(ds_name_to_urls.values())
153
- all_urls = [item for sublist in all_urls for item in sublist]
126
+ url_to_target_file = {}
154
127
 
155
- # Convert Azure URLs to GCP URLs if necessary
156
- if image_download_source != 'azure':
157
- assert image_download_source == 'gcp'
158
- all_urls = [azure_url_to_gcp_http_url(url) for url in all_urls]
128
+ for url in urls_to_download:
129
+ assert url.startswith(base_url)
130
+ target_fn_relative = url.replace(base_url,'')
131
+ target_fn_abs = os.path.join(output_dir,target_fn_relative)
132
+ url_to_target_file[url] = target_fn_abs
159
133
 
160
- print('Downloading {} images on {} workers'.format(len(all_urls),n_download_threads))
161
134
 
162
- if n_download_threads <= 1:
135
+ #%% Download image files
163
136
 
164
- results = []
165
-
166
- # url = all_urls[0]
167
- for url in tqdm(all_urls):
168
- results.append(download_relative_filename(url,output_dir,url_base=None))
137
+ from md_utils.url_utils import parallel_download_urls
138
+
139
+ download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
140
+ verbose=False,
141
+ overwrite=False,
142
+ n_workers=n_download_threads,
143
+ pool_type='thread')
144
+
145
+
146
+ #%% Scrap
147
+
148
+ if False:
169
149
 
170
- else:
150
+ pass
151
+
152
+ #%% Find all the reptiles on LILA
171
153
 
172
- pool = ThreadPool(n_download_threads)
173
- results = list(tqdm(pool.imap(lambda s: download_relative_filename(
174
- s,output_dir,url_base=None),
175
- all_urls), total=len(all_urls)))
154
+ reptile_rows = df.loc[df['class'] == 'reptilia']
155
+
156
+ # i_row = 0; row = reptile_rows.iloc[i_row]
157
+
158
+ common_name_to_count = defaultdict(int)
159
+ dataset_to_count = defaultdict(int)
160
+ for i_row,row in reptile_rows.iterrows():
161
+ common_name_to_count[row['common_name']] += 1
162
+ dataset_to_count[row['dataset_name']] += 1
163
+
164
+ from md_utils.ct_utils import sort_dictionary_by_value
165
+
166
+ print('Found {} reptiles\n'.format(len(reptile_rows)))
167
+
168
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
169
+ dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
170
+
171
+ print('Common names by count:\n')
172
+ for k in common_name_to_count:
173
+ print('{} ({})'.format(k,common_name_to_count[k]))
174
+
175
+ print('\nDatasets by count:\n')
176
+ for k in dataset_to_count:
177
+ print('{} ({})'.format(k,dataset_to_count[k]))
@@ -1,19 +1,19 @@
1
- ########
2
- #
3
- # generate_lila_per_image_labels.py
4
- #
5
- # Generate a .csv file with one row per annotation, containing full URLs to every
6
- # camera trap image on LILA, with taxonomically expanded labels.
7
- #
8
- # Typically there will be one row per image, though images with multiple annotations
9
- # will have multiple rows.
10
- #
11
- # Some images may not physically exist, particularly images that are labeled as "human".
12
- # This script does not validate image URLs.
13
- #
14
- # Does not include bounding box annotations.
15
- #
16
- ########
1
+ """
2
+
3
+ generate_lila_per_image_labels.py
4
+
5
+ Generate a .csv file with one row per annotation, containing full URLs to every
6
+ camera trap image on LILA, with taxonomically expanded labels.
7
+
8
+ Typically there will be one row per image, though images with multiple annotations
9
+ will have multiple rows.
10
+
11
+ Some images may not physically exist, particularly images that are labeled as "human".
12
+ This script does not validate image URLs.
13
+
14
+ Does not include bounding box annotations.
15
+
16
+ """
17
17
 
18
18
  #%% Constants and imports
19
19
 
@@ -23,8 +23,6 @@ import pandas as pd
23
23
  import numpy as np
24
24
  import dateparser
25
25
  import csv
26
- import urllib
27
- import urllib.request
28
26
 
29
27
  from collections import defaultdict
30
28
  from tqdm import tqdm
@@ -36,7 +34,6 @@ from data_management.lila.lila_common import read_lila_metadata, \
36
34
  from md_utils import write_html_image_list
37
35
  from md_utils.path_utils import zip_file
38
36
  from md_utils.path_utils import open_file
39
- from md_utils.url_utils import download_url
40
37
 
41
38
  # We'll write images, metadata downloads, and temporary files here
42
39
  lila_local_base = os.path.expanduser('~/lila')
@@ -107,12 +104,15 @@ for i_row,row in taxonomy_df.iterrows():
107
104
 
108
105
  # Takes several hours
109
106
 
110
- header = ['dataset_name','url','image_id','sequence_id','location_id','frame_num','original_label',\
111
- 'scientific_name','common_name','datetime','annotation_level']
107
+ # The order of these headers needs to match the order in which fields are added later in this cell;
108
+ # don't mess with this order.
109
+ header = ['dataset_name','url_gcp','url_aws','url_azure',
110
+ 'image_id','sequence_id','location_id','frame_num',
111
+ 'original_label','scientific_name','common_name','datetime','annotation_level']
112
112
 
113
113
  taxonomy_levels_to_include = \
114
114
  ['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
115
- 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',\
115
+ 'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
116
116
  'variety']
117
117
 
118
118
  header.extend(taxonomy_levels_to_include)
@@ -179,10 +179,17 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
179
179
  break
180
180
 
181
181
  file_name = im['file_name'].replace('\\','/')
182
- base_url = metadata_table[ds_name]['image_base_url']
183
- assert not base_url.endswith('/')
184
- url = base_url + '/' + file_name
182
+ base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
183
+ base_url_aws = metadata_table[ds_name]['image_base_url_aws']
184
+ base_url_azure = metadata_table[ds_name]['image_base_url_azure']
185
+ assert not base_url_gcp.endswith('/')
186
+ assert not base_url_aws.endswith('/')
187
+ assert not base_url_azure.endswith('/')
185
188
 
189
+ url_gcp = base_url_gcp + '/' + file_name
190
+ url_aws = base_url_aws + '/' + file_name
191
+ url_azure = base_url_azure + '/' + file_name
192
+
186
193
  for k in im.keys():
187
194
  if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
188
195
  raise ValueError('Unrecognized datetime field')
@@ -297,7 +304,9 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
297
304
 
298
305
  row = []
299
306
  row.append(ds_name)
300
- row.append(url)
307
+ row.append(url_gcp)
308
+ row.append(url_aws)
309
+ row.append(url_azure)
301
310
  row.append(image_id)
302
311
  row.append(sequence_id)
303
312
  row.append(location_id)
@@ -338,7 +347,7 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
338
347
 
339
348
  # ...with open()
340
349
 
341
- print('Processed {} datsets'.format(len(metadata_table)))
350
+ print('Processed {} datasets'.format(len(metadata_table)))
342
351
 
343
352
 
344
353
  #%% Read the .csv back
@@ -365,7 +374,8 @@ dataset_name_to_locations = defaultdict(set)
365
374
  def check_row(row):
366
375
 
367
376
  assert row['dataset_name'] in metadata_table.keys()
368
- assert row['url'].startswith('https://')
377
+ for url_column in ['url_gcp','url_aws','url_azure']:
378
+ assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
369
379
  assert ' : ' in row['image_id']
370
380
  assert 'seq' not in row['location_id'].lower()
371
381
  assert row['annotation_level'] in valid_annotation_levels
@@ -446,28 +456,31 @@ for ds_name in metadata_table.keys():
446
456
  print('Selected {} total images'.format(len(images_to_download)))
447
457
 
448
458
 
449
- #%% Download images
459
+ #%% Download images (prep)
450
460
 
451
461
  # Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
452
462
 
453
- # TODO: trivially parallelizable
454
- #
463
+ preferred_cloud = 'aws'
464
+
465
+ url_to_target_file = {}
466
+
455
467
  # i_image = 10; image = images_to_download[i_image]
456
468
  for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
457
469
 
458
- url = image['url']
470
+ url = image['url_' + preferred_cloud]
459
471
  ext = os.path.splitext(url)[1]
460
- image_file = os.path.join(preview_folder,'image_{}'.format(str(i_image).zfill(4)) + ext)
461
- relative_file = os.path.relpath(image_file,preview_folder)
462
- try:
463
- download_url(url,image_file,verbose=False)
464
- image['relative_file'] = relative_file
465
- except urllib.error.HTTPError:
466
- print('Image {} does not exist ({}:{})'.format(
467
- i_image,image['dataset_name'],image['original_label']))
468
- image['relative_file'] = None
472
+ fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
473
+ fn_abs = os.path.join(preview_folder,fn_relative)
474
+ image['relative_file'] = fn_relative
475
+ image['url'] = url
476
+ url_to_target_file[url] = fn_abs
477
+
478
+
479
+ #%% Download images (execution)
469
480
 
470
- # ...for each image we need to download
481
+ from md_utils.url_utils import parallel_download_urls
482
+ download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
483
+ n_workers=20,pool_type='thread')
471
484
 
472
485
 
473
486
  #%% Write preview HTML
@@ -499,4 +512,4 @@ open_file(html_filename)
499
512
 
500
513
  zipped_output_file = zip_file(output_file,verbose=True)
501
514
 
502
- print('Zipped {} to {}'.format(output_file,zipped_output_file))
515
+ print('Zipped {} to {}'.format(output_file,zipped_output_file))
@@ -1,16 +1,16 @@
1
- ########
2
- #
3
- # get_lila_annotation_counts.py
4
- #
5
- # Generates a .json-formatted dictionary mapping each LILA dataset to all categories
6
- # that exist for that dataset, with counts for the number of occurrences of each category
7
- # (the number of *annotations* for each category, not the number of *images*).
8
- #
9
- # Also loads the taxonomy mapping file, to include scientific names for each category.
10
- #
11
- # get_lila_image_counts.py counts the number of *images* for each category in each dataset.
12
- #
13
- ########
1
+ """
2
+
3
+ get_lila_annotation_counts.py
4
+
5
+ Generates a .json-formatted dictionary mapping each LILA dataset to all categories
6
+ that exist for that dataset, with counts for the number of occurrences of each category
7
+ (the number of *annotations* for each category, not the number of *images*).
8
+
9
+ Also loads the taxonomy mapping file, to include scientific names for each category.
10
+
11
+ get_lila_image_counts.py counts the number of *images* for each category in each dataset.
12
+
13
+ """
14
14
 
15
15
  #%% Constants and imports
16
16
 
@@ -20,6 +20,9 @@ import os
20
20
  from data_management.lila.lila_common import read_lila_metadata,\
21
21
  read_metadata_file_for_dataset, read_lila_taxonomy_mapping
22
22
 
23
+ # cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
24
+ preferred_cloud = 'gcp'
25
+
23
26
  # array to fill for output
24
27
  category_list = []
25
28
 
@@ -96,9 +99,9 @@ for ds_name in metadata_table.keys():
96
99
  print('Warning: taxonomy mapping not available for {}'.format(ds_name))
97
100
 
98
101
  print('Finding categories in {}'.format(ds_name))
99
-
102
+
100
103
  json_filename = metadata_table[ds_name]['json_filename']
101
- base_url = metadata_table[ds_name]['image_base_url']
104
+ base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
102
105
  assert not base_url.endswith('/')
103
106
 
104
107
  # Open the metadata file
@@ -1,14 +1,14 @@
1
- ########
2
- #
3
- # get_lila_image_counts.py
4
- #
5
- # Count the number of images and bounding boxes with each label in one or more LILA datasets.
6
- #
7
- # This script doesn't write these counts out anywhere other than the console, it's just intended
8
- # as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
9
- # information out to a .json file, but it counts *annotations*, not *images*, for each category.
10
- #
11
- ########
1
+ """
2
+
3
+ get_lila_image_counts.py
4
+
5
+ Count the number of images and bounding boxes with each label in one or more LILA datasets.
6
+
7
+ This script doesn't write these counts out anywhere other than the console, it's just intended
8
+ as a template for doing operations like this on LILA data. get_lila_annotation_counts.py writes
9
+ information out to a .json file, but it counts *annotations*, not *images*, for each category.
10
+
11
+ """
12
12
 
13
13
  #%% Constants and imports
14
14