megadetector 10.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +702 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +528 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +187 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +663 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +876 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2159 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1494 -0
  81. megadetector/detection/run_tiled_inference.py +1038 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1752 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2077 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +224 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2832 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1759 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1940 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +479 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.13.dist-info/METADATA +134 -0
  144. megadetector-10.0.13.dist-info/RECORD +147 -0
  145. megadetector-10.0.13.dist-info/WHEEL +5 -0
  146. megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.13.dist-info/top_level.txt +1 -0
@@ -0,0 +1,187 @@
1
+ """
2
+
3
+ create_lila_test_set.py
4
+
5
+ Create a test set of camera trap images, containing N empty and N non-empty
6
+ images from each LILA data set.
7
+
8
+ """
9
+
10
+ #%% Constants and imports
11
+
12
+ import json
13
+ import os
14
+ import random
15
+
16
+ from megadetector.data_management.lila.lila_common import \
17
+ read_lila_metadata, read_metadata_file_for_dataset
18
+ from megadetector.utils.url_utils import parallel_download_urls
19
+ from megadetector.utils.path_utils import open_file
20
+
21
+ n_empty_images_per_dataset = 1
22
+ n_non_empty_images_per_dataset = 1
23
+
24
+ # We'll write images, metadata downloads, and temporary files here
25
+ lila_local_base = os.path.expanduser('~/lila')
26
+
27
+ output_dir = os.path.join(lila_local_base,'lila_test_set')
28
+ os.makedirs(output_dir,exist_ok=True)
29
+
30
+ metadata_dir = os.path.join(lila_local_base,'metadata')
31
+ os.makedirs(metadata_dir,exist_ok=True)
32
+
33
+ random.seed(0)
34
+
35
+
36
+ #%% Download and parse the metadata file
37
+
38
+ metadata_table = read_lila_metadata(metadata_dir)
39
+
40
+
41
+ #%% Download and extract metadata for every dataset
42
+
43
+ for ds_name in metadata_table.keys():
44
+ metadata_table[ds_name]['metadata_filename'] = \
45
+ read_metadata_file_for_dataset(ds_name=ds_name,
46
+ metadata_dir=metadata_dir,
47
+ metadata_table=metadata_table)
48
+
49
+
50
+ #%% Choose images from each dataset
51
+
52
+ # Takes ~60 seconds
53
+
54
+ empty_category_names = ['empty','blank']
55
+
56
+ # ds_name = (list(metadata_table.keys()))[0]
57
+ for ds_name in metadata_table.keys():
58
+
59
+ print('Choosing images for {}'.format(ds_name))
60
+
61
+ json_filename = metadata_table[ds_name]['metadata_filename']
62
+
63
+ with open(json_filename,'r') as f:
64
+ d = json.load(f)
65
+
66
+ category_id_to_name = {c['id']:c['name'] for c in d['categories']}
67
+ category_name_to_id = {c['name']:c['id'] for c in d['categories']}
68
+
69
+ ## Find empty images
70
+
71
+ empty_category_present = False
72
+ for category_name in category_name_to_id:
73
+ if category_name in empty_category_names:
74
+ empty_category_present = True
75
+ break
76
+ if not empty_category_present:
77
+ empty_annotations_to_download = []
78
+ else:
79
+ empty_category_id = None
80
+ for category_name in empty_category_names:
81
+ if category_name in category_name_to_id:
82
+ if empty_category_id is not None:
83
+ print('Warning: multiple empty categories in dataset {}'.format(ds_name))
84
+ else:
85
+ empty_category_id = category_name_to_id[category_name]
86
+ assert empty_category_id is not None
87
+ empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] == empty_category_id]
88
+ try:
89
+ empty_annotations_to_download = random.sample(empty_annotations,n_empty_images_per_dataset)
90
+ except ValueError:
91
+ print('No empty images available for dataset {}'.format(ds_name))
92
+ empty_annotations_to_download = []
93
+
94
+ ## Find non-empty images
95
+
96
+ non_empty_annotations = [ann for ann in d['annotations'] if ann['category_id'] != empty_category_id]
97
+ try:
98
+ non_empty_annotations_to_download = random.sample(non_empty_annotations,n_non_empty_images_per_dataset)
99
+ except ValueError:
100
+ print('No non-empty images available for dataset {}'.format(ds_name))
101
+ non_empty_annotations_to_download = []
102
+
103
+
104
+ annotations_to_download = empty_annotations_to_download + non_empty_annotations_to_download
105
+
106
+ image_ids_to_download = set([ann['image_id'] for ann in annotations_to_download])
107
+ assert len(image_ids_to_download) == len(set(image_ids_to_download))
108
+
109
+ images_to_download = []
110
+ for im in d['images']:
111
+ if im['id'] in image_ids_to_download:
112
+ images_to_download.append(im)
113
+ assert len(images_to_download) == len(image_ids_to_download)
114
+
115
+ metadata_table[ds_name]['images_to_download'] = images_to_download
116
+
117
+ # ...for each dataset
118
+
119
+
120
+ #%% Convert to URLs
121
+
122
+ preferred_cloud = 'gcp'
123
+
124
+ # ds_name = (list(metadata_table.keys()))[0]
125
+ for ds_name in metadata_table.keys():
126
+
127
+ base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
128
+ assert not base_url.endswith('/')
129
+
130
+ # Retrieve image file names
131
+ filenames = [im['file_name'] for im in metadata_table[ds_name]['images_to_download']]
132
+
133
+ urls_to_download = []
134
+
135
+ # Convert to URLs
136
+ for fn in filenames:
137
+ url = base_url + '/' + fn
138
+ urls_to_download.append(url)
139
+
140
+ metadata_table[ds_name]['urls_to_download'] = urls_to_download
141
+
142
+ # ...for each dataset
143
+
144
+
145
+ #%% Download image files (prep)
146
+
147
+ url_to_target_file = {}
148
+
149
+ # ds_name = (list(metadata_table.keys()))[0]
150
+ for ds_name in metadata_table.keys():
151
+
152
+ base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
153
+ assert not base_url.endswith('/')
154
+ base_url += '/'
155
+
156
+ urls_to_download = metadata_table[ds_name]['urls_to_download']
157
+
158
+ # url = urls_to_download[0]
159
+ for url in urls_to_download:
160
+
161
+ assert base_url in url
162
+ output_file_relative = ds_name.lower().replace(' ','_') + \
163
+ '_' + url.replace(base_url,'').replace('/','_').replace('\\','_')
164
+ output_file_absolute = os.path.join(output_dir,output_file_relative)
165
+ url_to_target_file[url] = output_file_absolute
166
+
167
+ # ...for each url
168
+
169
+ # ...for each dataset
170
+
171
+
172
+ #%% Download image files (execution)
173
+
174
+ download_results = parallel_download_urls(url_to_target_file,
175
+ verbose=False,
176
+ overwrite=False,
177
+ n_workers=20,
178
+ pool_type='thread')
179
+
180
+ # r = download_results[0]
181
+ for r in download_results:
182
+ assert r['status'] in ('skipped','success')
183
+
184
+
185
+ #%% Open the test test
186
+
187
+ open_file(output_dir)
@@ -0,0 +1,106 @@
1
+ """
2
+
3
+ create_links_to_md_results_files.py
4
+
5
+ One-off script to populate the columns in the camera trap data .csv file that point to MD results.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+
13
+ import pandas as pd
14
+
15
+ input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
16
+ output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
17
+
18
+ md_results_local_folder = r'g:\temp\lila-md-results'
19
+ md_base_url = 'https://lila.science/public/lila-md-results/'
20
+ assert md_base_url.endswith('/')
21
+
22
+ # No RDE files for datasets with no location information
23
+ datasets_without_location_info = ('ena24','missouri-camera-traps')
24
+
25
+ md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
26
+
27
+ validate_urls = False
28
+
29
+
30
+ #%% Read input data
31
+
32
+ df = pd.read_csv(input_csv_file)
33
+ for s in md_results_column_names:
34
+ df[s] = ''
35
+
36
+
37
+ #%% Find matching files locally, and create URLs
38
+
39
+ local_files = os.listdir(md_results_local_folder)
40
+ local_files = [fn for fn in local_files if fn.endswith('.zip')]
41
+
42
+ # i_row = 0; row = df.iloc[i_row]
43
+ for i_row,row in df.iterrows():
44
+
45
+ if not isinstance(row['name'],str):
46
+ continue
47
+
48
+ dataset_shortname = row['short_name']
49
+ matching_files = [fn for fn in local_files if dataset_shortname in fn]
50
+
51
+ # No RDE files for datasets with no location information
52
+ if dataset_shortname in datasets_without_location_info:
53
+ assert len(matching_files) == 2
54
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
55
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
56
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
57
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
+ else:
60
+ # Exclude single-season files for snapshot-serengeti
61
+ if dataset_shortname == 'snapshot-serengeti':
62
+ matching_files = [fn for fn in matching_files if '_S' not in fn]
63
+ assert len(matching_files) == 2
64
+ assert all(['mdv4' in fn for fn in matching_files])
65
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
66
+ raw_files = [fn for fn in matching_files if 'rde' not in fn]
67
+ assert len(rde_files) == 1 and len(raw_files) == 1
68
+ df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
69
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
70
+ else:
71
+ assert len(matching_files) == 3
72
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
73
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
74
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
75
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
76
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
77
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
78
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
79
+
80
+ print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
81
+
82
+ # ...for each row
83
+
84
+
85
+ #%% Validate URLs
86
+
87
+ if validate_urls:
88
+
89
+ from megadetector.utils.url_utils import test_urls
90
+
91
+ urls = set()
92
+
93
+ for i_row,row in df.iterrows():
94
+ for column_name in md_results_column_names:
95
+ if len(row[column_name]) > 0:
96
+ assert row[column_name] not in urls
97
+ urls.add(row[column_name])
98
+
99
+ test_urls(urls,error_on_failure=True)
100
+
101
+ print('Validated {} URLs'.format(len(urls)))
102
+
103
+
104
+ #%% Write new .csv file
105
+
106
+ df.to_csv(output_csv_file,header=True,index=False)
@@ -0,0 +1,182 @@
1
+ """
2
+
3
+ download_lila_subset.py
4
+
5
+ Example of how to download a list of files from LILA, e.g. all the files
6
+ in a data set corresponding to a particular species.
7
+
8
+ """
9
+
10
+ #%% Constants and imports
11
+
12
+ import os
13
+ import random
14
+
15
+ from tqdm import tqdm
16
+ from collections import defaultdict
17
+ from copy import deepcopy
18
+
19
+ from megadetector.data_management.lila.lila_common import \
20
+ read_lila_all_images_file, is_empty, lila_base_urls
21
+ from megadetector.utils.url_utils import parallel_download_urls
22
+ from megadetector.utils.path_utils import open_file
23
+
24
+ for s in lila_base_urls.values():
25
+ assert s.endswith('/')
26
+
27
+ # If any of these strings appear in the common name of a species, we'll download that image
28
+ # species_of_interest = ['grey fox','gray fox','cape fox','red fox','kit fox']
29
+ species_of_interest = ['bear']
30
+
31
+ # We'll write images, metadata downloads, and temporary files here
32
+ lila_local_base = os.path.expanduser('~/lila')
33
+
34
+ metadata_dir = os.path.join(lila_local_base,'metadata')
35
+ os.makedirs(metadata_dir,exist_ok=True)
36
+
37
+ output_dir = os.path.join(lila_local_base,'lila_downloads_by_dataset')
38
+ os.makedirs(output_dir,exist_ok=True)
39
+
40
+ # Number of concurrent download threads
41
+ n_download_threads = 20
42
+
43
+ max_images_per_dataset = 10 # None
44
+
45
+ preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
46
+
47
+ random.seed(0)
48
+
49
+
50
+ #%% Download and open the giant table of image URLs and labels
51
+
52
+ # Takes ~2 minutes to download, unzip, and open
53
+ df = read_lila_all_images_file(metadata_dir)
54
+
55
+
56
+ #%% Find all the images we want to download
57
+
58
+ # Takes ~2 minutes
59
+
60
+ common_name_to_count = defaultdict(int)
61
+
62
+ ds_name_to_urls = defaultdict(list)
63
+
64
+ def find_items(row): # noqa
65
+
66
+ if is_empty(row['common_name']):
67
+ return
68
+
69
+ match = False
70
+
71
+ # This is the only bit of this file that's specific to a particular query. In this case
72
+ # we're checking whether each row is on a list of species of interest, but you do you.
73
+ for species_name in species_of_interest:
74
+ if species_name in row['common_name']:
75
+ match = True
76
+ common_name_to_count[species_name] += 1
77
+ break
78
+
79
+ if match:
80
+ ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
81
+
82
+ tqdm.pandas()
83
+ _ = df.progress_apply(find_items,axis=1)
84
+
85
+ # We have a list of URLs for each dataset, flatten them all into a list of URLs
86
+ all_urls = list(ds_name_to_urls.values())
87
+ all_urls = [item for sublist in all_urls for item in sublist]
88
+ print('Found {} matching URLs across {} datasets'.format(len(all_urls),len(ds_name_to_urls)))
89
+
90
+ for common_name in common_name_to_count:
91
+ print('{}: {}'.format(common_name,common_name_to_count[common_name]))
92
+
93
+ ds_name_to_urls_raw = deepcopy(ds_name_to_urls)
94
+
95
+
96
+ #%% Optionally trim to a fixed number of URLs per dataset
97
+
98
+ if max_images_per_dataset is None:
99
+ pass
100
+ else:
101
+ # ds_name = next(iter(ds_name_to_urls.keys()))
102
+ for ds_name in ds_name_to_urls:
103
+ if len(ds_name_to_urls[ds_name]) > max_images_per_dataset:
104
+ ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
105
+
106
+
107
+ #%% Choose target files for each URL
108
+
109
+ # We have a list of URLs per dataset, flatten that into a single list of URLs
110
+ urls_to_download = set()
111
+ for ds_name in ds_name_to_urls:
112
+ for url in ds_name_to_urls[ds_name]:
113
+ urls_to_download.add(url)
114
+ urls_to_download = sorted(list(urls_to_download))
115
+
116
+ # A URL might look like this:
117
+ #
118
+ # https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
119
+ #
120
+ # We'll write that to an output file that looks like this (relative to output_dir):
121
+ #
122
+ # wcs-unzipped/animals/0667/0302.jpg
123
+ #
124
+ # ...so we need to remove the base URL to get the target file.
125
+ base_url = lila_base_urls[preferred_provider]
126
+ assert base_url.endswith('/')
127
+
128
+ url_to_target_file = {}
129
+
130
+ for url in urls_to_download:
131
+ assert url.startswith(base_url)
132
+ target_fn_relative = url.replace(base_url,'')
133
+ target_fn_abs = os.path.join(output_dir,target_fn_relative)
134
+ url_to_target_file[url] = target_fn_abs
135
+
136
+
137
+ #%% Download image files
138
+
139
+ download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
140
+ verbose=False,
141
+ overwrite=False,
142
+ n_workers=n_download_threads,
143
+ pool_type='thread')
144
+
145
+
146
+ #%% Open output folder
147
+
148
+ open_file(output_dir)
149
+
150
+
151
+ #%% Scrap
152
+
153
+ if False:
154
+
155
+ pass
156
+
157
+ #%% Find all the reptiles on LILA
158
+
159
+ reptile_rows = df.loc[df['class'] == 'reptilia']
160
+
161
+ # i_row = 0; row = reptile_rows.iloc[i_row]
162
+
163
+ common_name_to_count = defaultdict(int)
164
+ dataset_to_count = defaultdict(int)
165
+ for i_row,row in reptile_rows.iterrows():
166
+ common_name_to_count[row['common_name']] += 1
167
+ dataset_to_count[row['dataset_name']] += 1
168
+
169
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
170
+
171
+ print('Found {} reptiles\n'.format(len(reptile_rows)))
172
+
173
+ common_name_to_count = sort_dictionary_by_value(common_name_to_count,reverse=True)
174
+ dataset_to_count = sort_dictionary_by_value(dataset_to_count,reverse=True)
175
+
176
+ print('Common names by count:\n')
177
+ for k in common_name_to_count:
178
+ print('{} ({})'.format(k,common_name_to_count[k]))
179
+
180
+ print('\nDatasets by count:\n')
181
+ for k in dataset_to_count:
182
+ print('{} ({})'.format(k,dataset_to_count[k]))