megadetector 5.0.11__py3-none-any.whl → 5.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (201) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +98 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +152 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +92 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +126 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +610 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +239 -0
  58. megadetector/data_management/cct_json_utils.py +395 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +272 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +477 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +796 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +874 -0
  129. megadetector/data_management/read_exif.py +681 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/resize_coco_dataset.py +189 -0
  133. megadetector/data_management/wi_download_csv_to_coco.py +246 -0
  134. megadetector/data_management/yolo_output_to_md_output.py +441 -0
  135. megadetector/data_management/yolo_to_coco.py +676 -0
  136. megadetector/detection/__init__.py +0 -0
  137. megadetector/detection/detector_training/__init__.py +0 -0
  138. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  139. megadetector/detection/process_video.py +702 -0
  140. megadetector/detection/pytorch_detector.py +341 -0
  141. megadetector/detection/run_detector.py +779 -0
  142. megadetector/detection/run_detector_batch.py +1219 -0
  143. megadetector/detection/run_inference_with_yolov5_val.py +917 -0
  144. megadetector/detection/run_tiled_inference.py +934 -0
  145. megadetector/detection/tf_detector.py +189 -0
  146. megadetector/detection/video_utils.py +606 -0
  147. megadetector/postprocessing/__init__.py +0 -0
  148. megadetector/postprocessing/add_max_conf.py +64 -0
  149. megadetector/postprocessing/categorize_detections_by_size.py +163 -0
  150. megadetector/postprocessing/combine_api_outputs.py +249 -0
  151. megadetector/postprocessing/compare_batch_results.py +958 -0
  152. megadetector/postprocessing/convert_output_format.py +396 -0
  153. megadetector/postprocessing/load_api_results.py +195 -0
  154. megadetector/postprocessing/md_to_coco.py +310 -0
  155. megadetector/postprocessing/md_to_labelme.py +330 -0
  156. megadetector/postprocessing/merge_detections.py +401 -0
  157. megadetector/postprocessing/postprocess_batch_results.py +1902 -0
  158. megadetector/postprocessing/remap_detection_categories.py +170 -0
  159. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  160. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  161. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  162. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1631 -0
  163. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  164. megadetector/postprocessing/subset_json_detector_output.py +696 -0
  165. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  166. megadetector/taxonomy_mapping/__init__.py +0 -0
  167. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  168. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  169. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  170. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +590 -0
  171. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  172. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  173. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  174. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  175. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  176. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  177. megadetector/utils/__init__.py +0 -0
  178. megadetector/utils/azure_utils.py +178 -0
  179. megadetector/utils/ct_utils.py +612 -0
  180. megadetector/utils/directory_listing.py +246 -0
  181. megadetector/utils/md_tests.py +968 -0
  182. megadetector/utils/path_utils.py +1044 -0
  183. megadetector/utils/process_utils.py +157 -0
  184. megadetector/utils/sas_blob_utils.py +509 -0
  185. megadetector/utils/split_locations_into_train_val.py +228 -0
  186. megadetector/utils/string_utils.py +92 -0
  187. megadetector/utils/url_utils.py +323 -0
  188. megadetector/utils/write_html_image_list.py +225 -0
  189. megadetector/visualization/__init__.py +0 -0
  190. megadetector/visualization/plot_utils.py +293 -0
  191. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  192. megadetector/visualization/visualization_utils.py +1536 -0
  193. megadetector/visualization/visualize_db.py +550 -0
  194. megadetector/visualization/visualize_detector_output.py +405 -0
  195. {megadetector-5.0.11.dist-info → megadetector-5.0.12.dist-info}/METADATA +1 -1
  196. megadetector-5.0.12.dist-info/RECORD +199 -0
  197. megadetector-5.0.12.dist-info/top_level.txt +1 -0
  198. megadetector-5.0.11.dist-info/RECORD +0 -5
  199. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  200. {megadetector-5.0.11.dist-info → megadetector-5.0.12.dist-info}/LICENSE +0 -0
  201. {megadetector-5.0.11.dist-info → megadetector-5.0.12.dist-info}/WHEEL +0 -0
@@ -0,0 +1,590 @@
1
+ """
2
+
3
+ preview_lila_taxonomy.py
4
+
5
+ Does some consistency-checking on the LILA taxonomy file, and generates
6
+ an HTML preview page that we can use to determine whether the mappings
7
+ make sense.
8
+
9
+ """
10
+
11
+ #%% Imports and constants
12
+
13
+ from tqdm import tqdm
14
+
15
+ import os
16
+ import pandas as pd
17
+
18
+ # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
20
+
21
+ preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
+ os.makedirs(preview_base,exist_ok=True)
23
+ html_output_file = os.path.join(preview_base,'index.html')
24
+
25
+
26
+ #%% Support functions
27
+
28
+ def parse_taxonomy_string(taxonomy_string):
29
+
30
+ taxonomic_match = eval(taxonomy_string)
31
+ matched_entity = taxonomic_match[0]
32
+ assert len(matched_entity) == 4
33
+
34
+ level = matched_entity[1]
35
+
36
+ scientific_name = matched_entity[2]
37
+
38
+ common_names = matched_entity[3]
39
+ if len(common_names) == 1:
40
+ common_name = common_names[0]
41
+ else:
42
+ common_name = str(common_names)
43
+
44
+ return scientific_name,common_name,level,taxonomic_match
45
+
46
+ def taxonomy_string_to_common_name(taxonomy_string):
47
+ _,cn,_,_ = parse_taxonomy_string(taxonomy_string)
48
+ return cn
49
+
50
+ def taxonomy_string_to_scientific(taxonomy_string):
51
+ sn,_,_,_ = parse_taxonomy_string(taxonomy_string)
52
+ return sn
53
+
54
+ def taxonomy_string_to_level(taxonomy_string):
55
+ _,_,level,_ = parse_taxonomy_string(taxonomy_string)
56
+ return level
57
+
58
+
59
+ #%% Read the taxonomy mapping file
60
+
61
+ df = pd.read_csv(lila_taxonomy_file)
62
+
63
+
64
+ #%% Prepare taxonomy lookup
65
+
66
+ from megadetector.taxonomy_mapping.species_lookup import \
67
+ initialize_taxonomy_lookup, get_preferred_taxonomic_match
68
+
69
+ # from taxonomy_mapping.species_lookup import (
70
+ # get_taxonomic_info, print_taxonomy_matche)
71
+
72
+ initialize_taxonomy_lookup()
73
+
74
+
75
+ #%% Optionally remap all gbif-based mappings to inat (or vice-versa)
76
+
77
+ if False:
78
+
79
+ #%%
80
+
81
+ source_mappings = ['gbif','manual']
82
+ target_mapping = 'inat'
83
+ valid_mappings = ['gbif','inat','manual']
84
+
85
+ assert target_mapping in valid_mappings
86
+ for source_mapping in source_mappings:
87
+ assert source_mapping != target_mapping and \
88
+ source_mapping in valid_mappings
89
+
90
+ n_remappings = 0
91
+
92
+ # i_row = 1; row = df.iloc[i_row]; row
93
+ for i_row,row in df.iterrows():
94
+
95
+ if row['source'] not in source_mappings:
96
+ continue
97
+
98
+ scientific_name = row['scientific_name']
99
+ old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
100
+
101
+ m = get_preferred_taxonomic_match(scientific_name,target_mapping)
102
+
103
+ if m is None or m.source != target_mapping:
104
+ print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
105
+ continue
106
+
107
+ assert m.scientific_name == row['scientific_name']
108
+
109
+ if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
110
+ pass
111
+ else:
112
+ assert m.taxonomic_level == row['taxonomy_level']
113
+
114
+ new_common = taxonomy_string_to_common_name(m.taxonomy_string)
115
+
116
+ if row['taxonomy_string'] != m.taxonomy_string:
117
+ print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
118
+ n_remappings += 1
119
+ df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
120
+
121
+ if row['source'] != 'manual':
122
+ df.loc[i_row,'source'] = m.source
123
+
124
+ # This should be zero for the release .csv
125
+ print('Made {} remappings'.format(n_remappings))
126
+
127
+ #%%
128
+
129
+ df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
130
+
131
+
132
+ #%% Check for mappings that disagree with the taxonomy string
133
+
134
+ df = pd.read_csv(lila_taxonomy_file)
135
+
136
+ n_taxonomy_changes = 0
137
+
138
+ # Look for internal inconsistency
139
+ for i_row,row in df.iterrows():
140
+
141
+ sn = row['scientific_name']
142
+ if not isinstance(sn,str):
143
+ continue
144
+
145
+ ts = row['taxonomy_string']
146
+ assert sn == taxonomy_string_to_scientific(ts)
147
+
148
+ assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
149
+
150
+ # Look for outdated mappings
151
+ taxonomy_preference = 'inat'
152
+
153
+ # i_row = 0; row = df.iloc[i_row]
154
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
155
+
156
+ sn = row['scientific_name']
157
+ if not isinstance(sn,str):
158
+ continue
159
+
160
+ m = get_preferred_taxonomic_match(sn,taxonomy_preference)
161
+ assert m.scientific_name == sn
162
+
163
+ ts = row['taxonomy_string']
164
+ assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
165
+ row['dataset_name'],ts,m.taxonomy_string)
166
+
167
+ if ts != m.taxonomy_string:
168
+ n_taxonomy_changes += 1
169
+ df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
170
+
171
+ print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
172
+
173
+ # Optionally re-write
174
+ if False:
175
+ df.to_csv(lila_taxonomy_file,header=True,index=False)
176
+
177
+
178
+ #%% List null mappings
179
+
180
+ # These should all be things like "empty", "unidentified", "fire", "car", etc.
181
+
182
+ # i_row = 0; row = df.iloc[i_row]
183
+ for i_row,row in df.iterrows():
184
+ if (not isinstance(row['taxonomy_string'],str)) or (len(row['taxonomy_string']) == 0):
185
+ print('No mapping for {}:{}'.format(row['dataset_name'],row['query']))
186
+
187
+
188
+ #%% List mappings with scientific names but no common names
189
+
190
+ for i_row,row in df.iterrows():
191
+ cn = row['common_name']
192
+ sn = row['scientific_name']
193
+ ts = row['taxonomy_string']
194
+ if (isinstance(ts,str)) and (len(ts) >= 0):
195
+ if (not isinstance(cn,str)) or (len(cn) == 0):
196
+ print('No mapping for {}:{}:{}'.format(row['dataset_name'],row['query'],row['scientific_name']))
197
+
198
+
199
+ #%% List mappings that map to different things in different data sets
200
+
201
+ import numpy as np
202
+ def isnan(x):
203
+ if not isinstance(x,float):
204
+ return False
205
+ return np.isnan(x)
206
+
207
+ from collections import defaultdict
208
+ query_to_rows = defaultdict(list)
209
+
210
+ queries_with_multiple_mappings = set()
211
+
212
+ n_suppressed = 0
213
+
214
+ suppress_multiple_matches = [
215
+ ['porcupine','Snapshot Camdeboo','Idaho Camera Traps'],
216
+ ['porcupine','Snapshot Enonkishu','Idaho Camera Traps'],
217
+ ['porcupine','Snapshot Karoo','Idaho Camera Traps'],
218
+ ['porcupine','Snapshot Kgalagadi','Idaho Camera Traps'],
219
+ ['porcupine','Snapshot Kruger','Idaho Camera Traps'],
220
+ ['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
221
+ ['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
222
+
223
+ ['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
224
+ ['porcupine','Snapshot Serengeti','Snapshot Kruger'],
225
+ ['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
226
+ ['porcupine','Snapshot Serengeti','Snapshot Karoo'],
227
+ ['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
228
+
229
+ ['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
230
+ ['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
231
+ ['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
232
+ ['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
233
+ ['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
234
+
235
+ ['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
236
+ ['kudu','Snapshot Serengeti','Snapshot Kruger'],
237
+ ['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
238
+ ['kudu','Snapshot Serengeti','Snapshot Karoo'],
239
+ ['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
240
+
241
+ ['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
242
+ ['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
243
+ ['fox','Idaho Camera Traps','Caltech Camera Traps'],
244
+
245
+ ['pangolin','Snapshot Serengeti','SWG Camera Traps'],
246
+
247
+ ['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
248
+ ['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
249
+
250
+ ['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
251
+
252
+ ]
253
+
254
+ for i_row,row in df.iterrows():
255
+
256
+ query = row['query']
257
+ taxonomy_string = row['taxonomy_string']
258
+
259
+ for previous_i_row in query_to_rows[query]:
260
+
261
+ previous_row = df.iloc[previous_i_row]
262
+ assert previous_row['query'] == query
263
+ query_match = False
264
+ if isnan(row['taxonomy_string']):
265
+ query_match = isnan(previous_row['taxonomy_string'])
266
+ elif isnan(previous_row['taxonomy_string']):
267
+ query_match = isnan(row['taxonomy_string'])
268
+ else:
269
+ query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
270
+
271
+ if not query_match:
272
+
273
+ suppress = False
274
+
275
+ # x = suppress_multiple_matches[-1]
276
+ for x in suppress_multiple_matches:
277
+ if x[0] == query and \
278
+ ( \
279
+ (x[1] == row['dataset_name'] and x[2] == previous_row['dataset_name']) \
280
+ or \
281
+ (x[2] == row['dataset_name'] and x[1] == previous_row['dataset_name']) \
282
+ ):
283
+ suppress = True
284
+ n_suppressed += 1
285
+ break
286
+
287
+ if not suppress:
288
+ print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
289
+ query, row['dataset_name'], previous_row['dataset_name'],
290
+ taxonomy_string, previous_row['taxonomy_string']))
291
+
292
+ queries_with_multiple_mappings.add(query)
293
+
294
+ # ...for each row where we saw this query
295
+
296
+ query_to_rows[query].append(i_row)
297
+
298
+ # ...for each row
299
+
300
+ print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
301
+ len(queries_with_multiple_mappings),n_suppressed))
302
+
303
+
304
+ #%% Verify that nothing "unidentified" maps to a species or subspecies
305
+
306
+ # E.g., "unidentified skunk" should never map to a specific species of skunk
307
+
308
+ allowable_unknown_species = [
309
+ 'unknown_tayra' # AFAIK this is a unique species, I'm not sure what's implied here
310
+ ]
311
+
312
+ unk_queries = ['skunk']
313
+ for i_row,row in df.iterrows():
314
+
315
+ query = row['query']
316
+ level = row['taxonomy_level']
317
+
318
+ if not isinstance(level,str):
319
+ assert not isinstance(row['taxonomy_string'],str)
320
+ continue
321
+
322
+ if ( \
323
+ 'unidentified' in query or \
324
+ ('unk' in query and ('skunk' not in query and 'chipmunk' not in query))\
325
+ ) \
326
+ and \
327
+ ('species' in level):
328
+
329
+ if query not in allowable_unknown_species:
330
+
331
+ print('Warning: query {}:{} maps to {} {}'.format(
332
+ row['dataset_name'],
333
+ row['query'],
334
+ row['taxonomy_level'],
335
+ row['scientific_name']
336
+ ))
337
+
338
+
339
+ #%% Make sure there are valid source and level values for everything with a mapping
340
+
341
+ for i_row,row in df.iterrows():
342
+ if isinstance(row['scientific_name'],str):
343
+ if 'source' in row:
344
+ assert isinstance(row['source'],str)
345
+ assert isinstance(row['taxonomy_level'],str)
346
+
347
+
348
+ #%% Find WCS mappings that aren't species or aren't the same as the input
349
+
350
+ # WCS used scientific names, so these remappings are slightly more controversial
351
+ # then the standard remappings.
352
+
353
+ # row = df.iloc[-500]
354
+ for i_row,row in df.iterrows():
355
+
356
+ if not isinstance(row['scientific_name'],str):
357
+ continue
358
+ if 'WCS' not in row['dataset_name']:
359
+ continue
360
+
361
+ query = row['query']
362
+ scientific_name = row['scientific_name']
363
+ common_name = row['common_name']
364
+ level = row['taxonomy_level']
365
+ taxonomy_string = row['taxonomy_string']
366
+
367
+ common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
368
+ query_string = query.replace(' sp','')
369
+ query_string = query_string.replace('unknown ','')
370
+
371
+ # Anything marked "species" or "unknown" by definition doesn't map to a species,
372
+ # so ignore these.
373
+ if (' sp' not in query) and ('unknown' not in query) and \
374
+ (level != 'species') and (level != 'subspecies'):
375
+ print('WCS query {} ({}) remapped to {} {} ({})'.format(
376
+ query,common_name,level,scientific_name,common_name_from_taxonomy))
377
+
378
+ if query_string != scientific_name:
379
+ pass
380
+ # print('WCS query {} ({}) remapped to {} ({})'.format(
381
+ # query,common_name,scientific_name,common_names_from_taxonomy))
382
+
383
+
384
+ #%% Download sample images for all scientific names
385
+
386
+ remapped_queries = {'papio':'papio+baboon',
387
+ 'damaliscus lunatus jimela':'damaliscus lunatus',
388
+ 'mazama':'genus+mazama',
389
+ 'mirafra':'genus+mirafra'}
390
+
391
+ import os
392
+ from megadetector.taxonomy_mapping import retrieve_sample_image
393
+
394
+ scientific_name_to_paths = {}
395
+ image_base = os.path.join(preview_base,'images')
396
+ images_per_query = 15
397
+ min_valid_images_per_query = 3
398
+ min_valid_image_size = 3000
399
+
400
+ # TODO: trivially prallelizable
401
+ #
402
+ # i_row = 0; row = df.iloc[i_row]
403
+ for i_row,row in df.iterrows():
404
+
405
+ s = row['scientific_name']
406
+
407
+ if (not isinstance(s,str)) or (len(s)==0):
408
+ continue
409
+
410
+ query = s.replace(' ','+')
411
+
412
+ if query in remapped_queries:
413
+ query = remapped_queries[query]
414
+
415
+ query_folder = os.path.join(image_base,query)
416
+ os.makedirs(query_folder,exist_ok=True)
417
+
418
+ # Check whether we already have enough images for this query
419
+ image_files = os.listdir(query_folder)
420
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
421
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
422
+ sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
423
+ if len(sizes_above_threshold) > min_valid_images_per_query:
424
+ print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
425
+ continue
426
+
427
+ # Check whether we've already run this query for a previous row
428
+ if query in scientific_name_to_paths:
429
+ continue
430
+
431
+ print('Processing query {} of {} ({})'.format(i_row,len(df),query))
432
+ paths = retrieve_sample_image.download_images(query=query,
433
+ output_directory=image_base,
434
+ limit=images_per_query,
435
+ verbose=True)
436
+ print('Downloaded {} images for {}'.format(len(paths),query))
437
+ scientific_name_to_paths[query] = paths
438
+
439
+ # ...for each row in the mapping table
440
+
441
+
442
+ #%% Rename .jpeg to .jpg
443
+
444
+ from megadetector.utils import path_utils
445
+ all_images = path_utils.recursive_file_list(image_base,False)
446
+
447
+ for fn in tqdm(all_images):
448
+ if fn.lower().endswith('.jpeg'):
449
+ new_fn = fn[0:-5] + '.jpg'
450
+ os.rename(fn, new_fn)
451
+
452
+
453
+ #%% Choose representative images for each scientific name
454
+
455
+ # Specifically, sort by size, and take the largest unique sizes. Very small files tend
456
+ # to be bogus thumbnails, etc.
457
+
458
+ max_images_per_query = 4
459
+ scientific_name_to_preferred_images = {}
460
+
461
+ # s = list(scientific_name_to_paths.keys())[0]
462
+ for s in list(df.scientific_name):
463
+
464
+ if not isinstance(s,str):
465
+ continue
466
+
467
+ query = s.replace(' ','+')
468
+
469
+ if query in remapped_queries:
470
+ query = remapped_queries[query]
471
+
472
+ query_folder = os.path.join(image_base,query)
473
+ assert os.path.isdir(query_folder)
474
+ image_files = os.listdir(query_folder)
475
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
476
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
477
+ path_to_size = {}
478
+ for i_fp,fp in enumerate(image_fullpaths):
479
+ path_to_size[fp] = sizes[i_fp]
480
+ paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
481
+
482
+ # Be suspicious of duplicate sizes
483
+ b_duplicate_sizes = [False] * len(paths_by_size)
484
+
485
+ for i_path,p in enumerate(paths_by_size):
486
+ if i_path == len(paths_by_size) - 1:
487
+ continue
488
+ if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
489
+ b_duplicate_sizes[i_path] = True
490
+
491
+ paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
492
+
493
+ preferred_paths = paths_by_size_non_dup[:max_images_per_query]
494
+ scientific_name_to_preferred_images[s] = preferred_paths
495
+
496
+ # ...for each scientific name
497
+
498
+
499
+ #%% Delete unused images
500
+
501
+ used_images = []
502
+ for images in scientific_name_to_preferred_images.values():
503
+ used_images.extend(images)
504
+
505
+ print('Using a total of {} images'.format(len(used_images)))
506
+ used_images_set = set(used_images)
507
+
508
+ from megadetector.utils import path_utils
509
+ all_images = path_utils.recursive_file_list(image_base,False)
510
+
511
+ unused_images = []
512
+ for fn in all_images:
513
+ if fn not in used_images_set:
514
+ unused_images.append(fn)
515
+
516
+ print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images),
517
+ len(all_images) - len(unused_images)))
518
+
519
+ for fn in tqdm(unused_images):
520
+ os.remove(fn)
521
+
522
+
523
+ #%% Produce HTML preview
524
+
525
+ with open(html_output_file, 'w', encoding='utf-8') as f:
526
+
527
+ f.write('<html><head></head><body>\n')
528
+
529
+ names = scientific_name_to_preferred_images.keys()
530
+ names = sorted(names)
531
+
532
+ f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
533
+ 'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
534
+ '</p>')
535
+
536
+ # i_row = 2; row = df.iloc[i_row]
537
+ for i_row, row in tqdm(df.iterrows(), total=len(df)):
538
+
539
+ s = row['scientific_name']
540
+
541
+ taxonomy_string = row['taxonomy_string']
542
+ if isinstance(taxonomy_string,str):
543
+ taxonomic_match = eval(taxonomy_string)
544
+ matched_entity = taxonomic_match[0]
545
+ assert len(matched_entity) == 4
546
+ common_names = matched_entity[3]
547
+ if len(common_names) == 1:
548
+ common_name_string = common_names[0]
549
+ else:
550
+ common_name_string = str(common_names)
551
+ else:
552
+ common_name_string = ''
553
+
554
+ f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
555
+
556
+ if isinstance(row.scientific_name,str):
557
+ output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
558
+ row.dataset_name, row.query,
559
+ row.taxonomy_level, row.scientific_name, common_name_string,
560
+ row.common_name)
561
+ f.write(output_string)
562
+ else:
563
+ f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
564
+
565
+ if s is None or s not in names:
566
+ f.write('<p class="content_p">no images available</p>')
567
+ else:
568
+ image_paths = scientific_name_to_preferred_images[s]
569
+ basedir = os.path.dirname(html_output_file)
570
+ relative_paths = [os.path.relpath(p,basedir) for p in image_paths]
571
+ image_paths = [s.replace('\\','/') for s in relative_paths]
572
+ n_images = len(image_paths)
573
+ # image_paths = [os.path.relpath(p, output_base) for p in image_paths]
574
+ image_width_percent = round(100 / n_images)
575
+ f.write('<table class="image_table"><tr>\n')
576
+ for image_path in image_paths:
577
+ f.write('<td style="vertical-align:top;" width="{}%">'
578
+ '<img src="{}" style="display:block; width:100%; vertical-align:top; height:auto;">'
579
+ '</td>\n'.format(image_width_percent, image_path))
580
+ f.write('</tr></table>\n')
581
+
582
+ # ...for each row
583
+
584
+ f.write('</body></html>\n')
585
+
586
+
587
+ #%% Open HTML preview
588
+
589
+ from megadetector.utils.path_utils import open_file
590
+ open_file(html_output_file)
@@ -0,0 +1,71 @@
1
+ """
2
+
3
+ retrieve_sample_image.py
4
+
5
+ Downloader that retrieves images from Google images, used for verifying taxonomy
6
+ lookups and looking for egregious mismappings (e.g., "snake" being mapped to a fish called
7
+ "snake").
8
+
9
+ Simple wrapper around simple_image_download, but I've had to swap in and out the underlying
10
+ downloader a few times.
11
+
12
+ """
13
+
14
+ #%% Imports and environment
15
+
16
+ import os
17
+
18
+ output_folder = os.path.expanduser('~/tmp/image-download-test')
19
+ os.makedirs(output_folder,exist_ok=True)
20
+
21
+ method = 'simple_image_download' # 'google_images_download'
22
+
23
+ if method == 'simple_image_download':
24
+
25
+ from megadetector.taxonomy_mapping import simple_image_download
26
+ google_image_downloader = simple_image_download.Downloader()
27
+ google_image_downloader.directory = output_folder
28
+
29
+ elif method == 'google_images_download':
30
+
31
+ from google_images_download import google_images_download
32
+
33
+ else:
34
+
35
+ raise ValueError('Unrecognized method {}'.format(method))
36
+
37
+
38
+ #%% Main entry point
39
+
40
+ def download_images(query,output_directory,limit=100,verbose=False):
41
+
42
+ query = query.replace(' ','+')
43
+
44
+ if method == 'simple_image_download':
45
+
46
+ google_image_downloader.directory = output_directory
47
+ paths = google_image_downloader.download(query, limit=limit,
48
+ verbose=verbose, cache=False, download_cache=False)
49
+ return paths
50
+
51
+ elif method == 'google_images_download':
52
+
53
+ response = google_images_download.googleimagesdownload()
54
+ arguments = {'keywords':query,'limit':limit,'print_urls':verbose,
55
+ 'image-directory':output_directory}
56
+ response.download(arguments)
57
+ return None
58
+
59
+ else:
60
+
61
+ raise ValueError('Unrecognized method {}'.format(method))
62
+
63
+
64
+ #%% Test driver
65
+
66
+ if False:
67
+
68
+ #%%
69
+
70
+ paths = download_images(query='redunca',output_directory=output_folder,
71
+ limit=20,verbose=True)