megadetector 5.0.10__py3-none-any.whl → 5.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show
  1. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/LICENSE +0 -0
  2. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/METADATA +12 -11
  3. megadetector-5.0.11.dist-info/RECORD +5 -0
  4. megadetector-5.0.11.dist-info/top_level.txt +1 -0
  5. api/__init__.py +0 -0
  6. api/batch_processing/__init__.py +0 -0
  7. api/batch_processing/api_core/__init__.py +0 -0
  8. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  9. api/batch_processing/api_core/batch_service/score.py +0 -439
  10. api/batch_processing/api_core/server.py +0 -294
  11. api/batch_processing/api_core/server_api_config.py +0 -98
  12. api/batch_processing/api_core/server_app_config.py +0 -55
  13. api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  14. api/batch_processing/api_core/server_job_status_table.py +0 -152
  15. api/batch_processing/api_core/server_orchestration.py +0 -360
  16. api/batch_processing/api_core/server_utils.py +0 -92
  17. api/batch_processing/api_core_support/__init__.py +0 -0
  18. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  19. api/batch_processing/api_support/__init__.py +0 -0
  20. api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  21. api/batch_processing/data_preparation/__init__.py +0 -0
  22. api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
  23. api/batch_processing/data_preparation/manage_video_batch.py +0 -327
  24. api/batch_processing/integration/digiKam/setup.py +0 -6
  25. api/batch_processing/integration/digiKam/xmp_integration.py +0 -465
  26. api/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -5
  27. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -126
  28. api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -55
  29. api/batch_processing/postprocessing/__init__.py +0 -0
  30. api/batch_processing/postprocessing/add_max_conf.py +0 -64
  31. api/batch_processing/postprocessing/categorize_detections_by_size.py +0 -163
  32. api/batch_processing/postprocessing/combine_api_outputs.py +0 -249
  33. api/batch_processing/postprocessing/compare_batch_results.py +0 -958
  34. api/batch_processing/postprocessing/convert_output_format.py +0 -397
  35. api/batch_processing/postprocessing/load_api_results.py +0 -195
  36. api/batch_processing/postprocessing/md_to_coco.py +0 -310
  37. api/batch_processing/postprocessing/md_to_labelme.py +0 -330
  38. api/batch_processing/postprocessing/merge_detections.py +0 -401
  39. api/batch_processing/postprocessing/postprocess_batch_results.py +0 -1904
  40. api/batch_processing/postprocessing/remap_detection_categories.py +0 -170
  41. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +0 -661
  42. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +0 -211
  43. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +0 -82
  44. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +0 -1631
  45. api/batch_processing/postprocessing/separate_detections_into_folders.py +0 -731
  46. api/batch_processing/postprocessing/subset_json_detector_output.py +0 -696
  47. api/batch_processing/postprocessing/top_folders_to_bottom.py +0 -223
  48. api/synchronous/__init__.py +0 -0
  49. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  50. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -152
  51. api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -266
  52. api/synchronous/api_core/animal_detection_api/config.py +0 -35
  53. api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
  54. api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
  55. api/synchronous/api_core/animal_detection_api/detection/detector_training/model_main_tf2.py +0 -114
  56. api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
  57. api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
  58. api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
  59. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
  60. api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
  61. api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
  62. api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
  63. api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
  64. api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
  65. api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
  66. api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
  67. api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
  68. api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
  69. api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
  70. api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
  71. api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
  72. api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
  73. api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
  74. api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
  75. api/synchronous/api_core/tests/__init__.py +0 -0
  76. api/synchronous/api_core/tests/load_test.py +0 -110
  77. classification/__init__.py +0 -0
  78. classification/aggregate_classifier_probs.py +0 -108
  79. classification/analyze_failed_images.py +0 -227
  80. classification/cache_batchapi_outputs.py +0 -198
  81. classification/create_classification_dataset.py +0 -627
  82. classification/crop_detections.py +0 -516
  83. classification/csv_to_json.py +0 -226
  84. classification/detect_and_crop.py +0 -855
  85. classification/efficientnet/__init__.py +0 -9
  86. classification/efficientnet/model.py +0 -415
  87. classification/efficientnet/utils.py +0 -610
  88. classification/evaluate_model.py +0 -520
  89. classification/identify_mislabeled_candidates.py +0 -152
  90. classification/json_to_azcopy_list.py +0 -63
  91. classification/json_validator.py +0 -695
  92. classification/map_classification_categories.py +0 -276
  93. classification/merge_classification_detection_output.py +0 -506
  94. classification/prepare_classification_script.py +0 -194
  95. classification/prepare_classification_script_mc.py +0 -228
  96. classification/run_classifier.py +0 -286
  97. classification/save_mislabeled.py +0 -110
  98. classification/train_classifier.py +0 -825
  99. classification/train_classifier_tf.py +0 -724
  100. classification/train_utils.py +0 -322
  101. data_management/__init__.py +0 -0
  102. data_management/annotations/__init__.py +0 -0
  103. data_management/annotations/annotation_constants.py +0 -34
  104. data_management/camtrap_dp_to_coco.py +0 -238
  105. data_management/cct_json_utils.py +0 -395
  106. data_management/cct_to_md.py +0 -176
  107. data_management/cct_to_wi.py +0 -289
  108. data_management/coco_to_labelme.py +0 -272
  109. data_management/coco_to_yolo.py +0 -662
  110. data_management/databases/__init__.py +0 -0
  111. data_management/databases/add_width_and_height_to_db.py +0 -33
  112. data_management/databases/combine_coco_camera_traps_files.py +0 -206
  113. data_management/databases/integrity_check_json_db.py +0 -477
  114. data_management/databases/subset_json_db.py +0 -115
  115. data_management/generate_crops_from_cct.py +0 -149
  116. data_management/get_image_sizes.py +0 -188
  117. data_management/importers/add_nacti_sizes.py +0 -52
  118. data_management/importers/add_timestamps_to_icct.py +0 -79
  119. data_management/importers/animl_results_to_md_results.py +0 -158
  120. data_management/importers/auckland_doc_test_to_json.py +0 -372
  121. data_management/importers/auckland_doc_to_json.py +0 -200
  122. data_management/importers/awc_to_json.py +0 -189
  123. data_management/importers/bellevue_to_json.py +0 -273
  124. data_management/importers/cacophony-thermal-importer.py +0 -796
  125. data_management/importers/carrizo_shrubfree_2018.py +0 -268
  126. data_management/importers/carrizo_trail_cam_2017.py +0 -287
  127. data_management/importers/cct_field_adjustments.py +0 -57
  128. data_management/importers/channel_islands_to_cct.py +0 -913
  129. data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  130. data_management/importers/eMammal/eMammal_helpers.py +0 -249
  131. data_management/importers/eMammal/make_eMammal_json.py +0 -223
  132. data_management/importers/ena24_to_json.py +0 -275
  133. data_management/importers/filenames_to_json.py +0 -385
  134. data_management/importers/helena_to_cct.py +0 -282
  135. data_management/importers/idaho-camera-traps.py +0 -1407
  136. data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  137. data_management/importers/jb_csv_to_json.py +0 -150
  138. data_management/importers/mcgill_to_json.py +0 -250
  139. data_management/importers/missouri_to_json.py +0 -489
  140. data_management/importers/nacti_fieldname_adjustments.py +0 -79
  141. data_management/importers/noaa_seals_2019.py +0 -181
  142. data_management/importers/pc_to_json.py +0 -365
  143. data_management/importers/plot_wni_giraffes.py +0 -123
  144. data_management/importers/prepare-noaa-fish-data-for-lila.py +0 -359
  145. data_management/importers/prepare_zsl_imerit.py +0 -131
  146. data_management/importers/rspb_to_json.py +0 -356
  147. data_management/importers/save_the_elephants_survey_A.py +0 -320
  148. data_management/importers/save_the_elephants_survey_B.py +0 -332
  149. data_management/importers/snapshot_safari_importer.py +0 -758
  150. data_management/importers/snapshot_safari_importer_reprise.py +0 -665
  151. data_management/importers/snapshot_serengeti_lila.py +0 -1067
  152. data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  153. data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  154. data_management/importers/sulross_get_exif.py +0 -65
  155. data_management/importers/timelapse_csv_set_to_json.py +0 -490
  156. data_management/importers/ubc_to_json.py +0 -399
  157. data_management/importers/umn_to_json.py +0 -507
  158. data_management/importers/wellington_to_json.py +0 -263
  159. data_management/importers/wi_to_json.py +0 -441
  160. data_management/importers/zamba_results_to_md_results.py +0 -181
  161. data_management/labelme_to_coco.py +0 -548
  162. data_management/labelme_to_yolo.py +0 -272
  163. data_management/lila/__init__.py +0 -0
  164. data_management/lila/add_locations_to_island_camera_traps.py +0 -97
  165. data_management/lila/add_locations_to_nacti.py +0 -147
  166. data_management/lila/create_lila_blank_set.py +0 -557
  167. data_management/lila/create_lila_test_set.py +0 -151
  168. data_management/lila/create_links_to_md_results_files.py +0 -106
  169. data_management/lila/download_lila_subset.py +0 -177
  170. data_management/lila/generate_lila_per_image_labels.py +0 -515
  171. data_management/lila/get_lila_annotation_counts.py +0 -170
  172. data_management/lila/get_lila_image_counts.py +0 -111
  173. data_management/lila/lila_common.py +0 -300
  174. data_management/lila/test_lila_metadata_urls.py +0 -132
  175. data_management/ocr_tools.py +0 -874
  176. data_management/read_exif.py +0 -681
  177. data_management/remap_coco_categories.py +0 -84
  178. data_management/remove_exif.py +0 -66
  179. data_management/resize_coco_dataset.py +0 -189
  180. data_management/wi_download_csv_to_coco.py +0 -246
  181. data_management/yolo_output_to_md_output.py +0 -441
  182. data_management/yolo_to_coco.py +0 -676
  183. detection/__init__.py +0 -0
  184. detection/detector_training/__init__.py +0 -0
  185. detection/detector_training/model_main_tf2.py +0 -114
  186. detection/process_video.py +0 -703
  187. detection/pytorch_detector.py +0 -337
  188. detection/run_detector.py +0 -779
  189. detection/run_detector_batch.py +0 -1219
  190. detection/run_inference_with_yolov5_val.py +0 -917
  191. detection/run_tiled_inference.py +0 -935
  192. detection/tf_detector.py +0 -188
  193. detection/video_utils.py +0 -606
  194. docs/source/conf.py +0 -43
  195. md_utils/__init__.py +0 -0
  196. md_utils/azure_utils.py +0 -174
  197. md_utils/ct_utils.py +0 -612
  198. md_utils/directory_listing.py +0 -246
  199. md_utils/md_tests.py +0 -968
  200. md_utils/path_utils.py +0 -1044
  201. md_utils/process_utils.py +0 -157
  202. md_utils/sas_blob_utils.py +0 -509
  203. md_utils/split_locations_into_train_val.py +0 -228
  204. md_utils/string_utils.py +0 -92
  205. md_utils/url_utils.py +0 -323
  206. md_utils/write_html_image_list.py +0 -225
  207. md_visualization/__init__.py +0 -0
  208. md_visualization/plot_utils.py +0 -293
  209. md_visualization/render_images_with_thumbnails.py +0 -275
  210. md_visualization/visualization_utils.py +0 -1537
  211. md_visualization/visualize_db.py +0 -551
  212. md_visualization/visualize_detector_output.py +0 -406
  213. megadetector-5.0.10.dist-info/RECORD +0 -224
  214. megadetector-5.0.10.dist-info/top_level.txt +0 -8
  215. taxonomy_mapping/__init__.py +0 -0
  216. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +0 -491
  217. taxonomy_mapping/map_new_lila_datasets.py +0 -154
  218. taxonomy_mapping/prepare_lila_taxonomy_release.py +0 -142
  219. taxonomy_mapping/preview_lila_taxonomy.py +0 -591
  220. taxonomy_mapping/retrieve_sample_image.py +0 -71
  221. taxonomy_mapping/simple_image_download.py +0 -218
  222. taxonomy_mapping/species_lookup.py +0 -834
  223. taxonomy_mapping/taxonomy_csv_checker.py +0 -159
  224. taxonomy_mapping/taxonomy_graph.py +0 -346
  225. taxonomy_mapping/validate_lila_category_mappings.py +0 -83
  226. {megadetector-5.0.10.dist-info → megadetector-5.0.11.dist-info}/WHEEL +0 -0
@@ -1,591 +0,0 @@
1
- """
2
-
3
- preview_lila_taxonomy.py
4
-
5
- Does some consistency-checking on the LILA taxonomy file, and generates
6
- an HTML preview page that we can use to determine whether the mappings
7
- make sense.
8
-
9
- """
10
-
11
- #%% Imports and constants
12
-
13
- from tqdm import tqdm
14
-
15
- import os
16
- import pandas as pd
17
-
18
- # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
- lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
20
-
21
- preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
- os.makedirs(preview_base,exist_ok=True)
23
- html_output_file = os.path.join(preview_base,'index.html')
24
-
25
-
26
- #%% Support functions
27
-
28
- def parse_taxonomy_string(taxonomy_string):
29
-
30
- taxonomic_match = eval(taxonomy_string)
31
- matched_entity = taxonomic_match[0]
32
- assert len(matched_entity) == 4
33
-
34
- level = matched_entity[1]
35
-
36
- scientific_name = matched_entity[2]
37
-
38
- common_names = matched_entity[3]
39
- if len(common_names) == 1:
40
- common_name = common_names[0]
41
- else:
42
- common_name = str(common_names)
43
-
44
- return scientific_name,common_name,level,taxonomic_match
45
-
46
- def taxonomy_string_to_common_name(taxonomy_string):
47
- _,cn,_,_ = parse_taxonomy_string(taxonomy_string)
48
- return cn
49
-
50
- def taxonomy_string_to_scientific(taxonomy_string):
51
- sn,_,_,_ = parse_taxonomy_string(taxonomy_string)
52
- return sn
53
-
54
- def taxonomy_string_to_level(taxonomy_string):
55
- _,_,level,_ = parse_taxonomy_string(taxonomy_string)
56
- return level
57
-
58
-
59
- #%% Read the taxonomy mapping file
60
-
61
- df = pd.read_csv(lila_taxonomy_file)
62
-
63
-
64
- #%% Prepare taxonomy lookup
65
-
66
- from taxonomy_mapping.species_lookup import (
67
- initialize_taxonomy_lookup,
68
- get_preferred_taxonomic_match)
69
-
70
- # from taxonomy_mapping.species_lookup import (
71
- # get_taxonomic_info, print_taxonomy_matche)
72
-
73
- initialize_taxonomy_lookup()
74
-
75
-
76
- #%% Optionally remap all gbif-based mappings to inat (or vice-versa)
77
-
78
- if False:
79
-
80
- #%%
81
-
82
- source_mappings = ['gbif','manual']
83
- target_mapping = 'inat'
84
- valid_mappings = ['gbif','inat','manual']
85
-
86
- assert target_mapping in valid_mappings
87
- for source_mapping in source_mappings:
88
- assert source_mapping != target_mapping and \
89
- source_mapping in valid_mappings
90
-
91
- n_remappings = 0
92
-
93
- # i_row = 1; row = df.iloc[i_row]; row
94
- for i_row,row in df.iterrows():
95
-
96
- if row['source'] not in source_mappings:
97
- continue
98
-
99
- scientific_name = row['scientific_name']
100
- old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
101
-
102
- m = get_preferred_taxonomic_match(scientific_name,target_mapping)
103
-
104
- if m is None or m.source != target_mapping:
105
- print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
106
- continue
107
-
108
- assert m.scientific_name == row['scientific_name']
109
-
110
- if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
111
- pass
112
- else:
113
- assert m.taxonomic_level == row['taxonomy_level']
114
-
115
- new_common = taxonomy_string_to_common_name(m.taxonomy_string)
116
-
117
- if row['taxonomy_string'] != m.taxonomy_string:
118
- print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
119
- n_remappings += 1
120
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
121
-
122
- if row['source'] != 'manual':
123
- df.loc[i_row,'source'] = m.source
124
-
125
- # This should be zero for the release .csv
126
- print('Made {} remappings'.format(n_remappings))
127
-
128
- #%%
129
-
130
- df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
131
-
132
-
133
- #%% Check for mappings that disagree with the taxonomy string
134
-
135
- df = pd.read_csv(lila_taxonomy_file)
136
-
137
- n_taxonomy_changes = 0
138
-
139
- # Look for internal inconsistency
140
- for i_row,row in df.iterrows():
141
-
142
- sn = row['scientific_name']
143
- if not isinstance(sn,str):
144
- continue
145
-
146
- ts = row['taxonomy_string']
147
- assert sn == taxonomy_string_to_scientific(ts)
148
-
149
- assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
150
-
151
- # Look for outdated mappings
152
- taxonomy_preference = 'inat'
153
-
154
- # i_row = 0; row = df.iloc[i_row]
155
- for i_row,row in tqdm(df.iterrows(),total=len(df)):
156
-
157
- sn = row['scientific_name']
158
- if not isinstance(sn,str):
159
- continue
160
-
161
- m = get_preferred_taxonomic_match(sn,taxonomy_preference)
162
- assert m.scientific_name == sn
163
-
164
- ts = row['taxonomy_string']
165
- assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
166
- row['dataset_name'],ts,m.taxonomy_string)
167
-
168
- if ts != m.taxonomy_string:
169
- n_taxonomy_changes += 1
170
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
171
-
172
- print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
173
-
174
- # Optionally re-write
175
- if False:
176
- df.to_csv(lila_taxonomy_file,header=True,index=False)
177
-
178
-
179
- #%% List null mappings
180
-
181
- # These should all be things like "empty", "unidentified", "fire", "car", etc.
182
-
183
- # i_row = 0; row = df.iloc[i_row]
184
- for i_row,row in df.iterrows():
185
- if (not isinstance(row['taxonomy_string'],str)) or (len(row['taxonomy_string']) == 0):
186
- print('No mapping for {}:{}'.format(row['dataset_name'],row['query']))
187
-
188
-
189
- #%% List mappings with scientific names but no common names
190
-
191
- for i_row,row in df.iterrows():
192
- cn = row['common_name']
193
- sn = row['scientific_name']
194
- ts = row['taxonomy_string']
195
- if (isinstance(ts,str)) and (len(ts) >= 0):
196
- if (not isinstance(cn,str)) or (len(cn) == 0):
197
- print('No mapping for {}:{}:{}'.format(row['dataset_name'],row['query'],row['scientific_name']))
198
-
199
-
200
- #%% List mappings that map to different things in different data sets
201
-
202
- import numpy as np
203
- def isnan(x):
204
- if not isinstance(x,float):
205
- return False
206
- return np.isnan(x)
207
-
208
- from collections import defaultdict
209
- query_to_rows = defaultdict(list)
210
-
211
- queries_with_multiple_mappings = set()
212
-
213
- n_suppressed = 0
214
-
215
- suppress_multiple_matches = [
216
- ['porcupine','Snapshot Camdeboo','Idaho Camera Traps'],
217
- ['porcupine','Snapshot Enonkishu','Idaho Camera Traps'],
218
- ['porcupine','Snapshot Karoo','Idaho Camera Traps'],
219
- ['porcupine','Snapshot Kgalagadi','Idaho Camera Traps'],
220
- ['porcupine','Snapshot Kruger','Idaho Camera Traps'],
221
- ['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
222
- ['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
223
-
224
- ['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
225
- ['porcupine','Snapshot Serengeti','Snapshot Kruger'],
226
- ['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
227
- ['porcupine','Snapshot Serengeti','Snapshot Karoo'],
228
- ['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
229
-
230
- ['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
231
- ['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
232
- ['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
233
- ['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
234
- ['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
235
-
236
- ['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
237
- ['kudu','Snapshot Serengeti','Snapshot Kruger'],
238
- ['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
239
- ['kudu','Snapshot Serengeti','Snapshot Karoo'],
240
- ['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
241
-
242
- ['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
243
- ['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
244
- ['fox','Idaho Camera Traps','Caltech Camera Traps'],
245
-
246
- ['pangolin','Snapshot Serengeti','SWG Camera Traps'],
247
-
248
- ['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
249
- ['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
250
-
251
- ['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
252
-
253
- ]
254
-
255
- for i_row,row in df.iterrows():
256
-
257
- query = row['query']
258
- taxonomy_string = row['taxonomy_string']
259
-
260
- for previous_i_row in query_to_rows[query]:
261
-
262
- previous_row = df.iloc[previous_i_row]
263
- assert previous_row['query'] == query
264
- query_match = False
265
- if isnan(row['taxonomy_string']):
266
- query_match = isnan(previous_row['taxonomy_string'])
267
- elif isnan(previous_row['taxonomy_string']):
268
- query_match = isnan(row['taxonomy_string'])
269
- else:
270
- query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
271
-
272
- if not query_match:
273
-
274
- suppress = False
275
-
276
- # x = suppress_multiple_matches[-1]
277
- for x in suppress_multiple_matches:
278
- if x[0] == query and \
279
- ( \
280
- (x[1] == row['dataset_name'] and x[2] == previous_row['dataset_name']) \
281
- or \
282
- (x[2] == row['dataset_name'] and x[1] == previous_row['dataset_name']) \
283
- ):
284
- suppress = True
285
- n_suppressed += 1
286
- break
287
-
288
- if not suppress:
289
- print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
290
- query, row['dataset_name'], previous_row['dataset_name'],
291
- taxonomy_string, previous_row['taxonomy_string']))
292
-
293
- queries_with_multiple_mappings.add(query)
294
-
295
- # ...for each row where we saw this query
296
-
297
- query_to_rows[query].append(i_row)
298
-
299
- # ...for each row
300
-
301
- print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
302
- len(queries_with_multiple_mappings),n_suppressed))
303
-
304
-
305
- #%% Verify that nothing "unidentified" maps to a species or subspecies
306
-
307
- # E.g., "unidentified skunk" should never map to a specific species of skunk
308
-
309
- allowable_unknown_species = [
310
- 'unknown_tayra' # AFAIK this is a unique species, I'm not sure what's implied here
311
- ]
312
-
313
- unk_queries = ['skunk']
314
- for i_row,row in df.iterrows():
315
-
316
- query = row['query']
317
- level = row['taxonomy_level']
318
-
319
- if not isinstance(level,str):
320
- assert not isinstance(row['taxonomy_string'],str)
321
- continue
322
-
323
- if ( \
324
- 'unidentified' in query or \
325
- ('unk' in query and ('skunk' not in query and 'chipmunk' not in query))\
326
- ) \
327
- and \
328
- ('species' in level):
329
-
330
- if query not in allowable_unknown_species:
331
-
332
- print('Warning: query {}:{} maps to {} {}'.format(
333
- row['dataset_name'],
334
- row['query'],
335
- row['taxonomy_level'],
336
- row['scientific_name']
337
- ))
338
-
339
-
340
- #%% Make sure there are valid source and level values for everything with a mapping
341
-
342
- for i_row,row in df.iterrows():
343
- if isinstance(row['scientific_name'],str):
344
- if 'source' in row:
345
- assert isinstance(row['source'],str)
346
- assert isinstance(row['taxonomy_level'],str)
347
-
348
-
349
- #%% Find WCS mappings that aren't species or aren't the same as the input
350
-
351
- # WCS used scientific names, so these remappings are slightly more controversial
352
- # then the standard remappings.
353
-
354
- # row = df.iloc[-500]
355
- for i_row,row in df.iterrows():
356
-
357
- if not isinstance(row['scientific_name'],str):
358
- continue
359
- if 'WCS' not in row['dataset_name']:
360
- continue
361
-
362
- query = row['query']
363
- scientific_name = row['scientific_name']
364
- common_name = row['common_name']
365
- level = row['taxonomy_level']
366
- taxonomy_string = row['taxonomy_string']
367
-
368
- common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
369
- query_string = query.replace(' sp','')
370
- query_string = query_string.replace('unknown ','')
371
-
372
- # Anything marked "species" or "unknown" by definition doesn't map to a species,
373
- # so ignore these.
374
- if (' sp' not in query) and ('unknown' not in query) and \
375
- (level != 'species') and (level != 'subspecies'):
376
- print('WCS query {} ({}) remapped to {} {} ({})'.format(
377
- query,common_name,level,scientific_name,common_name_from_taxonomy))
378
-
379
- if query_string != scientific_name:
380
- pass
381
- # print('WCS query {} ({}) remapped to {} ({})'.format(
382
- # query,common_name,scientific_name,common_names_from_taxonomy))
383
-
384
-
385
- #%% Download sample images for all scientific names
386
-
387
- remapped_queries = {'papio':'papio+baboon',
388
- 'damaliscus lunatus jimela':'damaliscus lunatus',
389
- 'mazama':'genus+mazama',
390
- 'mirafra':'genus+mirafra'}
391
-
392
- import os
393
- from taxonomy_mapping import retrieve_sample_image
394
-
395
- scientific_name_to_paths = {}
396
- image_base = os.path.join(preview_base,'images')
397
- images_per_query = 15
398
- min_valid_images_per_query = 3
399
- min_valid_image_size = 3000
400
-
401
- # TODO: trivially prallelizable
402
- #
403
- # i_row = 0; row = df.iloc[i_row]
404
- for i_row,row in df.iterrows():
405
-
406
- s = row['scientific_name']
407
-
408
- if (not isinstance(s,str)) or (len(s)==0):
409
- continue
410
-
411
- query = s.replace(' ','+')
412
-
413
- if query in remapped_queries:
414
- query = remapped_queries[query]
415
-
416
- query_folder = os.path.join(image_base,query)
417
- os.makedirs(query_folder,exist_ok=True)
418
-
419
- # Check whether we already have enough images for this query
420
- image_files = os.listdir(query_folder)
421
- image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
422
- sizes = [os.path.getsize(p) for p in image_fullpaths]
423
- sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
424
- if len(sizes_above_threshold) > min_valid_images_per_query:
425
- print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
426
- continue
427
-
428
- # Check whether we've already run this query for a previous row
429
- if query in scientific_name_to_paths:
430
- continue
431
-
432
- print('Processing query {} of {} ({})'.format(i_row,len(df),query))
433
- paths = retrieve_sample_image.download_images(query=query,
434
- output_directory=image_base,
435
- limit=images_per_query,
436
- verbose=True)
437
- print('Downloaded {} images for {}'.format(len(paths),query))
438
- scientific_name_to_paths[query] = paths
439
-
440
- # ...for each row in the mapping table
441
-
442
-
443
- #%% Rename .jpeg to .jpg
444
-
445
- from md_utils import path_utils
446
- all_images = path_utils.recursive_file_list(image_base,False)
447
-
448
- for fn in tqdm(all_images):
449
- if fn.lower().endswith('.jpeg'):
450
- new_fn = fn[0:-5] + '.jpg'
451
- os.rename(fn, new_fn)
452
-
453
-
454
- #%% Choose representative images for each scientific name
455
-
456
- # Specifically, sort by size, and take the largest unique sizes. Very small files tend
457
- # to be bogus thumbnails, etc.
458
-
459
- max_images_per_query = 4
460
- scientific_name_to_preferred_images = {}
461
-
462
- # s = list(scientific_name_to_paths.keys())[0]
463
- for s in list(df.scientific_name):
464
-
465
- if not isinstance(s,str):
466
- continue
467
-
468
- query = s.replace(' ','+')
469
-
470
- if query in remapped_queries:
471
- query = remapped_queries[query]
472
-
473
- query_folder = os.path.join(image_base,query)
474
- assert os.path.isdir(query_folder)
475
- image_files = os.listdir(query_folder)
476
- image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
477
- sizes = [os.path.getsize(p) for p in image_fullpaths]
478
- path_to_size = {}
479
- for i_fp,fp in enumerate(image_fullpaths):
480
- path_to_size[fp] = sizes[i_fp]
481
- paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
482
-
483
- # Be suspicious of duplicate sizes
484
- b_duplicate_sizes = [False] * len(paths_by_size)
485
-
486
- for i_path,p in enumerate(paths_by_size):
487
- if i_path == len(paths_by_size) - 1:
488
- continue
489
- if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
490
- b_duplicate_sizes[i_path] = True
491
-
492
- paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
493
-
494
- preferred_paths = paths_by_size_non_dup[:max_images_per_query]
495
- scientific_name_to_preferred_images[s] = preferred_paths
496
-
497
- # ...for each scientific name
498
-
499
-
500
- #%% Delete unused images
501
-
502
- used_images = []
503
- for images in scientific_name_to_preferred_images.values():
504
- used_images.extend(images)
505
-
506
- print('Using a total of {} images'.format(len(used_images)))
507
- used_images_set = set(used_images)
508
-
509
- from md_utils import path_utils
510
- all_images = path_utils.recursive_file_list(image_base,False)
511
-
512
- unused_images = []
513
- for fn in all_images:
514
- if fn not in used_images_set:
515
- unused_images.append(fn)
516
-
517
- print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images),
518
- len(all_images) - len(unused_images)))
519
-
520
- for fn in tqdm(unused_images):
521
- os.remove(fn)
522
-
523
-
524
- #%% Produce HTML preview
525
-
526
- with open(html_output_file, 'w', encoding='utf-8') as f:
527
-
528
- f.write('<html><head></head><body>\n')
529
-
530
- names = scientific_name_to_preferred_images.keys()
531
- names = sorted(names)
532
-
533
- f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
534
- 'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
535
- '</p>')
536
-
537
- # i_row = 2; row = df.iloc[i_row]
538
- for i_row, row in tqdm(df.iterrows(), total=len(df)):
539
-
540
- s = row['scientific_name']
541
-
542
- taxonomy_string = row['taxonomy_string']
543
- if isinstance(taxonomy_string,str):
544
- taxonomic_match = eval(taxonomy_string)
545
- matched_entity = taxonomic_match[0]
546
- assert len(matched_entity) == 4
547
- common_names = matched_entity[3]
548
- if len(common_names) == 1:
549
- common_name_string = common_names[0]
550
- else:
551
- common_name_string = str(common_names)
552
- else:
553
- common_name_string = ''
554
-
555
- f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
556
-
557
- if isinstance(row.scientific_name,str):
558
- output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
559
- row.dataset_name, row.query,
560
- row.taxonomy_level, row.scientific_name, common_name_string,
561
- row.common_name)
562
- f.write(output_string)
563
- else:
564
- f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
565
-
566
- if s is None or s not in names:
567
- f.write('<p class="content_p">no images available</p>')
568
- else:
569
- image_paths = scientific_name_to_preferred_images[s]
570
- basedir = os.path.dirname(html_output_file)
571
- relative_paths = [os.path.relpath(p,basedir) for p in image_paths]
572
- image_paths = [s.replace('\\','/') for s in relative_paths]
573
- n_images = len(image_paths)
574
- # image_paths = [os.path.relpath(p, output_base) for p in image_paths]
575
- image_width_percent = round(100 / n_images)
576
- f.write('<table class="image_table"><tr>\n')
577
- for image_path in image_paths:
578
- f.write('<td style="vertical-align:top;" width="{}%">'
579
- '<img src="{}" style="display:block; width:100%; vertical-align:top; height:auto;">'
580
- '</td>\n'.format(image_width_percent, image_path))
581
- f.write('</tr></table>\n')
582
-
583
- # ...for each row
584
-
585
- f.write('</body></html>\n')
586
-
587
-
588
- #%% Open HTML preview
589
-
590
- from md_utils.path_utils import open_file
591
- open_file(html_output_file)
@@ -1,71 +0,0 @@
1
- """
2
-
3
- retrieve_sample_image.py
4
-
5
- Downloader that retrieves images from Google images, used for verifying taxonomy
6
- lookups and looking for egregious mismappings (e.g., "snake" being mapped to a fish called
7
- "snake").
8
-
9
- Simple wrapper around simple_image_download, but I've had to swap in and out the underlying
10
- downloader a few times.
11
-
12
- """
13
-
14
- #%% Imports and environment
15
-
16
- import os
17
-
18
- output_folder = os.path.expanduser('~/tmp/image-download-test')
19
- os.makedirs(output_folder,exist_ok=True)
20
-
21
- method = 'simple_image_download' # 'google_images_download'
22
-
23
- if method == 'simple_image_download':
24
-
25
- from taxonomy_mapping import simple_image_download
26
- google_image_downloader = simple_image_download.Downloader()
27
- google_image_downloader.directory = output_folder
28
-
29
- elif method == 'google_images_download':
30
-
31
- from google_images_download import google_images_download
32
-
33
- else:
34
-
35
- raise ValueError('Unrecognized method {}'.format(method))
36
-
37
-
38
- #%% Main entry point
39
-
40
- def download_images(query,output_directory,limit=100,verbose=False):
41
-
42
- query = query.replace(' ','+')
43
-
44
- if method == 'simple_image_download':
45
-
46
- google_image_downloader.directory = output_directory
47
- paths = google_image_downloader.download(query, limit=limit,
48
- verbose=verbose, cache=False, download_cache=False)
49
- return paths
50
-
51
- elif method == 'google_images_download':
52
-
53
- response = google_images_download.googleimagesdownload()
54
- arguments = {'keywords':query,'limit':limit,'print_urls':verbose,
55
- 'image-directory':output_directory}
56
- response.download(arguments)
57
- return None
58
-
59
- else:
60
-
61
- raise ValueError('Unrecognized method {}'.format(method))
62
-
63
-
64
- #%% Test driver
65
-
66
- if False:
67
-
68
- #%%
69
-
70
- paths = download_images(query='redunca',output_directory=output_folder,
71
- limit=20,verbose=True)