megadetector 5.0.8__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +65 -65
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
  20. api/batch_processing/postprocessing/compare_batch_results.py +113 -43
  21. api/batch_processing/postprocessing/convert_output_format.py +41 -16
  22. api/batch_processing/postprocessing/load_api_results.py +16 -17
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +52 -22
  25. api/batch_processing/postprocessing/merge_detections.py +14 -14
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
  27. api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +102 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -263
  71. data_management/coco_to_yolo.py +79 -58
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +62 -24
  76. data_management/databases/subset_json_db.py +24 -15
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -162
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -158
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +7 -7
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +65 -24
  120. data_management/labelme_to_yolo.py +8 -8
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +13 -13
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +44 -110
  128. data_management/lila/generate_lila_per_image_labels.py +55 -42
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +96 -33
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +110 -97
  135. data_management/remap_coco_categories.py +83 -83
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +30 -23
  138. data_management/wi_download_csv_to_coco.py +246 -239
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +300 -60
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +179 -113
  147. detection/run_inference_with_yolov5_val.py +108 -48
  148. detection/run_tiled_inference.py +111 -40
  149. detection/tf_detector.py +51 -29
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +228 -68
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -871
  157. md_utils/path_utils.py +460 -134
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +176 -60
  163. md_utils/write_html_image_list.py +40 -33
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +597 -291
  168. md_visualization/visualize_db.py +76 -48
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/METADATA +13 -7
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  173. taxonomy_mapping/__init__.py +0 -0
  174. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  175. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  176. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  177. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  178. taxonomy_mapping/retrieve_sample_image.py +12 -12
  179. taxonomy_mapping/simple_image_download.py +11 -11
  180. taxonomy_mapping/species_lookup.py +10 -10
  181. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  182. taxonomy_mapping/taxonomy_graph.py +47 -47
  183. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  184. data_management/cct_json_to_filename_json.py +0 -89
  185. data_management/cct_to_csv.py +0 -140
  186. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  187. detection/detector_training/copy_checkpoints.py +0 -43
  188. megadetector-5.0.8.dist-info/RECORD +0 -205
  189. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
  190. {megadetector-5.0.8.dist-info → megadetector-5.0.9.dist-info}/WHEEL +0 -0
@@ -1,591 +1,591 @@
1
- ########
2
- #
3
- # preview_lila_taxonomy.py
4
- #
5
- # Does some consistency-checking on the LILA taxonomy file, and generates
6
- # an HTML preview page that we can use to determine whether the mappings
7
- # make sense.
8
- #
9
- ########
10
-
11
- #%% Imports and constants
12
-
13
- from tqdm import tqdm
14
-
15
- import os
16
- import pandas as pd
17
-
18
- # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
- lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
20
-
21
- preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
- os.makedirs(preview_base,exist_ok=True)
23
- html_output_file = os.path.join(preview_base,'index.html')
24
-
25
-
26
- #%% Support functions
27
-
28
- def parse_taxonomy_string(taxonomy_string):
29
-
30
- taxonomic_match = eval(taxonomy_string)
31
- matched_entity = taxonomic_match[0]
32
- assert len(matched_entity) == 4
33
-
34
- level = matched_entity[1]
35
-
36
- scientific_name = matched_entity[2]
37
-
38
- common_names = matched_entity[3]
39
- if len(common_names) == 1:
40
- common_name = common_names[0]
41
- else:
42
- common_name = str(common_names)
43
-
44
- return scientific_name,common_name,level,taxonomic_match
45
-
46
- def taxonomy_string_to_common_name(taxonomy_string):
47
- _,cn,_,_ = parse_taxonomy_string(taxonomy_string)
48
- return cn
49
-
50
- def taxonomy_string_to_scientific(taxonomy_string):
51
- sn,_,_,_ = parse_taxonomy_string(taxonomy_string)
52
- return sn
53
-
54
- def taxonomy_string_to_level(taxonomy_string):
55
- _,_,level,_ = parse_taxonomy_string(taxonomy_string)
56
- return level
57
-
58
-
59
- #%% Read the taxonomy mapping file
60
-
61
- df = pd.read_csv(lila_taxonomy_file)
62
-
63
-
64
- #%% Prepare taxonomy lookup
65
-
66
- from taxonomy_mapping.species_lookup import (
67
- initialize_taxonomy_lookup,
68
- get_preferred_taxonomic_match)
69
-
70
- # from taxonomy_mapping.species_lookup import (
71
- # get_taxonomic_info, print_taxonomy_matche)
72
-
73
- initialize_taxonomy_lookup()
74
-
75
-
76
- #%% Optionally remap all gbif-based mappings to inat (or vice-versa)
77
-
78
- if False:
79
-
80
- #%%
81
-
82
- source_mappings = ['gbif','manual']
83
- target_mapping = 'inat'
84
- valid_mappings = ['gbif','inat','manual']
85
-
86
- assert target_mapping in valid_mappings
87
- for source_mapping in source_mappings:
88
- assert source_mapping != target_mapping and \
89
- source_mapping in valid_mappings
90
-
91
- n_remappings = 0
92
-
93
- # i_row = 1; row = df.iloc[i_row]; row
94
- for i_row,row in df.iterrows():
95
-
96
- if row['source'] not in source_mappings:
97
- continue
98
-
99
- scientific_name = row['scientific_name']
100
- old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
101
-
102
- m = get_preferred_taxonomic_match(scientific_name,target_mapping)
103
-
104
- if m is None or m.source != target_mapping:
105
- print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
106
- continue
107
-
108
- assert m.scientific_name == row['scientific_name']
109
-
110
- if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
111
- pass
112
- else:
113
- assert m.taxonomic_level == row['taxonomy_level']
114
-
115
- new_common = taxonomy_string_to_common_name(m.taxonomy_string)
116
-
117
- if row['taxonomy_string'] != m.taxonomy_string:
118
- print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
119
- n_remappings += 1
120
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
121
-
122
- if row['source'] != 'manual':
123
- df.loc[i_row,'source'] = m.source
124
-
125
- # This should be zero for the release .csv
126
- print('Made {} remappings'.format(n_remappings))
127
-
128
- #%%
129
-
130
- df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
131
-
132
-
133
- #%% Check for mappings that disagree with the taxonomy string
134
-
135
- df = pd.read_csv(lila_taxonomy_file)
136
-
137
- n_taxonomy_changes = 0
138
-
139
- # Look for internal inconsistency
140
- for i_row,row in df.iterrows():
141
-
142
- sn = row['scientific_name']
143
- if not isinstance(sn,str):
144
- continue
145
-
146
- ts = row['taxonomy_string']
147
- assert sn == taxonomy_string_to_scientific(ts)
148
-
149
- assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
150
-
151
- # Look for outdated mappings
152
- taxonomy_preference = 'inat'
153
-
154
- # i_row = 0; row = df.iloc[i_row]
155
- for i_row,row in tqdm(df.iterrows(),total=len(df)):
156
-
157
- sn = row['scientific_name']
158
- if not isinstance(sn,str):
159
- continue
160
-
161
- m = get_preferred_taxonomic_match(sn,taxonomy_preference)
162
- assert m.scientific_name == sn
163
-
164
- ts = row['taxonomy_string']
165
- assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
166
- row['dataset_name'],ts,m.taxonomy_string)
167
-
168
- if ts != m.taxonomy_string:
169
- n_taxonomy_changes += 1
170
- df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
171
-
172
- print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
173
-
174
- # Optionally re-write
175
- if False:
176
- df.to_csv(lila_taxonomy_file,header=True,index=False)
177
-
178
-
179
- #%% List null mappings
180
-
181
- # These should all be things like "empty", "unidentified", "fire", "car", etc.
182
-
183
- # i_row = 0; row = df.iloc[i_row]
184
- for i_row,row in df.iterrows():
185
- if (not isinstance(row['taxonomy_string'],str)) or (len(row['taxonomy_string']) == 0):
186
- print('No mapping for {}:{}'.format(row['dataset_name'],row['query']))
187
-
188
-
189
- #%% List mappings with scientific names but no common names
190
-
191
- for i_row,row in df.iterrows():
192
- cn = row['common_name']
193
- sn = row['scientific_name']
194
- ts = row['taxonomy_string']
195
- if (isinstance(ts,str)) and (len(ts) >= 0):
196
- if (not isinstance(cn,str)) or (len(cn) == 0):
197
- print('No mapping for {}:{}:{}'.format(row['dataset_name'],row['query'],row['scientific_name']))
198
-
199
-
200
- #%% List mappings that map to different things in different data sets
201
-
202
- import numpy as np
203
- def isnan(x):
204
- if not isinstance(x,float):
205
- return False
206
- return np.isnan(x)
207
-
208
- from collections import defaultdict
209
- query_to_rows = defaultdict(list)
210
-
211
- queries_with_multiple_mappings = set()
212
-
213
- n_suppressed = 0
214
-
215
- suppress_multiple_matches = [
216
- ['porcupine','Snapshot Camdeboo','Idaho Camera Traps'],
217
- ['porcupine','Snapshot Enonkishu','Idaho Camera Traps'],
218
- ['porcupine','Snapshot Karoo','Idaho Camera Traps'],
219
- ['porcupine','Snapshot Kgalagadi','Idaho Camera Traps'],
220
- ['porcupine','Snapshot Kruger','Idaho Camera Traps'],
221
- ['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
222
- ['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
223
-
224
- ['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
225
- ['porcupine','Snapshot Serengeti','Snapshot Kruger'],
226
- ['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
227
- ['porcupine','Snapshot Serengeti','Snapshot Karoo'],
228
- ['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
229
-
230
- ['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
231
- ['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
232
- ['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
233
- ['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
234
- ['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
235
-
236
- ['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
237
- ['kudu','Snapshot Serengeti','Snapshot Kruger'],
238
- ['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
239
- ['kudu','Snapshot Serengeti','Snapshot Karoo'],
240
- ['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
241
-
242
- ['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
243
- ['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
244
- ['fox','Idaho Camera Traps','Caltech Camera Traps'],
245
-
246
- ['pangolin','Snapshot Serengeti','SWG Camera Traps'],
247
-
248
- ['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
249
- ['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
250
-
251
- ['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
252
-
253
- ]
254
-
255
- for i_row,row in df.iterrows():
256
-
257
- query = row['query']
258
- taxonomy_string = row['taxonomy_string']
259
-
260
- for previous_i_row in query_to_rows[query]:
261
-
262
- previous_row = df.iloc[previous_i_row]
263
- assert previous_row['query'] == query
264
- query_match = False
265
- if isnan(row['taxonomy_string']):
266
- query_match = isnan(previous_row['taxonomy_string'])
267
- elif isnan(previous_row['taxonomy_string']):
268
- query_match = isnan(row['taxonomy_string'])
269
- else:
270
- query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
271
-
272
- if not query_match:
273
-
274
- suppress = False
275
-
276
- # x = suppress_multiple_matches[-1]
277
- for x in suppress_multiple_matches:
278
- if x[0] == query and \
279
- ( \
280
- (x[1] == row['dataset_name'] and x[2] == previous_row['dataset_name']) \
281
- or \
282
- (x[2] == row['dataset_name'] and x[1] == previous_row['dataset_name']) \
283
- ):
284
- suppress = True
285
- n_suppressed += 1
286
- break
287
-
288
- if not suppress:
289
- print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
290
- query, row['dataset_name'], previous_row['dataset_name'],
291
- taxonomy_string, previous_row['taxonomy_string']))
292
-
293
- queries_with_multiple_mappings.add(query)
294
-
295
- # ...for each row where we saw this query
296
-
297
- query_to_rows[query].append(i_row)
298
-
299
- # ...for each row
300
-
301
- print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
302
- len(queries_with_multiple_mappings),n_suppressed))
303
-
304
-
305
- #%% Verify that nothing "unidentified" maps to a species or subspecies
306
-
307
- # E.g., "unidentified skunk" should never map to a specific species of skunk
308
-
309
- allowable_unknown_species = [
310
- 'unknown_tayra' # AFAIK this is a unique species, I'm not sure what's implied here
311
- ]
312
-
313
- unk_queries = ['skunk']
314
- for i_row,row in df.iterrows():
315
-
316
- query = row['query']
317
- level = row['taxonomy_level']
318
-
319
- if not isinstance(level,str):
320
- assert not isinstance(row['taxonomy_string'],str)
321
- continue
322
-
323
- if ( \
324
- 'unidentified' in query or \
325
- ('unk' in query and ('skunk' not in query and 'chipmunk' not in query))\
326
- ) \
327
- and \
328
- ('species' in level):
329
-
330
- if query not in allowable_unknown_species:
331
-
332
- print('Warning: query {}:{} maps to {} {}'.format(
333
- row['dataset_name'],
334
- row['query'],
335
- row['taxonomy_level'],
336
- row['scientific_name']
337
- ))
338
-
339
-
340
- #%% Make sure there are valid source and level values for everything with a mapping
341
-
342
- for i_row,row in df.iterrows():
343
- if isinstance(row['scientific_name'],str):
344
- if 'source' in row:
345
- assert isinstance(row['source'],str)
346
- assert isinstance(row['taxonomy_level'],str)
347
-
348
-
349
- #%% Find WCS mappings that aren't species or aren't the same as the input
350
-
351
- # WCS used scientific names, so these remappings are slightly more controversial
352
- # then the standard remappings.
353
-
354
- # row = df.iloc[-500]
355
- for i_row,row in df.iterrows():
356
-
357
- if not isinstance(row['scientific_name'],str):
358
- continue
359
- if 'WCS' not in row['dataset_name']:
360
- continue
361
-
362
- query = row['query']
363
- scientific_name = row['scientific_name']
364
- common_name = row['common_name']
365
- level = row['taxonomy_level']
366
- taxonomy_string = row['taxonomy_string']
367
-
368
- common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
369
- query_string = query.replace(' sp','')
370
- query_string = query_string.replace('unknown ','')
371
-
372
- # Anything marked "species" or "unknown" by definition doesn't map to a species,
373
- # so ignore these.
374
- if (' sp' not in query) and ('unknown' not in query) and \
375
- (level != 'species') and (level != 'subspecies'):
376
- print('WCS query {} ({}) remapped to {} {} ({})'.format(
377
- query,common_name,level,scientific_name,common_name_from_taxonomy))
378
-
379
- if query_string != scientific_name:
380
- pass
381
- # print('WCS query {} ({}) remapped to {} ({})'.format(
382
- # query,common_name,scientific_name,common_names_from_taxonomy))
383
-
384
-
385
- #%% Download sample images for all scientific names
386
-
387
- remapped_queries = {'papio':'papio+baboon',
388
- 'damaliscus lunatus jimela':'damaliscus lunatus',
389
- 'mazama':'genus+mazama',
390
- 'mirafra':'genus+mirafra'}
391
-
392
- import os
393
- from taxonomy_mapping import retrieve_sample_image
394
-
395
- scientific_name_to_paths = {}
396
- image_base = os.path.join(preview_base,'images')
397
- images_per_query = 15
398
- min_valid_images_per_query = 3
399
- min_valid_image_size = 3000
400
-
401
- # TODO: trivially prallelizable
402
- #
403
- # i_row = 0; row = df.iloc[i_row]
404
- for i_row,row in df.iterrows():
405
-
406
- s = row['scientific_name']
407
-
408
- if (not isinstance(s,str)) or (len(s)==0):
409
- continue
410
-
411
- query = s.replace(' ','+')
412
-
413
- if query in remapped_queries:
414
- query = remapped_queries[query]
415
-
416
- query_folder = os.path.join(image_base,query)
417
- os.makedirs(query_folder,exist_ok=True)
418
-
419
- # Check whether we already have enough images for this query
420
- image_files = os.listdir(query_folder)
421
- image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
422
- sizes = [os.path.getsize(p) for p in image_fullpaths]
423
- sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
424
- if len(sizes_above_threshold) > min_valid_images_per_query:
425
- print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
426
- continue
427
-
428
- # Check whether we've already run this query for a previous row
429
- if query in scientific_name_to_paths:
430
- continue
431
-
432
- print('Processing query {} of {} ({})'.format(i_row,len(df),query))
433
- paths = retrieve_sample_image.download_images(query=query,
434
- output_directory=image_base,
435
- limit=images_per_query,
436
- verbose=True)
437
- print('Downloaded {} images for {}'.format(len(paths),query))
438
- scientific_name_to_paths[query] = paths
439
-
440
- # ...for each row in the mapping table
441
-
442
-
443
- #%% Rename .jpeg to .jpg
444
-
445
- from md_utils import path_utils
446
- all_images = path_utils.recursive_file_list(image_base,False)
447
-
448
- for fn in tqdm(all_images):
449
- if fn.lower().endswith('.jpeg'):
450
- new_fn = fn[0:-5] + '.jpg'
451
- os.rename(fn, new_fn)
452
-
453
-
454
- #%% Choose representative images for each scientific name
455
-
456
- # Specifically, sort by size, and take the largest unique sizes. Very small files tend
457
- # to be bogus thumbnails, etc.
458
-
459
- max_images_per_query = 4
460
- scientific_name_to_preferred_images = {}
461
-
462
- # s = list(scientific_name_to_paths.keys())[0]
463
- for s in list(df.scientific_name):
464
-
465
- if not isinstance(s,str):
466
- continue
467
-
468
- query = s.replace(' ','+')
469
-
470
- if query in remapped_queries:
471
- query = remapped_queries[query]
472
-
473
- query_folder = os.path.join(image_base,query)
474
- assert os.path.isdir(query_folder)
475
- image_files = os.listdir(query_folder)
476
- image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
477
- sizes = [os.path.getsize(p) for p in image_fullpaths]
478
- path_to_size = {}
479
- for i_fp,fp in enumerate(image_fullpaths):
480
- path_to_size[fp] = sizes[i_fp]
481
- paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
482
-
483
- # Be suspicious of duplicate sizes
484
- b_duplicate_sizes = [False] * len(paths_by_size)
485
-
486
- for i_path,p in enumerate(paths_by_size):
487
- if i_path == len(paths_by_size) - 1:
488
- continue
489
- if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
490
- b_duplicate_sizes[i_path] = True
491
-
492
- paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
493
-
494
- preferred_paths = paths_by_size_non_dup[:max_images_per_query]
495
- scientific_name_to_preferred_images[s] = preferred_paths
496
-
497
- # ...for each scientific name
498
-
499
-
500
- #%% Delete unused images
501
-
502
- used_images = []
503
- for images in scientific_name_to_preferred_images.values():
504
- used_images.extend(images)
505
-
506
- print('Using a total of {} images'.format(len(used_images)))
507
- used_images_set = set(used_images)
508
-
509
- from md_utils import path_utils
510
- all_images = path_utils.recursive_file_list(image_base,False)
511
-
512
- unused_images = []
513
- for fn in all_images:
514
- if fn not in used_images_set:
515
- unused_images.append(fn)
516
-
517
- print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images),
518
- len(all_images) - len(unused_images)))
519
-
520
- for fn in tqdm(unused_images):
521
- os.remove(fn)
522
-
523
-
524
- #%% Produce HTML preview
525
-
526
- with open(html_output_file, 'w', encoding='utf-8') as f:
527
-
528
- f.write('<html><head></head><body>\n')
529
-
530
- names = scientific_name_to_preferred_images.keys()
531
- names = sorted(names)
532
-
533
- f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
534
- 'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
535
- '</p>')
536
-
537
- # i_row = 2; row = df.iloc[i_row]
538
- for i_row, row in tqdm(df.iterrows(), total=len(df)):
539
-
540
- s = row['scientific_name']
541
-
542
- taxonomy_string = row['taxonomy_string']
543
- if isinstance(taxonomy_string,str):
544
- taxonomic_match = eval(taxonomy_string)
545
- matched_entity = taxonomic_match[0]
546
- assert len(matched_entity) == 4
547
- common_names = matched_entity[3]
548
- if len(common_names) == 1:
549
- common_name_string = common_names[0]
550
- else:
551
- common_name_string = str(common_names)
552
- else:
553
- common_name_string = ''
554
-
555
- f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
556
-
557
- if isinstance(row.scientific_name,str):
558
- output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
559
- row.dataset_name, row.query,
560
- row.taxonomy_level, row.scientific_name, common_name_string,
561
- row.common_name)
562
- f.write(output_string)
563
- else:
564
- f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
565
-
566
- if s is None or s not in names:
567
- f.write('<p class="content_p">no images available</p>')
568
- else:
569
- image_paths = scientific_name_to_preferred_images[s]
570
- basedir = os.path.dirname(html_output_file)
571
- relative_paths = [os.path.relpath(p,basedir) for p in image_paths]
572
- image_paths = [s.replace('\\','/') for s in relative_paths]
573
- n_images = len(image_paths)
574
- # image_paths = [os.path.relpath(p, output_base) for p in image_paths]
575
- image_width_percent = round(100 / n_images)
576
- f.write('<table class="image_table"><tr>\n')
577
- for image_path in image_paths:
578
- f.write('<td style="vertical-align:top;" width="{}%">'
579
- '<img src="{}" style="display:block; width:100%; vertical-align:top; height:auto;">'
580
- '</td>\n'.format(image_width_percent, image_path))
581
- f.write('</tr></table>\n')
582
-
583
- # ...for each row
584
-
585
- f.write('</body></html>\n')
586
-
587
-
588
- #%% Open HTML preview
589
-
590
- from md_utils.path_utils import open_file
591
- open_file(html_output_file)
1
+ """
2
+
3
+ preview_lila_taxonomy.py
4
+
5
+ Does some consistency-checking on the LILA taxonomy file, and generates
6
+ an HTML preview page that we can use to determine whether the mappings
7
+ make sense.
8
+
9
+ """
10
+
11
+ #%% Imports and constants
12
+
13
+ from tqdm import tqdm
14
+
15
+ import os
16
+ import pandas as pd
17
+
18
+ # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
20
+
21
+ preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
+ os.makedirs(preview_base,exist_ok=True)
23
+ html_output_file = os.path.join(preview_base,'index.html')
24
+
25
+
26
+ #%% Support functions
27
+
28
+ def parse_taxonomy_string(taxonomy_string):
29
+
30
+ taxonomic_match = eval(taxonomy_string)
31
+ matched_entity = taxonomic_match[0]
32
+ assert len(matched_entity) == 4
33
+
34
+ level = matched_entity[1]
35
+
36
+ scientific_name = matched_entity[2]
37
+
38
+ common_names = matched_entity[3]
39
+ if len(common_names) == 1:
40
+ common_name = common_names[0]
41
+ else:
42
+ common_name = str(common_names)
43
+
44
+ return scientific_name,common_name,level,taxonomic_match
45
+
46
+ def taxonomy_string_to_common_name(taxonomy_string):
47
+ _,cn,_,_ = parse_taxonomy_string(taxonomy_string)
48
+ return cn
49
+
50
+ def taxonomy_string_to_scientific(taxonomy_string):
51
+ sn,_,_,_ = parse_taxonomy_string(taxonomy_string)
52
+ return sn
53
+
54
+ def taxonomy_string_to_level(taxonomy_string):
55
+ _,_,level,_ = parse_taxonomy_string(taxonomy_string)
56
+ return level
57
+
58
+
59
+ #%% Read the taxonomy mapping file
60
+
61
+ df = pd.read_csv(lila_taxonomy_file)
62
+
63
+
64
+ #%% Prepare taxonomy lookup
65
+
66
+ from taxonomy_mapping.species_lookup import (
67
+ initialize_taxonomy_lookup,
68
+ get_preferred_taxonomic_match)
69
+
70
+ # from taxonomy_mapping.species_lookup import (
71
+ # get_taxonomic_info, print_taxonomy_matche)
72
+
73
+ initialize_taxonomy_lookup()
74
+
75
+
76
+ #%% Optionally remap all gbif-based mappings to inat (or vice-versa)
77
+
78
+ if False:
79
+
80
+ #%%
81
+
82
+ source_mappings = ['gbif','manual']
83
+ target_mapping = 'inat'
84
+ valid_mappings = ['gbif','inat','manual']
85
+
86
+ assert target_mapping in valid_mappings
87
+ for source_mapping in source_mappings:
88
+ assert source_mapping != target_mapping and \
89
+ source_mapping in valid_mappings
90
+
91
+ n_remappings = 0
92
+
93
+ # i_row = 1; row = df.iloc[i_row]; row
94
+ for i_row,row in df.iterrows():
95
+
96
+ if row['source'] not in source_mappings:
97
+ continue
98
+
99
+ scientific_name = row['scientific_name']
100
+ old_common = taxonomy_string_to_common_name(row['taxonomy_string'])
101
+
102
+ m = get_preferred_taxonomic_match(scientific_name,target_mapping)
103
+
104
+ if m is None or m.source != target_mapping:
105
+ print('No mapping for {} ({}) ({})'.format(scientific_name,row['query'],old_common))
106
+ continue
107
+
108
+ assert m.scientific_name == row['scientific_name']
109
+
110
+ if m.taxonomic_level == 'variety' and row['taxonomy_level'] == 'subspecies':
111
+ pass
112
+ else:
113
+ assert m.taxonomic_level == row['taxonomy_level']
114
+
115
+ new_common = taxonomy_string_to_common_name(m.taxonomy_string)
116
+
117
+ if row['taxonomy_string'] != m.taxonomy_string:
118
+ print('Remapping {} ({} to {})'.format(scientific_name, old_common, new_common))
119
+ n_remappings += 1
120
+ df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
121
+
122
+ if row['source'] != 'manual':
123
+ df.loc[i_row,'source'] = m.source
124
+
125
+ # This should be zero for the release .csv
126
+ print('Made {} remappings'.format(n_remappings))
127
+
128
+ #%%
129
+
130
+ df.to_csv(lila_taxonomy_file.replace('.csv','_remapped.csv'),header=True,index=False)
131
+
132
+
133
+ #%% Check for mappings that disagree with the taxonomy string
134
+
135
+ df = pd.read_csv(lila_taxonomy_file)
136
+
137
+ n_taxonomy_changes = 0
138
+
139
+ # Look for internal inconsistency
140
+ for i_row,row in df.iterrows():
141
+
142
+ sn = row['scientific_name']
143
+ if not isinstance(sn,str):
144
+ continue
145
+
146
+ ts = row['taxonomy_string']
147
+ assert sn == taxonomy_string_to_scientific(ts)
148
+
149
+ assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
150
+
151
+ # Look for outdated mappings
152
+ taxonomy_preference = 'inat'
153
+
154
+ # i_row = 0; row = df.iloc[i_row]
155
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
156
+
157
+ sn = row['scientific_name']
158
+ if not isinstance(sn,str):
159
+ continue
160
+
161
+ m = get_preferred_taxonomic_match(sn,taxonomy_preference)
162
+ assert m.scientific_name == sn
163
+
164
+ ts = row['taxonomy_string']
165
+ assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
166
+ row['dataset_name'],ts,m.taxonomy_string)
167
+
168
+ if ts != m.taxonomy_string:
169
+ n_taxonomy_changes += 1
170
+ df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
171
+
172
+ print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
173
+
174
+ # Optionally re-write
175
+ if False:
176
+ df.to_csv(lila_taxonomy_file,header=True,index=False)
177
+
178
+
179
+ #%% List null mappings
180
+
181
+ # These should all be things like "empty", "unidentified", "fire", "car", etc.
182
+
183
+ # i_row = 0; row = df.iloc[i_row]
184
+ for i_row,row in df.iterrows():
185
+ if (not isinstance(row['taxonomy_string'],str)) or (len(row['taxonomy_string']) == 0):
186
+ print('No mapping for {}:{}'.format(row['dataset_name'],row['query']))
187
+
188
+
189
+ #%% List mappings with scientific names but no common names
190
+
191
+ for i_row,row in df.iterrows():
192
+ cn = row['common_name']
193
+ sn = row['scientific_name']
194
+ ts = row['taxonomy_string']
195
+ if (isinstance(ts,str)) and (len(ts) >= 0):
196
+ if (not isinstance(cn,str)) or (len(cn) == 0):
197
+ print('No mapping for {}:{}:{}'.format(row['dataset_name'],row['query'],row['scientific_name']))
198
+
199
+
200
+ #%% List mappings that map to different things in different data sets
201
+
202
+ import numpy as np
203
+ def isnan(x):
204
+ if not isinstance(x,float):
205
+ return False
206
+ return np.isnan(x)
207
+
208
+ from collections import defaultdict
209
+ query_to_rows = defaultdict(list)
210
+
211
+ queries_with_multiple_mappings = set()
212
+
213
+ n_suppressed = 0
214
+
215
+ suppress_multiple_matches = [
216
+ ['porcupine','Snapshot Camdeboo','Idaho Camera Traps'],
217
+ ['porcupine','Snapshot Enonkishu','Idaho Camera Traps'],
218
+ ['porcupine','Snapshot Karoo','Idaho Camera Traps'],
219
+ ['porcupine','Snapshot Kgalagadi','Idaho Camera Traps'],
220
+ ['porcupine','Snapshot Kruger','Idaho Camera Traps'],
221
+ ['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
222
+ ['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
223
+
224
+ ['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
225
+ ['porcupine','Snapshot Serengeti','Snapshot Kruger'],
226
+ ['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
227
+ ['porcupine','Snapshot Serengeti','Snapshot Karoo'],
228
+ ['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
229
+
230
+ ['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
231
+ ['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
232
+ ['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
233
+ ['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
234
+ ['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
235
+
236
+ ['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
237
+ ['kudu','Snapshot Serengeti','Snapshot Kruger'],
238
+ ['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
239
+ ['kudu','Snapshot Serengeti','Snapshot Karoo'],
240
+ ['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
241
+
242
+ ['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
243
+ ['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
244
+ ['fox','Idaho Camera Traps','Caltech Camera Traps'],
245
+
246
+ ['pangolin','Snapshot Serengeti','SWG Camera Traps'],
247
+
248
+ ['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
249
+ ['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
250
+
251
+ ['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
252
+
253
+ ]
254
+
255
+ for i_row,row in df.iterrows():
256
+
257
+ query = row['query']
258
+ taxonomy_string = row['taxonomy_string']
259
+
260
+ for previous_i_row in query_to_rows[query]:
261
+
262
+ previous_row = df.iloc[previous_i_row]
263
+ assert previous_row['query'] == query
264
+ query_match = False
265
+ if isnan(row['taxonomy_string']):
266
+ query_match = isnan(previous_row['taxonomy_string'])
267
+ elif isnan(previous_row['taxonomy_string']):
268
+ query_match = isnan(row['taxonomy_string'])
269
+ else:
270
+ query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
271
+
272
+ if not query_match:
273
+
274
+ suppress = False
275
+
276
+ # x = suppress_multiple_matches[-1]
277
+ for x in suppress_multiple_matches:
278
+ if x[0] == query and \
279
+ ( \
280
+ (x[1] == row['dataset_name'] and x[2] == previous_row['dataset_name']) \
281
+ or \
282
+ (x[2] == row['dataset_name'] and x[1] == previous_row['dataset_name']) \
283
+ ):
284
+ suppress = True
285
+ n_suppressed += 1
286
+ break
287
+
288
+ if not suppress:
289
+ print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
290
+ query, row['dataset_name'], previous_row['dataset_name'],
291
+ taxonomy_string, previous_row['taxonomy_string']))
292
+
293
+ queries_with_multiple_mappings.add(query)
294
+
295
+ # ...for each row where we saw this query
296
+
297
+ query_to_rows[query].append(i_row)
298
+
299
+ # ...for each row
300
+
301
+ print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
302
+ len(queries_with_multiple_mappings),n_suppressed))
303
+
304
+
305
+ #%% Verify that nothing "unidentified" maps to a species or subspecies
306
+
307
+ # E.g., "unidentified skunk" should never map to a specific species of skunk
308
+
309
+ allowable_unknown_species = [
310
+ 'unknown_tayra' # AFAIK this is a unique species, I'm not sure what's implied here
311
+ ]
312
+
313
+ unk_queries = ['skunk']
314
+ for i_row,row in df.iterrows():
315
+
316
+ query = row['query']
317
+ level = row['taxonomy_level']
318
+
319
+ if not isinstance(level,str):
320
+ assert not isinstance(row['taxonomy_string'],str)
321
+ continue
322
+
323
+ if ( \
324
+ 'unidentified' in query or \
325
+ ('unk' in query and ('skunk' not in query and 'chipmunk' not in query))\
326
+ ) \
327
+ and \
328
+ ('species' in level):
329
+
330
+ if query not in allowable_unknown_species:
331
+
332
+ print('Warning: query {}:{} maps to {} {}'.format(
333
+ row['dataset_name'],
334
+ row['query'],
335
+ row['taxonomy_level'],
336
+ row['scientific_name']
337
+ ))
338
+
339
+
340
+ #%% Make sure there are valid source and level values for everything with a mapping
341
+
342
+ for i_row,row in df.iterrows():
343
+ if isinstance(row['scientific_name'],str):
344
+ if 'source' in row:
345
+ assert isinstance(row['source'],str)
346
+ assert isinstance(row['taxonomy_level'],str)
347
+
348
+
349
+ #%% Find WCS mappings that aren't species or aren't the same as the input
350
+
351
+ # WCS used scientific names, so these remappings are slightly more controversial
352
+ # then the standard remappings.
353
+
354
+ # row = df.iloc[-500]
355
+ for i_row,row in df.iterrows():
356
+
357
+ if not isinstance(row['scientific_name'],str):
358
+ continue
359
+ if 'WCS' not in row['dataset_name']:
360
+ continue
361
+
362
+ query = row['query']
363
+ scientific_name = row['scientific_name']
364
+ common_name = row['common_name']
365
+ level = row['taxonomy_level']
366
+ taxonomy_string = row['taxonomy_string']
367
+
368
+ common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
369
+ query_string = query.replace(' sp','')
370
+ query_string = query_string.replace('unknown ','')
371
+
372
+ # Anything marked "species" or "unknown" by definition doesn't map to a species,
373
+ # so ignore these.
374
+ if (' sp' not in query) and ('unknown' not in query) and \
375
+ (level != 'species') and (level != 'subspecies'):
376
+ print('WCS query {} ({}) remapped to {} {} ({})'.format(
377
+ query,common_name,level,scientific_name,common_name_from_taxonomy))
378
+
379
+ if query_string != scientific_name:
380
+ pass
381
+ # print('WCS query {} ({}) remapped to {} ({})'.format(
382
+ # query,common_name,scientific_name,common_names_from_taxonomy))
383
+
384
+
385
+ #%% Download sample images for all scientific names
386
+
387
+ remapped_queries = {'papio':'papio+baboon',
388
+ 'damaliscus lunatus jimela':'damaliscus lunatus',
389
+ 'mazama':'genus+mazama',
390
+ 'mirafra':'genus+mirafra'}
391
+
392
+ import os
393
+ from taxonomy_mapping import retrieve_sample_image
394
+
395
+ scientific_name_to_paths = {}
396
+ image_base = os.path.join(preview_base,'images')
397
+ images_per_query = 15
398
+ min_valid_images_per_query = 3
399
+ min_valid_image_size = 3000
400
+
401
+ # TODO: trivially prallelizable
402
+ #
403
+ # i_row = 0; row = df.iloc[i_row]
404
+ for i_row,row in df.iterrows():
405
+
406
+ s = row['scientific_name']
407
+
408
+ if (not isinstance(s,str)) or (len(s)==0):
409
+ continue
410
+
411
+ query = s.replace(' ','+')
412
+
413
+ if query in remapped_queries:
414
+ query = remapped_queries[query]
415
+
416
+ query_folder = os.path.join(image_base,query)
417
+ os.makedirs(query_folder,exist_ok=True)
418
+
419
+ # Check whether we already have enough images for this query
420
+ image_files = os.listdir(query_folder)
421
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
422
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
423
+ sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
424
+ if len(sizes_above_threshold) > min_valid_images_per_query:
425
+ print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
426
+ continue
427
+
428
+ # Check whether we've already run this query for a previous row
429
+ if query in scientific_name_to_paths:
430
+ continue
431
+
432
+ print('Processing query {} of {} ({})'.format(i_row,len(df),query))
433
+ paths = retrieve_sample_image.download_images(query=query,
434
+ output_directory=image_base,
435
+ limit=images_per_query,
436
+ verbose=True)
437
+ print('Downloaded {} images for {}'.format(len(paths),query))
438
+ scientific_name_to_paths[query] = paths
439
+
440
+ # ...for each row in the mapping table
441
+
442
+
443
+ #%% Rename .jpeg to .jpg
444
+
445
+ from md_utils import path_utils
446
+ all_images = path_utils.recursive_file_list(image_base,False)
447
+
448
+ for fn in tqdm(all_images):
449
+ if fn.lower().endswith('.jpeg'):
450
+ new_fn = fn[0:-5] + '.jpg'
451
+ os.rename(fn, new_fn)
452
+
453
+
454
+ #%% Choose representative images for each scientific name
455
+
456
+ # Specifically, sort by size, and take the largest unique sizes. Very small files tend
457
+ # to be bogus thumbnails, etc.
458
+
459
+ max_images_per_query = 4
460
+ scientific_name_to_preferred_images = {}
461
+
462
+ # s = list(scientific_name_to_paths.keys())[0]
463
+ for s in list(df.scientific_name):
464
+
465
+ if not isinstance(s,str):
466
+ continue
467
+
468
+ query = s.replace(' ','+')
469
+
470
+ if query in remapped_queries:
471
+ query = remapped_queries[query]
472
+
473
+ query_folder = os.path.join(image_base,query)
474
+ assert os.path.isdir(query_folder)
475
+ image_files = os.listdir(query_folder)
476
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
477
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
478
+ path_to_size = {}
479
+ for i_fp,fp in enumerate(image_fullpaths):
480
+ path_to_size[fp] = sizes[i_fp]
481
+ paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
482
+
483
+ # Be suspicious of duplicate sizes
484
+ b_duplicate_sizes = [False] * len(paths_by_size)
485
+
486
+ for i_path,p in enumerate(paths_by_size):
487
+ if i_path == len(paths_by_size) - 1:
488
+ continue
489
+ if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
490
+ b_duplicate_sizes[i_path] = True
491
+
492
+ paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
493
+
494
+ preferred_paths = paths_by_size_non_dup[:max_images_per_query]
495
+ scientific_name_to_preferred_images[s] = preferred_paths
496
+
497
+ # ...for each scientific name
498
+
499
+
500
+ #%% Delete unused images
501
+
502
+ used_images = []
503
+ for images in scientific_name_to_preferred_images.values():
504
+ used_images.extend(images)
505
+
506
+ print('Using a total of {} images'.format(len(used_images)))
507
+ used_images_set = set(used_images)
508
+
509
+ from md_utils import path_utils
510
+ all_images = path_utils.recursive_file_list(image_base,False)
511
+
512
+ unused_images = []
513
+ for fn in all_images:
514
+ if fn not in used_images_set:
515
+ unused_images.append(fn)
516
+
517
+ print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images),
518
+ len(all_images) - len(unused_images)))
519
+
520
+ for fn in tqdm(unused_images):
521
+ os.remove(fn)
522
+
523
+
524
+ #%% Produce HTML preview
525
+
526
+ with open(html_output_file, 'w', encoding='utf-8') as f:
527
+
528
+ f.write('<html><head></head><body>\n')
529
+
530
+ names = scientific_name_to_preferred_images.keys()
531
+ names = sorted(names)
532
+
533
+ f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
534
+ 'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
535
+ '</p>')
536
+
537
+ # i_row = 2; row = df.iloc[i_row]
538
+ for i_row, row in tqdm(df.iterrows(), total=len(df)):
539
+
540
+ s = row['scientific_name']
541
+
542
+ taxonomy_string = row['taxonomy_string']
543
+ if isinstance(taxonomy_string,str):
544
+ taxonomic_match = eval(taxonomy_string)
545
+ matched_entity = taxonomic_match[0]
546
+ assert len(matched_entity) == 4
547
+ common_names = matched_entity[3]
548
+ if len(common_names) == 1:
549
+ common_name_string = common_names[0]
550
+ else:
551
+ common_name_string = str(common_names)
552
+ else:
553
+ common_name_string = ''
554
+
555
+ f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
556
+
557
+ if isinstance(row.scientific_name,str):
558
+ output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
559
+ row.dataset_name, row.query,
560
+ row.taxonomy_level, row.scientific_name, common_name_string,
561
+ row.common_name)
562
+ f.write(output_string)
563
+ else:
564
+ f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
565
+
566
+ if s is None or s not in names:
567
+ f.write('<p class="content_p">no images available</p>')
568
+ else:
569
+ image_paths = scientific_name_to_preferred_images[s]
570
+ basedir = os.path.dirname(html_output_file)
571
+ relative_paths = [os.path.relpath(p,basedir) for p in image_paths]
572
+ image_paths = [s.replace('\\','/') for s in relative_paths]
573
+ n_images = len(image_paths)
574
+ # image_paths = [os.path.relpath(p, output_base) for p in image_paths]
575
+ image_width_percent = round(100 / n_images)
576
+ f.write('<table class="image_table"><tr>\n')
577
+ for image_path in image_paths:
578
+ f.write('<td style="vertical-align:top;" width="{}%">'
579
+ '<img src="{}" style="display:block; width:100%; vertical-align:top; height:auto;">'
580
+ '</td>\n'.format(image_width_percent, image_path))
581
+ f.write('</tr></table>\n')
582
+
583
+ # ...for each row
584
+
585
+ f.write('</body></html>\n')
586
+
587
+
588
+ #%% Open HTML preview
589
+
590
+ from md_utils.path_utils import open_file
591
+ open_file(html_output_file)