megadetector 5.0.28__py3-none-any.whl → 5.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (176) hide show
  1. megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
  2. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
  3. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  7. megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
  8. megadetector/classification/aggregate_classifier_probs.py +3 -3
  9. megadetector/classification/analyze_failed_images.py +5 -5
  10. megadetector/classification/cache_batchapi_outputs.py +5 -5
  11. megadetector/classification/create_classification_dataset.py +11 -12
  12. megadetector/classification/crop_detections.py +10 -10
  13. megadetector/classification/csv_to_json.py +8 -8
  14. megadetector/classification/detect_and_crop.py +13 -15
  15. megadetector/classification/evaluate_model.py +7 -7
  16. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  17. megadetector/classification/json_to_azcopy_list.py +1 -1
  18. megadetector/classification/json_validator.py +29 -32
  19. megadetector/classification/map_classification_categories.py +9 -9
  20. megadetector/classification/merge_classification_detection_output.py +12 -9
  21. megadetector/classification/prepare_classification_script.py +19 -19
  22. megadetector/classification/prepare_classification_script_mc.py +23 -23
  23. megadetector/classification/run_classifier.py +4 -4
  24. megadetector/classification/save_mislabeled.py +6 -6
  25. megadetector/classification/train_classifier.py +1 -1
  26. megadetector/classification/train_classifier_tf.py +9 -9
  27. megadetector/classification/train_utils.py +10 -10
  28. megadetector/data_management/annotations/annotation_constants.py +1 -1
  29. megadetector/data_management/camtrap_dp_to_coco.py +45 -45
  30. megadetector/data_management/cct_json_utils.py +101 -101
  31. megadetector/data_management/cct_to_md.py +49 -49
  32. megadetector/data_management/cct_to_wi.py +33 -33
  33. megadetector/data_management/coco_to_labelme.py +75 -75
  34. megadetector/data_management/coco_to_yolo.py +189 -189
  35. megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
  36. megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
  37. megadetector/data_management/databases/integrity_check_json_db.py +202 -188
  38. megadetector/data_management/databases/subset_json_db.py +33 -33
  39. megadetector/data_management/generate_crops_from_cct.py +38 -38
  40. megadetector/data_management/get_image_sizes.py +54 -49
  41. megadetector/data_management/labelme_to_coco.py +130 -124
  42. megadetector/data_management/labelme_to_yolo.py +78 -72
  43. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  44. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  45. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  46. megadetector/data_management/lila/download_lila_subset.py +21 -24
  47. megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
  48. megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
  49. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  50. megadetector/data_management/lila/lila_common.py +70 -70
  51. megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
  52. megadetector/data_management/mewc_to_md.py +339 -340
  53. megadetector/data_management/ocr_tools.py +258 -252
  54. megadetector/data_management/read_exif.py +231 -224
  55. megadetector/data_management/remap_coco_categories.py +26 -26
  56. megadetector/data_management/remove_exif.py +31 -20
  57. megadetector/data_management/rename_images.py +187 -187
  58. megadetector/data_management/resize_coco_dataset.py +41 -41
  59. megadetector/data_management/speciesnet_to_md.py +41 -41
  60. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  61. megadetector/data_management/yolo_output_to_md_output.py +117 -120
  62. megadetector/data_management/yolo_to_coco.py +195 -188
  63. megadetector/detection/change_detection.py +831 -0
  64. megadetector/detection/process_video.py +340 -337
  65. megadetector/detection/pytorch_detector.py +304 -262
  66. megadetector/detection/run_detector.py +177 -164
  67. megadetector/detection/run_detector_batch.py +364 -363
  68. megadetector/detection/run_inference_with_yolov5_val.py +328 -325
  69. megadetector/detection/run_tiled_inference.py +256 -249
  70. megadetector/detection/tf_detector.py +24 -24
  71. megadetector/detection/video_utils.py +290 -282
  72. megadetector/postprocessing/add_max_conf.py +15 -11
  73. megadetector/postprocessing/categorize_detections_by_size.py +44 -44
  74. megadetector/postprocessing/classification_postprocessing.py +415 -415
  75. megadetector/postprocessing/combine_batch_outputs.py +20 -21
  76. megadetector/postprocessing/compare_batch_results.py +528 -517
  77. megadetector/postprocessing/convert_output_format.py +97 -97
  78. megadetector/postprocessing/create_crop_folder.py +219 -146
  79. megadetector/postprocessing/detector_calibration.py +173 -168
  80. megadetector/postprocessing/generate_csv_report.py +508 -499
  81. megadetector/postprocessing/load_api_results.py +23 -20
  82. megadetector/postprocessing/md_to_coco.py +129 -98
  83. megadetector/postprocessing/md_to_labelme.py +89 -83
  84. megadetector/postprocessing/md_to_wi.py +40 -40
  85. megadetector/postprocessing/merge_detections.py +87 -114
  86. megadetector/postprocessing/postprocess_batch_results.py +313 -298
  87. megadetector/postprocessing/remap_detection_categories.py +36 -36
  88. megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
  89. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  90. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  91. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
  92. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  93. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  94. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  95. megadetector/postprocessing/validate_batch_results.py +70 -70
  96. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  97. megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
  98. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
  99. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -66
  100. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  101. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  102. megadetector/taxonomy_mapping/species_lookup.py +33 -33
  103. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  104. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  105. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  106. megadetector/utils/azure_utils.py +22 -22
  107. megadetector/utils/ct_utils.py +1018 -200
  108. megadetector/utils/directory_listing.py +21 -77
  109. megadetector/utils/gpu_test.py +22 -22
  110. megadetector/utils/md_tests.py +541 -518
  111. megadetector/utils/path_utils.py +1457 -398
  112. megadetector/utils/process_utils.py +41 -41
  113. megadetector/utils/sas_blob_utils.py +53 -49
  114. megadetector/utils/split_locations_into_train_val.py +61 -61
  115. megadetector/utils/string_utils.py +147 -26
  116. megadetector/utils/url_utils.py +463 -173
  117. megadetector/utils/wi_utils.py +2629 -2526
  118. megadetector/utils/write_html_image_list.py +137 -137
  119. megadetector/visualization/plot_utils.py +21 -21
  120. megadetector/visualization/render_images_with_thumbnails.py +37 -73
  121. megadetector/visualization/visualization_utils.py +401 -397
  122. megadetector/visualization/visualize_db.py +197 -190
  123. megadetector/visualization/visualize_detector_output.py +79 -73
  124. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/METADATA +135 -132
  125. megadetector-5.0.29.dist-info/RECORD +163 -0
  126. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
  127. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
  128. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
  129. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  130. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  131. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  132. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  133. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  134. megadetector/data_management/importers/awc_to_json.py +0 -191
  135. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  136. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  137. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  138. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  139. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  140. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  141. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  142. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  143. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  144. megadetector/data_management/importers/ena24_to_json.py +0 -276
  145. megadetector/data_management/importers/filenames_to_json.py +0 -386
  146. megadetector/data_management/importers/helena_to_cct.py +0 -283
  147. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  148. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  149. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  150. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  151. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  152. megadetector/data_management/importers/missouri_to_json.py +0 -490
  153. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  154. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  155. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  156. megadetector/data_management/importers/pc_to_json.py +0 -365
  157. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  158. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  159. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  160. megadetector/data_management/importers/rspb_to_json.py +0 -356
  161. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  162. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  163. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  164. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  165. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  166. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  167. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  168. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  169. megadetector/data_management/importers/ubc_to_json.py +0 -399
  170. megadetector/data_management/importers/umn_to_json.py +0 -507
  171. megadetector/data_management/importers/wellington_to_json.py +0 -263
  172. megadetector/data_management/importers/wi_to_json.py +0 -442
  173. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  174. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  175. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  176. megadetector-5.0.28.dist-info/RECORD +0 -209
@@ -27,20 +27,20 @@ html_output_file = os.path.join(preview_base,'index.html')
27
27
 
28
28
  def parse_taxonomy_string(taxonomy_string):
29
29
 
30
- taxonomic_match = eval(taxonomy_string)
30
+ taxonomic_match = eval(taxonomy_string)
31
31
  matched_entity = taxonomic_match[0]
32
32
  assert len(matched_entity) == 4
33
-
33
+
34
34
  level = matched_entity[1]
35
-
35
+
36
36
  scientific_name = matched_entity[2]
37
-
37
+
38
38
  common_names = matched_entity[3]
39
39
  if len(common_names) == 1:
40
40
  common_name = common_names[0]
41
41
  else:
42
42
  common_name = str(common_names)
43
-
43
+
44
44
  return scientific_name,common_name,level,taxonomic_match
45
45
 
46
46
  def taxonomy_string_to_common_name(taxonomy_string):
@@ -79,14 +79,14 @@ n_taxonomy_changes = 0
79
79
 
80
80
  # Look for internal inconsistency
81
81
  for i_row,row in df.iterrows():
82
-
82
+
83
83
  sn = row['scientific_name']
84
84
  if not isinstance(sn,str):
85
85
  continue
86
-
87
- ts = row['taxonomy_string']
86
+
87
+ ts = row['taxonomy_string']
88
88
  assert sn == taxonomy_string_to_scientific(ts)
89
-
89
+
90
90
  assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
91
91
 
92
92
  # Look for outdated mappings
@@ -94,18 +94,18 @@ taxonomy_preference = 'inat'
94
94
 
95
95
  # i_row = 0; row = df.iloc[i_row]
96
96
  for i_row,row in tqdm(df.iterrows(),total=len(df)):
97
-
97
+
98
98
  sn = row['scientific_name']
99
99
  if not isinstance(sn,str):
100
100
  continue
101
-
101
+
102
102
  m = get_preferred_taxonomic_match(sn,taxonomy_preference)
103
103
  assert m.scientific_name == sn
104
-
104
+
105
105
  ts = row['taxonomy_string']
106
106
  assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
107
107
  row['dataset_name'],ts,m.taxonomy_string)
108
-
108
+
109
109
  if ts != m.taxonomy_string:
110
110
  n_taxonomy_changes += 1
111
111
  df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
@@ -161,45 +161,45 @@ suppress_multiple_matches = [
161
161
  ['porcupine','Snapshot Kruger','Idaho Camera Traps'],
162
162
  ['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
163
163
  ['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
164
-
164
+
165
165
  ['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
166
166
  ['porcupine','Snapshot Serengeti','Snapshot Kruger'],
167
167
  ['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
168
168
  ['porcupine','Snapshot Serengeti','Snapshot Karoo'],
169
169
  ['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
170
-
170
+
171
171
  ['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
172
172
  ['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
173
173
  ['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
174
174
  ['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
175
175
  ['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
176
-
176
+
177
177
  ['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
178
178
  ['kudu','Snapshot Serengeti','Snapshot Kruger'],
179
179
  ['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
180
180
  ['kudu','Snapshot Serengeti','Snapshot Karoo'],
181
181
  ['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
182
-
182
+
183
183
  ['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
184
184
  ['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
185
185
  ['fox','Idaho Camera Traps','Caltech Camera Traps'],
186
-
186
+
187
187
  ['pangolin','Snapshot Serengeti','SWG Camera Traps'],
188
-
188
+
189
189
  ['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
190
190
  ['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
191
-
191
+
192
192
  ['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
193
-
193
+
194
194
  ]
195
195
 
196
196
  for i_row,row in df.iterrows():
197
-
197
+
198
198
  query = row['query']
199
199
  taxonomy_string = row['taxonomy_string']
200
-
200
+
201
201
  for previous_i_row in query_to_rows[query]:
202
-
202
+
203
203
  previous_row = df.iloc[previous_i_row]
204
204
  assert previous_row['query'] == query
205
205
  query_match = False
@@ -209,11 +209,11 @@ for i_row,row in df.iterrows():
209
209
  query_match = isnan(row['taxonomy_string'])
210
210
  else:
211
211
  query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
212
-
212
+
213
213
  if not query_match:
214
-
214
+
215
215
  suppress = False
216
-
216
+
217
217
  # x = suppress_multiple_matches[-1]
218
218
  for x in suppress_multiple_matches:
219
219
  if x[0] == query and \
@@ -225,18 +225,18 @@ for i_row,row in df.iterrows():
225
225
  suppress = True
226
226
  n_suppressed += 1
227
227
  break
228
-
228
+
229
229
  if not suppress:
230
230
  print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
231
231
  query, row['dataset_name'], previous_row['dataset_name'],
232
232
  taxonomy_string, previous_row['taxonomy_string']))
233
-
233
+
234
234
  queries_with_multiple_mappings.add(query)
235
-
235
+
236
236
  # ...for each row where we saw this query
237
-
237
+
238
238
  query_to_rows[query].append(i_row)
239
-
239
+
240
240
  # ...for each row
241
241
 
242
242
  print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
@@ -267,9 +267,9 @@ for i_row,row in df.iterrows():
267
267
  ) \
268
268
  and \
269
269
  ('species' in level):
270
-
270
+
271
271
  if query not in allowable_unknown_species:
272
-
272
+
273
273
  print('Warning: query {}:{} maps to {} {}'.format(
274
274
  row['dataset_name'],
275
275
  row['query'],
@@ -285,7 +285,7 @@ for i_row,row in df.iterrows():
285
285
  if 'source' in row:
286
286
  assert isinstance(row['source'],str)
287
287
  assert isinstance(row['taxonomy_level'],str)
288
-
288
+
289
289
 
290
290
  #%% Find WCS mappings that aren't species or aren't the same as the input
291
291
 
@@ -294,22 +294,22 @@ for i_row,row in df.iterrows():
294
294
 
295
295
  # row = df.iloc[-500]
296
296
  for i_row,row in df.iterrows():
297
-
297
+
298
298
  if not isinstance(row['scientific_name'],str):
299
299
  continue
300
300
  if 'WCS' not in row['dataset_name']:
301
301
  continue
302
-
302
+
303
303
  query = row['query']
304
304
  scientific_name = row['scientific_name']
305
305
  common_name = row['common_name']
306
- level = row['taxonomy_level']
306
+ level = row['taxonomy_level']
307
307
  taxonomy_string = row['taxonomy_string']
308
-
309
- common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
308
+
309
+ common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
310
310
  query_string = query.replace(' sp','')
311
311
  query_string = query_string.replace('unknown ','')
312
-
312
+
313
313
  # Anything marked "species" or "unknown" by definition doesn't map to a species,
314
314
  # so ignore these.
315
315
  if (' sp' not in query) and ('unknown' not in query) and \
@@ -317,7 +317,7 @@ for i_row,row in df.iterrows():
317
317
  print('WCS query {} ({}) remapped to {} {} ({})'.format(
318
318
  query,common_name,level,scientific_name,common_name_from_taxonomy))
319
319
 
320
- if query_string != scientific_name:
320
+ if query_string != scientific_name:
321
321
  pass
322
322
  # print('WCS query {} ({}) remapped to {} ({})'.format(
323
323
  # query,common_name,scientific_name,common_names_from_taxonomy))
@@ -345,20 +345,20 @@ min_valid_image_size = 3000
345
345
  #
346
346
  # i_row = 0; row = df.iloc[i_row]
347
347
  for i_row,row in df.iterrows():
348
-
348
+
349
349
  s = row['scientific_name']
350
-
350
+
351
351
  if (not isinstance(s,str)) or (len(s)==0):
352
352
  continue
353
-
353
+
354
354
  query = s.replace(' ','+')
355
-
355
+
356
356
  if query in remapped_queries:
357
357
  query = remapped_queries[query]
358
-
358
+
359
359
  query_folder = os.path.join(image_base,query)
360
360
  os.makedirs(query_folder,exist_ok=True)
361
-
361
+
362
362
  # Check whether we already have enough images for this query
363
363
  image_files = os.listdir(query_folder)
364
364
  image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
@@ -371,7 +371,7 @@ for i_row,row in df.iterrows():
371
371
  # Check whether we've already run this query for a previous row
372
372
  if query in scientific_name_to_paths:
373
373
  continue
374
-
374
+
375
375
  print('Processing query {} of {} ({})'.format(i_row,len(df),query))
376
376
  paths = retrieve_sample_image.download_images(query=query,
377
377
  output_directory=image_base,
@@ -404,40 +404,40 @@ scientific_name_to_preferred_images = {}
404
404
 
405
405
  # s = list(scientific_name_to_paths.keys())[0]
406
406
  for s in list(df.scientific_name):
407
-
407
+
408
408
  if not isinstance(s,str):
409
409
  continue
410
-
410
+
411
411
  query = s.replace(' ','+')
412
-
412
+
413
413
  if query in remapped_queries:
414
414
  query = remapped_queries[query]
415
-
415
+
416
416
  query_folder = os.path.join(image_base,query)
417
417
  assert os.path.isdir(query_folder)
418
418
  image_files = os.listdir(query_folder)
419
- image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
419
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
420
420
  sizes = [os.path.getsize(p) for p in image_fullpaths]
421
421
  path_to_size = {}
422
422
  for i_fp,fp in enumerate(image_fullpaths):
423
423
  path_to_size[fp] = sizes[i_fp]
424
424
  paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
425
-
425
+
426
426
  # Be suspicious of duplicate sizes
427
427
  b_duplicate_sizes = [False] * len(paths_by_size)
428
-
428
+
429
429
  for i_path,p in enumerate(paths_by_size):
430
430
  if i_path == len(paths_by_size) - 1:
431
431
  continue
432
432
  if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
433
433
  b_duplicate_sizes[i_path] = True
434
-
434
+
435
435
  paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
436
-
436
+
437
437
  preferred_paths = paths_by_size_non_dup[:max_images_per_query]
438
438
  scientific_name_to_preferred_images[s] = preferred_paths
439
439
 
440
- # ...for each scientific name
440
+ # ...for each scientific name
441
441
 
442
442
 
443
443
  #%% Delete unused images
@@ -445,7 +445,7 @@ for s in list(df.scientific_name):
445
445
  used_images = []
446
446
  for images in scientific_name_to_preferred_images.values():
447
447
  used_images.extend(images)
448
-
448
+
449
449
  print('Using a total of {} images'.format(len(used_images)))
450
450
  used_images_set = set(used_images)
451
451
 
@@ -461,18 +461,18 @@ print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images
461
461
  len(all_images) - len(unused_images)))
462
462
 
463
463
  for fn in tqdm(unused_images):
464
- os.remove(fn)
464
+ os.remove(fn)
465
465
 
466
466
 
467
467
  #%% Produce HTML preview
468
468
 
469
469
  with open(html_output_file, 'w', encoding='utf-8') as f:
470
-
470
+
471
471
  f.write('<html><head></head><body>\n')
472
472
 
473
473
  names = scientific_name_to_preferred_images.keys()
474
474
  names = sorted(names)
475
-
475
+
476
476
  f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
477
477
  'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
478
478
  '</p>')
@@ -481,10 +481,10 @@ with open(html_output_file, 'w', encoding='utf-8') as f:
481
481
  for i_row, row in tqdm(df.iterrows(), total=len(df)):
482
482
 
483
483
  s = row['scientific_name']
484
-
484
+
485
485
  taxonomy_string = row['taxonomy_string']
486
486
  if isinstance(taxonomy_string,str):
487
- taxonomic_match = eval(taxonomy_string)
487
+ taxonomic_match = eval(taxonomy_string)
488
488
  matched_entity = taxonomic_match[0]
489
489
  assert len(matched_entity) == 4
490
490
  common_names = matched_entity[3]
@@ -499,7 +499,7 @@ with open(html_output_file, 'w', encoding='utf-8') as f:
499
499
 
500
500
  if isinstance(row.scientific_name,str):
501
501
  output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
502
- row.dataset_name, row.query,
502
+ row.dataset_name, row.query,
503
503
  row.taxonomy_level, row.scientific_name, common_name_string,
504
504
  row.common_name)
505
505
  f.write(output_string)
@@ -17,21 +17,21 @@ import os
17
17
 
18
18
  output_folder = os.path.expanduser('~/tmp/image-download-test')
19
19
  os.makedirs(output_folder,exist_ok=True)
20
-
20
+
21
21
  method = 'simple_image_download' # 'google_images_download'
22
22
 
23
23
  if method == 'simple_image_download':
24
-
24
+
25
25
  from megadetector.taxonomy_mapping import simple_image_download
26
26
  google_image_downloader = simple_image_download.Downloader()
27
27
  google_image_downloader.directory = output_folder
28
-
28
+
29
29
  elif method == 'google_images_download':
30
-
30
+
31
31
  from google_images_download import google_images_download
32
32
 
33
33
  else:
34
-
34
+
35
35
  raise ValueError('Unrecognized method {}'.format(method))
36
36
 
37
37
 
@@ -39,33 +39,33 @@ else:
39
39
 
40
40
  def download_images(query,output_directory,limit=100,verbose=False):
41
41
 
42
- query = query.replace(' ','+')
43
-
42
+ query = query.replace(' ','+')
43
+
44
44
  if method == 'simple_image_download':
45
-
45
+
46
46
  google_image_downloader.directory = output_directory
47
47
  paths = google_image_downloader.download(query, limit=limit,
48
48
  verbose=verbose, cache=False, download_cache=False)
49
49
  return paths
50
-
50
+
51
51
  elif method == 'google_images_download':
52
-
53
- response = google_images_download.googleimagesdownload()
52
+
53
+ response = google_images_download.googleimagesdownload()
54
54
  arguments = {'keywords':query,'limit':limit,'print_urls':verbose,
55
55
  'image-directory':output_directory}
56
56
  response.download(arguments)
57
57
  return None
58
58
 
59
59
  else:
60
-
60
+
61
61
  raise ValueError('Unrecognized method {}'.format(method))
62
-
62
+
63
63
 
64
64
  #%% Test driver
65
65
 
66
66
  if False:
67
-
67
+
68
68
  #%%
69
-
69
+
70
70
  paths = download_images(query='redunca',output_directory=output_folder,
71
- limit=20,verbose=True)
71
+ limit=20,verbose=True)
@@ -49,7 +49,7 @@ def generate_urls(search):
49
49
  """
50
50
  Generate Google search URLs for all tokens in the list [search]
51
51
  """
52
-
52
+
53
53
  return [(BASE_URL+quote(word)+GOOGLE_PICTURE_ID) for word in search]
54
54
 
55
55
 
@@ -60,7 +60,7 @@ def check_webpage(url):
60
60
  if 'html' not in str(request.content):
61
61
  checked_url = request
62
62
  except Exception as err:
63
- print(err)
63
+ print(err)
64
64
  return checked_url
65
65
 
66
66
 
@@ -68,7 +68,7 @@ def scan_webpage(webpage, extensions, timer):
68
68
  """
69
69
  Scan for pictures to download based on keywords
70
70
  """
71
-
71
+
72
72
  global SCANNER_COUNTER
73
73
  scanner = webpage.find
74
74
  found = False
@@ -143,7 +143,7 @@ class Downloader:
143
143
  urls_ = generate_urls(search)
144
144
  timer = timer if timer else 1000
145
145
  # max_progressbar = count * (list(range(limit+1))[-1]+1)
146
-
146
+
147
147
  # bar = progressbar.ProgressBar(maxval=max_progressbar,
148
148
  # widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]).start()
149
149
  i = 0
@@ -172,7 +172,7 @@ class Downloader:
172
172
  print('==='*15 + ' < ' + 'NO PICTURES FOUND' + ' > ' + '==='*15)
173
173
  return cache_out
174
174
 
175
- def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False,
175
+ def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False,
176
176
  timer=None):
177
177
  if not download_cache:
178
178
  content = self.search_urls(keywords, limit, verbose, cache, timer)
@@ -180,16 +180,16 @@ class Downloader:
180
180
  content = self._cached_urls
181
181
  if not content:
182
182
  print('Downloader has not URLs saved in Memory yet, run Downloader.search_urls to find pics first')
183
- paths = []
183
+ paths = []
184
184
  for name, (path, url) in content.items():
185
185
  fullpath = os.path.join(path, name)
186
186
  paths.append(fullpath)
187
187
  with open(fullpath, 'wb') as file:
188
188
  file.write(url.content)
189
189
  if verbose:
190
- print(f'File Name={name}, Downloaded from {url.url}')
190
+ print(f'File Name={name}, Downloaded from {url.url}')
191
191
  return paths
192
-
192
+
193
193
  def _create_directories(self, name):
194
194
  dir_path = os.path.join(self._directory, name)
195
195
  try:
@@ -114,7 +114,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
114
114
  if (not force_init) and (inat_taxonomy is not None):
115
115
  print('Skipping taxonomy re-init')
116
116
  return
117
-
117
+
118
118
  if (not force_init) and (os.path.isfile(serialized_structures_file)):
119
119
 
120
120
  print(f'De-serializing taxonomy data from {serialized_structures_file}')
@@ -135,7 +135,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
135
135
  gbif_vernacular_to_taxon_id,\
136
136
  gbif_taxon_id_to_scientific,\
137
137
  gbif_scientific_to_taxon_id = structures_to_serialize
138
-
138
+
139
139
  return
140
140
 
141
141
 
@@ -146,7 +146,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
146
146
  for taxonomy_name, zip_url in taxonomy_urls.items():
147
147
 
148
148
  need_to_download = False
149
-
149
+
150
150
  if force_init:
151
151
  need_to_download = True
152
152
 
@@ -267,7 +267,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
267
267
  # Build iNat dictionaries
268
268
 
269
269
  print('Building lookup dictionaries for iNat taxonomy')
270
-
270
+
271
271
  for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
272
272
 
273
273
  taxon_id = row['taxonID']
@@ -286,7 +286,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
286
286
  # Build GBIF dictionaries
287
287
 
288
288
  print('Building lookup dictionaries for GBIF taxonomy')
289
-
289
+
290
290
  for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
291
291
 
292
292
  taxon_id = row['taxonID']
@@ -596,21 +596,21 @@ class TaxonomicMatch:
596
596
 
597
597
 
598
598
  hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
599
- 'ruffed', 'browed', 'eating', 'striped', 'shanked',
599
+ 'ruffed', 'browed', 'eating', 'striped', 'shanked',
600
600
  'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
601
601
  'necked']
602
602
 
603
603
  def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
604
604
  """
605
- Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
605
+ Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
606
606
  and preferences that are specific to our scenario.
607
-
607
+
608
608
  Args:
609
609
  query (str): The common or scientific name we want to look up
610
610
  taxonomy_preference (str, optional): 'inat' or 'gbif'
611
- retry (bool, optional): if the initial lookup fails, should we try heuristic
611
+ retry (bool, optional): if the initial lookup fails, should we try heuristic
612
612
  substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
613
-
613
+
614
614
  Returns:
615
615
  TaxonomicMatch: the best taxonomic match, or None
616
616
  """
@@ -618,31 +618,31 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
618
618
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
619
619
  if (len(m.scientific_name) > 0) or (not retry):
620
620
  return m
621
-
621
+
622
622
  for s in hyphenated_terms:
623
623
  query = query.replace(' ' + s,'-' + s)
624
624
  m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
625
625
  return m
626
-
627
-
626
+
627
+
628
628
  def validate_and_convert(data):
629
629
  """
630
630
  Recursively validates that all elements in the nested structure are only
631
631
  tuples, lists, ints, or np.int64, and converts np.int64 to int.
632
-
632
+
633
633
  Args:
634
634
  data: The nested structure to validate and convert
635
-
635
+
636
636
  Returns:
637
637
  The validated and converted structure
638
-
638
+
639
639
  Raises:
640
640
  TypeError: If an invalid type is encountered
641
641
  """
642
-
643
- if isinstance(data, np.int64):
642
+
643
+ if isinstance(data, np.int64):
644
644
  return int(data)
645
- elif isinstance(data, int) or isinstance(data, str):
645
+ elif isinstance(data, int) or isinstance(data, str):
646
646
  return data
647
647
  elif isinstance(data, (list, tuple)):
648
648
  # Process lists and tuples recursively
@@ -654,17 +654,17 @@ def validate_and_convert(data):
654
654
 
655
655
  # ...def validate_and_convert(...)
656
656
 
657
-
657
+
658
658
  def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
659
-
659
+
660
660
  query = query.lower().strip().replace('_', ' ')
661
661
  query = query.replace('unidentified','')
662
662
  query = query.replace('unknown','')
663
663
  if query.endswith(' sp'):
664
664
  query = query.replace(' sp','')
665
665
  if query.endswith(' group'):
666
- query = query.replace(' group','')
667
-
666
+ query = query.replace(' group','')
667
+
668
668
  query = query.strip()
669
669
 
670
670
  # query = 'person'
@@ -686,17 +686,17 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
686
686
 
687
687
  n_inat_matches = len(inat_matches)
688
688
  n_gbif_matches = len(gbif_matches)
689
-
689
+
690
690
  selected_matches = None
691
-
691
+
692
692
  assert taxonomy_preference in ['gbif','inat'],\
693
693
  'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
694
-
694
+
695
695
  if n_inat_matches > 0 and taxonomy_preference == 'inat':
696
696
  selected_matches = 'inat'
697
697
  elif n_gbif_matches > 0:
698
698
  selected_matches = 'gbif'
699
-
699
+
700
700
  if selected_matches == 'inat':
701
701
 
702
702
  i_match = 0
@@ -802,7 +802,7 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
802
802
  # Convert np.int64's to ints
803
803
  if match is not None:
804
804
  match = validate_and_convert(match)
805
-
805
+
806
806
  taxonomy_string = str(match)
807
807
 
808
808
  return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
@@ -828,15 +828,15 @@ if False:
828
828
  # print(matches)
829
829
 
830
830
  print_taxonomy_matches(matches,verbose=True)
831
-
831
+
832
832
  print('\n\n')
833
-
833
+
834
834
  # Print the taxonomy in the taxonomy spreadsheet format
835
835
  assert matches[1]['source'] == 'inat'
836
836
  t = str(matches[1]['taxonomy'])
837
837
  print(t)
838
838
  import clipboard; clipboard.copy(t)
839
-
839
+
840
840
 
841
841
  #%% Directly access the taxonomy tables
842
842
 
@@ -848,12 +848,12 @@ if False:
848
848
 
849
849
  #%% Command-line driver
850
850
 
851
- def main():
851
+ def main(): # noqa
852
852
 
853
853
  # Read command line inputs (absolute path)
854
854
  parser = argparse.ArgumentParser()
855
855
  parser.add_argument('input_file')
856
-
856
+
857
857
  if len(sys.argv[1:]) == 0:
858
858
  parser.print_help()
859
859
  parser.exit()