megadetector 5.0.28__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +231 -224
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +304 -262
- megadetector/detection/run_detector.py +177 -164
- megadetector/detection/run_detector_batch.py +364 -363
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +256 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +290 -282
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +415 -415
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +219 -146
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +23 -20
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +313 -298
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -66
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1018 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1457 -398
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +61 -61
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +401 -397
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +79 -73
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/METADATA +135 -132
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.28.dist-info/RECORD +0 -209
|
@@ -27,20 +27,20 @@ html_output_file = os.path.join(preview_base,'index.html')
|
|
|
27
27
|
|
|
28
28
|
def parse_taxonomy_string(taxonomy_string):
|
|
29
29
|
|
|
30
|
-
taxonomic_match = eval(taxonomy_string)
|
|
30
|
+
taxonomic_match = eval(taxonomy_string)
|
|
31
31
|
matched_entity = taxonomic_match[0]
|
|
32
32
|
assert len(matched_entity) == 4
|
|
33
|
-
|
|
33
|
+
|
|
34
34
|
level = matched_entity[1]
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
scientific_name = matched_entity[2]
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
common_names = matched_entity[3]
|
|
39
39
|
if len(common_names) == 1:
|
|
40
40
|
common_name = common_names[0]
|
|
41
41
|
else:
|
|
42
42
|
common_name = str(common_names)
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
return scientific_name,common_name,level,taxonomic_match
|
|
45
45
|
|
|
46
46
|
def taxonomy_string_to_common_name(taxonomy_string):
|
|
@@ -79,14 +79,14 @@ n_taxonomy_changes = 0
|
|
|
79
79
|
|
|
80
80
|
# Look for internal inconsistency
|
|
81
81
|
for i_row,row in df.iterrows():
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
sn = row['scientific_name']
|
|
84
84
|
if not isinstance(sn,str):
|
|
85
85
|
continue
|
|
86
|
-
|
|
87
|
-
ts = row['taxonomy_string']
|
|
86
|
+
|
|
87
|
+
ts = row['taxonomy_string']
|
|
88
88
|
assert sn == taxonomy_string_to_scientific(ts)
|
|
89
|
-
|
|
89
|
+
|
|
90
90
|
assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
|
|
91
91
|
|
|
92
92
|
# Look for outdated mappings
|
|
@@ -94,18 +94,18 @@ taxonomy_preference = 'inat'
|
|
|
94
94
|
|
|
95
95
|
# i_row = 0; row = df.iloc[i_row]
|
|
96
96
|
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
sn = row['scientific_name']
|
|
99
99
|
if not isinstance(sn,str):
|
|
100
100
|
continue
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
m = get_preferred_taxonomic_match(sn,taxonomy_preference)
|
|
103
103
|
assert m.scientific_name == sn
|
|
104
|
-
|
|
104
|
+
|
|
105
105
|
ts = row['taxonomy_string']
|
|
106
106
|
assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
|
|
107
107
|
row['dataset_name'],ts,m.taxonomy_string)
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
if ts != m.taxonomy_string:
|
|
110
110
|
n_taxonomy_changes += 1
|
|
111
111
|
df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
|
|
@@ -161,45 +161,45 @@ suppress_multiple_matches = [
|
|
|
161
161
|
['porcupine','Snapshot Kruger','Idaho Camera Traps'],
|
|
162
162
|
['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
|
|
163
163
|
['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
|
|
164
|
-
|
|
164
|
+
|
|
165
165
|
['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
|
|
166
166
|
['porcupine','Snapshot Serengeti','Snapshot Kruger'],
|
|
167
167
|
['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
|
|
168
168
|
['porcupine','Snapshot Serengeti','Snapshot Karoo'],
|
|
169
169
|
['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
|
|
170
|
-
|
|
170
|
+
|
|
171
171
|
['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
|
|
172
172
|
['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
|
|
173
173
|
['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
|
|
174
174
|
['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
|
|
175
175
|
['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
|
|
176
|
-
|
|
176
|
+
|
|
177
177
|
['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
|
|
178
178
|
['kudu','Snapshot Serengeti','Snapshot Kruger'],
|
|
179
179
|
['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
|
|
180
180
|
['kudu','Snapshot Serengeti','Snapshot Karoo'],
|
|
181
181
|
['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
|
|
182
|
-
|
|
182
|
+
|
|
183
183
|
['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
|
|
184
184
|
['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
|
|
185
185
|
['fox','Idaho Camera Traps','Caltech Camera Traps'],
|
|
186
|
-
|
|
186
|
+
|
|
187
187
|
['pangolin','Snapshot Serengeti','SWG Camera Traps'],
|
|
188
|
-
|
|
188
|
+
|
|
189
189
|
['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
|
|
190
190
|
['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
|
|
191
|
-
|
|
191
|
+
|
|
192
192
|
['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
|
|
193
|
-
|
|
193
|
+
|
|
194
194
|
]
|
|
195
195
|
|
|
196
196
|
for i_row,row in df.iterrows():
|
|
197
|
-
|
|
197
|
+
|
|
198
198
|
query = row['query']
|
|
199
199
|
taxonomy_string = row['taxonomy_string']
|
|
200
|
-
|
|
200
|
+
|
|
201
201
|
for previous_i_row in query_to_rows[query]:
|
|
202
|
-
|
|
202
|
+
|
|
203
203
|
previous_row = df.iloc[previous_i_row]
|
|
204
204
|
assert previous_row['query'] == query
|
|
205
205
|
query_match = False
|
|
@@ -209,11 +209,11 @@ for i_row,row in df.iterrows():
|
|
|
209
209
|
query_match = isnan(row['taxonomy_string'])
|
|
210
210
|
else:
|
|
211
211
|
query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
|
|
212
|
-
|
|
212
|
+
|
|
213
213
|
if not query_match:
|
|
214
|
-
|
|
214
|
+
|
|
215
215
|
suppress = False
|
|
216
|
-
|
|
216
|
+
|
|
217
217
|
# x = suppress_multiple_matches[-1]
|
|
218
218
|
for x in suppress_multiple_matches:
|
|
219
219
|
if x[0] == query and \
|
|
@@ -225,18 +225,18 @@ for i_row,row in df.iterrows():
|
|
|
225
225
|
suppress = True
|
|
226
226
|
n_suppressed += 1
|
|
227
227
|
break
|
|
228
|
-
|
|
228
|
+
|
|
229
229
|
if not suppress:
|
|
230
230
|
print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
|
|
231
231
|
query, row['dataset_name'], previous_row['dataset_name'],
|
|
232
232
|
taxonomy_string, previous_row['taxonomy_string']))
|
|
233
|
-
|
|
233
|
+
|
|
234
234
|
queries_with_multiple_mappings.add(query)
|
|
235
|
-
|
|
235
|
+
|
|
236
236
|
# ...for each row where we saw this query
|
|
237
|
-
|
|
237
|
+
|
|
238
238
|
query_to_rows[query].append(i_row)
|
|
239
|
-
|
|
239
|
+
|
|
240
240
|
# ...for each row
|
|
241
241
|
|
|
242
242
|
print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
|
|
@@ -267,9 +267,9 @@ for i_row,row in df.iterrows():
|
|
|
267
267
|
) \
|
|
268
268
|
and \
|
|
269
269
|
('species' in level):
|
|
270
|
-
|
|
270
|
+
|
|
271
271
|
if query not in allowable_unknown_species:
|
|
272
|
-
|
|
272
|
+
|
|
273
273
|
print('Warning: query {}:{} maps to {} {}'.format(
|
|
274
274
|
row['dataset_name'],
|
|
275
275
|
row['query'],
|
|
@@ -285,7 +285,7 @@ for i_row,row in df.iterrows():
|
|
|
285
285
|
if 'source' in row:
|
|
286
286
|
assert isinstance(row['source'],str)
|
|
287
287
|
assert isinstance(row['taxonomy_level'],str)
|
|
288
|
-
|
|
288
|
+
|
|
289
289
|
|
|
290
290
|
#%% Find WCS mappings that aren't species or aren't the same as the input
|
|
291
291
|
|
|
@@ -294,22 +294,22 @@ for i_row,row in df.iterrows():
|
|
|
294
294
|
|
|
295
295
|
# row = df.iloc[-500]
|
|
296
296
|
for i_row,row in df.iterrows():
|
|
297
|
-
|
|
297
|
+
|
|
298
298
|
if not isinstance(row['scientific_name'],str):
|
|
299
299
|
continue
|
|
300
300
|
if 'WCS' not in row['dataset_name']:
|
|
301
301
|
continue
|
|
302
|
-
|
|
302
|
+
|
|
303
303
|
query = row['query']
|
|
304
304
|
scientific_name = row['scientific_name']
|
|
305
305
|
common_name = row['common_name']
|
|
306
|
-
level = row['taxonomy_level']
|
|
306
|
+
level = row['taxonomy_level']
|
|
307
307
|
taxonomy_string = row['taxonomy_string']
|
|
308
|
-
|
|
309
|
-
common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
|
|
308
|
+
|
|
309
|
+
common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
|
|
310
310
|
query_string = query.replace(' sp','')
|
|
311
311
|
query_string = query_string.replace('unknown ','')
|
|
312
|
-
|
|
312
|
+
|
|
313
313
|
# Anything marked "species" or "unknown" by definition doesn't map to a species,
|
|
314
314
|
# so ignore these.
|
|
315
315
|
if (' sp' not in query) and ('unknown' not in query) and \
|
|
@@ -317,7 +317,7 @@ for i_row,row in df.iterrows():
|
|
|
317
317
|
print('WCS query {} ({}) remapped to {} {} ({})'.format(
|
|
318
318
|
query,common_name,level,scientific_name,common_name_from_taxonomy))
|
|
319
319
|
|
|
320
|
-
if query_string != scientific_name:
|
|
320
|
+
if query_string != scientific_name:
|
|
321
321
|
pass
|
|
322
322
|
# print('WCS query {} ({}) remapped to {} ({})'.format(
|
|
323
323
|
# query,common_name,scientific_name,common_names_from_taxonomy))
|
|
@@ -345,20 +345,20 @@ min_valid_image_size = 3000
|
|
|
345
345
|
#
|
|
346
346
|
# i_row = 0; row = df.iloc[i_row]
|
|
347
347
|
for i_row,row in df.iterrows():
|
|
348
|
-
|
|
348
|
+
|
|
349
349
|
s = row['scientific_name']
|
|
350
|
-
|
|
350
|
+
|
|
351
351
|
if (not isinstance(s,str)) or (len(s)==0):
|
|
352
352
|
continue
|
|
353
|
-
|
|
353
|
+
|
|
354
354
|
query = s.replace(' ','+')
|
|
355
|
-
|
|
355
|
+
|
|
356
356
|
if query in remapped_queries:
|
|
357
357
|
query = remapped_queries[query]
|
|
358
|
-
|
|
358
|
+
|
|
359
359
|
query_folder = os.path.join(image_base,query)
|
|
360
360
|
os.makedirs(query_folder,exist_ok=True)
|
|
361
|
-
|
|
361
|
+
|
|
362
362
|
# Check whether we already have enough images for this query
|
|
363
363
|
image_files = os.listdir(query_folder)
|
|
364
364
|
image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
|
|
@@ -371,7 +371,7 @@ for i_row,row in df.iterrows():
|
|
|
371
371
|
# Check whether we've already run this query for a previous row
|
|
372
372
|
if query in scientific_name_to_paths:
|
|
373
373
|
continue
|
|
374
|
-
|
|
374
|
+
|
|
375
375
|
print('Processing query {} of {} ({})'.format(i_row,len(df),query))
|
|
376
376
|
paths = retrieve_sample_image.download_images(query=query,
|
|
377
377
|
output_directory=image_base,
|
|
@@ -404,40 +404,40 @@ scientific_name_to_preferred_images = {}
|
|
|
404
404
|
|
|
405
405
|
# s = list(scientific_name_to_paths.keys())[0]
|
|
406
406
|
for s in list(df.scientific_name):
|
|
407
|
-
|
|
407
|
+
|
|
408
408
|
if not isinstance(s,str):
|
|
409
409
|
continue
|
|
410
|
-
|
|
410
|
+
|
|
411
411
|
query = s.replace(' ','+')
|
|
412
|
-
|
|
412
|
+
|
|
413
413
|
if query in remapped_queries:
|
|
414
414
|
query = remapped_queries[query]
|
|
415
|
-
|
|
415
|
+
|
|
416
416
|
query_folder = os.path.join(image_base,query)
|
|
417
417
|
assert os.path.isdir(query_folder)
|
|
418
418
|
image_files = os.listdir(query_folder)
|
|
419
|
-
image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
|
|
419
|
+
image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
|
|
420
420
|
sizes = [os.path.getsize(p) for p in image_fullpaths]
|
|
421
421
|
path_to_size = {}
|
|
422
422
|
for i_fp,fp in enumerate(image_fullpaths):
|
|
423
423
|
path_to_size[fp] = sizes[i_fp]
|
|
424
424
|
paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
|
|
425
|
-
|
|
425
|
+
|
|
426
426
|
# Be suspicious of duplicate sizes
|
|
427
427
|
b_duplicate_sizes = [False] * len(paths_by_size)
|
|
428
|
-
|
|
428
|
+
|
|
429
429
|
for i_path,p in enumerate(paths_by_size):
|
|
430
430
|
if i_path == len(paths_by_size) - 1:
|
|
431
431
|
continue
|
|
432
432
|
if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
|
|
433
433
|
b_duplicate_sizes[i_path] = True
|
|
434
|
-
|
|
434
|
+
|
|
435
435
|
paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
|
|
436
|
-
|
|
436
|
+
|
|
437
437
|
preferred_paths = paths_by_size_non_dup[:max_images_per_query]
|
|
438
438
|
scientific_name_to_preferred_images[s] = preferred_paths
|
|
439
439
|
|
|
440
|
-
# ...for each scientific name
|
|
440
|
+
# ...for each scientific name
|
|
441
441
|
|
|
442
442
|
|
|
443
443
|
#%% Delete unused images
|
|
@@ -445,7 +445,7 @@ for s in list(df.scientific_name):
|
|
|
445
445
|
used_images = []
|
|
446
446
|
for images in scientific_name_to_preferred_images.values():
|
|
447
447
|
used_images.extend(images)
|
|
448
|
-
|
|
448
|
+
|
|
449
449
|
print('Using a total of {} images'.format(len(used_images)))
|
|
450
450
|
used_images_set = set(used_images)
|
|
451
451
|
|
|
@@ -461,18 +461,18 @@ print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images
|
|
|
461
461
|
len(all_images) - len(unused_images)))
|
|
462
462
|
|
|
463
463
|
for fn in tqdm(unused_images):
|
|
464
|
-
os.remove(fn)
|
|
464
|
+
os.remove(fn)
|
|
465
465
|
|
|
466
466
|
|
|
467
467
|
#%% Produce HTML preview
|
|
468
468
|
|
|
469
469
|
with open(html_output_file, 'w', encoding='utf-8') as f:
|
|
470
|
-
|
|
470
|
+
|
|
471
471
|
f.write('<html><head></head><body>\n')
|
|
472
472
|
|
|
473
473
|
names = scientific_name_to_preferred_images.keys()
|
|
474
474
|
names = sorted(names)
|
|
475
|
-
|
|
475
|
+
|
|
476
476
|
f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
|
|
477
477
|
'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
|
|
478
478
|
'</p>')
|
|
@@ -481,10 +481,10 @@ with open(html_output_file, 'w', encoding='utf-8') as f:
|
|
|
481
481
|
for i_row, row in tqdm(df.iterrows(), total=len(df)):
|
|
482
482
|
|
|
483
483
|
s = row['scientific_name']
|
|
484
|
-
|
|
484
|
+
|
|
485
485
|
taxonomy_string = row['taxonomy_string']
|
|
486
486
|
if isinstance(taxonomy_string,str):
|
|
487
|
-
taxonomic_match = eval(taxonomy_string)
|
|
487
|
+
taxonomic_match = eval(taxonomy_string)
|
|
488
488
|
matched_entity = taxonomic_match[0]
|
|
489
489
|
assert len(matched_entity) == 4
|
|
490
490
|
common_names = matched_entity[3]
|
|
@@ -499,7 +499,7 @@ with open(html_output_file, 'w', encoding='utf-8') as f:
|
|
|
499
499
|
|
|
500
500
|
if isinstance(row.scientific_name,str):
|
|
501
501
|
output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
|
|
502
|
-
row.dataset_name, row.query,
|
|
502
|
+
row.dataset_name, row.query,
|
|
503
503
|
row.taxonomy_level, row.scientific_name, common_name_string,
|
|
504
504
|
row.common_name)
|
|
505
505
|
f.write(output_string)
|
|
@@ -17,21 +17,21 @@ import os
|
|
|
17
17
|
|
|
18
18
|
output_folder = os.path.expanduser('~/tmp/image-download-test')
|
|
19
19
|
os.makedirs(output_folder,exist_ok=True)
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
method = 'simple_image_download' # 'google_images_download'
|
|
22
22
|
|
|
23
23
|
if method == 'simple_image_download':
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
from megadetector.taxonomy_mapping import simple_image_download
|
|
26
26
|
google_image_downloader = simple_image_download.Downloader()
|
|
27
27
|
google_image_downloader.directory = output_folder
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
elif method == 'google_images_download':
|
|
30
|
-
|
|
30
|
+
|
|
31
31
|
from google_images_download import google_images_download
|
|
32
32
|
|
|
33
33
|
else:
|
|
34
|
-
|
|
34
|
+
|
|
35
35
|
raise ValueError('Unrecognized method {}'.format(method))
|
|
36
36
|
|
|
37
37
|
|
|
@@ -39,33 +39,33 @@ else:
|
|
|
39
39
|
|
|
40
40
|
def download_images(query,output_directory,limit=100,verbose=False):
|
|
41
41
|
|
|
42
|
-
query = query.replace(' ','+')
|
|
43
|
-
|
|
42
|
+
query = query.replace(' ','+')
|
|
43
|
+
|
|
44
44
|
if method == 'simple_image_download':
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
google_image_downloader.directory = output_directory
|
|
47
47
|
paths = google_image_downloader.download(query, limit=limit,
|
|
48
48
|
verbose=verbose, cache=False, download_cache=False)
|
|
49
49
|
return paths
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
elif method == 'google_images_download':
|
|
52
|
-
|
|
53
|
-
response = google_images_download.googleimagesdownload()
|
|
52
|
+
|
|
53
|
+
response = google_images_download.googleimagesdownload()
|
|
54
54
|
arguments = {'keywords':query,'limit':limit,'print_urls':verbose,
|
|
55
55
|
'image-directory':output_directory}
|
|
56
56
|
response.download(arguments)
|
|
57
57
|
return None
|
|
58
58
|
|
|
59
59
|
else:
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
raise ValueError('Unrecognized method {}'.format(method))
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
|
|
64
64
|
#%% Test driver
|
|
65
65
|
|
|
66
66
|
if False:
|
|
67
|
-
|
|
67
|
+
|
|
68
68
|
#%%
|
|
69
|
-
|
|
69
|
+
|
|
70
70
|
paths = download_images(query='redunca',output_directory=output_folder,
|
|
71
|
-
limit=20,verbose=True)
|
|
71
|
+
limit=20,verbose=True)
|
|
@@ -49,7 +49,7 @@ def generate_urls(search):
|
|
|
49
49
|
"""
|
|
50
50
|
Generate Google search URLs for all tokens in the list [search]
|
|
51
51
|
"""
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
return [(BASE_URL+quote(word)+GOOGLE_PICTURE_ID) for word in search]
|
|
54
54
|
|
|
55
55
|
|
|
@@ -60,7 +60,7 @@ def check_webpage(url):
|
|
|
60
60
|
if 'html' not in str(request.content):
|
|
61
61
|
checked_url = request
|
|
62
62
|
except Exception as err:
|
|
63
|
-
print(err)
|
|
63
|
+
print(err)
|
|
64
64
|
return checked_url
|
|
65
65
|
|
|
66
66
|
|
|
@@ -68,7 +68,7 @@ def scan_webpage(webpage, extensions, timer):
|
|
|
68
68
|
"""
|
|
69
69
|
Scan for pictures to download based on keywords
|
|
70
70
|
"""
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
global SCANNER_COUNTER
|
|
73
73
|
scanner = webpage.find
|
|
74
74
|
found = False
|
|
@@ -143,7 +143,7 @@ class Downloader:
|
|
|
143
143
|
urls_ = generate_urls(search)
|
|
144
144
|
timer = timer if timer else 1000
|
|
145
145
|
# max_progressbar = count * (list(range(limit+1))[-1]+1)
|
|
146
|
-
|
|
146
|
+
|
|
147
147
|
# bar = progressbar.ProgressBar(maxval=max_progressbar,
|
|
148
148
|
# widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]).start()
|
|
149
149
|
i = 0
|
|
@@ -172,7 +172,7 @@ class Downloader:
|
|
|
172
172
|
print('==='*15 + ' < ' + 'NO PICTURES FOUND' + ' > ' + '==='*15)
|
|
173
173
|
return cache_out
|
|
174
174
|
|
|
175
|
-
def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False,
|
|
175
|
+
def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False,
|
|
176
176
|
timer=None):
|
|
177
177
|
if not download_cache:
|
|
178
178
|
content = self.search_urls(keywords, limit, verbose, cache, timer)
|
|
@@ -180,16 +180,16 @@ class Downloader:
|
|
|
180
180
|
content = self._cached_urls
|
|
181
181
|
if not content:
|
|
182
182
|
print('Downloader has not URLs saved in Memory yet, run Downloader.search_urls to find pics first')
|
|
183
|
-
paths = []
|
|
183
|
+
paths = []
|
|
184
184
|
for name, (path, url) in content.items():
|
|
185
185
|
fullpath = os.path.join(path, name)
|
|
186
186
|
paths.append(fullpath)
|
|
187
187
|
with open(fullpath, 'wb') as file:
|
|
188
188
|
file.write(url.content)
|
|
189
189
|
if verbose:
|
|
190
|
-
print(f'File Name={name}, Downloaded from {url.url}')
|
|
190
|
+
print(f'File Name={name}, Downloaded from {url.url}')
|
|
191
191
|
return paths
|
|
192
|
-
|
|
192
|
+
|
|
193
193
|
def _create_directories(self, name):
|
|
194
194
|
dir_path = os.path.join(self._directory, name)
|
|
195
195
|
try:
|
|
@@ -114,7 +114,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
114
114
|
if (not force_init) and (inat_taxonomy is not None):
|
|
115
115
|
print('Skipping taxonomy re-init')
|
|
116
116
|
return
|
|
117
|
-
|
|
117
|
+
|
|
118
118
|
if (not force_init) and (os.path.isfile(serialized_structures_file)):
|
|
119
119
|
|
|
120
120
|
print(f'De-serializing taxonomy data from {serialized_structures_file}')
|
|
@@ -135,7 +135,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
135
135
|
gbif_vernacular_to_taxon_id,\
|
|
136
136
|
gbif_taxon_id_to_scientific,\
|
|
137
137
|
gbif_scientific_to_taxon_id = structures_to_serialize
|
|
138
|
-
|
|
138
|
+
|
|
139
139
|
return
|
|
140
140
|
|
|
141
141
|
|
|
@@ -146,7 +146,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
146
146
|
for taxonomy_name, zip_url in taxonomy_urls.items():
|
|
147
147
|
|
|
148
148
|
need_to_download = False
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
if force_init:
|
|
151
151
|
need_to_download = True
|
|
152
152
|
|
|
@@ -267,7 +267,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
267
267
|
# Build iNat dictionaries
|
|
268
268
|
|
|
269
269
|
print('Building lookup dictionaries for iNat taxonomy')
|
|
270
|
-
|
|
270
|
+
|
|
271
271
|
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
272
272
|
|
|
273
273
|
taxon_id = row['taxonID']
|
|
@@ -286,7 +286,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
286
286
|
# Build GBIF dictionaries
|
|
287
287
|
|
|
288
288
|
print('Building lookup dictionaries for GBIF taxonomy')
|
|
289
|
-
|
|
289
|
+
|
|
290
290
|
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
291
291
|
|
|
292
292
|
taxon_id = row['taxonID']
|
|
@@ -596,21 +596,21 @@ class TaxonomicMatch:
|
|
|
596
596
|
|
|
597
597
|
|
|
598
598
|
hyphenated_terms = ['crowned', 'backed', 'throated', 'tailed', 'headed', 'cheeked',
|
|
599
|
-
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
599
|
+
'ruffed', 'browed', 'eating', 'striped', 'shanked',
|
|
600
600
|
'fronted', 'bellied', 'spotted', 'eared', 'collared', 'breasted',
|
|
601
601
|
'necked']
|
|
602
602
|
|
|
603
603
|
def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retry=True) -> TaxonomicMatch:
|
|
604
604
|
"""
|
|
605
|
-
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
605
|
+
Wrapper for _get_preferred_taxonomic_match, but expressing a variety of heuristics
|
|
606
606
|
and preferences that are specific to our scenario.
|
|
607
|
-
|
|
607
|
+
|
|
608
608
|
Args:
|
|
609
609
|
query (str): The common or scientific name we want to look up
|
|
610
610
|
taxonomy_preference (str, optional): 'inat' or 'gbif'
|
|
611
|
-
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
611
|
+
retry (bool, optional): if the initial lookup fails, should we try heuristic
|
|
612
612
|
substitutions, e.g. replacing "_" with " ", or "spp" with "species"?
|
|
613
|
-
|
|
613
|
+
|
|
614
614
|
Returns:
|
|
615
615
|
TaxonomicMatch: the best taxonomic match, or None
|
|
616
616
|
"""
|
|
@@ -618,31 +618,31 @@ def get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat', retr
|
|
|
618
618
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
619
619
|
if (len(m.scientific_name) > 0) or (not retry):
|
|
620
620
|
return m
|
|
621
|
-
|
|
621
|
+
|
|
622
622
|
for s in hyphenated_terms:
|
|
623
623
|
query = query.replace(' ' + s,'-' + s)
|
|
624
624
|
m,query = _get_preferred_taxonomic_match(query=query,taxonomy_preference=taxonomy_preference)
|
|
625
625
|
return m
|
|
626
|
-
|
|
627
|
-
|
|
626
|
+
|
|
627
|
+
|
|
628
628
|
def validate_and_convert(data):
|
|
629
629
|
"""
|
|
630
630
|
Recursively validates that all elements in the nested structure are only
|
|
631
631
|
tuples, lists, ints, or np.int64, and converts np.int64 to int.
|
|
632
|
-
|
|
632
|
+
|
|
633
633
|
Args:
|
|
634
634
|
data: The nested structure to validate and convert
|
|
635
|
-
|
|
635
|
+
|
|
636
636
|
Returns:
|
|
637
637
|
The validated and converted structure
|
|
638
|
-
|
|
638
|
+
|
|
639
639
|
Raises:
|
|
640
640
|
TypeError: If an invalid type is encountered
|
|
641
641
|
"""
|
|
642
|
-
|
|
643
|
-
if isinstance(data, np.int64):
|
|
642
|
+
|
|
643
|
+
if isinstance(data, np.int64):
|
|
644
644
|
return int(data)
|
|
645
|
-
elif isinstance(data, int) or isinstance(data, str):
|
|
645
|
+
elif isinstance(data, int) or isinstance(data, str):
|
|
646
646
|
return data
|
|
647
647
|
elif isinstance(data, (list, tuple)):
|
|
648
648
|
# Process lists and tuples recursively
|
|
@@ -654,17 +654,17 @@ def validate_and_convert(data):
|
|
|
654
654
|
|
|
655
655
|
# ...def validate_and_convert(...)
|
|
656
656
|
|
|
657
|
-
|
|
657
|
+
|
|
658
658
|
def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') -> TaxonomicMatch:
|
|
659
|
-
|
|
659
|
+
|
|
660
660
|
query = query.lower().strip().replace('_', ' ')
|
|
661
661
|
query = query.replace('unidentified','')
|
|
662
662
|
query = query.replace('unknown','')
|
|
663
663
|
if query.endswith(' sp'):
|
|
664
664
|
query = query.replace(' sp','')
|
|
665
665
|
if query.endswith(' group'):
|
|
666
|
-
query = query.replace(' group','')
|
|
667
|
-
|
|
666
|
+
query = query.replace(' group','')
|
|
667
|
+
|
|
668
668
|
query = query.strip()
|
|
669
669
|
|
|
670
670
|
# query = 'person'
|
|
@@ -686,17 +686,17 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
686
686
|
|
|
687
687
|
n_inat_matches = len(inat_matches)
|
|
688
688
|
n_gbif_matches = len(gbif_matches)
|
|
689
|
-
|
|
689
|
+
|
|
690
690
|
selected_matches = None
|
|
691
|
-
|
|
691
|
+
|
|
692
692
|
assert taxonomy_preference in ['gbif','inat'],\
|
|
693
693
|
'Unrecognized taxonomy preference: {}'.format(taxonomy_preference)
|
|
694
|
-
|
|
694
|
+
|
|
695
695
|
if n_inat_matches > 0 and taxonomy_preference == 'inat':
|
|
696
696
|
selected_matches = 'inat'
|
|
697
697
|
elif n_gbif_matches > 0:
|
|
698
698
|
selected_matches = 'gbif'
|
|
699
|
-
|
|
699
|
+
|
|
700
700
|
if selected_matches == 'inat':
|
|
701
701
|
|
|
702
702
|
i_match = 0
|
|
@@ -802,7 +802,7 @@ def _get_preferred_taxonomic_match(query: str, taxonomy_preference = 'inat') ->
|
|
|
802
802
|
# Convert np.int64's to ints
|
|
803
803
|
if match is not None:
|
|
804
804
|
match = validate_and_convert(match)
|
|
805
|
-
|
|
805
|
+
|
|
806
806
|
taxonomy_string = str(match)
|
|
807
807
|
|
|
808
808
|
return TaxonomicMatch(scientific_name, common_name, taxonomic_level, source,
|
|
@@ -828,15 +828,15 @@ if False:
|
|
|
828
828
|
# print(matches)
|
|
829
829
|
|
|
830
830
|
print_taxonomy_matches(matches,verbose=True)
|
|
831
|
-
|
|
831
|
+
|
|
832
832
|
print('\n\n')
|
|
833
|
-
|
|
833
|
+
|
|
834
834
|
# Print the taxonomy in the taxonomy spreadsheet format
|
|
835
835
|
assert matches[1]['source'] == 'inat'
|
|
836
836
|
t = str(matches[1]['taxonomy'])
|
|
837
837
|
print(t)
|
|
838
838
|
import clipboard; clipboard.copy(t)
|
|
839
|
-
|
|
839
|
+
|
|
840
840
|
|
|
841
841
|
#%% Directly access the taxonomy tables
|
|
842
842
|
|
|
@@ -848,12 +848,12 @@ if False:
|
|
|
848
848
|
|
|
849
849
|
#%% Command-line driver
|
|
850
850
|
|
|
851
|
-
def main():
|
|
851
|
+
def main(): # noqa
|
|
852
852
|
|
|
853
853
|
# Read command line inputs (absolute path)
|
|
854
854
|
parser = argparse.ArgumentParser()
|
|
855
855
|
parser.add_argument('input_file')
|
|
856
|
-
|
|
856
|
+
|
|
857
857
|
if len(sys.argv[1:]) == 0:
|
|
858
858
|
parser.print_help()
|
|
859
859
|
parser.exit()
|