megadetector 5.0.27__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +232 -223
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +341 -338
- megadetector/detection/pytorch_detector.py +308 -266
- megadetector/detection/run_detector.py +186 -166
- megadetector/detection/run_detector_batch.py +366 -364
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +312 -253
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +291 -283
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +808 -311
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +220 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -0
- megadetector/postprocessing/load_api_results.py +25 -22
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -302
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -69
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +11 -11
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1019 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1511 -406
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +73 -60
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2868
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +424 -404
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +126 -98
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/METADATA +6 -3
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.27.dist-info/RECORD +0 -208
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
|
@@ -27,20 +27,20 @@ html_output_file = os.path.join(preview_base,'index.html')
|
|
|
27
27
|
|
|
28
28
|
def parse_taxonomy_string(taxonomy_string):
|
|
29
29
|
|
|
30
|
-
taxonomic_match = eval(taxonomy_string)
|
|
30
|
+
taxonomic_match = eval(taxonomy_string)
|
|
31
31
|
matched_entity = taxonomic_match[0]
|
|
32
32
|
assert len(matched_entity) == 4
|
|
33
|
-
|
|
33
|
+
|
|
34
34
|
level = matched_entity[1]
|
|
35
|
-
|
|
35
|
+
|
|
36
36
|
scientific_name = matched_entity[2]
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
common_names = matched_entity[3]
|
|
39
39
|
if len(common_names) == 1:
|
|
40
40
|
common_name = common_names[0]
|
|
41
41
|
else:
|
|
42
42
|
common_name = str(common_names)
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
return scientific_name,common_name,level,taxonomic_match
|
|
45
45
|
|
|
46
46
|
def taxonomy_string_to_common_name(taxonomy_string):
|
|
@@ -66,9 +66,6 @@ df = pd.read_csv(lila_taxonomy_file)
|
|
|
66
66
|
from megadetector.taxonomy_mapping.species_lookup import \
|
|
67
67
|
initialize_taxonomy_lookup, get_preferred_taxonomic_match
|
|
68
68
|
|
|
69
|
-
# from taxonomy_mapping.species_lookup import (
|
|
70
|
-
# get_taxonomic_info, print_taxonomy_matche)
|
|
71
|
-
|
|
72
69
|
initialize_taxonomy_lookup()
|
|
73
70
|
|
|
74
71
|
|
|
@@ -82,14 +79,14 @@ n_taxonomy_changes = 0
|
|
|
82
79
|
|
|
83
80
|
# Look for internal inconsistency
|
|
84
81
|
for i_row,row in df.iterrows():
|
|
85
|
-
|
|
82
|
+
|
|
86
83
|
sn = row['scientific_name']
|
|
87
84
|
if not isinstance(sn,str):
|
|
88
85
|
continue
|
|
89
|
-
|
|
90
|
-
ts = row['taxonomy_string']
|
|
86
|
+
|
|
87
|
+
ts = row['taxonomy_string']
|
|
91
88
|
assert sn == taxonomy_string_to_scientific(ts)
|
|
92
|
-
|
|
89
|
+
|
|
93
90
|
assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
|
|
94
91
|
|
|
95
92
|
# Look for outdated mappings
|
|
@@ -97,18 +94,18 @@ taxonomy_preference = 'inat'
|
|
|
97
94
|
|
|
98
95
|
# i_row = 0; row = df.iloc[i_row]
|
|
99
96
|
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
100
|
-
|
|
97
|
+
|
|
101
98
|
sn = row['scientific_name']
|
|
102
99
|
if not isinstance(sn,str):
|
|
103
100
|
continue
|
|
104
|
-
|
|
101
|
+
|
|
105
102
|
m = get_preferred_taxonomic_match(sn,taxonomy_preference)
|
|
106
103
|
assert m.scientific_name == sn
|
|
107
|
-
|
|
104
|
+
|
|
108
105
|
ts = row['taxonomy_string']
|
|
109
106
|
assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
|
|
110
107
|
row['dataset_name'],ts,m.taxonomy_string)
|
|
111
|
-
|
|
108
|
+
|
|
112
109
|
if ts != m.taxonomy_string:
|
|
113
110
|
n_taxonomy_changes += 1
|
|
114
111
|
df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
|
|
@@ -164,45 +161,45 @@ suppress_multiple_matches = [
|
|
|
164
161
|
['porcupine','Snapshot Kruger','Idaho Camera Traps'],
|
|
165
162
|
['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
|
|
166
163
|
['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
|
|
167
|
-
|
|
164
|
+
|
|
168
165
|
['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
|
|
169
166
|
['porcupine','Snapshot Serengeti','Snapshot Kruger'],
|
|
170
167
|
['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
|
|
171
168
|
['porcupine','Snapshot Serengeti','Snapshot Karoo'],
|
|
172
169
|
['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
|
|
173
|
-
|
|
170
|
+
|
|
174
171
|
['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
|
|
175
172
|
['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
|
|
176
173
|
['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
|
|
177
174
|
['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
|
|
178
175
|
['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
|
|
179
|
-
|
|
176
|
+
|
|
180
177
|
['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
|
|
181
178
|
['kudu','Snapshot Serengeti','Snapshot Kruger'],
|
|
182
179
|
['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
|
|
183
180
|
['kudu','Snapshot Serengeti','Snapshot Karoo'],
|
|
184
181
|
['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
|
|
185
|
-
|
|
182
|
+
|
|
186
183
|
['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
|
|
187
184
|
['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
|
|
188
185
|
['fox','Idaho Camera Traps','Caltech Camera Traps'],
|
|
189
|
-
|
|
186
|
+
|
|
190
187
|
['pangolin','Snapshot Serengeti','SWG Camera Traps'],
|
|
191
|
-
|
|
188
|
+
|
|
192
189
|
['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
|
|
193
190
|
['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
|
|
194
|
-
|
|
191
|
+
|
|
195
192
|
['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
|
|
196
|
-
|
|
193
|
+
|
|
197
194
|
]
|
|
198
195
|
|
|
199
196
|
for i_row,row in df.iterrows():
|
|
200
|
-
|
|
197
|
+
|
|
201
198
|
query = row['query']
|
|
202
199
|
taxonomy_string = row['taxonomy_string']
|
|
203
|
-
|
|
200
|
+
|
|
204
201
|
for previous_i_row in query_to_rows[query]:
|
|
205
|
-
|
|
202
|
+
|
|
206
203
|
previous_row = df.iloc[previous_i_row]
|
|
207
204
|
assert previous_row['query'] == query
|
|
208
205
|
query_match = False
|
|
@@ -212,11 +209,11 @@ for i_row,row in df.iterrows():
|
|
|
212
209
|
query_match = isnan(row['taxonomy_string'])
|
|
213
210
|
else:
|
|
214
211
|
query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
|
|
215
|
-
|
|
212
|
+
|
|
216
213
|
if not query_match:
|
|
217
|
-
|
|
214
|
+
|
|
218
215
|
suppress = False
|
|
219
|
-
|
|
216
|
+
|
|
220
217
|
# x = suppress_multiple_matches[-1]
|
|
221
218
|
for x in suppress_multiple_matches:
|
|
222
219
|
if x[0] == query and \
|
|
@@ -228,18 +225,18 @@ for i_row,row in df.iterrows():
|
|
|
228
225
|
suppress = True
|
|
229
226
|
n_suppressed += 1
|
|
230
227
|
break
|
|
231
|
-
|
|
228
|
+
|
|
232
229
|
if not suppress:
|
|
233
230
|
print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
|
|
234
231
|
query, row['dataset_name'], previous_row['dataset_name'],
|
|
235
232
|
taxonomy_string, previous_row['taxonomy_string']))
|
|
236
|
-
|
|
233
|
+
|
|
237
234
|
queries_with_multiple_mappings.add(query)
|
|
238
|
-
|
|
235
|
+
|
|
239
236
|
# ...for each row where we saw this query
|
|
240
|
-
|
|
237
|
+
|
|
241
238
|
query_to_rows[query].append(i_row)
|
|
242
|
-
|
|
239
|
+
|
|
243
240
|
# ...for each row
|
|
244
241
|
|
|
245
242
|
print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
|
|
@@ -270,9 +267,9 @@ for i_row,row in df.iterrows():
|
|
|
270
267
|
) \
|
|
271
268
|
and \
|
|
272
269
|
('species' in level):
|
|
273
|
-
|
|
270
|
+
|
|
274
271
|
if query not in allowable_unknown_species:
|
|
275
|
-
|
|
272
|
+
|
|
276
273
|
print('Warning: query {}:{} maps to {} {}'.format(
|
|
277
274
|
row['dataset_name'],
|
|
278
275
|
row['query'],
|
|
@@ -288,7 +285,7 @@ for i_row,row in df.iterrows():
|
|
|
288
285
|
if 'source' in row:
|
|
289
286
|
assert isinstance(row['source'],str)
|
|
290
287
|
assert isinstance(row['taxonomy_level'],str)
|
|
291
|
-
|
|
288
|
+
|
|
292
289
|
|
|
293
290
|
#%% Find WCS mappings that aren't species or aren't the same as the input
|
|
294
291
|
|
|
@@ -297,22 +294,22 @@ for i_row,row in df.iterrows():
|
|
|
297
294
|
|
|
298
295
|
# row = df.iloc[-500]
|
|
299
296
|
for i_row,row in df.iterrows():
|
|
300
|
-
|
|
297
|
+
|
|
301
298
|
if not isinstance(row['scientific_name'],str):
|
|
302
299
|
continue
|
|
303
300
|
if 'WCS' not in row['dataset_name']:
|
|
304
301
|
continue
|
|
305
|
-
|
|
302
|
+
|
|
306
303
|
query = row['query']
|
|
307
304
|
scientific_name = row['scientific_name']
|
|
308
305
|
common_name = row['common_name']
|
|
309
|
-
level = row['taxonomy_level']
|
|
306
|
+
level = row['taxonomy_level']
|
|
310
307
|
taxonomy_string = row['taxonomy_string']
|
|
311
|
-
|
|
312
|
-
common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
|
|
308
|
+
|
|
309
|
+
common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
|
|
313
310
|
query_string = query.replace(' sp','')
|
|
314
311
|
query_string = query_string.replace('unknown ','')
|
|
315
|
-
|
|
312
|
+
|
|
316
313
|
# Anything marked "species" or "unknown" by definition doesn't map to a species,
|
|
317
314
|
# so ignore these.
|
|
318
315
|
if (' sp' not in query) and ('unknown' not in query) and \
|
|
@@ -320,7 +317,7 @@ for i_row,row in df.iterrows():
|
|
|
320
317
|
print('WCS query {} ({}) remapped to {} {} ({})'.format(
|
|
321
318
|
query,common_name,level,scientific_name,common_name_from_taxonomy))
|
|
322
319
|
|
|
323
|
-
if query_string != scientific_name:
|
|
320
|
+
if query_string != scientific_name:
|
|
324
321
|
pass
|
|
325
322
|
# print('WCS query {} ({}) remapped to {} ({})'.format(
|
|
326
323
|
# query,common_name,scientific_name,common_names_from_taxonomy))
|
|
@@ -348,20 +345,20 @@ min_valid_image_size = 3000
|
|
|
348
345
|
#
|
|
349
346
|
# i_row = 0; row = df.iloc[i_row]
|
|
350
347
|
for i_row,row in df.iterrows():
|
|
351
|
-
|
|
348
|
+
|
|
352
349
|
s = row['scientific_name']
|
|
353
|
-
|
|
350
|
+
|
|
354
351
|
if (not isinstance(s,str)) or (len(s)==0):
|
|
355
352
|
continue
|
|
356
|
-
|
|
353
|
+
|
|
357
354
|
query = s.replace(' ','+')
|
|
358
|
-
|
|
355
|
+
|
|
359
356
|
if query in remapped_queries:
|
|
360
357
|
query = remapped_queries[query]
|
|
361
|
-
|
|
358
|
+
|
|
362
359
|
query_folder = os.path.join(image_base,query)
|
|
363
360
|
os.makedirs(query_folder,exist_ok=True)
|
|
364
|
-
|
|
361
|
+
|
|
365
362
|
# Check whether we already have enough images for this query
|
|
366
363
|
image_files = os.listdir(query_folder)
|
|
367
364
|
image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
|
|
@@ -374,7 +371,7 @@ for i_row,row in df.iterrows():
|
|
|
374
371
|
# Check whether we've already run this query for a previous row
|
|
375
372
|
if query in scientific_name_to_paths:
|
|
376
373
|
continue
|
|
377
|
-
|
|
374
|
+
|
|
378
375
|
print('Processing query {} of {} ({})'.format(i_row,len(df),query))
|
|
379
376
|
paths = retrieve_sample_image.download_images(query=query,
|
|
380
377
|
output_directory=image_base,
|
|
@@ -407,40 +404,40 @@ scientific_name_to_preferred_images = {}
|
|
|
407
404
|
|
|
408
405
|
# s = list(scientific_name_to_paths.keys())[0]
|
|
409
406
|
for s in list(df.scientific_name):
|
|
410
|
-
|
|
407
|
+
|
|
411
408
|
if not isinstance(s,str):
|
|
412
409
|
continue
|
|
413
|
-
|
|
410
|
+
|
|
414
411
|
query = s.replace(' ','+')
|
|
415
|
-
|
|
412
|
+
|
|
416
413
|
if query in remapped_queries:
|
|
417
414
|
query = remapped_queries[query]
|
|
418
|
-
|
|
415
|
+
|
|
419
416
|
query_folder = os.path.join(image_base,query)
|
|
420
417
|
assert os.path.isdir(query_folder)
|
|
421
418
|
image_files = os.listdir(query_folder)
|
|
422
|
-
image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
|
|
419
|
+
image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
|
|
423
420
|
sizes = [os.path.getsize(p) for p in image_fullpaths]
|
|
424
421
|
path_to_size = {}
|
|
425
422
|
for i_fp,fp in enumerate(image_fullpaths):
|
|
426
423
|
path_to_size[fp] = sizes[i_fp]
|
|
427
424
|
paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
|
|
428
|
-
|
|
425
|
+
|
|
429
426
|
# Be suspicious of duplicate sizes
|
|
430
427
|
b_duplicate_sizes = [False] * len(paths_by_size)
|
|
431
|
-
|
|
428
|
+
|
|
432
429
|
for i_path,p in enumerate(paths_by_size):
|
|
433
430
|
if i_path == len(paths_by_size) - 1:
|
|
434
431
|
continue
|
|
435
432
|
if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
|
|
436
433
|
b_duplicate_sizes[i_path] = True
|
|
437
|
-
|
|
434
|
+
|
|
438
435
|
paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
|
|
439
|
-
|
|
436
|
+
|
|
440
437
|
preferred_paths = paths_by_size_non_dup[:max_images_per_query]
|
|
441
438
|
scientific_name_to_preferred_images[s] = preferred_paths
|
|
442
439
|
|
|
443
|
-
# ...for each scientific name
|
|
440
|
+
# ...for each scientific name
|
|
444
441
|
|
|
445
442
|
|
|
446
443
|
#%% Delete unused images
|
|
@@ -448,7 +445,7 @@ for s in list(df.scientific_name):
|
|
|
448
445
|
used_images = []
|
|
449
446
|
for images in scientific_name_to_preferred_images.values():
|
|
450
447
|
used_images.extend(images)
|
|
451
|
-
|
|
448
|
+
|
|
452
449
|
print('Using a total of {} images'.format(len(used_images)))
|
|
453
450
|
used_images_set = set(used_images)
|
|
454
451
|
|
|
@@ -464,18 +461,18 @@ print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images
|
|
|
464
461
|
len(all_images) - len(unused_images)))
|
|
465
462
|
|
|
466
463
|
for fn in tqdm(unused_images):
|
|
467
|
-
os.remove(fn)
|
|
464
|
+
os.remove(fn)
|
|
468
465
|
|
|
469
466
|
|
|
470
467
|
#%% Produce HTML preview
|
|
471
468
|
|
|
472
469
|
with open(html_output_file, 'w', encoding='utf-8') as f:
|
|
473
|
-
|
|
470
|
+
|
|
474
471
|
f.write('<html><head></head><body>\n')
|
|
475
472
|
|
|
476
473
|
names = scientific_name_to_preferred_images.keys()
|
|
477
474
|
names = sorted(names)
|
|
478
|
-
|
|
475
|
+
|
|
479
476
|
f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
|
|
480
477
|
'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
|
|
481
478
|
'</p>')
|
|
@@ -484,10 +481,10 @@ with open(html_output_file, 'w', encoding='utf-8') as f:
|
|
|
484
481
|
for i_row, row in tqdm(df.iterrows(), total=len(df)):
|
|
485
482
|
|
|
486
483
|
s = row['scientific_name']
|
|
487
|
-
|
|
484
|
+
|
|
488
485
|
taxonomy_string = row['taxonomy_string']
|
|
489
486
|
if isinstance(taxonomy_string,str):
|
|
490
|
-
taxonomic_match = eval(taxonomy_string)
|
|
487
|
+
taxonomic_match = eval(taxonomy_string)
|
|
491
488
|
matched_entity = taxonomic_match[0]
|
|
492
489
|
assert len(matched_entity) == 4
|
|
493
490
|
common_names = matched_entity[3]
|
|
@@ -502,7 +499,7 @@ with open(html_output_file, 'w', encoding='utf-8') as f:
|
|
|
502
499
|
|
|
503
500
|
if isinstance(row.scientific_name,str):
|
|
504
501
|
output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
|
|
505
|
-
row.dataset_name, row.query,
|
|
502
|
+
row.dataset_name, row.query,
|
|
506
503
|
row.taxonomy_level, row.scientific_name, common_name_string,
|
|
507
504
|
row.common_name)
|
|
508
505
|
f.write(output_string)
|
|
@@ -17,21 +17,21 @@ import os
|
|
|
17
17
|
|
|
18
18
|
output_folder = os.path.expanduser('~/tmp/image-download-test')
|
|
19
19
|
os.makedirs(output_folder,exist_ok=True)
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
method = 'simple_image_download' # 'google_images_download'
|
|
22
22
|
|
|
23
23
|
if method == 'simple_image_download':
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
from megadetector.taxonomy_mapping import simple_image_download
|
|
26
26
|
google_image_downloader = simple_image_download.Downloader()
|
|
27
27
|
google_image_downloader.directory = output_folder
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
elif method == 'google_images_download':
|
|
30
|
-
|
|
30
|
+
|
|
31
31
|
from google_images_download import google_images_download
|
|
32
32
|
|
|
33
33
|
else:
|
|
34
|
-
|
|
34
|
+
|
|
35
35
|
raise ValueError('Unrecognized method {}'.format(method))
|
|
36
36
|
|
|
37
37
|
|
|
@@ -39,33 +39,33 @@ else:
|
|
|
39
39
|
|
|
40
40
|
def download_images(query,output_directory,limit=100,verbose=False):
|
|
41
41
|
|
|
42
|
-
query = query.replace(' ','+')
|
|
43
|
-
|
|
42
|
+
query = query.replace(' ','+')
|
|
43
|
+
|
|
44
44
|
if method == 'simple_image_download':
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
google_image_downloader.directory = output_directory
|
|
47
47
|
paths = google_image_downloader.download(query, limit=limit,
|
|
48
48
|
verbose=verbose, cache=False, download_cache=False)
|
|
49
49
|
return paths
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
elif method == 'google_images_download':
|
|
52
|
-
|
|
53
|
-
response = google_images_download.googleimagesdownload()
|
|
52
|
+
|
|
53
|
+
response = google_images_download.googleimagesdownload()
|
|
54
54
|
arguments = {'keywords':query,'limit':limit,'print_urls':verbose,
|
|
55
55
|
'image-directory':output_directory}
|
|
56
56
|
response.download(arguments)
|
|
57
57
|
return None
|
|
58
58
|
|
|
59
59
|
else:
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
raise ValueError('Unrecognized method {}'.format(method))
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
|
|
64
64
|
#%% Test driver
|
|
65
65
|
|
|
66
66
|
if False:
|
|
67
|
-
|
|
67
|
+
|
|
68
68
|
#%%
|
|
69
|
-
|
|
69
|
+
|
|
70
70
|
paths = download_images(query='redunca',output_directory=output_folder,
|
|
71
|
-
limit=20,verbose=True)
|
|
71
|
+
limit=20,verbose=True)
|
|
@@ -49,7 +49,7 @@ def generate_urls(search):
|
|
|
49
49
|
"""
|
|
50
50
|
Generate Google search URLs for all tokens in the list [search]
|
|
51
51
|
"""
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
return [(BASE_URL+quote(word)+GOOGLE_PICTURE_ID) for word in search]
|
|
54
54
|
|
|
55
55
|
|
|
@@ -60,7 +60,7 @@ def check_webpage(url):
|
|
|
60
60
|
if 'html' not in str(request.content):
|
|
61
61
|
checked_url = request
|
|
62
62
|
except Exception as err:
|
|
63
|
-
print(err)
|
|
63
|
+
print(err)
|
|
64
64
|
return checked_url
|
|
65
65
|
|
|
66
66
|
|
|
@@ -68,7 +68,7 @@ def scan_webpage(webpage, extensions, timer):
|
|
|
68
68
|
"""
|
|
69
69
|
Scan for pictures to download based on keywords
|
|
70
70
|
"""
|
|
71
|
-
|
|
71
|
+
|
|
72
72
|
global SCANNER_COUNTER
|
|
73
73
|
scanner = webpage.find
|
|
74
74
|
found = False
|
|
@@ -143,7 +143,7 @@ class Downloader:
|
|
|
143
143
|
urls_ = generate_urls(search)
|
|
144
144
|
timer = timer if timer else 1000
|
|
145
145
|
# max_progressbar = count * (list(range(limit+1))[-1]+1)
|
|
146
|
-
|
|
146
|
+
|
|
147
147
|
# bar = progressbar.ProgressBar(maxval=max_progressbar,
|
|
148
148
|
# widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]).start()
|
|
149
149
|
i = 0
|
|
@@ -172,7 +172,7 @@ class Downloader:
|
|
|
172
172
|
print('==='*15 + ' < ' + 'NO PICTURES FOUND' + ' > ' + '==='*15)
|
|
173
173
|
return cache_out
|
|
174
174
|
|
|
175
|
-
def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False,
|
|
175
|
+
def download(self, keywords=None, limit=1, verbose=False, cache=True, download_cache=False,
|
|
176
176
|
timer=None):
|
|
177
177
|
if not download_cache:
|
|
178
178
|
content = self.search_urls(keywords, limit, verbose, cache, timer)
|
|
@@ -180,16 +180,16 @@ class Downloader:
|
|
|
180
180
|
content = self._cached_urls
|
|
181
181
|
if not content:
|
|
182
182
|
print('Downloader has not URLs saved in Memory yet, run Downloader.search_urls to find pics first')
|
|
183
|
-
paths = []
|
|
183
|
+
paths = []
|
|
184
184
|
for name, (path, url) in content.items():
|
|
185
185
|
fullpath = os.path.join(path, name)
|
|
186
186
|
paths.append(fullpath)
|
|
187
187
|
with open(fullpath, 'wb') as file:
|
|
188
188
|
file.write(url.content)
|
|
189
189
|
if verbose:
|
|
190
|
-
print(f'File Name={name}, Downloaded from {url.url}')
|
|
190
|
+
print(f'File Name={name}, Downloaded from {url.url}')
|
|
191
191
|
return paths
|
|
192
|
-
|
|
192
|
+
|
|
193
193
|
def _create_directories(self, name):
|
|
194
194
|
dir_path = os.path.join(self._directory, name)
|
|
195
195
|
try:
|