megadetector 5.0.8__py3-none-any.whl → 5.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/__init__.py +0 -0
- api/batch_processing/__init__.py +0 -0
- api/batch_processing/api_core/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/__init__.py +0 -0
- api/batch_processing/api_core/batch_service/score.py +0 -1
- api/batch_processing/api_core/server_job_status_table.py +0 -1
- api/batch_processing/api_core_support/__init__.py +0 -0
- api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
- api/batch_processing/api_support/__init__.py +0 -0
- api/batch_processing/api_support/summarize_daily_activity.py +0 -1
- api/batch_processing/data_preparation/__init__.py +0 -0
- api/batch_processing/data_preparation/manage_local_batch.py +65 -65
- api/batch_processing/data_preparation/manage_video_batch.py +8 -8
- api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
- api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
- api/batch_processing/postprocessing/__init__.py +0 -0
- api/batch_processing/postprocessing/add_max_conf.py +12 -12
- api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
- api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
- api/batch_processing/postprocessing/compare_batch_results.py +113 -43
- api/batch_processing/postprocessing/convert_output_format.py +41 -16
- api/batch_processing/postprocessing/load_api_results.py +16 -17
- api/batch_processing/postprocessing/md_to_coco.py +31 -21
- api/batch_processing/postprocessing/md_to_labelme.py +52 -22
- api/batch_processing/postprocessing/merge_detections.py +14 -14
- api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
- api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
- api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
- api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
- api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
- api/synchronous/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
- api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
- api/synchronous/api_core/animal_detection_api/config.py +35 -35
- api/synchronous/api_core/tests/__init__.py +0 -0
- api/synchronous/api_core/tests/load_test.py +109 -109
- classification/__init__.py +0 -0
- classification/aggregate_classifier_probs.py +21 -24
- classification/analyze_failed_images.py +11 -13
- classification/cache_batchapi_outputs.py +51 -51
- classification/create_classification_dataset.py +69 -68
- classification/crop_detections.py +54 -53
- classification/csv_to_json.py +97 -100
- classification/detect_and_crop.py +105 -105
- classification/evaluate_model.py +43 -42
- classification/identify_mislabeled_candidates.py +47 -46
- classification/json_to_azcopy_list.py +10 -10
- classification/json_validator.py +72 -71
- classification/map_classification_categories.py +44 -43
- classification/merge_classification_detection_output.py +68 -68
- classification/prepare_classification_script.py +157 -154
- classification/prepare_classification_script_mc.py +228 -228
- classification/run_classifier.py +27 -26
- classification/save_mislabeled.py +30 -30
- classification/train_classifier.py +20 -20
- classification/train_classifier_tf.py +21 -22
- classification/train_utils.py +10 -10
- data_management/__init__.py +0 -0
- data_management/annotations/__init__.py +0 -0
- data_management/annotations/annotation_constants.py +18 -31
- data_management/camtrap_dp_to_coco.py +238 -0
- data_management/cct_json_utils.py +102 -59
- data_management/cct_to_md.py +176 -158
- data_management/cct_to_wi.py +247 -219
- data_management/coco_to_labelme.py +272 -263
- data_management/coco_to_yolo.py +79 -58
- data_management/databases/__init__.py +0 -0
- data_management/databases/add_width_and_height_to_db.py +20 -16
- data_management/databases/combine_coco_camera_traps_files.py +35 -31
- data_management/databases/integrity_check_json_db.py +62 -24
- data_management/databases/subset_json_db.py +24 -15
- data_management/generate_crops_from_cct.py +27 -45
- data_management/get_image_sizes.py +188 -162
- data_management/importers/add_nacti_sizes.py +8 -8
- data_management/importers/add_timestamps_to_icct.py +78 -78
- data_management/importers/animl_results_to_md_results.py +158 -158
- data_management/importers/auckland_doc_test_to_json.py +9 -9
- data_management/importers/auckland_doc_to_json.py +8 -8
- data_management/importers/awc_to_json.py +7 -7
- data_management/importers/bellevue_to_json.py +15 -15
- data_management/importers/cacophony-thermal-importer.py +13 -13
- data_management/importers/carrizo_shrubfree_2018.py +8 -8
- data_management/importers/carrizo_trail_cam_2017.py +8 -8
- data_management/importers/cct_field_adjustments.py +9 -9
- data_management/importers/channel_islands_to_cct.py +10 -10
- data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
- data_management/importers/ena24_to_json.py +7 -7
- data_management/importers/filenames_to_json.py +8 -8
- data_management/importers/helena_to_cct.py +7 -7
- data_management/importers/idaho-camera-traps.py +7 -7
- data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
- data_management/importers/jb_csv_to_json.py +9 -9
- data_management/importers/mcgill_to_json.py +8 -8
- data_management/importers/missouri_to_json.py +18 -18
- data_management/importers/nacti_fieldname_adjustments.py +10 -10
- data_management/importers/noaa_seals_2019.py +7 -7
- data_management/importers/pc_to_json.py +7 -7
- data_management/importers/plot_wni_giraffes.py +7 -7
- data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
- data_management/importers/prepare_zsl_imerit.py +7 -7
- data_management/importers/rspb_to_json.py +8 -8
- data_management/importers/save_the_elephants_survey_A.py +8 -8
- data_management/importers/save_the_elephants_survey_B.py +9 -9
- data_management/importers/snapshot_safari_importer.py +26 -26
- data_management/importers/snapshot_safari_importer_reprise.py +665 -665
- data_management/importers/snapshot_serengeti_lila.py +14 -14
- data_management/importers/sulross_get_exif.py +8 -9
- data_management/importers/timelapse_csv_set_to_json.py +11 -11
- data_management/importers/ubc_to_json.py +13 -13
- data_management/importers/umn_to_json.py +7 -7
- data_management/importers/wellington_to_json.py +8 -8
- data_management/importers/wi_to_json.py +9 -9
- data_management/importers/zamba_results_to_md_results.py +181 -181
- data_management/labelme_to_coco.py +65 -24
- data_management/labelme_to_yolo.py +8 -8
- data_management/lila/__init__.py +0 -0
- data_management/lila/add_locations_to_island_camera_traps.py +9 -9
- data_management/lila/add_locations_to_nacti.py +147 -147
- data_management/lila/create_lila_blank_set.py +13 -13
- data_management/lila/create_lila_test_set.py +8 -8
- data_management/lila/create_links_to_md_results_files.py +106 -106
- data_management/lila/download_lila_subset.py +44 -110
- data_management/lila/generate_lila_per_image_labels.py +55 -42
- data_management/lila/get_lila_annotation_counts.py +18 -15
- data_management/lila/get_lila_image_counts.py +11 -11
- data_management/lila/lila_common.py +96 -33
- data_management/lila/test_lila_metadata_urls.py +132 -116
- data_management/ocr_tools.py +173 -128
- data_management/read_exif.py +110 -97
- data_management/remap_coco_categories.py +83 -83
- data_management/remove_exif.py +58 -62
- data_management/resize_coco_dataset.py +30 -23
- data_management/wi_download_csv_to_coco.py +246 -239
- data_management/yolo_output_to_md_output.py +86 -73
- data_management/yolo_to_coco.py +300 -60
- detection/__init__.py +0 -0
- detection/detector_training/__init__.py +0 -0
- detection/process_video.py +85 -33
- detection/pytorch_detector.py +43 -25
- detection/run_detector.py +157 -72
- detection/run_detector_batch.py +179 -113
- detection/run_inference_with_yolov5_val.py +108 -48
- detection/run_tiled_inference.py +111 -40
- detection/tf_detector.py +51 -29
- detection/video_utils.py +606 -521
- docs/source/conf.py +43 -0
- md_utils/__init__.py +0 -0
- md_utils/azure_utils.py +9 -9
- md_utils/ct_utils.py +228 -68
- md_utils/directory_listing.py +59 -64
- md_utils/md_tests.py +968 -871
- md_utils/path_utils.py +460 -134
- md_utils/process_utils.py +157 -133
- md_utils/sas_blob_utils.py +20 -20
- md_utils/split_locations_into_train_val.py +45 -32
- md_utils/string_utils.py +33 -10
- md_utils/url_utils.py +176 -60
- md_utils/write_html_image_list.py +40 -33
- md_visualization/__init__.py +0 -0
- md_visualization/plot_utils.py +102 -109
- md_visualization/render_images_with_thumbnails.py +34 -34
- md_visualization/visualization_utils.py +597 -291
- md_visualization/visualize_db.py +76 -48
- md_visualization/visualize_detector_output.py +61 -42
- {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/METADATA +13 -7
- megadetector-5.0.10.dist-info/RECORD +224 -0
- {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/top_level.txt +1 -0
- taxonomy_mapping/__init__.py +0 -0
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
- taxonomy_mapping/map_new_lila_datasets.py +154 -154
- taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
- taxonomy_mapping/preview_lila_taxonomy.py +591 -591
- taxonomy_mapping/retrieve_sample_image.py +12 -12
- taxonomy_mapping/simple_image_download.py +11 -11
- taxonomy_mapping/species_lookup.py +10 -10
- taxonomy_mapping/taxonomy_csv_checker.py +18 -18
- taxonomy_mapping/taxonomy_graph.py +47 -47
- taxonomy_mapping/validate_lila_category_mappings.py +83 -76
- data_management/cct_json_to_filename_json.py +0 -89
- data_management/cct_to_csv.py +0 -140
- data_management/databases/remove_corrupted_images_from_db.py +0 -191
- detection/detector_training/copy_checkpoints.py +0 -43
- megadetector-5.0.8.dist-info/RECORD +0 -205
- {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/LICENSE +0 -0
- {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/WHEEL +0 -0
|
@@ -1,106 +1,106 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
#%% Imports and constants
|
|
10
|
-
|
|
11
|
-
import os
|
|
12
|
-
|
|
13
|
-
import pandas as pd
|
|
14
|
-
|
|
15
|
-
input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
|
|
16
|
-
output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
|
|
17
|
-
|
|
18
|
-
md_results_local_folder = r'g:\temp\lila-md-results'
|
|
19
|
-
md_base_url = 'https://lila.science/public/lila-md-results/'
|
|
20
|
-
assert md_base_url.endswith('/')
|
|
21
|
-
|
|
22
|
-
# No RDE files for datasets with no location information
|
|
23
|
-
datasets_without_location_info = ('ena24','missouri-camera-traps')
|
|
24
|
-
|
|
25
|
-
md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
|
|
26
|
-
|
|
27
|
-
validate_urls = False
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
#%% Read input data
|
|
31
|
-
|
|
32
|
-
df = pd.read_csv(input_csv_file)
|
|
33
|
-
for s in md_results_column_names:
|
|
34
|
-
df[s] = ''
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
#%% Find matching files locally, and create URLs
|
|
38
|
-
|
|
39
|
-
local_files = os.listdir(md_results_local_folder)
|
|
40
|
-
local_files = [fn for fn in local_files if fn.endswith('.zip')]
|
|
41
|
-
|
|
42
|
-
# i_row = 0; row = df.iloc[i_row]
|
|
43
|
-
for i_row,row in df.iterrows():
|
|
44
|
-
|
|
45
|
-
if not isinstance(row['name'],str):
|
|
46
|
-
continue
|
|
47
|
-
|
|
48
|
-
dataset_shortname = row['short_name']
|
|
49
|
-
matching_files = [fn for fn in local_files if dataset_shortname in fn]
|
|
50
|
-
|
|
51
|
-
# No RDE files for datasets with no location information
|
|
52
|
-
if dataset_shortname in datasets_without_location_info:
|
|
53
|
-
assert len(matching_files) == 2
|
|
54
|
-
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
|
|
55
|
-
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
|
|
56
|
-
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
|
|
57
|
-
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
58
|
-
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
59
|
-
else:
|
|
60
|
-
# Exclude single-season files for snapshot-serengeti
|
|
61
|
-
if dataset_shortname == 'snapshot-serengeti':
|
|
62
|
-
matching_files = [fn for fn in matching_files if '_S' not in fn]
|
|
63
|
-
assert len(matching_files) == 2
|
|
64
|
-
assert all(['mdv4' in fn for fn in matching_files])
|
|
65
|
-
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
66
|
-
raw_files = [fn for fn in matching_files if 'rde' not in fn]
|
|
67
|
-
assert len(rde_files) == 1 and len(raw_files) == 1
|
|
68
|
-
df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
|
|
69
|
-
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
70
|
-
else:
|
|
71
|
-
assert len(matching_files) == 3
|
|
72
|
-
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
|
|
73
|
-
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
|
|
74
|
-
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
75
|
-
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
|
|
76
|
-
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
77
|
-
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
78
|
-
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
79
|
-
|
|
80
|
-
print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
|
|
81
|
-
|
|
82
|
-
# ...for each row
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
#%% Validate URLs
|
|
86
|
-
|
|
87
|
-
if validate_urls:
|
|
88
|
-
|
|
89
|
-
from md_utils.url_utils import test_urls
|
|
90
|
-
|
|
91
|
-
urls = set()
|
|
92
|
-
|
|
93
|
-
for i_row,row in df.iterrows():
|
|
94
|
-
for column_name in md_results_column_names:
|
|
95
|
-
if len(row[column_name]) > 0:
|
|
96
|
-
assert row[column_name] not in urls
|
|
97
|
-
urls.add(row[column_name])
|
|
98
|
-
|
|
99
|
-
test_urls(urls,error_on_failure=True)
|
|
100
|
-
|
|
101
|
-
print('Validated {} URLs'.format(len(urls)))
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
#%% Write new .csv file
|
|
105
|
-
|
|
106
|
-
df.to_csv(output_csv_file,header=True,index=False)
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
create_links_to_md_results_files.py
|
|
4
|
+
|
|
5
|
+
One-off script to populate the columns in the camera trap data .csv file that point to MD results.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
#%% Imports and constants
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
|
|
16
|
+
output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
|
|
17
|
+
|
|
18
|
+
md_results_local_folder = r'g:\temp\lila-md-results'
|
|
19
|
+
md_base_url = 'https://lila.science/public/lila-md-results/'
|
|
20
|
+
assert md_base_url.endswith('/')
|
|
21
|
+
|
|
22
|
+
# No RDE files for datasets with no location information
|
|
23
|
+
datasets_without_location_info = ('ena24','missouri-camera-traps')
|
|
24
|
+
|
|
25
|
+
md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
|
|
26
|
+
|
|
27
|
+
validate_urls = False
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
#%% Read input data
|
|
31
|
+
|
|
32
|
+
df = pd.read_csv(input_csv_file)
|
|
33
|
+
for s in md_results_column_names:
|
|
34
|
+
df[s] = ''
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
#%% Find matching files locally, and create URLs
|
|
38
|
+
|
|
39
|
+
local_files = os.listdir(md_results_local_folder)
|
|
40
|
+
local_files = [fn for fn in local_files if fn.endswith('.zip')]
|
|
41
|
+
|
|
42
|
+
# i_row = 0; row = df.iloc[i_row]
|
|
43
|
+
for i_row,row in df.iterrows():
|
|
44
|
+
|
|
45
|
+
if not isinstance(row['name'],str):
|
|
46
|
+
continue
|
|
47
|
+
|
|
48
|
+
dataset_shortname = row['short_name']
|
|
49
|
+
matching_files = [fn for fn in local_files if dataset_shortname in fn]
|
|
50
|
+
|
|
51
|
+
# No RDE files for datasets with no location information
|
|
52
|
+
if dataset_shortname in datasets_without_location_info:
|
|
53
|
+
assert len(matching_files) == 2
|
|
54
|
+
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
|
|
55
|
+
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
|
|
56
|
+
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
|
|
57
|
+
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
58
|
+
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
59
|
+
else:
|
|
60
|
+
# Exclude single-season files for snapshot-serengeti
|
|
61
|
+
if dataset_shortname == 'snapshot-serengeti':
|
|
62
|
+
matching_files = [fn for fn in matching_files if '_S' not in fn]
|
|
63
|
+
assert len(matching_files) == 2
|
|
64
|
+
assert all(['mdv4' in fn for fn in matching_files])
|
|
65
|
+
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
66
|
+
raw_files = [fn for fn in matching_files if 'rde' not in fn]
|
|
67
|
+
assert len(rde_files) == 1 and len(raw_files) == 1
|
|
68
|
+
df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
|
|
69
|
+
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
70
|
+
else:
|
|
71
|
+
assert len(matching_files) == 3
|
|
72
|
+
mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
|
|
73
|
+
mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
|
|
74
|
+
rde_files = [fn for fn in matching_files if 'rde' in fn]
|
|
75
|
+
assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
|
|
76
|
+
df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
|
|
77
|
+
df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
|
|
78
|
+
df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
|
|
79
|
+
|
|
80
|
+
print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
|
|
81
|
+
|
|
82
|
+
# ...for each row
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
#%% Validate URLs
|
|
86
|
+
|
|
87
|
+
if validate_urls:
|
|
88
|
+
|
|
89
|
+
from md_utils.url_utils import test_urls
|
|
90
|
+
|
|
91
|
+
urls = set()
|
|
92
|
+
|
|
93
|
+
for i_row,row in df.iterrows():
|
|
94
|
+
for column_name in md_results_column_names:
|
|
95
|
+
if len(row[column_name]) > 0:
|
|
96
|
+
assert row[column_name] not in urls
|
|
97
|
+
urls.add(row[column_name])
|
|
98
|
+
|
|
99
|
+
test_urls(urls,error_on_failure=True)
|
|
100
|
+
|
|
101
|
+
print('Validated {} URLs'.format(len(urls)))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
#%% Write new .csv file
|
|
105
|
+
|
|
106
|
+
df.to_csv(output_csv_file,header=True,index=False)
|
|
@@ -1,17 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
# what you want to query for, etc., is very application-specific; this is just meant as a
|
|
10
|
-
# demo.
|
|
11
|
-
#
|
|
12
|
-
# Can download from GCP (all datasets), AWS (all datasets), or Azure (most datasets).
|
|
13
|
-
#
|
|
14
|
-
########
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
download_lila_subset.py
|
|
4
|
+
|
|
5
|
+
Example of how to download a list of files from LILA, e.g. all the files
|
|
6
|
+
in a data set corresponding to a particular species.
|
|
7
|
+
|
|
8
|
+
"""
|
|
15
9
|
|
|
16
10
|
#%% Constants and imports
|
|
17
11
|
|
|
@@ -19,11 +13,9 @@ import os
|
|
|
19
13
|
import random
|
|
20
14
|
|
|
21
15
|
from tqdm import tqdm
|
|
22
|
-
from multiprocessing.pool import ThreadPool
|
|
23
16
|
from collections import defaultdict
|
|
24
17
|
|
|
25
18
|
from data_management.lila.lila_common import read_lila_all_images_file, is_empty, lila_base_urls
|
|
26
|
-
from md_utils.url_utils import download_url
|
|
27
19
|
|
|
28
20
|
for s in lila_base_urls.values():
|
|
29
21
|
assert s.endswith('/')
|
|
@@ -43,70 +35,22 @@ os.makedirs(output_dir,exist_ok=True)
|
|
|
43
35
|
# Number of concurrent download threads
|
|
44
36
|
n_download_threads = 20
|
|
45
37
|
|
|
46
|
-
verbose = False
|
|
47
|
-
|
|
48
38
|
max_images_per_dataset = 10 # None
|
|
49
39
|
|
|
50
|
-
# This impacts the data download, but not the metadata download
|
|
51
|
-
#
|
|
52
|
-
# Setting this to "Azure" really means "Azure if available"; some datasets are
|
|
53
|
-
# not available on Azure.
|
|
54
40
|
preferred_provider = 'gcp' # 'azure', 'gcp', 'aws'
|
|
55
41
|
|
|
56
42
|
random.seed(0)
|
|
57
43
|
|
|
58
44
|
|
|
59
|
-
#%% Support functions
|
|
60
|
-
|
|
61
|
-
def download_relative_url(relative_url, output_base, provider='gcp',
|
|
62
|
-
verbose=False, overwrite=False):
|
|
63
|
-
"""
|
|
64
|
-
Download a URL to output_base, preserving the path relative to the common LILA root.
|
|
65
|
-
"""
|
|
66
|
-
|
|
67
|
-
assert not relative_url.startswith('/')
|
|
68
|
-
|
|
69
|
-
# Not all datasets are available on Azure, fall back in these cases. The decision
|
|
70
|
-
# to fall back to GCP rather than AWS is arbitrary.
|
|
71
|
-
if provider == 'azure':
|
|
72
|
-
nominal_provider = relative_url_to_nominal_provider[relative_url]
|
|
73
|
-
if nominal_provider != 'azure':
|
|
74
|
-
if verbose:
|
|
75
|
-
print('URL {} not available on Azure, falling back to GCP'.format(
|
|
76
|
-
relative_url))
|
|
77
|
-
provider = 'gcp'
|
|
78
|
-
|
|
79
|
-
url = lila_base_urls[provider] + relative_url
|
|
80
|
-
|
|
81
|
-
result = {'status':'unknown','url':url,'destination_filename':None}
|
|
82
|
-
|
|
83
|
-
destination_filename = os.path.join(output_base,relative_url)
|
|
84
|
-
result['destination_filename'] = destination_filename
|
|
85
|
-
|
|
86
|
-
if ((os.path.isfile(destination_filename)) and (not overwrite)):
|
|
87
|
-
result['status'] = 'skipped'
|
|
88
|
-
return result
|
|
89
|
-
try:
|
|
90
|
-
download_url(url, destination_filename, verbose=verbose, force_download=overwrite)
|
|
91
|
-
except Exception as e:
|
|
92
|
-
print('Warning: error downloading URL {}: {}'.format(
|
|
93
|
-
url,str(e)))
|
|
94
|
-
result['status'] = 'error: {}'.format(str(e))
|
|
95
|
-
return result
|
|
96
|
-
|
|
97
|
-
result['status'] = 'success'
|
|
98
|
-
return result
|
|
99
|
-
|
|
100
|
-
|
|
101
45
|
#%% Download and open the giant table of image URLs and labels
|
|
102
46
|
|
|
103
|
-
# ~60 seconds to download, unzip, and open
|
|
47
|
+
# Takes ~60 seconds to download, unzip, and open
|
|
104
48
|
df = read_lila_all_images_file(metadata_dir)
|
|
105
49
|
|
|
106
50
|
|
|
107
51
|
#%% Find all the images we want to download
|
|
108
52
|
|
|
109
|
-
# ~2 minutes
|
|
53
|
+
# Takes ~2 minutes
|
|
110
54
|
|
|
111
55
|
common_name_to_count = defaultdict(int)
|
|
112
56
|
|
|
@@ -119,6 +63,8 @@ def find_items(row):
|
|
|
119
63
|
|
|
120
64
|
match = False
|
|
121
65
|
|
|
66
|
+
# This is the only bit of this file that's specific to a particular query. In this case
|
|
67
|
+
# we're checking whether each row is on a list of species of interest, but you do you.
|
|
122
68
|
for species_name in species_of_interest:
|
|
123
69
|
if species_name in row['common_name']:
|
|
124
70
|
match = True
|
|
@@ -126,7 +72,7 @@ def find_items(row):
|
|
|
126
72
|
break
|
|
127
73
|
|
|
128
74
|
if match:
|
|
129
|
-
ds_name_to_urls[row['dataset_name']].append(row['
|
|
75
|
+
ds_name_to_urls[row['dataset_name']].append(row['url_' + preferred_provider])
|
|
130
76
|
|
|
131
77
|
tqdm.pandas()
|
|
132
78
|
_ = df.progress_apply(find_items,axis=1)
|
|
@@ -154,58 +100,47 @@ else:
|
|
|
154
100
|
ds_name_to_urls[ds_name] = random.sample(ds_name_to_urls[ds_name],max_images_per_dataset)
|
|
155
101
|
|
|
156
102
|
|
|
157
|
-
#%%
|
|
103
|
+
#%% Choose target files for each URL
|
|
158
104
|
|
|
159
|
-
|
|
160
|
-
all_urls = [item for sublist in all_urls for item in sublist]
|
|
105
|
+
from data_management.lila.lila_common import lila_base_urls
|
|
161
106
|
|
|
162
|
-
|
|
107
|
+
# We have a list of URLs per dataset, flatten that into a single list of URLs
|
|
108
|
+
urls_to_download = set()
|
|
109
|
+
for ds_name in ds_name_to_urls:
|
|
110
|
+
for url in ds_name_to_urls[ds_name]:
|
|
111
|
+
urls_to_download.add(url)
|
|
112
|
+
urls_to_download = sorted(list(urls_to_download))
|
|
163
113
|
|
|
164
|
-
#
|
|
165
|
-
# is that if the nominal URL isn't an Azure URL, the file isn't on Azure. All files are on
|
|
166
|
-
# GCP and AWS.
|
|
114
|
+
# A URL might look like this:
|
|
167
115
|
#
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
all_urls_relative.append(relative_url)
|
|
178
|
-
relative_url_to_nominal_provider[relative_url] = provider
|
|
179
|
-
found_base = True
|
|
180
|
-
break
|
|
181
|
-
assert found_base
|
|
182
|
-
|
|
183
|
-
assert len(all_urls) == len(all_urls_relative)
|
|
116
|
+
# https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/animals/0667/0302.jpg
|
|
117
|
+
#
|
|
118
|
+
# We'll write that to an output file that looks like this (relative to output_dir):
|
|
119
|
+
#
|
|
120
|
+
# wcs-unzipped/animals/0667/0302.jpg
|
|
121
|
+
#
|
|
122
|
+
# ...so we need to remove the base URL to get the target file.
|
|
123
|
+
base_url = lila_base_urls[preferred_provider]
|
|
124
|
+
assert base_url.endswith('/')
|
|
184
125
|
|
|
126
|
+
url_to_target_file = {}
|
|
185
127
|
|
|
186
|
-
|
|
128
|
+
for url in urls_to_download:
|
|
129
|
+
assert url.startswith(base_url)
|
|
130
|
+
target_fn_relative = url.replace(base_url,'')
|
|
131
|
+
target_fn_abs = os.path.join(output_dir,target_fn_relative)
|
|
132
|
+
url_to_target_file[url] = target_fn_abs
|
|
187
133
|
|
|
188
|
-
print('Downloading {} images on {} workers, preferred provider is {}'.format(
|
|
189
|
-
len(all_urls),n_download_threads,preferred_provider))
|
|
190
134
|
|
|
191
|
-
|
|
135
|
+
#%% Download image files
|
|
192
136
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
# url_relative = all_urls_relative[0]
|
|
196
|
-
for url_relative in tqdm(all_urls_relative):
|
|
197
|
-
result = download_relative_url(url_relative,
|
|
198
|
-
output_base=output_dir,
|
|
199
|
-
provider=preferred_provider,
|
|
200
|
-
verbose=verbose)
|
|
201
|
-
results.append(result)
|
|
202
|
-
|
|
203
|
-
else:
|
|
137
|
+
from md_utils.url_utils import parallel_download_urls
|
|
204
138
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
139
|
+
download_results = parallel_download_urls(url_to_target_file=url_to_target_file,
|
|
140
|
+
verbose=False,
|
|
141
|
+
overwrite=False,
|
|
142
|
+
n_workers=n_download_threads,
|
|
143
|
+
pool_type='thread')
|
|
209
144
|
|
|
210
145
|
|
|
211
146
|
#%% Scrap
|
|
@@ -240,4 +175,3 @@ if False:
|
|
|
240
175
|
print('\nDatasets by count:\n')
|
|
241
176
|
for k in dataset_to_count:
|
|
242
177
|
print('{} ({})'.format(k,dataset_to_count[k]))
|
|
243
|
-
|
|
@@ -1,19 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
generate_lila_per_image_labels.py
|
|
4
|
+
|
|
5
|
+
Generate a .csv file with one row per annotation, containing full URLs to every
|
|
6
|
+
camera trap image on LILA, with taxonomically expanded labels.
|
|
7
|
+
|
|
8
|
+
Typically there will be one row per image, though images with multiple annotations
|
|
9
|
+
will have multiple rows.
|
|
10
|
+
|
|
11
|
+
Some images may not physically exist, particularly images that are labeled as "human".
|
|
12
|
+
This script does not validate image URLs.
|
|
13
|
+
|
|
14
|
+
Does not include bounding box annotations.
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
17
|
|
|
18
18
|
#%% Constants and imports
|
|
19
19
|
|
|
@@ -23,8 +23,6 @@ import pandas as pd
|
|
|
23
23
|
import numpy as np
|
|
24
24
|
import dateparser
|
|
25
25
|
import csv
|
|
26
|
-
import urllib
|
|
27
|
-
import urllib.request
|
|
28
26
|
|
|
29
27
|
from collections import defaultdict
|
|
30
28
|
from tqdm import tqdm
|
|
@@ -36,7 +34,6 @@ from data_management.lila.lila_common import read_lila_metadata, \
|
|
|
36
34
|
from md_utils import write_html_image_list
|
|
37
35
|
from md_utils.path_utils import zip_file
|
|
38
36
|
from md_utils.path_utils import open_file
|
|
39
|
-
from md_utils.url_utils import download_url
|
|
40
37
|
|
|
41
38
|
# We'll write images, metadata downloads, and temporary files here
|
|
42
39
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -107,12 +104,15 @@ for i_row,row in taxonomy_df.iterrows():
|
|
|
107
104
|
|
|
108
105
|
# Takes several hours
|
|
109
106
|
|
|
110
|
-
|
|
111
|
-
|
|
107
|
+
# The order of these headers needs to match the order in which fields are added later in this cell;
|
|
108
|
+
# don't mess with this order.
|
|
109
|
+
header = ['dataset_name','url_gcp','url_aws','url_azure',
|
|
110
|
+
'image_id','sequence_id','location_id','frame_num',
|
|
111
|
+
'original_label','scientific_name','common_name','datetime','annotation_level']
|
|
112
112
|
|
|
113
113
|
taxonomy_levels_to_include = \
|
|
114
114
|
['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
|
|
115
|
-
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies'
|
|
115
|
+
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
|
|
116
116
|
'variety']
|
|
117
117
|
|
|
118
118
|
header.extend(taxonomy_levels_to_include)
|
|
@@ -179,10 +179,17 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
179
179
|
break
|
|
180
180
|
|
|
181
181
|
file_name = im['file_name'].replace('\\','/')
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
182
|
+
base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
|
|
183
|
+
base_url_aws = metadata_table[ds_name]['image_base_url_aws']
|
|
184
|
+
base_url_azure = metadata_table[ds_name]['image_base_url_azure']
|
|
185
|
+
assert not base_url_gcp.endswith('/')
|
|
186
|
+
assert not base_url_aws.endswith('/')
|
|
187
|
+
assert not base_url_azure.endswith('/')
|
|
185
188
|
|
|
189
|
+
url_gcp = base_url_gcp + '/' + file_name
|
|
190
|
+
url_aws = base_url_aws + '/' + file_name
|
|
191
|
+
url_azure = base_url_azure + '/' + file_name
|
|
192
|
+
|
|
186
193
|
for k in im.keys():
|
|
187
194
|
if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
|
|
188
195
|
raise ValueError('Unrecognized datetime field')
|
|
@@ -297,7 +304,9 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
297
304
|
|
|
298
305
|
row = []
|
|
299
306
|
row.append(ds_name)
|
|
300
|
-
row.append(
|
|
307
|
+
row.append(url_gcp)
|
|
308
|
+
row.append(url_aws)
|
|
309
|
+
row.append(url_azure)
|
|
301
310
|
row.append(image_id)
|
|
302
311
|
row.append(sequence_id)
|
|
303
312
|
row.append(location_id)
|
|
@@ -365,7 +374,8 @@ dataset_name_to_locations = defaultdict(set)
|
|
|
365
374
|
def check_row(row):
|
|
366
375
|
|
|
367
376
|
assert row['dataset_name'] in metadata_table.keys()
|
|
368
|
-
|
|
377
|
+
for url_column in ['url_gcp','url_aws','url_azure']:
|
|
378
|
+
assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
|
|
369
379
|
assert ' : ' in row['image_id']
|
|
370
380
|
assert 'seq' not in row['location_id'].lower()
|
|
371
381
|
assert row['annotation_level'] in valid_annotation_levels
|
|
@@ -446,28 +456,31 @@ for ds_name in metadata_table.keys():
|
|
|
446
456
|
print('Selected {} total images'.format(len(images_to_download)))
|
|
447
457
|
|
|
448
458
|
|
|
449
|
-
#%% Download images
|
|
459
|
+
#%% Download images (prep)
|
|
450
460
|
|
|
451
461
|
# Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
|
|
452
462
|
|
|
453
|
-
|
|
454
|
-
|
|
463
|
+
preferred_cloud = 'aws'
|
|
464
|
+
|
|
465
|
+
url_to_target_file = {}
|
|
466
|
+
|
|
455
467
|
# i_image = 10; image = images_to_download[i_image]
|
|
456
468
|
for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
|
|
457
469
|
|
|
458
|
-
url = image['
|
|
470
|
+
url = image['url_' + preferred_cloud]
|
|
459
471
|
ext = os.path.splitext(url)[1]
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
image['relative_file'] = None
|
|
472
|
+
fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
|
|
473
|
+
fn_abs = os.path.join(preview_folder,fn_relative)
|
|
474
|
+
image['relative_file'] = fn_relative
|
|
475
|
+
image['url'] = url
|
|
476
|
+
url_to_target_file[url] = fn_abs
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
#%% Download images (execution)
|
|
469
480
|
|
|
470
|
-
|
|
481
|
+
from md_utils.url_utils import parallel_download_urls
|
|
482
|
+
download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
|
|
483
|
+
n_workers=20,pool_type='thread')
|
|
471
484
|
|
|
472
485
|
|
|
473
486
|
#%% Write preview HTML
|
|
@@ -499,4 +512,4 @@ open_file(html_filename)
|
|
|
499
512
|
|
|
500
513
|
zipped_output_file = zip_file(output_file,verbose=True)
|
|
501
514
|
|
|
502
|
-
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
515
|
+
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
get_lila_annotation_counts.py
|
|
4
|
+
|
|
5
|
+
Generates a .json-formatted dictionary mapping each LILA dataset to all categories
|
|
6
|
+
that exist for that dataset, with counts for the number of occurrences of each category
|
|
7
|
+
(the number of *annotations* for each category, not the number of *images*).
|
|
8
|
+
|
|
9
|
+
Also loads the taxonomy mapping file, to include scientific names for each category.
|
|
10
|
+
|
|
11
|
+
get_lila_image_counts.py counts the number of *images* for each category in each dataset.
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
14
|
|
|
15
15
|
#%% Constants and imports
|
|
16
16
|
|
|
@@ -20,6 +20,9 @@ import os
|
|
|
20
20
|
from data_management.lila.lila_common import read_lila_metadata,\
|
|
21
21
|
read_metadata_file_for_dataset, read_lila_taxonomy_mapping
|
|
22
22
|
|
|
23
|
+
# cloud provider to use for downloading images; options are 'gcp', 'azure', or 'aws'
|
|
24
|
+
preferred_cloud = 'gcp'
|
|
25
|
+
|
|
23
26
|
# array to fill for output
|
|
24
27
|
category_list = []
|
|
25
28
|
|
|
@@ -96,9 +99,9 @@ for ds_name in metadata_table.keys():
|
|
|
96
99
|
print('Warning: taxonomy mapping not available for {}'.format(ds_name))
|
|
97
100
|
|
|
98
101
|
print('Finding categories in {}'.format(ds_name))
|
|
99
|
-
|
|
102
|
+
|
|
100
103
|
json_filename = metadata_table[ds_name]['json_filename']
|
|
101
|
-
base_url = metadata_table[ds_name]['
|
|
104
|
+
base_url = metadata_table[ds_name]['image_base_url_' + preferred_cloud]
|
|
102
105
|
assert not base_url.endswith('/')
|
|
103
106
|
|
|
104
107
|
# Open the metadata file
|