megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
|
@@ -35,6 +35,7 @@ from megadetector.data_management.lila.lila_common import \
|
|
|
35
35
|
from megadetector.utils import write_html_image_list
|
|
36
36
|
from megadetector.utils.path_utils import zip_file
|
|
37
37
|
from megadetector.utils.path_utils import open_file
|
|
38
|
+
from megadetector.utils.url_utils import parallel_download_urls
|
|
38
39
|
|
|
39
40
|
# We'll write images, metadata downloads, and temporary files here
|
|
40
41
|
lila_local_base = os.path.expanduser('~/lila')
|
|
@@ -47,7 +48,7 @@ os.makedirs(metadata_dir,exist_ok=True)
|
|
|
47
48
|
|
|
48
49
|
output_file = os.path.join(lila_local_base,'lila_image_urls_and_labels.csv')
|
|
49
50
|
|
|
50
|
-
# Some datasets don't have "sequence_level_annotation" fields populated, but we know their
|
|
51
|
+
# Some datasets don't have "sequence_level_annotation" fields populated, but we know their
|
|
51
52
|
# annotation level
|
|
52
53
|
ds_name_to_annotation_level = {}
|
|
53
54
|
ds_name_to_annotation_level['Caltech Camera Traps'] = 'image'
|
|
@@ -66,6 +67,18 @@ if debug_max_images_per_dataset > 0:
|
|
|
66
67
|
print('Running in debug mode')
|
|
67
68
|
output_file = output_file.replace('.csv','_debug.csv')
|
|
68
69
|
|
|
70
|
+
taxonomy_levels_to_include = \
|
|
71
|
+
['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
|
|
72
|
+
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','subgenus',
|
|
73
|
+
'species','subspecies','variety']
|
|
74
|
+
|
|
75
|
+
def _clearnan(v):
|
|
76
|
+
if isinstance(v,float):
|
|
77
|
+
assert np.isnan(v)
|
|
78
|
+
v = ''
|
|
79
|
+
assert isinstance(v,str)
|
|
80
|
+
return v
|
|
81
|
+
|
|
69
82
|
|
|
70
83
|
#%% Download and parse the metadata file
|
|
71
84
|
|
|
@@ -79,14 +92,14 @@ if False:
|
|
|
79
92
|
|
|
80
93
|
#%% Download and extract metadata for each dataset
|
|
81
94
|
|
|
82
|
-
for ds_name in metadata_table.keys():
|
|
95
|
+
for ds_name in metadata_table.keys():
|
|
83
96
|
metadata_table[ds_name]['metadata_filename'] = read_metadata_file_for_dataset(ds_name=ds_name,
|
|
84
97
|
metadata_dir=metadata_dir,
|
|
85
98
|
metadata_table=metadata_table)
|
|
86
|
-
|
|
99
|
+
|
|
87
100
|
#%% Load taxonomy data
|
|
88
101
|
|
|
89
|
-
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
102
|
+
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir, force_download=True)
|
|
90
103
|
|
|
91
104
|
|
|
92
105
|
#%% Build a dictionary that maps each [dataset,query] pair to the full taxonomic label set
|
|
@@ -95,12 +108,12 @@ ds_label_to_taxonomy = {}
|
|
|
95
108
|
|
|
96
109
|
# i_row = 0; row = taxonomy_df.iloc[i_row]
|
|
97
110
|
for i_row,row in taxonomy_df.iterrows():
|
|
98
|
-
|
|
111
|
+
|
|
99
112
|
ds_label = row['dataset_name'] + ':' + row['query']
|
|
100
113
|
assert ds_label.strip() == ds_label
|
|
101
114
|
assert ds_label not in ds_label_to_taxonomy
|
|
102
115
|
ds_label_to_taxonomy[ds_label] = row.to_dict()
|
|
103
|
-
|
|
116
|
+
|
|
104
117
|
|
|
105
118
|
#%% Process annotations for each dataset
|
|
106
119
|
|
|
@@ -112,74 +125,62 @@ header = ['dataset_name','url_gcp','url_aws','url_azure',
|
|
|
112
125
|
'image_id','sequence_id','location_id','frame_num',
|
|
113
126
|
'original_label','scientific_name','common_name','datetime','annotation_level']
|
|
114
127
|
|
|
115
|
-
taxonomy_levels_to_include = \
|
|
116
|
-
['kingdom','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order',
|
|
117
|
-
'suborder','infraorder','superfamily','family','subfamily','tribe','genus','species','subspecies',
|
|
118
|
-
'variety']
|
|
119
|
-
|
|
120
128
|
header.extend(taxonomy_levels_to_include)
|
|
121
129
|
|
|
122
130
|
missing_annotations = set()
|
|
123
131
|
|
|
124
|
-
def clearnan(v):
|
|
125
|
-
if isinstance(v,float):
|
|
126
|
-
assert np.isnan(v)
|
|
127
|
-
v = ''
|
|
128
|
-
assert isinstance(v,str)
|
|
129
|
-
return v
|
|
130
|
-
|
|
131
132
|
with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
132
|
-
|
|
133
|
+
|
|
133
134
|
csv_writer = csv.writer(f)
|
|
134
135
|
csv_writer.writerow(header)
|
|
135
|
-
|
|
136
|
+
|
|
136
137
|
# ds_name = list(metadata_table.keys())[0]
|
|
137
138
|
for ds_name in metadata_table.keys():
|
|
138
|
-
|
|
139
|
+
|
|
139
140
|
if 'bbox' in ds_name:
|
|
140
141
|
print('Skipping bbox dataset {}'.format(ds_name))
|
|
141
142
|
continue
|
|
142
|
-
|
|
143
|
+
|
|
143
144
|
print('Processing dataset {}'.format(ds_name))
|
|
144
|
-
|
|
145
|
+
|
|
145
146
|
json_filename = metadata_table[ds_name]['metadata_filename']
|
|
146
147
|
with open(json_filename, 'r') as f:
|
|
147
148
|
data = json.load(f)
|
|
148
|
-
|
|
149
|
+
|
|
149
150
|
categories = data['categories']
|
|
150
151
|
category_ids = [c['id'] for c in categories]
|
|
151
152
|
for c in categories:
|
|
152
153
|
category_id_to_name = {c['id']:c['name'] for c in categories}
|
|
153
|
-
|
|
154
|
+
|
|
154
155
|
annotations = data['annotations']
|
|
155
156
|
images = data['images']
|
|
156
|
-
|
|
157
|
+
|
|
157
158
|
image_id_to_annotations = defaultdict(list)
|
|
158
|
-
|
|
159
|
+
|
|
159
160
|
# Go through annotations, marking each image with the categories that are present
|
|
160
161
|
#
|
|
161
162
|
# ann = annotations[0]
|
|
162
|
-
for ann in annotations:
|
|
163
|
+
for ann in annotations:
|
|
163
164
|
image_id_to_annotations[ann['image_id']].append(ann)
|
|
164
|
-
|
|
165
|
+
|
|
165
166
|
unannotated_images = []
|
|
166
|
-
|
|
167
|
+
|
|
167
168
|
found_date = False
|
|
168
169
|
found_location = False
|
|
169
170
|
found_annotation_level = False
|
|
170
|
-
|
|
171
|
+
|
|
171
172
|
if ds_name in ds_name_to_annotation_level:
|
|
172
173
|
expected_annotation_level = ds_name_to_annotation_level[ds_name]
|
|
173
174
|
else:
|
|
174
175
|
expected_annotation_level = None
|
|
175
|
-
|
|
176
|
+
|
|
176
177
|
# im = images[10]
|
|
177
178
|
for i_image,im in tqdm(enumerate(images),total=len(images)):
|
|
178
|
-
|
|
179
|
+
|
|
179
180
|
if (debug_max_images_per_dataset is not None) and (debug_max_images_per_dataset > 0) \
|
|
180
181
|
and (i_image >= debug_max_images_per_dataset):
|
|
181
182
|
break
|
|
182
|
-
|
|
183
|
+
|
|
183
184
|
file_name = im['file_name'].replace('\\','/')
|
|
184
185
|
base_url_gcp = metadata_table[ds_name]['image_base_url_gcp']
|
|
185
186
|
base_url_aws = metadata_table[ds_name]['image_base_url_aws']
|
|
@@ -187,21 +188,21 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
187
188
|
assert not base_url_gcp.endswith('/')
|
|
188
189
|
assert not base_url_aws.endswith('/')
|
|
189
190
|
assert not base_url_azure.endswith('/')
|
|
190
|
-
|
|
191
|
+
|
|
191
192
|
url_gcp = base_url_gcp + '/' + file_name
|
|
192
193
|
url_aws = base_url_aws + '/' + file_name
|
|
193
194
|
url_azure = base_url_azure + '/' + file_name
|
|
194
|
-
|
|
195
|
+
|
|
195
196
|
for k in im.keys():
|
|
196
197
|
if ('date' in k or 'time' in k) and (k not in ['datetime','date_captured']):
|
|
197
198
|
raise ValueError('Unrecognized datetime field')
|
|
198
|
-
|
|
199
|
+
|
|
199
200
|
# This field name was only used for Caltech Camera Traps
|
|
200
201
|
if 'date_captured' in im:
|
|
201
202
|
assert ds_name == 'Caltech Camera Traps'
|
|
202
203
|
im['datetime'] = im['date_captured']
|
|
203
|
-
|
|
204
|
-
def
|
|
204
|
+
|
|
205
|
+
def _has_valid_datetime(im):
|
|
205
206
|
if 'datetime' not in im:
|
|
206
207
|
return False
|
|
207
208
|
v = im['datetime']
|
|
@@ -212,29 +213,29 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
212
213
|
else:
|
|
213
214
|
assert isinstance(v,float) and np.isnan(v)
|
|
214
215
|
return False
|
|
215
|
-
|
|
216
|
-
dt_string = ''
|
|
217
|
-
if (
|
|
218
|
-
|
|
216
|
+
|
|
217
|
+
dt_string = ''
|
|
218
|
+
if (_has_valid_datetime(im)):
|
|
219
|
+
|
|
219
220
|
dt = dateparser.parse(im['datetime'])
|
|
220
|
-
|
|
221
|
+
|
|
221
222
|
if dt is None or dt.year < 1990 or dt.year > 2025:
|
|
222
|
-
|
|
223
|
+
|
|
223
224
|
# raise ValueError('Suspicious date parsing result')
|
|
224
|
-
|
|
225
|
-
# Special case we don't want to print a warning about... this is
|
|
225
|
+
|
|
226
|
+
# Special case we don't want to print a warning about... this is
|
|
226
227
|
# in invalid date that very likely originates on the camera, not at
|
|
227
228
|
# some intermediate processing step.
|
|
228
229
|
#
|
|
229
230
|
# print('Suspicious date for image {}: {} ({})'.format(
|
|
230
231
|
# im['id'], im['datetime'], ds_name))
|
|
231
|
-
pass
|
|
232
|
-
|
|
232
|
+
pass
|
|
233
|
+
|
|
233
234
|
else:
|
|
234
|
-
|
|
235
|
+
|
|
235
236
|
found_date = True
|
|
236
237
|
dt_string = dt.strftime("%m-%d-%Y %H:%M:%S")
|
|
237
|
-
|
|
238
|
+
|
|
238
239
|
# Location, sequence, and image IDs are only guaranteed to be unique within
|
|
239
240
|
# a dataset, so for the output .csv file, include both
|
|
240
241
|
if 'location' in im:
|
|
@@ -242,25 +243,25 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
242
243
|
location_id = ds_name + ' : ' + str(im['location'])
|
|
243
244
|
else:
|
|
244
245
|
location_id = ds_name
|
|
245
|
-
|
|
246
|
+
|
|
246
247
|
image_id = ds_name + ' : ' + str(im['id'])
|
|
247
|
-
|
|
248
|
+
|
|
248
249
|
if 'seq_id' in im:
|
|
249
250
|
sequence_id = ds_name + ' : ' + str(im['seq_id'])
|
|
250
251
|
else:
|
|
251
252
|
sequence_id = ds_name + ' : ' + 'unknown'
|
|
252
|
-
|
|
253
|
+
|
|
253
254
|
if 'frame_num' in im:
|
|
254
255
|
frame_num = im['frame_num']
|
|
255
256
|
else:
|
|
256
257
|
frame_num = -1
|
|
257
|
-
|
|
258
|
+
|
|
258
259
|
annotations_this_image = image_id_to_annotations[im['id']]
|
|
259
|
-
|
|
260
|
+
|
|
260
261
|
categories_this_image = set()
|
|
261
|
-
|
|
262
|
+
|
|
262
263
|
annotation_level = 'unknown'
|
|
263
|
-
|
|
264
|
+
|
|
264
265
|
for ann in annotations_this_image:
|
|
265
266
|
assert ann['image_id'] == im['id']
|
|
266
267
|
categories_this_image.add(category_id_to_name[ann['category_id']])
|
|
@@ -275,35 +276,35 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
275
276
|
'Unexpected annotation level'
|
|
276
277
|
elif expected_annotation_level is not None:
|
|
277
278
|
annotation_level = expected_annotation_level
|
|
278
|
-
|
|
279
|
+
|
|
279
280
|
if len(categories_this_image) == 0:
|
|
280
281
|
unannotated_images.append(im)
|
|
281
282
|
continue
|
|
282
|
-
|
|
283
|
+
|
|
283
284
|
# category_name = list(categories_this_image)[0]
|
|
284
285
|
for category_name in categories_this_image:
|
|
285
|
-
|
|
286
|
+
|
|
286
287
|
ds_label = ds_name + ':' + category_name.lower()
|
|
287
|
-
|
|
288
|
+
|
|
288
289
|
if ds_label not in ds_label_to_taxonomy:
|
|
289
|
-
|
|
290
|
+
|
|
290
291
|
assert ds_label in known_unmapped_labels
|
|
291
|
-
|
|
292
|
+
|
|
292
293
|
# Only print a warning the first time we see an unmapped label
|
|
293
294
|
if ds_label not in missing_annotations:
|
|
294
295
|
print('Warning: {} not in taxonomy file'.format(ds_label))
|
|
295
296
|
missing_annotations.add(ds_label)
|
|
296
297
|
continue
|
|
297
|
-
|
|
298
|
+
|
|
298
299
|
taxonomy_labels = ds_label_to_taxonomy[ds_label]
|
|
299
|
-
|
|
300
|
+
|
|
300
301
|
"""
|
|
301
|
-
header =
|
|
302
|
+
header =
|
|
302
303
|
['dataset_name','url','image_id','sequence_id','location_id',
|
|
303
304
|
'frame_num','original_label','scientific_name','common_name',
|
|
304
305
|
'datetime','annotation_level']
|
|
305
306
|
"""
|
|
306
|
-
|
|
307
|
+
|
|
307
308
|
row = []
|
|
308
309
|
row.append(ds_name)
|
|
309
310
|
row.append(url_gcp)
|
|
@@ -314,37 +315,37 @@ with open(output_file,'w',encoding='utf-8',newline='') as f:
|
|
|
314
315
|
row.append(location_id)
|
|
315
316
|
row.append(frame_num)
|
|
316
317
|
row.append(taxonomy_labels['query'])
|
|
317
|
-
row.append(
|
|
318
|
-
row.append(
|
|
318
|
+
row.append(_clearnan(taxonomy_labels['scientific_name']))
|
|
319
|
+
row.append(_clearnan(taxonomy_labels['common_name']))
|
|
319
320
|
row.append(dt_string)
|
|
320
321
|
row.append(annotation_level)
|
|
321
|
-
|
|
322
|
+
|
|
322
323
|
for s in taxonomy_levels_to_include:
|
|
323
|
-
row.append(
|
|
324
|
-
|
|
324
|
+
row.append(_clearnan(taxonomy_labels[s]))
|
|
325
|
+
|
|
325
326
|
assert len(row) == len(header)
|
|
326
|
-
|
|
327
|
+
|
|
327
328
|
csv_writer.writerow(row)
|
|
328
|
-
|
|
329
|
+
|
|
329
330
|
# ...for each category that was applied at least once to this image
|
|
330
|
-
|
|
331
|
+
|
|
331
332
|
# ...for each image in this dataset
|
|
332
|
-
|
|
333
|
+
|
|
333
334
|
if not found_date:
|
|
334
335
|
pass
|
|
335
336
|
# print('Warning: no date information available for this dataset')
|
|
336
|
-
|
|
337
|
+
|
|
337
338
|
if not found_location:
|
|
338
339
|
pass
|
|
339
340
|
# print('Warning: no location information available for this dataset')
|
|
340
|
-
|
|
341
|
+
|
|
341
342
|
if not found_annotation_level and (ds_name not in ds_name_to_annotation_level):
|
|
342
343
|
print('Warning: no annotation level information available for this dataset')
|
|
343
|
-
|
|
344
|
+
|
|
344
345
|
if len(unannotated_images) > 0:
|
|
345
346
|
print('Warning: {} of {} images are un-annotated\n'.\
|
|
346
347
|
format(len(unannotated_images),len(images)))
|
|
347
|
-
|
|
348
|
+
|
|
348
349
|
# ...for each dataset
|
|
349
350
|
|
|
350
351
|
# ...with open()
|
|
@@ -360,11 +361,14 @@ print('Read {} rows from {}'.format(len(df),output_file))
|
|
|
360
361
|
|
|
361
362
|
#%% Do some post-hoc integrity checking
|
|
362
363
|
|
|
363
|
-
# Takes ~10 minutes without
|
|
364
|
+
# Takes ~5 minutes with apply(), or ~10 minutes without apply()
|
|
365
|
+
#
|
|
366
|
+
# Using apply() is faster, but more annoying to debug.
|
|
367
|
+
use_pandas_apply_for_integrity_checking = True
|
|
364
368
|
|
|
365
369
|
tqdm.pandas()
|
|
366
370
|
|
|
367
|
-
def
|
|
371
|
+
def _isint(v):
|
|
368
372
|
return isinstance(v,int) or isinstance(v,np.int64)
|
|
369
373
|
|
|
370
374
|
valid_annotation_levels = set(['sequence','image','unknown'])
|
|
@@ -373,8 +377,8 @@ valid_annotation_levels = set(['sequence','image','unknown'])
|
|
|
373
377
|
# in the next cell to look for datasets that only have a single location
|
|
374
378
|
dataset_name_to_locations = defaultdict(set)
|
|
375
379
|
|
|
376
|
-
def
|
|
377
|
-
|
|
380
|
+
def _check_row(row):
|
|
381
|
+
|
|
378
382
|
assert row['dataset_name'] in metadata_table.keys()
|
|
379
383
|
for url_column in ['url_gcp','url_aws','url_azure']:
|
|
380
384
|
assert row[url_column].startswith('https://') or row[url_column].startswith('http://')
|
|
@@ -387,21 +391,20 @@ def check_row(row):
|
|
|
387
391
|
assert np.isnan(row['frame_num'])
|
|
388
392
|
else:
|
|
389
393
|
# -1 is sometimes used for sequences of unknown length
|
|
390
|
-
assert
|
|
394
|
+
assert _isint(row['frame_num']) and row['frame_num'] >= -1
|
|
391
395
|
|
|
392
396
|
ds_name = row['dataset_name']
|
|
393
397
|
dataset_name_to_locations[ds_name].add(row['location_id'])
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
df.progress_apply(check_row, axis=1)
|
|
398
|
+
|
|
399
|
+
if use_pandas_apply_for_integrity_checking:
|
|
400
|
+
|
|
401
|
+
df.progress_apply(_check_row, axis=1)
|
|
399
402
|
|
|
400
403
|
else:
|
|
401
|
-
|
|
404
|
+
|
|
402
405
|
# i_row = 0; row = df.iloc[i_row]
|
|
403
406
|
for i_row,row in tqdm(df.iterrows(),total=len(df)):
|
|
404
|
-
|
|
407
|
+
_check_row(row)
|
|
405
408
|
|
|
406
409
|
|
|
407
410
|
#%% Check for datasets that have only one location string (typically "unknown")
|
|
@@ -428,31 +431,32 @@ images_to_download = []
|
|
|
428
431
|
|
|
429
432
|
# ds_name = list(metadata_table.keys())[2]
|
|
430
433
|
for ds_name in metadata_table.keys():
|
|
431
|
-
|
|
434
|
+
|
|
432
435
|
if 'bbox' in ds_name:
|
|
433
436
|
continue
|
|
434
|
-
|
|
437
|
+
|
|
435
438
|
# Find all rows for this dataset
|
|
436
439
|
ds_rows = df.loc[df['dataset_name'] == ds_name]
|
|
437
|
-
|
|
440
|
+
|
|
438
441
|
print('{} rows available for {}'.format(len(ds_rows),ds_name))
|
|
439
442
|
assert len(ds_rows) > 0
|
|
440
|
-
|
|
443
|
+
|
|
441
444
|
empty_rows = ds_rows[ds_rows['scientific_name'].isnull()]
|
|
442
445
|
non_empty_rows = ds_rows[~ds_rows['scientific_name'].isnull()]
|
|
443
|
-
|
|
446
|
+
|
|
444
447
|
if len(empty_rows) == 0:
|
|
445
448
|
print('No empty images available for {}'.format(ds_name))
|
|
446
449
|
elif len(empty_rows) > n_empty_images_per_dataset:
|
|
447
450
|
empty_rows = empty_rows.sample(n=n_empty_images_per_dataset)
|
|
448
451
|
images_to_download.extend(empty_rows.to_dict('records'))
|
|
449
452
|
|
|
453
|
+
# All LILA datasets have non-empty images
|
|
450
454
|
if len(non_empty_rows) == 0:
|
|
451
|
-
|
|
455
|
+
raise ValueError('No non-empty images available for {}'.format(ds_name))
|
|
452
456
|
elif len(non_empty_rows) > n_non_empty_images_per_dataset:
|
|
453
457
|
non_empty_rows = non_empty_rows.sample(n=n_non_empty_images_per_dataset)
|
|
454
458
|
images_to_download.extend(non_empty_rows.to_dict('records'))
|
|
455
|
-
|
|
459
|
+
|
|
456
460
|
# ...for each dataset
|
|
457
461
|
|
|
458
462
|
print('Selected {} total images'.format(len(images_to_download)))
|
|
@@ -462,13 +466,13 @@ print('Selected {} total images'.format(len(images_to_download)))
|
|
|
462
466
|
|
|
463
467
|
# Expect a few errors for images with human or vehicle labels (or things like "ignore" that *could* be humans)
|
|
464
468
|
|
|
465
|
-
preferred_cloud = '
|
|
469
|
+
preferred_cloud = 'gcp'
|
|
466
470
|
|
|
467
471
|
url_to_target_file = {}
|
|
468
472
|
|
|
469
473
|
# i_image = 10; image = images_to_download[i_image]
|
|
470
474
|
for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_download)):
|
|
471
|
-
|
|
475
|
+
|
|
472
476
|
url = image['url_' + preferred_cloud]
|
|
473
477
|
ext = os.path.splitext(url)[1]
|
|
474
478
|
fn_relative = 'image_{}'.format(str(i_image).zfill(4)) + ext
|
|
@@ -476,14 +480,26 @@ for i_image,image in tqdm(enumerate(images_to_download),total=len(images_to_down
|
|
|
476
480
|
image['relative_file'] = fn_relative
|
|
477
481
|
image['url'] = url
|
|
478
482
|
url_to_target_file[url] = fn_abs
|
|
479
|
-
|
|
483
|
+
|
|
480
484
|
|
|
481
485
|
#%% Download images (execution)
|
|
482
486
|
|
|
483
|
-
from megadetector.utils.url_utils import parallel_download_urls
|
|
484
487
|
download_results = parallel_download_urls(url_to_target_file,verbose=False,overwrite=True,
|
|
485
488
|
n_workers=20,pool_type='thread')
|
|
486
489
|
|
|
490
|
+
# 10-20 errors is normal; they should all be images that are labeled as "human"
|
|
491
|
+
errors = []
|
|
492
|
+
|
|
493
|
+
for r in download_results:
|
|
494
|
+
if r['status'] != 'success':
|
|
495
|
+
errors.append(r)
|
|
496
|
+
|
|
497
|
+
assert len(download_results) == len(url_to_target_file)
|
|
498
|
+
print('Errors on {} of {} downloads:\n'.format(len(errors),len(download_results)))
|
|
499
|
+
|
|
500
|
+
for err in errors:
|
|
501
|
+
print(err['url'])
|
|
502
|
+
|
|
487
503
|
|
|
488
504
|
#%% Write preview HTML
|
|
489
505
|
|
|
@@ -493,10 +509,10 @@ html_images = []
|
|
|
493
509
|
|
|
494
510
|
# im = images_to_download[0]
|
|
495
511
|
for im in images_to_download:
|
|
496
|
-
|
|
512
|
+
|
|
497
513
|
if im['relative_file'] is None:
|
|
498
514
|
continue
|
|
499
|
-
|
|
515
|
+
|
|
500
516
|
output_im = {}
|
|
501
517
|
output_im['filename'] = im['relative_file']
|
|
502
518
|
output_im['linkTarget'] = im['url']
|
|
@@ -504,7 +520,7 @@ for im in images_to_download:
|
|
|
504
520
|
output_im['imageStyle'] = 'width:600px;'
|
|
505
521
|
output_im['textStyle'] = 'font-weight:normal;font-size:100%;'
|
|
506
522
|
html_images.append(output_im)
|
|
507
|
-
|
|
523
|
+
|
|
508
524
|
write_html_image_list.write_html_image_list(html_filename,html_images)
|
|
509
525
|
|
|
510
526
|
open_file(html_filename)
|
|
@@ -515,3 +531,245 @@ open_file(html_filename)
|
|
|
515
531
|
zipped_output_file = zip_file(output_file,verbose=True,overwrite=True)
|
|
516
532
|
|
|
517
533
|
print('Zipped {} to {}'.format(output_file,zipped_output_file))
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
#%% Convert to .json
|
|
537
|
+
|
|
538
|
+
"""
|
|
539
|
+
The .csv file "output_file" (already loaded into the variable "df" at this point) has the following columns:
|
|
540
|
+
|
|
541
|
+
dataset_name,url_gcp,url_aws,url_azure,image_id,sequence_id,location_id,frame_num,original_label,scientific_name,common_name,datetime,annotation_level,kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
|
|
542
|
+
|
|
543
|
+
Each row in the .csv represents an image. The URL columns represent the location of that
|
|
544
|
+
image on three different clouds; for a given image, the value of those columns differs only
|
|
545
|
+
in the prefix. The columns starting with "kingdom" represent a taxonomic wildlife identifier. Not
|
|
546
|
+
all rows have values in all of these columns; some rows represent non-wildlife images where all of these
|
|
547
|
+
columns are blank.
|
|
548
|
+
|
|
549
|
+
This cell converts this to a .json dictionary, with the following top-level keys:
|
|
550
|
+
|
|
551
|
+
## datasets (dict)
|
|
552
|
+
|
|
553
|
+
A dict mapping integer IDs to strings.
|
|
554
|
+
|
|
555
|
+
Each unique value in the "dataset_name" column should become an element in this dict with a unique ID.
|
|
556
|
+
|
|
557
|
+
## sequences (dict)
|
|
558
|
+
|
|
559
|
+
A dict mapping integer IDs to strings.
|
|
560
|
+
|
|
561
|
+
Each unique value in the "sequence_id" column should become an element in this dict with a unique ID.
|
|
562
|
+
|
|
563
|
+
## locations (dict)
|
|
564
|
+
|
|
565
|
+
A dict mapping integer IDs to strings.
|
|
566
|
+
|
|
567
|
+
Each unique value in the "location_id" column should become an element in this dict with a unique ID.
|
|
568
|
+
|
|
569
|
+
## base_urls (dict)
|
|
570
|
+
|
|
571
|
+
This key should point to the following dict:
|
|
572
|
+
|
|
573
|
+
{
|
|
574
|
+
"gcp": "https://storage.googleapis.com/public-datasets-lila/",
|
|
575
|
+
"aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
|
|
576
|
+
"azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
All values in the url_gcp, url_aws, and url_azure columns start with these values, respectively.
|
|
580
|
+
|
|
581
|
+
## taxa (dict)
|
|
582
|
+
|
|
583
|
+
A dict mapping integer IDs to dicts, where each dict has the fields:
|
|
584
|
+
|
|
585
|
+
kingdom,phylum,subphylum,superclass,class,subclass,infraclass,superorder,order,suborder,infraorder,superfamily,family,subfamily,tribe,genus,subgenus,species,subspecies,variety
|
|
586
|
+
|
|
587
|
+
The value of each of these fields in each row is either a string or None.
|
|
588
|
+
|
|
589
|
+
## images (list)
|
|
590
|
+
|
|
591
|
+
A list of images, where each image is a dict with the following fields:
|
|
592
|
+
|
|
593
|
+
### dataset (int)
|
|
594
|
+
|
|
595
|
+
The integer ID corresponding to the dataset_name column for this image
|
|
596
|
+
|
|
597
|
+
### path (str)
|
|
598
|
+
|
|
599
|
+
The suffix for this image's URL, which should be the same across the three URL columns.
|
|
600
|
+
|
|
601
|
+
### seq (int)
|
|
602
|
+
|
|
603
|
+
The integer ID corresponding to the sequence_id column for this image
|
|
604
|
+
|
|
605
|
+
### loc (int)
|
|
606
|
+
|
|
607
|
+
The integer ID corresponding to the location_id column for this image
|
|
608
|
+
|
|
609
|
+
### frame_num
|
|
610
|
+
|
|
611
|
+
The value of the frame_num column for this image, unless the original value was -1,
|
|
612
|
+
in which case this is omitted.
|
|
613
|
+
|
|
614
|
+
### original_label
|
|
615
|
+
|
|
616
|
+
The value of the original_label column for this image
|
|
617
|
+
|
|
618
|
+
### common_name
|
|
619
|
+
|
|
620
|
+
The value of the common_name column for this image, if not empty
|
|
621
|
+
|
|
622
|
+
### datetime
|
|
623
|
+
|
|
624
|
+
The value of the datetime column for this image
|
|
625
|
+
|
|
626
|
+
### ann_level
|
|
627
|
+
|
|
628
|
+
The value of the annotation_level column for this image
|
|
629
|
+
|
|
630
|
+
### taxon
|
|
631
|
+
|
|
632
|
+
The integer ID corresponding to the taxonomic identifier columns for this image
|
|
633
|
+
|
|
634
|
+
--
|
|
635
|
+
|
|
636
|
+
The original .csv file is large (~15GB); this may impact the implementation of the .json conversion. Speed of
|
|
637
|
+
conversion is not a priority.
|
|
638
|
+
|
|
639
|
+
"""
|
|
640
|
+
|
|
641
|
+
print('Converting to JSON...')
|
|
642
|
+
|
|
643
|
+
output_json_file = output_file.replace('.csv', '.json')
|
|
644
|
+
|
|
645
|
+
json_data = {}
|
|
646
|
+
|
|
647
|
+
# Create mappings for datasets, sequences, and locations
|
|
648
|
+
dataset_to_id = {}
|
|
649
|
+
sequence_to_id = {}
|
|
650
|
+
location_to_id = {}
|
|
651
|
+
taxa_to_id = {}
|
|
652
|
+
|
|
653
|
+
next_dataset_id = 0
|
|
654
|
+
next_sequence_id = 0
|
|
655
|
+
next_location_id = 0
|
|
656
|
+
next_taxa_id = 0
|
|
657
|
+
|
|
658
|
+
json_data['datasets'] = {}
|
|
659
|
+
json_data['sequences'] = {}
|
|
660
|
+
json_data['locations'] = {}
|
|
661
|
+
json_data['taxa'] = {}
|
|
662
|
+
|
|
663
|
+
json_data['base_urls'] = {
|
|
664
|
+
"gcp": "https://storage.googleapis.com/public-datasets-lila/",
|
|
665
|
+
"aws": "http://us-west-2.opendata.source.coop.s3.amazonaws.com/agentmorris/lila-wildlife/",
|
|
666
|
+
"azure": "https://lilawildlife.blob.core.windows.net/lila-wildlife/",
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
json_data['images'] = []
|
|
670
|
+
|
|
671
|
+
debug_max_json_conversion_rows = None
|
|
672
|
+
|
|
673
|
+
print('Counting rows in .csv file...')
|
|
674
|
+
|
|
675
|
+
# Get total number of lines for progress bar (optional, but helpful for large files)
|
|
676
|
+
def _count_lines(filename):
|
|
677
|
+
with open(filename, 'r', encoding='utf-8') as f:
|
|
678
|
+
return sum(1 for line in f) - 1
|
|
679
|
+
|
|
680
|
+
total_rows = _count_lines(output_file)
|
|
681
|
+
print('Total rows to process: {}'.format(total_rows))
|
|
682
|
+
|
|
683
|
+
# Read CSV file line by line
|
|
684
|
+
with open(output_file, 'r', encoding='utf-8') as csvfile:
|
|
685
|
+
|
|
686
|
+
reader = csv.DictReader(csvfile)
|
|
687
|
+
|
|
688
|
+
# Process each row
|
|
689
|
+
for i_row, row in enumerate(tqdm(reader, total=total_rows, desc="Processing rows")):
|
|
690
|
+
|
|
691
|
+
if (debug_max_json_conversion_rows is not None) and (i_row >= debug_max_json_conversion_rows):
|
|
692
|
+
break
|
|
693
|
+
|
|
694
|
+
# Datasets
|
|
695
|
+
dataset_name = row['dataset_name']
|
|
696
|
+
if dataset_name not in dataset_to_id:
|
|
697
|
+
dataset_to_id[dataset_name] = next_dataset_id
|
|
698
|
+
json_data['datasets'][str(next_dataset_id)] = dataset_name
|
|
699
|
+
next_dataset_id += 1
|
|
700
|
+
dataset_id = dataset_to_id[dataset_name]
|
|
701
|
+
|
|
702
|
+
# Sequences
|
|
703
|
+
sequence_id_str = row['sequence_id']
|
|
704
|
+
assert sequence_id_str.startswith(dataset_name + ' : ')
|
|
705
|
+
if sequence_id_str not in sequence_to_id:
|
|
706
|
+
sequence_to_id[sequence_id_str] = next_sequence_id
|
|
707
|
+
json_data['sequences'][str(next_sequence_id)] = sequence_id_str
|
|
708
|
+
next_sequence_id += 1
|
|
709
|
+
sequence_id = sequence_to_id[sequence_id_str]
|
|
710
|
+
|
|
711
|
+
# Locations
|
|
712
|
+
location_id_str = row['location_id']
|
|
713
|
+
assert location_id_str.startswith(dataset_name) # + ' : ')
|
|
714
|
+
if location_id_str not in location_to_id:
|
|
715
|
+
location_to_id[location_id_str] = next_location_id
|
|
716
|
+
json_data['locations'][str(next_location_id)] = location_id_str
|
|
717
|
+
next_location_id += 1
|
|
718
|
+
location_id = location_to_id[location_id_str]
|
|
719
|
+
|
|
720
|
+
# Taxa
|
|
721
|
+
taxa_data = {level: _clearnan(row[level]) for level in taxonomy_levels_to_include}
|
|
722
|
+
taxa_tuple = tuple(taxa_data.items()) # use tuple for hashable key
|
|
723
|
+
if taxa_tuple not in taxa_to_id:
|
|
724
|
+
taxa_to_id[taxa_tuple] = next_taxa_id
|
|
725
|
+
json_data['taxa'][str(next_taxa_id)] = taxa_data
|
|
726
|
+
next_taxa_id += 1
|
|
727
|
+
taxa_id = taxa_to_id[taxa_tuple]
|
|
728
|
+
|
|
729
|
+
# Image path
|
|
730
|
+
url_gcp = row['url_gcp']
|
|
731
|
+
assert url_gcp.startswith(json_data['base_urls']['gcp'])
|
|
732
|
+
path = url_gcp.replace(json_data['base_urls']['gcp'], '')
|
|
733
|
+
|
|
734
|
+
common_name = _clearnan(row['common_name'])
|
|
735
|
+
|
|
736
|
+
frame_num = int(row['frame_num'])
|
|
737
|
+
|
|
738
|
+
# Image data
|
|
739
|
+
image_entry = {
|
|
740
|
+
'dataset': dataset_id,
|
|
741
|
+
'path': path,
|
|
742
|
+
'seq': sequence_id,
|
|
743
|
+
'loc': location_id,
|
|
744
|
+
'ann_level': row['annotation_level'],
|
|
745
|
+
'original_label': row['original_label'],
|
|
746
|
+
'datetime': row['datetime'],
|
|
747
|
+
'taxon': taxa_id
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
if frame_num >= 0:
|
|
751
|
+
image_entry['frame_num'] = frame_num
|
|
752
|
+
|
|
753
|
+
if len(common_name) > 0:
|
|
754
|
+
image_entry['common_name'] = common_name
|
|
755
|
+
|
|
756
|
+
json_data['images'].append(image_entry)
|
|
757
|
+
|
|
758
|
+
# ...for each line
|
|
759
|
+
|
|
760
|
+
# ...with open(...)
|
|
761
|
+
|
|
762
|
+
# Save the JSON data
|
|
763
|
+
print('Saving JSON file...')
|
|
764
|
+
with open(output_json_file, 'w', encoding='utf-8') as f:
|
|
765
|
+
json.dump(json_data, f, indent=1)
|
|
766
|
+
|
|
767
|
+
print(f'Converted to JSON and saved to {output_json_file}')
|
|
768
|
+
print(f'JSON file size: {os.path.getsize(output_json_file)/(1024*1024*1024):.2f} GB')
|
|
769
|
+
|
|
770
|
+
# Print summary statistics
|
|
771
|
+
print(f'Total datasets: {len(json_data["datasets"])}')
|
|
772
|
+
print(f'Total sequences: {len(json_data["sequences"])}')
|
|
773
|
+
print(f'Total locations: {len(json_data["locations"])}')
|
|
774
|
+
print(f'Total taxa: {len(json_data["taxa"])}')
|
|
775
|
+
print(f'Total images: {len(json_data["images"])}')
|