megadetector 5.0.29__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/prepare_classification_script_mc.py +3 -3
- megadetector/data_management/annotations/annotation_constants.py +0 -1
- megadetector/data_management/camtrap_dp_to_coco.py +34 -1
- megadetector/data_management/cct_json_utils.py +2 -2
- megadetector/data_management/coco_to_yolo.py +22 -5
- megadetector/data_management/databases/add_width_and_height_to_db.py +85 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +2 -2
- megadetector/data_management/databases/integrity_check_json_db.py +29 -15
- megadetector/data_management/generate_crops_from_cct.py +50 -1
- megadetector/data_management/labelme_to_coco.py +4 -2
- megadetector/data_management/labelme_to_yolo.py +82 -2
- megadetector/data_management/lila/generate_lila_per_image_labels.py +276 -18
- megadetector/data_management/lila/get_lila_annotation_counts.py +5 -3
- megadetector/data_management/lila/lila_common.py +3 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +15 -5
- megadetector/data_management/mewc_to_md.py +5 -0
- megadetector/data_management/ocr_tools.py +4 -3
- megadetector/data_management/read_exif.py +20 -5
- megadetector/data_management/remap_coco_categories.py +66 -4
- megadetector/data_management/remove_exif.py +50 -1
- megadetector/data_management/rename_images.py +3 -3
- megadetector/data_management/resize_coco_dataset.py +563 -95
- megadetector/data_management/yolo_output_to_md_output.py +131 -2
- megadetector/data_management/yolo_to_coco.py +140 -5
- megadetector/detection/change_detection.py +4 -3
- megadetector/detection/pytorch_detector.py +60 -22
- megadetector/detection/run_detector.py +225 -25
- megadetector/detection/run_detector_batch.py +42 -16
- megadetector/detection/run_inference_with_yolov5_val.py +12 -2
- megadetector/detection/run_tiled_inference.py +1 -0
- megadetector/detection/video_utils.py +53 -24
- megadetector/postprocessing/add_max_conf.py +4 -0
- megadetector/postprocessing/categorize_detections_by_size.py +1 -1
- megadetector/postprocessing/classification_postprocessing.py +55 -20
- megadetector/postprocessing/combine_batch_outputs.py +3 -2
- megadetector/postprocessing/compare_batch_results.py +64 -10
- megadetector/postprocessing/convert_output_format.py +12 -8
- megadetector/postprocessing/create_crop_folder.py +137 -10
- megadetector/postprocessing/load_api_results.py +26 -8
- megadetector/postprocessing/md_to_coco.py +4 -4
- megadetector/postprocessing/md_to_labelme.py +18 -7
- megadetector/postprocessing/merge_detections.py +5 -0
- megadetector/postprocessing/postprocess_batch_results.py +6 -3
- megadetector/postprocessing/remap_detection_categories.py +55 -2
- megadetector/postprocessing/render_detection_confusion_matrix.py +9 -6
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +3 -4
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +40 -19
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +1 -1
- megadetector/taxonomy_mapping/species_lookup.py +123 -41
- megadetector/utils/ct_utils.py +133 -113
- megadetector/utils/md_tests.py +93 -13
- megadetector/utils/path_utils.py +137 -107
- megadetector/utils/split_locations_into_train_val.py +2 -2
- megadetector/utils/string_utils.py +7 -7
- megadetector/utils/url_utils.py +81 -58
- megadetector/utils/wi_utils.py +46 -17
- megadetector/visualization/plot_utils.py +13 -9
- megadetector/visualization/render_images_with_thumbnails.py +2 -1
- megadetector/visualization/visualization_utils.py +94 -46
- megadetector/visualization/visualize_db.py +36 -9
- megadetector/visualization/visualize_detector_output.py +4 -4
- {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/METADATA +135 -135
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -438
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -109
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -513
- megadetector-5.0.29.dist-info/RECORD +0 -163
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/WHEEL +0 -0
|
@@ -32,27 +32,18 @@ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
|
|
|
32
32
|
|
|
33
33
|
taxonomy_urls = {
|
|
34
34
|
'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
|
|
35
|
-
'iNaturalist': 'https://www.inaturalist.org/
|
|
35
|
+
'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
|
|
36
36
|
}
|
|
37
37
|
|
|
38
38
|
files_to_unzip = {
|
|
39
|
-
# GBIF used to put everything in a "backbone" folder within the zipfile, but as of
|
|
40
|
-
# 12.2023, this is no longer the case.
|
|
41
|
-
# 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
|
|
42
39
|
'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
|
|
43
|
-
'iNaturalist': ['taxa.csv']
|
|
40
|
+
'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
|
|
44
41
|
}
|
|
45
42
|
|
|
46
|
-
# As of
|
|
43
|
+
# As of 2025.06.24:
|
|
47
44
|
#
|
|
48
|
-
# GBIF:
|
|
49
|
-
# iNat:
|
|
50
|
-
|
|
51
|
-
# As of 2023.12.29:
|
|
52
|
-
#
|
|
53
|
-
# GBIF: ~948MB zipped, ~2.2GB taxonomy
|
|
54
|
-
# iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
|
|
55
|
-
|
|
45
|
+
# GBIF: 950MB zipped, 2.3GB of relevant content unzipped
|
|
46
|
+
# iNat: 71MB zipped, 415MB of relevant content unzipped
|
|
56
47
|
|
|
57
48
|
os.makedirs(taxonomy_download_dir, exist_ok=True)
|
|
58
49
|
for taxonomy_name in taxonomy_urls:
|
|
@@ -83,7 +74,7 @@ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
|
|
|
83
74
|
|
|
84
75
|
# Initialization function
|
|
85
76
|
|
|
86
|
-
def initialize_taxonomy_lookup(force_init=False)
|
|
77
|
+
def initialize_taxonomy_lookup(force_init=False):
|
|
87
78
|
"""
|
|
88
79
|
Initialize this module by doing the following:
|
|
89
80
|
|
|
@@ -92,8 +83,14 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
92
83
|
* Builds a bunch of dictionaries and tables to facilitate lookup
|
|
93
84
|
* Serializes those tables via pickle
|
|
94
85
|
* Skips all of the above if the serialized pickle file already exists
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
force_init (bool, optional): force re-download and parsing of the source .zip files,
|
|
89
|
+
even if the cached .p file already exists
|
|
95
90
|
"""
|
|
96
91
|
|
|
92
|
+
#%%
|
|
93
|
+
|
|
97
94
|
global inat_taxonomy,\
|
|
98
95
|
gbif_taxonomy,\
|
|
99
96
|
gbif_common_mapping,\
|
|
@@ -109,7 +106,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
109
106
|
gbif_scientific_to_taxon_id
|
|
110
107
|
|
|
111
108
|
|
|
112
|
-
|
|
109
|
+
#%% Load serialized taxonomy info if we've already saved it
|
|
113
110
|
|
|
114
111
|
if (not force_init) and (inat_taxonomy is not None):
|
|
115
112
|
print('Skipping taxonomy re-init')
|
|
@@ -139,9 +136,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
139
136
|
return
|
|
140
137
|
|
|
141
138
|
|
|
142
|
-
|
|
139
|
+
#%% Download and unzip taxonomy files
|
|
143
140
|
|
|
144
|
-
# Download and unzip taxonomy files
|
|
145
141
|
# taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
|
|
146
142
|
for taxonomy_name, zip_url in taxonomy_urls.items():
|
|
147
143
|
|
|
@@ -189,21 +185,44 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
189
185
|
|
|
190
186
|
# ...for each file that we need from this zipfile
|
|
191
187
|
|
|
192
|
-
# Remove the zipfile
|
|
193
|
-
# os.remove(zipfile_path)
|
|
194
|
-
|
|
195
188
|
# ...for each taxonomy
|
|
196
189
|
|
|
197
190
|
|
|
198
|
-
|
|
199
|
-
# name file
|
|
191
|
+
#%% Create dataframes from each of the taxonomy/vernacular files
|
|
200
192
|
|
|
201
193
|
# Load iNat taxonomy
|
|
202
194
|
inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
|
|
203
195
|
print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
|
|
204
196
|
inat_taxonomy = pd.read_csv(inat_taxonomy_file)
|
|
205
197
|
inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
|
|
206
|
-
|
|
198
|
+
|
|
199
|
+
# Delete columns we won't use. The "taxonID" column is a non-int version of "ID"
|
|
200
|
+
inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
|
|
201
|
+
|
|
202
|
+
# The "parentNameUsageID" column in inat_taxonomy is a URL, like:
|
|
203
|
+
#
|
|
204
|
+
# https://www.inaturalist.org/taxa/71262
|
|
205
|
+
#
|
|
206
|
+
# Convert this column to be integer-valued, using only the last token of the URL
|
|
207
|
+
inat_taxonomy['parentNameUsageID'] = \
|
|
208
|
+
inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
|
|
209
|
+
|
|
210
|
+
# Rename the "id" column to "taxonID"
|
|
211
|
+
inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
|
|
212
|
+
|
|
213
|
+
assert 'id' not in inat_taxonomy.columns
|
|
214
|
+
assert 'taxonID' in inat_taxonomy.columns
|
|
215
|
+
|
|
216
|
+
# Load iNat common name mapping
|
|
217
|
+
inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
|
|
218
|
+
inat_common_mapping = pd.read_csv(inat_common_mapping_file)
|
|
219
|
+
inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
|
|
220
|
+
|
|
221
|
+
inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
|
|
222
|
+
'source','lexicon','contributor','created'], axis=1)
|
|
223
|
+
assert 'id' in inat_common_mapping.columns
|
|
224
|
+
assert 'taxonID' not in inat_common_mapping.columns
|
|
225
|
+
assert 'vernacularName' in inat_common_mapping.columns
|
|
207
226
|
|
|
208
227
|
# Load GBIF taxonomy
|
|
209
228
|
gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
|
|
@@ -211,12 +230,20 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
211
230
|
gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
|
|
212
231
|
gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
|
|
213
232
|
gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
|
|
233
|
+
gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
|
|
214
234
|
|
|
215
235
|
# Remove questionable rows from the GBIF taxonomy
|
|
216
236
|
gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
|
|
217
237
|
gbif_taxonomy = gbif_taxonomy.reset_index()
|
|
218
238
|
|
|
219
|
-
|
|
239
|
+
gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
|
|
240
|
+
'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
|
|
241
|
+
'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
|
|
242
|
+
|
|
243
|
+
assert 'taxonID' in gbif_taxonomy.columns
|
|
244
|
+
assert 'scientificName' in gbif_taxonomy.columns
|
|
245
|
+
|
|
246
|
+
# Load GBIF common name mapping
|
|
220
247
|
gbif_common_mapping = pd.read_csv(os.path.join(
|
|
221
248
|
taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
|
|
222
249
|
gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
|
|
@@ -225,6 +252,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
225
252
|
gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
|
|
226
253
|
gbif_common_mapping = gbif_common_mapping.reset_index()
|
|
227
254
|
|
|
255
|
+
gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
|
|
256
|
+
'lifeStage','source'],axis=1)
|
|
257
|
+
|
|
258
|
+
assert 'taxonID' in gbif_common_mapping.columns
|
|
259
|
+
assert 'vernacularName' in gbif_common_mapping.columns
|
|
260
|
+
|
|
228
261
|
|
|
229
262
|
# Convert everything to lowercase
|
|
230
263
|
|
|
@@ -235,23 +268,28 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
235
268
|
inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
|
|
236
269
|
gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
|
|
237
270
|
gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
|
|
271
|
+
inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
|
|
238
272
|
|
|
239
273
|
|
|
240
|
-
|
|
274
|
+
##%% For each taxonomy table, create a mapping from taxon IDs to rows
|
|
241
275
|
|
|
242
276
|
inat_taxon_id_to_row = {}
|
|
243
277
|
gbif_taxon_id_to_row = {}
|
|
244
278
|
|
|
245
279
|
print('Building iNat taxonID --> row table')
|
|
246
280
|
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
247
|
-
|
|
281
|
+
taxon_id = row['taxonID']
|
|
282
|
+
assert isinstance(taxon_id, int)
|
|
283
|
+
inat_taxon_id_to_row[taxon_id] = i_row
|
|
248
284
|
|
|
249
285
|
print('Building GBIF taxonID --> row table')
|
|
250
286
|
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
251
|
-
|
|
287
|
+
taxon_id = row['taxonID']
|
|
288
|
+
assert isinstance(taxon_id, int)
|
|
289
|
+
gbif_taxon_id_to_row[taxon_id] = i_row
|
|
252
290
|
|
|
253
291
|
|
|
254
|
-
|
|
292
|
+
##%% Create name mapping dictionaries
|
|
255
293
|
|
|
256
294
|
inat_taxon_id_to_vernacular = defaultdict(set)
|
|
257
295
|
inat_vernacular_to_taxon_id = defaultdict(set)
|
|
@@ -268,31 +306,60 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
268
306
|
|
|
269
307
|
print('Building lookup dictionaries for iNat taxonomy')
|
|
270
308
|
|
|
309
|
+
# iNat Scientific name mapping
|
|
310
|
+
|
|
271
311
|
for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
|
|
272
312
|
|
|
273
313
|
taxon_id = row['taxonID']
|
|
274
|
-
|
|
275
|
-
scientific_name = row['scientificName']
|
|
276
|
-
|
|
277
|
-
if len(vernacular_name) > 0:
|
|
278
|
-
inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
279
|
-
inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
314
|
+
assert isinstance(taxon_id,int)
|
|
280
315
|
|
|
316
|
+
scientific_name = row['scientificName']
|
|
281
317
|
assert len(scientific_name) > 0
|
|
318
|
+
|
|
282
319
|
inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
|
|
283
320
|
inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
|
|
284
321
|
|
|
322
|
+
# iNat common name mapping
|
|
323
|
+
|
|
324
|
+
inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
|
|
325
|
+
|
|
326
|
+
for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
|
|
285
327
|
|
|
286
|
-
|
|
328
|
+
taxon_id = row['id']
|
|
329
|
+
assert isinstance(taxon_id,int)
|
|
330
|
+
|
|
331
|
+
# This should never happen; we will assert() this at the end of the loop
|
|
332
|
+
if taxon_id not in inat_taxon_id_to_scientific:
|
|
333
|
+
inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
|
|
334
|
+
continue
|
|
335
|
+
|
|
336
|
+
vernacular_name = row['vernacularName']
|
|
337
|
+
|
|
338
|
+
assert len(vernacular_name) > 0
|
|
339
|
+
inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
340
|
+
inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
341
|
+
|
|
342
|
+
assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
##%% Build GBIF dictionaries
|
|
287
346
|
|
|
288
347
|
print('Building lookup dictionaries for GBIF taxonomy')
|
|
289
348
|
|
|
349
|
+
# GBIF scientific name mapping
|
|
350
|
+
|
|
290
351
|
for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
|
|
291
352
|
|
|
292
353
|
taxon_id = row['taxonID']
|
|
354
|
+
assert isinstance(taxon_id,int)
|
|
293
355
|
|
|
294
|
-
# The canonical name is the Latin name; the "scientific name"
|
|
295
|
-
#
|
|
356
|
+
# The "canonical name" is the Latin name; the "scientific name"
|
|
357
|
+
# column includes other information. For example:
|
|
358
|
+
#
|
|
359
|
+
# "scientificName": Schizophoria impressa (Hall, 1843)
|
|
360
|
+
# "canonicalName": Schizophoria impressa
|
|
361
|
+
#
|
|
362
|
+
# Also see:
|
|
296
363
|
#
|
|
297
364
|
# http://globalnames.org/docs/glossary/
|
|
298
365
|
|
|
@@ -307,12 +374,18 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
307
374
|
gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
|
|
308
375
|
gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
|
|
309
376
|
|
|
377
|
+
# GBIF common name mapping
|
|
378
|
+
|
|
379
|
+
gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
|
|
380
|
+
|
|
310
381
|
for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
|
|
311
382
|
|
|
312
383
|
taxon_id = row['taxonID']
|
|
384
|
+
assert isinstance(taxon_id,int)
|
|
313
385
|
|
|
314
386
|
# Don't include taxon IDs that were removed from the master table
|
|
315
387
|
if taxon_id not in gbif_taxon_id_to_scientific:
|
|
388
|
+
gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
|
|
316
389
|
continue
|
|
317
390
|
|
|
318
391
|
vernacular_name = row['vernacularName']
|
|
@@ -321,8 +394,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
321
394
|
gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
|
|
322
395
|
gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
|
|
323
396
|
|
|
397
|
+
print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
|
|
398
|
+
len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
|
|
399
|
+
len(gbif_common_mapping)
|
|
400
|
+
))
|
|
401
|
+
|
|
324
402
|
|
|
325
|
-
|
|
403
|
+
##%% Save everything to file
|
|
326
404
|
|
|
327
405
|
structures_to_serialize = [
|
|
328
406
|
inat_taxonomy,
|
|
@@ -344,7 +422,10 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
|
|
|
344
422
|
if not os.path.isfile(serialized_structures_file):
|
|
345
423
|
with open(serialized_structures_file, 'wb') as p:
|
|
346
424
|
pickle.dump(structures_to_serialize, p)
|
|
347
|
-
print('
|
|
425
|
+
print('...done')
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
#%%
|
|
348
429
|
|
|
349
430
|
# ...def initialize_taxonomy_lookup(...)
|
|
350
431
|
|
|
@@ -412,7 +493,8 @@ def traverse_taxonomy(matching_rownums: Sequence[int],
|
|
|
412
493
|
while True:
|
|
413
494
|
|
|
414
495
|
taxon_id = current_row['taxonID']
|
|
415
|
-
|
|
496
|
+
# sort for determinism
|
|
497
|
+
vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
|
|
416
498
|
match_details.append((taxon_id, current_row['taxonRank'],
|
|
417
499
|
get_scientific_name_from_row(current_row),
|
|
418
500
|
vernacular_names))
|