megadetector 10.0.6__py3-none-any.whl → 10.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/cct_json_utils.py +16 -6
- megadetector/data_management/databases/subset_json_db.py +57 -2
- megadetector/detection/pytorch_detector.py +29 -15
- megadetector/detection/run_inference_with_yolov5_val.py +3 -1
- megadetector/detection/run_tiled_inference.py +5 -2
- megadetector/detection/video_utils.py +23 -7
- megadetector/postprocessing/classification_postprocessing.py +218 -69
- megadetector/postprocessing/convert_output_format.py +81 -87
- megadetector/postprocessing/subset_json_detector_output.py +3 -0
- megadetector/utils/directory_listing.py +19 -13
- megadetector/utils/path_utils.py +58 -8
- megadetector/utils/url_utils.py +91 -1
- megadetector/utils/wi_taxonomy_utils.py +44 -26
- megadetector/visualization/visualize_video_output.py +16 -6
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/METADATA +134 -134
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/RECORD +19 -19
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/licenses/LICENSE +0 -0
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/top_level.txt +0 -0
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/WHEEL +0 -0
|
@@ -13,14 +13,15 @@ Functions for postprocessing species classification results, particularly:
|
|
|
13
13
|
|
|
14
14
|
#%% Constants and imports
|
|
15
15
|
|
|
16
|
-
import os
|
|
17
16
|
import json
|
|
18
17
|
import copy
|
|
18
|
+
import pandas as pd
|
|
19
19
|
|
|
20
20
|
from collections import defaultdict
|
|
21
21
|
from tqdm import tqdm
|
|
22
22
|
|
|
23
23
|
from megadetector.utils.ct_utils import is_list_sorted
|
|
24
|
+
from megadetector.utils.ct_utils import is_empty
|
|
24
25
|
from megadetector.utils.ct_utils import sort_dictionary_by_value
|
|
25
26
|
from megadetector.utils.ct_utils import sort_dictionary_by_key
|
|
26
27
|
from megadetector.utils.ct_utils import invert_dictionary
|
|
@@ -29,9 +30,9 @@ from megadetector.utils.wi_taxonomy_utils import clean_taxonomy_string
|
|
|
29
30
|
from megadetector.utils.wi_taxonomy_utils import taxonomy_level_index
|
|
30
31
|
from megadetector.utils.wi_taxonomy_utils import taxonomy_level_string_to_index
|
|
31
32
|
|
|
32
|
-
from megadetector.utils.wi_taxonomy_utils import non_taxonomic_prediction_strings
|
|
33
33
|
from megadetector.utils.wi_taxonomy_utils import human_prediction_string
|
|
34
34
|
from megadetector.utils.wi_taxonomy_utils import animal_prediction_string
|
|
35
|
+
from megadetector.utils.wi_taxonomy_utils import is_taxonomic_prediction_string
|
|
35
36
|
from megadetector.utils.wi_taxonomy_utils import blank_prediction_string # noqa
|
|
36
37
|
|
|
37
38
|
|
|
@@ -129,7 +130,7 @@ class ClassificationSmoothingOptions:
|
|
|
129
130
|
|
|
130
131
|
## Populated internally
|
|
131
132
|
|
|
132
|
-
#:
|
|
133
|
+
#: Only include these categories in the smoothing process (None to use all categories)
|
|
133
134
|
self._detection_category_ids_to_smooth = None
|
|
134
135
|
|
|
135
136
|
|
|
@@ -1014,6 +1015,10 @@ def smooth_classification_results_sequence_level(input_file,
|
|
|
1014
1015
|
|
|
1015
1016
|
detections_this_sequence = []
|
|
1016
1017
|
for image_filename in image_filenames_this_sequence:
|
|
1018
|
+
if image_filename not in image_fn_to_classification_results:
|
|
1019
|
+
print('Warning: {} in sequence list but not in results'.format(
|
|
1020
|
+
image_filename))
|
|
1021
|
+
continue
|
|
1017
1022
|
im = image_fn_to_classification_results[image_filename]
|
|
1018
1023
|
if 'detections' not in im or im['detections'] is None:
|
|
1019
1024
|
continue
|
|
@@ -1101,16 +1106,16 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1101
1106
|
output_file,
|
|
1102
1107
|
allow_walk_down=False,
|
|
1103
1108
|
add_pre_filtering_description=True,
|
|
1104
|
-
allow_redundant_latin_names=
|
|
1109
|
+
allow_redundant_latin_names=True,
|
|
1110
|
+
protected_common_names=None,
|
|
1111
|
+
use_original_common_names_if_available=True,
|
|
1112
|
+
verbose=True):
|
|
1105
1113
|
"""
|
|
1106
1114
|
Given a prediction file in MD .json format, likely without having had
|
|
1107
1115
|
a geofence applied, apply a custom taxa list.
|
|
1108
1116
|
|
|
1109
1117
|
Args:
|
|
1110
|
-
taxa_list (str
|
|
1111
|
-
a list of latin names. Optionally may contain a second (comma-delimited)
|
|
1112
|
-
column containing common names, used only for debugging. Latin names
|
|
1113
|
-
must exist in the SpeciesNet taxonomy.
|
|
1118
|
+
taxa_list (str): .csv file with at least the columns "latin" and "common".
|
|
1114
1119
|
speciesnet_taxonomy_file (str): taxonomy filename, in the same format used for
|
|
1115
1120
|
model release (with 7-token taxonomy entries)
|
|
1116
1121
|
input_file (str): .json file to read, in MD format. This can be None, in which
|
|
@@ -1128,45 +1133,73 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1128
1133
|
if the same latin name appears twice in the taxonomy list; if True, we'll
|
|
1129
1134
|
just print a warning and ignore all entries other than the first for this
|
|
1130
1135
|
latin name
|
|
1136
|
+
protected_common_names (list, optional): these categories should be
|
|
1137
|
+
unmodified, even if they aren't used, or have the same taxonomic
|
|
1138
|
+
description as other categories
|
|
1139
|
+
use_original_common_names_if_available (bool, optional): if an "original_common"
|
|
1140
|
+
column is present in [taxa_list], use those common names instead of the ones
|
|
1141
|
+
in the taxonomy file
|
|
1142
|
+
verbose (bool, optional): enable additional debug output
|
|
1131
1143
|
"""
|
|
1132
1144
|
|
|
1133
1145
|
##%% Read target taxa list
|
|
1134
1146
|
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1147
|
+
taxa_list_df = pd.read_csv(taxa_list)
|
|
1148
|
+
|
|
1149
|
+
required_columns = ('latin','common')
|
|
1150
|
+
for s in required_columns:
|
|
1151
|
+
assert s in taxa_list_df.columns, \
|
|
1152
|
+
'Required column {} missing from taxonomy list file {}'.format(
|
|
1153
|
+
s,taxa_list)
|
|
1154
|
+
|
|
1155
|
+
# Convert the "latin" and "common" columns in taxa_list_df to lowercase
|
|
1156
|
+
taxa_list_df['latin'] = taxa_list_df['latin'].str.lower()
|
|
1157
|
+
taxa_list_df['common'] = taxa_list_df['common'].str.lower()
|
|
1140
1158
|
|
|
1141
|
-
|
|
1142
|
-
|
|
1159
|
+
# Remove rows from taxa_list_df where the "latin" column is nan,
|
|
1160
|
+
# printing a warning for each row (with a string representation of the whole row)
|
|
1161
|
+
for i_row,row in taxa_list_df.iterrows():
|
|
1162
|
+
if pd.isna(row['latin']):
|
|
1163
|
+
if verbose:
|
|
1164
|
+
print('Warning: Skipping row with empty "latin" column in {}:\n{}\n'.format(
|
|
1165
|
+
taxa_list,str(row.to_dict())))
|
|
1166
|
+
taxa_list_df.drop(index=i_row, inplace=True)
|
|
1143
1167
|
|
|
1168
|
+
# Convert all NaN values in the "common" column to empty strings
|
|
1169
|
+
taxa_list_df['common'] = taxa_list_df['common'].fillna('')
|
|
1170
|
+
|
|
1171
|
+
# Create a dictionary mapping latin names to common names
|
|
1144
1172
|
target_latin_to_common = {}
|
|
1145
1173
|
|
|
1146
|
-
for
|
|
1174
|
+
for i_row,row in taxa_list_df.iterrows():
|
|
1147
1175
|
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1176
|
+
latin = row['latin']
|
|
1177
|
+
common = row['common']
|
|
1178
|
+
|
|
1179
|
+
if use_original_common_names_if_available and \
|
|
1180
|
+
('original_common' in row) and \
|
|
1181
|
+
(not is_empty(row['original_common'])):
|
|
1182
|
+
common = row['original_common'].strip().lower()
|
|
1183
|
+
|
|
1184
|
+
# Valid latin names have either one token (e.g. "canidae"),
|
|
1185
|
+
# two tokens (e.g. "bos taurus"), or three tokens (e.g. "canis lupus familiaris")
|
|
1186
|
+
assert len(latin.split(' ')) in (1,2,3), \
|
|
1187
|
+
'Illegal binomial name {} in taxaonomy list {}'.format(
|
|
1188
|
+
latin,taxa_list)
|
|
1189
|
+
|
|
1190
|
+
if latin in target_latin_to_common:
|
|
1191
|
+
error_string = \
|
|
1192
|
+
'scientific name {} appears multiple times in the taxonomy list'.format(
|
|
1193
|
+
latin)
|
|
1163
1194
|
if allow_redundant_latin_names:
|
|
1164
|
-
|
|
1195
|
+
if verbose:
|
|
1196
|
+
print('Warning: {}'.format(error_string))
|
|
1165
1197
|
else:
|
|
1166
1198
|
raise ValueError(error_string)
|
|
1167
|
-
target_latin_to_common[binomial_name] = common_name
|
|
1168
1199
|
|
|
1169
|
-
|
|
1200
|
+
target_latin_to_common[latin] = common
|
|
1201
|
+
|
|
1202
|
+
# ...for each row in the custom taxonomy list
|
|
1170
1203
|
|
|
1171
1204
|
|
|
1172
1205
|
##%% Read taxonomy file
|
|
@@ -1185,7 +1218,7 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1185
1218
|
def _insert_taxonomy_string(s):
|
|
1186
1219
|
|
|
1187
1220
|
tokens = s.split(';')
|
|
1188
|
-
assert len(tokens) == 7
|
|
1221
|
+
assert len(tokens) == 7, 'Illegal taxonomy string {}'.format(s)
|
|
1189
1222
|
|
|
1190
1223
|
guid = tokens[0] # noqa
|
|
1191
1224
|
class_name = tokens[1]
|
|
@@ -1196,20 +1229,24 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1196
1229
|
common_name = tokens[6]
|
|
1197
1230
|
|
|
1198
1231
|
if len(class_name) == 0:
|
|
1199
|
-
assert common_name in ('animal','vehicle','blank')
|
|
1232
|
+
assert common_name in ('animal','vehicle','blank'), \
|
|
1233
|
+
'Illegal common name {}'.format(common_name)
|
|
1200
1234
|
return
|
|
1201
1235
|
|
|
1202
1236
|
if len(species) > 0:
|
|
1203
|
-
assert all([len(s) > 0 for s in [genus,family,order]])
|
|
1237
|
+
assert all([len(s) > 0 for s in [genus,family,order]]), \
|
|
1238
|
+
'Higher-level taxa missing for {}: {},{},{}'.format(s,genus,family,order)
|
|
1204
1239
|
binomial_name = genus + ' ' + species
|
|
1205
1240
|
if binomial_name not in speciesnet_latin_name_to_taxon_string:
|
|
1206
1241
|
speciesnet_latin_name_to_taxon_string[binomial_name] = s
|
|
1207
1242
|
elif len(genus) > 0:
|
|
1208
|
-
assert all([len(s) > 0 for s in [family,order]])
|
|
1243
|
+
assert all([len(s) > 0 for s in [family,order]]), \
|
|
1244
|
+
'Higher-level taxa missing for {}: {},{}'.format(s,family,order)
|
|
1209
1245
|
if genus not in speciesnet_latin_name_to_taxon_string:
|
|
1210
1246
|
speciesnet_latin_name_to_taxon_string[genus] = s
|
|
1211
1247
|
elif len(family) > 0:
|
|
1212
|
-
assert len(order) > 0
|
|
1248
|
+
assert len(order) > 0, \
|
|
1249
|
+
'Higher-level taxa missing for {}: {}'.format(s,order)
|
|
1213
1250
|
if family not in speciesnet_latin_name_to_taxon_string:
|
|
1214
1251
|
speciesnet_latin_name_to_taxon_string[family] = s
|
|
1215
1252
|
elif len(order) > 0:
|
|
@@ -1232,12 +1269,19 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1232
1269
|
|
|
1233
1270
|
# In theory any taxon that appears as the parent of another taxon should
|
|
1234
1271
|
# also be in the taxonomy, but this isn't always true, so we fix it here.
|
|
1235
|
-
|
|
1236
1272
|
new_taxon_string_to_missing_tokens = defaultdict(list)
|
|
1237
1273
|
|
|
1274
|
+
# While we're making this loop, also see whether we need to store any custom
|
|
1275
|
+
# common name mappings based on the taxonomy list.
|
|
1276
|
+
speciesnet_latin_name_to_output_common_name = {}
|
|
1277
|
+
|
|
1238
1278
|
# latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
|
|
1239
1279
|
for latin_name in speciesnet_latin_name_to_taxon_string.keys():
|
|
1240
1280
|
|
|
1281
|
+
if latin_name in target_latin_to_common:
|
|
1282
|
+
speciesnet_latin_name_to_output_common_name[latin_name] = \
|
|
1283
|
+
target_latin_to_common[latin_name]
|
|
1284
|
+
|
|
1241
1285
|
if 'no cv result' in latin_name:
|
|
1242
1286
|
continue
|
|
1243
1287
|
|
|
@@ -1260,7 +1304,8 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1260
1304
|
for i_copy_token in range(1,i_token+1):
|
|
1261
1305
|
new_tokens[i_copy_token] = tokens[i_copy_token]
|
|
1262
1306
|
new_tokens[-1] = test_token + ' species'
|
|
1263
|
-
assert new_tokens[-2] == ''
|
|
1307
|
+
assert new_tokens[-2] == '', \
|
|
1308
|
+
'Illegal taxonomy string {}'.format(taxon_string)
|
|
1264
1309
|
new_taxon_string = ';'.join(new_tokens)
|
|
1265
1310
|
# assert new_taxon_string not in new_taxon_strings
|
|
1266
1311
|
new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
|
|
@@ -1269,14 +1314,19 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1269
1314
|
|
|
1270
1315
|
# ...for each taxon
|
|
1271
1316
|
|
|
1272
|
-
print('Found {} taxa that need to be inserted to make the taxonomy valid:\n'.format(
|
|
1273
|
-
len(new_taxon_string_to_missing_tokens)))
|
|
1274
|
-
|
|
1275
1317
|
new_taxon_string_to_missing_tokens = \
|
|
1276
1318
|
sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1319
|
+
|
|
1320
|
+
if verbose:
|
|
1321
|
+
|
|
1322
|
+
print(f'Found {len(new_taxon_string_to_missing_tokens)} taxa that need to be inserted to ' + \
|
|
1323
|
+
'make the taxonomy valid, showing only mammals and birds here:\n')
|
|
1324
|
+
|
|
1325
|
+
for taxon_string in new_taxon_string_to_missing_tokens:
|
|
1326
|
+
if 'mammalia' not in taxon_string and 'aves' not in taxon_string:
|
|
1327
|
+
continue
|
|
1328
|
+
missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
|
|
1329
|
+
print('{} ({})'.format(taxon_string,missing_taxa))
|
|
1280
1330
|
|
|
1281
1331
|
for new_taxon_string in new_taxon_string_to_missing_tokens:
|
|
1282
1332
|
_insert_taxonomy_string(new_taxon_string)
|
|
@@ -1298,7 +1348,7 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1298
1348
|
n_failed_mappings += 1
|
|
1299
1349
|
|
|
1300
1350
|
if n_failed_mappings > 0:
|
|
1301
|
-
raise ValueError('Cannot continue with
|
|
1351
|
+
raise ValueError('Cannot continue with taxonomic restriction')
|
|
1302
1352
|
|
|
1303
1353
|
|
|
1304
1354
|
##%% For the allow-list, map each parent taxon to a set of allowable child taxa
|
|
@@ -1312,7 +1362,8 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1312
1362
|
|
|
1313
1363
|
taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
|
|
1314
1364
|
tokens = taxon_string.split(';')
|
|
1315
|
-
assert len(tokens) == 7
|
|
1365
|
+
assert len(tokens) == 7, \
|
|
1366
|
+
'Illegal taxonomy string {}'.format(taxon_string)
|
|
1316
1367
|
|
|
1317
1368
|
# Remove GUID and common mame
|
|
1318
1369
|
#
|
|
@@ -1324,25 +1375,85 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1324
1375
|
# If this is a species
|
|
1325
1376
|
if len(tokens[-1]) > 0:
|
|
1326
1377
|
binomial_name = tokens[-2] + ' ' + tokens[-1]
|
|
1327
|
-
assert binomial_name == latin_name
|
|
1378
|
+
assert binomial_name == latin_name, \
|
|
1379
|
+
'Binomial/latin mismatch: {} vs {}'.format(binomial_name,latin_name)
|
|
1380
|
+
# If this already exists, it should only allow "None"
|
|
1381
|
+
if binomial_name in allowed_parent_taxon_to_child_taxa:
|
|
1382
|
+
assert len(allowed_parent_taxon_to_child_taxa[binomial_name]) == 1, \
|
|
1383
|
+
'Species-level entry {} has multiple children'.format(binomial_name)
|
|
1384
|
+
assert None in allowed_parent_taxon_to_child_taxa[binomial_name], \
|
|
1385
|
+
'Species-level entry {} has non-None children'.format(binomial_name)
|
|
1328
1386
|
allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
|
|
1329
1387
|
child_taxon = binomial_name
|
|
1330
1388
|
|
|
1331
|
-
# The first
|
|
1389
|
+
# The first level that can ever be a parent taxon is the genus level
|
|
1332
1390
|
parent_token_index = len(tokens) - 2
|
|
1333
1391
|
|
|
1392
|
+
# Walk up from genus to family
|
|
1334
1393
|
while(parent_token_index >= 0):
|
|
1335
1394
|
|
|
1395
|
+
# "None" is our leaf node marker, we should never have ''
|
|
1396
|
+
if child_taxon is not None:
|
|
1397
|
+
assert len(child_taxon) > 0
|
|
1398
|
+
|
|
1336
1399
|
parent_taxon = tokens[parent_token_index]
|
|
1337
|
-
|
|
1338
|
-
|
|
1400
|
+
|
|
1401
|
+
# Don't create entries for blank taxa
|
|
1402
|
+
if (len(parent_taxon) > 0):
|
|
1403
|
+
|
|
1404
|
+
create_child = True
|
|
1405
|
+
|
|
1406
|
+
# This is the lowest-level taxon in this entry
|
|
1407
|
+
if (child_taxon is None):
|
|
1408
|
+
|
|
1409
|
+
# ...but we don't want to remove existing children from any parents
|
|
1410
|
+
if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
|
|
1411
|
+
(len(allowed_parent_taxon_to_child_taxa[parent_taxon]) > 0):
|
|
1412
|
+
if verbose:
|
|
1413
|
+
existing_children_string = str(allowed_parent_taxon_to_child_taxa[parent_taxon])
|
|
1414
|
+
print('Not creating empty child for parent {} (already has children {})'.format(
|
|
1415
|
+
parent_taxon,existing_children_string))
|
|
1416
|
+
create_child = False
|
|
1417
|
+
|
|
1418
|
+
# If we're adding a new child entry, clear out any leaf node markers
|
|
1419
|
+
else:
|
|
1420
|
+
|
|
1421
|
+
if (parent_taxon in allowed_parent_taxon_to_child_taxa) and \
|
|
1422
|
+
(None in allowed_parent_taxon_to_child_taxa[parent_taxon]):
|
|
1423
|
+
|
|
1424
|
+
assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
|
|
1425
|
+
'Illlegal parent/child configuration'
|
|
1426
|
+
|
|
1427
|
+
if verbose:
|
|
1428
|
+
print('Un-marking parent {} as a leaf node because of child {}'.format(
|
|
1429
|
+
parent_taxon,child_taxon))
|
|
1430
|
+
|
|
1431
|
+
allowed_parent_taxon_to_child_taxa[parent_taxon] = set()
|
|
1432
|
+
|
|
1433
|
+
if create_child:
|
|
1434
|
+
allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
|
|
1435
|
+
|
|
1436
|
+
# If we haven't hit a non-empty taxon yet, don't update "child_taxon"
|
|
1437
|
+
assert len(parent_taxon) > 0
|
|
1438
|
+
child_taxon = parent_taxon
|
|
1439
|
+
|
|
1440
|
+
# ...if we have a non-empty taxon
|
|
1441
|
+
|
|
1339
1442
|
parent_token_index -= 1
|
|
1340
1443
|
|
|
1444
|
+
# ...for each taxonomic level
|
|
1445
|
+
|
|
1341
1446
|
# ...for each allowed latin name
|
|
1342
1447
|
|
|
1343
1448
|
allowed_parent_taxon_to_child_taxa = \
|
|
1344
1449
|
sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
|
|
1345
1450
|
|
|
1451
|
+
for parent_taxon in allowed_parent_taxon_to_child_taxa:
|
|
1452
|
+
# "None" should only ever appear alone; this marks a leaf node with no children
|
|
1453
|
+
if None in allowed_parent_taxon_to_child_taxa[parent_taxon]:
|
|
1454
|
+
assert len(allowed_parent_taxon_to_child_taxa[parent_taxon]) == 1, \
|
|
1455
|
+
'"None" should only appear alone in a child taxon list'
|
|
1456
|
+
|
|
1346
1457
|
|
|
1347
1458
|
##%% If we were just validating the custom taxa file, we're done
|
|
1348
1459
|
|
|
@@ -1369,11 +1480,25 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1369
1480
|
|
|
1370
1481
|
input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
|
|
1371
1482
|
input_taxon_tokens = input_taxon_string.split(';')
|
|
1372
|
-
assert len(input_taxon_tokens) == 7
|
|
1483
|
+
assert len(input_taxon_tokens) == 7, \
|
|
1484
|
+
'Illegal taxonomy string: {}'.format(input_taxon_string)
|
|
1373
1485
|
|
|
1374
|
-
# Don't mess with blank/no-cv-result/animal
|
|
1375
|
-
if (input_taxon_string
|
|
1486
|
+
# Don't mess with blank/no-cv-result/human (or "animal", which is really "unknown")
|
|
1487
|
+
if (not is_taxonomic_prediction_string(input_taxon_string)) or \
|
|
1376
1488
|
(input_taxon_string == human_prediction_string):
|
|
1489
|
+
if verbose:
|
|
1490
|
+
print('Not messing with non-taxonomic category {}'.format(input_taxon_string))
|
|
1491
|
+
input_category_id_to_output_taxon_string[input_category_id] = \
|
|
1492
|
+
input_taxon_string
|
|
1493
|
+
continue
|
|
1494
|
+
|
|
1495
|
+
# Don't mess with protected categories
|
|
1496
|
+
common_name = input_taxon_tokens[-1]
|
|
1497
|
+
|
|
1498
|
+
if (protected_common_names is not None) and \
|
|
1499
|
+
(common_name in protected_common_names):
|
|
1500
|
+
if verbose:
|
|
1501
|
+
print('Not messing with protected category {}'.format(common_name))
|
|
1377
1502
|
input_category_id_to_output_taxon_string[input_category_id] = \
|
|
1378
1503
|
input_taxon_string
|
|
1379
1504
|
continue
|
|
@@ -1403,19 +1528,23 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1403
1528
|
test_index -= 1
|
|
1404
1529
|
continue
|
|
1405
1530
|
|
|
1406
|
-
assert test_taxon_name in speciesnet_latin_name_to_taxon_string
|
|
1531
|
+
assert test_taxon_name in speciesnet_latin_name_to_taxon_string, \
|
|
1532
|
+
'{} should be a substring of {}'.format(test_taxon_name,
|
|
1533
|
+
speciesnet_latin_name_to_taxon_string)
|
|
1407
1534
|
|
|
1408
1535
|
# Is this taxon allowed according to the custom species list?
|
|
1409
1536
|
if test_taxon_name in allowed_parent_taxon_to_child_taxa:
|
|
1410
1537
|
|
|
1411
1538
|
allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
|
|
1412
|
-
assert allowed_child_taxa is not None
|
|
1539
|
+
assert allowed_child_taxa is not None, \
|
|
1540
|
+
'allowed_child_taxa should not be None: {}'.format(test_taxon_name)
|
|
1413
1541
|
|
|
1414
1542
|
# If this is the lowest-level allowable token or there is not a
|
|
1415
1543
|
# unique child, don't walk any further, even if walking down
|
|
1416
1544
|
# is enabled.
|
|
1417
|
-
if
|
|
1418
|
-
assert len(allowed_child_taxa) == 1
|
|
1545
|
+
if None in allowed_child_taxa:
|
|
1546
|
+
assert len(allowed_child_taxa) == 1, \
|
|
1547
|
+
'"None" should not be listed as a child taxa with other child taxa'
|
|
1419
1548
|
|
|
1420
1549
|
if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
|
|
1421
1550
|
target_taxon = test_taxon_name
|
|
@@ -1427,8 +1556,12 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1427
1556
|
while ((next(iter(allowed_child_taxa)) is not None) and \
|
|
1428
1557
|
(len(allowed_child_taxa) == 1)):
|
|
1429
1558
|
candidate_taxon = next(iter(allowed_child_taxa))
|
|
1430
|
-
assert candidate_taxon in allowed_parent_taxon_to_child_taxa
|
|
1431
|
-
|
|
1559
|
+
assert candidate_taxon in allowed_parent_taxon_to_child_taxa, \
|
|
1560
|
+
'{} should be a subset of {}'.format(
|
|
1561
|
+
candidate_taxon,allowed_parent_taxon_to_child_taxa)
|
|
1562
|
+
assert candidate_taxon in speciesnet_latin_name_to_taxon_string, \
|
|
1563
|
+
'{} should be a subset of {}'.format(
|
|
1564
|
+
candidate_taxon,speciesnet_latin_name_to_taxon_string)
|
|
1432
1565
|
allowed_child_taxa = \
|
|
1433
1566
|
allowed_parent_taxon_to_child_taxa[candidate_taxon]
|
|
1434
1567
|
target_taxon = candidate_taxon
|
|
@@ -1450,21 +1583,30 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1450
1583
|
|
|
1451
1584
|
##%% Build the new tables
|
|
1452
1585
|
|
|
1586
|
+
speciesnet_taxon_string_to_latin_name = invert_dictionary(speciesnet_latin_name_to_taxon_string)
|
|
1587
|
+
|
|
1453
1588
|
input_category_id_to_output_category_id = {}
|
|
1454
1589
|
output_taxon_string_to_category_id = {}
|
|
1455
1590
|
output_category_id_to_common_name = {}
|
|
1456
1591
|
|
|
1457
1592
|
for input_category_id in input_category_id_to_output_taxon_string:
|
|
1458
1593
|
|
|
1459
|
-
original_common_name = \
|
|
1460
|
-
input_category_id_to_common_name[input_category_id]
|
|
1461
|
-
original_taxon_string = \
|
|
1462
|
-
input_category_id_to_taxonomy_string[input_category_id]
|
|
1463
1594
|
output_taxon_string = \
|
|
1464
1595
|
input_category_id_to_output_taxon_string[input_category_id]
|
|
1465
1596
|
|
|
1466
1597
|
output_common_name = output_taxon_string.split(';')[-1]
|
|
1467
1598
|
|
|
1599
|
+
# Possibly substitute a custom common name
|
|
1600
|
+
if output_taxon_string in speciesnet_taxon_string_to_latin_name:
|
|
1601
|
+
|
|
1602
|
+
speciesnet_latin_name = speciesnet_taxon_string_to_latin_name[output_taxon_string]
|
|
1603
|
+
|
|
1604
|
+
if speciesnet_latin_name in speciesnet_latin_name_to_output_common_name:
|
|
1605
|
+
custom_common_name = speciesnet_latin_name_to_output_common_name[speciesnet_latin_name]
|
|
1606
|
+
if custom_common_name != output_common_name:
|
|
1607
|
+
print('Substituting common name {} for {}'.format(custom_common_name,output_common_name))
|
|
1608
|
+
output_common_name = custom_common_name
|
|
1609
|
+
|
|
1468
1610
|
# Do we need to create a new output category?
|
|
1469
1611
|
if output_taxon_string not in output_taxon_string_to_category_id:
|
|
1470
1612
|
output_category_id = str(len(output_taxon_string_to_category_id))
|
|
@@ -1479,21 +1621,28 @@ def restrict_to_taxa_list(taxa_list,
|
|
|
1479
1621
|
input_category_id_to_output_category_id[input_category_id] = \
|
|
1480
1622
|
output_category_id
|
|
1481
1623
|
|
|
1624
|
+
# Sometimes-useful debug printouts
|
|
1482
1625
|
if False:
|
|
1626
|
+
original_common_name = \
|
|
1627
|
+
input_category_id_to_common_name[input_category_id]
|
|
1628
|
+
|
|
1629
|
+
original_taxon_string = \
|
|
1630
|
+
input_category_id_to_taxonomy_string[input_category_id]
|
|
1631
|
+
|
|
1483
1632
|
print('Mapping {} ({}) to:\n{} ({})\n'.format(
|
|
1484
1633
|
original_common_name,original_taxon_string,
|
|
1485
1634
|
output_common_name,output_taxon_string))
|
|
1486
|
-
if False:
|
|
1487
1635
|
print('Mapping {} to {}'.format(
|
|
1488
1636
|
original_common_name,output_common_name,))
|
|
1489
1637
|
|
|
1490
1638
|
# ...for each category
|
|
1491
1639
|
|
|
1492
1640
|
|
|
1493
|
-
|
|
1641
|
+
#%% Remap all category labels
|
|
1494
1642
|
|
|
1495
1643
|
assert len(set(output_taxon_string_to_category_id.keys())) == \
|
|
1496
|
-
len(set(output_taxon_string_to_category_id.values()))
|
|
1644
|
+
len(set(output_taxon_string_to_category_id.values())), \
|
|
1645
|
+
'Category ID/value non-uniqueness error'
|
|
1497
1646
|
|
|
1498
1647
|
output_category_id_to_taxon_string = \
|
|
1499
1648
|
invert_dictionary(output_taxon_string_to_category_id)
|