megadetector 10.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/__init__.py +0 -0
- megadetector/api/__init__.py +0 -0
- megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
- megadetector/classification/__init__.py +0 -0
- megadetector/classification/aggregate_classifier_probs.py +108 -0
- megadetector/classification/analyze_failed_images.py +227 -0
- megadetector/classification/cache_batchapi_outputs.py +198 -0
- megadetector/classification/create_classification_dataset.py +626 -0
- megadetector/classification/crop_detections.py +516 -0
- megadetector/classification/csv_to_json.py +226 -0
- megadetector/classification/detect_and_crop.py +853 -0
- megadetector/classification/efficientnet/__init__.py +9 -0
- megadetector/classification/efficientnet/model.py +415 -0
- megadetector/classification/efficientnet/utils.py +608 -0
- megadetector/classification/evaluate_model.py +520 -0
- megadetector/classification/identify_mislabeled_candidates.py +152 -0
- megadetector/classification/json_to_azcopy_list.py +63 -0
- megadetector/classification/json_validator.py +696 -0
- megadetector/classification/map_classification_categories.py +276 -0
- megadetector/classification/merge_classification_detection_output.py +509 -0
- megadetector/classification/prepare_classification_script.py +194 -0
- megadetector/classification/prepare_classification_script_mc.py +228 -0
- megadetector/classification/run_classifier.py +287 -0
- megadetector/classification/save_mislabeled.py +110 -0
- megadetector/classification/train_classifier.py +827 -0
- megadetector/classification/train_classifier_tf.py +725 -0
- megadetector/classification/train_utils.py +323 -0
- megadetector/data_management/__init__.py +0 -0
- megadetector/data_management/animl_to_md.py +161 -0
- megadetector/data_management/annotations/__init__.py +0 -0
- megadetector/data_management/annotations/annotation_constants.py +33 -0
- megadetector/data_management/camtrap_dp_to_coco.py +270 -0
- megadetector/data_management/cct_json_utils.py +566 -0
- megadetector/data_management/cct_to_md.py +184 -0
- megadetector/data_management/cct_to_wi.py +293 -0
- megadetector/data_management/coco_to_labelme.py +284 -0
- megadetector/data_management/coco_to_yolo.py +702 -0
- megadetector/data_management/databases/__init__.py +0 -0
- megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
- megadetector/data_management/databases/integrity_check_json_db.py +528 -0
- megadetector/data_management/databases/subset_json_db.py +195 -0
- megadetector/data_management/generate_crops_from_cct.py +200 -0
- megadetector/data_management/get_image_sizes.py +164 -0
- megadetector/data_management/labelme_to_coco.py +559 -0
- megadetector/data_management/labelme_to_yolo.py +349 -0
- megadetector/data_management/lila/__init__.py +0 -0
- megadetector/data_management/lila/create_lila_blank_set.py +556 -0
- megadetector/data_management/lila/create_lila_test_set.py +187 -0
- megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
- megadetector/data_management/lila/download_lila_subset.py +182 -0
- megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
- megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
- megadetector/data_management/lila/get_lila_image_counts.py +112 -0
- megadetector/data_management/lila/lila_common.py +319 -0
- megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
- megadetector/data_management/mewc_to_md.py +344 -0
- megadetector/data_management/ocr_tools.py +873 -0
- megadetector/data_management/read_exif.py +964 -0
- megadetector/data_management/remap_coco_categories.py +195 -0
- megadetector/data_management/remove_exif.py +156 -0
- megadetector/data_management/rename_images.py +194 -0
- megadetector/data_management/resize_coco_dataset.py +663 -0
- megadetector/data_management/speciesnet_to_md.py +41 -0
- megadetector/data_management/wi_download_csv_to_coco.py +247 -0
- megadetector/data_management/yolo_output_to_md_output.py +594 -0
- megadetector/data_management/yolo_to_coco.py +876 -0
- megadetector/data_management/zamba_to_md.py +188 -0
- megadetector/detection/__init__.py +0 -0
- megadetector/detection/change_detection.py +840 -0
- megadetector/detection/process_video.py +479 -0
- megadetector/detection/pytorch_detector.py +1451 -0
- megadetector/detection/run_detector.py +1267 -0
- megadetector/detection/run_detector_batch.py +2159 -0
- megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
- megadetector/detection/run_md_and_speciesnet.py +1494 -0
- megadetector/detection/run_tiled_inference.py +1038 -0
- megadetector/detection/tf_detector.py +209 -0
- megadetector/detection/video_utils.py +1379 -0
- megadetector/postprocessing/__init__.py +0 -0
- megadetector/postprocessing/add_max_conf.py +72 -0
- megadetector/postprocessing/categorize_detections_by_size.py +166 -0
- megadetector/postprocessing/classification_postprocessing.py +1752 -0
- megadetector/postprocessing/combine_batch_outputs.py +249 -0
- megadetector/postprocessing/compare_batch_results.py +2110 -0
- megadetector/postprocessing/convert_output_format.py +403 -0
- megadetector/postprocessing/create_crop_folder.py +629 -0
- megadetector/postprocessing/detector_calibration.py +570 -0
- megadetector/postprocessing/generate_csv_report.py +522 -0
- megadetector/postprocessing/load_api_results.py +223 -0
- megadetector/postprocessing/md_to_coco.py +428 -0
- megadetector/postprocessing/md_to_labelme.py +351 -0
- megadetector/postprocessing/md_to_wi.py +41 -0
- megadetector/postprocessing/merge_detections.py +392 -0
- megadetector/postprocessing/postprocess_batch_results.py +2077 -0
- megadetector/postprocessing/remap_detection_categories.py +226 -0
- megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
- megadetector/postprocessing/separate_detections_into_folders.py +795 -0
- megadetector/postprocessing/subset_json_detector_output.py +964 -0
- megadetector/postprocessing/top_folders_to_bottom.py +238 -0
- megadetector/postprocessing/validate_batch_results.py +332 -0
- megadetector/taxonomy_mapping/__init__.py +0 -0
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
- megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
- megadetector/taxonomy_mapping/simple_image_download.py +224 -0
- megadetector/taxonomy_mapping/species_lookup.py +1008 -0
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
- megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
- megadetector/tests/__init__.py +0 -0
- megadetector/tests/test_nms_synthetic.py +335 -0
- megadetector/utils/__init__.py +0 -0
- megadetector/utils/ct_utils.py +1857 -0
- megadetector/utils/directory_listing.py +199 -0
- megadetector/utils/extract_frames_from_video.py +307 -0
- megadetector/utils/gpu_test.py +125 -0
- megadetector/utils/md_tests.py +2072 -0
- megadetector/utils/path_utils.py +2832 -0
- megadetector/utils/process_utils.py +172 -0
- megadetector/utils/split_locations_into_train_val.py +237 -0
- megadetector/utils/string_utils.py +234 -0
- megadetector/utils/url_utils.py +825 -0
- megadetector/utils/wi_platform_utils.py +968 -0
- megadetector/utils/wi_taxonomy_utils.py +1759 -0
- megadetector/utils/write_html_image_list.py +239 -0
- megadetector/visualization/__init__.py +0 -0
- megadetector/visualization/plot_utils.py +309 -0
- megadetector/visualization/render_images_with_thumbnails.py +243 -0
- megadetector/visualization/visualization_utils.py +1940 -0
- megadetector/visualization/visualize_db.py +630 -0
- megadetector/visualization/visualize_detector_output.py +479 -0
- megadetector/visualization/visualize_video_output.py +705 -0
- megadetector-10.0.13.dist-info/METADATA +134 -0
- megadetector-10.0.13.dist-info/RECORD +147 -0
- megadetector-10.0.13.dist-info/WHEEL +5 -0
- megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
- megadetector-10.0.13.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
taxonomy_csv_checker.py
|
|
4
|
+
|
|
5
|
+
Checks the taxonomy CSV file to make sure that for each row:
|
|
6
|
+
|
|
7
|
+
1) The 'taxonomy_level' column matches the lowest-level taxon level in the
|
|
8
|
+
'taxonomy_string' column.
|
|
9
|
+
|
|
10
|
+
2) The 'scientific_name' column matches the scientific name from the
|
|
11
|
+
lowest-level taxon level in the 'taxonomy_string' column.
|
|
12
|
+
|
|
13
|
+
Prints out any mismatches.
|
|
14
|
+
|
|
15
|
+
Also prints out nodes that have 2 ambiguous parents. See "CASE 2" from the
|
|
16
|
+
module docstring of taxonomy_graph.py.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
#%% Imports
|
|
21
|
+
|
|
22
|
+
import sys
|
|
23
|
+
import argparse
|
|
24
|
+
|
|
25
|
+
import networkx as nx
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
from typing import Optional
|
|
29
|
+
|
|
30
|
+
from megadetector.taxonomy_mapping.taxonomy_graph import TaxonNode, dag_to_tree
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
#%% Taxonomy checking
|
|
34
|
+
|
|
35
|
+
def check_taxonomy_csv(csv_path: str) -> None:
|
|
36
|
+
"""
|
|
37
|
+
See module docstring.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
taxonomy_df = pd.read_csv(csv_path)
|
|
41
|
+
|
|
42
|
+
graph = nx.DiGraph()
|
|
43
|
+
taxon_to_node = {} # maps (taxon_level, taxon_name) to a TaxonNode
|
|
44
|
+
|
|
45
|
+
num_taxon_level_errors = 0
|
|
46
|
+
num_scientific_name_errors = 0
|
|
47
|
+
|
|
48
|
+
for i_row, row in taxonomy_df.iterrows():
|
|
49
|
+
|
|
50
|
+
ds = row['dataset_name']
|
|
51
|
+
ds_label = row['query']
|
|
52
|
+
scientific_name = row['scientific_name']
|
|
53
|
+
level = row['taxonomy_level']
|
|
54
|
+
|
|
55
|
+
# This used to represent the source of the mapping: iNat, gbif, or manual. We've
|
|
56
|
+
# stopped tracking this, so this is now vestigial.
|
|
57
|
+
id_source = 0 # row['source']
|
|
58
|
+
|
|
59
|
+
taxa_ancestry = row['taxonomy_string']
|
|
60
|
+
if pd.isna(taxa_ancestry):
|
|
61
|
+
# taxonomy CSV rows without 'taxonomy_string' entries are excluded
|
|
62
|
+
# from the taxonomy graph, but can be included in a classification
|
|
63
|
+
# label specification JSON via the 'dataset_labels' key
|
|
64
|
+
continue
|
|
65
|
+
else:
|
|
66
|
+
taxa_ancestry = eval(taxa_ancestry) # pylint: disable=eval-used
|
|
67
|
+
|
|
68
|
+
taxon_child: Optional[TaxonNode] = None
|
|
69
|
+
for j, taxon in enumerate(taxa_ancestry):
|
|
70
|
+
taxon_id, taxon_level, taxon_name, _ = taxon
|
|
71
|
+
|
|
72
|
+
key = (taxon_level, taxon_name)
|
|
73
|
+
if key not in taxon_to_node:
|
|
74
|
+
taxon_to_node[key] = TaxonNode(level=taxon_level,
|
|
75
|
+
name=taxon_name, graph=graph)
|
|
76
|
+
node = taxon_to_node[key]
|
|
77
|
+
|
|
78
|
+
if taxon_child is not None:
|
|
79
|
+
node.add_child(taxon_child)
|
|
80
|
+
|
|
81
|
+
node.add_id(id_source, int(taxon_id)) # np.int64 -> int
|
|
82
|
+
if j == 0:
|
|
83
|
+
if level != taxon_level:
|
|
84
|
+
print(f'row: {i_row}, {ds}, {ds_label}')
|
|
85
|
+
print(f'- taxonomy_level column: {level}, '
|
|
86
|
+
f'level from taxonomy_string: {taxon_level}')
|
|
87
|
+
print()
|
|
88
|
+
num_taxon_level_errors += 1
|
|
89
|
+
|
|
90
|
+
if scientific_name != taxon_name:
|
|
91
|
+
print(f'row: {i_row}, {ds}, {ds_label}')
|
|
92
|
+
print(f'- scientific_name column: {scientific_name}, '
|
|
93
|
+
f'name from taxonomy_string: {taxon_name}')
|
|
94
|
+
print()
|
|
95
|
+
num_scientific_name_errors += 1
|
|
96
|
+
|
|
97
|
+
taxon_child = node
|
|
98
|
+
|
|
99
|
+
# ...for each row in the taxonomy file
|
|
100
|
+
|
|
101
|
+
assert nx.is_directed_acyclic_graph(graph)
|
|
102
|
+
|
|
103
|
+
for node in graph.nodes:
|
|
104
|
+
assert len(node.parents) <= 2
|
|
105
|
+
if len(node.parents) == 2:
|
|
106
|
+
p0 = node.parents[0]
|
|
107
|
+
p1 = node.parents[1]
|
|
108
|
+
assert p0 is not p1
|
|
109
|
+
|
|
110
|
+
p0_is_ancestor_of_p1 = p1 in nx.descendants(graph, p0)
|
|
111
|
+
p1_is_ancestor_of_p0 = p0 in nx.descendants(graph, p1)
|
|
112
|
+
if not p0_is_ancestor_of_p1 and not p1_is_ancestor_of_p0:
|
|
113
|
+
print('Node with two ambiguous parents:', node)
|
|
114
|
+
print('\t', p0)
|
|
115
|
+
print('\t\t', p0.parents)
|
|
116
|
+
print('\t', p1)
|
|
117
|
+
print('\t\t', p1.parents)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
dag_to_tree(graph, taxon_to_node)
|
|
121
|
+
print('All ambiguous parents have hard-coded resolution in '
|
|
122
|
+
'dag_to_tree().')
|
|
123
|
+
except AssertionError as e:
|
|
124
|
+
print(f'At least one node has unresolved ambiguous parents: {e}')
|
|
125
|
+
|
|
126
|
+
print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
|
|
127
|
+
|
|
128
|
+
print('num taxon level errors:', num_taxon_level_errors)
|
|
129
|
+
print('num scientific name errors:', num_scientific_name_errors)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
#%% Command-line driver
|
|
133
|
+
|
|
134
|
+
if __name__ == '__main__':
|
|
135
|
+
|
|
136
|
+
parser = argparse.ArgumentParser()
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
'taxonomy_csv_path',
|
|
139
|
+
help='path to taxonomy CSV file')
|
|
140
|
+
|
|
141
|
+
if len(sys.argv[1:]) == 0:
|
|
142
|
+
parser.print_help()
|
|
143
|
+
parser.exit()
|
|
144
|
+
|
|
145
|
+
args = parser.parse_args()
|
|
146
|
+
|
|
147
|
+
check_taxonomy_csv(args.taxonomy_csv_path)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
#%% Interactive driver
|
|
151
|
+
|
|
152
|
+
if False:
|
|
153
|
+
|
|
154
|
+
#%%
|
|
155
|
+
|
|
156
|
+
import os
|
|
157
|
+
csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
|
|
158
|
+
check_taxonomy_csv(csv_path)
|
|
159
|
+
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
|
|
3
|
+
taxonomy_graph.py
|
|
4
|
+
|
|
5
|
+
Methods for transforming taxonomy CSV into a graph structure backed by
|
|
6
|
+
NetworkX.
|
|
7
|
+
|
|
8
|
+
We treat each taxon in the taxonomy as a node in a graph, represented by the
|
|
9
|
+
TaxonNode class. We use a NetworkX directed graph (nx.DiGraph) to keep track of
|
|
10
|
+
the edges (parent-child relationships) between the nodes.
|
|
11
|
+
|
|
12
|
+
In theory, the true biological taxonomy graph should be a tree, where every
|
|
13
|
+
taxon node has exactly 1 parent. However, because we use both GBIF and INAT
|
|
14
|
+
taxonomies, there are 2 situations where a taxon node ends up with two parents.
|
|
15
|
+
Thus, the graph is actually a "directed acyclic graph" (DAG) instead of a tree.
|
|
16
|
+
|
|
17
|
+
The two situations are explained in detail below. This module includes a
|
|
18
|
+
function dag_to_tree() which converts a DAG to a tree by heuristically removing
|
|
19
|
+
edges from the DAG so that each node only has 1 parent.
|
|
20
|
+
|
|
21
|
+
CASE 1: INAT and GBIF have different granularity in their taxonomy levels
|
|
22
|
+
======
|
|
23
|
+
An example is shown below. In dag_to_tree(), the lower parent is kept, while
|
|
24
|
+
the higher-up parent is discarded. In this example, the "sciurini -> sciurus"
|
|
25
|
+
edge would be kept, while "sciuridae -> sciurus" would be removed.
|
|
26
|
+
|
|
27
|
+
"eastern gray squirrel" (inat) "squirrel" (gbif)
|
|
28
|
+
------------------------------ -----------------
|
|
29
|
+
family: sciuridae
|
|
30
|
+
/ \
|
|
31
|
+
subfamily: sciurinae | # skips subfamily
|
|
32
|
+
| |
|
|
33
|
+
tribe: sciurini | # skips tribe
|
|
34
|
+
\ /
|
|
35
|
+
genus: sciurus
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
CASE 2: INAT and GBIF have different taxonomies
|
|
39
|
+
======
|
|
40
|
+
An example is shown below. In dag_to_tree(), the resolution to these
|
|
41
|
+
discrepancies are hard-coded.
|
|
42
|
+
|
|
43
|
+
order: cathartiformes (inat) accipitriformes (gbif)
|
|
44
|
+
\ /
|
|
45
|
+
family: cathartidae
|
|
46
|
+
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
#%% Imports and constants
|
|
50
|
+
|
|
51
|
+
# allow forward references in typing annotations
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
from typing import (ClassVar, Container, Dict, Iterable, List, Optional, Set,
|
|
55
|
+
Tuple)
|
|
56
|
+
|
|
57
|
+
import networkx as nx
|
|
58
|
+
import pandas as pd
|
|
59
|
+
|
|
60
|
+
default_source = 'inat'
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
#%% Classes
|
|
64
|
+
|
|
65
|
+
class TaxonNode:
|
|
66
|
+
"""
|
|
67
|
+
A node in a taxonomy graph (DAG), associated with a set of dataset labels.
|
|
68
|
+
|
|
69
|
+
By default, we support multiple parents for each TaxonNode. See discussion
|
|
70
|
+
in module docstring above.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# class variables
|
|
74
|
+
single_parent_only: ClassVar[bool] = False
|
|
75
|
+
|
|
76
|
+
# instance variables
|
|
77
|
+
level: str
|
|
78
|
+
name: str
|
|
79
|
+
ids: Set[Tuple[str, int]]
|
|
80
|
+
graph: Optional[nx.DiGraph]
|
|
81
|
+
dataset_labels: Set[Tuple[str, str]]
|
|
82
|
+
|
|
83
|
+
def __init__(self, level: str, name: str,
|
|
84
|
+
graph: Optional[nx.DiGraph] = None):
|
|
85
|
+
|
|
86
|
+
self.level = level
|
|
87
|
+
self.name = name
|
|
88
|
+
self.graph = graph
|
|
89
|
+
self.ids = set()
|
|
90
|
+
self.dataset_labels = set()
|
|
91
|
+
|
|
92
|
+
def __repr__(self):
|
|
93
|
+
id_str = ', '.join(f'{source}={id}' for source, id in self.ids)
|
|
94
|
+
return f'TaxonNode({id_str}, level={self.level}, name={self.name})'
|
|
95
|
+
|
|
96
|
+
@property # read-only getter
|
|
97
|
+
def parents(self) -> List[TaxonNode]:
|
|
98
|
+
assert self.graph is not None
|
|
99
|
+
return list(self.graph.predecessors(self))
|
|
100
|
+
|
|
101
|
+
@parents.setter
|
|
102
|
+
def parents(self, parents: Iterable[TaxonNode]) -> None:
|
|
103
|
+
assert self.graph is not None
|
|
104
|
+
for p in self.parents:
|
|
105
|
+
self.graph.remove_edge(p, self)
|
|
106
|
+
for p in parents:
|
|
107
|
+
self.graph.add_edge(p, self)
|
|
108
|
+
|
|
109
|
+
@property # read-only getter
|
|
110
|
+
def children(self) -> List[TaxonNode]:
|
|
111
|
+
assert self.graph is not None
|
|
112
|
+
return list(self.graph.successors(self))
|
|
113
|
+
|
|
114
|
+
@children.setter
|
|
115
|
+
def children(self, children: Iterable[TaxonNode]) -> None:
|
|
116
|
+
assert self.graph is not None
|
|
117
|
+
for c in self.children:
|
|
118
|
+
self.graph.remove_edge(self, c)
|
|
119
|
+
for c in children:
|
|
120
|
+
self.graph.add_edge(self, c)
|
|
121
|
+
|
|
122
|
+
def add_id(self, source: str, taxon_id: int) -> None:
|
|
123
|
+
# assert source in ['gbif', 'inat', 'manual']
|
|
124
|
+
self.ids.add((source, taxon_id))
|
|
125
|
+
|
|
126
|
+
def add_parent(self, parent: TaxonNode) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Adds a TaxonNode to the list of parents of the current TaxonNode.
|
|
129
|
+
Requires this TaxonNode to be associated with a Graph.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
parent: TaxonNode, must be higher in the taxonomical hierarchy
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
assert self.graph is not None
|
|
136
|
+
parents = self.parents
|
|
137
|
+
if TaxonNode.single_parent_only and len(parents) > 0:
|
|
138
|
+
assert len(parents) == 1
|
|
139
|
+
assert parents[0] is parent, (
|
|
140
|
+
f'self.parents: {parents}, new parent: {parent}')
|
|
141
|
+
return
|
|
142
|
+
if parent not in parents:
|
|
143
|
+
self.graph.add_edge(parent, self)
|
|
144
|
+
|
|
145
|
+
def add_child(self, child: TaxonNode) -> None:
|
|
146
|
+
"""
|
|
147
|
+
Adds a TaxonNode to the list of children of the current TaxonNode.
|
|
148
|
+
Requires this TaxonNode to be associated with a Graph.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
child: TaxonNode, must be lower in the taxonomical hierarchy
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
assert self.graph is not None
|
|
155
|
+
self.graph.add_edge(self, child)
|
|
156
|
+
|
|
157
|
+
def add_dataset_label(self, ds: str, ds_label: str) -> None:
|
|
158
|
+
"""
|
|
159
|
+
Args:
|
|
160
|
+
ds: str, name of dataset
|
|
161
|
+
ds_label: str, name of label used by that dataset
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
self.dataset_labels.add((ds, ds_label))
|
|
165
|
+
|
|
166
|
+
def get_dataset_labels(self,
|
|
167
|
+
include_datasets: Optional[Container[str]] = None
|
|
168
|
+
) -> Set[Tuple[str, str]]:
|
|
169
|
+
"""
|
|
170
|
+
Returns a set of all (ds, ds_label) tuples that belong to this taxon
|
|
171
|
+
node or its descendants.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
include_datasets: list of str, names of datasets to include
|
|
175
|
+
if None, then all datasets are included
|
|
176
|
+
|
|
177
|
+
Returns: set of (ds, ds_label) tuples
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
result = self.dataset_labels
|
|
181
|
+
if include_datasets is not None:
|
|
182
|
+
result = set(tup for tup in result if tup[0] in include_datasets)
|
|
183
|
+
|
|
184
|
+
for child in self.children:
|
|
185
|
+
result |= child.get_dataset_labels(include_datasets)
|
|
186
|
+
return result
|
|
187
|
+
|
|
188
|
+
@classmethod
|
|
189
|
+
def lowest_common_ancestor(cls, nodes: Iterable[TaxonNode]
|
|
190
|
+
) -> Optional[TaxonNode]:
|
|
191
|
+
"""
|
|
192
|
+
Returns the lowest common ancestor (LCA) of a list or set of nodes.
|
|
193
|
+
|
|
194
|
+
For each node in <nodes>, get the set of nodes on the path to the root.
|
|
195
|
+
The LCA of <nodes> is certainly in the intersection of these sets.
|
|
196
|
+
Iterate through the nodes in this set intersection, looking for a node
|
|
197
|
+
such that none of its children is in this intersection. Given n nodes
|
|
198
|
+
from a k-ary tree of height h, the algorithm runs in O((n + k)h).
|
|
199
|
+
|
|
200
|
+
Returns: TaxonNode, the LCA if it exists, or None if no LCA exists
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
paths = []
|
|
204
|
+
for node in nodes:
|
|
205
|
+
# get path to root
|
|
206
|
+
path = {node}
|
|
207
|
+
remaining = node.parents.copy() # make a shallow copy
|
|
208
|
+
while len(remaining) > 0:
|
|
209
|
+
x = remaining.pop()
|
|
210
|
+
if x not in path:
|
|
211
|
+
path.add(x)
|
|
212
|
+
remaining += x.parents
|
|
213
|
+
paths.append(path)
|
|
214
|
+
intersect = set.intersection(*paths)
|
|
215
|
+
|
|
216
|
+
for node in intersect:
|
|
217
|
+
if intersect.isdisjoint(node.children):
|
|
218
|
+
return node
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
#%% Module functions
|
|
223
|
+
|
|
224
|
+
def build_taxonomy_graph(taxonomy_df: pd.DataFrame
|
|
225
|
+
) -> Tuple[
|
|
226
|
+
nx.DiGraph,
|
|
227
|
+
Dict[Tuple[str, str], TaxonNode],
|
|
228
|
+
Dict[Tuple[str, str], TaxonNode]
|
|
229
|
+
]:
|
|
230
|
+
"""
|
|
231
|
+
Creates a mapping from (taxon_level, taxon_name) to TaxonNodes, used for
|
|
232
|
+
gathering all dataset labels associated with a given taxon.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
taxonomy_df: pd.DataFrame, the taxonomy CSV
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
graph: nx.DiGraph
|
|
239
|
+
taxon_to_node: dict, maps (taxon_level, taxon_name) to a TaxonNode,
|
|
240
|
+
keys are all lowercase
|
|
241
|
+
label_to_node: dict, maps (dataset_name, dataset_label) to the lowest
|
|
242
|
+
TaxonNode node in the tree that contains the label,
|
|
243
|
+
keys are all lowercase
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
graph = nx.DiGraph()
|
|
247
|
+
taxon_to_node = {} # maps (taxon_level, taxon_name) to a TaxonNode
|
|
248
|
+
label_to_node = {} # maps (dataset_name, dataset_label) to a TaxonNode
|
|
249
|
+
for _, row in taxonomy_df.iterrows():
|
|
250
|
+
ds = row['dataset_name'].lower()
|
|
251
|
+
ds_label = row['query'].lower()
|
|
252
|
+
if 'source' in row:
|
|
253
|
+
id_source = row['source']
|
|
254
|
+
else:
|
|
255
|
+
id_source = default_source
|
|
256
|
+
taxa_ancestry = row['taxonomy_string']
|
|
257
|
+
if pd.isna(taxa_ancestry):
|
|
258
|
+
# taxonomy CSV rows without 'taxonomy_string' entries are excluded
|
|
259
|
+
# from the taxonomy graph, but can be included in a classification
|
|
260
|
+
# label specification JSON via the 'dataset_labels' key
|
|
261
|
+
continue
|
|
262
|
+
else:
|
|
263
|
+
taxa_ancestry = eval(taxa_ancestry) # pylint: disable=eval-used
|
|
264
|
+
|
|
265
|
+
taxon_child: Optional[TaxonNode] = None
|
|
266
|
+
for i, taxon in enumerate(taxa_ancestry):
|
|
267
|
+
taxon_id, taxon_level, taxon_name, _ = taxon
|
|
268
|
+
taxon_level = taxon_level.lower()
|
|
269
|
+
taxon_name = taxon_name.lower()
|
|
270
|
+
|
|
271
|
+
key = (taxon_level, taxon_name)
|
|
272
|
+
if key not in taxon_to_node:
|
|
273
|
+
taxon_to_node[key] = TaxonNode(level=taxon_level,
|
|
274
|
+
name=taxon_name, graph=graph)
|
|
275
|
+
node = taxon_to_node[key]
|
|
276
|
+
|
|
277
|
+
if taxon_child is not None:
|
|
278
|
+
node.add_child(taxon_child)
|
|
279
|
+
|
|
280
|
+
node.add_id(id_source, int(taxon_id)) # np.int64 -> int
|
|
281
|
+
if i == 0:
|
|
282
|
+
assert row['taxonomy_level'] == taxon_level, (
|
|
283
|
+
f'taxonomy CSV level: {row["taxonomy_level"]}, '
|
|
284
|
+
f'level from taxonomy_string: {taxon_level}')
|
|
285
|
+
assert row['scientific_name'] == taxon_name
|
|
286
|
+
node.add_dataset_label(ds, ds_label)
|
|
287
|
+
label_to_node[(ds, ds_label)] = node
|
|
288
|
+
|
|
289
|
+
taxon_child = node
|
|
290
|
+
|
|
291
|
+
assert nx.is_directed_acyclic_graph(graph)
|
|
292
|
+
return graph, taxon_to_node, label_to_node
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def dag_to_tree(graph: nx.DiGraph,
|
|
296
|
+
taxon_to_node: Dict[Tuple[str, str], TaxonNode]) -> nx.DiGraph:
|
|
297
|
+
"""
|
|
298
|
+
Converts the taxonomy graph from a DAG to a tree. See module docstring
|
|
299
|
+
for more information.
|
|
300
|
+
|
|
301
|
+
NOTE: nx.is_tree() on the output of this function might fail because the
|
|
302
|
+
tree may have disconnected components. Instead, check nx.is_tree() on each
|
|
303
|
+
component separately.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
graph: nx.DiGraph, DAG representation of taxonomy hierarchy
|
|
307
|
+
taxon_to_node: dict, maps (taxon_level, taxon_name) to a TaxonNode
|
|
308
|
+
|
|
309
|
+
Returns: nx.DiGraph, a tree-structured graph
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
tree = nx.DiGraph()
|
|
313
|
+
for node in graph.nodes:
|
|
314
|
+
tree.add_node(node)
|
|
315
|
+
|
|
316
|
+
if len(node.parents) == 1:
|
|
317
|
+
tree.add_edge(node.parents[0], node)
|
|
318
|
+
|
|
319
|
+
elif len(node.parents) == 2:
|
|
320
|
+
p0 = node.parents[0]
|
|
321
|
+
p1 = node.parents[1]
|
|
322
|
+
|
|
323
|
+
# use the lower parent
|
|
324
|
+
if p1 in nx.descendants(graph, p0):
|
|
325
|
+
tree.add_edge(p1, node)
|
|
326
|
+
elif p0 in nx.descendants(graph, p1):
|
|
327
|
+
tree.add_edge(p0, node)
|
|
328
|
+
else:
|
|
329
|
+
# special cases
|
|
330
|
+
if node.name == 'cathartidae':
|
|
331
|
+
p = taxon_to_node[('order', 'accipitriformes')]
|
|
332
|
+
elif node.name == 'soricidae':
|
|
333
|
+
p = taxon_to_node[('order', 'eulipotyphla')]
|
|
334
|
+
elif node.name == 'nyctanassa violacea':
|
|
335
|
+
p = taxon_to_node[('genus', 'nyctanassa')]
|
|
336
|
+
elif node.name == 'trochilidae': # this one is controversial
|
|
337
|
+
p = taxon_to_node[('order', 'caprimulgiformes')]
|
|
338
|
+
else:
|
|
339
|
+
assert False
|
|
340
|
+
|
|
341
|
+
assert (p is p0) or (p is p1)
|
|
342
|
+
tree.add_edge(p, node)
|
|
343
|
+
|
|
344
|
+
for node in tree.nodes:
|
|
345
|
+
node.graph = tree
|
|
346
|
+
return tree
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
validate_lila_category_mappings.py
|
|
4
|
+
|
|
5
|
+
Confirm that all category names on LILA have mappings in the taxonomy file.
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
#%% Constants and imports
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
from megadetector.data_management.lila.lila_common import read_lila_taxonomy_mapping
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
#%% Prevent execution during infrastructural imports
|
|
18
|
+
|
|
19
|
+
if False:
|
|
20
|
+
|
|
21
|
+
#%% Constants
|
|
22
|
+
|
|
23
|
+
lila_local_base = os.path.expanduser('~/lila')
|
|
24
|
+
|
|
25
|
+
metadata_dir = os.path.join(lila_local_base,'metadata')
|
|
26
|
+
os.makedirs(metadata_dir,exist_ok=True)
|
|
27
|
+
|
|
28
|
+
# Created by get_lila_category_list.py... contains counts for each category
|
|
29
|
+
category_list_dir = os.path.join(lila_local_base,'lila_categories_list')
|
|
30
|
+
lila_dataset_to_categories_file = os.path.join(category_list_dir,'lila_dataset_to_categories.json')
|
|
31
|
+
|
|
32
|
+
assert os.path.isfile(lila_dataset_to_categories_file)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
#%% Load category and taxonomy files
|
|
36
|
+
|
|
37
|
+
with open(lila_dataset_to_categories_file,'r') as f:
|
|
38
|
+
lila_dataset_to_categories = json.load(f)
|
|
39
|
+
|
|
40
|
+
taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
#%% Map dataset names and category names to scientific names
|
|
44
|
+
|
|
45
|
+
ds_query_to_scientific_name = {}
|
|
46
|
+
|
|
47
|
+
unmapped_queries = set()
|
|
48
|
+
|
|
49
|
+
# i_row = 1; row = taxonomy_df.iloc[i_row]; row
|
|
50
|
+
for i_row,row in taxonomy_df.iterrows():
|
|
51
|
+
|
|
52
|
+
ds_query = row['dataset_name'] + ':' + row['query']
|
|
53
|
+
ds_query = ds_query.lower()
|
|
54
|
+
|
|
55
|
+
if not isinstance(row['scientific_name'],str):
|
|
56
|
+
unmapped_queries.add(ds_query)
|
|
57
|
+
ds_query_to_scientific_name[ds_query] = 'unmapped'
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
ds_query_to_scientific_name[ds_query] = row['scientific_name']
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
#%% For each dataset, make sure we can map every category to the taxonomy
|
|
64
|
+
|
|
65
|
+
# dataset_name = list(lila_dataset_to_categories.keys())[0]
|
|
66
|
+
for _dataset_name in lila_dataset_to_categories.keys():
|
|
67
|
+
|
|
68
|
+
if '_bbox' in _dataset_name:
|
|
69
|
+
dataset_name = _dataset_name.replace('_bbox','')
|
|
70
|
+
else:
|
|
71
|
+
dataset_name = _dataset_name
|
|
72
|
+
|
|
73
|
+
categories = lila_dataset_to_categories[dataset_name]
|
|
74
|
+
|
|
75
|
+
# c = categories[0]
|
|
76
|
+
for c in categories:
|
|
77
|
+
ds_query = dataset_name + ':' + c['name']
|
|
78
|
+
ds_query = ds_query.lower()
|
|
79
|
+
|
|
80
|
+
if ds_query not in ds_query_to_scientific_name:
|
|
81
|
+
print('Could not find mapping for {}'.format(ds_query))
|
|
82
|
+
else:
|
|
83
|
+
scientific_name = ds_query_to_scientific_name[ds_query]
|
|
File without changes
|