megadetector 10.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +702 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +528 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +187 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +663 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +876 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2159 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1494 -0
  81. megadetector/detection/run_tiled_inference.py +1038 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1752 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2077 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +213 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +224 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2832 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1759 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1940 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +479 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.13.dist-info/METADATA +134 -0
  144. megadetector-10.0.13.dist-info/RECORD +147 -0
  145. megadetector-10.0.13.dist-info/WHEEL +5 -0
  146. megadetector-10.0.13.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.13.dist-info/top_level.txt +1 -0
@@ -0,0 +1,165 @@
1
+ """
2
+
3
+ prepare_lila_taxonomy_release.py
4
+
5
+ Given the private intermediate taxonomy mapping (produced by map_new_lila_datasets.py),
6
+ prepare the public (release) taxonomy mapping file.
7
+
8
+ """
9
+
10
+ #%% Imports and constants
11
+
12
+ import os
13
+ import json
14
+ import pandas as pd
15
+
16
+
17
+ #%% Prevent execution during infrastructural imports
18
+
19
+ if False:
20
+
21
+ #%% Filenames
22
+
23
+ lila_taxonomy_file = 'c:/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv'
24
+ release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
25
+ # import clipboard; clipboard.copy(release_taxonomy_file)
26
+
27
+ # Created by get_lila_annotation_counts.py... contains counts for each category
28
+ lila_dataset_to_categories_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
29
+
30
+ assert os.path.isfile(lila_dataset_to_categories_file)
31
+ assert os.path.isfile(lila_taxonomy_file)
32
+
33
+
34
+ #%% Find out which categories are actually used
35
+
36
+ df = pd.read_csv(lila_taxonomy_file)
37
+
38
+ with open(lila_dataset_to_categories_file,'r') as f:
39
+ lila_dataset_to_categories = json.load(f)
40
+
41
+ used_category_mappings = []
42
+
43
+ # dataset_name = datasets_to_map[0]
44
+ for dataset_name in lila_dataset_to_categories.keys():
45
+
46
+ ds_categories = lila_dataset_to_categories[dataset_name]
47
+ for category in ds_categories:
48
+ category_name = category['name'].lower()
49
+ assert ':' not in category_name
50
+ mapping_name = dataset_name + ':' + category_name
51
+ used_category_mappings.append(mapping_name)
52
+
53
+ df['used'] = False
54
+
55
+ n_dropped = 0
56
+
57
+ # i_row = 0; row = df.iloc[i_row]; row
58
+ for i_row,row in df.iterrows():
59
+ ds_name = row['dataset_name']
60
+ query = row['query']
61
+ mapping_name = ds_name + ':' + query
62
+ if mapping_name in used_category_mappings:
63
+ df.loc[i_row,'used'] = True
64
+ else:
65
+ n_dropped += 1
66
+ print('Dropping unused mapping {}'.format(mapping_name))
67
+
68
+ print('Dropping {} of {} mappings'.format(n_dropped,len(df)))
69
+
70
+ df = df[df.used]
71
+ df = df.drop('used',axis=1)
72
+
73
+
74
+ #%% Generate the final output file
75
+
76
+ assert not os.path.isfile(release_taxonomy_file), \
77
+ 'File {} exists, delete it manually before proceeding'.format(release_taxonomy_file)
78
+
79
+ levels_to_include = ['kingdom',
80
+ 'phylum',
81
+ 'subphylum',
82
+ 'superclass',
83
+ 'class',
84
+ 'subclass',
85
+ 'infraclass',
86
+ 'superorder',
87
+ 'order',
88
+ 'suborder',
89
+ 'infraorder',
90
+ 'superfamily',
91
+ 'family',
92
+ 'subfamily',
93
+ 'tribe',
94
+ 'genus',
95
+ 'subgenus',
96
+ 'species',
97
+ 'subspecies',
98
+ 'variety']
99
+
100
+ levels_to_exclude = ['stateofmatter',
101
+ 'zoosection',
102
+ 'parvorder',
103
+ 'complex',
104
+ 'epifamily']
105
+
106
+ for x in [levels_to_include,levels_to_exclude]:
107
+ assert len(x) == len(set(x))
108
+
109
+ for s in levels_to_exclude:
110
+ assert s not in levels_to_include
111
+
112
+ known_levels = levels_to_include + levels_to_exclude
113
+
114
+ levels_used = set()
115
+
116
+ # i_row = 0; row = df.iloc[i_row]; row
117
+ for i_row,row in df.iterrows():
118
+
119
+ if not isinstance(row['scientific_name'],str):
120
+ assert not isinstance(row['taxonomy_string'],str)
121
+ continue
122
+
123
+ # This is a list of length-4 tuples that each look like:
124
+ #
125
+ # (41789, 'species', 'taxidea taxus', ['american badger'])
126
+ taxonomic_match = eval(row['taxonomy_string'])
127
+
128
+ # match_at_level = taxonomic_match[0]
129
+ for match_at_level in taxonomic_match:
130
+ assert len(match_at_level) == 4
131
+ # E.g. "species"
132
+ levels_used.add(match_at_level[1])
133
+
134
+ levels_used = [s for s in levels_used if isinstance(s,str)]
135
+
136
+ for s in levels_used:
137
+ assert s in known_levels, 'Unrecognized level {}'.format(s)
138
+
139
+ for s in levels_to_include:
140
+ assert s in levels_used
141
+
142
+ for s in levels_to_include:
143
+ df[s] = ''
144
+
145
+ # i_row = 0; row = df.iloc[i_row]; row
146
+ for i_row,row in df.iterrows():
147
+
148
+ if not isinstance(row['scientific_name'],str):
149
+ assert not isinstance(row['taxonomy_string'],str)
150
+ continue
151
+
152
+ # E.g.: (43117, 'genus', 'lepus', ['hares and jackrabbits']
153
+ taxonomic_match = eval(row['taxonomy_string'])
154
+
155
+ for match_at_level in taxonomic_match:
156
+ level = match_at_level[1]
157
+ if level in levels_to_include:
158
+ df.loc[i_row,level] = match_at_level[2]
159
+
160
+ df = df.drop('source',axis=1)
161
+ df.to_csv(release_taxonomy_file,header=True,index=False)
162
+
163
+ print('Wrote final output to {}'.format(release_taxonomy_file))
164
+
165
+
@@ -0,0 +1,543 @@
1
+ """
2
+
3
+ preview_lila_taxonomy.py
4
+
5
+ Does some consistency-checking on the LILA taxonomy file, and generates
6
+ an HTML preview page that we can use to determine whether the mappings
7
+ make sense.
8
+
9
+ """
10
+
11
+ #%% Imports and constants
12
+
13
+ from tqdm import tqdm
14
+
15
+ import os
16
+ import pandas as pd
17
+
18
+ # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2025.10.07.csv')
20
+
21
+ preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
22
+ os.makedirs(preview_base,exist_ok=True)
23
+ html_output_file = os.path.join(preview_base,'index.html')
24
+
25
+
26
+ #%% Support functions
27
+
28
+ def parse_taxonomy_string(taxonomy_string):
29
+
30
+ taxonomic_match = eval(taxonomy_string)
31
+ matched_entity = taxonomic_match[0]
32
+ assert len(matched_entity) == 4
33
+
34
+ level = matched_entity[1]
35
+
36
+ scientific_name = matched_entity[2]
37
+
38
+ common_names = matched_entity[3]
39
+ if len(common_names) == 1:
40
+ common_name = common_names[0]
41
+ else:
42
+ common_name = str(common_names)
43
+
44
+ return scientific_name,common_name,level,taxonomic_match
45
+
46
+ def taxonomy_string_to_common_name(taxonomy_string):
47
+ _,cn,_,_ = parse_taxonomy_string(taxonomy_string)
48
+ return cn
49
+
50
+ def taxonomy_string_to_scientific(taxonomy_string):
51
+ sn,_,_,_ = parse_taxonomy_string(taxonomy_string)
52
+ return sn
53
+
54
+ def taxonomy_string_to_level(taxonomy_string):
55
+ _,_,level,_ = parse_taxonomy_string(taxonomy_string)
56
+ return level
57
+
58
+
59
+ #%% Prepare taxonomy lookup
60
+
61
+ from megadetector.taxonomy_mapping.species_lookup import \
62
+ initialize_taxonomy_lookup, get_preferred_taxonomic_match
63
+
64
+ initialize_taxonomy_lookup()
65
+
66
+
67
+ #%% Check for mappings that disagree with the taxonomy string
68
+
69
+ # For example, cases where the "level" column says "species", but the taxonomy string says it's a genus.
70
+
71
+ df = pd.read_csv(lila_taxonomy_file)
72
+
73
+ n_taxonomy_changes = 0
74
+
75
+ # Look for internal inconsistency
76
+ for i_row,row in df.iterrows():
77
+
78
+ sn = row['scientific_name']
79
+ if not isinstance(sn,str):
80
+ continue
81
+
82
+ ts = row['taxonomy_string']
83
+ assert sn == taxonomy_string_to_scientific(ts)
84
+
85
+ assert row['taxonomy_level'] == taxonomy_string_to_level(ts)
86
+
87
+ # Look for outdated mappings
88
+ taxonomy_preference = 'inat'
89
+
90
+ # i_row = 0; row = df.iloc[i_row]
91
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
92
+
93
+ try:
94
+
95
+ sn = row['scientific_name']
96
+ if not isinstance(sn,str):
97
+ continue
98
+
99
+ m = get_preferred_taxonomic_match(sn,taxonomy_preference)
100
+ assert m.scientific_name == sn
101
+
102
+ ts = row['taxonomy_string']
103
+ assert m.taxonomy_string[0:50] == ts[0:50], 'Mismatch for {}:\n\n{}\n\n{}\n'.format(
104
+ row['dataset_name'],ts,m.taxonomy_string)
105
+
106
+ if ts != m.taxonomy_string:
107
+ n_taxonomy_changes += 1
108
+ df.loc[i_row,'taxonomy_string'] = m.taxonomy_string
109
+
110
+ except Exception as e:
111
+
112
+ print('Error at row {}: {}'.format(i_row,str(e)))
113
+ raise
114
+
115
+ # ...for each row
116
+
117
+ print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
118
+
119
+ # Optionally re-write
120
+ if False:
121
+ df.to_csv(lila_taxonomy_file,header=True,index=False)
122
+
123
+
124
+ #%% List null mappings
125
+
126
+ # These should all be things like "empty", "unidentified", "fire", "car", etc.
127
+
128
+ # i_row = 0; row = df.iloc[i_row]
129
+ for i_row,row in df.iterrows():
130
+ if (not isinstance(row['taxonomy_string'],str)) or (len(row['taxonomy_string']) == 0):
131
+ print('No mapping for {}:{}'.format(row['dataset_name'],row['query']))
132
+
133
+
134
+ #%% List mappings with scientific names but no common names
135
+
136
+ for i_row,row in df.iterrows():
137
+ cn = row['common_name']
138
+ sn = row['scientific_name']
139
+ ts = row['taxonomy_string']
140
+ if (isinstance(ts,str)) and (len(ts) >= 0):
141
+ if (not isinstance(cn,str)) or (len(cn) == 0):
142
+ print('No mapping for {}:{}:{}'.format(row['dataset_name'],row['query'],row['scientific_name']))
143
+
144
+
145
+ #%% List mappings that map to different things in different data sets
146
+
147
+ import numpy as np
148
+ def isnan(x):
149
+ if not isinstance(x,float):
150
+ return False
151
+ return np.isnan(x)
152
+
153
+ from collections import defaultdict
154
+ query_to_rows = defaultdict(list)
155
+
156
+ queries_with_multiple_mappings = set()
157
+
158
+ n_suppressed = 0
159
+
160
+ suppress_multiple_matches = [
161
+ ['porcupine','Snapshot Camdeboo','Idaho Camera Traps'],
162
+ ['porcupine','Snapshot Enonkishu','Idaho Camera Traps'],
163
+ ['porcupine','Snapshot Karoo','Idaho Camera Traps'],
164
+ ['porcupine','Snapshot Kgalagadi','Idaho Camera Traps'],
165
+ ['porcupine','Snapshot Kruger','Idaho Camera Traps'],
166
+ ['porcupine','Snapshot Mountain Zebra','Idaho Camera Traps'],
167
+ ['porcupine','Snapshot Serengeti','Idaho Camera Traps'],
168
+
169
+ ['porcupine','Snapshot Serengeti','Snapshot Mountain Zebra'],
170
+ ['porcupine','Snapshot Serengeti','Snapshot Kruger'],
171
+ ['porcupine','Snapshot Serengeti','Snapshot Kgalagadi'],
172
+ ['porcupine','Snapshot Serengeti','Snapshot Karoo'],
173
+ ['porcupine','Snapshot Serengeti','Snapshot Camdeboo'],
174
+
175
+ ['porcupine','Snapshot Enonkishu','Snapshot Camdeboo'],
176
+ ['porcupine','Snapshot Enonkishu','Snapshot Mountain Zebra'],
177
+ ['porcupine','Snapshot Enonkishu','Snapshot Kruger'],
178
+ ['porcupine','Snapshot Enonkishu','Snapshot Kgalagadi'],
179
+ ['porcupine','Snapshot Enonkishu','Snapshot Karoo'],
180
+
181
+ ['kudu','Snapshot Serengeti','Snapshot Mountain Zebra'],
182
+ ['kudu','Snapshot Serengeti','Snapshot Kruger'],
183
+ ['kudu','Snapshot Serengeti','Snapshot Kgalagadi'],
184
+ ['kudu','Snapshot Serengeti','Snapshot Karoo'],
185
+ ['kudu','Snapshot Serengeti','Snapshot Camdeboo'],
186
+
187
+ ['fox','Caltech Camera Traps','Channel Islands Camera Traps'],
188
+ ['fox','Idaho Camera Traps','Channel Islands Camera Traps'],
189
+ ['fox','Idaho Camera Traps','Caltech Camera Traps'],
190
+
191
+ ['pangolin','Snapshot Serengeti','SWG Camera Traps'],
192
+
193
+ ['deer', 'Wellington Camera Traps', 'Idaho Camera Traps'],
194
+ ['deer', 'Wellington Camera Traps', 'Caltech Camera Traps'],
195
+
196
+ ['unknown cervid', 'WCS Camera Traps', 'Idaho Camera Traps']
197
+
198
+ ]
199
+
200
+ for i_row,row in df.iterrows():
201
+
202
+ query = row['query']
203
+ taxonomy_string = row['taxonomy_string']
204
+
205
+ for previous_i_row in query_to_rows[query]:
206
+
207
+ previous_row = df.iloc[previous_i_row]
208
+ assert previous_row['query'] == query
209
+ query_match = False
210
+ if isnan(row['taxonomy_string']):
211
+ query_match = isnan(previous_row['taxonomy_string'])
212
+ elif isnan(previous_row['taxonomy_string']):
213
+ query_match = isnan(row['taxonomy_string'])
214
+ else:
215
+ query_match = previous_row['taxonomy_string'][0:10] == taxonomy_string[0:10]
216
+
217
+ if not query_match:
218
+
219
+ suppress = False
220
+
221
+ # x = suppress_multiple_matches[-1]
222
+ for x in suppress_multiple_matches:
223
+ if x[0] == query and \
224
+ ( \
225
+ (x[1] == row['dataset_name'] and x[2] == previous_row['dataset_name']) \
226
+ or \
227
+ (x[2] == row['dataset_name'] and x[1] == previous_row['dataset_name']) \
228
+ ):
229
+ suppress = True
230
+ n_suppressed += 1
231
+ break
232
+
233
+ if not suppress:
234
+ print('Query {} in {} and {}:\n\n{}\n\n{}\n'.format(
235
+ query, row['dataset_name'], previous_row['dataset_name'],
236
+ taxonomy_string, previous_row['taxonomy_string']))
237
+
238
+ queries_with_multiple_mappings.add(query)
239
+
240
+ # ...for each row where we saw this query
241
+
242
+ query_to_rows[query].append(i_row)
243
+
244
+ # ...for each row
245
+
246
+ print('Found {} queries with multiple mappings ({} occurrences suppressed)'.format(
247
+ len(queries_with_multiple_mappings),n_suppressed))
248
+
249
+
250
+ #%% Verify that nothing "unidentified" maps to a species or subspecies
251
+
252
+ # E.g., "unidentified skunk" should never map to a specific species of skunk
253
+
254
+ allowable_unknown_species = [
255
+ 'unknown_tayra' # AFAIK this is a unique species, I'm not sure what's implied here
256
+ ]
257
+
258
+ unk_queries = ['skunk']
259
+ for i_row,row in df.iterrows():
260
+
261
+ query = row['query']
262
+ level = row['taxonomy_level']
263
+
264
+ if not isinstance(level,str):
265
+ assert not isinstance(row['taxonomy_string'],str)
266
+ continue
267
+
268
+ if ( \
269
+ 'unidentified' in query or \
270
+ ('unk' in query and ('skunk' not in query and 'chipmunk' not in query))\
271
+ ) \
272
+ and \
273
+ ('species' in level):
274
+
275
+ if query not in allowable_unknown_species:
276
+
277
+ print('Warning: query {}:{} maps to {} {}'.format(
278
+ row['dataset_name'],
279
+ row['query'],
280
+ row['taxonomy_level'],
281
+ row['scientific_name']
282
+ ))
283
+
284
+
285
+ #%% Make sure there are valid source and level values for everything with a mapping
286
+
287
+ for i_row,row in df.iterrows():
288
+ if isinstance(row['scientific_name'],str):
289
+ if 'source' in row:
290
+ assert isinstance(row['source'],str)
291
+ assert isinstance(row['taxonomy_level'],str)
292
+
293
+
294
+ #%% Find WCS mappings that aren't species or aren't the same as the input
295
+
296
+ # WCS used scientific names, so these remappings are slightly more controversial
297
+ # then the standard remappings.
298
+
299
+ # row = df.iloc[-500]
300
+ for i_row,row in df.iterrows():
301
+
302
+ if not isinstance(row['scientific_name'],str):
303
+ continue
304
+ if 'WCS' not in row['dataset_name']:
305
+ continue
306
+
307
+ query = row['query']
308
+ scientific_name = row['scientific_name']
309
+ common_name = row['common_name']
310
+ level = row['taxonomy_level']
311
+ taxonomy_string = row['taxonomy_string']
312
+
313
+ common_name_from_taxonomy = taxonomy_string_to_common_name(taxonomy_string)
314
+ query_string = query.replace(' sp','')
315
+ query_string = query_string.replace('unknown ','')
316
+
317
+ # Anything marked "species" or "unknown" by definition doesn't map to a species,
318
+ # so ignore these.
319
+ if (' sp' not in query) and ('unknown' not in query) and \
320
+ (level != 'species') and (level != 'subspecies'):
321
+ print('WCS query {} ({}) remapped to {} {} ({})'.format(
322
+ query,common_name,level,scientific_name,common_name_from_taxonomy))
323
+
324
+ if query_string != scientific_name:
325
+ pass
326
+ # print('WCS query {} ({}) remapped to {} ({})'.format(
327
+ # query,common_name,scientific_name,common_names_from_taxonomy))
328
+
329
+
330
+ #%% Download sample images for all scientific names
331
+
332
+ # You might have to do this:
333
+ #
334
+ # pip install python-magic
335
+ # pip install python-magic-bin
336
+
337
+ # Takes ~1 minute per 10 rows
338
+
339
+ remapped_queries = {'papio':'papio+baboon',
340
+ 'damaliscus lunatus jimela':'damaliscus lunatus',
341
+ 'mazama':'genus+mazama',
342
+ 'mirafra':'genus+mirafra'}
343
+
344
+ import os
345
+ from megadetector.taxonomy_mapping import retrieve_sample_image
346
+
347
+ scientific_name_to_paths = {}
348
+ image_base = os.path.join(preview_base,'images')
349
+ images_per_query = 15
350
+ min_valid_images_per_query = 3
351
+ min_valid_image_size = 3000
352
+
353
+ # TODO: parallelize this loop
354
+ #
355
+ # i_row = 0; row = df.iloc[i_row]
356
+ for i_row,row in df.iterrows():
357
+
358
+ s = row['scientific_name']
359
+
360
+ if (not isinstance(s,str)) or (len(s)==0):
361
+ continue
362
+
363
+ query = s.replace(' ','+')
364
+
365
+ if query in remapped_queries:
366
+ query = remapped_queries[query]
367
+
368
+ query_folder = os.path.join(image_base,query)
369
+ os.makedirs(query_folder,exist_ok=True)
370
+
371
+ # Check whether we already have enough images for this query
372
+ image_files = os.listdir(query_folder)
373
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
374
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
375
+ sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
376
+ if len(sizes_above_threshold) > min_valid_images_per_query:
377
+ print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
378
+ continue
379
+
380
+ # Check whether we've already run this query for a previous row
381
+ if query in scientific_name_to_paths:
382
+ continue
383
+
384
+ print('Processing query {} of {} ({})'.format(i_row,len(df),query))
385
+ paths = retrieve_sample_image.download_images(query=query,
386
+ output_directory=image_base,
387
+ limit=images_per_query,
388
+ verbose=True)
389
+ print('Downloaded {} images for {}'.format(len(paths),query))
390
+ scientific_name_to_paths[query] = paths
391
+
392
+ # ...for each row in the mapping table
393
+
394
+
395
+ #%% Rename .jpeg to .jpg
396
+
397
+ from megadetector.utils import path_utils
398
+ all_images = path_utils.recursive_file_list(image_base,False)
399
+
400
+ for fn in tqdm(all_images):
401
+ if fn.lower().endswith('.jpeg'):
402
+ new_fn = fn[0:-5] + '.jpg'
403
+ os.rename(fn, new_fn)
404
+
405
+
406
+ #%% Choose representative images for each scientific name
407
+
408
+ # Specifically, sort by size, and take the largest unique sizes. Very small files tend
409
+ # to be bogus thumbnails, etc.
410
+
411
+ max_images_per_query = 4
412
+ scientific_name_to_preferred_images = {}
413
+
414
+ # s = list(scientific_name_to_paths.keys())[0]
415
+ for s in list(df.scientific_name):
416
+
417
+ if not isinstance(s,str):
418
+ continue
419
+
420
+ query = s.replace(' ','+')
421
+
422
+ if query in remapped_queries:
423
+ query = remapped_queries[query]
424
+
425
+ query_folder = os.path.join(image_base,query)
426
+ assert os.path.isdir(query_folder)
427
+ image_files = os.listdir(query_folder)
428
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
429
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
430
+ path_to_size = {}
431
+ for i_fp,fp in enumerate(image_fullpaths):
432
+ path_to_size[fp] = sizes[i_fp]
433
+ paths_by_size = [x for _, x in sorted(zip(sizes, image_fullpaths),reverse=True)]
434
+
435
+ # Be suspicious of duplicate sizes
436
+ b_duplicate_sizes = [False] * len(paths_by_size)
437
+
438
+ for i_path,p in enumerate(paths_by_size):
439
+ if i_path == len(paths_by_size) - 1:
440
+ continue
441
+ if path_to_size[p] == path_to_size[paths_by_size[i_path+1]]:
442
+ b_duplicate_sizes[i_path] = True
443
+
444
+ paths_by_size_non_dup = [i for (i, v) in zip(paths_by_size, b_duplicate_sizes) if not v]
445
+
446
+ preferred_paths = paths_by_size_non_dup[:max_images_per_query]
447
+ scientific_name_to_preferred_images[s] = preferred_paths
448
+
449
+ # ...for each scientific name
450
+
451
+
452
+ #%% Delete unused images
453
+
454
+ used_images = []
455
+ for images in scientific_name_to_preferred_images.values():
456
+ used_images.extend(images)
457
+
458
+ print('Using a total of {} images'.format(len(used_images)))
459
+ used_images_set = set(used_images)
460
+
461
+ from megadetector.utils import path_utils
462
+ all_images = path_utils.recursive_file_list(image_base,False)
463
+
464
+ unused_images = []
465
+ for fn in all_images:
466
+ if fn not in used_images_set:
467
+ unused_images.append(fn)
468
+
469
+ print('{} of {} files unused (diff {})'.format(len(unused_images),len(all_images),
470
+ len(all_images) - len(unused_images)))
471
+
472
+ for fn in tqdm(unused_images):
473
+ os.remove(fn)
474
+
475
+
476
+ #%% Produce HTML preview
477
+
478
+ with open(html_output_file, 'w', encoding='utf-8') as f:
479
+
480
+ f.write('<html><head></head><body>\n')
481
+
482
+ names = scientific_name_to_preferred_images.keys()
483
+ names = sorted(names)
484
+
485
+ f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">'
486
+ 'dataset_name: <b><u>category</u></b> mapped to taxonomy_level scientific_name (taxonomic_common_name) (manual_common_name)</p>\n'
487
+ '</p>')
488
+
489
+ # i_row = 2; row = df.iloc[i_row]
490
+ for i_row, row in tqdm(df.iterrows(), total=len(df)):
491
+
492
+ s = row['scientific_name']
493
+
494
+ taxonomy_string = row['taxonomy_string']
495
+ if isinstance(taxonomy_string,str):
496
+ taxonomic_match = eval(taxonomy_string)
497
+ matched_entity = taxonomic_match[0]
498
+ assert len(matched_entity) == 4
499
+ common_names = matched_entity[3]
500
+ if len(common_names) == 1:
501
+ common_name_string = common_names[0]
502
+ else:
503
+ common_name_string = str(common_names)
504
+ else:
505
+ common_name_string = ''
506
+
507
+ f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
508
+
509
+ if isinstance(row.scientific_name,str):
510
+ output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
511
+ row.dataset_name, row.query,
512
+ row.taxonomy_level, row.scientific_name, common_name_string,
513
+ row.common_name)
514
+ f.write(output_string)
515
+ else:
516
+ f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
517
+
518
+ if s is None or s not in names:
519
+ f.write('<p class="content_p">no images available</p>')
520
+ else:
521
+ image_paths = scientific_name_to_preferred_images[s]
522
+ basedir = os.path.dirname(html_output_file)
523
+ relative_paths = [os.path.relpath(p,basedir) for p in image_paths]
524
+ image_paths = [s.replace('\\','/') for s in relative_paths]
525
+ n_images = len(image_paths)
526
+ # image_paths = [os.path.relpath(p, output_base) for p in image_paths]
527
+ image_width_percent = round(100 / n_images)
528
+ f.write('<table class="image_table"><tr>\n')
529
+ for image_path in image_paths:
530
+ f.write('<td style="vertical-align:top;" width="{}%">'
531
+ '<img src="{}" style="display:block; width:100%; vertical-align:top; height:auto;">'
532
+ '</td>\n'.format(image_width_percent, image_path))
533
+ f.write('</tr></table>\n')
534
+
535
+ # ...for each row
536
+
537
+ f.write('</body></html>\n')
538
+
539
+
540
+ #%% Open HTML preview
541
+
542
+ from megadetector.utils.path_utils import open_file
543
+ open_file(html_output_file)