megadetector 10.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. megadetector/__init__.py +0 -0
  2. megadetector/api/__init__.py +0 -0
  3. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  7. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  8. megadetector/classification/__init__.py +0 -0
  9. megadetector/classification/aggregate_classifier_probs.py +108 -0
  10. megadetector/classification/analyze_failed_images.py +227 -0
  11. megadetector/classification/cache_batchapi_outputs.py +198 -0
  12. megadetector/classification/create_classification_dataset.py +626 -0
  13. megadetector/classification/crop_detections.py +516 -0
  14. megadetector/classification/csv_to_json.py +226 -0
  15. megadetector/classification/detect_and_crop.py +853 -0
  16. megadetector/classification/efficientnet/__init__.py +9 -0
  17. megadetector/classification/efficientnet/model.py +415 -0
  18. megadetector/classification/efficientnet/utils.py +608 -0
  19. megadetector/classification/evaluate_model.py +520 -0
  20. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  21. megadetector/classification/json_to_azcopy_list.py +63 -0
  22. megadetector/classification/json_validator.py +696 -0
  23. megadetector/classification/map_classification_categories.py +276 -0
  24. megadetector/classification/merge_classification_detection_output.py +509 -0
  25. megadetector/classification/prepare_classification_script.py +194 -0
  26. megadetector/classification/prepare_classification_script_mc.py +228 -0
  27. megadetector/classification/run_classifier.py +287 -0
  28. megadetector/classification/save_mislabeled.py +110 -0
  29. megadetector/classification/train_classifier.py +827 -0
  30. megadetector/classification/train_classifier_tf.py +725 -0
  31. megadetector/classification/train_utils.py +323 -0
  32. megadetector/data_management/__init__.py +0 -0
  33. megadetector/data_management/animl_to_md.py +161 -0
  34. megadetector/data_management/annotations/__init__.py +0 -0
  35. megadetector/data_management/annotations/annotation_constants.py +33 -0
  36. megadetector/data_management/camtrap_dp_to_coco.py +270 -0
  37. megadetector/data_management/cct_json_utils.py +566 -0
  38. megadetector/data_management/cct_to_md.py +184 -0
  39. megadetector/data_management/cct_to_wi.py +293 -0
  40. megadetector/data_management/coco_to_labelme.py +284 -0
  41. megadetector/data_management/coco_to_yolo.py +701 -0
  42. megadetector/data_management/databases/__init__.py +0 -0
  43. megadetector/data_management/databases/add_width_and_height_to_db.py +107 -0
  44. megadetector/data_management/databases/combine_coco_camera_traps_files.py +210 -0
  45. megadetector/data_management/databases/integrity_check_json_db.py +563 -0
  46. megadetector/data_management/databases/subset_json_db.py +195 -0
  47. megadetector/data_management/generate_crops_from_cct.py +200 -0
  48. megadetector/data_management/get_image_sizes.py +164 -0
  49. megadetector/data_management/labelme_to_coco.py +559 -0
  50. megadetector/data_management/labelme_to_yolo.py +349 -0
  51. megadetector/data_management/lila/__init__.py +0 -0
  52. megadetector/data_management/lila/create_lila_blank_set.py +556 -0
  53. megadetector/data_management/lila/create_lila_test_set.py +192 -0
  54. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  55. megadetector/data_management/lila/download_lila_subset.py +182 -0
  56. megadetector/data_management/lila/generate_lila_per_image_labels.py +777 -0
  57. megadetector/data_management/lila/get_lila_annotation_counts.py +174 -0
  58. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  59. megadetector/data_management/lila/lila_common.py +319 -0
  60. megadetector/data_management/lila/test_lila_metadata_urls.py +164 -0
  61. megadetector/data_management/mewc_to_md.py +344 -0
  62. megadetector/data_management/ocr_tools.py +873 -0
  63. megadetector/data_management/read_exif.py +964 -0
  64. megadetector/data_management/remap_coco_categories.py +195 -0
  65. megadetector/data_management/remove_exif.py +156 -0
  66. megadetector/data_management/rename_images.py +194 -0
  67. megadetector/data_management/resize_coco_dataset.py +665 -0
  68. megadetector/data_management/speciesnet_to_md.py +41 -0
  69. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  70. megadetector/data_management/yolo_output_to_md_output.py +594 -0
  71. megadetector/data_management/yolo_to_coco.py +984 -0
  72. megadetector/data_management/zamba_to_md.py +188 -0
  73. megadetector/detection/__init__.py +0 -0
  74. megadetector/detection/change_detection.py +840 -0
  75. megadetector/detection/process_video.py +479 -0
  76. megadetector/detection/pytorch_detector.py +1451 -0
  77. megadetector/detection/run_detector.py +1267 -0
  78. megadetector/detection/run_detector_batch.py +2172 -0
  79. megadetector/detection/run_inference_with_yolov5_val.py +1314 -0
  80. megadetector/detection/run_md_and_speciesnet.py +1604 -0
  81. megadetector/detection/run_tiled_inference.py +1044 -0
  82. megadetector/detection/tf_detector.py +209 -0
  83. megadetector/detection/video_utils.py +1379 -0
  84. megadetector/postprocessing/__init__.py +0 -0
  85. megadetector/postprocessing/add_max_conf.py +72 -0
  86. megadetector/postprocessing/categorize_detections_by_size.py +166 -0
  87. megadetector/postprocessing/classification_postprocessing.py +1943 -0
  88. megadetector/postprocessing/combine_batch_outputs.py +249 -0
  89. megadetector/postprocessing/compare_batch_results.py +2110 -0
  90. megadetector/postprocessing/convert_output_format.py +403 -0
  91. megadetector/postprocessing/create_crop_folder.py +629 -0
  92. megadetector/postprocessing/detector_calibration.py +570 -0
  93. megadetector/postprocessing/generate_csv_report.py +522 -0
  94. megadetector/postprocessing/load_api_results.py +223 -0
  95. megadetector/postprocessing/md_to_coco.py +428 -0
  96. megadetector/postprocessing/md_to_labelme.py +351 -0
  97. megadetector/postprocessing/md_to_wi.py +41 -0
  98. megadetector/postprocessing/merge_detections.py +392 -0
  99. megadetector/postprocessing/postprocess_batch_results.py +2140 -0
  100. megadetector/postprocessing/remap_detection_categories.py +226 -0
  101. megadetector/postprocessing/render_detection_confusion_matrix.py +677 -0
  102. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +206 -0
  103. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +82 -0
  104. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1665 -0
  105. megadetector/postprocessing/separate_detections_into_folders.py +795 -0
  106. megadetector/postprocessing/subset_json_detector_output.py +964 -0
  107. megadetector/postprocessing/top_folders_to_bottom.py +238 -0
  108. megadetector/postprocessing/validate_batch_results.py +332 -0
  109. megadetector/taxonomy_mapping/__init__.py +0 -0
  110. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  111. megadetector/taxonomy_mapping/map_new_lila_datasets.py +211 -0
  112. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +165 -0
  113. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +543 -0
  114. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  115. megadetector/taxonomy_mapping/simple_image_download.py +231 -0
  116. megadetector/taxonomy_mapping/species_lookup.py +1008 -0
  117. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  118. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  119. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  120. megadetector/tests/__init__.py +0 -0
  121. megadetector/tests/test_nms_synthetic.py +335 -0
  122. megadetector/utils/__init__.py +0 -0
  123. megadetector/utils/ct_utils.py +1857 -0
  124. megadetector/utils/directory_listing.py +199 -0
  125. megadetector/utils/extract_frames_from_video.py +307 -0
  126. megadetector/utils/gpu_test.py +125 -0
  127. megadetector/utils/md_tests.py +2072 -0
  128. megadetector/utils/path_utils.py +2872 -0
  129. megadetector/utils/process_utils.py +172 -0
  130. megadetector/utils/split_locations_into_train_val.py +237 -0
  131. megadetector/utils/string_utils.py +234 -0
  132. megadetector/utils/url_utils.py +825 -0
  133. megadetector/utils/wi_platform_utils.py +968 -0
  134. megadetector/utils/wi_taxonomy_utils.py +1766 -0
  135. megadetector/utils/write_html_image_list.py +239 -0
  136. megadetector/visualization/__init__.py +0 -0
  137. megadetector/visualization/plot_utils.py +309 -0
  138. megadetector/visualization/render_images_with_thumbnails.py +243 -0
  139. megadetector/visualization/visualization_utils.py +1973 -0
  140. megadetector/visualization/visualize_db.py +630 -0
  141. megadetector/visualization/visualize_detector_output.py +498 -0
  142. megadetector/visualization/visualize_video_output.py +705 -0
  143. megadetector-10.0.15.dist-info/METADATA +115 -0
  144. megadetector-10.0.15.dist-info/RECORD +147 -0
  145. megadetector-10.0.15.dist-info/WHEEL +5 -0
  146. megadetector-10.0.15.dist-info/licenses/LICENSE +19 -0
  147. megadetector-10.0.15.dist-info/top_level.txt +1 -0
@@ -0,0 +1,491 @@
1
+ """
2
+
3
+ map_lila_taxonomy_to_wi_taxonomy.py
4
+
5
+ Loads the LILA category mapping (in which taxonomy information comes from an
6
+ iNat taxonomy snapshot) and tries to map each class to the Wildlife Insights taxonomy.
7
+
8
+ """
9
+
10
+ #%% Constants and imports
11
+
12
+ import numpy as np
13
+ import json
14
+ import os
15
+
16
+ from tqdm import tqdm
17
+
18
+ from megadetector.data_management.lila.lila_common import \
19
+ read_lila_taxonomy_mapping, read_wildlife_insights_taxonomy_mapping
20
+
21
+
22
+ #%% Prevent execution during infrastructural imports
23
+
24
+ if False:
25
+
26
+ #%%
27
+
28
+ lila_local_base = os.path.expanduser('~/lila')
29
+
30
+ metadata_dir = os.path.join(lila_local_base, 'metadata')
31
+ os.makedirs(metadata_dir, exist_ok=True)
32
+
33
+ # Created by get_lila_category_list.py... contains counts for each category
34
+ category_list_dir = os.path.join(lila_local_base, 'lila_categories_list')
35
+ lila_dataset_to_categories_file = os.path.join(
36
+ category_list_dir, 'lila_dataset_to_categories.json')
37
+
38
+ # This is a manually-curated file used to store mappings that had to be made manually
39
+ lila_to_wi_supplementary_mapping_file = os.path.expanduser(
40
+ '~/git/MegaDetector/taxonomy_mapping/lila_to_wi_supplementary_mapping_file.csv')
41
+
42
+ assert os.path.isfile(lila_dataset_to_categories_file)
43
+
44
+ # This is the main output file from this whole process
45
+ wi_mapping_table_file = os.path.join(lila_local_base,'lila_wi_mapping_table.csv')
46
+
47
+ id_column = 'uniqueIdentifier' # 'id'
48
+
49
+
50
+ #%% Load category and taxonomy files
51
+
52
+ with open(lila_dataset_to_categories_file, 'r') as f:
53
+ lila_dataset_to_categories = json.load(f)
54
+
55
+ lila_taxonomy_df = read_lila_taxonomy_mapping(metadata_dir)
56
+
57
+ wi_taxonomy_df = read_wildlife_insights_taxonomy_mapping(metadata_dir)
58
+
59
+
60
+ #%% Pull everything out of pandas
61
+
62
+ lila_taxonomy = lila_taxonomy_df.to_dict('records')
63
+ wi_taxonomy = wi_taxonomy_df.to_dict('records')
64
+
65
+
66
+ #%% Cache WI taxonomy lookups
67
+
68
+ def _is_empty_wi_item(v):
69
+ if isinstance(v, str):
70
+ return len(v) == 0
71
+ elif v is None:
72
+ return True
73
+ else:
74
+ assert isinstance(v, float) and np.isnan(v), 'Invalid item: {}'.format(str(v))
75
+ return True
76
+
77
+
78
+ def _taxonomy_items_equal(a, b):
79
+ if isinstance(a, str) and (not isinstance(b, str)):
80
+ return False
81
+ if isinstance(b, str) and (not isinstance(a, str)):
82
+ return False
83
+ if (not isinstance(a, str)) or (not isinstance(b, str)):
84
+ assert (a is None and b is None) or (isinstance(a, float) and isinstance(b, float))
85
+ return True
86
+ return a == b
87
+
88
+
89
+ for taxon in wi_taxonomy:
90
+ taxon['taxon_name'] = None
91
+
92
+ from collections import defaultdict
93
+ wi_taxon_name_to_taxa = defaultdict(list)
94
+
95
+ # This is just a handy lookup table that we'll use to debug mismatches
96
+ wi_common_name_to_taxon = {}
97
+
98
+ blank_taxon_name = 'blank'
99
+ blank_taxon = None
100
+
101
+ animal_taxon_name = 'animal'
102
+ animal_taxon = None
103
+
104
+ unknown_taxon_name = 'unknown'
105
+ unknown_taxon = None
106
+
107
+ ignore_taxa = set(['No CV Result', 'CV Needed', 'CV Failed'])
108
+
109
+ known_problematic_taxon_ids = ['f94e6d97-59cf-4d38-a05a-a75efdd2863b']
110
+
111
+ human_taxa = []
112
+
113
+ # taxon = wi_taxonomy[21653]; print(taxon)
114
+ for taxon in tqdm(wi_taxonomy):
115
+
116
+ taxon_name = None
117
+
118
+ assert taxon['taxonomyType'] == 'object' or taxon['taxonomyType'] == 'biological'
119
+
120
+ for k in taxon.keys():
121
+ v = taxon[k]
122
+ if isinstance(v,str):
123
+ taxon[k] = v.strip()
124
+
125
+ if taxon['commonNameEnglish'] in ignore_taxa:
126
+ continue
127
+
128
+ if isinstance(taxon['commonNameEnglish'], str):
129
+
130
+ wi_common_name_to_taxon[taxon['commonNameEnglish'].strip(
131
+ ).lower()] = taxon
132
+
133
+ special_taxon = False
134
+
135
+ # Look for keywords that don't refer to specific taxa: blank/animal/unknown
136
+ if taxon['commonNameEnglish'].strip().lower() == blank_taxon_name:
137
+ blank_taxon = taxon
138
+ special_taxon = True
139
+
140
+ elif taxon['commonNameEnglish'].strip().lower() == animal_taxon_name:
141
+ animal_taxon = taxon
142
+ special_taxon = True
143
+
144
+ elif taxon['commonNameEnglish'].strip().lower() == unknown_taxon_name:
145
+ unknown_taxon = taxon
146
+ special_taxon = True
147
+
148
+ if special_taxon:
149
+ taxon_name = taxon['commonNameEnglish'].strip().lower()
150
+ taxon['taxon_name'] = taxon_name
151
+ wi_taxon_name_to_taxa[taxon_name].append(taxon)
152
+ continue
153
+
154
+ # Do we have a species name?
155
+ if not _is_empty_wi_item(taxon['species']):
156
+
157
+ # If 'species' is populated, 'genus' should always be populated; one item currently breaks
158
+ # this rule.
159
+ assert not _is_empty_wi_item(taxon['genus'])
160
+
161
+ taxon_name = (taxon['genus'].strip() + ' ' +
162
+ taxon['species'].strip()).strip().lower()
163
+ assert not _is_empty_wi_item(taxon['class']) and \
164
+ not _is_empty_wi_item(taxon['order']) and \
165
+ not _is_empty_wi_item(taxon['family'])
166
+
167
+ elif not _is_empty_wi_item(taxon['genus']):
168
+
169
+ assert not _is_empty_wi_item(taxon['class']) and \
170
+ not _is_empty_wi_item(taxon['order']) and \
171
+ not _is_empty_wi_item(taxon['family'])
172
+ taxon_name = taxon['genus'].strip().lower()
173
+
174
+ elif not _is_empty_wi_item(taxon['family']):
175
+
176
+ assert not _is_empty_wi_item(taxon['class']) and \
177
+ not _is_empty_wi_item(taxon['order'])
178
+ taxon_name = taxon['family'].strip().lower()
179
+
180
+ elif not _is_empty_wi_item(taxon['order']):
181
+
182
+ assert not _is_empty_wi_item(taxon['class'])
183
+ taxon_name = taxon['order'].strip().lower()
184
+
185
+ elif not _is_empty_wi_item(taxon['class']):
186
+
187
+ taxon_name = taxon['class'].strip().lower()
188
+
189
+ if taxon_name is not None:
190
+ assert taxon['taxonomyType'] == 'biological'
191
+ else:
192
+ assert taxon['taxonomyType'] == 'object'
193
+ taxon_name = taxon['commonNameEnglish'].strip().lower()
194
+
195
+ if taxon_name in wi_taxon_name_to_taxa:
196
+ if taxon[id_column] in known_problematic_taxon_ids:
197
+ print('Skipping problematic taxon ID {}'.format(taxon[id_column]))
198
+ else:
199
+ previous_taxa = wi_taxon_name_to_taxa[taxon_name]
200
+ for previous_taxon in previous_taxa:
201
+ for level in ['class', 'order', 'family', 'genus', 'species']:
202
+ error_string = 'Error: taxon {} appeared previously in {} {} (as {}), now in {} {}'.format(
203
+ taxon_name,
204
+ level,previous_taxon[level],
205
+ previous_taxon['taxon_name'],
206
+ level,taxon[level])
207
+ assert _taxonomy_items_equal(previous_taxon[level], taxon[level]), error_string
208
+
209
+ taxon['taxon_name'] = taxon_name
210
+ if taxon_name == 'homo sapiens':
211
+ human_taxa.append(taxon)
212
+ wi_taxon_name_to_taxa[taxon_name].append(taxon)
213
+
214
+ # ...for each taxon
215
+
216
+ assert unknown_taxon is not None
217
+ assert animal_taxon is not None
218
+ assert blank_taxon is not None
219
+
220
+
221
+ #%% Find redundant taxa
222
+
223
+ taxon_names_with_multiple_entries = []
224
+ for wi_taxon_name in wi_taxon_name_to_taxa:
225
+ if len(wi_taxon_name_to_taxa[wi_taxon_name]) > 1:
226
+ taxon_names_with_multiple_entries.append(wi_taxon_name)
227
+
228
+ print('{} names have multiple entries\n:'.format(len(taxon_names_with_multiple_entries)))
229
+
230
+ for s in taxon_names_with_multiple_entries:
231
+ print(s)
232
+
233
+ if False:
234
+ pass
235
+
236
+ #%% Manual review of redundant taxa
237
+
238
+ s = taxon_names_with_multiple_entries[15]
239
+ taxa = wi_taxon_name_to_taxa[s]
240
+ for t in taxa:
241
+ for k in t.keys():
242
+ print('{}: {}'.format(k,t[k]))
243
+ print()
244
+ # print(t,end='\n\n')
245
+
246
+
247
+ #%% Clean up redundant taxa
248
+
249
+ taxon_name_to_preferred_taxon_id = {}
250
+
251
+ # "helmeted guineafowl" vs "domestic guineafowl"
252
+ taxon_name_to_preferred_taxon_id['numida meleagris'] = '83133617-8358-4910-82ee-4c23e40ba3dc' # 2005826
253
+
254
+ # "domestic turkey" vs. "wild turkey"
255
+ taxon_name_to_preferred_taxon_id['meleagris gallopavo'] = 'c10547c3-1748-48bf-a451-8066c820f22f' # 2021598
256
+
257
+ # multiple sensible human entries
258
+ taxon_name_to_preferred_taxon_id['homo sapiens'] = '990ae9dd-7a59-4344-afcb-1b7b21368000' # 2002045
259
+
260
+ # "domestic dog" and "dog-on-leash"
261
+ taxon_name_to_preferred_taxon_id['canis familiaris'] = '3d80f1d6-b1df-4966-9ff4-94053c7a902a' # 2021548
262
+
263
+ # "small mammal" vs. "mammal"
264
+ taxon_name_to_preferred_taxon_id['mammalia'] = 'f2d233e3-80e3-433d-9687-e29ecc7a467a' # 2021108
265
+
266
+ # "Hispaniolan Mango" vs. NaN
267
+ taxon_name_to_preferred_taxon_id['anthracothorax dominicus'] = 'f94e6d97-59cf-4d38-a05a-a75efdd2863b'
268
+
269
+ # "millipedes" vs. "Millipede"
270
+ taxon_name_to_preferred_taxon_id['diplopoda'] = '065884eb-4e64-4233-84dc-de25bd06ffd2' # 2021760
271
+
272
+ # Different suborders: Squamata vs. Lacertilia
273
+ taxon_name_to_preferred_taxon_id['squamata'] = '710c4066-bd5d-4313-bcf4-0217c4c84da7' # 2021703
274
+
275
+ # Redundancy (both "beautiful firetail")
276
+ taxon_name_to_preferred_taxon_id['stagonopleura bella'] = '7fec8e7e-fd3b-4d7f-99fd-3ade6f3bbaa5' # 2021939
277
+
278
+ # "yellow wagtail" vs. "yellow crowned-wagtail"
279
+ taxon_name_to_preferred_taxon_id['motacilla flava'] = 'ac6669bc-9f9e-4473-b609-b9082f9bf50c' # 2016194
280
+
281
+ # "dremomys species" vs. "dremomys genus"
282
+ taxon_name_to_preferred_taxon_id['dremomys'] = '1507d153-af11-46f1-bfb8-77918d035ab3' # 2019370
283
+
284
+ # "elk" vs. "domestic elk"
285
+ taxon_name_to_preferred_taxon_id['cervus canadensis'] = 'c5ce946f-8f0d-4379-992b-cc0982381f5e'
286
+
287
+ # "American bison" vs. "domestic bison"
288
+ taxon_name_to_preferred_taxon_id['bison bison'] = '539ebd55-081b-429a-9ae6-5a6a0f6999d4' # 2021593
289
+
290
+ # "woodrat or rat or mouse species" vs. "mouse species"
291
+ taxon_name_to_preferred_taxon_id['muridae'] = 'e7503287-468c-45af-a1bd-a17821bb62f2' # 2021642
292
+
293
+ # both "southern sand frog"
294
+ taxon_name_to_preferred_taxon_id['tomopterna adiastola'] = 'a5dc63cb-41be-4090-84a7-b944b16dcee4' # 2021834
295
+
296
+ # sericornis species vs. scrubwren species
297
+ taxon_name_to_preferred_taxon_id['sericornis'] = 'ad82c0ac-df48-4028-bf71-d2b2f4bc4129' # 2021776
298
+
299
+
300
+ # taxon_name = list(taxon_name_to_preferred_taxon_id.keys())[0]
301
+ for taxon_name in taxon_name_to_preferred_taxon_id.keys():
302
+
303
+ candidate_taxa = wi_taxon_name_to_taxa[taxon_name]
304
+
305
+ # If we've gotten this far, we should be choosing from multiple taxa.
306
+ #
307
+ # This will become untrue if any of these are resolved later, at which point we should
308
+ # remove them from taxon_name_to_preferred_id
309
+ assert len(candidate_taxa) > 1, 'Only one taxon available for {}'.format(taxon_name)
310
+
311
+ # Choose the preferred taxa
312
+ selected_taxa = [t for t in candidate_taxa if t[id_column] == \
313
+ taxon_name_to_preferred_taxon_id[taxon_name]]
314
+ assert len(selected_taxa) == 1
315
+ wi_taxon_name_to_taxa[taxon_name] = selected_taxa
316
+
317
+ wi_taxon_name_to_taxon = {}
318
+
319
+ for taxon_name in wi_taxon_name_to_taxa.keys():
320
+ taxa = wi_taxon_name_to_taxa[taxon_name]
321
+ assert len(taxa) == 1
322
+ wi_taxon_name_to_taxon[taxon_name] = taxa[0]
323
+
324
+
325
+ #%% Read supplementary mappings
326
+
327
+ with open(lila_to_wi_supplementary_mapping_file, 'r') as f:
328
+ lines = f.readlines()
329
+
330
+ supplementary_lila_query_to_wi_query = {}
331
+
332
+ for line in lines:
333
+ # Each line is [lila query],[WI taxon name],[notes]
334
+ tokens = line.strip().split(',')
335
+ assert len(tokens) == 3
336
+ lila_query = tokens[0].strip().lower()
337
+ wi_taxon_name = tokens[1].strip().lower()
338
+ assert wi_taxon_name in wi_taxon_name_to_taxa
339
+ supplementary_lila_query_to_wi_query[lila_query] = wi_taxon_name
340
+
341
+
342
+ #%% Map LILA categories to WI categories
343
+
344
+ mismatches = set()
345
+ mismatches_with_common_mappings = set()
346
+ supplementary_mappings = set()
347
+
348
+ all_searches = set()
349
+
350
+ # Must be ordered from kingdom --> species
351
+ lila_taxonomy_levels = ['kingdom', 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
352
+ 'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
353
+ 'superfamily', 'family', 'subfamily', 'tribe', 'genus', 'species']
354
+
355
+ unknown_queries = set(
356
+ ['unidentifiable', 'other', 'unidentified', 'unknown', 'unclassifiable'])
357
+ blank_queries = set(['empty'])
358
+ animal_queries = set(['animalia'])
359
+
360
+ lila_dataset_category_to_wi_taxon = {}
361
+
362
+ # i_taxon = 0; taxon = lila_taxonomy[i_taxon]; print(taxon)
363
+ for i_taxon, lila_taxon in enumerate(lila_taxonomy):
364
+
365
+ query = None
366
+
367
+ lila_dataset_category = lila_taxon['dataset_name'] + ':' + lila_taxon['query']
368
+
369
+ # Go from kingdom --> species, choosing the lowest-level description as the query
370
+ for level in lila_taxonomy_levels:
371
+ if isinstance(lila_taxon[level], str):
372
+ query = lila_taxon[level]
373
+ all_searches.add(query)
374
+
375
+ if query is None:
376
+ # E.g., 'car'
377
+ query = lila_taxon['query']
378
+
379
+ wi_taxon = None
380
+
381
+ if query in unknown_queries:
382
+
383
+ wi_taxon = unknown_taxon
384
+
385
+ elif query in blank_queries:
386
+
387
+ wi_taxon = blank_taxon
388
+
389
+ elif query in animal_queries:
390
+
391
+ wi_taxon = animal_taxon
392
+
393
+ elif query in wi_taxon_name_to_taxon:
394
+
395
+ wi_taxon = wi_taxon_name_to_taxon[query]
396
+
397
+ elif query in supplementary_lila_query_to_wi_query:
398
+
399
+ wi_taxon = wi_taxon_name_to_taxon[supplementary_lila_query_to_wi_query[query]]
400
+ supplementary_mappings.add(query)
401
+ # print('Made a supplementary mapping from {} to {}'.format(query,wi_taxon['taxon_name']))
402
+
403
+ else:
404
+
405
+ # print('No match for {}'.format(query))
406
+ lila_common_name = lila_taxon['common_name']
407
+
408
+ if lila_common_name in wi_common_name_to_taxon:
409
+ wi_taxon = wi_common_name_to_taxon[lila_common_name]
410
+ wi_common_name = wi_taxon['commonNameEnglish']
411
+ wi_taxon_name = wi_taxon['taxon_name']
412
+ if False:
413
+ print('LILA common name {} maps to WI taxon {} ({})'.format(lila_common_name,
414
+ wi_taxon_name,
415
+ wi_common_name))
416
+ mismatches_with_common_mappings.add(query)
417
+
418
+ else:
419
+
420
+ mismatches.add(query)
421
+
422
+ lila_dataset_category_to_wi_taxon[lila_dataset_category] = wi_taxon
423
+
424
+ # ...for each LILA taxon
425
+
426
+ print('Of {} entities, there are {} mismatches ({} mapped by common name) ({} mapped by supplementary mapping file)'.format(
427
+ len(all_searches), len(mismatches), len(mismatches_with_common_mappings), len(supplementary_mappings)))
428
+
429
+ assert len(mismatches) == 0
430
+
431
+
432
+ #%% Manual mapping
433
+
434
+ if not os.path.isfile(lila_to_wi_supplementary_mapping_file):
435
+ print('Creating mapping file {}'.format(
436
+ lila_to_wi_supplementary_mapping_file))
437
+ with open(lila_to_wi_supplementary_mapping_file, 'w') as f:
438
+ for query in mismatches:
439
+ f.write(query + ',' + '\n')
440
+ else:
441
+ print('{} exists, not re-writing'.format(lila_to_wi_supplementary_mapping_file))
442
+
443
+
444
+ #%% Build a dictionary from LILA dataset names and categories to LILA taxa
445
+
446
+ lila_dataset_category_to_lila_taxon = {}
447
+
448
+ # i_d = 0; d = lila_taxonomy[i_d]
449
+ for i_d,d in enumerate(lila_taxonomy):
450
+ lila_dataset_category = d['dataset_name'] + ':' + d['query']
451
+ assert lila_dataset_category not in lila_dataset_category_to_lila_taxon
452
+ lila_dataset_category_to_lila_taxon[lila_dataset_category] = d
453
+
454
+
455
+ #%% Map LILA datasets to WI taxa, and count the number of each taxon available in each dataset
456
+
457
+ with open(wi_mapping_table_file,'w') as f:
458
+
459
+ f.write('lila_dataset_name,lila_category_name,wi_guid,wi_taxon_name,wi_common,count\n')
460
+
461
+ # dataset_name = list(lila_dataset_to_categories.keys())[0]
462
+ for dataset_name in lila_dataset_to_categories.keys():
463
+
464
+ if '_bbox' in dataset_name:
465
+ continue
466
+
467
+ dataset_categories = lila_dataset_to_categories[dataset_name]
468
+
469
+ # dataset_category = dataset_categories[0]
470
+ for category in dataset_categories:
471
+
472
+ lila_dataset_category = dataset_name + ':' + category['name'].strip().lower()
473
+ if '#' in lila_dataset_category:
474
+ continue
475
+ assert lila_dataset_category in lila_dataset_category_to_lila_taxon
476
+ assert lila_dataset_category in lila_dataset_category_to_wi_taxon
477
+ assert 'count' in category
478
+
479
+ wi_taxon = lila_dataset_category_to_wi_taxon[lila_dataset_category]
480
+
481
+ # Write out the dataset name, category name, WI GUID, WI scientific name, WI common name,
482
+ # and count
483
+ s = f"{dataset_name},{category['name']},{wi_taxon['uniqueIdentifier']},"+\
484
+ f"{wi_taxon['taxon_name']},{wi_taxon['commonNameEnglish']},{category['count']}\n"
485
+ f.write(s)
486
+
487
+ # ...for each category in this dataset
488
+
489
+ # ...for each dataset
490
+
491
+ # ...with open()
@@ -0,0 +1,211 @@
1
+ """
2
+
3
+ map_new_lila_datasets.py
4
+
5
+ Given a subset of LILA datasets, find all the categories, and start the taxonomy
6
+ mapping process.
7
+
8
+ """
9
+
10
+ #%% Constants and imports
11
+
12
+ import os
13
+ import json
14
+
15
+ # Created by get_lila_category_list.py
16
+ input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
+
18
+ output_file = os.path.expanduser('~/lila/lila_additions_2025.11.17.csv')
19
+
20
+ datasets_to_map = [
21
+ 'Biome Health Project Maasai Mara 2018'
22
+ ]
23
+
24
+
25
+ #%% Initialize taxonomic lookup
26
+
27
+ # Takes ~2 mins
28
+
29
+ from megadetector.taxonomy_mapping.species_lookup import \
30
+ initialize_taxonomy_lookup, get_preferred_taxonomic_match
31
+
32
+ initialize_taxonomy_lookup(force_init=False)
33
+
34
+
35
+ #%% Read the list of datasets
36
+
37
+ with open(input_lila_category_list_file,'r') as f:
38
+ input_lila_categories = json.load(f)
39
+
40
+ lila_datasets = set()
41
+
42
+ for dataset_name in input_lila_categories.keys():
43
+ # The script that generates this dictionary creates a separate entry for bounding box
44
+ # metadata files, but those don't represent new dataset names, so we ignore them here.
45
+ lila_datasets.add(dataset_name.replace('_bbox',''))
46
+
47
+ for s in datasets_to_map:
48
+ assert s in lila_datasets
49
+
50
+
51
+ #%% Find all categories
52
+
53
+ category_mappings = []
54
+
55
+ # dataset_name = datasets_to_map[0]
56
+ for dataset_name in datasets_to_map:
57
+
58
+ ds_categories = input_lila_categories[dataset_name]
59
+ for category in ds_categories:
60
+ category_name = category['name']
61
+ assert ':' not in category_name
62
+ mapping_name = dataset_name + ':' + category_name
63
+ category_mappings.append(mapping_name)
64
+
65
+ print('Need to create {} mappings'.format(len(category_mappings)))
66
+
67
+
68
+ #%% Match every query against our taxonomies
69
+
70
+ output_rows = []
71
+
72
+ taxonomy_preference = 'inat'
73
+
74
+ allow_non_preferred_matches = True
75
+
76
+ # mapping_string = category_mappings[1]; print(mapping_string)
77
+ for mapping_string in category_mappings:
78
+
79
+ tokens = mapping_string.split(':')
80
+ assert len(tokens) == 2
81
+
82
+ dataset_name = tokens[0]
83
+ query = tokens[1]
84
+
85
+ taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
86
+
87
+ if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
88
+
89
+ output_row = {
90
+ 'dataset_name': dataset_name,
91
+ 'query': query,
92
+ 'source': taxonomic_match.source,
93
+ 'taxonomy_level': taxonomic_match.taxonomic_level,
94
+ 'scientific_name': taxonomic_match.scientific_name,
95
+ 'common_name': taxonomic_match.common_name,
96
+ 'taxonomy_string': taxonomic_match.taxonomy_string
97
+ }
98
+
99
+ else:
100
+
101
+ output_row = {
102
+ 'dataset_name': dataset_name,
103
+ 'query': query,
104
+ 'source': '',
105
+ 'taxonomy_level': '',
106
+ 'scientific_name': '',
107
+ 'common_name': '',
108
+ 'taxonomy_string': ''
109
+ }
110
+
111
+ output_rows.append(output_row)
112
+
113
+ # ...for each mapping
114
+
115
+
116
+ #%% Write output rows
117
+
118
+ import os
119
+ import pandas as pd
120
+
121
+ assert not os.path.isfile(output_file), 'Delete the output file before re-generating'
122
+
123
+ output_df = pd.DataFrame(data=output_rows, columns=[
124
+ 'dataset_name', 'query', 'source', 'taxonomy_level',
125
+ 'scientific_name', 'common_name', 'taxonomy_string'])
126
+ output_df.to_csv(output_file, index=None, header=True)
127
+
128
+ # from megadetector.utils.path_utils import open_file; open_file(output_file)
129
+
130
+
131
+ #%% Remap missing entries in the .csv file
132
+
133
+ # ...typically because I made a change to the mapping code.
134
+
135
+ from megadetector.utils.path_utils import insert_before_extension
136
+ from megadetector.utils.ct_utils import is_empty
137
+
138
+ remapped_file = insert_before_extension(output_file,'remapped')
139
+
140
+ df = pd.read_csv(output_file)
141
+
142
+ for i_row,row in df.iterrows():
143
+
144
+ # Do we need to map this row?
145
+ if is_empty(row['source']):
146
+
147
+ query = row['query']
148
+ print('Mapping {}'.format(query))
149
+
150
+ taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
151
+
152
+ if (taxonomic_match.source == taxonomy_preference):
153
+
154
+ source = taxonomic_match.source
155
+ taxonomy_level = taxonomic_match.taxonomic_level
156
+ scientific_name = taxonomic_match.scientific_name
157
+ common_name = taxonomic_match.common_name
158
+ taxonomy_string = taxonomic_match.taxonomy_string
159
+
160
+ # Write source, taxonomy_level, scientific_name, common_name, and taxonomy_string
161
+ # to the corresponding columns in the current row in df
162
+ df.loc[i_row, 'source'] = source
163
+ df.loc[i_row, 'taxonomy_level'] = taxonomy_level
164
+ df.loc[i_row, 'scientific_name'] = scientific_name
165
+ df.loc[i_row, 'common_name'] = common_name
166
+ df.loc[i_row, 'taxonomy_string'] = taxonomy_string
167
+
168
+ # ...if we found a match
169
+
170
+ # ...do we need to map this row?
171
+
172
+ # ...for each row
173
+
174
+ df.to_csv(remapped_file, index=None, header=True)
175
+
176
+
177
+ #%% Manual lookup
178
+
179
+ if False:
180
+
181
+ #%% You probably want to open the .csv file first
182
+
183
+ from megadetector.utils.path_utils import open_file
184
+ open_file(output_file)
185
+
186
+
187
+ #%%
188
+
189
+ from megadetector.taxonomy_mapping.species_lookup import pop_levels
190
+
191
+ # Use this when an iNat match includes an empty subgenus with the same name as the genus
192
+ n_levels_to_pop = 0
193
+ q = 'hirundinidae'
194
+
195
+ taxonomy_preference = 'inat'
196
+ m = get_preferred_taxonomic_match(q,taxonomy_preference)
197
+ if n_levels_to_pop > 0:
198
+ m = pop_levels(m,n_levels_to_pop)
199
+
200
+ # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
201
+ # common_name = eval(m.__dict__['taxonomy_string'])[0][-1][0]; print(common_name); clipboard.copy(common_name)
202
+
203
+ if (m is None) or (len(m.taxonomy_string) == 0):
204
+ print('No match')
205
+ else:
206
+ if m.source != taxonomy_preference:
207
+ print('\n*** non-preferred match ***\n')
208
+ # raise ValueError('')
209
+ print(m.source)
210
+ print(m.taxonomy_string)
211
+ import clipboard; clipboard.copy(m.taxonomy_string)