megadetector 5.0.8__py3-none-any.whl → 5.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +65 -65
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
  20. api/batch_processing/postprocessing/compare_batch_results.py +113 -43
  21. api/batch_processing/postprocessing/convert_output_format.py +41 -16
  22. api/batch_processing/postprocessing/load_api_results.py +16 -17
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +52 -22
  25. api/batch_processing/postprocessing/merge_detections.py +14 -14
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
  27. api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +102 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -263
  71. data_management/coco_to_yolo.py +79 -58
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +62 -24
  76. data_management/databases/subset_json_db.py +24 -15
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -162
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -158
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +7 -7
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +65 -24
  120. data_management/labelme_to_yolo.py +8 -8
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +13 -13
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +44 -110
  128. data_management/lila/generate_lila_per_image_labels.py +55 -42
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +96 -33
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +110 -97
  135. data_management/remap_coco_categories.py +83 -83
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +30 -23
  138. data_management/wi_download_csv_to_coco.py +246 -239
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +300 -60
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +179 -113
  147. detection/run_inference_with_yolov5_val.py +108 -48
  148. detection/run_tiled_inference.py +111 -40
  149. detection/tf_detector.py +51 -29
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +228 -68
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -871
  157. md_utils/path_utils.py +460 -134
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +176 -60
  163. md_utils/write_html_image_list.py +40 -33
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +597 -291
  168. md_visualization/visualize_db.py +76 -48
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/METADATA +13 -7
  171. megadetector-5.0.10.dist-info/RECORD +224 -0
  172. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/top_level.txt +1 -0
  173. taxonomy_mapping/__init__.py +0 -0
  174. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  175. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  176. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  177. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  178. taxonomy_mapping/retrieve_sample_image.py +12 -12
  179. taxonomy_mapping/simple_image_download.py +11 -11
  180. taxonomy_mapping/species_lookup.py +10 -10
  181. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  182. taxonomy_mapping/taxonomy_graph.py +47 -47
  183. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  184. data_management/cct_json_to_filename_json.py +0 -89
  185. data_management/cct_to_csv.py +0 -140
  186. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  187. detection/detector_training/copy_checkpoints.py +0 -43
  188. megadetector-5.0.8.dist-info/RECORD +0 -205
  189. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/LICENSE +0 -0
  190. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/WHEEL +0 -0
@@ -1,154 +1,154 @@
1
- ########
2
- #
3
- # map_new_lila_datasets.py
4
- #
5
- # Given a subset of LILA datasets, find all the categories, and start the taxonomy
6
- # mapping process.
7
- #
8
- ########
9
-
10
- #%% Constants and imports
11
-
12
- import os
13
- import json
14
-
15
- # Created by get_lila_category_list.py
16
- input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
-
18
- output_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
19
-
20
- datasets_to_map = [
21
- 'Trail Camera Images of New Zealand Animals'
22
- ]
23
-
24
-
25
- #%% Initialize taxonomic lookup
26
-
27
- from taxonomy_mapping.species_lookup import (
28
- initialize_taxonomy_lookup,
29
- get_preferred_taxonomic_match)
30
-
31
- # from taxonomy_mapping.species_lookup import (
32
- # get_taxonomic_info, print_taxonomy_matche)
33
-
34
- initialize_taxonomy_lookup(force_init=False)
35
-
36
-
37
- #%% Read the list of datasets
38
-
39
- with open(input_lila_category_list_file,'r') as f:
40
- input_lila_categories = json.load(f)
41
-
42
- lila_datasets = set()
43
-
44
- for dataset_name in input_lila_categories.keys():
45
- # The script that generates this dictionary creates a separate entry for bounding box
46
- # metadata files, but those don't represent new dataset names
47
- lila_datasets.add(dataset_name.replace('_bbox',''))
48
-
49
- for s in datasets_to_map:
50
- assert s in lila_datasets
51
-
52
-
53
- #%% Find all categories
54
-
55
- category_mappings = []
56
-
57
- # dataset_name = datasets_to_map[0]
58
- for dataset_name in datasets_to_map:
59
-
60
- ds_categories = input_lila_categories[dataset_name]
61
- for category in ds_categories:
62
- category_name = category['name']
63
- assert ':' not in category_name
64
- mapping_name = dataset_name + ':' + category_name
65
- category_mappings.append(mapping_name)
66
-
67
- print('Need to create {} mappings'.format(len(category_mappings)))
68
-
69
-
70
- #%% Match every query against our taxonomies
71
-
72
- output_rows = []
73
-
74
- taxonomy_preference = 'inat'
75
-
76
- allow_non_preferred_matches = True
77
-
78
- # mapping_string = category_mappings[1]; print(mapping_string)
79
- for mapping_string in category_mappings:
80
-
81
- tokens = mapping_string.split(':')
82
- assert len(tokens) == 2
83
-
84
- dataset_name = tokens[0]
85
- query = tokens[1]
86
-
87
- taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
88
-
89
- if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
90
-
91
- output_row = {
92
- 'dataset_name': dataset_name,
93
- 'query': query,
94
- 'source': taxonomic_match.source,
95
- 'taxonomy_level': taxonomic_match.taxonomic_level,
96
- 'scientific_name': taxonomic_match.scientific_name,
97
- 'common_name': taxonomic_match.common_name,
98
- 'taxonomy_string': taxonomic_match.taxonomy_string
99
- }
100
-
101
- else:
102
-
103
- output_row = {
104
- 'dataset_name': dataset_name,
105
- 'query': query,
106
- 'source': '',
107
- 'taxonomy_level': '',
108
- 'scientific_name': '',
109
- 'common_name': '',
110
- 'taxonomy_string': ''
111
- }
112
-
113
- output_rows.append(output_row)
114
-
115
- # ...for each mapping
116
-
117
-
118
- #%% Write output rows
119
-
120
- import os
121
- import pandas as pd
122
-
123
- assert not os.path.isfile(output_file), 'Delete the output file before re-generating'
124
-
125
- output_df = pd.DataFrame(data=output_rows, columns=[
126
- 'dataset_name', 'query', 'source', 'taxonomy_level',
127
- 'scientific_name', 'common_name', 'taxonomy_string'])
128
- output_df.to_csv(output_file, index=None, header=True)
129
-
130
-
131
- #%% Manual lookup
132
-
133
- if False:
134
-
135
- #%%
136
-
137
- # q = 'white-throated monkey'
138
- # q = 'cingulata'
139
- # q = 'notamacropus'
140
- q = 'porzana'
141
- taxonomy_preference = 'inat'
142
- m = get_preferred_taxonomic_match(q,taxonomy_preference)
143
- # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
144
-
145
- if m is None:
146
- print('No match')
147
- else:
148
- if m.source != taxonomy_preference:
149
- print('\n*** non-preferred match ***\n')
150
- # raise ValueError('')
151
- print(m.source)
152
- print(m.taxonomy_string)
153
- # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
154
- import clipboard; clipboard.copy(m.taxonomy_string)
1
+ """
2
+
3
+ map_new_lila_datasets.py
4
+
5
+ Given a subset of LILA datasets, find all the categories, and start the taxonomy
6
+ mapping process.
7
+
8
+ """
9
+
10
+ #%% Constants and imports
11
+
12
+ import os
13
+ import json
14
+
15
+ # Created by get_lila_category_list.py
16
+ input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
+
18
+ output_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
19
+
20
+ datasets_to_map = [
21
+ 'Trail Camera Images of New Zealand Animals'
22
+ ]
23
+
24
+
25
+ #%% Initialize taxonomic lookup
26
+
27
+ from taxonomy_mapping.species_lookup import (
28
+ initialize_taxonomy_lookup,
29
+ get_preferred_taxonomic_match)
30
+
31
+ # from taxonomy_mapping.species_lookup import (
32
+ # get_taxonomic_info, print_taxonomy_matche)
33
+
34
+ initialize_taxonomy_lookup(force_init=False)
35
+
36
+
37
+ #%% Read the list of datasets
38
+
39
+ with open(input_lila_category_list_file,'r') as f:
40
+ input_lila_categories = json.load(f)
41
+
42
+ lila_datasets = set()
43
+
44
+ for dataset_name in input_lila_categories.keys():
45
+ # The script that generates this dictionary creates a separate entry for bounding box
46
+ # metadata files, but those don't represent new dataset names
47
+ lila_datasets.add(dataset_name.replace('_bbox',''))
48
+
49
+ for s in datasets_to_map:
50
+ assert s in lila_datasets
51
+
52
+
53
+ #%% Find all categories
54
+
55
+ category_mappings = []
56
+
57
+ # dataset_name = datasets_to_map[0]
58
+ for dataset_name in datasets_to_map:
59
+
60
+ ds_categories = input_lila_categories[dataset_name]
61
+ for category in ds_categories:
62
+ category_name = category['name']
63
+ assert ':' not in category_name
64
+ mapping_name = dataset_name + ':' + category_name
65
+ category_mappings.append(mapping_name)
66
+
67
+ print('Need to create {} mappings'.format(len(category_mappings)))
68
+
69
+
70
+ #%% Match every query against our taxonomies
71
+
72
+ output_rows = []
73
+
74
+ taxonomy_preference = 'inat'
75
+
76
+ allow_non_preferred_matches = True
77
+
78
+ # mapping_string = category_mappings[1]; print(mapping_string)
79
+ for mapping_string in category_mappings:
80
+
81
+ tokens = mapping_string.split(':')
82
+ assert len(tokens) == 2
83
+
84
+ dataset_name = tokens[0]
85
+ query = tokens[1]
86
+
87
+ taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
88
+
89
+ if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
90
+
91
+ output_row = {
92
+ 'dataset_name': dataset_name,
93
+ 'query': query,
94
+ 'source': taxonomic_match.source,
95
+ 'taxonomy_level': taxonomic_match.taxonomic_level,
96
+ 'scientific_name': taxonomic_match.scientific_name,
97
+ 'common_name': taxonomic_match.common_name,
98
+ 'taxonomy_string': taxonomic_match.taxonomy_string
99
+ }
100
+
101
+ else:
102
+
103
+ output_row = {
104
+ 'dataset_name': dataset_name,
105
+ 'query': query,
106
+ 'source': '',
107
+ 'taxonomy_level': '',
108
+ 'scientific_name': '',
109
+ 'common_name': '',
110
+ 'taxonomy_string': ''
111
+ }
112
+
113
+ output_rows.append(output_row)
114
+
115
+ # ...for each mapping
116
+
117
+
118
+ #%% Write output rows
119
+
120
+ import os
121
+ import pandas as pd
122
+
123
+ assert not os.path.isfile(output_file), 'Delete the output file before re-generating'
124
+
125
+ output_df = pd.DataFrame(data=output_rows, columns=[
126
+ 'dataset_name', 'query', 'source', 'taxonomy_level',
127
+ 'scientific_name', 'common_name', 'taxonomy_string'])
128
+ output_df.to_csv(output_file, index=None, header=True)
129
+
130
+
131
+ #%% Manual lookup
132
+
133
+ if False:
134
+
135
+ #%%
136
+
137
+ # q = 'white-throated monkey'
138
+ # q = 'cingulata'
139
+ # q = 'notamacropus'
140
+ q = 'porzana'
141
+ taxonomy_preference = 'inat'
142
+ m = get_preferred_taxonomic_match(q,taxonomy_preference)
143
+ # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
144
+
145
+ if m is None:
146
+ print('No match')
147
+ else:
148
+ if m.source != taxonomy_preference:
149
+ print('\n*** non-preferred match ***\n')
150
+ # raise ValueError('')
151
+ print(m.source)
152
+ print(m.taxonomy_string)
153
+ # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
154
+ import clipboard; clipboard.copy(m.taxonomy_string)
@@ -1,134 +1,142 @@
1
- ########
2
- #
3
- # prepare_lila_taxonomy_release.py
4
- #
5
- # Given the private intermediate taxonomy mapping (produced by map_new_lila_datasets.py),
6
- # prepare the public (release) taxonomy mapping file.
7
- #
8
- ########
9
-
10
- #%% Imports and constants
11
-
12
- import os
13
- import json
14
- import pandas as pd
15
-
16
- lila_taxonomy_file = 'c:/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv'
17
- release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
18
- # import clipboard; clipboard.copy(release_taxonomy_file)
19
-
20
- # Created by get_lila_category_list.py... contains counts for each category
21
- lila_dataset_to_categories_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
22
-
23
- assert os.path.isfile(lila_dataset_to_categories_file)
24
- assert os.path.isfile(lila_taxonomy_file)
25
-
26
-
27
- #%% Find out which categories are actually used
28
-
29
- df = pd.read_csv(lila_taxonomy_file)
30
-
31
- with open(lila_dataset_to_categories_file,'r') as f:
32
- lila_dataset_to_categories = json.load(f)
33
-
34
- used_category_mappings = []
35
-
36
- # dataset_name = datasets_to_map[0]
37
- for dataset_name in lila_dataset_to_categories.keys():
38
-
39
- ds_categories = lila_dataset_to_categories[dataset_name]
40
- for category in ds_categories:
41
- category_name = category['name'].lower()
42
- assert ':' not in category_name
43
- mapping_name = dataset_name + ':' + category_name
44
- used_category_mappings.append(mapping_name)
45
-
46
- df['used'] = False
47
-
48
- # i_row = 0; row = df.iloc[i_row]; row
49
- for i_row,row in df.iterrows():
50
- ds_name = row['dataset_name']
51
- query = row['query']
52
- mapping_name = ds_name + ':' + query
53
- if mapping_name in used_category_mappings:
54
- df.loc[i_row,'used'] = True
55
- else:
56
- print('Dropping unused mapping {}'.format(mapping_name))
57
-
58
- df = df[df.used]
59
- df = df.drop('used',axis=1)
60
-
61
-
62
- #%% Generate the final output file
63
-
64
- assert not os.path.isfile(release_taxonomy_file)
65
-
66
- known_levels = ['stateofmatter',
67
- 'kingdom',
68
- 'phylum','subphylum',
69
- 'superclass','class','subclass','infraclass',
70
- 'superorder','order','parvorder','suborder','infraorder',
71
- 'zoosection',
72
- 'superfamily','family','subfamily','tribe',
73
- 'genus',
74
- 'species','subspecies','variety']
75
-
76
- levels_to_include = ['kingdom',
77
- 'phylum','subphylum',
78
- 'superclass','class','subclass','infraclass',
79
- 'superorder','order','suborder','infraorder',
80
- 'superfamily','family','subfamily','tribe',
81
- 'genus',
82
- 'species','subspecies','variety']
83
-
84
- levels_to_exclude = ['stateofmatter','zoosection','parvorder']
85
-
86
- for s in levels_to_exclude:
87
- assert s not in levels_to_include
88
-
89
- levels_used = set()
90
-
91
- # i_row = 0; row = df.iloc[i_row]; row
92
- for i_row,row in df.iterrows():
93
-
94
- if not isinstance(row['scientific_name'],str):
95
- assert not isinstance(row['taxonomy_string'],str)
96
- continue
97
-
98
- taxonomic_match = eval(row['taxonomy_string'])
99
-
100
- # match_at_level = taxonomic_match[0]
101
- for match_at_level in taxonomic_match:
102
- assert len(match_at_level) == 4
103
- levels_used.add(match_at_level[1])
104
-
105
- levels_used = [s for s in levels_used if isinstance(s,str)]
106
-
107
- for s in levels_used:
108
- assert s in levels_to_exclude or s in levels_to_include, 'Unrecognized level {}'.format(s)
109
-
110
- for s in levels_to_include:
111
- assert s in levels_used
112
-
113
- for s in levels_to_include:
114
- df[s] = ''
115
-
116
- # i_row = 0; row = df.iloc[i_row]; row
117
- for i_row,row in df.iterrows():
118
-
119
- if not isinstance(row['scientific_name'],str):
120
- assert not isinstance(row['taxonomy_string'],str)
121
- continue
122
-
123
- # E.g.: (43117, 'genus', 'lepus', ['hares and jackrabbits']
124
- taxonomic_match = eval(row['taxonomy_string'])
125
-
126
- for match_at_level in taxonomic_match:
127
- level = match_at_level[1]
128
- if level in levels_to_include:
129
- df.loc[i_row,level] = match_at_level[2]
130
-
131
- df = df.drop('source',axis=1)
132
- df.to_csv(release_taxonomy_file,header=True,index=False)
133
-
134
- print('Wrote final output to {}'.format(release_taxonomy_file))
1
+ """
2
+
3
+ prepare_lila_taxonomy_release.py
4
+
5
+ Given the private intermediate taxonomy mapping (produced by map_new_lila_datasets.py),
6
+ prepare the public (release) taxonomy mapping file.
7
+
8
+ """
9
+
10
+ #%% Imports and constants
11
+
12
+ import os
13
+ import json
14
+ import pandas as pd
15
+
16
+
17
+ #%% Prevent execution during infrastructural imports
18
+
19
+ if False:
20
+
21
+ #%% Filenames
22
+
23
+ lila_taxonomy_file = 'c:/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv'
24
+ release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
25
+ # import clipboard; clipboard.copy(release_taxonomy_file)
26
+
27
+ # Created by get_lila_category_list.py... contains counts for each category
28
+ lila_dataset_to_categories_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
29
+
30
+ assert os.path.isfile(lila_dataset_to_categories_file)
31
+ assert os.path.isfile(lila_taxonomy_file)
32
+
33
+
34
+ #%% Find out which categories are actually used
35
+
36
+ df = pd.read_csv(lila_taxonomy_file)
37
+
38
+ with open(lila_dataset_to_categories_file,'r') as f:
39
+ lila_dataset_to_categories = json.load(f)
40
+
41
+ used_category_mappings = []
42
+
43
+ # dataset_name = datasets_to_map[0]
44
+ for dataset_name in lila_dataset_to_categories.keys():
45
+
46
+ ds_categories = lila_dataset_to_categories[dataset_name]
47
+ for category in ds_categories:
48
+ category_name = category['name'].lower()
49
+ assert ':' not in category_name
50
+ mapping_name = dataset_name + ':' + category_name
51
+ used_category_mappings.append(mapping_name)
52
+
53
+ df['used'] = False
54
+
55
+ # i_row = 0; row = df.iloc[i_row]; row
56
+ for i_row,row in df.iterrows():
57
+ ds_name = row['dataset_name']
58
+ query = row['query']
59
+ mapping_name = ds_name + ':' + query
60
+ if mapping_name in used_category_mappings:
61
+ df.loc[i_row,'used'] = True
62
+ else:
63
+ print('Dropping unused mapping {}'.format(mapping_name))
64
+
65
+ df = df[df.used]
66
+ df = df.drop('used',axis=1)
67
+
68
+
69
+ #%% Generate the final output file
70
+
71
+ assert not os.path.isfile(release_taxonomy_file)
72
+
73
+ known_levels = ['stateofmatter', #noqa
74
+ 'kingdom',
75
+ 'phylum','subphylum',
76
+ 'superclass','class','subclass','infraclass',
77
+ 'superorder','order','parvorder','suborder','infraorder',
78
+ 'zoosection',
79
+ 'superfamily','family','subfamily','tribe',
80
+ 'genus',
81
+ 'species','subspecies','variety']
82
+
83
+ levels_to_include = ['kingdom',
84
+ 'phylum','subphylum',
85
+ 'superclass','class','subclass','infraclass',
86
+ 'superorder','order','suborder','infraorder',
87
+ 'superfamily','family','subfamily','tribe',
88
+ 'genus',
89
+ 'species','subspecies','variety']
90
+
91
+ levels_to_exclude = ['stateofmatter','zoosection','parvorder']
92
+
93
+ for s in levels_to_exclude:
94
+ assert s not in levels_to_include
95
+
96
+ levels_used = set()
97
+
98
+ # i_row = 0; row = df.iloc[i_row]; row
99
+ for i_row,row in df.iterrows():
100
+
101
+ if not isinstance(row['scientific_name'],str):
102
+ assert not isinstance(row['taxonomy_string'],str)
103
+ continue
104
+
105
+ taxonomic_match = eval(row['taxonomy_string'])
106
+
107
+ # match_at_level = taxonomic_match[0]
108
+ for match_at_level in taxonomic_match:
109
+ assert len(match_at_level) == 4
110
+ levels_used.add(match_at_level[1])
111
+
112
+ levels_used = [s for s in levels_used if isinstance(s,str)]
113
+
114
+ for s in levels_used:
115
+ assert s in levels_to_exclude or s in levels_to_include, 'Unrecognized level {}'.format(s)
116
+
117
+ for s in levels_to_include:
118
+ assert s in levels_used
119
+
120
+ for s in levels_to_include:
121
+ df[s] = ''
122
+
123
+ # i_row = 0; row = df.iloc[i_row]; row
124
+ for i_row,row in df.iterrows():
125
+
126
+ if not isinstance(row['scientific_name'],str):
127
+ assert not isinstance(row['taxonomy_string'],str)
128
+ continue
129
+
130
+ # E.g.: (43117, 'genus', 'lepus', ['hares and jackrabbits']
131
+ taxonomic_match = eval(row['taxonomy_string'])
132
+
133
+ for match_at_level in taxonomic_match:
134
+ level = match_at_level[1]
135
+ if level in levels_to_include:
136
+ df.loc[i_row,level] = match_at_level[2]
137
+
138
+ df = df.drop('source',axis=1)
139
+ df.to_csv(release_taxonomy_file,header=True,index=False)
140
+
141
+ print('Wrote final output to {}'.format(release_taxonomy_file))
142
+