megadetector 5.0.11__py3-none-any.whl → 5.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (203) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +97 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +149 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +88 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +125 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +263 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +607 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +237 -0
  58. megadetector/data_management/cct_json_utils.py +404 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +283 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +493 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +793 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +870 -0
  129. megadetector/data_management/read_exif.py +809 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/rename_images.py +187 -0
  133. megadetector/data_management/resize_coco_dataset.py +189 -0
  134. megadetector/data_management/wi_download_csv_to_coco.py +247 -0
  135. megadetector/data_management/yolo_output_to_md_output.py +446 -0
  136. megadetector/data_management/yolo_to_coco.py +676 -0
  137. megadetector/detection/__init__.py +0 -0
  138. megadetector/detection/detector_training/__init__.py +0 -0
  139. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  140. megadetector/detection/process_video.py +846 -0
  141. megadetector/detection/pytorch_detector.py +355 -0
  142. megadetector/detection/run_detector.py +779 -0
  143. megadetector/detection/run_detector_batch.py +1219 -0
  144. megadetector/detection/run_inference_with_yolov5_val.py +1087 -0
  145. megadetector/detection/run_tiled_inference.py +934 -0
  146. megadetector/detection/tf_detector.py +192 -0
  147. megadetector/detection/video_utils.py +698 -0
  148. megadetector/postprocessing/__init__.py +0 -0
  149. megadetector/postprocessing/add_max_conf.py +64 -0
  150. megadetector/postprocessing/categorize_detections_by_size.py +165 -0
  151. megadetector/postprocessing/classification_postprocessing.py +716 -0
  152. megadetector/postprocessing/combine_api_outputs.py +249 -0
  153. megadetector/postprocessing/compare_batch_results.py +966 -0
  154. megadetector/postprocessing/convert_output_format.py +396 -0
  155. megadetector/postprocessing/load_api_results.py +195 -0
  156. megadetector/postprocessing/md_to_coco.py +310 -0
  157. megadetector/postprocessing/md_to_labelme.py +330 -0
  158. megadetector/postprocessing/merge_detections.py +412 -0
  159. megadetector/postprocessing/postprocess_batch_results.py +1908 -0
  160. megadetector/postprocessing/remap_detection_categories.py +170 -0
  161. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  162. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  163. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  164. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1635 -0
  165. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  166. megadetector/postprocessing/subset_json_detector_output.py +700 -0
  167. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  168. megadetector/taxonomy_mapping/__init__.py +0 -0
  169. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  170. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  171. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  172. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +588 -0
  173. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  174. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  175. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  176. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  177. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  178. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  179. megadetector/utils/__init__.py +0 -0
  180. megadetector/utils/azure_utils.py +178 -0
  181. megadetector/utils/ct_utils.py +613 -0
  182. megadetector/utils/directory_listing.py +246 -0
  183. megadetector/utils/md_tests.py +1164 -0
  184. megadetector/utils/path_utils.py +1045 -0
  185. megadetector/utils/process_utils.py +160 -0
  186. megadetector/utils/sas_blob_utils.py +509 -0
  187. megadetector/utils/split_locations_into_train_val.py +228 -0
  188. megadetector/utils/string_utils.py +92 -0
  189. megadetector/utils/url_utils.py +323 -0
  190. megadetector/utils/write_html_image_list.py +225 -0
  191. megadetector/visualization/__init__.py +0 -0
  192. megadetector/visualization/plot_utils.py +293 -0
  193. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  194. megadetector/visualization/visualization_utils.py +1536 -0
  195. megadetector/visualization/visualize_db.py +552 -0
  196. megadetector/visualization/visualize_detector_output.py +405 -0
  197. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/LICENSE +0 -0
  198. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/METADATA +2 -2
  199. megadetector-5.0.13.dist-info/RECORD +201 -0
  200. megadetector-5.0.13.dist-info/top_level.txt +1 -0
  201. megadetector-5.0.11.dist-info/RECORD +0 -5
  202. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  203. {megadetector-5.0.11.dist-info → megadetector-5.0.13.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1067 @@
1
+ """
2
+
3
+ snapshot_serengeti_lila.py
4
+
5
+ Create zipfiles of Snapshot Serengeti S1-S11.
6
+
7
+ Create a metadata file for S1-S10, plus separate metadata files
8
+ for S1-S11. At the time this code was written, S11 was under embargo.
9
+
10
+ Create zip archives of each season without humans.
11
+
12
+ Create a human zip archive.
13
+
14
+ """
15
+
16
+ #%% Constants and imports
17
+
18
+ import pandas as pd
19
+ import json
20
+ import os
21
+ import uuid
22
+ import pickle
23
+ import humanfriendly
24
+ import time
25
+ import pprint
26
+ import numpy as np
27
+ import re
28
+ import glob
29
+
30
+ from PIL import Image
31
+ from multiprocessing.pool import ThreadPool
32
+ from tqdm import tqdm
33
+ from zipfile import ZipFile
34
+ import zipfile
35
+
36
+ from megadetector.utils import path_utils
37
+
38
+ metadata_base = r'e:\snapshot-serengeti\MetaData\SER'
39
+ image_base = r'e:\snapshot-serengeti\SER'
40
+
41
+ bbox_file = r'e:\snapshot_serengeti_bboxes_20190409.json'
42
+ old_json_file = r'e:\SnapshotSerengeti.json'
43
+
44
+ temp_base = r'e:\snapshot_temp'
45
+ output_base = r'e:\snapshot_out'
46
+ output_zip_base = r'e:\snapshot_out'
47
+
48
+ os.makedirs(temp_base,exist_ok=True)
49
+ os.makedirs(output_base,exist_ok=True)
50
+
51
+ # assert(os.path.isdir(metadata_base))
52
+ assert(os.path.isdir(image_base))
53
+ assert(os.path.isfile(bbox_file))
54
+ assert(os.path.isfile(old_json_file))
55
+
56
+ nSeasons = 11
57
+ non_public_seasons = set()
58
+
59
+ initial_dictionary_cache_file = os.path.join(temp_base,'initial_dictionaries.p')
60
+ resized_images_dictionary_cache_file = os.path.join(temp_base,'resized_image_dictionaries.p')
61
+ revised_annotations_dictionary_cache_file = os.path.join(temp_base,'revised_annotations.p')
62
+ final_dictionary_cache_file = os.path.join(temp_base,'final_annotations.p')
63
+
64
+ # There are two redundant categories, and we re-map "blank" to "empty" as per CCT convention
65
+ category_mappings = {'blank':'empty','birdother':'otherbird','vervetmonkey':'monkeyvervet'}
66
+
67
+ process_images_n_threads = 20
68
+
69
+
70
+ #%% Load metadata files, concatenate into a single table
71
+
72
+ per_season_image_tables = []
73
+ per_season_annotation_tables = []
74
+
75
+ print('Reading tables...')
76
+
77
+ # iSeason = 1
78
+ for iSeason in tqdm(range(1,nSeasons+1)):
79
+ image_table_fn = os.path.join(metadata_base,'SER_S' + str(iSeason) + '_report_lila_image_inventory.csv')
80
+ annotation_table_fn = os.path.join(metadata_base,'SER_S' + str(iSeason) + '_report_lila.csv')
81
+ assert os.path.isfile(image_table_fn) and os.path.isfile(annotation_table_fn)
82
+ image_table = pd.read_csv(image_table_fn)
83
+ annotation_table = pd.read_csv(annotation_table_fn)
84
+ per_season_image_tables.append(image_table)
85
+ per_season_annotation_tables.append(annotation_table)
86
+
87
+ print('Finished reading tables, concatenating...')
88
+
89
+ image_table = pd.concat(per_season_image_tables)
90
+ annotation_table = pd.concat(per_season_annotation_tables)
91
+
92
+ print('Finished concatenating {} images and {} annotations'.format(len(image_table),len(annotation_table)))
93
+
94
+
95
+ #%% Convert to dictionaries (prep)
96
+
97
+ im_id_to_image = {}
98
+ images = []
99
+ seq_id_to_images = {}
100
+ seq_id_to_annotations = {}
101
+
102
+ annotations = []
103
+ categories = []
104
+
105
+ species_to_category = {}
106
+
107
+ empty_category_id = 0
108
+ empty_category_name = 'empty'
109
+
110
+ empty_cat = {}
111
+ empty_cat['id'] = empty_category_id
112
+ empty_cat['name'] = empty_category_name
113
+ empty_cat['count'] = 0
114
+ species_to_category['empty'] = empty_cat
115
+ categories.append(empty_cat)
116
+
117
+ next_category_id = empty_category_id + 1
118
+
119
+
120
+ #%% Convert to dictionaries (loops)
121
+
122
+ # iterrows() is a terrible way to do this, but this is one of those days
123
+ # where I want to get this done, not get better at Python.
124
+
125
+ print('Processing image table')
126
+
127
+ start_time = time.time()
128
+
129
+ # irow = 0; row = image_table.iloc[0]
130
+ for iRow,row in tqdm(image_table.iterrows()):
131
+
132
+ # Loaded as an int64, converting to int here
133
+ frame_num = int(row['image_rank_in_capture'])
134
+ assert frame_num > 0
135
+ sequence_id = row['capture_id']
136
+ frame_num = int(frame_num)
137
+ filename = row['image_path_rel']
138
+ tokens = filename.split('.')
139
+ assert(len(tokens)==2)
140
+ assert(tokens[1] == 'JPG')
141
+ id = tokens[0]
142
+
143
+ im = {}
144
+ im['id'] = id
145
+ im['file_name'] = filename
146
+ im['frame_num'] = frame_num
147
+ im['seq_id'] = sequence_id
148
+
149
+ assert id not in im_id_to_image
150
+ im_id_to_image[id] = im
151
+ seq_id_to_images.setdefault(sequence_id,[]).append(im)
152
+
153
+ images.append(im)
154
+
155
+ # ...for each row in the image table
156
+
157
+ # Make sure image IDs are what we think they are
158
+ for im in tqdm(images):
159
+ assert im['id'] == im['file_name'].replace('.JPG','')
160
+
161
+ print('Processing annotation table')
162
+
163
+ for iRow,row in tqdm(annotation_table.iterrows()):
164
+
165
+ sequence_id = row['capture_id']
166
+
167
+ species = row['question__species'].lower()
168
+ if species in category_mappings:
169
+ species = category_mappings[species]
170
+
171
+ category = None
172
+
173
+ if species not in species_to_category:
174
+ category = {}
175
+ category['id'] = next_category_id
176
+ next_category_id = next_category_id + 1
177
+ category['name'] = species
178
+ category['count'] = 1
179
+ categories.append(category)
180
+ species_to_category[species] = category
181
+ else:
182
+ category = species_to_category[species]
183
+ category['count'] += 1
184
+
185
+ ann = {}
186
+ ann['sequence_level_annotation'] = True
187
+ ann['id'] = str(uuid.uuid1())
188
+ ann['category_id'] = category['id']
189
+ ann['seq_id'] = sequence_id
190
+
191
+ ann['season'] = row['season']
192
+ ann['site'] = row['site']
193
+ ann['datetime'] = row['capture_date_local'] + ' ' + row['capture_time_local']
194
+ ann['subject_id'] = row['subject_id']
195
+ ann['count'] = row['question__count_median']
196
+ ann['standing'] = row['question__standing']
197
+ ann['resting'] = row['question__resting']
198
+ ann['moving'] = row['question__moving']
199
+ ann['interacting'] = row['question__interacting']
200
+ ann['young_present'] = row['question__young_present']
201
+
202
+ seq_id_to_annotations.setdefault(sequence_id,[]).append(ann)
203
+
204
+ annotations.append(ann)
205
+
206
+ # ...for each row in the annotation table
207
+
208
+ elapsed = time.time() - start_time
209
+ print('Done converting tables to dictionaries in {}'.format(humanfriendly.format_timespan(elapsed)))
210
+
211
+ print('Saving dictionaries to {}'.format(initial_dictionary_cache_file))
212
+ cct_cache = [im_id_to_image, images, seq_id_to_images,
213
+ seq_id_to_annotations, annotations, categories, species_to_category]
214
+ with open(initial_dictionary_cache_file, 'wb') as f:
215
+ pickle.dump(cct_cache, f, protocol=pickle.HIGHEST_PROTOCOL)
216
+
217
+
218
+ #%% Load previously-saved dictionaries when re-starting mid-script
219
+
220
+ if False:
221
+
222
+ #%%
223
+
224
+ print('Loading dictionaries from {}'.format(initial_dictionary_cache_file))
225
+ with open(initial_dictionary_cache_file, 'rb') as f:
226
+ cct_cache = pickle.load(f)
227
+ im_id_to_image,images,seq_id_to_images, \
228
+ seq_id_to_annotations,annotations,categories,species_to_category = cct_cache
229
+
230
+
231
+ #%% Take a look at categories
232
+
233
+ if False:
234
+
235
+ #%%
236
+
237
+ assert(len(im_id_to_image)==len(images))
238
+ print('Loaded metadata about {} images and {} sequences'.format(len(images),len(seq_id_to_annotations)))
239
+
240
+ categories_by_species = sorted(categories, key = lambda i: i['name'])
241
+ categories_by_count = sorted(categories, key = lambda i: i['count'])
242
+
243
+ pp = pprint.PrettyPrinter(depth=6)
244
+ pp.pprint(categories_by_species)
245
+ pp.pprint(categories_by_count)
246
+
247
+
248
+ #%% Fill in some image fields we didn't have when we created the image table
249
+
250
+ # width, height, corrupt, seq_num_frames, location, datetime
251
+
252
+ def process_image(im):
253
+
254
+ im['width'] = -1
255
+ im['height'] = -1
256
+ im['corrupt'] = False
257
+ im['location'] = 'unknown'
258
+ im['seq_num_frames'] = -1
259
+ im['datetime'] = 'unknown'
260
+ im['status'] = ''
261
+
262
+ if im['seq_id'] not in seq_id_to_annotations:
263
+ im['status'] = 'no_annotation'
264
+ return im
265
+
266
+ seq_annotations = seq_id_to_annotations[im['seq_id']]
267
+
268
+ # Every annotation in this list should have the same sequence ID
269
+ assert all(ann['seq_id'] == im['seq_id'] for ann in seq_annotations) , 'Error on image {}'.format(im['id'])
270
+
271
+ # Figure out "seq_num_frames", which really should be done in a separate lopp;
272
+ # there's no reason to do this redundantly for every image
273
+ images_in_sequence = seq_id_to_images[im['seq_id']]
274
+
275
+ # Every image in this sequence should point back to the same equence
276
+ assert all(seqim['seq_id'] == im['seq_id'] for seqim in images_in_sequence), 'Error on image {}'.format(im['id'])
277
+
278
+ frame_nums = [seqim['frame_num'] for seqim in images_in_sequence]
279
+ seq_num_frames = max(frame_nums)
280
+ im['seq_num_frames'] = seq_num_frames
281
+
282
+ im['location'] = seq_annotations[0]['site']
283
+
284
+ # Every annotation in this list should have the same location
285
+ assert all(ann['site'] == im['location'] for ann in seq_annotations), 'Error on image {}'.format(im['id'])
286
+
287
+ im['datetime'] = seq_annotations[0]['datetime']
288
+
289
+ # Every annotation in this list should have the same datetime
290
+ assert all(ann['datetime'] == im['datetime'] for ann in seq_annotations), 'Error on image {}'.format(im['id'])
291
+
292
+ # Is this image on disk?
293
+ fullpath = os.path.join(image_base,im['file_name'])
294
+ if not os.path.isfile(fullpath):
295
+ im['status'] = 'not_on_disk'
296
+ return im
297
+
298
+ try:
299
+
300
+ pil_im = Image.open(fullpath)
301
+ im['height'] = pil_im.height
302
+ im['width'] = pil_im.width
303
+
304
+ except:
305
+
306
+ im['corrupt'] = True
307
+
308
+ return im
309
+
310
+
311
+ if process_images_n_threads <= 1:
312
+
313
+ # iImage = 0; im = images[0]
314
+ for iImage,im in tqdm(enumerate(images),total=len(images)):
315
+ process_image(im)
316
+ # ...for each image
317
+
318
+ else:
319
+
320
+ pool = ThreadPool(process_images_n_threads)
321
+
322
+ # images_processed = pool.map(process_image, images)
323
+ # images_processed = list(tqdm(pool.imap_unordered(process_image, images), total=len(images)))
324
+ images_processed = list(tqdm(pool.imap(process_image, images), total=len(images)))
325
+
326
+ print('Saving size-checked dictionaries to {}'.format(resized_images_dictionary_cache_file))
327
+ cct_cache = [im_id_to_image, images, seq_id_to_images,
328
+ seq_id_to_annotations, annotations, categories, species_to_category]
329
+ with open(resized_images_dictionary_cache_file, 'wb') as f:
330
+ pickle.dump(cct_cache, f, protocol=pickle.HIGHEST_PROTOCOL)
331
+
332
+
333
+ if False:
334
+
335
+ #%%
336
+
337
+ print('Loading dictionaries with size information from {}'.format(resized_images_dictionary_cache_file))
338
+ with open(resized_images_dictionary_cache_file, 'rb') as f:
339
+ cct_cache = pickle.load(f)
340
+ im_id_to_image,images,seq_id_to_images, \
341
+ seq_id_to_annotations,annotations,categories,species_to_category = cct_cache
342
+
343
+
344
+ #%% Count missing/corrupted images
345
+
346
+ n_missing = 0
347
+ n_corrupt = 0
348
+ n_no_annotation = 0
349
+
350
+ for im in tqdm(images):
351
+
352
+ if im['corrupt']:
353
+ n_corrupt += 1
354
+
355
+ if im['status'] == '':
356
+ continue
357
+ elif im['status'] == 'not_on_disk':
358
+ n_missing += 1
359
+ elif im['status'] == 'no_annotation':
360
+ n_no_annotation += 1
361
+ else:
362
+ raise ValueError('Unrecognized status {}'.format(im['status']))
363
+
364
+ print('\nOf {} images: {} missing, {} corrupt, {} no annotation'.format(len(images),
365
+ n_missing, n_corrupt, n_no_annotation))
366
+
367
+
368
+ #%% Print distribution of sequence lengths
369
+
370
+ seq_id_to_sequence_length = {}
371
+
372
+ for im in tqdm(images):
373
+
374
+ seq_id = im['seq_id']
375
+ seq_num_frames = im['seq_num_frames']
376
+ if seq_id not in seq_id_to_sequence_length:
377
+ seq_id_to_sequence_length[seq_id] = seq_num_frames
378
+
379
+ sequence_lengths = list(seq_id_to_sequence_length.values())
380
+
381
+ print('\nMean/min/max sequence length is {}/{}/{}'.format(np.mean(sequence_lengths),min(sequence_lengths),max(sequence_lengths)))
382
+
383
+
384
+ #%% Replicate annotations across images
385
+
386
+ annotations_replicated = []
387
+
388
+ # iAnn = 0; ann = annotations[iAnn]
389
+ for iAnn,ann in tqdm(enumerate(annotations), total=len(annotations)):
390
+
391
+ associated_images = seq_id_to_images[ann['seq_id']]
392
+ assert len(associated_images) > 0
393
+ for associated_image in associated_images:
394
+ new_ann = ann.copy()
395
+ new_ann['image_id'] = associated_image['id']
396
+ new_ann['id'] = str(uuid.uuid1())
397
+ annotations_replicated.append(new_ann)
398
+
399
+ print('\nCreated {} replicated annotations from {} original annotations'.format(len(annotations_replicated),
400
+ len(annotations)))
401
+
402
+ annotations = annotations_replicated
403
+
404
+
405
+ #%% See what files are on disk but not annotated (~15 mins)
406
+
407
+ print('Listing images from disk...')
408
+ start_time = time.time()
409
+ all_files = path_utils.recursive_file_list(image_base)
410
+ elapsed = time.time() - start_time
411
+ print('Finished listing {} files in {}'.format(len(all_files),humanfriendly.format_timespan(elapsed)))
412
+
413
+ files_not_in_db = []
414
+
415
+ for fn in tqdm(all_files):
416
+ id = os.path.relpath(fn,image_base).replace('\\','/').replace('.JPG','')
417
+ if id not in im_id_to_image:
418
+ files_not_in_db.append(fn)
419
+
420
+ print('{} files not in the database (of {})'.format(len(files_not_in_db),len(all_files)))
421
+
422
+ # 247370 files not in the database (of 7425810)
423
+
424
+
425
+ #%% Load old image database
426
+
427
+ print('Loading old .json file...',end='')
428
+ with open(old_json_file,'r') as f:
429
+ cct_old = json.load(f)
430
+ print('done')
431
+
432
+
433
+ #%% Look for old images not in the new DB and vice-versa
434
+
435
+ # At the time this was written, "old" was S1-S6
436
+
437
+ # old_im = cct_old['images'][0]
438
+ old_im_id_to_im = {}
439
+ old_images_not_in_new_db = []
440
+ new_images_not_in_old_db = []
441
+ size_mismatches = []
442
+ for old_im in tqdm(cct_old['images']):
443
+ old_im_id_to_im[old_im['id']] = old_im
444
+ if old_im['id'] not in im_id_to_image:
445
+ old_images_not_in_new_db.append(old_im)
446
+ else:
447
+ new_im = im_id_to_image[old_im['id']]
448
+ if (old_im['width'] != new_im['width']) or (old_im['height'] != new_im['height']):
449
+ size_mismatches.append(old_im)
450
+
451
+ # new_im = images[0]
452
+ for new_im in tqdm(images):
453
+ new_id = new_im['id']
454
+ if new_id.startswith('SER_S11'):
455
+ continue
456
+ m = re.search('^S(\d+)/',new_id)
457
+ if m is None:
458
+ print('Failed to match season number in {}'.format(id))
459
+ continue
460
+ season = int(m.group(1))
461
+ if season > 6:
462
+ continue
463
+ if new_id not in old_im_id_to_im:
464
+ new_images_not_in_old_db.append(new_im)
465
+
466
+ print('{} old images not in new db'.format(len(old_images_not_in_new_db)))
467
+ print('{} new images not in old db'.format(len(new_images_not_in_old_db)))
468
+ print('{} size mismatches'.format(len(size_mismatches)))
469
+
470
+ # 4 old images not in new db
471
+ # 12 new images not in old db
472
+
473
+
474
+ #%% Save our work
475
+
476
+ print('Saving revised-annotation dictionaries to {}'.format(revised_annotations_dictionary_cache_file))
477
+ cct_cache = [im_id_to_image, images, seq_id_to_images,
478
+ seq_id_to_annotations, annotations, categories, species_to_category, all_files]
479
+ with open(revised_annotations_dictionary_cache_file, 'wb') as f:
480
+ pickle.dump(cct_cache, f, protocol=pickle.HIGHEST_PROTOCOL)
481
+
482
+
483
+ #%% Load our work
484
+
485
+ if False:
486
+
487
+ #%%
488
+ print('Loading dictionaries from {}'.format(revised_annotations_dictionary_cache_file))
489
+ with open(revised_annotations_dictionary_cache_file, 'rb') as f:
490
+ cct_cache = pickle.load(f)
491
+ im_id_to_image, images, seq_id_to_images, \
492
+ seq_id_to_annotations, annotations, categories, species_to_category, all_files = cct_cache
493
+
494
+
495
+ #%% Examine size mismatches
496
+
497
+ # i_mismatch = -1; old_im = size_mismatches[i_mismatch]
498
+ for i_mismatch,old_im in enumerate(size_mismatches):
499
+ new_im = im_id_to_image[old_im['id']]
500
+
501
+ seasons = list(range(1,7))
502
+ mismatches_by_season = []
503
+ for season in seasons:
504
+ season_mismatches = [x for x in size_mismatches if x['id'].startswith('S' + str(season))]
505
+ mismatches_by_season.append(season_mismatches)
506
+
507
+ for iSeason,season_mismatches in enumerate(mismatches_by_season):
508
+ print('Size mismatches in season {}: {}'.format(iSeason+1,len(mismatches_by_season[iSeason])))
509
+
510
+
511
+ #%% Validate image and annotation uniqueness
512
+
513
+ tmp_img_ids = set()
514
+ tmp_ann_ids = set()
515
+
516
+ for im in tqdm(images):
517
+ assert im['id'] not in tmp_img_ids
518
+ tmp_img_ids.add(im['id'])
519
+
520
+ for ann in tqdm(annotations):
521
+ assert ann['id'] not in tmp_ann_ids
522
+ tmp_ann_ids.add(ann['id'])
523
+
524
+
525
+ #%% Split data by seasons, create master list for public seasons
526
+
527
+ annotations_by_season = [[] for i in range(nSeasons)]
528
+ annotations_public = []
529
+
530
+ image_ids_by_season = [set() for i in range(nSeasons)]
531
+ image_ids_public = set()
532
+
533
+ # ann = annotations[0]
534
+ for ann in tqdm(annotations):
535
+ season_id = ann['image_id'].split('/')[0]
536
+ assert season_id is not None and season_id.startswith('S')
537
+ season = int(season_id.replace('SER_','S').replace('S',''))
538
+ assert season >=1 and season <= nSeasons
539
+ i_season = season - 1
540
+ annotations_by_season[i_season].append(ann)
541
+ im = im_id_to_image[ann['image_id']]
542
+ image_ids_by_season[i_season].add(im['id'])
543
+
544
+ if season not in non_public_seasons:
545
+ annotations_public.append(ann)
546
+ image_ids_public.add(im['id'])
547
+
548
+ images_by_season = []
549
+ for id_list in image_ids_by_season:
550
+ season_images = []
551
+ for id in id_list:
552
+ season_images.append(im_id_to_image[id])
553
+ images_by_season.append(season_images)
554
+
555
+ images_public = []
556
+ for id in image_ids_public:
557
+ images_public.append(im_id_to_image[id])
558
+
559
+ for season in range(1,nSeasons+1):
560
+ i_season = season - 1
561
+ print('Season {}: {} images, {} annotations'.format(season,len(images_by_season[i_season]),
562
+ len(annotations_by_season[i_season])))
563
+
564
+ print('Public total: {} images, {} annotations'.format(len(images_public),
565
+ len(annotations_public)))
566
+
567
+
568
+ #%% Minor updates to fields
569
+
570
+ for ann in tqdm(annotations):
571
+ ann['location'] = ann['site']
572
+ del ann['site']
573
+ try:
574
+ icount = ann['count']
575
+ except:
576
+ icount = -1
577
+ ann['count'] = icount
578
+
579
+ for im in tqdm(images):
580
+ del im['status']
581
+
582
+ for c in categories:
583
+ del c['count']
584
+
585
+
586
+ #%% Write master .json out for S1-10, write individual season .jsons (including S11)
587
+
588
+ info = {}
589
+ info['version'] = '2.0'
590
+ info['description'] = 'Camera trap data from the Snapshot Serengeti program'
591
+ info['date_created'] = '2019'
592
+ info['contributor'] = 'University of Minnesota Lion Center'
593
+
594
+ # Loop over all seasons, plus one iteration for the "all public data" iteration, and
595
+ # one for the "all data" iteration
596
+ for season in range(1,nSeasons+3):
597
+ i_season = season - 1
598
+ data = {}
599
+ data['info'] = info.copy()
600
+ data['categories'] = categories
601
+ if i_season == nSeasons + 1:
602
+ data['info']['version'] = '2.1'
603
+ data['info']['description'] = data['info']['description'] + ', seasons 1-11'
604
+ data['images'] = images
605
+ data['annotations'] = annotations
606
+ fn = os.path.join(output_base,'SnapshotSerengeti_v2.1.json')
607
+ elif i_season == nSeasons:
608
+ data['info']['description'] = data['info']['description'] + ', seasons 1-10'
609
+ data['images'] = images_public
610
+ data['annotations'] = annotations_public
611
+ fn = os.path.join(output_base,'SnapshotSeregenti_v2.0.json')
612
+ else:
613
+ data['info']['description'] = data['info']['description'] + ', season {}'.format(season)
614
+ data['images'] = images_by_season[i_season]
615
+ data['annotations'] = annotations_by_season[i_season]
616
+ fn = os.path.join(output_base,'SnapshotSerengetiS{:0>2d}.json'.format(season))
617
+
618
+ print('Writing data for season {} to {}'.format(season,fn))
619
+
620
+ s = json.dumps(data,indent=1)
621
+ with open(fn, "w+") as f:
622
+ f.write(s)
623
+
624
+
625
+ #%% Find categories that only exist in S11
626
+
627
+ # List of categories in each season
628
+ categories_by_season = [set() for i in range(nSeasons)]
629
+
630
+ for ann in tqdm(annotations):
631
+ season_id = ann['image_id'].split('/')[0]
632
+ assert season_id is not None and season_id.startswith('S')
633
+ season = int(season_id.replace('SER_','S').replace('S',''))
634
+ assert season >=1 and season <= nSeasons
635
+ i_season = season - 1
636
+ categories_by_season[i_season].add(ann['category_id'])
637
+
638
+ cat_id_to_cat = {}
639
+ for c in categories:
640
+ cat_id_to_cat[c['id']] = c
641
+
642
+ category_counts_by_season = [len(c) for c in categories_by_season]
643
+ target_season_idx = 10
644
+
645
+ for id in categories_by_season[target_season_idx]:
646
+ b_found = False
647
+ for i_season,season_categories in enumerate(categories_by_season):
648
+ if i_season == target_season_idx:
649
+ continue
650
+ if id in season_categories:
651
+ b_found = True
652
+ break
653
+ if not b_found:
654
+ print('Category {} ({}) only in S{}'.format(id,cat_id_to_cat[id]['name'],target_season_idx+1))
655
+
656
+ for cat in categories:
657
+ if cat['id'] not in categories_by_season[target_season_idx]:
658
+ print('Category {} ({}) not in S{}'.format(cat['id'],cat['name'],target_season_idx+1))
659
+
660
+ # Category 55 (fire) only in S11
661
+ # Category 56 (hyenabrown) only in S11
662
+ # Category 57 (wilddog) only in S11
663
+ # Category 58 (kudu) only in S11
664
+ # Category 59 (pangolin) only in S11
665
+ # Category 60 (lioncub) only in S11
666
+
667
+
668
+ #%% Prepare season-specific .csv files
669
+
670
+ per_season_image_tables = []
671
+ per_season_annotation_tables = []
672
+
673
+ print('Reading tables...')
674
+
675
+ # iSeason = 1
676
+ for iSeason in tqdm(range(1,nSeasons+1)):
677
+ image_table_fn = os.path.join(metadata_base,'SER_S' + str(iSeason) + '_report_lila_image_inventory.csv')
678
+ annotation_table_fn = os.path.join(metadata_base,'SER_S' + str(iSeason) + '_report_lila.csv')
679
+ assert os.path.isfile(image_table_fn) and os.path.isfile(annotation_table_fn)
680
+ image_table = pd.read_csv(image_table_fn)
681
+ annotation_table = pd.read_csv(annotation_table_fn)
682
+ per_season_image_tables.append(image_table)
683
+ per_season_annotation_tables.append(annotation_table)
684
+
685
+ print('Finished reading tables, concatenating...')
686
+
687
+ image_table_public = pd.concat(per_season_image_tables[0:-1],sort=False)
688
+ annotation_table_public = pd.concat(per_season_annotation_tables[0:-1],sort=False)
689
+
690
+ image_table_all = pd.concat(per_season_image_tables,sort=False)
691
+ annotation_table_all = pd.concat(per_season_annotation_tables,sort=False)
692
+
693
+ print('Finished concatenating {} images and {} annotations'.format(len(image_table),len(annotation_table)))
694
+
695
+ fn_image_csv_public = os.path.join(output_base,'SnapshotSerengeti_v2_0_images.csv')
696
+ fn_annotation_csv_public = os.path.join(output_base,'SnapshotSerengeti_v2_0_annotations.csv')
697
+ fn_image_csv_all = os.path.join(output_base,'SnapshotSerengeti_v2_1_images.csv')
698
+ fn_annotation_csv_all = os.path.join(output_base,'SnapshotSerengeti_v2_1_annotations.csv')
699
+
700
+ image_table_public.to_csv(fn_image_csv_public)
701
+ annotation_table_public.to_csv(fn_annotation_csv_public)
702
+
703
+ image_table_all.to_csv(fn_image_csv_all)
704
+ annotation_table_all.to_csv(fn_annotation_csv_all)
705
+
706
+
707
+ #%% Create a list of human files
708
+
709
+ human_image_ids = set()
710
+ human_id = species_to_category['human']['id']
711
+
712
+ # ann = annotations[0]
713
+ for ann in tqdm(annotations):
714
+ if ann['category_id'] == human_id:
715
+ human_image_ids.add(ann['image_id'])
716
+
717
+ print('Found {} images with humans'.format(len(human_image_ids)))
718
+
719
+
720
+ #%% Save our work
721
+
722
+ print('Saving final dictionaries to {}'.format(final_dictionary_cache_file))
723
+ cct_cache = [im_id_to_image, images, seq_id_to_images,
724
+ seq_id_to_annotations, annotations, categories, species_to_category, all_files,
725
+ human_id, human_image_ids]
726
+ with open(final_dictionary_cache_file, 'wb') as f:
727
+ pickle.dump(cct_cache, f, protocol=pickle.HIGHEST_PROTOCOL)
728
+
729
+
730
+ #%% Load our work
731
+
732
+ if False:
733
+
734
+ #%%
735
+ print('Loading dictionaries from {}'.format(final_dictionary_cache_file))
736
+ with open(final_dictionary_cache_file, 'rb') as f:
737
+ cct_cache = pickle.load(f)
738
+ im_id_to_image, images, seq_id_to_images, \
739
+ seq_id_to_annotations, annotations, categories, species_to_category, all_files, \
740
+ human_id, human_image_ids = cct_cache
741
+
742
+
743
+ #%% Create archives (human, per-season) (prep)
744
+
745
+ human_zipfile = os.path.join(output_zip_base,'SnapshotSerengeti_humans_v2.0.zip')
746
+ os.makedirs(output_zip_base,exist_ok=True)
747
+
748
+ debug_max_files = -1
749
+ n_dot = 1000
750
+ n_print = 10000
751
+ max_files_per_archive = 500000
752
+
753
+ def create_human_archive():
754
+
755
+ n_images_added = 0
756
+ with ZipFile(human_zipfile,'w') as zip:
757
+
758
+ print('Creating archive {}'.format(human_zipfile))
759
+
760
+ # im = images[0]
761
+ for iImage,im in enumerate(images):
762
+ if im['id'] in human_image_ids:
763
+ n_images_added += 1
764
+ if debug_max_files > 0 and n_images_added > debug_max_files:
765
+ break
766
+ if (n_images_added % n_dot)==0:
767
+ print('.',end='')
768
+ if (n_images_added % n_print)==0:
769
+ print('{} images added to {}'.format(n_images_added,human_zipfile))
770
+ source_file = os.path.join(image_base,im['file_name'])
771
+ dest_file = im['file_name']
772
+ zip.write(source_file,dest_file,zipfile.ZIP_STORED)
773
+
774
+ print('\nFinished writing {}, added {} files'.format(human_zipfile,n_images_added))
775
+
776
+ return n_images_added
777
+
778
+ def create_season_archive(i_season):
779
+
780
+ season = i_season + 1
781
+ zipfilename = os.path.join(output_zip_base,'SnapshotSerengeti_S{:>02d}_v2_0.zip'.format(season))
782
+
783
+ n_images_added = 0
784
+ zip = ZipFile(zipfilename,'w')
785
+
786
+ print('Creating archive {}'.format(zipfilename))
787
+
788
+ # im = images[0]
789
+ for iImage,im in enumerate(images):
790
+
791
+ # Don't include humans
792
+ if im['id'] in human_image_ids:
793
+ continue
794
+
795
+ # Only include files from this season
796
+ season_id = im['id'].split('/')[0]
797
+ assert season_id is not None and season_id.startswith('S')
798
+ season = int(season_id.replace('SER_','S').replace('S',''))
799
+ assert season >=1 and season <= nSeasons
800
+
801
+ if (season != i_season + 1):
802
+ continue
803
+
804
+ n_images_added += 1
805
+
806
+ # Possibly start a new archive
807
+ if n_images_added >= max_files_per_archive:
808
+ zip.close()
809
+ zipfilename = zipfilename.replace('.zip','.{}.zip'.format(n_images_added))
810
+ print('Starting new archive for season {}: {}'.format(i_season+1,zipfilename))
811
+ zip = ZipFile(zipfilename,'w')
812
+ n_images_added = 0
813
+
814
+ if (n_images_added % n_dot)==0:
815
+ print('.',end='')
816
+ if (n_images_added % n_print)==0:
817
+ print('{} images added to {}'.format(n_images_added,zipfilename))
818
+ if debug_max_files > 0 and n_images_added > debug_max_files:
819
+ break
820
+
821
+ source_file = os.path.join(image_base,im['file_name'])
822
+ dest_file = im['file_name']
823
+ zip.write(source_file,dest_file,zipfile.ZIP_STORED)
824
+
825
+ # ...for each image
826
+
827
+ zip.close()
828
+ print('\nFinished writing {}, added {} files'.format(zipfilename,n_images_added))
829
+
830
+ return n_images_added
831
+
832
+ # i_season = 0
833
+ # for i_season in range(0,nSeasons):
834
+ # create_season_archive(i_season)
835
+
836
+ def create_archive(i_season):
837
+ if i_season == -1:
838
+ return create_human_archive()
839
+ else:
840
+ return create_season_archive(i_season)
841
+
842
+
843
+ #%% Create archives (loop)
844
+
845
+ # pool = ThreadPool(nSeasons+1)
846
+ # n_images = pool.map(create_archive, range(-1,nSeasons))
847
+ # seasons_to_zip = range(-1,nSeasons)
848
+ seasons_to_zip = [ 4,6 ]
849
+ for i_season in seasons_to_zip:
850
+ create_archive(i_season)
851
+
852
+ # ...for each season
853
+
854
+
855
+ #%% Validate .json files
856
+
857
+ # %logstart -o r'E:\snapshot_temp\python.txt'
858
+
859
+ from megadetector.data_management.databases import integrity_check_json_db
860
+
861
+ files_to_check = glob.glob(os.path.join(output_base,'*.json'))
862
+
863
+ options = integrity_check_json_db.IntegrityCheckOptions()
864
+ options.baseDir = image_base
865
+ options.bCheckImageSizes = False
866
+ options.bCheckImageExistence = True
867
+ options.bFindUnusedImages = False
868
+
869
+ for fn in files_to_check:
870
+ sortedCategories, data = integrity_check_json_db.integrity_check_json_db(fn,options)
871
+
872
+
873
+ #%% Zip up .json and .csv files
874
+
875
+ def zip_single_file(fn):
876
+
877
+ zipfilename = fn + '.zip'
878
+ print('Zipping {} to {}'.format(fn,zipfilename))
879
+ with ZipFile(zipfilename,'w') as zip:
880
+ source_file = fn
881
+ dest_file = os.path.basename(fn)
882
+ zip.write(source_file,dest_file,zipfile.ZIP_DEFLATED)
883
+
884
+ files_to_zip = []
885
+ files_to_zip.extend(glob.glob(os.path.join(output_base,'*.csv')))
886
+ files_to_zip.extend(glob.glob(os.path.join(output_base,'*.json')))
887
+
888
+ # pool = ThreadPool(len(files_to_zip))
889
+ # pool.map(zip_single_file, files_to_zip)
890
+ for fn in tqdm(files_to_zip):
891
+ if os.path.isfile(fn + '.zip'):
892
+ print('Skipping {}'.format(fn))
893
+ continue
894
+ zip_single_file(fn)
895
+
896
+
897
+ #%% Validate that S11 info isn't leaking
898
+
899
+ files_to_check = glob.glob(os.path.join(output_base,'*.json'))
900
+
901
+ for jsonFn in files_to_check:
902
+
903
+ if '11' in jsonFn or '2_1' in jsonFn:
904
+ print('Skipping file {}'.format(jsonFn))
905
+ continue
906
+
907
+ print('Processing file {}'.format(jsonFn))
908
+
909
+ with open(jsonFn,'r') as f:
910
+ data_public = json.load(f)
911
+
912
+ # im = data_public['images'][0]
913
+ for im in tqdm(data_public['images']):
914
+ assert (not im['id'].startswith('S11')) and (not im['id'].startswith('SER11'))
915
+ assert (not im['file_name'].startswith('S11')) and (not im['file_name'].startswith('SER11'))
916
+ sequence_tokens = im['seq_id'].split('#')
917
+ assert '11' not in sequence_tokens[0]
918
+
919
+ # ann = data_public['annotations'][0]
920
+ for ann in tqdm(data_public['annotations']):
921
+ assert (not ann['image_id'].startswith('S11')) and (not ann['image_id'].startswith('SER11'))
922
+ sequence_tokens = ann['seq_id'].split('#')
923
+ assert '11' not in sequence_tokens[0]
924
+
925
+ print('Done checking .json files')
926
+
927
+ annotation_csv = "E:\snapshot_out\SnapshotSerengeti_v2_0_annotations.csv"
928
+ image_csv = "E:\snapshot_out\SnapshotSerengeti_v2_0_images.csv"
929
+
930
+ annotation_df = pd.read_csv(annotation_csv)
931
+ image_df = pd.read_csv(image_csv)
932
+
933
+ # iRow = 0; row = annotation_df.iloc[iRow]
934
+ for iRow,row in tqdm(annotation_df.iterrows(),total=len(annotation_df)):
935
+ sequence_tokens = row['capture_id'].split('#')
936
+ assert '11' not in sequence_tokens[0]
937
+ assert '11' not in row['season']
938
+
939
+ # iRow = 0; row = image_df.iloc[iRow]
940
+ for iRow,row in tqdm(image_df.iterrows(),total=len(image_df)):
941
+ sequence_tokens = row['capture_id'].split('#')
942
+ assert '11' not in sequence_tokens[0]
943
+ fn = row['image_path_rel']
944
+ assert (not fn.startswith('S11')) and (not fn.startswith('SER11'))
945
+
946
+ print('Done checking .csv files')
947
+
948
+
949
+ #%% Create bounding box archive
950
+
951
+ bbox_json_fn = r"E:\snapshot_serengeti_bboxes_20190409.json"
952
+
953
+ with open(bbox_json_fn,'r') as f:
954
+ bbox_data = json.load(f)
955
+
956
+ json_fn = r"E:\snapshot_out\SnapshotSeregeti_v2.0.json"
957
+
958
+ with open(json_fn,'r') as f:
959
+ data = json.load(f)
960
+
961
+ print('Finished reading annotations and bounding boxes')
962
+
963
+ available_images = set()
964
+
965
+ # i_image = 0; im = data['images'][0]
966
+ for i_image,im in enumerate(data['images']):
967
+ available_images.add(im['id'])
968
+
969
+ print('{} images available'.format(len(available_images)))
970
+
971
+ missing_images = []
972
+ found_images = []
973
+ # i_box = 0; boxann = bbox_data['annotations'][0]
974
+ for i_ann,ann in enumerate(bbox_data['annotations']):
975
+ id = ann['image_id']
976
+ if id not in available_images:
977
+ missing_images.append(id)
978
+ else:
979
+ found_images.append(id)
980
+
981
+ print('{} missing images in {} bounding boxes ({} found)'.format(len(missing_images), len(bbox_data['annotations']), len(found_images)))
982
+
983
+
984
+ #%% Integrity-check a few files to make sure bounding boxes are still sensible
985
+
986
+ # import sys; sys.path.append(r'C:\git\MegaDetector')
987
+ from megadetector.visualization import visualize_db
988
+ output_base = r'E:\snapshot_temp'
989
+
990
+ viz_options = visualize_db.DbVizOptions()
991
+ viz_options.num_to_visualize = 500
992
+ viz_options.trim_to_images_with_bboxes = True
993
+ viz_options.add_search_links = True
994
+ viz_options.sort_by_filename = False
995
+ html_output_file,bbox_db = visualize_db.visualize_db(bbox_json_fn,os.path.join(output_base,'preview2'),image_base,viz_options)
996
+ os.startfile(html_output_file)
997
+
998
+
999
+ #%% Check categories
1000
+
1001
+ json_fn_all = r"E:\snapshot_out\SnapshotSeregeti_v2.0.json"
1002
+
1003
+ with open(json_fn_all,'r') as f:
1004
+ data_all = json.load(f)
1005
+
1006
+ data_by_season = []
1007
+ data_10 = None
1008
+ i_season = 9
1009
+ fn = r'e:\snapshot_out\SnapshotSerengetiS{:0>2d}.json'.format(i_season+1)
1010
+ with open(fn,'r') as f:
1011
+ data_10 = json.load(f)
1012
+
1013
+ n_categories_all = len(data_all['categories'])
1014
+ n_categories_s10 = len(data_10['categories'])
1015
+
1016
+
1017
+ #%% Summary prep for LILA
1018
+
1019
+ import os
1020
+ import json
1021
+ from tqdm import tqdm
1022
+
1023
+ json_fn = r"D:\temp\SnapshotSeregeti_v2.0.json"
1024
+ with open(json_fn,'r') as f:
1025
+ data = json.load(f)
1026
+
1027
+ categories = data['categories']
1028
+ annotations = data['annotations']
1029
+ images = data['images']
1030
+ output_base = r'd:\temp'
1031
+
1032
+ n_empty = 0
1033
+ n_species = len(categories)
1034
+ n_images = len(images)
1035
+
1036
+ sequences = set()
1037
+ for im in tqdm(images):
1038
+ sequences.add(im['seq_id'])
1039
+
1040
+ category_id_to_count = {}
1041
+ for ann in tqdm(annotations):
1042
+ if ann['category_id'] == 0:
1043
+ n_empty += 1
1044
+ if ann['category_id'] in category_id_to_count:
1045
+ category_id_to_count[ann['category_id']] += 1
1046
+ else:
1047
+ category_id_to_count[ann['category_id']] = 1
1048
+
1049
+ empty_categories = []
1050
+ for c in categories:
1051
+ if c['id'] in category_id_to_count:
1052
+ c['count'] = category_id_to_count[c['id']]
1053
+ else:
1054
+ empty_categories.append(c)
1055
+ c['count'] = 0
1056
+
1057
+ categories = [c for c in categories if c['count'] > 0]
1058
+ sorted_categories = sorted(categories, key=lambda k: k['count'], reverse=True)
1059
+
1060
+ fn = os.path.join(output_base,'ss_specieslist.csv')
1061
+ with open(fn,'w') as f:
1062
+ for c in sorted_categories:
1063
+ f.write(c['name'] + ',' + str(c['count']) + '\n')
1064
+
1065
+ print('Found {} images ({} empty, {}%) in {} sequences, in {} categories'.format(
1066
+ n_images,n_empty,100*n_empty/n_images,len(sequences),len(categories)))
1067
+