megadetector 5.0.8__py3-none-any.whl → 5.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (190) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +65 -65
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +68 -54
  20. api/batch_processing/postprocessing/compare_batch_results.py +113 -43
  21. api/batch_processing/postprocessing/convert_output_format.py +41 -16
  22. api/batch_processing/postprocessing/load_api_results.py +16 -17
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +52 -22
  25. api/batch_processing/postprocessing/merge_detections.py +14 -14
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +246 -174
  27. api/batch_processing/postprocessing/remap_detection_categories.py +32 -25
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +60 -27
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +242 -158
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +102 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -263
  71. data_management/coco_to_yolo.py +79 -58
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +62 -24
  76. data_management/databases/subset_json_db.py +24 -15
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -162
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -158
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +7 -7
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +65 -24
  120. data_management/labelme_to_yolo.py +8 -8
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +13 -13
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +44 -110
  128. data_management/lila/generate_lila_per_image_labels.py +55 -42
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +96 -33
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +110 -97
  135. data_management/remap_coco_categories.py +83 -83
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +30 -23
  138. data_management/wi_download_csv_to_coco.py +246 -239
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +300 -60
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +179 -113
  147. detection/run_inference_with_yolov5_val.py +108 -48
  148. detection/run_tiled_inference.py +111 -40
  149. detection/tf_detector.py +51 -29
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +228 -68
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -871
  157. md_utils/path_utils.py +460 -134
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +176 -60
  163. md_utils/write_html_image_list.py +40 -33
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +597 -291
  168. md_visualization/visualize_db.py +76 -48
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/METADATA +13 -7
  171. megadetector-5.0.10.dist-info/RECORD +224 -0
  172. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/top_level.txt +1 -0
  173. taxonomy_mapping/__init__.py +0 -0
  174. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  175. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  176. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  177. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  178. taxonomy_mapping/retrieve_sample_image.py +12 -12
  179. taxonomy_mapping/simple_image_download.py +11 -11
  180. taxonomy_mapping/species_lookup.py +10 -10
  181. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  182. taxonomy_mapping/taxonomy_graph.py +47 -47
  183. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  184. data_management/cct_json_to_filename_json.py +0 -89
  185. data_management/cct_to_csv.py +0 -140
  186. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  187. detection/detector_training/copy_checkpoints.py +0 -43
  188. megadetector-5.0.8.dist-info/RECORD +0 -205
  189. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/LICENSE +0 -0
  190. {megadetector-5.0.8.dist-info → megadetector-5.0.10.dist-info}/WHEEL +0 -0
@@ -1,359 +1,359 @@
1
- ########
2
- #
3
- # Prepare a LILA-ready .json file for the NOAA Puget Sound Nearshore Fish dataset.
4
- #
5
- ########
6
-
7
- #%% Constants and imports
8
-
9
- import os
10
- import json
11
- import uuid
12
- import pandas as pd
13
-
14
- from md_utils.path_utils import open_file
15
-
16
- base_folder = r'G:\temp\noaa'
17
- output_json_fn = os.path.join(base_folder,'noaa_estuary_fish.json')
18
- edited_image_folders = ['edited_clip_2017','edited_clip_2018']
19
- jpeg_image_folder = 'JPEGImages'
20
- metadata_file = 'MasterDataForMicrosoft.xlsx'
21
-
22
-
23
- #%% Enumerate files
24
-
25
- edited_image_files = []
26
-
27
- # edited_image_folder = edited_image_folders[0]
28
- for edited_image_folder in edited_image_folders:
29
- folder_path = os.path.join(base_folder,edited_image_folder)
30
- image_files = os.listdir(folder_path)
31
- assert all([fn.endswith('.jpg') for fn in image_files])
32
- edited_image_files.extend([os.path.join(folder_path,fn) for fn in image_files])
33
-
34
- jpeg_image_folder_files = os.listdir(os.path.join(base_folder,jpeg_image_folder))
35
- assert all([fn.endswith('.jpg') for fn in jpeg_image_folder_files])
36
-
37
- relative_edited_image_files_set = set()
38
-
39
- # fn = edited_image_files[0]
40
- for fn in edited_image_files:
41
- bn = os.path.basename(fn)
42
- assert bn not in relative_edited_image_files_set
43
- relative_edited_image_files_set.add(bn)
44
-
45
- jpeg_image_folder_files_set = set(jpeg_image_folder_files)
46
-
47
- assert len(jpeg_image_folder_files_set) == len(relative_edited_image_files_set)
48
-
49
- assert jpeg_image_folder_files_set == relative_edited_image_files_set
50
-
51
-
52
- #%% Read metadata and capture location information
53
-
54
- df = pd.read_excel(os.path.join(base_folder,metadata_file))
55
-
56
- print('Read {} rows from metadata file'.format(len(df)))
57
-
58
- id_string_to_site = {}
59
-
60
- # i_row = 0; row = df.iloc[i_row]
61
- for i_row,row in df.iterrows():
62
-
63
- assert row['sd'].lower().startswith('sd')
64
- assert isinstance(row['id'],int) and row['id'] > 0 and row['id'] < 10000
65
- date_string = row['date']
66
- date_tokens = date_string.split('_')
67
-
68
- # Sometimes '2017' was just '17' in the date column
69
- if len(date_tokens[2]) != 4:
70
- assert len(date_tokens[2]) == 2
71
- date_tokens[2] = '20' + date_tokens[2]
72
- date_string = '_'.join(date_tokens)
73
- else:
74
- assert date_tokens[2].startswith('201')
75
-
76
- id_string = row['sd'].upper() + '_' + str(row['id']) + '_' + date_string
77
- id_string_to_site[id_string] = row['site']
78
-
79
- print('Found {} unique locations'.format(len(pd.unique(df['site']))))
80
-
81
-
82
- #%% Read the .json files and build output dictionaries
83
-
84
- json_files = [fn for fn in os.listdir(base_folder) if (fn.endswith('.json') and (fn != os.path.basename(output_json_fn)))]
85
- json_files = [os.path.join(base_folder,fn) for fn in json_files]
86
-
87
- fn_to_image = {}
88
- annotations = []
89
-
90
- CATEGORY_ID_EMPTY = 0
91
- CATEGORY_ID_FISH = 1
92
-
93
- categories = [{'id':CATEGORY_ID_EMPTY,'name':'empty'},{'id':CATEGORY_ID_FISH,'name':'animal'}]
94
-
95
- empty_images = set()
96
- non_empty_images = set()
97
-
98
- n_matched_locations = 0
99
- images_with_unmatched_locations = []
100
-
101
- import random
102
- random.seed(1)
103
-
104
- site_to_location_id = {}
105
-
106
- # json_fn = json_files[0]
107
- for json_fn in json_files:
108
-
109
- # if 'partial' in json_fn:
110
- # continue
111
-
112
- with open(json_fn,'r') as f:
113
-
114
- lines = f.readlines()
115
-
116
- # line = lines[0]
117
- for line in lines:
118
-
119
- d = json.loads(line)
120
- image_fn = d['image']
121
-
122
- # if image_fn == 'SD1_238_6_26_17_16_76.73.jpg':
123
- # asdfad
124
-
125
- # SD29_079_5_14_2018_17_52.85.jpg
126
-
127
- tokens = image_fn.split('_')
128
- assert len(tokens) == 7
129
- assert tokens[0].startswith('SD')
130
-
131
- # Re-write two-digit years as four-digit years
132
- if len(tokens[4]) != 4:
133
- assert len(tokens[4]) == 2
134
- tokens[4] = '20' + tokens[4]
135
- else:
136
- assert tokens[4].startswith('201')
137
-
138
- # Sometimes the year was written with two digits instead of 4
139
- # assert len(tokens[4]) == 4 and tokens[4].startswith('20')
140
-
141
- while tokens[1].startswith('0'):
142
- tokens[1] = tokens[1][1:]
143
- assert not tokens[1].startswith('0')
144
- assert len(tokens[1]) > 0
145
-
146
- id_string = '_'.join(tokens[0:5])
147
-
148
- location_id = 'unknown'
149
-
150
- if id_string in id_string_to_site:
151
-
152
- site_id = id_string_to_site[id_string]
153
-
154
- # Have we seen this location already?
155
- if site_id in site_to_location_id:
156
- location_id = site_to_location_id[site_id]
157
- else:
158
- location_id = 'loc_' + str(uuid.uuid1())
159
- site_to_location_id[site_id] = location_id
160
- print('Adding new location ID {} for site {}'.format(
161
- location_id,site_id))
162
- n_matched_locations += 1
163
-
164
- else:
165
- raise ValueError('Could not match location ID')
166
- images_with_unmatched_locations.append(image_fn)
167
-
168
- assert image_fn in jpeg_image_folder_files_set
169
- assert d['type'] == 'image/jpg'
170
- input_ann = d['annotations']
171
- assert len(input_ann) == 1 and len(input_ann.keys()) == 1 and 'object' in input_ann
172
- input_ann = input_ann['object']
173
- assert input_ann['metainfo']['image']['height'] == 1080
174
- assert input_ann['metainfo']['image']['width'] == 1920
175
-
176
- im = {}
177
-
178
- img_h = input_ann['metainfo']['image']['height']
179
- img_w = input_ann['metainfo']['image']['width']
180
-
181
- im['width'] = img_w
182
- im['height'] = img_h
183
- im['file_name'] = image_fn
184
-
185
- if image_fn in fn_to_image:
186
- assert fn_to_image[image_fn]['file_name'] == image_fn
187
- assert fn_to_image[image_fn]['width'] == img_w
188
- assert fn_to_image[image_fn]['height'] == img_h
189
- im = fn_to_image[image_fn]
190
- else:
191
- fn_to_image[image_fn] = im
192
- im['location'] = location_id
193
- im['id'] = image_fn # str(uuid.uuid1())
194
-
195
- # Not a typo, it's actually "formateddata"
196
- formatted_data = input_ann['formateddata']
197
- if len(formatted_data) == 0:
198
-
199
- # An image shouldn't be annotated as both empty and non-empty
200
- assert image_fn not in non_empty_images
201
- empty_images.add(image_fn)
202
- ann = {}
203
- ann['id'] = str(uuid.uuid1())
204
- ann['image_id'] = im['id']
205
- ann['category_id'] = CATEGORY_ID_EMPTY
206
- ann['sequence_level_annotation'] = False
207
- annotations.append(ann)
208
-
209
- else:
210
-
211
- # An image shouldn't be annotated as both empty and non-empty
212
- assert image_fn not in empty_images
213
- non_empty_images.add(image_fn)
214
-
215
- n_boxes = len(formatted_data)
216
-
217
- # box = formatteddata[0]
218
- for box in formatted_data:
219
-
220
- attributes = box['attribute']
221
- assert len(attributes) == 2 and 'occluded' in attributes and 'truncated' in attributes
222
- coordinates = box['coordinates']
223
- assert box['object_type'] == 'bbox'
224
- assert box['class']['type'] == 'Fish'
225
- assert len(coordinates) == 4
226
- for coord in coordinates:
227
- assert len(coord) == 2 and 'x' in coord and 'y' in coord
228
- assert coordinates[0]['y'] == coordinates[1]['y']
229
- assert coordinates[2]['y'] == coordinates[3]['y']
230
- assert coordinates[0]['x'] == coordinates[3]['x']
231
- assert coordinates[1]['x'] == coordinates[2]['x']
232
-
233
- assert coordinates[0]['x'] < coordinates[1]['x']
234
- assert coordinates[0]['y'] < coordinates[3]['y']
235
-
236
- if False:
237
- x = coordinates[0]['x'] / img_w
238
- y = coordinates[0]['y'] / img_h
239
- box_w = (coordinates[1]['x'] - coordinates[0]['x']) / img_w
240
- box_h = (coordinates[3]['y'] - coordinates[0]['y']) / img_h
241
- else:
242
- x = coordinates[0]['x']
243
- y = coordinates[0]['y']
244
- box_w = (coordinates[1]['x'] - coordinates[0]['x'])
245
- box_h = (coordinates[3]['y'] - coordinates[0]['y'])
246
-
247
- bbox = [x,y,box_w,box_h]
248
-
249
- ann = {}
250
- ann['id'] = str(uuid.uuid1())
251
- ann['image_id'] = im['id']
252
- ann['category_id'] = CATEGORY_ID_FISH
253
- ann['sequence_level_annotation'] = False
254
- ann['bbox'] = bbox
255
-
256
- annotations.append(ann)
257
-
258
- # open_file(os.path.join(base_folder,jpeg_image_folder,image_fn))
259
-
260
- # ...for each box
261
-
262
- # ...if there are boxes on this image
263
-
264
- # ...for each line
265
-
266
- # ...with open()
267
-
268
- # ...for each json file
269
-
270
- print('Found annotations for {} images (of {})'.format(len(fn_to_image),
271
- len(jpeg_image_folder_files_set)))
272
-
273
-
274
- print('Matched locations for {} images (failed to match {})'.format(
275
- n_matched_locations,len(images_with_unmatched_locations)))
276
-
277
- images = list(fn_to_image.values())
278
-
279
-
280
- #%% Prepare the output .json
281
-
282
- info = {}
283
- info['version'] = '2022.07.31.00'
284
- info['description'] = 'NOAA Estuary Fish 2022'
285
- info['year'] = 2022
286
- info['contributor'] = 'NOAA Fisheries'
287
-
288
- d = {}
289
- d['info'] = info
290
- d['annotations'] = annotations
291
- d['images'] = images
292
- d['categories'] = categories
293
-
294
- with open(output_json_fn,'w') as f:
295
- json.dump(d,f,indent=1)
296
-
297
-
298
- #%% Check DB integrity
299
-
300
- from data_management.databases import integrity_check_json_db
301
-
302
- options = integrity_check_json_db.IntegrityCheckOptions()
303
- options.baseDir = os.path.join(base_folder,jpeg_image_folder)
304
- options.bCheckImageSizes = False
305
- options.bCheckImageExistence = True
306
- options.bFindUnusedImages = True
307
-
308
- _, _, _ = integrity_check_json_db.integrity_check_json_db(output_json_fn, options)
309
-
310
-
311
- #%% Print unique locations
312
-
313
- from collections import defaultdict
314
- location_to_count = defaultdict(int)
315
- for im in d['images']:
316
- location_to_count[im['location']] += 1
317
- for loc in location_to_count.keys():
318
- print(loc + ': ' + str(location_to_count[loc]))
319
-
320
- print('{} unique locations'.format(len(location_to_count)))
321
- assert 'unknown' not in location_to_count.keys()
322
-
323
- # SD12_202_6_23_2017_1_31.85.jpg
324
-
325
-
326
- #%% Preview some images
327
-
328
- from md_visualization import visualize_db
329
-
330
- viz_options = visualize_db.DbVizOptions()
331
- viz_options.num_to_visualize = 10000
332
- viz_options.trim_to_images_with_bboxes = False
333
- viz_options.add_search_links = False
334
- viz_options.sort_by_filename = False
335
- viz_options.parallelize_rendering = True
336
- viz_options.include_filename_links = True
337
-
338
- html_output_file, _ = visualize_db.visualize_db(db_path=output_json_fn,
339
- output_dir=os.path.join(base_folder,'preview'),
340
- image_base_dir=os.path.join(base_folder,jpeg_image_folder),
341
- options=viz_options)
342
- open_file(html_output_file)
343
-
344
-
345
- #%% Statistics
346
-
347
- print('Empty: {}'.format(len(empty_images)))
348
- print('Non-empty: {}'.format(len(non_empty_images)))
349
-
350
- images_with_no_boxes = 0
351
- n_boxes = 0
352
- for ann in annotations:
353
- if 'bbox' not in ann:
354
- images_with_no_boxes += 1
355
- else:
356
- assert len(bbox) == 4
357
- n_boxes += 1
358
-
359
- print('N boxes: {}'.format(n_boxes))
1
+ """
2
+
3
+ Prepare a LILA-ready .json file for the NOAA Puget Sound Nearshore Fish dataset.
4
+
5
+ """
6
+
7
+ #%% Constants and imports
8
+
9
+ import os
10
+ import json
11
+ import uuid
12
+ import pandas as pd
13
+
14
+ from md_utils.path_utils import open_file
15
+
16
+ base_folder = r'G:\temp\noaa'
17
+ output_json_fn = os.path.join(base_folder,'noaa_estuary_fish.json')
18
+ edited_image_folders = ['edited_clip_2017','edited_clip_2018']
19
+ jpeg_image_folder = 'JPEGImages'
20
+ metadata_file = 'MasterDataForMicrosoft.xlsx'
21
+
22
+
23
+ #%% Enumerate files
24
+
25
+ edited_image_files = []
26
+
27
+ # edited_image_folder = edited_image_folders[0]
28
+ for edited_image_folder in edited_image_folders:
29
+ folder_path = os.path.join(base_folder,edited_image_folder)
30
+ image_files = os.listdir(folder_path)
31
+ assert all([fn.endswith('.jpg') for fn in image_files])
32
+ edited_image_files.extend([os.path.join(folder_path,fn) for fn in image_files])
33
+
34
+ jpeg_image_folder_files = os.listdir(os.path.join(base_folder,jpeg_image_folder))
35
+ assert all([fn.endswith('.jpg') for fn in jpeg_image_folder_files])
36
+
37
+ relative_edited_image_files_set = set()
38
+
39
+ # fn = edited_image_files[0]
40
+ for fn in edited_image_files:
41
+ bn = os.path.basename(fn)
42
+ assert bn not in relative_edited_image_files_set
43
+ relative_edited_image_files_set.add(bn)
44
+
45
+ jpeg_image_folder_files_set = set(jpeg_image_folder_files)
46
+
47
+ assert len(jpeg_image_folder_files_set) == len(relative_edited_image_files_set)
48
+
49
+ assert jpeg_image_folder_files_set == relative_edited_image_files_set
50
+
51
+
52
+ #%% Read metadata and capture location information
53
+
54
+ df = pd.read_excel(os.path.join(base_folder,metadata_file))
55
+
56
+ print('Read {} rows from metadata file'.format(len(df)))
57
+
58
+ id_string_to_site = {}
59
+
60
+ # i_row = 0; row = df.iloc[i_row]
61
+ for i_row,row in df.iterrows():
62
+
63
+ assert row['sd'].lower().startswith('sd')
64
+ assert isinstance(row['id'],int) and row['id'] > 0 and row['id'] < 10000
65
+ date_string = row['date']
66
+ date_tokens = date_string.split('_')
67
+
68
+ # Sometimes '2017' was just '17' in the date column
69
+ if len(date_tokens[2]) != 4:
70
+ assert len(date_tokens[2]) == 2
71
+ date_tokens[2] = '20' + date_tokens[2]
72
+ date_string = '_'.join(date_tokens)
73
+ else:
74
+ assert date_tokens[2].startswith('201')
75
+
76
+ id_string = row['sd'].upper() + '_' + str(row['id']) + '_' + date_string
77
+ id_string_to_site[id_string] = row['site']
78
+
79
+ print('Found {} unique locations'.format(len(pd.unique(df['site']))))
80
+
81
+
82
+ #%% Read the .json files and build output dictionaries
83
+
84
+ json_files = [fn for fn in os.listdir(base_folder) if (fn.endswith('.json') and (fn != os.path.basename(output_json_fn)))]
85
+ json_files = [os.path.join(base_folder,fn) for fn in json_files]
86
+
87
+ fn_to_image = {}
88
+ annotations = []
89
+
90
+ CATEGORY_ID_EMPTY = 0
91
+ CATEGORY_ID_FISH = 1
92
+
93
+ categories = [{'id':CATEGORY_ID_EMPTY,'name':'empty'},{'id':CATEGORY_ID_FISH,'name':'animal'}]
94
+
95
+ empty_images = set()
96
+ non_empty_images = set()
97
+
98
+ n_matched_locations = 0
99
+ images_with_unmatched_locations = []
100
+
101
+ import random
102
+ random.seed(1)
103
+
104
+ site_to_location_id = {}
105
+
106
+ # json_fn = json_files[0]
107
+ for json_fn in json_files:
108
+
109
+ # if 'partial' in json_fn:
110
+ # continue
111
+
112
+ with open(json_fn,'r') as f:
113
+
114
+ lines = f.readlines()
115
+
116
+ # line = lines[0]
117
+ for line in lines:
118
+
119
+ d = json.loads(line)
120
+ image_fn = d['image']
121
+
122
+ # if image_fn == 'SD1_238_6_26_17_16_76.73.jpg':
123
+ # asdfad
124
+
125
+ # SD29_079_5_14_2018_17_52.85.jpg
126
+
127
+ tokens = image_fn.split('_')
128
+ assert len(tokens) == 7
129
+ assert tokens[0].startswith('SD')
130
+
131
+ # Re-write two-digit years as four-digit years
132
+ if len(tokens[4]) != 4:
133
+ assert len(tokens[4]) == 2
134
+ tokens[4] = '20' + tokens[4]
135
+ else:
136
+ assert tokens[4].startswith('201')
137
+
138
+ # Sometimes the year was written with two digits instead of 4
139
+ # assert len(tokens[4]) == 4 and tokens[4].startswith('20')
140
+
141
+ while tokens[1].startswith('0'):
142
+ tokens[1] = tokens[1][1:]
143
+ assert not tokens[1].startswith('0')
144
+ assert len(tokens[1]) > 0
145
+
146
+ id_string = '_'.join(tokens[0:5])
147
+
148
+ location_id = 'unknown'
149
+
150
+ if id_string in id_string_to_site:
151
+
152
+ site_id = id_string_to_site[id_string]
153
+
154
+ # Have we seen this location already?
155
+ if site_id in site_to_location_id:
156
+ location_id = site_to_location_id[site_id]
157
+ else:
158
+ location_id = 'loc_' + str(uuid.uuid1())
159
+ site_to_location_id[site_id] = location_id
160
+ print('Adding new location ID {} for site {}'.format(
161
+ location_id,site_id))
162
+ n_matched_locations += 1
163
+
164
+ else:
165
+ raise ValueError('Could not match location ID')
166
+ images_with_unmatched_locations.append(image_fn)
167
+
168
+ assert image_fn in jpeg_image_folder_files_set
169
+ assert d['type'] == 'image/jpg'
170
+ input_ann = d['annotations']
171
+ assert len(input_ann) == 1 and len(input_ann.keys()) == 1 and 'object' in input_ann
172
+ input_ann = input_ann['object']
173
+ assert input_ann['metainfo']['image']['height'] == 1080
174
+ assert input_ann['metainfo']['image']['width'] == 1920
175
+
176
+ im = {}
177
+
178
+ img_h = input_ann['metainfo']['image']['height']
179
+ img_w = input_ann['metainfo']['image']['width']
180
+
181
+ im['width'] = img_w
182
+ im['height'] = img_h
183
+ im['file_name'] = image_fn
184
+
185
+ if image_fn in fn_to_image:
186
+ assert fn_to_image[image_fn]['file_name'] == image_fn
187
+ assert fn_to_image[image_fn]['width'] == img_w
188
+ assert fn_to_image[image_fn]['height'] == img_h
189
+ im = fn_to_image[image_fn]
190
+ else:
191
+ fn_to_image[image_fn] = im
192
+ im['location'] = location_id
193
+ im['id'] = image_fn # str(uuid.uuid1())
194
+
195
+ # Not a typo, it's actually "formateddata"
196
+ formatted_data = input_ann['formateddata']
197
+ if len(formatted_data) == 0:
198
+
199
+ # An image shouldn't be annotated as both empty and non-empty
200
+ assert image_fn not in non_empty_images
201
+ empty_images.add(image_fn)
202
+ ann = {}
203
+ ann['id'] = str(uuid.uuid1())
204
+ ann['image_id'] = im['id']
205
+ ann['category_id'] = CATEGORY_ID_EMPTY
206
+ ann['sequence_level_annotation'] = False
207
+ annotations.append(ann)
208
+
209
+ else:
210
+
211
+ # An image shouldn't be annotated as both empty and non-empty
212
+ assert image_fn not in empty_images
213
+ non_empty_images.add(image_fn)
214
+
215
+ n_boxes = len(formatted_data)
216
+
217
+ # box = formatteddata[0]
218
+ for box in formatted_data:
219
+
220
+ attributes = box['attribute']
221
+ assert len(attributes) == 2 and 'occluded' in attributes and 'truncated' in attributes
222
+ coordinates = box['coordinates']
223
+ assert box['object_type'] == 'bbox'
224
+ assert box['class']['type'] == 'Fish'
225
+ assert len(coordinates) == 4
226
+ for coord in coordinates:
227
+ assert len(coord) == 2 and 'x' in coord and 'y' in coord
228
+ assert coordinates[0]['y'] == coordinates[1]['y']
229
+ assert coordinates[2]['y'] == coordinates[3]['y']
230
+ assert coordinates[0]['x'] == coordinates[3]['x']
231
+ assert coordinates[1]['x'] == coordinates[2]['x']
232
+
233
+ assert coordinates[0]['x'] < coordinates[1]['x']
234
+ assert coordinates[0]['y'] < coordinates[3]['y']
235
+
236
+ if False:
237
+ x = coordinates[0]['x'] / img_w
238
+ y = coordinates[0]['y'] / img_h
239
+ box_w = (coordinates[1]['x'] - coordinates[0]['x']) / img_w
240
+ box_h = (coordinates[3]['y'] - coordinates[0]['y']) / img_h
241
+ else:
242
+ x = coordinates[0]['x']
243
+ y = coordinates[0]['y']
244
+ box_w = (coordinates[1]['x'] - coordinates[0]['x'])
245
+ box_h = (coordinates[3]['y'] - coordinates[0]['y'])
246
+
247
+ bbox = [x,y,box_w,box_h]
248
+
249
+ ann = {}
250
+ ann['id'] = str(uuid.uuid1())
251
+ ann['image_id'] = im['id']
252
+ ann['category_id'] = CATEGORY_ID_FISH
253
+ ann['sequence_level_annotation'] = False
254
+ ann['bbox'] = bbox
255
+
256
+ annotations.append(ann)
257
+
258
+ # open_file(os.path.join(base_folder,jpeg_image_folder,image_fn))
259
+
260
+ # ...for each box
261
+
262
+ # ...if there are boxes on this image
263
+
264
+ # ...for each line
265
+
266
+ # ...with open()
267
+
268
+ # ...for each json file
269
+
270
+ print('Found annotations for {} images (of {})'.format(len(fn_to_image),
271
+ len(jpeg_image_folder_files_set)))
272
+
273
+
274
+ print('Matched locations for {} images (failed to match {})'.format(
275
+ n_matched_locations,len(images_with_unmatched_locations)))
276
+
277
+ images = list(fn_to_image.values())
278
+
279
+
280
+ #%% Prepare the output .json
281
+
282
+ info = {}
283
+ info['version'] = '2022.07.31.00'
284
+ info['description'] = 'NOAA Estuary Fish 2022'
285
+ info['year'] = 2022
286
+ info['contributor'] = 'NOAA Fisheries'
287
+
288
+ d = {}
289
+ d['info'] = info
290
+ d['annotations'] = annotations
291
+ d['images'] = images
292
+ d['categories'] = categories
293
+
294
+ with open(output_json_fn,'w') as f:
295
+ json.dump(d,f,indent=1)
296
+
297
+
298
+ #%% Check DB integrity
299
+
300
+ from data_management.databases import integrity_check_json_db
301
+
302
+ options = integrity_check_json_db.IntegrityCheckOptions()
303
+ options.baseDir = os.path.join(base_folder,jpeg_image_folder)
304
+ options.bCheckImageSizes = False
305
+ options.bCheckImageExistence = True
306
+ options.bFindUnusedImages = True
307
+
308
+ _, _, _ = integrity_check_json_db.integrity_check_json_db(output_json_fn, options)
309
+
310
+
311
+ #%% Print unique locations
312
+
313
+ from collections import defaultdict
314
+ location_to_count = defaultdict(int)
315
+ for im in d['images']:
316
+ location_to_count[im['location']] += 1
317
+ for loc in location_to_count.keys():
318
+ print(loc + ': ' + str(location_to_count[loc]))
319
+
320
+ print('{} unique locations'.format(len(location_to_count)))
321
+ assert 'unknown' not in location_to_count.keys()
322
+
323
+ # SD12_202_6_23_2017_1_31.85.jpg
324
+
325
+
326
+ #%% Preview some images
327
+
328
+ from md_visualization import visualize_db
329
+
330
+ viz_options = visualize_db.DbVizOptions()
331
+ viz_options.num_to_visualize = 10000
332
+ viz_options.trim_to_images_with_bboxes = False
333
+ viz_options.add_search_links = False
334
+ viz_options.sort_by_filename = False
335
+ viz_options.parallelize_rendering = True
336
+ viz_options.include_filename_links = True
337
+
338
+ html_output_file, _ = visualize_db.visualize_db(db_path=output_json_fn,
339
+ output_dir=os.path.join(base_folder,'preview'),
340
+ image_base_dir=os.path.join(base_folder,jpeg_image_folder),
341
+ options=viz_options)
342
+ open_file(html_output_file)
343
+
344
+
345
+ #%% Statistics
346
+
347
+ print('Empty: {}'.format(len(empty_images)))
348
+ print('Non-empty: {}'.format(len(non_empty_images)))
349
+
350
+ images_with_no_boxes = 0
351
+ n_boxes = 0
352
+ for ann in annotations:
353
+ if 'bbox' not in ann:
354
+ images_with_no_boxes += 1
355
+ else:
356
+ assert len(bbox) == 4
357
+ n_boxes += 1
358
+
359
+ print('N boxes: {}'.format(n_boxes))