megadetector 5.0.10__py3-none-any.whl → 5.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (226) hide show
  1. {api → megadetector/api}/batch_processing/api_core/batch_service/score.py +2 -2
  2. {api → megadetector/api}/synchronous/api_core/animal_detection_api/api_backend.py +1 -1
  3. {api → megadetector/api}/synchronous/api_core/animal_detection_api/api_frontend.py +1 -1
  4. {classification → megadetector/classification}/analyze_failed_images.py +3 -3
  5. {classification → megadetector/classification}/cache_batchapi_outputs.py +1 -1
  6. {classification → megadetector/classification}/create_classification_dataset.py +1 -1
  7. {classification → megadetector/classification}/crop_detections.py +1 -1
  8. {classification → megadetector/classification}/detect_and_crop.py +5 -5
  9. {classification → megadetector/classification}/evaluate_model.py +1 -1
  10. {classification → megadetector/classification}/json_to_azcopy_list.py +2 -2
  11. {classification → megadetector/classification}/json_validator.py +13 -9
  12. {classification → megadetector/classification}/map_classification_categories.py +1 -1
  13. {classification → megadetector/classification}/merge_classification_detection_output.py +1 -1
  14. {classification → megadetector/classification}/run_classifier.py +2 -1
  15. {classification → megadetector/classification}/train_classifier.py +8 -6
  16. {classification → megadetector/classification}/train_classifier_tf.py +10 -9
  17. {classification → megadetector/classification}/train_utils.py +3 -2
  18. {data_management → megadetector/data_management}/camtrap_dp_to_coco.py +4 -3
  19. {data_management → megadetector/data_management}/cct_json_utils.py +2 -2
  20. {data_management → megadetector/data_management}/cct_to_md.py +1 -1
  21. {data_management → megadetector/data_management}/coco_to_labelme.py +1 -1
  22. {data_management → megadetector/data_management}/coco_to_yolo.py +1 -1
  23. {data_management → megadetector/data_management}/databases/integrity_check_json_db.py +2 -2
  24. {data_management → megadetector/data_management}/get_image_sizes.py +4 -3
  25. {data_management → megadetector/data_management}/importers/auckland_doc_test_to_json.py +6 -5
  26. {data_management → megadetector/data_management}/importers/auckland_doc_to_json.py +4 -3
  27. {data_management → megadetector/data_management}/importers/awc_to_json.py +6 -4
  28. {data_management → megadetector/data_management}/importers/bellevue_to_json.py +3 -3
  29. {data_management → megadetector/data_management}/importers/cacophony-thermal-importer.py +4 -4
  30. {data_management → megadetector/data_management}/importers/carrizo_shrubfree_2018.py +5 -4
  31. {data_management → megadetector/data_management}/importers/carrizo_trail_cam_2017.py +8 -6
  32. {data_management → megadetector/data_management}/importers/cct_field_adjustments.py +2 -1
  33. {data_management → megadetector/data_management}/importers/channel_islands_to_cct.py +2 -2
  34. {data_management → megadetector/data_management}/importers/ena24_to_json.py +6 -5
  35. {data_management → megadetector/data_management}/importers/filenames_to_json.py +2 -1
  36. {data_management → megadetector/data_management}/importers/helena_to_cct.py +7 -6
  37. {data_management → megadetector/data_management}/importers/idaho-camera-traps.py +6 -6
  38. {data_management → megadetector/data_management}/importers/idfg_iwildcam_lila_prep.py +4 -4
  39. {data_management → megadetector/data_management}/importers/jb_csv_to_json.py +1 -1
  40. {data_management → megadetector/data_management}/importers/missouri_to_json.py +4 -3
  41. {data_management → megadetector/data_management}/importers/noaa_seals_2019.py +2 -2
  42. {data_management → megadetector/data_management}/importers/pc_to_json.py +5 -5
  43. {data_management → megadetector/data_management}/importers/prepare-noaa-fish-data-for-lila.py +3 -3
  44. {data_management → megadetector/data_management}/importers/prepare_zsl_imerit.py +3 -3
  45. {data_management → megadetector/data_management}/importers/rspb_to_json.py +2 -2
  46. {data_management → megadetector/data_management}/importers/save_the_elephants_survey_A.py +4 -4
  47. {data_management → megadetector/data_management}/importers/save_the_elephants_survey_B.py +6 -9
  48. {data_management → megadetector/data_management}/importers/snapshot_safari_importer.py +4 -4
  49. {data_management → megadetector/data_management}/importers/snapshot_safari_importer_reprise.py +2 -2
  50. {data_management → megadetector/data_management}/importers/snapshot_serengeti_lila.py +4 -4
  51. {data_management → megadetector/data_management}/importers/timelapse_csv_set_to_json.py +3 -3
  52. {data_management → megadetector/data_management}/importers/ubc_to_json.py +3 -3
  53. {data_management → megadetector/data_management}/importers/umn_to_json.py +2 -2
  54. {data_management → megadetector/data_management}/importers/wellington_to_json.py +3 -3
  55. {data_management → megadetector/data_management}/importers/wi_to_json.py +3 -2
  56. {data_management → megadetector/data_management}/labelme_to_coco.py +6 -7
  57. {data_management → megadetector/data_management}/labelme_to_yolo.py +2 -2
  58. {data_management → megadetector/data_management}/lila/add_locations_to_island_camera_traps.py +4 -4
  59. {data_management → megadetector/data_management}/lila/create_lila_blank_set.py +10 -9
  60. {data_management → megadetector/data_management}/lila/create_lila_test_set.py +3 -2
  61. {data_management → megadetector/data_management}/lila/create_links_to_md_results_files.py +1 -1
  62. {data_management → megadetector/data_management}/lila/download_lila_subset.py +5 -4
  63. {data_management → megadetector/data_management}/lila/generate_lila_per_image_labels.py +6 -5
  64. {data_management → megadetector/data_management}/lila/get_lila_annotation_counts.py +2 -2
  65. {data_management → megadetector/data_management}/lila/get_lila_image_counts.py +2 -1
  66. {data_management → megadetector/data_management}/lila/lila_common.py +5 -5
  67. {data_management → megadetector/data_management}/lila/test_lila_metadata_urls.py +2 -2
  68. {data_management → megadetector/data_management}/ocr_tools.py +6 -6
  69. {data_management → megadetector/data_management}/read_exif.py +2 -2
  70. {data_management → megadetector/data_management}/remap_coco_categories.py +1 -1
  71. {data_management → megadetector/data_management}/remove_exif.py +1 -1
  72. {data_management → megadetector/data_management}/resize_coco_dataset.py +4 -4
  73. {data_management → megadetector/data_management}/wi_download_csv_to_coco.py +3 -3
  74. {data_management → megadetector/data_management}/yolo_output_to_md_output.py +5 -5
  75. {data_management → megadetector/data_management}/yolo_to_coco.py +9 -9
  76. {detection → megadetector/detection}/process_video.py +9 -10
  77. {detection → megadetector/detection}/pytorch_detector.py +12 -8
  78. {detection → megadetector/detection}/run_detector.py +6 -6
  79. {detection → megadetector/detection}/run_detector_batch.py +12 -12
  80. {detection → megadetector/detection}/run_inference_with_yolov5_val.py +12 -12
  81. {detection → megadetector/detection}/run_tiled_inference.py +8 -9
  82. {detection → megadetector/detection}/tf_detector.py +3 -2
  83. {detection → megadetector/detection}/video_utils.py +2 -2
  84. {api/batch_processing → megadetector}/postprocessing/add_max_conf.py +1 -1
  85. {api/batch_processing → megadetector}/postprocessing/categorize_detections_by_size.py +1 -1
  86. {api/batch_processing → megadetector}/postprocessing/combine_api_outputs.py +1 -1
  87. {api/batch_processing → megadetector}/postprocessing/compare_batch_results.py +5 -5
  88. {api/batch_processing → megadetector}/postprocessing/convert_output_format.py +4 -5
  89. {api/batch_processing → megadetector}/postprocessing/load_api_results.py +1 -1
  90. {api/batch_processing → megadetector}/postprocessing/md_to_coco.py +3 -3
  91. {api/batch_processing → megadetector}/postprocessing/md_to_labelme.py +3 -3
  92. {api/batch_processing → megadetector}/postprocessing/merge_detections.py +1 -1
  93. {api/batch_processing → megadetector}/postprocessing/postprocess_batch_results.py +19 -21
  94. {api/batch_processing → megadetector}/postprocessing/remap_detection_categories.py +1 -1
  95. {api/batch_processing → megadetector}/postprocessing/render_detection_confusion_matrix.py +5 -6
  96. {api/batch_processing → megadetector}/postprocessing/repeat_detection_elimination/find_repeat_detections.py +3 -3
  97. {api/batch_processing → megadetector}/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +3 -2
  98. {api/batch_processing → megadetector}/postprocessing/repeat_detection_elimination/repeat_detections_core.py +11 -11
  99. {api/batch_processing → megadetector}/postprocessing/separate_detections_into_folders.py +3 -4
  100. {api/batch_processing → megadetector}/postprocessing/subset_json_detector_output.py +2 -2
  101. {api/batch_processing → megadetector}/postprocessing/top_folders_to_bottom.py +1 -1
  102. {taxonomy_mapping → megadetector/taxonomy_mapping}/map_lila_taxonomy_to_wi_taxonomy.py +2 -2
  103. {taxonomy_mapping → megadetector/taxonomy_mapping}/map_new_lila_datasets.py +2 -6
  104. {taxonomy_mapping → megadetector/taxonomy_mapping}/preview_lila_taxonomy.py +6 -7
  105. {taxonomy_mapping → megadetector/taxonomy_mapping}/retrieve_sample_image.py +1 -1
  106. {taxonomy_mapping → megadetector/taxonomy_mapping}/simple_image_download.py +2 -1
  107. {taxonomy_mapping → megadetector/taxonomy_mapping}/species_lookup.py +1 -1
  108. {taxonomy_mapping → megadetector/taxonomy_mapping}/taxonomy_csv_checker.py +1 -1
  109. {taxonomy_mapping → megadetector/taxonomy_mapping}/validate_lila_category_mappings.py +1 -1
  110. {md_utils → megadetector/utils}/azure_utils.py +7 -3
  111. {md_utils → megadetector/utils}/directory_listing.py +1 -1
  112. {md_utils → megadetector/utils}/md_tests.py +29 -29
  113. {md_utils → megadetector/utils}/split_locations_into_train_val.py +1 -1
  114. {md_utils → megadetector/utils}/write_html_image_list.py +1 -1
  115. {md_visualization → megadetector/visualization}/render_images_with_thumbnails.py +3 -3
  116. {md_visualization → megadetector/visualization}/visualization_utils.py +6 -7
  117. {md_visualization → megadetector/visualization}/visualize_db.py +3 -4
  118. {md_visualization → megadetector/visualization}/visualize_detector_output.py +9 -10
  119. {megadetector-5.0.10.dist-info → megadetector-5.0.12.dist-info}/LICENSE +0 -0
  120. {megadetector-5.0.10.dist-info → megadetector-5.0.12.dist-info}/METADATA +12 -11
  121. megadetector-5.0.12.dist-info/RECORD +199 -0
  122. megadetector-5.0.12.dist-info/top_level.txt +1 -0
  123. api/batch_processing/data_preparation/manage_local_batch.py +0 -2391
  124. api/batch_processing/data_preparation/manage_video_batch.py +0 -327
  125. api/synchronous/api_core/animal_detection_api/data_management/annotations/annotation_constants.py +0 -47
  126. api/synchronous/api_core/animal_detection_api/detection/detector_training/copy_checkpoints.py +0 -43
  127. api/synchronous/api_core/animal_detection_api/detection/process_video.py +0 -543
  128. api/synchronous/api_core/animal_detection_api/detection/pytorch_detector.py +0 -304
  129. api/synchronous/api_core/animal_detection_api/detection/run_detector.py +0 -627
  130. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +0 -1029
  131. api/synchronous/api_core/animal_detection_api/detection/run_inference_with_yolov5_val.py +0 -581
  132. api/synchronous/api_core/animal_detection_api/detection/run_tiled_inference.py +0 -754
  133. api/synchronous/api_core/animal_detection_api/detection/tf_detector.py +0 -165
  134. api/synchronous/api_core/animal_detection_api/detection/video_utils.py +0 -495
  135. api/synchronous/api_core/animal_detection_api/md_utils/azure_utils.py +0 -174
  136. api/synchronous/api_core/animal_detection_api/md_utils/ct_utils.py +0 -262
  137. api/synchronous/api_core/animal_detection_api/md_utils/directory_listing.py +0 -251
  138. api/synchronous/api_core/animal_detection_api/md_utils/matlab_porting_tools.py +0 -97
  139. api/synchronous/api_core/animal_detection_api/md_utils/path_utils.py +0 -416
  140. api/synchronous/api_core/animal_detection_api/md_utils/process_utils.py +0 -110
  141. api/synchronous/api_core/animal_detection_api/md_utils/sas_blob_utils.py +0 -509
  142. api/synchronous/api_core/animal_detection_api/md_utils/string_utils.py +0 -59
  143. api/synchronous/api_core/animal_detection_api/md_utils/url_utils.py +0 -144
  144. api/synchronous/api_core/animal_detection_api/md_utils/write_html_image_list.py +0 -226
  145. api/synchronous/api_core/animal_detection_api/md_visualization/visualization_utils.py +0 -841
  146. detection/detector_training/model_main_tf2.py +0 -114
  147. docs/source/conf.py +0 -43
  148. megadetector-5.0.10.dist-info/RECORD +0 -224
  149. megadetector-5.0.10.dist-info/top_level.txt +0 -8
  150. {api → megadetector/api}/__init__.py +0 -0
  151. {api → megadetector/api}/batch_processing/__init__.py +0 -0
  152. {api → megadetector/api}/batch_processing/api_core/__init__.py +0 -0
  153. {api → megadetector/api}/batch_processing/api_core/batch_service/__init__.py +0 -0
  154. {api → megadetector/api}/batch_processing/api_core/server.py +0 -0
  155. {api → megadetector/api}/batch_processing/api_core/server_api_config.py +0 -0
  156. {api → megadetector/api}/batch_processing/api_core/server_app_config.py +0 -0
  157. {api → megadetector/api}/batch_processing/api_core/server_batch_job_manager.py +0 -0
  158. {api → megadetector/api}/batch_processing/api_core/server_job_status_table.py +0 -0
  159. {api → megadetector/api}/batch_processing/api_core/server_orchestration.py +0 -0
  160. {api → megadetector/api}/batch_processing/api_core/server_utils.py +0 -0
  161. {api → megadetector/api}/batch_processing/api_core_support/__init__.py +0 -0
  162. {api → megadetector/api}/batch_processing/api_core_support/aggregate_results_manually.py +0 -0
  163. {api → megadetector/api}/batch_processing/api_support/__init__.py +0 -0
  164. {api → megadetector/api}/batch_processing/api_support/summarize_daily_activity.py +0 -0
  165. {api → megadetector/api}/batch_processing/data_preparation/__init__.py +0 -0
  166. {api → megadetector/api}/batch_processing/integration/digiKam/setup.py +0 -0
  167. {api → megadetector/api}/batch_processing/integration/digiKam/xmp_integration.py +0 -0
  168. {api → megadetector/api}/batch_processing/integration/eMammal/test_scripts/config_template.py +0 -0
  169. {api → megadetector/api}/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -0
  170. {api → megadetector/api}/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +0 -0
  171. {api/batch_processing/postprocessing → megadetector/api/synchronous}/__init__.py +0 -0
  172. {api/synchronous → megadetector/api/synchronous/api_core/animal_detection_api}/__init__.py +0 -0
  173. {api → megadetector/api}/synchronous/api_core/animal_detection_api/config.py +0 -0
  174. {api/synchronous/api_core/animal_detection_api → megadetector/api/synchronous/api_core/tests}/__init__.py +0 -0
  175. {api → megadetector/api}/synchronous/api_core/tests/load_test.py +0 -0
  176. {api/synchronous/api_core/tests → megadetector/classification}/__init__.py +0 -0
  177. {classification → megadetector/classification}/aggregate_classifier_probs.py +0 -0
  178. {classification → megadetector/classification}/csv_to_json.py +0 -0
  179. {classification → megadetector/classification}/efficientnet/__init__.py +0 -0
  180. {classification → megadetector/classification}/efficientnet/model.py +0 -0
  181. {classification → megadetector/classification}/efficientnet/utils.py +0 -0
  182. {classification → megadetector/classification}/identify_mislabeled_candidates.py +0 -0
  183. {classification → megadetector/classification}/prepare_classification_script.py +0 -0
  184. {classification → megadetector/classification}/prepare_classification_script_mc.py +0 -0
  185. {classification → megadetector/classification}/save_mislabeled.py +0 -0
  186. {classification → megadetector/data_management}/__init__.py +0 -0
  187. {data_management → megadetector/data_management/annotations}/__init__.py +0 -0
  188. {data_management → megadetector/data_management}/annotations/annotation_constants.py +0 -0
  189. {data_management → megadetector/data_management}/cct_to_wi.py +0 -0
  190. {data_management/annotations → megadetector/data_management/databases}/__init__.py +0 -0
  191. {data_management → megadetector/data_management}/databases/add_width_and_height_to_db.py +0 -0
  192. {data_management → megadetector/data_management}/databases/combine_coco_camera_traps_files.py +0 -0
  193. {data_management → megadetector/data_management}/databases/subset_json_db.py +0 -0
  194. {data_management → megadetector/data_management}/generate_crops_from_cct.py +0 -0
  195. {data_management → megadetector/data_management}/importers/add_nacti_sizes.py +0 -0
  196. {data_management → megadetector/data_management}/importers/add_timestamps_to_icct.py +0 -0
  197. {data_management → megadetector/data_management}/importers/animl_results_to_md_results.py +0 -0
  198. {data_management → megadetector/data_management}/importers/eMammal/copy_and_unzip_emammal.py +0 -0
  199. {data_management → megadetector/data_management}/importers/eMammal/eMammal_helpers.py +0 -0
  200. {data_management → megadetector/data_management}/importers/eMammal/make_eMammal_json.py +0 -0
  201. {data_management → megadetector/data_management}/importers/mcgill_to_json.py +0 -0
  202. {data_management → megadetector/data_management}/importers/nacti_fieldname_adjustments.py +0 -0
  203. {data_management → megadetector/data_management}/importers/plot_wni_giraffes.py +0 -0
  204. {data_management → megadetector/data_management}/importers/snapshotserengeti/make_full_SS_json.py +0 -0
  205. {data_management → megadetector/data_management}/importers/snapshotserengeti/make_per_season_SS_json.py +0 -0
  206. {data_management → megadetector/data_management}/importers/sulross_get_exif.py +0 -0
  207. {data_management → megadetector/data_management}/importers/zamba_results_to_md_results.py +0 -0
  208. {data_management/databases → megadetector/data_management/lila}/__init__.py +0 -0
  209. {data_management → megadetector/data_management}/lila/add_locations_to_nacti.py +0 -0
  210. {data_management/lila → megadetector/detection}/__init__.py +0 -0
  211. {detection → megadetector/detection/detector_training}/__init__.py +0 -0
  212. {api/synchronous/api_core/animal_detection_api → megadetector}/detection/detector_training/model_main_tf2.py +0 -0
  213. {detection/detector_training → megadetector/postprocessing}/__init__.py +0 -0
  214. {md_utils → megadetector/taxonomy_mapping}/__init__.py +0 -0
  215. {taxonomy_mapping → megadetector/taxonomy_mapping}/prepare_lila_taxonomy_release.py +0 -0
  216. {taxonomy_mapping → megadetector/taxonomy_mapping}/taxonomy_graph.py +0 -0
  217. {md_visualization → megadetector/utils}/__init__.py +0 -0
  218. {md_utils → megadetector/utils}/ct_utils.py +0 -0
  219. {md_utils → megadetector/utils}/path_utils.py +0 -0
  220. {md_utils → megadetector/utils}/process_utils.py +0 -0
  221. {md_utils → megadetector/utils}/sas_blob_utils.py +0 -0
  222. {md_utils → megadetector/utils}/string_utils.py +0 -0
  223. {md_utils → megadetector/utils}/url_utils.py +0 -0
  224. {taxonomy_mapping → megadetector/visualization}/__init__.py +0 -0
  225. {md_visualization → megadetector/visualization}/plot_utils.py +0 -0
  226. {megadetector-5.0.10.dist-info → megadetector-5.0.12.dist-info}/WHEEL +0 -0
@@ -1,2391 +0,0 @@
1
- """
2
-
3
- manage_local_batch.py
4
-
5
- Semi-automated process for managing a local MegaDetector job, including
6
- standard postprocessing steps.
7
-
8
- This script is not intended to be run from top to bottom like a typical Python script,
9
- it's a notebook disguised with a .py extension. It's the Bestest Most Awesome way to
10
- run MegaDetector, but it's also pretty subtle; if you want to play with this, you might
11
- want to check in with cameratraps@lila.science for some tips. Otherwise... YMMV.
12
-
13
- Some general notes on using this script, which I do in Spyder, though everything will be
14
- the same if you are reading this in Jupyter Notebook (using the .ipynb version of the
15
- script):
16
-
17
- * Typically when I have a MegaDetector job to run, I make a copy of this script. Let's
18
- say I'm running a job for an organization called "bibblebop"; I have a big folder of
19
- job-specific copies of this script, and I might save a new one called "bibblebop-2023-07-26.py"
20
- (the filename doesn't matter, it just helps me keep these organized).
21
-
22
- * There are three variables you need to set in this script before you start running code:
23
- "input_path", "organization_name_short", and "job_date". You will get a sensible error if you forget
24
- to set any of these. In this case I might set those to "/data/bibblebobcamerastuff",
25
- "bibblebop", and "2023-07-26", respectively.
26
-
27
- * The defaults assume you want to split the job into two tasks (this is the default because I have
28
- two GPUs). Nothing bad will happen if you do this on a zero-GPU or single-GPU machine, but if you
29
- want everything to run in one logical task, change "n_gpus" and "n_jobs" to 1 (instead of 2).
30
-
31
- * After setting the required variables, I run the first few cells - up to and including the one
32
- called "Generate commands" - which collectively take basically zero seconds. After you run the
33
- "Generate commands" cell, you will have a folder that looks something like:
34
-
35
- ~/postprocessing/bibblebop/bibblebop-2023-07-06-mdv5a/
36
-
37
- On Windows, this means:
38
-
39
- ~/postprocessing/bibblebop/bibblebop-2023-07-06-mdv5a/
40
-
41
- Everything related to this job - scripts, outputs, intermediate stuff - will be in this folder.
42
- Specifically, after the "Generate commands" cell, you'll have scripts in that folder called something
43
- like:
44
-
45
- run_chunk_000_gpu_00.sh (or .bat on Windows)
46
-
47
- Personally, I like to run that script directly in a command prompt (I just leave Spyder open, though
48
- it's OK if Spyder gets shut down while MD is running).
49
-
50
- At this point, once you get the hang of it, you've invested about zero seconds of human time,
51
- but possibly several days of unattended compute time, depending on the size of your job.
52
-
53
- * Then when the jobs are done, back to the interactive environment! I run the next few cells,
54
- which make sure the job finished OK, and the cell called "Post-processing (pre-RDE)", which
55
- generates an HTML preview of the results. You are very plausibly done at this point, and can ignore
56
- all the remaining cells. If you want to do things like repeat detection elimination, or running
57
- a classifier, or splitting your results file up in specialized ways, there are cells for all of those
58
- things, but now you're in power-user territory, so I'm going to leave this guide here. Email
59
- cameratraps@lila.science with questions about the fancy stuff.
60
-
61
- """
62
-
63
- #%% Imports and constants
64
-
65
- import json
66
- import os
67
- import stat
68
- import time
69
- import re
70
-
71
- import humanfriendly
72
-
73
- from tqdm import tqdm
74
- from collections import defaultdict
75
-
76
- from md_utils import path_utils
77
- from md_utils.ct_utils import is_list_sorted
78
- from md_utils.ct_utils import split_list_into_n_chunks
79
-
80
- from detection.run_detector_batch import load_and_run_detector_batch, write_results_to_file
81
- from detection.run_detector import DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
82
- from detection.run_detector import estimate_md_images_per_second
83
-
84
- from api.batch_processing.postprocessing.postprocess_batch_results import (
85
- PostProcessingOptions, process_batch_results)
86
- from detection.run_detector import get_detector_version_from_filename
87
- from md_utils.ct_utils import image_file_to_camera_folder
88
-
89
- ## Inference options
90
-
91
- # To specify a non-default confidence threshold for including detections in the .json file
92
- json_threshold = None
93
-
94
- # Turn warnings into errors if more than this many images are missing
95
- max_tolerable_failed_images = 100
96
-
97
- # Should we supply the --image_queue_option to run_detector_batch.py? I only set this
98
- # when I have a very slow drive and a comparably fast GPU. When this is enabled, checkpointing
99
- # is not supported within a job, so I set n_jobs to a large number (typically 100).
100
- use_image_queue = False
101
-
102
- # Only relevant when we're using a single GPU
103
- default_gpu_number = 0
104
-
105
- # Should we supply --quiet to run_detector_batch.py?
106
- quiet_mode = True
107
-
108
- # Specify a target image size when running MD... strongly recommended to leave this at "None"
109
- #
110
- # When using augmented inference, if you leave this at "None", run_inference_with_yolov5_val.py
111
- # will use its default size, which is 1280 * 1.3, which is almost always what you want.
112
- image_size = None
113
-
114
- # Should we include image size, timestamp, and/or EXIF data in MD output?
115
- include_image_size = False
116
- include_image_timestamp = False
117
- include_exif_data = False
118
-
119
- # Only relevant when running on CPU
120
- ncores = 1
121
-
122
- # OS-specific script line continuation character (modified later if we're running on Windows)
123
- slcc = '\\'
124
-
125
- # OS-specific script comment character (modified later if we're running on Windows)
126
- scc = '#'
127
-
128
- # # OS-specific script extension (modified later if we're running on Windows)
129
- script_extension = '.sh'
130
-
131
- # If False, we'll load chunk files with file lists if they exist
132
- force_enumeration = False
133
-
134
- # Prefer threads on Windows, processes on Linux
135
- parallelization_defaults_to_threads = False
136
-
137
- # This is for things like image rendering, not for MegaDetector
138
- default_workers_for_parallel_tasks = 30
139
-
140
- overwrite_handling = 'skip' # 'skip', 'error', or 'overwrite'
141
-
142
- # Only relevant to repeat detection elimination; try to identify EK113/RCNX101-style
143
- # overflow folders and treat them as the same camera
144
- overflow_folder_handling_enabled = True
145
-
146
- # The function used to get camera names from image paths; can also replace this
147
- # with a custom function.
148
- relative_path_to_location = image_file_to_camera_folder
149
-
150
- # This will be the .json results file after RDE; if this is still None when
151
- # we get to classification stuff, that will indicate that we didn't do RDE.
152
- filtered_output_filename = None
153
-
154
- if os.name == 'nt':
155
-
156
- slcc = '^'
157
- scc = 'REM'
158
- script_extension = '.bat'
159
-
160
- # My experience has been that Python multiprocessing is flaky on Windows, so
161
- # default to threads on Windows
162
- parallelization_defaults_to_threads = True
163
- default_workers_for_parallel_tasks = 10
164
-
165
-
166
- ## Constants related to using YOLOv5's val.py
167
-
168
- # Should we use YOLOv5's val.py instead of run_detector_batch.py?
169
- use_yolo_inference_scripts = False
170
-
171
- # Directory in which to run val.py (relevant for YOLOv5, not for YOLOv8)
172
- yolo_working_dir = os.path.expanduser('~/git/yolov5')
173
-
174
- # Only used for loading the mapping from class indices to names
175
- yolo_dataset_file = None
176
-
177
- # 'yolov5' or 'yolov8'; assumes YOLOv5 if this is None
178
- yolo_model_type = None
179
-
180
- # inference batch size
181
- yolo_batch_size = 1
182
-
183
- # Should we remove intermediate files used for running YOLOv5's val.py?
184
- #
185
- # Only relevant if use_yolo_inference_scripts is True.
186
- remove_yolo_intermediate_results = True
187
- remove_yolo_symlink_folder = True
188
- use_symlinks_for_yolo_inference = True
189
- write_yolo_debug_output = False
190
-
191
- # Should we apply YOLOv5's test-time augmentation?
192
- augment = False
193
-
194
-
195
- ## Constants related to tiled inference
196
-
197
- use_tiled_inference = False
198
-
199
- # Should we delete tiles after each job? Only set this to False for debugging;
200
- # large jobs will take up a lot of space if you keep tiles around after each task.
201
- remove_tiles = True
202
- tile_size = (1280,1280)
203
- tile_overlap = 0.2
204
-
205
-
206
- #%% Constants I set per script
207
-
208
- input_path = '/drive/organization'
209
-
210
- assert not (input_path.endswith('/') or input_path.endswith('\\'))
211
- assert os.path.isdir(input_path), 'Could not find input folder {}'.format(input_path)
212
- input_path = input_path.replace('\\','/')
213
-
214
- organization_name_short = 'organization'
215
- job_date = None # '2024-01-01'
216
- assert job_date is not None and organization_name_short != 'organization'
217
-
218
- # Optional descriptor
219
- job_tag = None
220
-
221
- if job_tag is None:
222
- job_description_string = ''
223
- else:
224
- job_description_string = '-' + job_tag
225
-
226
- model_file = 'MDV5A' # 'MDV5A', 'MDV5B', 'MDV4'
227
-
228
- postprocessing_base = os.path.expanduser('~/postprocessing')
229
-
230
- # Number of jobs to split data into, typically equal to the number of available GPUs, though
231
- # when using augmentation or an image queue (and thus not using checkpoints), I typically
232
- # use ~100 jobs per GPU; those serve as de facto checkpoints.
233
- n_jobs = 2
234
- n_gpus = 2
235
-
236
- # Set to "None" when using augmentation or an image queue, which don't currently support
237
- # checkpointing. Don't worry, this will be assert()'d in the next cell.
238
- checkpoint_frequency = 10000
239
-
240
- # Estimate inference speed for the current GPU
241
- approx_images_per_second = estimate_md_images_per_second(model_file)
242
-
243
- # Rough estimate for the inference time cost of augmentation
244
- if augment and (approx_images_per_second is not None):
245
- approx_images_per_second = approx_images_per_second * 0.7
246
-
247
- base_task_name = organization_name_short + '-' + job_date + job_description_string + '-' + \
248
- get_detector_version_from_filename(model_file)
249
- base_output_folder_name = os.path.join(postprocessing_base,organization_name_short)
250
- os.makedirs(base_output_folder_name,exist_ok=True)
251
-
252
-
253
- #%% Derived variables, constant validation, path setup
254
-
255
- if use_image_queue:
256
- assert checkpoint_frequency is None,\
257
- 'Checkpointing is not supported when using an image queue'
258
-
259
- if augment:
260
- assert checkpoint_frequency is None,\
261
- 'Checkpointing is not supported when using augmentation'
262
-
263
- assert use_yolo_inference_scripts,\
264
- 'Augmentation is only supported when running with the YOLO inference scripts'
265
-
266
- if use_tiled_inference:
267
- assert not augment, \
268
- 'Augmentation is not supported when using tiled inference'
269
- assert not use_yolo_inference_scripts, \
270
- 'Using the YOLO inference script is not supported when using tiled inference'
271
- assert checkpoint_frequency is None, \
272
- 'Checkpointing is not supported when using tiled inference'
273
-
274
- filename_base = os.path.join(base_output_folder_name, base_task_name)
275
- combined_api_output_folder = os.path.join(filename_base, 'combined_api_outputs')
276
- postprocessing_output_folder = os.path.join(filename_base, 'preview')
277
-
278
- combined_api_output_file = os.path.join(
279
- combined_api_output_folder,
280
- '{}_detections.json'.format(base_task_name))
281
-
282
- os.makedirs(filename_base, exist_ok=True)
283
- os.makedirs(combined_api_output_folder, exist_ok=True)
284
- os.makedirs(postprocessing_output_folder, exist_ok=True)
285
-
286
- if input_path.endswith('/'):
287
- input_path = input_path[0:-1]
288
-
289
- print('Output folder:\n{}'.format(filename_base))
290
-
291
-
292
- #%% Enumerate files
293
-
294
- # Have we already listed files for this job?
295
- chunk_files = os.listdir(filename_base)
296
- pattern = re.compile('chunk\d+.json')
297
- chunk_files = [fn for fn in chunk_files if pattern.match(fn)]
298
-
299
- if (not force_enumeration) and (len(chunk_files) > 0):
300
-
301
- print('Found {} chunk files in folder {}, bypassing enumeration'.format(
302
- len(chunk_files),
303
- filename_base))
304
-
305
- all_images = []
306
- for fn in chunk_files:
307
- with open(os.path.join(filename_base,fn),'r') as f:
308
- chunk = json.load(f)
309
- assert isinstance(chunk,list)
310
- all_images.extend(chunk)
311
- all_images = sorted(all_images)
312
-
313
- print('Loaded {} image files from {} chunks in {}'.format(
314
- len(all_images),len(chunk_files),filename_base))
315
-
316
- else:
317
-
318
- print('Enumerating image files in {}'.format(input_path))
319
-
320
- all_images = sorted(path_utils.find_images(input_path,recursive=True,convert_slashes=True))
321
-
322
- # It's common to run this notebook on an external drive with the main folders in the drive root
323
- all_images = [fn for fn in all_images if not \
324
- (fn.startswith('$RECYCLE') or fn.startswith('System Volume Information'))]
325
-
326
- print('')
327
-
328
- print('Enumerated {} image files in {}'.format(len(all_images),input_path))
329
-
330
-
331
- #%% Divide images into chunks
332
-
333
- folder_chunks = split_list_into_n_chunks(all_images,n_jobs)
334
-
335
-
336
- #%% Estimate total time
337
-
338
- if approx_images_per_second is None:
339
-
340
- print("Can't estimate inference time for the current environment")
341
-
342
- else:
343
-
344
- n_images = len(all_images)
345
- execution_seconds = n_images / approx_images_per_second
346
- wallclock_seconds = execution_seconds / n_gpus
347
- print('Expected time: {}'.format(humanfriendly.format_timespan(wallclock_seconds)))
348
-
349
- seconds_per_chunk = len(folder_chunks[0]) / approx_images_per_second
350
- print('Expected time per chunk: {}'.format(humanfriendly.format_timespan(seconds_per_chunk)))
351
-
352
-
353
- #%% Write file lists
354
-
355
- task_info = []
356
-
357
- for i_chunk,chunk_list in enumerate(folder_chunks):
358
-
359
- chunk_fn = os.path.join(filename_base,'chunk{}.json'.format(str(i_chunk).zfill(3)))
360
- task_info.append({'id':i_chunk,'input_file':chunk_fn})
361
- path_utils.write_list_to_file(chunk_fn, chunk_list)
362
-
363
-
364
- #%% Generate commands
365
-
366
- # A list of the scripts tied to each GPU, as absolute paths. We'll write this out at
367
- # the end so each GPU's list of commands can be run at once
368
- gpu_to_scripts = defaultdict(list)
369
-
370
- # i_task = 0; task = task_info[i_task]
371
- for i_task,task in enumerate(task_info):
372
-
373
- chunk_file = task['input_file']
374
- checkpoint_filename = chunk_file.replace('.json','_checkpoint.json')
375
-
376
- output_fn = chunk_file.replace('.json','_results.json')
377
-
378
- task['output_file'] = output_fn
379
-
380
- if n_gpus > 1:
381
- gpu_number = i_task % n_gpus
382
- else:
383
- gpu_number = default_gpu_number
384
-
385
- image_size_string = ''
386
- if image_size is not None:
387
- image_size_string = '--image_size {}'.format(image_size)
388
-
389
- # Generate the script to run MD
390
-
391
- if use_yolo_inference_scripts:
392
-
393
- augment_string = ''
394
- if augment:
395
- augment_string = '--augment_enabled 1'
396
- else:
397
- augment_string = '--augment_enabled 0'
398
-
399
- batch_string = '--batch_size {}'.format(yolo_batch_size)
400
-
401
- symlink_folder = os.path.join(filename_base,'symlinks','symlinks_{}'.format(
402
- str(i_task).zfill(3)))
403
- yolo_results_folder = os.path.join(filename_base,'yolo_results','yolo_results_{}'.format(
404
- str(i_task).zfill(3)))
405
-
406
- symlink_folder_string = '--symlink_folder "{}"'.format(symlink_folder)
407
- yolo_results_folder_string = '--yolo_results_folder "{}"'.format(yolo_results_folder)
408
-
409
- remove_symlink_folder_string = ''
410
- if not remove_yolo_symlink_folder:
411
- remove_symlink_folder_string = '--no_remove_symlink_folder'
412
-
413
- write_yolo_debug_output_string = ''
414
- if write_yolo_debug_output:
415
- write_yolo_debug_output = '--write_yolo_debug_output'
416
-
417
- remove_yolo_results_string = ''
418
- if not remove_yolo_intermediate_results:
419
- remove_yolo_results_string = '--no_remove_yolo_results_folder'
420
-
421
- confidence_threshold_string = ''
422
- if json_threshold is not None:
423
- confidence_threshold_string = '--conf_thres {}'.format(json_threshold)
424
- else:
425
- confidence_threshold_string = '--conf_thres {}'.format(DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD)
426
-
427
- cmd = ''
428
-
429
- device_string = '--device {}'.format(gpu_number)
430
-
431
- overwrite_handling_string = '--overwrite_handling {}'.format(overwrite_handling)
432
-
433
- cmd += f'python run_inference_with_yolov5_val.py "{model_file}" "{chunk_file}" "{output_fn}" '
434
- cmd += f'{image_size_string} {augment_string} '
435
- cmd += f'{symlink_folder_string} {yolo_results_folder_string} {remove_yolo_results_string} '
436
- cmd += f'{remove_symlink_folder_string} {confidence_threshold_string} {device_string} '
437
- cmd += f'{overwrite_handling_string} {batch_string} {write_yolo_debug_output_string}'
438
-
439
- if yolo_working_dir is not None:
440
- cmd += f' --yolo_working_folder "{yolo_working_dir}"'
441
- if yolo_dataset_file is not None:
442
- cmd += ' --yolo_dataset_file "{}"'.format(yolo_dataset_file)
443
- if yolo_model_type is not None:
444
- cmd += ' --model_type {}'.format(yolo_model_type)
445
-
446
- if not use_symlinks_for_yolo_inference:
447
- cmd += ' --no_use_symlinks'
448
-
449
- cmd += '\n'
450
-
451
- elif use_tiled_inference:
452
-
453
- tiling_folder = os.path.join(filename_base,'tile_cache','tile_cache_{}'.format(
454
- str(i_task).zfill(3)))
455
-
456
- if os.name == 'nt':
457
- cuda_string = f'set CUDA_VISIBLE_DEVICES={gpu_number} & '
458
- else:
459
- cuda_string = f'CUDA_VISIBLE_DEVICES={gpu_number} '
460
-
461
- cmd = f'{cuda_string} python run_tiled_inference.py "{model_file}" "{input_path}" "{tiling_folder}" "{output_fn}"'
462
-
463
- cmd += f' --image_list "{chunk_file}"'
464
- cmd += f' --overwrite_handling {overwrite_handling}'
465
-
466
- if not remove_tiles:
467
- cmd += ' --no_remove_tiles'
468
-
469
- # If we're using non-default tile sizes
470
- if tile_size is not None and (tile_size[0] > 0 or tile_size[1] > 0):
471
- cmd += ' --tile_size_x {} --tile_size_y {}'.format(tile_size[0],tile_size[1])
472
-
473
- if tile_overlap is not None:
474
- cmd += f' --tile_overlap {tile_overlap}'
475
-
476
- else:
477
-
478
- if os.name == 'nt':
479
- cuda_string = f'set CUDA_VISIBLE_DEVICES={gpu_number} & '
480
- else:
481
- cuda_string = f'CUDA_VISIBLE_DEVICES={gpu_number} '
482
-
483
- checkpoint_frequency_string = ''
484
- checkpoint_path_string = ''
485
-
486
- if checkpoint_frequency is not None and checkpoint_frequency > 0:
487
- checkpoint_frequency_string = f'--checkpoint_frequency {checkpoint_frequency}'
488
- checkpoint_path_string = '--checkpoint_path "{}"'.format(checkpoint_filename)
489
-
490
- use_image_queue_string = ''
491
- if (use_image_queue):
492
- use_image_queue_string = '--use_image_queue'
493
-
494
- ncores_string = ''
495
- if (ncores > 1):
496
- ncores_string = '--ncores {}'.format(ncores)
497
-
498
- quiet_string = ''
499
- if quiet_mode:
500
- quiet_string = '--quiet'
501
-
502
- confidence_threshold_string = ''
503
- if json_threshold is not None:
504
- confidence_threshold_string = '--threshold {}'.format(json_threshold)
505
-
506
- overwrite_handling_string = '--overwrite_handling {}'.format(overwrite_handling)
507
- cmd = f'{cuda_string} python run_detector_batch.py "{model_file}" "{chunk_file}" "{output_fn}" {checkpoint_frequency_string} {checkpoint_path_string} {use_image_queue_string} {ncores_string} {quiet_string} {image_size_string} {confidence_threshold_string} {overwrite_handling_string}'
508
-
509
- if include_image_size:
510
- cmd += ' --include_image_size'
511
- if include_image_timestamp:
512
- cmd += ' --include_image_timestamp'
513
- if include_exif_data:
514
- cmd += ' --include_exif_data'
515
-
516
- cmd_file = os.path.join(filename_base,'run_chunk_{}_gpu_{}{}'.format(str(i_task).zfill(3),
517
- str(gpu_number).zfill(2),script_extension))
518
-
519
- with open(cmd_file,'w') as f:
520
- f.write(cmd + '\n')
521
-
522
- st = os.stat(cmd_file)
523
- os.chmod(cmd_file, st.st_mode | stat.S_IEXEC)
524
-
525
- task['command'] = cmd
526
- task['command_file'] = cmd_file
527
-
528
- # Generate the script to resume from the checkpoint (only supported with MD inference code)
529
-
530
- gpu_to_scripts[gpu_number].append(cmd_file)
531
-
532
- if checkpoint_frequency is not None:
533
-
534
- resume_string = ' --resume_from_checkpoint "{}"'.format(checkpoint_filename)
535
- resume_cmd = cmd + resume_string
536
-
537
- resume_cmd_file = os.path.join(filename_base,
538
- 'resume_chunk_{}_gpu_{}{}'.format(str(i_task).zfill(3),
539
- str(gpu_number).zfill(2),script_extension))
540
-
541
- with open(resume_cmd_file,'w') as f:
542
- f.write(resume_cmd + '\n')
543
-
544
- st = os.stat(resume_cmd_file)
545
- os.chmod(resume_cmd_file, st.st_mode | stat.S_IEXEC)
546
-
547
- task['resume_command'] = resume_cmd
548
- task['resume_command_file'] = resume_cmd_file
549
-
550
- # ...for each task
551
-
552
- # Write out a script for each GPU that runs all of the commands associated with
553
- # that GPU. Typically only used when running lots of little scripts in lieu
554
- # of checkpointing.
555
- for gpu_number in gpu_to_scripts:
556
-
557
- gpu_script_file = os.path.join(filename_base,'run_all_for_gpu_{}{}'.format(
558
- str(gpu_number).zfill(2),script_extension))
559
- with open(gpu_script_file,'w') as f:
560
- for script_name in gpu_to_scripts[gpu_number]:
561
- s = script_name
562
- # When calling a series of batch files on Windows from within a batch file, you need to
563
- # use "call", or only the first will be executed. No, it doesn't make sense.
564
- if os.name == 'nt':
565
- s = 'call ' + s
566
- f.write(s + '\n')
567
- f.write('echo "Finished all commands for GPU {}"'.format(gpu_number))
568
- st = os.stat(gpu_script_file)
569
- os.chmod(gpu_script_file, st.st_mode | stat.S_IEXEC)
570
-
571
- # ...for each GPU
572
-
573
-
574
- #%% Run the tasks
575
-
576
- r"""
577
- The cells we've run so far wrote out some shell scripts (.bat files on Windows,
578
- .sh files on Linx/Mac) that will run MegaDetector. I like to leave the interactive
579
- environment at this point and run those scripts at the command line. So, for example,
580
- if you're on Windows, and you've basically used the default values above, there will be
581
- batch files called, e.g.:
582
-
583
- c:\users\[username]\postprocessing\[organization]\[job_name]\run_chunk_000_gpu_00.bat
584
- c:\users\[username]\postprocessing\[organization]\[job_name]\run_chunk_001_gpu_01.bat
585
-
586
- Those batch files expect to be run from the "detection" folder of the MegaDetector repo,
587
- typically:
588
-
589
- c:\git\MegaDetector\detection
590
-
591
- All of that said, you don't *have* to do this at the command line. The following cell
592
- runs these scripts programmatically, so if you just run the "run the tasks (commented out)"
593
- cell, you should be running MegaDetector.
594
-
595
- One downside of the programmatic approach is that this cell doesn't yet parallelize over
596
- multiple processes, so the tasks will run serially. This only matters if you have multiple
597
- GPUs.
598
- """
599
-
600
- run_tasks_in_notebook = False
601
-
602
- if run_tasks_in_notebook:
603
-
604
- assert not use_yolo_inference_scripts, \
605
- 'If you want to use the YOLOv5 inference scripts, you can\'t run the model interactively (yet)'
606
-
607
- # i_task = 0; task = task_info[i_task]
608
- for i_task,task in enumerate(task_info):
609
-
610
- chunk_file = task['input_file']
611
- output_fn = task['output_file']
612
-
613
- checkpoint_filename = chunk_file.replace('.json','_checkpoint.json')
614
-
615
- if json_threshold is not None:
616
- confidence_threshold = json_threshold
617
- else:
618
- confidence_threshold = DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
619
-
620
- if checkpoint_frequency is not None and checkpoint_frequency > 0:
621
- cp_freq_arg = checkpoint_frequency
622
- else:
623
- cp_freq_arg = -1
624
-
625
- start_time = time.time()
626
- results = load_and_run_detector_batch(model_file=model_file,
627
- image_file_names=chunk_file,
628
- checkpoint_path=checkpoint_filename,
629
- confidence_threshold=confidence_threshold,
630
- checkpoint_frequency=cp_freq_arg,
631
- results=None,
632
- n_cores=ncores,
633
- use_image_queue=use_image_queue,
634
- quiet=quiet_mode,
635
- image_size=image_size)
636
- elapsed = time.time() - start_time
637
-
638
- print('Task {}: finished inference for {} images in {}'.format(
639
- i_task, len(results),humanfriendly.format_timespan(elapsed)))
640
-
641
- # This will write absolute paths to the file, we'll fix this later
642
- write_results_to_file(results, output_fn, detector_file=model_file)
643
-
644
- if checkpoint_frequency is not None and checkpoint_frequency > 0:
645
- if os.path.isfile(checkpoint_filename):
646
- os.remove(checkpoint_filename)
647
- print('Deleted checkpoint file {}'.format(checkpoint_filename))
648
-
649
- # ...for each chunk
650
-
651
- # ...if we're running tasks in this notebook
652
-
653
-
654
- #%% Load results, look for failed or missing images in each task
655
-
656
- # Check that all task output files exist
657
-
658
- missing_output_files = []
659
-
660
- # i_task = 0; task = task_info[i_task]
661
- for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
662
- output_file = task['output_file']
663
- if not os.path.isfile(output_file):
664
- missing_output_files.append(output_file)
665
-
666
- if len(missing_output_files) > 0:
667
- print('Missing {} output files:'.format(len(missing_output_files)))
668
- for s in missing_output_files:
669
- print(s)
670
- raise Exception('Missing output files')
671
-
672
-
673
- n_total_failures = 0
674
-
675
- # i_task = 0; task = task_info[i_task]
676
- for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
677
-
678
- chunk_file = task['input_file']
679
- output_file = task['output_file']
680
-
681
- with open(chunk_file,'r') as f:
682
- task_images = json.load(f)
683
- with open(output_file,'r') as f:
684
- task_results = json.load(f)
685
-
686
- task_images_set = set(task_images)
687
- filename_to_results = {}
688
-
689
- n_task_failures = 0
690
-
691
- # im = task_results['images'][0]
692
- for im in task_results['images']:
693
-
694
- # Most of the time, inference result files use absolute paths, but it's
695
- # getting annoying to make sure that's *always* true, so handle both here.
696
- # E.g., when using tiled inference, paths will be relative.
697
- if not os.path.isabs(im['file']):
698
- fn = os.path.join(input_path,im['file']).replace('\\','/')
699
- im['file'] = fn
700
- assert im['file'].startswith(input_path)
701
- assert im['file'] in task_images_set
702
- filename_to_results[im['file']] = im
703
- if 'failure' in im:
704
- assert im['failure'] is not None
705
- n_task_failures += 1
706
-
707
- task['n_failures'] = n_task_failures
708
- task['results'] = task_results
709
-
710
- for fn in task_images:
711
- assert fn in filename_to_results, \
712
- 'File {} not found in results for task {}'.format(fn,i_task)
713
-
714
- n_total_failures += n_task_failures
715
-
716
- # ...for each task
717
-
718
- assert n_total_failures < max_tolerable_failed_images,\
719
- '{} failures (max tolerable set to {})'.format(n_total_failures,
720
- max_tolerable_failed_images)
721
-
722
- print('Processed all {} images with {} failures'.format(
723
- len(all_images),n_total_failures))
724
-
725
-
726
- ##%% Merge results files and make filenames relative
727
-
728
- combined_results = {}
729
- combined_results['images'] = []
730
- images_processed = set()
731
-
732
- for i_task,task in tqdm(enumerate(task_info),total=len(task_info)):
733
-
734
- task_results = task['results']
735
-
736
- if i_task == 0:
737
- combined_results['info'] = task_results['info']
738
- combined_results['detection_categories'] = task_results['detection_categories']
739
- else:
740
- assert task_results['info']['format_version'] == combined_results['info']['format_version']
741
- assert task_results['detection_categories'] == combined_results['detection_categories']
742
-
743
- # Make sure we didn't see this image in another chunk
744
- for im in task_results['images']:
745
- assert im['file'] not in images_processed
746
- images_processed.add(im['file'])
747
-
748
- combined_results['images'].extend(task_results['images'])
749
-
750
- # Check that we ended up with the right number of images
751
- assert len(combined_results['images']) == len(all_images), \
752
- 'Expected {} images in combined results, found {}'.format(
753
- len(all_images),len(combined_results['images']))
754
-
755
- # Check uniqueness
756
- result_filenames = [im['file'] for im in combined_results['images']]
757
- assert len(combined_results['images']) == len(set(result_filenames))
758
-
759
- # Convert to relative paths, preserving '/' as the path separator, regardless of OS
760
- for im in combined_results['images']:
761
- assert '\\' not in im['file']
762
- assert im['file'].startswith(input_path)
763
- if input_path.endswith(':'):
764
- im['file'] = im['file'].replace(input_path,'',1)
765
- else:
766
- im['file'] = im['file'].replace(input_path + '/','',1)
767
-
768
- with open(combined_api_output_file,'w') as f:
769
- json.dump(combined_results,f,indent=1)
770
-
771
- print('Wrote results to {}'.format(combined_api_output_file))
772
-
773
-
774
- #%% Post-processing (pre-RDE)
775
-
776
- render_animals_only = False
777
-
778
- options = PostProcessingOptions()
779
- options.image_base_dir = input_path
780
- options.include_almost_detections = True
781
- options.num_images_to_sample = 7500
782
- options.confidence_threshold = 0.2
783
- options.almost_detection_confidence_threshold = options.confidence_threshold - 0.05
784
- options.ground_truth_json_file = None
785
- options.separate_detections_by_category = True
786
- options.sample_seed = 0
787
- options.max_figures_per_html_file = 2500
788
-
789
- options.parallelize_rendering = True
790
- options.parallelize_rendering_n_cores = default_workers_for_parallel_tasks
791
- options.parallelize_rendering_with_threads = parallelization_defaults_to_threads
792
-
793
- if render_animals_only:
794
- # Omit some pages from the output, useful when animals are rare
795
- options.rendering_bypass_sets = ['detections_person','detections_vehicle',
796
- 'detections_person_vehicle','non_detections']
797
-
798
- output_base = os.path.join(postprocessing_output_folder,
799
- base_task_name + '_{:.3f}'.format(options.confidence_threshold))
800
- if render_animals_only:
801
- output_base = output_base + '_animals_only'
802
-
803
- os.makedirs(output_base, exist_ok=True)
804
- print('Processing to {}'.format(output_base))
805
-
806
- options.md_results_file = combined_api_output_file
807
- options.output_dir = output_base
808
- ppresults = process_batch_results(options)
809
- html_output_file = ppresults.output_html_file
810
- path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
811
- # import clipboard; clipboard.copy(html_output_file)
812
-
813
-
814
- #%% Repeat detection elimination, phase 1
815
-
816
- # Deliberately leaving these imports here, rather than at the top, because this
817
- # cell is not typically executed
818
- from api.batch_processing.postprocessing.repeat_detection_elimination import repeat_detections_core
819
- task_index = 0
820
-
821
- options = repeat_detections_core.RepeatDetectionOptions()
822
-
823
- options.confidenceMin = 0.1
824
- options.confidenceMax = 1.01
825
- options.iouThreshold = 0.85
826
- options.occurrenceThreshold = 15
827
- options.maxSuspiciousDetectionSize = 0.2
828
- # options.minSuspiciousDetectionSize = 0.05
829
-
830
- options.parallelizationUsesThreads = parallelization_defaults_to_threads
831
- options.nWorkers = default_workers_for_parallel_tasks
832
-
833
- # This will cause a very light gray box to get drawn around all the detections
834
- # we're *not* considering as suspicious.
835
- options.bRenderOtherDetections = True
836
- options.otherDetectionsThreshold = options.confidenceMin
837
-
838
- options.bRenderDetectionTiles = True
839
- options.maxOutputImageWidth = 2000
840
- options.detectionTilesMaxCrops = 250
841
-
842
- # options.lineThickness = 5
843
- # options.boxExpansion = 8
844
-
845
- # To invoke custom collapsing of folders for a particular manufacturer's naming scheme
846
- options.customDirNameFunction = relative_path_to_location
847
-
848
- options.bRenderHtml = False
849
- options.imageBase = input_path
850
- rde_string = 'rde_{:.3f}_{:.3f}_{}_{:.3f}'.format(
851
- options.confidenceMin, options.iouThreshold,
852
- options.occurrenceThreshold, options.maxSuspiciousDetectionSize)
853
- options.outputBase = os.path.join(filename_base, rde_string + '_task_{}'.format(task_index))
854
- options.filenameReplacements = None # {'':''}
855
-
856
- # Exclude people and vehicles from RDE
857
- # options.excludeClasses = [2,3]
858
-
859
- # options.maxImagesPerFolder = 50000
860
- # options.includeFolders = ['a/b/c']
861
- # options.excludeFolder = ['a/b/c']
862
-
863
- options.debugMaxDir = -1
864
- options.debugMaxRenderDir = -1
865
- options.debugMaxRenderDetection = -1
866
- options.debugMaxRenderInstance = -1
867
-
868
- # Can be None, 'xsort', or 'clustersort'
869
- options.smartSort = 'xsort'
870
-
871
- suspicious_detection_results = repeat_detections_core.find_repeat_detections(combined_api_output_file,
872
- outputFilename=None,
873
- options=options)
874
-
875
-
876
- #%% Manual RDE step
877
-
878
- ## DELETE THE VALID DETECTIONS ##
879
-
880
- # If you run this line, it will open the folder up in your file browser
881
- path_utils.open_file(os.path.dirname(suspicious_detection_results.filterFile),
882
- attempt_to_open_in_wsl_host=True)
883
-
884
- #
885
- # If you ran the previous cell, but then you change your mind and you don't want to do
886
- # the RDE step, that's fine, but don't just blast through this cell once you've run the
887
- # previous cell. If you do that, you're implicitly telling the notebook that you looked
888
- # at everything in that folder, and confirmed there were no red boxes on animals.
889
- #
890
- # Instead, either change "filtered_output_filename" below to "combined_api_output_file",
891
- # or delete *all* the images in the filtering folder.
892
- #
893
-
894
-
895
- #%% Re-filtering
896
-
897
- from api.batch_processing.postprocessing.repeat_detection_elimination import remove_repeat_detections
898
-
899
- filtered_output_filename = path_utils.insert_before_extension(combined_api_output_file,
900
- 'filtered_{}'.format(rde_string))
901
-
902
- remove_repeat_detections.remove_repeat_detections(
903
- inputFile=combined_api_output_file,
904
- outputFile=filtered_output_filename,
905
- filteringDir=os.path.dirname(suspicious_detection_results.filterFile)
906
- )
907
-
908
-
909
- #%% Post-processing (post-RDE)
910
-
911
- render_animals_only = False
912
-
913
- options = PostProcessingOptions()
914
- options.image_base_dir = input_path
915
- options.include_almost_detections = True
916
- options.num_images_to_sample = 7500
917
- options.confidence_threshold = 0.2
918
- options.almost_detection_confidence_threshold = options.confidence_threshold - 0.05
919
- options.ground_truth_json_file = None
920
- options.separate_detections_by_category = True
921
- options.sample_seed = 0
922
- options.max_figures_per_html_file = 5000
923
-
924
- options.parallelize_rendering = True
925
- options.parallelize_rendering_n_cores = default_workers_for_parallel_tasks
926
- options.parallelize_rendering_with_threads = parallelization_defaults_to_threads
927
-
928
- if render_animals_only:
929
- # Omit some pages from the output, useful when animals are rare
930
- options.rendering_bypass_sets = ['detections_person','detections_vehicle',
931
- 'detections_person_vehicle','non_detections']
932
-
933
- output_base = os.path.join(postprocessing_output_folder,
934
- base_task_name + '_{}_{:.3f}'.format(rde_string, options.confidence_threshold))
935
-
936
- if render_animals_only:
937
- output_base = output_base + '_render_animals_only'
938
- os.makedirs(output_base, exist_ok=True)
939
-
940
- print('Processing post-RDE to {}'.format(output_base))
941
-
942
- options.md_results_file = filtered_output_filename
943
- options.output_dir = output_base
944
- ppresults = process_batch_results(options)
945
- html_output_file = ppresults.output_html_file
946
-
947
- path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
948
- # import clipboard; clipboard.copy(html_output_file)
949
-
950
-
951
- #%% Run MegaClassifier (actually, write out a script that runs MegaClassifier)
952
-
953
- # Variables that will indicate which classifiers we ran
954
- final_output_path_mc = None
955
- final_output_path_ic = None
956
-
957
- # If we didn't do RDE
958
- if filtered_output_filename is None:
959
- print("Warning: it looks like you didn't do RDE, using the raw output file")
960
- filtered_output_filename = combined_api_output_file
961
-
962
- classifier_name_short = 'megaclassifier'
963
- threshold_str = '0.15' # 0.6
964
- classifier_name = 'megaclassifier_v0.1_efficientnet-b3'
965
-
966
- organization_name = organization_name_short
967
- job_name = base_task_name
968
- input_filename = filtered_output_filename # combined_api_output_file
969
- input_files = [input_filename]
970
- image_base = input_path
971
- crop_path = os.path.join(os.path.expanduser('~/crops'),job_name + '_crops')
972
- output_base = combined_api_output_folder
973
- device_id = 0
974
-
975
- output_file = os.path.join(filename_base,'run_{}_'.format(classifier_name_short) + job_name + script_extension)
976
-
977
- classifier_base = os.path.expanduser('~/models/camera_traps/megaclassifier/v0.1/')
978
- assert os.path.isdir(classifier_base)
979
-
980
- checkpoint_path = os.path.join(classifier_base,'v0.1_efficientnet-b3_compiled.pt')
981
- assert os.path.isfile(checkpoint_path)
982
-
983
- classifier_categories_path = os.path.join(classifier_base,'v0.1_index_to_name.json')
984
- assert os.path.isfile(classifier_categories_path)
985
-
986
- target_mapping_path = os.path.join(classifier_base,'idfg_to_megaclassifier_labels.json')
987
- assert os.path.isfile(target_mapping_path)
988
-
989
- classifier_output_suffix = '_megaclassifier_output.csv.gz'
990
- final_output_suffix = '_megaclassifier.json'
991
-
992
- n_threads_str = str(default_workers_for_parallel_tasks)
993
- image_size_str = '300'
994
- batch_size_str = '64'
995
- num_workers_str = str(default_workers_for_parallel_tasks)
996
- classification_threshold_str = '0.05'
997
-
998
- logdir = filename_base
999
-
1000
- # This is just passed along to the metadata in the output file, it has no impact
1001
- # on how the classification scripts run.
1002
- typical_classification_threshold_str = '0.75'
1003
-
1004
- ##%% Set up environment
1005
-
1006
- commands = []
1007
- # commands.append('cd MegaDetector/classification\n')
1008
- # commands.append('conda activate cameratraps-classifier\n')
1009
-
1010
- ##%% Crop images
1011
-
1012
- commands.append('\n' + scc + ' Cropping ' + scc + '\n')
1013
-
1014
- # fn = input_files[0]
1015
- for fn in input_files:
1016
-
1017
- input_file_path = fn
1018
- crop_cmd = ''
1019
-
1020
- crop_comment = '\n' + scc + ' Cropping {}\n'.format(fn)
1021
- crop_cmd += crop_comment
1022
-
1023
- crop_cmd += "python crop_detections.py " + slcc + "\n" + \
1024
- ' "' + input_file_path + '" ' + slcc + '\n' + \
1025
- ' "' + crop_path + '" ' + slcc + '\n' + \
1026
- ' ' + '--images-dir "' + image_base + '"' + ' ' + slcc + '\n' + \
1027
- ' ' + '--threshold "' + threshold_str + '"' + ' ' + slcc + '\n' + \
1028
- ' ' + '--square-crops ' + ' ' + slcc + '\n' + \
1029
- ' ' + '--threads "' + n_threads_str + '"' + ' ' + slcc + '\n' + \
1030
- ' ' + '--logdir "' + logdir + '"' + '\n' + \
1031
- ' ' + '\n'
1032
- crop_cmd = '{}'.format(crop_cmd)
1033
- commands.append(crop_cmd)
1034
-
1035
-
1036
- ##%% Run classifier
1037
-
1038
- commands.append('\n' + scc + ' Classifying ' + scc + '\n')
1039
-
1040
- # fn = input_files[0]
1041
- for fn in input_files:
1042
-
1043
- input_file_path = fn
1044
- classifier_output_path = crop_path + classifier_output_suffix
1045
-
1046
- classify_cmd = ''
1047
-
1048
- classify_comment = '\n' + scc + ' Classifying {}\n'.format(fn)
1049
- classify_cmd += classify_comment
1050
-
1051
- classify_cmd += "python run_classifier.py " + slcc + "\n" + \
1052
- ' "' + checkpoint_path + '" ' + slcc + '\n' + \
1053
- ' "' + crop_path + '" ' + slcc + '\n' + \
1054
- ' "' + classifier_output_path + '" ' + slcc + '\n' + \
1055
- ' ' + '--detections-json "' + input_file_path + '"' + ' ' + slcc + '\n' + \
1056
- ' ' + '--classifier-categories "' + classifier_categories_path + '"' + ' ' + slcc + '\n' + \
1057
- ' ' + '--image-size "' + image_size_str + '"' + ' ' + slcc + '\n' + \
1058
- ' ' + '--batch-size "' + batch_size_str + '"' + ' ' + slcc + '\n' + \
1059
- ' ' + '--num-workers "' + num_workers_str + '"' + ' ' + slcc + '\n'
1060
-
1061
- if device_id is not None:
1062
- classify_cmd += ' ' + '--device {}'.format(device_id)
1063
-
1064
- classify_cmd += '\n\n'
1065
- classify_cmd = '{}'.format(classify_cmd)
1066
- commands.append(classify_cmd)
1067
-
1068
-
1069
- ##%% Remap classifier outputs
1070
-
1071
- commands.append('\n' + scc + ' Remapping ' + scc + '\n')
1072
-
1073
- # fn = input_files[0]
1074
- for fn in input_files:
1075
-
1076
- input_file_path = fn
1077
- classifier_output_path = crop_path + classifier_output_suffix
1078
- classifier_output_path_remapped = \
1079
- classifier_output_path.replace(".csv.gz","_remapped.csv.gz")
1080
- assert not (classifier_output_path == classifier_output_path_remapped)
1081
-
1082
- output_label_index = classifier_output_path_remapped.replace(
1083
- "_remapped.csv.gz","_label_index_remapped.json")
1084
-
1085
- remap_cmd = ''
1086
-
1087
- remap_comment = '\n' + scc + ' Remapping {}\n'.format(fn)
1088
- remap_cmd += remap_comment
1089
-
1090
- remap_cmd += "python aggregate_classifier_probs.py " + slcc + "\n" + \
1091
- ' "' + classifier_output_path + '" ' + slcc + '\n' + \
1092
- ' ' + '--target-mapping "' + target_mapping_path + '"' + ' ' + slcc + '\n' + \
1093
- ' ' + '--output-csv "' + classifier_output_path_remapped + '"' + ' ' + slcc + '\n' + \
1094
- ' ' + '--output-label-index "' + output_label_index + '"' \
1095
- '\n'
1096
-
1097
- remap_cmd = '{}'.format(remap_cmd)
1098
- commands.append(remap_cmd)
1099
-
1100
-
1101
- ##%% Merge classification and detection outputs
1102
-
1103
- commands.append('\n' + scc + ' Merging ' + scc + '\n')
1104
-
1105
- # fn = input_files[0]
1106
- for fn in input_files:
1107
-
1108
- input_file_path = fn
1109
- classifier_output_path = crop_path + classifier_output_suffix
1110
-
1111
- classifier_output_path_remapped = \
1112
- classifier_output_path.replace(".csv.gz","_remapped.csv.gz")
1113
-
1114
- output_label_index = classifier_output_path_remapped.replace(
1115
- "_remapped.csv.gz","_label_index_remapped.json")
1116
-
1117
- final_output_path = os.path.join(output_base,
1118
- os.path.basename(classifier_output_path)).\
1119
- replace(classifier_output_suffix,
1120
- final_output_suffix)
1121
- final_output_path = final_output_path.replace('_detections','')
1122
- final_output_path = final_output_path.replace('_crops','')
1123
- final_output_path_mc = final_output_path
1124
-
1125
- merge_cmd = ''
1126
-
1127
- merge_comment = '\n' + scc + ' Merging {}\n'.format(fn)
1128
- merge_cmd += merge_comment
1129
-
1130
- merge_cmd += "python merge_classification_detection_output.py " + slcc + "\n" + \
1131
- ' "' + classifier_output_path_remapped + '" ' + slcc + '\n' + \
1132
- ' "' + output_label_index + '" ' + slcc + '\n' + \
1133
- ' ' + '--output-json "' + final_output_path + '"' + ' ' + slcc + '\n' + \
1134
- ' ' + '--detection-json "' + input_file_path + '"' + ' ' + slcc + '\n' + \
1135
- ' ' + '--classifier-name "' + classifier_name + '"' + ' ' + slcc + '\n' + \
1136
- ' ' + '--threshold "' + classification_threshold_str + '"' + ' ' + slcc + '\n' + \
1137
- ' ' + '--typical-confidence-threshold "' + typical_classification_threshold_str + '"' + '\n' + \
1138
- '\n'
1139
- merge_cmd = '{}'.format(merge_cmd)
1140
- commands.append(merge_cmd)
1141
-
1142
-
1143
- ##%% Write out classification script
1144
-
1145
- with open(output_file,'w') as f:
1146
- for s in commands:
1147
- f.write('{}'.format(s))
1148
-
1149
- st = os.stat(output_file)
1150
- os.chmod(output_file, st.st_mode | stat.S_IEXEC)
1151
-
1152
-
1153
- #%% Run a non-MegaClassifier classifier (i.e., a classifier with no output mapping)
1154
-
1155
- classifier_name_short = 'idfgclassifier'
1156
- threshold_str = '0.15' # 0.6
1157
- classifier_name = 'idfg_classifier_ckpt_14_compiled'
1158
-
1159
- organization_name = organization_name_short
1160
- job_name = base_task_name
1161
- input_filename = filtered_output_filename # combined_api_output_file
1162
- input_files = [input_filename]
1163
- image_base = input_path
1164
- crop_path = os.path.join(os.path.expanduser('~/crops'),job_name + '_crops')
1165
- output_base = combined_api_output_folder
1166
- device_id = 1
1167
-
1168
- output_file = os.path.join(filename_base,'run_{}_'.format(classifier_name_short) + job_name + script_extension)
1169
-
1170
- classifier_base = os.path.expanduser('~/models/camera_traps/idfg_classifier/idfg_classifier_20200905_042558')
1171
- assert os.path.isdir(classifier_base)
1172
-
1173
- checkpoint_path = os.path.join(classifier_base,'idfg_classifier_ckpt_14_compiled.pt')
1174
- assert os.path.isfile(checkpoint_path)
1175
-
1176
- classifier_categories_path = os.path.join(classifier_base,'label_index.json')
1177
- assert os.path.isfile(classifier_categories_path)
1178
-
1179
- classifier_output_suffix = '_{}_output.csv.gz'.format(classifier_name_short)
1180
- final_output_suffix = '_{}.json'.format(classifier_name_short)
1181
-
1182
- threshold_str = '0.65'
1183
- n_threads_str = str(default_workers_for_parallel_tasks)
1184
- image_size_str = '300'
1185
- batch_size_str = '64'
1186
- num_workers_str = str(default_workers_for_parallel_tasks)
1187
- logdir = filename_base
1188
-
1189
- classification_threshold_str = '0.05'
1190
-
1191
- # This is just passed along to the metadata in the output file, it has no impact
1192
- # on how the classification scripts run.
1193
- typical_classification_threshold_str = '0.75'
1194
-
1195
-
1196
- ##%% Set up environment
1197
-
1198
- commands = []
1199
-
1200
-
1201
- ##%% Crop images
1202
-
1203
- commands.append('\n' + scc + ' Cropping ' + scc + '\n')
1204
-
1205
- # fn = input_files[0]
1206
- for fn in input_files:
1207
-
1208
- input_file_path = fn
1209
- crop_cmd = ''
1210
-
1211
- crop_comment = '\n' + scc + ' Cropping {}\n'.format(fn)
1212
- crop_cmd += crop_comment
1213
-
1214
- crop_cmd += "python crop_detections.py " + slcc + "\n" + \
1215
- ' "' + input_file_path + '" ' + slcc + '\n' + \
1216
- ' "' + crop_path + '" ' + slcc + '\n' + \
1217
- ' ' + '--images-dir "' + image_base + '"' + ' ' + slcc + '\n' + \
1218
- ' ' + '--threshold "' + threshold_str + '"' + ' ' + slcc + '\n' + \
1219
- ' ' + '--square-crops ' + ' ' + slcc + '\n' + \
1220
- ' ' + '--threads "' + n_threads_str + '"' + ' ' + slcc + '\n' + \
1221
- ' ' + '--logdir "' + logdir + '"' + '\n' + \
1222
- '\n'
1223
- crop_cmd = '{}'.format(crop_cmd)
1224
- commands.append(crop_cmd)
1225
-
1226
-
1227
- ##%% Run classifier
1228
-
1229
- commands.append('\n' + scc + ' Classifying ' + scc + '\n')
1230
-
1231
- # fn = input_files[0]
1232
- for fn in input_files:
1233
-
1234
- input_file_path = fn
1235
- classifier_output_path = crop_path + classifier_output_suffix
1236
-
1237
- classify_cmd = ''
1238
-
1239
- classify_comment = '\n' + scc + ' Classifying {}\n'.format(fn)
1240
- classify_cmd += classify_comment
1241
-
1242
- classify_cmd += "python run_classifier.py " + slcc + "\n" + \
1243
- ' "' + checkpoint_path + '" ' + slcc + '\n' + \
1244
- ' "' + crop_path + '" ' + slcc + '\n' + \
1245
- ' "' + classifier_output_path + '" ' + slcc + '\n' + \
1246
- ' ' + '--detections-json "' + input_file_path + '"' + ' ' + slcc + '\n' + \
1247
- ' ' + '--classifier-categories "' + classifier_categories_path + '"' + ' ' + slcc + '\n' + \
1248
- ' ' + '--image-size "' + image_size_str + '"' + ' ' + slcc + '\n' + \
1249
- ' ' + '--batch-size "' + batch_size_str + '"' + ' ' + slcc + '\n' + \
1250
- ' ' + '--num-workers "' + num_workers_str + '"' + ' ' + slcc + '\n'
1251
-
1252
- if device_id is not None:
1253
- classify_cmd += ' ' + '--device {}'.format(device_id)
1254
-
1255
- classify_cmd += '\n\n'
1256
- classify_cmd = '{}'.format(classify_cmd)
1257
- commands.append(classify_cmd)
1258
-
1259
-
1260
- ##%% Merge classification and detection outputs
1261
-
1262
- commands.append('\n' + scc + ' Merging ' + scc + '\n')
1263
-
1264
- # fn = input_files[0]
1265
- for fn in input_files:
1266
-
1267
- input_file_path = fn
1268
- classifier_output_path = crop_path + classifier_output_suffix
1269
- final_output_path = os.path.join(output_base,
1270
- os.path.basename(classifier_output_path)).\
1271
- replace(classifier_output_suffix,
1272
- final_output_suffix)
1273
- final_output_path = final_output_path.replace('_detections','')
1274
- final_output_path = final_output_path.replace('_crops','')
1275
- final_output_path_ic = final_output_path
1276
-
1277
- merge_cmd = ''
1278
-
1279
- merge_comment = '\n' + scc + ' Merging {}\n'.format(fn)
1280
- merge_cmd += merge_comment
1281
-
1282
- merge_cmd += "python merge_classification_detection_output.py " + slcc + "\n" + \
1283
- ' "' + classifier_output_path + '" ' + slcc + '\n' + \
1284
- ' "' + classifier_categories_path + '" ' + slcc + '\n' + \
1285
- ' ' + '--output-json "' + final_output_path_ic + '"' + ' ' + slcc + '\n' + \
1286
- ' ' + '--detection-json "' + input_file_path + '"' + ' ' + slcc + '\n' + \
1287
- ' ' + '--classifier-name "' + classifier_name + '"' + ' ' + slcc + '\n' + \
1288
- ' ' + '--threshold "' + classification_threshold_str + '"' + ' ' + slcc + '\n' + \
1289
- ' ' + '--typical-confidence-threshold "' + typical_classification_threshold_str + '"' + '\n' + \
1290
- '\n'
1291
- merge_cmd = '{}'.format(merge_cmd)
1292
- commands.append(merge_cmd)
1293
-
1294
-
1295
- ##%% Write everything out
1296
-
1297
- with open(output_file,'w') as f:
1298
- for s in commands:
1299
- f.write('{}'.format(s))
1300
-
1301
- import stat
1302
- st = os.stat(output_file)
1303
- os.chmod(output_file, st.st_mode | stat.S_IEXEC)
1304
-
1305
-
1306
- #%% Run the classifier(s) via the .sh script(s) or batch file(s) we just wrote
1307
-
1308
- # I do this manually, primarily because this requires a different mamba environment
1309
- # (cameratraps-classifier) from MegaDetector's environment (cameratraps-detector).
1310
- #
1311
- # The next few pseudo-cells (#%) in this script are basically always run all at once, getting us
1312
- # all the way from running the classifier to classification previews and zipped .json files that
1313
- # are ready to upload.
1314
-
1315
-
1316
- #%% Within-image classification smoothing
1317
-
1318
- #
1319
- # Only count detections with a classification confidence threshold above
1320
- # *classification_confidence_threshold*, which in practice means we're only
1321
- # looking at one category per detection.
1322
- #
1323
- # If an image has at least *min_detections_above_threshold* such detections
1324
- # in the most common category, and no more than *max_detections_secondary_class*
1325
- # in the second-most-common category, flip all detections to the most common
1326
- # category.
1327
- #
1328
- # Optionally treat some classes as particularly unreliable, typically used to overwrite an
1329
- # "other" class.
1330
- #
1331
- # This cell also removes everything but the non-dominant classification for each detection.
1332
- #
1333
-
1334
- # How many detections do we need above the classification threshold to determine a dominant category
1335
- # for an image?
1336
- min_detections_above_threshold = 4
1337
-
1338
- # Even if we have a dominant class, if a non-dominant class has at least this many classifications
1339
- # in an image, leave them alone.
1340
- max_detections_secondary_class = 3
1341
-
1342
- # If the dominant class has at least this many classifications, overwrite "other" classifications
1343
- min_detections_to_overwrite_other = 2
1344
- other_category_names = ['other']
1345
-
1346
- # What confidence threshold should we use for assessing the dominant category in an image?
1347
- classification_confidence_threshold = 0.6
1348
-
1349
- # Which classifications should we even bother over-writing?
1350
- classification_overwrite_threshold = 0.3
1351
-
1352
- # Detection confidence threshold for things we count when determining a dominant class
1353
- detection_confidence_threshold = 0.2
1354
-
1355
- # Which detections should we even bother over-writing?
1356
- detection_overwrite_threshold = 0.05
1357
-
1358
- classification_detection_files = []
1359
-
1360
- # Did we run MegaClassifier
1361
- if final_output_path_mc is not None:
1362
- classification_detection_files.append(final_output_path_mc)
1363
-
1364
- # Did we run the IDFG classifier?
1365
- if final_output_path_ic is not None:
1366
- classification_detection_files.append(final_output_path_ic)
1367
-
1368
- assert all([os.path.isfile(fn) for fn in classification_detection_files])
1369
-
1370
- smoothed_classification_files = []
1371
-
1372
- for final_output_path in classification_detection_files:
1373
-
1374
- classifier_output_path = final_output_path
1375
- classifier_output_path_within_image_smoothing = classifier_output_path.replace(
1376
- '.json','_within_image_smoothing.json')
1377
-
1378
- with open(classifier_output_path,'r') as f:
1379
- d = json.load(f)
1380
-
1381
- category_name_to_id = {d['classification_categories'][k]:k for k in d['classification_categories']}
1382
- other_category_ids = []
1383
- for s in other_category_names:
1384
- if s in category_name_to_id:
1385
- other_category_ids.append(category_name_to_id[s])
1386
- else:
1387
- print('Warning: "other" category {} not present in file {}'.format(
1388
- s,classifier_output_path))
1389
-
1390
- n_other_classifications_changed = 0
1391
- n_other_images_changed = 0
1392
-
1393
- n_detections_flipped = 0
1394
- n_images_changed = 0
1395
-
1396
- # Before we do anything else, get rid of everything but the top classification
1397
- # for each detection.
1398
- for im in tqdm(d['images']):
1399
-
1400
- if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
1401
- continue
1402
-
1403
- detections = im['detections']
1404
-
1405
- for det in detections:
1406
-
1407
- if 'classifications' not in det or len(det['classifications']) == 0:
1408
- continue
1409
-
1410
- classification_confidence_values = [c[1] for c in det['classifications']]
1411
- assert is_list_sorted(classification_confidence_values,reverse=True)
1412
- det['classifications'] = [det['classifications'][0]]
1413
-
1414
- # ...for each detection in this image
1415
-
1416
- # ...for each image
1417
-
1418
- # im = d['images'][0]
1419
- for im in tqdm(d['images']):
1420
-
1421
- if 'detections' not in im or im['detections'] is None or len(im['detections']) == 0:
1422
- continue
1423
-
1424
- detections = im['detections']
1425
-
1426
- category_to_count = defaultdict(int)
1427
- for det in detections:
1428
- if ('classifications' in det) and (det['conf'] >= detection_confidence_threshold):
1429
- for c in det['classifications']:
1430
- if c[1] >= classification_confidence_threshold:
1431
- category_to_count[c[0]] += 1
1432
- # ...for each classification
1433
- # ...if there are classifications for this detection
1434
- # ...for each detection
1435
-
1436
- if len(category_to_count) <= 1:
1437
- continue
1438
-
1439
- category_to_count = {k: v for k, v in sorted(category_to_count.items(),
1440
- key=lambda item: item[1],
1441
- reverse=True)}
1442
-
1443
- keys = list(category_to_count.keys())
1444
-
1445
- # Handle a quirky special case: if the most common category is "other" and
1446
- # it's "tied" with the second-most-common category, swap them
1447
- if (len(keys) > 1) and \
1448
- (keys[0] in other_category_ids) and \
1449
- (keys[1] not in other_category_ids) and \
1450
- (category_to_count[keys[0]] == category_to_count[keys[1]]):
1451
- keys[1], keys[0] = keys[0], keys[1]
1452
-
1453
- max_count = category_to_count[keys[0]]
1454
- # secondary_count = category_to_count[keys[1]]
1455
- # The 'secondary count' is the most common non-other class
1456
- secondary_count = 0
1457
- for i_key in range(1,len(keys)):
1458
- if keys[i_key] not in other_category_ids:
1459
- secondary_count = category_to_count[keys[i_key]]
1460
- break
1461
-
1462
- most_common_category = keys[0]
1463
-
1464
- assert max_count >= secondary_count
1465
-
1466
- # If we have at least *min_detections_to_overwrite_other* in a category that isn't
1467
- # "other", change all "other" classifications to that category
1468
- if max_count >= min_detections_to_overwrite_other and \
1469
- most_common_category not in other_category_ids:
1470
-
1471
- other_change_made = False
1472
-
1473
- for det in detections:
1474
-
1475
- if ('classifications' in det) and (det['conf'] >= detection_overwrite_threshold):
1476
-
1477
- for c in det['classifications']:
1478
-
1479
- if c[1] >= classification_overwrite_threshold and \
1480
- c[0] in other_category_ids:
1481
-
1482
- n_other_classifications_changed += 1
1483
- other_change_made = True
1484
- c[0] = most_common_category
1485
-
1486
- # ...for each classification
1487
-
1488
- # ...if there are classifications for this detection
1489
-
1490
- # ...for each detection
1491
-
1492
- if other_change_made:
1493
- n_other_images_changed += 1
1494
-
1495
- # ...if we should overwrite all "other" classifications
1496
-
1497
- if max_count < min_detections_above_threshold:
1498
- continue
1499
-
1500
- if secondary_count >= max_detections_secondary_class:
1501
- continue
1502
-
1503
- # At this point, we know we have a dominant category; change all other above-threshold
1504
- # classifications to that category. That category may have been "other", in which
1505
- # case we may have already made the relevant changes.
1506
-
1507
- n_detections_flipped_this_image = 0
1508
-
1509
- # det = detections[0]
1510
- for det in detections:
1511
-
1512
- if ('classifications' in det) and (det['conf'] >= detection_overwrite_threshold):
1513
-
1514
- for c in det['classifications']:
1515
- if c[1] >= classification_overwrite_threshold and \
1516
- c[0] != most_common_category:
1517
-
1518
- c[0] = most_common_category
1519
- n_detections_flipped += 1
1520
- n_detections_flipped_this_image += 1
1521
-
1522
- # ...for each classification
1523
-
1524
- # ...if there are classifications for this detection
1525
-
1526
- # ...for each detection
1527
-
1528
- if n_detections_flipped_this_image > 0:
1529
- n_images_changed += 1
1530
-
1531
- # ...for each image
1532
-
1533
- print('Classification smoothing: changed {} detections on {} images'.format(
1534
- n_detections_flipped,n_images_changed))
1535
-
1536
- print('"Other" smoothing: changed {} detections on {} images'.format(
1537
- n_other_classifications_changed,n_other_images_changed))
1538
-
1539
- with open(classifier_output_path_within_image_smoothing,'w') as f:
1540
- json.dump(d,f,indent=1)
1541
-
1542
- print('Wrote results to:\n{}'.format(classifier_output_path_within_image_smoothing))
1543
- smoothed_classification_files.append(classifier_output_path_within_image_smoothing)
1544
-
1545
- # ...for each file we want to smooth
1546
-
1547
-
1548
- #% Read EXIF data from all images
1549
-
1550
- from data_management import read_exif
1551
- exif_options = read_exif.ReadExifOptions()
1552
-
1553
- exif_options.verbose = False
1554
- exif_options.n_workers = default_workers_for_parallel_tasks
1555
- exif_options.use_threads = parallelization_defaults_to_threads
1556
- exif_options.processing_library = 'pil'
1557
- exif_options.byte_handling = 'delete'
1558
-
1559
- exif_results_file = os.path.join(filename_base,'exif_data.json')
1560
-
1561
- if os.path.isfile(exif_results_file):
1562
- print('Reading EXIF results from {}'.format(exif_results_file))
1563
- with open(exif_results_file,'r') as f:
1564
- exif_results = json.load(f)
1565
- else:
1566
- exif_results = read_exif.read_exif_from_folder(input_path,
1567
- output_file=exif_results_file,
1568
- options=exif_options)
1569
-
1570
-
1571
- #% Prepare COCO-camera-traps-compatible image objects for EXIF results
1572
-
1573
- import datetime
1574
- from data_management.read_exif import parse_exif_datetime_string
1575
-
1576
- min_valid_timestamp_year = 2001
1577
-
1578
- now = datetime.datetime.now()
1579
-
1580
- image_info = []
1581
-
1582
- images_without_datetime = []
1583
- images_with_invalid_datetime = []
1584
-
1585
- exif_datetime_tag = 'DateTimeOriginal'
1586
-
1587
- # exif_result = exif_results[0]
1588
- for exif_result in tqdm(exif_results):
1589
-
1590
- im = {}
1591
-
1592
- # By default we assume that each leaf-node folder is a location
1593
- if overflow_folder_handling_enabled:
1594
- im['location'] = relative_path_to_location(os.path.dirname(exif_result['file_name']))
1595
- else:
1596
- im['location'] = os.path.dirname(exif_result['file_name'])
1597
-
1598
- im['file_name'] = exif_result['file_name']
1599
- im['id'] = im['file_name']
1600
-
1601
- if ('exif_tags' not in exif_result) or (exif_result['exif_tags'] is None) or \
1602
- (exif_datetime_tag not in exif_result['exif_tags']):
1603
- exif_dt = None
1604
- else:
1605
- exif_dt = exif_result['exif_tags'][exif_datetime_tag]
1606
- exif_dt = parse_exif_datetime_string(exif_dt)
1607
- if exif_dt is None:
1608
- im['datetime'] = None
1609
- images_without_datetime.append(im['file_name'])
1610
- else:
1611
- dt = exif_dt
1612
-
1613
- # An image from the future (or within the last 24 hours) is invalid
1614
- if (now - dt).total_seconds() <= 1*24*60*60:
1615
- print('Warning: datetime for {} is {}'.format(
1616
- im['file_name'],dt))
1617
- im['datetime'] = None
1618
- images_with_invalid_datetime.append(im['file_name'])
1619
-
1620
- # An image from before the dawn of time is also invalid
1621
- elif dt.year < min_valid_timestamp_year:
1622
- print('Warning: datetime for {} is {}'.format(
1623
- im['file_name'],dt))
1624
- im['datetime'] = None
1625
- images_with_invalid_datetime.append(im['file_name'])
1626
-
1627
- else:
1628
- im['datetime'] = dt
1629
-
1630
- image_info.append(im)
1631
-
1632
- # ...for each exif image result
1633
-
1634
- print('Parsed EXIF datetime information, unable to parse EXIF date from {} of {} images'.format(
1635
- len(images_without_datetime),len(exif_results)))
1636
-
1637
-
1638
- #% Assemble into sequences
1639
-
1640
- from collections import defaultdict
1641
- from data_management import cct_json_utils
1642
-
1643
- print('Assembling images into sequences')
1644
-
1645
- cct_json_utils.create_sequences(image_info)
1646
-
1647
- # Make a list of images appearing at each location
1648
- sequence_to_images = defaultdict(list)
1649
-
1650
- # im = image_info[0]
1651
- for im in tqdm(image_info):
1652
- sequence_to_images[im['seq_id']].append(im)
1653
-
1654
- all_sequences = list(sorted(sequence_to_images.keys()))
1655
-
1656
-
1657
- #% Load classification results
1658
-
1659
- sequence_level_smoothing_input_file = smoothed_classification_files[0]
1660
-
1661
- with open(sequence_level_smoothing_input_file,'r') as f:
1662
- d = json.load(f)
1663
-
1664
- # Map each filename to classification results for that file
1665
- filename_to_results = {}
1666
-
1667
- for im in tqdm(d['images']):
1668
- filename_to_results[im['file'].replace('\\','/')] = im
1669
-
1670
-
1671
- #% Smooth classification results over sequences (prep)
1672
-
1673
- from md_utils.ct_utils import is_list_sorted
1674
-
1675
- classification_category_id_to_name = d['classification_categories']
1676
- classification_category_name_to_id = {v: k for k, v in classification_category_id_to_name.items()}
1677
-
1678
- class_names = list(classification_category_id_to_name.values())
1679
-
1680
- animal_detection_category = '1'
1681
- assert(d['detection_categories'][animal_detection_category] == 'animal')
1682
-
1683
- other_category_names = set(['other'])
1684
- other_category_ids = set([classification_category_name_to_id[s] for s in other_category_names])
1685
-
1686
- # These are the only classes to which we're going to switch other classifications
1687
- category_names_to_smooth_to = set(['deer','elk','cow','canid','cat','bird','bear'])
1688
- category_ids_to_smooth_to = set([classification_category_name_to_id[s] for s in category_names_to_smooth_to])
1689
- assert all([s in class_names for s in category_names_to_smooth_to])
1690
-
1691
- # Only switch classifications to the dominant class if we see the dominant class at least
1692
- # this many times
1693
- min_dominant_class_classifications_above_threshold_for_class_smoothing = 5 # 2
1694
-
1695
- # If we see more than this many of a class that are above threshold, don't switch those
1696
- # classifications to the dominant class.
1697
- max_secondary_class_classifications_above_threshold_for_class_smoothing = 5
1698
-
1699
- # If the ratio between a dominant class and a secondary class count is greater than this,
1700
- # regardless of the secondary class count, switch those classifications (i.e., ignore
1701
- # max_secondary_class_classifications_above_threshold_for_class_smoothing).
1702
- #
1703
- # This may be different for different dominant classes, e.g. if we see lots of cows, they really
1704
- # tend to be cows. Less so for canids, so we set a higher "override ratio" for canids.
1705
- min_dominant_class_ratio_for_secondary_override_table = {classification_category_name_to_id['cow']:2,None:3}
1706
-
1707
- # If there are at least this many classifications for the dominant class in a sequence,
1708
- # regardless of what that class is, convert all 'other' classifications (regardless of
1709
- # confidence) to that class.
1710
- min_dominant_class_classifications_above_threshold_for_other_smoothing = 3 # 2
1711
-
1712
- # If there are at least this many classifications for the dominant class in a sequence,
1713
- # regardless of what that class is, classify all previously-unclassified detections
1714
- # as that class.
1715
- min_dominant_class_classifications_above_threshold_for_unclassified_smoothing = 3 # 2
1716
-
1717
- # Only count classifications above this confidence level when determining the dominant
1718
- # class, and when deciding whether to switch other classifications.
1719
- classification_confidence_threshold = 0.6
1720
-
1721
- # Confidence values to use when we change a detection's classification (the
1722
- # original confidence value is irrelevant at that point)
1723
- flipped_other_confidence_value = 0.6
1724
- flipped_class_confidence_value = 0.6
1725
- flipped_unclassified_confidence_value = 0.6
1726
-
1727
- min_detection_confidence_for_unclassified_flipping = 0.15
1728
-
1729
-
1730
- #% Smooth classification results over sequences (supporting functions)
1731
-
1732
- def results_for_sequence(images_this_sequence):
1733
- """
1734
- Fetch MD results for every image in this sequence, based on the 'file_name' field
1735
- """
1736
-
1737
- results_this_sequence = []
1738
- for im in images_this_sequence:
1739
- fn = im['file_name']
1740
- results_this_image = filename_to_results[fn]
1741
- assert isinstance(results_this_image,dict)
1742
- results_this_sequence.append(results_this_image)
1743
-
1744
- return results_this_sequence
1745
-
1746
-
1747
- def top_classifications_for_sequence(images_this_sequence):
1748
- """
1749
- Return all top-1 animal classifications for every detection in this
1750
- sequence, regardless of confidence
1751
-
1752
- May modify [images_this_sequence] (removing non-top-1 classifications)
1753
- """
1754
-
1755
- classifications_this_sequence = []
1756
-
1757
- # im = images_this_sequence[0]
1758
- for im in images_this_sequence:
1759
-
1760
- fn = im['file_name']
1761
- results_this_image = filename_to_results[fn]
1762
-
1763
- if results_this_image['detections'] is None:
1764
- continue
1765
-
1766
- # det = results_this_image['detections'][0]
1767
- for det in results_this_image['detections']:
1768
-
1769
- # Only process animal detections
1770
- if det['category'] != animal_detection_category:
1771
- continue
1772
-
1773
- # Only process detections with classification information
1774
- if 'classifications' not in det:
1775
- continue
1776
-
1777
- # We only care about top-1 classifications, remove everything else
1778
- if len(det['classifications']) > 1:
1779
-
1780
- # Make sure the list of classifications is already sorted by confidence
1781
- classification_confidence_values = [c[1] for c in det['classifications']]
1782
- assert is_list_sorted(classification_confidence_values,reverse=True)
1783
-
1784
- # ...and just keep the first one
1785
- det['classifications'] = [det['classifications'][0]]
1786
-
1787
- # Confidence values should be sorted within a detection; verify this, and ignore
1788
- top_classification = det['classifications'][0]
1789
-
1790
- classifications_this_sequence.append(top_classification)
1791
-
1792
- # ...for each detection in this image
1793
-
1794
- # ...for each image in this sequence
1795
-
1796
- return classifications_this_sequence
1797
-
1798
- # ...top_classifications_for_sequence()
1799
-
1800
-
1801
- def count_above_threshold_classifications(classifications_this_sequence):
1802
- """
1803
- Given a list of classification objects (tuples), return a dict mapping
1804
- category IDs to the count of above-threshold classifications.
1805
-
1806
- This dict's keys will be sorted in descending order by frequency.
1807
- """
1808
-
1809
- # Count above-threshold classifications in this sequence
1810
- category_to_count = defaultdict(int)
1811
- for c in classifications_this_sequence:
1812
- if c[1] >= classification_confidence_threshold:
1813
- category_to_count[c[0]] += 1
1814
-
1815
- # Sort the dictionary in descending order by count
1816
- category_to_count = {k: v for k, v in sorted(category_to_count.items(),
1817
- key=lambda item: item[1],
1818
- reverse=True)}
1819
-
1820
- keys_sorted_by_frequency = list(category_to_count.keys())
1821
-
1822
- # Handle a quirky special case: if the most common category is "other" and
1823
- # it's "tied" with the second-most-common category, swap them.
1824
- if len(other_category_names) > 0:
1825
- if (len(keys_sorted_by_frequency) > 1) and \
1826
- (keys_sorted_by_frequency[0] in other_category_names) and \
1827
- (keys_sorted_by_frequency[1] not in other_category_names) and \
1828
- (category_to_count[keys_sorted_by_frequency[0]] == \
1829
- category_to_count[keys_sorted_by_frequency[1]]):
1830
- keys_sorted_by_frequency[1], keys_sorted_by_frequency[0] = \
1831
- keys_sorted_by_frequency[0], keys_sorted_by_frequency[1]
1832
-
1833
- sorted_category_to_count = {}
1834
- for k in keys_sorted_by_frequency:
1835
- sorted_category_to_count[k] = category_to_count[k]
1836
-
1837
- return sorted_category_to_count
1838
-
1839
- # ...def count_above_threshold_classifications()
1840
-
1841
- def sort_images_by_time(images):
1842
- """
1843
- Returns a copy of [images], sorted by the 'datetime' field (ascending).
1844
- """
1845
- return sorted(images, key = lambda im: im['datetime'])
1846
-
1847
-
1848
- def get_first_key_from_sorted_dictionary(di):
1849
- if len(di) == 0:
1850
- return None
1851
- return next(iter(di.items()))[0]
1852
-
1853
-
1854
- def get_first_value_from_sorted_dictionary(di):
1855
- if len(di) == 0:
1856
- return None
1857
- return next(iter(di.items()))[1]
1858
-
1859
-
1860
- #% Smooth classifications at the sequence level (main loop)
1861
-
1862
- n_other_flips = 0
1863
- n_classification_flips = 0
1864
- n_unclassified_flips = 0
1865
-
1866
- # Break if this token is contained in a filename (set to None for normal operation)
1867
- debug_fn = None
1868
-
1869
- # i_sequence = 0; seq_id = all_sequences[i_sequence]
1870
- for i_sequence,seq_id in tqdm(enumerate(all_sequences),total=len(all_sequences)):
1871
-
1872
- images_this_sequence = sequence_to_images[seq_id]
1873
-
1874
- # Count top-1 classifications in this sequence (regardless of confidence)
1875
- classifications_this_sequence = top_classifications_for_sequence(images_this_sequence)
1876
-
1877
- # Handy debugging code for looking at the numbers for a particular sequence
1878
- for im in images_this_sequence:
1879
- if debug_fn is not None and debug_fn in im['file_name']:
1880
- raise ValueError('')
1881
-
1882
- if len(classifications_this_sequence) == 0:
1883
- continue
1884
-
1885
- # Count above-threshold classifications for each category
1886
- sorted_category_to_count = count_above_threshold_classifications(classifications_this_sequence)
1887
-
1888
- if len(sorted_category_to_count) == 0:
1889
- continue
1890
-
1891
- max_count = get_first_value_from_sorted_dictionary(sorted_category_to_count)
1892
- dominant_category_id = get_first_key_from_sorted_dictionary(sorted_category_to_count)
1893
-
1894
- # If our dominant category ID isn't something we want to smooth to, don't mess around with this sequence
1895
- if dominant_category_id not in category_ids_to_smooth_to:
1896
- continue
1897
-
1898
-
1899
- ## Smooth "other" classifications ##
1900
-
1901
- if max_count >= min_dominant_class_classifications_above_threshold_for_other_smoothing:
1902
- for c in classifications_this_sequence:
1903
- if c[0] in other_category_ids:
1904
- n_other_flips += 1
1905
- c[0] = dominant_category_id
1906
- c[1] = flipped_other_confidence_value
1907
-
1908
-
1909
- # By not re-computing "max_count" here, we are making a decision that the count used
1910
- # to decide whether a class should overwrite another class does not include any "other"
1911
- # classifications we changed to be the dominant class. If we wanted to include those...
1912
- #
1913
- # sorted_category_to_count = count_above_threshold_classifications(classifications_this_sequence)
1914
- # max_count = get_first_value_from_sorted_dictionary(sorted_category_to_count)
1915
- # assert dominant_category_id == get_first_key_from_sorted_dictionary(sorted_category_to_count)
1916
-
1917
-
1918
- ## Smooth non-dominant classes ##
1919
-
1920
- if max_count >= min_dominant_class_classifications_above_threshold_for_class_smoothing:
1921
-
1922
- # Don't flip classes to the dominant class if they have a large number of classifications
1923
- category_ids_not_to_flip = set()
1924
-
1925
- for category_id in sorted_category_to_count.keys():
1926
- secondary_class_count = sorted_category_to_count[category_id]
1927
- dominant_to_secondary_ratio = max_count / secondary_class_count
1928
-
1929
- # Don't smooth over this class if there are a bunch of them, and the ratio
1930
- # if primary to secondary class count isn't too large
1931
-
1932
- # Default ratio
1933
- ratio_for_override = min_dominant_class_ratio_for_secondary_override_table[None]
1934
-
1935
- # Does this dominant class have a custom ratio?
1936
- if dominant_category_id in min_dominant_class_ratio_for_secondary_override_table:
1937
- ratio_for_override = \
1938
- min_dominant_class_ratio_for_secondary_override_table[dominant_category_id]
1939
-
1940
- if (dominant_to_secondary_ratio < ratio_for_override) and \
1941
- (secondary_class_count > \
1942
- max_secondary_class_classifications_above_threshold_for_class_smoothing):
1943
- category_ids_not_to_flip.add(category_id)
1944
-
1945
- for c in classifications_this_sequence:
1946
- if c[0] not in category_ids_not_to_flip and c[0] != dominant_category_id:
1947
- c[0] = dominant_category_id
1948
- c[1] = flipped_class_confidence_value
1949
- n_classification_flips += 1
1950
-
1951
-
1952
- ## Smooth unclassified detections ##
1953
-
1954
- if max_count >= min_dominant_class_classifications_above_threshold_for_unclassified_smoothing:
1955
-
1956
- results_this_sequence = results_for_sequence(images_this_sequence)
1957
- detections_this_sequence = []
1958
- for r in results_this_sequence:
1959
- if r['detections'] is not None:
1960
- detections_this_sequence.extend(r['detections'])
1961
- for det in detections_this_sequence:
1962
- if 'classifications' in det and len(det['classifications']) > 0:
1963
- continue
1964
- if det['category'] != animal_detection_category:
1965
- continue
1966
- if det['conf'] < min_detection_confidence_for_unclassified_flipping:
1967
- continue
1968
- det['classifications'] = [[dominant_category_id,flipped_unclassified_confidence_value]]
1969
- n_unclassified_flips += 1
1970
-
1971
- # ...for each sequence
1972
-
1973
- print('\Finished sequence smoothing\n')
1974
- print('Flipped {} "other" classifications'.format(n_other_flips))
1975
- print('Flipped {} species classifications'.format(n_classification_flips))
1976
- print('Flipped {} unclassified detections'.format(n_unclassified_flips))
1977
-
1978
-
1979
- #% Write smoothed classification results
1980
-
1981
- sequence_smoothed_classification_file = sequence_level_smoothing_input_file.replace(
1982
- '.json','_seqsmoothing.json')
1983
-
1984
- print('Writing sequence-smoothed classification results to {}'.format(
1985
- sequence_smoothed_classification_file))
1986
-
1987
- with open(sequence_smoothed_classification_file,'w') as f:
1988
- json.dump(d,f,indent=1)
1989
-
1990
-
1991
- #% Post-processing (post-classification, post-within-image-and-within-sequence-smoothing)
1992
-
1993
- options = PostProcessingOptions()
1994
- options.image_base_dir = input_path
1995
- options.include_almost_detections = True
1996
- options.num_images_to_sample = 10000
1997
- options.confidence_threshold = 0.2
1998
- options.classification_confidence_threshold = 0.7
1999
- options.almost_detection_confidence_threshold = options.confidence_threshold - 0.05
2000
- options.ground_truth_json_file = None
2001
- options.separate_detections_by_category = True
2002
- options.max_figures_per_html_file = 2500
2003
-
2004
- options.parallelize_rendering = True
2005
- options.parallelize_rendering_n_cores = default_workers_for_parallel_tasks
2006
- options.parallelize_rendering_with_threads = parallelization_defaults_to_threads
2007
-
2008
- folder_token = sequence_smoothed_classification_file.split(os.path.sep)[-1].replace(
2009
- '_within_image_smoothing_seqsmoothing','')
2010
- folder_token = folder_token.replace('.json','_seqsmoothing')
2011
-
2012
- output_base = os.path.join(postprocessing_output_folder, folder_token + \
2013
- base_task_name + '_{:.3f}'.format(options.confidence_threshold))
2014
- os.makedirs(output_base, exist_ok=True)
2015
- print('Processing {} to {}'.format(base_task_name, output_base))
2016
-
2017
- options.md_results_file = sequence_smoothed_classification_file
2018
- options.output_dir = output_base
2019
- ppresults = process_batch_results(options)
2020
- path_utils.open_file(ppresults.output_html_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
2021
- # import clipboard; clipboard.copy(ppresults.output_html_file)
2022
-
2023
- #% Zip .json files
2024
-
2025
- from md_utils.path_utils import parallel_zip_files
2026
-
2027
- json_files = os.listdir(combined_api_output_folder)
2028
- json_files = [fn for fn in json_files if fn.endswith('.json')]
2029
- json_files = [os.path.join(combined_api_output_folder,fn) for fn in json_files]
2030
-
2031
- parallel_zip_files(json_files)
2032
-
2033
-
2034
- #%% 99.9% of jobs end here
2035
-
2036
- # Everything after this is run ad hoc and/or requires some manual editing.
2037
-
2038
-
2039
- #%% Compare results files for different model versions (or before/after RDE)
2040
-
2041
- import itertools
2042
-
2043
- from api.batch_processing.postprocessing.compare_batch_results import (
2044
- BatchComparisonOptions,PairwiseBatchComparisonOptions,compare_batch_results)
2045
-
2046
- options = BatchComparisonOptions()
2047
-
2048
- options.job_name = organization_name_short
2049
- options.output_folder = os.path.join(postprocessing_output_folder,'model_comparison')
2050
- options.image_folder = input_path
2051
-
2052
- options.pairwise_options = []
2053
-
2054
- filenames = [
2055
- '/postprocessing/organization/mdv4_results.json',
2056
- '/postprocessing/organization/mdv5a_results.json',
2057
- '/postprocessing/organization/mdv5b_results.json'
2058
- ]
2059
-
2060
- detection_thresholds = [0.7,0.15,0.15]
2061
-
2062
- assert len(detection_thresholds) == len(filenames)
2063
-
2064
- rendering_thresholds = [(x*0.6666) for x in detection_thresholds]
2065
-
2066
- # Choose all pairwise combinations of the files in [filenames]
2067
- for i, j in itertools.combinations(list(range(0,len(filenames))),2):
2068
-
2069
- pairwise_options = PairwiseBatchComparisonOptions()
2070
-
2071
- pairwise_options.results_filename_a = filenames[i]
2072
- pairwise_options.results_filename_b = filenames[j]
2073
-
2074
- pairwise_options.rendering_confidence_threshold_a = rendering_thresholds[i]
2075
- pairwise_options.rendering_confidence_threshold_b = rendering_thresholds[j]
2076
-
2077
- pairwise_options.detection_thresholds_a = {'animal':detection_thresholds[i],
2078
- 'person':detection_thresholds[i],
2079
- 'vehicle':detection_thresholds[i]}
2080
- pairwise_options.detection_thresholds_b = {'animal':detection_thresholds[j],
2081
- 'person':detection_thresholds[j],
2082
- 'vehicle':detection_thresholds[j]}
2083
- options.pairwise_options.append(pairwise_options)
2084
-
2085
- results = compare_batch_results(options)
2086
-
2087
- from md_utils.path_utils import open_file
2088
- open_file(results.html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
2089
-
2090
-
2091
- #%% Merge in high-confidence detections from another results file
2092
-
2093
- from api.batch_processing.postprocessing.merge_detections import MergeDetectionsOptions,merge_detections
2094
-
2095
- source_files = ['']
2096
- target_file = ''
2097
- output_file = target_file.replace('.json','_merged.json')
2098
-
2099
- options = MergeDetectionsOptions()
2100
- options.max_detection_size = 1.0
2101
- options.target_confidence_threshold = 0.25
2102
- options.categories_to_include = [1]
2103
- options.source_confidence_thresholds = [0.2]
2104
- merge_detections(source_files, target_file, output_file, options)
2105
-
2106
- merged_detections_file = output_file
2107
-
2108
-
2109
- #%% Create a new category for large boxes
2110
-
2111
- from api.batch_processing.postprocessing import categorize_detections_by_size
2112
-
2113
- size_options = categorize_detections_by_size.SizeCategorizationOptions()
2114
-
2115
- size_options.size_thresholds = [0.9]
2116
- size_options.size_category_names = ['large_detections']
2117
-
2118
- size_options.categories_to_separate = [1]
2119
- size_options.measurement = 'size' # 'width'
2120
-
2121
- threshold_string = '-'.join([str(x) for x in size_options.size_thresholds])
2122
-
2123
- input_file = filtered_output_filename
2124
- size_separated_file = input_file.replace('.json','-size-separated-{}.json'.format(
2125
- threshold_string))
2126
- d = categorize_detections_by_size.categorize_detections_by_size(input_file,size_separated_file,
2127
- size_options)
2128
-
2129
-
2130
- #%% Preview large boxes
2131
-
2132
- output_base_large_boxes = os.path.join(postprocessing_output_folder,
2133
- base_task_name + '_{}_{:.3f}_size_separated_boxes'.format(rde_string, options.confidence_threshold))
2134
- os.makedirs(output_base_large_boxes, exist_ok=True)
2135
- print('Processing post-RDE, post-size-separation to {}'.format(output_base_large_boxes))
2136
-
2137
- options.md_results_file = size_separated_file
2138
- options.output_dir = output_base_large_boxes
2139
-
2140
- ppresults = process_batch_results(options)
2141
- html_output_file = ppresults.output_html_file
2142
- path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
2143
-
2144
-
2145
- #%% .json splitting
2146
-
2147
- data = None
2148
-
2149
- from api.batch_processing.postprocessing.subset_json_detector_output import (
2150
- subset_json_detector_output, SubsetJsonDetectorOutputOptions)
2151
-
2152
- input_filename = filtered_output_filename
2153
- output_base = os.path.join(combined_api_output_folder,base_task_name + '_json_subsets')
2154
-
2155
- print('Processing file {} to {}'.format(input_filename,output_base))
2156
-
2157
- options = SubsetJsonDetectorOutputOptions()
2158
- # options.query = None
2159
- # options.replacement = None
2160
-
2161
- options.split_folders = True
2162
- options.make_folder_relative = True
2163
-
2164
- # Reminder: 'n_from_bottom' with a parameter of zero is the same as 'bottom'
2165
- options.split_folder_mode = 'bottom' # 'top', 'n_from_top', 'n_from_bottom'
2166
- options.split_folder_param = 0
2167
- options.overwrite_json_files = False
2168
- options.confidence_threshold = 0.01
2169
-
2170
- subset_data = subset_json_detector_output(input_filename, output_base, options, data)
2171
-
2172
- # Zip the subsets folder
2173
- from md_utils.path_utils import zip_folder
2174
- zip_folder(output_base,verbose=True)
2175
-
2176
-
2177
- #%% Custom splitting/subsetting
2178
-
2179
- data = None
2180
-
2181
- from api.batch_processing.postprocessing.subset_json_detector_output import (
2182
- subset_json_detector_output, SubsetJsonDetectorOutputOptions)
2183
-
2184
- input_filename = filtered_output_filename
2185
- output_base = os.path.join(filename_base,'json_subsets')
2186
-
2187
- folders = os.listdir(input_path)
2188
-
2189
- if data is None:
2190
- with open(input_filename) as f:
2191
- data = json.load(f)
2192
-
2193
- print('Data set contains {} images'.format(len(data['images'])))
2194
-
2195
- # i_folder = 0; folder_name = folders[i_folder]
2196
- for i_folder, folder_name in enumerate(folders):
2197
-
2198
- output_filename = os.path.join(output_base, folder_name + '.json')
2199
- print('Processing folder {} of {} ({}) to {}'.format(i_folder, len(folders), folder_name,
2200
- output_filename))
2201
-
2202
- options = SubsetJsonDetectorOutputOptions()
2203
- options.confidence_threshold = 0.01
2204
- options.overwrite_json_files = True
2205
- options.query = folder_name + '/'
2206
-
2207
- # This doesn't do anything in this case, since we're not splitting folders
2208
- # options.make_folder_relative = True
2209
-
2210
- subset_data = subset_json_detector_output(input_filename, output_filename, options, data)
2211
-
2212
-
2213
- #%% String replacement
2214
-
2215
- data = None
2216
-
2217
- from api.batch_processing.postprocessing.subset_json_detector_output import (
2218
- subset_json_detector_output, SubsetJsonDetectorOutputOptions)
2219
-
2220
- input_filename = filtered_output_filename
2221
- output_filename = input_filename.replace('.json','_replaced.json')
2222
-
2223
- options = SubsetJsonDetectorOutputOptions()
2224
- options.query = folder_name + '/'
2225
- options.replacement = ''
2226
- subset_json_detector_output(input_filename,output_filename,options)
2227
-
2228
-
2229
- #%% Splitting images into folders
2230
-
2231
- from api.batch_processing.postprocessing.separate_detections_into_folders import (
2232
- separate_detections_into_folders, SeparateDetectionsIntoFoldersOptions)
2233
-
2234
- default_threshold = 0.2
2235
- base_output_folder = os.path.expanduser('~/data/{}-{}-separated'.format(base_task_name,default_threshold))
2236
-
2237
- options = SeparateDetectionsIntoFoldersOptions(default_threshold)
2238
-
2239
- options.results_file = filtered_output_filename
2240
- options.base_input_folder = input_path
2241
- options.base_output_folder = os.path.join(base_output_folder,folder_name)
2242
- options.n_threads = default_workers_for_parallel_tasks
2243
- options.allow_existing_directory = False
2244
-
2245
- separate_detections_into_folders(options)
2246
-
2247
-
2248
- #%% Convert frame-level results to video-level results
2249
-
2250
- # This cell is only useful if the files submitted to this job were generated via
2251
- # video_folder_to_frames().
2252
-
2253
- from detection.video_utils import frame_results_to_video_results
2254
-
2255
- video_output_filename = filtered_output_filename.replace('.json','_aggregated.json')
2256
- frame_results_to_video_results(filtered_output_filename,video_output_filename)
2257
-
2258
-
2259
- #%% Sample custom path replacement function
2260
-
2261
- def custom_relative_path_to_location(relative_path):
2262
-
2263
- relative_path = relative_path.replace('\\','/')
2264
- tokens = relative_path.split('/')
2265
- location_name = '/'.join(tokens[0:2])
2266
- return location_name
2267
-
2268
-
2269
- #%% Test relative_path_to_location on the current dataset
2270
-
2271
- with open(combined_api_output_file,'r') as f:
2272
- d = json.load(f)
2273
- image_filenames = [im['file'] for im in d['images']]
2274
-
2275
- location_names = set()
2276
-
2277
- # relative_path = image_filenames[0]
2278
- for relative_path in tqdm(image_filenames):
2279
- location_name = relative_path_to_location(relative_path)
2280
- location_names.add(location_name)
2281
-
2282
- location_names = list(location_names)
2283
- location_names.sort()
2284
-
2285
- for s in location_names:
2286
- print(s)
2287
-
2288
-
2289
- #%% End notebook: turn this script into a notebook (how meta!)
2290
-
2291
- import os
2292
- import nbformat as nbf
2293
-
2294
- if os.name == 'nt':
2295
- git_base = r'c:\git'
2296
- else:
2297
- git_base = os.path.expanduser('~/git')
2298
-
2299
- input_py_file = git_base + '/MegaDetector/api/batch_processing/data_preparation/manage_local_batch.py'
2300
- assert os.path.isfile(input_py_file)
2301
- output_ipynb_file = input_py_file.replace('.py','.ipynb')
2302
-
2303
- nb_header = '# Managing a local MegaDetector batch'
2304
-
2305
- nb_header += '\n'
2306
-
2307
- nb_header += \
2308
- """
2309
- This notebook represents an interactive process for running MegaDetector on large batches of images, including typical and optional postprocessing steps. Everything after "Merge results..." is basically optional, and we typically do a mix of these optional steps, depending on the job.
2310
-
2311
- This notebook is auto-generated from manage_local_batch.py (a cell-delimited .py file that is used the same way, typically in Spyder or VS Code).
2312
-
2313
- """
2314
-
2315
- with open(input_py_file,'r') as f:
2316
- lines = f.readlines()
2317
-
2318
- i_line = 0
2319
-
2320
- header_comment = ''
2321
-
2322
- # Delete a few lines from the top that don't belong in the NB version, e.g. the name
2323
- # of the .py file
2324
- lines_to_ignore = 7
2325
- expected_first_token = '# This script'
2326
- found_first_token = False
2327
-
2328
- # Everything before the first cell is the header comment
2329
- while(not lines[i_line].startswith('#%%')):
2330
-
2331
- if i_line < lines_to_ignore:
2332
- i_line += 1
2333
- continue
2334
-
2335
- if not found_first_token:
2336
- assert lines[i_line].startswith(expected_first_token)
2337
- found_first_token = True
2338
-
2339
- s = lines[i_line].replace('#','').strip()
2340
- if len(s) == 0:
2341
- header_comment += '\n\n'
2342
- else:
2343
- header_comment += ' ' + s
2344
- i_line += 1
2345
-
2346
- nb_header += header_comment
2347
- nb = nbf.v4.new_notebook()
2348
- nb['cells'].append(nbf.v4.new_markdown_cell(nb_header))
2349
-
2350
- current_cell = []
2351
-
2352
- def write_code_cell(c):
2353
-
2354
- first_non_empty_line = None
2355
- last_non_empty_line = None
2356
-
2357
- for i_code_line,code_line in enumerate(c):
2358
- if len(code_line.strip()) > 0:
2359
- if first_non_empty_line is None:
2360
- first_non_empty_line = i_code_line
2361
- last_non_empty_line = i_code_line
2362
-
2363
- # Remove the first [first_non_empty_lines] from the list
2364
- c = c[first_non_empty_line:]
2365
- last_non_empty_line -= first_non_empty_line
2366
- c = c[:last_non_empty_line+1]
2367
-
2368
- nb['cells'].append(nbf.v4.new_code_cell('\n'.join(c)))
2369
-
2370
- while(True):
2371
-
2372
- line = lines[i_line].rstrip()
2373
-
2374
- if 'end notebook' in line.lower():
2375
- break
2376
-
2377
- if lines[i_line].startswith('#%% '):
2378
- if len(current_cell) > 0:
2379
- write_code_cell(current_cell)
2380
- current_cell = []
2381
- markdown_content = line.replace('#%%','##')
2382
- nb['cells'].append(nbf.v4.new_markdown_cell(markdown_content))
2383
- else:
2384
- current_cell.append(line)
2385
-
2386
- i_line += 1
2387
-
2388
- # Add the last cell
2389
- write_code_cell(current_cell)
2390
-
2391
- nbf.write(nb,output_ipynb_file)