megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (191) hide show
  1. api/__init__.py +0 -0
  2. api/batch_processing/__init__.py +0 -0
  3. api/batch_processing/api_core/__init__.py +0 -0
  4. api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. api/batch_processing/api_core/batch_service/score.py +0 -1
  6. api/batch_processing/api_core/server_job_status_table.py +0 -1
  7. api/batch_processing/api_core_support/__init__.py +0 -0
  8. api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
  9. api/batch_processing/api_support/__init__.py +0 -0
  10. api/batch_processing/api_support/summarize_daily_activity.py +0 -1
  11. api/batch_processing/data_preparation/__init__.py +0 -0
  12. api/batch_processing/data_preparation/manage_local_batch.py +93 -79
  13. api/batch_processing/data_preparation/manage_video_batch.py +8 -8
  14. api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
  15. api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
  16. api/batch_processing/postprocessing/__init__.py +0 -0
  17. api/batch_processing/postprocessing/add_max_conf.py +12 -12
  18. api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
  19. api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
  20. api/batch_processing/postprocessing/compare_batch_results.py +114 -44
  21. api/batch_processing/postprocessing/convert_output_format.py +62 -19
  22. api/batch_processing/postprocessing/load_api_results.py +17 -20
  23. api/batch_processing/postprocessing/md_to_coco.py +31 -21
  24. api/batch_processing/postprocessing/md_to_labelme.py +165 -68
  25. api/batch_processing/postprocessing/merge_detections.py +40 -15
  26. api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
  27. api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
  28. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
  29. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
  30. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
  31. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
  32. api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
  33. api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
  34. api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
  35. api/synchronous/__init__.py +0 -0
  36. api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  37. api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
  38. api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
  39. api/synchronous/api_core/animal_detection_api/config.py +35 -35
  40. api/synchronous/api_core/tests/__init__.py +0 -0
  41. api/synchronous/api_core/tests/load_test.py +109 -109
  42. classification/__init__.py +0 -0
  43. classification/aggregate_classifier_probs.py +21 -24
  44. classification/analyze_failed_images.py +11 -13
  45. classification/cache_batchapi_outputs.py +51 -51
  46. classification/create_classification_dataset.py +69 -68
  47. classification/crop_detections.py +54 -53
  48. classification/csv_to_json.py +97 -100
  49. classification/detect_and_crop.py +105 -105
  50. classification/evaluate_model.py +43 -42
  51. classification/identify_mislabeled_candidates.py +47 -46
  52. classification/json_to_azcopy_list.py +10 -10
  53. classification/json_validator.py +72 -71
  54. classification/map_classification_categories.py +44 -43
  55. classification/merge_classification_detection_output.py +68 -68
  56. classification/prepare_classification_script.py +157 -154
  57. classification/prepare_classification_script_mc.py +228 -228
  58. classification/run_classifier.py +27 -26
  59. classification/save_mislabeled.py +30 -30
  60. classification/train_classifier.py +20 -20
  61. classification/train_classifier_tf.py +21 -22
  62. classification/train_utils.py +10 -10
  63. data_management/__init__.py +0 -0
  64. data_management/annotations/__init__.py +0 -0
  65. data_management/annotations/annotation_constants.py +18 -31
  66. data_management/camtrap_dp_to_coco.py +238 -0
  67. data_management/cct_json_utils.py +107 -59
  68. data_management/cct_to_md.py +176 -158
  69. data_management/cct_to_wi.py +247 -219
  70. data_management/coco_to_labelme.py +272 -0
  71. data_management/coco_to_yolo.py +86 -62
  72. data_management/databases/__init__.py +0 -0
  73. data_management/databases/add_width_and_height_to_db.py +20 -16
  74. data_management/databases/combine_coco_camera_traps_files.py +35 -31
  75. data_management/databases/integrity_check_json_db.py +130 -83
  76. data_management/databases/subset_json_db.py +25 -16
  77. data_management/generate_crops_from_cct.py +27 -45
  78. data_management/get_image_sizes.py +188 -144
  79. data_management/importers/add_nacti_sizes.py +8 -8
  80. data_management/importers/add_timestamps_to_icct.py +78 -78
  81. data_management/importers/animl_results_to_md_results.py +158 -160
  82. data_management/importers/auckland_doc_test_to_json.py +9 -9
  83. data_management/importers/auckland_doc_to_json.py +8 -8
  84. data_management/importers/awc_to_json.py +7 -7
  85. data_management/importers/bellevue_to_json.py +15 -15
  86. data_management/importers/cacophony-thermal-importer.py +13 -13
  87. data_management/importers/carrizo_shrubfree_2018.py +8 -8
  88. data_management/importers/carrizo_trail_cam_2017.py +8 -8
  89. data_management/importers/cct_field_adjustments.py +9 -9
  90. data_management/importers/channel_islands_to_cct.py +10 -10
  91. data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
  92. data_management/importers/ena24_to_json.py +7 -7
  93. data_management/importers/filenames_to_json.py +8 -8
  94. data_management/importers/helena_to_cct.py +7 -7
  95. data_management/importers/idaho-camera-traps.py +7 -7
  96. data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
  97. data_management/importers/jb_csv_to_json.py +9 -9
  98. data_management/importers/mcgill_to_json.py +8 -8
  99. data_management/importers/missouri_to_json.py +18 -18
  100. data_management/importers/nacti_fieldname_adjustments.py +10 -10
  101. data_management/importers/noaa_seals_2019.py +8 -8
  102. data_management/importers/pc_to_json.py +7 -7
  103. data_management/importers/plot_wni_giraffes.py +7 -7
  104. data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
  105. data_management/importers/prepare_zsl_imerit.py +7 -7
  106. data_management/importers/rspb_to_json.py +8 -8
  107. data_management/importers/save_the_elephants_survey_A.py +8 -8
  108. data_management/importers/save_the_elephants_survey_B.py +9 -9
  109. data_management/importers/snapshot_safari_importer.py +26 -26
  110. data_management/importers/snapshot_safari_importer_reprise.py +665 -665
  111. data_management/importers/snapshot_serengeti_lila.py +14 -14
  112. data_management/importers/sulross_get_exif.py +8 -9
  113. data_management/importers/timelapse_csv_set_to_json.py +11 -11
  114. data_management/importers/ubc_to_json.py +13 -13
  115. data_management/importers/umn_to_json.py +7 -7
  116. data_management/importers/wellington_to_json.py +8 -8
  117. data_management/importers/wi_to_json.py +9 -9
  118. data_management/importers/zamba_results_to_md_results.py +181 -181
  119. data_management/labelme_to_coco.py +309 -159
  120. data_management/labelme_to_yolo.py +103 -60
  121. data_management/lila/__init__.py +0 -0
  122. data_management/lila/add_locations_to_island_camera_traps.py +9 -9
  123. data_management/lila/add_locations_to_nacti.py +147 -147
  124. data_management/lila/create_lila_blank_set.py +114 -31
  125. data_management/lila/create_lila_test_set.py +8 -8
  126. data_management/lila/create_links_to_md_results_files.py +106 -106
  127. data_management/lila/download_lila_subset.py +92 -90
  128. data_management/lila/generate_lila_per_image_labels.py +56 -43
  129. data_management/lila/get_lila_annotation_counts.py +18 -15
  130. data_management/lila/get_lila_image_counts.py +11 -11
  131. data_management/lila/lila_common.py +103 -70
  132. data_management/lila/test_lila_metadata_urls.py +132 -116
  133. data_management/ocr_tools.py +173 -128
  134. data_management/read_exif.py +161 -99
  135. data_management/remap_coco_categories.py +84 -0
  136. data_management/remove_exif.py +58 -62
  137. data_management/resize_coco_dataset.py +32 -44
  138. data_management/wi_download_csv_to_coco.py +246 -0
  139. data_management/yolo_output_to_md_output.py +86 -73
  140. data_management/yolo_to_coco.py +535 -95
  141. detection/__init__.py +0 -0
  142. detection/detector_training/__init__.py +0 -0
  143. detection/process_video.py +85 -33
  144. detection/pytorch_detector.py +43 -25
  145. detection/run_detector.py +157 -72
  146. detection/run_detector_batch.py +189 -114
  147. detection/run_inference_with_yolov5_val.py +118 -51
  148. detection/run_tiled_inference.py +113 -42
  149. detection/tf_detector.py +51 -28
  150. detection/video_utils.py +606 -521
  151. docs/source/conf.py +43 -0
  152. md_utils/__init__.py +0 -0
  153. md_utils/azure_utils.py +9 -9
  154. md_utils/ct_utils.py +249 -70
  155. md_utils/directory_listing.py +59 -64
  156. md_utils/md_tests.py +968 -862
  157. md_utils/path_utils.py +655 -155
  158. md_utils/process_utils.py +157 -133
  159. md_utils/sas_blob_utils.py +20 -20
  160. md_utils/split_locations_into_train_val.py +45 -32
  161. md_utils/string_utils.py +33 -10
  162. md_utils/url_utils.py +208 -27
  163. md_utils/write_html_image_list.py +51 -35
  164. md_visualization/__init__.py +0 -0
  165. md_visualization/plot_utils.py +102 -109
  166. md_visualization/render_images_with_thumbnails.py +34 -34
  167. md_visualization/visualization_utils.py +908 -311
  168. md_visualization/visualize_db.py +109 -58
  169. md_visualization/visualize_detector_output.py +61 -42
  170. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
  171. megadetector-5.0.9.dist-info/RECORD +224 -0
  172. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
  173. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
  174. taxonomy_mapping/__init__.py +0 -0
  175. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
  176. taxonomy_mapping/map_new_lila_datasets.py +154 -154
  177. taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
  178. taxonomy_mapping/preview_lila_taxonomy.py +591 -591
  179. taxonomy_mapping/retrieve_sample_image.py +12 -12
  180. taxonomy_mapping/simple_image_download.py +11 -11
  181. taxonomy_mapping/species_lookup.py +10 -10
  182. taxonomy_mapping/taxonomy_csv_checker.py +18 -18
  183. taxonomy_mapping/taxonomy_graph.py +47 -47
  184. taxonomy_mapping/validate_lila_category_mappings.py +83 -76
  185. data_management/cct_json_to_filename_json.py +0 -89
  186. data_management/cct_to_csv.py +0 -140
  187. data_management/databases/remove_corrupted_images_from_db.py +0 -191
  188. detection/detector_training/copy_checkpoints.py +0 -43
  189. md_visualization/visualize_megadb.py +0 -183
  190. megadetector-5.0.7.dist-info/RECORD +0 -202
  191. {megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0
@@ -1,12 +1,16 @@
1
- ########
2
- #
3
- # create_lila_blank_set.py
4
- #
5
- # Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
6
- # locations will be oversampled relative to more common locations. We'll also run MegaDetector
7
- # to minimize the chance that incorrectly-labeled non-empty images sneak into our blank set.
8
- #
9
- ########
1
+ """
2
+
3
+ create_lila_blank_set.py
4
+
5
+ Create a folder of blank images sampled from LILA. We'll aim for diversity, so less-common
6
+ locations will be oversampled relative to more common locations. We'll also run MegaDetector
7
+ (with manual review) to remove some incorrectly-labeled, not-actually-empty images from our
8
+ blank set.
9
+
10
+ We'll store location information for each image in a .json file, so we can split locations
11
+ into train/val in downstream tasks.
12
+
13
+ """
10
14
 
11
15
  #%% Constants and imports
12
16
 
@@ -14,7 +18,6 @@ import os
14
18
  import random
15
19
  import math
16
20
  import json
17
- import shutil
18
21
 
19
22
  import numpy as np
20
23
  from tqdm import tqdm
@@ -22,8 +25,7 @@ from multiprocessing.pool import ThreadPool
22
25
  from urllib.parse import urlparse
23
26
  from collections import defaultdict
24
27
 
25
- from data_management.lila.lila_common import \
26
- read_lila_all_images_file, azure_url_to_gcp_http_url
28
+ from data_management.lila.lila_common import read_lila_all_images_file
27
29
  from md_utils.url_utils import download_url
28
30
  from md_visualization import visualization_utils as vis_utils
29
31
  from md_utils.path_utils import recursive_file_list
@@ -45,6 +47,14 @@ os.makedirs(confirmed_blanks_base,exist_ok=True)
45
47
  md_possible_non_blanks_folder = os.path.join(project_base,'candidate_non_blanks')
46
48
  os.makedirs(md_possible_non_blanks_folder,exist_ok=True)
47
49
 
50
+ location_to_blank_image_urls_cache_file = os.path.join(project_base,
51
+ 'location_to_blank_image_urls.json')
52
+
53
+ md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
54
+
55
+ all_fn_relative_to_location_file = os.path.join(project_base,'all_fn_relative_to_location.json')
56
+ confirmed_fn_relative_to_location_file = os.path.join(project_base,'confirmed_fn_relative_to_location.json')
57
+
48
58
  preferred_image_download_source = 'gcp'
49
59
 
50
60
  # Number of concurrent download threads
@@ -171,9 +181,6 @@ for s in original_labels_with_nan_common_names:
171
181
 
172
182
  #%% Map locations to blank images
173
183
 
174
- location_to_blank_image_urls_cache_file = os.path.join(project_base,
175
- 'location_to_blank_image_urls.json')
176
-
177
184
  force_map_locations = False
178
185
 
179
186
  # Load from .json if available
@@ -275,7 +282,7 @@ print('Max samples per location: {}'.format(max_blanks_per_location))
275
282
 
276
283
  #%% Download those image files (prep)
277
284
 
278
- container_to_url_base = {
285
+ container_to_url_base = {
279
286
  'lilablobssc.blob.core.windows.net':'/',
280
287
  'storage.googleapis.com':'/public-datasets-lila/'
281
288
  }
@@ -318,6 +325,21 @@ def download_relative_filename(url, output_base, verbose=False, url_base=None, o
318
325
  result['status'] = 'success'
319
326
  return result
320
327
 
328
+ def azure_url_to_gcp_http_url(url,error_if_not_azure_url=True):
329
+ """
330
+ Most URLs point to Azure by default, but most files are available on both Azure and GCP.
331
+ This function converts an Azure URL to the corresponding GCP http:// url.
332
+ """
333
+
334
+ lila_azure_storage_account = 'https://lilablobssc.blob.core.windows.net'
335
+ gcp_bucket_api_url = 'https://storage.googleapis.com/public-datasets-lila'
336
+ error_if_not_azure_url = False
337
+
338
+ if error_if_not_azure_url:
339
+ assert url.startswith(lila_azure_storage_account)
340
+ gcp_url = url.replace(lila_azure_storage_account,gcp_bucket_api_url,1)
341
+ return gcp_url
342
+
321
343
  # Convert Azure URLs to GCP URLs if necessary
322
344
  if preferred_image_download_source != 'azure':
323
345
  assert preferred_image_download_source == 'gcp'
@@ -358,8 +380,6 @@ print('Errors on {} of {} downloads'.format(len(error_urls),len(results)))
358
380
 
359
381
  #%% Run MegaDetector on the folder
360
382
 
361
- md_results_file = os.path.join(project_base,'lila_blanks_md_results.json')
362
-
363
383
  cmd = 'python run_detector_batch.py MDV5A "{}" "{}"'.format(
364
384
  candidate_blanks_base,md_results_file)
365
385
  cmd += ' --recursive --output_relative_filenames'
@@ -419,6 +439,7 @@ for i_fn,source_file_relative in tqdm(enumerate(images_to_review_to_detections),
419
439
  confidence_threshold=min_threshold,
420
440
  target_size=(1280,-1))
421
441
 
442
+ # This is a temporary file I just used during debugging
422
443
  with open(os.path.join(project_base,'output_file_to_source_file.json'),'w') as f:
423
444
  json.dump(output_file_to_source_file,f,indent=1)
424
445
 
@@ -442,33 +463,95 @@ for output_file in tqdm(output_file_to_source_file.keys()):
442
463
  source_file_relative = output_file_to_source_file[output_file]
443
464
  removed_blank_images_relative.append(source_file_relative)
444
465
 
466
+ removed_blank_images_relative_set = set(removed_blank_images_relative)
445
467
  assert len(removed_blank_images_relative) + len(remaining_images) == len(output_file_to_source_file)
446
468
 
447
469
 
448
- #%% Copy all the confirmed blanks to the confirmed folder
470
+ #%% Copy only the confirmed blanks to the confirmed folder
471
+
472
+ from md_utils.path_utils import is_image_file
449
473
 
450
474
  all_candidate_blanks = recursive_file_list(candidate_blanks_base,return_relative_paths=True)
451
475
  print('Found {} candidate blanks'.format(len(all_candidate_blanks)))
452
476
 
477
+ skipped_images_relative = []
478
+ skipped_non_images = []
479
+
453
480
  for source_fn_relative in tqdm(all_candidate_blanks):
481
+
482
+ # Skip anything we removed from the "candidate non-blanks" folder; these weren't really
483
+ # blank.
484
+ if source_fn_relative in removed_blank_images_relative_set:
485
+ skipped_images_relative.append(source_fn_relative)
486
+ continue
487
+
488
+ if not is_image_file(source_fn_relative):
489
+ # Not a typo; "skipped images" really means "skipped files"
490
+ skipped_images_relative.append(source_fn_relative)
491
+ skipped_non_images.append(source_fn_relative)
492
+
493
+
454
494
  source_fn_abs = os.path.join(candidate_blanks_base,source_fn_relative)
455
495
  assert os.path.isfile(source_fn_abs)
456
496
  target_fn_abs = os.path.join(confirmed_blanks_base,source_fn_relative)
457
497
  os.makedirs(os.path.dirname(target_fn_abs),exist_ok=True)
458
- shutil.copyfile(source_fn_abs,target_fn_abs)
498
+ # shutil.copyfile(source_fn_abs,target_fn_abs)
459
499
 
500
+ print('Skipped {} files ({} non-image files)'.format(len(skipped_images_relative),
501
+ len(skipped_non_images)))
460
502
 
461
- #%% Record location information for each file
462
503
 
463
- fn_relative_to_location = {}
464
- for location in location_to_blank_image_urls:
465
- urls_this_location = location_to_blank_image_urls[location]
466
- for url in urls_this_location:
467
- fn_relative = url.split('//')[1]
468
- fn_relative_to_location[fn_relative] = location
469
-
470
- all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
504
+ #%% Validate the folder of confirmed blanks
505
+
506
+ from md_utils.path_utils import find_images
507
+ # all_confirmed_blanks = recursive_file_list(confirmed_blanks_base,return_relative_paths=True)
508
+ all_confirmed_blanks = find_images(confirmed_blanks_base,return_relative_paths=True,recursive=True)
509
+ assert len(all_confirmed_blanks) < len(all_candidate_blanks)
471
510
  print('Found {} confirmed blanks'.format(len(all_confirmed_blanks)))
472
511
 
473
- for fn_relative in all_confirmed_blanks:
474
- assert fn_relative in fn_relative_to_location
512
+
513
+ #%% Manually review a few of the images we skipped
514
+
515
+ # ...to make sure they're non-blank
516
+ i_image = random.randint(0, len(skipped_images_relative))
517
+ fn_relative = skipped_images_relative[i_image]
518
+ fn_abs = os.path.join(candidate_blanks_base,fn_relative)
519
+ assert os.path.isfile(fn_abs)
520
+ import clipboard
521
+ clipboard.copy('feh --scale-down "{}"'.format(fn_abs))
522
+
523
+
524
+ #%% Record location information for each confirmed file
525
+
526
+ # Map every URL's path to the corresponding location
527
+ #
528
+ # This is *all empty URLs*, not just the ones we downloaded
529
+ all_fn_relative_to_location = {}
530
+
531
+ # location = next(iter(location_to_blank_image_urls.keys()))
532
+ for location in tqdm(location_to_blank_image_urls):
533
+ urls_this_location = location_to_blank_image_urls[location]
534
+
535
+ # url = urls_this_location[0]
536
+ for url in urls_this_location:
537
+ # Turn:
538
+ #
539
+ # https://lilablobssc.blob.core.windows.net/caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
540
+ #
541
+ # ...into:
542
+ #
543
+ # caltech-unzipped/cct_images/5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg'
544
+ p = urlparse(url)
545
+ fn_relative = str(p.path)[1:]
546
+ all_fn_relative_to_location[fn_relative] = location
547
+
548
+ # Build a much smaller mapping of just the confirmed blanks
549
+ confirmed_fn_relative_to_location = {}
550
+ for i_fn,fn_relative in tqdm(enumerate(all_confirmed_blanks),total=len(all_confirmed_blanks)):
551
+ confirmed_fn_relative_to_location[fn_relative] = all_fn_relative_to_location[fn_relative]
552
+
553
+ with open(all_fn_relative_to_location_file,'w') as f:
554
+ json.dump(all_fn_relative_to_location,f,indent=1)
555
+
556
+ with open(confirmed_fn_relative_to_location_file,'w') as f:
557
+ json.dump(confirmed_fn_relative_to_location,f,indent=1)
@@ -1,11 +1,11 @@
1
- ########
2
- #
3
- # create_lila_test_set.py
4
- #
5
- # Create a test set of camera trap images, containing N empty and N non-empty
6
- # images from each LILA data set.
7
- #
8
- ########
1
+ """
2
+
3
+ create_lila_test_set.py
4
+
5
+ Create a test set of camera trap images, containing N empty and N non-empty
6
+ images from each LILA data set.
7
+
8
+ """
9
9
 
10
10
  #%% Constants and imports
11
11
 
@@ -1,106 +1,106 @@
1
- ########
2
- #
3
- # create_links_to_md_results_files.py
4
- #
5
- # One-off script to populate the columns in the camera trap data .csv file that point to MD results.
6
- #
7
- ########
8
-
9
- #%% Imports and constants
10
-
11
- import os
12
-
13
- import pandas as pd
14
-
15
- input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
16
- output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
17
-
18
- md_results_local_folder = r'g:\temp\lila-md-results'
19
- md_base_url = 'https://lila.science/public/lila-md-results/'
20
- assert md_base_url.endswith('/')
21
-
22
- # No RDE files for datasets with no location information
23
- datasets_without_location_info = ('ena24','missouri-camera-traps')
24
-
25
- md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
26
-
27
- validate_urls = False
28
-
29
-
30
- #%% Read input data
31
-
32
- df = pd.read_csv(input_csv_file)
33
- for s in md_results_column_names:
34
- df[s] = ''
35
-
36
-
37
- #%% Find matching files locally, and create URLs
38
-
39
- local_files = os.listdir(md_results_local_folder)
40
- local_files = [fn for fn in local_files if fn.endswith('.zip')]
41
-
42
- # i_row = 0; row = df.iloc[i_row]
43
- for i_row,row in df.iterrows():
44
-
45
- if not isinstance(row['name'],str):
46
- continue
47
-
48
- dataset_shortname = row['short_name']
49
- matching_files = [fn for fn in local_files if dataset_shortname in fn]
50
-
51
- # No RDE files for datasets with no location information
52
- if dataset_shortname in datasets_without_location_info:
53
- assert len(matching_files) == 2
54
- mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
55
- mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
56
- assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
57
- df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
- df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
- else:
60
- # Exclude single-season files for snapshot-serengeti
61
- if dataset_shortname == 'snapshot-serengeti':
62
- matching_files = [fn for fn in matching_files if '_S' not in fn]
63
- assert len(matching_files) == 2
64
- assert all(['mdv4' in fn for fn in matching_files])
65
- rde_files = [fn for fn in matching_files if 'rde' in fn]
66
- raw_files = [fn for fn in matching_files if 'rde' not in fn]
67
- assert len(rde_files) == 1 and len(raw_files) == 1
68
- df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
69
- df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
70
- else:
71
- assert len(matching_files) == 3
72
- mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
73
- mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
74
- rde_files = [fn for fn in matching_files if 'rde' in fn]
75
- assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
76
- df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
77
- df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
78
- df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
79
-
80
- print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
81
-
82
- # ...for each row
83
-
84
-
85
- #%% Validate URLs
86
-
87
- if validate_urls:
88
-
89
- from md_utils.url_utils import test_urls
90
-
91
- urls = set()
92
-
93
- for i_row,row in df.iterrows():
94
- for column_name in md_results_column_names:
95
- if len(row[column_name]) > 0:
96
- assert row[column_name] not in urls
97
- urls.add(row[column_name])
98
-
99
- test_urls(urls,error_on_failure=True)
100
-
101
- print('Validated {} URLs'.format(len(urls)))
102
-
103
-
104
- #%% Write new .csv file
105
-
106
- df.to_csv(output_csv_file,header=True,index=False)
1
+ """
2
+
3
+ create_links_to_md_results_files.py
4
+
5
+ One-off script to populate the columns in the camera trap data .csv file that point to MD results.
6
+
7
+ """
8
+
9
+ #%% Imports and constants
10
+
11
+ import os
12
+
13
+ import pandas as pd
14
+
15
+ input_csv_file = r'g:\temp\lila_camera_trap_datasets_no_md_results.csv'
16
+ output_csv_file = r'g:\temp\lila_camera_trap_datasets.csv'
17
+
18
+ md_results_local_folder = r'g:\temp\lila-md-results'
19
+ md_base_url = 'https://lila.science/public/lila-md-results/'
20
+ assert md_base_url.endswith('/')
21
+
22
+ # No RDE files for datasets with no location information
23
+ datasets_without_location_info = ('ena24','missouri-camera-traps')
24
+
25
+ md_results_column_names = ['mdv4_results_raw','mdv5a_results_raw','mdv5b_results_raw','md_results_with_rde']
26
+
27
+ validate_urls = False
28
+
29
+
30
+ #%% Read input data
31
+
32
+ df = pd.read_csv(input_csv_file)
33
+ for s in md_results_column_names:
34
+ df[s] = ''
35
+
36
+
37
+ #%% Find matching files locally, and create URLs
38
+
39
+ local_files = os.listdir(md_results_local_folder)
40
+ local_files = [fn for fn in local_files if fn.endswith('.zip')]
41
+
42
+ # i_row = 0; row = df.iloc[i_row]
43
+ for i_row,row in df.iterrows():
44
+
45
+ if not isinstance(row['name'],str):
46
+ continue
47
+
48
+ dataset_shortname = row['short_name']
49
+ matching_files = [fn for fn in local_files if dataset_shortname in fn]
50
+
51
+ # No RDE files for datasets with no location information
52
+ if dataset_shortname in datasets_without_location_info:
53
+ assert len(matching_files) == 2
54
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn]
55
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn]
56
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1
57
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
58
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
59
+ else:
60
+ # Exclude single-season files for snapshot-serengeti
61
+ if dataset_shortname == 'snapshot-serengeti':
62
+ matching_files = [fn for fn in matching_files if '_S' not in fn]
63
+ assert len(matching_files) == 2
64
+ assert all(['mdv4' in fn for fn in matching_files])
65
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
66
+ raw_files = [fn for fn in matching_files if 'rde' not in fn]
67
+ assert len(rde_files) == 1 and len(raw_files) == 1
68
+ df.loc[i_row,'mdv4_results_raw'] = md_base_url + raw_files[0]
69
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
70
+ else:
71
+ assert len(matching_files) == 3
72
+ mdv5a_files = [fn for fn in matching_files if 'mdv5a' in fn and 'rde' not in fn]
73
+ mdv5b_files = [fn for fn in matching_files if 'mdv5b' in fn and 'rde' not in fn]
74
+ rde_files = [fn for fn in matching_files if 'rde' in fn]
75
+ assert len(mdv5a_files) == 1 and len(mdv5b_files) == 1 and len(rde_files) == 1
76
+ df.loc[i_row,'mdv5a_results_raw'] = md_base_url + mdv5a_files[0]
77
+ df.loc[i_row,'mdv5b_results_raw'] = md_base_url + mdv5b_files[0]
78
+ df.loc[i_row,'md_results_with_rde'] = md_base_url + rde_files[0]
79
+
80
+ print('Found {} matching files for {}'.format(len(matching_files),dataset_shortname))
81
+
82
+ # ...for each row
83
+
84
+
85
+ #%% Validate URLs
86
+
87
+ if validate_urls:
88
+
89
+ from md_utils.url_utils import test_urls
90
+
91
+ urls = set()
92
+
93
+ for i_row,row in df.iterrows():
94
+ for column_name in md_results_column_names:
95
+ if len(row[column_name]) > 0:
96
+ assert row[column_name] not in urls
97
+ urls.add(row[column_name])
98
+
99
+ test_urls(urls,error_on_failure=True)
100
+
101
+ print('Validated {} URLs'.format(len(urls)))
102
+
103
+
104
+ #%% Write new .csv file
105
+
106
+ df.to_csv(output_csv_file,header=True,index=False)