megadetector 5.0.29__py3-none-any.whl → 10.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (95) hide show
  1. megadetector/classification/efficientnet/model.py +8 -8
  2. megadetector/classification/efficientnet/utils.py +6 -5
  3. megadetector/classification/prepare_classification_script_mc.py +3 -3
  4. megadetector/data_management/annotations/annotation_constants.py +0 -1
  5. megadetector/data_management/camtrap_dp_to_coco.py +34 -1
  6. megadetector/data_management/cct_json_utils.py +2 -2
  7. megadetector/data_management/coco_to_yolo.py +22 -5
  8. megadetector/data_management/databases/add_width_and_height_to_db.py +85 -12
  9. megadetector/data_management/databases/combine_coco_camera_traps_files.py +2 -2
  10. megadetector/data_management/databases/integrity_check_json_db.py +29 -15
  11. megadetector/data_management/generate_crops_from_cct.py +50 -1
  12. megadetector/data_management/labelme_to_coco.py +4 -2
  13. megadetector/data_management/labelme_to_yolo.py +82 -2
  14. megadetector/data_management/lila/generate_lila_per_image_labels.py +276 -18
  15. megadetector/data_management/lila/get_lila_annotation_counts.py +5 -3
  16. megadetector/data_management/lila/lila_common.py +3 -0
  17. megadetector/data_management/lila/test_lila_metadata_urls.py +15 -5
  18. megadetector/data_management/mewc_to_md.py +5 -0
  19. megadetector/data_management/ocr_tools.py +4 -3
  20. megadetector/data_management/read_exif.py +20 -5
  21. megadetector/data_management/remap_coco_categories.py +66 -4
  22. megadetector/data_management/remove_exif.py +50 -1
  23. megadetector/data_management/rename_images.py +3 -3
  24. megadetector/data_management/resize_coco_dataset.py +563 -95
  25. megadetector/data_management/yolo_output_to_md_output.py +131 -2
  26. megadetector/data_management/yolo_to_coco.py +140 -5
  27. megadetector/detection/change_detection.py +4 -3
  28. megadetector/detection/pytorch_detector.py +60 -22
  29. megadetector/detection/run_detector.py +225 -25
  30. megadetector/detection/run_detector_batch.py +42 -16
  31. megadetector/detection/run_inference_with_yolov5_val.py +12 -2
  32. megadetector/detection/run_tiled_inference.py +1 -0
  33. megadetector/detection/video_utils.py +53 -24
  34. megadetector/postprocessing/add_max_conf.py +4 -0
  35. megadetector/postprocessing/categorize_detections_by_size.py +1 -1
  36. megadetector/postprocessing/classification_postprocessing.py +55 -20
  37. megadetector/postprocessing/combine_batch_outputs.py +3 -2
  38. megadetector/postprocessing/compare_batch_results.py +64 -10
  39. megadetector/postprocessing/convert_output_format.py +12 -8
  40. megadetector/postprocessing/create_crop_folder.py +137 -10
  41. megadetector/postprocessing/load_api_results.py +26 -8
  42. megadetector/postprocessing/md_to_coco.py +4 -4
  43. megadetector/postprocessing/md_to_labelme.py +18 -7
  44. megadetector/postprocessing/merge_detections.py +5 -0
  45. megadetector/postprocessing/postprocess_batch_results.py +6 -3
  46. megadetector/postprocessing/remap_detection_categories.py +55 -2
  47. megadetector/postprocessing/render_detection_confusion_matrix.py +9 -6
  48. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +2 -2
  49. megadetector/taxonomy_mapping/map_new_lila_datasets.py +3 -4
  50. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +40 -19
  51. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +1 -1
  52. megadetector/taxonomy_mapping/species_lookup.py +123 -41
  53. megadetector/utils/ct_utils.py +133 -113
  54. megadetector/utils/md_tests.py +93 -13
  55. megadetector/utils/path_utils.py +137 -107
  56. megadetector/utils/split_locations_into_train_val.py +2 -2
  57. megadetector/utils/string_utils.py +7 -7
  58. megadetector/utils/url_utils.py +81 -58
  59. megadetector/utils/wi_utils.py +46 -17
  60. megadetector/visualization/plot_utils.py +13 -9
  61. megadetector/visualization/render_images_with_thumbnails.py +2 -1
  62. megadetector/visualization/visualization_utils.py +94 -46
  63. megadetector/visualization/visualize_db.py +36 -9
  64. megadetector/visualization/visualize_detector_output.py +4 -4
  65. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/METADATA +135 -135
  66. megadetector-10.0.0.dist-info/RECORD +139 -0
  67. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
  68. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
  69. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  70. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  71. megadetector/api/batch_processing/api_core/batch_service/score.py +0 -438
  72. megadetector/api/batch_processing/api_core/server.py +0 -294
  73. megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
  74. megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
  75. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
  76. megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
  77. megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
  78. megadetector/api/batch_processing/api_core/server_utils.py +0 -88
  79. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  80. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
  81. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  82. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
  83. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  84. megadetector/api/synchronous/__init__.py +0 -0
  85. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  86. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
  87. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
  88. megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
  89. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  90. megadetector/api/synchronous/api_core/tests/load_test.py +0 -109
  91. megadetector/utils/azure_utils.py +0 -178
  92. megadetector/utils/sas_blob_utils.py +0 -513
  93. megadetector-5.0.29.dist-info/RECORD +0 -163
  94. /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
  95. {megadetector-5.0.29.dist-info → megadetector-10.0.0.dist-info}/WHEEL +0 -0
@@ -32,27 +32,18 @@ taxonomy_download_dir = os.path.expanduser('~/taxonomy')
32
32
 
33
33
  taxonomy_urls = {
34
34
  'GBIF': 'https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip',
35
- 'iNaturalist': 'https://www.inaturalist.org/observations/inaturalist-dwca-with-taxa.zip' # pylint: disable=line-too-long
35
+ 'iNaturalist': 'https://www.inaturalist.org/taxa/inaturalist-taxonomy.dwca.zip'
36
36
  }
37
37
 
38
38
  files_to_unzip = {
39
- # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
40
- # 12.2023, this is no longer the case.
41
- # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
42
39
  'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
43
- 'iNaturalist': ['taxa.csv']
40
+ 'iNaturalist': ['taxa.csv','VernacularNames-english.csv']
44
41
  }
45
42
 
46
- # As of 2020.05.12:
43
+ # As of 2025.06.24:
47
44
  #
48
- # GBIF: ~777MB zipped, ~1.6GB taxonomy
49
- # iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
50
-
51
- # As of 2023.12.29:
52
- #
53
- # GBIF: ~948MB zipped, ~2.2GB taxonomy
54
- # iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
55
-
45
+ # GBIF: 950MB zipped, 2.3GB of relevant content unzipped
46
+ # iNat: 71MB zipped, 415MB of relevant content unzipped
56
47
 
57
48
  os.makedirs(taxonomy_download_dir, exist_ok=True)
58
49
  for taxonomy_name in taxonomy_urls:
@@ -83,7 +74,7 @@ gbif_scientific_to_taxon_id = None # : Dict[str, np.int64]
83
74
 
84
75
  # Initialization function
85
76
 
86
- def initialize_taxonomy_lookup(force_init=False) -> None:
77
+ def initialize_taxonomy_lookup(force_init=False):
87
78
  """
88
79
  Initialize this module by doing the following:
89
80
 
@@ -92,8 +83,14 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
92
83
  * Builds a bunch of dictionaries and tables to facilitate lookup
93
84
  * Serializes those tables via pickle
94
85
  * Skips all of the above if the serialized pickle file already exists
86
+
87
+ Args:
88
+ force_init (bool, optional): force re-download and parsing of the source .zip files,
89
+ even if the cached .p file already exists
95
90
  """
96
91
 
92
+ #%%
93
+
97
94
  global inat_taxonomy,\
98
95
  gbif_taxonomy,\
99
96
  gbif_common_mapping,\
@@ -109,7 +106,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
109
106
  gbif_scientific_to_taxon_id
110
107
 
111
108
 
112
- ## Load serialized taxonomy info if we've already saved it
109
+ #%% Load serialized taxonomy info if we've already saved it
113
110
 
114
111
  if (not force_init) and (inat_taxonomy is not None):
115
112
  print('Skipping taxonomy re-init')
@@ -139,9 +136,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
139
136
  return
140
137
 
141
138
 
142
- ## If we don't have serialized taxonomy info, create it from scratch.
139
+ #%% Download and unzip taxonomy files
143
140
 
144
- # Download and unzip taxonomy files
145
141
  # taxonomy_name = list(taxonomy_urls.items())[0][0]; zip_url = list(taxonomy_urls.items())[0][1]
146
142
  for taxonomy_name, zip_url in taxonomy_urls.items():
147
143
 
@@ -189,21 +185,44 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
189
185
 
190
186
  # ...for each file that we need from this zipfile
191
187
 
192
- # Remove the zipfile
193
- # os.remove(zipfile_path)
194
-
195
188
  # ...for each taxonomy
196
189
 
197
190
 
198
- # Create dataframes from each of the taxonomy files, and the GBIF common
199
- # name file
191
+ #%% Create dataframes from each of the taxonomy/vernacular files
200
192
 
201
193
  # Load iNat taxonomy
202
194
  inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
203
195
  print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
204
196
  inat_taxonomy = pd.read_csv(inat_taxonomy_file)
205
197
  inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
206
- inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
198
+
199
+ # Delete columns we won't use. The "taxonID" column is a non-int version of "ID"
200
+ inat_taxonomy = inat_taxonomy.drop(['identifier', 'taxonID', 'modified', 'references'], axis=1)
201
+
202
+ # The "parentNameUsageID" column in inat_taxonomy is a URL, like:
203
+ #
204
+ # https://www.inaturalist.org/taxa/71262
205
+ #
206
+ # Convert this column to be integer-valued, using only the last token of the URL
207
+ inat_taxonomy['parentNameUsageID'] = \
208
+ inat_taxonomy['parentNameUsageID'].str.split('/').str[-1].fillna(0).astype(int)
209
+
210
+ # Rename the "id" column to "taxonID"
211
+ inat_taxonomy = inat_taxonomy.rename(columns={'id': 'taxonID'})
212
+
213
+ assert 'id' not in inat_taxonomy.columns
214
+ assert 'taxonID' in inat_taxonomy.columns
215
+
216
+ # Load iNat common name mapping
217
+ inat_common_mapping_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'VernacularNames-english.csv')
218
+ inat_common_mapping = pd.read_csv(inat_common_mapping_file)
219
+ inat_common_mapping['vernacularName'] = inat_common_mapping['vernacularName'].fillna('').str.strip()
220
+
221
+ inat_common_mapping = inat_common_mapping.drop(['language','locality','countryCode',
222
+ 'source','lexicon','contributor','created'], axis=1)
223
+ assert 'id' in inat_common_mapping.columns
224
+ assert 'taxonID' not in inat_common_mapping.columns
225
+ assert 'vernacularName' in inat_common_mapping.columns
207
226
 
208
227
  # Load GBIF taxonomy
209
228
  gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
@@ -211,12 +230,20 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
211
230
  gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t', encoding='utf-8',on_bad_lines='warn')
212
231
  gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
213
232
  gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
233
+ gbif_taxonomy['parentNameUsageID'] = gbif_taxonomy['parentNameUsageID'].fillna(-1).astype(int)
214
234
 
215
235
  # Remove questionable rows from the GBIF taxonomy
216
236
  gbif_taxonomy = gbif_taxonomy[~gbif_taxonomy['taxonomicStatus'].isin(['doubtful', 'misapplied'])]
217
237
  gbif_taxonomy = gbif_taxonomy.reset_index()
218
238
 
219
- # Load GBIF vernacular name mapping
239
+ gbif_taxonomy = gbif_taxonomy.drop(['datasetID','acceptedNameUsageID','originalNameUsageID',
240
+ 'scientificNameAuthorship','nameAccordingTo','namePublishedIn',
241
+ 'taxonomicStatus','nomenclaturalStatus','taxonRemarks'], axis=1)
242
+
243
+ assert 'taxonID' in gbif_taxonomy.columns
244
+ assert 'scientificName' in gbif_taxonomy.columns
245
+
246
+ # Load GBIF common name mapping
220
247
  gbif_common_mapping = pd.read_csv(os.path.join(
221
248
  taxonomy_download_dir, 'GBIF', 'VernacularName.tsv'), sep='\t')
222
249
  gbif_common_mapping['vernacularName'] = gbif_common_mapping['vernacularName'].fillna('').str.strip()
@@ -225,6 +252,12 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
225
252
  gbif_common_mapping = gbif_common_mapping.loc[gbif_common_mapping['language'] == 'en']
226
253
  gbif_common_mapping = gbif_common_mapping.reset_index()
227
254
 
255
+ gbif_common_mapping = gbif_common_mapping.drop(['language','country','countryCode','sex',
256
+ 'lifeStage','source'],axis=1)
257
+
258
+ assert 'taxonID' in gbif_common_mapping.columns
259
+ assert 'vernacularName' in gbif_common_mapping.columns
260
+
228
261
 
229
262
  # Convert everything to lowercase
230
263
 
@@ -235,23 +268,28 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
235
268
  inat_taxonomy = convert_df_to_lowercase(inat_taxonomy)
236
269
  gbif_taxonomy = convert_df_to_lowercase(gbif_taxonomy)
237
270
  gbif_common_mapping = convert_df_to_lowercase(gbif_common_mapping)
271
+ inat_common_mapping = convert_df_to_lowercase(inat_common_mapping)
238
272
 
239
273
 
240
- # For each taxonomy table, create a mapping from taxon IDs to rows
274
+ ##%% For each taxonomy table, create a mapping from taxon IDs to rows
241
275
 
242
276
  inat_taxon_id_to_row = {}
243
277
  gbif_taxon_id_to_row = {}
244
278
 
245
279
  print('Building iNat taxonID --> row table')
246
280
  for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
247
- inat_taxon_id_to_row[row['taxonID']] = i_row
281
+ taxon_id = row['taxonID']
282
+ assert isinstance(taxon_id, int)
283
+ inat_taxon_id_to_row[taxon_id] = i_row
248
284
 
249
285
  print('Building GBIF taxonID --> row table')
250
286
  for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
251
- gbif_taxon_id_to_row[row['taxonID']] = i_row
287
+ taxon_id = row['taxonID']
288
+ assert isinstance(taxon_id, int)
289
+ gbif_taxon_id_to_row[taxon_id] = i_row
252
290
 
253
291
 
254
- # Create name mapping dictionaries
292
+ ##%% Create name mapping dictionaries
255
293
 
256
294
  inat_taxon_id_to_vernacular = defaultdict(set)
257
295
  inat_vernacular_to_taxon_id = defaultdict(set)
@@ -268,31 +306,60 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
268
306
 
269
307
  print('Building lookup dictionaries for iNat taxonomy')
270
308
 
309
+ # iNat Scientific name mapping
310
+
271
311
  for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
272
312
 
273
313
  taxon_id = row['taxonID']
274
- vernacular_name = row['vernacularName']
275
- scientific_name = row['scientificName']
276
-
277
- if len(vernacular_name) > 0:
278
- inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
279
- inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
314
+ assert isinstance(taxon_id,int)
280
315
 
316
+ scientific_name = row['scientificName']
281
317
  assert len(scientific_name) > 0
318
+
282
319
  inat_taxon_id_to_scientific[taxon_id].add(scientific_name)
283
320
  inat_scientific_to_taxon_id[scientific_name].add(taxon_id)
284
321
 
322
+ # iNat common name mapping
323
+
324
+ inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
325
+
326
+ for i_row, row in tqdm(inat_common_mapping.iterrows(), total=len(inat_common_mapping)):
285
327
 
286
- # Build GBIF dictionaries
328
+ taxon_id = row['id']
329
+ assert isinstance(taxon_id,int)
330
+
331
+ # This should never happen; we will assert() this at the end of the loop
332
+ if taxon_id not in inat_taxon_id_to_scientific:
333
+ inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
334
+ continue
335
+
336
+ vernacular_name = row['vernacularName']
337
+
338
+ assert len(vernacular_name) > 0
339
+ inat_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
340
+ inat_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
341
+
342
+ assert len(inat_taxon_ids_in_vernacular_file_but_not_in_taxa_file) == 0
343
+
344
+
345
+ ##%% Build GBIF dictionaries
287
346
 
288
347
  print('Building lookup dictionaries for GBIF taxonomy')
289
348
 
349
+ # GBIF scientific name mapping
350
+
290
351
  for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
291
352
 
292
353
  taxon_id = row['taxonID']
354
+ assert isinstance(taxon_id,int)
293
355
 
294
- # The canonical name is the Latin name; the "scientific name"
295
- # include the taxonomy name.
356
+ # The "canonical name" is the Latin name; the "scientific name"
357
+ # column includes other information. For example:
358
+ #
359
+ # "scientificName": Schizophoria impressa (Hall, 1843)
360
+ # "canonicalName": Schizophoria impressa
361
+ #
362
+ # Also see:
296
363
  #
297
364
  # http://globalnames.org/docs/glossary/
298
365
 
@@ -307,12 +374,18 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
307
374
  gbif_taxon_id_to_scientific[taxon_id].add(scientific_name)
308
375
  gbif_scientific_to_taxon_id[scientific_name].add(taxon_id)
309
376
 
377
+ # GBIF common name mapping
378
+
379
+ gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file = set()
380
+
310
381
  for i_row, row in tqdm(gbif_common_mapping.iterrows(), total=len(gbif_common_mapping)):
311
382
 
312
383
  taxon_id = row['taxonID']
384
+ assert isinstance(taxon_id,int)
313
385
 
314
386
  # Don't include taxon IDs that were removed from the master table
315
387
  if taxon_id not in gbif_taxon_id_to_scientific:
388
+ gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file.add(taxon_id)
316
389
  continue
317
390
 
318
391
  vernacular_name = row['vernacularName']
@@ -321,8 +394,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
321
394
  gbif_taxon_id_to_vernacular[taxon_id].add(vernacular_name)
322
395
  gbif_vernacular_to_taxon_id[vernacular_name].add(taxon_id)
323
396
 
397
+ print('Finished GBIF common --> scientific mapping, failed to map {} of {} taxon IDs'.format(
398
+ len(gbif_taxon_ids_in_vernacular_file_but_not_in_taxa_file),
399
+ len(gbif_common_mapping)
400
+ ))
401
+
324
402
 
325
- # Save everything to file
403
+ ##%% Save everything to file
326
404
 
327
405
  structures_to_serialize = [
328
406
  inat_taxonomy,
@@ -344,7 +422,10 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
344
422
  if not os.path.isfile(serialized_structures_file):
345
423
  with open(serialized_structures_file, 'wb') as p:
346
424
  pickle.dump(structures_to_serialize, p)
347
- print(' done')
425
+ print('...done')
426
+
427
+
428
+ #%%
348
429
 
349
430
  # ...def initialize_taxonomy_lookup(...)
350
431
 
@@ -412,7 +493,8 @@ def traverse_taxonomy(matching_rownums: Sequence[int],
412
493
  while True:
413
494
 
414
495
  taxon_id = current_row['taxonID']
415
- vernacular_names = sorted(taxon_id_to_vernacular[taxon_id]) # sort for determinism, pylint: disable=line-too-long
496
+ # sort for determinism
497
+ vernacular_names = sorted(taxon_id_to_vernacular[taxon_id])
416
498
  match_details.append((taxon_id, current_row['taxonRank'],
417
499
  get_scientific_name_from_row(current_row),
418
500
  vernacular_names))