megadetector 5.0.6__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (62) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +278 -197
  2. api/batch_processing/data_preparation/manage_video_batch.py +7 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  5. api/batch_processing/postprocessing/load_api_results.py +55 -69
  6. api/batch_processing/postprocessing/md_to_labelme.py +1 -0
  7. api/batch_processing/postprocessing/postprocess_batch_results.py +158 -50
  8. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  9. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  10. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  11. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +222 -74
  12. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  13. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  14. classification/prepare_classification_script.py +191 -191
  15. data_management/coco_to_yolo.py +65 -44
  16. data_management/databases/integrity_check_json_db.py +7 -5
  17. data_management/generate_crops_from_cct.py +1 -1
  18. data_management/importers/animl_results_to_md_results.py +2 -2
  19. data_management/importers/noaa_seals_2019.py +1 -1
  20. data_management/importers/zamba_results_to_md_results.py +2 -2
  21. data_management/labelme_to_coco.py +34 -6
  22. data_management/labelme_to_yolo.py +1 -1
  23. data_management/lila/create_lila_blank_set.py +474 -0
  24. data_management/lila/create_lila_test_set.py +2 -1
  25. data_management/lila/create_links_to_md_results_files.py +1 -1
  26. data_management/lila/download_lila_subset.py +46 -21
  27. data_management/lila/generate_lila_per_image_labels.py +23 -14
  28. data_management/lila/get_lila_annotation_counts.py +16 -10
  29. data_management/lila/lila_common.py +14 -11
  30. data_management/lila/test_lila_metadata_urls.py +116 -0
  31. data_management/resize_coco_dataset.py +12 -10
  32. data_management/yolo_output_to_md_output.py +40 -13
  33. data_management/yolo_to_coco.py +34 -21
  34. detection/process_video.py +36 -14
  35. detection/pytorch_detector.py +1 -1
  36. detection/run_detector.py +73 -18
  37. detection/run_detector_batch.py +104 -24
  38. detection/run_inference_with_yolov5_val.py +127 -26
  39. detection/run_tiled_inference.py +153 -43
  40. detection/video_utils.py +3 -1
  41. md_utils/ct_utils.py +79 -3
  42. md_utils/md_tests.py +253 -15
  43. md_utils/path_utils.py +129 -24
  44. md_utils/process_utils.py +26 -7
  45. md_utils/split_locations_into_train_val.py +215 -0
  46. md_utils/string_utils.py +10 -0
  47. md_utils/url_utils.py +0 -2
  48. md_utils/write_html_image_list.py +1 -0
  49. md_visualization/visualization_utils.py +17 -2
  50. md_visualization/visualize_db.py +8 -0
  51. md_visualization/visualize_detector_output.py +185 -104
  52. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/METADATA +2 -2
  53. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/RECORD +62 -58
  54. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  55. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  56. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  57. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  58. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  59. taxonomy_mapping/species_lookup.py +33 -13
  60. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  61. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  62. {megadetector-5.0.6.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -15,15 +15,25 @@ import json
15
15
  # Created by get_lila_category_list.py
16
16
  input_lila_category_list_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
17
17
 
18
- output_file = os.path.expanduser('~/lila/lila_additions_2022.08.22.csv')
18
+ output_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
19
19
 
20
20
  datasets_to_map = [
21
- # 'NACTI'
22
- # 'Channel Islands Camera Traps'
23
- 'ENA24'
21
+ 'Trail Camera Images of New Zealand Animals'
24
22
  ]
25
23
 
26
24
 
25
+ #%% Initialize taxonomic lookup
26
+
27
+ from taxonomy_mapping.species_lookup import (
28
+ initialize_taxonomy_lookup,
29
+ get_preferred_taxonomic_match)
30
+
31
+ # from taxonomy_mapping.species_lookup import (
32
+ # get_taxonomic_info, print_taxonomy_matche)
33
+
34
+ initialize_taxonomy_lookup(force_init=False)
35
+
36
+
27
37
  #%% Read the list of datasets
28
38
 
29
39
  with open(input_lila_category_list_file,'r') as f:
@@ -57,46 +67,14 @@ for dataset_name in datasets_to_map:
57
67
  print('Need to create {} mappings'.format(len(category_mappings)))
58
68
 
59
69
 
60
- #%% Initialize taxonomic lookup
61
-
62
- from taxonomy_mapping.species_lookup import (
63
- initialize_taxonomy_lookup,
64
- get_preferred_taxonomic_match)
65
-
66
- # from taxonomy_mapping.species_lookup import (
67
- # get_taxonomic_info, print_taxonomy_matche)
68
-
69
- initialize_taxonomy_lookup()
70
-
71
-
72
- #%% Manual lookup
73
-
74
- if False:
75
-
76
- #%%
77
-
78
- # q = 'white-throated monkey'
79
- q = 'cingulata'
80
- taxonomy_preference = 'inat'
81
- m = get_preferred_taxonomic_match(q,taxonomy_preference)
82
-
83
- if m is None:
84
- print('No match')
85
- else:
86
- if m.source != taxonomy_preference:
87
- print('\n*** non-preferred match ***\n')
88
- # raise ValueError('')
89
- print(m.source)
90
- print(m.taxonomy_string)
91
- import clipboard; clipboard.copy(m.taxonomy_string)
92
-
93
-
94
70
  #%% Match every query against our taxonomies
95
71
 
96
72
  output_rows = []
97
73
 
98
74
  taxonomy_preference = 'inat'
99
75
 
76
+ allow_non_preferred_matches = True
77
+
100
78
  # mapping_string = category_mappings[1]; print(mapping_string)
101
79
  for mapping_string in category_mappings:
102
80
 
@@ -108,7 +86,7 @@ for mapping_string in category_mappings:
108
86
 
109
87
  taxonomic_match = get_preferred_taxonomic_match(query,taxonomy_preference=taxonomy_preference)
110
88
 
111
- if taxonomic_match.source == taxonomy_preference:
89
+ if (taxonomic_match.source == taxonomy_preference) or allow_non_preferred_matches:
112
90
 
113
91
  output_row = {
114
92
  'dataset_name': dataset_name,
@@ -148,3 +126,29 @@ output_df = pd.DataFrame(data=output_rows, columns=[
148
126
  'dataset_name', 'query', 'source', 'taxonomy_level',
149
127
  'scientific_name', 'common_name', 'taxonomy_string'])
150
128
  output_df.to_csv(output_file, index=None, header=True)
129
+
130
+
131
+ #%% Manual lookup
132
+
133
+ if False:
134
+
135
+ #%%
136
+
137
+ # q = 'white-throated monkey'
138
+ # q = 'cingulata'
139
+ # q = 'notamacropus'
140
+ q = 'porzana'
141
+ taxonomy_preference = 'inat'
142
+ m = get_preferred_taxonomic_match(q,taxonomy_preference)
143
+ # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
144
+
145
+ if m is None:
146
+ print('No match')
147
+ else:
148
+ if m.source != taxonomy_preference:
149
+ print('\n*** non-preferred match ***\n')
150
+ # raise ValueError('')
151
+ print(m.source)
152
+ print(m.taxonomy_string)
153
+ # print(m.scientific_name); import clipboard; clipboard.copy(m.scientific_name)
154
+ import clipboard; clipboard.copy(m.taxonomy_string)
@@ -13,8 +13,9 @@ import os
13
13
  import json
14
14
  import pandas as pd
15
15
 
16
- lila_taxonomy_file = os.path.expanduser('~/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv')
17
- release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.22.08.22.0000.csv')
16
+ lila_taxonomy_file = 'c:/git/agentmorrisprivate/lila-taxonomy/lila-taxonomy-mapping.csv'
17
+ release_taxonomy_file = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
18
+ # import clipboard; clipboard.copy(release_taxonomy_file)
18
19
 
19
20
  # Created by get_lila_category_list.py... contains counts for each category
20
21
  lila_dataset_to_categories_file = os.path.expanduser('~/lila/lila_categories_list/lila_dataset_to_categories.json')
@@ -129,3 +130,5 @@ for i_row,row in df.iterrows():
129
130
 
130
131
  df = df.drop('source',axis=1)
131
132
  df.to_csv(release_taxonomy_file,header=True,index=False)
133
+
134
+ print('Wrote final output to {}'.format(release_taxonomy_file))
@@ -15,11 +15,10 @@ from tqdm import tqdm
15
15
  import os
16
16
  import pandas as pd
17
17
 
18
- # lila_taxonomy_file = r"G:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
- lila_taxonomy_file = r"G:\temp\lila\lila-taxonomy-mapping_release.22.07.03.1608.csv"
20
- # lila_taxonomy_file = r"G:\temp\lila\lila_additions_2022.06.29.csv"
18
+ # lila_taxonomy_file = r"c:\git\agentmorrisprivate\lila-taxonomy\lila-taxonomy-mapping.csv"
19
+ lila_taxonomy_file = os.path.expanduser('~/lila/lila_additions_2023.12.29.csv')
21
20
 
22
- preview_base = r'g:\temp\lila\lila_taxonomy_preview'
21
+ preview_base = os.path.expanduser('~/lila/lila_taxonomy_preview')
23
22
  os.makedirs(preview_base,exist_ok=True)
24
23
  html_output_file = os.path.join(preview_base,'index.html')
25
24
 
@@ -172,15 +171,14 @@ for i_row,row in tqdm(df.iterrows(),total=len(df)):
172
171
 
173
172
  print('\nMade {} taxonomy changes'.format(n_taxonomy_changes))
174
173
 
174
+ # Optionally re-write
175
175
  if False:
176
176
  df.to_csv(lila_taxonomy_file,header=True,index=False)
177
177
 
178
178
 
179
179
  #%% List null mappings
180
180
 
181
- #
182
- # These should all be things like "unidentified" and "fire"
183
- #
181
+ # These should all be things like "empty", "unidentified", "fire", "car", etc.
184
182
 
185
183
  # i_row = 0; row = df.iloc[i_row]
186
184
  for i_row,row in df.iterrows():
@@ -393,20 +391,20 @@ remapped_queries = {'papio':'papio+baboon',
393
391
 
394
392
  import os
395
393
  from taxonomy_mapping import retrieve_sample_image
394
+
396
395
  scientific_name_to_paths = {}
397
396
  image_base = os.path.join(preview_base,'images')
398
397
  images_per_query = 15
399
398
  min_valid_images_per_query = 3
400
399
  min_valid_image_size = 3000
401
400
 
401
+ # TODO: trivially prallelizable
402
+ #
402
403
  # i_row = 0; row = df.iloc[i_row]
403
404
  for i_row,row in df.iterrows():
404
405
 
405
406
  s = row['scientific_name']
406
407
 
407
- # if s != 'mirafra':
408
- # continue
409
-
410
408
  if (not isinstance(s,str)) or (len(s)==0):
411
409
  continue
412
410
 
@@ -416,17 +414,17 @@ for i_row,row in df.iterrows():
416
414
  query = remapped_queries[query]
417
415
 
418
416
  query_folder = os.path.join(image_base,query)
417
+ os.makedirs(query_folder,exist_ok=True)
419
418
 
420
419
  # Check whether we already have enough images for this query
421
- if os.path.isdir(query_folder):
422
- image_files = os.listdir(query_folder)
423
- image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
424
- sizes = [os.path.getsize(p) for p in image_fullpaths]
425
- sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
426
- if len(sizes_above_threshold) > min_valid_images_per_query:
427
- # print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
428
- continue
429
-
420
+ image_files = os.listdir(query_folder)
421
+ image_fullpaths = [os.path.join(query_folder,fn) for fn in image_files]
422
+ sizes = [os.path.getsize(p) for p in image_fullpaths]
423
+ sizes_above_threshold = [x for x in sizes if x > min_valid_image_size]
424
+ if len(sizes_above_threshold) > min_valid_images_per_query:
425
+ print('Skipping query {}, already have {} images'.format(s,len(sizes_above_threshold)))
426
+ continue
427
+
430
428
  # Check whether we've already run this query for a previous row
431
429
  if query in scientific_name_to_paths:
432
430
  continue
@@ -448,14 +446,16 @@ from md_utils import path_utils
448
446
  all_images = path_utils.recursive_file_list(image_base,False)
449
447
 
450
448
  for fn in tqdm(all_images):
451
- if fn.endswith('.jpeg'):
449
+ if fn.lower().endswith('.jpeg'):
452
450
  new_fn = fn[0:-5] + '.jpg'
453
- # print('Renaming {} to {}'.format(fn,new_fn))
454
451
  os.rename(fn, new_fn)
455
452
 
456
453
 
457
454
  #%% Choose representative images for each scientific name
458
455
 
456
+ # Specifically, sort by size, and take the largest unique sizes. Very small files tend
457
+ # to be bogus thumbnails, etc.
458
+
459
459
  max_images_per_query = 4
460
460
  scientific_name_to_preferred_images = {}
461
461
 
@@ -506,7 +506,7 @@ for images in scientific_name_to_preferred_images.values():
506
506
  print('Using a total of {} images'.format(len(used_images)))
507
507
  used_images_set = set(used_images)
508
508
 
509
- import path_utils
509
+ from md_utils import path_utils
510
510
  all_images = path_utils.recursive_file_list(image_base,False)
511
511
 
512
512
  unused_images = []
@@ -523,7 +523,7 @@ for fn in tqdm(unused_images):
523
523
 
524
524
  #%% Produce HTML preview
525
525
 
526
- with open(html_output_file, 'w') as f:
526
+ with open(html_output_file, 'w', encoding='utf-8') as f:
527
527
 
528
528
  f.write('<html><head></head><body>\n')
529
529
 
@@ -555,10 +555,11 @@ with open(html_output_file, 'w') as f:
555
555
  f.write('<p class="speciesinfo_p" style="font-weight:bold;font-size:130%">')
556
556
 
557
557
  if isinstance(row.scientific_name,str):
558
- f.write('{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
558
+ output_string = '{}: <b><u>{}</u></b> mapped to {} {} ({}) ({})</p>\n'.format(
559
559
  row.dataset_name, row.query,
560
560
  row.taxonomy_level, row.scientific_name, common_name_string,
561
- row.common_name))
561
+ row.common_name)
562
+ f.write(output_string)
562
563
  else:
563
564
  f.write('{}: <b><u>{}</u></b> unmapped'.format(row.dataset_name,row.query))
564
565
 
@@ -586,6 +587,5 @@ with open(html_output_file, 'w') as f:
586
587
 
587
588
  #%% Open HTML preview
588
589
 
589
- from md_utils.path_utils import open_file # from ai4eutils
590
+ from md_utils.path_utils import open_file
590
591
  open_file(html_output_file)
591
-
@@ -36,14 +36,23 @@ taxonomy_urls = {
36
36
  }
37
37
 
38
38
  files_to_unzip = {
39
- 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
39
+ # GBIF used to put everything in a "backbone" folder within the zipfile, but as of
40
+ # 12.2023, this is no longer the case.
41
+ # 'GBIF': ['backbone/Taxon.tsv', 'backbone/VernacularName.tsv'],
42
+ 'GBIF': ['Taxon.tsv', 'VernacularName.tsv'],
40
43
  'iNaturalist': ['taxa.csv']
41
44
  }
42
45
 
43
46
  # As of 2020.05.12:
44
47
  #
45
48
  # GBIF: ~777MB zipped, ~1.6GB taxonomy
46
- # iNat: ~2.2GB zipped, ~51MB taxonomy
49
+ # iNat: ~2.2GB zipped, ~51MB taxonomy (most of the zipfile is observations)
50
+
51
+ # As of 2023.12.29:
52
+ #
53
+ # GBIF: ~948MB zipped, ~2.2GB taxonomy
54
+ # iNat: ~6.7GB zipped, ~62MB taxonomy (most of the zipfile is observations)
55
+
47
56
 
48
57
  os.makedirs(taxonomy_download_dir, exist_ok=True)
49
58
  for taxonomy_name in taxonomy_urls:
@@ -99,15 +108,16 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
99
108
  gbif_taxon_id_to_scientific,\
100
109
  gbif_scientific_to_taxon_id
101
110
 
111
+
102
112
  ## Load serialized taxonomy info if we've already saved it
103
113
 
104
114
  if (not force_init) and (inat_taxonomy is not None):
105
115
  print('Skipping taxonomy re-init')
106
116
  return
107
117
 
108
- if os.path.isfile(serialized_structures_file):
118
+ if (not force_init) and (os.path.isfile(serialized_structures_file)):
109
119
 
110
- print(f'Reading taxonomy data from {serialized_structures_file}')
120
+ print(f'De-serializing taxonomy data from {serialized_structures_file}')
111
121
 
112
122
  with open(serialized_structures_file, 'rb') as f:
113
123
  structures_to_serialize = pickle.load(f)
@@ -125,6 +135,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
125
135
  gbif_vernacular_to_taxon_id,\
126
136
  gbif_taxon_id_to_scientific,\
127
137
  gbif_scientific_to_taxon_id = structures_to_serialize
138
+
128
139
  return
129
140
 
130
141
 
@@ -135,6 +146,9 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
135
146
  for taxonomy_name, zip_url in taxonomy_urls.items():
136
147
 
137
148
  need_to_download = False
149
+
150
+ if force_init:
151
+ need_to_download = True
138
152
 
139
153
  # Don't download the zipfile if we've already unzipped what we need
140
154
  for fn in files_to_unzip[taxonomy_name]:
@@ -150,11 +164,11 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
150
164
  zipfile_path = os.path.join(
151
165
  taxonomy_download_dir, zip_url.split('/')[-1])
152
166
 
153
- # Bypasses download if the file exists already
167
+ # Bypasses download if the file exists already (unless force_init is set)
154
168
  url_utils.download_url(
155
169
  zip_url, os.path.join(zipfile_path),
156
170
  progress_updater=url_utils.DownloadProgressBar(),
157
- verbose=True)
171
+ verbose=True,force_download=force_init)
158
172
 
159
173
  # Unzip the files we need
160
174
  files_we_need = files_to_unzip[taxonomy_name]
@@ -166,7 +180,7 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
166
180
  target_file = os.path.join(
167
181
  taxonomy_download_dir, taxonomy_name, os.path.basename(fn))
168
182
 
169
- if os.path.isfile(target_file):
183
+ if (not force_init) and (os.path.isfile(target_file)):
170
184
  print(f'Bypassing unzip of {target_file}, file exists')
171
185
  else:
172
186
  os.makedirs(os.path.basename(target_file),exist_ok=True)
@@ -185,13 +199,16 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
185
199
  # name file
186
200
 
187
201
  # Load iNat taxonomy
188
- inat_taxonomy = pd.read_csv(os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv'))
202
+ inat_taxonomy_file = os.path.join(taxonomy_download_dir, 'iNaturalist', 'taxa.csv')
203
+ print('Loading iNat taxonomy from {}'.format(inat_taxonomy_file))
204
+ inat_taxonomy = pd.read_csv(inat_taxonomy_file)
189
205
  inat_taxonomy['scientificName'] = inat_taxonomy['scientificName'].fillna('').str.strip()
190
206
  inat_taxonomy['vernacularName'] = inat_taxonomy['vernacularName'].fillna('').str.strip()
191
207
 
192
208
  # Load GBIF taxonomy
193
- gbif_taxonomy = pd.read_csv(os.path.join(
194
- taxonomy_download_dir, 'GBIF', 'Taxon.tsv'), sep='\t')
209
+ gbif_taxonomy_file = os.path.join(taxonomy_download_dir, 'GBIF', 'Taxon.tsv')
210
+ print('Loading GBIF taxonomy from {}'.format(gbif_taxonomy_file))
211
+ gbif_taxonomy = pd.read_csv(gbif_taxonomy_file, sep='\t')
195
212
  gbif_taxonomy['scientificName'] = gbif_taxonomy['scientificName'].fillna('').str.strip()
196
213
  gbif_taxonomy['canonicalName'] = gbif_taxonomy['canonicalName'].fillna('').str.strip()
197
214
 
@@ -249,7 +266,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
249
266
 
250
267
  # Build iNat dictionaries
251
268
 
252
- # row = inat_taxonomy.iloc[0]
269
+ print('Building lookup dictionaries for iNat taxonomy')
270
+
253
271
  for i_row, row in tqdm(inat_taxonomy.iterrows(), total=len(inat_taxonomy)):
254
272
 
255
273
  taxon_id = row['taxonID']
@@ -267,6 +285,8 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
267
285
 
268
286
  # Build GBIF dictionaries
269
287
 
288
+ print('Building lookup dictionaries for GBIF taxonomy')
289
+
270
290
  for i_row, row in tqdm(gbif_taxonomy.iterrows(), total=len(gbif_taxonomy)):
271
291
 
272
292
  taxon_id = row['taxonID']
@@ -320,13 +340,13 @@ def initialize_taxonomy_lookup(force_init=False) -> None:
320
340
  gbif_scientific_to_taxon_id
321
341
  ]
322
342
 
323
- print('Serializing...', end='')
343
+ print('Serializing to {}...'.format(serialized_structures_file), end='')
324
344
  if not os.path.isfile(serialized_structures_file):
325
345
  with open(serialized_structures_file, 'wb') as p:
326
346
  pickle.dump(structures_to_serialize, p)
327
347
  print(' done')
328
348
 
329
- # ...def initialize_taxonomy_lookup()
349
+ # ...def initialize_taxonomy_lookup(...)
330
350
 
331
351
 
332
352
  def get_scientific_name_from_row(r):
@@ -45,7 +45,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
45
45
  num_taxon_level_errors = 0
46
46
  num_scientific_name_errors = 0
47
47
 
48
- for i, row in taxonomy_df.iterrows():
48
+ for i_row, row in taxonomy_df.iterrows():
49
49
 
50
50
  ds = row['dataset_name']
51
51
  ds_label = row['query']
@@ -81,14 +81,14 @@ def check_taxonomy_csv(csv_path: str) -> None:
81
81
  node.add_id(id_source, int(taxon_id)) # np.int64 -> int
82
82
  if j == 0:
83
83
  if level != taxon_level:
84
- print(f'row: {i}, {ds}, {ds_label}')
84
+ print(f'row: {i_row}, {ds}, {ds_label}')
85
85
  print(f'- taxonomy_level column: {level}, '
86
86
  f'level from taxonomy_string: {taxon_level}')
87
87
  print()
88
88
  num_taxon_level_errors += 1
89
89
 
90
90
  if scientific_name != taxon_name:
91
- print(f'row: {i}, {ds}, {ds_label}')
91
+ print(f'row: {i_row}, {ds}, {ds_label}')
92
92
  print(f'- scientific_name column: {scientific_name}, '
93
93
  f'name from taxonomy_string: {taxon_name}')
94
94
  print()
@@ -97,7 +97,7 @@ def check_taxonomy_csv(csv_path: str) -> None:
97
97
  taxon_child = node
98
98
 
99
99
  # ...for each row in the taxonomy file
100
-
100
+
101
101
  assert nx.is_directed_acyclic_graph(graph)
102
102
 
103
103
  for node in graph.nodes:
@@ -123,6 +123,8 @@ def check_taxonomy_csv(csv_path: str) -> None:
123
123
  except AssertionError as e:
124
124
  print(f'At least one node has unresolved ambiguous parents: {e}')
125
125
 
126
+ print('Processed {} rows from {}'.format(len(taxonomy_df),csv_path))
127
+
126
128
  print('num taxon level errors:', num_taxon_level_errors)
127
129
  print('num scientific name errors:', num_scientific_name_errors)
128
130
 
@@ -154,4 +156,4 @@ if False:
154
156
  import os
155
157
  csv_path = os.path.expanduser('~/lila/lila-taxonomy-mapping_release.csv')
156
158
  check_taxonomy_csv(csv_path)
157
-
159
+