megadetector 5.0.23__py3-none-any.whl → 5.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (42) hide show
  1. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +2 -3
  2. megadetector/classification/merge_classification_detection_output.py +2 -2
  3. megadetector/data_management/coco_to_labelme.py +2 -1
  4. megadetector/data_management/databases/integrity_check_json_db.py +15 -14
  5. megadetector/data_management/databases/subset_json_db.py +49 -21
  6. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +73 -69
  7. megadetector/data_management/lila/add_locations_to_nacti.py +114 -110
  8. megadetector/data_management/mewc_to_md.py +340 -0
  9. megadetector/data_management/speciesnet_to_md.py +41 -0
  10. megadetector/data_management/yolo_output_to_md_output.py +15 -8
  11. megadetector/detection/process_video.py +24 -7
  12. megadetector/detection/pytorch_detector.py +841 -160
  13. megadetector/detection/run_detector.py +341 -146
  14. megadetector/detection/run_detector_batch.py +307 -70
  15. megadetector/detection/run_inference_with_yolov5_val.py +61 -4
  16. megadetector/detection/tf_detector.py +6 -1
  17. megadetector/postprocessing/{combine_api_outputs.py → combine_batch_outputs.py} +10 -13
  18. megadetector/postprocessing/compare_batch_results.py +236 -7
  19. megadetector/postprocessing/create_crop_folder.py +358 -0
  20. megadetector/postprocessing/md_to_labelme.py +7 -7
  21. megadetector/postprocessing/md_to_wi.py +40 -0
  22. megadetector/postprocessing/merge_detections.py +1 -1
  23. megadetector/postprocessing/postprocess_batch_results.py +12 -5
  24. megadetector/postprocessing/separate_detections_into_folders.py +32 -4
  25. megadetector/postprocessing/validate_batch_results.py +9 -4
  26. megadetector/utils/ct_utils.py +236 -45
  27. megadetector/utils/directory_listing.py +3 -3
  28. megadetector/utils/gpu_test.py +125 -0
  29. megadetector/utils/md_tests.py +455 -116
  30. megadetector/utils/path_utils.py +43 -2
  31. megadetector/utils/wi_utils.py +2691 -0
  32. megadetector/visualization/visualization_utils.py +95 -18
  33. megadetector/visualization/visualize_db.py +25 -7
  34. megadetector/visualization/visualize_detector_output.py +60 -13
  35. {megadetector-5.0.23.dist-info → megadetector-5.0.25.dist-info}/METADATA +11 -23
  36. {megadetector-5.0.23.dist-info → megadetector-5.0.25.dist-info}/RECORD +39 -36
  37. {megadetector-5.0.23.dist-info → megadetector-5.0.25.dist-info}/WHEEL +1 -1
  38. megadetector/detection/detector_training/__init__.py +0 -0
  39. megadetector/detection/detector_training/model_main_tf2.py +0 -114
  40. megadetector/utils/torch_test.py +0 -32
  41. {megadetector-5.0.23.dist-info → megadetector-5.0.25.dist-info}/LICENSE +0 -0
  42. {megadetector-5.0.23.dist-info → megadetector-5.0.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2691 @@
1
+ """
2
+
3
+ wi_utils.py
4
+
5
+ Functions related to working with the WI insights platform, specifically for:
6
+
7
+ * Retrieving images based on .csv downloads
8
+ * Pushing results to the ProcessCVResponse() API (requires an API key)
9
+ * Working with WI taxonomy records and geofencing data
10
+
11
+ """
12
+
13
+ #%% Imports and constants
14
+
15
+ import os
16
+ import requests
17
+ import json
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from copy import deepcopy
23
+ from collections import defaultdict
24
+ from multiprocessing.pool import Pool, ThreadPool
25
+ from functools import partial
26
+ from tqdm import tqdm
27
+
28
+ from megadetector.utils.path_utils import insert_before_extension
29
+ from megadetector.utils.ct_utils import split_list_into_n_chunks
30
+ from megadetector.utils.ct_utils import round_floats_in_nested_dict
31
+ from megadetector.utils.ct_utils import is_list_sorted
32
+ from megadetector.utils.ct_utils import invert_dictionary
33
+ from megadetector.utils.ct_utils import sort_list_of_dicts_by_key
34
+ from megadetector.utils.ct_utils import sort_dictionary_by_value
35
+ from megadetector.utils.ct_utils import sort_dictionary_by_key
36
+ from megadetector.utils.path_utils import find_images
37
+ from megadetector.postprocessing.validate_batch_results import \
38
+ validate_batch_results, ValidateBatchResultsOptions
39
+
40
+ md_category_id_to_name = {'1':'animal','2':'person','3':'vehicle'}
41
+ md_category_name_to_id = invert_dictionary(md_category_id_to_name)
42
+
43
+ # Only used when pushing results directly to the platform via the API; any detections we want
44
+ # to show in the UI should have at least this confidence value.
45
+ min_md_output_confidence = 0.25
46
+
47
+ # Fields expected to be present in a valid WI result
48
+ wi_result_fields = ['wi_taxon_id','class','order','family','genus','species','common_name']
49
+
50
+
51
+ #%% Miscellaneous WI support functions
52
+
53
+ def is_valid_prediction_string(s):
54
+ """
55
+ Determine whether [s] is a valid WI prediction string. Prediction strings look like:
56
+
57
+ '90d950db-2106-4bd9-a4c1-777604c3eada;mammalia;rodentia;;;;rodent'
58
+
59
+ Args:
60
+ s (str): the string to be tested for validity
61
+
62
+ Returns:
63
+ bool: True if this looks more or less like a WI prediction string
64
+ """
65
+
66
+ # Note to self... don't get tempted to remove spaces here; spaces are used
67
+ # to indicate subspecies.
68
+ return isinstance(s,str) and (len(s.split(';')) == 7) and (s == s.lower())
69
+
70
+
71
+ def is_valid_taxonomy_string(s):
72
+ """
73
+ Determine whether [s] is a valid 5-token WI taxonomy string. Taxonmy strings look like:
74
+
75
+ 'mammalia;rodentia;;;;rodent'
76
+ 'mammalia;chordata;canidae;canis;lupus dingo'
77
+
78
+ Args:
79
+ s (str): the string to be tested for validity
80
+
81
+ Returns:
82
+ bool: True if this looks more or less like a WI taxonomy string
83
+ """
84
+ return isinstance(s,str) and (len(s.split(';')) == 5) and (s == s.lower())
85
+
86
+
87
+ def wi_result_to_prediction_string(r):
88
+ """
89
+ Convert the dict [r] - typically loaded from a row in a downloaded .csv file - to
90
+ a valid prediction string, e.g.:
91
+
92
+ 1f689929-883d-4dae-958c-3d57ab5b6c16;;;;;;animal
93
+ 90d950db-2106-4bd9-a4c1-777604c3eada;mammalia;rodentia;;;;rodent
94
+
95
+ Args:
96
+ r (dict): dict containing WI prediction information, with at least the fields
97
+ specified in wi_result_fields.
98
+
99
+ Returns:
100
+ str: the result in [r], as a semicolon-delimited prediction string
101
+ """
102
+
103
+ values = []
104
+ for field in wi_result_fields:
105
+ if isinstance(r[field],str):
106
+ values.append(r[field].lower())
107
+ else:
108
+ assert isinstance(r[field],float) and np.isnan(r[field])
109
+ values.append('')
110
+ s = ';'.join(values)
111
+ assert is_valid_prediction_string(s)
112
+ return s
113
+
114
+
115
+ def compare_values(v0,v1):
116
+ """
117
+ Utility function for comparing two values when we want to return True if both
118
+ values are NaN.
119
+
120
+ Args:
121
+ v0 (object): the first value to compare
122
+ v1 (object): the second value to compare
123
+
124
+ Returns:
125
+ bool: True if v0 == v1, or if both v0 and v1 are NaN
126
+ """
127
+
128
+ if isinstance(v0,float) and isinstance(v1,float) and np.isnan(v0) and np.isnan(v1):
129
+ return True
130
+ return v0 == v1
131
+
132
+
133
+ def record_is_unidentified(record):
134
+ """
135
+ A record is considered "unidentified" if the "identified by" field is either NaN or "computer vision"
136
+
137
+ Args:
138
+ record (dict): dict representing a WI result loaded from a .csv file, with at least the
139
+ field "identified_by"
140
+
141
+ Returns:
142
+ bool: True if the "identified_by" field is either NaN or a string indicating that this
143
+ record has not yet been human-reviewed.
144
+ """
145
+
146
+ identified_by = record['identified_by']
147
+ assert isinstance(identified_by,float) or isinstance(identified_by,str)
148
+ if isinstance(identified_by,float):
149
+ assert np.isnan(identified_by)
150
+ return True
151
+ else:
152
+ return identified_by == 'Computer vision'
153
+
154
+
155
+ def record_lists_are_identical(records_0,records_1,verbose=False):
156
+ """
157
+ Takes two lists of records in the form returned by read_images_from_download_bundle and
158
+ determines whether they are the same.
159
+
160
+ Args:
161
+ records_0 (list of dict): the first list of records to compare
162
+ records_1 (list of dict): the second list of records to compare
163
+ verbose (bool, optional): enable additional debug output
164
+
165
+ Returns:
166
+ bool: True if the two lists are identical
167
+ """
168
+
169
+ if len(records_0) != len(records_1):
170
+ return False
171
+
172
+ # i_record = 0; record_0 = records_0[i_record]
173
+ for i_record,record_0 in enumerate(records_0):
174
+ record_1 = records_1[i_record]
175
+ assert set(record_0.keys()) == set(record_1.keys())
176
+ for k in record_0.keys():
177
+ if not compare_values(record_0[k],record_1[k]):
178
+ if verbose:
179
+ print('Image ID: {} ({})\nRecord 0/{}: {}\nRecord 1/{}: {}'.format(
180
+ record_0['image_id'],record_1['image_id'],
181
+ k,record_0[k],k,record_1[k]))
182
+ return False
183
+
184
+ return True
185
+
186
+
187
+ #%% Functions for managing WI downloads
188
+
189
+ def read_sequences_from_download_bundle(download_folder):
190
+ """
191
+ Reads sequences.csv from [download_folder], returning a list of dicts. This is a
192
+ thin wrapper around pd.read_csv, it's just here for future-proofing.
193
+
194
+ Args:
195
+ download_folder (str): a folder containing exactly one file called sequences.csv, typically
196
+ representing a Wildlife Insights download bundle.
197
+
198
+ Returns:
199
+ list of dict: a direct conversion of the .csv file to a list of dicts
200
+ """
201
+
202
+ print('Reading sequences from {}'.format(download_folder))
203
+
204
+ sequence_list_files = os.listdir(download_folder)
205
+ sequence_list_files = \
206
+ [fn for fn in sequence_list_files if fn == 'sequences.csv']
207
+ assert len(sequence_list_files) == 1, \
208
+ 'Could not find sequences.csv in {}'.format(download_folder)
209
+
210
+ sequence_list_file = os.path.join(download_folder,sequence_list_files[0])
211
+
212
+ df = pd.read_csv(sequence_list_file)
213
+ sequence_records = df.to_dict('records')
214
+ return sequence_records
215
+
216
+
217
+ def read_images_from_download_bundle(download_folder):
218
+ """
219
+ Reads all images.csv files from [download_folder], returns a dict mapping image IDs
220
+ to a list of dicts that describe each image. It's a list of dicts rather than a single dict
221
+ because images may appear more than once.
222
+
223
+ Args:
224
+ download_folder (str): a folder containing one or more images.csv files, typically
225
+ representing a Wildlife Insights download bundle.
226
+
227
+ Returns:
228
+ dict: Maps image GUIDs to dicts with at least the following fields:
229
+ * project_id (int)
230
+ * deployment_id (str)
231
+ * image_id (str, should match the key)
232
+ * filename (str, the filename without path at the time of upload)
233
+ * location (str, starting with gs://)
234
+
235
+ May also contain clasification fields: wi_taxon_id (str), species, etc.
236
+ """
237
+
238
+ print('Reading images from {}'.format(download_folder))
239
+
240
+ ##%% Find lists of images
241
+
242
+ image_list_files = os.listdir(download_folder)
243
+ image_list_files = \
244
+ [fn for fn in image_list_files if fn.startswith('images_') and fn.endswith('.csv')]
245
+ image_list_files = \
246
+ [os.path.join(download_folder,fn) for fn in image_list_files]
247
+ print('Found {} image list files'.format(len(image_list_files)))
248
+
249
+
250
+ ##%% Read lists of images by deployment
251
+
252
+ image_id_to_image_records = defaultdict(list)
253
+
254
+ # image_list_file = image_list_files[0]
255
+ for image_list_file in image_list_files:
256
+
257
+ print('Reading images from list file {}'.format(
258
+ os.path.basename(image_list_file)))
259
+
260
+ df = pd.read_csv(image_list_file)
261
+
262
+ # i_row = 0; row = df.iloc[i_row]
263
+ for i_row,row in tqdm(df.iterrows(),total=len(df)):
264
+
265
+ row_dict = row.to_dict()
266
+ image_id = row_dict['image_id']
267
+ image_id_to_image_records[image_id].append(row_dict)
268
+
269
+ # ...for each image
270
+
271
+ # ...for each list file
272
+
273
+ deployment_ids = set()
274
+ for image_id in image_id_to_image_records:
275
+ image_records = image_id_to_image_records[image_id]
276
+ for image_record in image_records:
277
+ deployment_ids.add(image_record['deployment_id'])
278
+
279
+ print('Found {} rows in {} deployments'.format(
280
+ len(image_id_to_image_records),
281
+ len(deployment_ids)))
282
+
283
+ return image_id_to_image_records
284
+
285
+
286
+ def find_images_in_identify_tab(download_folder_with_identify,download_folder_excluding_identify):
287
+ """
288
+ Based on extracted download packages with and without the "exclude images in 'identify' tab
289
+ checkbox" checked, figure out which images are in the identify tab. Returns a list of dicts (one
290
+ per image).
291
+
292
+ Args:
293
+ download_folder_with_identify (str): the folder containing the download bundle that
294
+ includes images from the "identify" tab
295
+ download_folder_excluding_identify (str): the folder containing the download bundle that
296
+ excludes images from the "identify" tab
297
+
298
+ Returns:
299
+ list of dict: list of image records that are present in the identify tab
300
+ """
301
+
302
+ ##%% Read data (~30 seconds)
303
+
304
+ image_id_to_image_records_with_identify = \
305
+ read_images_from_download_bundle(download_folder_with_identify)
306
+ image_id_to_image_records_excluding_identify = \
307
+ read_images_from_download_bundle(download_folder_excluding_identify)
308
+
309
+
310
+ ##%% Find images that have not been identified
311
+
312
+ all_image_ids_with_identify = set(image_id_to_image_records_with_identify.keys())
313
+ all_image_ids_excluding_identify = set(image_id_to_image_records_excluding_identify.keys())
314
+
315
+ image_ids_in_identify_tab = all_image_ids_with_identify.difference(all_image_ids_excluding_identify)
316
+
317
+ assert len(image_ids_in_identify_tab) == \
318
+ len(all_image_ids_with_identify) - len(all_image_ids_excluding_identify)
319
+
320
+ print('Found {} images with identify, {} in identify tab, {} excluding'.format(
321
+ len(all_image_ids_with_identify),
322
+ len(image_ids_in_identify_tab),
323
+ len(all_image_ids_excluding_identify)))
324
+
325
+ image_records_in_identify_tab = []
326
+ deployment_ids_for_downloaded_images = set()
327
+
328
+ for image_id in image_ids_in_identify_tab:
329
+ image_records_this_image = image_id_to_image_records_with_identify[image_id]
330
+ assert len(image_records_this_image) > 0
331
+ image_records_in_identify_tab.extend(image_records_this_image)
332
+ for image_record in image_records_this_image:
333
+ deployment_ids_for_downloaded_images.add(image_record['deployment_id'])
334
+
335
+ print('Found {} records for {} unique images in {} deployments'.format(
336
+ len(image_records_in_identify_tab),
337
+ len(image_ids_in_identify_tab),
338
+ len(deployment_ids_for_downloaded_images)))
339
+
340
+ return image_records_in_identify_tab
341
+
342
+ # ...def find_images_in_identify_tab(...)
343
+
344
+
345
+ def write_download_commands(image_records_to_download,
346
+ download_dir_base,
347
+ force_download=False,
348
+ n_download_workers=25,
349
+ download_command_file_base=None):
350
+ """
351
+ Given a list of dicts with at least the field 'location' (a gs:// URL), prepare a set of "gcloud
352
+ storage" commands to download images, and write those to a series of .sh scripts, along with one
353
+ .sh script that runs all the others and blocks.
354
+
355
+ gcloud commands will use relative paths.
356
+
357
+ image_records_to_download can also be a dict mapping IDs to lists of records.
358
+
359
+ Args:
360
+ image_records_to_download (list of dict): list of dicts with at least the field 'location'
361
+ download_dir_base (str): local destination folder
362
+ force_download (bool, optional): include gs commands even if the target file exists
363
+ n_download_workers (int, optional): number of scripts to write (that's our hacky way
364
+ of controlling parallelization)
365
+ download_command_file (str, optional): path of the .sh script we should write, defaults
366
+ to "download_wi_images.sh" in the destination folder
367
+ """
368
+
369
+ if isinstance(image_records_to_download,dict):
370
+
371
+ all_image_records = []
372
+ for k in image_records_to_download:
373
+ records_this_image = image_records_to_download[k]
374
+ all_image_records.extend(records_this_image)
375
+ return write_download_commands(all_image_records,
376
+ download_dir_base=download_dir_base,
377
+ force_download=force_download,
378
+ n_download_workers=n_download_workers,
379
+ download_command_file_base=download_command_file_base)
380
+
381
+ ##%% Make list of gcloud storage commands
382
+
383
+ if download_command_file_base is None:
384
+ download_command_file_base = os.path.join(download_dir_base,'download_wi_images.sh')
385
+
386
+ commands = []
387
+ skipped_urls = []
388
+ downloaded_urls = set()
389
+
390
+ # image_record = image_records_to_download[0]
391
+ for image_record in tqdm(image_records_to_download):
392
+
393
+ url = image_record['location']
394
+ if url in downloaded_urls:
395
+ continue
396
+
397
+ assert url.startswith('gs://')
398
+
399
+ relative_path = url.replace('gs://','')
400
+ abs_path = os.path.join(download_dir_base,relative_path)
401
+
402
+ # Skip files that already exist
403
+ if (not force_download) and (os.path.isfile(abs_path)):
404
+ skipped_urls.append(url)
405
+ continue
406
+
407
+ # command = 'gsutil cp "{}" "./{}"'.format(url,relative_path)
408
+ command = 'gcloud storage cp --no-clobber "{}" "./{}"'.format(url,relative_path)
409
+ commands.append(command)
410
+
411
+ print('Generated {} commands for {} image records'.format(
412
+ len(commands),len(image_records_to_download)))
413
+
414
+ print('Skipped {} URLs'.format(len(skipped_urls)))
415
+
416
+
417
+ ##%% Write those commands out to n .sh files
418
+
419
+ commands_by_script = split_list_into_n_chunks(commands,n_download_workers)
420
+
421
+ local_download_commands = []
422
+
423
+ output_dir = os.path.dirname(download_command_file_base)
424
+ os.makedirs(output_dir,exist_ok=True)
425
+
426
+ # Write out the download script for each chunk
427
+ # i_script = 0
428
+ for i_script in range(0,n_download_workers):
429
+ download_command_file = insert_before_extension(download_command_file_base,str(i_script).zfill(2))
430
+ local_download_commands.append(os.path.basename(download_command_file))
431
+ with open(download_command_file,'w',newline='\n') as f:
432
+ for command in commands_by_script[i_script]:
433
+ f.write(command + '\n')
434
+
435
+ # Write out the main download script
436
+ with open(download_command_file_base,'w',newline='\n') as f:
437
+ for local_download_command in local_download_commands:
438
+ f.write('./' + local_download_command + ' &\n')
439
+ f.write('wait\n')
440
+ f.write('echo done\n')
441
+
442
+ # ...def write_download_commands(...)
443
+
444
+
445
+ #%% Functions and constants related to pushing results to the DB
446
+
447
+ # Sample payload for validation
448
+ sample_update_payload = {
449
+
450
+ "predictions": [
451
+ {
452
+ "project_id": "1234",
453
+ "ignore_data_file_checks": True,
454
+ "prediction": "f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank",
455
+ "prediction_score": 0.81218224763870239,
456
+ "classifications": {
457
+ "classes": [
458
+ "f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank",
459
+ "b1352069-a39c-4a84-a949-60044271c0c1;aves;;;;;bird",
460
+ "90d950db-2106-4bd9-a4c1-777604c3eada;mammalia;rodentia;;;;rodent",
461
+ "f2d233e3-80e3-433d-9687-e29ecc7a467a;mammalia;;;;;mammal",
462
+ "ac068717-6079-4aec-a5ab-99e8d14da40b;mammalia;rodentia;sciuridae;dremomys;rufigenis;red-cheeked squirrel"
463
+ ],
464
+ "scores": [
465
+ 0.81218224763870239,
466
+ 0.1096673980355263,
467
+ 0.02707692421972752,
468
+ 0.00771023565903306,
469
+ 0.0049269795417785636
470
+ ]
471
+ },
472
+ "detections": [
473
+ {
474
+ "category": "1",
475
+ "label": "animal",
476
+ "conf": 0.181,
477
+ "bbox": [
478
+ 0.02421,
479
+ 0.35823999999999989,
480
+ 0.051560000000000009,
481
+ 0.070826666666666746
482
+ ]
483
+ }
484
+ ],
485
+ "model_version": "3.1.2",
486
+ "prediction_source": "manual_update",
487
+ "data_file_id": "2ea1d2b2-7f84-43f9-af1f-8be0e69c7015"
488
+ }
489
+ ]
490
+ }
491
+
492
+ blank_prediction_string = 'f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank'
493
+ no_cv_result_prediction_string = 'f2efdae9-efb8-48fb-8a91-eccf79ab4ffb;no cv result;no cv result;no cv result;no cv result;no cv result;no cv result'
494
+ animal_prediction_string = '1f689929-883d-4dae-958c-3d57ab5b6c16;;;;;;animal'
495
+ human_prediction_string = '990ae9dd-7a59-4344-afcb-1b7b21368000;mammalia;primates;hominidae;homo;sapiens;human'
496
+ vehicle_prediction_string = 'e2895ed5-780b-48f6-8a11-9e27cb594511;;;;;;vehicle'
497
+
498
+ non_taxonomic_prediction_strings = [blank_prediction_string,
499
+ no_cv_result_prediction_string,
500
+ animal_prediction_string,
501
+ vehicle_prediction_string]
502
+
503
+ process_cv_response_url = 'https://placeholder'
504
+
505
+
506
+ def prepare_data_update_auth_headers(auth_token_file):
507
+ """
508
+ Read the authorization token from a text file and prepare http headers.
509
+
510
+ Args:
511
+ auth_token_file (str): a single-line text file containing a write-enabled
512
+ API token.
513
+
514
+ Returns:
515
+ dict: http headers, with fields 'Authorization' and 'Content-Type'
516
+ """
517
+
518
+ with open(auth_token_file,'r') as f:
519
+ auth_token = f.read()
520
+
521
+ headers = {
522
+ 'Authorization': 'Bearer ' + auth_token,
523
+ 'Content-Type': 'application/json'
524
+ }
525
+
526
+ return headers
527
+
528
+
529
+ def push_results_for_images(payload,
530
+ headers,
531
+ url=process_cv_response_url,
532
+ verbose=False):
533
+ """
534
+ Push results for one or more images represented in [payload] to the
535
+ process_cv_response API, to write to the WI DB.
536
+
537
+ Args:
538
+ payload (dict): payload to upload to the API
539
+ headers (dict): authorization headers, see prepare_data_update_auth_headers
540
+ url (str, optional): API URL
541
+ verbose (bool, optional): enable additional debug output
542
+
543
+ Return:
544
+ int: response status code
545
+ """
546
+
547
+ if verbose:
548
+ print('Sending header {} to URL {}'.format(
549
+ headers,url))
550
+
551
+ response = requests.post(url, headers=headers, json=payload)
552
+
553
+ # Check the response status code
554
+ if response.status_code in (200,201):
555
+ if verbose:
556
+ print('Successfully pushed results for {} images'.format(len(payload['predictions'])))
557
+ print(response.headers)
558
+ print(str(response))
559
+ else:
560
+ print(f'Error: {response.status_code} {response.text}')
561
+
562
+ return response.status_code
563
+
564
+
565
+ def parallel_push_results_for_images(payloads,
566
+ headers,
567
+ url=process_cv_response_url,
568
+ verbose=False,
569
+ pool_type='thread',
570
+ n_workers=10):
571
+ """
572
+ Push results for the list of payloads in [payloads] to the process_cv_response API,
573
+ parallelized over multiple workers.
574
+
575
+ Args:
576
+ payloads (list of dict): payloads to upload to the API
577
+ headers (dict): authorization headers, see prepare_data_update_auth_headers
578
+ url (str, optional): API URL
579
+ verbose (bool, optional): enable additional debug output
580
+ pool_type (str, optional): 'thread' or 'process'
581
+ n_workers (int, optional): number of parallel workers
582
+
583
+ Returns:
584
+ list of int: list of http response codes, one per payload
585
+ """
586
+
587
+ if n_workers == 1:
588
+
589
+ results = []
590
+ for payload in payloads:
591
+ results.append(push_results_for_images(payload,
592
+ headers=headers,
593
+ url=url,
594
+ verbose=verbose))
595
+ return results
596
+
597
+ else:
598
+
599
+ assert pool_type in ('thread','process')
600
+
601
+ if pool_type == 'thread':
602
+ pool_string = 'thread'
603
+ pool = ThreadPool(n_workers)
604
+ else:
605
+ pool_string = 'process'
606
+ pool = Pool(n_workers)
607
+
608
+ print('Created a {} pool of {} workers'.format(
609
+ pool_string,n_workers))
610
+
611
+ results = list(tqdm(pool.imap(
612
+ partial(push_results_for_images,headers=headers,url=url,verbose=verbose),payloads),
613
+ total=len(payloads)))
614
+
615
+ assert len(results) == len(payloads)
616
+ return results
617
+
618
+
619
+ def generate_payload_with_replacement_detections(wi_result,
620
+ detections,
621
+ prediction_score=0.9,
622
+ model_version='3.1.2',
623
+ prediction_source='manual_update'):
624
+ """
625
+ Generate a payload for a single image that keeps the classifications from
626
+ [wi_result], but replaces the detections with the MD-formatted list [detections].
627
+
628
+ Args:
629
+ wi_result (dict): dict representing a WI prediction result, with at least the
630
+ fields in the constant wi_result_fields
631
+ detections (list): list of WI-formatted detection dicts (with fields ['conf'] and ['category'])
632
+ prediction_score (float, optional): confidence value to use for the combined prediction
633
+ model_version (str, optional): model version string to include in the payload
634
+ prediction_source (str, optional): prediction source string to include in the payload
635
+
636
+ Returns:
637
+ dict: dictionary suitable for uploading via push_results_for_images
638
+ """
639
+
640
+ payload_detections = []
641
+
642
+ # detection = detections[0]
643
+ for detection in detections:
644
+ detection_out = detection.copy()
645
+ detection_out['label'] = md_category_id_to_name[detection['category']]
646
+ if detection_out['conf'] < min_md_output_confidence:
647
+ detection_out['conf'] = min_md_output_confidence
648
+ payload_detections.append(detection_out)
649
+
650
+ prediction_string = wi_result_to_prediction_string(wi_result)
651
+
652
+ prediction = {}
653
+ prediction['ignore_data_file_checks'] = True
654
+ prediction['prediction'] = prediction_string
655
+ prediction['prediction_score'] = prediction_score
656
+
657
+ classifications = {}
658
+ classifications['classes'] = [prediction_string]
659
+ classifications['scores'] = [prediction_score]
660
+
661
+ prediction['classifications'] = classifications
662
+ prediction['detections'] = payload_detections
663
+ prediction['model_version'] = model_version
664
+ prediction['prediction_source'] = prediction_source
665
+ prediction['data_file_id'] = wi_result['image_id']
666
+ prediction['project_id'] = str(wi_result['project_id'])
667
+ payload = {}
668
+ payload['predictions'] = [prediction]
669
+
670
+ return payload
671
+
672
+
673
+ def generate_blank_prediction_payload(data_file_id,
674
+ project_id,
675
+ blank_confidence=0.9,
676
+ model_version='3.1.2',
677
+ prediction_source='manual_update'):
678
+ """
679
+ Generate a payload that will set a single image to the blank classification, with
680
+ no detections. Suitable for upload via push_results_for_images.
681
+
682
+ Args:
683
+ data_file_id (str): unique identifier for this image used in the WI DB
684
+ project_id (int): WI project ID
685
+ blank_confidence (float, optional): confidence value to associate with this
686
+ prediction
687
+ model_version (str, optional): model version string to include in the payload
688
+ prediction_source (str, optional): prediction source string to include in the payload
689
+
690
+ Returns:
691
+ dict: dictionary suitable for uploading via push_results_for_images
692
+ """
693
+
694
+ prediction = {}
695
+ prediction['ignore_data_file_checks'] = True
696
+ prediction['prediction'] = blank_prediction_string
697
+ prediction['prediction_score'] = blank_confidence
698
+ prediction['classifications'] = {}
699
+ prediction['classifications']['classes'] = [blank_prediction_string]
700
+ prediction['classifications']['scores'] = [blank_confidence]
701
+ prediction['detections'] = []
702
+ prediction['model_version'] = model_version
703
+ prediction['prediction_source'] = prediction_source
704
+ prediction['data_file_id'] = data_file_id
705
+ prediction['project_id'] = project_id
706
+ payload = {}
707
+ payload['predictions'] = [prediction]
708
+
709
+ return payload
710
+
711
+
712
+ def generate_no_cv_result_payload(data_file_id,
713
+ project_id,
714
+ no_cv_confidence=0.9,
715
+ model_version='3.1.2',
716
+ prediction_source='manual_update'):
717
+ """
718
+ Generate a payload that will set a single image to the blank classification, with
719
+ no detections. Suitable for uploading via push_results_for_images.
720
+
721
+ Args:
722
+ data_file_id (str): unique identifier for this image used in the WI DB
723
+ project_id (int): WI project ID
724
+ no_cv_confidence (float, optional): confidence value to associate with this
725
+ prediction
726
+ model_version (str, optional): model version string to include in the payload
727
+ prediction_source (str, optional): prediction source string to include in the payload
728
+
729
+ Returns:
730
+ dict: dictionary suitable for uploading via push_results_for_images
731
+ """
732
+
733
+ prediction = {}
734
+ prediction['ignore_data_file_checks'] = True
735
+ prediction['prediction'] = no_cv_result_prediction_string
736
+ prediction['prediction_score'] = no_cv_confidence
737
+ prediction['classifications'] = {}
738
+ prediction['classifications']['classes'] = [no_cv_result_prediction_string]
739
+ prediction['classifications']['scores'] = [no_cv_confidence]
740
+ prediction['detections'] = []
741
+ prediction['model_version'] = model_version
742
+ prediction['prediction_source'] = prediction_source
743
+ prediction['data_file_id'] = data_file_id
744
+ prediction['project_id'] = project_id
745
+ payload = {}
746
+ payload['predictions'] = [prediction]
747
+
748
+ return payload
749
+
750
+
751
+ def generate_payload_for_prediction_string(data_file_id,
752
+ project_id,
753
+ prediction_string,
754
+ prediction_confidence=0.8,
755
+ detections=None,
756
+ model_version='3.1.2',
757
+ prediction_source='manual_update'):
758
+ """
759
+ Generate a payload that will set a single image to a particular prediction, optionally
760
+ including detections. Suitable for uploading via push_results_for_images.
761
+
762
+ Args:
763
+ data_file_id (str): unique identifier for this image used in the WI DB
764
+ project_id (int): WI project ID
765
+ prediction_string (str): WI-formatted prediction string to include in the payload
766
+ prediction_confidence (float, optional): confidence value to associate with this
767
+ prediction
768
+ detections (list, optional): list of MD-formatted detection dicts, with fields
769
+ ['category'] and 'conf'
770
+ model_version (str, optional): model version string to include in the payload
771
+ prediction_source (str, optional): prediction source string to include in the payload
772
+
773
+
774
+ Returns:
775
+ dict: dictionary suitable for uploading via push_results_for_images
776
+ """
777
+
778
+ assert is_valid_prediction_string(prediction_string), \
779
+ 'Invalid prediction string: {}'.format(prediction_string)
780
+
781
+ payload_detections = []
782
+
783
+ if detections is not None:
784
+ # detection = detections[0]
785
+ for detection in detections:
786
+ detection_out = detection.copy()
787
+ detection_out['label'] = md_category_id_to_name[detection['category']]
788
+ if detection_out['conf'] < min_md_output_confidence:
789
+ detection_out['conf'] = min_md_output_confidence
790
+ payload_detections.append(detection_out)
791
+
792
+ prediction = {}
793
+ prediction['ignore_data_file_checks'] = True
794
+ prediction['prediction'] = prediction_string
795
+ prediction['prediction_score'] = prediction_confidence
796
+ prediction['classifications'] = {}
797
+ prediction['classifications']['classes'] = [prediction_string]
798
+ prediction['classifications']['scores'] = [prediction_confidence]
799
+ prediction['detections'] = payload_detections
800
+ prediction['model_version'] = model_version
801
+ prediction['prediction_source'] = prediction_source
802
+ prediction['data_file_id'] = data_file_id
803
+ prediction['project_id'] = project_id
804
+
805
+ payload = {}
806
+ payload['predictions'] = [prediction]
807
+
808
+ return payload
809
+
810
+
811
+ def validate_payload(payload):
812
+ """
813
+ Verifies that the dict [payload] is compatible with the ProcessCVResponse() API. Throws an
814
+ error if [payload] is invalid.
815
+
816
+ Args:
817
+ payload (dict): payload in the format expected by push_results_for_images.
818
+
819
+ Returns:
820
+ bool: successful validation; this is just future-proofing, currently never returns False
821
+ """
822
+
823
+ assert isinstance(payload,dict)
824
+ assert len(payload.keys()) == 1 and 'predictions' in payload
825
+
826
+ # prediction = payload['predictions'][0]
827
+ for prediction in payload['predictions']:
828
+
829
+ assert 'project_id' in prediction
830
+ if not isinstance(prediction['project_id'],int):
831
+ _ = int(prediction['project_id'])
832
+ assert 'ignore_data_file_checks' in prediction and \
833
+ isinstance(prediction['ignore_data_file_checks'],bool)
834
+ assert 'prediction' in prediction and \
835
+ isinstance(prediction['prediction'],str) and \
836
+ len(prediction['prediction'].split(';')) == 7
837
+ assert 'prediction_score' in prediction and \
838
+ isinstance(prediction['prediction_score'],float)
839
+ assert 'model_version' in prediction and \
840
+ isinstance(prediction['model_version'],str)
841
+ assert 'data_file_id' in prediction and \
842
+ isinstance(prediction['data_file_id'],str) and \
843
+ len(prediction['data_file_id']) == 36
844
+ assert 'classifications' in prediction and \
845
+ isinstance(prediction['classifications'],dict)
846
+ classifications = prediction['classifications']
847
+ assert 'classes' in classifications and isinstance(classifications['classes'],list)
848
+ assert 'scores' in classifications and isinstance(classifications['scores'],list)
849
+ assert len(classifications['classes']) == len(classifications['scores'])
850
+ for c in classifications['classes']:
851
+ assert is_valid_prediction_string(c)
852
+ for score in classifications['scores']:
853
+ assert isinstance(score,float) and score >= 0 and score <= 1.0
854
+ assert 'detections' in prediction and isinstance(prediction['detections'],list)
855
+
856
+ for detection in prediction['detections']:
857
+
858
+ assert isinstance(detection,dict)
859
+ assert 'category' in detection and detection['category'] in ('1','2','3')
860
+ assert 'label' in detection and detection['label'] in ('animal','person','vehicle')
861
+ assert 'conf' in detection and \
862
+ isinstance(detection['conf'],float) and \
863
+ detection['conf'] >= 0 and detection['conf'] <= 1.0
864
+ assert 'bbox' in detection and \
865
+ isinstance(detection['bbox'],list) and \
866
+ len(detection['bbox']) == 4
867
+
868
+ # ...for each detection
869
+
870
+ # ...for each prediction
871
+
872
+ return True
873
+
874
+ # ...def validate_payload(...)
875
+
876
+
877
+ #%% Validate constants
878
+
879
+ # This is executed at the time this module gets imported.
880
+
881
+ blank_payload = generate_blank_prediction_payload('70ede9c6-d056-4dd1-9a0b-3098d8113e0e','1234')
882
+ validate_payload(sample_update_payload)
883
+ validate_payload(blank_payload)
884
+
885
+
886
+ #%% Functions and constants related to working with batch predictions
887
+
888
+ def get_kingdom(prediction_string):
889
+ """
890
+ Return the kingdom field from a WI prediction string
891
+
892
+ Args:
893
+ prediction_string (str): a string in the semicolon-delimited prediction string format
894
+
895
+ Returns:
896
+ str: the kingdom field from the input string
897
+ """
898
+ tokens = prediction_string.split(';')
899
+ assert is_valid_prediction_string(prediction_string)
900
+ return tokens[1]
901
+
902
+
903
+ def is_human_classification(prediction_string):
904
+ """
905
+ Determines whether the input string represents a human classification, which includes a variety
906
+ of common names (hiker, person, etc.)
907
+
908
+ Args:
909
+ prediction_string (str): a string in the semicolon-delimited prediction string format
910
+
911
+ Returns:
912
+ bool: whether this string corresponds to a human category
913
+ """
914
+ return prediction_string == human_prediction_string or 'homo;sapiens' in prediction_string
915
+
916
+
917
+ def is_animal_classification(prediction_string):
918
+ """
919
+ Determines whether the input string represents an animal classification, which excludes, e.g.,
920
+ humans, blanks, vehicles, unknowns
921
+
922
+ Args:
923
+ prediction_string (str): a string in the semicolon-delimited prediction string format
924
+
925
+ Returns:
926
+ bool: whether this string corresponds to an animal category
927
+ """
928
+
929
+ if prediction_string == animal_prediction_string:
930
+ return True
931
+ if prediction_string == human_prediction_string or 'homo;sapiens' in prediction_string:
932
+ return False
933
+ if prediction_string == blank_prediction_string:
934
+ return False
935
+ if prediction_string == no_cv_result_prediction_string:
936
+ return False
937
+ if len(get_kingdom(prediction_string)) == 0:
938
+ return False
939
+ return True
940
+
941
+
942
+ def generate_md_results_from_predictions_json(predictions_json_file,
943
+ md_results_file,
944
+ base_folder=None,
945
+ max_decimals=5):
946
+ """
947
+ Generate an MD-formatted .json file from a predictions.json file. Typically,
948
+ MD results files use relative paths, and predictions.json files use absolute paths, so
949
+ this function optionally removes the leading string [base_folder] from all file names.
950
+
951
+ Currently just applies the top classification category to every detection. If the top classification
952
+ is "blank", writes an empty detection list.
953
+
954
+ speciesnet_to_md.py is a command-line driver for this function.
955
+
956
+ Args:
957
+ predictions_json_file (str): path to a predictions.json file, or a dict
958
+ md_results_file (str): path to which we should write an MD-formatted .json file
959
+ base_folder (str, optional): leading string to remove from each path in the
960
+ predictions.json file
961
+ max_decimals (int, optional): number of decimal places to which we should round
962
+ all values
963
+ """
964
+
965
+ # Read predictions file
966
+ if isinstance(predictions_json_file,str):
967
+ with open(predictions_json_file,'r') as f:
968
+ predictions = json.load(f)
969
+ else:
970
+ assert isinstance(predictions_json_file,dict)
971
+ predictions = predictions_json_file
972
+
973
+ # Round floating-point values (confidence scores, coordinates) to a
974
+ # reasonable number of decimal places
975
+ if max_decimals is not None and max_decimals > 0:
976
+ round_floats_in_nested_dict(predictions)
977
+
978
+ predictions = predictions['predictions']
979
+ assert isinstance(predictions,list)
980
+
981
+ # Convert backslashes to forward slashes in both filenames and the base folder string
982
+ for im in predictions:
983
+ im['filepath'] = im['filepath'].replace('\\','/')
984
+ if base_folder is not None:
985
+ base_folder = base_folder.replace('\\','/')
986
+
987
+ detection_category_id_to_name = {}
988
+ classification_category_name_to_id = {}
989
+
990
+ # Keep track of detections that don't have an assigned detection category; these
991
+ # are fake detections we create for non-blank images with non-empty detection lists.
992
+ # We need to go back later and give them a legitimate detection category ID.
993
+ all_unknown_detections = []
994
+
995
+ # Create the output images list
996
+ images_out = []
997
+
998
+ base_folder_replacements = 0
999
+
1000
+ # im_in = predictions[0]
1001
+ for im_in in predictions:
1002
+
1003
+ # blank_prediction_string
1004
+ im_out = {}
1005
+
1006
+ fn = im_in['filepath']
1007
+ if base_folder is not None:
1008
+ if fn.startswith(base_folder):
1009
+ base_folder_replacements += 1
1010
+ fn = fn.replace(base_folder,'',1)
1011
+
1012
+ im_out['file'] = fn
1013
+
1014
+ if 'failures' in im_in:
1015
+
1016
+ im_out['failure'] = str(im_in['failures'])
1017
+ im_out['detections'] = None
1018
+
1019
+ else:
1020
+
1021
+ im_out['detections'] = []
1022
+
1023
+ if 'detections' in im_in:
1024
+
1025
+ if len(im_in['detections']) == 0:
1026
+ im_out['detections'] = []
1027
+ else:
1028
+ # det_in = im_in['detections'][0]
1029
+ for det_in in im_in['detections']:
1030
+ det_out = {}
1031
+ if det_in['category'] in detection_category_id_to_name:
1032
+ assert detection_category_id_to_name[det_in['category']] == det_in['label']
1033
+ else:
1034
+ detection_category_id_to_name[det_in['category']] = det_in['label']
1035
+ det_out = {}
1036
+ for s in ['category','conf','bbox']:
1037
+ det_out[s] = det_in[s]
1038
+ im_out['detections'].append(det_out)
1039
+
1040
+ # ...if detections are present
1041
+
1042
+ class_to_assign = None
1043
+ class_confidence = None
1044
+
1045
+ if 'classifications' in im_in:
1046
+
1047
+ classifications = im_in['classifications']
1048
+ assert len(classifications['scores']) == len(classifications['classes'])
1049
+ assert is_list_sorted(classifications['scores'],reverse=True)
1050
+ class_to_assign = classifications['classes'][0]
1051
+ class_confidence = classifications['scores'][0]
1052
+
1053
+ if 'prediction' in im_in:
1054
+
1055
+ class_to_assign = im_in['prediction']
1056
+ class_confidence = im_in['prediction_score']
1057
+
1058
+ if class_to_assign is not None:
1059
+
1060
+ if class_to_assign == blank_prediction_string:
1061
+
1062
+ # This is a scenario that's not captured well by the MD format: a blank prediction
1063
+ # with detections present. But, for now, don't do anything special here, just making
1064
+ # a note of this.
1065
+ if len(im_out['detections']) > 0:
1066
+ pass
1067
+
1068
+ else:
1069
+
1070
+ assert not class_to_assign.endswith('blank')
1071
+
1072
+ # This is a scenario that's not captured well by the MD format: no detections present,
1073
+ # but a non-blank prediction. For now, create a fake detection to handle this prediction.
1074
+ if len(im_out['detections']) == 0:
1075
+
1076
+ print('Warning: creating fake detection for non-blank whole-image classification')
1077
+ det_out = {}
1078
+ all_unknown_detections.append(det_out)
1079
+
1080
+ # We will change this to a string-int later
1081
+ det_out['category'] = 'unknown'
1082
+ det_out['conf'] = class_confidence
1083
+ det_out['bbox'] = [0,0,1,1]
1084
+ im_out['detections'].append(det_out)
1085
+
1086
+ # ...if this is/isn't a blank classification
1087
+
1088
+ # Attach that classification to each detection
1089
+
1090
+ # Create a new category ID if necessary
1091
+ if class_to_assign in classification_category_name_to_id:
1092
+ classification_category_id = classification_category_name_to_id[class_to_assign]
1093
+ else:
1094
+ classification_category_id = str(len(classification_category_name_to_id))
1095
+ classification_category_name_to_id[class_to_assign] = classification_category_id
1096
+
1097
+ for det in im_out['detections']:
1098
+ det['classifications'] = []
1099
+ det['classifications'].append([classification_category_id,class_confidence])
1100
+
1101
+ # ...if we have some type of classification for this image
1102
+
1103
+ # ...if this is/isn't a failure
1104
+
1105
+ images_out.append(im_out)
1106
+
1107
+ # ...for each image
1108
+
1109
+ if base_folder is not None:
1110
+ if base_folder_replacements == 0:
1111
+ print('Warning: you supplied {} as the base folder, but I made zero replacements'.format(
1112
+ base_folder))
1113
+
1114
+ # Fix the 'unknown' category
1115
+
1116
+ if len(all_unknown_detections) > 0:
1117
+
1118
+ max_detection_category_id = max([int(x) for x in detection_category_id_to_name.keys()])
1119
+ unknown_category_id = str(max_detection_category_id + 1)
1120
+ detection_category_id_to_name[unknown_category_id] = 'unknown'
1121
+
1122
+ for det in all_unknown_detections:
1123
+ assert det['category'] == 'unknown'
1124
+ det['category'] = unknown_category_id
1125
+
1126
+
1127
+ # Sort by filename
1128
+
1129
+ images_out = sort_list_of_dicts_by_key(images_out,'file')
1130
+
1131
+ # Prepare friendly classification names
1132
+
1133
+ classification_category_descriptions = \
1134
+ invert_dictionary(classification_category_name_to_id)
1135
+ classification_categories_out = {}
1136
+ for category_id in classification_category_descriptions.keys():
1137
+ category_name = classification_category_descriptions[category_id].split(';')[-1]
1138
+ classification_categories_out[category_id] = category_name
1139
+
1140
+ # Prepare the output dict
1141
+
1142
+ detection_categories_out = detection_category_id_to_name
1143
+ info = {}
1144
+ info['format_version'] = 1.4
1145
+ info['detector'] = 'converted_from_predictions_json'
1146
+
1147
+ output_dict = {}
1148
+ output_dict['info'] = info
1149
+ output_dict['detection_categories'] = detection_categories_out
1150
+ output_dict['classification_categories'] = classification_categories_out
1151
+ output_dict['classification_category_descriptions'] = classification_category_descriptions
1152
+ output_dict['images'] = images_out
1153
+
1154
+ with open(md_results_file,'w') as f:
1155
+ json.dump(output_dict,f,indent=1)
1156
+
1157
+ validation_options = ValidateBatchResultsOptions()
1158
+ validation_options.raise_errors = True
1159
+ _ = validate_batch_results(md_results_file, options=validation_options)
1160
+
1161
+ # ...def generate_md_results_from_predictions_json(...)
1162
+
1163
+
1164
+ def generate_predictions_json_from_md_results(md_results_file,
1165
+ predictions_json_file,
1166
+ base_folder=None):
1167
+ """
1168
+ Generate a predictions.json file from the MD-formatted .json file [md_results_file]. Typically,
1169
+ MD results files use relative paths, and predictions.json files use absolute paths, so
1170
+ this function optionally prepends [base_folder]. Does not handle classification results in
1171
+ MD format, since this is intended to prepare data for passing through the WI classifier.
1172
+
1173
+ md_to_wi.py is a command-line driver for this function.
1174
+
1175
+ Args:
1176
+ md_results_file (str): path to an MD-formatted .json file
1177
+ predictions_json_file (str): path to which we should write a predictions.json file
1178
+ base_folder (str, optional): folder name to prepend to each path in md_results_file,
1179
+ to convert relative paths to absolute paths.
1180
+ """
1181
+
1182
+ # Validate the input file
1183
+ validation_options = ValidateBatchResultsOptions()
1184
+ validation_options.raise_errors = True
1185
+ validation_options.return_data = True
1186
+ md_results = validate_batch_results(md_results_file, options=validation_options)
1187
+ category_id_to_name = md_results['detection_categories']
1188
+
1189
+ output_dict = {}
1190
+ output_dict['predictions'] = []
1191
+
1192
+ # im = md_results['images'][0]
1193
+ for im in md_results['images']:
1194
+
1195
+ prediction = {}
1196
+ fn = im['file']
1197
+ if base_folder is not None:
1198
+ fn = os.path.join(base_folder,fn)
1199
+ fn = fn.replace('\\','/')
1200
+ prediction['filepath'] = fn
1201
+ if 'failure' in im and im['failure'] is not None:
1202
+ prediction['failures'] = ['DETECTOR']
1203
+ else:
1204
+ assert 'detections' in im and im['detections'] is not None
1205
+ detections = []
1206
+ for det in im['detections']:
1207
+ output_det = deepcopy(det)
1208
+ output_det['label'] = category_id_to_name[det['category']]
1209
+ detections.append(output_det)
1210
+
1211
+ # detections *must* be sorted in descending order by confidence
1212
+ detections = sort_list_of_dicts_by_key(detections,'conf', reverse=True)
1213
+ prediction['detections'] = detections
1214
+
1215
+ assert len(prediction.keys()) >= 2
1216
+ output_dict['predictions'].append(prediction)
1217
+
1218
+ # ...for each image
1219
+
1220
+ os.makedirs(os.path.dirname(predictions_json_file),exist_ok=True)
1221
+ with open(predictions_json_file,'w') as f:
1222
+ json.dump(output_dict,f,indent=1)
1223
+
1224
+ # ...def generate_predictions_json_from_md_results(...)
1225
+
1226
+ default_tokens_to_ignore = ['$RECYCLE.BIN']
1227
+
1228
+ def generate_instances_json_from_folder(folder,
1229
+ country=None,
1230
+ admin1_region=None,
1231
+ lat=None,
1232
+ lon=None,
1233
+ output_file=None,
1234
+ filename_replacements=None,
1235
+ tokens_to_ignore=default_tokens_to_ignore):
1236
+ """
1237
+ Generate an instances.json record that contains all images in [folder], optionally
1238
+ including location information, in a format suitable for run_model.py. Optionally writes
1239
+ the results to [output_file].
1240
+
1241
+ Args:
1242
+ folder (str): the folder to recursively search for images
1243
+ country (str, optional): a three-letter country code
1244
+ lat (float, optional): latitude to associate with all images
1245
+ lon (float, optional): longitude to associate with all images
1246
+ output_file (str, optional): .json file to which we should write instance records
1247
+ filename_replacements (dict, optional): str --> str dict indicating filename substrings
1248
+ that should be replaced with other strings. Replacement occurs *after* converting
1249
+ backslashes to forward slashes.
1250
+ tokens_to_ignore (list, optional): ignore any images with these tokens in their
1251
+ names, typically used to avoid $RECYCLE.BIN. Can be None.
1252
+
1253
+ Returns:
1254
+ dict: dict with at least the field "instances"
1255
+ """
1256
+
1257
+ assert os.path.isdir(folder)
1258
+
1259
+ image_files_abs = find_images(folder,recursive=True,return_relative_paths=False)
1260
+
1261
+ if tokens_to_ignore is not None:
1262
+ n_images_before_ignore_tokens = len(image_files_abs)
1263
+ for token in tokens_to_ignore:
1264
+ image_files_abs = [fn for fn in image_files_abs if token not in fn]
1265
+ print('After ignoring {} tokens, kept {} of {} images'.format(
1266
+ len(tokens_to_ignore),len(image_files_abs),n_images_before_ignore_tokens))
1267
+
1268
+ instances = []
1269
+
1270
+ # image_fn_abs = image_files_abs[0]
1271
+ for image_fn_abs in image_files_abs:
1272
+ instance = {}
1273
+ instance['filepath'] = image_fn_abs.replace('\\','/')
1274
+ if filename_replacements is not None:
1275
+ for s in filename_replacements:
1276
+ instance['filepath'] = instance['filepath'].replace(s,filename_replacements[s])
1277
+ if country is not None:
1278
+ instance['country'] = country
1279
+ if admin1_region is not None:
1280
+ instance['admin1_region'] = admin1_region
1281
+ if lat is not None:
1282
+ assert lon is not None, 'Latitude provided without longitude'
1283
+ instance['latitude'] = lat
1284
+ if lon is not None:
1285
+ assert lat is not None, 'Longitude provided without latitude'
1286
+ instance['longitude'] = lon
1287
+ instances.append(instance)
1288
+
1289
+ to_return = {'instances':instances}
1290
+
1291
+ if output_file is not None:
1292
+ os.makedirs(os.path.dirname(output_file),exist_ok=True)
1293
+ with open(output_file,'w') as f:
1294
+ json.dump(to_return,f,indent=1)
1295
+
1296
+ return to_return
1297
+
1298
+ # ...def generate_instances_json_from_folder(...)
1299
+
1300
+
1301
+ def split_instances_into_n_batches(instances_json,n_batches,output_files=None):
1302
+ """
1303
+ Given an instances.json file, split it into batches of equal size.
1304
+
1305
+ Args:
1306
+ instances_json (str): input .json file in
1307
+ n_batches (int): number of new files to generate
1308
+ output_files (list, optional): output .json files for each
1309
+ batch. If supplied, should have length [n_batches]. If not
1310
+ supplied, filenames will be generated based on [instances_json].
1311
+
1312
+ Returns:
1313
+ list: list of output files that were written; identical to [output_files]
1314
+ if it was supplied as input.
1315
+ """
1316
+
1317
+ with open(instances_json,'r') as f:
1318
+ instances = json.load(f)
1319
+ assert isinstance(instances,dict) and 'instances' in instances
1320
+ instances = instances['instances']
1321
+
1322
+ if output_files is not None:
1323
+ assert len(output_files) == n_batches, \
1324
+ 'Expected {} output files, received {}'.format(
1325
+ n_batches,len(output_files))
1326
+ else:
1327
+ output_files = []
1328
+ for i_batch in range(0,n_batches):
1329
+ batch_string = 'batch_{}'.format(str(i_batch).zfill(3))
1330
+ output_files.append(insert_before_extension(instances_json,batch_string))
1331
+
1332
+ batches = split_list_into_n_chunks(instances, n_batches)
1333
+
1334
+ for i_batch,batch in enumerate(batches):
1335
+ batch_dict = {'instances':batch}
1336
+ with open(output_files[i_batch],'w') as f:
1337
+ json.dump(batch_dict,f,indent=1)
1338
+
1339
+ print('Wrote {} batches to file'.format(n_batches))
1340
+
1341
+ return output_files
1342
+
1343
+
1344
+ def merge_prediction_json_files(input_prediction_files,output_prediction_file):
1345
+ """
1346
+ Merge all predictions.json files in [files] into a single .json file.
1347
+
1348
+ Args:
1349
+ files (list): list of predictions.json files to merge
1350
+ output_file (str): output .json file
1351
+ """
1352
+
1353
+ predictions = []
1354
+ image_filenames_processed = set()
1355
+
1356
+ # input_json_fn = input_prediction_files[0]
1357
+ for input_json_fn in tqdm(input_prediction_files):
1358
+
1359
+ assert os.path.isfile(input_json_fn), \
1360
+ 'Could not find prediction file {}'.format(input_json_fn)
1361
+ with open(input_json_fn,'r') as f:
1362
+ results_this_file = json.load(f)
1363
+ assert isinstance(results_this_file,dict)
1364
+ predictions_this_file = results_this_file['predictions']
1365
+ for prediction in predictions_this_file:
1366
+ image_fn = prediction['filepath']
1367
+ assert image_fn not in image_filenames_processed
1368
+ predictions.extend(predictions_this_file)
1369
+
1370
+ output_dict = {'predictions':predictions}
1371
+
1372
+ os.makedirs(os.path.dirname(output_prediction_file),exist_ok=True)
1373
+ with open(output_prediction_file,'w') as f:
1374
+ json.dump(output_dict,f,indent=1)
1375
+
1376
+ # ...def merge_prediction_json_files(...)
1377
+
1378
+
1379
+ def validate_predictions_file(fn,instances=None,verbose=True):
1380
+ """
1381
+ Validate the predictions.json file [fn].
1382
+
1383
+ Args:
1384
+ fn (str): a .json file in predictions.json (SpeciesNet) format
1385
+ instances (str or list, optional): a folder, instances.json file,
1386
+ or dict loaded from an instances.json file. If supplied, this
1387
+ function will verify that [fn] contains the same number of
1388
+ images as [instances].
1389
+ verbose (bool, optional): enable additional debug output
1390
+
1391
+ Returns:
1392
+ dict: the contents of [fn]
1393
+ """
1394
+
1395
+ with open(fn,'r') as f:
1396
+ d = json.load(f)
1397
+ predictions = d['predictions']
1398
+
1399
+ failures = []
1400
+
1401
+ for im in predictions:
1402
+ if 'failures' in im:
1403
+ failures.append(im)
1404
+
1405
+ if verbose:
1406
+ print('Read detector results for {} images, with {} failure(s)'.format(
1407
+ len(d['predictions']),len(failures)))
1408
+
1409
+ if instances is not None:
1410
+
1411
+ if isinstance(instances,str):
1412
+ if os.path.isdir(instances):
1413
+ instances = generate_instances_json_from_folder(folder=instances)
1414
+ elif os.path.isfile(instances):
1415
+ with open(instances,'r') as f:
1416
+ instances = json.load(f)
1417
+ else:
1418
+ raise ValueError('Could not find instances file/folder {}'.format(
1419
+ instances))
1420
+ assert isinstance(instances,dict)
1421
+ assert 'instances' in instances
1422
+ instances = instances['instances']
1423
+ if verbose:
1424
+ print('Expected results for {} files'.format(len(instances)))
1425
+ assert len(instances) == len(predictions), \
1426
+ '{} instances expected, {} found'.format(
1427
+ len(instances),len(predictions))
1428
+
1429
+ expected_files = set([instance['filepath'] for instance in instances])
1430
+ found_files = set([prediction['filepath'] for prediction in predictions])
1431
+ assert expected_files == found_files
1432
+
1433
+ # ...if a list of instances was supplied
1434
+
1435
+ return d
1436
+
1437
+ # ...def validate_predictions_file(...)
1438
+
1439
+
1440
+ def find_geofence_adjustments(ensemble_json_file,use_latin_names=False):
1441
+ """
1442
+ Count the number of instances of each unique change made by the geofence.
1443
+
1444
+ Args:
1445
+ ensemble_json_file (str): SpeciesNet-formatted .json file produced
1446
+ by the full ensemble.
1447
+ use_latin_names (bool, optional): return a mapping using binomial names
1448
+ rather than common names.
1449
+
1450
+ Returns:
1451
+ dict: maps strings that look like "puma,felidae family" to integers,
1452
+ where that entry would indicate the number of times that "puma" was
1453
+ predicted, but mapped to family level by the geofence. Sorted in
1454
+ descending order by count.
1455
+ """
1456
+
1457
+ ensemble_results = validate_predictions_file(ensemble_json_file)
1458
+
1459
+ assert isinstance(ensemble_results,dict)
1460
+ predictions = ensemble_results['predictions']
1461
+
1462
+ # Maps comma-separated pairs of common names (or binomial names) to
1463
+ # the number of times that transition (first --> second) happened
1464
+ rollup_pair_to_count = defaultdict(int)
1465
+
1466
+ # prediction = predictions[0]
1467
+ for prediction in tqdm(predictions):
1468
+
1469
+ if 'failures' in prediction and \
1470
+ prediction['failures'] is not None and \
1471
+ len(prediction['failures']) > 0:
1472
+ continue
1473
+
1474
+ assert 'prediction_source' in prediction, \
1475
+ 'Prediction present without [prediction_source] field, are you sure this ' + \
1476
+ 'is an ensemble output file?'
1477
+
1478
+ if 'geofence' in prediction['prediction_source']:
1479
+
1480
+ classification_taxonomy_string = \
1481
+ prediction['classifications']['classes'][0]
1482
+ prediction_taxonomy_string = prediction['prediction']
1483
+ assert is_valid_prediction_string(classification_taxonomy_string)
1484
+ assert is_valid_prediction_string(prediction_taxonomy_string)
1485
+
1486
+ # Typical examples:
1487
+ # '86f5b978-4f30-40cc-bd08-be9e3fba27a0;mammalia;rodentia;sciuridae;sciurus;carolinensis;eastern gray squirrel'
1488
+ # 'e4d1e892-0e4b-475a-a8ac-b5c3502e0d55;mammalia;rodentia;sciuridae;;;sciuridae family'
1489
+ classification_common_name = classification_taxonomy_string.split(';')[-1]
1490
+ prediction_common_name = prediction_taxonomy_string.split(';')[-1]
1491
+ classification_binomial_name = classification_taxonomy_string.split(';')[-2]
1492
+ prediction_binomial_name = prediction_taxonomy_string.split(';')[-2]
1493
+
1494
+ input_name = classification_binomial_name if use_latin_names else \
1495
+ classification_common_name
1496
+ output_name = prediction_binomial_name if use_latin_names else \
1497
+ prediction_common_name
1498
+
1499
+ rollup_pair = input_name.strip() + ',' + output_name.strip()
1500
+ rollup_pair_to_count[rollup_pair] += 1
1501
+
1502
+ # ...if we made a geofencing change
1503
+
1504
+ # ...for each prediction
1505
+
1506
+ rollup_pair_to_count = sort_dictionary_by_value(rollup_pair_to_count,reverse=True)
1507
+
1508
+ return rollup_pair_to_count
1509
+
1510
+ # ...def find_geofence_adjustments(...)
1511
+
1512
+
1513
+ #%% Module-level globals related to taxonomy mapping and geofencing
1514
+
1515
+ # This maps a taxonomy string (e.g. mammalia;cetartiodactyla;cervidae;odocoileus;virginianus) to
1516
+ # a dict with keys taxon_id, common_name, kingdom, phylum, class, order, family, genus, species
1517
+ taxonomy_string_to_taxonomy_info = None
1518
+
1519
+ # Maps a binomial name (possibly three tokens, if it's a subspecies) to the same dict
1520
+ # described above.
1521
+ binomial_name_to_taxonomy_info = None
1522
+
1523
+ # Maps a common name to the same dict described above
1524
+ common_name_to_taxonomy_info = None
1525
+
1526
+ # Dict mapping 5-token semicolon-delimited taxonomy strings to geofencing rules
1527
+ taxonomy_string_to_geofencing_rules = None
1528
+
1529
+ # Maps lower-case country names to upper-case country codes
1530
+ country_to_country_code = None
1531
+
1532
+ # Maps upper-case country codes to lower-case country names
1533
+ country_code_to_country = None
1534
+
1535
+
1536
+ #%% Functions related to geofencing and taxonomy mapping
1537
+
1538
+ def taxonomy_info_to_taxonomy_string(taxonomy_info):
1539
+ """
1540
+ Convert a taxonomy record in dict format to a semicolon-delimited string
1541
+
1542
+ Args:
1543
+ taxonomy_info (dict): dict in the format stored in, e.g., taxonomy_string_to_taxonomy_info
1544
+
1545
+ Returns:
1546
+ str: string in the format used as keys in, e.g., taxonomy_string_to_taxonomy_info
1547
+ """
1548
+ return taxonomy_info['class'] + ';' + \
1549
+ taxonomy_info['order'] + ';' + \
1550
+ taxonomy_info['family'] + ';' + \
1551
+ taxonomy_info['genus'] + ';' + \
1552
+ taxonomy_info['species']
1553
+
1554
+
1555
+ def initialize_taxonomy_info(taxonomy_file,force_init=False,encoding='cp1252'):
1556
+ """
1557
+ Load WI taxonomy information from a .json file. Stores information in the global
1558
+ dicts [taxonomy_string_to_taxonomy_info], [binomial_name_to_taxonomy_info], and
1559
+ [common_name_to_taxonomy_info].
1560
+
1561
+ Args:
1562
+ taxonomy_file (str): .json file containing mappings from the short taxonomy strings
1563
+ to the longer strings with GUID and common name, see example below.
1564
+ force_init (bool, optional): if the output dicts already exist, should we
1565
+ re-initialize anyway?
1566
+ encoding (str, optional): character encoding to use when opening the .json file
1567
+ """
1568
+
1569
+ if encoding is None:
1570
+ encoding = 'cp1252'
1571
+
1572
+ global taxonomy_string_to_taxonomy_info
1573
+ global binomial_name_to_taxonomy_info
1574
+ global common_name_to_taxonomy_info
1575
+
1576
+ if (taxonomy_string_to_taxonomy_info is not None) and (not force_init):
1577
+ return
1578
+
1579
+ """
1580
+ Taxonomy keys are taxonomy strings, e.g.:
1581
+
1582
+ 'mammalia;cetartiodactyla;cervidae;odocoileus;virginianus'
1583
+
1584
+ Taxonomy values are extended strings w/Taxon IDs and common names, e.g.:
1585
+
1586
+ '5c7ce479-8a45-40b3-ae21-7c97dfae22f5;mammalia;cetartiodactyla;cervidae;odocoileus;virginianus;white-tailed deer'
1587
+ """
1588
+
1589
+ with open(taxonomy_file,encoding=encoding,errors='ignore') as f:
1590
+ taxonomy_table = json.load(f,strict=False)
1591
+
1592
+ # Right now I'm punting on some unusual-character issues, but here is some scrap that
1593
+ # might help address this in the future
1594
+ if False:
1595
+ import codecs
1596
+ with codecs.open(taxonomy_file,'r',encoding=encoding,errors='ignore') as f:
1597
+ s = f.read()
1598
+ import unicodedata
1599
+ s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
1600
+ taxonomy_table = json.loads(s,strict=False)
1601
+
1602
+ taxonomy_string_to_taxonomy_info = {}
1603
+ binomial_name_to_taxonomy_info = {}
1604
+ common_name_to_taxonomy_info = {}
1605
+
1606
+ # taxonomy_string = next(iter(taxonomy_table.keys()))
1607
+ for taxonomy_string in taxonomy_table.keys():
1608
+
1609
+ taxonomy_string = taxonomy_string.lower()
1610
+
1611
+ taxon_info = {}
1612
+ extended_string = taxonomy_table[taxonomy_string]
1613
+ tokens = extended_string.split(';')
1614
+ assert len(tokens) == 7
1615
+ taxon_info['taxon_id'] = tokens[0]
1616
+ assert len(taxon_info['taxon_id']) == 36
1617
+ taxon_info['kingdom'] = 'animal'
1618
+ taxon_info['phylum'] = 'chordata'
1619
+ taxon_info['class'] = tokens[1]
1620
+ taxon_info['order'] = tokens[2]
1621
+ taxon_info['family'] = tokens[3]
1622
+ taxon_info['genus'] = tokens[4]
1623
+ taxon_info['species'] = tokens[5]
1624
+ taxon_info['common_name'] = tokens[6]
1625
+
1626
+ if taxon_info['common_name'] != '':
1627
+ common_name_to_taxonomy_info[taxon_info['common_name']] = taxon_info
1628
+
1629
+ taxonomy_string_to_taxonomy_info[taxonomy_string] = taxon_info
1630
+ if tokens[4] == '' or tokens[5] == '':
1631
+ # print('Warning: no binomial name for {}'.format(taxonomy_string))
1632
+ pass
1633
+ else:
1634
+ # strip(), but don't remove spaces from the species name;
1635
+ # subspecies are separated with a space, e.g. canis;lupus dingo
1636
+ binomial_name = tokens[4].strip() + ' ' + tokens[5].strip()
1637
+ binomial_name_to_taxonomy_info[binomial_name] = taxon_info
1638
+
1639
+ print('Created {} records in taxonomy_string_to_taxonomy_info'.format(
1640
+ len(taxonomy_string_to_taxonomy_info)))
1641
+
1642
+ # ...def initialize_taxonomy_info(...)
1643
+
1644
+
1645
+ def _parse_code_list(codes):
1646
+ """
1647
+ Turn a list of country or state codes in string, delimited string, or list format
1648
+ into a list. Also does basic validity checking.
1649
+ """
1650
+
1651
+ if not isinstance(codes,list):
1652
+
1653
+ assert isinstance(codes,str)
1654
+
1655
+ codes = codes.strip()
1656
+
1657
+ # This is just a single codes
1658
+ if ',' not in codes:
1659
+ codes = [codes]
1660
+ else:
1661
+ codes = codes.split(',')
1662
+ codes = [c.strip() for c in codes]
1663
+
1664
+ assert isinstance(codes,list)
1665
+
1666
+ codes = [c.upper().strip() for c in codes]
1667
+
1668
+ for c in codes:
1669
+ assert len(c) in (2,3)
1670
+
1671
+ return codes
1672
+
1673
+
1674
+ def _generate_csv_rows_to_block_all_countries_except(
1675
+ species_string,
1676
+ block_except_list):
1677
+ """
1678
+ Generate rows in the format expected by geofence_fixes.csv, representing a list of
1679
+ allow and block rules to block all countries currently allowed for this species
1680
+ except [allow_countries], and add allow rules these countries.
1681
+ """
1682
+
1683
+ assert is_valid_taxonomy_string(species_string), \
1684
+ '{} is not a valid taxonomy string'.format(species_string)
1685
+
1686
+ global taxonomy_string_to_taxonomy_info
1687
+ global binomial_name_to_taxonomy_info
1688
+ global common_name_to_taxonomy_info
1689
+
1690
+ assert taxonomy_string_to_geofencing_rules is not None, \
1691
+ 'Initialize geofencing prior to species lookup'
1692
+ assert taxonomy_string_to_taxonomy_info is not None, \
1693
+ 'Initialize taxonomy lookup prior to species lookup'
1694
+
1695
+ geofencing_rules_this_species = \
1696
+ taxonomy_string_to_geofencing_rules[species_string]
1697
+
1698
+ allowed_countries = []
1699
+ if 'allow' in geofencing_rules_this_species:
1700
+ allowed_countries.extend(geofencing_rules_this_species['allow'])
1701
+
1702
+ blocked_countries = []
1703
+ if 'block' in geofencing_rules_this_species:
1704
+ blocked_countries.extend(geofencing_rules_this_species['block'])
1705
+
1706
+ block_except_list = _parse_code_list(block_except_list)
1707
+
1708
+ countries_to_block = []
1709
+ countries_to_allow = []
1710
+
1711
+ # country = allowed_countries[0]
1712
+ for country in allowed_countries:
1713
+ if country not in block_except_list and country not in blocked_countries:
1714
+ countries_to_block.append(country)
1715
+
1716
+ for country in block_except_list:
1717
+ if country in blocked_countries:
1718
+ raise ValueError("I can't allow a country that has already been blocked")
1719
+ if country not in allowed_countries:
1720
+ countries_to_allow.append(country)
1721
+
1722
+ rows = generate_csv_rows_for_species(species_string,
1723
+ allow_countries=countries_to_allow,
1724
+ block_countries=countries_to_block)
1725
+
1726
+ return rows
1727
+
1728
+ # ...def _generate_csv_rows_to_block_all_countries_except(...)
1729
+
1730
+
1731
+ def generate_csv_rows_for_species(species_string,
1732
+ allow_countries=None,
1733
+ block_countries=None,
1734
+ allow_states=None,
1735
+ block_states=None,
1736
+ blockexcept_countries=None):
1737
+ """
1738
+ Generate rows in the format expected by geofence_fixes.csv, representing a list of
1739
+ allow and/or block rules for the specified species and countries/states. Does not check
1740
+ that the rules make sense; e.g. nothing will stop you in this function from both allowing
1741
+ and blocking a country.
1742
+
1743
+ Args:
1744
+ species_string (str): string in semicolon-delimited WI taxonomy format
1745
+ allow_countries (optional, list or str): three-letter country codes, list of
1746
+ country codes, or comma-separated list of country codes to allow
1747
+ block_countries (optional, list or str): three-letter country codes, list of
1748
+ country codes, or comma-separated list of country codes to block
1749
+ allow_states (optional, list or str): two-letter state codes, list of
1750
+ state codes, or comma-separated list of state codes to allow
1751
+ block_states (optional, list or str): two-letter state code, list of
1752
+ state codes, or comma-separated list of state codes to block
1753
+
1754
+ Returns:
1755
+ list of str: lines ready to be pasted into geofence_fixes.csv
1756
+ """
1757
+
1758
+ assert is_valid_taxonomy_string(species_string), \
1759
+ '{} is not a valid taxonomy string'.format(species_string)
1760
+
1761
+ lines = []
1762
+
1763
+ if allow_countries is not None:
1764
+ allow_countries = _parse_code_list(allow_countries)
1765
+ for country in allow_countries:
1766
+ lines.append(species_string + ',allow,' + country + ',')
1767
+
1768
+ if block_countries is not None:
1769
+ block_countries = _parse_code_list(block_countries)
1770
+ for country in block_countries:
1771
+ lines.append(species_string + ',block,' + country + ',')
1772
+
1773
+ if allow_states is not None:
1774
+ allow_states = _parse_code_list(allow_states)
1775
+ for state in allow_states:
1776
+ lines.append(species_string + ',allow,USA,' + state)
1777
+
1778
+ if block_states is not None:
1779
+ block_states = _parse_code_list(block_states)
1780
+ for state in block_states:
1781
+ lines.append(species_string + ',block,USA,' + state)
1782
+
1783
+ return lines
1784
+
1785
+ # ...def generate_csv_rows_for_species(...)
1786
+
1787
+
1788
+ def initialize_geofencing(geofencing_file,country_code_file,force_init=False):
1789
+ """
1790
+ Load geofencing information from a .json file, and country code mappings from
1791
+ a .csv file. Stores results in the global tables [taxonomy_string_to_geofencing_rules],
1792
+ [country_to_country_code], and [country_code_to_country].
1793
+
1794
+ Args:
1795
+ geofencing_file (str): .json file with geofencing rules
1796
+ country_code_file (str): .csv file with country code mappings, in columns
1797
+ called "name" and "alpha-3", e.g. from
1798
+ https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
1799
+ force_init (bool, optional): if the output dicts already exist, should we
1800
+ re-initialize anyway?
1801
+ """
1802
+
1803
+ global taxonomy_string_to_geofencing_rules
1804
+ global country_to_country_code
1805
+ global country_code_to_country
1806
+
1807
+ if (country_to_country_code is not None) and \
1808
+ (country_code_to_country is not None) and \
1809
+ (taxonomy_string_to_geofencing_rules is not None) and \
1810
+ (not force_init):
1811
+ return
1812
+
1813
+ # Read country code information
1814
+ country_code_df = pd.read_csv(country_code_file)
1815
+ country_to_country_code = {}
1816
+ country_code_to_country = {}
1817
+ for i_row,row in country_code_df.iterrows():
1818
+ country_to_country_code[row['name'].lower()] = row['alpha-3'].upper()
1819
+ country_code_to_country[row['alpha-3'].upper()] = row['name'].lower()
1820
+
1821
+ # Read geofencing information
1822
+ with open(geofencing_file,'r',encoding='utf-8') as f:
1823
+ taxonomy_string_to_geofencing_rules = json.load(f)
1824
+
1825
+ """
1826
+ Geofencing keys are taxonomy strings, e.g.:
1827
+
1828
+ 'mammalia;cetartiodactyla;cervidae;odocoileus;virginianus'
1829
+
1830
+ Geofencing values are tables mapping allow/block to country codes, optionally including region/state codes, e.g.:
1831
+
1832
+ {'allow': {
1833
+ 'ALA': [],
1834
+ 'ARG': [],
1835
+ ...
1836
+ 'SUR': [],
1837
+ 'TTO': [],
1838
+ 'USA': ['AL',
1839
+ 'AR',
1840
+ 'AZ',
1841
+ ...
1842
+ }
1843
+ """
1844
+
1845
+ # Validate
1846
+
1847
+ # species_string = next(iter(taxonomy_string_to_geofencing_rules.keys()))
1848
+ for species_string in taxonomy_string_to_geofencing_rules.keys():
1849
+
1850
+ species_rules = taxonomy_string_to_geofencing_rules[species_string]
1851
+
1852
+ # Every country should *either* have allow rules or block rules, no countries
1853
+ # currently have both
1854
+ assert len(species_rules.keys()) == 1
1855
+ rule_type = list(species_rules.keys())[0]
1856
+ assert rule_type in ('allow','block')
1857
+
1858
+ all_country_rules_this_species = species_rules[rule_type]
1859
+ for country_code in all_country_rules_this_species.keys():
1860
+
1861
+ assert country_code in country_code_to_country
1862
+
1863
+ region_rules = all_country_rules_this_species[country_code]
1864
+
1865
+ # Right now we only have regional rules for the USA; these may be part of
1866
+ # allow or block rules.
1867
+ if len(region_rules) > 0:
1868
+ assert country_code == 'USA'
1869
+
1870
+ # ...for each species
1871
+
1872
+ # ...def initialize_geofencing(...)
1873
+
1874
+
1875
+ def _species_string_to_canonical_species_string(species):
1876
+ """
1877
+ Convert a string that may be a 5-token species string, a binomial name,
1878
+ or a common name into a 5-token species string.
1879
+ """
1880
+
1881
+ global taxonomy_string_to_taxonomy_info
1882
+ global binomial_name_to_taxonomy_info
1883
+ global common_name_to_taxonomy_info
1884
+
1885
+ assert taxonomy_string_to_geofencing_rules is not None, \
1886
+ 'Initialize geofencing prior to species lookup'
1887
+ assert taxonomy_string_to_taxonomy_info is not None, \
1888
+ 'Initialize taxonomy lookup prior to species lookup'
1889
+
1890
+ species = species.lower()
1891
+
1892
+ # Turn "species" into a taxonomy string
1893
+
1894
+ # If this is already a taxonomy string...
1895
+ if len(species.split(';')) == 5:
1896
+ pass
1897
+ # If this is a binomial name (which may include a subspecies)...
1898
+ elif (len(species.split(' ')) in (2,3)) and (species in binomial_name_to_taxonomy_info):
1899
+ taxonomy_info = binomial_name_to_taxonomy_info[species]
1900
+ taxonomy_string = taxonomy_info_to_taxonomy_string(taxonomy_info)
1901
+ # If this is a common name...
1902
+ elif species in common_name_to_taxonomy_info:
1903
+ taxonomy_info = common_name_to_taxonomy_info[species]
1904
+ taxonomy_string = taxonomy_info_to_taxonomy_string(taxonomy_info)
1905
+ else:
1906
+ raise ValueError('Could not find taxonomic information for {}'.format(species))
1907
+
1908
+ return taxonomy_string
1909
+
1910
+
1911
+ def species_allowed_in_country(species,country,state=None,return_status=False):
1912
+ """
1913
+ Determines whether [species] is allowed in [country], according to
1914
+ already-initialized geofencing rules.
1915
+
1916
+ Args:
1917
+ species (str): can be a common name, a binomial name, or a species string
1918
+ country (str): country name or three-letter code
1919
+ state (str, optional): two-letter US state code
1920
+ return_status (bool, optional): by default, this function returns a bool;
1921
+ if you want to know *why* [species] is allowed/not allowed, settings
1922
+ return_status to True will return additional information.
1923
+
1924
+ Returns:
1925
+ bool or str: typically returns True if [species] is allowed in [country], else
1926
+ False. Returns a more detailed string if return_status is set.
1927
+ """
1928
+
1929
+ global taxonomy_string_to_taxonomy_info
1930
+ global binomial_name_to_taxonomy_info
1931
+ global common_name_to_taxonomy_info
1932
+
1933
+ assert taxonomy_string_to_geofencing_rules is not None, \
1934
+ 'Initialize geofencing prior to species lookup'
1935
+ assert taxonomy_string_to_taxonomy_info is not None, \
1936
+ 'Initialize taxonomy lookup prior to species lookup'
1937
+
1938
+ taxonomy_string = _species_string_to_canonical_species_string(species)
1939
+
1940
+ # Normalize [state]
1941
+
1942
+ if state is not None:
1943
+ state = state.upper()
1944
+ assert len(state) == 2
1945
+
1946
+ # Turn "country" into a country code
1947
+
1948
+ if len(country) == 3:
1949
+ assert country.upper() in country_code_to_country
1950
+ country = country.upper()
1951
+ else:
1952
+ assert country.lower() in country_to_country_code
1953
+ country = country_to_country_code[country.lower()]
1954
+
1955
+ country_code = country.upper()
1956
+
1957
+ # Species with no rules are allowed everywhere
1958
+ if taxonomy_string not in taxonomy_string_to_geofencing_rules:
1959
+ status = 'allow_by_default'
1960
+ if return_status:
1961
+ return status
1962
+ else:
1963
+ return True
1964
+
1965
+ geofencing_rules_this_species = taxonomy_string_to_geofencing_rules[taxonomy_string]
1966
+ allowed_countries = []
1967
+ blocked_countries = []
1968
+
1969
+ assert len(geofencing_rules_this_species.keys()) == 1
1970
+ rule_type = list(geofencing_rules_this_species.keys())[0]
1971
+ assert rule_type in ('allow','block')
1972
+
1973
+ if rule_type == 'allow':
1974
+ allowed_countries = list(geofencing_rules_this_species['allow'])
1975
+ else:
1976
+ assert rule_type == 'block'
1977
+ blocked_countries = list(geofencing_rules_this_species['block'])
1978
+
1979
+ status = None
1980
+
1981
+ # The convention is that block rules win over allow rules
1982
+ if country_code in blocked_countries:
1983
+ status = 'blocked'
1984
+ elif country_code in allowed_countries:
1985
+ status = 'allowed'
1986
+ else:
1987
+ # The convention is that if allow rules exist, any country not on that list
1988
+ # is blocked.
1989
+ assert len(allowed_countries) > 0
1990
+ return 'not_on_country_allow_list'
1991
+
1992
+ # Now let's see whether we have to deal with any regional rules
1993
+ if state is None:
1994
+
1995
+ # If state rules are provided, we need to have a state
1996
+ if country_code == 'USA':
1997
+ state_list = geofencing_rules_this_species[rule_type][country_code]
1998
+ if len(state_list) > 0:
1999
+ raise ValueError('Cannot determine status for a species with state-level rules with no state information')
2000
+
2001
+ else:
2002
+
2003
+ # Right now state-level rules only exist for the US
2004
+ assert country_code == 'USA'
2005
+ state_list = geofencing_rules_this_species[rule_type][country_code]
2006
+
2007
+ if state in state_list:
2008
+ # If the state is on the list, do what the list says
2009
+ if rule_type == 'allow':
2010
+ status = 'allow_on_state_allow_list'
2011
+ else:
2012
+ status = 'block_on_state_block_list'
2013
+ else:
2014
+ # If the state is not on the list, do the opposite of what the list says
2015
+ if rule_type == 'allow':
2016
+ status = 'block_not_on_state_allow_list'
2017
+ else:
2018
+ status = 'allow_not_on_state_block_list'
2019
+
2020
+ if return_status:
2021
+ return status
2022
+ else:
2023
+ if status.startswith('allow'):
2024
+ return True
2025
+ else:
2026
+ assert status.startswith('block')
2027
+ return False
2028
+
2029
+ # ...def species_allowed_in_country(...)
2030
+
2031
+
2032
+ def restrict_to_taxa_list(taxa_list,
2033
+ speciesnet_taxonomy_file,
2034
+ input_file,
2035
+ output_file,
2036
+ allow_walk_down=False):
2037
+ """
2038
+ Given a prediction file in MD .json format, likely without having had
2039
+ a geofence applied, apply a custom taxa list.
2040
+
2041
+ Args:
2042
+ taxa_list (str or list): list of latin names, or a text file containing
2043
+ a list of latin names. Optionally may contain a second (comma-delimited)
2044
+ column containing common names, used only for debugging. Latin names
2045
+ must exist in the SpeciesNet taxonomy.
2046
+ taxonomy_file (str): taxonomy filename, in the same format used for model
2047
+ release (with 7-token taxonomy entries)
2048
+ output_file (str): .json file to write, in MD format
2049
+ allow_walk_down (bool, optional): should we walk down the taxonomy tree
2050
+ when making mappings if a parent has only a single allowable child?
2051
+ For example, if only a single felid species is allowed, should other
2052
+ felid predictions be mapped to that species, as opposed to being mapped
2053
+ to the family?
2054
+ """
2055
+
2056
+ ##%% Read target taxa list
2057
+
2058
+ if isinstance(taxa_list,str):
2059
+ assert os.path.isfile(taxa_list), \
2060
+ 'Could not find taxa list file {}'.format(taxa_list)
2061
+ with open(taxa_list,'r') as f:
2062
+ taxa_list = f.readlines()
2063
+
2064
+ taxa_list = [s.strip().lower() for s in taxa_list]
2065
+ taxa_list = [s for s in taxa_list if len(s) > 0]
2066
+
2067
+ target_latin_to_common = {}
2068
+ for s in taxa_list:
2069
+ if s.strip().startswith('#'):
2070
+ continue
2071
+ tokens = s.split(',')
2072
+ assert len(tokens) <= 2
2073
+ binomial_name = tokens[0]
2074
+ assert len(binomial_name.split(' ')) in (1,2,3), \
2075
+ 'Illegal binomial name in species list: {}'.format(binomial_name)
2076
+ if len(tokens) > 0:
2077
+ common_name = tokens[1].strip().lower()
2078
+ else:
2079
+ common_name = None
2080
+ assert binomial_name not in target_latin_to_common
2081
+ target_latin_to_common[binomial_name] = common_name
2082
+
2083
+
2084
+ ##%% Read taxonomy file
2085
+
2086
+ with open(speciesnet_taxonomy_file,'r') as f:
2087
+ speciesnet_taxonomy_list = f.readlines()
2088
+ speciesnet_taxonomy_list = [s.strip() for s in \
2089
+ speciesnet_taxonomy_list if len(s.strip()) > 0]
2090
+
2091
+ # Maps the latin name of every taxon to the corresponding full taxon string
2092
+ #
2093
+ # For species, the key is a binomial name
2094
+ speciesnet_latin_name_to_taxon_string = {}
2095
+ speciesnet_common_name_to_taxon_string = {}
2096
+
2097
+ def _insert_taxonomy_string(s):
2098
+
2099
+ tokens = s.split(';')
2100
+ assert len(tokens) == 7
2101
+
2102
+ guid = tokens[0] # noqa
2103
+ class_name = tokens[1]
2104
+ order = tokens[2]
2105
+ family = tokens[3]
2106
+ genus = tokens[4]
2107
+ species = tokens[5]
2108
+ common_name = tokens[6]
2109
+
2110
+ if len(class_name) == 0:
2111
+ assert common_name in ('animal','vehicle','blank')
2112
+ return
2113
+
2114
+ if len(species) > 0:
2115
+ assert all([len(s) > 0 for s in [genus,family,order]])
2116
+ binomial_name = genus + ' ' + species
2117
+ if binomial_name not in speciesnet_latin_name_to_taxon_string:
2118
+ speciesnet_latin_name_to_taxon_string[binomial_name] = s
2119
+ elif len(genus) > 0:
2120
+ assert all([len(s) > 0 for s in [family,order]])
2121
+ if genus not in speciesnet_latin_name_to_taxon_string:
2122
+ speciesnet_latin_name_to_taxon_string[genus] = s
2123
+ elif len(family) > 0:
2124
+ assert len(order) > 0
2125
+ if family not in speciesnet_latin_name_to_taxon_string:
2126
+ speciesnet_latin_name_to_taxon_string[family] = s
2127
+ elif len(order) > 0:
2128
+ if order not in speciesnet_latin_name_to_taxon_string:
2129
+ speciesnet_latin_name_to_taxon_string[order] = s
2130
+ else:
2131
+ if class_name not in speciesnet_latin_name_to_taxon_string:
2132
+ speciesnet_latin_name_to_taxon_string[class_name] = s
2133
+
2134
+ if len(common_name) > 0:
2135
+ if common_name not in speciesnet_common_name_to_taxon_string:
2136
+ speciesnet_common_name_to_taxon_string[common_name] = s
2137
+
2138
+ for s in speciesnet_taxonomy_list:
2139
+
2140
+ _insert_taxonomy_string(s)
2141
+
2142
+
2143
+ ##%% Make sure all parent taxa are represented in the taxonomy
2144
+
2145
+ # In theory any taxon that appears as the parent of another taxon should
2146
+ # also be in the taxonomy, but this isn't always true, so we fix it here.
2147
+
2148
+ new_taxon_string_to_missing_tokens = defaultdict(list)
2149
+
2150
+ # latin_name = next(iter(speciesnet_latin_name_to_taxon_string.keys()))
2151
+ for latin_name in speciesnet_latin_name_to_taxon_string.keys():
2152
+
2153
+ if 'no cv result' in latin_name:
2154
+ continue
2155
+
2156
+ taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
2157
+ tokens = taxon_string.split(';')
2158
+
2159
+ # Don't process GUID, species, or common name
2160
+ # i_token = 6
2161
+ for i_token in range(1,len(tokens)-2):
2162
+
2163
+ test_token = tokens[i_token]
2164
+ if len(test_token) == 0:
2165
+ continue
2166
+
2167
+ # Do we need to make up a taxon for this token?
2168
+ if test_token not in speciesnet_latin_name_to_taxon_string:
2169
+
2170
+ new_tokens = [''] * 7
2171
+ new_tokens[0] = 'fake_guid'
2172
+ for i_copy_token in range(1,i_token+1):
2173
+ new_tokens[i_copy_token] = tokens[i_copy_token]
2174
+ new_tokens[-1] = test_token + ' species'
2175
+ assert new_tokens[-2] == ''
2176
+ new_taxon_string = ';'.join(new_tokens)
2177
+ # assert new_taxon_string not in new_taxon_strings
2178
+ new_taxon_string_to_missing_tokens[new_taxon_string].append(test_token)
2179
+
2180
+ # ...for each token
2181
+
2182
+ # ...for each taxon
2183
+
2184
+ print('Found {} taxa that need to be inserted to make the taxonomy valid:\n'.format(
2185
+ len(new_taxon_string_to_missing_tokens)))
2186
+
2187
+ new_taxon_string_to_missing_tokens = \
2188
+ sort_dictionary_by_key(new_taxon_string_to_missing_tokens)
2189
+ for taxon_string in new_taxon_string_to_missing_tokens:
2190
+ missing_taxa = ','.join(new_taxon_string_to_missing_tokens[taxon_string])
2191
+ print('{} ({})'.format(taxon_string,missing_taxa))
2192
+
2193
+ for new_taxon_string in new_taxon_string_to_missing_tokens:
2194
+ _insert_taxonomy_string(new_taxon_string)
2195
+
2196
+
2197
+ ##%% Make sure all species on the allow-list are in the taxonomy
2198
+
2199
+ n_failed_mappings = 0
2200
+
2201
+ for target_taxon_latin_name in target_latin_to_common.keys():
2202
+ if target_taxon_latin_name not in speciesnet_latin_name_to_taxon_string:
2203
+ common_name = target_latin_to_common[target_taxon_latin_name]
2204
+ s = '{} ({}) not in speciesnet taxonomy'.format(
2205
+ target_taxon_latin_name,common_name)
2206
+ if common_name in speciesnet_common_name_to_taxon_string:
2207
+ s += ' (common name maps to {})'.format(
2208
+ speciesnet_common_name_to_taxon_string[common_name])
2209
+ print(s)
2210
+ n_failed_mappings += 1
2211
+
2212
+ if n_failed_mappings > 0:
2213
+ raise ValueError('Cannot continue with geofence generation')
2214
+
2215
+
2216
+ ##%% For the allow-list, map each parent taxon to a set of allowable child taxa
2217
+
2218
+ # Maps parent names to all allowed child names, or None if this is the
2219
+ # lowest-level allowable taxon on this path
2220
+ allowed_parent_taxon_to_child_taxa = defaultdict(set)
2221
+
2222
+ # latin_name = next(iter(target_latin_to_common.keys()))
2223
+ for latin_name in target_latin_to_common:
2224
+
2225
+ taxon_string = speciesnet_latin_name_to_taxon_string[latin_name]
2226
+ tokens = taxon_string.split(';')
2227
+ assert len(tokens) == 7
2228
+
2229
+ # Remove GUID and common mame
2230
+ #
2231
+ # This is now always class/order/family/genus/species
2232
+ tokens = tokens[1:-1]
2233
+
2234
+ child_taxon = None
2235
+
2236
+ # If this is a species
2237
+ if len(tokens[-1]) > 0:
2238
+ binomial_name = tokens[-2] + ' ' + tokens[-1]
2239
+ assert binomial_name == latin_name
2240
+ allowed_parent_taxon_to_child_taxa[binomial_name].add(None)
2241
+ child_taxon = binomial_name
2242
+
2243
+ # The first candidate parent is the genus
2244
+ parent_token_index = len(tokens) - 2
2245
+
2246
+ while(parent_token_index >= 0):
2247
+
2248
+ parent_taxon = tokens[parent_token_index]
2249
+ allowed_parent_taxon_to_child_taxa[parent_taxon].add(child_taxon)
2250
+ child_taxon = parent_taxon
2251
+ parent_token_index -= 1
2252
+
2253
+ # ...for each allowed latin name
2254
+
2255
+ allowed_parent_taxon_to_child_taxa = \
2256
+ sort_dictionary_by_key(allowed_parent_taxon_to_child_taxa)
2257
+
2258
+
2259
+ ##%% Map all predictions that exist in this dataset...
2260
+
2261
+ # ...to the prediction we should generate.
2262
+
2263
+ with open(input_file,'r') as f:
2264
+ input_data = json.load(f)
2265
+
2266
+ input_category_id_to_common_name = input_data['classification_categories'] #noqa
2267
+ input_category_id_to_taxonomy_string = \
2268
+ input_data['classification_category_descriptions']
2269
+
2270
+ input_category_id_to_output_taxon_string = {}
2271
+
2272
+ # input_category_id = next(iter(input_category_id_to_taxonomy_string.keys()))
2273
+ for input_category_id in input_category_id_to_taxonomy_string.keys():
2274
+
2275
+ input_taxon_string = input_category_id_to_taxonomy_string[input_category_id]
2276
+ input_taxon_tokens = input_taxon_string.split(';')
2277
+ assert len(input_taxon_tokens) == 7
2278
+
2279
+ # Don't mess with blank/no-cv-result/animal/human
2280
+ if (input_taxon_string in non_taxonomic_prediction_strings) or \
2281
+ (input_taxon_string == human_prediction_string):
2282
+ input_category_id_to_output_taxon_string[input_category_id] = \
2283
+ input_taxon_string
2284
+ continue
2285
+
2286
+ # Remove GUID and common mame
2287
+ #
2288
+ # This is now always class/order/family/genus/species
2289
+ input_taxon_tokens = input_taxon_tokens[1:-1]
2290
+
2291
+ test_index = len(input_taxon_tokens) - 1
2292
+ target_taxon = None
2293
+
2294
+ # Start at the species level, and see whether each taxon is allowed
2295
+ while((test_index >= 0) and (target_taxon is None)):
2296
+
2297
+ # Species are represented as binomial names
2298
+ if (test_index == (len(input_taxon_tokens) - 1)) and \
2299
+ (len(input_taxon_tokens[-1]) > 0):
2300
+ test_taxon_name = \
2301
+ input_taxon_tokens[-2] + ' ' + input_taxon_tokens[-1]
2302
+ else:
2303
+ test_taxon_name = input_taxon_tokens[test_index]
2304
+
2305
+ # If we haven't yet found the level at which this taxon is non-empty,
2306
+ # keep going up
2307
+ if len(test_taxon_name) == 0:
2308
+ test_index -= 1
2309
+ continue
2310
+
2311
+ assert test_taxon_name in speciesnet_latin_name_to_taxon_string
2312
+
2313
+ # Is this taxon allowed according to the custom species list?
2314
+ if test_taxon_name in allowed_parent_taxon_to_child_taxa:
2315
+
2316
+ allowed_child_taxa = allowed_parent_taxon_to_child_taxa[test_taxon_name]
2317
+ assert allowed_child_taxa is not None
2318
+
2319
+ # If this is the lowest-level allowable token or there is not a
2320
+ # unique child, don't walk any further, even if walking down
2321
+ # is enabled.
2322
+ if (None in allowed_child_taxa):
2323
+ assert len(allowed_child_taxa) == 1
2324
+
2325
+ if (None in allowed_child_taxa) or (len(allowed_child_taxa) > 1):
2326
+ target_taxon = test_taxon_name
2327
+ elif not allow_walk_down:
2328
+ target_taxon = test_taxon_name
2329
+ else:
2330
+ # If there's a unique child, walk back *down* the allowable
2331
+ # taxa until we run out of unique children
2332
+ while ((next(iter(allowed_child_taxa)) is not None) and \
2333
+ (len(allowed_child_taxa) == 1)):
2334
+ candidate_taxon = next(iter(allowed_child_taxa))
2335
+ assert candidate_taxon in allowed_parent_taxon_to_child_taxa
2336
+ assert candidate_taxon in speciesnet_latin_name_to_taxon_string
2337
+ allowed_child_taxa = \
2338
+ allowed_parent_taxon_to_child_taxa[candidate_taxon]
2339
+ target_taxon = candidate_taxon
2340
+
2341
+ # ...if this is an allowed taxon
2342
+
2343
+ test_index -= 1
2344
+
2345
+ # ...for each token
2346
+
2347
+ if target_taxon is None:
2348
+ output_taxon_string = animal_prediction_string
2349
+ else:
2350
+ output_taxon_string = speciesnet_latin_name_to_taxon_string[target_taxon]
2351
+ input_category_id_to_output_taxon_string[input_category_id] = output_taxon_string
2352
+
2353
+ # ...for each category
2354
+
2355
+
2356
+ ##%% Build the new tables
2357
+
2358
+ input_category_id_to_output_category_id = {}
2359
+ output_taxon_string_to_category_id = {}
2360
+ output_category_id_to_common_name = {}
2361
+
2362
+ for input_category_id in input_category_id_to_output_taxon_string:
2363
+
2364
+ original_common_name = \
2365
+ input_category_id_to_common_name[input_category_id]
2366
+ original_taxon_string = \
2367
+ input_category_id_to_taxonomy_string[input_category_id]
2368
+ output_taxon_string = \
2369
+ input_category_id_to_output_taxon_string[input_category_id]
2370
+
2371
+ output_common_name = output_taxon_string.split(';')[-1]
2372
+
2373
+ # Do we need to create a new output category?
2374
+ if output_taxon_string not in output_taxon_string_to_category_id:
2375
+ output_category_id = str(len(output_taxon_string_to_category_id))
2376
+ output_taxon_string_to_category_id[output_taxon_string] = \
2377
+ output_category_id
2378
+ output_category_id_to_common_name[output_category_id] = \
2379
+ output_common_name
2380
+ else:
2381
+ output_category_id = \
2382
+ output_taxon_string_to_category_id[output_taxon_string]
2383
+
2384
+ input_category_id_to_output_category_id[input_category_id] = \
2385
+ output_category_id
2386
+
2387
+ if False:
2388
+ print('Mapping {} ({}) to:\n{} ({})\n'.format(
2389
+ original_common_name,original_taxon_string,
2390
+ output_common_name,output_taxon_string))
2391
+ if False:
2392
+ print('Mapping {} to {}'.format(
2393
+ original_common_name,output_common_name,))
2394
+
2395
+ # ...for each category
2396
+
2397
+
2398
+ ##%% Remap all category labels
2399
+
2400
+ assert len(set(output_taxon_string_to_category_id.keys())) == \
2401
+ len(set(output_taxon_string_to_category_id.values()))
2402
+
2403
+ output_category_id_to_taxon_string = \
2404
+ invert_dictionary(output_taxon_string_to_category_id)
2405
+
2406
+ with open(input_file,'r') as f:
2407
+ output_data = json.load(f)
2408
+
2409
+ for im in tqdm(output_data['images']):
2410
+ if 'detections' in im and im['detections'] is not None:
2411
+ for det in im['detections']:
2412
+ if 'classifications' in det:
2413
+ for classification in det['classifications']:
2414
+ classification[0] = \
2415
+ input_category_id_to_output_category_id[classification[0]]
2416
+
2417
+ output_data['classification_categories'] = output_category_id_to_common_name
2418
+ output_data['classification_category_descriptions'] = \
2419
+ output_category_id_to_taxon_string
2420
+
2421
+
2422
+ ##%% Write output
2423
+
2424
+ with open(output_file,'w') as f:
2425
+ json.dump(output_data,f,indent=1)
2426
+
2427
+
2428
+ #%% Interactive driver(s)
2429
+
2430
+ if False:
2431
+
2432
+ pass
2433
+
2434
+ #%% Shared cell to initialize geofencing and taxonomy information
2435
+
2436
+ from megadetector.utils.wi_utils import species_allowed_in_country # noqa
2437
+ from megadetector.utils.wi_utils import initialize_geofencing, initialize_taxonomy_info # noqa
2438
+ from megadetector.utils.wi_utils import _species_string_to_canonical_species_string # noqa
2439
+ from megadetector.utils.wi_utils import generate_csv_rows_for_species # noqa
2440
+ from megadetector.utils.wi_utils import _generate_csv_rows_to_block_all_countries_except # noqa
2441
+
2442
+ from megadetector.utils.wi_utils import taxonomy_string_to_geofencing_rules # noqa
2443
+ from megadetector.utils.wi_utils import taxonomy_string_to_taxonomy_info # noqa
2444
+
2445
+ geofencing_file = r'c:\git\cameratrapai\data\geofence_base.json'
2446
+
2447
+ country_code_file = r'g:\temp\country-codes.csv'
2448
+ # encoding = 'cp1252'; taxonomy_file = r'g:\temp\taxonomy_mapping-' + encoding + '.json'
2449
+ encoding = None; taxonomy_file = r'g:\temp\taxonomy_mapping.json'
2450
+
2451
+ initialize_geofencing(geofencing_file, country_code_file, force_init=True)
2452
+ initialize_taxonomy_info(taxonomy_file, force_init=True, encoding=encoding)
2453
+
2454
+
2455
+ #%% Test driver for geofence_fixes.csv function
2456
+
2457
+ block_except_list = 'AUS, PNG, THA, IDN, MYS'
2458
+ species = 'dingo'
2459
+ species_string = _species_string_to_canonical_species_string(species)
2460
+ rows = _generate_csv_rows_to_block_all_countries_except(species_string,block_except_list)
2461
+
2462
+ import clipboard; clipboard.copy('\n'.join(rows))
2463
+
2464
+
2465
+ #%%
2466
+
2467
+ generate_csv_rows_for_species(species_string=species_string,
2468
+ allow_countries=None,
2469
+ block_countries=None,
2470
+ allow_states=None,
2471
+ block_states=None,
2472
+ blockexcept_countries=None)
2473
+
2474
+
2475
+ _generate_csv_rows_to_block_all_countries_except(species_string,'AUS')
2476
+
2477
+
2478
+ #%% Test the effects of geofence changes
2479
+
2480
+ species = 'canis lupus dingo'
2481
+ country = 'guatemala'
2482
+ species_allowed_in_country(species,country,state=None,return_status=False)
2483
+
2484
+
2485
+ #%% instances.json generation test
2486
+
2487
+ from megadetector.utils.wi_utils import generate_instances_json_from_folder # noqa
2488
+
2489
+ instances_file = r'g:\temp\water-hole\instances.json'
2490
+
2491
+ _ = generate_instances_json_from_folder(folder=r'g:\temp\water-hole',
2492
+ country='NAM',
2493
+ lat=None,
2494
+ lon=None,
2495
+ output_file=instances_file,
2496
+ filename_replacements={'g:/temp':'/mnt/g/temp'})
2497
+
2498
+ # from megadetector.utils.path_utils import open_file; open_file(instances_file)
2499
+
2500
+
2501
+ #%% MD --> prediction conversion test
2502
+
2503
+ from megadetector.utils.wi_utils import generate_predictions_json_from_md_results # noqa
2504
+ md_results_file = r'G:\temp\md-test-images\mdv5a.relpaths.json'
2505
+ predictions_json_file = r'\\wsl$\Ubuntu\home\dmorris\tmp\speciesnet-tests\mdv5a.abspaths.predictions-format.json'
2506
+ generate_predictions_json_from_md_results(md_results_file,predictions_json_file,base_folder=
2507
+ '/home/dmorris/tmp/md-test-images/')
2508
+
2509
+ from megadetector.utils.wi_utils import generate_predictions_json_from_md_results # noqa
2510
+ md_results_file = r"G:\temp\water-hole\md_results.json"
2511
+ predictions_json_file = r"G:\temp\water-hole\md_results-prediction_format.json"
2512
+ generate_predictions_json_from_md_results(md_results_file,predictions_json_file,base_folder=
2513
+ '/mnt/g/temp/water-hole')
2514
+
2515
+
2516
+ #%% Geofencing tests
2517
+
2518
+ species = 'didelphis marsupialis'
2519
+ print(binomial_name_to_taxonomy_info[species])
2520
+ country = 'Guatemala'
2521
+ assert species_allowed_in_country(species, country)
2522
+
2523
+ species = 'virginia opossum'
2524
+ print(common_name_to_taxonomy_info[species])
2525
+ country = 'USA'
2526
+ assert species_allowed_in_country(species, country)
2527
+
2528
+
2529
+ #%% Test several species
2530
+
2531
+ if True:
2532
+
2533
+ # Make sure some Guatemalan species are allowed in Guatemala
2534
+ all_species = [
2535
+ 'didelphis marsupialis',
2536
+ 'didelphis virginiana',
2537
+ 'dasypus novemcinctus',
2538
+ 'urocyon cinereoargenteus',
2539
+ 'nasua narica',
2540
+ 'eira barbara',
2541
+ 'conepatus semistriatus',
2542
+ 'leopardus wiedii',
2543
+ 'leopardus pardalis',
2544
+ 'puma concolor',
2545
+ 'panthera onca',
2546
+ 'tapirus bairdii',
2547
+ 'pecari tajacu',
2548
+ 'tayassu pecari',
2549
+ 'mazama temama',
2550
+ 'mazama pandora',
2551
+ 'odocoileus virginianus',
2552
+ 'dasyprocta punctata',
2553
+ 'tinamus major',
2554
+ 'crax rubra',
2555
+ 'meleagris ocellata',
2556
+ 'gulo gulo' # Consistency check; this species should be blocked
2557
+ ]
2558
+
2559
+ country ='guatemala'
2560
+ state = None
2561
+
2562
+ if True:
2563
+
2564
+ # Make sure some PNW species are allowed in the right states
2565
+ all_species = \
2566
+ ['Taxidea taxus',
2567
+ 'Martes americana',
2568
+ 'Ovis canadensis',
2569
+ 'Ursus americanus',
2570
+ 'Lynx rufus',
2571
+ 'Lynx canadensis',
2572
+ 'Puma concolor',
2573
+ 'Canis latrans',
2574
+ 'Cervus canadensis',
2575
+ 'Canis lupus',
2576
+ 'Ursus arctos',
2577
+ 'Marmota caligata',
2578
+ 'Alces alces',
2579
+ 'Oreamnos americanus',
2580
+ 'Odocoileus hemionus',
2581
+ 'Vulpes vulpes',
2582
+ 'Lepus americanus',
2583
+ 'Mephitis mephitis',
2584
+ 'Odocoileus virginianus',
2585
+ 'Marmota flaviventris',
2586
+ 'tapirus bairdii' # Consistency check; this species should be blocked
2587
+ ]
2588
+
2589
+ all_species = [s.lower() for s in all_species]
2590
+
2591
+ country = 'USA'
2592
+ state = 'WA'
2593
+ # state = 'MT'
2594
+
2595
+ if True:
2596
+
2597
+ all_species = ['ammospermophilus harrisii']
2598
+ country = 'USA'
2599
+ state = 'CA'
2600
+
2601
+ for species in all_species:
2602
+
2603
+ taxonomy_info = binomial_name_to_taxonomy_info[species]
2604
+ allowed = species_allowed_in_country(species, country, state=state, return_status=True)
2605
+ state_string = ''
2606
+ if state is not None:
2607
+ state_string = ' ({})'.format(state)
2608
+ print('{} ({}) for {}{}: {}'.format(taxonomy_info['common_name'],species,country,state_string,allowed))
2609
+
2610
+
2611
+ #%% Test conversion from predictons.json to MD format
2612
+
2613
+ import os # noqa
2614
+ from megadetector.utils.wi_utils import generate_md_results_from_predictions_json # noqa
2615
+
2616
+ # detector_source = 'speciesnet'
2617
+ detector_source = 'md'
2618
+
2619
+ if False:
2620
+ image_folder = r'g:\temp\md-test-images'
2621
+ base_folder = '/home/dmorris/tmp/md-test-images/'
2622
+ if detector_source == 'speciesnet':
2623
+ predictions_json_file = r"\\wsl$\Ubuntu\home\dmorris\tmp\speciesnet-tests\ensemble-output.json"
2624
+ md_results_file = r"\\wsl$\Ubuntu\home\dmorris\tmp\speciesnet-tests\ensemble-output-md-format.json"
2625
+ else:
2626
+ assert detector_source == 'md'
2627
+ predictions_json_file = r"\\wsl$\Ubuntu\home\dmorris\tmp\speciesnet-tests\ensemble-output-from-md-results.json"
2628
+ md_results_file = r"\\wsl$\Ubuntu\home\dmorris\tmp\speciesnet-tests\ensemble-output-md-format-from-md-results.json"
2629
+ else:
2630
+ image_folder = r'g:\temp\water-hole'
2631
+ base_folder = '/mnt/g/temp/water-hole/'
2632
+ if detector_source == 'speciesnet':
2633
+ predictions_json_file = r'g:\temp\water-hole\ensemble-output.json'
2634
+ md_results_file = r'g:\temp\water-hole\ensemble-output.md_format.json'
2635
+ else:
2636
+ assert detector_source == 'md'
2637
+ predictions_json_file = r'g:\temp\water-hole\ensemble-output-md.json'
2638
+ md_results_file = r'g:\temp\water-hole\ensemble-output-md.md_format.json'
2639
+
2640
+ generate_md_results_from_predictions_json(predictions_json_file=predictions_json_file,
2641
+ md_results_file=md_results_file,
2642
+ base_folder=base_folder)
2643
+
2644
+ # from megadetector.utils.path_utils import open_file; open_file(md_results_file)
2645
+
2646
+ assert os.path.isdir(image_folder)
2647
+
2648
+
2649
+ #%% Preview
2650
+
2651
+ from megadetector.postprocessing.postprocess_batch_results import \
2652
+ PostProcessingOptions, process_batch_results
2653
+ from megadetector.utils import path_utils
2654
+
2655
+ render_animals_only = False
2656
+
2657
+ options = PostProcessingOptions()
2658
+ options.image_base_dir = image_folder
2659
+ options.include_almost_detections = True
2660
+ options.num_images_to_sample = None
2661
+ options.confidence_threshold = 0.2
2662
+ options.almost_detection_confidence_threshold = options.confidence_threshold - 0.05
2663
+ options.ground_truth_json_file = None
2664
+ options.separate_detections_by_category = True
2665
+ options.sample_seed = 0
2666
+ options.max_figures_per_html_file = 5000
2667
+
2668
+ options.parallelize_rendering = True
2669
+ options.parallelize_rendering_n_cores = 10
2670
+ options.parallelize_rendering_with_threads = True
2671
+ options.sort_classification_results_by_count = True
2672
+
2673
+ if render_animals_only:
2674
+ # Omit some pages from the output, useful when animals are rare
2675
+ options.rendering_bypass_sets = ['detections_person','detections_vehicle',
2676
+ 'detections_person_vehicle','non_detections']
2677
+
2678
+ output_base = r'g:\temp\preview' + '_' + detector_source
2679
+ if render_animals_only:
2680
+ output_base = output_base + '_render_animals_only'
2681
+ os.makedirs(output_base, exist_ok=True)
2682
+
2683
+ print('Writing preview to {}'.format(output_base))
2684
+
2685
+ options.md_results_file = md_results_file
2686
+ options.output_dir = output_base
2687
+ ppresults = process_batch_results(options)
2688
+ html_output_file = ppresults.output_html_file
2689
+
2690
+ path_utils.open_file(html_output_file,attempt_to_open_in_wsl_host=True,browser_name='chrome')
2691
+ # import clipboard; clipboard.copy(html_output_file)