megadetector 5.0.6__py3-none-any.whl → 5.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- api/batch_processing/data_preparation/manage_local_batch.py +297 -202
- api/batch_processing/data_preparation/manage_video_batch.py +7 -2
- api/batch_processing/postprocessing/add_max_conf.py +1 -0
- api/batch_processing/postprocessing/combine_api_outputs.py +2 -2
- api/batch_processing/postprocessing/compare_batch_results.py +111 -61
- api/batch_processing/postprocessing/convert_output_format.py +24 -6
- api/batch_processing/postprocessing/load_api_results.py +56 -72
- api/batch_processing/postprocessing/md_to_labelme.py +119 -51
- api/batch_processing/postprocessing/merge_detections.py +30 -5
- api/batch_processing/postprocessing/postprocess_batch_results.py +175 -55
- api/batch_processing/postprocessing/remap_detection_categories.py +163 -0
- api/batch_processing/postprocessing/render_detection_confusion_matrix.py +628 -0
- api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
- api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
- api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +224 -76
- api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
- api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
- classification/prepare_classification_script.py +191 -191
- data_management/cct_json_utils.py +7 -2
- data_management/coco_to_labelme.py +263 -0
- data_management/coco_to_yolo.py +72 -48
- data_management/databases/integrity_check_json_db.py +75 -64
- data_management/databases/subset_json_db.py +1 -1
- data_management/generate_crops_from_cct.py +1 -1
- data_management/get_image_sizes.py +44 -26
- data_management/importers/animl_results_to_md_results.py +3 -5
- data_management/importers/noaa_seals_2019.py +2 -2
- data_management/importers/zamba_results_to_md_results.py +2 -2
- data_management/labelme_to_coco.py +264 -127
- data_management/labelme_to_yolo.py +96 -53
- data_management/lila/create_lila_blank_set.py +557 -0
- data_management/lila/create_lila_test_set.py +2 -1
- data_management/lila/create_links_to_md_results_files.py +1 -1
- data_management/lila/download_lila_subset.py +138 -45
- data_management/lila/generate_lila_per_image_labels.py +23 -14
- data_management/lila/get_lila_annotation_counts.py +16 -10
- data_management/lila/lila_common.py +15 -42
- data_management/lila/test_lila_metadata_urls.py +116 -0
- data_management/read_exif.py +65 -16
- data_management/remap_coco_categories.py +84 -0
- data_management/resize_coco_dataset.py +14 -31
- data_management/wi_download_csv_to_coco.py +239 -0
- data_management/yolo_output_to_md_output.py +40 -13
- data_management/yolo_to_coco.py +313 -100
- detection/process_video.py +36 -14
- detection/pytorch_detector.py +1 -1
- detection/run_detector.py +73 -18
- detection/run_detector_batch.py +116 -27
- detection/run_inference_with_yolov5_val.py +135 -27
- detection/run_tiled_inference.py +153 -43
- detection/tf_detector.py +2 -1
- detection/video_utils.py +4 -2
- md_utils/ct_utils.py +101 -6
- md_utils/md_tests.py +264 -17
- md_utils/path_utils.py +326 -47
- md_utils/process_utils.py +26 -7
- md_utils/split_locations_into_train_val.py +215 -0
- md_utils/string_utils.py +10 -0
- md_utils/url_utils.py +66 -3
- md_utils/write_html_image_list.py +12 -2
- md_visualization/visualization_utils.py +380 -74
- md_visualization/visualize_db.py +41 -10
- md_visualization/visualize_detector_output.py +185 -104
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/METADATA +11 -13
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/RECORD +74 -67
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/WHEEL +1 -1
- taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
- taxonomy_mapping/map_new_lila_datasets.py +43 -39
- taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
- taxonomy_mapping/preview_lila_taxonomy.py +27 -27
- taxonomy_mapping/species_lookup.py +33 -13
- taxonomy_mapping/taxonomy_csv_checker.py +7 -5
- md_visualization/visualize_megadb.py +0 -183
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/LICENSE +0 -0
- {megadetector-5.0.6.dist-info → megadetector-5.0.8.dist-info}/top_level.txt +0 -0
|
@@ -14,7 +14,10 @@ import warnings
|
|
|
14
14
|
import sklearn.cluster
|
|
15
15
|
import numpy as np
|
|
16
16
|
import jsonpickle
|
|
17
|
+
import traceback
|
|
17
18
|
import pandas as pd
|
|
19
|
+
import json
|
|
20
|
+
import shutil
|
|
18
21
|
|
|
19
22
|
from tqdm import tqdm
|
|
20
23
|
from operator import attrgetter
|
|
@@ -35,6 +38,8 @@ from api.batch_processing.postprocessing.postprocess_batch_results import relati
|
|
|
35
38
|
from md_visualization.visualization_utils import open_image, render_detection_bounding_boxes
|
|
36
39
|
from md_visualization import render_images_with_thumbnails
|
|
37
40
|
from md_visualization import visualization_utils as vis_utils
|
|
41
|
+
from md_utils.path_utils import flatten_path
|
|
42
|
+
from md_utils.ct_utils import invert_dictionary
|
|
38
43
|
|
|
39
44
|
# "PIL cannot read EXIF metainfo for the images"
|
|
40
45
|
warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
|
|
@@ -42,10 +47,12 @@ warnings.filterwarnings('ignore', '(Possibly )?corrupt EXIF data', UserWarning)
|
|
|
42
47
|
# "Metadata Warning, tag 256 had too many entries: 42, expected 1"
|
|
43
48
|
warnings.filterwarnings('ignore', 'Metadata warning', UserWarning)
|
|
44
49
|
|
|
50
|
+
jsonpickle.set_encoder_options('json', sort_keys=True, indent=1)
|
|
51
|
+
|
|
45
52
|
|
|
46
53
|
#%% Constants
|
|
47
54
|
|
|
48
|
-
|
|
55
|
+
detection_index_file_name_base = 'detectionIndex.json'
|
|
49
56
|
|
|
50
57
|
|
|
51
58
|
#%% Classes
|
|
@@ -75,25 +82,31 @@ class RepeatDetectionOptions:
|
|
|
75
82
|
# are required before we declare it suspicious?
|
|
76
83
|
occurrenceThreshold = 20
|
|
77
84
|
|
|
85
|
+
# Ignore "suspicious" detections smaller than some size
|
|
86
|
+
minSuspiciousDetectionSize = 0.0
|
|
87
|
+
|
|
78
88
|
# Ignore "suspicious" detections larger than some size; these are often animals
|
|
79
89
|
# taking up the whole image. This is expressed as a fraction of the image size.
|
|
80
90
|
maxSuspiciousDetectionSize = 0.2
|
|
81
91
|
|
|
82
|
-
# Ignore "suspicious" detections smaller than some size
|
|
83
|
-
minSuspiciousDetectionSize = 0.0
|
|
84
|
-
|
|
85
92
|
# Ignore folders with more than this many images in them
|
|
86
93
|
maxImagesPerFolder = None
|
|
87
94
|
|
|
88
95
|
# A list of classes we don't want to treat as suspicious. Each element is an int.
|
|
89
96
|
excludeClasses = [] # [annotation_constants.detector_bbox_category_name_to_id['person']]
|
|
90
97
|
|
|
98
|
+
# For very large sets of results, passing chunks of results to and from workers as
|
|
99
|
+
# parameters ('memory') can be memory-intensive, so we can serialize to intermediate
|
|
100
|
+
# files instead ('file').
|
|
101
|
+
#
|
|
102
|
+
# The use of 'file' here is still experimental.
|
|
103
|
+
pass_detections_to_processes_method = 'memory'
|
|
104
|
+
|
|
91
105
|
nWorkers = 10
|
|
92
106
|
|
|
107
|
+
# Should we use threads or processes for parallelization?
|
|
93
108
|
parallelizationUsesThreads = True
|
|
94
109
|
|
|
95
|
-
viz_target_width = 800
|
|
96
|
-
|
|
97
110
|
# Load detections from a filter file rather than finding them from the detector output
|
|
98
111
|
|
|
99
112
|
# .json file containing detections, generally this is the detectionIndex.json file in
|
|
@@ -168,7 +181,7 @@ class RepeatDetectionOptions:
|
|
|
168
181
|
|
|
169
182
|
# Optionally show a grid that includes a sample image for the detection, plus
|
|
170
183
|
# the top N additional detections
|
|
171
|
-
bRenderDetectionTiles =
|
|
184
|
+
bRenderDetectionTiles = True
|
|
172
185
|
|
|
173
186
|
# If this is None, we'll render at the width of the original image
|
|
174
187
|
detectionTilesPrimaryImageWidth = None
|
|
@@ -180,7 +193,7 @@ class RepeatDetectionOptions:
|
|
|
180
193
|
# of luck.
|
|
181
194
|
detectionTilesCroppedGridWidth = 0.6
|
|
182
195
|
detectionTilesPrimaryImageLocation='right'
|
|
183
|
-
detectionTilesMaxCrops =
|
|
196
|
+
detectionTilesMaxCrops = 250
|
|
184
197
|
|
|
185
198
|
# If bRenderOtherDetections is True, what color should we use to render the
|
|
186
199
|
# (hopefully pretty subtle) non-target detections?
|
|
@@ -213,7 +226,7 @@ class RepeatDetectionResults:
|
|
|
213
226
|
"""
|
|
214
227
|
|
|
215
228
|
# The data table (Pandas DataFrame), as loaded from the input json file via
|
|
216
|
-
# load_api_results()
|
|
229
|
+
# load_api_results(). Has columns ['file', 'detections','failure'].
|
|
217
230
|
detectionResults = None
|
|
218
231
|
|
|
219
232
|
# The other fields in the input json file, loaded via load_api_results()
|
|
@@ -313,7 +326,7 @@ class DetectionLocation:
|
|
|
313
326
|
return detection
|
|
314
327
|
|
|
315
328
|
|
|
316
|
-
#%%
|
|
329
|
+
#%% Support functions
|
|
317
330
|
|
|
318
331
|
def enumerate_images(dirName,outputFileName=None):
|
|
319
332
|
"""
|
|
@@ -347,7 +360,7 @@ def render_bounding_box(detection, inputFileName, outputFileName, lineWidth=5,
|
|
|
347
360
|
|
|
348
361
|
|
|
349
362
|
def detection_rect_to_rtree_rect(detection_rect):
|
|
350
|
-
# We store
|
|
363
|
+
# We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
|
|
351
364
|
l = detection_rect[0]
|
|
352
365
|
b = detection_rect[1]
|
|
353
366
|
r = detection_rect[0] + detection_rect[2]
|
|
@@ -356,7 +369,7 @@ def detection_rect_to_rtree_rect(detection_rect):
|
|
|
356
369
|
|
|
357
370
|
|
|
358
371
|
def rtree_rect_to_detection_rect(rtree_rect):
|
|
359
|
-
# We store
|
|
372
|
+
# We store detections as x/y/w/h, rtree and pyqtree use l/b/r/t
|
|
360
373
|
x = rtree_rect[0]
|
|
361
374
|
y = rtree_rect[1]
|
|
362
375
|
w = rtree_rect[2] - rtree_rect[0]
|
|
@@ -364,12 +377,11 @@ def rtree_rect_to_detection_rect(rtree_rect):
|
|
|
364
377
|
return (x,y,w,h)
|
|
365
378
|
|
|
366
379
|
|
|
367
|
-
#%% Sort a list of candidate detections to make them visually easier to review
|
|
368
|
-
|
|
369
380
|
def sort_detections_for_directory(candidateDetections,options):
|
|
370
381
|
"""
|
|
371
382
|
candidateDetections is a list of DetectionLocation objects. Sorts them to
|
|
372
|
-
put nearby detections next to each other, for easier visual review.
|
|
383
|
+
put nearby detections next to each other, for easier visual review. Returns
|
|
384
|
+
a sorted copy of candidateDetections, does not sort in-place.
|
|
373
385
|
"""
|
|
374
386
|
|
|
375
387
|
if len(candidateDetections) <= 1 or options.smartSort is None:
|
|
@@ -462,13 +474,24 @@ def sort_detections_for_directory(candidateDetections,options):
|
|
|
462
474
|
raise ValueError('Unrecognized sort method {}'.format(
|
|
463
475
|
options.smartSort))
|
|
464
476
|
|
|
465
|
-
|
|
466
|
-
|
|
477
|
+
# ...def sort_detections_for_directory(...)
|
|
478
|
+
|
|
467
479
|
|
|
468
480
|
def find_matches_in_directory(dirNameAndRows, options):
|
|
469
481
|
"""
|
|
470
482
|
dirNameAndRows is a tuple of (name,rows).
|
|
471
483
|
|
|
484
|
+
"name" is a location name, typically a folder name.
|
|
485
|
+
|
|
486
|
+
"rows" is a Pandas dataframe with one row per image in this location, with columns:
|
|
487
|
+
|
|
488
|
+
* 'file': relative file name
|
|
489
|
+
* 'detections': a list of MD detection objects, i.e. dicts with keys ['category','conf','bbox']
|
|
490
|
+
* 'max_detection_conf': maximum confidence of any detection, in any category
|
|
491
|
+
|
|
492
|
+
"rows" can also point to a .csv file, in which case the detection table will be read from that
|
|
493
|
+
.csv file, and results will be written to a .csv file rather than being returned.
|
|
494
|
+
|
|
472
495
|
Find all unique detections in this directory.
|
|
473
496
|
|
|
474
497
|
Returns a list of DetectionLocation objects.
|
|
@@ -480,11 +503,21 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
480
503
|
# Create a tree to store candidate detections
|
|
481
504
|
candidateDetectionsIndex = pyqtree.Index(bbox=(-0.1,-0.1,1.1,1.1))
|
|
482
505
|
|
|
483
|
-
assert len(dirNameAndRows) == 2
|
|
484
|
-
assert isinstance(dirNameAndRows[0],str)
|
|
485
|
-
dirName = dirNameAndRows[0]
|
|
506
|
+
assert len(dirNameAndRows) == 2, 'find_matches_in_directory: invalid input'
|
|
507
|
+
assert isinstance(dirNameAndRows[0],str), 'find_matches_in_directory: invalid location name'
|
|
508
|
+
dirName = dirNameAndRows[0]
|
|
486
509
|
rows = dirNameAndRows[1]
|
|
487
|
-
|
|
510
|
+
|
|
511
|
+
detections_loaded_from_csv_file = None
|
|
512
|
+
|
|
513
|
+
if isinstance(rows,str):
|
|
514
|
+
detections_loaded_from_csv_file = rows
|
|
515
|
+
print('Loading results for location {} from {}'.format(
|
|
516
|
+
dirName,detections_loaded_from_csv_file))
|
|
517
|
+
rows = pd.read_csv(detections_loaded_from_csv_file)
|
|
518
|
+
# Pandas writes out detections out as strings, convert them back to lists
|
|
519
|
+
rows['detections'] = rows['detections'].apply(lambda s: json.loads(s.replace('\'','"')))
|
|
520
|
+
|
|
488
521
|
if options.maxImagesPerFolder is not None and len(rows) > options.maxImagesPerFolder:
|
|
489
522
|
print('Ignoring directory {} because it has {} images (limit set to {})'.format(
|
|
490
523
|
dirName,len(rows),options.maxImagesPerFolder))
|
|
@@ -539,7 +572,7 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
539
572
|
# }
|
|
540
573
|
detections = row['detections']
|
|
541
574
|
if isinstance(detections,float):
|
|
542
|
-
assert isinstance(row['failure'],str)
|
|
575
|
+
assert isinstance(row['failure'],str), 'Expected failure indicator'
|
|
543
576
|
print('Skipping failed image {} ({})'.format(filename,row['failure']))
|
|
544
577
|
continue
|
|
545
578
|
|
|
@@ -554,8 +587,9 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
554
587
|
print('Skipping detection {}'.format(iDetection))
|
|
555
588
|
continue
|
|
556
589
|
|
|
557
|
-
assert 'category' in detection and
|
|
558
|
-
'
|
|
590
|
+
assert 'category' in detection and \
|
|
591
|
+
'conf' in detection and \
|
|
592
|
+
'bbox' in detection, 'Illegal detection'
|
|
559
593
|
|
|
560
594
|
confidence = detection['conf']
|
|
561
595
|
|
|
@@ -572,7 +606,7 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
572
606
|
continue
|
|
573
607
|
|
|
574
608
|
# Optionally exclude some classes from consideration as suspicious
|
|
575
|
-
if len(options.excludeClasses) > 0:
|
|
609
|
+
if (options.excludeClasses is not None) and (len(options.excludeClasses) > 0):
|
|
576
610
|
iClass = int(detection['category'])
|
|
577
611
|
if iClass in options.excludeClasses:
|
|
578
612
|
continue
|
|
@@ -588,8 +622,12 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
588
622
|
|
|
589
623
|
area = h * w
|
|
590
624
|
|
|
625
|
+
if area < 0:
|
|
626
|
+
print('Warning: negative-area bounding box for file {}'.format(filename))
|
|
627
|
+
area = abs(area); h = abs(h); w = abs(w)
|
|
628
|
+
|
|
591
629
|
assert area >= 0.0 and area <= 1.0, \
|
|
592
|
-
'Illegal bounding box area {}'.format(area)
|
|
630
|
+
'Illegal bounding box area {} in image {}'.format(area,filename)
|
|
593
631
|
|
|
594
632
|
if area < options.minSuspiciousDetectionSize:
|
|
595
633
|
continue
|
|
@@ -653,9 +691,7 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
653
691
|
candidate = DetectionLocation(instance=instance,
|
|
654
692
|
detection=detection, relativeDir=dirName,
|
|
655
693
|
category=category, id=i_iteration)
|
|
656
|
-
|
|
657
|
-
# candidateDetections.append(candidate)
|
|
658
|
-
|
|
694
|
+
|
|
659
695
|
# pyqtree
|
|
660
696
|
candidateDetectionsIndex.insert(item=candidate,bbox=rtree_rect)
|
|
661
697
|
|
|
@@ -673,20 +709,45 @@ def find_matches_in_directory(dirNameAndRows, options):
|
|
|
673
709
|
candidateDetections.sort(
|
|
674
710
|
key=lambda x: x.id, reverse=False)
|
|
675
711
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
712
|
+
if detections_loaded_from_csv_file is not None:
|
|
713
|
+
location_results_file = \
|
|
714
|
+
os.path.splitext(detections_loaded_from_csv_file)[0] + \
|
|
715
|
+
'_results.json'
|
|
716
|
+
print('Writing results for location {} to {}'.format(
|
|
717
|
+
dirName,location_results_file))
|
|
718
|
+
s = jsonpickle.encode(candidateDetections,make_refs=False)
|
|
719
|
+
with open(location_results_file,'w') as f:
|
|
720
|
+
f.write(s)
|
|
721
|
+
# json.dump(candidateDetections,f,indent=1)
|
|
722
|
+
return location_results_file
|
|
723
|
+
else:
|
|
724
|
+
return candidateDetections
|
|
679
725
|
|
|
726
|
+
# ...def find_matches_in_directory(...)
|
|
680
727
|
|
|
681
|
-
#%% Update the detection table based on suspicious results, write .csv output
|
|
682
728
|
|
|
683
|
-
def update_detection_table(
|
|
729
|
+
def update_detection_table(repeatDetectionResults, options, outputFilename=None):
|
|
730
|
+
"""
|
|
731
|
+
Changes confidence values in repeatDetectionResults.detectionResults so that detections
|
|
732
|
+
deemed to be possible false positives are given negative confidence values.
|
|
733
|
+
|
|
734
|
+
repeatDetectionResults is an object of type RepeatDetectionResults, with a pandas
|
|
735
|
+
dataframe (detectionResults) containing all the detections loaded from the .json file,
|
|
736
|
+
and a list of detections for each location (suspiciousDetections) that are deemed to
|
|
737
|
+
be suspicious.
|
|
738
|
+
|
|
739
|
+
returns the modified pandas dataframe (repeatDetectionResults.detectionResults), but
|
|
740
|
+
also modifies it in place.
|
|
741
|
+
"""
|
|
684
742
|
|
|
685
|
-
|
|
743
|
+
# This is the pandas dataframe that contains actual detection results.
|
|
744
|
+
#
|
|
745
|
+
# Has fields ['file', 'detections','failure'].
|
|
746
|
+
detectionResults = repeatDetectionResults.detectionResults
|
|
686
747
|
|
|
687
748
|
# An array of length nDirs, where each element is a list of DetectionLocation
|
|
688
749
|
# objects for that directory that have been flagged as suspicious
|
|
689
|
-
suspiciousDetectionsByDirectory =
|
|
750
|
+
suspiciousDetectionsByDirectory = repeatDetectionResults.suspiciousDetections
|
|
690
751
|
|
|
691
752
|
nBboxChanges = 0
|
|
692
753
|
|
|
@@ -715,8 +776,8 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
|
|
|
715
776
|
# if iou < options.iouThreshold:
|
|
716
777
|
# print('IOU warning: {},{}'.format(iou,options.iouThreshold))
|
|
717
778
|
|
|
718
|
-
assert instance.filename in
|
|
719
|
-
iRow =
|
|
779
|
+
assert instance.filename in repeatDetectionResults.filenameToRow
|
|
780
|
+
iRow = repeatDetectionResults.filenameToRow[instance.filename]
|
|
720
781
|
row = detectionResults.iloc[iRow]
|
|
721
782
|
rowDetections = row['detections']
|
|
722
783
|
detectionToModify = rowDetections[instance.iDetection]
|
|
@@ -800,7 +861,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
|
|
|
800
861
|
|
|
801
862
|
# If we're also writing output...
|
|
802
863
|
if outputFilename is not None and len(outputFilename) > 0:
|
|
803
|
-
write_api_results(detectionResults,
|
|
864
|
+
write_api_results(detectionResults, repeatDetectionResults.otherFields,
|
|
804
865
|
outputFilename)
|
|
805
866
|
|
|
806
867
|
print(
|
|
@@ -809,7 +870,7 @@ def update_detection_table(RepeatDetectionResults, options, outputFilename=None)
|
|
|
809
870
|
|
|
810
871
|
return detectionResults
|
|
811
872
|
|
|
812
|
-
# ...def update_detection_table(
|
|
873
|
+
# ...def update_detection_table(...)
|
|
813
874
|
|
|
814
875
|
|
|
815
876
|
def render_sample_image_for_detection(detection,filteringDir,options):
|
|
@@ -845,12 +906,12 @@ def render_sample_image_for_detection(detection,filteringDir,options):
|
|
|
845
906
|
|
|
846
907
|
try:
|
|
847
908
|
|
|
909
|
+
im = open_image(inputFullPath)
|
|
910
|
+
|
|
848
911
|
# Should we render (typically in a very light color) detections
|
|
849
912
|
# *other* than the one we're highlighting here?
|
|
850
913
|
if options.bRenderOtherDetections:
|
|
851
|
-
|
|
852
|
-
im = open_image(inputFullPath)
|
|
853
|
-
|
|
914
|
+
|
|
854
915
|
# Optionally resize the output image
|
|
855
916
|
if (options.maxOutputImageWidth is not None) and \
|
|
856
917
|
(im.size[0] > options.maxOutputImageWidth):
|
|
@@ -896,6 +957,10 @@ def render_sample_image_for_detection(detection,filteringDir,options):
|
|
|
896
957
|
render_bounding_box(detection, inputFullPath, outputFullPath,
|
|
897
958
|
lineWidth=options.lineThickness, expansion=options.boxExpansion)
|
|
898
959
|
|
|
960
|
+
# ...if we are/aren't rendering other bounding boxes
|
|
961
|
+
|
|
962
|
+
# If we're rendering detection tiles, we'll re-load and re-write the image we
|
|
963
|
+
# just wrote to outputFullPath
|
|
899
964
|
if options.bRenderDetectionTiles:
|
|
900
965
|
|
|
901
966
|
assert not is_sas_url(options.imageBase), "Can't render detection tiles from SAS URLs"
|
|
@@ -903,6 +968,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
|
|
|
903
968
|
if options.detectionTilesPrimaryImageWidth is not None:
|
|
904
969
|
primaryImageWidth = options.detectionTilesPrimaryImageWidth
|
|
905
970
|
else:
|
|
971
|
+
# "im" may be a resized version of the original image, if we've already run
|
|
972
|
+
# the code to render other bounding boxes.
|
|
906
973
|
primaryImageWidth = im.size[0]
|
|
907
974
|
|
|
908
975
|
if options.detectionTilesCroppedGridWidth <= 1.0:
|
|
@@ -926,7 +993,8 @@ def render_sample_image_for_detection(detection,filteringDir,options):
|
|
|
926
993
|
secondaryImageFilenameList[0:options.detectionTilesMaxCrops]
|
|
927
994
|
secondaryImageBoundingBoxList = \
|
|
928
995
|
secondaryImageBoundingBoxList[0:options.detectionTilesMaxCrops]
|
|
929
|
-
|
|
996
|
+
|
|
997
|
+
# This will over-write the image we've already written to outputFullPath
|
|
930
998
|
render_images_with_thumbnails.render_images_with_thumbnails(
|
|
931
999
|
primary_image_filename=outputFullPath,
|
|
932
1000
|
primary_image_width=primaryImageWidth,
|
|
@@ -940,16 +1008,20 @@ def render_sample_image_for_detection(detection,filteringDir,options):
|
|
|
940
1008
|
# bDetectionTilesCroppedGridWidth = 0.6
|
|
941
1009
|
# bDetectionTilesPrimaryImageLocation='right'
|
|
942
1010
|
|
|
943
|
-
# ...if we are/aren't rendering
|
|
1011
|
+
# ...if we are/aren't rendering detection tiles
|
|
944
1012
|
|
|
945
1013
|
except Exception as e:
|
|
946
|
-
|
|
947
|
-
|
|
1014
|
+
|
|
1015
|
+
stack_trace = traceback.format_exc()
|
|
1016
|
+
print('Warning: error rendering bounding box from {} to {}: {} ({})'.format(
|
|
1017
|
+
inputFullPath,outputFullPath,e,stack_trace))
|
|
948
1018
|
if options.bFailOnRenderError:
|
|
949
1019
|
raise
|
|
950
|
-
|
|
951
1020
|
|
|
952
|
-
|
|
1021
|
+
# ...def render_sample_image_for_detection(...)
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
#%% Main entry point
|
|
953
1025
|
|
|
954
1026
|
def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
955
1027
|
|
|
@@ -1002,9 +1074,9 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1002
1074
|
|
|
1003
1075
|
# Load file to a pandas dataframe. Also populates 'max_detection_conf', even if it's
|
|
1004
1076
|
# not present in the .json file.
|
|
1005
|
-
|
|
1006
1077
|
detectionResults, otherFields = load_api_results(inputFilename, normalize_paths=True,
|
|
1007
|
-
filename_replacements=options.filenameReplacements
|
|
1078
|
+
filename_replacements=options.filenameReplacements,
|
|
1079
|
+
force_forward_slashes=True)
|
|
1008
1080
|
toReturn.detectionResults = detectionResults
|
|
1009
1081
|
toReturn.otherFields = otherFields
|
|
1010
1082
|
|
|
@@ -1028,7 +1100,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1028
1100
|
assert os.path.isfile(absolutePath), 'Could not find file {}'.format(absolutePath)
|
|
1029
1101
|
|
|
1030
1102
|
|
|
1031
|
-
##%% Separate files into
|
|
1103
|
+
##%% Separate files into locations
|
|
1032
1104
|
|
|
1033
1105
|
# This will be a map from a directory name to smaller data frames
|
|
1034
1106
|
rowsByDirectory = {}
|
|
@@ -1036,12 +1108,12 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1036
1108
|
# This is a mapping back into the rows of the original table
|
|
1037
1109
|
filenameToRow = {}
|
|
1038
1110
|
|
|
1039
|
-
print('Separating
|
|
1111
|
+
print('Separating images into locations...')
|
|
1040
1112
|
|
|
1041
1113
|
nCustomDirReplacements = 0
|
|
1042
1114
|
|
|
1043
1115
|
# iRow = 0; row = detectionResults.iloc[0]
|
|
1044
|
-
for iRow, row in detectionResults.iterrows():
|
|
1116
|
+
for iRow, row in tqdm(detectionResults.iterrows(),total=len(detectionResults)):
|
|
1045
1117
|
|
|
1046
1118
|
relativePath = row['file']
|
|
1047
1119
|
|
|
@@ -1079,7 +1151,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1079
1151
|
if options.customDirNameFunction is not None:
|
|
1080
1152
|
print('Custom dir name function made {} replacements (of {} images)'.format(
|
|
1081
1153
|
nCustomDirReplacements,len(detectionResults)))
|
|
1082
|
-
|
|
1154
|
+
|
|
1083
1155
|
# Convert lists of rows to proper DataFrames
|
|
1084
1156
|
dirs = list(rowsByDirectory.keys())
|
|
1085
1157
|
for d in dirs:
|
|
@@ -1088,11 +1160,10 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1088
1160
|
toReturn.rowsByDirectory = rowsByDirectory
|
|
1089
1161
|
toReturn.filenameToRow = filenameToRow
|
|
1090
1162
|
|
|
1091
|
-
print('Finished separating {} files into {}
|
|
1092
|
-
|
|
1093
|
-
|
|
1163
|
+
print('Finished separating {} files into {} locations'.format(len(detectionResults),
|
|
1164
|
+
len(rowsByDirectory)))
|
|
1094
1165
|
|
|
1095
|
-
##% Look for
|
|
1166
|
+
##% Look for repeat detections (or load them from file)
|
|
1096
1167
|
|
|
1097
1168
|
dirsToSearch = list(rowsByDirectory.keys())
|
|
1098
1169
|
if options.debugMaxDir > 0:
|
|
@@ -1119,6 +1190,11 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1119
1190
|
|
|
1120
1191
|
allCandidateDetections = [None] * len(dirsToSearch)
|
|
1121
1192
|
|
|
1193
|
+
# If we serialize results to intermediate files, we need to remove slashes from
|
|
1194
|
+
# location names; we store mappings here.
|
|
1195
|
+
normalized_location_name_to_location_name = None
|
|
1196
|
+
location_name_to_normalized_location_name = None
|
|
1197
|
+
|
|
1122
1198
|
if not options.bParallelizeComparisons:
|
|
1123
1199
|
|
|
1124
1200
|
options.pbar = None
|
|
@@ -1136,7 +1212,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1136
1212
|
print('Pool of {} requested, but only {} folders available, reducing pool to {}'.\
|
|
1137
1213
|
format(n_workers,len(dirNameAndRows),len(dirNameAndRows)))
|
|
1138
1214
|
n_workers = len(dirNameAndRows)
|
|
1139
|
-
|
|
1215
|
+
|
|
1140
1216
|
if options.parallelizationUsesThreads:
|
|
1141
1217
|
pool = ThreadPool(n_workers); poolstring = 'threads'
|
|
1142
1218
|
else:
|
|
@@ -1144,24 +1220,96 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1144
1220
|
|
|
1145
1221
|
print('Starting comparison pool with {} {}'.format(n_workers,poolstring))
|
|
1146
1222
|
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1223
|
+
assert options.pass_detections_to_processes_method in ('file','memory'), \
|
|
1224
|
+
'Unrecognized IPC mechanism: {}'.format(options.pass_detections_to_processes_method)
|
|
1225
|
+
|
|
1226
|
+
# ** Experimental **
|
|
1227
|
+
#
|
|
1228
|
+
# Rather than passing detections and results around in memory, write detections and
|
|
1229
|
+
# results for each worker to intermediate files. May improve performance for very large
|
|
1230
|
+
# results sets that exceed working memory.
|
|
1231
|
+
if options.pass_detections_to_processes_method == 'file':
|
|
1232
|
+
|
|
1233
|
+
##%% Convert location names to normalized names we can write to files
|
|
1234
|
+
|
|
1235
|
+
normalized_location_name_to_location_name = {}
|
|
1236
|
+
for dir_name in dirsToSearch:
|
|
1237
|
+
normalized_location_name = flatten_path(dir_name)
|
|
1238
|
+
assert normalized_location_name not in normalized_location_name_to_location_name, \
|
|
1239
|
+
'Redundant location name {}, can\'t serialize to intermediate files'.format(
|
|
1240
|
+
dir_name)
|
|
1241
|
+
normalized_location_name_to_location_name[normalized_location_name] = dir_name
|
|
1242
|
+
|
|
1243
|
+
location_name_to_normalized_location_name = \
|
|
1244
|
+
invert_dictionary(normalized_location_name_to_location_name)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
##%% Write results to files for each location
|
|
1248
|
+
|
|
1249
|
+
print('Writing results to intermediate files')
|
|
1250
|
+
|
|
1251
|
+
intermediate_json_file_folder = os.path.join(options.outputBase,'intermediate_results')
|
|
1252
|
+
os.makedirs(intermediate_json_file_folder,exist_ok=True)
|
|
1253
|
+
|
|
1254
|
+
# i_location = 0; location_info = dirNameAndRows[0]
|
|
1255
|
+
dirNameAndIntermediateFile = []
|
|
1256
|
+
|
|
1257
|
+
# i_location = 0; location_info = dirNameAndRows[i_location]
|
|
1258
|
+
for i_location, location_info in tqdm(enumerate(dirNameAndRows)):
|
|
1259
|
+
|
|
1260
|
+
location_name = location_info[0]
|
|
1261
|
+
assert location_name in location_name_to_normalized_location_name
|
|
1262
|
+
normalized_location_name = location_name_to_normalized_location_name[location_name]
|
|
1263
|
+
intermediate_results_file = os.path.join(intermediate_json_file_folder,
|
|
1264
|
+
normalized_location_name + '.csv')
|
|
1265
|
+
detections_table_this_location = location_info[1]
|
|
1266
|
+
detections_table_this_location.to_csv(intermediate_results_file,header=True,index=False)
|
|
1267
|
+
dirNameAndIntermediateFile.append((location_name,intermediate_results_file))
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
##%% Find detections in each directory
|
|
1271
|
+
|
|
1155
1272
|
options.pbar = None
|
|
1156
|
-
|
|
1157
|
-
partial(find_matches_in_directory,options=options),
|
|
1273
|
+
allCandidateDetectionFiles = list(pool.imap(
|
|
1274
|
+
partial(find_matches_in_directory,options=options), dirNameAndIntermediateFile))
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
##%% Load into a combined list of candidate detections
|
|
1278
|
+
|
|
1279
|
+
allCandidateDetections = []
|
|
1280
|
+
|
|
1281
|
+
# candidate_detection_file = allCandidateDetectionFiles[0]
|
|
1282
|
+
for candidate_detection_file in allCandidateDetectionFiles:
|
|
1283
|
+
s = open(candidate_detection_file, 'r').read()
|
|
1284
|
+
candidate_detections_this_file = jsonpickle.decode(s)
|
|
1285
|
+
allCandidateDetections.append(candidate_detections_this_file)
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
##%% Clean up intermediate files
|
|
1289
|
+
|
|
1290
|
+
shutil.rmtree(intermediate_json_file_folder)
|
|
1291
|
+
|
|
1292
|
+
# If we're passing things around in memory, rather than via intermediate files
|
|
1293
|
+
else:
|
|
1294
|
+
|
|
1295
|
+
# We get slightly nicer progress bar behavior using threads, by passing a pbar
|
|
1296
|
+
# object and letting it get updated. We can't serialize this object across
|
|
1297
|
+
# processes.
|
|
1298
|
+
if options.parallelizationUsesThreads:
|
|
1299
|
+
options.pbar = tqdm(total=len(dirNameAndRows))
|
|
1300
|
+
allCandidateDetections = list(pool.imap(
|
|
1301
|
+
partial(find_matches_in_directory,options=options), dirNameAndRows))
|
|
1302
|
+
else:
|
|
1303
|
+
options.pbar = None
|
|
1304
|
+
allCandidateDetections = list(tqdm(pool.imap(
|
|
1305
|
+
partial(find_matches_in_directory,options=options), dirNameAndRows)))
|
|
1158
1306
|
|
|
1159
1307
|
print('\nFinished looking for similar detections')
|
|
1160
1308
|
|
|
1161
1309
|
|
|
1162
|
-
##%%
|
|
1310
|
+
##%% Mark suspicious locations based on match results
|
|
1163
1311
|
|
|
1164
|
-
print('
|
|
1312
|
+
print('Marking repeat detections...')
|
|
1165
1313
|
|
|
1166
1314
|
nImagesWithSuspiciousDetections = 0
|
|
1167
1315
|
nSuspiciousDetections = 0
|
|
@@ -1202,7 +1350,8 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1202
1350
|
|
|
1203
1351
|
# ...for each directory
|
|
1204
1352
|
|
|
1205
|
-
print('Finished
|
|
1353
|
+
print('Finished marking repeat detections')
|
|
1354
|
+
|
|
1206
1355
|
print('Found {} unique detections on {} images that are suspicious'.format(
|
|
1207
1356
|
nSuspiciousDetections, nImagesWithSuspiciousDetections))
|
|
1208
1357
|
|
|
@@ -1371,8 +1520,7 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1371
1520
|
detection.sampleImageDetections = None
|
|
1372
1521
|
|
|
1373
1522
|
# Write out the detection index
|
|
1374
|
-
detectionIndexFileName = os.path.join(filteringDir,
|
|
1375
|
-
jsonpickle.set_encoder_options('json', sort_keys=True, indent=1)
|
|
1523
|
+
detectionIndexFileName = os.path.join(filteringDir, detection_index_file_name_base)
|
|
1376
1524
|
|
|
1377
1525
|
# Prepare the data we're going to write to the detection index file
|
|
1378
1526
|
detectionInfo = {}
|
|
@@ -1396,4 +1544,4 @@ def find_repeat_detections(inputFilename, outputFilename=None, options=None):
|
|
|
1396
1544
|
|
|
1397
1545
|
return toReturn
|
|
1398
1546
|
|
|
1399
|
-
# ...find_repeat_detections()
|
|
1547
|
+
# ...def find_repeat_detections()
|