megadetector 5.0.27__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +232 -223
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +341 -338
- megadetector/detection/pytorch_detector.py +308 -266
- megadetector/detection/run_detector.py +186 -166
- megadetector/detection/run_detector_batch.py +366 -364
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +312 -253
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +291 -283
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +808 -311
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +220 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -0
- megadetector/postprocessing/load_api_results.py +25 -22
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -302
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -69
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +11 -11
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1019 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1511 -406
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +73 -60
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2868
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +424 -404
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +126 -98
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/METADATA +6 -3
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.27.dist-info/RECORD +0 -208
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
|
@@ -22,11 +22,11 @@ Prior to using this module:
|
|
|
22
22
|
* Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
|
|
23
23
|
|
|
24
24
|
* pip install pytesseract
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
Known limitations:
|
|
27
27
|
|
|
28
28
|
* Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
#%% Notes to self
|
|
@@ -34,9 +34,9 @@ Known limitations:
|
|
|
34
34
|
"""
|
|
35
35
|
|
|
36
36
|
* To use the legacy engine (--oem 0), I had to download an updated eng.traineddata file from:
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
https://github.com/tesseract-ocr/tessdata
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
42
|
#%% Constants and imports
|
|
@@ -56,7 +56,7 @@ from tqdm import tqdm
|
|
|
56
56
|
|
|
57
57
|
from megadetector.utils.path_utils import find_images
|
|
58
58
|
from megadetector.utils.path_utils import open_file
|
|
59
|
-
from megadetector.utils import write_html_image_list
|
|
59
|
+
from megadetector.utils import write_html_image_list
|
|
60
60
|
from megadetector.utils.ct_utils import is_iterable
|
|
61
61
|
from megadetector.visualization import visualization_utils as vis_utils
|
|
62
62
|
|
|
@@ -73,40 +73,40 @@ class DatetimeExtractionOptions:
|
|
|
73
73
|
"""
|
|
74
74
|
Options used to parameterize datetime extraction in most functions in this module.
|
|
75
75
|
"""
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
def __init__(self):
|
|
78
|
-
|
|
79
|
-
#: Using a semi-arbitrary metric of how much it feels like we found the
|
|
78
|
+
|
|
79
|
+
#: Using a semi-arbitrary metric of how much it feels like we found the
|
|
80
80
|
#: text-containing region, discard regions that appear to be extraction failures
|
|
81
81
|
self.p_crop_success_threshold = 0.5
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
#: Pad each crop with a few pixels to make tesseract happy
|
|
84
|
-
self.crop_padding = 10
|
|
85
|
-
|
|
84
|
+
self.crop_padding = 10
|
|
85
|
+
|
|
86
86
|
#: Discard short text, typically text from the top of the image
|
|
87
87
|
self.min_text_length = 4
|
|
88
|
-
|
|
89
|
-
#: When we're looking for pixels that match the background color, allow some
|
|
88
|
+
|
|
89
|
+
#: When we're looking for pixels that match the background color, allow some
|
|
90
90
|
#: tolerance around the dominant color
|
|
91
91
|
self.background_tolerance = 2
|
|
92
|
-
|
|
93
|
-
#: We need to see a consistent color in at least this fraction of pixels in our rough
|
|
92
|
+
|
|
93
|
+
#: We need to see a consistent color in at least this fraction of pixels in our rough
|
|
94
94
|
#: crop to believe that we actually found a candidate metadata region.
|
|
95
95
|
self.min_background_fraction = 0.3
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
#: What fraction of the [top,bottom] of the image should we use for our rough crop?
|
|
98
98
|
self.image_crop_fraction = [0.045 , 0.045]
|
|
99
99
|
# self.image_crop_fraction = [0.08 , 0.08]
|
|
100
|
-
|
|
100
|
+
|
|
101
101
|
#: Within that rough crop, how much should we use for determining the background color?
|
|
102
102
|
self.background_crop_fraction_of_rough_crop = 0.5
|
|
103
|
-
|
|
103
|
+
|
|
104
104
|
#: A row is considered a probable metadata row if it contains at least this fraction
|
|
105
|
-
#: of the background color. This is used only to find the top and bottom of the crop area,
|
|
105
|
+
#: of the background color. This is used only to find the top and bottom of the crop area,
|
|
106
106
|
#: so it's not that *every* row needs to hit this criteria, only the rows that are generally
|
|
107
107
|
#: above and below the text.
|
|
108
108
|
self.min_background_fraction_for_background_row = 0.5
|
|
109
|
-
|
|
109
|
+
|
|
110
110
|
#: psm 6: "assume a single uniform block of text"
|
|
111
111
|
#: psm 13: raw line
|
|
112
112
|
#: oem: 0 == legacy, 1 == lstm
|
|
@@ -115,14 +115,14 @@ class DatetimeExtractionOptions:
|
|
|
115
115
|
#: Try these configuration strings in order until we find a valid datetime
|
|
116
116
|
self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
|
|
117
117
|
'--oem 1 --psm 6','--oem 0 --psm 6']
|
|
118
|
-
|
|
118
|
+
|
|
119
119
|
#: If this is False, and one set of options appears to succeed for an image, we'll
|
|
120
120
|
#: stop there. If this is True, we always run all option sets on every image.
|
|
121
121
|
self.force_all_ocr_options = False
|
|
122
|
-
|
|
122
|
+
|
|
123
123
|
#: Whether to apply PIL's ImageFilter.SHARPEN prior to OCR
|
|
124
124
|
self.apply_sharpening_filter = True
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
#: Tesseract should be on your system path, but you can also specify the
|
|
127
127
|
#: path explicitly, e.g. you can do either of these:
|
|
128
128
|
#:
|
|
@@ -136,115 +136,115 @@ class DatetimeExtractionOptions:
|
|
|
136
136
|
def make_rough_crops(image,options=None):
|
|
137
137
|
"""
|
|
138
138
|
Crops the top and bottom regions out of an image.
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
Args:
|
|
141
141
|
image (Image or str): a PIL Image or file name
|
|
142
142
|
options (DatetimeExtractionOptions, optional): OCR parameters
|
|
143
|
-
|
|
143
|
+
|
|
144
144
|
Returns:
|
|
145
|
-
dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
|
|
145
|
+
dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
|
|
146
146
|
"""
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
if options is None:
|
|
149
149
|
options = DatetimeExtractionOptions()
|
|
150
|
-
|
|
150
|
+
|
|
151
151
|
if isinstance(image,str):
|
|
152
152
|
image = vis_utils.open_image(image)
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
w = image.width
|
|
155
155
|
h = image.height
|
|
156
|
-
|
|
156
|
+
|
|
157
157
|
crop_height_top = round(options.image_crop_fraction[0] * h)
|
|
158
158
|
crop_height_bottom = round(options.image_crop_fraction[1] * h)
|
|
159
|
-
|
|
159
|
+
|
|
160
160
|
# l,t,r,b
|
|
161
161
|
#
|
|
162
162
|
# 0,0 is upper-left
|
|
163
163
|
top_crop = image.crop([0,0,w,crop_height_top])
|
|
164
164
|
bottom_crop = image.crop([0,h-crop_height_bottom,w,h])
|
|
165
165
|
return {'top':top_crop,'bottom':bottom_crop}
|
|
166
|
-
|
|
166
|
+
|
|
167
167
|
# ...def make_rough_crops(...)
|
|
168
168
|
|
|
169
169
|
|
|
170
170
|
def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
171
|
-
"""
|
|
171
|
+
"""
|
|
172
172
|
Given a rough crop from the top or bottom of an image, finds the background color
|
|
173
173
|
and crops to the metadata region.
|
|
174
|
-
|
|
175
|
-
Within a region of an image (typically a crop from the top-ish or bottom-ish part of
|
|
174
|
+
|
|
175
|
+
Within a region of an image (typically a crop from the top-ish or bottom-ish part of
|
|
176
176
|
an image), tightly crop to the solid portion (typically a region with a black background).
|
|
177
177
|
|
|
178
178
|
The success metric is just a binary indicator right now: 1.0 if we found a region we believe
|
|
179
179
|
contains a solid background, 0.0 otherwise.
|
|
180
|
-
|
|
180
|
+
|
|
181
181
|
Args:
|
|
182
182
|
rough_crop (Image): the PIL Image to crop
|
|
183
183
|
crop_location (str): 'top' or 'bottom'
|
|
184
184
|
options (DatetimeExtractionOptions, optional): OCR parameters
|
|
185
|
-
|
|
185
|
+
|
|
186
186
|
Returns:
|
|
187
187
|
tuple: a tuple containing (a cropped_image (Image), p_success (float), padded_image (Image))
|
|
188
188
|
"""
|
|
189
|
-
|
|
189
|
+
|
|
190
190
|
if options is None:
|
|
191
|
-
options = DatetimeExtractionOptions()
|
|
191
|
+
options = DatetimeExtractionOptions()
|
|
192
192
|
|
|
193
193
|
crop_to_solid_region_result = {}
|
|
194
194
|
crop_to_solid_region_result['crop_pil'] = None
|
|
195
195
|
crop_to_solid_region_result['padded_crop_pil'] = None
|
|
196
196
|
crop_to_solid_region_result['p_success'] = 0.0
|
|
197
|
-
|
|
198
|
-
# pil --> cv2
|
|
199
|
-
rough_crop_np = np.array(rough_crop)
|
|
200
|
-
rough_crop_np = rough_crop_np[:, :, ::-1].copy()
|
|
201
|
-
|
|
197
|
+
|
|
198
|
+
# pil --> cv2
|
|
199
|
+
rough_crop_np = np.array(rough_crop)
|
|
200
|
+
rough_crop_np = rough_crop_np[:, :, ::-1].copy()
|
|
201
|
+
|
|
202
202
|
# Search *part* of the crop for the background value (the part closest to the top or bottom
|
|
203
203
|
# of the image)
|
|
204
204
|
rows_to_use_for_background_search = int(rough_crop_np.shape[0] * \
|
|
205
205
|
options.background_crop_fraction_of_rough_crop)
|
|
206
|
-
|
|
206
|
+
|
|
207
207
|
if crop_location == 'top':
|
|
208
208
|
background_search_image = rough_crop_np[0:rows_to_use_for_background_search,:,:]
|
|
209
209
|
elif crop_location == 'bottom':
|
|
210
210
|
background_search_image = rough_crop_np[-rows_to_use_for_background_search:,:,:]
|
|
211
211
|
else:
|
|
212
212
|
raise ValueError('Unrecognized crop location: {}'.format(crop_location))
|
|
213
|
-
|
|
213
|
+
|
|
214
214
|
background_search_image = cv2.cvtColor(background_search_image, cv2.COLOR_BGR2GRAY)
|
|
215
|
-
background_search_image = background_search_image.astype('uint8')
|
|
216
|
-
background_search_image = cv2.medianBlur(background_search_image,3)
|
|
215
|
+
background_search_image = background_search_image.astype('uint8')
|
|
216
|
+
background_search_image = cv2.medianBlur(background_search_image,3)
|
|
217
217
|
pixel_values = background_search_image.flatten()
|
|
218
218
|
counts = np.bincount(pixel_values)
|
|
219
219
|
background_value = int(np.argmax(counts))
|
|
220
|
-
|
|
220
|
+
|
|
221
221
|
# Did we find a sensible mode that looks like a background value?
|
|
222
222
|
background_value_count = int(np.max(counts))
|
|
223
223
|
p_background_value = background_value_count / np.sum(counts)
|
|
224
|
-
|
|
224
|
+
|
|
225
225
|
if (p_background_value < options.min_background_fraction):
|
|
226
226
|
return crop_to_solid_region_result
|
|
227
227
|
else:
|
|
228
228
|
p_success = 1.0
|
|
229
|
-
|
|
229
|
+
|
|
230
230
|
analysis_image = cv2.cvtColor(rough_crop_np, cv2.COLOR_BGR2GRAY)
|
|
231
|
-
analysis_image = analysis_image.astype('uint8')
|
|
232
|
-
analysis_image = cv2.medianBlur(analysis_image,3)
|
|
233
|
-
|
|
231
|
+
analysis_image = analysis_image.astype('uint8')
|
|
232
|
+
analysis_image = cv2.medianBlur(analysis_image,3)
|
|
233
|
+
|
|
234
234
|
# This will now be a binary image indicating which pixels are background
|
|
235
235
|
analysis_image = cv2.inRange(analysis_image,
|
|
236
236
|
background_value-options.background_tolerance,
|
|
237
237
|
background_value+options.background_tolerance)
|
|
238
|
-
|
|
239
|
-
# Use row heuristics to refine the crop
|
|
238
|
+
|
|
239
|
+
# Use row heuristics to refine the crop
|
|
240
240
|
h = analysis_image.shape[0]
|
|
241
241
|
w = analysis_image.shape[1]
|
|
242
|
-
|
|
242
|
+
|
|
243
243
|
min_x = 0
|
|
244
244
|
min_y = -1
|
|
245
245
|
max_x = w
|
|
246
246
|
max_y = -1
|
|
247
|
-
|
|
247
|
+
|
|
248
248
|
# Find the first and last row that are mostly the background color
|
|
249
249
|
for y in range(h):
|
|
250
250
|
row_count = 0
|
|
@@ -256,20 +256,20 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
|
256
256
|
if min_y == -1:
|
|
257
257
|
min_y = y
|
|
258
258
|
max_y = y
|
|
259
|
-
|
|
259
|
+
|
|
260
260
|
assert (min_y == -1 and max_y == -1) or (min_y != -1 and max_y != -1)
|
|
261
|
-
|
|
261
|
+
|
|
262
262
|
if min_y == -1:
|
|
263
263
|
return crop_to_solid_region_result
|
|
264
|
-
|
|
264
|
+
|
|
265
265
|
if max_y == min_y:
|
|
266
266
|
return crop_to_solid_region_result
|
|
267
|
-
|
|
267
|
+
|
|
268
268
|
x = min_x
|
|
269
269
|
y = min_y
|
|
270
270
|
w = max_x-min_x
|
|
271
271
|
h = max_y-min_y
|
|
272
|
-
|
|
272
|
+
|
|
273
273
|
x = min_x
|
|
274
274
|
y = min_y
|
|
275
275
|
w = max_x-min_x
|
|
@@ -277,7 +277,7 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
|
277
277
|
|
|
278
278
|
# Crop the image
|
|
279
279
|
crop_np = rough_crop_np[y:y+h,x:x+w]
|
|
280
|
-
|
|
280
|
+
|
|
281
281
|
# Tesseract doesn't like characters really close to the edge, so pad a little.
|
|
282
282
|
crop_padding = options.crop_padding
|
|
283
283
|
padded_crop_np = cv2.copyMakeBorder(crop_np,crop_padding,crop_padding,crop_padding,crop_padding,
|
|
@@ -286,39 +286,39 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
|
286
286
|
|
|
287
287
|
crop_pil = Image.fromarray(crop_np)
|
|
288
288
|
padded_crop_pil = Image.fromarray(padded_crop_np)
|
|
289
|
-
|
|
289
|
+
|
|
290
290
|
crop_to_solid_region_result['crop_pil'] = crop_pil
|
|
291
291
|
crop_to_solid_region_result['padded_crop_pil'] = padded_crop_pil
|
|
292
292
|
crop_to_solid_region_result['p_success'] = p_success
|
|
293
|
-
|
|
293
|
+
|
|
294
294
|
return crop_to_solid_region_result
|
|
295
|
-
|
|
296
|
-
# ...crop_to_solid_region(...)
|
|
295
|
+
|
|
296
|
+
# ...crop_to_solid_region(...)
|
|
297
297
|
|
|
298
298
|
|
|
299
299
|
def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
|
|
300
300
|
"""
|
|
301
|
-
Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
|
|
301
|
+
Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
|
|
302
302
|
regions by the time they get to this function, roughly the top or bottom 20% of an image.
|
|
303
|
-
|
|
303
|
+
|
|
304
304
|
Args:
|
|
305
305
|
rough_crops (list): list of Image objects that have been cropped close to text
|
|
306
306
|
options (DatetimeExtractionOptions, optional): OCR parameters
|
|
307
307
|
tesseract_config_string (str, optional): optional CLI argument to pass to tesseract.exe
|
|
308
|
-
|
|
308
|
+
|
|
309
309
|
Returns:
|
|
310
310
|
dict: a dict with keys "top" and "bottom", where each value is a dict with keys
|
|
311
311
|
'text' (text found, if any) and 'crop_to_solid_region_results' (metadata about the OCR pass)
|
|
312
312
|
"""
|
|
313
|
-
|
|
313
|
+
|
|
314
314
|
if options is None:
|
|
315
315
|
options = DatetimeExtractionOptions()
|
|
316
|
-
|
|
316
|
+
|
|
317
317
|
if tesseract_config_string is None:
|
|
318
318
|
tesseract_config_string = options.tesseract_config_strings[0]
|
|
319
|
-
|
|
319
|
+
|
|
320
320
|
find_text_in_crops_results = {}
|
|
321
|
-
|
|
321
|
+
|
|
322
322
|
# crop_location = 'top'
|
|
323
323
|
# crop_location = 'bottom'
|
|
324
324
|
for crop_location in ('top','bottom'):
|
|
@@ -326,51 +326,51 @@ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
|
|
|
326
326
|
find_text_in_crops_results[crop_location] = {}
|
|
327
327
|
find_text_in_crops_results[crop_location]['text'] = ''
|
|
328
328
|
find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = None
|
|
329
|
-
|
|
329
|
+
|
|
330
330
|
rough_crop = rough_crops[crop_location]
|
|
331
|
-
|
|
331
|
+
|
|
332
332
|
# Crop to the portion of the rough crop with a solid background color
|
|
333
333
|
crop_to_solid_region_results = crop_to_solid_region(rough_crop,crop_location,options)
|
|
334
|
-
|
|
334
|
+
|
|
335
335
|
find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = \
|
|
336
336
|
crop_to_solid_region_results
|
|
337
|
-
|
|
337
|
+
|
|
338
338
|
# Try cropping to a solid region; if that doesn't work, try running OCR on the whole
|
|
339
339
|
# rough crop.
|
|
340
340
|
if crop_to_solid_region_results['p_success'] >= options.p_crop_success_threshold:
|
|
341
341
|
padded_crop_pil = crop_to_solid_region_results['padded_crop_pil']
|
|
342
|
-
else:
|
|
342
|
+
else:
|
|
343
343
|
# continue
|
|
344
|
-
padded_crop_pil = rough_crop
|
|
345
|
-
|
|
344
|
+
padded_crop_pil = rough_crop
|
|
345
|
+
|
|
346
346
|
if options.apply_sharpening_filter:
|
|
347
347
|
padded_crop_pil = padded_crop_pil.filter(ImageFilter.SHARPEN)
|
|
348
|
-
|
|
348
|
+
|
|
349
349
|
# Find text in the padded crop
|
|
350
350
|
pytesseract.pytesseract.tesseract_cmd = options.tesseract_cmd
|
|
351
|
-
text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
|
|
351
|
+
text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
|
|
352
352
|
config=tesseract_config_string)
|
|
353
|
-
|
|
353
|
+
|
|
354
354
|
text = text.replace('\n', ' ').replace('\r', '').strip()
|
|
355
355
|
|
|
356
|
-
find_text_in_crops_results[crop_location]['text'] = text
|
|
357
|
-
|
|
356
|
+
find_text_in_crops_results[crop_location]['text'] = text
|
|
357
|
+
|
|
358
358
|
# ...for each cropped region
|
|
359
|
-
|
|
359
|
+
|
|
360
360
|
return find_text_in_crops_results
|
|
361
|
-
|
|
361
|
+
|
|
362
362
|
# ...def find_text_in_crops(...)
|
|
363
|
-
|
|
363
|
+
|
|
364
364
|
|
|
365
365
|
def _datetime_string_to_datetime(matched_string):
|
|
366
366
|
"""
|
|
367
367
|
Takes an OCR-matched datetime string, does a little cleanup, and parses a date
|
|
368
368
|
from it.
|
|
369
|
-
|
|
369
|
+
|
|
370
370
|
By the time a string gets to this function, it should be a proper date string, with
|
|
371
371
|
no extraneous characters other than spaces around colons or hyphens.
|
|
372
372
|
"""
|
|
373
|
-
|
|
373
|
+
|
|
374
374
|
matched_string = matched_string.replace(' -','-')
|
|
375
375
|
matched_string = matched_string.replace('- ','-')
|
|
376
376
|
matched_string = matched_string.replace(' :',':')
|
|
@@ -386,155 +386,155 @@ def _get_datetime_from_strings(strings,options=None):
|
|
|
386
386
|
"""
|
|
387
387
|
Given a string or list of strings, search for exactly one datetime in those strings.
|
|
388
388
|
using a series of regular expressions.
|
|
389
|
-
|
|
389
|
+
|
|
390
390
|
Strings are currently just concatenated before searching for a datetime.
|
|
391
391
|
"""
|
|
392
|
-
|
|
392
|
+
|
|
393
393
|
if options is None:
|
|
394
|
-
options = DatetimeExtractionOptions()
|
|
395
|
-
|
|
394
|
+
options = DatetimeExtractionOptions()
|
|
395
|
+
|
|
396
396
|
if isinstance(strings,str):
|
|
397
397
|
s = strings
|
|
398
398
|
else:
|
|
399
399
|
s = ' '.join(strings).lower()
|
|
400
|
-
s = s.replace('—','-')
|
|
400
|
+
s = s.replace('—','-')
|
|
401
401
|
s = ''.join(e for e in s if e.isalnum() or e in ':-/' or e.isspace())
|
|
402
|
-
|
|
402
|
+
|
|
403
403
|
### AM/PM
|
|
404
|
-
|
|
404
|
+
|
|
405
405
|
# 2013-10-02 11:40:50 AM
|
|
406
|
-
m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
407
|
-
if m is not None:
|
|
408
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
409
|
-
|
|
406
|
+
m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
407
|
+
if m is not None:
|
|
408
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
409
|
+
|
|
410
410
|
# 04/01/2017 08:54:00AM
|
|
411
|
-
m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
412
|
-
if m is not None:
|
|
413
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
414
|
-
|
|
411
|
+
m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
412
|
+
if m is not None:
|
|
413
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
414
|
+
|
|
415
415
|
# 2017/04/01 08:54:00AM
|
|
416
|
-
m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
417
|
-
if m is not None:
|
|
418
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
419
|
-
|
|
416
|
+
m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
417
|
+
if m is not None:
|
|
418
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
419
|
+
|
|
420
420
|
# 04/01/2017 08:54AM
|
|
421
|
-
m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
422
|
-
if m is not None:
|
|
423
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
424
|
-
|
|
421
|
+
m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
422
|
+
if m is not None:
|
|
423
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
424
|
+
|
|
425
425
|
# 2017/04/01 08:54AM
|
|
426
|
-
m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
427
|
-
if m is not None:
|
|
428
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
429
|
-
|
|
426
|
+
m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
427
|
+
if m is not None:
|
|
428
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
429
|
+
|
|
430
430
|
### No AM/PM
|
|
431
|
-
|
|
431
|
+
|
|
432
432
|
# 2013-07-27 04:56:35
|
|
433
|
-
m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
434
|
-
if m is not None:
|
|
435
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
436
|
-
|
|
433
|
+
m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
434
|
+
if m is not None:
|
|
435
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
436
|
+
|
|
437
437
|
# 07-27-2013 04:56:35
|
|
438
|
-
m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
439
|
-
if m is not None:
|
|
440
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
441
|
-
|
|
438
|
+
m = re.search(r'(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
439
|
+
if m is not None:
|
|
440
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
441
|
+
|
|
442
442
|
# 2013/07/27 04:56:35
|
|
443
|
-
m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
444
|
-
if m is not None:
|
|
445
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
446
|
-
|
|
443
|
+
m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
444
|
+
if m is not None:
|
|
445
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
446
|
+
|
|
447
447
|
# 07/27/2013 04:56:35
|
|
448
|
-
m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
449
|
-
if m is not None:
|
|
450
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
451
|
-
|
|
448
|
+
m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
449
|
+
if m is not None:
|
|
450
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
451
|
+
|
|
452
452
|
return None
|
|
453
|
-
|
|
453
|
+
|
|
454
454
|
# ...def _get_datetime_from_strings(...)
|
|
455
455
|
|
|
456
456
|
|
|
457
457
|
def get_datetime_from_image(image,include_crops=True,options=None):
|
|
458
458
|
"""
|
|
459
459
|
Tries to find the datetime string (if present) in an image.
|
|
460
|
-
|
|
460
|
+
|
|
461
461
|
Args:
|
|
462
462
|
image (Image or str): the PIL Image object or image filename in which we should look for
|
|
463
463
|
datetime information.
|
|
464
464
|
include_crops (bool, optional): whether to include cropped images in the return dict (set
|
|
465
465
|
this to False if you're worried about size and you're processing a zillion images)
|
|
466
|
-
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
466
|
+
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
467
467
|
DatetimeExtractionOptions object or a list of options to try
|
|
468
|
-
|
|
468
|
+
|
|
469
469
|
Returns:
|
|
470
470
|
dict: a dict with fields:
|
|
471
|
-
|
|
471
|
+
|
|
472
472
|
- datetime: Python datetime object, or None
|
|
473
473
|
- text_results: length-2 list of strings
|
|
474
|
-
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
474
|
+
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
475
475
|
datetimes extracted for each option set
|
|
476
476
|
- ocr_results: detailed results from the OCR process, including crops as PIL images;
|
|
477
477
|
only included if include_crops is True
|
|
478
478
|
"""
|
|
479
|
-
|
|
479
|
+
|
|
480
480
|
if options is None:
|
|
481
481
|
options = DatetimeExtractionOptions()
|
|
482
|
-
|
|
482
|
+
|
|
483
483
|
if isinstance(image,str):
|
|
484
484
|
image = vis_utils.open_image(image)
|
|
485
485
|
|
|
486
486
|
# Crop the top and bottom from the image
|
|
487
487
|
rough_crops = make_rough_crops(image,options)
|
|
488
488
|
assert len(rough_crops) == 2
|
|
489
|
-
|
|
489
|
+
|
|
490
490
|
all_extracted_datetimes = {}
|
|
491
491
|
all_text_results = []
|
|
492
492
|
all_ocr_results = []
|
|
493
|
-
|
|
493
|
+
|
|
494
494
|
extracted_datetime = None
|
|
495
|
-
|
|
495
|
+
|
|
496
496
|
# Find text, possibly trying all config strings
|
|
497
497
|
#
|
|
498
498
|
# tesseract_config_string = options.tesseract_config_strings[0]
|
|
499
499
|
for tesseract_config_string in options.tesseract_config_strings:
|
|
500
|
-
|
|
500
|
+
|
|
501
501
|
ocr_results = find_text_in_crops(rough_crops,options,tesseract_config_string)
|
|
502
502
|
all_ocr_results.append(ocr_results)
|
|
503
|
-
|
|
503
|
+
|
|
504
504
|
text_results = [v['text'] for v in ocr_results.values()]
|
|
505
505
|
assert len(text_results) == 2
|
|
506
506
|
all_text_results.append(text_results)
|
|
507
|
-
|
|
507
|
+
|
|
508
508
|
# Find datetime
|
|
509
509
|
extracted_datetime_this_option_set = _get_datetime_from_strings(text_results,options)
|
|
510
510
|
assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
|
|
511
511
|
(extracted_datetime_this_option_set is None)
|
|
512
|
-
|
|
512
|
+
|
|
513
513
|
all_extracted_datetimes[tesseract_config_string] = \
|
|
514
514
|
extracted_datetime_this_option_set
|
|
515
|
-
|
|
515
|
+
|
|
516
516
|
if extracted_datetime_this_option_set is not None:
|
|
517
517
|
if extracted_datetime is None:
|
|
518
518
|
extracted_datetime = extracted_datetime_this_option_set
|
|
519
519
|
if not options.force_all_ocr_options:
|
|
520
|
-
break
|
|
521
|
-
|
|
520
|
+
break
|
|
521
|
+
|
|
522
522
|
# ...for each set of OCR options
|
|
523
|
-
|
|
524
|
-
if extracted_datetime is not None:
|
|
523
|
+
|
|
524
|
+
if extracted_datetime is not None:
|
|
525
525
|
assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
|
|
526
526
|
|
|
527
527
|
to_return = {}
|
|
528
528
|
to_return['datetime'] = extracted_datetime
|
|
529
|
-
|
|
529
|
+
|
|
530
530
|
to_return['text_results'] = all_text_results
|
|
531
531
|
to_return['all_extracted_datetimes'] = all_extracted_datetimes
|
|
532
|
-
|
|
532
|
+
|
|
533
533
|
if include_crops:
|
|
534
534
|
to_return['ocr_results'] = all_ocr_results
|
|
535
535
|
else:
|
|
536
536
|
to_return['ocr_results'] = None
|
|
537
|
-
|
|
537
|
+
|
|
538
538
|
return to_return
|
|
539
539
|
|
|
540
540
|
# ...def get_datetime_from_image(...)
|
|
@@ -544,34 +544,34 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
|
|
|
544
544
|
"""
|
|
545
545
|
Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
|
|
546
546
|
until we find a datetime.
|
|
547
|
-
|
|
547
|
+
|
|
548
548
|
Args:
|
|
549
549
|
image (Image or str): the PIL Image object or image filename in which we should look for
|
|
550
550
|
datetime information.
|
|
551
551
|
include_crops (bool, optional): whether to include cropped images in the return dict (set
|
|
552
552
|
this to False if you're worried about size and you're processing a zillion images)
|
|
553
|
-
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
553
|
+
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
554
554
|
DatetimeExtractionOptions object or a list of options to try
|
|
555
|
-
|
|
555
|
+
|
|
556
556
|
Returns:
|
|
557
557
|
dict: A dict with fields:
|
|
558
558
|
- datetime: Python datetime object, or None
|
|
559
559
|
- text_results: length-2 list of strings
|
|
560
|
-
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
560
|
+
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
561
561
|
datetimes extracted for each option set
|
|
562
562
|
- ocr_results: detailed results from the OCR process, including crops as PIL images;
|
|
563
563
|
only included if include_crops is True
|
|
564
564
|
"""
|
|
565
|
-
|
|
565
|
+
|
|
566
566
|
if options is None:
|
|
567
567
|
options = DatetimeExtractionOptions()
|
|
568
568
|
|
|
569
569
|
if not is_iterable(options):
|
|
570
570
|
options = [options]
|
|
571
|
-
|
|
571
|
+
|
|
572
572
|
result = {}
|
|
573
573
|
result['error'] = None
|
|
574
|
-
|
|
574
|
+
|
|
575
575
|
for i_option_set,current_options in enumerate(options):
|
|
576
576
|
try:
|
|
577
577
|
result = get_datetime_from_image(filename,include_crops=include_crops,options=current_options)
|
|
@@ -580,79 +580,85 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
|
|
|
580
580
|
break
|
|
581
581
|
except Exception as e:
|
|
582
582
|
result['error'] = str(e)
|
|
583
|
-
|
|
583
|
+
|
|
584
584
|
return result
|
|
585
585
|
|
|
586
586
|
|
|
587
587
|
def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None,
|
|
588
588
|
n_workers=16,use_threads=False):
|
|
589
589
|
"""
|
|
590
|
-
The main entry point for this module. Tries to retrieve metadata from pixels for every
|
|
590
|
+
The main entry point for this module. Tries to retrieve metadata from pixels for every
|
|
591
591
|
image in [folder_name], optionally the results to the .json file [output_file].
|
|
592
|
-
|
|
592
|
+
|
|
593
593
|
Args:
|
|
594
594
|
folder_name (str): the folder of images to process recursively
|
|
595
595
|
output_file (str, optional): the .json file to which we should write results; if None,
|
|
596
596
|
just returns the results
|
|
597
597
|
n_to_sample (int, optional): for debugging only, used to limit the number of images
|
|
598
598
|
we process
|
|
599
|
-
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
599
|
+
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
600
600
|
DatetimeExtractionOptions object or a list of options to try for each image
|
|
601
601
|
n_workers (int, optional): the number of parallel workers to use; set to <= 1 to disable
|
|
602
602
|
parallelization
|
|
603
603
|
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
604
604
|
parallelization; not relevant if n_workers <= 1
|
|
605
|
-
|
|
605
|
+
|
|
606
606
|
Returns:
|
|
607
607
|
dict: a dict mapping filenames to datetime extraction results, see try_get_datetime_from_images
|
|
608
608
|
for the format of each value in the dict.
|
|
609
609
|
"""
|
|
610
|
-
|
|
610
|
+
|
|
611
611
|
if options is None:
|
|
612
612
|
options = DatetimeExtractionOptions()
|
|
613
|
-
|
|
613
|
+
|
|
614
614
|
image_file_names = \
|
|
615
615
|
find_images(folder_name,convert_slashes=True,
|
|
616
616
|
return_relative_paths=False,recursive=True)
|
|
617
|
-
|
|
617
|
+
|
|
618
618
|
if n_to_sample > 0:
|
|
619
619
|
import random
|
|
620
620
|
random.seed(0)
|
|
621
621
|
image_file_names = random.sample(image_file_names,n_to_sample)
|
|
622
|
-
|
|
622
|
+
|
|
623
623
|
if n_workers <= 1:
|
|
624
|
-
|
|
624
|
+
|
|
625
625
|
all_results = []
|
|
626
626
|
for fn_abs in tqdm(image_file_names):
|
|
627
627
|
all_results.append(try_get_datetime_from_image(fn_abs,options=options))
|
|
628
|
-
|
|
629
|
-
else:
|
|
630
|
-
|
|
628
|
+
|
|
629
|
+
else:
|
|
630
|
+
|
|
631
631
|
# Don't spawn more than one worker per image
|
|
632
632
|
if n_workers > len(image_file_names):
|
|
633
633
|
n_workers = len(image_file_names)
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
634
|
+
|
|
635
|
+
pool = None
|
|
636
|
+
try:
|
|
637
|
+
if use_threads:
|
|
638
|
+
from multiprocessing.pool import ThreadPool
|
|
639
|
+
pool = ThreadPool(n_workers)
|
|
640
|
+
worker_string = 'threads'
|
|
641
|
+
else:
|
|
642
|
+
from multiprocessing.pool import Pool
|
|
643
|
+
pool = Pool(n_workers)
|
|
644
|
+
worker_string = 'processes'
|
|
645
|
+
|
|
646
|
+
print('Starting a pool of {} {}'.format(n_workers,worker_string))
|
|
647
|
+
|
|
648
|
+
all_results = list(tqdm(pool.imap(
|
|
649
|
+
partial(try_get_datetime_from_image,options=options),image_file_names),
|
|
650
|
+
total=len(image_file_names)))
|
|
651
|
+
finally:
|
|
652
|
+
pool.close()
|
|
653
|
+
pool.join()
|
|
654
|
+
print("Pool closed and joined for datetime extraction")
|
|
655
|
+
|
|
650
656
|
filename_to_results = {}
|
|
651
|
-
|
|
657
|
+
|
|
652
658
|
# fn_relative = image_file_names[0]
|
|
653
659
|
for i_file,fn_abs in enumerate(image_file_names):
|
|
654
660
|
filename_to_results[fn_abs] = all_results[i_file]
|
|
655
|
-
|
|
661
|
+
|
|
656
662
|
if output_file is not None:
|
|
657
663
|
with open(output_file,'w') as f:
|
|
658
664
|
json.dump(filename_to_results,f,indent=1,default=str)
|
|
@@ -663,9 +669,9 @@ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options
|
|
|
663
669
|
#%% Interactive driver
|
|
664
670
|
|
|
665
671
|
if False:
|
|
666
|
-
|
|
672
|
+
|
|
667
673
|
#%% Process images
|
|
668
|
-
|
|
674
|
+
|
|
669
675
|
folder_name = r'g:\temp\island_conservation_camera_traps'
|
|
670
676
|
output_file = r'g:\temp\ocr_results.json'
|
|
671
677
|
from megadetector.utils.path_utils import insert_before_extension
|
|
@@ -681,60 +687,60 @@ if False:
|
|
|
681
687
|
all_options = [options_a]
|
|
682
688
|
filename_to_results = get_datetimes_for_folder(folder_name,output_file,
|
|
683
689
|
n_to_sample=n_to_sample,options=all_options)
|
|
684
|
-
|
|
690
|
+
|
|
685
691
|
|
|
686
692
|
#%% Load results
|
|
687
|
-
|
|
693
|
+
|
|
688
694
|
# output_file = r"G:\temp\ocr_results.2023.10.31.07.37.54.json"
|
|
689
695
|
with open(output_file,'r') as f:
|
|
690
696
|
filename_to_results = json.load(f)
|
|
691
697
|
filenames = sorted(list(filename_to_results.keys()))
|
|
692
698
|
print('Loaded results for {} files'.format(len(filename_to_results)))
|
|
693
|
-
|
|
694
|
-
|
|
699
|
+
|
|
700
|
+
|
|
695
701
|
#%% Scrap cell
|
|
696
|
-
|
|
702
|
+
|
|
697
703
|
fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
|
|
698
704
|
include_crops = False
|
|
699
705
|
options_a = DatetimeExtractionOptions()
|
|
700
706
|
options_b = DatetimeExtractionOptions()
|
|
701
707
|
options_b.image_crop_fraction = [0.08 , 0.08]
|
|
702
|
-
image = vis_utils.open_image(fn) # noqa
|
|
708
|
+
image = vis_utils.open_image(fn) # noqa
|
|
703
709
|
result = try_get_datetime_from_image(fn,options=[options_a,options_b]) # noqa
|
|
704
710
|
print(result)
|
|
705
|
-
|
|
711
|
+
|
|
706
712
|
# open_file(fn)
|
|
707
713
|
# rough_crops = make_rough_crops(image,options=options)
|
|
708
|
-
|
|
709
|
-
|
|
714
|
+
|
|
715
|
+
|
|
710
716
|
#%% Look for OCR or parsing failures
|
|
711
|
-
|
|
717
|
+
|
|
712
718
|
bad_tokens = ()
|
|
713
|
-
|
|
719
|
+
|
|
714
720
|
files_with_disagreements = set()
|
|
715
|
-
|
|
721
|
+
|
|
716
722
|
# i_fn = 0; fn = filenames[i_fn]
|
|
717
723
|
for i_fn,fn in enumerate(filenames):
|
|
718
|
-
|
|
724
|
+
|
|
719
725
|
image = fn
|
|
720
726
|
results = filename_to_results[fn]
|
|
721
|
-
|
|
727
|
+
|
|
722
728
|
if 'text_results' not in results:
|
|
723
729
|
raise Exception('no results available for {} ({})'.format(i_fn,fn))
|
|
724
730
|
print('Skipping {}, no results'.format(i_fn))
|
|
725
731
|
continue
|
|
726
|
-
|
|
732
|
+
|
|
727
733
|
s = ' '.join([x[0] for x in results['text_results']])
|
|
728
|
-
|
|
734
|
+
|
|
729
735
|
known_bad = False
|
|
730
736
|
for bad_token in bad_tokens:
|
|
731
737
|
if bad_token in s:
|
|
732
738
|
known_bad = True
|
|
733
|
-
if known_bad:
|
|
739
|
+
if known_bad:
|
|
734
740
|
continue
|
|
735
|
-
|
|
741
|
+
|
|
736
742
|
extracted_datetime = results['datetime']
|
|
737
|
-
|
|
743
|
+
|
|
738
744
|
# If we have a datetime, make sure all successful OCR results agree
|
|
739
745
|
if extracted_datetime is not None:
|
|
740
746
|
for config_string in results['all_extracted_datetimes']:
|
|
@@ -745,19 +751,19 @@ if False:
|
|
|
745
751
|
print('Falling back for {} ({})'.format(i_fn,fn))
|
|
746
752
|
ocr_results = get_datetime_from_image(fn)
|
|
747
753
|
extracted_datetime = ocr_results['datetime']
|
|
748
|
-
|
|
754
|
+
|
|
749
755
|
if extracted_datetime is None:
|
|
750
756
|
print('Failure at {}: {}'.format(i_fn,s))
|
|
751
|
-
|
|
757
|
+
|
|
752
758
|
# open_file(fn)
|
|
753
759
|
# get_datetime_from_image(fn)
|
|
754
|
-
|
|
755
|
-
|
|
760
|
+
|
|
761
|
+
|
|
756
762
|
#%% Write results to an HTML file for testing
|
|
757
|
-
|
|
763
|
+
|
|
758
764
|
n_to_sample = 5000
|
|
759
765
|
if (n_to_sample >= 0) and (len(filename_to_results) > n_to_sample):
|
|
760
|
-
filenames = sorted(list(filename_to_results.keys()))
|
|
766
|
+
filenames = sorted(list(filename_to_results.keys()))
|
|
761
767
|
import random
|
|
762
768
|
random.seed(0)
|
|
763
769
|
keys = random.sample(filenames,n_to_sample)
|
|
@@ -765,18 +771,18 @@ if False:
|
|
|
765
771
|
|
|
766
772
|
preview_dir = r'g:\temp\ocr-preview'
|
|
767
773
|
os.makedirs(preview_dir,exist_ok=True)
|
|
768
|
-
|
|
774
|
+
|
|
769
775
|
def resize_image_for_preview(fn_abs):
|
|
770
|
-
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
776
|
+
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
771
777
|
resized_image = vis_utils.resize_image(fn_abs,target_width=600)
|
|
772
778
|
resized_fn = os.path.join(preview_dir,fn_relative)
|
|
773
779
|
os.makedirs(os.path.dirname(resized_fn),exist_ok=True)
|
|
774
780
|
resized_image.save(resized_fn)
|
|
775
781
|
return resized_fn
|
|
776
|
-
|
|
782
|
+
|
|
777
783
|
# Resize images in parallel
|
|
778
784
|
n_rendering_workers = 16
|
|
779
|
-
|
|
785
|
+
|
|
780
786
|
if n_rendering_workers <= 1:
|
|
781
787
|
for fn_abs in tqdm(filename_to_results.keys()):
|
|
782
788
|
resize_image_for_preview(fn_abs)
|
|
@@ -784,64 +790,64 @@ if False:
|
|
|
784
790
|
# from multiprocessing.pool import Pool as RenderingPool; worker_string = 'processes'
|
|
785
791
|
from multiprocessing.pool import ThreadPool as RenderingPool; worker_string = 'threads'
|
|
786
792
|
pool = RenderingPool(n_rendering_workers)
|
|
787
|
-
|
|
793
|
+
|
|
788
794
|
print('Starting rendering pool with {} {}'.format(n_rendering_workers,worker_string))
|
|
789
|
-
|
|
795
|
+
|
|
790
796
|
_ = list(tqdm(pool.imap(resize_image_for_preview,filename_to_results.keys()),
|
|
791
797
|
total=len(filename_to_results)))
|
|
792
|
-
|
|
793
|
-
|
|
798
|
+
|
|
799
|
+
|
|
794
800
|
def make_datetime_preview_page(filenames,html_file):
|
|
795
|
-
|
|
801
|
+
|
|
796
802
|
html_image_list = []
|
|
797
803
|
html_options = write_html_image_list.write_html_image_list()
|
|
798
804
|
html_options['maxFiguresPerHtmlFile'] = 2500
|
|
799
805
|
html_options['defaultImageStyle'] = 'margin:0px;margin-top:5px;margin-bottom:30px;'
|
|
800
|
-
|
|
806
|
+
|
|
801
807
|
# fn_abs = filenames[0]
|
|
802
808
|
for fn_abs in filenames:
|
|
803
|
-
|
|
804
|
-
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
809
|
+
|
|
810
|
+
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
805
811
|
# resized_fn = os.path.join(preview_dir,fn_relative)
|
|
806
812
|
results_this_image = filename_to_results[fn_abs]
|
|
807
|
-
|
|
813
|
+
|
|
808
814
|
extracted_datetime = results_this_image['datetime']
|
|
809
815
|
title = 'Image: {}<br/>Extracted datetime: {}'.format(fn_relative,extracted_datetime)
|
|
810
816
|
html_image_list.append({'filename':fn_relative,'title':title})
|
|
811
|
-
|
|
817
|
+
|
|
812
818
|
# ...for each crop
|
|
813
|
-
|
|
819
|
+
|
|
814
820
|
# ...for each image
|
|
815
|
-
|
|
821
|
+
|
|
816
822
|
html_options['makeRelative'] = True
|
|
817
823
|
write_html_image_list.write_html_image_list(html_file,
|
|
818
824
|
html_image_list,
|
|
819
825
|
html_options)
|
|
820
826
|
open_file(html_file)
|
|
821
827
|
return html_image_list
|
|
822
|
-
|
|
828
|
+
|
|
823
829
|
failed_files = []
|
|
824
830
|
for fn_abs in filename_to_results:
|
|
825
831
|
results_this_image = filename_to_results[fn_abs]
|
|
826
832
|
if results_this_image['datetime'] is None:
|
|
827
833
|
failed_files.append(fn_abs)
|
|
828
|
-
|
|
834
|
+
|
|
829
835
|
print('Found {} failures'.format(len(failed_files)))
|
|
830
|
-
|
|
836
|
+
|
|
831
837
|
output_summary_file = os.path.join(preview_dir,'summary.html')
|
|
832
838
|
html_image_list = make_datetime_preview_page(sorted(list(filename_to_results.keys())),output_summary_file)
|
|
833
|
-
|
|
834
|
-
failure_summary_file = os.path.join(preview_dir,'failures.html')
|
|
839
|
+
|
|
840
|
+
failure_summary_file = os.path.join(preview_dir,'failures.html')
|
|
835
841
|
html_image_list_failures = make_datetime_preview_page(failed_files,failure_summary_file)
|
|
836
|
-
|
|
842
|
+
|
|
837
843
|
filenames = failed_files
|
|
838
844
|
html_file = failure_summary_file
|
|
839
845
|
|
|
840
|
-
|
|
846
|
+
|
|
841
847
|
#%% Other approaches to getting dates from strings
|
|
842
|
-
|
|
848
|
+
|
|
843
849
|
# ...that didn't really work out.
|
|
844
|
-
|
|
850
|
+
|
|
845
851
|
# pip install dateparser
|
|
846
852
|
import dateparser
|
|
847
853
|
|
|
@@ -853,7 +859,7 @@ if False:
|
|
|
853
859
|
dateparser_settings = {'PREFER_DATES_FROM':'past','STRICT_PARSING':True}
|
|
854
860
|
|
|
855
861
|
dateparser_result = dateparser.search.search_dates(s, settings=dateparser_settings)
|
|
856
|
-
|
|
862
|
+
|
|
857
863
|
if dateparser_result is not None:
|
|
858
864
|
assert len(dateparser_result) == 1
|
|
859
865
|
extracted_datetime = dateparser_result[0][1]
|
|
@@ -864,7 +870,7 @@ if False:
|
|
|
864
870
|
extracted_datetime = matches_list[0]
|
|
865
871
|
else:
|
|
866
872
|
extracted_datetime = None
|
|
867
|
-
|
|
868
|
-
if extracted_datetime is not None:
|
|
873
|
+
|
|
874
|
+
if extracted_datetime is not None:
|
|
869
875
|
assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
|
|
870
876
|
|