megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/efficientnet/model.py +8 -8
- megadetector/classification/efficientnet/utils.py +6 -5
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +26 -26
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -2
- megadetector/data_management/camtrap_dp_to_coco.py +79 -46
- megadetector/data_management/cct_json_utils.py +103 -103
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +210 -193
- megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
- megadetector/data_management/databases/integrity_check_json_db.py +228 -200
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +88 -39
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +133 -125
- megadetector/data_management/labelme_to_yolo.py +159 -73
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
- megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +73 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
- megadetector/data_management/mewc_to_md.py +344 -340
- megadetector/data_management/ocr_tools.py +262 -255
- megadetector/data_management/read_exif.py +249 -227
- megadetector/data_management/remap_coco_categories.py +90 -28
- megadetector/data_management/remove_exif.py +81 -21
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +588 -120
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +248 -122
- megadetector/data_management/yolo_to_coco.py +333 -191
- megadetector/detection/change_detection.py +832 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +358 -278
- megadetector/detection/run_detector.py +399 -186
- megadetector/detection/run_detector_batch.py +404 -377
- megadetector/detection/run_inference_with_yolov5_val.py +340 -327
- megadetector/detection/run_tiled_inference.py +257 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +332 -295
- megadetector/postprocessing/add_max_conf.py +19 -11
- megadetector/postprocessing/categorize_detections_by_size.py +45 -45
- megadetector/postprocessing/classification_postprocessing.py +468 -433
- megadetector/postprocessing/combine_batch_outputs.py +23 -23
- megadetector/postprocessing/compare_batch_results.py +590 -525
- megadetector/postprocessing/convert_output_format.py +106 -102
- megadetector/postprocessing/create_crop_folder.py +347 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +48 -27
- megadetector/postprocessing/md_to_coco.py +133 -102
- megadetector/postprocessing/md_to_labelme.py +107 -90
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +92 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -301
- megadetector/postprocessing/remap_detection_categories.py +91 -38
- megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +156 -74
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/ct_utils.py +1049 -211
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +632 -529
- megadetector/utils/path_utils.py +1520 -431
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/split_locations_into_train_val.py +62 -62
- megadetector/utils/string_utils.py +148 -27
- megadetector/utils/url_utils.py +489 -176
- megadetector/utils/wi_utils.py +2658 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +34 -30
- megadetector/visualization/render_images_with_thumbnails.py +39 -74
- megadetector/visualization/visualization_utils.py +487 -435
- megadetector/visualization/visualize_db.py +232 -198
- megadetector/visualization/visualize_detector_output.py +82 -76
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
- megadetector-10.0.0.dist-info/RECORD +139 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
- megadetector/api/batch_processing/api_core/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
- megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
- megadetector/api/batch_processing/api_core/server.py +0 -294
- megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
- megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
- megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
- megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
- megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
- megadetector/api/batch_processing/api_core/server_utils.py +0 -88
- megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
- megadetector/api/batch_processing/api_support/__init__.py +0 -0
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
- megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
- megadetector/api/synchronous/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
- megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
- megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
- megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
- megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
- megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector/utils/azure_utils.py +0 -178
- megadetector/utils/sas_blob_utils.py +0 -509
- megadetector-5.0.28.dist-info/RECORD +0 -209
- /megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0
|
@@ -22,11 +22,11 @@ Prior to using this module:
|
|
|
22
22
|
* Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
|
|
23
23
|
|
|
24
24
|
* pip install pytesseract
|
|
25
|
-
|
|
25
|
+
|
|
26
26
|
Known limitations:
|
|
27
27
|
|
|
28
28
|
* Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
#%% Notes to self
|
|
@@ -34,11 +34,12 @@ Known limitations:
|
|
|
34
34
|
"""
|
|
35
35
|
|
|
36
36
|
* To use the legacy engine (--oem 0), I had to download an updated eng.traineddata file from:
|
|
37
|
-
|
|
37
|
+
|
|
38
38
|
https://github.com/tesseract-ocr/tessdata
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
|
+
|
|
42
43
|
#%% Constants and imports
|
|
43
44
|
|
|
44
45
|
import os
|
|
@@ -56,7 +57,7 @@ from tqdm import tqdm
|
|
|
56
57
|
|
|
57
58
|
from megadetector.utils.path_utils import find_images
|
|
58
59
|
from megadetector.utils.path_utils import open_file
|
|
59
|
-
from megadetector.utils import write_html_image_list
|
|
60
|
+
from megadetector.utils import write_html_image_list
|
|
60
61
|
from megadetector.utils.ct_utils import is_iterable
|
|
61
62
|
from megadetector.visualization import visualization_utils as vis_utils
|
|
62
63
|
|
|
@@ -64,7 +65,7 @@ from megadetector.visualization import visualization_utils as vis_utils
|
|
|
64
65
|
#
|
|
65
66
|
# Also install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki, and add
|
|
66
67
|
# the installation dir to your path (on Windows, typically C:\Program Files (x86)\Tesseract-OCR)
|
|
67
|
-
import pytesseract
|
|
68
|
+
import pytesseract # type: ignore
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
#%% Extraction options
|
|
@@ -73,40 +74,40 @@ class DatetimeExtractionOptions:
|
|
|
73
74
|
"""
|
|
74
75
|
Options used to parameterize datetime extraction in most functions in this module.
|
|
75
76
|
"""
|
|
76
|
-
|
|
77
|
+
|
|
77
78
|
def __init__(self):
|
|
78
|
-
|
|
79
|
-
#: Using a semi-arbitrary metric of how much it feels like we found the
|
|
79
|
+
|
|
80
|
+
#: Using a semi-arbitrary metric of how much it feels like we found the
|
|
80
81
|
#: text-containing region, discard regions that appear to be extraction failures
|
|
81
82
|
self.p_crop_success_threshold = 0.5
|
|
82
|
-
|
|
83
|
+
|
|
83
84
|
#: Pad each crop with a few pixels to make tesseract happy
|
|
84
|
-
self.crop_padding = 10
|
|
85
|
-
|
|
85
|
+
self.crop_padding = 10
|
|
86
|
+
|
|
86
87
|
#: Discard short text, typically text from the top of the image
|
|
87
88
|
self.min_text_length = 4
|
|
88
|
-
|
|
89
|
-
#: When we're looking for pixels that match the background color, allow some
|
|
89
|
+
|
|
90
|
+
#: When we're looking for pixels that match the background color, allow some
|
|
90
91
|
#: tolerance around the dominant color
|
|
91
92
|
self.background_tolerance = 2
|
|
92
|
-
|
|
93
|
-
#: We need to see a consistent color in at least this fraction of pixels in our rough
|
|
93
|
+
|
|
94
|
+
#: We need to see a consistent color in at least this fraction of pixels in our rough
|
|
94
95
|
#: crop to believe that we actually found a candidate metadata region.
|
|
95
96
|
self.min_background_fraction = 0.3
|
|
96
|
-
|
|
97
|
+
|
|
97
98
|
#: What fraction of the [top,bottom] of the image should we use for our rough crop?
|
|
98
99
|
self.image_crop_fraction = [0.045 , 0.045]
|
|
99
100
|
# self.image_crop_fraction = [0.08 , 0.08]
|
|
100
|
-
|
|
101
|
+
|
|
101
102
|
#: Within that rough crop, how much should we use for determining the background color?
|
|
102
103
|
self.background_crop_fraction_of_rough_crop = 0.5
|
|
103
|
-
|
|
104
|
+
|
|
104
105
|
#: A row is considered a probable metadata row if it contains at least this fraction
|
|
105
|
-
#: of the background color. This is used only to find the top and bottom of the crop area,
|
|
106
|
+
#: of the background color. This is used only to find the top and bottom of the crop area,
|
|
106
107
|
#: so it's not that *every* row needs to hit this criteria, only the rows that are generally
|
|
107
108
|
#: above and below the text.
|
|
108
109
|
self.min_background_fraction_for_background_row = 0.5
|
|
109
|
-
|
|
110
|
+
|
|
110
111
|
#: psm 6: "assume a single uniform block of text"
|
|
111
112
|
#: psm 13: raw line
|
|
112
113
|
#: oem: 0 == legacy, 1 == lstm
|
|
@@ -115,14 +116,14 @@ class DatetimeExtractionOptions:
|
|
|
115
116
|
#: Try these configuration strings in order until we find a valid datetime
|
|
116
117
|
self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
|
|
117
118
|
'--oem 1 --psm 6','--oem 0 --psm 6']
|
|
118
|
-
|
|
119
|
+
|
|
119
120
|
#: If this is False, and one set of options appears to succeed for an image, we'll
|
|
120
121
|
#: stop there. If this is True, we always run all option sets on every image.
|
|
121
122
|
self.force_all_ocr_options = False
|
|
122
|
-
|
|
123
|
+
|
|
123
124
|
#: Whether to apply PIL's ImageFilter.SHARPEN prior to OCR
|
|
124
125
|
self.apply_sharpening_filter = True
|
|
125
|
-
|
|
126
|
+
|
|
126
127
|
#: Tesseract should be on your system path, but you can also specify the
|
|
127
128
|
#: path explicitly, e.g. you can do either of these:
|
|
128
129
|
#:
|
|
@@ -136,115 +137,115 @@ class DatetimeExtractionOptions:
|
|
|
136
137
|
def make_rough_crops(image,options=None):
|
|
137
138
|
"""
|
|
138
139
|
Crops the top and bottom regions out of an image.
|
|
139
|
-
|
|
140
|
+
|
|
140
141
|
Args:
|
|
141
142
|
image (Image or str): a PIL Image or file name
|
|
142
143
|
options (DatetimeExtractionOptions, optional): OCR parameters
|
|
143
|
-
|
|
144
|
+
|
|
144
145
|
Returns:
|
|
145
|
-
dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
|
|
146
|
+
dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
|
|
146
147
|
"""
|
|
147
|
-
|
|
148
|
+
|
|
148
149
|
if options is None:
|
|
149
150
|
options = DatetimeExtractionOptions()
|
|
150
|
-
|
|
151
|
+
|
|
151
152
|
if isinstance(image,str):
|
|
152
153
|
image = vis_utils.open_image(image)
|
|
153
|
-
|
|
154
|
+
|
|
154
155
|
w = image.width
|
|
155
156
|
h = image.height
|
|
156
|
-
|
|
157
|
+
|
|
157
158
|
crop_height_top = round(options.image_crop_fraction[0] * h)
|
|
158
159
|
crop_height_bottom = round(options.image_crop_fraction[1] * h)
|
|
159
|
-
|
|
160
|
+
|
|
160
161
|
# l,t,r,b
|
|
161
162
|
#
|
|
162
163
|
# 0,0 is upper-left
|
|
163
164
|
top_crop = image.crop([0,0,w,crop_height_top])
|
|
164
165
|
bottom_crop = image.crop([0,h-crop_height_bottom,w,h])
|
|
165
166
|
return {'top':top_crop,'bottom':bottom_crop}
|
|
166
|
-
|
|
167
|
+
|
|
167
168
|
# ...def make_rough_crops(...)
|
|
168
169
|
|
|
169
170
|
|
|
170
171
|
def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
171
|
-
"""
|
|
172
|
+
"""
|
|
172
173
|
Given a rough crop from the top or bottom of an image, finds the background color
|
|
173
174
|
and crops to the metadata region.
|
|
174
|
-
|
|
175
|
-
Within a region of an image (typically a crop from the top-ish or bottom-ish part of
|
|
175
|
+
|
|
176
|
+
Within a region of an image (typically a crop from the top-ish or bottom-ish part of
|
|
176
177
|
an image), tightly crop to the solid portion (typically a region with a black background).
|
|
177
178
|
|
|
178
179
|
The success metric is just a binary indicator right now: 1.0 if we found a region we believe
|
|
179
180
|
contains a solid background, 0.0 otherwise.
|
|
180
|
-
|
|
181
|
+
|
|
181
182
|
Args:
|
|
182
183
|
rough_crop (Image): the PIL Image to crop
|
|
183
184
|
crop_location (str): 'top' or 'bottom'
|
|
184
185
|
options (DatetimeExtractionOptions, optional): OCR parameters
|
|
185
|
-
|
|
186
|
+
|
|
186
187
|
Returns:
|
|
187
188
|
tuple: a tuple containing (a cropped_image (Image), p_success (float), padded_image (Image))
|
|
188
189
|
"""
|
|
189
|
-
|
|
190
|
+
|
|
190
191
|
if options is None:
|
|
191
|
-
options = DatetimeExtractionOptions()
|
|
192
|
+
options = DatetimeExtractionOptions()
|
|
192
193
|
|
|
193
194
|
crop_to_solid_region_result = {}
|
|
194
195
|
crop_to_solid_region_result['crop_pil'] = None
|
|
195
196
|
crop_to_solid_region_result['padded_crop_pil'] = None
|
|
196
197
|
crop_to_solid_region_result['p_success'] = 0.0
|
|
197
|
-
|
|
198
|
-
# pil --> cv2
|
|
199
|
-
rough_crop_np = np.array(rough_crop)
|
|
200
|
-
rough_crop_np = rough_crop_np[:, :, ::-1].copy()
|
|
201
|
-
|
|
198
|
+
|
|
199
|
+
# pil --> cv2
|
|
200
|
+
rough_crop_np = np.array(rough_crop)
|
|
201
|
+
rough_crop_np = rough_crop_np[:, :, ::-1].copy()
|
|
202
|
+
|
|
202
203
|
# Search *part* of the crop for the background value (the part closest to the top or bottom
|
|
203
204
|
# of the image)
|
|
204
205
|
rows_to_use_for_background_search = int(rough_crop_np.shape[0] * \
|
|
205
206
|
options.background_crop_fraction_of_rough_crop)
|
|
206
|
-
|
|
207
|
+
|
|
207
208
|
if crop_location == 'top':
|
|
208
209
|
background_search_image = rough_crop_np[0:rows_to_use_for_background_search,:,:]
|
|
209
210
|
elif crop_location == 'bottom':
|
|
210
211
|
background_search_image = rough_crop_np[-rows_to_use_for_background_search:,:,:]
|
|
211
212
|
else:
|
|
212
213
|
raise ValueError('Unrecognized crop location: {}'.format(crop_location))
|
|
213
|
-
|
|
214
|
+
|
|
214
215
|
background_search_image = cv2.cvtColor(background_search_image, cv2.COLOR_BGR2GRAY)
|
|
215
|
-
background_search_image = background_search_image.astype('uint8')
|
|
216
|
-
background_search_image = cv2.medianBlur(background_search_image,3)
|
|
216
|
+
background_search_image = background_search_image.astype('uint8')
|
|
217
|
+
background_search_image = cv2.medianBlur(background_search_image,3)
|
|
217
218
|
pixel_values = background_search_image.flatten()
|
|
218
219
|
counts = np.bincount(pixel_values)
|
|
219
220
|
background_value = int(np.argmax(counts))
|
|
220
|
-
|
|
221
|
+
|
|
221
222
|
# Did we find a sensible mode that looks like a background value?
|
|
222
223
|
background_value_count = int(np.max(counts))
|
|
223
224
|
p_background_value = background_value_count / np.sum(counts)
|
|
224
|
-
|
|
225
|
+
|
|
225
226
|
if (p_background_value < options.min_background_fraction):
|
|
226
227
|
return crop_to_solid_region_result
|
|
227
228
|
else:
|
|
228
229
|
p_success = 1.0
|
|
229
|
-
|
|
230
|
+
|
|
230
231
|
analysis_image = cv2.cvtColor(rough_crop_np, cv2.COLOR_BGR2GRAY)
|
|
231
|
-
analysis_image = analysis_image.astype('uint8')
|
|
232
|
-
analysis_image = cv2.medianBlur(analysis_image,3)
|
|
233
|
-
|
|
232
|
+
analysis_image = analysis_image.astype('uint8')
|
|
233
|
+
analysis_image = cv2.medianBlur(analysis_image,3)
|
|
234
|
+
|
|
234
235
|
# This will now be a binary image indicating which pixels are background
|
|
235
236
|
analysis_image = cv2.inRange(analysis_image,
|
|
236
237
|
background_value-options.background_tolerance,
|
|
237
238
|
background_value+options.background_tolerance)
|
|
238
|
-
|
|
239
|
-
# Use row heuristics to refine the crop
|
|
239
|
+
|
|
240
|
+
# Use row heuristics to refine the crop
|
|
240
241
|
h = analysis_image.shape[0]
|
|
241
242
|
w = analysis_image.shape[1]
|
|
242
|
-
|
|
243
|
+
|
|
243
244
|
min_x = 0
|
|
244
245
|
min_y = -1
|
|
245
246
|
max_x = w
|
|
246
247
|
max_y = -1
|
|
247
|
-
|
|
248
|
+
|
|
248
249
|
# Find the first and last row that are mostly the background color
|
|
249
250
|
for y in range(h):
|
|
250
251
|
row_count = 0
|
|
@@ -256,20 +257,20 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
|
256
257
|
if min_y == -1:
|
|
257
258
|
min_y = y
|
|
258
259
|
max_y = y
|
|
259
|
-
|
|
260
|
+
|
|
260
261
|
assert (min_y == -1 and max_y == -1) or (min_y != -1 and max_y != -1)
|
|
261
|
-
|
|
262
|
+
|
|
262
263
|
if min_y == -1:
|
|
263
264
|
return crop_to_solid_region_result
|
|
264
|
-
|
|
265
|
+
|
|
265
266
|
if max_y == min_y:
|
|
266
267
|
return crop_to_solid_region_result
|
|
267
|
-
|
|
268
|
+
|
|
268
269
|
x = min_x
|
|
269
270
|
y = min_y
|
|
270
271
|
w = max_x-min_x
|
|
271
272
|
h = max_y-min_y
|
|
272
|
-
|
|
273
|
+
|
|
273
274
|
x = min_x
|
|
274
275
|
y = min_y
|
|
275
276
|
w = max_x-min_x
|
|
@@ -277,7 +278,7 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
|
277
278
|
|
|
278
279
|
# Crop the image
|
|
279
280
|
crop_np = rough_crop_np[y:y+h,x:x+w]
|
|
280
|
-
|
|
281
|
+
|
|
281
282
|
# Tesseract doesn't like characters really close to the edge, so pad a little.
|
|
282
283
|
crop_padding = options.crop_padding
|
|
283
284
|
padded_crop_np = cv2.copyMakeBorder(crop_np,crop_padding,crop_padding,crop_padding,crop_padding,
|
|
@@ -286,39 +287,39 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
|
|
|
286
287
|
|
|
287
288
|
crop_pil = Image.fromarray(crop_np)
|
|
288
289
|
padded_crop_pil = Image.fromarray(padded_crop_np)
|
|
289
|
-
|
|
290
|
+
|
|
290
291
|
crop_to_solid_region_result['crop_pil'] = crop_pil
|
|
291
292
|
crop_to_solid_region_result['padded_crop_pil'] = padded_crop_pil
|
|
292
293
|
crop_to_solid_region_result['p_success'] = p_success
|
|
293
|
-
|
|
294
|
+
|
|
294
295
|
return crop_to_solid_region_result
|
|
295
|
-
|
|
296
|
-
# ...crop_to_solid_region(...)
|
|
296
|
+
|
|
297
|
+
# ...crop_to_solid_region(...)
|
|
297
298
|
|
|
298
299
|
|
|
299
300
|
def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
|
|
300
301
|
"""
|
|
301
|
-
Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
|
|
302
|
+
Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
|
|
302
303
|
regions by the time they get to this function, roughly the top or bottom 20% of an image.
|
|
303
|
-
|
|
304
|
+
|
|
304
305
|
Args:
|
|
305
306
|
rough_crops (list): list of Image objects that have been cropped close to text
|
|
306
307
|
options (DatetimeExtractionOptions, optional): OCR parameters
|
|
307
308
|
tesseract_config_string (str, optional): optional CLI argument to pass to tesseract.exe
|
|
308
|
-
|
|
309
|
+
|
|
309
310
|
Returns:
|
|
310
311
|
dict: a dict with keys "top" and "bottom", where each value is a dict with keys
|
|
311
312
|
'text' (text found, if any) and 'crop_to_solid_region_results' (metadata about the OCR pass)
|
|
312
313
|
"""
|
|
313
|
-
|
|
314
|
+
|
|
314
315
|
if options is None:
|
|
315
316
|
options = DatetimeExtractionOptions()
|
|
316
|
-
|
|
317
|
+
|
|
317
318
|
if tesseract_config_string is None:
|
|
318
319
|
tesseract_config_string = options.tesseract_config_strings[0]
|
|
319
|
-
|
|
320
|
+
|
|
320
321
|
find_text_in_crops_results = {}
|
|
321
|
-
|
|
322
|
+
|
|
322
323
|
# crop_location = 'top'
|
|
323
324
|
# crop_location = 'bottom'
|
|
324
325
|
for crop_location in ('top','bottom'):
|
|
@@ -326,51 +327,51 @@ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
|
|
|
326
327
|
find_text_in_crops_results[crop_location] = {}
|
|
327
328
|
find_text_in_crops_results[crop_location]['text'] = ''
|
|
328
329
|
find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = None
|
|
329
|
-
|
|
330
|
+
|
|
330
331
|
rough_crop = rough_crops[crop_location]
|
|
331
|
-
|
|
332
|
+
|
|
332
333
|
# Crop to the portion of the rough crop with a solid background color
|
|
333
334
|
crop_to_solid_region_results = crop_to_solid_region(rough_crop,crop_location,options)
|
|
334
|
-
|
|
335
|
+
|
|
335
336
|
find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = \
|
|
336
337
|
crop_to_solid_region_results
|
|
337
|
-
|
|
338
|
+
|
|
338
339
|
# Try cropping to a solid region; if that doesn't work, try running OCR on the whole
|
|
339
340
|
# rough crop.
|
|
340
341
|
if crop_to_solid_region_results['p_success'] >= options.p_crop_success_threshold:
|
|
341
342
|
padded_crop_pil = crop_to_solid_region_results['padded_crop_pil']
|
|
342
|
-
else:
|
|
343
|
+
else:
|
|
343
344
|
# continue
|
|
344
|
-
padded_crop_pil = rough_crop
|
|
345
|
-
|
|
345
|
+
padded_crop_pil = rough_crop
|
|
346
|
+
|
|
346
347
|
if options.apply_sharpening_filter:
|
|
347
348
|
padded_crop_pil = padded_crop_pil.filter(ImageFilter.SHARPEN)
|
|
348
|
-
|
|
349
|
+
|
|
349
350
|
# Find text in the padded crop
|
|
350
351
|
pytesseract.pytesseract.tesseract_cmd = options.tesseract_cmd
|
|
351
|
-
text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
|
|
352
|
+
text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
|
|
352
353
|
config=tesseract_config_string)
|
|
353
|
-
|
|
354
|
+
|
|
354
355
|
text = text.replace('\n', ' ').replace('\r', '').strip()
|
|
355
356
|
|
|
356
|
-
find_text_in_crops_results[crop_location]['text'] = text
|
|
357
|
-
|
|
357
|
+
find_text_in_crops_results[crop_location]['text'] = text
|
|
358
|
+
|
|
358
359
|
# ...for each cropped region
|
|
359
|
-
|
|
360
|
+
|
|
360
361
|
return find_text_in_crops_results
|
|
361
|
-
|
|
362
|
+
|
|
362
363
|
# ...def find_text_in_crops(...)
|
|
363
|
-
|
|
364
|
+
|
|
364
365
|
|
|
365
366
|
def _datetime_string_to_datetime(matched_string):
|
|
366
367
|
"""
|
|
367
368
|
Takes an OCR-matched datetime string, does a little cleanup, and parses a date
|
|
368
369
|
from it.
|
|
369
|
-
|
|
370
|
+
|
|
370
371
|
By the time a string gets to this function, it should be a proper date string, with
|
|
371
372
|
no extraneous characters other than spaces around colons or hyphens.
|
|
372
373
|
"""
|
|
373
|
-
|
|
374
|
+
|
|
374
375
|
matched_string = matched_string.replace(' -','-')
|
|
375
376
|
matched_string = matched_string.replace('- ','-')
|
|
376
377
|
matched_string = matched_string.replace(' :',':')
|
|
@@ -386,155 +387,155 @@ def _get_datetime_from_strings(strings,options=None):
|
|
|
386
387
|
"""
|
|
387
388
|
Given a string or list of strings, search for exactly one datetime in those strings.
|
|
388
389
|
using a series of regular expressions.
|
|
389
|
-
|
|
390
|
+
|
|
390
391
|
Strings are currently just concatenated before searching for a datetime.
|
|
391
392
|
"""
|
|
392
|
-
|
|
393
|
+
|
|
393
394
|
if options is None:
|
|
394
|
-
options = DatetimeExtractionOptions()
|
|
395
|
-
|
|
395
|
+
options = DatetimeExtractionOptions()
|
|
396
|
+
|
|
396
397
|
if isinstance(strings,str):
|
|
397
398
|
s = strings
|
|
398
399
|
else:
|
|
399
400
|
s = ' '.join(strings).lower()
|
|
400
|
-
s = s.replace('—','-')
|
|
401
|
+
s = s.replace('—','-')
|
|
401
402
|
s = ''.join(e for e in s if e.isalnum() or e in ':-/' or e.isspace())
|
|
402
|
-
|
|
403
|
+
|
|
403
404
|
### AM/PM
|
|
404
|
-
|
|
405
|
+
|
|
405
406
|
# 2013-10-02 11:40:50 AM
|
|
406
|
-
m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
407
|
-
if m is not None:
|
|
408
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
409
|
-
|
|
407
|
+
m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
408
|
+
if m is not None:
|
|
409
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
410
|
+
|
|
410
411
|
# 04/01/2017 08:54:00AM
|
|
411
|
-
m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
412
|
-
if m is not None:
|
|
413
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
414
|
-
|
|
412
|
+
m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
413
|
+
if m is not None:
|
|
414
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
415
|
+
|
|
415
416
|
# 2017/04/01 08:54:00AM
|
|
416
|
-
m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
417
|
-
if m is not None:
|
|
418
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
419
|
-
|
|
417
|
+
m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
418
|
+
if m is not None:
|
|
419
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
420
|
+
|
|
420
421
|
# 04/01/2017 08:54AM
|
|
421
|
-
m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
422
|
-
if m is not None:
|
|
423
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
424
|
-
|
|
422
|
+
m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
423
|
+
if m is not None:
|
|
424
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
425
|
+
|
|
425
426
|
# 2017/04/01 08:54AM
|
|
426
|
-
m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
427
|
-
if m is not None:
|
|
428
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
429
|
-
|
|
427
|
+
m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
|
|
428
|
+
if m is not None:
|
|
429
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
430
|
+
|
|
430
431
|
### No AM/PM
|
|
431
|
-
|
|
432
|
+
|
|
432
433
|
# 2013-07-27 04:56:35
|
|
433
|
-
m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
434
|
-
if m is not None:
|
|
435
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
436
|
-
|
|
434
|
+
m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
435
|
+
if m is not None:
|
|
436
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
437
|
+
|
|
437
438
|
# 07-27-2013 04:56:35
|
|
438
|
-
m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
439
|
-
if m is not None:
|
|
440
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
441
|
-
|
|
439
|
+
m = re.search(r'(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
440
|
+
if m is not None:
|
|
441
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
442
|
+
|
|
442
443
|
# 2013/07/27 04:56:35
|
|
443
|
-
m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
444
|
-
if m is not None:
|
|
445
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
446
|
-
|
|
444
|
+
m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
445
|
+
if m is not None:
|
|
446
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
447
|
+
|
|
447
448
|
# 07/27/2013 04:56:35
|
|
448
|
-
m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
449
|
-
if m is not None:
|
|
450
|
-
return _datetime_string_to_datetime(m.group(0))
|
|
451
|
-
|
|
449
|
+
m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
|
|
450
|
+
if m is not None:
|
|
451
|
+
return _datetime_string_to_datetime(m.group(0))
|
|
452
|
+
|
|
452
453
|
return None
|
|
453
|
-
|
|
454
|
+
|
|
454
455
|
# ...def _get_datetime_from_strings(...)
|
|
455
456
|
|
|
456
457
|
|
|
457
458
|
def get_datetime_from_image(image,include_crops=True,options=None):
|
|
458
459
|
"""
|
|
459
460
|
Tries to find the datetime string (if present) in an image.
|
|
460
|
-
|
|
461
|
+
|
|
461
462
|
Args:
|
|
462
463
|
image (Image or str): the PIL Image object or image filename in which we should look for
|
|
463
464
|
datetime information.
|
|
464
465
|
include_crops (bool, optional): whether to include cropped images in the return dict (set
|
|
465
466
|
this to False if you're worried about size and you're processing a zillion images)
|
|
466
|
-
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
467
|
+
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
467
468
|
DatetimeExtractionOptions object or a list of options to try
|
|
468
|
-
|
|
469
|
+
|
|
469
470
|
Returns:
|
|
470
471
|
dict: a dict with fields:
|
|
471
|
-
|
|
472
|
+
|
|
472
473
|
- datetime: Python datetime object, or None
|
|
473
474
|
- text_results: length-2 list of strings
|
|
474
|
-
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
475
|
+
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
475
476
|
datetimes extracted for each option set
|
|
476
477
|
- ocr_results: detailed results from the OCR process, including crops as PIL images;
|
|
477
478
|
only included if include_crops is True
|
|
478
479
|
"""
|
|
479
|
-
|
|
480
|
+
|
|
480
481
|
if options is None:
|
|
481
482
|
options = DatetimeExtractionOptions()
|
|
482
|
-
|
|
483
|
+
|
|
483
484
|
if isinstance(image,str):
|
|
484
485
|
image = vis_utils.open_image(image)
|
|
485
486
|
|
|
486
487
|
# Crop the top and bottom from the image
|
|
487
488
|
rough_crops = make_rough_crops(image,options)
|
|
488
489
|
assert len(rough_crops) == 2
|
|
489
|
-
|
|
490
|
+
|
|
490
491
|
all_extracted_datetimes = {}
|
|
491
492
|
all_text_results = []
|
|
492
493
|
all_ocr_results = []
|
|
493
|
-
|
|
494
|
+
|
|
494
495
|
extracted_datetime = None
|
|
495
|
-
|
|
496
|
+
|
|
496
497
|
# Find text, possibly trying all config strings
|
|
497
498
|
#
|
|
498
499
|
# tesseract_config_string = options.tesseract_config_strings[0]
|
|
499
500
|
for tesseract_config_string in options.tesseract_config_strings:
|
|
500
|
-
|
|
501
|
+
|
|
501
502
|
ocr_results = find_text_in_crops(rough_crops,options,tesseract_config_string)
|
|
502
503
|
all_ocr_results.append(ocr_results)
|
|
503
|
-
|
|
504
|
+
|
|
504
505
|
text_results = [v['text'] for v in ocr_results.values()]
|
|
505
506
|
assert len(text_results) == 2
|
|
506
507
|
all_text_results.append(text_results)
|
|
507
|
-
|
|
508
|
+
|
|
508
509
|
# Find datetime
|
|
509
510
|
extracted_datetime_this_option_set = _get_datetime_from_strings(text_results,options)
|
|
510
511
|
assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
|
|
511
512
|
(extracted_datetime_this_option_set is None)
|
|
512
|
-
|
|
513
|
+
|
|
513
514
|
all_extracted_datetimes[tesseract_config_string] = \
|
|
514
515
|
extracted_datetime_this_option_set
|
|
515
|
-
|
|
516
|
+
|
|
516
517
|
if extracted_datetime_this_option_set is not None:
|
|
517
518
|
if extracted_datetime is None:
|
|
518
519
|
extracted_datetime = extracted_datetime_this_option_set
|
|
519
520
|
if not options.force_all_ocr_options:
|
|
520
|
-
break
|
|
521
|
-
|
|
521
|
+
break
|
|
522
|
+
|
|
522
523
|
# ...for each set of OCR options
|
|
523
|
-
|
|
524
|
-
if extracted_datetime is not None:
|
|
524
|
+
|
|
525
|
+
if extracted_datetime is not None:
|
|
525
526
|
assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
|
|
526
527
|
|
|
527
528
|
to_return = {}
|
|
528
529
|
to_return['datetime'] = extracted_datetime
|
|
529
|
-
|
|
530
|
+
|
|
530
531
|
to_return['text_results'] = all_text_results
|
|
531
532
|
to_return['all_extracted_datetimes'] = all_extracted_datetimes
|
|
532
|
-
|
|
533
|
+
|
|
533
534
|
if include_crops:
|
|
534
535
|
to_return['ocr_results'] = all_ocr_results
|
|
535
536
|
else:
|
|
536
537
|
to_return['ocr_results'] = None
|
|
537
|
-
|
|
538
|
+
|
|
538
539
|
return to_return
|
|
539
540
|
|
|
540
541
|
# ...def get_datetime_from_image(...)
|
|
@@ -544,34 +545,34 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
|
|
|
544
545
|
"""
|
|
545
546
|
Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
|
|
546
547
|
until we find a datetime.
|
|
547
|
-
|
|
548
|
+
|
|
548
549
|
Args:
|
|
549
|
-
|
|
550
|
-
datetime information.
|
|
550
|
+
filename (Image or str): the PIL Image object or image filename in which we should look
|
|
551
|
+
for datetime information.
|
|
551
552
|
include_crops (bool, optional): whether to include cropped images in the return dict (set
|
|
552
553
|
this to False if you're worried about size and you're processing a zillion images)
|
|
553
|
-
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
554
|
+
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
554
555
|
DatetimeExtractionOptions object or a list of options to try
|
|
555
|
-
|
|
556
|
+
|
|
556
557
|
Returns:
|
|
557
558
|
dict: A dict with fields:
|
|
558
559
|
- datetime: Python datetime object, or None
|
|
559
560
|
- text_results: length-2 list of strings
|
|
560
|
-
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
561
|
+
- all_extracted_datetimes: if we ran multiple option sets, this will contain the
|
|
561
562
|
datetimes extracted for each option set
|
|
562
563
|
- ocr_results: detailed results from the OCR process, including crops as PIL images;
|
|
563
564
|
only included if include_crops is True
|
|
564
565
|
"""
|
|
565
|
-
|
|
566
|
+
|
|
566
567
|
if options is None:
|
|
567
568
|
options = DatetimeExtractionOptions()
|
|
568
569
|
|
|
569
570
|
if not is_iterable(options):
|
|
570
571
|
options = [options]
|
|
571
|
-
|
|
572
|
+
|
|
572
573
|
result = {}
|
|
573
574
|
result['error'] = None
|
|
574
|
-
|
|
575
|
+
|
|
575
576
|
for i_option_set,current_options in enumerate(options):
|
|
576
577
|
try:
|
|
577
578
|
result = get_datetime_from_image(filename,include_crops=include_crops,options=current_options)
|
|
@@ -580,79 +581,85 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
|
|
|
580
581
|
break
|
|
581
582
|
except Exception as e:
|
|
582
583
|
result['error'] = str(e)
|
|
583
|
-
|
|
584
|
+
|
|
584
585
|
return result
|
|
585
586
|
|
|
586
587
|
|
|
587
588
|
def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None,
|
|
588
589
|
n_workers=16,use_threads=False):
|
|
589
590
|
"""
|
|
590
|
-
The main entry point for this module. Tries to retrieve metadata from pixels for every
|
|
591
|
+
The main entry point for this module. Tries to retrieve metadata from pixels for every
|
|
591
592
|
image in [folder_name], optionally the results to the .json file [output_file].
|
|
592
|
-
|
|
593
|
+
|
|
593
594
|
Args:
|
|
594
595
|
folder_name (str): the folder of images to process recursively
|
|
595
596
|
output_file (str, optional): the .json file to which we should write results; if None,
|
|
596
597
|
just returns the results
|
|
597
598
|
n_to_sample (int, optional): for debugging only, used to limit the number of images
|
|
598
599
|
we process
|
|
599
|
-
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
600
|
+
options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
|
|
600
601
|
DatetimeExtractionOptions object or a list of options to try for each image
|
|
601
602
|
n_workers (int, optional): the number of parallel workers to use; set to <= 1 to disable
|
|
602
603
|
parallelization
|
|
603
604
|
use_threads (bool, optional): whether to use threads (True) or processes (False) for
|
|
604
605
|
parallelization; not relevant if n_workers <= 1
|
|
605
|
-
|
|
606
|
+
|
|
606
607
|
Returns:
|
|
607
608
|
dict: a dict mapping filenames to datetime extraction results, see try_get_datetime_from_images
|
|
608
609
|
for the format of each value in the dict.
|
|
609
610
|
"""
|
|
610
|
-
|
|
611
|
+
|
|
611
612
|
if options is None:
|
|
612
613
|
options = DatetimeExtractionOptions()
|
|
613
|
-
|
|
614
|
+
|
|
614
615
|
image_file_names = \
|
|
615
616
|
find_images(folder_name,convert_slashes=True,
|
|
616
617
|
return_relative_paths=False,recursive=True)
|
|
617
|
-
|
|
618
|
+
|
|
618
619
|
if n_to_sample > 0:
|
|
619
620
|
import random
|
|
620
621
|
random.seed(0)
|
|
621
622
|
image_file_names = random.sample(image_file_names,n_to_sample)
|
|
622
|
-
|
|
623
|
+
|
|
623
624
|
if n_workers <= 1:
|
|
624
|
-
|
|
625
|
+
|
|
625
626
|
all_results = []
|
|
626
627
|
for fn_abs in tqdm(image_file_names):
|
|
627
628
|
all_results.append(try_get_datetime_from_image(fn_abs,options=options))
|
|
628
|
-
|
|
629
|
-
else:
|
|
630
|
-
|
|
629
|
+
|
|
630
|
+
else:
|
|
631
|
+
|
|
631
632
|
# Don't spawn more than one worker per image
|
|
632
633
|
if n_workers > len(image_file_names):
|
|
633
634
|
n_workers = len(image_file_names)
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
635
|
+
|
|
636
|
+
pool = None
|
|
637
|
+
try:
|
|
638
|
+
if use_threads:
|
|
639
|
+
from multiprocessing.pool import ThreadPool
|
|
640
|
+
pool = ThreadPool(n_workers)
|
|
641
|
+
worker_string = 'threads'
|
|
642
|
+
else:
|
|
643
|
+
from multiprocessing.pool import Pool
|
|
644
|
+
pool = Pool(n_workers)
|
|
645
|
+
worker_string = 'processes'
|
|
646
|
+
|
|
647
|
+
print('Starting a pool of {} {}'.format(n_workers,worker_string))
|
|
648
|
+
|
|
649
|
+
all_results = list(tqdm(pool.imap(
|
|
650
|
+
partial(try_get_datetime_from_image,options=options),image_file_names),
|
|
651
|
+
total=len(image_file_names)))
|
|
652
|
+
finally:
|
|
653
|
+
pool.close()
|
|
654
|
+
pool.join()
|
|
655
|
+
print("Pool closed and joined for datetime extraction")
|
|
656
|
+
|
|
650
657
|
filename_to_results = {}
|
|
651
|
-
|
|
658
|
+
|
|
652
659
|
# fn_relative = image_file_names[0]
|
|
653
660
|
for i_file,fn_abs in enumerate(image_file_names):
|
|
654
661
|
filename_to_results[fn_abs] = all_results[i_file]
|
|
655
|
-
|
|
662
|
+
|
|
656
663
|
if output_file is not None:
|
|
657
664
|
with open(output_file,'w') as f:
|
|
658
665
|
json.dump(filename_to_results,f,indent=1,default=str)
|
|
@@ -663,9 +670,9 @@ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options
|
|
|
663
670
|
#%% Interactive driver
|
|
664
671
|
|
|
665
672
|
if False:
|
|
666
|
-
|
|
673
|
+
|
|
667
674
|
#%% Process images
|
|
668
|
-
|
|
675
|
+
|
|
669
676
|
folder_name = r'g:\temp\island_conservation_camera_traps'
|
|
670
677
|
output_file = r'g:\temp\ocr_results.json'
|
|
671
678
|
from megadetector.utils.path_utils import insert_before_extension
|
|
@@ -681,60 +688,60 @@ if False:
|
|
|
681
688
|
all_options = [options_a]
|
|
682
689
|
filename_to_results = get_datetimes_for_folder(folder_name,output_file,
|
|
683
690
|
n_to_sample=n_to_sample,options=all_options)
|
|
684
|
-
|
|
691
|
+
|
|
685
692
|
|
|
686
693
|
#%% Load results
|
|
687
|
-
|
|
694
|
+
|
|
688
695
|
# output_file = r"G:\temp\ocr_results.2023.10.31.07.37.54.json"
|
|
689
696
|
with open(output_file,'r') as f:
|
|
690
697
|
filename_to_results = json.load(f)
|
|
691
698
|
filenames = sorted(list(filename_to_results.keys()))
|
|
692
699
|
print('Loaded results for {} files'.format(len(filename_to_results)))
|
|
693
|
-
|
|
694
|
-
|
|
700
|
+
|
|
701
|
+
|
|
695
702
|
#%% Scrap cell
|
|
696
|
-
|
|
703
|
+
|
|
697
704
|
fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
|
|
698
705
|
include_crops = False
|
|
699
706
|
options_a = DatetimeExtractionOptions()
|
|
700
707
|
options_b = DatetimeExtractionOptions()
|
|
701
708
|
options_b.image_crop_fraction = [0.08 , 0.08]
|
|
702
|
-
image = vis_utils.open_image(fn) # noqa
|
|
709
|
+
image = vis_utils.open_image(fn) # noqa
|
|
703
710
|
result = try_get_datetime_from_image(fn,options=[options_a,options_b]) # noqa
|
|
704
711
|
print(result)
|
|
705
|
-
|
|
712
|
+
|
|
706
713
|
# open_file(fn)
|
|
707
714
|
# rough_crops = make_rough_crops(image,options=options)
|
|
708
|
-
|
|
709
|
-
|
|
715
|
+
|
|
716
|
+
|
|
710
717
|
#%% Look for OCR or parsing failures
|
|
711
|
-
|
|
718
|
+
|
|
712
719
|
bad_tokens = ()
|
|
713
|
-
|
|
720
|
+
|
|
714
721
|
files_with_disagreements = set()
|
|
715
|
-
|
|
722
|
+
|
|
716
723
|
# i_fn = 0; fn = filenames[i_fn]
|
|
717
724
|
for i_fn,fn in enumerate(filenames):
|
|
718
|
-
|
|
725
|
+
|
|
719
726
|
image = fn
|
|
720
727
|
results = filename_to_results[fn]
|
|
721
|
-
|
|
728
|
+
|
|
722
729
|
if 'text_results' not in results:
|
|
723
730
|
raise Exception('no results available for {} ({})'.format(i_fn,fn))
|
|
724
731
|
print('Skipping {}, no results'.format(i_fn))
|
|
725
732
|
continue
|
|
726
|
-
|
|
733
|
+
|
|
727
734
|
s = ' '.join([x[0] for x in results['text_results']])
|
|
728
|
-
|
|
735
|
+
|
|
729
736
|
known_bad = False
|
|
730
737
|
for bad_token in bad_tokens:
|
|
731
738
|
if bad_token in s:
|
|
732
739
|
known_bad = True
|
|
733
|
-
if known_bad:
|
|
740
|
+
if known_bad:
|
|
734
741
|
continue
|
|
735
|
-
|
|
742
|
+
|
|
736
743
|
extracted_datetime = results['datetime']
|
|
737
|
-
|
|
744
|
+
|
|
738
745
|
# If we have a datetime, make sure all successful OCR results agree
|
|
739
746
|
if extracted_datetime is not None:
|
|
740
747
|
for config_string in results['all_extracted_datetimes']:
|
|
@@ -745,19 +752,19 @@ if False:
|
|
|
745
752
|
print('Falling back for {} ({})'.format(i_fn,fn))
|
|
746
753
|
ocr_results = get_datetime_from_image(fn)
|
|
747
754
|
extracted_datetime = ocr_results['datetime']
|
|
748
|
-
|
|
755
|
+
|
|
749
756
|
if extracted_datetime is None:
|
|
750
757
|
print('Failure at {}: {}'.format(i_fn,s))
|
|
751
|
-
|
|
758
|
+
|
|
752
759
|
# open_file(fn)
|
|
753
760
|
# get_datetime_from_image(fn)
|
|
754
|
-
|
|
755
|
-
|
|
761
|
+
|
|
762
|
+
|
|
756
763
|
#%% Write results to an HTML file for testing
|
|
757
|
-
|
|
764
|
+
|
|
758
765
|
n_to_sample = 5000
|
|
759
766
|
if (n_to_sample >= 0) and (len(filename_to_results) > n_to_sample):
|
|
760
|
-
filenames = sorted(list(filename_to_results.keys()))
|
|
767
|
+
filenames = sorted(list(filename_to_results.keys()))
|
|
761
768
|
import random
|
|
762
769
|
random.seed(0)
|
|
763
770
|
keys = random.sample(filenames,n_to_sample)
|
|
@@ -765,18 +772,18 @@ if False:
|
|
|
765
772
|
|
|
766
773
|
preview_dir = r'g:\temp\ocr-preview'
|
|
767
774
|
os.makedirs(preview_dir,exist_ok=True)
|
|
768
|
-
|
|
775
|
+
|
|
769
776
|
def resize_image_for_preview(fn_abs):
|
|
770
|
-
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
777
|
+
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
771
778
|
resized_image = vis_utils.resize_image(fn_abs,target_width=600)
|
|
772
779
|
resized_fn = os.path.join(preview_dir,fn_relative)
|
|
773
780
|
os.makedirs(os.path.dirname(resized_fn),exist_ok=True)
|
|
774
781
|
resized_image.save(resized_fn)
|
|
775
782
|
return resized_fn
|
|
776
|
-
|
|
783
|
+
|
|
777
784
|
# Resize images in parallel
|
|
778
785
|
n_rendering_workers = 16
|
|
779
|
-
|
|
786
|
+
|
|
780
787
|
if n_rendering_workers <= 1:
|
|
781
788
|
for fn_abs in tqdm(filename_to_results.keys()):
|
|
782
789
|
resize_image_for_preview(fn_abs)
|
|
@@ -784,64 +791,64 @@ if False:
|
|
|
784
791
|
# from multiprocessing.pool import Pool as RenderingPool; worker_string = 'processes'
|
|
785
792
|
from multiprocessing.pool import ThreadPool as RenderingPool; worker_string = 'threads'
|
|
786
793
|
pool = RenderingPool(n_rendering_workers)
|
|
787
|
-
|
|
794
|
+
|
|
788
795
|
print('Starting rendering pool with {} {}'.format(n_rendering_workers,worker_string))
|
|
789
|
-
|
|
796
|
+
|
|
790
797
|
_ = list(tqdm(pool.imap(resize_image_for_preview,filename_to_results.keys()),
|
|
791
798
|
total=len(filename_to_results)))
|
|
792
|
-
|
|
793
|
-
|
|
799
|
+
|
|
800
|
+
|
|
794
801
|
def make_datetime_preview_page(filenames,html_file):
|
|
795
|
-
|
|
802
|
+
|
|
796
803
|
html_image_list = []
|
|
797
804
|
html_options = write_html_image_list.write_html_image_list()
|
|
798
805
|
html_options['maxFiguresPerHtmlFile'] = 2500
|
|
799
806
|
html_options['defaultImageStyle'] = 'margin:0px;margin-top:5px;margin-bottom:30px;'
|
|
800
|
-
|
|
807
|
+
|
|
801
808
|
# fn_abs = filenames[0]
|
|
802
809
|
for fn_abs in filenames:
|
|
803
|
-
|
|
804
|
-
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
810
|
+
|
|
811
|
+
fn_relative = os.path.relpath(fn_abs,folder_name)
|
|
805
812
|
# resized_fn = os.path.join(preview_dir,fn_relative)
|
|
806
813
|
results_this_image = filename_to_results[fn_abs]
|
|
807
|
-
|
|
814
|
+
|
|
808
815
|
extracted_datetime = results_this_image['datetime']
|
|
809
816
|
title = 'Image: {}<br/>Extracted datetime: {}'.format(fn_relative,extracted_datetime)
|
|
810
817
|
html_image_list.append({'filename':fn_relative,'title':title})
|
|
811
|
-
|
|
818
|
+
|
|
812
819
|
# ...for each crop
|
|
813
|
-
|
|
820
|
+
|
|
814
821
|
# ...for each image
|
|
815
|
-
|
|
822
|
+
|
|
816
823
|
html_options['makeRelative'] = True
|
|
817
824
|
write_html_image_list.write_html_image_list(html_file,
|
|
818
825
|
html_image_list,
|
|
819
826
|
html_options)
|
|
820
827
|
open_file(html_file)
|
|
821
828
|
return html_image_list
|
|
822
|
-
|
|
829
|
+
|
|
823
830
|
failed_files = []
|
|
824
831
|
for fn_abs in filename_to_results:
|
|
825
832
|
results_this_image = filename_to_results[fn_abs]
|
|
826
833
|
if results_this_image['datetime'] is None:
|
|
827
834
|
failed_files.append(fn_abs)
|
|
828
|
-
|
|
835
|
+
|
|
829
836
|
print('Found {} failures'.format(len(failed_files)))
|
|
830
|
-
|
|
837
|
+
|
|
831
838
|
output_summary_file = os.path.join(preview_dir,'summary.html')
|
|
832
839
|
html_image_list = make_datetime_preview_page(sorted(list(filename_to_results.keys())),output_summary_file)
|
|
833
|
-
|
|
834
|
-
failure_summary_file = os.path.join(preview_dir,'failures.html')
|
|
840
|
+
|
|
841
|
+
failure_summary_file = os.path.join(preview_dir,'failures.html')
|
|
835
842
|
html_image_list_failures = make_datetime_preview_page(failed_files,failure_summary_file)
|
|
836
|
-
|
|
843
|
+
|
|
837
844
|
filenames = failed_files
|
|
838
845
|
html_file = failure_summary_file
|
|
839
846
|
|
|
840
|
-
|
|
847
|
+
|
|
841
848
|
#%% Other approaches to getting dates from strings
|
|
842
|
-
|
|
849
|
+
|
|
843
850
|
# ...that didn't really work out.
|
|
844
|
-
|
|
851
|
+
|
|
845
852
|
# pip install dateparser
|
|
846
853
|
import dateparser
|
|
847
854
|
|
|
@@ -853,7 +860,7 @@ if False:
|
|
|
853
860
|
dateparser_settings = {'PREFER_DATES_FROM':'past','STRICT_PARSING':True}
|
|
854
861
|
|
|
855
862
|
dateparser_result = dateparser.search.search_dates(s, settings=dateparser_settings)
|
|
856
|
-
|
|
863
|
+
|
|
857
864
|
if dateparser_result is not None:
|
|
858
865
|
assert len(dateparser_result) == 1
|
|
859
866
|
extracted_datetime = dateparser_result[0][1]
|
|
@@ -864,7 +871,7 @@ if False:
|
|
|
864
871
|
extracted_datetime = matches_list[0]
|
|
865
872
|
else:
|
|
866
873
|
extracted_datetime = None
|
|
867
|
-
|
|
868
|
-
if extracted_datetime is not None:
|
|
874
|
+
|
|
875
|
+
if extracted_datetime is not None:
|
|
869
876
|
assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
|
|
870
877
|
|