megadetector 5.0.11__py3-none-any.whl → 5.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (201) hide show
  1. megadetector/api/__init__.py +0 -0
  2. megadetector/api/batch_processing/__init__.py +0 -0
  3. megadetector/api/batch_processing/api_core/__init__.py +0 -0
  4. megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
  5. megadetector/api/batch_processing/api_core/batch_service/score.py +439 -0
  6. megadetector/api/batch_processing/api_core/server.py +294 -0
  7. megadetector/api/batch_processing/api_core/server_api_config.py +98 -0
  8. megadetector/api/batch_processing/api_core/server_app_config.py +55 -0
  9. megadetector/api/batch_processing/api_core/server_batch_job_manager.py +220 -0
  10. megadetector/api/batch_processing/api_core/server_job_status_table.py +152 -0
  11. megadetector/api/batch_processing/api_core/server_orchestration.py +360 -0
  12. megadetector/api/batch_processing/api_core/server_utils.py +92 -0
  13. megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
  14. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +46 -0
  15. megadetector/api/batch_processing/api_support/__init__.py +0 -0
  16. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +152 -0
  17. megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
  18. megadetector/api/batch_processing/integration/digiKam/setup.py +6 -0
  19. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +465 -0
  20. megadetector/api/batch_processing/integration/eMammal/test_scripts/config_template.py +5 -0
  21. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +126 -0
  22. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +55 -0
  23. megadetector/api/synchronous/__init__.py +0 -0
  24. megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
  25. megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +152 -0
  26. megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -0
  27. megadetector/api/synchronous/api_core/animal_detection_api/config.py +35 -0
  28. megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
  29. megadetector/api/synchronous/api_core/tests/load_test.py +110 -0
  30. megadetector/classification/__init__.py +0 -0
  31. megadetector/classification/aggregate_classifier_probs.py +108 -0
  32. megadetector/classification/analyze_failed_images.py +227 -0
  33. megadetector/classification/cache_batchapi_outputs.py +198 -0
  34. megadetector/classification/create_classification_dataset.py +627 -0
  35. megadetector/classification/crop_detections.py +516 -0
  36. megadetector/classification/csv_to_json.py +226 -0
  37. megadetector/classification/detect_and_crop.py +855 -0
  38. megadetector/classification/efficientnet/__init__.py +9 -0
  39. megadetector/classification/efficientnet/model.py +415 -0
  40. megadetector/classification/efficientnet/utils.py +610 -0
  41. megadetector/classification/evaluate_model.py +520 -0
  42. megadetector/classification/identify_mislabeled_candidates.py +152 -0
  43. megadetector/classification/json_to_azcopy_list.py +63 -0
  44. megadetector/classification/json_validator.py +699 -0
  45. megadetector/classification/map_classification_categories.py +276 -0
  46. megadetector/classification/merge_classification_detection_output.py +506 -0
  47. megadetector/classification/prepare_classification_script.py +194 -0
  48. megadetector/classification/prepare_classification_script_mc.py +228 -0
  49. megadetector/classification/run_classifier.py +287 -0
  50. megadetector/classification/save_mislabeled.py +110 -0
  51. megadetector/classification/train_classifier.py +827 -0
  52. megadetector/classification/train_classifier_tf.py +725 -0
  53. megadetector/classification/train_utils.py +323 -0
  54. megadetector/data_management/__init__.py +0 -0
  55. megadetector/data_management/annotations/__init__.py +0 -0
  56. megadetector/data_management/annotations/annotation_constants.py +34 -0
  57. megadetector/data_management/camtrap_dp_to_coco.py +239 -0
  58. megadetector/data_management/cct_json_utils.py +395 -0
  59. megadetector/data_management/cct_to_md.py +176 -0
  60. megadetector/data_management/cct_to_wi.py +289 -0
  61. megadetector/data_management/coco_to_labelme.py +272 -0
  62. megadetector/data_management/coco_to_yolo.py +662 -0
  63. megadetector/data_management/databases/__init__.py +0 -0
  64. megadetector/data_management/databases/add_width_and_height_to_db.py +33 -0
  65. megadetector/data_management/databases/combine_coco_camera_traps_files.py +206 -0
  66. megadetector/data_management/databases/integrity_check_json_db.py +477 -0
  67. megadetector/data_management/databases/subset_json_db.py +115 -0
  68. megadetector/data_management/generate_crops_from_cct.py +149 -0
  69. megadetector/data_management/get_image_sizes.py +189 -0
  70. megadetector/data_management/importers/add_nacti_sizes.py +52 -0
  71. megadetector/data_management/importers/add_timestamps_to_icct.py +79 -0
  72. megadetector/data_management/importers/animl_results_to_md_results.py +158 -0
  73. megadetector/data_management/importers/auckland_doc_test_to_json.py +373 -0
  74. megadetector/data_management/importers/auckland_doc_to_json.py +201 -0
  75. megadetector/data_management/importers/awc_to_json.py +191 -0
  76. megadetector/data_management/importers/bellevue_to_json.py +273 -0
  77. megadetector/data_management/importers/cacophony-thermal-importer.py +796 -0
  78. megadetector/data_management/importers/carrizo_shrubfree_2018.py +269 -0
  79. megadetector/data_management/importers/carrizo_trail_cam_2017.py +289 -0
  80. megadetector/data_management/importers/cct_field_adjustments.py +58 -0
  81. megadetector/data_management/importers/channel_islands_to_cct.py +913 -0
  82. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +180 -0
  83. megadetector/data_management/importers/eMammal/eMammal_helpers.py +249 -0
  84. megadetector/data_management/importers/eMammal/make_eMammal_json.py +223 -0
  85. megadetector/data_management/importers/ena24_to_json.py +276 -0
  86. megadetector/data_management/importers/filenames_to_json.py +386 -0
  87. megadetector/data_management/importers/helena_to_cct.py +283 -0
  88. megadetector/data_management/importers/idaho-camera-traps.py +1407 -0
  89. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +294 -0
  90. megadetector/data_management/importers/jb_csv_to_json.py +150 -0
  91. megadetector/data_management/importers/mcgill_to_json.py +250 -0
  92. megadetector/data_management/importers/missouri_to_json.py +490 -0
  93. megadetector/data_management/importers/nacti_fieldname_adjustments.py +79 -0
  94. megadetector/data_management/importers/noaa_seals_2019.py +181 -0
  95. megadetector/data_management/importers/pc_to_json.py +365 -0
  96. megadetector/data_management/importers/plot_wni_giraffes.py +123 -0
  97. megadetector/data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -0
  98. megadetector/data_management/importers/prepare_zsl_imerit.py +131 -0
  99. megadetector/data_management/importers/rspb_to_json.py +356 -0
  100. megadetector/data_management/importers/save_the_elephants_survey_A.py +320 -0
  101. megadetector/data_management/importers/save_the_elephants_survey_B.py +329 -0
  102. megadetector/data_management/importers/snapshot_safari_importer.py +758 -0
  103. megadetector/data_management/importers/snapshot_safari_importer_reprise.py +665 -0
  104. megadetector/data_management/importers/snapshot_serengeti_lila.py +1067 -0
  105. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +150 -0
  106. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +153 -0
  107. megadetector/data_management/importers/sulross_get_exif.py +65 -0
  108. megadetector/data_management/importers/timelapse_csv_set_to_json.py +490 -0
  109. megadetector/data_management/importers/ubc_to_json.py +399 -0
  110. megadetector/data_management/importers/umn_to_json.py +507 -0
  111. megadetector/data_management/importers/wellington_to_json.py +263 -0
  112. megadetector/data_management/importers/wi_to_json.py +442 -0
  113. megadetector/data_management/importers/zamba_results_to_md_results.py +181 -0
  114. megadetector/data_management/labelme_to_coco.py +547 -0
  115. megadetector/data_management/labelme_to_yolo.py +272 -0
  116. megadetector/data_management/lila/__init__.py +0 -0
  117. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +97 -0
  118. megadetector/data_management/lila/add_locations_to_nacti.py +147 -0
  119. megadetector/data_management/lila/create_lila_blank_set.py +558 -0
  120. megadetector/data_management/lila/create_lila_test_set.py +152 -0
  121. megadetector/data_management/lila/create_links_to_md_results_files.py +106 -0
  122. megadetector/data_management/lila/download_lila_subset.py +178 -0
  123. megadetector/data_management/lila/generate_lila_per_image_labels.py +516 -0
  124. megadetector/data_management/lila/get_lila_annotation_counts.py +170 -0
  125. megadetector/data_management/lila/get_lila_image_counts.py +112 -0
  126. megadetector/data_management/lila/lila_common.py +300 -0
  127. megadetector/data_management/lila/test_lila_metadata_urls.py +132 -0
  128. megadetector/data_management/ocr_tools.py +874 -0
  129. megadetector/data_management/read_exif.py +681 -0
  130. megadetector/data_management/remap_coco_categories.py +84 -0
  131. megadetector/data_management/remove_exif.py +66 -0
  132. megadetector/data_management/resize_coco_dataset.py +189 -0
  133. megadetector/data_management/wi_download_csv_to_coco.py +246 -0
  134. megadetector/data_management/yolo_output_to_md_output.py +441 -0
  135. megadetector/data_management/yolo_to_coco.py +676 -0
  136. megadetector/detection/__init__.py +0 -0
  137. megadetector/detection/detector_training/__init__.py +0 -0
  138. megadetector/detection/detector_training/model_main_tf2.py +114 -0
  139. megadetector/detection/process_video.py +702 -0
  140. megadetector/detection/pytorch_detector.py +341 -0
  141. megadetector/detection/run_detector.py +779 -0
  142. megadetector/detection/run_detector_batch.py +1219 -0
  143. megadetector/detection/run_inference_with_yolov5_val.py +917 -0
  144. megadetector/detection/run_tiled_inference.py +934 -0
  145. megadetector/detection/tf_detector.py +189 -0
  146. megadetector/detection/video_utils.py +606 -0
  147. megadetector/postprocessing/__init__.py +0 -0
  148. megadetector/postprocessing/add_max_conf.py +64 -0
  149. megadetector/postprocessing/categorize_detections_by_size.py +163 -0
  150. megadetector/postprocessing/combine_api_outputs.py +249 -0
  151. megadetector/postprocessing/compare_batch_results.py +958 -0
  152. megadetector/postprocessing/convert_output_format.py +396 -0
  153. megadetector/postprocessing/load_api_results.py +195 -0
  154. megadetector/postprocessing/md_to_coco.py +310 -0
  155. megadetector/postprocessing/md_to_labelme.py +330 -0
  156. megadetector/postprocessing/merge_detections.py +401 -0
  157. megadetector/postprocessing/postprocess_batch_results.py +1902 -0
  158. megadetector/postprocessing/remap_detection_categories.py +170 -0
  159. megadetector/postprocessing/render_detection_confusion_matrix.py +660 -0
  160. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +211 -0
  161. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +83 -0
  162. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +1631 -0
  163. megadetector/postprocessing/separate_detections_into_folders.py +730 -0
  164. megadetector/postprocessing/subset_json_detector_output.py +696 -0
  165. megadetector/postprocessing/top_folders_to_bottom.py +223 -0
  166. megadetector/taxonomy_mapping/__init__.py +0 -0
  167. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +491 -0
  168. megadetector/taxonomy_mapping/map_new_lila_datasets.py +150 -0
  169. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -0
  170. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +590 -0
  171. megadetector/taxonomy_mapping/retrieve_sample_image.py +71 -0
  172. megadetector/taxonomy_mapping/simple_image_download.py +219 -0
  173. megadetector/taxonomy_mapping/species_lookup.py +834 -0
  174. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +159 -0
  175. megadetector/taxonomy_mapping/taxonomy_graph.py +346 -0
  176. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +83 -0
  177. megadetector/utils/__init__.py +0 -0
  178. megadetector/utils/azure_utils.py +178 -0
  179. megadetector/utils/ct_utils.py +612 -0
  180. megadetector/utils/directory_listing.py +246 -0
  181. megadetector/utils/md_tests.py +968 -0
  182. megadetector/utils/path_utils.py +1044 -0
  183. megadetector/utils/process_utils.py +157 -0
  184. megadetector/utils/sas_blob_utils.py +509 -0
  185. megadetector/utils/split_locations_into_train_val.py +228 -0
  186. megadetector/utils/string_utils.py +92 -0
  187. megadetector/utils/url_utils.py +323 -0
  188. megadetector/utils/write_html_image_list.py +225 -0
  189. megadetector/visualization/__init__.py +0 -0
  190. megadetector/visualization/plot_utils.py +293 -0
  191. megadetector/visualization/render_images_with_thumbnails.py +275 -0
  192. megadetector/visualization/visualization_utils.py +1536 -0
  193. megadetector/visualization/visualize_db.py +550 -0
  194. megadetector/visualization/visualize_detector_output.py +405 -0
  195. {megadetector-5.0.11.dist-info → megadetector-5.0.12.dist-info}/METADATA +1 -1
  196. megadetector-5.0.12.dist-info/RECORD +199 -0
  197. megadetector-5.0.12.dist-info/top_level.txt +1 -0
  198. megadetector-5.0.11.dist-info/RECORD +0 -5
  199. megadetector-5.0.11.dist-info/top_level.txt +0 -1
  200. {megadetector-5.0.11.dist-info → megadetector-5.0.12.dist-info}/LICENSE +0 -0
  201. {megadetector-5.0.11.dist-info → megadetector-5.0.12.dist-info}/WHEEL +0 -0
@@ -0,0 +1,874 @@
1
+ """
2
+
3
+ ocr_tools.py
4
+
5
+ Use OCR (via the Tesseract package) to pull metadata (particularly times and
6
+ dates from camera trap images).
7
+
8
+ The general approach is:
9
+
10
+ * Crop a fixed percentage from the top and bottom of an image, slightly larger
11
+ than the largest examples we've seen of how much space is used for metadata.
12
+
13
+ * Define the background color as the median pixel value, and find rows that are
14
+ mostly that color to refine the crop.
15
+
16
+ * Crop to the refined crop, then run pytesseract to extract text.
17
+
18
+ * Use regular expressions to find time and date.
19
+
20
+ Prior to using this module:
21
+
22
+ * Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
23
+
24
+ * pip install pytesseract
25
+
26
+ Known limitations:
27
+
28
+ * Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
29
+
30
+ """
31
+
32
+ #%% Notes to self
33
+
34
+ """
35
+
36
+ * To use the legacy engine (--oem 0), I had to download an updated eng.traineddata file from:
37
+
38
+ https://github.com/tesseract-ocr/tessdata
39
+
40
+ """
41
+
42
+ #%% Constants and imports
43
+
44
+ import os
45
+ import json
46
+ import numpy as np
47
+ import datetime
48
+ import re
49
+
50
+ from functools import partial
51
+ from dateutil.parser import parse as dateparse
52
+
53
+ import cv2
54
+ from PIL import Image, ImageFilter
55
+ from tqdm import tqdm
56
+
57
+ from megadetector.utils.path_utils import find_images
58
+ from megadetector.utils.path_utils import open_file
59
+ from megadetector.utils import write_html_image_list
60
+ from megadetector.utils.ct_utils import is_iterable
61
+ from megadetector.visualization import visualization_utils as vis_utils
62
+
63
+ # pip install pytesseract
64
+ #
65
+ # Also install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki, and add
66
+ # the installation dir to your path (on Windows, typically C:\Program Files (x86)\Tesseract-OCR)
67
+ import pytesseract
68
+
69
+
70
+ #%% Extraction options
71
+
72
+ class DatetimeExtractionOptions:
73
+ """
74
+ Options used to parameterize datetime extraction in most functions in this module.
75
+ """
76
+
77
+ def __init__(self):
78
+
79
+ #: Using a semi-arbitrary metric of how much it feels like we found the
80
+ #: text-containing region, discard regions that appear to be extraction failures
81
+ self.p_crop_success_threshold = 0.5
82
+
83
+ #: Pad each crop with a few pixels to make tesseract happy
84
+ self.crop_padding = 10
85
+
86
+ #: Discard short text, typically text from the top of the image
87
+ self.min_text_length = 4
88
+
89
+ #: When we're looking for pixels that match the background color, allow some
90
+ #: tolerance around the dominant color
91
+ self.background_tolerance = 2
92
+
93
+ #: We need to see a consistent color in at least this fraction of pixels in our rough
94
+ #: crop to believe that we actually found a candidate metadata region.
95
+ self.min_background_fraction = 0.3
96
+
97
+ #: What fraction of the [top,bottom] of the image should we use for our rough crop?
98
+ self.image_crop_fraction = [0.045 , 0.045]
99
+ # self.image_crop_fraction = [0.08 , 0.08]
100
+
101
+ #: Within that rough crop, how much should we use for determining the background color?
102
+ self.background_crop_fraction_of_rough_crop = 0.5
103
+
104
+ #: A row is considered a probable metadata row if it contains at least this fraction
105
+ #: of the background color. This is used only to find the top and bottom of the crop area,
106
+ #: so it's not that *every* row needs to hit this criteria, only the rows that are generally
107
+ #: above and below the text.
108
+ self.min_background_fraction_for_background_row = 0.5
109
+
110
+ #: psm 6: "assume a single uniform block of text"
111
+ #: psm 13: raw line
112
+ #: oem: 0 == legacy, 1 == lstm
113
+ #: tesseract_config_string = '--oem 0 --psm 6'
114
+ #:
115
+ #: Try these configuration strings in order until we find a valid datetime
116
+ self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
117
+ '--oem 1 --psm 6','--oem 0 --psm 6']
118
+
119
+ #: If this is False, and one set of options appears to succeed for an image, we'll
120
+ #: stop there. If this is True, we always run all option sets on every image.
121
+ self.force_all_ocr_options = False
122
+
123
+ #: Whether to apply PIL's ImageFilter.SHARPEN prior to OCR
124
+ self.apply_sharpening_filter = True
125
+
126
+ #: Tesseract should be on your system path, but you can also specify the
127
+ #: path explicitly, e.g. you can do either of these:
128
+ #:
129
+ #: * os.environ['PATH'] += r';C:\Program Files\Tesseract-OCR'
130
+ #: * self.tesseract_cmd = 'r"C:\Program Files\Tesseract-OCR\tesseract.exe"'
131
+ self.tesseract_cmd = 'tesseract.exe'
132
+
133
+
134
+ #%% Support functions
135
+
136
+ def make_rough_crops(image,options=None):
137
+ """
138
+ Crops the top and bottom regions out of an image.
139
+
140
+ Args:
141
+ image (Image or str): a PIL Image or file name
142
+ options (DatetimeExtractionOptions, optional): OCR parameters
143
+
144
+ Returns:
145
+ dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
146
+ """
147
+
148
+ if options is None:
149
+ options = DatetimeExtractionOptions()
150
+
151
+ if isinstance(image,str):
152
+ image = vis_utils.open_image(image)
153
+
154
+ w = image.width
155
+ h = image.height
156
+
157
+ crop_height_top = round(options.image_crop_fraction[0] * h)
158
+ crop_height_bottom = round(options.image_crop_fraction[1] * h)
159
+
160
+ # l,t,r,b
161
+ #
162
+ # 0,0 is upper-left
163
+ top_crop = image.crop([0,0,w,crop_height_top])
164
+ bottom_crop = image.crop([0,h-crop_height_bottom,w,h])
165
+ return {'top':top_crop,'bottom':bottom_crop}
166
+
167
+ # ...def make_rough_crops(...)
168
+
169
+
170
+ def crop_to_solid_region(rough_crop,crop_location,options=None):
171
+ """
172
+ Given a rough crop from the top or bottom of an image, finds the background color
173
+ and crops to the metadata region.
174
+
175
+ Within a region of an image (typically a crop from the top-ish or bottom-ish part of
176
+ an image), tightly crop to the solid portion (typically a region with a black background).
177
+
178
+ The success metric is just a binary indicator right now: 1.0 if we found a region we believe
179
+ contains a solid background, 0.0 otherwise.
180
+
181
+ Args:
182
+ rough_crop (Image): the PIL Image to crop
183
+ crop_location (str): 'top' or 'bottom'
184
+ options (DatetimeExtractionOptions, optional): OCR parameters
185
+
186
+ Returns:
187
+ tuple: a tuple containing (a cropped_image (Image), p_success (float), padded_image (Image))
188
+ """
189
+
190
+ if options is None:
191
+ options = DatetimeExtractionOptions()
192
+
193
+ crop_to_solid_region_result = {}
194
+ crop_to_solid_region_result['crop_pil'] = None
195
+ crop_to_solid_region_result['padded_crop_pil'] = None
196
+ crop_to_solid_region_result['p_success'] = 0.0
197
+
198
+ # pil --> cv2
199
+ rough_crop_np = np.array(rough_crop)
200
+ rough_crop_np = rough_crop_np[:, :, ::-1].copy()
201
+
202
+ # Search *part* of the crop for the background value (the part closest to the top or bottom
203
+ # of the image)
204
+ rows_to_use_for_background_search = int(rough_crop_np.shape[0] * \
205
+ options.background_crop_fraction_of_rough_crop)
206
+
207
+ if crop_location == 'top':
208
+ background_search_image = rough_crop_np[0:rows_to_use_for_background_search,:,:]
209
+ elif crop_location == 'bottom':
210
+ background_search_image = rough_crop_np[-rows_to_use_for_background_search:,:,:]
211
+ else:
212
+ raise ValueError('Unrecognized crop location: {}'.format(crop_location))
213
+
214
+ background_search_image = cv2.cvtColor(background_search_image, cv2.COLOR_BGR2GRAY)
215
+ background_search_image = background_search_image.astype('uint8')
216
+ background_search_image = cv2.medianBlur(background_search_image,3)
217
+ pixel_values = background_search_image.flatten()
218
+ counts = np.bincount(pixel_values)
219
+ background_value = int(np.argmax(counts))
220
+
221
+ # Did we find a sensible mode that looks like a background value?
222
+ background_value_count = int(np.max(counts))
223
+ p_background_value = background_value_count / np.sum(counts)
224
+
225
+ if (p_background_value < options.min_background_fraction):
226
+ return crop_to_solid_region_result
227
+ else:
228
+ p_success = 1.0
229
+
230
+ analysis_image = cv2.cvtColor(rough_crop_np, cv2.COLOR_BGR2GRAY)
231
+ analysis_image = analysis_image.astype('uint8')
232
+ analysis_image = cv2.medianBlur(analysis_image,3)
233
+
234
+ # This will now be a binary image indicating which pixels are background
235
+ analysis_image = cv2.inRange(analysis_image,
236
+ background_value-options.background_tolerance,
237
+ background_value+options.background_tolerance)
238
+
239
+ # Use row heuristics to refine the crop
240
+ h = analysis_image.shape[0]
241
+ w = analysis_image.shape[1]
242
+
243
+ min_x = 0
244
+ min_y = -1
245
+ max_x = w
246
+ max_y = -1
247
+
248
+ # Find the first and last row that are mostly the background color
249
+ for y in range(h):
250
+ row_count = 0
251
+ for x in range(w):
252
+ if analysis_image[y][x] > 0:
253
+ row_count += 1
254
+ row_fraction = row_count / w
255
+ if row_fraction > options.min_background_fraction_for_background_row:
256
+ if min_y == -1:
257
+ min_y = y
258
+ max_y = y
259
+
260
+ assert (min_y == -1 and max_y == -1) or (min_y != -1 and max_y != -1)
261
+
262
+ if min_y == -1:
263
+ return crop_to_solid_region_result
264
+
265
+ if max_y == min_y:
266
+ return crop_to_solid_region_result
267
+
268
+ x = min_x
269
+ y = min_y
270
+ w = max_x-min_x
271
+ h = max_y-min_y
272
+
273
+ x = min_x
274
+ y = min_y
275
+ w = max_x-min_x
276
+ h = max_y-min_y
277
+
278
+ # Crop the image
279
+ crop_np = rough_crop_np[y:y+h,x:x+w]
280
+
281
+ # Tesseract doesn't like characters really close to the edge, so pad a little.
282
+ crop_padding = options.crop_padding
283
+ padded_crop_np = cv2.copyMakeBorder(crop_np,crop_padding,crop_padding,crop_padding,crop_padding,
284
+ cv2.BORDER_CONSTANT,
285
+ value=[background_value,background_value,background_value])
286
+
287
+ crop_pil = Image.fromarray(crop_np)
288
+ padded_crop_pil = Image.fromarray(padded_crop_np)
289
+
290
+ crop_to_solid_region_result['crop_pil'] = crop_pil
291
+ crop_to_solid_region_result['padded_crop_pil'] = padded_crop_pil
292
+ crop_to_solid_region_result['p_success'] = p_success
293
+
294
+ return crop_to_solid_region_result
295
+
296
+ # ...crop_to_solid_region(...)
297
+
298
+
299
+ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
300
+ """
301
+ Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
302
+ regions by the time they get to this function, roughly the top or bottom 20% of an image.
303
+
304
+ Args:
305
+ rough_crops (list): list of Image objects that have been cropped close to text
306
+ options (DatetimeExtractionOptions, optional): OCR parameters
307
+ tesseract_config_string (str, optional): optional CLI argument to pass to tesseract.exe
308
+
309
+ Returns:
310
+ dict: a dict with keys "top" and "bottom", where each value is a dict with keys
311
+ 'text' (text found, if any) and 'crop_to_solid_region_results' (metadata about the OCR pass)
312
+ """
313
+
314
+ if options is None:
315
+ options = DatetimeExtractionOptions()
316
+
317
+ if tesseract_config_string is None:
318
+ tesseract_config_string = options.tesseract_config_strings[0]
319
+
320
+ find_text_in_crops_results = {}
321
+
322
+ # crop_location = 'top'
323
+ # crop_location = 'bottom'
324
+ for crop_location in ('top','bottom'):
325
+
326
+ find_text_in_crops_results[crop_location] = {}
327
+ find_text_in_crops_results[crop_location]['text'] = ''
328
+ find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = None
329
+
330
+ rough_crop = rough_crops[crop_location]
331
+
332
+ # Crop to the portion of the rough crop with a solid background color
333
+ crop_to_solid_region_results = crop_to_solid_region(rough_crop,crop_location,options)
334
+
335
+ find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = \
336
+ crop_to_solid_region_results
337
+
338
+ # Try cropping to a solid region; if that doesn't work, try running OCR on the whole
339
+ # rough crop.
340
+ if crop_to_solid_region_results['p_success'] >= options.p_crop_success_threshold:
341
+ padded_crop_pil = crop_to_solid_region_results['padded_crop_pil']
342
+ else:
343
+ # continue
344
+ padded_crop_pil = rough_crop
345
+
346
+ if options.apply_sharpening_filter:
347
+ padded_crop_pil = padded_crop_pil.filter(ImageFilter.SHARPEN)
348
+
349
+ # Find text in the padded crop
350
+ pytesseract.pytesseract.tesseract_cmd = options.tesseract_cmd
351
+ text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
352
+ config=tesseract_config_string)
353
+
354
+ text = text.replace('\n', ' ').replace('\r', '').strip()
355
+
356
+ find_text_in_crops_results[crop_location]['text'] = text
357
+
358
+ # ...for each cropped region
359
+
360
+ return find_text_in_crops_results
361
+
362
+ # ...def find_text_in_crops(...)
363
+
364
+
365
+ def _datetime_string_to_datetime(matched_string):
366
+ """
367
+ Takes an OCR-matched datetime string, does a little cleanup, and parses a date
368
+ from it.
369
+
370
+ By the time a string gets to this function, it should be a proper date string, with
371
+ no extraneous characters other than spaces around colons or hyphens.
372
+ """
373
+
374
+ matched_string = matched_string.replace(' -','-')
375
+ matched_string = matched_string.replace('- ','-')
376
+ matched_string = matched_string.replace(' :',':')
377
+ matched_string = matched_string.replace(': ',':')
378
+ try:
379
+ extracted_datetime = dateparse(matched_string)
380
+ except Exception:
381
+ extracted_datetime = None
382
+ return extracted_datetime
383
+
384
+
385
+ def _get_datetime_from_strings(strings,options=None):
386
+ """
387
+ Given a string or list of strings, search for exactly one datetime in those strings.
388
+ using a series of regular expressions.
389
+
390
+ Strings are currently just concatenated before searching for a datetime.
391
+ """
392
+
393
+ if options is None:
394
+ options = DatetimeExtractionOptions()
395
+
396
+ if isinstance(strings,str):
397
+ s = strings
398
+ else:
399
+ s = ' '.join(strings).lower()
400
+ s = s.replace('—','-')
401
+ s = ''.join(e for e in s if e.isalnum() or e in ':-/' or e.isspace())
402
+
403
+ ### AM/PM
404
+
405
+ # 2013-10-02 11:40:50 AM
406
+ m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
407
+ if m is not None:
408
+ return _datetime_string_to_datetime(m.group(0))
409
+
410
+ # 04/01/2017 08:54:00AM
411
+ m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
412
+ if m is not None:
413
+ return _datetime_string_to_datetime(m.group(0))
414
+
415
+ # 2017/04/01 08:54:00AM
416
+ m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
417
+ if m is not None:
418
+ return _datetime_string_to_datetime(m.group(0))
419
+
420
+ # 04/01/2017 08:54AM
421
+ m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
422
+ if m is not None:
423
+ return _datetime_string_to_datetime(m.group(0))
424
+
425
+ # 2017/04/01 08:54AM
426
+ m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
427
+ if m is not None:
428
+ return _datetime_string_to_datetime(m.group(0))
429
+
430
+ ### No AM/PM
431
+
432
+ # 2013-07-27 04:56:35
433
+ m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
434
+ if m is not None:
435
+ return _datetime_string_to_datetime(m.group(0))
436
+
437
+ # 07-27-2013 04:56:35
438
+ m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
439
+ if m is not None:
440
+ return _datetime_string_to_datetime(m.group(0))
441
+
442
+ # 2013/07/27 04:56:35
443
+ m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
444
+ if m is not None:
445
+ return _datetime_string_to_datetime(m.group(0))
446
+
447
+ # 07/27/2013 04:56:35
448
+ m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
449
+ if m is not None:
450
+ return _datetime_string_to_datetime(m.group(0))
451
+
452
+ return None
453
+
454
+ # ...def _get_datetime_from_strings(...)
455
+
456
+
457
+ def get_datetime_from_image(image,include_crops=True,options=None):
458
+ """
459
+ Tries to find the datetime string (if present) in an image.
460
+
461
+ Args:
462
+ image (Image or str): the PIL Image object or image filename in which we should look for
463
+ datetime information.
464
+ include_crops (bool, optional): whether to include cropped images in the return dict (set
465
+ this to False if you're worried about size and you're processing a zillion images)
466
+ options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
467
+ DatetimeExtractionOptions object or a list of options to try
468
+
469
+ Returns:
470
+ dict: a dict with fields:
471
+
472
+ - datetime: Python datetime object, or None
473
+ - text_results: length-2 list of strings
474
+ - all_extracted_datetimes: if we ran multiple option sets, this will contain the
475
+ datetimes extracted for each option set
476
+ - ocr_results: detailed results from the OCR process, including crops as PIL images;
477
+ only included if include_crops is True
478
+ """
479
+
480
+ if options is None:
481
+ options = DatetimeExtractionOptions()
482
+
483
+ if isinstance(image,str):
484
+ image = vis_utils.open_image(image)
485
+
486
+ # Crop the top and bottom from the image
487
+ rough_crops = make_rough_crops(image,options)
488
+ assert len(rough_crops) == 2
489
+
490
+ all_extracted_datetimes = {}
491
+ all_text_results = []
492
+ all_ocr_results = []
493
+
494
+ extracted_datetime = None
495
+
496
+ # Find text, possibly trying all config strings
497
+ #
498
+ # tesseract_config_string = options.tesseract_config_strings[0]
499
+ for tesseract_config_string in options.tesseract_config_strings:
500
+
501
+ ocr_results = find_text_in_crops(rough_crops,options,tesseract_config_string)
502
+ all_ocr_results.append(ocr_results)
503
+
504
+ text_results = [v['text'] for v in ocr_results.values()]
505
+ assert len(text_results) == 2
506
+ all_text_results.append(text_results)
507
+
508
+ # Find datetime
509
+ extracted_datetime_this_option_set = _get_datetime_from_strings(text_results,options)
510
+ assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
511
+ (extracted_datetime_this_option_set is None)
512
+
513
+ all_extracted_datetimes[tesseract_config_string] = \
514
+ extracted_datetime_this_option_set
515
+
516
+ if extracted_datetime_this_option_set is not None:
517
+ if extracted_datetime is None:
518
+ extracted_datetime = extracted_datetime_this_option_set
519
+ if not options.force_all_ocr_options:
520
+ break
521
+
522
+ # ...for each set of OCR options
523
+
524
+ if extracted_datetime is not None:
525
+ assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
526
+
527
+ to_return = {}
528
+ to_return['datetime'] = extracted_datetime
529
+
530
+ to_return['text_results'] = all_text_results
531
+ to_return['all_extracted_datetimes'] = all_extracted_datetimes
532
+
533
+ if include_crops:
534
+ to_return['ocr_results'] = all_ocr_results
535
+ else:
536
+ to_return['ocr_results'] = None
537
+
538
+ return to_return
539
+
540
+ # ...def get_datetime_from_image(...)
541
+
542
+
543
+ def try_get_datetime_from_image(filename,include_crops=False,options=None):
544
+ """
545
+ Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
546
+ until we find a datetime.
547
+
548
+ Args:
549
+ image (Image or str): the PIL Image object or image filename in which we should look for
550
+ datetime information.
551
+ include_crops (bool, optional): whether to include cropped images in the return dict (set
552
+ this to False if you're worried about size and you're processing a zillion images)
553
+ options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
554
+ DatetimeExtractionOptions object or a list of options to try
555
+
556
+ Returns:
557
+ dict: A dict with fields:
558
+ - datetime: Python datetime object, or None
559
+ - text_results: length-2 list of strings
560
+ - all_extracted_datetimes: if we ran multiple option sets, this will contain the
561
+ datetimes extracted for each option set
562
+ - ocr_results: detailed results from the OCR process, including crops as PIL images;
563
+ only included if include_crops is True
564
+ """
565
+
566
+ if options is None:
567
+ options = DatetimeExtractionOptions()
568
+
569
+ if not is_iterable(options):
570
+ options = [options]
571
+
572
+ result = {}
573
+ result['error'] = None
574
+
575
+ for i_option_set,current_options in enumerate(options):
576
+ try:
577
+ result = get_datetime_from_image(filename,include_crops=include_crops,options=current_options)
578
+ result['options_index'] = i_option_set
579
+ if 'datetime' in result and result['datetime'] is not None:
580
+ break
581
+ except Exception as e:
582
+ result['error'] = str(e)
583
+
584
+ return result
585
+
586
+
587
+ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None,
588
+ n_workers=16,use_threads=False):
589
+ """
590
+ The main entry point for this module. Tries to retrieve metadata from pixels for every
591
+ image in [folder_name], optionally the results to the .json file [output_file].
592
+
593
+ Args:
594
+ folder_name (str): the folder of images to process recursively
595
+ output_file (str, optional): the .json file to which we should write results; if None,
596
+ just returns the results
597
+ n_to_sample (int, optional): for debugging only, used to limit the number of images
598
+ we process
599
+ options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
600
+ DatetimeExtractionOptions object or a list of options to try for each image
601
+ n_workers (int, optional): the number of parallel workers to use; set to <= 1 to disable
602
+ parallelization
603
+ use_threads (bool, optional): whether to use threads (True) or processes (False) for
604
+ parallelization; not relevant if n_workers <= 1
605
+
606
+ Returns:
607
+ dict: a dict mapping filenames to datetime extraction results, see try_get_datetime_from_images
608
+ for the format of each value in the dict.
609
+ """
610
+
611
+ if options is None:
612
+ options = DatetimeExtractionOptions()
613
+
614
+ image_file_names = \
615
+ find_images(folder_name,convert_slashes=True,
616
+ return_relative_paths=False,recursive=True)
617
+
618
+ if n_to_sample > 0:
619
+ import random
620
+ random.seed(0)
621
+ image_file_names = random.sample(image_file_names,n_to_sample)
622
+
623
+ if n_workers <= 1:
624
+
625
+ all_results = []
626
+ for fn_abs in tqdm(image_file_names):
627
+ all_results.append(try_get_datetime_from_image(fn_abs,options=options))
628
+
629
+ else:
630
+
631
+ # Don't spawn more than one worker per image
632
+ if n_workers > len(image_file_names):
633
+ n_workers = len(image_file_names)
634
+
635
+ if use_threads:
636
+ from multiprocessing.pool import ThreadPool
637
+ pool = ThreadPool(n_workers)
638
+ worker_string = 'threads'
639
+ else:
640
+ from multiprocessing.pool import Pool
641
+ pool = Pool(n_workers)
642
+ worker_string = 'processes'
643
+
644
+ print('Starting a pool of {} {}'.format(n_workers,worker_string))
645
+
646
+ all_results = list(tqdm(pool.imap(
647
+ partial(try_get_datetime_from_image,options=options),image_file_names),
648
+ total=len(image_file_names)))
649
+
650
+ filename_to_results = {}
651
+
652
+ # fn_relative = image_file_names[0]
653
+ for i_file,fn_abs in enumerate(image_file_names):
654
+ filename_to_results[fn_abs] = all_results[i_file]
655
+
656
+ if output_file is not None:
657
+ with open(output_file,'w') as f:
658
+ json.dump(filename_to_results,f,indent=1,default=str)
659
+
660
+ return filename_to_results
661
+
662
+
663
+ #%% Interactive driver
664
+
665
+ if False:
666
+
667
+ #%% Process images
668
+
669
+ folder_name = r'g:\temp\island_conservation_camera_traps'
670
+ output_file = r'g:\temp\ocr_results.json'
671
+ from megadetector.utils.path_utils import insert_before_extension
672
+ output_file = insert_before_extension(output_file)
673
+ n_to_sample = -1
674
+ assert os.path.isdir(folder_name)
675
+ options_a = DatetimeExtractionOptions()
676
+ options_b = DatetimeExtractionOptions()
677
+ options_b.image_crop_fraction = [0.08 , 0.08]
678
+ options_a.force_all_ocr_options = False
679
+ options_b.force_all_ocr_options = False
680
+ # all_options = [options_a,options_b]
681
+ all_options = [options_a]
682
+ filename_to_results = get_datetimes_for_folder(folder_name,output_file,
683
+ n_to_sample=n_to_sample,options=all_options)
684
+
685
+
686
+ #%% Load results
687
+
688
+ # output_file = r"G:\temp\ocr_results.2023.10.31.07.37.54.json"
689
+ with open(output_file,'r') as f:
690
+ filename_to_results = json.load(f)
691
+ filenames = sorted(list(filename_to_results.keys()))
692
+ print('Loaded results for {} files'.format(len(filename_to_results)))
693
+
694
+
695
+ #%% Scrap cell
696
+
697
+ fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
698
+ include_crops = False
699
+ options_a = DatetimeExtractionOptions()
700
+ options_b = DatetimeExtractionOptions()
701
+ options_b.image_crop_fraction = [0.08 , 0.08]
702
+ image = vis_utils.open_image(fn) # noqa
703
+ result = try_get_datetime_from_image(fn,options=[options_a,options_b]) # noqa
704
+ print(result)
705
+
706
+ # open_file(fn)
707
+ # rough_crops = make_rough_crops(image,options=options)
708
+
709
+
710
+ #%% Look for OCR or parsing failures
711
+
712
+ bad_tokens = ()
713
+
714
+ files_with_disagreements = set()
715
+
716
+ # i_fn = 0; fn = filenames[i_fn]
717
+ for i_fn,fn in enumerate(filenames):
718
+
719
+ image = fn
720
+ results = filename_to_results[fn]
721
+
722
+ if 'text_results' not in results:
723
+ raise Exception('no results available for {} ({})'.format(i_fn,fn))
724
+ print('Skipping {}, no results'.format(i_fn))
725
+ continue
726
+
727
+ s = ' '.join([x[0] for x in results['text_results']])
728
+
729
+ known_bad = False
730
+ for bad_token in bad_tokens:
731
+ if bad_token in s:
732
+ known_bad = True
733
+ if known_bad:
734
+ continue
735
+
736
+ extracted_datetime = results['datetime']
737
+
738
+ # If we have a datetime, make sure all successful OCR results agree
739
+ if extracted_datetime is not None:
740
+ for config_string in results['all_extracted_datetimes']:
741
+ if results['all_extracted_datetimes'][config_string] is not None:
742
+ if results['all_extracted_datetimes'][config_string] != extracted_datetime:
743
+ files_with_disagreements.add(fn)
744
+ else:
745
+ print('Falling back for {} ({})'.format(i_fn,fn))
746
+ ocr_results = get_datetime_from_image(fn)
747
+ extracted_datetime = ocr_results['datetime']
748
+
749
+ if extracted_datetime is None:
750
+ print('Failure at {}: {}'.format(i_fn,s))
751
+
752
+ # open_file(fn)
753
+ # get_datetime_from_image(fn)
754
+
755
+
756
+ #%% Write results to an HTML file for testing
757
+
758
+ n_to_sample = 5000
759
+ if (n_to_sample >= 0) and (len(filename_to_results) > n_to_sample):
760
+ filenames = sorted(list(filename_to_results.keys()))
761
+ import random
762
+ random.seed(0)
763
+ keys = random.sample(filenames,n_to_sample)
764
+ filename_to_results = {k: filename_to_results[k] for k in keys}
765
+
766
+ preview_dir = r'g:\temp\ocr-preview'
767
+ os.makedirs(preview_dir,exist_ok=True)
768
+
769
+ def resize_image_for_preview(fn_abs):
770
+ fn_relative = os.path.relpath(fn_abs,folder_name)
771
+ resized_image = vis_utils.resize_image(fn_abs,target_width=600)
772
+ resized_fn = os.path.join(preview_dir,fn_relative)
773
+ os.makedirs(os.path.dirname(resized_fn),exist_ok=True)
774
+ resized_image.save(resized_fn)
775
+ return resized_fn
776
+
777
+ # Resize images in parallel
778
+ n_rendering_workers = 16
779
+
780
+ if n_rendering_workers <= 1:
781
+ for fn_abs in tqdm(filename_to_results.keys()):
782
+ resize_image_for_preview(fn_abs)
783
+ else:
784
+ # from multiprocessing.pool import Pool as RenderingPool; worker_string = 'processes'
785
+ from multiprocessing.pool import ThreadPool as RenderingPool; worker_string = 'threads'
786
+ pool = RenderingPool(n_rendering_workers)
787
+
788
+ print('Starting rendering pool with {} {}'.format(n_rendering_workers,worker_string))
789
+
790
+ _ = list(tqdm(pool.imap(resize_image_for_preview,filename_to_results.keys()),
791
+ total=len(filename_to_results)))
792
+
793
+
794
+ def make_datetime_preview_page(filenames,html_file):
795
+
796
+ html_image_list = []
797
+ html_options = write_html_image_list.write_html_image_list()
798
+ html_options['maxFiguresPerHtmlFile'] = 2500
799
+ html_options['defaultImageStyle'] = 'margin:0px;margin-top:5px;margin-bottom:30px;'
800
+
801
+ # fn_abs = filenames[0]
802
+ for fn_abs in filenames:
803
+
804
+ fn_relative = os.path.relpath(fn_abs,folder_name)
805
+ # resized_fn = os.path.join(preview_dir,fn_relative)
806
+ results_this_image = filename_to_results[fn_abs]
807
+
808
+ extracted_datetime = results_this_image['datetime']
809
+ title = 'Image: {}<br/>Extracted datetime: {}'.format(fn_relative,extracted_datetime)
810
+ html_image_list.append({'filename':fn_relative,'title':title})
811
+
812
+ # ...for each crop
813
+
814
+ # ...for each image
815
+
816
+ html_options['makeRelative'] = True
817
+ write_html_image_list.write_html_image_list(html_file,
818
+ html_image_list,
819
+ html_options)
820
+ open_file(html_file)
821
+ return html_image_list
822
+
823
+ failed_files = []
824
+ for fn_abs in filename_to_results:
825
+ results_this_image = filename_to_results[fn_abs]
826
+ if results_this_image['datetime'] is None:
827
+ failed_files.append(fn_abs)
828
+
829
+ print('Found {} failures'.format(len(failed_files)))
830
+
831
+ output_summary_file = os.path.join(preview_dir,'summary.html')
832
+ html_image_list = make_datetime_preview_page(sorted(list(filename_to_results.keys())),output_summary_file)
833
+
834
+ failure_summary_file = os.path.join(preview_dir,'failures.html')
835
+ html_image_list_failures = make_datetime_preview_page(failed_files,failure_summary_file)
836
+
837
+ filenames = failed_files
838
+ html_file = failure_summary_file
839
+
840
+
841
+ #%% Other approaches to getting dates from strings
842
+
843
+ # ...that didn't really work out.
844
+
845
+ # pip install dateparser
846
+ import dateparser
847
+
848
+ # pip install datefinder
849
+ import datefinder
850
+
851
+ from dateparser.search import search_dates # noqa
852
+
853
+ dateparser_settings = {'PREFER_DATES_FROM':'past','STRICT_PARSING':True}
854
+
855
+ dateparser_result = dateparser.search.search_dates(s, settings=dateparser_settings)
856
+
857
+ if dateparser_result is not None:
858
+ assert len(dateparser_result) == 1
859
+ extracted_datetime = dateparser_result[0][1]
860
+ else:
861
+ matches = datefinder.find_dates(s,strict=False)
862
+ matches_list = [m for m in matches]
863
+ if len(matches_list) == 1:
864
+ extracted_datetime = matches_list[0]
865
+ else:
866
+ extracted_datetime = None
867
+
868
+ if extracted_datetime is not None:
869
+ assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
870
+
871
+
872
+ #%% Command-line driver
873
+
874
+ # TODO