megadetector 5.0.5__py3-none-any.whl → 5.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (132) hide show
  1. api/batch_processing/data_preparation/manage_local_batch.py +302 -263
  2. api/batch_processing/data_preparation/manage_video_batch.py +81 -2
  3. api/batch_processing/postprocessing/add_max_conf.py +1 -0
  4. api/batch_processing/postprocessing/categorize_detections_by_size.py +50 -19
  5. api/batch_processing/postprocessing/compare_batch_results.py +110 -60
  6. api/batch_processing/postprocessing/load_api_results.py +56 -70
  7. api/batch_processing/postprocessing/md_to_coco.py +1 -1
  8. api/batch_processing/postprocessing/md_to_labelme.py +2 -1
  9. api/batch_processing/postprocessing/postprocess_batch_results.py +240 -81
  10. api/batch_processing/postprocessing/render_detection_confusion_matrix.py +625 -0
  11. api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +71 -23
  12. api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +1 -1
  13. api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +227 -75
  14. api/batch_processing/postprocessing/subset_json_detector_output.py +132 -5
  15. api/batch_processing/postprocessing/top_folders_to_bottom.py +1 -1
  16. api/synchronous/api_core/animal_detection_api/detection/run_detector_batch.py +2 -2
  17. classification/prepare_classification_script.py +191 -191
  18. data_management/coco_to_yolo.py +68 -45
  19. data_management/databases/integrity_check_json_db.py +7 -5
  20. data_management/generate_crops_from_cct.py +3 -3
  21. data_management/get_image_sizes.py +8 -6
  22. data_management/importers/add_timestamps_to_icct.py +79 -0
  23. data_management/importers/animl_results_to_md_results.py +160 -0
  24. data_management/importers/auckland_doc_test_to_json.py +4 -4
  25. data_management/importers/auckland_doc_to_json.py +1 -1
  26. data_management/importers/awc_to_json.py +5 -5
  27. data_management/importers/bellevue_to_json.py +5 -5
  28. data_management/importers/carrizo_shrubfree_2018.py +5 -5
  29. data_management/importers/carrizo_trail_cam_2017.py +5 -5
  30. data_management/importers/cct_field_adjustments.py +2 -3
  31. data_management/importers/channel_islands_to_cct.py +4 -4
  32. data_management/importers/ena24_to_json.py +5 -5
  33. data_management/importers/helena_to_cct.py +10 -10
  34. data_management/importers/idaho-camera-traps.py +12 -12
  35. data_management/importers/idfg_iwildcam_lila_prep.py +8 -8
  36. data_management/importers/jb_csv_to_json.py +4 -4
  37. data_management/importers/missouri_to_json.py +1 -1
  38. data_management/importers/noaa_seals_2019.py +1 -1
  39. data_management/importers/pc_to_json.py +5 -5
  40. data_management/importers/prepare-noaa-fish-data-for-lila.py +4 -4
  41. data_management/importers/prepare_zsl_imerit.py +5 -5
  42. data_management/importers/rspb_to_json.py +4 -4
  43. data_management/importers/save_the_elephants_survey_A.py +5 -5
  44. data_management/importers/save_the_elephants_survey_B.py +6 -6
  45. data_management/importers/snapshot_safari_importer.py +9 -9
  46. data_management/importers/snapshot_serengeti_lila.py +9 -9
  47. data_management/importers/timelapse_csv_set_to_json.py +5 -7
  48. data_management/importers/ubc_to_json.py +4 -4
  49. data_management/importers/umn_to_json.py +4 -4
  50. data_management/importers/wellington_to_json.py +1 -1
  51. data_management/importers/wi_to_json.py +2 -2
  52. data_management/importers/zamba_results_to_md_results.py +181 -0
  53. data_management/labelme_to_coco.py +35 -7
  54. data_management/labelme_to_yolo.py +229 -0
  55. data_management/lila/add_locations_to_island_camera_traps.py +1 -1
  56. data_management/lila/add_locations_to_nacti.py +147 -0
  57. data_management/lila/create_lila_blank_set.py +474 -0
  58. data_management/lila/create_lila_test_set.py +2 -1
  59. data_management/lila/create_links_to_md_results_files.py +106 -0
  60. data_management/lila/download_lila_subset.py +46 -21
  61. data_management/lila/generate_lila_per_image_labels.py +23 -14
  62. data_management/lila/get_lila_annotation_counts.py +17 -11
  63. data_management/lila/lila_common.py +14 -11
  64. data_management/lila/test_lila_metadata_urls.py +116 -0
  65. data_management/ocr_tools.py +829 -0
  66. data_management/resize_coco_dataset.py +13 -11
  67. data_management/yolo_output_to_md_output.py +84 -12
  68. data_management/yolo_to_coco.py +38 -20
  69. detection/process_video.py +36 -14
  70. detection/pytorch_detector.py +23 -8
  71. detection/run_detector.py +76 -19
  72. detection/run_detector_batch.py +178 -63
  73. detection/run_inference_with_yolov5_val.py +326 -57
  74. detection/run_tiled_inference.py +153 -43
  75. detection/video_utils.py +34 -8
  76. md_utils/ct_utils.py +172 -1
  77. md_utils/md_tests.py +372 -51
  78. md_utils/path_utils.py +167 -39
  79. md_utils/process_utils.py +26 -7
  80. md_utils/split_locations_into_train_val.py +215 -0
  81. md_utils/string_utils.py +10 -0
  82. md_utils/url_utils.py +0 -2
  83. md_utils/write_html_image_list.py +9 -26
  84. md_visualization/plot_utils.py +12 -8
  85. md_visualization/visualization_utils.py +106 -7
  86. md_visualization/visualize_db.py +16 -8
  87. md_visualization/visualize_detector_output.py +208 -97
  88. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/METADATA +3 -6
  89. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/RECORD +98 -121
  90. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/WHEEL +1 -1
  91. taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +1 -1
  92. taxonomy_mapping/map_new_lila_datasets.py +43 -39
  93. taxonomy_mapping/prepare_lila_taxonomy_release.py +5 -2
  94. taxonomy_mapping/preview_lila_taxonomy.py +27 -27
  95. taxonomy_mapping/species_lookup.py +33 -13
  96. taxonomy_mapping/taxonomy_csv_checker.py +7 -5
  97. api/synchronous/api_core/yolov5/detect.py +0 -252
  98. api/synchronous/api_core/yolov5/export.py +0 -607
  99. api/synchronous/api_core/yolov5/hubconf.py +0 -146
  100. api/synchronous/api_core/yolov5/models/__init__.py +0 -0
  101. api/synchronous/api_core/yolov5/models/common.py +0 -738
  102. api/synchronous/api_core/yolov5/models/experimental.py +0 -104
  103. api/synchronous/api_core/yolov5/models/tf.py +0 -574
  104. api/synchronous/api_core/yolov5/models/yolo.py +0 -338
  105. api/synchronous/api_core/yolov5/train.py +0 -670
  106. api/synchronous/api_core/yolov5/utils/__init__.py +0 -36
  107. api/synchronous/api_core/yolov5/utils/activations.py +0 -103
  108. api/synchronous/api_core/yolov5/utils/augmentations.py +0 -284
  109. api/synchronous/api_core/yolov5/utils/autoanchor.py +0 -170
  110. api/synchronous/api_core/yolov5/utils/autobatch.py +0 -66
  111. api/synchronous/api_core/yolov5/utils/aws/__init__.py +0 -0
  112. api/synchronous/api_core/yolov5/utils/aws/resume.py +0 -40
  113. api/synchronous/api_core/yolov5/utils/benchmarks.py +0 -148
  114. api/synchronous/api_core/yolov5/utils/callbacks.py +0 -71
  115. api/synchronous/api_core/yolov5/utils/dataloaders.py +0 -1087
  116. api/synchronous/api_core/yolov5/utils/downloads.py +0 -178
  117. api/synchronous/api_core/yolov5/utils/flask_rest_api/example_request.py +0 -19
  118. api/synchronous/api_core/yolov5/utils/flask_rest_api/restapi.py +0 -46
  119. api/synchronous/api_core/yolov5/utils/general.py +0 -1018
  120. api/synchronous/api_core/yolov5/utils/loggers/__init__.py +0 -187
  121. api/synchronous/api_core/yolov5/utils/loggers/wandb/__init__.py +0 -0
  122. api/synchronous/api_core/yolov5/utils/loggers/wandb/log_dataset.py +0 -27
  123. api/synchronous/api_core/yolov5/utils/loggers/wandb/sweep.py +0 -41
  124. api/synchronous/api_core/yolov5/utils/loggers/wandb/wandb_utils.py +0 -577
  125. api/synchronous/api_core/yolov5/utils/loss.py +0 -234
  126. api/synchronous/api_core/yolov5/utils/metrics.py +0 -355
  127. api/synchronous/api_core/yolov5/utils/plots.py +0 -489
  128. api/synchronous/api_core/yolov5/utils/torch_utils.py +0 -314
  129. api/synchronous/api_core/yolov5/val.py +0 -394
  130. md_utils/matlab_porting_tools.py +0 -97
  131. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/LICENSE +0 -0
  132. {megadetector-5.0.5.dist-info → megadetector-5.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,829 @@
1
+ ########
2
+ #
3
+ # ocr_tools.py
4
+ #
5
+ # Use OCR (via the Tesseract package) to pull metadata (particularly times and
6
+ # dates from camera trap images).
7
+ #
8
+ # The general approach is:
9
+ #
10
+ # * Crop a fixed percentage from the top and bottom of an image, slightly larger
11
+ # than the largest examples we've seen of how much space is used for metadata.
12
+ #
13
+ # * Define the background color as the median pixel value, and find rows that are
14
+ # mostly that color to refine the crop.
15
+ #
16
+ # * Crop to the refined crop, then run pytesseract to extract text.
17
+ #
18
+ # * Use regular expressions to find time and date.
19
+ #
20
+ # Prior to using this module:
21
+ #
22
+ # * Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
23
+ #
24
+ # * pip install pytesseract
25
+ #
26
+ # Known limitations:
27
+ #
28
+ # * Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
29
+ #
30
+ ########
31
+
32
+ #%% Notes to self
33
+
34
+ """
35
+
36
+ * To use the legacy engine (--oem 0), I had to download an updated eng.traineddata file from:
37
+
38
+ https://github.com/tesseract-ocr/tessdata
39
+
40
+ """
41
+
42
+ #%% Constants and imports
43
+
44
+ import os
45
+ import json
46
+ import numpy as np
47
+ import datetime
48
+ import re
49
+
50
+ from functools import partial
51
+ from dateutil.parser import parse as dateparse
52
+
53
+ import cv2
54
+ from PIL import Image, ImageFilter
55
+ from tqdm import tqdm
56
+
57
+ from md_utils.path_utils import find_images
58
+ from md_visualization import visualization_utils as vis_utils
59
+ from md_utils import write_html_image_list
60
+ from md_utils.path_utils import open_file
61
+
62
+ # pip install pytesseract
63
+ #
64
+ # Also install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki, and add
65
+ # the installation dir to your path (on Windows, typically C:\Program Files (x86)\Tesseract-OCR)
66
+ import pytesseract
67
+
68
+
69
+ #%% Extraction options
70
+
71
+ class DatetimeExtractionOptions:
72
+
73
+ def __init__(self):
74
+
75
+ # Using a semi-arbitrary metric of how much it feels like we found the
76
+ # text-containing region, discard regions that appear to be extraction failures
77
+ self.p_crop_success_threshold = 0.5
78
+
79
+ # Pad each crop with a few pixels to make tesseract happy
80
+ self.crop_padding = 10
81
+
82
+ # Discard short text, typically text from the top of the image
83
+ self.min_text_length = 4
84
+
85
+ # When we're looking for pixels that match the background color, allow some
86
+ # tolerance around the dominant color
87
+ self.background_tolerance = 2
88
+
89
+ # We need to see a consistent color in at least this fraction of pixels in our rough
90
+ # crop to believe that we actually found a candidate metadata region.
91
+ self.min_background_fraction = 0.3
92
+
93
+ # What fraction of the [top,bottom] of the image should we use for our rough crop?
94
+ self.image_crop_fraction = [0.045 , 0.045]
95
+ # self.image_crop_fraction = [0.08 , 0.08]
96
+
97
+ # Within that rough crop, how much should we use for determining the background color?
98
+ self.background_crop_fraction_of_rough_crop = 0.5
99
+
100
+ # A row is considered a probable metadata row if it contains at least this fraction
101
+ # of the background color. This is used only to find the top and bottom of the crop area,
102
+ # so it's not that *every* row needs to hit this criteria, only the rows that are generally
103
+ # above and below the text.
104
+ self.min_background_fraction_for_background_row = 0.5
105
+
106
+ # psm 6: "assume a single uniform block of text"
107
+ # psm 13: raw line
108
+ # oem: 0 == legacy, 1 == lstm
109
+ # tesseract_config_string = '--oem 0 --psm 6'
110
+ #
111
+ # Try these configuration strings in order until we find a valid datetime
112
+ self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
113
+ '--oem 1 --psm 6','--oem 0 --psm 6']
114
+
115
+ self.force_all_ocr_options = False
116
+
117
+ self.apply_sharpening_filter = True
118
+
119
+ # Tesseract should be on your system path, but you can also specify the
120
+ # path explicitly.
121
+ #
122
+ # os.environ['PATH'] += r';C:\Program Files\Tesseract-OCR'
123
+ # self.tesseract_cmd = 'r"C:\Program Files\Tesseract-OCR\tesseract.exe"'
124
+ self.tesseract_cmd = 'tesseract.exe'
125
+
126
+
127
+ #%% Support functions
128
+
129
+ def make_rough_crops(image,options=None):
130
+ """
131
+ Crops the top and bottom regions out of an image, returns a dict with fields
132
+ 'top' and 'bottom', each pointing to a PIL image.
133
+
134
+ [image] can be a PIL image or a file name.
135
+ """
136
+
137
+ if options is None:
138
+ options = DatetimeExtractionOptions()
139
+
140
+ if isinstance(image,str):
141
+ image = vis_utils.open_image(image)
142
+
143
+ w = image.width
144
+ h = image.height
145
+
146
+ crop_height_top = round(options.image_crop_fraction[0] * h)
147
+ crop_height_bottom = round(options.image_crop_fraction[1] * h)
148
+
149
+ # l,t,r,b
150
+ #
151
+ # 0,0 is upper-left
152
+ top_crop = image.crop([0,0,w,crop_height_top])
153
+ bottom_crop = image.crop([0,h-crop_height_bottom,w,h])
154
+ return {'top':top_crop,'bottom':bottom_crop}
155
+
156
+ # ...def make_rough_crops(...)
157
+
158
+
159
+ def crop_to_solid_region(rough_crop,crop_location,options=None):
160
+ """
161
+ Given a rough crop from the top or bottom of an imaeg, find the background color
162
+ and crop to the metadata region.
163
+
164
+ rough_crop should be PIL Image, crop_location should be 'top' or 'bottom'.
165
+
166
+ Within a region of an image (typically a crop from the top-ish or bottom-ish part of
167
+ an image), tightly crop to the solid portion (typically a region with a black background).
168
+
169
+ The success metric is just a binary indicator right now: 1.0 if we found a region we believe
170
+ contains a solid background, 0.0 otherwise.
171
+
172
+ Returns cropped_image,p_success,padded_image
173
+ """
174
+
175
+ if options is None:
176
+ options = DatetimeExtractionOptions()
177
+
178
+ crop_to_solid_region_result = {}
179
+ crop_to_solid_region_result['crop_pil'] = None
180
+ crop_to_solid_region_result['padded_crop_pil'] = None
181
+ crop_to_solid_region_result['p_success'] = 0.0
182
+
183
+ # pil --> cv2
184
+ rough_crop_np = np.array(rough_crop)
185
+ rough_crop_np = rough_crop_np[:, :, ::-1].copy()
186
+
187
+ # Search *part* of the crop for the background value (the part closest to the top or bottom
188
+ # of the image)
189
+ rows_to_use_for_background_search = int(rough_crop_np.shape[0] * \
190
+ options.background_crop_fraction_of_rough_crop)
191
+
192
+ if crop_location == 'top':
193
+ background_search_image = rough_crop_np[0:rows_to_use_for_background_search,:,:]
194
+ elif crop_location == 'bottom':
195
+ background_search_image = rough_crop_np[-rows_to_use_for_background_search:,:,:]
196
+ else:
197
+ raise ValueError('Unrecognized crop location: {}'.format(crop_location))
198
+
199
+ background_search_image = cv2.cvtColor(background_search_image, cv2.COLOR_BGR2GRAY)
200
+ background_search_image = background_search_image.astype('uint8')
201
+ background_search_image = cv2.medianBlur(background_search_image,3)
202
+ pixel_values = background_search_image.flatten()
203
+ counts = np.bincount(pixel_values)
204
+ background_value = int(np.argmax(counts))
205
+
206
+ # Did we find a sensible mode that looks like a background value?
207
+ background_value_count = int(np.max(counts))
208
+ p_background_value = background_value_count / np.sum(counts)
209
+
210
+ if (p_background_value < options.min_background_fraction):
211
+ return crop_to_solid_region_result
212
+ else:
213
+ p_success = 1.0
214
+
215
+ analysis_image = cv2.cvtColor(rough_crop_np, cv2.COLOR_BGR2GRAY)
216
+ analysis_image = analysis_image.astype('uint8')
217
+ analysis_image = cv2.medianBlur(analysis_image,3)
218
+
219
+ # This will now be a binary image indicating which pixels are background
220
+ analysis_image = cv2.inRange(analysis_image,
221
+ background_value-options.background_tolerance,
222
+ background_value+options.background_tolerance)
223
+
224
+ # Use row heuristics to refine the crop
225
+ h = analysis_image.shape[0]
226
+ w = analysis_image.shape[1]
227
+
228
+ min_x = 0
229
+ min_y = -1
230
+ max_x = w
231
+ max_y = -1
232
+
233
+ # Find the first and last row that are mostly the background color
234
+ for y in range(h):
235
+ row_count = 0
236
+ for x in range(w):
237
+ if analysis_image[y][x] > 0:
238
+ row_count += 1
239
+ row_fraction = row_count / w
240
+ if row_fraction > options.min_background_fraction_for_background_row:
241
+ if min_y == -1:
242
+ min_y = y
243
+ max_y = y
244
+
245
+ assert (min_y == -1 and max_y == -1) or (min_y != -1 and max_y != -1)
246
+
247
+ if min_y == -1:
248
+ return crop_to_solid_region_result
249
+
250
+ if max_y == min_y:
251
+ return crop_to_solid_region_result
252
+
253
+ x = min_x
254
+ y = min_y
255
+ w = max_x-min_x
256
+ h = max_y-min_y
257
+
258
+ x = min_x
259
+ y = min_y
260
+ w = max_x-min_x
261
+ h = max_y-min_y
262
+
263
+ # Crop the image
264
+ crop_np = rough_crop_np[y:y+h,x:x+w]
265
+
266
+ # Tesseract doesn't like characters really close to the edge, so pad a little.
267
+ crop_padding = options.crop_padding
268
+ padded_crop_np = cv2.copyMakeBorder(crop_np,crop_padding,crop_padding,crop_padding,crop_padding,
269
+ cv2.BORDER_CONSTANT,
270
+ value=[background_value,background_value,background_value])
271
+
272
+ crop_pil = Image.fromarray(crop_np)
273
+ padded_crop_pil = Image.fromarray(padded_crop_np)
274
+
275
+ crop_to_solid_region_result['crop_pil'] = crop_pil
276
+ crop_to_solid_region_result['padded_crop_pil'] = padded_crop_pil
277
+ crop_to_solid_region_result['p_success'] = p_success
278
+
279
+ return crop_to_solid_region_result
280
+
281
+ # ...crop_to_solid_region(...)
282
+
283
+
284
+ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
285
+ """
286
+ Find all text in each Image in the dict [rough_crops]; those images should be pretty small
287
+ regions by the time they get to this function, roughly the top or bottom 20% of an image.
288
+ """
289
+
290
+ if options is None:
291
+ options = DatetimeExtractionOptions()
292
+
293
+ if tesseract_config_string is None:
294
+ tesseract_config_string = options.tesseract_config_strings[0]
295
+
296
+ find_text_in_crops_results = {}
297
+
298
+ # crop_location = 'top'
299
+ # crop_location = 'bottom'
300
+ for crop_location in ('top','bottom'):
301
+
302
+ find_text_in_crops_results[crop_location] = {}
303
+ find_text_in_crops_results[crop_location]['text'] = ''
304
+ find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = None
305
+
306
+ rough_crop = rough_crops[crop_location]
307
+
308
+ # Crop to the portion of the rough crop with a solid background color
309
+ crop_to_solid_region_results = crop_to_solid_region(rough_crop,crop_location,options)
310
+
311
+ find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = \
312
+ crop_to_solid_region_results
313
+
314
+ # Try cropping to a solid region; if that doesn't work, try running OCR on the whole
315
+ # rough crop.
316
+ if crop_to_solid_region_results['p_success'] >= options.p_crop_success_threshold:
317
+ padded_crop_pil = crop_to_solid_region_results['padded_crop_pil']
318
+ else:
319
+ # continue
320
+ padded_crop_pil = rough_crop
321
+
322
+ if options.apply_sharpening_filter:
323
+ padded_crop_pil = padded_crop_pil.filter(ImageFilter.SHARPEN)
324
+
325
+ # Find text in the padded crop
326
+ pytesseract.pytesseract.tesseract_cmd = options.tesseract_cmd
327
+ text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
328
+ config=tesseract_config_string)
329
+
330
+ text = text.replace('\n', ' ').replace('\r', '').strip()
331
+
332
+ find_text_in_crops_results[crop_location]['text'] = text
333
+
334
+ # ...for each cropped region
335
+
336
+ return find_text_in_crops_results
337
+
338
+ # ...def find_text_in_crops(...)
339
+
340
+
341
+ def datetime_string_to_datetime(matched_string):
342
+ """
343
+ Takes an OCR-matched datetime string, does a little cleanup, and parses a date
344
+ from it.
345
+
346
+ By the time a string gets to this function, it should be a proper date string, with
347
+ no extraneous characters other than spaces around colons or hyphens.
348
+ """
349
+
350
+ matched_string = matched_string.replace(' -','-')
351
+ matched_string = matched_string.replace('- ','-')
352
+ matched_string = matched_string.replace(' :',':')
353
+ matched_string = matched_string.replace(': ',':')
354
+ try:
355
+ extracted_datetime = dateparse(matched_string)
356
+ except Exception:
357
+ extracted_datetime = None
358
+ return extracted_datetime
359
+
360
+
361
+ def get_datetime_from_strings(strings,options=None):
362
+ """
363
+ Given a string or list of strings, search for exactly one datetime in those strings.
364
+ using a series of regular expressions.
365
+
366
+ Strings are currently just concatenated before searching for a datetime.
367
+ """
368
+
369
+ if options is None:
370
+ options = DatetimeExtractionOptions()
371
+
372
+ if isinstance(strings,str):
373
+ s = strings
374
+ else:
375
+ s = ' '.join(strings).lower()
376
+ s = s.replace('—','-')
377
+ s = ''.join(e for e in s if e.isalnum() or e in ':-/' or e.isspace())
378
+
379
+ ### AM/PM
380
+
381
+ # 2013-10-02 11:40:50 AM
382
+ m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
383
+ if m is not None:
384
+ return datetime_string_to_datetime(m.group(0))
385
+
386
+ # 04/01/2017 08:54:00AM
387
+ m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
388
+ if m is not None:
389
+ return datetime_string_to_datetime(m.group(0))
390
+
391
+ # 2017/04/01 08:54:00AM
392
+ m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
393
+ if m is not None:
394
+ return datetime_string_to_datetime(m.group(0))
395
+
396
+ # 04/01/2017 08:54AM
397
+ m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
398
+ if m is not None:
399
+ return datetime_string_to_datetime(m.group(0))
400
+
401
+ # 2017/04/01 08:54AM
402
+ m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
403
+ if m is not None:
404
+ return datetime_string_to_datetime(m.group(0))
405
+
406
+ ### No AM/PM
407
+
408
+ # 2013-07-27 04:56:35
409
+ m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
410
+ if m is not None:
411
+ return datetime_string_to_datetime(m.group(0))
412
+
413
+ # 07-27-2013 04:56:35
414
+ m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
415
+ if m is not None:
416
+ return datetime_string_to_datetime(m.group(0))
417
+
418
+ # 2013/07/27 04:56:35
419
+ m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
420
+ if m is not None:
421
+ return datetime_string_to_datetime(m.group(0))
422
+
423
+ # 07/27/2013 04:56:35
424
+ m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
425
+ if m is not None:
426
+ return datetime_string_to_datetime(m.group(0))
427
+
428
+ return None
429
+
430
+ # ...def get_datetime_from_strings(...)
431
+
432
+
433
+ def get_datetime_from_image(image,include_crops=True,options=None):
434
+ """
435
+ Find the datetime string (if present) in [image], which can be a PIL image or a
436
+ filename. Returns a dict:
437
+
438
+ datetime: Python datetime object, or None
439
+
440
+ text_results: length-2 list of strings
441
+
442
+ all_extracted_datetimes: if we ran multiple option sets, this will contain the
443
+ datetimes extracted for each option set
444
+
445
+ ocr_results: detailed results from the OCR process, including crops as PIL images;
446
+ only included if include_crops is True.
447
+
448
+ [options] can be None, a DatetimeExtractionOptions object, or a list of
449
+ DatetimeExtractionOptions objects to try for each image.
450
+ """
451
+
452
+ if options is None:
453
+ options = DatetimeExtractionOptions()
454
+
455
+ if isinstance(image,str):
456
+ image = vis_utils.open_image(image)
457
+
458
+ # Crop the top and bottom from the image
459
+ rough_crops = make_rough_crops(image,options)
460
+ assert len(rough_crops) == 2
461
+
462
+ all_extracted_datetimes = {}
463
+ all_text_results = []
464
+ all_ocr_results = []
465
+
466
+ extracted_datetime = None
467
+
468
+ # Find text, possibly trying all config strings
469
+ #
470
+ # tesseract_config_string = options.tesseract_config_strings[0]
471
+ for tesseract_config_string in options.tesseract_config_strings:
472
+
473
+ ocr_results = find_text_in_crops(rough_crops,options,tesseract_config_string)
474
+ all_ocr_results.append(ocr_results)
475
+
476
+ text_results = [v['text'] for v in ocr_results.values()]
477
+ assert len(text_results) == 2
478
+ all_text_results.append(text_results)
479
+
480
+ # Find datetime
481
+ extracted_datetime_this_option_set = get_datetime_from_strings(text_results,options)
482
+ assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
483
+ (extracted_datetime_this_option_set is None)
484
+
485
+ all_extracted_datetimes[tesseract_config_string] = \
486
+ extracted_datetime_this_option_set
487
+
488
+ if extracted_datetime_this_option_set is not None:
489
+ if extracted_datetime is None:
490
+ extracted_datetime = extracted_datetime_this_option_set
491
+ if not options.force_all_ocr_options:
492
+ break
493
+
494
+ # ...for each set of OCR options
495
+
496
+ if extracted_datetime is not None:
497
+ assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
498
+
499
+ to_return = {}
500
+ to_return['datetime'] = extracted_datetime
501
+
502
+ to_return['text_results'] = all_text_results
503
+ to_return['all_extracted_datetimes'] = all_extracted_datetimes
504
+
505
+ if include_crops:
506
+ to_return['ocr_results'] = all_ocr_results
507
+ else:
508
+ to_return['ocr_results'] = None
509
+
510
+ return to_return
511
+
512
+ # ...def get_datetime_from_image(...)
513
+
514
+
515
+ def is_iterable(x):
516
+ try:
517
+ _ = iter(x)
518
+ except:
519
+ return False
520
+ return True
521
+
522
+
523
+ def try_get_datetime_from_image(filename,include_crops=False,options=None):
524
+ """
525
+ Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
526
+ until we find a datetime.
527
+ """
528
+
529
+ if options is None:
530
+ options = DatetimeExtractionOptions()
531
+
532
+ if not is_iterable(options):
533
+ options = [options]
534
+
535
+ result = {}
536
+ result['error'] = None
537
+
538
+ for i_option_set,current_options in enumerate(options):
539
+ try:
540
+ result = get_datetime_from_image(filename,include_crops=include_crops,options=current_options)
541
+ result['options_index'] = i_option_set
542
+ if 'datetime' in result and result['datetime'] is not None:
543
+ break
544
+ except Exception as e:
545
+ result['error'] = str(e)
546
+
547
+ return result
548
+
549
+
550
+ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None):
551
+ """
552
+ Retrieve metadata from every image in [folder_name], and
553
+ write the results to the .json file [output_file].
554
+
555
+ [options] can be None, a DatetimeExtractionOptions object, or a list of
556
+ DatetimeExtractionOptions objects to try for each image.
557
+
558
+ Returns a dict mapping filenames to datetime extraction results. Optionally writes
559
+ results to the .json file [output_file].
560
+ """
561
+
562
+ if options is None:
563
+ options = DatetimeExtractionOptions()
564
+
565
+ image_file_names = \
566
+ find_images(folder_name,convert_slashes=True,
567
+ return_relative_paths=False,recursive=True)
568
+
569
+ if n_to_sample > 0:
570
+ import random
571
+ random.seed(0)
572
+ image_file_names = random.sample(image_file_names,n_to_sample)
573
+
574
+ n_cores = 16
575
+ use_threads = False
576
+
577
+ if n_cores <= 1:
578
+
579
+ all_results = []
580
+ for fn_abs in tqdm(image_file_names):
581
+ all_results.append(try_get_datetime_from_image(fn_abs,options=options))
582
+
583
+ else:
584
+
585
+ # Don't spawn more than one worker per image
586
+ if n_cores > len(image_file_names):
587
+ n_cores = len(image_file_names)
588
+
589
+ if use_threads:
590
+ from multiprocessing.pool import ThreadPool
591
+ pool = ThreadPool(n_cores)
592
+ worker_string = 'threads'
593
+ else:
594
+ from multiprocessing.pool import Pool
595
+ pool = Pool(n_cores)
596
+ worker_string = 'processes'
597
+
598
+ print('Starting a pool of {} {}'.format(n_cores,worker_string))
599
+
600
+ all_results = list(tqdm(pool.imap(
601
+ partial(try_get_datetime_from_image,options=options),image_file_names),
602
+ total=len(image_file_names)))
603
+
604
+ filename_to_results = {}
605
+
606
+ # fn_relative = image_file_names[0]
607
+ for i_file,fn_abs in enumerate(image_file_names):
608
+ filename_to_results[fn_abs] = all_results[i_file]
609
+
610
+ if output_file is not None:
611
+ with open(output_file,'w') as f:
612
+ json.dump(filename_to_results,f,indent=1,default=str)
613
+
614
+ return filename_to_results
615
+
616
+
617
+ #%% Interactive driver
618
+
619
+ if False:
620
+
621
+ #%% Process images
622
+
623
+ folder_name = r'g:\temp\island_conservation_camera_traps'
624
+ # folder_name = r'g:\camera_traps\camera_trap_images'
625
+ output_file = r'g:\temp\ocr_results.json'
626
+ from md_utils.path_utils import insert_before_extension
627
+ output_file = insert_before_extension(output_file)
628
+ n_to_sample = -1
629
+ assert os.path.isdir(folder_name)
630
+ options_a = DatetimeExtractionOptions()
631
+ options_b = DatetimeExtractionOptions()
632
+ options_b.image_crop_fraction = [0.08 , 0.08]
633
+ options_a.force_all_ocr_options = False
634
+ options_b.force_all_ocr_options = False
635
+ # all_options = [options_a,options_b]
636
+ all_options = [options_a]
637
+ filename_to_results = get_datetimes_for_folder(folder_name,output_file,
638
+ n_to_sample=n_to_sample,options=all_options)
639
+
640
+
641
+ #%% Load results
642
+
643
+ # output_file = r"G:\temp\ocr_results.2023.10.31.07.37.54.json"
644
+ with open(output_file,'r') as f:
645
+ filename_to_results = json.load(f)
646
+ filenames = sorted(list(filename_to_results.keys()))
647
+ print('Loaded results for {} files'.format(len(filename_to_results)))
648
+
649
+
650
+ #%% Scrap cell
651
+
652
+ fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
653
+ # fn = r'g:\camera_traps\camera_trap_images\2022.01.29\cam0\coyote\DSCF0057.JPG'
654
+ # fn = 'g:/temp/island_conservation_camera_traps/chile/frances01/frances012013/chile_frances012013_02012013105658.jpg'
655
+ # fn = 'g:/temp/island_conservation_camera_traps/dominicanrepublic/camara06/cam0618junio2016/dominicanrepublic_cam0618junio2016_20160614_114115_img_0013.jpg'
656
+ # fn = os.path.join(folder_name,r'dominicanrepublic\camara22\cam228noviembre2015\dominicanrepublic_cam228noviembre2015_20151105_071226_img_0132.jpg')
657
+ # fn = 'g:/camera_traps/camera_trap_images/2021.06.06/camera01/empty/DSCF0873.JPG'
658
+ include_crops = False
659
+ options_a = DatetimeExtractionOptions()
660
+ options_b = DatetimeExtractionOptions()
661
+ options_b.image_crop_fraction = [0.08 , 0.08]
662
+ image = vis_utils.open_image(fn) # noqa
663
+ result = try_get_datetime_from_image(fn,options=[options_a,options_b]) # noqa
664
+ print(result)
665
+
666
+ # open_file(fn)
667
+ # rough_crops = make_rough_crops(image,options=options)
668
+
669
+
670
+ #%% Look for OCR or parsing failures
671
+
672
+ bad_tokens = ()
673
+
674
+ files_with_disagreements = set()
675
+
676
+ # i_fn = 0; fn = filenames[i_fn]
677
+ for i_fn,fn in enumerate(filenames):
678
+
679
+ image = fn
680
+ results = filename_to_results[fn]
681
+
682
+ if 'text_results' not in results:
683
+ raise Exception('no results available for {} ({})'.format(i_fn,fn))
684
+ print('Skipping {}, no results'.format(i_fn))
685
+ continue
686
+
687
+ s = ' '.join([x[0] for x in results['text_results']])
688
+
689
+ known_bad = False
690
+ for bad_token in bad_tokens:
691
+ if bad_token in s:
692
+ known_bad = True
693
+ if known_bad:
694
+ continue
695
+
696
+ extracted_datetime = results['datetime']
697
+
698
+ # If we have a datetime, make sure all successful OCR results agree
699
+ if extracted_datetime is not None:
700
+ for config_string in results['all_extracted_datetimes']:
701
+ if results['all_extracted_datetimes'][config_string] is not None:
702
+ if results['all_extracted_datetimes'][config_string] != extracted_datetime:
703
+ files_with_disagreements.add(fn)
704
+ else:
705
+ print('Falling back for {} ({})'.format(i_fn,fn))
706
+ ocr_results = get_datetime_from_image(fn)
707
+ extracted_datetime = ocr_results['datetime']
708
+
709
+ if extracted_datetime is None:
710
+ print('Failure at {}: {}'.format(i_fn,s))
711
+
712
+ # open_file(fn)
713
+ # get_datetime_from_image(fn)
714
+
715
+
716
+ #%% Write results to an HTML file for testing
717
+
718
+ n_to_sample = 5000
719
+ if (n_to_sample >= 0) and (len(filename_to_results) > n_to_sample):
720
+ filenames = sorted(list(filename_to_results.keys()))
721
+ import random
722
+ random.seed(0)
723
+ keys = random.sample(filenames,n_to_sample)
724
+ filename_to_results = {k: filename_to_results[k] for k in keys}
725
+
726
+ preview_dir = r'g:\temp\ocr-preview'
727
+ os.makedirs(preview_dir,exist_ok=True)
728
+
729
+ def resize_image_for_preview(fn_abs):
730
+ fn_relative = os.path.relpath(fn_abs,folder_name)
731
+ resized_image = vis_utils.resize_image(fn_abs,target_width=600)
732
+ resized_fn = os.path.join(preview_dir,fn_relative)
733
+ os.makedirs(os.path.dirname(resized_fn),exist_ok=True)
734
+ resized_image.save(resized_fn)
735
+ return resized_fn
736
+
737
+ # Resize images in parallel
738
+ n_rendering_workers = 16
739
+
740
+ if n_rendering_workers <= 1:
741
+ for fn_abs in tqdm(filename_to_results.keys()):
742
+ resize_image_for_preview(fn_abs)
743
+ else:
744
+ # from multiprocessing.pool import Pool as RenderingPool; worker_string = 'processes'
745
+ from multiprocessing.pool import ThreadPool as RenderingPool; worker_string = 'threads'
746
+ pool = RenderingPool(n_rendering_workers)
747
+
748
+ print('Starting rendering pool with {} {}'.format(n_rendering_workers,worker_string))
749
+
750
+ _ = list(tqdm(pool.imap(resize_image_for_preview,filename_to_results.keys()),
751
+ total=len(filename_to_results)))
752
+
753
+
754
+ def make_datetime_preview_page(filenames,html_file):
755
+
756
+ html_image_list = []
757
+ html_options = write_html_image_list.write_html_image_list()
758
+ html_options['maxFiguresPerHtmlFile'] = 2500
759
+ html_options['defaultImageStyle'] = 'margin:0px;margin-top:5px;margin-bottom:30px;'
760
+
761
+ # fn_abs = filenames[0]
762
+ for fn_abs in filenames:
763
+
764
+ fn_relative = os.path.relpath(fn_abs,folder_name)
765
+ # resized_fn = os.path.join(preview_dir,fn_relative)
766
+ results_this_image = filename_to_results[fn_abs]
767
+
768
+ extracted_datetime = results_this_image['datetime']
769
+ title = 'Image: {}<br/>Extracted datetime: {}'.format(fn_relative,extracted_datetime)
770
+ html_image_list.append({'filename':fn_relative,'title':title})
771
+
772
+ # ...for each crop
773
+
774
+ # ...for each image
775
+
776
+ html_options['makeRelative'] = True
777
+ write_html_image_list.write_html_image_list(html_file,
778
+ html_image_list,
779
+ html_options)
780
+ open_file(html_file)
781
+ return html_image_list
782
+
783
+ failed_files = []
784
+ for fn_abs in filename_to_results:
785
+ results_this_image = filename_to_results[fn_abs]
786
+ if results_this_image['datetime'] is None:
787
+ failed_files.append(fn_abs)
788
+
789
+ print('Found {} failures'.format(len(failed_files)))
790
+
791
+ output_summary_file = os.path.join(preview_dir,'summary.html')
792
+ html_image_list = make_datetime_preview_page(sorted(list(filename_to_results.keys())),output_summary_file)
793
+
794
+ failure_summary_file = os.path.join(preview_dir,'failures.html')
795
+ html_image_list_failures = make_datetime_preview_page(failed_files,failure_summary_file)
796
+
797
+ filenames = failed_files
798
+ html_file = failure_summary_file
799
+
800
+
801
+ #%% Other approaches to getting dates from strings
802
+
803
+ # ...that didn't really work out.
804
+
805
+ # pip install dateparser
806
+ import dateparser
807
+
808
+ # pip install datefinder
809
+ import datefinder
810
+
811
+ from dateparser.search import search_dates # noqa
812
+
813
+ dateparser_settings = {'PREFER_DATES_FROM':'past','STRICT_PARSING':True}
814
+
815
+ dateparser_result = dateparser.search.search_dates(s, settings=dateparser_settings)
816
+
817
+ if dateparser_result is not None:
818
+ assert len(dateparser_result) == 1
819
+ extracted_datetime = dateparser_result[0][1]
820
+ else:
821
+ matches = datefinder.find_dates(s,strict=False)
822
+ matches_list = [m for m in matches]
823
+ if len(matches_list) == 1:
824
+ extracted_datetime = matches_list[0]
825
+ else:
826
+ extracted_datetime = None
827
+
828
+ if extracted_datetime is not None:
829
+ assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990