megadetector 5.0.28__py3-none-any.whl → 5.0.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of megadetector might be problematic. Click here for more details.

Files changed (176) hide show
  1. megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
  2. megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
  3. megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
  4. megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
  5. megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
  6. megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
  7. megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
  8. megadetector/classification/aggregate_classifier_probs.py +3 -3
  9. megadetector/classification/analyze_failed_images.py +5 -5
  10. megadetector/classification/cache_batchapi_outputs.py +5 -5
  11. megadetector/classification/create_classification_dataset.py +11 -12
  12. megadetector/classification/crop_detections.py +10 -10
  13. megadetector/classification/csv_to_json.py +8 -8
  14. megadetector/classification/detect_and_crop.py +13 -15
  15. megadetector/classification/evaluate_model.py +7 -7
  16. megadetector/classification/identify_mislabeled_candidates.py +6 -6
  17. megadetector/classification/json_to_azcopy_list.py +1 -1
  18. megadetector/classification/json_validator.py +29 -32
  19. megadetector/classification/map_classification_categories.py +9 -9
  20. megadetector/classification/merge_classification_detection_output.py +12 -9
  21. megadetector/classification/prepare_classification_script.py +19 -19
  22. megadetector/classification/prepare_classification_script_mc.py +23 -23
  23. megadetector/classification/run_classifier.py +4 -4
  24. megadetector/classification/save_mislabeled.py +6 -6
  25. megadetector/classification/train_classifier.py +1 -1
  26. megadetector/classification/train_classifier_tf.py +9 -9
  27. megadetector/classification/train_utils.py +10 -10
  28. megadetector/data_management/annotations/annotation_constants.py +1 -1
  29. megadetector/data_management/camtrap_dp_to_coco.py +45 -45
  30. megadetector/data_management/cct_json_utils.py +101 -101
  31. megadetector/data_management/cct_to_md.py +49 -49
  32. megadetector/data_management/cct_to_wi.py +33 -33
  33. megadetector/data_management/coco_to_labelme.py +75 -75
  34. megadetector/data_management/coco_to_yolo.py +189 -189
  35. megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
  36. megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
  37. megadetector/data_management/databases/integrity_check_json_db.py +202 -188
  38. megadetector/data_management/databases/subset_json_db.py +33 -33
  39. megadetector/data_management/generate_crops_from_cct.py +38 -38
  40. megadetector/data_management/get_image_sizes.py +54 -49
  41. megadetector/data_management/labelme_to_coco.py +130 -124
  42. megadetector/data_management/labelme_to_yolo.py +78 -72
  43. megadetector/data_management/lila/create_lila_blank_set.py +81 -83
  44. megadetector/data_management/lila/create_lila_test_set.py +32 -31
  45. megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
  46. megadetector/data_management/lila/download_lila_subset.py +21 -24
  47. megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
  48. megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
  49. megadetector/data_management/lila/get_lila_image_counts.py +22 -22
  50. megadetector/data_management/lila/lila_common.py +70 -70
  51. megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
  52. megadetector/data_management/mewc_to_md.py +339 -340
  53. megadetector/data_management/ocr_tools.py +258 -252
  54. megadetector/data_management/read_exif.py +231 -224
  55. megadetector/data_management/remap_coco_categories.py +26 -26
  56. megadetector/data_management/remove_exif.py +31 -20
  57. megadetector/data_management/rename_images.py +187 -187
  58. megadetector/data_management/resize_coco_dataset.py +41 -41
  59. megadetector/data_management/speciesnet_to_md.py +41 -41
  60. megadetector/data_management/wi_download_csv_to_coco.py +55 -55
  61. megadetector/data_management/yolo_output_to_md_output.py +117 -120
  62. megadetector/data_management/yolo_to_coco.py +195 -188
  63. megadetector/detection/change_detection.py +831 -0
  64. megadetector/detection/process_video.py +340 -337
  65. megadetector/detection/pytorch_detector.py +304 -262
  66. megadetector/detection/run_detector.py +177 -164
  67. megadetector/detection/run_detector_batch.py +364 -363
  68. megadetector/detection/run_inference_with_yolov5_val.py +328 -325
  69. megadetector/detection/run_tiled_inference.py +256 -249
  70. megadetector/detection/tf_detector.py +24 -24
  71. megadetector/detection/video_utils.py +290 -282
  72. megadetector/postprocessing/add_max_conf.py +15 -11
  73. megadetector/postprocessing/categorize_detections_by_size.py +44 -44
  74. megadetector/postprocessing/classification_postprocessing.py +415 -415
  75. megadetector/postprocessing/combine_batch_outputs.py +20 -21
  76. megadetector/postprocessing/compare_batch_results.py +528 -517
  77. megadetector/postprocessing/convert_output_format.py +97 -97
  78. megadetector/postprocessing/create_crop_folder.py +219 -146
  79. megadetector/postprocessing/detector_calibration.py +173 -168
  80. megadetector/postprocessing/generate_csv_report.py +508 -499
  81. megadetector/postprocessing/load_api_results.py +23 -20
  82. megadetector/postprocessing/md_to_coco.py +129 -98
  83. megadetector/postprocessing/md_to_labelme.py +89 -83
  84. megadetector/postprocessing/md_to_wi.py +40 -40
  85. megadetector/postprocessing/merge_detections.py +87 -114
  86. megadetector/postprocessing/postprocess_batch_results.py +313 -298
  87. megadetector/postprocessing/remap_detection_categories.py +36 -36
  88. megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
  89. megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
  90. megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
  91. megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
  92. megadetector/postprocessing/separate_detections_into_folders.py +226 -211
  93. megadetector/postprocessing/subset_json_detector_output.py +265 -262
  94. megadetector/postprocessing/top_folders_to_bottom.py +45 -45
  95. megadetector/postprocessing/validate_batch_results.py +70 -70
  96. megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
  97. megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
  98. megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
  99. megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -66
  100. megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
  101. megadetector/taxonomy_mapping/simple_image_download.py +8 -8
  102. megadetector/taxonomy_mapping/species_lookup.py +33 -33
  103. megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
  104. megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
  105. megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
  106. megadetector/utils/azure_utils.py +22 -22
  107. megadetector/utils/ct_utils.py +1018 -200
  108. megadetector/utils/directory_listing.py +21 -77
  109. megadetector/utils/gpu_test.py +22 -22
  110. megadetector/utils/md_tests.py +541 -518
  111. megadetector/utils/path_utils.py +1457 -398
  112. megadetector/utils/process_utils.py +41 -41
  113. megadetector/utils/sas_blob_utils.py +53 -49
  114. megadetector/utils/split_locations_into_train_val.py +61 -61
  115. megadetector/utils/string_utils.py +147 -26
  116. megadetector/utils/url_utils.py +463 -173
  117. megadetector/utils/wi_utils.py +2629 -2526
  118. megadetector/utils/write_html_image_list.py +137 -137
  119. megadetector/visualization/plot_utils.py +21 -21
  120. megadetector/visualization/render_images_with_thumbnails.py +37 -73
  121. megadetector/visualization/visualization_utils.py +401 -397
  122. megadetector/visualization/visualize_db.py +197 -190
  123. megadetector/visualization/visualize_detector_output.py +79 -73
  124. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/METADATA +135 -132
  125. megadetector-5.0.29.dist-info/RECORD +163 -0
  126. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
  127. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
  128. {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
  129. megadetector/data_management/importers/add_nacti_sizes.py +0 -52
  130. megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
  131. megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
  132. megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
  133. megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
  134. megadetector/data_management/importers/awc_to_json.py +0 -191
  135. megadetector/data_management/importers/bellevue_to_json.py +0 -272
  136. megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
  137. megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
  138. megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
  139. megadetector/data_management/importers/cct_field_adjustments.py +0 -58
  140. megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
  141. megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
  142. megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
  143. megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
  144. megadetector/data_management/importers/ena24_to_json.py +0 -276
  145. megadetector/data_management/importers/filenames_to_json.py +0 -386
  146. megadetector/data_management/importers/helena_to_cct.py +0 -283
  147. megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
  148. megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
  149. megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
  150. megadetector/data_management/importers/jb_csv_to_json.py +0 -150
  151. megadetector/data_management/importers/mcgill_to_json.py +0 -250
  152. megadetector/data_management/importers/missouri_to_json.py +0 -490
  153. megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
  154. megadetector/data_management/importers/noaa_seals_2019.py +0 -181
  155. megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
  156. megadetector/data_management/importers/pc_to_json.py +0 -365
  157. megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
  158. megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
  159. megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
  160. megadetector/data_management/importers/rspb_to_json.py +0 -356
  161. megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
  162. megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
  163. megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
  164. megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
  165. megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
  166. megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
  167. megadetector/data_management/importers/sulross_get_exif.py +0 -65
  168. megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
  169. megadetector/data_management/importers/ubc_to_json.py +0 -399
  170. megadetector/data_management/importers/umn_to_json.py +0 -507
  171. megadetector/data_management/importers/wellington_to_json.py +0 -263
  172. megadetector/data_management/importers/wi_to_json.py +0 -442
  173. megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
  174. megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
  175. megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
  176. megadetector-5.0.28.dist-info/RECORD +0 -209
@@ -22,11 +22,11 @@ Prior to using this module:
22
22
  * Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
23
23
 
24
24
  * pip install pytesseract
25
-
25
+
26
26
  Known limitations:
27
27
 
28
28
  * Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
29
-
29
+
30
30
  """
31
31
 
32
32
  #%% Notes to self
@@ -34,9 +34,9 @@ Known limitations:
34
34
  """
35
35
 
36
36
  * To use the legacy engine (--oem 0), I had to download an updated eng.traineddata file from:
37
-
37
+
38
38
  https://github.com/tesseract-ocr/tessdata
39
-
39
+
40
40
  """
41
41
 
42
42
  #%% Constants and imports
@@ -56,7 +56,7 @@ from tqdm import tqdm
56
56
 
57
57
  from megadetector.utils.path_utils import find_images
58
58
  from megadetector.utils.path_utils import open_file
59
- from megadetector.utils import write_html_image_list
59
+ from megadetector.utils import write_html_image_list
60
60
  from megadetector.utils.ct_utils import is_iterable
61
61
  from megadetector.visualization import visualization_utils as vis_utils
62
62
 
@@ -73,40 +73,40 @@ class DatetimeExtractionOptions:
73
73
  """
74
74
  Options used to parameterize datetime extraction in most functions in this module.
75
75
  """
76
-
76
+
77
77
  def __init__(self):
78
-
79
- #: Using a semi-arbitrary metric of how much it feels like we found the
78
+
79
+ #: Using a semi-arbitrary metric of how much it feels like we found the
80
80
  #: text-containing region, discard regions that appear to be extraction failures
81
81
  self.p_crop_success_threshold = 0.5
82
-
82
+
83
83
  #: Pad each crop with a few pixels to make tesseract happy
84
- self.crop_padding = 10
85
-
84
+ self.crop_padding = 10
85
+
86
86
  #: Discard short text, typically text from the top of the image
87
87
  self.min_text_length = 4
88
-
89
- #: When we're looking for pixels that match the background color, allow some
88
+
89
+ #: When we're looking for pixels that match the background color, allow some
90
90
  #: tolerance around the dominant color
91
91
  self.background_tolerance = 2
92
-
93
- #: We need to see a consistent color in at least this fraction of pixels in our rough
92
+
93
+ #: We need to see a consistent color in at least this fraction of pixels in our rough
94
94
  #: crop to believe that we actually found a candidate metadata region.
95
95
  self.min_background_fraction = 0.3
96
-
96
+
97
97
  #: What fraction of the [top,bottom] of the image should we use for our rough crop?
98
98
  self.image_crop_fraction = [0.045 , 0.045]
99
99
  # self.image_crop_fraction = [0.08 , 0.08]
100
-
100
+
101
101
  #: Within that rough crop, how much should we use for determining the background color?
102
102
  self.background_crop_fraction_of_rough_crop = 0.5
103
-
103
+
104
104
  #: A row is considered a probable metadata row if it contains at least this fraction
105
- #: of the background color. This is used only to find the top and bottom of the crop area,
105
+ #: of the background color. This is used only to find the top and bottom of the crop area,
106
106
  #: so it's not that *every* row needs to hit this criteria, only the rows that are generally
107
107
  #: above and below the text.
108
108
  self.min_background_fraction_for_background_row = 0.5
109
-
109
+
110
110
  #: psm 6: "assume a single uniform block of text"
111
111
  #: psm 13: raw line
112
112
  #: oem: 0 == legacy, 1 == lstm
@@ -115,14 +115,14 @@ class DatetimeExtractionOptions:
115
115
  #: Try these configuration strings in order until we find a valid datetime
116
116
  self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
117
117
  '--oem 1 --psm 6','--oem 0 --psm 6']
118
-
118
+
119
119
  #: If this is False, and one set of options appears to succeed for an image, we'll
120
120
  #: stop there. If this is True, we always run all option sets on every image.
121
121
  self.force_all_ocr_options = False
122
-
122
+
123
123
  #: Whether to apply PIL's ImageFilter.SHARPEN prior to OCR
124
124
  self.apply_sharpening_filter = True
125
-
125
+
126
126
  #: Tesseract should be on your system path, but you can also specify the
127
127
  #: path explicitly, e.g. you can do either of these:
128
128
  #:
@@ -136,115 +136,115 @@ class DatetimeExtractionOptions:
136
136
  def make_rough_crops(image,options=None):
137
137
  """
138
138
  Crops the top and bottom regions out of an image.
139
-
139
+
140
140
  Args:
141
141
  image (Image or str): a PIL Image or file name
142
142
  options (DatetimeExtractionOptions, optional): OCR parameters
143
-
143
+
144
144
  Returns:
145
- dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
145
+ dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
146
146
  """
147
-
147
+
148
148
  if options is None:
149
149
  options = DatetimeExtractionOptions()
150
-
150
+
151
151
  if isinstance(image,str):
152
152
  image = vis_utils.open_image(image)
153
-
153
+
154
154
  w = image.width
155
155
  h = image.height
156
-
156
+
157
157
  crop_height_top = round(options.image_crop_fraction[0] * h)
158
158
  crop_height_bottom = round(options.image_crop_fraction[1] * h)
159
-
159
+
160
160
  # l,t,r,b
161
161
  #
162
162
  # 0,0 is upper-left
163
163
  top_crop = image.crop([0,0,w,crop_height_top])
164
164
  bottom_crop = image.crop([0,h-crop_height_bottom,w,h])
165
165
  return {'top':top_crop,'bottom':bottom_crop}
166
-
166
+
167
167
  # ...def make_rough_crops(...)
168
168
 
169
169
 
170
170
  def crop_to_solid_region(rough_crop,crop_location,options=None):
171
- """
171
+ """
172
172
  Given a rough crop from the top or bottom of an image, finds the background color
173
173
  and crops to the metadata region.
174
-
175
- Within a region of an image (typically a crop from the top-ish or bottom-ish part of
174
+
175
+ Within a region of an image (typically a crop from the top-ish or bottom-ish part of
176
176
  an image), tightly crop to the solid portion (typically a region with a black background).
177
177
 
178
178
  The success metric is just a binary indicator right now: 1.0 if we found a region we believe
179
179
  contains a solid background, 0.0 otherwise.
180
-
180
+
181
181
  Args:
182
182
  rough_crop (Image): the PIL Image to crop
183
183
  crop_location (str): 'top' or 'bottom'
184
184
  options (DatetimeExtractionOptions, optional): OCR parameters
185
-
185
+
186
186
  Returns:
187
187
  tuple: a tuple containing (a cropped_image (Image), p_success (float), padded_image (Image))
188
188
  """
189
-
189
+
190
190
  if options is None:
191
- options = DatetimeExtractionOptions()
191
+ options = DatetimeExtractionOptions()
192
192
 
193
193
  crop_to_solid_region_result = {}
194
194
  crop_to_solid_region_result['crop_pil'] = None
195
195
  crop_to_solid_region_result['padded_crop_pil'] = None
196
196
  crop_to_solid_region_result['p_success'] = 0.0
197
-
198
- # pil --> cv2
199
- rough_crop_np = np.array(rough_crop)
200
- rough_crop_np = rough_crop_np[:, :, ::-1].copy()
201
-
197
+
198
+ # pil --> cv2
199
+ rough_crop_np = np.array(rough_crop)
200
+ rough_crop_np = rough_crop_np[:, :, ::-1].copy()
201
+
202
202
  # Search *part* of the crop for the background value (the part closest to the top or bottom
203
203
  # of the image)
204
204
  rows_to_use_for_background_search = int(rough_crop_np.shape[0] * \
205
205
  options.background_crop_fraction_of_rough_crop)
206
-
206
+
207
207
  if crop_location == 'top':
208
208
  background_search_image = rough_crop_np[0:rows_to_use_for_background_search,:,:]
209
209
  elif crop_location == 'bottom':
210
210
  background_search_image = rough_crop_np[-rows_to_use_for_background_search:,:,:]
211
211
  else:
212
212
  raise ValueError('Unrecognized crop location: {}'.format(crop_location))
213
-
213
+
214
214
  background_search_image = cv2.cvtColor(background_search_image, cv2.COLOR_BGR2GRAY)
215
- background_search_image = background_search_image.astype('uint8')
216
- background_search_image = cv2.medianBlur(background_search_image,3)
215
+ background_search_image = background_search_image.astype('uint8')
216
+ background_search_image = cv2.medianBlur(background_search_image,3)
217
217
  pixel_values = background_search_image.flatten()
218
218
  counts = np.bincount(pixel_values)
219
219
  background_value = int(np.argmax(counts))
220
-
220
+
221
221
  # Did we find a sensible mode that looks like a background value?
222
222
  background_value_count = int(np.max(counts))
223
223
  p_background_value = background_value_count / np.sum(counts)
224
-
224
+
225
225
  if (p_background_value < options.min_background_fraction):
226
226
  return crop_to_solid_region_result
227
227
  else:
228
228
  p_success = 1.0
229
-
229
+
230
230
  analysis_image = cv2.cvtColor(rough_crop_np, cv2.COLOR_BGR2GRAY)
231
- analysis_image = analysis_image.astype('uint8')
232
- analysis_image = cv2.medianBlur(analysis_image,3)
233
-
231
+ analysis_image = analysis_image.astype('uint8')
232
+ analysis_image = cv2.medianBlur(analysis_image,3)
233
+
234
234
  # This will now be a binary image indicating which pixels are background
235
235
  analysis_image = cv2.inRange(analysis_image,
236
236
  background_value-options.background_tolerance,
237
237
  background_value+options.background_tolerance)
238
-
239
- # Use row heuristics to refine the crop
238
+
239
+ # Use row heuristics to refine the crop
240
240
  h = analysis_image.shape[0]
241
241
  w = analysis_image.shape[1]
242
-
242
+
243
243
  min_x = 0
244
244
  min_y = -1
245
245
  max_x = w
246
246
  max_y = -1
247
-
247
+
248
248
  # Find the first and last row that are mostly the background color
249
249
  for y in range(h):
250
250
  row_count = 0
@@ -256,20 +256,20 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
256
256
  if min_y == -1:
257
257
  min_y = y
258
258
  max_y = y
259
-
259
+
260
260
  assert (min_y == -1 and max_y == -1) or (min_y != -1 and max_y != -1)
261
-
261
+
262
262
  if min_y == -1:
263
263
  return crop_to_solid_region_result
264
-
264
+
265
265
  if max_y == min_y:
266
266
  return crop_to_solid_region_result
267
-
267
+
268
268
  x = min_x
269
269
  y = min_y
270
270
  w = max_x-min_x
271
271
  h = max_y-min_y
272
-
272
+
273
273
  x = min_x
274
274
  y = min_y
275
275
  w = max_x-min_x
@@ -277,7 +277,7 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
277
277
 
278
278
  # Crop the image
279
279
  crop_np = rough_crop_np[y:y+h,x:x+w]
280
-
280
+
281
281
  # Tesseract doesn't like characters really close to the edge, so pad a little.
282
282
  crop_padding = options.crop_padding
283
283
  padded_crop_np = cv2.copyMakeBorder(crop_np,crop_padding,crop_padding,crop_padding,crop_padding,
@@ -286,39 +286,39 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
286
286
 
287
287
  crop_pil = Image.fromarray(crop_np)
288
288
  padded_crop_pil = Image.fromarray(padded_crop_np)
289
-
289
+
290
290
  crop_to_solid_region_result['crop_pil'] = crop_pil
291
291
  crop_to_solid_region_result['padded_crop_pil'] = padded_crop_pil
292
292
  crop_to_solid_region_result['p_success'] = p_success
293
-
293
+
294
294
  return crop_to_solid_region_result
295
-
296
- # ...crop_to_solid_region(...)
295
+
296
+ # ...crop_to_solid_region(...)
297
297
 
298
298
 
299
299
  def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
300
300
  """
301
- Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
301
+ Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
302
302
  regions by the time they get to this function, roughly the top or bottom 20% of an image.
303
-
303
+
304
304
  Args:
305
305
  rough_crops (list): list of Image objects that have been cropped close to text
306
306
  options (DatetimeExtractionOptions, optional): OCR parameters
307
307
  tesseract_config_string (str, optional): optional CLI argument to pass to tesseract.exe
308
-
308
+
309
309
  Returns:
310
310
  dict: a dict with keys "top" and "bottom", where each value is a dict with keys
311
311
  'text' (text found, if any) and 'crop_to_solid_region_results' (metadata about the OCR pass)
312
312
  """
313
-
313
+
314
314
  if options is None:
315
315
  options = DatetimeExtractionOptions()
316
-
316
+
317
317
  if tesseract_config_string is None:
318
318
  tesseract_config_string = options.tesseract_config_strings[0]
319
-
319
+
320
320
  find_text_in_crops_results = {}
321
-
321
+
322
322
  # crop_location = 'top'
323
323
  # crop_location = 'bottom'
324
324
  for crop_location in ('top','bottom'):
@@ -326,51 +326,51 @@ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
326
326
  find_text_in_crops_results[crop_location] = {}
327
327
  find_text_in_crops_results[crop_location]['text'] = ''
328
328
  find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = None
329
-
329
+
330
330
  rough_crop = rough_crops[crop_location]
331
-
331
+
332
332
  # Crop to the portion of the rough crop with a solid background color
333
333
  crop_to_solid_region_results = crop_to_solid_region(rough_crop,crop_location,options)
334
-
334
+
335
335
  find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = \
336
336
  crop_to_solid_region_results
337
-
337
+
338
338
  # Try cropping to a solid region; if that doesn't work, try running OCR on the whole
339
339
  # rough crop.
340
340
  if crop_to_solid_region_results['p_success'] >= options.p_crop_success_threshold:
341
341
  padded_crop_pil = crop_to_solid_region_results['padded_crop_pil']
342
- else:
342
+ else:
343
343
  # continue
344
- padded_crop_pil = rough_crop
345
-
344
+ padded_crop_pil = rough_crop
345
+
346
346
  if options.apply_sharpening_filter:
347
347
  padded_crop_pil = padded_crop_pil.filter(ImageFilter.SHARPEN)
348
-
348
+
349
349
  # Find text in the padded crop
350
350
  pytesseract.pytesseract.tesseract_cmd = options.tesseract_cmd
351
- text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
351
+ text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
352
352
  config=tesseract_config_string)
353
-
353
+
354
354
  text = text.replace('\n', ' ').replace('\r', '').strip()
355
355
 
356
- find_text_in_crops_results[crop_location]['text'] = text
357
-
356
+ find_text_in_crops_results[crop_location]['text'] = text
357
+
358
358
  # ...for each cropped region
359
-
359
+
360
360
  return find_text_in_crops_results
361
-
361
+
362
362
  # ...def find_text_in_crops(...)
363
-
363
+
364
364
 
365
365
  def _datetime_string_to_datetime(matched_string):
366
366
  """
367
367
  Takes an OCR-matched datetime string, does a little cleanup, and parses a date
368
368
  from it.
369
-
369
+
370
370
  By the time a string gets to this function, it should be a proper date string, with
371
371
  no extraneous characters other than spaces around colons or hyphens.
372
372
  """
373
-
373
+
374
374
  matched_string = matched_string.replace(' -','-')
375
375
  matched_string = matched_string.replace('- ','-')
376
376
  matched_string = matched_string.replace(' :',':')
@@ -386,155 +386,155 @@ def _get_datetime_from_strings(strings,options=None):
386
386
  """
387
387
  Given a string or list of strings, search for exactly one datetime in those strings.
388
388
  using a series of regular expressions.
389
-
389
+
390
390
  Strings are currently just concatenated before searching for a datetime.
391
391
  """
392
-
392
+
393
393
  if options is None:
394
- options = DatetimeExtractionOptions()
395
-
394
+ options = DatetimeExtractionOptions()
395
+
396
396
  if isinstance(strings,str):
397
397
  s = strings
398
398
  else:
399
399
  s = ' '.join(strings).lower()
400
- s = s.replace('—','-')
400
+ s = s.replace('—','-')
401
401
  s = ''.join(e for e in s if e.isalnum() or e in ':-/' or e.isspace())
402
-
402
+
403
403
  ### AM/PM
404
-
404
+
405
405
  # 2013-10-02 11:40:50 AM
406
- m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
407
- if m is not None:
408
- return _datetime_string_to_datetime(m.group(0))
409
-
406
+ m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
407
+ if m is not None:
408
+ return _datetime_string_to_datetime(m.group(0))
409
+
410
410
  # 04/01/2017 08:54:00AM
411
- m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
412
- if m is not None:
413
- return _datetime_string_to_datetime(m.group(0))
414
-
411
+ m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
412
+ if m is not None:
413
+ return _datetime_string_to_datetime(m.group(0))
414
+
415
415
  # 2017/04/01 08:54:00AM
416
- m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
417
- if m is not None:
418
- return _datetime_string_to_datetime(m.group(0))
419
-
416
+ m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
417
+ if m is not None:
418
+ return _datetime_string_to_datetime(m.group(0))
419
+
420
420
  # 04/01/2017 08:54AM
421
- m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
422
- if m is not None:
423
- return _datetime_string_to_datetime(m.group(0))
424
-
421
+ m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
422
+ if m is not None:
423
+ return _datetime_string_to_datetime(m.group(0))
424
+
425
425
  # 2017/04/01 08:54AM
426
- m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
427
- if m is not None:
428
- return _datetime_string_to_datetime(m.group(0))
429
-
426
+ m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
427
+ if m is not None:
428
+ return _datetime_string_to_datetime(m.group(0))
429
+
430
430
  ### No AM/PM
431
-
431
+
432
432
  # 2013-07-27 04:56:35
433
- m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
434
- if m is not None:
435
- return _datetime_string_to_datetime(m.group(0))
436
-
433
+ m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
434
+ if m is not None:
435
+ return _datetime_string_to_datetime(m.group(0))
436
+
437
437
  # 07-27-2013 04:56:35
438
- m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
439
- if m is not None:
440
- return _datetime_string_to_datetime(m.group(0))
441
-
438
+ m = re.search(r'(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
439
+ if m is not None:
440
+ return _datetime_string_to_datetime(m.group(0))
441
+
442
442
  # 2013/07/27 04:56:35
443
- m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
444
- if m is not None:
445
- return _datetime_string_to_datetime(m.group(0))
446
-
443
+ m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
444
+ if m is not None:
445
+ return _datetime_string_to_datetime(m.group(0))
446
+
447
447
  # 07/27/2013 04:56:35
448
- m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
449
- if m is not None:
450
- return _datetime_string_to_datetime(m.group(0))
451
-
448
+ m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
449
+ if m is not None:
450
+ return _datetime_string_to_datetime(m.group(0))
451
+
452
452
  return None
453
-
453
+
454
454
  # ...def _get_datetime_from_strings(...)
455
455
 
456
456
 
457
457
  def get_datetime_from_image(image,include_crops=True,options=None):
458
458
  """
459
459
  Tries to find the datetime string (if present) in an image.
460
-
460
+
461
461
  Args:
462
462
  image (Image or str): the PIL Image object or image filename in which we should look for
463
463
  datetime information.
464
464
  include_crops (bool, optional): whether to include cropped images in the return dict (set
465
465
  this to False if you're worried about size and you're processing a zillion images)
466
- options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
466
+ options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
467
467
  DatetimeExtractionOptions object or a list of options to try
468
-
468
+
469
469
  Returns:
470
470
  dict: a dict with fields:
471
-
471
+
472
472
  - datetime: Python datetime object, or None
473
473
  - text_results: length-2 list of strings
474
- - all_extracted_datetimes: if we ran multiple option sets, this will contain the
474
+ - all_extracted_datetimes: if we ran multiple option sets, this will contain the
475
475
  datetimes extracted for each option set
476
476
  - ocr_results: detailed results from the OCR process, including crops as PIL images;
477
477
  only included if include_crops is True
478
478
  """
479
-
479
+
480
480
  if options is None:
481
481
  options = DatetimeExtractionOptions()
482
-
482
+
483
483
  if isinstance(image,str):
484
484
  image = vis_utils.open_image(image)
485
485
 
486
486
  # Crop the top and bottom from the image
487
487
  rough_crops = make_rough_crops(image,options)
488
488
  assert len(rough_crops) == 2
489
-
489
+
490
490
  all_extracted_datetimes = {}
491
491
  all_text_results = []
492
492
  all_ocr_results = []
493
-
493
+
494
494
  extracted_datetime = None
495
-
495
+
496
496
  # Find text, possibly trying all config strings
497
497
  #
498
498
  # tesseract_config_string = options.tesseract_config_strings[0]
499
499
  for tesseract_config_string in options.tesseract_config_strings:
500
-
500
+
501
501
  ocr_results = find_text_in_crops(rough_crops,options,tesseract_config_string)
502
502
  all_ocr_results.append(ocr_results)
503
-
503
+
504
504
  text_results = [v['text'] for v in ocr_results.values()]
505
505
  assert len(text_results) == 2
506
506
  all_text_results.append(text_results)
507
-
507
+
508
508
  # Find datetime
509
509
  extracted_datetime_this_option_set = _get_datetime_from_strings(text_results,options)
510
510
  assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
511
511
  (extracted_datetime_this_option_set is None)
512
-
512
+
513
513
  all_extracted_datetimes[tesseract_config_string] = \
514
514
  extracted_datetime_this_option_set
515
-
515
+
516
516
  if extracted_datetime_this_option_set is not None:
517
517
  if extracted_datetime is None:
518
518
  extracted_datetime = extracted_datetime_this_option_set
519
519
  if not options.force_all_ocr_options:
520
- break
521
-
520
+ break
521
+
522
522
  # ...for each set of OCR options
523
-
524
- if extracted_datetime is not None:
523
+
524
+ if extracted_datetime is not None:
525
525
  assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
526
526
 
527
527
  to_return = {}
528
528
  to_return['datetime'] = extracted_datetime
529
-
529
+
530
530
  to_return['text_results'] = all_text_results
531
531
  to_return['all_extracted_datetimes'] = all_extracted_datetimes
532
-
532
+
533
533
  if include_crops:
534
534
  to_return['ocr_results'] = all_ocr_results
535
535
  else:
536
536
  to_return['ocr_results'] = None
537
-
537
+
538
538
  return to_return
539
539
 
540
540
  # ...def get_datetime_from_image(...)
@@ -544,34 +544,34 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
544
544
  """
545
545
  Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
546
546
  until we find a datetime.
547
-
547
+
548
548
  Args:
549
549
  image (Image or str): the PIL Image object or image filename in which we should look for
550
550
  datetime information.
551
551
  include_crops (bool, optional): whether to include cropped images in the return dict (set
552
552
  this to False if you're worried about size and you're processing a zillion images)
553
- options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
553
+ options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
554
554
  DatetimeExtractionOptions object or a list of options to try
555
-
555
+
556
556
  Returns:
557
557
  dict: A dict with fields:
558
558
  - datetime: Python datetime object, or None
559
559
  - text_results: length-2 list of strings
560
- - all_extracted_datetimes: if we ran multiple option sets, this will contain the
560
+ - all_extracted_datetimes: if we ran multiple option sets, this will contain the
561
561
  datetimes extracted for each option set
562
562
  - ocr_results: detailed results from the OCR process, including crops as PIL images;
563
563
  only included if include_crops is True
564
564
  """
565
-
565
+
566
566
  if options is None:
567
567
  options = DatetimeExtractionOptions()
568
568
 
569
569
  if not is_iterable(options):
570
570
  options = [options]
571
-
571
+
572
572
  result = {}
573
573
  result['error'] = None
574
-
574
+
575
575
  for i_option_set,current_options in enumerate(options):
576
576
  try:
577
577
  result = get_datetime_from_image(filename,include_crops=include_crops,options=current_options)
@@ -580,79 +580,85 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
580
580
  break
581
581
  except Exception as e:
582
582
  result['error'] = str(e)
583
-
583
+
584
584
  return result
585
585
 
586
586
 
587
587
  def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None,
588
588
  n_workers=16,use_threads=False):
589
589
  """
590
- The main entry point for this module. Tries to retrieve metadata from pixels for every
590
+ The main entry point for this module. Tries to retrieve metadata from pixels for every
591
591
  image in [folder_name], optionally the results to the .json file [output_file].
592
-
592
+
593
593
  Args:
594
594
  folder_name (str): the folder of images to process recursively
595
595
  output_file (str, optional): the .json file to which we should write results; if None,
596
596
  just returns the results
597
597
  n_to_sample (int, optional): for debugging only, used to limit the number of images
598
598
  we process
599
- options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
599
+ options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
600
600
  DatetimeExtractionOptions object or a list of options to try for each image
601
601
  n_workers (int, optional): the number of parallel workers to use; set to <= 1 to disable
602
602
  parallelization
603
603
  use_threads (bool, optional): whether to use threads (True) or processes (False) for
604
604
  parallelization; not relevant if n_workers <= 1
605
-
605
+
606
606
  Returns:
607
607
  dict: a dict mapping filenames to datetime extraction results, see try_get_datetime_from_images
608
608
  for the format of each value in the dict.
609
609
  """
610
-
610
+
611
611
  if options is None:
612
612
  options = DatetimeExtractionOptions()
613
-
613
+
614
614
  image_file_names = \
615
615
  find_images(folder_name,convert_slashes=True,
616
616
  return_relative_paths=False,recursive=True)
617
-
617
+
618
618
  if n_to_sample > 0:
619
619
  import random
620
620
  random.seed(0)
621
621
  image_file_names = random.sample(image_file_names,n_to_sample)
622
-
622
+
623
623
  if n_workers <= 1:
624
-
624
+
625
625
  all_results = []
626
626
  for fn_abs in tqdm(image_file_names):
627
627
  all_results.append(try_get_datetime_from_image(fn_abs,options=options))
628
-
629
- else:
630
-
628
+
629
+ else:
630
+
631
631
  # Don't spawn more than one worker per image
632
632
  if n_workers > len(image_file_names):
633
633
  n_workers = len(image_file_names)
634
-
635
- if use_threads:
636
- from multiprocessing.pool import ThreadPool
637
- pool = ThreadPool(n_workers)
638
- worker_string = 'threads'
639
- else:
640
- from multiprocessing.pool import Pool
641
- pool = Pool(n_workers)
642
- worker_string = 'processes'
643
-
644
- print('Starting a pool of {} {}'.format(n_workers,worker_string))
645
-
646
- all_results = list(tqdm(pool.imap(
647
- partial(try_get_datetime_from_image,options=options),image_file_names),
648
- total=len(image_file_names)))
649
-
634
+
635
+ pool = None
636
+ try:
637
+ if use_threads:
638
+ from multiprocessing.pool import ThreadPool
639
+ pool = ThreadPool(n_workers)
640
+ worker_string = 'threads'
641
+ else:
642
+ from multiprocessing.pool import Pool
643
+ pool = Pool(n_workers)
644
+ worker_string = 'processes'
645
+
646
+ print('Starting a pool of {} {}'.format(n_workers,worker_string))
647
+
648
+ all_results = list(tqdm(pool.imap(
649
+ partial(try_get_datetime_from_image,options=options),image_file_names),
650
+ total=len(image_file_names)))
651
+ finally:
652
+ pool.close()
653
+ pool.join()
654
+ print("Pool closed and joined for datetime extraction")
655
+
650
656
  filename_to_results = {}
651
-
657
+
652
658
  # fn_relative = image_file_names[0]
653
659
  for i_file,fn_abs in enumerate(image_file_names):
654
660
  filename_to_results[fn_abs] = all_results[i_file]
655
-
661
+
656
662
  if output_file is not None:
657
663
  with open(output_file,'w') as f:
658
664
  json.dump(filename_to_results,f,indent=1,default=str)
@@ -663,9 +669,9 @@ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options
663
669
  #%% Interactive driver
664
670
 
665
671
  if False:
666
-
672
+
667
673
  #%% Process images
668
-
674
+
669
675
  folder_name = r'g:\temp\island_conservation_camera_traps'
670
676
  output_file = r'g:\temp\ocr_results.json'
671
677
  from megadetector.utils.path_utils import insert_before_extension
@@ -681,60 +687,60 @@ if False:
681
687
  all_options = [options_a]
682
688
  filename_to_results = get_datetimes_for_folder(folder_name,output_file,
683
689
  n_to_sample=n_to_sample,options=all_options)
684
-
690
+
685
691
 
686
692
  #%% Load results
687
-
693
+
688
694
  # output_file = r"G:\temp\ocr_results.2023.10.31.07.37.54.json"
689
695
  with open(output_file,'r') as f:
690
696
  filename_to_results = json.load(f)
691
697
  filenames = sorted(list(filename_to_results.keys()))
692
698
  print('Loaded results for {} files'.format(len(filename_to_results)))
693
-
694
-
699
+
700
+
695
701
  #%% Scrap cell
696
-
702
+
697
703
  fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
698
704
  include_crops = False
699
705
  options_a = DatetimeExtractionOptions()
700
706
  options_b = DatetimeExtractionOptions()
701
707
  options_b.image_crop_fraction = [0.08 , 0.08]
702
- image = vis_utils.open_image(fn) # noqa
708
+ image = vis_utils.open_image(fn) # noqa
703
709
  result = try_get_datetime_from_image(fn,options=[options_a,options_b]) # noqa
704
710
  print(result)
705
-
711
+
706
712
  # open_file(fn)
707
713
  # rough_crops = make_rough_crops(image,options=options)
708
-
709
-
714
+
715
+
710
716
  #%% Look for OCR or parsing failures
711
-
717
+
712
718
  bad_tokens = ()
713
-
719
+
714
720
  files_with_disagreements = set()
715
-
721
+
716
722
  # i_fn = 0; fn = filenames[i_fn]
717
723
  for i_fn,fn in enumerate(filenames):
718
-
724
+
719
725
  image = fn
720
726
  results = filename_to_results[fn]
721
-
727
+
722
728
  if 'text_results' not in results:
723
729
  raise Exception('no results available for {} ({})'.format(i_fn,fn))
724
730
  print('Skipping {}, no results'.format(i_fn))
725
731
  continue
726
-
732
+
727
733
  s = ' '.join([x[0] for x in results['text_results']])
728
-
734
+
729
735
  known_bad = False
730
736
  for bad_token in bad_tokens:
731
737
  if bad_token in s:
732
738
  known_bad = True
733
- if known_bad:
739
+ if known_bad:
734
740
  continue
735
-
741
+
736
742
  extracted_datetime = results['datetime']
737
-
743
+
738
744
  # If we have a datetime, make sure all successful OCR results agree
739
745
  if extracted_datetime is not None:
740
746
  for config_string in results['all_extracted_datetimes']:
@@ -745,19 +751,19 @@ if False:
745
751
  print('Falling back for {} ({})'.format(i_fn,fn))
746
752
  ocr_results = get_datetime_from_image(fn)
747
753
  extracted_datetime = ocr_results['datetime']
748
-
754
+
749
755
  if extracted_datetime is None:
750
756
  print('Failure at {}: {}'.format(i_fn,s))
751
-
757
+
752
758
  # open_file(fn)
753
759
  # get_datetime_from_image(fn)
754
-
755
-
760
+
761
+
756
762
  #%% Write results to an HTML file for testing
757
-
763
+
758
764
  n_to_sample = 5000
759
765
  if (n_to_sample >= 0) and (len(filename_to_results) > n_to_sample):
760
- filenames = sorted(list(filename_to_results.keys()))
766
+ filenames = sorted(list(filename_to_results.keys()))
761
767
  import random
762
768
  random.seed(0)
763
769
  keys = random.sample(filenames,n_to_sample)
@@ -765,18 +771,18 @@ if False:
765
771
 
766
772
  preview_dir = r'g:\temp\ocr-preview'
767
773
  os.makedirs(preview_dir,exist_ok=True)
768
-
774
+
769
775
  def resize_image_for_preview(fn_abs):
770
- fn_relative = os.path.relpath(fn_abs,folder_name)
776
+ fn_relative = os.path.relpath(fn_abs,folder_name)
771
777
  resized_image = vis_utils.resize_image(fn_abs,target_width=600)
772
778
  resized_fn = os.path.join(preview_dir,fn_relative)
773
779
  os.makedirs(os.path.dirname(resized_fn),exist_ok=True)
774
780
  resized_image.save(resized_fn)
775
781
  return resized_fn
776
-
782
+
777
783
  # Resize images in parallel
778
784
  n_rendering_workers = 16
779
-
785
+
780
786
  if n_rendering_workers <= 1:
781
787
  for fn_abs in tqdm(filename_to_results.keys()):
782
788
  resize_image_for_preview(fn_abs)
@@ -784,64 +790,64 @@ if False:
784
790
  # from multiprocessing.pool import Pool as RenderingPool; worker_string = 'processes'
785
791
  from multiprocessing.pool import ThreadPool as RenderingPool; worker_string = 'threads'
786
792
  pool = RenderingPool(n_rendering_workers)
787
-
793
+
788
794
  print('Starting rendering pool with {} {}'.format(n_rendering_workers,worker_string))
789
-
795
+
790
796
  _ = list(tqdm(pool.imap(resize_image_for_preview,filename_to_results.keys()),
791
797
  total=len(filename_to_results)))
792
-
793
-
798
+
799
+
794
800
  def make_datetime_preview_page(filenames,html_file):
795
-
801
+
796
802
  html_image_list = []
797
803
  html_options = write_html_image_list.write_html_image_list()
798
804
  html_options['maxFiguresPerHtmlFile'] = 2500
799
805
  html_options['defaultImageStyle'] = 'margin:0px;margin-top:5px;margin-bottom:30px;'
800
-
806
+
801
807
  # fn_abs = filenames[0]
802
808
  for fn_abs in filenames:
803
-
804
- fn_relative = os.path.relpath(fn_abs,folder_name)
809
+
810
+ fn_relative = os.path.relpath(fn_abs,folder_name)
805
811
  # resized_fn = os.path.join(preview_dir,fn_relative)
806
812
  results_this_image = filename_to_results[fn_abs]
807
-
813
+
808
814
  extracted_datetime = results_this_image['datetime']
809
815
  title = 'Image: {}<br/>Extracted datetime: {}'.format(fn_relative,extracted_datetime)
810
816
  html_image_list.append({'filename':fn_relative,'title':title})
811
-
817
+
812
818
  # ...for each crop
813
-
819
+
814
820
  # ...for each image
815
-
821
+
816
822
  html_options['makeRelative'] = True
817
823
  write_html_image_list.write_html_image_list(html_file,
818
824
  html_image_list,
819
825
  html_options)
820
826
  open_file(html_file)
821
827
  return html_image_list
822
-
828
+
823
829
  failed_files = []
824
830
  for fn_abs in filename_to_results:
825
831
  results_this_image = filename_to_results[fn_abs]
826
832
  if results_this_image['datetime'] is None:
827
833
  failed_files.append(fn_abs)
828
-
834
+
829
835
  print('Found {} failures'.format(len(failed_files)))
830
-
836
+
831
837
  output_summary_file = os.path.join(preview_dir,'summary.html')
832
838
  html_image_list = make_datetime_preview_page(sorted(list(filename_to_results.keys())),output_summary_file)
833
-
834
- failure_summary_file = os.path.join(preview_dir,'failures.html')
839
+
840
+ failure_summary_file = os.path.join(preview_dir,'failures.html')
835
841
  html_image_list_failures = make_datetime_preview_page(failed_files,failure_summary_file)
836
-
842
+
837
843
  filenames = failed_files
838
844
  html_file = failure_summary_file
839
845
 
840
-
846
+
841
847
  #%% Other approaches to getting dates from strings
842
-
848
+
843
849
  # ...that didn't really work out.
844
-
850
+
845
851
  # pip install dateparser
846
852
  import dateparser
847
853
 
@@ -853,7 +859,7 @@ if False:
853
859
  dateparser_settings = {'PREFER_DATES_FROM':'past','STRICT_PARSING':True}
854
860
 
855
861
  dateparser_result = dateparser.search.search_dates(s, settings=dateparser_settings)
856
-
862
+
857
863
  if dateparser_result is not None:
858
864
  assert len(dateparser_result) == 1
859
865
  extracted_datetime = dateparser_result[0][1]
@@ -864,7 +870,7 @@ if False:
864
870
  extracted_datetime = matches_list[0]
865
871
  else:
866
872
  extracted_datetime = None
867
-
868
- if extracted_datetime is not None:
873
+
874
+ if extracted_datetime is not None:
869
875
  assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
870
876