PyPI - megadetector - Versions diffs - 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl - Mend

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (197) hide show

megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
megadetector/classification/aggregate_classifier_probs.py +3 -3
megadetector/classification/analyze_failed_images.py +5 -5
megadetector/classification/cache_batchapi_outputs.py +5 -5
megadetector/classification/create_classification_dataset.py +11 -12
megadetector/classification/crop_detections.py +10 -10
megadetector/classification/csv_to_json.py +8 -8
megadetector/classification/detect_and_crop.py +13 -15
megadetector/classification/efficientnet/model.py +8 -8
megadetector/classification/efficientnet/utils.py +6 -5
megadetector/classification/evaluate_model.py +7 -7
megadetector/classification/identify_mislabeled_candidates.py +6 -6
megadetector/classification/json_to_azcopy_list.py +1 -1
megadetector/classification/json_validator.py +29 -32
megadetector/classification/map_classification_categories.py +9 -9
megadetector/classification/merge_classification_detection_output.py +12 -9
megadetector/classification/prepare_classification_script.py +19 -19
megadetector/classification/prepare_classification_script_mc.py +26 -26
megadetector/classification/run_classifier.py +4 -4
megadetector/classification/save_mislabeled.py +6 -6
megadetector/classification/train_classifier.py +1 -1
megadetector/classification/train_classifier_tf.py +9 -9
megadetector/classification/train_utils.py +10 -10
megadetector/data_management/annotations/annotation_constants.py +1 -2
megadetector/data_management/camtrap_dp_to_coco.py +79 -46
megadetector/data_management/cct_json_utils.py +103 -103
megadetector/data_management/cct_to_md.py +49 -49
megadetector/data_management/cct_to_wi.py +33 -33
megadetector/data_management/coco_to_labelme.py +75 -75
megadetector/data_management/coco_to_yolo.py +210 -193
megadetector/data_management/databases/add_width_and_height_to_db.py +86 -12
megadetector/data_management/databases/combine_coco_camera_traps_files.py +40 -40
megadetector/data_management/databases/integrity_check_json_db.py +228 -200
megadetector/data_management/databases/subset_json_db.py +33 -33
megadetector/data_management/generate_crops_from_cct.py +88 -39
megadetector/data_management/get_image_sizes.py +54 -49
megadetector/data_management/labelme_to_coco.py +133 -125
megadetector/data_management/labelme_to_yolo.py +159 -73
megadetector/data_management/lila/create_lila_blank_set.py +81 -83
megadetector/data_management/lila/create_lila_test_set.py +32 -31
megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
megadetector/data_management/lila/download_lila_subset.py +21 -24
megadetector/data_management/lila/generate_lila_per_image_labels.py +365 -107
megadetector/data_management/lila/get_lila_annotation_counts.py +35 -33
megadetector/data_management/lila/get_lila_image_counts.py +22 -22
megadetector/data_management/lila/lila_common.py +73 -70
megadetector/data_management/lila/test_lila_metadata_urls.py +28 -19
megadetector/data_management/mewc_to_md.py +344 -340
megadetector/data_management/ocr_tools.py +262 -255
megadetector/data_management/read_exif.py +249 -227
megadetector/data_management/remap_coco_categories.py +90 -28
megadetector/data_management/remove_exif.py +81 -21
megadetector/data_management/rename_images.py +187 -187
megadetector/data_management/resize_coco_dataset.py +588 -120
megadetector/data_management/speciesnet_to_md.py +41 -41
megadetector/data_management/wi_download_csv_to_coco.py +55 -55
megadetector/data_management/yolo_output_to_md_output.py +248 -122
megadetector/data_management/yolo_to_coco.py +333 -191
megadetector/detection/change_detection.py +832 -0
megadetector/detection/process_video.py +340 -337
megadetector/detection/pytorch_detector.py +358 -278
megadetector/detection/run_detector.py +399 -186
megadetector/detection/run_detector_batch.py +404 -377
megadetector/detection/run_inference_with_yolov5_val.py +340 -327
megadetector/detection/run_tiled_inference.py +257 -249
megadetector/detection/tf_detector.py +24 -24
megadetector/detection/video_utils.py +332 -295
megadetector/postprocessing/add_max_conf.py +19 -11
megadetector/postprocessing/categorize_detections_by_size.py +45 -45
megadetector/postprocessing/classification_postprocessing.py +468 -433
megadetector/postprocessing/combine_batch_outputs.py +23 -23
megadetector/postprocessing/compare_batch_results.py +590 -525
megadetector/postprocessing/convert_output_format.py +106 -102
megadetector/postprocessing/create_crop_folder.py +347 -147
megadetector/postprocessing/detector_calibration.py +173 -168
megadetector/postprocessing/generate_csv_report.py +508 -499
megadetector/postprocessing/load_api_results.py +48 -27
megadetector/postprocessing/md_to_coco.py +133 -102
megadetector/postprocessing/md_to_labelme.py +107 -90
megadetector/postprocessing/md_to_wi.py +40 -40
megadetector/postprocessing/merge_detections.py +92 -114
megadetector/postprocessing/postprocess_batch_results.py +319 -301
megadetector/postprocessing/remap_detection_categories.py +91 -38
megadetector/postprocessing/render_detection_confusion_matrix.py +214 -205
megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +704 -679
megadetector/postprocessing/separate_detections_into_folders.py +226 -211
megadetector/postprocessing/subset_json_detector_output.py +265 -262
megadetector/postprocessing/top_folders_to_bottom.py +45 -45
megadetector/postprocessing/validate_batch_results.py +70 -70
megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
megadetector/taxonomy_mapping/map_new_lila_datasets.py +18 -19
megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +54 -33
megadetector/taxonomy_mapping/preview_lila_taxonomy.py +67 -67
megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
megadetector/taxonomy_mapping/simple_image_download.py +8 -8
megadetector/taxonomy_mapping/species_lookup.py +156 -74
megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
megadetector/utils/ct_utils.py +1049 -211
megadetector/utils/directory_listing.py +21 -77
megadetector/utils/gpu_test.py +22 -22
megadetector/utils/md_tests.py +632 -529
megadetector/utils/path_utils.py +1520 -431
megadetector/utils/process_utils.py +41 -41
megadetector/utils/split_locations_into_train_val.py +62 -62
megadetector/utils/string_utils.py +148 -27
megadetector/utils/url_utils.py +489 -176
megadetector/utils/wi_utils.py +2658 -2526
megadetector/utils/write_html_image_list.py +137 -137
megadetector/visualization/plot_utils.py +34 -30
megadetector/visualization/render_images_with_thumbnails.py +39 -74
megadetector/visualization/visualization_utils.py +487 -435
megadetector/visualization/visualize_db.py +232 -198
megadetector/visualization/visualize_detector_output.py +82 -76
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/METADATA +5 -2
megadetector-10.0.0.dist-info/RECORD +139 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/WHEEL +1 -1
megadetector/api/batch_processing/api_core/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/__init__.py +0 -0
megadetector/api/batch_processing/api_core/batch_service/score.py +0 -439
megadetector/api/batch_processing/api_core/server.py +0 -294
megadetector/api/batch_processing/api_core/server_api_config.py +0 -97
megadetector/api/batch_processing/api_core/server_app_config.py +0 -55
megadetector/api/batch_processing/api_core/server_batch_job_manager.py +0 -220
megadetector/api/batch_processing/api_core/server_job_status_table.py +0 -149
megadetector/api/batch_processing/api_core/server_orchestration.py +0 -360
megadetector/api/batch_processing/api_core/server_utils.py +0 -88
megadetector/api/batch_processing/api_core_support/__init__.py +0 -0
megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +0 -46
megadetector/api/batch_processing/api_support/__init__.py +0 -0
megadetector/api/batch_processing/api_support/summarize_daily_activity.py +0 -152
megadetector/api/batch_processing/data_preparation/__init__.py +0 -0
megadetector/api/synchronous/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
megadetector/api/synchronous/api_core/animal_detection_api/api_backend.py +0 -151
megadetector/api/synchronous/api_core/animal_detection_api/api_frontend.py +0 -263
megadetector/api/synchronous/api_core/animal_detection_api/config.py +0 -35
megadetector/api/synchronous/api_core/tests/__init__.py +0 -0
megadetector/api/synchronous/api_core/tests/load_test.py +0 -110
megadetector/data_management/importers/add_nacti_sizes.py +0 -52
megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
megadetector/data_management/importers/awc_to_json.py +0 -191
megadetector/data_management/importers/bellevue_to_json.py +0 -272
megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
megadetector/data_management/importers/cct_field_adjustments.py +0 -58
megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
megadetector/data_management/importers/ena24_to_json.py +0 -276
megadetector/data_management/importers/filenames_to_json.py +0 -386
megadetector/data_management/importers/helena_to_cct.py +0 -283
megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
megadetector/data_management/importers/jb_csv_to_json.py +0 -150
megadetector/data_management/importers/mcgill_to_json.py +0 -250
megadetector/data_management/importers/missouri_to_json.py +0 -490
megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
megadetector/data_management/importers/noaa_seals_2019.py +0 -181
megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
megadetector/data_management/importers/pc_to_json.py +0 -365
megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
megadetector/data_management/importers/rspb_to_json.py +0 -356
megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
megadetector/data_management/importers/sulross_get_exif.py +0 -65
megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
megadetector/data_management/importers/ubc_to_json.py +0 -399
megadetector/data_management/importers/umn_to_json.py +0 -507
megadetector/data_management/importers/wellington_to_json.py +0 -263
megadetector/data_management/importers/wi_to_json.py +0 -442
megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
megadetector/utils/azure_utils.py +0 -178
megadetector/utils/sas_blob_utils.py +0 -509
megadetector-5.0.28.dist-info/RECORD +0 -209
/megadetector/{api/batch_processing/__init__.py → __init__.py} +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/licenses/LICENSE +0 -0
{megadetector-5.0.28.dist-info → megadetector-10.0.0.dist-info}/top_level.txt +0 -0

megadetector/data_management/ocr_tools.py CHANGED Viewed

@@ -22,11 +22,11 @@ Prior to using this module:
 * Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
 * pip install pytesseract
 Known limitations:
 * Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
 """
 #%% Notes to self
@@ -34,11 +34,12 @@ Known limitations:
 """
 * To use the legacy engine (--oem 0), I had to download an updated eng.traineddata file from:
   https://github.com/tesseract-ocr/tessdata
 """
 #%% Constants and imports
 import os
@@ -56,7 +57,7 @@ from tqdm import tqdm
 from megadetector.utils.path_utils import find_images
 from megadetector.utils.path_utils import open_file
-from megadetector.utils import write_html_image_list
+from megadetector.utils import write_html_image_list
 from megadetector.utils.ct_utils import is_iterable
 from megadetector.visualization import visualization_utils as vis_utils
@@ -64,7 +65,7 @@ from megadetector.visualization import visualization_utils as vis_utils
 #
 # Also install tesseract from: https://github.com/UB-Mannheim/tesseract/wiki, and add
 # the installation dir to your path (on Windows, typically C:\Program Files (x86)\Tesseract-OCR)
-import pytesseract
+import pytesseract # type: ignore
 #%% Extraction options
@@ -73,40 +74,40 @@ class DatetimeExtractionOptions:
     """
     Options used to parameterize datetime extraction in most functions in this module.
     """
     def __init__(self):
-        #: Using a semi-arbitrary metric of how much it feels like we found the
+        #: Using a semi-arbitrary metric of how much it feels like we found the
         #: text-containing region, discard regions that appear to be extraction failures
         self.p_crop_success_threshold = 0.5
         #: Pad each crop with a few pixels to make tesseract happy
-        self.crop_padding = 10
+        self.crop_padding = 10
         #: Discard short text, typically text from the top of the image
         self.min_text_length = 4
-        #: When we're looking for pixels that match the background color, allow some
+        #: When we're looking for pixels that match the background color, allow some
         #: tolerance around the dominant color
         self.background_tolerance = 2
-        #: We need to see a consistent color in at least this fraction of pixels in our rough
+        #: We need to see a consistent color in at least this fraction of pixels in our rough
         #: crop to believe that we actually found a candidate metadata region.
         self.min_background_fraction = 0.3
         #: What fraction of the [top,bottom] of the image should we use for our rough crop?
         self.image_crop_fraction = [0.045 , 0.045]
         # self.image_crop_fraction = [0.08 , 0.08]
         #: Within that rough crop, how much should we use for determining the background color?
         self.background_crop_fraction_of_rough_crop = 0.5
         #: A row is considered a probable metadata row if it contains at least this fraction
-        #: of the background color.  This is used only to find the top and bottom of the crop area,
+        #: of the background color.  This is used only to find the top and bottom of the crop area,
         #: so it's not that *every* row needs to hit this criteria, only the rows that are generally
         #: above and below the text.
         self.min_background_fraction_for_background_row = 0.5
         #: psm 6: "assume a single uniform block of text"
         #: psm 13: raw line
         #: oem: 0 == legacy, 1 == lstm
@@ -115,14 +116,14 @@ class DatetimeExtractionOptions:
         #: Try these configuration strings in order until we find a valid datetime
         self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
                                          '--oem 1 --psm 6','--oem 0 --psm 6']
         #: If this is False, and one set of options appears to succeed for an image, we'll
         #: stop there.  If this is True, we always run all option sets on every image.
         self.force_all_ocr_options = False
         #: Whether to apply PIL's ImageFilter.SHARPEN prior to OCR
         self.apply_sharpening_filter = True
         #: Tesseract should be on your system path, but you can also specify the
         #: path explicitly, e.g. you can do either of these:
         #:
@@ -136,115 +137,115 @@ class DatetimeExtractionOptions:
 def make_rough_crops(image,options=None):
     """
     Crops the top and bottom regions out of an image.
     Args:
         image (Image or str): a PIL Image or file name
         options (DatetimeExtractionOptions, optional): OCR parameters
     Returns:
-        dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
+        dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
     """
     if options is None:
         options = DatetimeExtractionOptions()
     if isinstance(image,str):
         image = vis_utils.open_image(image)
     w = image.width
     h = image.height
     crop_height_top = round(options.image_crop_fraction[0] * h)
     crop_height_bottom = round(options.image_crop_fraction[1] * h)
     # l,t,r,b
     #
     # 0,0 is upper-left
     top_crop = image.crop([0,0,w,crop_height_top])
     bottom_crop = image.crop([0,h-crop_height_bottom,w,h])
     return {'top':top_crop,'bottom':bottom_crop}
 # ...def make_rough_crops(...)
 def crop_to_solid_region(rough_crop,crop_location,options=None):
-    """
+    """
     Given a rough crop from the top or bottom of an image, finds the background color
     and crops to the metadata region.
-    Within a region of an image (typically a crop from the top-ish or bottom-ish part of
+    Within a region of an image (typically a crop from the top-ish or bottom-ish part of
     an image), tightly crop to the solid portion (typically a region with a black background).
     The success metric is just a binary indicator right now: 1.0 if we found a region we believe
     contains a solid background, 0.0 otherwise.
     Args:
         rough_crop (Image): the PIL Image to crop
         crop_location (str): 'top' or 'bottom'
         options (DatetimeExtractionOptions, optional): OCR parameters
     Returns:
         tuple: a tuple containing (a cropped_image (Image), p_success (float), padded_image (Image))
     """
     if options is None:
-        options = DatetimeExtractionOptions()
+        options = DatetimeExtractionOptions()
     crop_to_solid_region_result = {}
     crop_to_solid_region_result['crop_pil'] = None
     crop_to_solid_region_result['padded_crop_pil'] = None
     crop_to_solid_region_result['p_success'] = 0.0
-    # pil --> cv2
-    rough_crop_np = np.array(rough_crop)
-    rough_crop_np = rough_crop_np[:, :, ::-1].copy()
+    # pil --> cv2
+    rough_crop_np = np.array(rough_crop)
+    rough_crop_np = rough_crop_np[:, :, ::-1].copy()
     # Search *part* of the crop for the background value (the part closest to the top or bottom
     # of the image)
     rows_to_use_for_background_search = int(rough_crop_np.shape[0] * \
                                             options.background_crop_fraction_of_rough_crop)
     if crop_location == 'top':
         background_search_image = rough_crop_np[0:rows_to_use_for_background_search,:,:]
     elif crop_location == 'bottom':
         background_search_image = rough_crop_np[-rows_to_use_for_background_search:,:,:]
     else:
         raise ValueError('Unrecognized crop location: {}'.format(crop_location))
     background_search_image = cv2.cvtColor(background_search_image, cv2.COLOR_BGR2GRAY)
-    background_search_image = background_search_image.astype('uint8')
-    background_search_image = cv2.medianBlur(background_search_image,3)
+    background_search_image = background_search_image.astype('uint8')
+    background_search_image = cv2.medianBlur(background_search_image,3)
     pixel_values = background_search_image.flatten()
     counts = np.bincount(pixel_values)
     background_value = int(np.argmax(counts))
     # Did we find a sensible mode that looks like a background value?
     background_value_count = int(np.max(counts))
     p_background_value = background_value_count / np.sum(counts)
     if (p_background_value < options.min_background_fraction):
         return crop_to_solid_region_result
     else:
         p_success = 1.0
     analysis_image = cv2.cvtColor(rough_crop_np, cv2.COLOR_BGR2GRAY)
-    analysis_image = analysis_image.astype('uint8')
-    analysis_image = cv2.medianBlur(analysis_image,3)
+    analysis_image = analysis_image.astype('uint8')
+    analysis_image = cv2.medianBlur(analysis_image,3)
     # This will now be a binary image indicating which pixels are background
     analysis_image = cv2.inRange(analysis_image,
                                 background_value-options.background_tolerance,
                                 background_value+options.background_tolerance)
-    # Use row heuristics to refine the crop
+    # Use row heuristics to refine the crop
     h = analysis_image.shape[0]
     w = analysis_image.shape[1]
     min_x = 0
     min_y = -1
     max_x = w
     max_y = -1
     # Find the first and last row that are mostly the background color
     for y in range(h):
         row_count = 0
@@ -256,20 +257,20 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
             if min_y == -1:
                 min_y = y
             max_y = y
     assert (min_y == -1 and max_y == -1) or (min_y != -1 and max_y != -1)
     if min_y == -1:
         return crop_to_solid_region_result
     if max_y == min_y:
         return crop_to_solid_region_result
     x = min_x
     y = min_y
     w = max_x-min_x
     h = max_y-min_y
     x = min_x
     y = min_y
     w = max_x-min_x
@@ -277,7 +278,7 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
     # Crop the image
     crop_np = rough_crop_np[y:y+h,x:x+w]
     # Tesseract doesn't like characters really close to the edge, so pad a little.
     crop_padding = options.crop_padding
     padded_crop_np = cv2.copyMakeBorder(crop_np,crop_padding,crop_padding,crop_padding,crop_padding,
@@ -286,39 +287,39 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
     crop_pil = Image.fromarray(crop_np)
     padded_crop_pil = Image.fromarray(padded_crop_np)
     crop_to_solid_region_result['crop_pil'] = crop_pil
     crop_to_solid_region_result['padded_crop_pil'] = padded_crop_pil
     crop_to_solid_region_result['p_success'] = p_success
     return crop_to_solid_region_result
-# ...crop_to_solid_region(...)
+# ...crop_to_solid_region(...)
 def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
     """
-    Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
+    Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
     regions by the time they get to this function, roughly the top or bottom 20% of an image.
     Args:
         rough_crops (list): list of Image objects that have been cropped close to text
         options (DatetimeExtractionOptions, optional): OCR parameters
         tesseract_config_string (str, optional): optional CLI argument to pass to tesseract.exe
     Returns:
         dict: a dict with keys "top" and "bottom", where each value is a dict with keys
         'text' (text found, if any) and 'crop_to_solid_region_results' (metadata about the OCR pass)
     """
     if options is None:
         options = DatetimeExtractionOptions()
     if tesseract_config_string is None:
         tesseract_config_string = options.tesseract_config_strings[0]
     find_text_in_crops_results = {}
     # crop_location = 'top'
     # crop_location = 'bottom'
     for crop_location in ('top','bottom'):
@@ -326,51 +327,51 @@ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
         find_text_in_crops_results[crop_location] = {}
         find_text_in_crops_results[crop_location]['text'] = ''
         find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = None
         rough_crop = rough_crops[crop_location]
         # Crop to the portion of the rough crop with a solid background color
         crop_to_solid_region_results = crop_to_solid_region(rough_crop,crop_location,options)
         find_text_in_crops_results[crop_location]['crop_to_solid_region_results'] = \
             crop_to_solid_region_results
         # Try cropping to a solid region; if that doesn't work, try running OCR on the whole
         # rough crop.
         if crop_to_solid_region_results['p_success'] >= options.p_crop_success_threshold:
             padded_crop_pil = crop_to_solid_region_results['padded_crop_pil']
-        else:
+        else:
             # continue
-            padded_crop_pil = rough_crop
+            padded_crop_pil = rough_crop
         if options.apply_sharpening_filter:
             padded_crop_pil = padded_crop_pil.filter(ImageFilter.SHARPEN)
         # Find text in the padded crop
         pytesseract.pytesseract.tesseract_cmd = options.tesseract_cmd
-        text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
+        text = pytesseract.image_to_string(padded_crop_pil, lang='eng',
                                            config=tesseract_config_string)
         text = text.replace('\n', ' ').replace('\r', '').strip()
-        find_text_in_crops_results[crop_location]['text'] = text
+        find_text_in_crops_results[crop_location]['text'] = text
     # ...for each cropped region
     return find_text_in_crops_results
 # ...def find_text_in_crops(...)
 def _datetime_string_to_datetime(matched_string):
     """
     Takes an OCR-matched datetime string, does a little cleanup, and parses a date
     from it.
     By the time a string gets to this function, it should be a proper date string, with
     no extraneous characters other than spaces around colons or hyphens.
     """
     matched_string = matched_string.replace(' -','-')
     matched_string = matched_string.replace('- ','-')
     matched_string = matched_string.replace(' :',':')
@@ -386,155 +387,155 @@ def _get_datetime_from_strings(strings,options=None):
     """
     Given a string or list of strings, search for exactly one datetime in those strings.
     using a series of regular expressions.
     Strings are currently just concatenated before searching for a datetime.
     """
     if options is None:
-        options = DatetimeExtractionOptions()
+        options = DatetimeExtractionOptions()
     if isinstance(strings,str):
         s = strings
     else:
         s = ' '.join(strings).lower()
-    s = s.replace('—','-')
+    s = s.replace('—','-')
     s = ''.join(e for e in s if e.isalnum() or e in ':-/' or e.isspace())
     ### AM/PM
     # 2013-10-02 11:40:50 AM
-    m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 04/01/2017 08:54:00AM
-    m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 2017/04/01 08:54:00AM
-    m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 04/01/2017 08:54AM
-    m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 2017/04/01 08:54AM
-    m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     ### No AM/PM
     # 2013-07-27 04:56:35
-    m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 07-27-2013 04:56:35
-    m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 2013/07/27 04:56:35
-    m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     # 07/27/2013 04:56:35
-    m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
-    if m is not None:
-        return _datetime_string_to_datetime(m.group(0))
+    m = re.search(r'(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
+    if m is not None:
+        return _datetime_string_to_datetime(m.group(0))
     return None
 # ...def _get_datetime_from_strings(...)
 def get_datetime_from_image(image,include_crops=True,options=None):
     """
     Tries to find the datetime string (if present) in an image.
     Args:
         image (Image or str): the PIL Image object or image filename in which we should look for
             datetime information.
         include_crops (bool, optional): whether to include cropped images in the return dict (set
             this to False if you're worried about size and you're processing a zillion images)
-        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
+        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
             DatetimeExtractionOptions object or a list of options to try
     Returns:
         dict: a dict with fields:
             - datetime: Python datetime object, or None
             - text_results: length-2 list of strings
-            - all_extracted_datetimes: if we ran multiple option sets, this will contain the
+            - all_extracted_datetimes: if we ran multiple option sets, this will contain the
               datetimes extracted for each option set
             - ocr_results: detailed results from the OCR process, including crops as PIL images;
               only included if include_crops is True
     """
     if options is None:
         options = DatetimeExtractionOptions()
     if isinstance(image,str):
         image = vis_utils.open_image(image)
     # Crop the top and bottom from the image
     rough_crops = make_rough_crops(image,options)
     assert len(rough_crops) == 2
     all_extracted_datetimes = {}
     all_text_results = []
     all_ocr_results = []
     extracted_datetime = None
     # Find text, possibly trying all config strings
     #
     # tesseract_config_string = options.tesseract_config_strings[0]
     for tesseract_config_string in options.tesseract_config_strings:
         ocr_results = find_text_in_crops(rough_crops,options,tesseract_config_string)
         all_ocr_results.append(ocr_results)
         text_results = [v['text'] for v in ocr_results.values()]
         assert len(text_results) == 2
         all_text_results.append(text_results)
         # Find datetime
         extracted_datetime_this_option_set = _get_datetime_from_strings(text_results,options)
         assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
             (extracted_datetime_this_option_set is None)
         all_extracted_datetimes[tesseract_config_string] = \
             extracted_datetime_this_option_set
         if extracted_datetime_this_option_set is not None:
             if extracted_datetime is None:
                 extracted_datetime = extracted_datetime_this_option_set
             if not options.force_all_ocr_options:
-                break
+                break
     # ...for each set of OCR options
-    if extracted_datetime is not None:
+    if extracted_datetime is not None:
         assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
     to_return = {}
     to_return['datetime'] = extracted_datetime
     to_return['text_results'] = all_text_results
     to_return['all_extracted_datetimes'] = all_extracted_datetimes
     if include_crops:
         to_return['ocr_results'] = all_ocr_results
     else:
         to_return['ocr_results'] = None
     return to_return
 # ...def get_datetime_from_image(...)
@@ -544,34 +545,34 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
     """
     Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
     until we find a datetime.
     Args:
-        image (Image or str): the PIL Image object or image filename in which we should look for
-            datetime information.
+        filename (Image or str): the PIL Image object or image filename in which we should look
+            for datetime information.
         include_crops (bool, optional): whether to include cropped images in the return dict (set
             this to False if you're worried about size and you're processing a zillion images)
-        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
+        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
             DatetimeExtractionOptions object or a list of options to try
     Returns:
         dict: A dict with fields:
             - datetime: Python datetime object, or None
             - text_results: length-2 list of strings
-            - all_extracted_datetimes: if we ran multiple option sets, this will contain the
+            - all_extracted_datetimes: if we ran multiple option sets, this will contain the
               datetimes extracted for each option set
             - ocr_results: detailed results from the OCR process, including crops as PIL images;
               only included if include_crops is True
     """
     if options is None:
         options = DatetimeExtractionOptions()
     if not is_iterable(options):
         options = [options]
     result = {}
     result['error'] = None
     for i_option_set,current_options in enumerate(options):
         try:
             result = get_datetime_from_image(filename,include_crops=include_crops,options=current_options)
@@ -580,79 +581,85 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
                 break
         except Exception as e:
             result['error'] = str(e)
     return result
 def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None,
                              n_workers=16,use_threads=False):
     """
-    The main entry point for this module.  Tries to retrieve metadata from pixels for every
+    The main entry point for this module.  Tries to retrieve metadata from pixels for every
     image in [folder_name], optionally the results to the .json file [output_file].
     Args:
         folder_name (str): the folder of images to process recursively
         output_file (str, optional): the .json file to which we should write results; if None,
             just returns the results
         n_to_sample (int, optional): for debugging only, used to limit the number of images
             we process
-        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
+        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
             DatetimeExtractionOptions object or a list of options to try for each image
         n_workers (int, optional): the number of parallel workers to use; set to <= 1 to disable
             parallelization
         use_threads (bool, optional): whether to use threads (True) or processes (False) for
             parallelization; not relevant if n_workers <= 1
     Returns:
         dict: a dict mapping filenames to datetime extraction results, see try_get_datetime_from_images
         for the format of each value in the dict.
     """
     if options is None:
         options = DatetimeExtractionOptions()
     image_file_names = \
         find_images(folder_name,convert_slashes=True,
                     return_relative_paths=False,recursive=True)
     if n_to_sample > 0:
         import random
         random.seed(0)
         image_file_names = random.sample(image_file_names,n_to_sample)
     if n_workers <= 1:
         all_results = []
         for fn_abs in tqdm(image_file_names):
             all_results.append(try_get_datetime_from_image(fn_abs,options=options))
-    else:
+    else:
         # Don't spawn more than one worker per image
         if n_workers > len(image_file_names):
             n_workers = len(image_file_names)
-        if use_threads:
-            from multiprocessing.pool import ThreadPool
-            pool = ThreadPool(n_workers)
-            worker_string = 'threads'
-        else:
-            from multiprocessing.pool import Pool
-            pool = Pool(n_workers)
-            worker_string = 'processes'
-        print('Starting a pool of {} {}'.format(n_workers,worker_string))
-        all_results = list(tqdm(pool.imap(
-            partial(try_get_datetime_from_image,options=options),image_file_names),
-            total=len(image_file_names)))
+        pool = None
+        try:
+            if use_threads:
+                from multiprocessing.pool import ThreadPool
+                pool = ThreadPool(n_workers)
+                worker_string = 'threads'
+            else:
+                from multiprocessing.pool import Pool
+                pool = Pool(n_workers)
+                worker_string = 'processes'
+            print('Starting a pool of {} {}'.format(n_workers,worker_string))
+            all_results = list(tqdm(pool.imap(
+                partial(try_get_datetime_from_image,options=options),image_file_names),
+                total=len(image_file_names)))
+        finally:
+            pool.close()
+            pool.join()
+            print("Pool closed and joined for datetime extraction")
     filename_to_results = {}
     # fn_relative = image_file_names[0]
     for i_file,fn_abs in enumerate(image_file_names):
         filename_to_results[fn_abs] = all_results[i_file]
     if output_file is not None:
         with open(output_file,'w') as f:
             json.dump(filename_to_results,f,indent=1,default=str)
@@ -663,9 +670,9 @@ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options
 #%% Interactive driver
 if False:
     #%% Process images
     folder_name = r'g:\temp\island_conservation_camera_traps'
     output_file = r'g:\temp\ocr_results.json'
     from megadetector.utils.path_utils import insert_before_extension
@@ -681,60 +688,60 @@ if False:
     all_options = [options_a]
     filename_to_results = get_datetimes_for_folder(folder_name,output_file,
                                                    n_to_sample=n_to_sample,options=all_options)
     #%% Load results
     # output_file = r"G:\temp\ocr_results.2023.10.31.07.37.54.json"
     with open(output_file,'r') as f:
         filename_to_results = json.load(f)
     filenames = sorted(list(filename_to_results.keys()))
     print('Loaded results for {} files'.format(len(filename_to_results)))
     #%% Scrap cell
     fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
     include_crops = False
     options_a = DatetimeExtractionOptions()
     options_b = DatetimeExtractionOptions()
     options_b.image_crop_fraction = [0.08 , 0.08]
-    image = vis_utils.open_image(fn) # noqa
+    image = vis_utils.open_image(fn) # noqa
     result = try_get_datetime_from_image(fn,options=[options_a,options_b]) # noqa
     print(result)
     # open_file(fn)
     # rough_crops = make_rough_crops(image,options=options)
     #%% Look for OCR or parsing failures
     bad_tokens = ()
     files_with_disagreements = set()
     # i_fn = 0; fn = filenames[i_fn]
     for i_fn,fn in enumerate(filenames):
         image = fn
         results = filename_to_results[fn]
         if 'text_results' not in results:
             raise Exception('no results available for {} ({})'.format(i_fn,fn))
             print('Skipping {}, no results'.format(i_fn))
             continue
         s = ' '.join([x[0] for x in results['text_results']])
         known_bad = False
         for bad_token in bad_tokens:
             if bad_token in s:
                 known_bad = True
-        if known_bad:
+        if known_bad:
             continue
         extracted_datetime = results['datetime']
         # If we have a datetime, make sure all successful OCR results agree
         if extracted_datetime is not None:
             for config_string in results['all_extracted_datetimes']:
@@ -745,19 +752,19 @@ if False:
             print('Falling back for {} ({})'.format(i_fn,fn))
             ocr_results = get_datetime_from_image(fn)
             extracted_datetime = ocr_results['datetime']
         if extracted_datetime is None:
             print('Failure at {}: {}'.format(i_fn,s))
         # open_file(fn)
         # get_datetime_from_image(fn)
     #%% Write results to an HTML file for testing
     n_to_sample = 5000
     if (n_to_sample >= 0) and (len(filename_to_results) > n_to_sample):
-        filenames = sorted(list(filename_to_results.keys()))
+        filenames = sorted(list(filename_to_results.keys()))
         import random
         random.seed(0)
         keys = random.sample(filenames,n_to_sample)
@@ -765,18 +772,18 @@ if False:
     preview_dir = r'g:\temp\ocr-preview'
     os.makedirs(preview_dir,exist_ok=True)
     def resize_image_for_preview(fn_abs):
-        fn_relative = os.path.relpath(fn_abs,folder_name)
+        fn_relative = os.path.relpath(fn_abs,folder_name)
         resized_image = vis_utils.resize_image(fn_abs,target_width=600)
         resized_fn = os.path.join(preview_dir,fn_relative)
         os.makedirs(os.path.dirname(resized_fn),exist_ok=True)
         resized_image.save(resized_fn)
         return resized_fn
     # Resize images in parallel
     n_rendering_workers = 16
     if n_rendering_workers <= 1:
         for fn_abs in tqdm(filename_to_results.keys()):
             resize_image_for_preview(fn_abs)
@@ -784,64 +791,64 @@ if False:
         # from multiprocessing.pool import Pool as RenderingPool; worker_string = 'processes'
         from multiprocessing.pool import ThreadPool as RenderingPool; worker_string = 'threads'
         pool = RenderingPool(n_rendering_workers)
         print('Starting rendering pool with {} {}'.format(n_rendering_workers,worker_string))
         _ = list(tqdm(pool.imap(resize_image_for_preview,filename_to_results.keys()),
                       total=len(filename_to_results)))
     def make_datetime_preview_page(filenames,html_file):
         html_image_list = []
         html_options = write_html_image_list.write_html_image_list()
         html_options['maxFiguresPerHtmlFile'] = 2500
         html_options['defaultImageStyle'] = 'margin:0px;margin-top:5px;margin-bottom:30px;'
         # fn_abs = filenames[0]
         for fn_abs in filenames:
-            fn_relative = os.path.relpath(fn_abs,folder_name)
+            fn_relative = os.path.relpath(fn_abs,folder_name)
             # resized_fn = os.path.join(preview_dir,fn_relative)
             results_this_image = filename_to_results[fn_abs]
             extracted_datetime = results_this_image['datetime']
             title = 'Image: {}<br/>Extracted datetime: {}'.format(fn_relative,extracted_datetime)
             html_image_list.append({'filename':fn_relative,'title':title})
             # ...for each crop
         # ...for each image
         html_options['makeRelative'] = True
         write_html_image_list.write_html_image_list(html_file,
                                                     html_image_list,
                                                     html_options)
         open_file(html_file)
         return html_image_list
     failed_files = []
     for fn_abs in filename_to_results:
         results_this_image = filename_to_results[fn_abs]
         if results_this_image['datetime'] is None:
             failed_files.append(fn_abs)
     print('Found {} failures'.format(len(failed_files)))
     output_summary_file = os.path.join(preview_dir,'summary.html')
     html_image_list = make_datetime_preview_page(sorted(list(filename_to_results.keys())),output_summary_file)
-    failure_summary_file = os.path.join(preview_dir,'failures.html')
+    failure_summary_file = os.path.join(preview_dir,'failures.html')
     html_image_list_failures = make_datetime_preview_page(failed_files,failure_summary_file)
     filenames = failed_files
     html_file = failure_summary_file
     #%% Other approaches to getting dates from strings
     # ...that didn't really work out.
     # pip install dateparser
     import dateparser
@@ -853,7 +860,7 @@ if False:
     dateparser_settings = {'PREFER_DATES_FROM':'past','STRICT_PARSING':True}
     dateparser_result = dateparser.search.search_dates(s, settings=dateparser_settings)
     if dateparser_result is not None:
         assert len(dateparser_result) == 1
         extracted_datetime = dateparser_result[0][1]
@@ -864,7 +871,7 @@ if False:
             extracted_datetime = matches_list[0]
         else:
             extracted_datetime = None
-    if extracted_datetime is not None:
+    if extracted_datetime is not None:
         assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990

megadetector 5.0.28__py3-none-any.whl → 10.0.0__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.28py3-none-any.whl → 10.0.0py3-none-any.whl