PyPI - megadetector - Versions diffs - 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl - Mend

megadetector 5.0.7py3-none-any.whl → 5.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of megadetector might be problematic. Click here for more details.

Files changed (191) hide show

api/__init__.py +0 -0
api/batch_processing/__init__.py +0 -0
api/batch_processing/api_core/__init__.py +0 -0
api/batch_processing/api_core/batch_service/__init__.py +0 -0
api/batch_processing/api_core/batch_service/score.py +0 -1
api/batch_processing/api_core/server_job_status_table.py +0 -1
api/batch_processing/api_core_support/__init__.py +0 -0
api/batch_processing/api_core_support/aggregate_results_manually.py +0 -1
api/batch_processing/api_support/__init__.py +0 -0
api/batch_processing/api_support/summarize_daily_activity.py +0 -1
api/batch_processing/data_preparation/__init__.py +0 -0
api/batch_processing/data_preparation/manage_local_batch.py +93 -79
api/batch_processing/data_preparation/manage_video_batch.py +8 -8
api/batch_processing/integration/digiKam/xmp_integration.py +0 -1
api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +0 -1
api/batch_processing/postprocessing/__init__.py +0 -0
api/batch_processing/postprocessing/add_max_conf.py +12 -12
api/batch_processing/postprocessing/categorize_detections_by_size.py +32 -14
api/batch_processing/postprocessing/combine_api_outputs.py +69 -55
api/batch_processing/postprocessing/compare_batch_results.py +114 -44
api/batch_processing/postprocessing/convert_output_format.py +62 -19
api/batch_processing/postprocessing/load_api_results.py +17 -20
api/batch_processing/postprocessing/md_to_coco.py +31 -21
api/batch_processing/postprocessing/md_to_labelme.py +165 -68
api/batch_processing/postprocessing/merge_detections.py +40 -15
api/batch_processing/postprocessing/postprocess_batch_results.py +270 -186
api/batch_processing/postprocessing/remap_detection_categories.py +170 -0
api/batch_processing/postprocessing/render_detection_confusion_matrix.py +75 -39
api/batch_processing/postprocessing/repeat_detection_elimination/find_repeat_detections.py +53 -44
api/batch_processing/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +25 -14
api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py +244 -160
api/batch_processing/postprocessing/separate_detections_into_folders.py +159 -114
api/batch_processing/postprocessing/subset_json_detector_output.py +146 -169
api/batch_processing/postprocessing/top_folders_to_bottom.py +77 -43
api/synchronous/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/__init__.py +0 -0
api/synchronous/api_core/animal_detection_api/api_backend.py +0 -2
api/synchronous/api_core/animal_detection_api/api_frontend.py +266 -268
api/synchronous/api_core/animal_detection_api/config.py +35 -35
api/synchronous/api_core/tests/__init__.py +0 -0
api/synchronous/api_core/tests/load_test.py +109 -109
classification/__init__.py +0 -0
classification/aggregate_classifier_probs.py +21 -24
classification/analyze_failed_images.py +11 -13
classification/cache_batchapi_outputs.py +51 -51
classification/create_classification_dataset.py +69 -68
classification/crop_detections.py +54 -53
classification/csv_to_json.py +97 -100
classification/detect_and_crop.py +105 -105
classification/evaluate_model.py +43 -42
classification/identify_mislabeled_candidates.py +47 -46
classification/json_to_azcopy_list.py +10 -10
classification/json_validator.py +72 -71
classification/map_classification_categories.py +44 -43
classification/merge_classification_detection_output.py +68 -68
classification/prepare_classification_script.py +157 -154
classification/prepare_classification_script_mc.py +228 -228
classification/run_classifier.py +27 -26
classification/save_mislabeled.py +30 -30
classification/train_classifier.py +20 -20
classification/train_classifier_tf.py +21 -22
classification/train_utils.py +10 -10
data_management/__init__.py +0 -0
data_management/annotations/__init__.py +0 -0
data_management/annotations/annotation_constants.py +18 -31
data_management/camtrap_dp_to_coco.py +238 -0
data_management/cct_json_utils.py +107 -59
data_management/cct_to_md.py +176 -158
data_management/cct_to_wi.py +247 -219
data_management/coco_to_labelme.py +272 -0
data_management/coco_to_yolo.py +86 -62
data_management/databases/__init__.py +0 -0
data_management/databases/add_width_and_height_to_db.py +20 -16
data_management/databases/combine_coco_camera_traps_files.py +35 -31
data_management/databases/integrity_check_json_db.py +130 -83
data_management/databases/subset_json_db.py +25 -16
data_management/generate_crops_from_cct.py +27 -45
data_management/get_image_sizes.py +188 -144
data_management/importers/add_nacti_sizes.py +8 -8
data_management/importers/add_timestamps_to_icct.py +78 -78
data_management/importers/animl_results_to_md_results.py +158 -160
data_management/importers/auckland_doc_test_to_json.py +9 -9
data_management/importers/auckland_doc_to_json.py +8 -8
data_management/importers/awc_to_json.py +7 -7
data_management/importers/bellevue_to_json.py +15 -15
data_management/importers/cacophony-thermal-importer.py +13 -13
data_management/importers/carrizo_shrubfree_2018.py +8 -8
data_management/importers/carrizo_trail_cam_2017.py +8 -8
data_management/importers/cct_field_adjustments.py +9 -9
data_management/importers/channel_islands_to_cct.py +10 -10
data_management/importers/eMammal/copy_and_unzip_emammal.py +1 -0
data_management/importers/ena24_to_json.py +7 -7
data_management/importers/filenames_to_json.py +8 -8
data_management/importers/helena_to_cct.py +7 -7
data_management/importers/idaho-camera-traps.py +7 -7
data_management/importers/idfg_iwildcam_lila_prep.py +10 -10
data_management/importers/jb_csv_to_json.py +9 -9
data_management/importers/mcgill_to_json.py +8 -8
data_management/importers/missouri_to_json.py +18 -18
data_management/importers/nacti_fieldname_adjustments.py +10 -10
data_management/importers/noaa_seals_2019.py +8 -8
data_management/importers/pc_to_json.py +7 -7
data_management/importers/plot_wni_giraffes.py +7 -7
data_management/importers/prepare-noaa-fish-data-for-lila.py +359 -359
data_management/importers/prepare_zsl_imerit.py +7 -7
data_management/importers/rspb_to_json.py +8 -8
data_management/importers/save_the_elephants_survey_A.py +8 -8
data_management/importers/save_the_elephants_survey_B.py +9 -9
data_management/importers/snapshot_safari_importer.py +26 -26
data_management/importers/snapshot_safari_importer_reprise.py +665 -665
data_management/importers/snapshot_serengeti_lila.py +14 -14
data_management/importers/sulross_get_exif.py +8 -9
data_management/importers/timelapse_csv_set_to_json.py +11 -11
data_management/importers/ubc_to_json.py +13 -13
data_management/importers/umn_to_json.py +7 -7
data_management/importers/wellington_to_json.py +8 -8
data_management/importers/wi_to_json.py +9 -9
data_management/importers/zamba_results_to_md_results.py +181 -181
data_management/labelme_to_coco.py +309 -159
data_management/labelme_to_yolo.py +103 -60
data_management/lila/__init__.py +0 -0
data_management/lila/add_locations_to_island_camera_traps.py +9 -9
data_management/lila/add_locations_to_nacti.py +147 -147
data_management/lila/create_lila_blank_set.py +114 -31
data_management/lila/create_lila_test_set.py +8 -8
data_management/lila/create_links_to_md_results_files.py +106 -106
data_management/lila/download_lila_subset.py +92 -90
data_management/lila/generate_lila_per_image_labels.py +56 -43
data_management/lila/get_lila_annotation_counts.py +18 -15
data_management/lila/get_lila_image_counts.py +11 -11
data_management/lila/lila_common.py +103 -70
data_management/lila/test_lila_metadata_urls.py +132 -116
data_management/ocr_tools.py +173 -128
data_management/read_exif.py +161 -99
data_management/remap_coco_categories.py +84 -0
data_management/remove_exif.py +58 -62
data_management/resize_coco_dataset.py +32 -44
data_management/wi_download_csv_to_coco.py +246 -0
data_management/yolo_output_to_md_output.py +86 -73
data_management/yolo_to_coco.py +535 -95
detection/__init__.py +0 -0
detection/detector_training/__init__.py +0 -0
detection/process_video.py +85 -33
detection/pytorch_detector.py +43 -25
detection/run_detector.py +157 -72
detection/run_detector_batch.py +189 -114
detection/run_inference_with_yolov5_val.py +118 -51
detection/run_tiled_inference.py +113 -42
detection/tf_detector.py +51 -28
detection/video_utils.py +606 -521
docs/source/conf.py +43 -0
md_utils/__init__.py +0 -0
md_utils/azure_utils.py +9 -9
md_utils/ct_utils.py +249 -70
md_utils/directory_listing.py +59 -64
md_utils/md_tests.py +968 -862
md_utils/path_utils.py +655 -155
md_utils/process_utils.py +157 -133
md_utils/sas_blob_utils.py +20 -20
md_utils/split_locations_into_train_val.py +45 -32
md_utils/string_utils.py +33 -10
md_utils/url_utils.py +208 -27
md_utils/write_html_image_list.py +51 -35
md_visualization/__init__.py +0 -0
md_visualization/plot_utils.py +102 -109
md_visualization/render_images_with_thumbnails.py +34 -34
md_visualization/visualization_utils.py +908 -311
md_visualization/visualize_db.py +109 -58
md_visualization/visualize_detector_output.py +61 -42
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/METADATA +21 -17
megadetector-5.0.9.dist-info/RECORD +224 -0
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/WHEEL +1 -1
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/top_level.txt +1 -0
taxonomy_mapping/__init__.py +0 -0
taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +342 -335
taxonomy_mapping/map_new_lila_datasets.py +154 -154
taxonomy_mapping/prepare_lila_taxonomy_release.py +142 -134
taxonomy_mapping/preview_lila_taxonomy.py +591 -591
taxonomy_mapping/retrieve_sample_image.py +12 -12
taxonomy_mapping/simple_image_download.py +11 -11
taxonomy_mapping/species_lookup.py +10 -10
taxonomy_mapping/taxonomy_csv_checker.py +18 -18
taxonomy_mapping/taxonomy_graph.py +47 -47
taxonomy_mapping/validate_lila_category_mappings.py +83 -76
data_management/cct_json_to_filename_json.py +0 -89
data_management/cct_to_csv.py +0 -140
data_management/databases/remove_corrupted_images_from_db.py +0 -191
detection/detector_training/copy_checkpoints.py +0 -43
md_visualization/visualize_megadb.py +0 -183
megadetector-5.0.7.dist-info/RECORD +0 -202
{megadetector-5.0.7.dist-info → megadetector-5.0.9.dist-info}/LICENSE +0 -0

data_management/ocr_tools.py CHANGED Viewed

@@ -1,33 +1,33 @@
-########
-#
-# ocr_tools.py
-#
-# Use OCR (via the Tesseract package) to pull metadata (particularly times and
-# dates from camera trap images).
-#
-# The general approach is:
-#
-# * Crop a fixed percentage from the top and bottom of an image, slightly larger
-#   than the largest examples we've seen of how much space is used for metadata.
-#
-# * Define the background color as the median pixel value, and find rows that are
-#   mostly that color to refine the crop.
-#
-# * Crop to the refined crop, then run pytesseract to extract text.
-#
-# * Use regular expressions to find time and date.
-#
-# Prior to using this module:
-#
-# * Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
-#
-# * pip install pytesseract
-#
-# Known limitations:
-#
-# * Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
-#
-########
+"""
+ocr_tools.py
+Use OCR (via the Tesseract package) to pull metadata (particularly times and
+dates from camera trap images).
+The general approach is:
+* Crop a fixed percentage from the top and bottom of an image, slightly larger
+  than the largest examples we've seen of how much space is used for metadata.
+* Define the background color as the median pixel value, and find rows that are
+  mostly that color to refine the crop.
+* Crop to the refined crop, then run pytesseract to extract text.
+* Use regular expressions to find time and date.
+Prior to using this module:
+* Install Tesseract from https://tesseract-ocr.github.io/tessdoc/Installation.html
+* pip install pytesseract
+Known limitations:
+* Semi-transparent overlays (which I've only seen on consumer cameras) usually fail.
+"""
 #%% Notes to self
@@ -55,9 +55,10 @@ from PIL import Image, ImageFilter
 from tqdm import tqdm
 from md_utils.path_utils import find_images
-from md_visualization import visualization_utils as vis_utils
-from md_utils import write_html_image_list
 from md_utils.path_utils import open_file
+from md_utils import write_html_image_list
+from md_utils.ct_utils import is_iterable
+from md_visualization import visualization_utils as vis_utils
 # pip install pytesseract
 #
@@ -69,58 +70,64 @@ import pytesseract
 #%% Extraction options
 class DatetimeExtractionOptions:
+    """
+    Options used to parameterize datetime extraction in most functions in this module.
+    """
     def __init__(self):
-        # Using a semi-arbitrary metric of how much it feels like we found the
-        # text-containing region, discard regions that appear to be extraction failures
+        #: Using a semi-arbitrary metric of how much it feels like we found the
+        #: text-containing region, discard regions that appear to be extraction failures
         self.p_crop_success_threshold = 0.5
-        # Pad each crop with a few pixels to make tesseract happy
+        #: Pad each crop with a few pixels to make tesseract happy
         self.crop_padding = 10
-        # Discard short text, typically text from the top of the image
+        #: Discard short text, typically text from the top of the image
         self.min_text_length = 4
-        # When we're looking for pixels that match the background color, allow some
-        # tolerance around the dominant color
+        #: When we're looking for pixels that match the background color, allow some
+        #: tolerance around the dominant color
         self.background_tolerance = 2
-        # We need to see a consistent color in at least this fraction of pixels in our rough
-        # crop to believe that we actually found a candidate metadata region.
+        #: We need to see a consistent color in at least this fraction of pixels in our rough
+        #: crop to believe that we actually found a candidate metadata region.
         self.min_background_fraction = 0.3
-        # What fraction of the [top,bottom] of the image should we use for our rough crop?
+        #: What fraction of the [top,bottom] of the image should we use for our rough crop?
         self.image_crop_fraction = [0.045 , 0.045]
         # self.image_crop_fraction = [0.08 , 0.08]
-        # Within that rough crop, how much should we use for determining the background color?
+        #: Within that rough crop, how much should we use for determining the background color?
         self.background_crop_fraction_of_rough_crop = 0.5
-        # A row is considered a probable metadata row if it contains at least this fraction
-        # of the background color.  This is used only to find the top and bottom of the crop area,
-        # so it's not that *every* row needs to hit this criteria, only the rows that are generally
-        # above and below the text.
+        #: A row is considered a probable metadata row if it contains at least this fraction
+        #: of the background color.  This is used only to find the top and bottom of the crop area,
+        #: so it's not that *every* row needs to hit this criteria, only the rows that are generally
+        #: above and below the text.
         self.min_background_fraction_for_background_row = 0.5
-        # psm 6: "assume a single uniform block of text"
-        # psm 13: raw line
-        # oem: 0 == legacy, 1 == lstm
-        # tesseract_config_string = '--oem 0 --psm 6'
-        #
-        # Try these configuration strings in order until we find a valid datetime
+        #: psm 6: "assume a single uniform block of text"
+        #: psm 13: raw line
+        #: oem: 0 == legacy, 1 == lstm
+        #: tesseract_config_string = '--oem 0 --psm 6'
+        #:
+        #: Try these configuration strings in order until we find a valid datetime
         self.tesseract_config_strings = ['--oem 1 --psm 13','--oem 0 --psm 13',
                                          '--oem 1 --psm 6','--oem 0 --psm 6']
+        #: If this is False, and one set of options appears to succeed for an image, we'll
+        #: stop there.  If this is True, we always run all option sets on every image.
         self.force_all_ocr_options = False
+        #: Whether to apply PIL's ImageFilter.SHARPEN prior to OCR
         self.apply_sharpening_filter = True
-        # Tesseract should be on your system path, but you can also specify the
-        # path explicitly.
-        #
-        # os.environ['PATH'] += r';C:\Program Files\Tesseract-OCR'
-        # self.tesseract_cmd = 'r"C:\Program Files\Tesseract-OCR\tesseract.exe"'
+        #: Tesseract should be on your system path, but you can also specify the
+        #: path explicitly, e.g. you can do either of these:
+        #:
+        #: * os.environ['PATH'] += r';C:\Program Files\Tesseract-OCR'
+        #: * self.tesseract_cmd = 'r"C:\Program Files\Tesseract-OCR\tesseract.exe"'
         self.tesseract_cmd = 'tesseract.exe'
@@ -128,10 +135,14 @@ class DatetimeExtractionOptions:
 def make_rough_crops(image,options=None):
     """
-    Crops the top and bottom regions out of an image, returns a dict with fields
-    'top' and 'bottom', each pointing to a PIL image.
+    Crops the top and bottom regions out of an image.
-    [image] can be a PIL image or a file name.
+    Args:
+        image (Image or str): a PIL Image or file name
+        options (DatetimeExtractionOptions, optional): OCR parameters
+    Returns:
+        dict: a dict with fields 'top' and 'bottom', each pointing to a new PIL Image
     """
     if options is None:
@@ -158,10 +169,8 @@ def make_rough_crops(image,options=None):
 def crop_to_solid_region(rough_crop,crop_location,options=None):
     """
-    Given a rough crop from the top or bottom of an imaeg, find the background color
-    and crop to the metadata region.
-    rough_crop should be PIL Image, crop_location should be 'top' or 'bottom'.
+    Given a rough crop from the top or bottom of an image, finds the background color
+    and crops to the metadata region.
     Within a region of an image (typically a crop from the top-ish or bottom-ish part of
     an image), tightly crop to the solid portion (typically a region with a black background).
@@ -169,7 +178,13 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
     The success metric is just a binary indicator right now: 1.0 if we found a region we believe
     contains a solid background, 0.0 otherwise.
-    Returns cropped_image,p_success,padded_image
+    Args:
+        rough_crop (Image): the PIL Image to crop
+        crop_location (str): 'top' or 'bottom'
+        options (DatetimeExtractionOptions, optional): OCR parameters
+    Returns:
+        tuple: a tuple containing (a cropped_image (Image), p_success (float), padded_image (Image))
     """
     if options is None:
@@ -283,8 +298,17 @@ def crop_to_solid_region(rough_crop,crop_location,options=None):
 def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
     """
-    Find all text in each Image in the dict [rough_crops]; those images should be pretty small
+    Finds all text in each Image in the dict [rough_crops]; those images should be pretty small
     regions by the time they get to this function, roughly the top or bottom 20% of an image.
+    Args:
+        rough_crops (list): list of Image objects that have been cropped close to text
+        options (DatetimeExtractionOptions, optional): OCR parameters
+        tesseract_config_string (str, optional): optional CLI argument to pass to tesseract.exe
+    Returns:
+        dict: a dict with keys "top" and "bottom", where each value is a dict with keys
+        'text' (text found, if any) and 'crop_to_solid_region_results' (metadata about the OCR pass)
     """
     if options is None:
@@ -338,7 +362,7 @@ def find_text_in_crops(rough_crops,options=None,tesseract_config_string=None):
 # ...def find_text_in_crops(...)
-def datetime_string_to_datetime(matched_string):
+def _datetime_string_to_datetime(matched_string):
     """
     Takes an OCR-matched datetime string, does a little cleanup, and parses a date
     from it.
@@ -358,7 +382,7 @@ def datetime_string_to_datetime(matched_string):
     return extracted_datetime
-def get_datetime_from_strings(strings,options=None):
+def _get_datetime_from_strings(strings,options=None):
     """
     Given a string or list of strings, search for exactly one datetime in those strings.
     using a series of regular expressions.
@@ -381,72 +405,76 @@ def get_datetime_from_strings(strings,options=None):
     # 2013-10-02 11:40:50 AM
     m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s+(\d+)\s?:?\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 04/01/2017 08:54:00AM
     m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 2017/04/01 08:54:00AM
     m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s?:\s?(\d\d)\s*([a|p]m)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 04/01/2017 08:54AM
     m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 2017/04/01 08:54AM
     m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s+(\d+)\s?:\s?(\d\d)\s*([a|p]m)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     ### No AM/PM
     # 2013-07-27 04:56:35
     m = re.search('(\d\d\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 07-27-2013 04:56:35
     m = re.search('(\d\d)\s?-\s?(\d\d)\s?-\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 2013/07/27 04:56:35
     m = re.search('(\d\d\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     # 07/27/2013 04:56:35
     m = re.search('(\d\d)\s?/\s?(\d\d)\s?/\s?(\d\d\d\d)\s*(\d\d)\s?:\s?(\d\d)\s?:\s?(\d\d)',s)
     if m is not None:
-        return datetime_string_to_datetime(m.group(0))
+        return _datetime_string_to_datetime(m.group(0))
     return None
-# ...def get_datetime_from_strings(...)
+# ...def _get_datetime_from_strings(...)
 def get_datetime_from_image(image,include_crops=True,options=None):
     """
-    Find the datetime string (if present) in [image], which can be a PIL image or a
-    filename.  Returns a dict:
-    datetime: Python datetime object, or None
-    text_results: length-2 list of strings
-    all_extracted_datetimes: if we ran multiple option sets, this will contain the
-    datetimes extracted for each option set
-    ocr_results: detailed results from the OCR process, including crops as PIL images;
-    only included if include_crops is True.
-    [options] can be None, a DatetimeExtractionOptions object, or a list of
-    DatetimeExtractionOptions objects to try for each image.
+    Tries to find the datetime string (if present) in an image.
+    Args:
+        image (Image or str): the PIL Image object or image filename in which we should look for
+            datetime information.
+        include_crops (bool, optional): whether to include cropped images in the return dict (set
+            this to False if you're worried about size and you're processing a zillion images)
+        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
+            DatetimeExtractionOptions object or a list of options to try
+    Returns:
+        dict: a dict with fields:
+            - datetime: Python datetime object, or None
+            - text_results: length-2 list of strings
+            - all_extracted_datetimes: if we ran multiple option sets, this will contain the
+              datetimes extracted for each option set
+            - ocr_results: detailed results from the OCR process, including crops as PIL images;
+              only included if include_crops is True
     """
     if options is None:
@@ -478,7 +506,7 @@ def get_datetime_from_image(image,include_crops=True,options=None):
         all_text_results.append(text_results)
         # Find datetime
-        extracted_datetime_this_option_set = get_datetime_from_strings(text_results,options)
+        extracted_datetime_this_option_set = _get_datetime_from_strings(text_results,options)
         assert isinstance(extracted_datetime_this_option_set,datetime.datetime) or \
             (extracted_datetime_this_option_set is None)
@@ -512,18 +540,27 @@ def get_datetime_from_image(image,include_crops=True,options=None):
 # ...def get_datetime_from_image(...)
-def is_iterable(x):
-    try:
-        _ = iter(x)
-    except:
-       return False
-    return True
 def try_get_datetime_from_image(filename,include_crops=False,options=None):
     """
     Try/catch wrapper for get_datetime_from_image, optionally trying multiple option sets
     until we find a datetime.
+    Args:
+        image (Image or str): the PIL Image object or image filename in which we should look for
+            datetime information.
+        include_crops (bool, optional): whether to include cropped images in the return dict (set
+            this to False if you're worried about size and you're processing a zillion images)
+        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
+            DatetimeExtractionOptions object or a list of options to try
+    Returns:
+        dict: A dict with fields:
+            - datetime: Python datetime object, or None
+            - text_results: length-2 list of strings
+            - all_extracted_datetimes: if we ran multiple option sets, this will contain the
+              datetimes extracted for each option set
+            - ocr_results: detailed results from the OCR process, including crops as PIL images;
+              only included if include_crops is True
     """
     if options is None:
@@ -547,16 +584,28 @@ def try_get_datetime_from_image(filename,include_crops=False,options=None):
     return result
-def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None):
+def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options=None,
+                             n_workers=16,use_threads=False):
     """
-    Retrieve metadata from every image in [folder_name], and
-    write the results to the .json file [output_file].
-    [options] can be None, a DatetimeExtractionOptions object, or a list of
-    DatetimeExtractionOptions objects to try for each image.
-    Returns a dict mapping filenames to datetime extraction results.  Optionally writes
-    results to the .json file [output_file].
+    The main entry point for this module.  Tries to retrieve metadata from pixels for every
+    image in [folder_name], optionally the results to the .json file [output_file].
+    Args:
+        folder_name (str): the folder of images to process recursively
+        output_file (str, optional): the .json file to which we should write results; if None,
+            just returns the results
+        n_to_sample (int, optional): for debugging only, used to limit the number of images
+            we process
+        options (DatetimeExtractionOptions or list, optional): OCR parameters, either one
+            DatetimeExtractionOptions object or a list of options to try for each image
+        n_workers (int, optional): the number of parallel workers to use; set to <= 1 to disable
+            parallelization
+        use_threads (bool, optional): whether to use threads (True) or processes (False) for
+            parallelization; not relevant if n_workers <= 1
+    Returns:
+        dict: a dict mapping filenames to datetime extraction results, see try_get_datetime_from_images
+        for the format of each value in the dict.
     """
     if options is None:
@@ -570,11 +619,8 @@ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options
         import random
         random.seed(0)
         image_file_names = random.sample(image_file_names,n_to_sample)
-    n_cores = 16
-    use_threads = False
-    if n_cores <= 1:
+    if n_workers <= 1:
         all_results = []
         for fn_abs in tqdm(image_file_names):
@@ -583,19 +629,19 @@ def get_datetimes_for_folder(folder_name,output_file=None,n_to_sample=-1,options
     else:
         # Don't spawn more than one worker per image
-        if n_cores > len(image_file_names):
-            n_cores = len(image_file_names)
+        if n_workers > len(image_file_names):
+            n_workers = len(image_file_names)
         if use_threads:
             from multiprocessing.pool import ThreadPool
-            pool = ThreadPool(n_cores)
+            pool = ThreadPool(n_workers)
             worker_string = 'threads'
         else:
             from multiprocessing.pool import Pool
-            pool = Pool(n_cores)
+            pool = Pool(n_workers)
             worker_string = 'processes'
-        print('Starting a pool of {} {}'.format(n_cores,worker_string))
+        print('Starting a pool of {} {}'.format(n_workers,worker_string))
         all_results = list(tqdm(pool.imap(
             partial(try_get_datetime_from_image,options=options),image_file_names),
@@ -621,7 +667,6 @@ if False:
     #%% Process images
     folder_name = r'g:\temp\island_conservation_camera_traps'
-    # folder_name = r'g:\camera_traps\camera_trap_images'
     output_file = r'g:\temp\ocr_results.json'
     from md_utils.path_utils import insert_before_extension
     output_file = insert_before_extension(output_file)
@@ -650,11 +695,6 @@ if False:
     #%% Scrap cell
     fn = 'g:/camera_traps/camera_trap_images/2018.07.02/newcam/people/DSCF0273.JPG'
-    # fn = r'g:\camera_traps\camera_trap_images\2022.01.29\cam0\coyote\DSCF0057.JPG'
-    # fn = 'g:/temp/island_conservation_camera_traps/chile/frances01/frances012013/chile_frances012013_02012013105658.jpg'
-    # fn = 'g:/temp/island_conservation_camera_traps/dominicanrepublic/camara06/cam0618junio2016/dominicanrepublic_cam0618junio2016_20160614_114115_img_0013.jpg'
-    # fn = os.path.join(folder_name,r'dominicanrepublic\camara22\cam228noviembre2015\dominicanrepublic_cam228noviembre2015_20151105_071226_img_0132.jpg')
-    # fn = 'g:/camera_traps/camera_trap_images/2021.06.06/camera01/empty/DSCF0873.JPG'
     include_crops = False
     options_a = DatetimeExtractionOptions()
     options_b = DatetimeExtractionOptions()
@@ -827,3 +867,8 @@ if False:
     if extracted_datetime is not None:
         assert extracted_datetime.year <= 2023 and extracted_datetime.year >= 1990
+#%% Command-line driver
+# TODO

megadetector 5.0.7__py3-none-any.whl → 5.0.9__py3-none-any.whl

Potentially problematic release.

megadetector 5.0.7py3-none-any.whl → 5.0.9py3-none-any.whl