megadetector 5.0.27__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +232 -223
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +341 -338
- megadetector/detection/pytorch_detector.py +308 -266
- megadetector/detection/run_detector.py +186 -166
- megadetector/detection/run_detector_batch.py +366 -364
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +312 -253
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +291 -283
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +808 -311
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +220 -147
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -0
- megadetector/postprocessing/load_api_results.py +25 -22
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +319 -302
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -69
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +11 -11
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1019 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1511 -406
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +73 -60
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2868
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +424 -404
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +126 -98
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/METADATA +6 -3
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.27.dist-info/RECORD +0 -208
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.27.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
|
@@ -4,8 +4,8 @@ split_locations_into_train_val.py
|
|
|
4
4
|
|
|
5
5
|
Splits a list of location IDs into training and validation, targeting a specific
|
|
6
6
|
train/val split for each category, but allowing some categories to be tighter or looser
|
|
7
|
-
than others. Does nothing particularly clever, just randomly splits locations into
|
|
8
|
-
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
7
|
+
than others. Does nothing particularly clever, just randomly splits locations into
|
|
8
|
+
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
9
9
|
specified constraints and minimizes weighted error, where "error" is defined as the
|
|
10
10
|
sum of each class's absolute divergence from the target val fraction.
|
|
11
11
|
|
|
@@ -26,60 +26,63 @@ from tqdm import tqdm
|
|
|
26
26
|
def split_locations_into_train_val(location_to_category_counts,
|
|
27
27
|
n_random_seeds=10000,
|
|
28
28
|
target_val_fraction=0.15,
|
|
29
|
-
category_to_max_allowable_error=None,
|
|
29
|
+
category_to_max_allowable_error=None,
|
|
30
30
|
category_to_error_weight=None,
|
|
31
|
-
default_max_allowable_error=0.1
|
|
31
|
+
default_max_allowable_error=0.1,
|
|
32
|
+
require_complete_coverage=True):
|
|
32
33
|
"""
|
|
33
34
|
Splits a list of location IDs into training and validation, targeting a specific
|
|
34
35
|
train/val split for each category, but allowing some categories to be tighter or looser
|
|
35
|
-
than others. Does nothing particularly clever, just randomly splits locations into
|
|
36
|
-
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
36
|
+
than others. Does nothing particularly clever, just randomly splits locations into
|
|
37
|
+
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
37
38
|
specified constraints and minimizes weighted error, where "error" is defined as the
|
|
38
|
-
sum of each class's absolute divergence from the target val fraction.
|
|
39
|
-
|
|
39
|
+
sum of each class's absolute divergence from the target val fraction.
|
|
40
|
+
|
|
40
41
|
Args:
|
|
41
42
|
location_to_category_counts (dict): a dict mapping location IDs to dicts,
|
|
42
|
-
with each dict mapping a category name to a count. Any categories not present
|
|
43
|
+
with each dict mapping a category name to a count. Any categories not present
|
|
43
44
|
in a particular dict are assumed to have a count of zero for that location.
|
|
44
|
-
|
|
45
|
+
|
|
45
46
|
For example:
|
|
46
|
-
|
|
47
|
+
|
|
47
48
|
.. code-block:: none
|
|
48
49
|
|
|
49
50
|
{'location-000': {'bear':4,'wolf':10},
|
|
50
51
|
'location-001': {'bear':12,'elk':20}}
|
|
51
|
-
|
|
52
|
+
|
|
52
53
|
n_random_seeds (int, optional): number of random seeds to try, always starting from zero
|
|
53
54
|
target_val_fraction (float, optional): fraction of images containing each species we'd
|
|
54
55
|
like to put in the val split
|
|
55
56
|
category_to_max_allowable_error (dict, optional): a dict mapping category names
|
|
56
57
|
to maximum allowable errors. These are hard constraints (i.e., we will error
|
|
57
|
-
if we can't meet them). Does not need to include all categories; categories not
|
|
58
|
+
if we can't meet them). Does not need to include all categories; categories not
|
|
58
59
|
included will be assigned a maximum error according to [default_max_allowable_error].
|
|
59
60
|
If this is None, no hard constraints are applied.
|
|
60
61
|
category_to_error_weight (dict, optional): a dict mapping category names to
|
|
61
62
|
error weights. You can specify a subset of categories; categories not included here
|
|
62
63
|
have a weight of 1.0. If None, all categories have the same weight.
|
|
63
|
-
default_max_allowable_error (float, optional): the maximum allowable error for categories not
|
|
64
|
-
present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
|
|
64
|
+
default_max_allowable_error (float, optional): the maximum allowable error for categories not
|
|
65
|
+
present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
|
|
65
66
|
constraints for categories not present in [category_to_max_allowable_error]
|
|
66
|
-
|
|
67
|
+
require_complete_coverage (bool, optional): require that every category appear in both train and
|
|
68
|
+
val
|
|
69
|
+
|
|
67
70
|
Returns:
|
|
68
71
|
tuple: A two-element tuple:
|
|
69
72
|
- list of location IDs in the val split
|
|
70
|
-
- a dict mapping category names to the fraction of images in the val split
|
|
73
|
+
- a dict mapping category names to the fraction of images in the val split
|
|
71
74
|
"""
|
|
72
|
-
|
|
75
|
+
|
|
73
76
|
location_ids = list(location_to_category_counts.keys())
|
|
74
|
-
|
|
77
|
+
|
|
75
78
|
n_val_locations = int(target_val_fraction*len(location_ids))
|
|
76
|
-
|
|
79
|
+
|
|
77
80
|
if category_to_max_allowable_error is None:
|
|
78
81
|
category_to_max_allowable_error = {}
|
|
79
|
-
|
|
82
|
+
|
|
80
83
|
if category_to_error_weight is None:
|
|
81
84
|
category_to_error_weight = {}
|
|
82
|
-
|
|
85
|
+
|
|
83
86
|
# category ID to total count; the total count is used only for printouts
|
|
84
87
|
category_id_to_count = {}
|
|
85
88
|
for location_id in location_to_category_counts:
|
|
@@ -88,28 +91,28 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
88
91
|
category_id_to_count[category_id] = 0
|
|
89
92
|
category_id_to_count[category_id] += \
|
|
90
93
|
location_to_category_counts[location_id][category_id]
|
|
91
|
-
|
|
94
|
+
|
|
92
95
|
category_ids = set(category_id_to_count.keys())
|
|
93
|
-
|
|
96
|
+
|
|
94
97
|
print('Splitting {} categories over {} locations'.format(
|
|
95
98
|
len(category_ids),len(location_ids)))
|
|
96
|
-
|
|
99
|
+
|
|
97
100
|
# random_seed = 0
|
|
98
101
|
def compute_seed_errors(random_seed):
|
|
99
102
|
"""
|
|
100
103
|
Computes the per-category error for a specific random seed.
|
|
101
|
-
|
|
104
|
+
|
|
102
105
|
returns weighted_average_error,category_to_val_fraction
|
|
103
106
|
"""
|
|
104
|
-
|
|
107
|
+
|
|
105
108
|
# Randomly split into train/val
|
|
106
109
|
random.seed(random_seed)
|
|
107
110
|
val_locations = random.sample(location_ids,k=n_val_locations)
|
|
108
111
|
val_locations_set = set(val_locations)
|
|
109
|
-
|
|
112
|
+
|
|
110
113
|
# For each category, measure the % of images that went into the val set
|
|
111
114
|
category_to_val_fraction = defaultdict(float)
|
|
112
|
-
|
|
115
|
+
|
|
113
116
|
for category_id in category_ids:
|
|
114
117
|
category_val_count = 0
|
|
115
118
|
category_train_count = 0
|
|
@@ -124,42 +127,42 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
124
127
|
category_train_count += location_category_count
|
|
125
128
|
category_val_fraction = category_val_count / (category_val_count + category_train_count)
|
|
126
129
|
category_to_val_fraction[category_id] = category_val_fraction
|
|
127
|
-
|
|
128
|
-
# Absolute deviation from the target val fraction for each
|
|
130
|
+
|
|
131
|
+
# Absolute deviation from the target val fraction for each category
|
|
129
132
|
category_errors = {}
|
|
130
133
|
weighted_category_errors = {}
|
|
131
|
-
|
|
134
|
+
|
|
132
135
|
# category = next(iter(category_to_val_fraction))
|
|
133
136
|
for category in category_to_val_fraction:
|
|
134
|
-
|
|
137
|
+
|
|
135
138
|
category_val_fraction = category_to_val_fraction[category]
|
|
136
|
-
|
|
139
|
+
|
|
137
140
|
category_error = abs(category_val_fraction-target_val_fraction)
|
|
138
141
|
category_errors[category] = category_error
|
|
139
|
-
|
|
142
|
+
|
|
140
143
|
category_weight = 1.0
|
|
141
144
|
if category in category_to_error_weight:
|
|
142
145
|
category_weight = category_to_error_weight[category]
|
|
143
146
|
weighted_category_error = category_error * category_weight
|
|
144
147
|
weighted_category_errors[category] = weighted_category_error
|
|
145
|
-
|
|
148
|
+
|
|
146
149
|
weighted_average_error = np.mean(list(weighted_category_errors.values()))
|
|
147
|
-
|
|
150
|
+
|
|
148
151
|
return weighted_average_error,weighted_category_errors,category_to_val_fraction
|
|
149
|
-
|
|
152
|
+
|
|
150
153
|
# ... def compute_seed_errors(...)
|
|
151
|
-
|
|
154
|
+
|
|
152
155
|
# This will only include random seeds that satisfy the hard constraints
|
|
153
156
|
random_seed_to_weighted_average_error = {}
|
|
154
|
-
|
|
157
|
+
|
|
155
158
|
# random_seed = 0
|
|
156
159
|
for random_seed in tqdm(range(0,n_random_seeds)):
|
|
157
|
-
|
|
160
|
+
|
|
158
161
|
weighted_average_error,weighted_category_errors,category_to_val_fraction = \
|
|
159
162
|
compute_seed_errors(random_seed)
|
|
160
|
-
|
|
163
|
+
|
|
161
164
|
seed_satisfies_hard_constraints = True
|
|
162
|
-
|
|
165
|
+
|
|
163
166
|
for category in category_to_val_fraction:
|
|
164
167
|
if category in category_to_max_allowable_error:
|
|
165
168
|
max_allowable_error = category_to_max_allowable_error[category]
|
|
@@ -168,61 +171,71 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
168
171
|
continue
|
|
169
172
|
max_allowable_error = default_max_allowable_error
|
|
170
173
|
val_fraction = category_to_val_fraction[category]
|
|
174
|
+
|
|
175
|
+
# If necessary, verify that this category doesn't *only* appear in train or val
|
|
176
|
+
if require_complete_coverage:
|
|
177
|
+
if (val_fraction == 0.0) or (val_fraction == 1.0):
|
|
178
|
+
seed_satisfies_hard_constraints = False
|
|
179
|
+
break
|
|
180
|
+
|
|
181
|
+
# Check whether this category exceeds the hard maximum deviation
|
|
171
182
|
category_error = abs(val_fraction - target_val_fraction)
|
|
172
183
|
if category_error > max_allowable_error:
|
|
173
184
|
seed_satisfies_hard_constraints = False
|
|
174
185
|
break
|
|
175
|
-
|
|
176
|
-
|
|
186
|
+
|
|
187
|
+
# ...for each category
|
|
188
|
+
|
|
189
|
+
if seed_satisfies_hard_constraints:
|
|
177
190
|
random_seed_to_weighted_average_error[random_seed] = weighted_average_error
|
|
178
|
-
|
|
191
|
+
|
|
179
192
|
# ...for each random seed
|
|
180
|
-
|
|
193
|
+
|
|
181
194
|
assert len(random_seed_to_weighted_average_error) > 0, \
|
|
182
195
|
'No random seed met all the hard constraints'
|
|
183
|
-
|
|
196
|
+
|
|
184
197
|
print('\n{} of {} random seeds satisfied hard constraints'.format(
|
|
185
198
|
len(random_seed_to_weighted_average_error),n_random_seeds))
|
|
186
|
-
|
|
199
|
+
|
|
187
200
|
min_error = None
|
|
188
201
|
min_error_seed = None
|
|
189
|
-
|
|
202
|
+
|
|
190
203
|
for random_seed in random_seed_to_weighted_average_error.keys():
|
|
191
204
|
error_metric = random_seed_to_weighted_average_error[random_seed]
|
|
192
205
|
if min_error is None or error_metric < min_error:
|
|
193
206
|
min_error = error_metric
|
|
194
207
|
min_error_seed = random_seed
|
|
195
|
-
|
|
208
|
+
|
|
196
209
|
random.seed(min_error_seed)
|
|
197
210
|
val_locations = random.sample(location_ids,k=n_val_locations)
|
|
198
211
|
train_locations = []
|
|
199
212
|
for location_id in location_ids:
|
|
200
213
|
if location_id not in val_locations:
|
|
201
214
|
train_locations.append(location_id)
|
|
202
|
-
|
|
203
|
-
print('\nVal locations:\n')
|
|
215
|
+
|
|
216
|
+
print('\nVal locations:\n')
|
|
204
217
|
for loc in val_locations:
|
|
205
218
|
print('{}'.format(loc))
|
|
206
219
|
print('')
|
|
207
|
-
|
|
220
|
+
|
|
208
221
|
weighted_average_error,weighted_category_errors,category_to_val_fraction = \
|
|
209
222
|
compute_seed_errors(min_error_seed)
|
|
210
|
-
|
|
223
|
+
|
|
211
224
|
random_seed = min_error_seed
|
|
212
|
-
|
|
225
|
+
|
|
213
226
|
category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
|
|
214
227
|
category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
|
|
215
228
|
sort_values=category_id_to_count,
|
|
216
229
|
reverse=True)
|
|
217
|
-
|
|
218
|
-
|
|
230
|
+
|
|
231
|
+
|
|
219
232
|
print('Val fractions by category:\n')
|
|
220
|
-
|
|
233
|
+
|
|
221
234
|
for category in category_to_val_fraction:
|
|
222
235
|
print('{} ({}) {:.2f}'.format(
|
|
223
236
|
category,category_id_to_count[category],
|
|
224
237
|
category_to_val_fraction[category]))
|
|
225
|
-
|
|
238
|
+
|
|
226
239
|
return val_locations,category_to_val_fraction
|
|
227
240
|
|
|
228
241
|
# ...def split_locations_into_train_val(...)
|
|
@@ -14,15 +14,18 @@ import re
|
|
|
14
14
|
#%% Functions
|
|
15
15
|
|
|
16
16
|
def is_float(s):
|
|
17
|
-
"""
|
|
17
|
+
"""
|
|
18
18
|
Checks whether [s] is an object (typically a string) that can be cast to a float
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
Args:
|
|
21
21
|
s (object): object to evaluate
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Returns:
|
|
24
24
|
bool: True if s successfully casts to a float, otherwise False
|
|
25
25
|
"""
|
|
26
|
+
|
|
27
|
+
if s is None:
|
|
28
|
+
return False
|
|
26
29
|
|
|
27
30
|
try:
|
|
28
31
|
_ = float(s)
|
|
@@ -36,57 +39,175 @@ def human_readable_to_bytes(size):
|
|
|
36
39
|
Given a human-readable byte string (e.g. 2G, 10GB, 30MB, 20KB),
|
|
37
40
|
returns the number of bytes. Will return 0 if the argument has
|
|
38
41
|
unexpected form.
|
|
39
|
-
|
|
42
|
+
|
|
40
43
|
https://gist.github.com/beugley/ccd69945346759eb6142272a6d69b4e0
|
|
41
|
-
|
|
44
|
+
|
|
42
45
|
Args:
|
|
43
46
|
size (str): string representing a size
|
|
44
|
-
|
|
47
|
+
|
|
45
48
|
Returns:
|
|
46
49
|
int: the corresponding size in bytes
|
|
47
50
|
"""
|
|
48
|
-
|
|
51
|
+
|
|
49
52
|
size = re.sub(r'\s+', '', size)
|
|
50
|
-
|
|
53
|
+
|
|
54
|
+
if not size: # Handle empty string case after stripping spaces
|
|
55
|
+
return 0
|
|
56
|
+
|
|
51
57
|
if (size[-1] == 'B'):
|
|
52
58
|
size = size[:-1]
|
|
53
|
-
|
|
59
|
+
|
|
60
|
+
if not size: # Handle case where size was just "B"
|
|
61
|
+
return 0
|
|
62
|
+
|
|
54
63
|
if (size.isdigit()):
|
|
55
|
-
|
|
64
|
+
bytes_val = int(size) # Renamed to avoid conflict with built-in 'bytes'
|
|
56
65
|
elif (is_float(size)):
|
|
57
|
-
|
|
66
|
+
bytes_val = float(size) # Renamed
|
|
58
67
|
else:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
68
|
+
# Handle cases like "1KB" where size[:-1] might be "1K" before this block
|
|
69
|
+
# The original code would try to float("1K") which fails.
|
|
70
|
+
# Need to separate numeric part from unit more carefully.
|
|
71
|
+
numeric_part = ''
|
|
72
|
+
unit_part = ''
|
|
73
|
+
|
|
74
|
+
# Iterate from the end to find the unit (K, M, G, T)
|
|
75
|
+
# This handles cases like "10KB" or "2.5GB"
|
|
76
|
+
for i in range(len(size) -1, -1, -1):
|
|
77
|
+
if size[i].isalpha():
|
|
78
|
+
unit_part = size[i] + unit_part
|
|
79
|
+
else:
|
|
80
|
+
numeric_part = size[:i+1]
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
# If no unit found, or numeric part is empty after stripping unit
|
|
84
|
+
if not unit_part or not numeric_part:
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
bytes_val = float(numeric_part)
|
|
89
|
+
unit = unit_part
|
|
63
90
|
if (unit == 'T'):
|
|
64
|
-
|
|
91
|
+
bytes_val *= 1024*1024*1024*1024
|
|
65
92
|
elif (unit == 'G'):
|
|
66
|
-
|
|
93
|
+
bytes_val *= 1024*1024*1024
|
|
67
94
|
elif (unit == 'M'):
|
|
68
|
-
|
|
95
|
+
bytes_val *= 1024*1024
|
|
69
96
|
elif (unit == 'K'):
|
|
70
|
-
|
|
97
|
+
bytes_val *= 1024
|
|
71
98
|
else:
|
|
72
|
-
|
|
99
|
+
# If it's a known unit (like 'B' already stripped) but not T/G/M/K,
|
|
100
|
+
# and it was floatable, it's just bytes. If it's an unknown unit, it's
|
|
101
|
+
# an error.
|
|
102
|
+
if unit not in ['B', '']: # 'B' was stripped, '' means just a number
|
|
103
|
+
bytes_val = 0
|
|
73
104
|
except ValueError:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return
|
|
105
|
+
bytes_val = 0
|
|
106
|
+
|
|
107
|
+
return bytes_val
|
|
77
108
|
|
|
78
109
|
|
|
79
110
|
def remove_ansi_codes(s):
|
|
80
111
|
"""
|
|
81
112
|
Removes ANSI escape codes from a string.
|
|
82
|
-
|
|
113
|
+
|
|
83
114
|
https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
|
|
84
|
-
|
|
115
|
+
|
|
85
116
|
Args:
|
|
86
117
|
s (str): the string to de-ANSI-i-fy
|
|
87
|
-
|
|
118
|
+
|
|
88
119
|
Returns:
|
|
89
120
|
str: A copy of [s] without ANSI codes
|
|
90
121
|
"""
|
|
122
|
+
|
|
91
123
|
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
|
|
92
124
|
return ansi_escape.sub('', s)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
#%% Tests
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class TestStringUtils:
|
|
131
|
+
"""
|
|
132
|
+
Tests for string_utils.py
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_is_float(self):
|
|
137
|
+
"""
|
|
138
|
+
Test the is_float function.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
assert is_float("1.23")
|
|
142
|
+
assert is_float("-0.5")
|
|
143
|
+
assert is_float("0")
|
|
144
|
+
assert is_float(1.23)
|
|
145
|
+
assert is_float(0)
|
|
146
|
+
assert not is_float("abc")
|
|
147
|
+
assert not is_float("1.2.3")
|
|
148
|
+
assert not is_float("")
|
|
149
|
+
assert not is_float(None)
|
|
150
|
+
assert not is_float("1,23")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def test_human_readable_to_bytes(self):
|
|
154
|
+
"""
|
|
155
|
+
Test the human_readable_to_bytes function.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
assert human_readable_to_bytes("10B") == 10
|
|
159
|
+
assert human_readable_to_bytes("10") == 10
|
|
160
|
+
assert human_readable_to_bytes("1K") == 1024
|
|
161
|
+
assert human_readable_to_bytes("1KB") == 1024
|
|
162
|
+
assert human_readable_to_bytes("1M") == 1024*1024
|
|
163
|
+
assert human_readable_to_bytes("1MB") == 1024*1024
|
|
164
|
+
assert human_readable_to_bytes("1G") == 1024*1024*1024
|
|
165
|
+
assert human_readable_to_bytes("1GB") == 1024*1024*1024
|
|
166
|
+
assert human_readable_to_bytes("1T") == 1024*1024*1024*1024
|
|
167
|
+
assert human_readable_to_bytes("1TB") == 1024*1024*1024*1024
|
|
168
|
+
|
|
169
|
+
assert human_readable_to_bytes("2.5K") == 2.5 * 1024
|
|
170
|
+
assert human_readable_to_bytes("0.5MB") == 0.5 * 1024 * 1024
|
|
171
|
+
|
|
172
|
+
# Test with spaces
|
|
173
|
+
assert human_readable_to_bytes(" 2 G ") == 2 * 1024*1024*1024
|
|
174
|
+
assert human_readable_to_bytes("500 KB") == 500 * 1024
|
|
175
|
+
|
|
176
|
+
# Invalid inputs
|
|
177
|
+
assert human_readable_to_bytes("abc") == 0
|
|
178
|
+
assert human_readable_to_bytes("1X") == 0
|
|
179
|
+
assert human_readable_to_bytes("1KBB") == 0
|
|
180
|
+
assert human_readable_to_bytes("K1") == 0
|
|
181
|
+
assert human_readable_to_bytes("") == 0
|
|
182
|
+
assert human_readable_to_bytes("1.2.3K") == 0
|
|
183
|
+
assert human_readable_to_bytes("B") == 0
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_remove_ansi_codes(self):
|
|
187
|
+
"""
|
|
188
|
+
Test the remove_ansi_codes function.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
assert remove_ansi_codes("text without codes") == "text without codes"
|
|
192
|
+
assert remove_ansi_codes("\x1b[31mRed text\x1b[0m") == "Red text"
|
|
193
|
+
assert remove_ansi_codes("\x1b[1m\x1b[4mBold and Underline\x1b[0m") == "Bold and Underline"
|
|
194
|
+
assert remove_ansi_codes("Mixed \x1b[32mgreen\x1b[0m and normal") == "Mixed green and normal"
|
|
195
|
+
assert remove_ansi_codes("") == ""
|
|
196
|
+
|
|
197
|
+
# More complex/varied ANSI codes
|
|
198
|
+
assert remove_ansi_codes("text\x1b[1Aup") == "textup"
|
|
199
|
+
assert remove_ansi_codes("\x1b[2Jclearscreen") == "clearscreen"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_string_utils():
|
|
203
|
+
"""
|
|
204
|
+
Runs all tests in the TestStringUtils class.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
test_instance = TestStringUtils()
|
|
208
|
+
test_instance.test_is_float()
|
|
209
|
+
test_instance.test_human_readable_to_bytes()
|
|
210
|
+
test_instance.test_remove_ansi_codes()
|
|
211
|
+
|
|
212
|
+
# from IPython import embed; embed()
|
|
213
|
+
# test_string_utils()
|