megadetector 5.0.28__py3-none-any.whl → 5.0.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/api/batch_processing/api_core/batch_service/score.py +4 -5
- megadetector/api/batch_processing/api_core_support/aggregate_results_manually.py +1 -1
- megadetector/api/batch_processing/api_support/summarize_daily_activity.py +1 -1
- megadetector/api/batch_processing/integration/digiKam/xmp_integration.py +2 -2
- megadetector/api/batch_processing/integration/eMammal/test_scripts/push_annotations_to_emammal.py +1 -1
- megadetector/api/batch_processing/integration/eMammal/test_scripts/select_images_for_testing.py +1 -1
- megadetector/api/synchronous/api_core/tests/load_test.py +2 -3
- megadetector/classification/aggregate_classifier_probs.py +3 -3
- megadetector/classification/analyze_failed_images.py +5 -5
- megadetector/classification/cache_batchapi_outputs.py +5 -5
- megadetector/classification/create_classification_dataset.py +11 -12
- megadetector/classification/crop_detections.py +10 -10
- megadetector/classification/csv_to_json.py +8 -8
- megadetector/classification/detect_and_crop.py +13 -15
- megadetector/classification/evaluate_model.py +7 -7
- megadetector/classification/identify_mislabeled_candidates.py +6 -6
- megadetector/classification/json_to_azcopy_list.py +1 -1
- megadetector/classification/json_validator.py +29 -32
- megadetector/classification/map_classification_categories.py +9 -9
- megadetector/classification/merge_classification_detection_output.py +12 -9
- megadetector/classification/prepare_classification_script.py +19 -19
- megadetector/classification/prepare_classification_script_mc.py +23 -23
- megadetector/classification/run_classifier.py +4 -4
- megadetector/classification/save_mislabeled.py +6 -6
- megadetector/classification/train_classifier.py +1 -1
- megadetector/classification/train_classifier_tf.py +9 -9
- megadetector/classification/train_utils.py +10 -10
- megadetector/data_management/annotations/annotation_constants.py +1 -1
- megadetector/data_management/camtrap_dp_to_coco.py +45 -45
- megadetector/data_management/cct_json_utils.py +101 -101
- megadetector/data_management/cct_to_md.py +49 -49
- megadetector/data_management/cct_to_wi.py +33 -33
- megadetector/data_management/coco_to_labelme.py +75 -75
- megadetector/data_management/coco_to_yolo.py +189 -189
- megadetector/data_management/databases/add_width_and_height_to_db.py +3 -2
- megadetector/data_management/databases/combine_coco_camera_traps_files.py +38 -38
- megadetector/data_management/databases/integrity_check_json_db.py +202 -188
- megadetector/data_management/databases/subset_json_db.py +33 -33
- megadetector/data_management/generate_crops_from_cct.py +38 -38
- megadetector/data_management/get_image_sizes.py +54 -49
- megadetector/data_management/labelme_to_coco.py +130 -124
- megadetector/data_management/labelme_to_yolo.py +78 -72
- megadetector/data_management/lila/create_lila_blank_set.py +81 -83
- megadetector/data_management/lila/create_lila_test_set.py +32 -31
- megadetector/data_management/lila/create_links_to_md_results_files.py +18 -18
- megadetector/data_management/lila/download_lila_subset.py +21 -24
- megadetector/data_management/lila/generate_lila_per_image_labels.py +91 -91
- megadetector/data_management/lila/get_lila_annotation_counts.py +30 -30
- megadetector/data_management/lila/get_lila_image_counts.py +22 -22
- megadetector/data_management/lila/lila_common.py +70 -70
- megadetector/data_management/lila/test_lila_metadata_urls.py +13 -14
- megadetector/data_management/mewc_to_md.py +339 -340
- megadetector/data_management/ocr_tools.py +258 -252
- megadetector/data_management/read_exif.py +231 -224
- megadetector/data_management/remap_coco_categories.py +26 -26
- megadetector/data_management/remove_exif.py +31 -20
- megadetector/data_management/rename_images.py +187 -187
- megadetector/data_management/resize_coco_dataset.py +41 -41
- megadetector/data_management/speciesnet_to_md.py +41 -41
- megadetector/data_management/wi_download_csv_to_coco.py +55 -55
- megadetector/data_management/yolo_output_to_md_output.py +117 -120
- megadetector/data_management/yolo_to_coco.py +195 -188
- megadetector/detection/change_detection.py +831 -0
- megadetector/detection/process_video.py +340 -337
- megadetector/detection/pytorch_detector.py +304 -262
- megadetector/detection/run_detector.py +177 -164
- megadetector/detection/run_detector_batch.py +364 -363
- megadetector/detection/run_inference_with_yolov5_val.py +328 -325
- megadetector/detection/run_tiled_inference.py +256 -249
- megadetector/detection/tf_detector.py +24 -24
- megadetector/detection/video_utils.py +290 -282
- megadetector/postprocessing/add_max_conf.py +15 -11
- megadetector/postprocessing/categorize_detections_by_size.py +44 -44
- megadetector/postprocessing/classification_postprocessing.py +415 -415
- megadetector/postprocessing/combine_batch_outputs.py +20 -21
- megadetector/postprocessing/compare_batch_results.py +528 -517
- megadetector/postprocessing/convert_output_format.py +97 -97
- megadetector/postprocessing/create_crop_folder.py +219 -146
- megadetector/postprocessing/detector_calibration.py +173 -168
- megadetector/postprocessing/generate_csv_report.py +508 -499
- megadetector/postprocessing/load_api_results.py +23 -20
- megadetector/postprocessing/md_to_coco.py +129 -98
- megadetector/postprocessing/md_to_labelme.py +89 -83
- megadetector/postprocessing/md_to_wi.py +40 -40
- megadetector/postprocessing/merge_detections.py +87 -114
- megadetector/postprocessing/postprocess_batch_results.py +313 -298
- megadetector/postprocessing/remap_detection_categories.py +36 -36
- megadetector/postprocessing/render_detection_confusion_matrix.py +205 -199
- megadetector/postprocessing/repeat_detection_elimination/find_repeat_detections.py +57 -57
- megadetector/postprocessing/repeat_detection_elimination/remove_repeat_detections.py +27 -28
- megadetector/postprocessing/repeat_detection_elimination/repeat_detections_core.py +702 -677
- megadetector/postprocessing/separate_detections_into_folders.py +226 -211
- megadetector/postprocessing/subset_json_detector_output.py +265 -262
- megadetector/postprocessing/top_folders_to_bottom.py +45 -45
- megadetector/postprocessing/validate_batch_results.py +70 -70
- megadetector/taxonomy_mapping/map_lila_taxonomy_to_wi_taxonomy.py +52 -52
- megadetector/taxonomy_mapping/map_new_lila_datasets.py +15 -15
- megadetector/taxonomy_mapping/prepare_lila_taxonomy_release.py +14 -14
- megadetector/taxonomy_mapping/preview_lila_taxonomy.py +66 -66
- megadetector/taxonomy_mapping/retrieve_sample_image.py +16 -16
- megadetector/taxonomy_mapping/simple_image_download.py +8 -8
- megadetector/taxonomy_mapping/species_lookup.py +33 -33
- megadetector/taxonomy_mapping/taxonomy_csv_checker.py +14 -14
- megadetector/taxonomy_mapping/taxonomy_graph.py +10 -10
- megadetector/taxonomy_mapping/validate_lila_category_mappings.py +13 -13
- megadetector/utils/azure_utils.py +22 -22
- megadetector/utils/ct_utils.py +1018 -200
- megadetector/utils/directory_listing.py +21 -77
- megadetector/utils/gpu_test.py +22 -22
- megadetector/utils/md_tests.py +541 -518
- megadetector/utils/path_utils.py +1457 -398
- megadetector/utils/process_utils.py +41 -41
- megadetector/utils/sas_blob_utils.py +53 -49
- megadetector/utils/split_locations_into_train_val.py +61 -61
- megadetector/utils/string_utils.py +147 -26
- megadetector/utils/url_utils.py +463 -173
- megadetector/utils/wi_utils.py +2629 -2526
- megadetector/utils/write_html_image_list.py +137 -137
- megadetector/visualization/plot_utils.py +21 -21
- megadetector/visualization/render_images_with_thumbnails.py +37 -73
- megadetector/visualization/visualization_utils.py +401 -397
- megadetector/visualization/visualize_db.py +197 -190
- megadetector/visualization/visualize_detector_output.py +79 -73
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/METADATA +135 -132
- megadetector-5.0.29.dist-info/RECORD +163 -0
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/WHEEL +1 -1
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/licenses/LICENSE +0 -0
- {megadetector-5.0.28.dist-info → megadetector-5.0.29.dist-info}/top_level.txt +0 -0
- megadetector/data_management/importers/add_nacti_sizes.py +0 -52
- megadetector/data_management/importers/add_timestamps_to_icct.py +0 -79
- megadetector/data_management/importers/animl_results_to_md_results.py +0 -158
- megadetector/data_management/importers/auckland_doc_test_to_json.py +0 -373
- megadetector/data_management/importers/auckland_doc_to_json.py +0 -201
- megadetector/data_management/importers/awc_to_json.py +0 -191
- megadetector/data_management/importers/bellevue_to_json.py +0 -272
- megadetector/data_management/importers/cacophony-thermal-importer.py +0 -793
- megadetector/data_management/importers/carrizo_shrubfree_2018.py +0 -269
- megadetector/data_management/importers/carrizo_trail_cam_2017.py +0 -289
- megadetector/data_management/importers/cct_field_adjustments.py +0 -58
- megadetector/data_management/importers/channel_islands_to_cct.py +0 -913
- megadetector/data_management/importers/eMammal/copy_and_unzip_emammal.py +0 -180
- megadetector/data_management/importers/eMammal/eMammal_helpers.py +0 -249
- megadetector/data_management/importers/eMammal/make_eMammal_json.py +0 -223
- megadetector/data_management/importers/ena24_to_json.py +0 -276
- megadetector/data_management/importers/filenames_to_json.py +0 -386
- megadetector/data_management/importers/helena_to_cct.py +0 -283
- megadetector/data_management/importers/idaho-camera-traps.py +0 -1407
- megadetector/data_management/importers/idfg_iwildcam_lila_prep.py +0 -294
- megadetector/data_management/importers/import_desert_lion_conservation_camera_traps.py +0 -387
- megadetector/data_management/importers/jb_csv_to_json.py +0 -150
- megadetector/data_management/importers/mcgill_to_json.py +0 -250
- megadetector/data_management/importers/missouri_to_json.py +0 -490
- megadetector/data_management/importers/nacti_fieldname_adjustments.py +0 -79
- megadetector/data_management/importers/noaa_seals_2019.py +0 -181
- megadetector/data_management/importers/osu-small-animals-to-json.py +0 -364
- megadetector/data_management/importers/pc_to_json.py +0 -365
- megadetector/data_management/importers/plot_wni_giraffes.py +0 -123
- megadetector/data_management/importers/prepare_zsl_imerit.py +0 -131
- megadetector/data_management/importers/raic_csv_to_md_results.py +0 -416
- megadetector/data_management/importers/rspb_to_json.py +0 -356
- megadetector/data_management/importers/save_the_elephants_survey_A.py +0 -320
- megadetector/data_management/importers/save_the_elephants_survey_B.py +0 -329
- megadetector/data_management/importers/snapshot_safari_importer.py +0 -758
- megadetector/data_management/importers/snapshot_serengeti_lila.py +0 -1067
- megadetector/data_management/importers/snapshotserengeti/make_full_SS_json.py +0 -150
- megadetector/data_management/importers/snapshotserengeti/make_per_season_SS_json.py +0 -153
- megadetector/data_management/importers/sulross_get_exif.py +0 -65
- megadetector/data_management/importers/timelapse_csv_set_to_json.py +0 -490
- megadetector/data_management/importers/ubc_to_json.py +0 -399
- megadetector/data_management/importers/umn_to_json.py +0 -507
- megadetector/data_management/importers/wellington_to_json.py +0 -263
- megadetector/data_management/importers/wi_to_json.py +0 -442
- megadetector/data_management/importers/zamba_results_to_md_results.py +0 -180
- megadetector/data_management/lila/add_locations_to_island_camera_traps.py +0 -101
- megadetector/data_management/lila/add_locations_to_nacti.py +0 -151
- megadetector-5.0.28.dist-info/RECORD +0 -209
|
@@ -4,8 +4,8 @@ split_locations_into_train_val.py
|
|
|
4
4
|
|
|
5
5
|
Splits a list of location IDs into training and validation, targeting a specific
|
|
6
6
|
train/val split for each category, but allowing some categories to be tighter or looser
|
|
7
|
-
than others. Does nothing particularly clever, just randomly splits locations into
|
|
8
|
-
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
7
|
+
than others. Does nothing particularly clever, just randomly splits locations into
|
|
8
|
+
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
9
9
|
specified constraints and minimizes weighted error, where "error" is defined as the
|
|
10
10
|
sum of each class's absolute divergence from the target val fraction.
|
|
11
11
|
|
|
@@ -26,63 +26,63 @@ from tqdm import tqdm
|
|
|
26
26
|
def split_locations_into_train_val(location_to_category_counts,
|
|
27
27
|
n_random_seeds=10000,
|
|
28
28
|
target_val_fraction=0.15,
|
|
29
|
-
category_to_max_allowable_error=None,
|
|
29
|
+
category_to_max_allowable_error=None,
|
|
30
30
|
category_to_error_weight=None,
|
|
31
31
|
default_max_allowable_error=0.1,
|
|
32
32
|
require_complete_coverage=True):
|
|
33
33
|
"""
|
|
34
34
|
Splits a list of location IDs into training and validation, targeting a specific
|
|
35
35
|
train/val split for each category, but allowing some categories to be tighter or looser
|
|
36
|
-
than others. Does nothing particularly clever, just randomly splits locations into
|
|
37
|
-
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
36
|
+
than others. Does nothing particularly clever, just randomly splits locations into
|
|
37
|
+
train/val lots of times using the target val fraction, and picks the one that meets the
|
|
38
38
|
specified constraints and minimizes weighted error, where "error" is defined as the
|
|
39
|
-
sum of each class's absolute divergence from the target val fraction.
|
|
40
|
-
|
|
39
|
+
sum of each class's absolute divergence from the target val fraction.
|
|
40
|
+
|
|
41
41
|
Args:
|
|
42
42
|
location_to_category_counts (dict): a dict mapping location IDs to dicts,
|
|
43
|
-
with each dict mapping a category name to a count. Any categories not present
|
|
43
|
+
with each dict mapping a category name to a count. Any categories not present
|
|
44
44
|
in a particular dict are assumed to have a count of zero for that location.
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
For example:
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
.. code-block:: none
|
|
49
49
|
|
|
50
50
|
{'location-000': {'bear':4,'wolf':10},
|
|
51
51
|
'location-001': {'bear':12,'elk':20}}
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
n_random_seeds (int, optional): number of random seeds to try, always starting from zero
|
|
54
54
|
target_val_fraction (float, optional): fraction of images containing each species we'd
|
|
55
55
|
like to put in the val split
|
|
56
56
|
category_to_max_allowable_error (dict, optional): a dict mapping category names
|
|
57
57
|
to maximum allowable errors. These are hard constraints (i.e., we will error
|
|
58
|
-
if we can't meet them). Does not need to include all categories; categories not
|
|
58
|
+
if we can't meet them). Does not need to include all categories; categories not
|
|
59
59
|
included will be assigned a maximum error according to [default_max_allowable_error].
|
|
60
60
|
If this is None, no hard constraints are applied.
|
|
61
61
|
category_to_error_weight (dict, optional): a dict mapping category names to
|
|
62
62
|
error weights. You can specify a subset of categories; categories not included here
|
|
63
63
|
have a weight of 1.0. If None, all categories have the same weight.
|
|
64
|
-
default_max_allowable_error (float, optional): the maximum allowable error for categories not
|
|
65
|
-
present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
|
|
64
|
+
default_max_allowable_error (float, optional): the maximum allowable error for categories not
|
|
65
|
+
present in [category_to_max_allowable_error]. Set to None (or >= 1.0) to disable hard
|
|
66
66
|
constraints for categories not present in [category_to_max_allowable_error]
|
|
67
|
-
require_complete_coverage (bool, optional): require that every category appear in both train and
|
|
67
|
+
require_complete_coverage (bool, optional): require that every category appear in both train and
|
|
68
68
|
val
|
|
69
|
-
|
|
69
|
+
|
|
70
70
|
Returns:
|
|
71
71
|
tuple: A two-element tuple:
|
|
72
72
|
- list of location IDs in the val split
|
|
73
|
-
- a dict mapping category names to the fraction of images in the val split
|
|
73
|
+
- a dict mapping category names to the fraction of images in the val split
|
|
74
74
|
"""
|
|
75
|
-
|
|
75
|
+
|
|
76
76
|
location_ids = list(location_to_category_counts.keys())
|
|
77
|
-
|
|
77
|
+
|
|
78
78
|
n_val_locations = int(target_val_fraction*len(location_ids))
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
if category_to_max_allowable_error is None:
|
|
81
81
|
category_to_max_allowable_error = {}
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
if category_to_error_weight is None:
|
|
84
84
|
category_to_error_weight = {}
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
# category ID to total count; the total count is used only for printouts
|
|
87
87
|
category_id_to_count = {}
|
|
88
88
|
for location_id in location_to_category_counts:
|
|
@@ -91,28 +91,28 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
91
91
|
category_id_to_count[category_id] = 0
|
|
92
92
|
category_id_to_count[category_id] += \
|
|
93
93
|
location_to_category_counts[location_id][category_id]
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
category_ids = set(category_id_to_count.keys())
|
|
96
|
-
|
|
96
|
+
|
|
97
97
|
print('Splitting {} categories over {} locations'.format(
|
|
98
98
|
len(category_ids),len(location_ids)))
|
|
99
|
-
|
|
99
|
+
|
|
100
100
|
# random_seed = 0
|
|
101
101
|
def compute_seed_errors(random_seed):
|
|
102
102
|
"""
|
|
103
103
|
Computes the per-category error for a specific random seed.
|
|
104
|
-
|
|
104
|
+
|
|
105
105
|
returns weighted_average_error,category_to_val_fraction
|
|
106
106
|
"""
|
|
107
|
-
|
|
107
|
+
|
|
108
108
|
# Randomly split into train/val
|
|
109
109
|
random.seed(random_seed)
|
|
110
110
|
val_locations = random.sample(location_ids,k=n_val_locations)
|
|
111
111
|
val_locations_set = set(val_locations)
|
|
112
|
-
|
|
112
|
+
|
|
113
113
|
# For each category, measure the % of images that went into the val set
|
|
114
114
|
category_to_val_fraction = defaultdict(float)
|
|
115
|
-
|
|
115
|
+
|
|
116
116
|
for category_id in category_ids:
|
|
117
117
|
category_val_count = 0
|
|
118
118
|
category_train_count = 0
|
|
@@ -127,44 +127,44 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
127
127
|
category_train_count += location_category_count
|
|
128
128
|
category_val_fraction = category_val_count / (category_val_count + category_train_count)
|
|
129
129
|
category_to_val_fraction[category_id] = category_val_fraction
|
|
130
|
-
|
|
130
|
+
|
|
131
131
|
# Absolute deviation from the target val fraction for each category
|
|
132
132
|
category_errors = {}
|
|
133
133
|
weighted_category_errors = {}
|
|
134
|
-
|
|
134
|
+
|
|
135
135
|
# category = next(iter(category_to_val_fraction))
|
|
136
136
|
for category in category_to_val_fraction:
|
|
137
|
-
|
|
137
|
+
|
|
138
138
|
category_val_fraction = category_to_val_fraction[category]
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
category_error = abs(category_val_fraction-target_val_fraction)
|
|
141
141
|
category_errors[category] = category_error
|
|
142
|
-
|
|
142
|
+
|
|
143
143
|
category_weight = 1.0
|
|
144
144
|
if category in category_to_error_weight:
|
|
145
145
|
category_weight = category_to_error_weight[category]
|
|
146
146
|
weighted_category_error = category_error * category_weight
|
|
147
147
|
weighted_category_errors[category] = weighted_category_error
|
|
148
|
-
|
|
148
|
+
|
|
149
149
|
weighted_average_error = np.mean(list(weighted_category_errors.values()))
|
|
150
|
-
|
|
150
|
+
|
|
151
151
|
return weighted_average_error,weighted_category_errors,category_to_val_fraction
|
|
152
|
-
|
|
152
|
+
|
|
153
153
|
# ... def compute_seed_errors(...)
|
|
154
|
-
|
|
154
|
+
|
|
155
155
|
# This will only include random seeds that satisfy the hard constraints
|
|
156
156
|
random_seed_to_weighted_average_error = {}
|
|
157
|
-
|
|
157
|
+
|
|
158
158
|
# random_seed = 0
|
|
159
159
|
for random_seed in tqdm(range(0,n_random_seeds)):
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
weighted_average_error,weighted_category_errors,category_to_val_fraction = \
|
|
162
162
|
compute_seed_errors(random_seed)
|
|
163
|
-
|
|
163
|
+
|
|
164
164
|
seed_satisfies_hard_constraints = True
|
|
165
|
-
|
|
165
|
+
|
|
166
166
|
for category in category_to_val_fraction:
|
|
167
|
-
if category in category_to_max_allowable_error:
|
|
167
|
+
if category in category_to_max_allowable_error:
|
|
168
168
|
max_allowable_error = category_to_max_allowable_error[category]
|
|
169
169
|
else:
|
|
170
170
|
if default_max_allowable_error is None:
|
|
@@ -183,59 +183,59 @@ def split_locations_into_train_val(location_to_category_counts,
|
|
|
183
183
|
if category_error > max_allowable_error:
|
|
184
184
|
seed_satisfies_hard_constraints = False
|
|
185
185
|
break
|
|
186
|
-
|
|
186
|
+
|
|
187
187
|
# ...for each category
|
|
188
|
-
|
|
189
|
-
if seed_satisfies_hard_constraints:
|
|
188
|
+
|
|
189
|
+
if seed_satisfies_hard_constraints:
|
|
190
190
|
random_seed_to_weighted_average_error[random_seed] = weighted_average_error
|
|
191
|
-
|
|
191
|
+
|
|
192
192
|
# ...for each random seed
|
|
193
|
-
|
|
193
|
+
|
|
194
194
|
assert len(random_seed_to_weighted_average_error) > 0, \
|
|
195
195
|
'No random seed met all the hard constraints'
|
|
196
|
-
|
|
196
|
+
|
|
197
197
|
print('\n{} of {} random seeds satisfied hard constraints'.format(
|
|
198
198
|
len(random_seed_to_weighted_average_error),n_random_seeds))
|
|
199
|
-
|
|
199
|
+
|
|
200
200
|
min_error = None
|
|
201
201
|
min_error_seed = None
|
|
202
|
-
|
|
202
|
+
|
|
203
203
|
for random_seed in random_seed_to_weighted_average_error.keys():
|
|
204
204
|
error_metric = random_seed_to_weighted_average_error[random_seed]
|
|
205
205
|
if min_error is None or error_metric < min_error:
|
|
206
206
|
min_error = error_metric
|
|
207
207
|
min_error_seed = random_seed
|
|
208
|
-
|
|
208
|
+
|
|
209
209
|
random.seed(min_error_seed)
|
|
210
210
|
val_locations = random.sample(location_ids,k=n_val_locations)
|
|
211
211
|
train_locations = []
|
|
212
212
|
for location_id in location_ids:
|
|
213
213
|
if location_id not in val_locations:
|
|
214
214
|
train_locations.append(location_id)
|
|
215
|
-
|
|
216
|
-
print('\nVal locations:\n')
|
|
215
|
+
|
|
216
|
+
print('\nVal locations:\n')
|
|
217
217
|
for loc in val_locations:
|
|
218
218
|
print('{}'.format(loc))
|
|
219
219
|
print('')
|
|
220
|
-
|
|
220
|
+
|
|
221
221
|
weighted_average_error,weighted_category_errors,category_to_val_fraction = \
|
|
222
222
|
compute_seed_errors(min_error_seed)
|
|
223
|
-
|
|
223
|
+
|
|
224
224
|
random_seed = min_error_seed
|
|
225
|
-
|
|
225
|
+
|
|
226
226
|
category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,reverse=True)
|
|
227
227
|
category_to_val_fraction = sort_dictionary_by_value(category_to_val_fraction,
|
|
228
228
|
sort_values=category_id_to_count,
|
|
229
229
|
reverse=True)
|
|
230
|
-
|
|
231
|
-
|
|
230
|
+
|
|
231
|
+
|
|
232
232
|
print('Val fractions by category:\n')
|
|
233
|
-
|
|
233
|
+
|
|
234
234
|
for category in category_to_val_fraction:
|
|
235
235
|
print('{} ({}) {:.2f}'.format(
|
|
236
236
|
category,category_id_to_count[category],
|
|
237
237
|
category_to_val_fraction[category]))
|
|
238
|
-
|
|
238
|
+
|
|
239
239
|
return val_locations,category_to_val_fraction
|
|
240
240
|
|
|
241
241
|
# ...def split_locations_into_train_val(...)
|
|
@@ -14,15 +14,18 @@ import re
|
|
|
14
14
|
#%% Functions
|
|
15
15
|
|
|
16
16
|
def is_float(s):
|
|
17
|
-
"""
|
|
17
|
+
"""
|
|
18
18
|
Checks whether [s] is an object (typically a string) that can be cast to a float
|
|
19
|
-
|
|
19
|
+
|
|
20
20
|
Args:
|
|
21
21
|
s (object): object to evaluate
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
Returns:
|
|
24
24
|
bool: True if s successfully casts to a float, otherwise False
|
|
25
25
|
"""
|
|
26
|
+
|
|
27
|
+
if s is None:
|
|
28
|
+
return False
|
|
26
29
|
|
|
27
30
|
try:
|
|
28
31
|
_ = float(s)
|
|
@@ -36,57 +39,175 @@ def human_readable_to_bytes(size):
|
|
|
36
39
|
Given a human-readable byte string (e.g. 2G, 10GB, 30MB, 20KB),
|
|
37
40
|
returns the number of bytes. Will return 0 if the argument has
|
|
38
41
|
unexpected form.
|
|
39
|
-
|
|
42
|
+
|
|
40
43
|
https://gist.github.com/beugley/ccd69945346759eb6142272a6d69b4e0
|
|
41
|
-
|
|
44
|
+
|
|
42
45
|
Args:
|
|
43
46
|
size (str): string representing a size
|
|
44
|
-
|
|
47
|
+
|
|
45
48
|
Returns:
|
|
46
49
|
int: the corresponding size in bytes
|
|
47
50
|
"""
|
|
48
|
-
|
|
51
|
+
|
|
49
52
|
size = re.sub(r'\s+', '', size)
|
|
50
|
-
|
|
53
|
+
|
|
54
|
+
if not size: # Handle empty string case after stripping spaces
|
|
55
|
+
return 0
|
|
56
|
+
|
|
51
57
|
if (size[-1] == 'B'):
|
|
52
58
|
size = size[:-1]
|
|
53
|
-
|
|
59
|
+
|
|
60
|
+
if not size: # Handle case where size was just "B"
|
|
61
|
+
return 0
|
|
62
|
+
|
|
54
63
|
if (size.isdigit()):
|
|
55
|
-
|
|
64
|
+
bytes_val = int(size) # Renamed to avoid conflict with built-in 'bytes'
|
|
56
65
|
elif (is_float(size)):
|
|
57
|
-
|
|
66
|
+
bytes_val = float(size) # Renamed
|
|
58
67
|
else:
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
68
|
+
# Handle cases like "1KB" where size[:-1] might be "1K" before this block
|
|
69
|
+
# The original code would try to float("1K") which fails.
|
|
70
|
+
# Need to separate numeric part from unit more carefully.
|
|
71
|
+
numeric_part = ''
|
|
72
|
+
unit_part = ''
|
|
73
|
+
|
|
74
|
+
# Iterate from the end to find the unit (K, M, G, T)
|
|
75
|
+
# This handles cases like "10KB" or "2.5GB"
|
|
76
|
+
for i in range(len(size) -1, -1, -1):
|
|
77
|
+
if size[i].isalpha():
|
|
78
|
+
unit_part = size[i] + unit_part
|
|
79
|
+
else:
|
|
80
|
+
numeric_part = size[:i+1]
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
# If no unit found, or numeric part is empty after stripping unit
|
|
84
|
+
if not unit_part or not numeric_part:
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
try:
|
|
88
|
+
bytes_val = float(numeric_part)
|
|
89
|
+
unit = unit_part
|
|
63
90
|
if (unit == 'T'):
|
|
64
|
-
|
|
91
|
+
bytes_val *= 1024*1024*1024*1024
|
|
65
92
|
elif (unit == 'G'):
|
|
66
|
-
|
|
93
|
+
bytes_val *= 1024*1024*1024
|
|
67
94
|
elif (unit == 'M'):
|
|
68
|
-
|
|
95
|
+
bytes_val *= 1024*1024
|
|
69
96
|
elif (unit == 'K'):
|
|
70
|
-
|
|
97
|
+
bytes_val *= 1024
|
|
71
98
|
else:
|
|
72
|
-
|
|
99
|
+
# If it's a known unit (like 'B' already stripped) but not T/G/M/K,
|
|
100
|
+
# and it was floatable, it's just bytes. If it's an unknown unit, it's
|
|
101
|
+
# an error.
|
|
102
|
+
if unit not in ['B', '']: # 'B' was stripped, '' means just a number
|
|
103
|
+
bytes_val = 0
|
|
73
104
|
except ValueError:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
return
|
|
105
|
+
bytes_val = 0
|
|
106
|
+
|
|
107
|
+
return bytes_val
|
|
77
108
|
|
|
78
109
|
|
|
79
110
|
def remove_ansi_codes(s):
|
|
80
111
|
"""
|
|
81
112
|
Removes ANSI escape codes from a string.
|
|
82
|
-
|
|
113
|
+
|
|
83
114
|
https://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python#14693789
|
|
84
|
-
|
|
115
|
+
|
|
85
116
|
Args:
|
|
86
117
|
s (str): the string to de-ANSI-i-fy
|
|
87
|
-
|
|
118
|
+
|
|
88
119
|
Returns:
|
|
89
120
|
str: A copy of [s] without ANSI codes
|
|
90
121
|
"""
|
|
122
|
+
|
|
91
123
|
ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
|
|
92
124
|
return ansi_escape.sub('', s)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
#%% Tests
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class TestStringUtils:
|
|
131
|
+
"""
|
|
132
|
+
Tests for string_utils.py
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test_is_float(self):
|
|
137
|
+
"""
|
|
138
|
+
Test the is_float function.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
assert is_float("1.23")
|
|
142
|
+
assert is_float("-0.5")
|
|
143
|
+
assert is_float("0")
|
|
144
|
+
assert is_float(1.23)
|
|
145
|
+
assert is_float(0)
|
|
146
|
+
assert not is_float("abc")
|
|
147
|
+
assert not is_float("1.2.3")
|
|
148
|
+
assert not is_float("")
|
|
149
|
+
assert not is_float(None)
|
|
150
|
+
assert not is_float("1,23")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def test_human_readable_to_bytes(self):
|
|
154
|
+
"""
|
|
155
|
+
Test the human_readable_to_bytes function.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
assert human_readable_to_bytes("10B") == 10
|
|
159
|
+
assert human_readable_to_bytes("10") == 10
|
|
160
|
+
assert human_readable_to_bytes("1K") == 1024
|
|
161
|
+
assert human_readable_to_bytes("1KB") == 1024
|
|
162
|
+
assert human_readable_to_bytes("1M") == 1024*1024
|
|
163
|
+
assert human_readable_to_bytes("1MB") == 1024*1024
|
|
164
|
+
assert human_readable_to_bytes("1G") == 1024*1024*1024
|
|
165
|
+
assert human_readable_to_bytes("1GB") == 1024*1024*1024
|
|
166
|
+
assert human_readable_to_bytes("1T") == 1024*1024*1024*1024
|
|
167
|
+
assert human_readable_to_bytes("1TB") == 1024*1024*1024*1024
|
|
168
|
+
|
|
169
|
+
assert human_readable_to_bytes("2.5K") == 2.5 * 1024
|
|
170
|
+
assert human_readable_to_bytes("0.5MB") == 0.5 * 1024 * 1024
|
|
171
|
+
|
|
172
|
+
# Test with spaces
|
|
173
|
+
assert human_readable_to_bytes(" 2 G ") == 2 * 1024*1024*1024
|
|
174
|
+
assert human_readable_to_bytes("500 KB") == 500 * 1024
|
|
175
|
+
|
|
176
|
+
# Invalid inputs
|
|
177
|
+
assert human_readable_to_bytes("abc") == 0
|
|
178
|
+
assert human_readable_to_bytes("1X") == 0
|
|
179
|
+
assert human_readable_to_bytes("1KBB") == 0
|
|
180
|
+
assert human_readable_to_bytes("K1") == 0
|
|
181
|
+
assert human_readable_to_bytes("") == 0
|
|
182
|
+
assert human_readable_to_bytes("1.2.3K") == 0
|
|
183
|
+
assert human_readable_to_bytes("B") == 0
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_remove_ansi_codes(self):
|
|
187
|
+
"""
|
|
188
|
+
Test the remove_ansi_codes function.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
assert remove_ansi_codes("text without codes") == "text without codes"
|
|
192
|
+
assert remove_ansi_codes("\x1b[31mRed text\x1b[0m") == "Red text"
|
|
193
|
+
assert remove_ansi_codes("\x1b[1m\x1b[4mBold and Underline\x1b[0m") == "Bold and Underline"
|
|
194
|
+
assert remove_ansi_codes("Mixed \x1b[32mgreen\x1b[0m and normal") == "Mixed green and normal"
|
|
195
|
+
assert remove_ansi_codes("") == ""
|
|
196
|
+
|
|
197
|
+
# More complex/varied ANSI codes
|
|
198
|
+
assert remove_ansi_codes("text\x1b[1Aup") == "textup"
|
|
199
|
+
assert remove_ansi_codes("\x1b[2Jclearscreen") == "clearscreen"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_string_utils():
|
|
203
|
+
"""
|
|
204
|
+
Runs all tests in the TestStringUtils class.
|
|
205
|
+
"""
|
|
206
|
+
|
|
207
|
+
test_instance = TestStringUtils()
|
|
208
|
+
test_instance.test_is_float()
|
|
209
|
+
test_instance.test_human_readable_to_bytes()
|
|
210
|
+
test_instance.test_remove_ansi_codes()
|
|
211
|
+
|
|
212
|
+
# from IPython import embed; embed()
|
|
213
|
+
# test_string_utils()
|