megadetector 10.0.6__py3-none-any.whl → 10.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of megadetector might be problematic. Click here for more details.
- megadetector/data_management/cct_json_utils.py +16 -6
- megadetector/data_management/databases/subset_json_db.py +57 -2
- megadetector/detection/pytorch_detector.py +29 -15
- megadetector/detection/run_inference_with_yolov5_val.py +3 -1
- megadetector/detection/run_tiled_inference.py +5 -2
- megadetector/detection/video_utils.py +23 -7
- megadetector/postprocessing/classification_postprocessing.py +218 -69
- megadetector/postprocessing/convert_output_format.py +81 -87
- megadetector/postprocessing/subset_json_detector_output.py +3 -0
- megadetector/utils/directory_listing.py +19 -13
- megadetector/utils/path_utils.py +58 -8
- megadetector/utils/url_utils.py +91 -1
- megadetector/utils/wi_taxonomy_utils.py +44 -26
- megadetector/visualization/visualize_video_output.py +16 -6
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/METADATA +134 -134
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/RECORD +19 -19
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/licenses/LICENSE +0 -0
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/top_level.txt +0 -0
- {megadetector-10.0.6.dist-info → megadetector-10.0.8.dist-info}/WHEEL +0 -0
|
@@ -2,12 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
convert_output_format.py
|
|
4
4
|
|
|
5
|
-
Converts between file
|
|
6
|
-
|
|
7
|
-
conversion - including between hypothetical alternative .json versions - that we support
|
|
8
|
-
in the future.
|
|
9
|
-
|
|
10
|
-
The .csv format is largely obsolete, don't use it unless you're super-duper sure you need it.
|
|
5
|
+
Converts between file .json and .csv representations of MD output. The .csv format is
|
|
6
|
+
largely obsolete, don't use it unless you're super-duper sure you need it.
|
|
11
7
|
|
|
12
8
|
"""
|
|
13
9
|
|
|
@@ -15,13 +11,16 @@ The .csv format is largely obsolete, don't use it unless you're super-duper sure
|
|
|
15
11
|
|
|
16
12
|
import argparse
|
|
17
13
|
import json
|
|
18
|
-
import csv
|
|
19
14
|
import sys
|
|
20
15
|
import os
|
|
21
16
|
|
|
22
17
|
from tqdm import tqdm
|
|
18
|
+
from collections import defaultdict
|
|
19
|
+
|
|
20
|
+
import pandas as pd
|
|
23
21
|
|
|
24
22
|
from megadetector.postprocessing.load_api_results import load_api_results_csv
|
|
23
|
+
from megadetector.utils.wi_taxonomy_utils import load_md_or_speciesnet_file
|
|
25
24
|
from megadetector.data_management.annotations import annotation_constants
|
|
26
25
|
from megadetector.utils import ct_utils
|
|
27
26
|
|
|
@@ -35,16 +34,13 @@ def convert_json_to_csv(input_path,
|
|
|
35
34
|
min_confidence=None,
|
|
36
35
|
omit_bounding_boxes=False,
|
|
37
36
|
output_encoding=None,
|
|
38
|
-
overwrite=True
|
|
37
|
+
overwrite=True,
|
|
38
|
+
verbose=False):
|
|
39
39
|
"""
|
|
40
40
|
Converts a MD results .json file to a totally non-standard .csv format.
|
|
41
41
|
|
|
42
42
|
If [output_path] is None, will convert x.json to x.csv.
|
|
43
43
|
|
|
44
|
-
TODO: this function should obviously be using Pandas or some other sensible structured
|
|
45
|
-
representation of tabular data. Even a list of dicts. This implementation is quite
|
|
46
|
-
brittle and depends on adding fields to every row in exactly the right order.
|
|
47
|
-
|
|
48
44
|
Args:
|
|
49
45
|
input_path (str): the input .json file to convert
|
|
50
46
|
output_path (str, optional): the output .csv file to generate; if this is None, uses
|
|
@@ -57,7 +53,7 @@ def convert_json_to_csv(input_path,
|
|
|
57
53
|
output_encoding (str, optional): encoding to use for the .csv file
|
|
58
54
|
overwrite (bool, optional): whether to overwrite an existing .csv file; if this is False and
|
|
59
55
|
the output file exists, no-ops and returns
|
|
60
|
-
|
|
56
|
+
verbose (bool, optional): enable additional debug output
|
|
61
57
|
"""
|
|
62
58
|
|
|
63
59
|
if output_path is None:
|
|
@@ -68,36 +64,28 @@ def convert_json_to_csv(input_path,
|
|
|
68
64
|
return
|
|
69
65
|
|
|
70
66
|
print('Loading json results from {}...'.format(input_path))
|
|
71
|
-
json_output =
|
|
72
|
-
|
|
73
|
-
rows = []
|
|
67
|
+
json_output = load_md_or_speciesnet_file(input_path,
|
|
68
|
+
verbose=verbose)
|
|
74
69
|
|
|
75
|
-
|
|
70
|
+
def clean_category_name(s):
|
|
71
|
+
return s.replace(',','_').replace(' ','_').lower()
|
|
76
72
|
|
|
77
|
-
#
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
for cat_id in range(1,n_non_empty_detection_categories+1):
|
|
84
|
-
cat_name = annotation_constants.detector_bbox_category_id_to_name[cat_id]
|
|
85
|
-
detection_category_column_names.append('max_conf_' + cat_name)
|
|
73
|
+
# Create column names for max detection confidences
|
|
74
|
+
detection_category_id_to_max_conf_column_name = {}
|
|
75
|
+
for category_id in json_output['detection_categories'].keys():
|
|
76
|
+
category_name = clean_category_name(json_output['detection_categories'][category_id])
|
|
77
|
+
detection_category_id_to_max_conf_column_name[category_id] = \
|
|
78
|
+
'max_conf_' + category_name
|
|
86
79
|
|
|
87
|
-
|
|
80
|
+
classification_category_id_to_max_conf_column_name = {}
|
|
88
81
|
|
|
82
|
+
# Create column names for max classification confidences (if necessary)
|
|
89
83
|
if 'classification_categories' in json_output.keys():
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
category_name = classification_category_id_to_name[category_id].\
|
|
96
|
-
replace(' ','_').replace(',','')
|
|
97
|
-
classification_category_column_names.append('max_classification_conf_' + category_name)
|
|
98
|
-
classification_category_id_to_column_number[category_id] = i_category
|
|
99
|
-
|
|
100
|
-
n_classification_categories = len(classification_category_ids)
|
|
84
|
+
|
|
85
|
+
for category_id in json_output['classification_categories'].keys():
|
|
86
|
+
category_name = clean_category_name(json_output['classification_categories'][category_id])
|
|
87
|
+
classification_category_id_to_max_conf_column_name[category_id] = \
|
|
88
|
+
'max_classification_conf_' + category_name
|
|
101
89
|
|
|
102
90
|
# There are several .json fields for which we add .csv columns; other random bespoke fields
|
|
103
91
|
# will be ignored.
|
|
@@ -117,26 +105,43 @@ def convert_json_to_csv(input_path,
|
|
|
117
105
|
if len(optional_fields_present) > 0:
|
|
118
106
|
print('Found {} optional fields'.format(len(optional_fields_present)))
|
|
119
107
|
|
|
120
|
-
expected_row_length = len(fixed_columns) + len(detection_category_column_names) + \
|
|
121
|
-
n_classification_categories + len(optional_fields_present)
|
|
122
|
-
|
|
123
108
|
print('Formatting results...')
|
|
124
109
|
|
|
110
|
+
output_records = []
|
|
111
|
+
|
|
125
112
|
# i_image = 0; im = json_output['images'][i_image]
|
|
126
113
|
for im in tqdm(json_output['images']):
|
|
127
114
|
|
|
128
|
-
|
|
115
|
+
output_record = {}
|
|
116
|
+
output_records.append(output_record)
|
|
117
|
+
|
|
118
|
+
output_record['image_path'] = im['file']
|
|
119
|
+
output_record['max_confidence'] = ''
|
|
120
|
+
output_record['detections'] = ''
|
|
121
|
+
|
|
122
|
+
for field_name in optional_fields_present:
|
|
123
|
+
output_record[field_name] = ''
|
|
124
|
+
if field_name in im:
|
|
125
|
+
output_record[field_name] = im[field_name]
|
|
126
|
+
|
|
127
|
+
for detection_category_id in detection_category_id_to_max_conf_column_name:
|
|
128
|
+
column_name = detection_category_id_to_max_conf_column_name[detection_category_id]
|
|
129
|
+
output_record[column_name] = 0
|
|
130
|
+
|
|
131
|
+
for classification_category_id in classification_category_id_to_max_conf_column_name:
|
|
132
|
+
column_name = classification_category_id_to_max_conf_column_name[classification_category_id]
|
|
133
|
+
output_record[column_name] = 0
|
|
129
134
|
|
|
130
135
|
if 'failure' in im and im['failure'] is not None:
|
|
131
|
-
|
|
132
|
-
|
|
136
|
+
output_record['max_confidence'] = 'failure'
|
|
137
|
+
output_record['detections'] = im['failure']
|
|
133
138
|
# print('Skipping failed image {} ({})'.format(im['file'],im['failure']))
|
|
134
139
|
continue
|
|
135
140
|
|
|
136
141
|
max_conf = ct_utils.get_max_conf(im)
|
|
142
|
+
detection_category_id_to_max_conf = defaultdict(float)
|
|
143
|
+
classification_category_id_to_max_conf = defaultdict(float)
|
|
137
144
|
detections = []
|
|
138
|
-
max_detection_category_probabilities = [None] * n_non_empty_detection_categories
|
|
139
|
-
max_classification_category_probabilities = [0] * n_classification_categories
|
|
140
145
|
|
|
141
146
|
# d = im['detections'][0]
|
|
142
147
|
for d in im['detections']:
|
|
@@ -155,31 +160,24 @@ def convert_json_to_csv(input_path,
|
|
|
155
160
|
xmax = input_bbox[0] + input_bbox[2]
|
|
156
161
|
ymax = input_bbox[1] + input_bbox[3]
|
|
157
162
|
output_detection = [ymin, xmin, ymax, xmax]
|
|
158
|
-
|
|
159
163
|
output_detection.append(d['conf'])
|
|
160
|
-
|
|
161
|
-
# Category 0 is empty, for which we don't have a column, so the max
|
|
162
|
-
# confidence for category N goes in column N-1
|
|
163
|
-
detection_category_id = int(d['category'])
|
|
164
|
-
assert detection_category_id > 0 and detection_category_id <= \
|
|
165
|
-
n_non_empty_detection_categories
|
|
166
|
-
detection_category_column = detection_category_id - 1
|
|
167
|
-
detection_category_max = max_detection_category_probabilities[detection_category_column]
|
|
168
|
-
if detection_category_max is None or d['conf'] > detection_category_max:
|
|
169
|
-
max_detection_category_probabilities[detection_category_column] = d['conf']
|
|
170
|
-
|
|
171
|
-
output_detection.append(detection_category_id)
|
|
164
|
+
output_detection.append(int(d['category']))
|
|
172
165
|
detections.append(output_detection)
|
|
173
166
|
|
|
167
|
+
detection_category_id = d['category']
|
|
168
|
+
detection_category_max = detection_category_id_to_max_conf[detection_category_id]
|
|
169
|
+
if d['conf'] > detection_category_max:
|
|
170
|
+
detection_category_id_to_max_conf[detection_category_id] = d['conf']
|
|
171
|
+
|
|
174
172
|
if 'classifications' in d:
|
|
175
|
-
|
|
176
|
-
'Oops, I have classification results, but no classification metadata'
|
|
173
|
+
|
|
177
174
|
for c in d['classifications']:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
175
|
+
classification_category_id = c[0]
|
|
176
|
+
classification_conf = c[1]
|
|
177
|
+
classification_category_max = \
|
|
178
|
+
classification_category_id_to_max_conf[classification_category_id]
|
|
179
|
+
if classification_conf > classification_category_max:
|
|
180
|
+
classification_category_id_to_max_conf[classification_category_id] = d['conf']
|
|
183
181
|
|
|
184
182
|
# ...for each classification
|
|
185
183
|
|
|
@@ -191,40 +189,36 @@ def convert_json_to_csv(input_path,
|
|
|
191
189
|
if not omit_bounding_boxes:
|
|
192
190
|
detection_string = json.dumps(detections)
|
|
193
191
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
row.extend(max_classification_category_probabilities)
|
|
192
|
+
output_record['detections'] = detection_string
|
|
193
|
+
output_record['max_confidence'] = max_conf
|
|
197
194
|
|
|
198
|
-
for
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
row.append(str(im[field_name]))
|
|
195
|
+
for detection_category_id in detection_category_id_to_max_conf_column_name:
|
|
196
|
+
column_name = detection_category_id_to_max_conf_column_name[detection_category_id]
|
|
197
|
+
output_record[column_name] = \
|
|
198
|
+
detection_category_id_to_max_conf[detection_category_id]
|
|
203
199
|
|
|
204
|
-
|
|
205
|
-
|
|
200
|
+
for classification_category_id in classification_category_id_to_max_conf_column_name:
|
|
201
|
+
column_name = classification_category_id_to_max_conf_column_name[classification_category_id]
|
|
202
|
+
output_record[column_name] = \
|
|
203
|
+
classification_category_id_to_max_conf[classification_category_id]
|
|
206
204
|
|
|
207
205
|
# ...for each image
|
|
208
206
|
|
|
209
207
|
print('Writing to csv...')
|
|
210
208
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
header.extend(classification_category_column_names)
|
|
217
|
-
for field_name in optional_fields_present:
|
|
218
|
-
header.append(field_name)
|
|
219
|
-
writer.writerow(header)
|
|
220
|
-
writer.writerows(rows)
|
|
209
|
+
df = pd.DataFrame(output_records)
|
|
210
|
+
|
|
211
|
+
if omit_bounding_boxes:
|
|
212
|
+
df = df.drop('detections',axis=1)
|
|
213
|
+
df.to_csv(output_path,index=False,header=True)
|
|
221
214
|
|
|
222
215
|
# ...def convert_json_to_csv(...)
|
|
223
216
|
|
|
224
217
|
|
|
225
218
|
def convert_csv_to_json(input_path,output_path=None,overwrite=True):
|
|
226
219
|
"""
|
|
227
|
-
Convert .csv to .json. If output_path is None, will convert x.csv to x.json.
|
|
220
|
+
Convert .csv to .json. If output_path is None, will convert x.csv to x.json. This
|
|
221
|
+
supports a largely obsolete .csv format, there's almost no reason you want to do this.
|
|
228
222
|
|
|
229
223
|
Args:
|
|
230
224
|
input_path (str): .csv filename to convert to .json
|
|
@@ -83,6 +83,9 @@ class SubsetJsonDetectorOutputOptions:
|
|
|
83
83
|
def __init__(self):
|
|
84
84
|
|
|
85
85
|
#: Only process files containing the token 'query'
|
|
86
|
+
#:
|
|
87
|
+
#: Does not support general regexes, but supports ^ as a special case
|
|
88
|
+
#: regex-like notation for "starts with"
|
|
86
89
|
self.query = None
|
|
87
90
|
|
|
88
91
|
#: Replace 'query' with 'replacement' if 'replacement' is not None. If 'query' is None,
|
|
@@ -21,7 +21,7 @@ from megadetector.utils.path_utils import is_image_file
|
|
|
21
21
|
|
|
22
22
|
#%% Directory enumeration functions
|
|
23
23
|
|
|
24
|
-
def
|
|
24
|
+
def _create_plain_index(root, dirs, files, dirname=None):
|
|
25
25
|
"""
|
|
26
26
|
Creates the fairly plain HTML folder index including a preview of a single image file,
|
|
27
27
|
if any is present.
|
|
@@ -40,6 +40,7 @@ def create_plain_index(root, dirs, files, dirname=None):
|
|
|
40
40
|
|
|
41
41
|
if dirname is None:
|
|
42
42
|
dirname = root or '/'
|
|
43
|
+
dirname = dirname.replace('\\','/')
|
|
43
44
|
|
|
44
45
|
html = "<!DOCTYPE html>\n"
|
|
45
46
|
html += "<html lang='en'><head>"
|
|
@@ -104,13 +105,14 @@ def create_plain_index(root, dirs, files, dirname=None):
|
|
|
104
105
|
html += "</body></html>\n"
|
|
105
106
|
return html
|
|
106
107
|
|
|
107
|
-
# ...def
|
|
108
|
+
# ...def _create_plain_index(...)
|
|
108
109
|
|
|
109
110
|
|
|
110
|
-
def
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
111
|
+
def create_html_index(dir,
|
|
112
|
+
overwrite=False,
|
|
113
|
+
template_fun=_create_plain_index,
|
|
114
|
+
basepath=None,
|
|
115
|
+
recursive=True):
|
|
114
116
|
"""
|
|
115
117
|
Recursively traverses the local directory [dir] and generates a index
|
|
116
118
|
file for each folder using [template_fun] to generate the HTML output.
|
|
@@ -118,12 +120,13 @@ def traverse_and_create_index(dir,
|
|
|
118
120
|
|
|
119
121
|
Args:
|
|
120
122
|
dir (str): directory to process
|
|
121
|
-
|
|
123
|
+
overwrite (bool, optional): whether to over-write existing index file
|
|
122
124
|
template_fun (func, optional): function taking three arguments (string,
|
|
123
125
|
list of string, list of string) representing the current root, the list of folders,
|
|
124
126
|
and the list of files. Should return the HTML source of the index file.
|
|
125
127
|
basepath (str, optional): if not None, the name used for each subfolder in [dir]
|
|
126
128
|
in the output files will be relative to [basepath]
|
|
129
|
+
recursive (bool, optional): recurse into subfolders
|
|
127
130
|
"""
|
|
128
131
|
|
|
129
132
|
print('Traversing {}'.format(dir))
|
|
@@ -141,7 +144,7 @@ def traverse_and_create_index(dir,
|
|
|
141
144
|
# Output is written to file *root*/index.html
|
|
142
145
|
output_file = os.path.join(root, "index.html")
|
|
143
146
|
|
|
144
|
-
if not
|
|
147
|
+
if (not overwrite) and os.path.isfile(output_file):
|
|
145
148
|
print('Skipping {}, file exists'.format(output_file))
|
|
146
149
|
continue
|
|
147
150
|
|
|
@@ -157,7 +160,10 @@ def traverse_and_create_index(dir,
|
|
|
157
160
|
with open(output_file, 'wt') as fi:
|
|
158
161
|
fi.write(html)
|
|
159
162
|
|
|
160
|
-
|
|
163
|
+
if not recursive:
|
|
164
|
+
break
|
|
165
|
+
|
|
166
|
+
# ...def create_html_index(...)
|
|
161
167
|
|
|
162
168
|
|
|
163
169
|
#%% Command-line driver
|
|
@@ -171,7 +177,7 @@ def main(): # noqa
|
|
|
171
177
|
parser.add_argument("--basepath", type=str,
|
|
172
178
|
help='Folder names will be printed relative to basepath, if specified',
|
|
173
179
|
default=None)
|
|
174
|
-
parser.add_argument("--
|
|
180
|
+
parser.add_argument("--overwrite", action='store_true', default=False,
|
|
175
181
|
help='If set, the script will overwrite existing index.html files.')
|
|
176
182
|
|
|
177
183
|
if len(sys.argv[1:]) == 0:
|
|
@@ -182,9 +188,9 @@ def main(): # noqa
|
|
|
182
188
|
|
|
183
189
|
assert os.path.isdir(args.directory), "{} is not a valid directory".format(args.directory)
|
|
184
190
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
191
|
+
create_html_index(args.directory,
|
|
192
|
+
overwrite=args.overwrite,
|
|
193
|
+
basepath=args.basepath)
|
|
188
194
|
|
|
189
195
|
if __name__ == '__main__':
|
|
190
196
|
main()
|
megadetector/utils/path_utils.py
CHANGED
|
@@ -528,7 +528,8 @@ def find_images(dirname,
|
|
|
528
528
|
def clean_filename(filename,
|
|
529
529
|
allow_list=VALID_FILENAME_CHARS,
|
|
530
530
|
char_limit=CHAR_LIMIT,
|
|
531
|
-
force_lower=
|
|
531
|
+
force_lower=False,
|
|
532
|
+
remove_trailing_leading_whitespace=True):
|
|
532
533
|
r"""
|
|
533
534
|
Removes non-ASCII and other invalid filename characters (on any
|
|
534
535
|
reasonable OS) from a filename, then optionally trims to a maximum length.
|
|
@@ -544,11 +545,27 @@ def clean_filename(filename,
|
|
|
544
545
|
char_limit (int, optional): maximum allowable filename length, if None will skip this
|
|
545
546
|
step
|
|
546
547
|
force_lower (bool, optional): convert the resulting filename to lowercase
|
|
547
|
-
|
|
548
|
+
remove_trailing_leading_whitespace (bool, optional): remove trailing and
|
|
549
|
+
leading whitespace from each component of a path, e.g. does not allow
|
|
550
|
+
a/b/c /d.jpg
|
|
548
551
|
Returns:
|
|
549
552
|
str: cleaned version of [filename]
|
|
550
553
|
"""
|
|
551
554
|
|
|
555
|
+
if remove_trailing_leading_whitespace:
|
|
556
|
+
|
|
557
|
+
# Best effort to preserve the original separator
|
|
558
|
+
separator = '/'
|
|
559
|
+
if '\\' in filename:
|
|
560
|
+
separator = '\\'
|
|
561
|
+
|
|
562
|
+
filename = filename.replace('\\','/')
|
|
563
|
+
components = filename.split('/')
|
|
564
|
+
clean_components = [c.strip() for c in components]
|
|
565
|
+
filename = separator.join(clean_components)
|
|
566
|
+
if separator == '\\':
|
|
567
|
+
filename = filename.replace('/','\\')
|
|
568
|
+
|
|
552
569
|
# keep only valid ascii chars
|
|
553
570
|
cleaned_filename = (unicodedata.normalize('NFKD', filename)
|
|
554
571
|
.encode('ASCII', 'ignore').decode())
|
|
@@ -565,7 +582,8 @@ def clean_filename(filename,
|
|
|
565
582
|
def clean_path(pathname,
|
|
566
583
|
allow_list=VALID_PATH_CHARS,
|
|
567
584
|
char_limit=CHAR_LIMIT,
|
|
568
|
-
force_lower=False
|
|
585
|
+
force_lower=False,
|
|
586
|
+
remove_trailing_leading_whitespace=True):
|
|
569
587
|
"""
|
|
570
588
|
Removes non-ASCII and other invalid path characters (on any reasonable
|
|
571
589
|
OS) from a path, then optionally trims to a maximum length.
|
|
@@ -576,13 +594,20 @@ def clean_path(pathname,
|
|
|
576
594
|
char_limit (int, optional): maximum allowable filename length, if None will skip this
|
|
577
595
|
step
|
|
578
596
|
force_lower (bool, optional): convert the resulting filename to lowercase
|
|
597
|
+
remove_trailing_leading_whitespace (bool, optional): remove trailing and
|
|
598
|
+
leading whitespace from each component of a path, e.g. does not allow
|
|
599
|
+
a/b/c /d.jpg
|
|
579
600
|
|
|
580
601
|
Returns:
|
|
581
602
|
str: cleaned version of [filename]
|
|
582
603
|
"""
|
|
583
604
|
|
|
584
|
-
return clean_filename(pathname,
|
|
585
|
-
|
|
605
|
+
return clean_filename(pathname,
|
|
606
|
+
allow_list=allow_list,
|
|
607
|
+
char_limit=char_limit,
|
|
608
|
+
force_lower=force_lower,
|
|
609
|
+
remove_trailing_leading_whitespace=\
|
|
610
|
+
remove_trailing_leading_whitespace)
|
|
586
611
|
|
|
587
612
|
|
|
588
613
|
def flatten_path(pathname,separator_chars=SEPARATOR_CHARS,separator_char_replacement='~'):
|
|
@@ -1553,6 +1578,7 @@ class TestPathUtils:
|
|
|
1553
1578
|
"""
|
|
1554
1579
|
|
|
1555
1580
|
self.test_dir = make_test_folder(subfolder='megadetector/path_utils_tests')
|
|
1581
|
+
print('Using temporary folder {} for path utils testing'.format(self.test_dir))
|
|
1556
1582
|
os.makedirs(self.test_dir, exist_ok=True)
|
|
1557
1583
|
|
|
1558
1584
|
|
|
@@ -1776,7 +1802,11 @@ class TestPathUtils:
|
|
|
1776
1802
|
])
|
|
1777
1803
|
folders_non_recursive_abs = folder_list(folder_list_dir, recursive=False,
|
|
1778
1804
|
return_relative_paths=False)
|
|
1779
|
-
assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs
|
|
1805
|
+
assert sorted(folders_non_recursive_abs) == expected_folders_non_recursive_abs, \
|
|
1806
|
+
'Non-recursive folder list failured, expected:\n\n{}\n\nFound:\n\n{}'.format(
|
|
1807
|
+
str(expected_folders_non_recursive_abs),
|
|
1808
|
+
str(folders_non_recursive_abs)
|
|
1809
|
+
)
|
|
1780
1810
|
|
|
1781
1811
|
# Test non-recursive, relative paths
|
|
1782
1812
|
expected_folders_non_recursive_rel = sorted(['subdir1', 'subdir2'])
|
|
@@ -2114,7 +2144,17 @@ class TestPathUtils:
|
|
|
2114
2144
|
assert clean_filename("test*file?.txt", char_limit=10) == "testfile.t"
|
|
2115
2145
|
assert clean_filename("TestFile.TXT", force_lower=True) == "testfile.txt"
|
|
2116
2146
|
assert clean_filename("file:with<illegal>chars.txt") == "filewithillegalchars.txt"
|
|
2117
|
-
|
|
2147
|
+
|
|
2148
|
+
s = " accented_name_éà.txt"
|
|
2149
|
+
|
|
2150
|
+
assert clean_filename(s,
|
|
2151
|
+
remove_trailing_leading_whitespace=False) == " accented_name_ea.txt", \
|
|
2152
|
+
'clean_filename with remove_trailing_leading_whitespace=False: {}'.format(
|
|
2153
|
+
clean_filename(s, remove_trailing_leading_whitespace=False))
|
|
2154
|
+
|
|
2155
|
+
assert clean_filename(s, remove_trailing_leading_whitespace=True) == "accented_name_ea.txt", \
|
|
2156
|
+
'clean_filename with remove_trailing_leading_whitespace=False: {}'.format(
|
|
2157
|
+
clean_filename(s, remove_trailing_leading_whitespace=True))
|
|
2118
2158
|
|
|
2119
2159
|
# Separators are not allowed by default in clean_filename
|
|
2120
2160
|
assert clean_filename("path/to/file.txt") == "pathtofile.txt"
|
|
@@ -2444,7 +2484,13 @@ class TestPathUtils:
|
|
|
2444
2484
|
un_tar_dir = os.path.join(self.test_dir, "un_tar_contents")
|
|
2445
2485
|
os.makedirs(un_tar_dir, exist_ok=True)
|
|
2446
2486
|
with tarfile.open(output_tar_path, 'r:gz') as tf:
|
|
2447
|
-
|
|
2487
|
+
# The "filter" option was added as of Python 3.12, and *not* specifying
|
|
2488
|
+
# filter=None will change behavior as of Python 3.14. We want the unmodified
|
|
2489
|
+
# behavior, but we want to support Python <3.12, so we do a version check.
|
|
2490
|
+
if sys.version_info >= (3, 12):
|
|
2491
|
+
tf.extractall(path=un_tar_dir, filter=None)
|
|
2492
|
+
else:
|
|
2493
|
+
tf.extractall(path=un_tar_dir)
|
|
2448
2494
|
|
|
2449
2495
|
expected_untarred_file1 = os.path.join(un_tar_dir, os.path.relpath(file1_path, self.test_dir))
|
|
2450
2496
|
expected_untarred_file2 = os.path.join(un_tar_dir, os.path.relpath(file2_path, self.test_dir))
|
|
@@ -2618,7 +2664,9 @@ def test_path_utils():
|
|
|
2618
2664
|
|
|
2619
2665
|
test_instance = TestPathUtils()
|
|
2620
2666
|
test_instance.set_up()
|
|
2667
|
+
|
|
2621
2668
|
try:
|
|
2669
|
+
|
|
2622
2670
|
test_instance.test_is_image_file()
|
|
2623
2671
|
test_instance.test_find_image_strings()
|
|
2624
2672
|
test_instance.test_find_images()
|
|
@@ -2643,5 +2691,7 @@ def test_path_utils():
|
|
|
2643
2691
|
test_instance.test_add_files_to_single_tar_file()
|
|
2644
2692
|
test_instance.test_parallel_zip_individual_files_and_folders()
|
|
2645
2693
|
test_instance.test_compute_file_hash()
|
|
2694
|
+
|
|
2646
2695
|
finally:
|
|
2696
|
+
|
|
2647
2697
|
test_instance.tear_down()
|
megadetector/utils/url_utils.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
url_utils.py
|
|
4
4
|
|
|
5
|
-
Frequently-used functions for downloading or
|
|
5
|
+
Frequently-used functions for downloading, manipulating, or serving URLs
|
|
6
6
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
@@ -16,6 +16,9 @@ import urllib.error
|
|
|
16
16
|
import requests
|
|
17
17
|
import shutil
|
|
18
18
|
import pytest
|
|
19
|
+
import socketserver
|
|
20
|
+
import threading
|
|
21
|
+
import http.server
|
|
19
22
|
|
|
20
23
|
from functools import partial
|
|
21
24
|
from tqdm import tqdm
|
|
@@ -453,6 +456,93 @@ def get_url_sizes(urls,n_workers=1,pool_type='thread',timeout=None,verbose=False
|
|
|
453
456
|
return url_to_size
|
|
454
457
|
|
|
455
458
|
|
|
459
|
+
#%% Singleton HTTP server
|
|
460
|
+
|
|
461
|
+
class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
462
|
+
"""
|
|
463
|
+
SimpleHTTPRequestHandler sublcass that suppresses console printouts
|
|
464
|
+
"""
|
|
465
|
+
def __init__(self, *args, directory=None, **kwargs):
|
|
466
|
+
super().__init__(*args, directory=directory, **kwargs)
|
|
467
|
+
|
|
468
|
+
def log_message(self, format, *args): # noqa
|
|
469
|
+
pass
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class SingletonHTTPServer:
|
|
473
|
+
"""
|
|
474
|
+
HTTP server that runs on a local port, serving a particular local folder. Runs as a
|
|
475
|
+
singleton, so starting a server in a new folder closes the previous server. I use this
|
|
476
|
+
primarily to serve MD/SpeciesNet previews from manage_local_batch, which can exceed
|
|
477
|
+
the 260-character filename length limitation imposed by browser on Windows, so really the
|
|
478
|
+
point here is just to remove characters from the URL.
|
|
479
|
+
"""
|
|
480
|
+
|
|
481
|
+
_server = None
|
|
482
|
+
_thread = None
|
|
483
|
+
|
|
484
|
+
@classmethod
|
|
485
|
+
def start_server(cls, directory, port=8000, host='localhost'):
|
|
486
|
+
"""
|
|
487
|
+
Start or restart the HTTP server with a specific directory
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
directory (str): the root folder served by the server
|
|
491
|
+
port (int, optional): the port on which to create the server
|
|
492
|
+
host (str, optional): the host on which to listen, typically
|
|
493
|
+
either "localhost" (default) or "0.0.0.0"
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
str: URL to the running host
|
|
497
|
+
"""
|
|
498
|
+
|
|
499
|
+
# Stop the existing server instance if necessary
|
|
500
|
+
cls.stop_server()
|
|
501
|
+
|
|
502
|
+
# Create new server
|
|
503
|
+
handler = partial(QuietHTTPRequestHandler, directory=directory)
|
|
504
|
+
cls._server = socketserver.TCPServer((host, port), handler)
|
|
505
|
+
|
|
506
|
+
# Start server in daemon thread (dies when parent process dies)
|
|
507
|
+
cls._thread = threading.Thread(target=cls._server.serve_forever)
|
|
508
|
+
cls._thread.daemon = True
|
|
509
|
+
cls._thread.start()
|
|
510
|
+
|
|
511
|
+
print(f"Serving {directory} at http://{host}:{port}")
|
|
512
|
+
return f"http://{host}:{port}"
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
@classmethod
|
|
516
|
+
def stop_server(cls):
|
|
517
|
+
"""
|
|
518
|
+
Stop the current server (if one is running)
|
|
519
|
+
"""
|
|
520
|
+
|
|
521
|
+
if cls._server:
|
|
522
|
+
cls._server.shutdown()
|
|
523
|
+
cls._server.server_close()
|
|
524
|
+
cls._server = None
|
|
525
|
+
if cls._thread:
|
|
526
|
+
cls._thread.join(timeout=1)
|
|
527
|
+
cls._thread = None
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
@classmethod
|
|
531
|
+
def is_running(cls):
|
|
532
|
+
"""
|
|
533
|
+
Check whether the server is currently running.
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
bool: True if the server is running
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
return (cls._server is not None) and \
|
|
540
|
+
(cls._thread is not None) and \
|
|
541
|
+
(cls._thread.is_alive())
|
|
542
|
+
|
|
543
|
+
# ...class SingletonHTTPServer
|
|
544
|
+
|
|
545
|
+
|
|
456
546
|
#%% Tests
|
|
457
547
|
|
|
458
548
|
# Constants for tests
|