dclab 0.67.0__cp314-cp314-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dclab might be problematic. Click here for more details.
- dclab/__init__.py +41 -0
- dclab/_version.py +34 -0
- dclab/cached.py +97 -0
- dclab/cli/__init__.py +10 -0
- dclab/cli/common.py +237 -0
- dclab/cli/task_compress.py +126 -0
- dclab/cli/task_condense.py +223 -0
- dclab/cli/task_join.py +229 -0
- dclab/cli/task_repack.py +98 -0
- dclab/cli/task_split.py +154 -0
- dclab/cli/task_tdms2rtdc.py +186 -0
- dclab/cli/task_verify_dataset.py +75 -0
- dclab/definitions/__init__.py +79 -0
- dclab/definitions/feat_const.py +202 -0
- dclab/definitions/feat_logic.py +182 -0
- dclab/definitions/meta_const.py +252 -0
- dclab/definitions/meta_logic.py +111 -0
- dclab/definitions/meta_parse.py +94 -0
- dclab/downsampling.cpython-314-darwin.so +0 -0
- dclab/downsampling.pyx +230 -0
- dclab/external/__init__.py +4 -0
- dclab/external/packaging/LICENSE +3 -0
- dclab/external/packaging/LICENSE.APACHE +177 -0
- dclab/external/packaging/LICENSE.BSD +23 -0
- dclab/external/packaging/__init__.py +6 -0
- dclab/external/packaging/_structures.py +61 -0
- dclab/external/packaging/version.py +505 -0
- dclab/external/skimage/LICENSE +28 -0
- dclab/external/skimage/__init__.py +2 -0
- dclab/external/skimage/_find_contours.py +216 -0
- dclab/external/skimage/_find_contours_cy.cpython-314-darwin.so +0 -0
- dclab/external/skimage/_find_contours_cy.pyx +188 -0
- dclab/external/skimage/_pnpoly.cpython-314-darwin.so +0 -0
- dclab/external/skimage/_pnpoly.pyx +99 -0
- dclab/external/skimage/_shared/__init__.py +1 -0
- dclab/external/skimage/_shared/geometry.cpython-314-darwin.so +0 -0
- dclab/external/skimage/_shared/geometry.pxd +6 -0
- dclab/external/skimage/_shared/geometry.pyx +55 -0
- dclab/external/skimage/measure.py +7 -0
- dclab/external/skimage/pnpoly.py +53 -0
- dclab/external/statsmodels/LICENSE +35 -0
- dclab/external/statsmodels/__init__.py +6 -0
- dclab/external/statsmodels/nonparametric/__init__.py +1 -0
- dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
- dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
- dclab/external/statsmodels/nonparametric/kernels.py +36 -0
- dclab/features/__init__.py +9 -0
- dclab/features/bright.py +81 -0
- dclab/features/bright_bc.py +93 -0
- dclab/features/bright_perc.py +63 -0
- dclab/features/contour.py +161 -0
- dclab/features/emodulus/__init__.py +339 -0
- dclab/features/emodulus/load.py +252 -0
- dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
- dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
- dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
- dclab/features/emodulus/pxcorr.py +135 -0
- dclab/features/emodulus/scale_linear.py +247 -0
- dclab/features/emodulus/viscosity.py +260 -0
- dclab/features/fl_crosstalk.py +95 -0
- dclab/features/inert_ratio.py +377 -0
- dclab/features/volume.py +242 -0
- dclab/http_utils.py +322 -0
- dclab/isoelastics/__init__.py +468 -0
- dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
- dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
- dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
- dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
- dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
- dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
- dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
- dclab/kde/__init__.py +1 -0
- dclab/kde/base.py +459 -0
- dclab/kde/contours.py +222 -0
- dclab/kde/methods.py +313 -0
- dclab/kde_contours.py +10 -0
- dclab/kde_methods.py +11 -0
- dclab/lme4/__init__.py +5 -0
- dclab/lme4/lme4_template.R +94 -0
- dclab/lme4/rsetup.py +204 -0
- dclab/lme4/wrapr.py +386 -0
- dclab/polygon_filter.py +398 -0
- dclab/rtdc_dataset/__init__.py +15 -0
- dclab/rtdc_dataset/check.py +902 -0
- dclab/rtdc_dataset/config.py +533 -0
- dclab/rtdc_dataset/copier.py +353 -0
- dclab/rtdc_dataset/core.py +896 -0
- dclab/rtdc_dataset/export.py +867 -0
- dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
- dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
- dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
- dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
- dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
- dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
- dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
- dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
- dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
- dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
- dclab/rtdc_dataset/feat_basin.py +762 -0
- dclab/rtdc_dataset/feat_temp.py +102 -0
- dclab/rtdc_dataset/filter.py +263 -0
- dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
- dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
- dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
- dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
- dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
- dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
- dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
- dclab/rtdc_dataset/fmt_dict.py +103 -0
- dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
- dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
- dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
- dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
- dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
- dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
- dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
- dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
- dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
- dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
- dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
- dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
- dclab/rtdc_dataset/fmt_http.py +102 -0
- dclab/rtdc_dataset/fmt_s3.py +354 -0
- dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
- dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
- dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
- dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
- dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
- dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
- dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
- dclab/rtdc_dataset/load.py +77 -0
- dclab/rtdc_dataset/meta_table.py +25 -0
- dclab/rtdc_dataset/writer.py +1019 -0
- dclab/statistics.py +226 -0
- dclab/util.py +176 -0
- dclab/warn.py +15 -0
- dclab-0.67.0.dist-info/METADATA +153 -0
- dclab-0.67.0.dist-info/RECORD +142 -0
- dclab-0.67.0.dist-info/WHEEL +6 -0
- dclab-0.67.0.dist-info/entry_points.txt +8 -0
- dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
- dclab-0.67.0.dist-info/top_level.txt +1 -0
dclab/features/volume.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Volume computation based on contour revolution"""
|
|
2
|
+
import numpy as np
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_volume(cont, pos_x, pos_y, pix, fix_orientation=False):
|
|
6
|
+
"""Calculate the volume of a polygon revolved around an axis
|
|
7
|
+
|
|
8
|
+
The volume estimation assumes rotational symmetry.
|
|
9
|
+
|
|
10
|
+
Parameters
|
|
11
|
+
----------
|
|
12
|
+
cont: ndarray or list of ndarrays of shape (N,2)
|
|
13
|
+
A 2D array that holds the contour of an event [px]
|
|
14
|
+
e.g. obtained using `mm.contour` where `mm` is an instance
|
|
15
|
+
of `RTDCBase`. The first and second columns of `cont`
|
|
16
|
+
correspond to the x- and y-coordinates of the contour.
|
|
17
|
+
pos_x: float or ndarray of length N
|
|
18
|
+
The x coordinate(s) of the centroid of the event(s) [µm]
|
|
19
|
+
e.g. obtained using `mm.pos_x`
|
|
20
|
+
pos_y: float or ndarray of length N
|
|
21
|
+
The y coordinate(s) of the centroid of the event(s) [µm]
|
|
22
|
+
e.g. obtained using `mm.pos_y`
|
|
23
|
+
pix: float
|
|
24
|
+
The detector pixel size in µm.
|
|
25
|
+
e.g. obtained using: `mm.config["imaging"]["pixel size"]`
|
|
26
|
+
fix_orientation: bool
|
|
27
|
+
If set to True, make sure that the orientation of the
|
|
28
|
+
contour is counter-clockwise in the r-z plane
|
|
29
|
+
(see :func:`vol_revolve`). This is False by default, because
|
|
30
|
+
(1) Shape-In always stores the contours in the correct
|
|
31
|
+
orientation and (2) there may be events with high porosity
|
|
32
|
+
where "fixing" the orientation makes things worse and a
|
|
33
|
+
negative volume is returned.
|
|
34
|
+
|
|
35
|
+
Returns
|
|
36
|
+
-------
|
|
37
|
+
volume: float or ndarray
|
|
38
|
+
volume in um^3
|
|
39
|
+
|
|
40
|
+
Notes
|
|
41
|
+
-----
|
|
42
|
+
The computation of the volume is based on a full rotation of the
|
|
43
|
+
upper and the lower halves of the contour from which the
|
|
44
|
+
average is then used.
|
|
45
|
+
|
|
46
|
+
The volume is computed radially from the the center position
|
|
47
|
+
given by (`pos_x`, `pos_y`). For sufficiently smooth contours,
|
|
48
|
+
such as densely sampled ellipses, the center position does not
|
|
49
|
+
play an important role. For contours that are given on a coarse
|
|
50
|
+
grid, as is the case for RT-DC, the center position must be
|
|
51
|
+
given.
|
|
52
|
+
|
|
53
|
+
References
|
|
54
|
+
----------
|
|
55
|
+
- https://de.wikipedia.org/wiki/Kegelstumpf#Formeln
|
|
56
|
+
- Yields identical results to the Matlab script by Geoff Olynyk
|
|
57
|
+
<https://de.mathworks.com/matlabcentral/fileexchange/36525-volrevolve>`_
|
|
58
|
+
"""
|
|
59
|
+
if np.isscalar(pos_x):
|
|
60
|
+
cont = [cont]
|
|
61
|
+
ret_list = False
|
|
62
|
+
else:
|
|
63
|
+
ret_list = True
|
|
64
|
+
|
|
65
|
+
# Convert input to 1D arrays
|
|
66
|
+
pos_x = np.atleast_1d(pos_x)
|
|
67
|
+
pos_y = np.atleast_1d(pos_y)
|
|
68
|
+
|
|
69
|
+
if pos_x.size != pos_y.size:
|
|
70
|
+
raise ValueError("Size of `pos_x` and `pos_y` must match!")
|
|
71
|
+
|
|
72
|
+
if pos_x.size > 1 and len(cont) <= 1:
|
|
73
|
+
raise ValueError("Number of given contours too small!")
|
|
74
|
+
|
|
75
|
+
# results are stored in a separate array initialized with nans
|
|
76
|
+
v_avg = np.zeros_like(pos_x, dtype=np.float64) * np.nan
|
|
77
|
+
|
|
78
|
+
# v_avg has the shape of `pos_x`. We are iterating over the smallest
|
|
79
|
+
# length for `cont` and `pos_x`.
|
|
80
|
+
for ii in range(min(len(cont), pos_x.shape[0])):
|
|
81
|
+
# If the contour has less than 4 pixels, the computation will fail.
|
|
82
|
+
# In that case, the value np.nan is already assigned.
|
|
83
|
+
cc = cont[ii]
|
|
84
|
+
if cc.shape[0] >= 4:
|
|
85
|
+
# Center contour coordinates with given centroid
|
|
86
|
+
contour_x = cc[:, 0] - pos_x[ii] / pix
|
|
87
|
+
contour_y = cc[:, 1] - pos_y[ii] / pix
|
|
88
|
+
# Switch to r and z to follow notation of vol_revolve
|
|
89
|
+
# (In RT-DC the axis of rotation is x, but for vol_revolve
|
|
90
|
+
# we need the axis vertically)
|
|
91
|
+
contour_r = contour_y
|
|
92
|
+
contour_z = contour_x
|
|
93
|
+
if fix_orientation:
|
|
94
|
+
# Make sure the contour is counter-clockwise
|
|
95
|
+
contour_r, contour_z = counter_clockwise(contour_r, contour_z)
|
|
96
|
+
|
|
97
|
+
# Compute right volume
|
|
98
|
+
# Which points are at negative r-values (r<0)?
|
|
99
|
+
inx_neg = np.where(contour_r < 0)
|
|
100
|
+
# These points will be shifted up to r=0 directly on the z-axis
|
|
101
|
+
contour_right = np.copy(contour_r)
|
|
102
|
+
contour_right[inx_neg] = 0
|
|
103
|
+
vol_right = vol_revolve(contour_right, contour_x, pix)
|
|
104
|
+
|
|
105
|
+
# Compute left volume
|
|
106
|
+
# Which points are at positive r-values? (r>0)?
|
|
107
|
+
idx_pos = np.where(contour_r > 0)
|
|
108
|
+
# These points will be shifted down to y=0 to build an x-axis
|
|
109
|
+
contour_left = np.copy(contour_r)
|
|
110
|
+
contour_left[idx_pos] = 0
|
|
111
|
+
# Now we still have negative r values, but vol_revolve needs
|
|
112
|
+
# positive values, so we flip the sign...
|
|
113
|
+
contour_left[:] *= -1
|
|
114
|
+
# ... but in doing so, we have switched to clockwise rotation
|
|
115
|
+
# and we need to pass the array in reverse order
|
|
116
|
+
vol_left = vol_revolve(contour_left[::-1], contour_x[::-1], pix)
|
|
117
|
+
|
|
118
|
+
# Compute the average
|
|
119
|
+
v_avg[ii] = (vol_right + vol_left) / 2
|
|
120
|
+
|
|
121
|
+
if not ret_list:
|
|
122
|
+
# Do not return a list if the input contour was not in a list
|
|
123
|
+
v_avg = v_avg[0]
|
|
124
|
+
|
|
125
|
+
return v_avg
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def counter_clockwise(cx, cy):
|
|
129
|
+
"""Put contour coordinates into counter-clockwise order
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
cx, cy: 1d ndarrays
|
|
134
|
+
The x- and y-coordinates of the contour
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
cx_cc, cy_cc:
|
|
139
|
+
The x- and y-coordinates of the contour in
|
|
140
|
+
counter-clockwise orientation.
|
|
141
|
+
|
|
142
|
+
Notes
|
|
143
|
+
-----
|
|
144
|
+
The contour must be centered around (0, 0).
|
|
145
|
+
"""
|
|
146
|
+
# test orientation
|
|
147
|
+
angles = np.unwrap(np.arctan2(cy, cx))
|
|
148
|
+
grad = np.diff(angles)
|
|
149
|
+
if np.average(grad) < 0:
|
|
150
|
+
return cx[::-1], cy[::-1]
|
|
151
|
+
else:
|
|
152
|
+
return cx, cy
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def vol_revolve(r, z, point_scale=1.):
|
|
156
|
+
r"""Calculate the volume of a polygon revolved around the Z-axis
|
|
157
|
+
|
|
158
|
+
This implementation yields the same results as the volRevolve
|
|
159
|
+
Matlab function by Geoff Olynyk (from 2012-05-03)
|
|
160
|
+
https://de.mathworks.com/matlabcentral/fileexchange/36525-volrevolve.
|
|
161
|
+
|
|
162
|
+
The difference here is that the volume is computed using (a much
|
|
163
|
+
more approachable) implementation using the volume of a truncated
|
|
164
|
+
cone (https://de.wikipedia.org/wiki/Kegelstumpf).
|
|
165
|
+
|
|
166
|
+
.. math::
|
|
167
|
+
|
|
168
|
+
V = \frac{h \cdot \pi}{3} \cdot (R^2 + R \cdot r + r^2)
|
|
169
|
+
|
|
170
|
+
Where :math:`h` is the height of the cone and :math:`r` and
|
|
171
|
+
`R` are the smaller and larger radii of the truncated cone.
|
|
172
|
+
|
|
173
|
+
Each line segment of the contour resembles one truncated cone. If
|
|
174
|
+
the z-step is positive (counter-clockwise contour), then the
|
|
175
|
+
truncated cone volume is added to the total volume. If the z-step
|
|
176
|
+
is negative (e.g. inclusion), then the truncated cone volume is
|
|
177
|
+
removed from the total volume.
|
|
178
|
+
|
|
179
|
+
.. versionchanged:: 0.37.0
|
|
180
|
+
|
|
181
|
+
The volume in previous versions was overestimated by on average
|
|
182
|
+
2µm³.
|
|
183
|
+
|
|
184
|
+
Parameters
|
|
185
|
+
----------
|
|
186
|
+
r: 1d np.ndarray
|
|
187
|
+
radial coordinates (perpendicular to the z axis)
|
|
188
|
+
z: 1d np.ndarray
|
|
189
|
+
coordinate along the axis of rotation
|
|
190
|
+
point_scale: float
|
|
191
|
+
point size in your preferred units; The volume is multiplied
|
|
192
|
+
by a factor of `point_scale**3`.
|
|
193
|
+
|
|
194
|
+
Notes
|
|
195
|
+
-----
|
|
196
|
+
The coordinates must be given in counter-clockwise order,
|
|
197
|
+
otherwise the volume will be negative.
|
|
198
|
+
"""
|
|
199
|
+
r = np.array(r).flatten()
|
|
200
|
+
z = np.array(z).flatten()
|
|
201
|
+
|
|
202
|
+
# sanity checks
|
|
203
|
+
assert len(r) == len(z)
|
|
204
|
+
assert len(r) >= 3
|
|
205
|
+
assert len(r.shape) == len(z.shape) == 1
|
|
206
|
+
assert np.all(r >= 0)
|
|
207
|
+
|
|
208
|
+
# make sure we have a closed contour
|
|
209
|
+
if (r[-1] != r[0]) or (z[-1] != z[0]):
|
|
210
|
+
# We have an open contour - close it.
|
|
211
|
+
r = np.resize(r, len(r) + 1)
|
|
212
|
+
z = np.resize(z, len(z) + 1)
|
|
213
|
+
|
|
214
|
+
rp = r[:-1]
|
|
215
|
+
|
|
216
|
+
# array of radii differences: R - r
|
|
217
|
+
dr = np.diff(r)
|
|
218
|
+
# array of height differences: h
|
|
219
|
+
dz = np.diff(z)
|
|
220
|
+
|
|
221
|
+
# If we expand the function in the doc string with
|
|
222
|
+
# dr = R - r and dz = h, then we get three terms for the volume
|
|
223
|
+
# (as opposed to four terms in Olynyk's script). Those three terms
|
|
224
|
+
# all resemble area slices multiplied by the z-distance dz.
|
|
225
|
+
a1 = 3 * rp**2
|
|
226
|
+
a2 = 3 * rp*dr
|
|
227
|
+
a3 = dr**2
|
|
228
|
+
|
|
229
|
+
# Note that the formula for computing the volume is symmetric
|
|
230
|
+
# with respect to r and R. This means that it does not matter
|
|
231
|
+
# which sign dr has (R and r are always positive). Since this
|
|
232
|
+
# algorithm assumes that the contour is ordered counter-clockwise,
|
|
233
|
+
# positive dz means adding to the contour while negative dz means
|
|
234
|
+
# subtracting from the contour (see test functions for more details).
|
|
235
|
+
# Conveniently so, dz only appears one time in this formula, so
|
|
236
|
+
# we can take the sign of dz as it is (Otherwise, we would have
|
|
237
|
+
# to take the absolute value of every truncated cone volume and
|
|
238
|
+
# multiply it by np.sign(dz)).
|
|
239
|
+
v = np.pi / 3 * dz * np.abs(a1 + a2 + a3)
|
|
240
|
+
vol = np.sum(v) * point_scale ** 3
|
|
241
|
+
|
|
242
|
+
return vol
|
dclab/http_utils.py
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import socket
|
|
6
|
+
from unittest import mock
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
import warnings
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import requests
|
|
15
|
+
except ModuleNotFoundError:
|
|
16
|
+
requests = mock.Mock()
|
|
17
|
+
REQUESTS_AVAILABLE = False
|
|
18
|
+
else:
|
|
19
|
+
REQUESTS_AVAILABLE = True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
#: Regular expression for matching a regular HTTP URL
|
|
23
|
+
REGEXP_HTTP_URL = re.compile(
|
|
24
|
+
r"^(https?:\/\/)" # protocol (http or https or omitted)
|
|
25
|
+
r"([a-z0-9-\.]*)(\:[0-9]*)?\/" # host:port
|
|
26
|
+
r".+" # path
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ETagNotInResponseHeaderWarning(UserWarning):
|
|
31
|
+
"""Used for cases where the requests.Response does not contain an ETag"""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ConnectionTimeoutWarning(UserWarning):
|
|
35
|
+
"""Used when a connection fails or times out"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class HTTPFile(io.IOBase):
|
|
39
|
+
def __init__(self, url, chunk_size=2**18, keep_chunks=200):
|
|
40
|
+
"""Chunk-cached access to a URL supporting range requests
|
|
41
|
+
|
|
42
|
+
Range requests (https://en.wikipedia.org/wiki/Byte_serving)
|
|
43
|
+
allow clients to access specific parts of a file via HTTP
|
|
44
|
+
without downloading the entire file.
|
|
45
|
+
|
|
46
|
+
This class creates a file-like object from a URL that can
|
|
47
|
+
then be passed on to e.g. h5py for reading. In addition, this
|
|
48
|
+
class keeps a chunk cache of the URL, making it (A) fast to
|
|
49
|
+
access frequently used parts of the file and (B) fast to slice
|
|
50
|
+
through large files since the ratio of data downloaded versus
|
|
51
|
+
(time-consuming) HTTP requests is very large.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
url: str
|
|
56
|
+
Path to the URL
|
|
57
|
+
chunk_size: int
|
|
58
|
+
Download chunk size. The entire file is split up into
|
|
59
|
+
equally-sized (and thus indexable) chunks.
|
|
60
|
+
keep_chunks: int
|
|
61
|
+
Number of downloaded chunks to keep in memory. For a
|
|
62
|
+
`chunk_size` of 2**18 bytes, a `keep_chunks` of 200
|
|
63
|
+
impliese a chunk cache size of 50 MiB.
|
|
64
|
+
"""
|
|
65
|
+
self.url = url
|
|
66
|
+
self._chunk_size = chunk_size
|
|
67
|
+
self._keep_chunks = keep_chunks
|
|
68
|
+
self.session = session_cache.get_session(url)
|
|
69
|
+
self._len = None
|
|
70
|
+
self._etag = None
|
|
71
|
+
self._pos = 0
|
|
72
|
+
self.cache = {}
|
|
73
|
+
|
|
74
|
+
def _parse_header(self):
|
|
75
|
+
"""parse the header sent by the server, populates length and etag"""
|
|
76
|
+
if self._len is None:
|
|
77
|
+
# Do not use `self.session.head`, because it might return a
|
|
78
|
+
# wrong content-length for pre-signed S3 URLS.
|
|
79
|
+
resp = self.session.get(self.url, stream=True, timeout=0.5)
|
|
80
|
+
if resp.status_code != 200:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Server replied with status code {resp.status_code} "
|
|
83
|
+
f"{resp.reason} for '{self.url}'")
|
|
84
|
+
self._len = int(resp.headers["content-length"])
|
|
85
|
+
# Try to determine the etag of the file.
|
|
86
|
+
etag = resp.headers.get("etag", "").strip("'").strip('"')
|
|
87
|
+
if len(etag) < 5:
|
|
88
|
+
etag = None
|
|
89
|
+
warnings.warn(f"Got empty ETag header for {self.url}",
|
|
90
|
+
ETagNotInResponseHeaderWarning)
|
|
91
|
+
self._etag = etag
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def etag(self):
|
|
95
|
+
"""Unique identifier for this resource (version) by the web server"""
|
|
96
|
+
self._parse_header()
|
|
97
|
+
return self._etag
|
|
98
|
+
|
|
99
|
+
@property
|
|
100
|
+
def length(self):
|
|
101
|
+
self._parse_header()
|
|
102
|
+
return self._len
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def max_cache_size(self):
|
|
106
|
+
"""The maximum cache size allowed by `chunk_size` and `keep_chunks`"""
|
|
107
|
+
return self._chunk_size * self._keep_chunks
|
|
108
|
+
|
|
109
|
+
def close(self):
|
|
110
|
+
"""Close the file
|
|
111
|
+
|
|
112
|
+
This closes the requests session and then calls `close` on
|
|
113
|
+
the super class.
|
|
114
|
+
"""
|
|
115
|
+
self.session.close()
|
|
116
|
+
super(HTTPFile, self).close()
|
|
117
|
+
|
|
118
|
+
def download_range(self, start, stop):
|
|
119
|
+
"""Download bytes given by the range (`start`, `stop`)
|
|
120
|
+
|
|
121
|
+
`stop` is not inclusive (In the HTTP range request it normally is).
|
|
122
|
+
"""
|
|
123
|
+
resp = self.session.get(self.url,
|
|
124
|
+
headers={"Range": f"bytes={start}-{stop-1}"}
|
|
125
|
+
)
|
|
126
|
+
return resp.content
|
|
127
|
+
|
|
128
|
+
def get_cache_chunk(self, index):
|
|
129
|
+
"""Return the cache chunk defined by `index`
|
|
130
|
+
|
|
131
|
+
If the chunk is not in `self.cache`, it is downloaded.
|
|
132
|
+
"""
|
|
133
|
+
if index not in self.cache:
|
|
134
|
+
start = index*self._chunk_size
|
|
135
|
+
stop = min((index+1)*self._chunk_size, self.length)
|
|
136
|
+
self.cache[index] = self.download_range(start, stop)
|
|
137
|
+
if len(self.cache) > self._keep_chunks:
|
|
138
|
+
for kk in self.cache.keys():
|
|
139
|
+
if kk != 0: # always keep the first chunk
|
|
140
|
+
self.cache.pop(kk)
|
|
141
|
+
break
|
|
142
|
+
return self.cache[index]
|
|
143
|
+
|
|
144
|
+
def read(self, size=-1, /):
|
|
145
|
+
"""Cache-supported read operation (file object)"""
|
|
146
|
+
data = self.read_range_cached(self._pos, self._pos + size)
|
|
147
|
+
if size > 0:
|
|
148
|
+
self._pos += size
|
|
149
|
+
else:
|
|
150
|
+
self._pos = self.length
|
|
151
|
+
return data
|
|
152
|
+
|
|
153
|
+
def read_range_cached(self, start, stop):
|
|
154
|
+
"""Concatenate the requested bytes from the cached chunks
|
|
155
|
+
|
|
156
|
+
This calls `get_cache_chunk` and thus downloads cache
|
|
157
|
+
chunks when necessary.
|
|
158
|
+
"""
|
|
159
|
+
toread = stop - start
|
|
160
|
+
# compute the chunk indices between start and stop
|
|
161
|
+
chunk_start = np.int64(start // self._chunk_size)
|
|
162
|
+
chunk_stop = np.int64(stop // self._chunk_size + 1)
|
|
163
|
+
data = b""
|
|
164
|
+
pos = start
|
|
165
|
+
for chunk_index in range(chunk_start, chunk_stop):
|
|
166
|
+
chunk = self.get_cache_chunk(chunk_index)
|
|
167
|
+
chunk_start = pos % self._chunk_size
|
|
168
|
+
if toread == 0:
|
|
169
|
+
break
|
|
170
|
+
elif chunk_start + toread >= self._chunk_size:
|
|
171
|
+
data += chunk[chunk_start:]
|
|
172
|
+
chunks_read = self._chunk_size - chunk_start
|
|
173
|
+
else:
|
|
174
|
+
chunk_end = stop % self._chunk_size
|
|
175
|
+
data += chunk[chunk_start:chunk_end]
|
|
176
|
+
chunks_read = chunk_end - chunk_start
|
|
177
|
+
toread -= chunks_read
|
|
178
|
+
pos += chunks_read
|
|
179
|
+
|
|
180
|
+
return data
|
|
181
|
+
|
|
182
|
+
def seek(self, offset, whence=os.SEEK_SET):
|
|
183
|
+
"""Seek to a position (file object)"""
|
|
184
|
+
if whence == os.SEEK_SET:
|
|
185
|
+
self._pos = offset
|
|
186
|
+
elif whence == os.SEEK_CUR:
|
|
187
|
+
self._pos += offset
|
|
188
|
+
elif whence == os.SEEK_END:
|
|
189
|
+
self._pos = self.length + offset
|
|
190
|
+
|
|
191
|
+
def seekable(self):
|
|
192
|
+
"""The HTTP file is seekable"""
|
|
193
|
+
return True
|
|
194
|
+
|
|
195
|
+
def tell(self):
|
|
196
|
+
"""Tell the position (file object)"""
|
|
197
|
+
return self._pos
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class ResoluteRequestsSessionCache:
|
|
201
|
+
def __init__(self):
|
|
202
|
+
"""A multiprocessing-safe cache for requests session objects
|
|
203
|
+
|
|
204
|
+
This class implements empty `__getstate__` and `__setstate__`
|
|
205
|
+
methods, so that when used in a multiprocessing context, sessions
|
|
206
|
+
are never mirrored to the subprocesses. Each subprocess creates
|
|
207
|
+
its own sessions.
|
|
208
|
+
|
|
209
|
+
Note that only :class:`ResoluteRequestsSession` objects are used,
|
|
210
|
+
which is ideal for the use-case of unstable internet connections.
|
|
211
|
+
"""
|
|
212
|
+
#: This dictionary holds all sessions in use by the current process.
|
|
213
|
+
#: Sessions are stored with the host name / netloc as the key.
|
|
214
|
+
self.sessions = {}
|
|
215
|
+
|
|
216
|
+
def __getstate__(self):
|
|
217
|
+
"""Returns None, so sessions are not pickled into subrpocesses"""
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
def __setstate__(self, state):
|
|
221
|
+
"""Does nothing (see `__getstate__`)"""
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
def get_session(self, url: str):
|
|
225
|
+
"""Return a requests session for the specified URL
|
|
226
|
+
|
|
227
|
+
For each hostname, a different session is returned,
|
|
228
|
+
but for identical hostnames, cached sessions are used.
|
|
229
|
+
"""
|
|
230
|
+
urlp = urlparse(url)
|
|
231
|
+
key = urlp.netloc
|
|
232
|
+
if key not in self.sessions:
|
|
233
|
+
self.sessions[key] = ResoluteRequestsSession()
|
|
234
|
+
return self.sessions[key]
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class ResoluteRequestsSession(requests.Session):
|
|
238
|
+
"""A session with built-in retry for `get`"""
|
|
239
|
+
def get(self, *args, **kwargs):
|
|
240
|
+
kwargs.setdefault("timeout", 0.5)
|
|
241
|
+
for ii in range(100):
|
|
242
|
+
try:
|
|
243
|
+
resp = super(ResoluteRequestsSession,
|
|
244
|
+
self).get(*args, **kwargs)
|
|
245
|
+
except (requests.exceptions.ConnectionError,
|
|
246
|
+
requests.exceptions.ReadTimeout,
|
|
247
|
+
requests.exceptions.ConnectTimeout,
|
|
248
|
+
requests.urllib3.exceptions.ConnectionError,
|
|
249
|
+
requests.urllib3.exceptions.ReadTimeoutError) as e:
|
|
250
|
+
warnings.warn(f"Encountered {e} for {args} {kwargs}",
|
|
251
|
+
ConnectionTimeoutWarning)
|
|
252
|
+
continue
|
|
253
|
+
else:
|
|
254
|
+
break
|
|
255
|
+
else:
|
|
256
|
+
raise requests.exceptions.ReadTimeout(
|
|
257
|
+
f"Resolute session failed for {args} and {kwargs}!")
|
|
258
|
+
return resp
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def is_url_available(url: str, ret_reason=False):
|
|
262
|
+
"""Check whether a URL is available
|
|
263
|
+
|
|
264
|
+
Parameters
|
|
265
|
+
----------
|
|
266
|
+
url: str
|
|
267
|
+
full URL to the object
|
|
268
|
+
ret_reason: bool
|
|
269
|
+
whether to return reason for unavailability
|
|
270
|
+
|
|
271
|
+
Returns
|
|
272
|
+
-------
|
|
273
|
+
available: bool
|
|
274
|
+
whether the URL is available
|
|
275
|
+
reason: str
|
|
276
|
+
reason for the URL not being available is `available` is False
|
|
277
|
+
"""
|
|
278
|
+
avail = False
|
|
279
|
+
reason = "none"
|
|
280
|
+
if is_http_url(url):
|
|
281
|
+
urlp = urlparse(url)
|
|
282
|
+
# default to https if no scheme or port is specified
|
|
283
|
+
port = urlp.port or (80 if urlp.scheme == "http" else 443)
|
|
284
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
285
|
+
s.settimeout(1)
|
|
286
|
+
# Try to connect to the host
|
|
287
|
+
try:
|
|
288
|
+
# Use `hostname`, not `netloc`, because `netloc` contains
|
|
289
|
+
# the port number which we do not want here.
|
|
290
|
+
s.connect((urlp.hostname, port))
|
|
291
|
+
except (socket.gaierror, OSError):
|
|
292
|
+
reason = "no connection"
|
|
293
|
+
else:
|
|
294
|
+
# Try to access the url
|
|
295
|
+
try:
|
|
296
|
+
ses = session_cache.get_session(url)
|
|
297
|
+
req = ses.get(url, stream=True, timeout=1)
|
|
298
|
+
avail = req.ok
|
|
299
|
+
if not avail:
|
|
300
|
+
reason = req.reason.lower()
|
|
301
|
+
except OSError:
|
|
302
|
+
reason = "oserror"
|
|
303
|
+
pass
|
|
304
|
+
else:
|
|
305
|
+
reason = "invalid"
|
|
306
|
+
if ret_reason:
|
|
307
|
+
return avail, reason
|
|
308
|
+
else:
|
|
309
|
+
return avail
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@functools.lru_cache()
|
|
313
|
+
def is_http_url(string):
|
|
314
|
+
"""Check whether `string` is a valid URL using regexp"""
|
|
315
|
+
if not isinstance(string, str):
|
|
316
|
+
return False
|
|
317
|
+
else:
|
|
318
|
+
return REGEXP_HTTP_URL.match(string.strip())
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
#: cache of requests sessions for current process
|
|
322
|
+
session_cache = ResoluteRequestsSessionCache()
|