dclab 0.67.0__cp314-cp314t-macosx_10_13_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dclab might be problematic. Click here for more details.
- dclab/__init__.py +41 -0
- dclab/_version.py +34 -0
- dclab/cached.py +97 -0
- dclab/cli/__init__.py +10 -0
- dclab/cli/common.py +237 -0
- dclab/cli/task_compress.py +126 -0
- dclab/cli/task_condense.py +223 -0
- dclab/cli/task_join.py +229 -0
- dclab/cli/task_repack.py +98 -0
- dclab/cli/task_split.py +154 -0
- dclab/cli/task_tdms2rtdc.py +186 -0
- dclab/cli/task_verify_dataset.py +75 -0
- dclab/definitions/__init__.py +79 -0
- dclab/definitions/feat_const.py +202 -0
- dclab/definitions/feat_logic.py +182 -0
- dclab/definitions/meta_const.py +252 -0
- dclab/definitions/meta_logic.py +111 -0
- dclab/definitions/meta_parse.py +94 -0
- dclab/downsampling.cpython-314t-darwin.so +0 -0
- dclab/downsampling.pyx +230 -0
- dclab/external/__init__.py +4 -0
- dclab/external/packaging/LICENSE +3 -0
- dclab/external/packaging/LICENSE.APACHE +177 -0
- dclab/external/packaging/LICENSE.BSD +23 -0
- dclab/external/packaging/__init__.py +6 -0
- dclab/external/packaging/_structures.py +61 -0
- dclab/external/packaging/version.py +505 -0
- dclab/external/skimage/LICENSE +28 -0
- dclab/external/skimage/__init__.py +2 -0
- dclab/external/skimage/_find_contours.py +216 -0
- dclab/external/skimage/_find_contours_cy.cpython-314t-darwin.so +0 -0
- dclab/external/skimage/_find_contours_cy.pyx +188 -0
- dclab/external/skimage/_pnpoly.cpython-314t-darwin.so +0 -0
- dclab/external/skimage/_pnpoly.pyx +99 -0
- dclab/external/skimage/_shared/__init__.py +1 -0
- dclab/external/skimage/_shared/geometry.cpython-314t-darwin.so +0 -0
- dclab/external/skimage/_shared/geometry.pxd +6 -0
- dclab/external/skimage/_shared/geometry.pyx +55 -0
- dclab/external/skimage/measure.py +7 -0
- dclab/external/skimage/pnpoly.py +53 -0
- dclab/external/statsmodels/LICENSE +35 -0
- dclab/external/statsmodels/__init__.py +6 -0
- dclab/external/statsmodels/nonparametric/__init__.py +1 -0
- dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
- dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
- dclab/external/statsmodels/nonparametric/kernels.py +36 -0
- dclab/features/__init__.py +9 -0
- dclab/features/bright.py +81 -0
- dclab/features/bright_bc.py +93 -0
- dclab/features/bright_perc.py +63 -0
- dclab/features/contour.py +161 -0
- dclab/features/emodulus/__init__.py +339 -0
- dclab/features/emodulus/load.py +252 -0
- dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
- dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
- dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
- dclab/features/emodulus/pxcorr.py +135 -0
- dclab/features/emodulus/scale_linear.py +247 -0
- dclab/features/emodulus/viscosity.py +260 -0
- dclab/features/fl_crosstalk.py +95 -0
- dclab/features/inert_ratio.py +377 -0
- dclab/features/volume.py +242 -0
- dclab/http_utils.py +322 -0
- dclab/isoelastics/__init__.py +468 -0
- dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
- dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
- dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
- dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
- dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
- dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
- dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
- dclab/kde/__init__.py +1 -0
- dclab/kde/base.py +459 -0
- dclab/kde/contours.py +222 -0
- dclab/kde/methods.py +313 -0
- dclab/kde_contours.py +10 -0
- dclab/kde_methods.py +11 -0
- dclab/lme4/__init__.py +5 -0
- dclab/lme4/lme4_template.R +94 -0
- dclab/lme4/rsetup.py +204 -0
- dclab/lme4/wrapr.py +386 -0
- dclab/polygon_filter.py +398 -0
- dclab/rtdc_dataset/__init__.py +15 -0
- dclab/rtdc_dataset/check.py +902 -0
- dclab/rtdc_dataset/config.py +533 -0
- dclab/rtdc_dataset/copier.py +353 -0
- dclab/rtdc_dataset/core.py +896 -0
- dclab/rtdc_dataset/export.py +867 -0
- dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
- dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
- dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
- dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
- dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
- dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
- dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
- dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
- dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
- dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
- dclab/rtdc_dataset/feat_basin.py +762 -0
- dclab/rtdc_dataset/feat_temp.py +102 -0
- dclab/rtdc_dataset/filter.py +263 -0
- dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
- dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
- dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
- dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
- dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
- dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
- dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
- dclab/rtdc_dataset/fmt_dict.py +103 -0
- dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
- dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
- dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
- dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
- dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
- dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
- dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
- dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
- dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
- dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
- dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
- dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
- dclab/rtdc_dataset/fmt_http.py +102 -0
- dclab/rtdc_dataset/fmt_s3.py +354 -0
- dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
- dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
- dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
- dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
- dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
- dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
- dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
- dclab/rtdc_dataset/load.py +77 -0
- dclab/rtdc_dataset/meta_table.py +25 -0
- dclab/rtdc_dataset/writer.py +1019 -0
- dclab/statistics.py +226 -0
- dclab/util.py +176 -0
- dclab/warn.py +15 -0
- dclab-0.67.0.dist-info/METADATA +153 -0
- dclab-0.67.0.dist-info/RECORD +142 -0
- dclab-0.67.0.dist-info/WHEEL +6 -0
- dclab-0.67.0.dist-info/entry_points.txt +8 -0
- dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
- dclab-0.67.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def map_indices_child2parent(child, child_indices):
|
|
5
|
+
"""Map child RTDCBase event indices to parent RTDCBase
|
|
6
|
+
|
|
7
|
+
Given a hierarchy child and indices defined for that child,
|
|
8
|
+
return the corresponding indices for its parent.
|
|
9
|
+
|
|
10
|
+
For instance, a child is defined in such a way that it
|
|
11
|
+
has every second event of its parent (`parent.filter.all[::2]=False`
|
|
12
|
+
i.e. the filtering array is `[False, True, False, ...]`). When passing
|
|
13
|
+
`child_indices=[2,3,4]`, the return value of this method would be
|
|
14
|
+
`parent_indices=[5,7,9]` (indexing starts at 0)`. Index 5 in the
|
|
15
|
+
parent dataset corresponds to index 2 in the child dataset.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
child: RTDC_Hierarchy
|
|
20
|
+
RTDCBase hierarchy child to map from
|
|
21
|
+
child_indices: 1d int ndarray
|
|
22
|
+
integer indices in `child`
|
|
23
|
+
|
|
24
|
+
Returns
|
|
25
|
+
-------
|
|
26
|
+
parent_indices: 1d int ndarray
|
|
27
|
+
integer indices in `child.hparent`
|
|
28
|
+
"""
|
|
29
|
+
parent = child.hparent
|
|
30
|
+
# filters
|
|
31
|
+
pf = parent.filter.all
|
|
32
|
+
# indices corresponding to all child events
|
|
33
|
+
idx = np.where(pf)[0] # True means present in the child
|
|
34
|
+
# indices corresponding to selected child events
|
|
35
|
+
parent_indices = idx[child_indices]
|
|
36
|
+
return parent_indices
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def map_indices_child2root(child, child_indices):
|
|
40
|
+
"""Map RTDC_Hierarchy event indices to root RTDCBase
|
|
41
|
+
|
|
42
|
+
Like :func:`map_indices_child2parent`, but map the
|
|
43
|
+
child indices to the root parent.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
child: RTDC_Hierarchy
|
|
48
|
+
RTDCBase hierarchy child to map from
|
|
49
|
+
child_indices: 1d ndarray
|
|
50
|
+
integer indices in `child`
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
root_indices: 1d ndarray
|
|
55
|
+
integer indices in the child's root parent
|
|
56
|
+
(not necessarily the indices of `child.hparent`)
|
|
57
|
+
"""
|
|
58
|
+
while True:
|
|
59
|
+
indices = map_indices_child2parent(child=child,
|
|
60
|
+
child_indices=child_indices)
|
|
61
|
+
if child.hparent.format == "hierarchy":
|
|
62
|
+
child = child.hparent
|
|
63
|
+
child_indices = indices
|
|
64
|
+
else:
|
|
65
|
+
break
|
|
66
|
+
return indices
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def map_indices_parent2child(child, parent_indices):
|
|
70
|
+
"""Map parent RTDCBase event indices to RTDC_Hierarchy child
|
|
71
|
+
|
|
72
|
+
Given a hierarchy child and indices defined for its `child.hparent`,
|
|
73
|
+
return the corresponding indices for the `child`.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
child: RTDC_Hierarchy
|
|
78
|
+
RTDCBase hierarchy child to map to
|
|
79
|
+
parent_indices: 1d ndarray
|
|
80
|
+
integer indices in `child.hparent`
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
child_indices: 1d ndarray
|
|
85
|
+
integer indices in `child`, corresponding to `parent_indices`
|
|
86
|
+
in `child.hparent`
|
|
87
|
+
"""
|
|
88
|
+
parent = child.hparent
|
|
89
|
+
# this boolean array defines `child` in the parent
|
|
90
|
+
pf = parent.filter.all
|
|
91
|
+
# all event indices in parent that define `child`
|
|
92
|
+
pf_loc = np.where(pf)[0]
|
|
93
|
+
# boolean array with size `len(child)` indicating where the
|
|
94
|
+
# `parent_indices` are set.
|
|
95
|
+
same = np.isin(pf_loc, parent_indices)
|
|
96
|
+
return np.where(same)[0]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def map_indices_root2child(child, root_indices):
|
|
100
|
+
"""Map root RTDCBase event indices to RTDC_Hierarchy child
|
|
101
|
+
|
|
102
|
+
Like :func:`map_indices_parent2child`, but accepts the
|
|
103
|
+
`root_indices` and map them to `child`.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
child: RTDCBase
|
|
108
|
+
RTDCBase hierarchy child to map to
|
|
109
|
+
root_indices: 1d ndarray
|
|
110
|
+
integer indices in the root parent of `child`
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
child_indices: 1d ndarray
|
|
115
|
+
integer indices in `child`, corresponding to `root_indices`
|
|
116
|
+
in `child`s root parent
|
|
117
|
+
"""
|
|
118
|
+
# construct hierarchy tree containing only RTDC_Hierarchy instances
|
|
119
|
+
hierarchy = [child]
|
|
120
|
+
while True:
|
|
121
|
+
if child.hparent.format == "hierarchy":
|
|
122
|
+
# the parent is a hierarchy tree
|
|
123
|
+
hierarchy.append(child.hparent)
|
|
124
|
+
child = child.hparent
|
|
125
|
+
else:
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
indices = root_indices
|
|
129
|
+
for hp in hierarchy[::-1]: # reverse order
|
|
130
|
+
# For each hierarchy parent, map the indices down the
|
|
131
|
+
# hierarchy tree.
|
|
132
|
+
indices = map_indices_parent2child(child=hp,
|
|
133
|
+
parent_indices=indices)
|
|
134
|
+
return indices
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
|
|
3
|
+
from ..http_utils import HTTPFile, REQUESTS_AVAILABLE, is_url_available
|
|
4
|
+
from ..http_utils import is_http_url # noqa: F401
|
|
5
|
+
|
|
6
|
+
from .feat_basin import Basin
|
|
7
|
+
from .fmt_hdf5 import RTDC_HDF5
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RTDC_HTTP(RTDC_HDF5):
|
|
11
|
+
def __init__(self,
|
|
12
|
+
url: str,
|
|
13
|
+
*args, **kwargs):
|
|
14
|
+
"""Access RT-DC measurements via HTTP
|
|
15
|
+
|
|
16
|
+
This class allows you to open .rtdc files accessible via an
|
|
17
|
+
HTTP URL, for instance files on an S3 object storage or
|
|
18
|
+
figshare download links.
|
|
19
|
+
|
|
20
|
+
This is essentially just a wrapper around :class:`.RTDC_HDF5`
|
|
21
|
+
with :class:`.HTTPFile` passing a file object to h5py.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
url: str
|
|
26
|
+
Full URL to an HDF5 file
|
|
27
|
+
*args:
|
|
28
|
+
Arguments for `RTDCBase`
|
|
29
|
+
**kwargs:
|
|
30
|
+
Keyword arguments for `RTDCBase`
|
|
31
|
+
|
|
32
|
+
Attributes
|
|
33
|
+
----------
|
|
34
|
+
path: str
|
|
35
|
+
The URL to the object
|
|
36
|
+
|
|
37
|
+
Notes
|
|
38
|
+
-----
|
|
39
|
+
Since this format still requires random access to the file online,
|
|
40
|
+
i.e. not the entire file is downloaded, only parts of it, the
|
|
41
|
+
web server must support range requests.
|
|
42
|
+
"""
|
|
43
|
+
if not REQUESTS_AVAILABLE:
|
|
44
|
+
raise ModuleNotFoundError(
|
|
45
|
+
f"Package `requests` required for loading http data '{url}'!")
|
|
46
|
+
|
|
47
|
+
self._fhttp = HTTPFile(url)
|
|
48
|
+
if kwargs.get("identifier") is None:
|
|
49
|
+
if self._fhttp.etag is not None:
|
|
50
|
+
# Set the HTTP ETag as the identifier, it doesn't get
|
|
51
|
+
# more unique than that!
|
|
52
|
+
kwargs["identifier"] = self._fhttp.etag
|
|
53
|
+
else:
|
|
54
|
+
# Compute a hash of the first data chunk
|
|
55
|
+
kwargs["identifier"] = hashlib.md5(
|
|
56
|
+
self._fhttp.get_cache_chunk(0)).hexdigest()
|
|
57
|
+
|
|
58
|
+
# Initialize the HDF5 dataset
|
|
59
|
+
super(RTDC_HTTP, self).__init__(
|
|
60
|
+
h5path=self._fhttp,
|
|
61
|
+
*args,
|
|
62
|
+
**kwargs)
|
|
63
|
+
# Override self.path with the actual HTTP URL
|
|
64
|
+
self.path = url
|
|
65
|
+
|
|
66
|
+
def close(self):
|
|
67
|
+
super(RTDC_HTTP, self).close()
|
|
68
|
+
self._fhttp.close()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class HTTPBasin(Basin):
|
|
72
|
+
basin_format = "http"
|
|
73
|
+
basin_type = "remote"
|
|
74
|
+
|
|
75
|
+
def __init__(self, *args, **kwargs):
|
|
76
|
+
self._available_verified = None
|
|
77
|
+
super(HTTPBasin, self).__init__(*args, **kwargs)
|
|
78
|
+
|
|
79
|
+
def _load_dataset(self, location, **kwargs):
|
|
80
|
+
h5file = RTDC_HTTP(location, **kwargs)
|
|
81
|
+
return h5file
|
|
82
|
+
|
|
83
|
+
def is_available(self):
|
|
84
|
+
"""Check for `requests` and object availability
|
|
85
|
+
|
|
86
|
+
Caching policy: Once this method returns True, it will always
|
|
87
|
+
return True.
|
|
88
|
+
"""
|
|
89
|
+
if self._available_verified is None:
|
|
90
|
+
with self._av_check_lock:
|
|
91
|
+
if not REQUESTS_AVAILABLE:
|
|
92
|
+
# don't even bother
|
|
93
|
+
self._available_verified = False
|
|
94
|
+
else:
|
|
95
|
+
avail, reason = is_url_available(self.location,
|
|
96
|
+
ret_reason=True)
|
|
97
|
+
if reason in ["forbidden", "not found"]:
|
|
98
|
+
# we cannot access the URL in the near future
|
|
99
|
+
self._available_verified = False
|
|
100
|
+
elif avail:
|
|
101
|
+
self._available_verified = True
|
|
102
|
+
return self._available_verified
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
# import multiprocessing BaseManager here, because there is some kind
|
|
3
|
+
# of circular dependency issue with s3transfer.compat and multiprocessing.
|
|
4
|
+
from multiprocessing.managers import BaseManager # noqa: F401
|
|
5
|
+
import os
|
|
6
|
+
import pathlib
|
|
7
|
+
import re
|
|
8
|
+
import socket
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
import warnings
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import boto3
|
|
15
|
+
import botocore
|
|
16
|
+
import botocore.client
|
|
17
|
+
import botocore.exceptions
|
|
18
|
+
import botocore.session
|
|
19
|
+
except ModuleNotFoundError:
|
|
20
|
+
BOTO3_AVAILABLE = False
|
|
21
|
+
else:
|
|
22
|
+
BOTO3_AVAILABLE = True
|
|
23
|
+
|
|
24
|
+
from ..http_utils import HTTPFile
|
|
25
|
+
|
|
26
|
+
from .feat_basin import Basin
|
|
27
|
+
|
|
28
|
+
from .fmt_hdf5 import RTDC_HDF5
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
#: Regular expression for matching a DCOR resource URL
|
|
32
|
+
REGEXP_S3_URL = re.compile(
|
|
33
|
+
r"^(https?:\/\/)" # protocol (http or https or omitted)
|
|
34
|
+
r"([a-z0-9-\.]*)(\:[0-9]*)?\/" # host:port
|
|
35
|
+
r".+\/" # bucket
|
|
36
|
+
r".+" # key
|
|
37
|
+
)
|
|
38
|
+
REGEXP_S3_BUCKET_KEY = re.compile(r"^[0-9a-z-]+(\/[0-9a-z-]+)+$")
|
|
39
|
+
|
|
40
|
+
S3_ENDPOINT_URL = os.environ.get("DCLAB_S3_ENDPOINT_URL")
|
|
41
|
+
S3_ACCESS_KEY_ID = os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
|
|
42
|
+
S3_SECRET_ACCESS_KEY = os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@functools.lru_cache(maxsize=1000)
|
|
46
|
+
def get_s3_session_client(access_key_id: str,
|
|
47
|
+
secret_access_key: str,
|
|
48
|
+
use_ssl: bool,
|
|
49
|
+
verify_ssl: bool,
|
|
50
|
+
endpoint_url: str
|
|
51
|
+
):
|
|
52
|
+
botocore_session = botocore.session.get_session()
|
|
53
|
+
s3_session = boto3.Session(
|
|
54
|
+
aws_access_key_id=access_key_id,
|
|
55
|
+
aws_secret_access_key=secret_access_key,
|
|
56
|
+
botocore_session=botocore_session)
|
|
57
|
+
s3_client = s3_session.client(
|
|
58
|
+
service_name='s3',
|
|
59
|
+
use_ssl=use_ssl,
|
|
60
|
+
verify=verify_ssl,
|
|
61
|
+
endpoint_url=endpoint_url,
|
|
62
|
+
)
|
|
63
|
+
return botocore_session, s3_session, s3_client
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class S3File(HTTPFile):
|
|
67
|
+
"""Monkeypatched `HTTPFile` to support authenticated access to S3"""
|
|
68
|
+
def __init__(self,
|
|
69
|
+
object_path: str,
|
|
70
|
+
endpoint_url: str,
|
|
71
|
+
access_key_id: str = "",
|
|
72
|
+
secret_access_key: str = "",
|
|
73
|
+
use_ssl: bool = True,
|
|
74
|
+
verify_ssl: bool = True):
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
object_path: str
|
|
80
|
+
bucket/key path to object in the object store
|
|
81
|
+
endpoint_url: str
|
|
82
|
+
the explicit endpoint URL for accessing the object store
|
|
83
|
+
access_key_id:
|
|
84
|
+
S3 access key
|
|
85
|
+
secret_access_key:
|
|
86
|
+
secret S3 key mathcing `access_key_id`
|
|
87
|
+
use_ssl: bool
|
|
88
|
+
use SSL to connect to the endpoint, only disabled for testing
|
|
89
|
+
verify_ssl: bool
|
|
90
|
+
make sure the SSL certificate is sound, only used for testing
|
|
91
|
+
"""
|
|
92
|
+
if endpoint_url is None:
|
|
93
|
+
raise ValueError(
|
|
94
|
+
"The S3 endpoint URL is empty. This could mean that you did "
|
|
95
|
+
"not specify the full S3 URL or that you forgot to set "
|
|
96
|
+
"the `S3_ENDPOINT_URL` environment variable.")
|
|
97
|
+
endpoint_url = endpoint_url.strip().rstrip("/")
|
|
98
|
+
self.botocore_session, self.s3_session, self.s3_client = \
|
|
99
|
+
get_s3_session_client(
|
|
100
|
+
access_key_id=access_key_id,
|
|
101
|
+
secret_access_key=secret_access_key,
|
|
102
|
+
use_ssl=use_ssl,
|
|
103
|
+
verify_ssl=verify_ssl,
|
|
104
|
+
endpoint_url=endpoint_url,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Use a configuration that allows anonymous access
|
|
108
|
+
# https://stackoverflow.com/a/34866092
|
|
109
|
+
if not secret_access_key:
|
|
110
|
+
config = botocore.client.Config(
|
|
111
|
+
signature_version=botocore.UNSIGNED,
|
|
112
|
+
region_name='us-east-1')
|
|
113
|
+
else:
|
|
114
|
+
config = None
|
|
115
|
+
|
|
116
|
+
self.s3_resource = self.s3_session.resource(
|
|
117
|
+
service_name="s3",
|
|
118
|
+
use_ssl=use_ssl,
|
|
119
|
+
verify=verify_ssl,
|
|
120
|
+
endpoint_url=endpoint_url,
|
|
121
|
+
config=config)
|
|
122
|
+
|
|
123
|
+
bucket_name, object_name = object_path.strip("/").split("/", 1)
|
|
124
|
+
self.s3_object = self.s3_resource.Object(
|
|
125
|
+
bucket_name=bucket_name,
|
|
126
|
+
key=object_name)
|
|
127
|
+
|
|
128
|
+
super(S3File, self).__init__(f"{endpoint_url}/{object_path}")
|
|
129
|
+
|
|
130
|
+
def _parse_header(self):
|
|
131
|
+
if self._len is None:
|
|
132
|
+
self._len = self.s3_object.content_length
|
|
133
|
+
self._etag = self.s3_object.e_tag
|
|
134
|
+
|
|
135
|
+
def close(self):
|
|
136
|
+
super(S3File, self).close()
|
|
137
|
+
self.s3_client.close()
|
|
138
|
+
|
|
139
|
+
def download_range(self, start, stop):
|
|
140
|
+
"""Download bytes given by the range (`start`, `stop`)
|
|
141
|
+
|
|
142
|
+
`stop` is not inclusive (In the HTTP range request it normally is).
|
|
143
|
+
"""
|
|
144
|
+
stream = self.s3_object.get(Range=f"bytes={start}-{stop-1}")['Body']
|
|
145
|
+
return stream.read()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class RTDC_S3(RTDC_HDF5):
|
|
149
|
+
def __init__(self,
|
|
150
|
+
url: str,
|
|
151
|
+
endpoint_url: str = None,
|
|
152
|
+
access_key_id: str = None,
|
|
153
|
+
secret_access_key: str = None,
|
|
154
|
+
use_ssl: bool = True,
|
|
155
|
+
*args, **kwargs):
|
|
156
|
+
"""Access RT-DC measurements in an S3-compatible object store
|
|
157
|
+
|
|
158
|
+
This is essentially just a wrapper around :class:`.RTDC_HDF5`
|
|
159
|
+
with :mod:`boto3` and :class:`.HTTPFile` passing a file object to h5py.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
url: str
|
|
164
|
+
URL to an object in an S3 instance; this can be either a full
|
|
165
|
+
URL (including the endpoint), or just `bucket/key`
|
|
166
|
+
access_key_id: str
|
|
167
|
+
S3 access identifier
|
|
168
|
+
secret_access_key: str
|
|
169
|
+
Secret S3 access key
|
|
170
|
+
use_ssl: bool
|
|
171
|
+
Whether to enforce SSL (defaults to True)
|
|
172
|
+
*args:
|
|
173
|
+
Arguments for `RTDCBase`
|
|
174
|
+
**kwargs:
|
|
175
|
+
Keyword arguments for `RTDCBase`
|
|
176
|
+
|
|
177
|
+
Attributes
|
|
178
|
+
----------
|
|
179
|
+
path: str
|
|
180
|
+
The URL to the object
|
|
181
|
+
"""
|
|
182
|
+
if not BOTO3_AVAILABLE:
|
|
183
|
+
raise ModuleNotFoundError(
|
|
184
|
+
f"Package `boto3` required for loading S3 data '{url}'!")
|
|
185
|
+
|
|
186
|
+
self._s3file = S3File(
|
|
187
|
+
object_path=get_object_path(url),
|
|
188
|
+
endpoint_url=(endpoint_url
|
|
189
|
+
or get_endpoint_url(url)
|
|
190
|
+
or S3_ENDPOINT_URL
|
|
191
|
+
or os.environ.get("DCLAB_S3_ENDPOINT_URL")
|
|
192
|
+
),
|
|
193
|
+
access_key_id=(access_key_id
|
|
194
|
+
or S3_ACCESS_KEY_ID
|
|
195
|
+
or os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
|
|
196
|
+
or ""
|
|
197
|
+
),
|
|
198
|
+
secret_access_key=(secret_access_key
|
|
199
|
+
or S3_SECRET_ACCESS_KEY
|
|
200
|
+
or os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")
|
|
201
|
+
or ""
|
|
202
|
+
),
|
|
203
|
+
use_ssl=use_ssl,
|
|
204
|
+
verify_ssl=use_ssl,
|
|
205
|
+
)
|
|
206
|
+
# Initialize the HDF5 dataset
|
|
207
|
+
super(RTDC_S3, self).__init__(
|
|
208
|
+
h5path=self._s3file,
|
|
209
|
+
*args,
|
|
210
|
+
**kwargs)
|
|
211
|
+
# Override self.path with the actual S3 URL
|
|
212
|
+
self.path = self._s3file.url
|
|
213
|
+
|
|
214
|
+
def close(self):
|
|
215
|
+
super(RTDC_S3, self).close()
|
|
216
|
+
self._s3file.close()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
class S3Basin(Basin):
|
|
220
|
+
basin_format = "s3"
|
|
221
|
+
basin_type = "remote"
|
|
222
|
+
|
|
223
|
+
def __init__(self, *args, **kwargs):
|
|
224
|
+
self._available_verified = None
|
|
225
|
+
super(S3Basin, self).__init__(*args, **kwargs)
|
|
226
|
+
|
|
227
|
+
def _load_dataset(self, location, **kwargs):
|
|
228
|
+
h5file = RTDC_S3(location, **kwargs)
|
|
229
|
+
return h5file
|
|
230
|
+
|
|
231
|
+
def is_available(self):
|
|
232
|
+
"""Check for boto3 and object availability
|
|
233
|
+
|
|
234
|
+
Caching policy: Once this method returns True, it will always
|
|
235
|
+
return True.
|
|
236
|
+
"""
|
|
237
|
+
if self._available_verified is None:
|
|
238
|
+
with self._av_check_lock:
|
|
239
|
+
if not BOTO3_AVAILABLE:
|
|
240
|
+
self._available_verified = False
|
|
241
|
+
else:
|
|
242
|
+
self._available_verified = \
|
|
243
|
+
is_s3_object_available(self.location)
|
|
244
|
+
return self._available_verified
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def is_s3_object_available(url: str,
|
|
248
|
+
access_key_id: str = None,
|
|
249
|
+
secret_access_key: str = None,
|
|
250
|
+
):
|
|
251
|
+
"""Check whether an S3 object is available
|
|
252
|
+
|
|
253
|
+
Parameters
|
|
254
|
+
----------
|
|
255
|
+
url: str
|
|
256
|
+
full URL to the object
|
|
257
|
+
access_key_id: str
|
|
258
|
+
S3 access identifier
|
|
259
|
+
secret_access_key: str
|
|
260
|
+
Secret S3 access key
|
|
261
|
+
"""
|
|
262
|
+
avail = False
|
|
263
|
+
if is_s3_url(url):
|
|
264
|
+
endpoint_url = (get_endpoint_url(url)
|
|
265
|
+
or S3_ENDPOINT_URL
|
|
266
|
+
or os.environ.get("DCLAB_S3_ENDPOINT_URL")
|
|
267
|
+
)
|
|
268
|
+
if not endpoint_url:
|
|
269
|
+
warnings.warn(
|
|
270
|
+
f"Could not determine endpoint from URL '{url}'. Please "
|
|
271
|
+
f"set the `S3_ENDPOINT_URL` environment variable or pass "
|
|
272
|
+
f"a full object URL.")
|
|
273
|
+
else:
|
|
274
|
+
# default to https if no scheme or port is specified
|
|
275
|
+
urlp = urlparse(endpoint_url)
|
|
276
|
+
port = urlp.port or (80 if urlp.scheme == "http" else 443)
|
|
277
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
278
|
+
s.settimeout(1)
|
|
279
|
+
# Try to connect to the host
|
|
280
|
+
try:
|
|
281
|
+
# Use `hostname`, not `netloc`, because `netloc` contains
|
|
282
|
+
# the port number which we do not want here.
|
|
283
|
+
s.connect((urlp.hostname, port))
|
|
284
|
+
except (socket.gaierror, OSError):
|
|
285
|
+
pass
|
|
286
|
+
else:
|
|
287
|
+
# Try to access the object
|
|
288
|
+
s3file = S3File(
|
|
289
|
+
object_path=get_object_path(url),
|
|
290
|
+
endpoint_url=endpoint_url,
|
|
291
|
+
access_key_id=(
|
|
292
|
+
access_key_id
|
|
293
|
+
or S3_ACCESS_KEY_ID
|
|
294
|
+
or os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
|
|
295
|
+
or ""
|
|
296
|
+
),
|
|
297
|
+
secret_access_key=(
|
|
298
|
+
secret_access_key
|
|
299
|
+
or S3_SECRET_ACCESS_KEY
|
|
300
|
+
or os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")
|
|
301
|
+
or ""
|
|
302
|
+
),
|
|
303
|
+
)
|
|
304
|
+
try:
|
|
305
|
+
s3file.s3_object.load()
|
|
306
|
+
except botocore.exceptions.ClientError:
|
|
307
|
+
avail = False
|
|
308
|
+
else:
|
|
309
|
+
avail = True
|
|
310
|
+
return avail
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@functools.lru_cache()
|
|
314
|
+
def get_endpoint_url(url):
|
|
315
|
+
"""Given a URL of an S3 object, return the endpoint URL
|
|
316
|
+
|
|
317
|
+
Return None if no endpoint URL can be extracted (e.g. because
|
|
318
|
+
just `bucket_name/object_path` was passed).
|
|
319
|
+
"""
|
|
320
|
+
urlp = urlparse(url=url)
|
|
321
|
+
if urlp.hostname:
|
|
322
|
+
scheme = urlp.scheme or "https"
|
|
323
|
+
port = urlp.port or (80 if scheme == "http" else 443)
|
|
324
|
+
return f"{scheme}://{urlp.hostname}:{port}"
|
|
325
|
+
else:
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
@functools.lru_cache()
|
|
330
|
+
def get_object_path(url):
|
|
331
|
+
"""Given a URL of an S3 object, return the `bucket_name/object_path` part
|
|
332
|
+
|
|
333
|
+
Return object paths always without leading slash `/`.
|
|
334
|
+
"""
|
|
335
|
+
urlp = urlparse(url=url)
|
|
336
|
+
return urlp.path.lstrip("/")
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
@functools.lru_cache()
|
|
340
|
+
def is_s3_url(string):
|
|
341
|
+
"""Check whether `string` is a valid S3 URL using regexp"""
|
|
342
|
+
if not isinstance(string, str):
|
|
343
|
+
return False
|
|
344
|
+
elif REGEXP_S3_URL.match(string.strip()):
|
|
345
|
+
# this is pretty clear
|
|
346
|
+
return True
|
|
347
|
+
elif pathlib.Path(string).exists():
|
|
348
|
+
# this is actually a file
|
|
349
|
+
return False
|
|
350
|
+
elif REGEXP_S3_BUCKET_KEY.match(string.strip()):
|
|
351
|
+
# bucket_name/key
|
|
352
|
+
return True
|
|
353
|
+
else:
|
|
354
|
+
return False
|