dclab 0.67.0__cp314-cp314t-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dclab might be problematic. Click here for more details.

Files changed (142) hide show
  1. dclab/__init__.py +41 -0
  2. dclab/_version.py +34 -0
  3. dclab/cached.py +97 -0
  4. dclab/cli/__init__.py +10 -0
  5. dclab/cli/common.py +237 -0
  6. dclab/cli/task_compress.py +126 -0
  7. dclab/cli/task_condense.py +223 -0
  8. dclab/cli/task_join.py +229 -0
  9. dclab/cli/task_repack.py +98 -0
  10. dclab/cli/task_split.py +154 -0
  11. dclab/cli/task_tdms2rtdc.py +186 -0
  12. dclab/cli/task_verify_dataset.py +75 -0
  13. dclab/definitions/__init__.py +79 -0
  14. dclab/definitions/feat_const.py +202 -0
  15. dclab/definitions/feat_logic.py +182 -0
  16. dclab/definitions/meta_const.py +252 -0
  17. dclab/definitions/meta_logic.py +111 -0
  18. dclab/definitions/meta_parse.py +94 -0
  19. dclab/downsampling.cpython-314t-darwin.so +0 -0
  20. dclab/downsampling.pyx +230 -0
  21. dclab/external/__init__.py +4 -0
  22. dclab/external/packaging/LICENSE +3 -0
  23. dclab/external/packaging/LICENSE.APACHE +177 -0
  24. dclab/external/packaging/LICENSE.BSD +23 -0
  25. dclab/external/packaging/__init__.py +6 -0
  26. dclab/external/packaging/_structures.py +61 -0
  27. dclab/external/packaging/version.py +505 -0
  28. dclab/external/skimage/LICENSE +28 -0
  29. dclab/external/skimage/__init__.py +2 -0
  30. dclab/external/skimage/_find_contours.py +216 -0
  31. dclab/external/skimage/_find_contours_cy.cpython-314t-darwin.so +0 -0
  32. dclab/external/skimage/_find_contours_cy.pyx +188 -0
  33. dclab/external/skimage/_pnpoly.cpython-314t-darwin.so +0 -0
  34. dclab/external/skimage/_pnpoly.pyx +99 -0
  35. dclab/external/skimage/_shared/__init__.py +1 -0
  36. dclab/external/skimage/_shared/geometry.cpython-314t-darwin.so +0 -0
  37. dclab/external/skimage/_shared/geometry.pxd +6 -0
  38. dclab/external/skimage/_shared/geometry.pyx +55 -0
  39. dclab/external/skimage/measure.py +7 -0
  40. dclab/external/skimage/pnpoly.py +53 -0
  41. dclab/external/statsmodels/LICENSE +35 -0
  42. dclab/external/statsmodels/__init__.py +6 -0
  43. dclab/external/statsmodels/nonparametric/__init__.py +1 -0
  44. dclab/external/statsmodels/nonparametric/_kernel_base.py +203 -0
  45. dclab/external/statsmodels/nonparametric/kernel_density.py +165 -0
  46. dclab/external/statsmodels/nonparametric/kernels.py +36 -0
  47. dclab/features/__init__.py +9 -0
  48. dclab/features/bright.py +81 -0
  49. dclab/features/bright_bc.py +93 -0
  50. dclab/features/bright_perc.py +63 -0
  51. dclab/features/contour.py +161 -0
  52. dclab/features/emodulus/__init__.py +339 -0
  53. dclab/features/emodulus/load.py +252 -0
  54. dclab/features/emodulus/lut_HE-2D-FEM-22.txt +16432 -0
  55. dclab/features/emodulus/lut_HE-3D-FEM-22.txt +1276 -0
  56. dclab/features/emodulus/lut_LE-2D-FEM-19.txt +13082 -0
  57. dclab/features/emodulus/pxcorr.py +135 -0
  58. dclab/features/emodulus/scale_linear.py +247 -0
  59. dclab/features/emodulus/viscosity.py +260 -0
  60. dclab/features/fl_crosstalk.py +95 -0
  61. dclab/features/inert_ratio.py +377 -0
  62. dclab/features/volume.py +242 -0
  63. dclab/http_utils.py +322 -0
  64. dclab/isoelastics/__init__.py +468 -0
  65. dclab/isoelastics/iso_HE-2D-FEM-22-area_um-deform.txt +2440 -0
  66. dclab/isoelastics/iso_HE-2D-FEM-22-volume-deform.txt +2635 -0
  67. dclab/isoelastics/iso_HE-3D-FEM-22-area_um-deform.txt +1930 -0
  68. dclab/isoelastics/iso_HE-3D-FEM-22-volume-deform.txt +2221 -0
  69. dclab/isoelastics/iso_LE-2D-FEM-19-area_um-deform.txt +2151 -0
  70. dclab/isoelastics/iso_LE-2D-FEM-19-volume-deform.txt +2250 -0
  71. dclab/isoelastics/iso_LE-2D-ana-18-area_um-deform.txt +1266 -0
  72. dclab/kde/__init__.py +1 -0
  73. dclab/kde/base.py +459 -0
  74. dclab/kde/contours.py +222 -0
  75. dclab/kde/methods.py +313 -0
  76. dclab/kde_contours.py +10 -0
  77. dclab/kde_methods.py +11 -0
  78. dclab/lme4/__init__.py +5 -0
  79. dclab/lme4/lme4_template.R +94 -0
  80. dclab/lme4/rsetup.py +204 -0
  81. dclab/lme4/wrapr.py +386 -0
  82. dclab/polygon_filter.py +398 -0
  83. dclab/rtdc_dataset/__init__.py +15 -0
  84. dclab/rtdc_dataset/check.py +902 -0
  85. dclab/rtdc_dataset/config.py +533 -0
  86. dclab/rtdc_dataset/copier.py +353 -0
  87. dclab/rtdc_dataset/core.py +896 -0
  88. dclab/rtdc_dataset/export.py +867 -0
  89. dclab/rtdc_dataset/feat_anc_core/__init__.py +24 -0
  90. dclab/rtdc_dataset/feat_anc_core/af_basic.py +75 -0
  91. dclab/rtdc_dataset/feat_anc_core/af_emodulus.py +160 -0
  92. dclab/rtdc_dataset/feat_anc_core/af_fl_max_ctc.py +133 -0
  93. dclab/rtdc_dataset/feat_anc_core/af_image_contour.py +113 -0
  94. dclab/rtdc_dataset/feat_anc_core/af_ml_class.py +102 -0
  95. dclab/rtdc_dataset/feat_anc_core/ancillary_feature.py +320 -0
  96. dclab/rtdc_dataset/feat_anc_ml/__init__.py +32 -0
  97. dclab/rtdc_dataset/feat_anc_plugin/__init__.py +3 -0
  98. dclab/rtdc_dataset/feat_anc_plugin/plugin_feature.py +329 -0
  99. dclab/rtdc_dataset/feat_basin.py +762 -0
  100. dclab/rtdc_dataset/feat_temp.py +102 -0
  101. dclab/rtdc_dataset/filter.py +263 -0
  102. dclab/rtdc_dataset/fmt_dcor/__init__.py +7 -0
  103. dclab/rtdc_dataset/fmt_dcor/access_token.py +52 -0
  104. dclab/rtdc_dataset/fmt_dcor/api.py +173 -0
  105. dclab/rtdc_dataset/fmt_dcor/base.py +299 -0
  106. dclab/rtdc_dataset/fmt_dcor/basin.py +73 -0
  107. dclab/rtdc_dataset/fmt_dcor/logs.py +26 -0
  108. dclab/rtdc_dataset/fmt_dcor/tables.py +66 -0
  109. dclab/rtdc_dataset/fmt_dict.py +103 -0
  110. dclab/rtdc_dataset/fmt_hdf5/__init__.py +6 -0
  111. dclab/rtdc_dataset/fmt_hdf5/base.py +192 -0
  112. dclab/rtdc_dataset/fmt_hdf5/basin.py +30 -0
  113. dclab/rtdc_dataset/fmt_hdf5/events.py +276 -0
  114. dclab/rtdc_dataset/fmt_hdf5/feat_defect.py +164 -0
  115. dclab/rtdc_dataset/fmt_hdf5/logs.py +33 -0
  116. dclab/rtdc_dataset/fmt_hdf5/tables.py +60 -0
  117. dclab/rtdc_dataset/fmt_hierarchy/__init__.py +11 -0
  118. dclab/rtdc_dataset/fmt_hierarchy/base.py +278 -0
  119. dclab/rtdc_dataset/fmt_hierarchy/events.py +146 -0
  120. dclab/rtdc_dataset/fmt_hierarchy/hfilter.py +140 -0
  121. dclab/rtdc_dataset/fmt_hierarchy/mapper.py +134 -0
  122. dclab/rtdc_dataset/fmt_http.py +102 -0
  123. dclab/rtdc_dataset/fmt_s3.py +354 -0
  124. dclab/rtdc_dataset/fmt_tdms/__init__.py +476 -0
  125. dclab/rtdc_dataset/fmt_tdms/event_contour.py +264 -0
  126. dclab/rtdc_dataset/fmt_tdms/event_image.py +220 -0
  127. dclab/rtdc_dataset/fmt_tdms/event_mask.py +62 -0
  128. dclab/rtdc_dataset/fmt_tdms/event_trace.py +146 -0
  129. dclab/rtdc_dataset/fmt_tdms/exc.py +37 -0
  130. dclab/rtdc_dataset/fmt_tdms/naming.py +151 -0
  131. dclab/rtdc_dataset/load.py +77 -0
  132. dclab/rtdc_dataset/meta_table.py +25 -0
  133. dclab/rtdc_dataset/writer.py +1019 -0
  134. dclab/statistics.py +226 -0
  135. dclab/util.py +176 -0
  136. dclab/warn.py +15 -0
  137. dclab-0.67.0.dist-info/METADATA +153 -0
  138. dclab-0.67.0.dist-info/RECORD +142 -0
  139. dclab-0.67.0.dist-info/WHEEL +6 -0
  140. dclab-0.67.0.dist-info/entry_points.txt +8 -0
  141. dclab-0.67.0.dist-info/licenses/LICENSE +283 -0
  142. dclab-0.67.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,134 @@
1
+ import numpy as np
2
+
3
+
4
+ def map_indices_child2parent(child, child_indices):
5
+ """Map child RTDCBase event indices to parent RTDCBase
6
+
7
+ Given a hierarchy child and indices defined for that child,
8
+ return the corresponding indices for its parent.
9
+
10
+ For instance, a child is defined in such a way that it
11
+ has every second event of its parent (`parent.filter.all[::2]=False`
12
+ i.e. the filtering array is `[False, True, False, ...]`). When passing
13
+ `child_indices=[2,3,4]`, the return value of this method would be
14
+ `parent_indices=[5,7,9]` (indexing starts at 0)`. Index 5 in the
15
+ parent dataset corresponds to index 2 in the child dataset.
16
+
17
+ Parameters
18
+ ----------
19
+ child: RTDC_Hierarchy
20
+ RTDCBase hierarchy child to map from
21
+ child_indices: 1d int ndarray
22
+ integer indices in `child`
23
+
24
+ Returns
25
+ -------
26
+ parent_indices: 1d int ndarray
27
+ integer indices in `child.hparent`
28
+ """
29
+ parent = child.hparent
30
+ # filters
31
+ pf = parent.filter.all
32
+ # indices corresponding to all child events
33
+ idx = np.where(pf)[0] # True means present in the child
34
+ # indices corresponding to selected child events
35
+ parent_indices = idx[child_indices]
36
+ return parent_indices
37
+
38
+
39
+ def map_indices_child2root(child, child_indices):
40
+ """Map RTDC_Hierarchy event indices to root RTDCBase
41
+
42
+ Like :func:`map_indices_child2parent`, but map the
43
+ child indices to the root parent.
44
+
45
+ Parameters
46
+ ----------
47
+ child: RTDC_Hierarchy
48
+ RTDCBase hierarchy child to map from
49
+ child_indices: 1d ndarray
50
+ integer indices in `child`
51
+
52
+ Returns
53
+ -------
54
+ root_indices: 1d ndarray
55
+ integer indices in the child's root parent
56
+ (not necessarily the indices of `child.hparent`)
57
+ """
58
+ while True:
59
+ indices = map_indices_child2parent(child=child,
60
+ child_indices=child_indices)
61
+ if child.hparent.format == "hierarchy":
62
+ child = child.hparent
63
+ child_indices = indices
64
+ else:
65
+ break
66
+ return indices
67
+
68
+
69
+ def map_indices_parent2child(child, parent_indices):
70
+ """Map parent RTDCBase event indices to RTDC_Hierarchy child
71
+
72
+ Given a hierarchy child and indices defined for its `child.hparent`,
73
+ return the corresponding indices for the `child`.
74
+
75
+ Parameters
76
+ ----------
77
+ child: RTDC_Hierarchy
78
+ RTDCBase hierarchy child to map to
79
+ parent_indices: 1d ndarray
80
+ integer indices in `child.hparent`
81
+
82
+ Returns
83
+ -------
84
+ child_indices: 1d ndarray
85
+ integer indices in `child`, corresponding to `parent_indices`
86
+ in `child.hparent`
87
+ """
88
+ parent = child.hparent
89
+ # this boolean array defines `child` in the parent
90
+ pf = parent.filter.all
91
+ # all event indices in parent that define `child`
92
+ pf_loc = np.where(pf)[0]
93
+ # boolean array with size `len(child)` indicating where the
94
+ # `parent_indices` are set.
95
+ same = np.isin(pf_loc, parent_indices)
96
+ return np.where(same)[0]
97
+
98
+
99
+ def map_indices_root2child(child, root_indices):
100
+ """Map root RTDCBase event indices to RTDC_Hierarchy child
101
+
102
+ Like :func:`map_indices_parent2child`, but accepts the
103
+ `root_indices` and map them to `child`.
104
+
105
+ Parameters
106
+ ----------
107
+ child: RTDCBase
108
+ RTDCBase hierarchy child to map to
109
+ root_indices: 1d ndarray
110
+ integer indices in the root parent of `child`
111
+
112
+ Returns
113
+ -------
114
+ child_indices: 1d ndarray
115
+ integer indices in `child`, corresponding to `root_indices`
116
+ in `child`s root parent
117
+ """
118
+ # construct hierarchy tree containing only RTDC_Hierarchy instances
119
+ hierarchy = [child]
120
+ while True:
121
+ if child.hparent.format == "hierarchy":
122
+ # the parent is a hierarchy tree
123
+ hierarchy.append(child.hparent)
124
+ child = child.hparent
125
+ else:
126
+ break
127
+
128
+ indices = root_indices
129
+ for hp in hierarchy[::-1]: # reverse order
130
+ # For each hierarchy parent, map the indices down the
131
+ # hierarchy tree.
132
+ indices = map_indices_parent2child(child=hp,
133
+ parent_indices=indices)
134
+ return indices
@@ -0,0 +1,102 @@
1
+ import hashlib
2
+
3
+ from ..http_utils import HTTPFile, REQUESTS_AVAILABLE, is_url_available
4
+ from ..http_utils import is_http_url # noqa: F401
5
+
6
+ from .feat_basin import Basin
7
+ from .fmt_hdf5 import RTDC_HDF5
8
+
9
+
10
+ class RTDC_HTTP(RTDC_HDF5):
11
+ def __init__(self,
12
+ url: str,
13
+ *args, **kwargs):
14
+ """Access RT-DC measurements via HTTP
15
+
16
+ This class allows you to open .rtdc files accessible via an
17
+ HTTP URL, for instance files on an S3 object storage or
18
+ figshare download links.
19
+
20
+ This is essentially just a wrapper around :class:`.RTDC_HDF5`
21
+ with :class:`.HTTPFile` passing a file object to h5py.
22
+
23
+ Parameters
24
+ ----------
25
+ url: str
26
+ Full URL to an HDF5 file
27
+ *args:
28
+ Arguments for `RTDCBase`
29
+ **kwargs:
30
+ Keyword arguments for `RTDCBase`
31
+
32
+ Attributes
33
+ ----------
34
+ path: str
35
+ The URL to the object
36
+
37
+ Notes
38
+ -----
39
+ Since this format still requires random access to the file online,
40
+ i.e. not the entire file is downloaded, only parts of it, the
41
+ web server must support range requests.
42
+ """
43
+ if not REQUESTS_AVAILABLE:
44
+ raise ModuleNotFoundError(
45
+ f"Package `requests` required for loading http data '{url}'!")
46
+
47
+ self._fhttp = HTTPFile(url)
48
+ if kwargs.get("identifier") is None:
49
+ if self._fhttp.etag is not None:
50
+ # Set the HTTP ETag as the identifier, it doesn't get
51
+ # more unique than that!
52
+ kwargs["identifier"] = self._fhttp.etag
53
+ else:
54
+ # Compute a hash of the first data chunk
55
+ kwargs["identifier"] = hashlib.md5(
56
+ self._fhttp.get_cache_chunk(0)).hexdigest()
57
+
58
+ # Initialize the HDF5 dataset
59
+ super(RTDC_HTTP, self).__init__(
60
+ h5path=self._fhttp,
61
+ *args,
62
+ **kwargs)
63
+ # Override self.path with the actual HTTP URL
64
+ self.path = url
65
+
66
+ def close(self):
67
+ super(RTDC_HTTP, self).close()
68
+ self._fhttp.close()
69
+
70
+
71
+ class HTTPBasin(Basin):
72
+ basin_format = "http"
73
+ basin_type = "remote"
74
+
75
+ def __init__(self, *args, **kwargs):
76
+ self._available_verified = None
77
+ super(HTTPBasin, self).__init__(*args, **kwargs)
78
+
79
+ def _load_dataset(self, location, **kwargs):
80
+ h5file = RTDC_HTTP(location, **kwargs)
81
+ return h5file
82
+
83
+ def is_available(self):
84
+ """Check for `requests` and object availability
85
+
86
+ Caching policy: Once this method returns True, it will always
87
+ return True.
88
+ """
89
+ if self._available_verified is None:
90
+ with self._av_check_lock:
91
+ if not REQUESTS_AVAILABLE:
92
+ # don't even bother
93
+ self._available_verified = False
94
+ else:
95
+ avail, reason = is_url_available(self.location,
96
+ ret_reason=True)
97
+ if reason in ["forbidden", "not found"]:
98
+ # we cannot access the URL in the near future
99
+ self._available_verified = False
100
+ elif avail:
101
+ self._available_verified = True
102
+ return self._available_verified
@@ -0,0 +1,354 @@
1
+ import functools
2
+ # import multiprocessing BaseManager here, because there is some kind
3
+ # of circular dependency issue with s3transfer.compat and multiprocessing.
4
+ from multiprocessing.managers import BaseManager # noqa: F401
5
+ import os
6
+ import pathlib
7
+ import re
8
+ import socket
9
+ from urllib.parse import urlparse
10
+ import warnings
11
+
12
+
13
+ try:
14
+ import boto3
15
+ import botocore
16
+ import botocore.client
17
+ import botocore.exceptions
18
+ import botocore.session
19
+ except ModuleNotFoundError:
20
+ BOTO3_AVAILABLE = False
21
+ else:
22
+ BOTO3_AVAILABLE = True
23
+
24
+ from ..http_utils import HTTPFile
25
+
26
+ from .feat_basin import Basin
27
+
28
+ from .fmt_hdf5 import RTDC_HDF5
29
+
30
+
31
+ #: Regular expression for matching a DCOR resource URL
32
+ REGEXP_S3_URL = re.compile(
33
+ r"^(https?:\/\/)" # protocol (http or https or omitted)
34
+ r"([a-z0-9-\.]*)(\:[0-9]*)?\/" # host:port
35
+ r".+\/" # bucket
36
+ r".+" # key
37
+ )
38
+ REGEXP_S3_BUCKET_KEY = re.compile(r"^[0-9a-z-]+(\/[0-9a-z-]+)+$")
39
+
40
+ S3_ENDPOINT_URL = os.environ.get("DCLAB_S3_ENDPOINT_URL")
41
+ S3_ACCESS_KEY_ID = os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
42
+ S3_SECRET_ACCESS_KEY = os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")
43
+
44
+
45
+ @functools.lru_cache(maxsize=1000)
46
+ def get_s3_session_client(access_key_id: str,
47
+ secret_access_key: str,
48
+ use_ssl: bool,
49
+ verify_ssl: bool,
50
+ endpoint_url: str
51
+ ):
52
+ botocore_session = botocore.session.get_session()
53
+ s3_session = boto3.Session(
54
+ aws_access_key_id=access_key_id,
55
+ aws_secret_access_key=secret_access_key,
56
+ botocore_session=botocore_session)
57
+ s3_client = s3_session.client(
58
+ service_name='s3',
59
+ use_ssl=use_ssl,
60
+ verify=verify_ssl,
61
+ endpoint_url=endpoint_url,
62
+ )
63
+ return botocore_session, s3_session, s3_client
64
+
65
+
66
+ class S3File(HTTPFile):
67
+ """Monkeypatched `HTTPFile` to support authenticated access to S3"""
68
+ def __init__(self,
69
+ object_path: str,
70
+ endpoint_url: str,
71
+ access_key_id: str = "",
72
+ secret_access_key: str = "",
73
+ use_ssl: bool = True,
74
+ verify_ssl: bool = True):
75
+ """
76
+
77
+ Parameters
78
+ ----------
79
+ object_path: str
80
+ bucket/key path to object in the object store
81
+ endpoint_url: str
82
+ the explicit endpoint URL for accessing the object store
83
+ access_key_id:
84
+ S3 access key
85
+ secret_access_key:
86
+ secret S3 key mathcing `access_key_id`
87
+ use_ssl: bool
88
+ use SSL to connect to the endpoint, only disabled for testing
89
+ verify_ssl: bool
90
+ make sure the SSL certificate is sound, only used for testing
91
+ """
92
+ if endpoint_url is None:
93
+ raise ValueError(
94
+ "The S3 endpoint URL is empty. This could mean that you did "
95
+ "not specify the full S3 URL or that you forgot to set "
96
+ "the `S3_ENDPOINT_URL` environment variable.")
97
+ endpoint_url = endpoint_url.strip().rstrip("/")
98
+ self.botocore_session, self.s3_session, self.s3_client = \
99
+ get_s3_session_client(
100
+ access_key_id=access_key_id,
101
+ secret_access_key=secret_access_key,
102
+ use_ssl=use_ssl,
103
+ verify_ssl=verify_ssl,
104
+ endpoint_url=endpoint_url,
105
+ )
106
+
107
+ # Use a configuration that allows anonymous access
108
+ # https://stackoverflow.com/a/34866092
109
+ if not secret_access_key:
110
+ config = botocore.client.Config(
111
+ signature_version=botocore.UNSIGNED,
112
+ region_name='us-east-1')
113
+ else:
114
+ config = None
115
+
116
+ self.s3_resource = self.s3_session.resource(
117
+ service_name="s3",
118
+ use_ssl=use_ssl,
119
+ verify=verify_ssl,
120
+ endpoint_url=endpoint_url,
121
+ config=config)
122
+
123
+ bucket_name, object_name = object_path.strip("/").split("/", 1)
124
+ self.s3_object = self.s3_resource.Object(
125
+ bucket_name=bucket_name,
126
+ key=object_name)
127
+
128
+ super(S3File, self).__init__(f"{endpoint_url}/{object_path}")
129
+
130
+ def _parse_header(self):
131
+ if self._len is None:
132
+ self._len = self.s3_object.content_length
133
+ self._etag = self.s3_object.e_tag
134
+
135
+ def close(self):
136
+ super(S3File, self).close()
137
+ self.s3_client.close()
138
+
139
+ def download_range(self, start, stop):
140
+ """Download bytes given by the range (`start`, `stop`)
141
+
142
+ `stop` is not inclusive (In the HTTP range request it normally is).
143
+ """
144
+ stream = self.s3_object.get(Range=f"bytes={start}-{stop-1}")['Body']
145
+ return stream.read()
146
+
147
+
148
+ class RTDC_S3(RTDC_HDF5):
149
+ def __init__(self,
150
+ url: str,
151
+ endpoint_url: str = None,
152
+ access_key_id: str = None,
153
+ secret_access_key: str = None,
154
+ use_ssl: bool = True,
155
+ *args, **kwargs):
156
+ """Access RT-DC measurements in an S3-compatible object store
157
+
158
+ This is essentially just a wrapper around :class:`.RTDC_HDF5`
159
+ with :mod:`boto3` and :class:`.HTTPFile` passing a file object to h5py.
160
+
161
+ Parameters
162
+ ----------
163
+ url: str
164
+ URL to an object in an S3 instance; this can be either a full
165
+ URL (including the endpoint), or just `bucket/key`
166
+ access_key_id: str
167
+ S3 access identifier
168
+ secret_access_key: str
169
+ Secret S3 access key
170
+ use_ssl: bool
171
+ Whether to enforce SSL (defaults to True)
172
+ *args:
173
+ Arguments for `RTDCBase`
174
+ **kwargs:
175
+ Keyword arguments for `RTDCBase`
176
+
177
+ Attributes
178
+ ----------
179
+ path: str
180
+ The URL to the object
181
+ """
182
+ if not BOTO3_AVAILABLE:
183
+ raise ModuleNotFoundError(
184
+ f"Package `boto3` required for loading S3 data '{url}'!")
185
+
186
+ self._s3file = S3File(
187
+ object_path=get_object_path(url),
188
+ endpoint_url=(endpoint_url
189
+ or get_endpoint_url(url)
190
+ or S3_ENDPOINT_URL
191
+ or os.environ.get("DCLAB_S3_ENDPOINT_URL")
192
+ ),
193
+ access_key_id=(access_key_id
194
+ or S3_ACCESS_KEY_ID
195
+ or os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
196
+ or ""
197
+ ),
198
+ secret_access_key=(secret_access_key
199
+ or S3_SECRET_ACCESS_KEY
200
+ or os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")
201
+ or ""
202
+ ),
203
+ use_ssl=use_ssl,
204
+ verify_ssl=use_ssl,
205
+ )
206
+ # Initialize the HDF5 dataset
207
+ super(RTDC_S3, self).__init__(
208
+ h5path=self._s3file,
209
+ *args,
210
+ **kwargs)
211
+ # Override self.path with the actual S3 URL
212
+ self.path = self._s3file.url
213
+
214
+ def close(self):
215
+ super(RTDC_S3, self).close()
216
+ self._s3file.close()
217
+
218
+
219
+ class S3Basin(Basin):
220
+ basin_format = "s3"
221
+ basin_type = "remote"
222
+
223
+ def __init__(self, *args, **kwargs):
224
+ self._available_verified = None
225
+ super(S3Basin, self).__init__(*args, **kwargs)
226
+
227
+ def _load_dataset(self, location, **kwargs):
228
+ h5file = RTDC_S3(location, **kwargs)
229
+ return h5file
230
+
231
+ def is_available(self):
232
+ """Check for boto3 and object availability
233
+
234
+ Caching policy: Once this method returns True, it will always
235
+ return True.
236
+ """
237
+ if self._available_verified is None:
238
+ with self._av_check_lock:
239
+ if not BOTO3_AVAILABLE:
240
+ self._available_verified = False
241
+ else:
242
+ self._available_verified = \
243
+ is_s3_object_available(self.location)
244
+ return self._available_verified
245
+
246
+
247
+ def is_s3_object_available(url: str,
248
+ access_key_id: str = None,
249
+ secret_access_key: str = None,
250
+ ):
251
+ """Check whether an S3 object is available
252
+
253
+ Parameters
254
+ ----------
255
+ url: str
256
+ full URL to the object
257
+ access_key_id: str
258
+ S3 access identifier
259
+ secret_access_key: str
260
+ Secret S3 access key
261
+ """
262
+ avail = False
263
+ if is_s3_url(url):
264
+ endpoint_url = (get_endpoint_url(url)
265
+ or S3_ENDPOINT_URL
266
+ or os.environ.get("DCLAB_S3_ENDPOINT_URL")
267
+ )
268
+ if not endpoint_url:
269
+ warnings.warn(
270
+ f"Could not determine endpoint from URL '{url}'. Please "
271
+ f"set the `S3_ENDPOINT_URL` environment variable or pass "
272
+ f"a full object URL.")
273
+ else:
274
+ # default to https if no scheme or port is specified
275
+ urlp = urlparse(endpoint_url)
276
+ port = urlp.port or (80 if urlp.scheme == "http" else 443)
277
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
278
+ s.settimeout(1)
279
+ # Try to connect to the host
280
+ try:
281
+ # Use `hostname`, not `netloc`, because `netloc` contains
282
+ # the port number which we do not want here.
283
+ s.connect((urlp.hostname, port))
284
+ except (socket.gaierror, OSError):
285
+ pass
286
+ else:
287
+ # Try to access the object
288
+ s3file = S3File(
289
+ object_path=get_object_path(url),
290
+ endpoint_url=endpoint_url,
291
+ access_key_id=(
292
+ access_key_id
293
+ or S3_ACCESS_KEY_ID
294
+ or os.environ.get("DCLAB_S3_ACCESS_KEY_ID")
295
+ or ""
296
+ ),
297
+ secret_access_key=(
298
+ secret_access_key
299
+ or S3_SECRET_ACCESS_KEY
300
+ or os.environ.get("DCLAB_S3_SECRET_ACCESS_KEY")
301
+ or ""
302
+ ),
303
+ )
304
+ try:
305
+ s3file.s3_object.load()
306
+ except botocore.exceptions.ClientError:
307
+ avail = False
308
+ else:
309
+ avail = True
310
+ return avail
311
+
312
+
313
+ @functools.lru_cache()
314
+ def get_endpoint_url(url):
315
+ """Given a URL of an S3 object, return the endpoint URL
316
+
317
+ Return None if no endpoint URL can be extracted (e.g. because
318
+ just `bucket_name/object_path` was passed).
319
+ """
320
+ urlp = urlparse(url=url)
321
+ if urlp.hostname:
322
+ scheme = urlp.scheme or "https"
323
+ port = urlp.port or (80 if scheme == "http" else 443)
324
+ return f"{scheme}://{urlp.hostname}:{port}"
325
+ else:
326
+ return None
327
+
328
+
329
+ @functools.lru_cache()
330
+ def get_object_path(url):
331
+ """Given a URL of an S3 object, return the `bucket_name/object_path` part
332
+
333
+ Return object paths always without leading slash `/`.
334
+ """
335
+ urlp = urlparse(url=url)
336
+ return urlp.path.lstrip("/")
337
+
338
+
339
+ @functools.lru_cache()
340
+ def is_s3_url(string):
341
+ """Check whether `string` is a valid S3 URL using regexp"""
342
+ if not isinstance(string, str):
343
+ return False
344
+ elif REGEXP_S3_URL.match(string.strip()):
345
+ # this is pretty clear
346
+ return True
347
+ elif pathlib.Path(string).exists():
348
+ # this is actually a file
349
+ return False
350
+ elif REGEXP_S3_BUCKET_KEY.match(string.strip()):
351
+ # bucket_name/key
352
+ return True
353
+ else:
354
+ return False