karton-core 5.7.0__py3-none-any.whl → 5.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,384 @@
1
+ import contextlib
2
+ import os
3
+ import shutil
4
+ import tempfile
5
+ import zipfile
6
+ from io import BytesIO
7
+ from typing import IO, TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Union
8
+
9
+ from karton.core.resource import LocalResourceBase, ResourceBase
10
+
11
+ if TYPE_CHECKING:
12
+ from .backend import KartonAsyncBackend
13
+
14
+
15
+ class LocalResource(LocalResourceBase):
16
+ """
17
+ Represents local resource with arbitrary binary data e.g. file contents.
18
+
19
+ Local resources will be uploaded to object hub (S3) during
20
+ task dispatching.
21
+
22
+ .. code-block:: python
23
+
24
+ # Creating resource from bytes
25
+ sample = Resource("original_name.exe", content=b"X5O!P%@AP[4\\
26
+ PZX54(P^)7CC)7}$EICAR-STANDARD-ANT...")
27
+
28
+ # Creating resource from path
29
+ sample = Resource("original_name.exe", path="sample/original_name.exe")
30
+
31
+ :param name: Name of the resource (e.g. name of file)
32
+ :param content: Resource content
33
+ :param path: Path of file with resource content
34
+ :param bucket: Alternative S3 bucket for resource
35
+ :param metadata: Resource metadata
36
+ :param uid: Alternative S3 resource id
37
+ :param sha256: Resource sha256 hash
38
+ :param fd: Seekable file descriptor
39
+ :param _flags: Resource flags
40
+ :param _close_fd: Close file descriptor after upload (default: False)
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ name: str,
46
+ content: Optional[Union[str, bytes]] = None,
47
+ path: Optional[str] = None,
48
+ bucket: Optional[str] = None,
49
+ metadata: Optional[Dict[str, Any]] = None,
50
+ uid: Optional[str] = None,
51
+ sha256: Optional[str] = None,
52
+ fd: Optional[IO[bytes]] = None,
53
+ _flags: Optional[List[str]] = None,
54
+ _close_fd: bool = False,
55
+ ) -> None:
56
+ super().__init__(
57
+ name=name,
58
+ content=content,
59
+ path=path,
60
+ bucket=bucket,
61
+ metadata=metadata,
62
+ uid=uid,
63
+ sha256=sha256,
64
+ fd=fd,
65
+ _flags=_flags,
66
+ _close_fd=_close_fd,
67
+ )
68
+
69
+ async def _upload(self, backend: "KartonAsyncBackend") -> None:
70
+ """Internal function for uploading resources
71
+
72
+ :param backend: KartonBackend to use while uploading the resource
73
+
74
+ :meta private:
75
+ """
76
+
77
+ # Note: never transform resource into Remote
78
+ # Multiple task dispatching with same local, in that case resource
79
+ # can be deleted between tasks.
80
+ if self.bucket is None:
81
+ raise RuntimeError(
82
+ "Resource object can't be uploaded because its bucket is not set"
83
+ )
84
+
85
+ if self._content:
86
+ # Upload contents
87
+ await backend.upload_object(self.bucket, self.uid, self._content)
88
+ elif self.fd:
89
+ if self.fd.tell() != 0:
90
+ raise RuntimeError(
91
+ f"Resource object can't be uploaded: "
92
+ f"file descriptor must point at first byte "
93
+ f"(fd.tell = {self.fd.tell()})"
94
+ )
95
+ # Upload contents from fd
96
+ await backend.upload_object(self.bucket, self.uid, self.fd)
97
+ # If file descriptor is managed by Resource, close it after upload
98
+ if self._close_fd:
99
+ self.fd.close()
100
+ elif self._path:
101
+ # Upload file provided by path
102
+ await backend.upload_object_from_file(self.bucket, self.uid, self._path)
103
+
104
+ async def upload(self, backend: "KartonAsyncBackend") -> None:
105
+ """Internal function for uploading resources
106
+
107
+ :param backend: KartonBackend to use while uploading the resource
108
+
109
+ :meta private:
110
+ """
111
+ if not self._content and not self._path and not self.fd:
112
+ raise RuntimeError("Can't upload resource without content")
113
+ await self._upload(backend)
114
+
115
+
116
+ Resource = LocalResource
117
+
118
+
119
+ class RemoteResource(ResourceBase):
120
+ """
121
+ Keeps reference to remote resource object shared between subsystems
122
+ via object storage (S3)
123
+
124
+ Should never be instantiated directly by subsystem, but can be directly passed to
125
+ outgoing payload.
126
+
127
+ :param name: Name of the resource (e.g. name of file)
128
+ :param bucket: Alternative S3 bucket for resource
129
+ :param metadata: Resource metadata
130
+ :param uid: Alternative S3 resource id
131
+ :param size: Resource size
132
+ :param backend: :py:meth:`KartonBackend` to bind to this resource
133
+ :param sha256: Resource sha256 hash
134
+ :param _flags: Resource flags
135
+ """
136
+
137
+ def __init__(
138
+ self,
139
+ name: str,
140
+ bucket: Optional[str] = None,
141
+ metadata: Optional[Dict[str, Any]] = None,
142
+ uid: Optional[str] = None,
143
+ size: Optional[int] = None,
144
+ backend: Optional["KartonAsyncBackend"] = None,
145
+ sha256: Optional[str] = None,
146
+ _flags: Optional[List[str]] = None,
147
+ ) -> None:
148
+ super(RemoteResource, self).__init__(
149
+ name,
150
+ bucket=bucket,
151
+ metadata=metadata,
152
+ sha256=sha256,
153
+ _uid=uid,
154
+ _size=size,
155
+ _flags=_flags,
156
+ )
157
+ self.backend = backend
158
+
159
+ def loaded(self) -> bool:
160
+ """
161
+ Checks whether resource is loaded into memory
162
+
163
+ :return: Flag indicating if the resource is loaded or not
164
+ """
165
+ return self._content is not None
166
+
167
+ @property
168
+ def content(self) -> bytes:
169
+ """
170
+ Resource content. Performs download when resource was not loaded before.
171
+
172
+ :return: Content bytes
173
+ """
174
+ if self._content is None:
175
+ raise RuntimeError(
176
+ "Resource object needs to be explicitly downloaded first"
177
+ )
178
+ return self._content
179
+
180
+ @classmethod
181
+ def from_dict(
182
+ cls, dict: Dict[str, Any], backend: Optional["KartonAsyncBackend"]
183
+ ) -> "RemoteResource":
184
+ """
185
+ Internal deserialization method for remote resources
186
+
187
+ :param dict: Serialized information about resource
188
+ :param backend: KartonBackend object
189
+ :return: Deserialized :py:meth:`RemoteResource` object
190
+
191
+ :meta private:
192
+ """
193
+ # Backwards compatibility
194
+ metadata = dict.get("metadata", {})
195
+ if "sha256" in dict:
196
+ metadata["sha256"] = dict["sha256"]
197
+
198
+ return cls(
199
+ name=dict["name"],
200
+ metadata=metadata,
201
+ bucket=dict["bucket"],
202
+ uid=dict["uid"],
203
+ size=dict.get("size"), # Backwards compatibility (2.x.x)
204
+ backend=backend,
205
+ _flags=dict.get("flags"), # Backwards compatibility (3.x.x)
206
+ )
207
+
208
+ def unload(self) -> None:
209
+ """
210
+ Unloads resource object from memory
211
+ """
212
+ self._content = None
213
+
214
+ async def download(self) -> bytes:
215
+ """
216
+ Downloads remote resource content from object hub into memory.
217
+
218
+ .. code-block:: python
219
+
220
+ sample = self.current_task.get_resource("sample")
221
+
222
+ # Ensure that resource will be downloaded before it will be
223
+ # passed to processing method
224
+ sample.download()
225
+
226
+ self.process_sample(sample)
227
+
228
+ :return: Downloaded content bytes
229
+ """
230
+ if self.backend is None:
231
+ raise RuntimeError(
232
+ (
233
+ "Resource object can't be downloaded because it's not bound to "
234
+ "the backend"
235
+ )
236
+ )
237
+ if self.bucket is None:
238
+ raise RuntimeError(
239
+ "Resource object can't be downloaded because its bucket is not set"
240
+ )
241
+
242
+ self._content = await self.backend.download_object(self.bucket, self.uid)
243
+ return self._content
244
+
245
+ async def download_to_file(self, path: str) -> None:
246
+ """
247
+ Downloads remote resource into file.
248
+
249
+ .. code-block:: python
250
+
251
+ sample = self.current_task.get_resource("sample")
252
+
253
+ sample.download_to_file("sample/sample.exe")
254
+
255
+ with open("sample/sample.exe", "rb") as f:
256
+ contents = f.read()
257
+
258
+ :param path: Path to download the resource into
259
+ """
260
+ if self.backend is None:
261
+ raise RuntimeError(
262
+ (
263
+ "Resource object can't be downloaded because it's not bound to "
264
+ "the backend"
265
+ )
266
+ )
267
+ if self.bucket is None:
268
+ raise RuntimeError(
269
+ "Resource object can't be downloaded because its bucket is not set"
270
+ )
271
+
272
+ await self.backend.download_object_to_file(self.bucket, self.uid, path)
273
+
274
+ @contextlib.asynccontextmanager
275
+ async def download_temporary_file(self, suffix=None) -> AsyncIterator[IO[bytes]]:
276
+ """
277
+ Downloads remote resource into named temporary file.
278
+
279
+ .. code-block:: python
280
+
281
+ sample = self.current_task.get_resource("sample")
282
+
283
+ with sample.download_temporary_file() as f:
284
+ contents = f.read()
285
+ path = f.name
286
+
287
+ # Temporary file is deleted after exitting the "with" scope
288
+
289
+ :return: ContextManager with the temporary file
290
+ """
291
+ # That tempfile-fu is necessary because minio.fget_object removes file
292
+ # under provided path and renames its own part-file with downloaded content
293
+ # under previously deleted path
294
+ # Weird move, but ok...
295
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
296
+ tmp.close()
297
+ try:
298
+ await self.download_to_file(tmp.name)
299
+ with open(tmp.name, "rb") as f:
300
+ yield f
301
+ finally:
302
+ os.remove(tmp.name)
303
+
304
+ @contextlib.asynccontextmanager
305
+ async def zip_file(self) -> AsyncIterator[zipfile.ZipFile]:
306
+ """
307
+ If resource contains a Zip file, downloads it to the temporary file
308
+ and wraps it with ZipFile object.
309
+
310
+ .. code-block:: python
311
+
312
+ dumps = self.current_task.get_resource("dumps")
313
+
314
+ with dumps.zip_file() as zipf:
315
+ print("Fetched dumps: ", zipf.namelist())
316
+
317
+ By default: method downloads zip into temporary file, which is deleted after
318
+ leaving the context. If you want to load zip into memory,
319
+ call :py:meth:`RemoteResource.download` first.
320
+
321
+ If you want to pre-download Zip under specified path and open it using
322
+ zipfile module, you need to do this manually:
323
+
324
+ .. code-block:: python
325
+
326
+ dumps = self.current_task.get_resource("dumps")
327
+
328
+ # Download zip file
329
+ zip_path = "./dumps.zip"
330
+ dumps.download_to_file(zip_path)
331
+
332
+ zipf = zipfile.Zipfile(zip_path)
333
+
334
+ :return: ContextManager with zipfile
335
+ """
336
+ if self._content:
337
+ yield zipfile.ZipFile(BytesIO(self._content))
338
+ else:
339
+ async with self.download_temporary_file() as f:
340
+ yield zipfile.ZipFile(f)
341
+
342
+ async def extract_to_directory(self, path: str) -> None:
343
+ """
344
+ If resource contains a Zip file, extracts files contained in Zip into
345
+ provided path.
346
+
347
+ By default: method downloads zip into temporary file, which is deleted
348
+ after extraction. If you want to load zip into memory, call
349
+ :py:meth:`RemoteResource.download` first.
350
+
351
+ :param path: Directory path where the resource should be unpacked
352
+ """
353
+ async with self.zip_file() as zf:
354
+ zf.extractall(path)
355
+
356
+ @contextlib.asynccontextmanager
357
+ async def extract_temporary(self) -> AsyncIterator[str]:
358
+ """
359
+ If resource contains a Zip file, extracts files contained in Zip
360
+ to the temporary directory.
361
+
362
+ Returns path of directory with extracted files. Directory is recursively
363
+ deleted after leaving the context.
364
+
365
+ .. code-block:: python
366
+
367
+ dumps = self.current_task.get_resource("dumps")
368
+
369
+ with dumps.extract_temporary() as dumps_path:
370
+ print("Fetched dumps:", os.listdir(dumps_path))
371
+
372
+ By default: method downloads zip into temporary file, which is deleted
373
+ after extraction. If you want to load zip into memory, call
374
+ :py:meth:`RemoteResource.download` first.
375
+
376
+ :return: ContextManager with the temporary directory
377
+ """
378
+ tmpdir = tempfile.mkdtemp()
379
+ try:
380
+ await self.extract_to_directory(tmpdir)
381
+ yield tmpdir
382
+ yield tmpdir
383
+ finally:
384
+ shutil.rmtree(tmpdir)