dcicutils 8.8.4.1b15__py3-none-any.whl → 8.8.4.1b17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dcicutils/file_utils.py CHANGED
@@ -1,4 +1,6 @@
1
1
  import glob
2
+ import hashlib
3
+ import io
2
4
  import os
3
5
  import pathlib
4
6
  from datetime import datetime
@@ -103,6 +105,76 @@ def are_files_equal(filea: str, fileb: str) -> bool:
103
105
  return False
104
106
 
105
107
 
108
+ def compute_file_md5(file: str) -> str:
109
+ """
110
+ Returns the md5 checksum for the given file.
111
+ """
112
+ if not isinstance(file, str):
113
+ return ""
114
+ try:
115
+ md5 = hashlib.md5()
116
+ with open(file, "rb") as file:
117
+ for chunk in iter(lambda: file.read(4096), b""):
118
+ md5.update(chunk)
119
+ return md5.hexdigest()
120
+ except Exception:
121
+ return ""
122
+
123
+
124
+ def compute_file_etag(file: str) -> Optional[str]:
125
+ """
126
+ Returns the AWS S3 "etag" for the given file; this value is md5-like but
127
+ not the same as a normal md5. We use this to compare that a file in S3
128
+ appears to be the exact the same file as a local file.
129
+ """
130
+ try:
131
+ with io.open(file, "rb") as f:
132
+ return _compute_file_etag(f)
133
+ except Exception:
134
+ return None
135
+
136
+
137
+ def _compute_file_etag(f: io.BufferedReader) -> str:
138
+ # See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag
139
+ MULTIPART_THRESHOLD = 8388608
140
+ MULTIPART_CHUNKSIZE = 8388608
141
+ # BUFFER_SIZE = 1048576
142
+ # Verify some assumptions are correct
143
+ # assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD)
144
+ # assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0)
145
+ # assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0)
146
+ hash = hashlib.md5()
147
+ read = 0
148
+ chunks = None
149
+ while True:
150
+ # Read some from stdin, if we're at the end, stop reading
151
+ bits = f.read(1048576)
152
+ if len(bits) == 0:
153
+ break
154
+ read += len(bits)
155
+ hash.update(bits)
156
+ if chunks is None:
157
+ # We're handling a multi-part upload, so switch to calculating
158
+ # hashes of each chunk
159
+ if read >= MULTIPART_THRESHOLD:
160
+ chunks = b''
161
+ if chunks is not None:
162
+ if (read % MULTIPART_CHUNKSIZE) == 0:
163
+ # Dont with a chunk, add it to the list of hashes to hash later
164
+ chunks += hash.digest()
165
+ hash = hashlib.md5()
166
+ if chunks is None:
167
+ # Normal upload, just output the MD5 hash
168
+ etag = hash.hexdigest()
169
+ else:
170
+ # Multipart upload, need to output the hash of the hashes
171
+ if (read % MULTIPART_CHUNKSIZE) != 0:
172
+ # Add the last part if we have a partial chunk
173
+ chunks += hash.digest()
174
+ etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16)
175
+ return etag
176
+
177
+
106
178
  def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
107
179
  nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
108
180
  """
dcicutils/http_utils.py CHANGED
@@ -1,20 +1,31 @@
1
1
  from contextlib import contextmanager
2
2
  import requests
3
- from typing import Optional
3
+ from typing import Callable, Optional
4
4
  from dcicutils.tmpfile_utils import temporary_file
5
5
 
6
6
 
7
7
  @contextmanager
8
- def download(url: str, suffix: Optional[str] = None, binary: bool = True) -> Optional[str]:
8
+ def download(url: str, suffix: Optional[str] = None, binary: bool = True,
9
+ progress: Optional[Callable] = None) -> Optional[str]:
9
10
  """
10
11
  Context manager to ownload the given URL into a temporary file and yields the file
11
12
  path to it. An optional file suffix may be specified. Defaults to binary file mode;
12
13
  if this is not desired then pass False as the binary argument.
13
14
  """
15
+ if not callable(progress):
16
+ progress = None
14
17
  with temporary_file(suffix=suffix) as file:
15
18
  response = requests.get(url, stream=True)
19
+ if progress:
20
+ nbytes = 0
21
+ nbytes_total = None
22
+ if isinstance(content_length := response.headers.get("Content-Length"), str) and content_length.isdigit():
23
+ nbytes_total = int(content_length)
16
24
  with open(file, "wb" if binary is True else "w") as f:
17
25
  for chunk in response.iter_content(chunk_size=8192):
18
26
  if chunk:
19
27
  f.write(chunk)
28
+ if progress:
29
+ nbytes += len(chunk)
30
+ progress(nbytes, nbytes_total)
20
31
  yield file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 8.8.4.1b15
3
+ Version: 8.8.4.1b17
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -28,10 +28,10 @@ dcicutils/es_utils.py,sha256=ZksLh5ei7kRUfiFltk8sd2ZSfh15twbstrMzBr8HNw4,7541
28
28
  dcicutils/exceptions.py,sha256=4giQGtpak-omQv7BP6Ckeu91XK5fnDosC8gfdmN_ccA,9931
29
29
  dcicutils/ff_mocks.py,sha256=6RKS4eUiu_Wl8yP_8V0CaV75w4ZdWxdCuL1CVlnMrek,36918
30
30
  dcicutils/ff_utils.py,sha256=oIhuZPnGtfwj6bWyCc1u23JbMB_6InPp01ZqUOljd8M,73123
31
- dcicutils/file_utils.py,sha256=vPPFiGK5scq7opVQ_Ne8eeTAjy5OtSxyAgeJUievygQ,6283
31
+ dcicutils/file_utils.py,sha256=msxA3fFTtK09Qc_I3-r9Y5Pp5WVJRPPpLlFYv3Rju-E,8697
32
32
  dcicutils/function_cache_decorator.py,sha256=XMyiEGODVr2WoAQ68vcoX_9_Xb9p8pZXdXl7keU8i2g,10026
33
33
  dcicutils/glacier_utils.py,sha256=Q4CVXsZCbP-SoZIsZ5NMcawDfelOLzbQnIlQn-GdlTo,34149
34
- dcicutils/http_utils.py,sha256=RB0x9hRMZM9Xd1x00c5J0iUzUdYzIQR0XKFiQ94HWO0,807
34
+ dcicutils/http_utils.py,sha256=Je5ErNjR5e6lfSXGRncK_lcR_-zP38PIpmHjApy9Wi4,1289
35
35
  dcicutils/jh_utils.py,sha256=Gpsxb9XEzggF_-Eq3ukjKvTnuyb9V1SCSUXkXsES4Kg,11502
36
36
  dcicutils/kibana/dashboards.json,sha256=wHMB_mpJ8OaYhRRgvpZuihaB2lmSF64ADt_8hkBWgQg,16225
37
37
  dcicutils/kibana/readme.md,sha256=3KmHF9FH6A6xwYsNxRFLw27q0XzHYnjZOlYUnn3VkQQ,2164
@@ -73,8 +73,8 @@ dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
73
73
  dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
74
74
  dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
75
75
  dcicutils/zip_utils.py,sha256=_Y9EmL3D2dUZhxucxHvrtmmlbZmK4FpSsHEb7rGSJLU,3265
76
- dcicutils-8.8.4.1b15.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
77
- dcicutils-8.8.4.1b15.dist-info/METADATA,sha256=-eKROBZs2uM-7YOnS8nHPkO9U1nHnRz5UJhZYU9svgQ,3440
78
- dcicutils-8.8.4.1b15.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
79
- dcicutils-8.8.4.1b15.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
80
- dcicutils-8.8.4.1b15.dist-info/RECORD,,
76
+ dcicutils-8.8.4.1b17.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
77
+ dcicutils-8.8.4.1b17.dist-info/METADATA,sha256=KLVzlNXL2JHkY6n3bL-fbnoVa1kNKAnIbK7bRluuxlM,3440
78
+ dcicutils-8.8.4.1b17.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
79
+ dcicutils-8.8.4.1b17.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
80
+ dcicutils-8.8.4.1b17.dist-info/RECORD,,