dcicutils 8.8.4.1b13__py3-none-any.whl → 8.8.4.1b16__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
dcicutils/file_utils.py CHANGED
@@ -1,4 +1,6 @@
1
1
  import glob
2
+ import hashlib
3
+ import io
2
4
  import os
3
5
  import pathlib
4
6
  from datetime import datetime
@@ -103,6 +105,76 @@ def are_files_equal(filea: str, fileb: str) -> bool:
103
105
  return False
104
106
 
105
107
 
108
+ def compute_file_md5(file: str) -> str:
109
+ """
110
+ Returns the md5 checksum for the given file.
111
+ """
112
+ if not isinstance(file, str):
113
+ return ""
114
+ try:
115
+ md5 = hashlib.md5()
116
+ with open(file, "rb") as file:
117
+ for chunk in iter(lambda: file.read(4096), b""):
118
+ md5.update(chunk)
119
+ return md5.hexdigest()
120
+ except Exception:
121
+ return ""
122
+
123
+
124
+ def compute_file_etag(file: str) -> Optional[str]:
125
+ """
126
+ Returns the AWS S3 "etag" for the given file; this value is md5-like but
127
+ not the same as a normal md5. We use this to compare that a file in S3
128
+ appears to be the exact the same file as a local file.
129
+ """
130
+ try:
131
+ with io.open(file, "rb") as f:
132
+ return _compute_file_etag(f)
133
+ except Exception:
134
+ return None
135
+
136
+
137
+ def _compute_file_etag(f: io.BufferedReader) -> str:
138
+ # See: https://stackoverflow.com/questions/75723647/calculate-md5-from-aws-s3-etag
139
+ MULTIPART_THRESHOLD = 8388608
140
+ MULTIPART_CHUNKSIZE = 8388608
141
+ # BUFFER_SIZE = 1048576
142
+ # Verify some assumptions are correct
143
+ # assert(MULTIPART_CHUNKSIZE >= MULTIPART_THRESHOLD)
144
+ # assert((MULTIPART_THRESHOLD % BUFFER_SIZE) == 0)
145
+ # assert((MULTIPART_CHUNKSIZE % BUFFER_SIZE) == 0)
146
+ hash = hashlib.md5()
147
+ read = 0
148
+ chunks = None
149
+ while True:
150
+ # Read some from stdin, if we're at the end, stop reading
151
+ bits = f.read(1048576)
152
+ if len(bits) == 0:
153
+ break
154
+ read += len(bits)
155
+ hash.update(bits)
156
+ if chunks is None:
157
+ # We're handling a multi-part upload, so switch to calculating
158
+ # hashes of each chunk
159
+ if read >= MULTIPART_THRESHOLD:
160
+ chunks = b''
161
+ if chunks is not None:
162
+ if (read % MULTIPART_CHUNKSIZE) == 0:
163
+ # Dont with a chunk, add it to the list of hashes to hash later
164
+ chunks += hash.digest()
165
+ hash = hashlib.md5()
166
+ if chunks is None:
167
+ # Normal upload, just output the MD5 hash
168
+ etag = hash.hexdigest()
169
+ else:
170
+ # Multipart upload, need to output the hash of the hashes
171
+ if (read % MULTIPART_CHUNKSIZE) != 0:
172
+ # Add the last part if we have a partial chunk
173
+ chunks += hash.digest()
174
+ etag = hashlib.md5(chunks).hexdigest() + "-" + str(len(chunks) // 16)
175
+ return etag
176
+
177
+
106
178
  def create_random_file(file: Optional[str] = None, prefix: Optional[str] = None, suffix: Optional[str] = None,
107
179
  nbytes: int = 1024, binary: bool = False, line_length: Optional[int] = None) -> str:
108
180
  """
dcicutils/misc_utils.py CHANGED
@@ -19,6 +19,7 @@ import pytz
19
19
  import re
20
20
  import rfc3986.validators
21
21
  import rfc3986.exceptions
22
+ import shortuuid
22
23
  import time
23
24
  import uuid
24
25
  import warnings
@@ -2698,3 +2699,9 @@ def get_cpu_architecture_name() -> str:
2698
2699
  if os_architecture_name == "x86_64": return "amd64" # noqa
2699
2700
  return os_architecture_name
2700
2701
  return ""
2702
+
2703
+
2704
+ def short_uuid(length: Optional[int] = None):
2705
+ if (length is None) or (not isinstance(length, int)) or (length < 1):
2706
+ length = 16
2707
+ return shortuuid.ShortUUID().random(length=length)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dcicutils
3
- Version: 8.8.4.1b13
3
+ Version: 8.8.4.1b16
4
4
  Summary: Utility package for interacting with the 4DN Data Portal and other 4DN resources
5
5
  Home-page: https://github.com/4dn-dcic/utils
6
6
  License: MIT
@@ -26,8 +26,8 @@ Requires-Dist: PyJWT (>=2.6.0,<3.0.0)
26
26
  Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
27
27
  Requires-Dist: appdirs (>=1.4.4,<2.0.0)
28
28
  Requires-Dist: aws-requests-auth (>=0.4.2,<1)
29
- Requires-Dist: boto3 (>=1.34.90,<2.0.0)
30
- Requires-Dist: botocore (>=1.34.90,<2.0.0)
29
+ Requires-Dist: boto3 (>=1.34.93,<2.0.0)
30
+ Requires-Dist: botocore (>=1.34.93,<2.0.0)
31
31
  Requires-Dist: chardet (>=5.2.0,<6.0.0)
32
32
  Requires-Dist: docker (>=4.4.4,<5.0.0)
33
33
  Requires-Dist: elasticsearch (==7.13.4)
@@ -43,6 +43,7 @@ Requires-Dist: pytz (>=2020.4)
43
43
  Requires-Dist: redis (>=4.5.1,<5.0.0)
44
44
  Requires-Dist: requests (>=2.21.0,<3.0.0)
45
45
  Requires-Dist: rfc3986 (>=1.4.0,<2.0.0)
46
+ Requires-Dist: shortuuid (>=1.0.13,<2.0.0)
46
47
  Requires-Dist: structlog (>=19.2.0,<20.0.0)
47
48
  Requires-Dist: toml (>=0.10.1,<1)
48
49
  Requires-Dist: tqdm (>=4.66.2,<5.0.0)
@@ -28,7 +28,7 @@ dcicutils/es_utils.py,sha256=ZksLh5ei7kRUfiFltk8sd2ZSfh15twbstrMzBr8HNw4,7541
28
28
  dcicutils/exceptions.py,sha256=4giQGtpak-omQv7BP6Ckeu91XK5fnDosC8gfdmN_ccA,9931
29
29
  dcicutils/ff_mocks.py,sha256=6RKS4eUiu_Wl8yP_8V0CaV75w4ZdWxdCuL1CVlnMrek,36918
30
30
  dcicutils/ff_utils.py,sha256=oIhuZPnGtfwj6bWyCc1u23JbMB_6InPp01ZqUOljd8M,73123
31
- dcicutils/file_utils.py,sha256=vPPFiGK5scq7opVQ_Ne8eeTAjy5OtSxyAgeJUievygQ,6283
31
+ dcicutils/file_utils.py,sha256=msxA3fFTtK09Qc_I3-r9Y5Pp5WVJRPPpLlFYv3Rju-E,8697
32
32
  dcicutils/function_cache_decorator.py,sha256=XMyiEGODVr2WoAQ68vcoX_9_Xb9p8pZXdXl7keU8i2g,10026
33
33
  dcicutils/glacier_utils.py,sha256=Q4CVXsZCbP-SoZIsZ5NMcawDfelOLzbQnIlQn-GdlTo,34149
34
34
  dcicutils/http_utils.py,sha256=RB0x9hRMZM9Xd1x00c5J0iUzUdYzIQR0XKFiQ94HWO0,807
@@ -44,7 +44,7 @@ dcicutils/license_policies/park-lab-gpl-pipeline.jsonc,sha256=vLZkwm3Js-kjV44nug
44
44
  dcicutils/license_policies/park-lab-pipeline.jsonc,sha256=9qlY0ASy3iUMQlr3gorVcXrSfRHnVGbLhkS427UaRy4,283
45
45
  dcicutils/license_utils.py,sha256=d1cq6iwv5Ju-VjdoINi6q7CPNNL7Oz6rcJdLMY38RX0,46978
46
46
  dcicutils/log_utils.py,sha256=7pWMc6vyrorUZQf-V-M3YC6zrPgNhuV_fzm9xqTPph0,10883
47
- dcicutils/misc_utils.py,sha256=Nw47AZs-cSOzGp5TZWrQZftePcahz1yfw6iNwOzUt-s,105530
47
+ dcicutils/misc_utils.py,sha256=eVZ3lEkDebweKCeza2GIo7x3qEqqkj61Ilr17eMFlR0,105744
48
48
  dcicutils/obfuscation_utils.py,sha256=fo2jOmDRC6xWpYX49u80bVNisqRRoPskFNX3ymFAmjw,5963
49
49
  dcicutils/opensearch_utils.py,sha256=V2exmFYW8Xl2_pGFixF4I2Cc549Opwe4PhFi5twC0M8,1017
50
50
  dcicutils/portal_object_utils.py,sha256=gDXRgPsRvqCFwbC8WatsuflAxNiigOnqr0Hi93k3AgE,15422
@@ -73,8 +73,8 @@ dcicutils/trace_utils.py,sha256=g8kwV4ebEy5kXW6oOrEAUsurBcCROvwtZqz9fczsGRE,1769
73
73
  dcicutils/validation_utils.py,sha256=cMZIU2cY98FYtzK52z5WUYck7urH6JcqOuz9jkXpqzg,14797
74
74
  dcicutils/variant_utils.py,sha256=2H9azNx3xAj-MySg-uZ2SFqbWs4kZvf61JnK6b-h4Qw,4343
75
75
  dcicutils/zip_utils.py,sha256=_Y9EmL3D2dUZhxucxHvrtmmlbZmK4FpSsHEb7rGSJLU,3265
76
- dcicutils-8.8.4.1b13.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
77
- dcicutils-8.8.4.1b13.dist-info/METADATA,sha256=gPRDYcqVxkYQNNEWUBaES-znrDZQhf-BT2Ie5ooqFwU,3397
78
- dcicutils-8.8.4.1b13.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
79
- dcicutils-8.8.4.1b13.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
80
- dcicutils-8.8.4.1b13.dist-info/RECORD,,
76
+ dcicutils-8.8.4.1b16.dist-info/LICENSE.txt,sha256=qnwSmfnEWMl5l78VPDEzAmEbLVrRqQvfUQiHT0ehrOo,1102
77
+ dcicutils-8.8.4.1b16.dist-info/METADATA,sha256=7R_Eatzjy4Ez8_JufgAKLIQ_O6z0fVvjqHzLEwfE9O0,3440
78
+ dcicutils-8.8.4.1b16.dist-info/WHEEL,sha256=7Z8_27uaHI_UZAc4Uox4PpBhQ9Y5_modZXWMxtUi4NU,88
79
+ dcicutils-8.8.4.1b16.dist-info/entry_points.txt,sha256=51Q4F_2V10L0282W7HFjP4jdzW4K8lnWDARJQVFy_hw,270
80
+ dcicutils-8.8.4.1b16.dist-info/RECORD,,