cloud-files 4.27.0__py3-none-any.whl → 6.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cloudfiles/paths.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from functools import lru_cache
2
- from collections import namedtuple
2
+ from collections import namedtuple, defaultdict
3
3
  import orjson
4
4
  import os.path
5
5
  import posixpath
@@ -8,17 +8,21 @@ import sys
8
8
  import urllib.parse
9
9
 
10
10
  from typing import Tuple, Optional
11
+ from .typing import GetPathType
11
12
 
12
13
  from .exceptions import UnsupportedProtocolError
13
- from .lib import yellow, toabs, jsonify, mkdir
14
+ from .lib import yellow, toabs, jsonify, mkdir, toiter
14
15
  from .secrets import CLOUD_FILES_DIR
15
16
 
16
17
  ExtractedPath = namedtuple('ExtractedPath',
17
18
  ('format', 'protocol', 'bucket', 'path', 'host', 'alias')
18
19
  )
19
20
 
21
+ PRECOMPUTED_SUFFIX = '|neuroglancer-precomputed:'
22
+
20
23
  ALIAS_FILE = os.path.join(CLOUD_FILES_DIR, "aliases.json")
21
24
  OFFICIAL_ALIASES = {
25
+ "nokura": "s3://https://c10s.pni.princeton.edu/",
22
26
  "matrix": "s3://https://s3-hpcrc.rc.princeton.edu/",
23
27
  "tigerdata": "s3://https://td.princeton.edu/",
24
28
  }
@@ -37,6 +41,10 @@ ALLOWED_FORMATS = [
37
41
  'render', 'vtk', 'nifti', 'dvid',
38
42
  ]
39
43
 
44
+ CLOUD_PROTOCOLS = [
45
+ "gs", "s3"
46
+ ] + list(OFFICIAL_ALIASES.keys())
47
+
40
48
  def update_aliases_from_file():
41
49
  global ALIASES_FROM_FILE
42
50
  global ALIAS_FILE
@@ -55,11 +63,14 @@ def update_aliases_from_file():
55
63
 
56
64
  def cloudpath_error(cloudpath):
57
65
  return yellow(f"""
58
- Cloud Path must conform to [FORMAT://]PROTOCOL://PATH
66
+ Cloud Path must conform to one of:
67
+ (a) [FORMAT://]PROTOCOL://PATH
68
+ (b) PROTOCOL://PATH|FORMAT:
59
69
  Examples:
60
70
  precomputed://gs://test_bucket/em
61
71
  gs://test_bucket/em
62
72
  graphene://https://example.com/image/em
73
+ gs://text_bucket/em/|zarr2:
63
74
 
64
75
  Supported Formats: None (precomputed), {", ".join(ALLOWED_FORMATS)}
65
76
  Supported Protocols: {", ".join(ALLOWED_PROTOCOLS)}
@@ -84,6 +95,9 @@ def mkregexp():
84
95
  CLOUDPATH_REGEXP = re.compile(mkregexp())
85
96
  BUCKET_RE = re.compile(r'^(/?[~\d\w_\.\-]+(?::\d+)?)(?:/|$)') # posix /what/a/great/path
86
97
 
98
+ # |neuroglancer-precomputed: or |zarr2: suffixes etc
99
+ TAIL_FORMAT_REGEXP = re.compile('\\|(?P<fmt>[\\w\\d-]+):$')
100
+
87
101
  def add_alias(alias:str, host:str):
88
102
  global ALIASES
89
103
  global ALLOWED_PROTOCOLS
@@ -158,7 +172,15 @@ for alias, host in OFFICIAL_ALIASES.items():
158
172
 
159
173
  ## Other Path Library Functions
160
174
 
175
+ @lru_cache(maxsize=10, typed=False)
161
176
  def normalize(path):
177
+
178
+ path = path.removesuffix(PRECOMPUTED_SUFFIX)
179
+ m = re.search(TAIL_FORMAT_REGEXP, path)
180
+ if m is not None:
181
+ path = re.sub(TAIL_FORMAT_REGEXP, "", path)
182
+ path = f"{m.group('fmt')}://{path}"
183
+
162
184
  fmt, proto, endpoint, cloudpath, alias = extract_format_protocol(
163
185
  path, allow_defaults=False
164
186
  )
@@ -284,6 +306,13 @@ def pop_protocol(cloudpath):
284
306
  def extract_format_protocol(cloudpath:str, allow_defaults=True) -> tuple:
285
307
  error = UnsupportedProtocolError(cloudpath_error(cloudpath))
286
308
 
309
+ cloudpath = cloudpath.removesuffix(PRECOMPUTED_SUFFIX)
310
+
311
+ m = re.search(TAIL_FORMAT_REGEXP, cloudpath)
312
+ if m is not None:
313
+ cloudpath = re.sub(TAIL_FORMAT_REGEXP, "", cloudpath)
314
+ cloudpath = f"{m.group('fmt')}://{cloudpath}"
315
+
287
316
  alias, cloudpath = resolve_alias(cloudpath)
288
317
 
289
318
  m = re.match(CLOUDPATH_REGEXP, cloudpath)
@@ -350,7 +379,7 @@ def extract(cloudpath:str, windows=None) -> ExtractedPath:
350
379
  cloudpath = toabs(cloudpath)
351
380
 
352
381
  bucket = None
353
- if protocol in ('gs', 's3', 'matrix', 'mem'):
382
+ if protocol in CLOUD_PROTOCOLS + ['mem']:
354
383
  match = re.match(BUCKET_RE, cloudpath)
355
384
  if not match:
356
385
  raise error
@@ -374,7 +403,7 @@ def extract(cloudpath:str, windows=None) -> ExtractedPath:
374
403
 
375
404
  def to_https_protocol(cloudpath):
376
405
  if isinstance(cloudpath, ExtractedPath):
377
- if cloudpath.protocol in ('gs', 's3', 'matrix'):
406
+ if cloudpath.protocol in CLOUD_PROTOCOLS:
378
407
  return extract(to_https_protocol(ascloudpath(cloudpath)))
379
408
  return cloudpath
380
409
 
@@ -390,3 +419,30 @@ def to_https_protocol(cloudpath):
390
419
  cloudpath = cloudpath.replace(f"{alias}://", host, 1)
391
420
 
392
421
  return cloudpath.replace("s3://", "", 1)
422
+
423
+ def find_common_buckets(cloudpaths:GetPathType):
424
+ cloudpaths, is_multiple = toiter(cloudpaths, is_iter=True)
425
+ clustered = defaultdict(list)
426
+
427
+ for path in cloudpaths:
428
+ pth = path
429
+ byte_range = None
430
+ if isinstance(path, dict):
431
+ pth = path["path"]
432
+ byte_range = path["byte_range"]
433
+
434
+ epath = extract(pth)
435
+ if epath.protocol == "file":
436
+ path = os.sep.join(asfilepath(epath).split(os.sep)[2:])
437
+ bucketpath = "file://" + os.sep.join(asfilepath(epath).split(os.sep)[:2])
438
+ else:
439
+ path = epath.path
440
+ bucketpath = asbucketpath(epath)
441
+
442
+ clustered[bucketpath].append({
443
+ "path": path,
444
+ "start": (byte_range[0] if byte_range else None), # type: ignore
445
+ "end": (byte_range[1] if byte_range else None), # type: ignore
446
+ })
447
+
448
+ return clustered
@@ -39,6 +39,9 @@ class ResumableFileSet:
39
39
  self.conn = sqlite3.connect(db_path)
40
40
  self.lease_msec = int(lease_msec)
41
41
 
42
+ self._total = 0
43
+ self._total_dirty = True
44
+
42
45
  def __del__(self):
43
46
  self.conn.close()
44
47
 
@@ -46,6 +49,7 @@ class ResumableFileSet:
46
49
  cur = self.conn.cursor()
47
50
  cur.execute("""DROP TABLE IF EXISTS filelist""")
48
51
  cur.execute("""DROP TABLE IF EXISTS xfermeta""")
52
+ cur.execute("""DROP TABLE IF EXISTS stats""")
49
53
  cur.close()
50
54
 
51
55
  def create(self, src, dest, reencode=None):
@@ -53,6 +57,7 @@ class ResumableFileSet:
53
57
 
54
58
  cur.execute("""DROP TABLE IF EXISTS filelist""")
55
59
  cur.execute("""DROP TABLE IF EXISTS xfermeta""")
60
+ cur.execute("""DROP TABLE IF EXISTS stats""")
56
61
 
57
62
  cur.execute(f"""
58
63
  CREATE TABLE xfermeta (
@@ -78,6 +83,18 @@ class ResumableFileSet:
78
83
  """)
79
84
  cur.execute("CREATE INDEX resumableidxfin ON filelist(finished,lease)")
80
85
  cur.execute("CREATE INDEX resumableidxfile ON filelist(filename)")
86
+
87
+ cur.execute(f"""
88
+ CREATE TABLE stats (
89
+ id {INTEGER} PRIMARY KEY {AUTOINC},
90
+ key TEXT NOT NULL,
91
+ value {INTEGER}
92
+ )
93
+ """)
94
+ cur.execute(
95
+ "INSERT INTO stats(id, key, value) VALUES (?,?,?)",
96
+ [1, 'finished', 0]
97
+ )
81
98
  cur.close()
82
99
 
83
100
  def insert(self, fname_iter):
@@ -91,7 +108,9 @@ class ResumableFileSet:
91
108
  cur.execute(f"INSERT INTO filelist(filename,finished,lease) VALUES {bindlist}", filenames)
92
109
  cur.execute("commit")
93
110
 
94
- cur.close()
111
+ cur.close()
112
+
113
+ self._total_dirty = True
95
114
 
96
115
  def metadata(self):
97
116
  cur = self.conn.cursor()
@@ -111,6 +130,7 @@ class ResumableFileSet:
111
130
  for filenames in sip(fname_iter, SQLITE_MAX_PARAMS):
112
131
  bindlist = ",".join([f"{BIND}"] * len(filenames))
113
132
  cur.execute(f"UPDATE filelist SET finished = 1 WHERE filename in ({bindlist})", filenames)
133
+ cur.execute(f"UPDATE stats SET value = value + {len(filenames)} WHERE id = 1")
114
134
  cur.execute("commit")
115
135
  cur.close()
116
136
 
@@ -120,7 +140,7 @@ class ResumableFileSet:
120
140
  N = 0
121
141
 
122
142
  while True:
123
- ts = now_msec() + self.lease_msec
143
+ ts = now_msec()
124
144
  cur.execute(f"""SELECT filename FROM filelist WHERE finished = 0 AND lease <= {ts} LIMIT {int(block_size)}""")
125
145
  rows = cur.fetchmany(block_size)
126
146
  N += len(rows)
@@ -140,31 +160,46 @@ class ResumableFileSet:
140
160
 
141
161
  cur.close()
142
162
 
143
- def total(self):
163
+ def _scalar_query(self, sql:str) -> int:
144
164
  cur = self.conn.cursor()
145
- cur.execute(f"SELECT count(filename) FROM filelist")
165
+ cur.execute(sql)
146
166
  res = cur.fetchone()
147
167
  cur.close()
148
168
  return int(res[0])
149
169
 
170
+ def total(self):
171
+ """Returns the total number of tasks (both processed and unprocessed)."""
172
+ if not self._total_dirty:
173
+ return self._total
174
+
175
+ self._total = self._scalar_query(f"SELECT max(id) FROM filelist")
176
+ self._total_dirty = False
177
+ return self._total
178
+
179
+ def finished(self):
180
+ return self._scalar_query(f"SELECT value FROM stats WHERE id = 1")
181
+
150
182
  def remaining(self):
151
- cur = self.conn.cursor()
152
- cur.execute(f"SELECT count(filename) FROM filelist WHERE finished = 0")
153
- res = cur.fetchone()
154
- cur.close()
155
- return int(res[0])
183
+ return self.total() - self.finished()
184
+
185
+ def num_leased(self):
186
+ ts = int(now_msec())
187
+ return self._scalar_query(
188
+ f"SELECT count(filename) FROM filelist WHERE finished = 0 AND lease > {ts}"
189
+ )
156
190
 
157
191
  def available(self):
158
- cur = self.conn.cursor()
159
- ts = int(now_msec() + self.lease_msec)
160
- cur.execute(f"SELECT count(filename) FROM filelist WHERE finished = 0 AND lease < {ts}")
161
- res = cur.fetchone()
162
- cur.close()
163
- return int(res[0])
192
+ ts = int(now_msec())
193
+ return self._scalar_query(
194
+ f"SELECT count(filename) FROM filelist WHERE finished = 0 AND lease <= {ts}"
195
+ )
164
196
 
165
197
  def release(self):
198
+ cur = self.conn.cursor()
166
199
  cur.execute(f"UPDATE filelist SET lease = 0")
167
200
  cur.execute("commit")
201
+ cur.close()
202
+
168
203
 
169
204
  def __len__(self):
170
205
  return self.remaining()
cloudfiles/scheduler.py CHANGED
@@ -137,7 +137,12 @@ def schedule_jobs(
137
137
  or (hasattr(fns, "__len__") and len(fns) <= 1)
138
138
  ):
139
139
  return schedule_single_threaded_jobs(fns, progress, total, count_return)
140
-
140
+
141
+ if isinstance(total, int):
142
+ concurrency = min(concurrency, max(total, 1))
143
+ elif hasattr(fns, "__len__"):
144
+ concurrency = min(concurrency, max(len(fns), 1))
145
+
141
146
  if green == True or (green is None and gevent.monkey.saved):
142
147
  return schedule_green_jobs(fns, concurrency, progress, total, count_return)
143
148
 
cloudfiles/secrets.py CHANGED
@@ -137,23 +137,27 @@ def aws_credentials(bucket = '', service = 'aws', skip_files=False):
137
137
  AWS_CREDENTIALS_CACHE[service][bucket] = aws_credentials
138
138
  return aws_credentials
139
139
 
140
- CAVE_CREDENTIALS = None
141
- def cave_credentials():
140
+ CAVE_CREDENTIALS:CredentialCacheType = {}
141
+ def cave_credentials(server = ''):
142
142
  global CAVE_CREDENTIALS
143
- default_file_path = 'cave-secret.json'
144
- path = secretpath(default_file_path)
145
143
 
146
- if CAVE_CREDENTIALS:
147
- return CAVE_CREDENTIALS
144
+ paths = [
145
+ secretpath('cave-secret.json')
146
+ ]
148
147
 
149
- if os.path.exists(path):
150
- with open(path, 'rt') as f:
151
- CAVE_CREDENTIALS = json.loads(f.read())
152
- else:
153
- CAVE_CREDENTIALS = None
148
+ if server:
149
+ paths = [ secretpath(f'{server}-cave-secret.json') ] + paths
150
+
151
+ if server in CAVE_CREDENTIALS:
152
+ return CAVE_CREDENTIALS.get(server, None)
154
153
 
155
- return CAVE_CREDENTIALS
154
+ for path in paths:
155
+ if os.path.exists(path):
156
+ with open(path, 'rt') as f:
157
+ CAVE_CREDENTIALS[server] = json.loads(f.read())
158
+ break
156
159
 
160
+ return CAVE_CREDENTIALS.get(server, None)
157
161
 
158
162
  HTTP_CREDENTIALS = None
159
163
  def http_credentials():
cloudfiles/test.py ADDED
@@ -0,0 +1,28 @@
1
+ import igneous.task_creation as tc
2
+ import os
3
+ from taskqueue import totask
4
+ import igneous.tasks
5
+
6
+
7
+ def process_task(msg):
8
+ task = totask(msg)
9
+ task.execute()
10
+ return None
11
+
12
+ def submit_tasks():
13
+ paths = [
14
+ "gs://ng_scratch_ranl_7/make_cv_happy/seg/20250403024338",]
15
+ all_tasks = []
16
+ for img_path in paths:
17
+ mip = 0
18
+ num_mips = 2
19
+ tasks = tc.create_downsampling_tasks(img_path,
20
+ fill_missing=False,
21
+ delete_black_uploads=True,
22
+ mip=mip, num_mips=num_mips)
23
+ all_tasks += list(tasks)
24
+ return all_tasks
25
+
26
+ if __name__ == "__main__":
27
+ tasks = submit_tasks()
28
+ process_task(tasks[0])