cloudfs 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cloudfs
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: An interface to interact with cloud storage as if it's a local filesystem.
5
5
  License-Expression: Apache-2.0
6
6
  License-File: LICENSE
@@ -21,8 +21,10 @@ rmdir():
21
21
  is_dir() returns True.
22
22
 
23
23
  open():
24
- Read modes download the full blob into memory. Write modes buffer in memory
25
- and upload on close.
24
+ Read modes stream the blob via a chunked reader; write modes stream the
25
+ upload by staging blocks and committing them on close. Memory use is bounded
26
+ by the upload chunk size, not the blob size. read_bytes/write_bytes still
27
+ load the whole blob, matching pathlib semantics.
26
28
 
27
29
  rename():
28
30
  Implemented as copy + delete. Not atomic — a crash between the two steps
@@ -38,11 +40,16 @@ Performance:
38
40
 
39
41
  from __future__ import annotations
40
42
 
43
+ import base64
41
44
  import io
42
45
  from typing import IO, Any, Generator, Iterator
43
46
 
44
47
  from cloudfs.base import CloudPath
45
48
 
49
+ # Streaming upload block size. Bounds the memory held during a streaming
50
+ # open("wb"); a chunk is staged as a block and the block list committed on close.
51
+ _UPLOAD_CHUNK_SIZE = 8 * 1024 * 1024
52
+
46
53
 
47
54
  class AzurePath(CloudPath):
48
55
  """pathlib.Path-compatible interface for Azure Blob Storage.
@@ -323,25 +330,23 @@ class AzurePath(CloudPath):
323
330
  newline: str | None = None,
324
331
  ) -> IO:
325
332
  if mode in ("rb", "r"):
326
- data = self.read_bytes()
327
- buf = io.BytesIO(data)
333
+ downloader = self._container.download_blob(self._key)
334
+ buf = io.BufferedReader(_AzureReadBuffer(downloader))
328
335
  if mode == "r":
329
336
  return io.TextIOWrapper(
330
- buf,
337
+ buf, encoding=encoding or "utf-8", errors=errors, newline=newline
338
+ )
339
+ return buf
340
+ if mode in ("wb", "w"):
341
+ raw = _AzureWriteBuffer(self._container, self._key)
342
+ if mode == "w":
343
+ return io.TextIOWrapper(
344
+ io.BufferedWriter(raw),
331
345
  encoding=encoding or "utf-8",
332
346
  errors=errors,
333
347
  newline=newline,
334
348
  )
335
- return buf
336
- if mode in ("wb", "w"):
337
- return _AzureWriteBuffer(
338
- self._container,
339
- self._key,
340
- binary=mode == "wb",
341
- encoding=encoding or "utf-8",
342
- errors=errors,
343
- newline=newline,
344
- )
349
+ return raw
345
350
  raise ValueError(f"Unsupported mode: {mode!r}")
346
351
 
347
352
  def read_bytes(self) -> bytes:
@@ -355,9 +360,7 @@ class AzurePath(CloudPath):
355
360
  return len(data)
356
361
 
357
362
  def write_text(self, data: str, encoding: str = "utf-8") -> int:
358
- encoded = data.encode(encoding)
359
- self._container.upload_blob(self._key, encoded, overwrite=True)
360
- return len(encoded)
363
+ return self.write_bytes(data.encode(encoding))
361
364
 
362
365
  def touch(self, mode: int = 0o666, exist_ok: bool = True) -> None:
363
366
  if self._blob_exists():
@@ -426,36 +429,60 @@ class AzurePath(CloudPath):
426
429
  return cls(container, key, _client=_client)
427
430
 
428
431
 
432
+ class _AzureReadBuffer(io.RawIOBase):
433
+ def __init__(self, downloader):
434
+ self._chunks = downloader.chunks()
435
+ self._leftover = b""
436
+
437
+ def readable(self) -> bool:
438
+ return True
439
+
440
+ def readinto(self, b) -> int:
441
+ if not self._leftover:
442
+ try:
443
+ self._leftover = next(self._chunks)
444
+ except StopIteration:
445
+ return 0
446
+ n = min(len(b), len(self._leftover))
447
+ b[:n] = self._leftover[:n]
448
+ self._leftover = self._leftover[n:]
449
+ return n
450
+
451
+
429
452
  class _AzureWriteBuffer(io.RawIOBase):
430
- def __init__(self, container, key, binary, encoding, errors, newline):
431
- self._container = container
432
- self._key = key
433
- self._binary = binary
434
- self._buf = io.BytesIO()
435
- self._text_wrapper = None
436
- if not binary:
437
- self._text_wrapper = io.TextIOWrapper(
438
- self._buf, encoding=encoding, errors=errors, newline=newline
439
- )
453
+ def __init__(self, container, key):
454
+ self._blob = container.get_blob_client(key)
455
+ self._buf = bytearray()
456
+ self._block_ids: list[str] = []
457
+
458
+ def writable(self) -> bool:
459
+ return True
440
460
 
441
461
  def write(self, data) -> int:
442
- if self._text_wrapper:
443
- return self._text_wrapper.write(data)
444
- return self._buf.write(data)
462
+ self._buf.extend(data)
463
+ while len(self._buf) >= _UPLOAD_CHUNK_SIZE:
464
+ self._flush_block(_UPLOAD_CHUNK_SIZE)
465
+ return len(data)
466
+
467
+ def _flush_block(self, size: int) -> None:
468
+ chunk = bytes(self._buf[:size])
469
+ del self._buf[:size]
470
+ block_id = base64.b64encode(f"{len(self._block_ids):032d}".encode()).decode()
471
+ self._blob.stage_block(block_id, chunk)
472
+ self._block_ids.append(block_id)
445
473
 
446
474
  def close(self) -> None:
447
- if not self.closed:
448
- if self._text_wrapper:
449
- self._text_wrapper.flush()
450
- self._buf.seek(0)
451
- self._container.upload_blob(self._key, self._buf.read(), overwrite=True)
452
- super().close()
453
-
454
- def __enter__(self):
455
- return self
456
-
457
- def __exit__(self, *args):
458
- self.close()
475
+ if self.closed:
476
+ return
477
+ from azure.storage.blob import BlobBlock
478
+
479
+ try:
480
+ if self._buf:
481
+ self._flush_block(len(self._buf))
482
+ block_list = [BlobBlock(block_id=bid) for bid in self._block_ids]
483
+ self._blob.commit_block_list(block_list)
484
+ finally:
485
+ super().close()
459
486
 
460
487
 
461
488
  class AzureStatResult:
@@ -20,8 +20,10 @@ rmdir():
20
20
  is_dir() returns True.
21
21
 
22
22
  open():
23
- S3 has no native streaming file object. Read modes download the full object
24
- into memory. Write modes buffer in memory and upload on close.
23
+ Read modes stream the object via a chunked reader; write modes stream the
24
+ upload as an S3 multipart upload, flushing one part at a time. Memory use is
25
+ bounded by the upload chunk size, not the object size. read_bytes/write_bytes
26
+ still load the whole object, matching pathlib semantics.
25
27
 
26
28
  Consistency:
27
29
  S3 provides strong read-after-write consistency for all operations since
@@ -39,6 +41,10 @@ from typing import IO, Any, Generator, Iterator
39
41
 
40
42
  from cloudfs.base import CloudPath
41
43
 
44
+ # Streaming upload part size. Must stay >= 5 MiB, S3's minimum non-final
45
+ # multipart part size. Bounds the memory held during a streaming open("wb").
46
+ _UPLOAD_CHUNK_SIZE = 8 * 1024 * 1024
47
+
42
48
 
43
49
  class S3Path(CloudPath):
44
50
  """pathlib.Path-compatible interface for AWS S3."""
@@ -321,26 +327,25 @@ class S3Path(CloudPath):
321
327
  newline: str | None = None,
322
328
  ) -> IO:
323
329
  if mode in ("rb", "r"):
324
- data = self.read_bytes()
325
- buf = io.BytesIO(data)
330
+ body = self._client.get_object(Bucket=self._bucket_name, Key=self._key)[
331
+ "Body"
332
+ ]
333
+ buf = io.BufferedReader(_S3ReadBuffer(body))
326
334
  if mode == "r":
327
335
  return io.TextIOWrapper(
328
- buf,
336
+ buf, encoding=encoding or "utf-8", errors=errors, newline=newline
337
+ )
338
+ return buf
339
+ if mode in ("wb", "w"):
340
+ raw = _S3WriteBuffer(self._client, self._bucket_name, self._key)
341
+ if mode == "w":
342
+ return io.TextIOWrapper(
343
+ io.BufferedWriter(raw),
329
344
  encoding=encoding or "utf-8",
330
345
  errors=errors,
331
346
  newline=newline,
332
347
  )
333
- return buf
334
- if mode in ("wb", "w"):
335
- return _S3WriteBuffer(
336
- self._client,
337
- self._bucket_name,
338
- self._key,
339
- binary=mode == "wb",
340
- encoding=encoding or "utf-8",
341
- errors=errors,
342
- newline=newline,
343
- )
348
+ return raw
344
349
  raise ValueError(f"Unsupported mode: {mode!r}")
345
350
 
346
351
  def read_bytes(self) -> bytes:
@@ -355,9 +360,7 @@ class S3Path(CloudPath):
355
360
  return len(data)
356
361
 
357
362
  def write_text(self, data: str, encoding: str = "utf-8") -> int:
358
- encoded = data.encode(encoding)
359
- self._client.put_object(Bucket=self._bucket_name, Key=self._key, Body=encoded)
360
- return len(encoded)
363
+ return self.write_bytes(data.encode(encoding))
361
364
 
362
365
  def touch(self, mode: int = 0o666, exist_ok: bool = True) -> None:
363
366
  if self._object_exists():
@@ -428,42 +431,91 @@ class S3Path(CloudPath):
428
431
  return cls(bucket, key, _client=_client)
429
432
 
430
433
 
434
+ class _S3ReadBuffer(io.RawIOBase):
435
+ def __init__(self, body):
436
+ self._body = body
437
+
438
+ def readable(self) -> bool:
439
+ return True
440
+
441
+ def readinto(self, b) -> int:
442
+ chunk = self._body.read(len(b))
443
+ if not chunk:
444
+ return 0
445
+ n = len(chunk)
446
+ b[:n] = chunk
447
+ return n
448
+
449
+ def close(self) -> None:
450
+ if not self.closed:
451
+ try:
452
+ self._body.close()
453
+ finally:
454
+ super().close()
455
+
456
+
431
457
  class _S3WriteBuffer(io.RawIOBase):
432
- def __init__(self, client, bucket, key, binary, encoding, errors, newline):
458
+ def __init__(self, client, bucket, key):
433
459
  self._client = client
434
460
  self._bucket = bucket
435
461
  self._key = key
436
- self._binary = binary
437
- self._encoding = encoding
438
- self._errors = errors
439
- self._newline = newline
440
- self._buf = io.BytesIO()
441
- self._text_wrapper = None
442
- if not binary:
443
- self._text_wrapper = io.TextIOWrapper(
444
- self._buf, encoding=encoding, errors=errors, newline=newline
445
- )
462
+ self._buf = bytearray()
463
+ self._parts: list[dict] = []
464
+ self._part_num = 1
465
+ resp = client.create_multipart_upload(Bucket=bucket, Key=key)
466
+ self._upload_id = resp["UploadId"]
467
+
468
+ def writable(self) -> bool:
469
+ return True
446
470
 
447
471
  def write(self, data) -> int:
448
- if self._text_wrapper:
449
- return self._text_wrapper.write(data)
450
- return self._buf.write(data)
472
+ self._buf.extend(data)
473
+ while len(self._buf) >= _UPLOAD_CHUNK_SIZE:
474
+ self._flush_part(_UPLOAD_CHUNK_SIZE)
475
+ return len(data)
451
476
 
452
- def close(self) -> None:
453
- if not self.closed:
454
- if self._text_wrapper:
455
- self._text_wrapper.flush()
456
- self._buf.seek(0)
457
- self._client.put_object(
458
- Bucket=self._bucket, Key=self._key, Body=self._buf.read()
459
- )
460
- super().close()
477
+ def _flush_part(self, size: int) -> None:
478
+ chunk = bytes(self._buf[:size])
479
+ del self._buf[:size]
480
+ resp = self._client.upload_part(
481
+ Bucket=self._bucket,
482
+ Key=self._key,
483
+ PartNumber=self._part_num,
484
+ UploadId=self._upload_id,
485
+ Body=chunk,
486
+ )
487
+ self._parts.append({"PartNumber": self._part_num, "ETag": resp["ETag"]})
488
+ self._part_num += 1
461
489
 
462
- def __enter__(self):
463
- return self
490
+ def _safe_abort(self) -> None:
491
+ try:
492
+ self._client.abort_multipart_upload(
493
+ Bucket=self._bucket, Key=self._key, UploadId=self._upload_id
494
+ )
495
+ except Exception:
496
+ pass
464
497
 
465
- def __exit__(self, *args):
466
- self.close()
498
+ def close(self) -> None:
499
+ if self.closed:
500
+ return
501
+ try:
502
+ if self._buf:
503
+ self._flush_part(len(self._buf))
504
+ if self._parts:
505
+ self._client.complete_multipart_upload(
506
+ Bucket=self._bucket,
507
+ Key=self._key,
508
+ UploadId=self._upload_id,
509
+ MultipartUpload={"Parts": self._parts},
510
+ )
511
+ else:
512
+ self._safe_abort()
513
+ self._client.put_object(Bucket=self._bucket, Key=self._key, Body=b"")
514
+ except Exception:
515
+ self._safe_abort()
516
+ raise
517
+ finally:
518
+ super().close()
467
519
 
468
520
 
469
521
  class S3StatResult:
@@ -0,0 +1 @@
1
+ VERSION = "0.3.0"
@@ -1,22 +1,22 @@
1
1
  [project]
2
- name = "cloudfs"
3
- version = "0.2.0"
4
- description = "An interface to interact with cloud storage as if it's a local filesystem."
5
2
  authors = [{ name = "Allen Chou", email = "f1470891079@gmail.com" }]
3
+ dependencies = []
4
+ description = "An interface to interact with cloud storage as if it's a local filesystem."
6
5
  license = "Apache-2.0"
6
+ name = "cloudfs"
7
7
  readme = "README.md"
8
8
  requires-python = ">=3.11,<4.0"
9
- dependencies = []
9
+ version = "0.3.0"
10
10
 
11
11
  [project.optional-dependencies]
12
- google = ["google-cloud-storage>=3,<4"]
13
- s3 = ["boto3>=1.35,<2"]
14
- azure = ["azure-storage-blob>=12,<13"]
15
12
  all = [
16
- "google-cloud-storage>=3,<4",
17
- "boto3>=1.35,<2",
18
13
  "azure-storage-blob>=12,<13",
14
+ "boto3>=1.35,<2",
15
+ "google-cloud-storage>=3,<4",
19
16
  ]
17
+ azure = ["azure-storage-blob>=12,<13"]
18
+ google = ["google-cloud-storage>=3,<4"]
19
+ s3 = ["boto3>=1.35,<2"]
20
20
 
21
21
  [dependency-groups]
22
22
  dev = [
@@ -29,19 +29,29 @@ dev = [
29
29
  "pytest",
30
30
  "pytest-xdist",
31
31
  "python-dotenv",
32
+ "ruff",
32
33
  ]
33
34
 
34
35
  [tool.black]
36
+ line-length = 88
35
37
  target-version = ["py311"]
36
38
 
37
39
  [tool.flake8]
38
- max-line-length = 88
39
40
  extend-ignore = ["E203"]
41
+ max-line-length = 88
40
42
 
41
43
  [tool.isort]
42
- profile = "black"
43
44
  line_length = 88
45
+ profile = "black"
46
+
47
+ [tool.ruff]
48
+ line-length = 88
49
+ target-version = "py311"
50
+
51
+ [tool.ruff.lint]
52
+ # E203 (whitespace before ':') conflicts with black; mirror the flake8 ignore.
53
+ extend-ignore = ["E203"]
44
54
 
45
55
  [build-system]
46
- requires = ["poetry-core>=2.0.0,<3.0.0"]
47
56
  build-backend = "poetry.core.masonry.api"
57
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
@@ -1 +0,0 @@
1
- VERSION = "0.2.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes