PyPI - rclone-api - Versions diffs - 1.3.27__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl - Mend

rclone-api 1.3.27py2.py3-none-any.whl → 1.4.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

rclone_api/__init__.py +491 -4
rclone_api/cmd/copy_large_s3.py +17 -10
rclone_api/db/db.py +3 -3
rclone_api/detail/copy_file_parts.py +382 -0
rclone_api/dir.py +1 -1
rclone_api/dir_listing.py +1 -1
rclone_api/file.py +8 -0
rclone_api/file_part.py +198 -0
rclone_api/file_stream.py +52 -0
rclone_api/http_server.py +15 -21
rclone_api/{rclone.py → rclone_impl.py} +153 -321
rclone_api/remote.py +3 -3
rclone_api/rpath.py +11 -4
rclone_api/s3/chunk_task.py +3 -19
rclone_api/s3/multipart/file_info.py +7 -0
rclone_api/s3/multipart/finished_piece.py +38 -0
rclone_api/s3/multipart/upload_info.py +62 -0
rclone_api/s3/{chunk_types.py → multipart/upload_state.py} +3 -99
rclone_api/s3/s3_multipart_uploader.py +138 -0
rclone_api/s3/types.py +1 -1
rclone_api/s3/upload_file_multipart.py +14 -14
rclone_api/scan_missing_folders.py +1 -1
rclone_api/types.py +136 -165
rclone_api/util.py +22 -2
{rclone_api-1.3.27.dist-info → rclone_api-1.4.1.dist-info}/METADATA +1 -1
rclone_api-1.4.1.dist-info/RECORD +55 -0
rclone_api/mount_read_chunker.py +0 -130
rclone_api/profile/mount_copy_bytes.py +0 -311
rclone_api-1.3.27.dist-info/RECORD +0 -50
/rclone_api/{walk.py → detail/walk.py} +0 -0
{rclone_api-1.3.27.dist-info → rclone_api-1.4.1.dist-info}/LICENSE +0 -0
{rclone_api-1.3.27.dist-info → rclone_api-1.4.1.dist-info}/WHEEL +0 -0
{rclone_api-1.3.27.dist-info → rclone_api-1.4.1.dist-info}/entry_points.txt +0 -0
{rclone_api-1.3.27.dist-info → rclone_api-1.4.1.dist-info}/top_level.txt +0 -0

rclone_api/detail/copy_file_parts.py ADDED Viewed

@@ -0,0 +1,382 @@
+import _thread
+import hashlib
+import json
+import os
+import threading
+import warnings
+from concurrent.futures import Future, ThreadPoolExecutor
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from rclone_api.dir_listing import DirListing
+from rclone_api.http_server import HttpServer
+from rclone_api.rclone_impl import RcloneImpl
+from rclone_api.types import (
+    PartInfo,
+    Range,
+    SizeSuffix,
+)
+@dataclass
+class UploadPart:
+    chunk: Path
+    dst_part: str
+    exception: Exception | None = None
+    finished: bool = False
+    def dispose(self):
+        try:
+            if self.chunk.exists():
+                self.chunk.unlink()
+            self.finished = True
+        except Exception as e:
+            warnings.warn(f"Failed to delete file {self.chunk}: {e}")
+    def __del__(self):
+        self.dispose()
+def _gen_name(part_number: int, offset: SizeSuffix, end: SizeSuffix) -> str:
+    return f"part.{part_number:05d}_{offset.as_int()}-{end.as_int()}"
+def upload_task(self: RcloneImpl, upload_part: UploadPart) -> UploadPart:
+    try:
+        if upload_part.exception is not None:
+            return upload_part
+        self.copy_to(upload_part.chunk.as_posix(), upload_part.dst_part)
+        return upload_part
+    except Exception as e:
+        upload_part.exception = e
+        return upload_part
+    finally:
+        upload_part.dispose()
+def read_task(
+    http_server: HttpServer,
+    src_name: str,
+    tmpdir: Path,
+    offset: SizeSuffix,
+    length: SizeSuffix,
+    part_dst: str,
+) -> UploadPart:
+    outchunk: Path = tmpdir / f"{offset.as_int()}-{(offset + length).as_int()}.chunk"
+    range = Range(offset.as_int(), (offset + length).as_int())
+    try:
+        err = http_server.download(
+            path=src_name,
+            range=range,
+            dst=outchunk,
+        )
+        if isinstance(err, Exception):
+            out = UploadPart(chunk=outchunk, dst_part="", exception=err)
+            out.dispose()
+            return out
+        return UploadPart(chunk=outchunk, dst_part=part_dst)
+    except KeyboardInterrupt as ke:
+        _thread.interrupt_main()
+        raise ke
+    except SystemExit as se:
+        _thread.interrupt_main()
+        raise se
+    except Exception as e:
+        return UploadPart(chunk=outchunk, dst_part=part_dst, exception=e)
+def _fetch_all_names(
+    self: RcloneImpl,
+    src: str,
+) -> list[str]:
+    dl: DirListing = self.ls(src)
+    files = dl.files
+    filenames: list[str] = [f.name for f in files]
+    filtered: list[str] = [f for f in filenames if f.startswith("part.")]
+    return filtered
+def _get_info_json(self: RcloneImpl, src: str, src_info: str) -> dict:
+    from rclone_api.file import File
+    src_stat: File | Exception = self.stat(src)
+    if isinstance(src_stat, Exception):
+        raise FileNotFoundError(f"Failed to stat {src}: {src_stat}")
+    now: datetime = datetime.now()
+    new_data = {
+        "new": True,
+        "created": now.isoformat(),
+        "src": src,
+        "src_modtime": src_stat.mod_time(),
+        "size": src_stat.size,
+        "chunksize": None,
+        "chunksize_int": None,
+        "first_part": None,
+        "last_part": None,
+        "hash": None,
+    }
+    text_or_err = self.read_text(src_info)
+    err: Exception | None = text_or_err if isinstance(text_or_err, Exception) else None
+    if isinstance(text_or_err, Exception):
+        warnings.warn(f"Failed to read {src_info}: {text_or_err}")
+        return new_data
+    assert isinstance(text_or_err, str)
+    text: str = text_or_err
+    if err is not None:
+        return new_data
+    data: dict = {}
+    try:
+        data = json.loads(text)
+        return data
+    except Exception as e:
+        warnings.warn(f"Failed to parse JSON: {e} at {src_info}")
+        return new_data
+def _save_info_json(self: RcloneImpl, src: str, data: dict) -> None:
+    data = data.copy()
+    data["new"] = False
+    # hash
+    h = hashlib.md5()
+    tmp = [
+        data.get("src"),
+        data.get("src_modtime"),
+        data.get("size"),
+        data.get("chunksize_int"),
+    ]
+    data_vals: list[str] = [str(v) for v in tmp]
+    str_data = "".join(data_vals)
+    h.update(str_data.encode("utf-8"))
+    data["hash"] = h.hexdigest()
+    json_str = json.dumps(data, indent=0)
+    self.write_text(dst=src, text=json_str)
+class InfoJson:
+    def __init__(self, rclone: RcloneImpl, src: str, src_info: str) -> None:
+        self.rclone = rclone
+        self.src = src
+        self.src_info = src_info
+        self.data: dict = {}
+    def load(self) -> bool:
+        self.data = _get_info_json(self.rclone, self.src, self.src_info)
+        return not self.data.get("new", False)
+    def save(self) -> None:
+        _save_info_json(self.rclone, self.src_info, self.data)
+    def print(self) -> None:
+        self.rclone.print(self.src_info)
+    def fetch_all_finished(self) -> list[str]:
+        parent_path = os.path.dirname(self.src_info)
+        out = _fetch_all_names(self.rclone, parent_path)
+        return out
+    def fetch_all_finished_part_numbers(self) -> list[int]:
+        names = self.fetch_all_finished()
+        part_numbers = [int(name.split("_")[0].split(".")[1]) for name in names]
+        return part_numbers
+    @property
+    def new(self) -> bool:
+        return self.data.get("new", False)
+    @property
+    def chunksize(self) -> SizeSuffix | None:
+        chunksize: str | None = self.data.get("chunksize")
+        if chunksize is None:
+            return None
+        return SizeSuffix(chunksize)
+    @chunksize.setter
+    def chunksize(self, value: SizeSuffix) -> None:
+        self.data["chunksize"] = str(value)
+        self.data["chunksize_int"] = value.as_int()
+    @property
+    def src_modtime(self) -> datetime:
+        return datetime.fromisoformat(self.data["src_modtime"])
+    @src_modtime.setter
+    def src_modtime(self, value: datetime) -> None:
+        self.data["src_modtime"] = value.isoformat()
+    @property
+    def first_part(self) -> int | None:
+        return self.data.get("first_part")
+    @first_part.setter
+    def first_part(self, value: int) -> None:
+        self.data["first_part"] = value
+    @property
+    def last_part(self) -> int | None:
+        return self.data.get("last_part")
+    @last_part.setter
+    def last_part(self, value: int) -> None:
+        self.data["last_part"] = value
+    @property
+    def hash(self) -> str | None:
+        return self.data.get("hash")
+    def to_json_str(self) -> str:
+        return json.dumps(self.data)
+    def __repr__(self):
+        return f"InfoJson({self.src}, {self.src_info}, {self.data})"
+    def __str__(self):
+        return self.to_json_str()
+def copy_file_parts(
+    self: RcloneImpl,
+    src: str,  # src:/Bucket/path/myfile.large.zst
+    dst_dir: str,  # dst:/Bucket/path/myfile.large.zst-parts/
+    part_infos: list[PartInfo] | None = None,
+    threads: int = 1,
+) -> Exception | None:
+    """Copy parts of a file from source to destination."""
+    if dst_dir.endswith("/"):
+        dst_dir = dst_dir[:-1]
+    src_size = self.size_file(src)
+    if isinstance(src_size, Exception):
+        return src_size
+    part_info: PartInfo
+    src_dir = os.path.dirname(src)
+    src_name = os.path.basename(src)
+    http_server: HttpServer
+    full_part_infos: list[PartInfo] | Exception = PartInfo.split_parts(
+        src_size, SizeSuffix("96MB")
+    )
+    if isinstance(full_part_infos, Exception):
+        return full_part_infos
+    assert isinstance(full_part_infos, list)
+    if part_infos is None:
+        src_size = self.size_file(src)
+        if isinstance(src_size, Exception):
+            return src_size
+        part_infos = full_part_infos.copy()
+    all_part_numbers: list[int] = [p.part_number for p in part_infos]
+    src_info_json = f"{dst_dir}/info.json"
+    info_json = InfoJson(self, src, src_info_json)
+    if not info_json.load():
+        print(f"New: {src_info_json}")
+        # info_json.save()
+    all_numbers_already_done: set[int] = set(
+        info_json.fetch_all_finished_part_numbers()
+    )
+    print(f"all_numbers_already_done: {sorted(list(all_numbers_already_done))}")
+    filtered_part_infos: list[PartInfo] = []
+    for part_info in part_infos:
+        if part_info.part_number not in all_numbers_already_done:
+            filtered_part_infos.append(part_info)
+    part_infos = filtered_part_infos
+    remaining_part_numbers: list[int] = [p.part_number for p in part_infos]
+    print(f"remaining_part_numbers: {remaining_part_numbers}")
+    if len(part_infos) == 0:
+        return Exception(f"No parts to copy for {src}")
+    chunk_size = SizeSuffix(part_infos[0].range.end - part_infos[0].range.start)
+    info_json.chunksize = chunk_size
+    info_json.first_part = part_infos[0].part_number
+    info_json.last_part = part_infos[-1].part_number
+    info_json.save()
+    # We are now validated
+    info_json.load()
+    info_json.print()
+    print(info_json)
+    finished_tasks: list[UploadPart] = []
+    with self.serve_http(src_dir) as http_server:
+        with TemporaryDirectory() as tmp_dir:
+            tmpdir: Path = Path(tmp_dir)
+            write_semaphore = threading.Semaphore(threads)
+            with ThreadPoolExecutor(max_workers=threads) as upload_executor:
+                with ThreadPoolExecutor(max_workers=threads) as read_executor:
+                    for part_info in part_infos:
+                        part_number: int = part_info.part_number
+                        range: Range = part_info.range
+                        offset: SizeSuffix = SizeSuffix(range.start)
+                        length: SizeSuffix = SizeSuffix(range.end - range.start)
+                        end = offset + length
+                        suffix = _gen_name(part_number, offset, end)
+                        part_dst = f"{dst_dir}/{suffix}"
+                        def _read_task(
+                            src_name=src_name,
+                            http_server=http_server,
+                            tmpdir=tmpdir,
+                            offset=offset,
+                            length=length,
+                            part_dst=part_dst,
+                        ) -> UploadPart:
+                            return read_task(
+                                src_name=src_name,
+                                http_server=http_server,
+                                tmpdir=tmpdir,
+                                offset=offset,
+                                length=length,
+                                part_dst=part_dst,
+                            )
+                        read_fut: Future[UploadPart] = read_executor.submit(_read_task)
+                        # Releases the semaphore when the write task is done
+                        def queue_upload_task(
+                            read_fut=read_fut,
+                        ) -> None:
+                            upload_part = read_fut.result()
+                            upload_fut: Future[UploadPart] = upload_executor.submit(
+                                upload_task, self, upload_part
+                            )
+                            # SEMAPHORE RELEASE!!!
+                            upload_fut.add_done_callback(
+                                lambda _: write_semaphore.release()
+                            )
+                            upload_fut.add_done_callback(
+                                lambda fut: finished_tasks.append(fut.result())
+                            )
+                        read_fut.add_done_callback(queue_upload_task)
+                        # SEMAPHORE ACQUIRE!!!
+                        # If we are back filled on the writers, then we stall.
+                        write_semaphore.acquire()
+    exceptions: list[Exception] = [
+        t.exception for t in finished_tasks if t.exception is not None
+    ]
+    if len(exceptions) > 0:
+        return Exception(f"Failed to copy parts: {exceptions}", exceptions)
+    finished_parts: list[int] = info_json.fetch_all_finished_part_numbers()
+    print(f"finished_names: {finished_parts}")
+    diff_set = set(all_part_numbers).symmetric_difference(set(finished_parts))
+    all_part_numbers_done = len(diff_set) == 0
+    print(f"all_part_numbers_done: {all_part_numbers_done}")
+    return None

rclone_api/dir.py CHANGED Viewed

@@ -72,7 +72,7 @@ class Dir:
         self, breadth_first: bool, max_depth: int = -1
     ) -> Generator[DirListing, None, None]:
         """List files and directories in the given path."""
-        from rclone_api.walk import walk
+        from rclone_api.detail.walk import walk
         assert self.path.rclone is not None
         return walk(self, breadth_first=breadth_first, max_depth=max_depth)

rclone_api/dir_listing.py CHANGED Viewed

@@ -42,7 +42,7 @@ class DirListing:
     def __str__(self) -> str:
         n_files = len(self.files)
         n_dirs = len(self.dirs)
-        msg = f"Files: {n_files}\n"
+        msg = f"\nFiles: {n_files}\n"
         if n_files > 0:
             for f in self.files:
                 msg += f"  {f}\n"

rclone_api/file.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import warnings
 from dataclasses import dataclass
+from datetime import datetime
 from pathlib import Path
 from rclone_api.rpath import RPath
@@ -146,6 +147,13 @@ class File:
     def name(self) -> str:
         return self.path.name
+    def mod_time(self) -> str:
+        return self.path.mod_time
+    def mod_time_dt(self) -> datetime:
+        """Return the modification time as a datetime object."""
+        return self.path.mod_time_dt()
     def read_text(self) -> str:
         """Read the file contents as bytes.

rclone_api/file_part.py ADDED Viewed

@@ -0,0 +1,198 @@
+import atexit
+import os
+import time
+import warnings
+from pathlib import Path
+from threading import Lock
+from typing import Any
+_TMP_DIR_ACCESS_LOCK = Lock()
+def _clean_old_files(out: Path) -> None:
+    # clean up files older than 1 day
+    from rclone_api.util import locked_print
+    now = time.time()
+    # Erase all stale files and then purge empty directories.
+    for root, dirs, files in os.walk(out):
+        for name in files:
+            f = Path(root) / name
+            filemod = f.stat().st_mtime
+            diff_secs = now - filemod
+            diff_days = diff_secs / (60 * 60 * 24)
+            if diff_days > 1:
+                locked_print(f"Removing old file: {f}")
+                f.unlink()
+    for root, dirs, _ in os.walk(out):
+        for dir in dirs:
+            d = Path(root) / dir
+            if not list(d.iterdir()):
+                locked_print(f"Removing empty directory: {d}")
+                d.rmdir()
+def get_chunk_tmpdir() -> Path:
+    with _TMP_DIR_ACCESS_LOCK:
+        dat = get_chunk_tmpdir.__dict__
+        if "out" in dat:
+            return dat["out"]  # Folder already validated.
+        out = Path("chunk_store")
+        if out.exists():
+            # first access, clean up directory
+            _clean_old_files(out)
+        out.mkdir(exist_ok=True, parents=True)
+        dat["out"] = out
+        return out
+_CLEANUP_LIST: list[Path] = []
+def _add_for_cleanup(path: Path) -> None:
+    _CLEANUP_LIST.append(path)
+def _on_exit_cleanup() -> None:
+    paths = list(_CLEANUP_LIST)
+    for path in paths:
+        try:
+            if path.exists():
+                path.unlink()
+        except Exception as e:
+            warnings.warn(f"Cannot cleanup {path}: {e}")
+atexit.register(_on_exit_cleanup)
+_FILEPARTS: list["FilePart"] = []
+_FILEPARTS_LOCK = Lock()
+def _add_filepart(part: "FilePart") -> None:
+    with _FILEPARTS_LOCK:
+        if part not in _FILEPARTS:
+            _FILEPARTS.append(part)
+def _remove_filepart(part: "FilePart") -> None:
+    with _FILEPARTS_LOCK:
+        if part in _FILEPARTS:
+            _FILEPARTS.remove(part)
+def run_debug_parts():
+    while True:
+        print("\nAlive file parts:")
+        for part in list(_FILEPARTS):
+            print(part)
+            # print(part.stacktrace)
+        print("\n\n")
+        time.sleep(60)
+# dbg_thread = threading.Thread(target=run_debug_parts)
+# dbg_thread.start()
+class FilePart:
+    def __init__(self, payload: Path | bytes | Exception, extra: Any) -> None:
+        import traceback
+        from rclone_api.util import random_str
+        stacktrace = traceback.format_stack()
+        stacktrace_str = "".join(stacktrace)
+        self.stacktrace = stacktrace_str
+        # _FILEPARTS.append(self)
+        _add_filepart(self)
+        self.extra = extra
+        self._lock = Lock()
+        self.payload: Path | Exception
+        if isinstance(payload, Exception):
+            self.payload = payload
+            return
+        if isinstance(payload, bytes):
+            print(f"Creating file part with payload: {len(payload)}")
+            self.payload = get_chunk_tmpdir() / f"{random_str(12)}.chunk"
+            with _TMP_DIR_ACCESS_LOCK:
+                if not self.payload.parent.exists():
+                    self.payload.parent.mkdir(parents=True, exist_ok=True)
+                self.payload.write_bytes(payload)
+            _add_for_cleanup(self.payload)
+        if isinstance(payload, Path):
+            print("Adopting payload: ", payload)
+            self.payload = payload
+            _add_for_cleanup(self.payload)
+    def get_file(self) -> Path | Exception:
+        return self.payload
+    @property
+    def size(self) -> int:
+        with self._lock:
+            if isinstance(self.payload, Path):
+                return self.payload.stat().st_size
+            return -1
+    def n_bytes(self) -> int:
+        with self._lock:
+            if isinstance(self.payload, Path):
+                return self.payload.stat().st_size
+            return -1
+    def load(self) -> bytes:
+        with self._lock:
+            if isinstance(self.payload, Path):
+                with open(self.payload, "rb") as f:
+                    return f.read()
+            raise ValueError("Cannot load from error")
+    def __post_init__(self):
+        if isinstance(self.payload, Path):
+            assert self.payload.exists(), f"File part {self.payload} does not exist"
+            assert self.payload.is_file(), f"File part {self.payload} is not a file"
+            assert self.payload.stat().st_size > 0, f"File part {self.payload} is empty"
+        elif isinstance(self.payload, Exception):
+            warnings.warn(f"File part error: {self.payload}")
+        print(f"File part created with payload: {self.payload}")
+    def is_error(self) -> bool:
+        return isinstance(self.payload, Exception)
+    def dispose(self) -> None:
+        # _FILEPARTS.remove(self)
+        _remove_filepart(self)
+        print("Disposing file part")
+        with self._lock:
+            if isinstance(self.payload, Exception):
+                warnings.warn(
+                    f"Cannot close file part because the payload represents an error: {self.payload}"
+                )
+                print("Cannot close file part because the payload represents an error")
+                return
+            if self.payload.exists():
+                print(f"File part {self.payload} exists")
+                try:
+                    print(f"Unlinking file part {self.payload}")
+                    self.payload.unlink()
+                    print(f"File part {self.payload} deleted")
+                except Exception as e:
+                    warnings.warn(f"Cannot close file part because of error: {e}")
+            else:
+                warnings.warn(
+                    f"Cannot close file part because it does not exist: {self.payload}"
+                )
+    def __del__(self):
+        self.dispose()
+    def __repr__(self):
+        from rclone_api.types import SizeSuffix
+        payload_str = "err" if self.is_error() else f"{SizeSuffix(self.n_bytes())}"
+        return f"FilePart(payload={payload_str}, extra={self.extra})"

rclone_api/file_stream.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""
+Unit test file.
+"""
+from typing import Generator
+from rclone_api.file import FileItem
+from rclone_api.process import Process
+class FilesStream:
+    def __init__(self, path: str, process: Process) -> None:
+        self.path = path
+        self.process = process
+    def __enter__(self) -> "FilesStream":
+        self.process.__enter__()
+        return self
+    def __exit__(self, *exc_info):
+        self.process.__exit__(*exc_info)
+    def files(self) -> Generator[FileItem, None, None]:
+        line: bytes
+        for line in self.process.stdout:
+            linestr: str = line.decode("utf-8").strip()
+            if linestr.startswith("["):
+                continue
+            if linestr.endswith(","):
+                linestr = linestr[:-1]
+            if linestr.endswith("]"):
+                continue
+            fileitem: FileItem | None = FileItem.from_json_str(self.path, linestr)
+            if fileitem is None:
+                continue
+            yield fileitem
+    def files_paged(
+        self, page_size: int = 1000
+    ) -> Generator[list[FileItem], None, None]:
+        page: list[FileItem] = []
+        for fileitem in self.files():
+            page.append(fileitem)
+            if len(page) >= page_size:
+                yield page
+                page = []
+        if len(page) > 0:
+            yield page
+    def __iter__(self) -> Generator[FileItem, None, None]:
+        return self.files()

rclone-api 1.3.27__py2.py3-none-any.whl → 1.4.1__py2.py3-none-any.whl

rclone-api 1.3.27py2.py3-none-any.whl → 1.4.1py2.py3-none-any.whl