s4fs 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
s4fs-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.4
2
+ Name: s4fs
3
+ Version: 1.1.0
4
+ Summary: fsspec adapter for reading S4 gateway-written objects (S4F2 framed + compressed) directly from the backend — no gateway required. Read-only by design.
5
+ Author: abyo software 合同会社
6
+ Author-email: masumi-ryugo <abyo.software@gmail.com>
7
+ License: Apache-2.0
8
+ Project-URL: Homepage, https://github.com/abyo-software/s4
9
+ Project-URL: Repository, https://github.com/abyo-software/s4
10
+ Project-URL: Issues, https://github.com/abyo-software/s4/issues
11
+ Keywords: s3,fsspec,compression,zstd,pandas,pyarrow
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: System :: Archiving :: Compression
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: fsspec>=2023.1.0
20
+ Requires-Dist: s4-codec<2,>=1.1.0
21
+ Provides-Extra: s3
22
+ Requires-Dist: s3fs>=2023.1.0; extra == "s3"
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=7.0; extra == "dev"
25
+
26
+ # s4fs — fsspec filesystem for S4 objects (no gateway required)
27
+
28
+ `s4fs` lets pandas / pyarrow / DuckDB / Polars (anything fsspec-aware) read
29
+ [S4](https://github.com/abyo-software/s4) gateway-written objects **directly
30
+ from the S3 backend**. Objects are transparently decompressed on read,
31
+ `ls`/`info` report the original (decompressed) sizes, and range reads use the
32
+ `<key>.s4index` sidecar to fetch + decode only the frames that overlap the
33
+ requested range. Objects that never went through the gateway pass through
34
+ byte-for-byte. This is the lock-in escape hatch: if you stop running the
35
+ gateway, your data stays readable.
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install -e python/s4fs[s3] # from a source checkout
41
+ # requires the s4-codec wheel: cd crates/s4-codec-py && maturin build --release
42
+ ```
43
+
44
+ ## Use
45
+
46
+ ```python
47
+ import pandas as pd
48
+ opts = {"target_options": {"endpoint_url": "http://backend:9000"}}
49
+ df = pd.read_parquet("s4://bucket/data.parquet", storage_options=opts)
50
+ ```
51
+
52
+ ```python
53
+ import fsspec, pyarrow.parquet as pq
54
+ fs = fsspec.filesystem("s4", target_options={"endpoint_url": "http://backend:9000"})
55
+ table = pq.read_table("bucket/data.parquet", filesystem=fs)
56
+ ```
57
+
58
+ ```python
59
+ import duckdb
60
+ con = duckdb.connect(); con.register_filesystem(fs)
61
+ con.sql("SELECT count(*) FROM read_parquet('s4://bucket/data.parquet')")
62
+ ```
63
+
64
+ Any underlying fsspec filesystem can be injected instead of s3fs:
65
+ `S4FileSystem(fs=my_fs)` (used by the unit tests with an in-memory stub).
66
+
67
+ ## Decoded formats
68
+
69
+ - S4F2-framed objects (single-PUT and multipart), S4P1 padding skipped
70
+ - codecs: `passthrough`, `cpu-zstd`, `cpu-gzip`, `cpu-zstd-dict`
71
+ (dictionaries are fetched from `.s4dict/<id>` and fingerprint-verified)
72
+ - unframed gateway objects carrying a metadata manifest (`cpu-gzip`,
73
+ legacy raw zstd, `passthrough`)
74
+ - `.s4index` sidecars v1/v2/v3 with ETag staleness checks (a stale sidecar
75
+ falls back to a full-object read)
76
+
77
+ ## Limitations
78
+
79
+ - **Read-only.** All write APIs raise `NotImplementedError` — write through
80
+ the S4 gateway, which owns the framing / sidecar / metadata contract.
81
+ - **GPU frames are refused loudly.** `nvcomp-*` / `dietgpu-ans` frames raise
82
+ `NotImplementedError` (decode them through the gateway); s4fs never
83
+ returns silently-wrong bytes.
84
+ - **SSE-encrypted objects are refused loudly.** Reads raise
85
+ `NotImplementedError` (the keyring / KMS / SSE-C key lives in the
86
+ gateway — read encrypted objects through the gateway). Detection is
87
+ threefold: the `s4-encrypted` object metadata stamp, the sidecar's v3
88
+ SSE binding, and the `S4E1`–`S4E6` envelope magic in the body; s4fs
89
+ never returns ciphertext as if it were data.
90
+ - Exact-size resolution in `ls`/`info` may cost one extra backend request
91
+ per object (sidecar GET or metadata HEAD); results are cached per
92
+ filesystem instance.
93
+ - Range reads on framed objects without a usable sidecar fall back to a
94
+ full-object read (with a warning when the object is multi-frame).
95
+ Legacy v1 sidecars (no source ETag/size binding) are treated as
96
+ unusable — they cannot be tied to the live object.
97
+ - `open()` refuses framed objects whose original size is inexact (no
98
+ usable sidecar, no `s4-original-size` metadata) instead of silently
99
+ truncating buffered reads at the compressed size; opt back in with
100
+ `S4FileSystem(allow_inexact_open=True)`. `cat_file()` is unaffected.
101
+
102
+ ## Tests
103
+
104
+ ```bash
105
+ pytest python/s4fs/tests # unit (gateway-captured fixtures)
106
+ pytest python/s4fs/tests/test_e2e_minio.py -m e2e # docker + MinIO + real gateway
107
+ ```
s4fs-1.1.0/README.md ADDED
@@ -0,0 +1,82 @@
1
+ # s4fs — fsspec filesystem for S4 objects (no gateway required)
2
+
3
+ `s4fs` lets pandas / pyarrow / DuckDB / Polars (anything fsspec-aware) read
4
+ [S4](https://github.com/abyo-software/s4) gateway-written objects **directly
5
+ from the S3 backend**. Objects are transparently decompressed on read,
6
+ `ls`/`info` report the original (decompressed) sizes, and range reads use the
7
+ `<key>.s4index` sidecar to fetch + decode only the frames that overlap the
8
+ requested range. Objects that never went through the gateway pass through
9
+ byte-for-byte. This is the lock-in escape hatch: if you stop running the
10
+ gateway, your data stays readable.
11
+
12
+ ## Install
13
+
14
+ ```bash
15
+ pip install -e python/s4fs[s3] # from a source checkout
16
+ # requires the s4-codec wheel: cd crates/s4-codec-py && maturin build --release
17
+ ```
18
+
19
+ ## Use
20
+
21
+ ```python
22
+ import pandas as pd
23
+ opts = {"target_options": {"endpoint_url": "http://backend:9000"}}
24
+ df = pd.read_parquet("s4://bucket/data.parquet", storage_options=opts)
25
+ ```
26
+
27
+ ```python
28
+ import fsspec, pyarrow.parquet as pq
29
+ fs = fsspec.filesystem("s4", target_options={"endpoint_url": "http://backend:9000"})
30
+ table = pq.read_table("bucket/data.parquet", filesystem=fs)
31
+ ```
32
+
33
+ ```python
34
+ import duckdb
35
+ con = duckdb.connect(); con.register_filesystem(fs)
36
+ con.sql("SELECT count(*) FROM read_parquet('s4://bucket/data.parquet')")
37
+ ```
38
+
39
+ Any underlying fsspec filesystem can be injected instead of s3fs:
40
+ `S4FileSystem(fs=my_fs)` (used by the unit tests with an in-memory stub).
41
+
42
+ ## Decoded formats
43
+
44
+ - S4F2-framed objects (single-PUT and multipart), S4P1 padding skipped
45
+ - codecs: `passthrough`, `cpu-zstd`, `cpu-gzip`, `cpu-zstd-dict`
46
+ (dictionaries are fetched from `.s4dict/<id>` and fingerprint-verified)
47
+ - unframed gateway objects carrying a metadata manifest (`cpu-gzip`,
48
+ legacy raw zstd, `passthrough`)
49
+ - `.s4index` sidecars v1/v2/v3 with ETag staleness checks (a stale sidecar
50
+ falls back to a full-object read)
51
+
52
+ ## Limitations
53
+
54
+ - **Read-only.** All write APIs raise `NotImplementedError` — write through
55
+ the S4 gateway, which owns the framing / sidecar / metadata contract.
56
+ - **GPU frames are refused loudly.** `nvcomp-*` / `dietgpu-ans` frames raise
57
+ `NotImplementedError` (decode them through the gateway); s4fs never
58
+ returns silently-wrong bytes.
59
+ - **SSE-encrypted objects are refused loudly.** Reads raise
60
+ `NotImplementedError` (the keyring / KMS / SSE-C key lives in the
61
+ gateway — read encrypted objects through the gateway). Detection is
62
+ threefold: the `s4-encrypted` object metadata stamp, the sidecar's v3
63
+ SSE binding, and the `S4E1`–`S4E6` envelope magic in the body; s4fs
64
+ never returns ciphertext as if it were data.
65
+ - Exact-size resolution in `ls`/`info` may cost one extra backend request
66
+ per object (sidecar GET or metadata HEAD); results are cached per
67
+ filesystem instance.
68
+ - Range reads on framed objects without a usable sidecar fall back to a
69
+ full-object read (with a warning when the object is multi-frame).
70
+ Legacy v1 sidecars (no source ETag/size binding) are treated as
71
+ unusable — they cannot be tied to the live object.
72
+ - `open()` refuses framed objects whose original size is inexact (no
73
+ usable sidecar, no `s4-original-size` metadata) instead of silently
74
+ truncating buffered reads at the compressed size; opt back in with
75
+ `S4FileSystem(allow_inexact_open=True)`. `cat_file()` is unaffected.
76
+
77
+ ## Tests
78
+
79
+ ```bash
80
+ pytest python/s4fs/tests # unit (gateway-captured fixtures)
81
+ pytest python/s4fs/tests/test_e2e_minio.py -m e2e # docker + MinIO + real gateway
82
+ ```
@@ -0,0 +1,58 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "s4fs"
7
+ version = "1.1.0"
8
+ description = "fsspec adapter for reading S4 gateway-written objects (S4F2 framed + compressed) directly from the backend — no gateway required. Read-only by design."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [
13
+ { name = "abyo software 合同会社" },
14
+ { name = "masumi-ryugo", email = "abyo.software@gmail.com" },
15
+ ]
16
+ keywords = ["s3", "fsspec", "compression", "zstd", "pandas", "pyarrow"]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Intended Audience :: Developers",
20
+ "License :: OSI Approved :: Apache Software License",
21
+ "Programming Language :: Python :: 3",
22
+ "Topic :: System :: Archiving :: Compression",
23
+ ]
24
+ dependencies = [
25
+ "fsspec>=2023.1.0",
26
+ # PyO3 binding crate (crates/s4-codec-py) — provides the S4F2 frame /
27
+ # .s4index sidecar decoders and the CPU codecs. Floor is 1.1.0: the
28
+ # binding APIs s4fs uses (SIDECAR_SUFFIX / frame_iter / decode_index /
29
+ # crc32c / CpuZstdDict) do not exist in the 1.0.0 wheel; they ship
30
+ # with the next s4-codec release (1.1.0).
31
+ "s4-codec>=1.1.0,<2",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ # Default underlying filesystem for s3:// backends.
36
+ s3 = ["s3fs>=2023.1.0"]
37
+ dev = ["pytest>=7.0"]
38
+
39
+ [project.urls]
40
+ Homepage = "https://github.com/abyo-software/s4"
41
+ Repository = "https://github.com/abyo-software/s4"
42
+ Issues = "https://github.com/abyo-software/s4/issues"
43
+
44
+ # fsspec discovers the implementation lazily via the `fsspec.specs` entry
45
+ # point group — `fsspec.filesystem("s4")` works without importing s4fs first.
46
+ [project.entry-points."fsspec.specs"]
47
+ s4 = "s4fs.core:S4FileSystem"
48
+
49
+ [tool.setuptools.packages.find]
50
+ where = ["src"]
51
+
52
+ [tool.pytest.ini_options]
53
+ markers = [
54
+ "e2e: end-to-end tests that require docker (MinIO) and the s4 binary",
55
+ ]
56
+ # e2e is opt-in: `pytest -m e2e` (an explicit -m on the command line
57
+ # overrides this default).
58
+ addopts = "-m 'not e2e'"
s4fs-1.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,27 @@
1
+ """s4fs — fsspec adapter for S4 gateway-written objects (read-only).
2
+
3
+ Usage::
4
+
5
+ import s4fs # registers the "s4" protocol
6
+ import pandas as pd
7
+
8
+ df = pd.read_parquet(
9
+ "s4://bucket/data.parquet",
10
+ storage_options={"target_options": {"endpoint_url": "http://backend:9000"}},
11
+ )
12
+ """
13
+
14
+ from fsspec import register_implementation
15
+
16
+ from s4fs.core import S4File, S4FileSystem
17
+
18
+ __all__ = ["S4File", "S4FileSystem"]
19
+ __version__ = "1.0.0"
20
+
21
+ # Idempotent: the `fsspec.specs` entry point in pyproject.toml already
22
+ # advertises the implementation; registering here covers source checkouts
23
+ # and older fsspec versions. clobber=False keeps an operator override.
24
+ try:
25
+ register_implementation("s4", S4FileSystem, clobber=False)
26
+ except ValueError:
27
+ pass