s4fs 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- s4fs-1.1.0/PKG-INFO +107 -0
- s4fs-1.1.0/README.md +82 -0
- s4fs-1.1.0/pyproject.toml +58 -0
- s4fs-1.1.0/setup.cfg +4 -0
- s4fs-1.1.0/src/s4fs/__init__.py +27 -0
- s4fs-1.1.0/src/s4fs/core.py +632 -0
- s4fs-1.1.0/src/s4fs.egg-info/PKG-INFO +107 -0
- s4fs-1.1.0/src/s4fs.egg-info/SOURCES.txt +12 -0
- s4fs-1.1.0/src/s4fs.egg-info/dependency_links.txt +1 -0
- s4fs-1.1.0/src/s4fs.egg-info/entry_points.txt +2 -0
- s4fs-1.1.0/src/s4fs.egg-info/requires.txt +8 -0
- s4fs-1.1.0/src/s4fs.egg-info/top_level.txt +1 -0
- s4fs-1.1.0/tests/test_e2e_minio.py +337 -0
- s4fs-1.1.0/tests/test_s4fs_unit.py +486 -0
s4fs-1.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: s4fs
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: fsspec adapter for reading S4 gateway-written objects (S4F2 framed + compressed) directly from the backend — no gateway required. Read-only by design.
|
|
5
|
+
Author: abyo software 合同会社
|
|
6
|
+
Author-email: masumi-ryugo <abyo.software@gmail.com>
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Project-URL: Homepage, https://github.com/abyo-software/s4
|
|
9
|
+
Project-URL: Repository, https://github.com/abyo-software/s4
|
|
10
|
+
Project-URL: Issues, https://github.com/abyo-software/s4/issues
|
|
11
|
+
Keywords: s3,fsspec,compression,zstd,pandas,pyarrow
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: System :: Archiving :: Compression
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: fsspec>=2023.1.0
|
|
20
|
+
Requires-Dist: s4-codec<2,>=1.1.0
|
|
21
|
+
Provides-Extra: s3
|
|
22
|
+
Requires-Dist: s3fs>=2023.1.0; extra == "s3"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
25
|
+
|
|
26
|
+
# s4fs — fsspec filesystem for S4 objects (no gateway required)
|
|
27
|
+
|
|
28
|
+
`s4fs` lets pandas / pyarrow / DuckDB / Polars (anything fsspec-aware) read
|
|
29
|
+
[S4](https://github.com/abyo-software/s4) gateway-written objects **directly
|
|
30
|
+
from the S3 backend**. Objects are transparently decompressed on read,
|
|
31
|
+
`ls`/`info` report the original (decompressed) sizes, and range reads use the
|
|
32
|
+
`<key>.s4index` sidecar to fetch + decode only the frames that overlap the
|
|
33
|
+
requested range. Objects that never went through the gateway pass through
|
|
34
|
+
byte-for-byte. This is the lock-in escape hatch: if you stop running the
|
|
35
|
+
gateway, your data stays readable.
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -e python/s4fs[s3] # from a source checkout
|
|
41
|
+
# requires the s4-codec wheel: cd crates/s4-codec-py && maturin build --release
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Use
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import pandas as pd
|
|
48
|
+
opts = {"target_options": {"endpoint_url": "http://backend:9000"}}
|
|
49
|
+
df = pd.read_parquet("s4://bucket/data.parquet", storage_options=opts)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import fsspec, pyarrow.parquet as pq
|
|
54
|
+
fs = fsspec.filesystem("s4", target_options={"endpoint_url": "http://backend:9000"})
|
|
55
|
+
table = pq.read_table("bucket/data.parquet", filesystem=fs)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import duckdb
|
|
60
|
+
con = duckdb.connect(); con.register_filesystem(fs)
|
|
61
|
+
con.sql("SELECT count(*) FROM read_parquet('s4://bucket/data.parquet')")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Any underlying fsspec filesystem can be injected instead of s3fs:
|
|
65
|
+
`S4FileSystem(fs=my_fs)` (used by the unit tests with an in-memory stub).
|
|
66
|
+
|
|
67
|
+
## Decoded formats
|
|
68
|
+
|
|
69
|
+
- S4F2-framed objects (single-PUT and multipart), S4P1 padding skipped
|
|
70
|
+
- codecs: `passthrough`, `cpu-zstd`, `cpu-gzip`, `cpu-zstd-dict`
|
|
71
|
+
(dictionaries are fetched from `.s4dict/<id>` and fingerprint-verified)
|
|
72
|
+
- unframed gateway objects carrying a metadata manifest (`cpu-gzip`,
|
|
73
|
+
legacy raw zstd, `passthrough`)
|
|
74
|
+
- `.s4index` sidecars v1/v2/v3 with ETag staleness checks (a stale sidecar
|
|
75
|
+
falls back to a full-object read)
|
|
76
|
+
|
|
77
|
+
## Limitations
|
|
78
|
+
|
|
79
|
+
- **Read-only.** All write APIs raise `NotImplementedError` — write through
|
|
80
|
+
the S4 gateway, which owns the framing / sidecar / metadata contract.
|
|
81
|
+
- **GPU frames are refused loudly.** `nvcomp-*` / `dietgpu-ans` frames raise
|
|
82
|
+
`NotImplementedError` (decode them through the gateway); s4fs never
|
|
83
|
+
returns silently-wrong bytes.
|
|
84
|
+
- **SSE-encrypted objects are refused loudly.** Reads raise
|
|
85
|
+
`NotImplementedError` (the keyring / KMS / SSE-C key lives in the
|
|
86
|
+
gateway — read encrypted objects through the gateway). Detection is
|
|
87
|
+
threefold: the `s4-encrypted` object metadata stamp, the sidecar's v3
|
|
88
|
+
SSE binding, and the `S4E1`–`S4E6` envelope magic in the body; s4fs
|
|
89
|
+
never returns ciphertext as if it were data.
|
|
90
|
+
- Exact-size resolution in `ls`/`info` may cost one extra backend request
|
|
91
|
+
per object (sidecar GET or metadata HEAD); results are cached per
|
|
92
|
+
filesystem instance.
|
|
93
|
+
- Range reads on framed objects without a usable sidecar fall back to a
|
|
94
|
+
full-object read (with a warning when the object is multi-frame).
|
|
95
|
+
Legacy v1 sidecars (no source ETag/size binding) are treated as
|
|
96
|
+
unusable — they cannot be tied to the live object.
|
|
97
|
+
- `open()` refuses framed objects whose original size is inexact (no
|
|
98
|
+
usable sidecar, no `s4-original-size` metadata) instead of silently
|
|
99
|
+
truncating buffered reads at the compressed size; opt back in with
|
|
100
|
+
`S4FileSystem(allow_inexact_open=True)`. `cat_file()` is unaffected.
|
|
101
|
+
|
|
102
|
+
## Tests
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
pytest python/s4fs/tests # unit (gateway-captured fixtures)
|
|
106
|
+
pytest python/s4fs/tests/test_e2e_minio.py -m e2e # docker + MinIO + real gateway
|
|
107
|
+
```
|
s4fs-1.1.0/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# s4fs — fsspec filesystem for S4 objects (no gateway required)
|
|
2
|
+
|
|
3
|
+
`s4fs` lets pandas / pyarrow / DuckDB / Polars (anything fsspec-aware) read
|
|
4
|
+
[S4](https://github.com/abyo-software/s4) gateway-written objects **directly
|
|
5
|
+
from the S3 backend**. Objects are transparently decompressed on read,
|
|
6
|
+
`ls`/`info` report the original (decompressed) sizes, and range reads use the
|
|
7
|
+
`<key>.s4index` sidecar to fetch + decode only the frames that overlap the
|
|
8
|
+
requested range. Objects that never went through the gateway pass through
|
|
9
|
+
byte-for-byte. This is the lock-in escape hatch: if you stop running the
|
|
10
|
+
gateway, your data stays readable.
|
|
11
|
+
|
|
12
|
+
## Install
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install -e python/s4fs[s3] # from a source checkout
|
|
16
|
+
# requires the s4-codec wheel: cd crates/s4-codec-py && maturin build --release
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Use
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import pandas as pd
|
|
23
|
+
opts = {"target_options": {"endpoint_url": "http://backend:9000"}}
|
|
24
|
+
df = pd.read_parquet("s4://bucket/data.parquet", storage_options=opts)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import fsspec, pyarrow.parquet as pq
|
|
29
|
+
fs = fsspec.filesystem("s4", target_options={"endpoint_url": "http://backend:9000"})
|
|
30
|
+
table = pq.read_table("bucket/data.parquet", filesystem=fs)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import duckdb
|
|
35
|
+
con = duckdb.connect(); con.register_filesystem(fs)
|
|
36
|
+
con.sql("SELECT count(*) FROM read_parquet('s4://bucket/data.parquet')")
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Any underlying fsspec filesystem can be injected instead of s3fs:
|
|
40
|
+
`S4FileSystem(fs=my_fs)` (used by the unit tests with an in-memory stub).
|
|
41
|
+
|
|
42
|
+
## Decoded formats
|
|
43
|
+
|
|
44
|
+
- S4F2-framed objects (single-PUT and multipart), S4P1 padding skipped
|
|
45
|
+
- codecs: `passthrough`, `cpu-zstd`, `cpu-gzip`, `cpu-zstd-dict`
|
|
46
|
+
(dictionaries are fetched from `.s4dict/<id>` and fingerprint-verified)
|
|
47
|
+
- unframed gateway objects carrying a metadata manifest (`cpu-gzip`,
|
|
48
|
+
legacy raw zstd, `passthrough`)
|
|
49
|
+
- `.s4index` sidecars v1/v2/v3 with ETag staleness checks (a stale sidecar
|
|
50
|
+
falls back to a full-object read)
|
|
51
|
+
|
|
52
|
+
## Limitations
|
|
53
|
+
|
|
54
|
+
- **Read-only.** All write APIs raise `NotImplementedError` — write through
|
|
55
|
+
the S4 gateway, which owns the framing / sidecar / metadata contract.
|
|
56
|
+
- **GPU frames are refused loudly.** `nvcomp-*` / `dietgpu-ans` frames raise
|
|
57
|
+
`NotImplementedError` (decode them through the gateway); s4fs never
|
|
58
|
+
returns silently-wrong bytes.
|
|
59
|
+
- **SSE-encrypted objects are refused loudly.** Reads raise
|
|
60
|
+
`NotImplementedError` (the keyring / KMS / SSE-C key lives in the
|
|
61
|
+
gateway — read encrypted objects through the gateway). Detection is
|
|
62
|
+
threefold: the `s4-encrypted` object metadata stamp, the sidecar's v3
|
|
63
|
+
SSE binding, and the `S4E1`–`S4E6` envelope magic in the body; s4fs
|
|
64
|
+
never returns ciphertext as if it were data.
|
|
65
|
+
- Exact-size resolution in `ls`/`info` may cost one extra backend request
|
|
66
|
+
per object (sidecar GET or metadata HEAD); results are cached per
|
|
67
|
+
filesystem instance.
|
|
68
|
+
- Range reads on framed objects without a usable sidecar fall back to a
|
|
69
|
+
full-object read (with a warning when the object is multi-frame).
|
|
70
|
+
Legacy v1 sidecars (no source ETag/size binding) are treated as
|
|
71
|
+
unusable — they cannot be tied to the live object.
|
|
72
|
+
- `open()` refuses framed objects whose original size is inexact (no
|
|
73
|
+
usable sidecar, no `s4-original-size` metadata) instead of silently
|
|
74
|
+
truncating buffered reads at the compressed size; opt back in with
|
|
75
|
+
`S4FileSystem(allow_inexact_open=True)`. `cat_file()` is unaffected.
|
|
76
|
+
|
|
77
|
+
## Tests
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pytest python/s4fs/tests # unit (gateway-captured fixtures)
|
|
81
|
+
pytest python/s4fs/tests/test_e2e_minio.py -m e2e # docker + MinIO + real gateway
|
|
82
|
+
```
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "s4fs"
|
|
7
|
+
version = "1.1.0"
|
|
8
|
+
description = "fsspec adapter for reading S4 gateway-written objects (S4F2 framed + compressed) directly from the backend — no gateway required. Read-only by design."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "abyo software 合同会社" },
|
|
14
|
+
{ name = "masumi-ryugo", email = "abyo.software@gmail.com" },
|
|
15
|
+
]
|
|
16
|
+
keywords = ["s3", "fsspec", "compression", "zstd", "pandas", "pyarrow"]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 4 - Beta",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"License :: OSI Approved :: Apache Software License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Topic :: System :: Archiving :: Compression",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"fsspec>=2023.1.0",
|
|
26
|
+
# PyO3 binding crate (crates/s4-codec-py) — provides the S4F2 frame /
|
|
27
|
+
# .s4index sidecar decoders and the CPU codecs. Floor is 1.1.0: the
|
|
28
|
+
# binding APIs s4fs uses (SIDECAR_SUFFIX / frame_iter / decode_index /
|
|
29
|
+
# crc32c / CpuZstdDict) do not exist in the 1.0.0 wheel; they ship
|
|
30
|
+
# with the next s4-codec release (1.1.0).
|
|
31
|
+
"s4-codec>=1.1.0,<2",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
# Default underlying filesystem for s3:// backends.
|
|
36
|
+
s3 = ["s3fs>=2023.1.0"]
|
|
37
|
+
dev = ["pytest>=7.0"]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://github.com/abyo-software/s4"
|
|
41
|
+
Repository = "https://github.com/abyo-software/s4"
|
|
42
|
+
Issues = "https://github.com/abyo-software/s4/issues"
|
|
43
|
+
|
|
44
|
+
# fsspec discovers the implementation lazily via the `fsspec.specs` entry
|
|
45
|
+
# point group — `fsspec.filesystem("s4")` works without importing s4fs first.
|
|
46
|
+
[project.entry-points."fsspec.specs"]
|
|
47
|
+
s4 = "s4fs.core:S4FileSystem"
|
|
48
|
+
|
|
49
|
+
[tool.setuptools.packages.find]
|
|
50
|
+
where = ["src"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
markers = [
|
|
54
|
+
"e2e: end-to-end tests that require docker (MinIO) and the s4 binary",
|
|
55
|
+
]
|
|
56
|
+
# e2e is opt-in: `pytest -m e2e` (an explicit -m on the command line
|
|
57
|
+
# overrides this default).
|
|
58
|
+
addopts = "-m 'not e2e'"
|
s4fs-1.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""s4fs — fsspec adapter for S4 gateway-written objects (read-only).
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
import s4fs # registers the "s4" protocol
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
df = pd.read_parquet(
|
|
9
|
+
"s4://bucket/data.parquet",
|
|
10
|
+
storage_options={"target_options": {"endpoint_url": "http://backend:9000"}},
|
|
11
|
+
)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from fsspec import register_implementation
|
|
15
|
+
|
|
16
|
+
from s4fs.core import S4File, S4FileSystem
|
|
17
|
+
|
|
18
|
+
__all__ = ["S4File", "S4FileSystem"]
|
|
19
|
+
__version__ = "1.0.0"
|
|
20
|
+
|
|
21
|
+
# Idempotent: the `fsspec.specs` entry point in pyproject.toml already
|
|
22
|
+
# advertises the implementation; registering here covers source checkouts
|
|
23
|
+
# and older fsspec versions. clobber=False keeps an operator override.
|
|
24
|
+
try:
|
|
25
|
+
register_implementation("s4", S4FileSystem, clobber=False)
|
|
26
|
+
except ValueError:
|
|
27
|
+
pass
|