archive-r-python 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/LICENSE.txt +28 -7
  2. {archive_r_python-0.1.0/archive_r_python.egg-info → archive_r_python-0.1.2}/PKG-INFO +3 -3
  3. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/README.md +1 -1
  4. archive_r_python-0.1.2/VERSION +1 -0
  5. {archive_r_python-0.1.0 → archive_r_python-0.1.2/archive_r_python.egg-info}/PKG-INFO +3 -3
  6. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/setup.cfg +1 -1
  7. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/setup.py +1 -1
  8. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/src/archive_r_py.cc +256 -158
  9. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/test/test_traverser.py +71 -23
  10. archive_r_python-0.1.0/VERSION +0 -1
  11. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/MANIFEST.in +0 -0
  12. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/data_stream.h +0 -0
  13. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/entry.h +0 -0
  14. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/entry_fault.h +0 -0
  15. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/entry_metadata.h +0 -0
  16. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/multi_volume_stream_base.h +0 -0
  17. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/path_hierarchy.h +0 -0
  18. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/path_hierarchy_utils.h +0 -0
  19. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/include/archive_r/traverser.h +0 -0
  20. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/archive_stack_cursor.cc +0 -0
  21. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/archive_stack_cursor.h +0 -0
  22. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/archive_stack_orchestrator.cc +0 -0
  23. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/archive_stack_orchestrator.h +0 -0
  24. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/archive_type.cc +0 -0
  25. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/archive_type.h +0 -0
  26. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/data_stream.cc +0 -0
  27. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/entry.cc +0 -0
  28. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/entry_fault.cc +0 -0
  29. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/entry_fault_error.cc +0 -0
  30. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/entry_fault_error.h +0 -0
  31. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/entry_impl.h +0 -0
  32. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/multi_volume_manager.cc +0 -0
  33. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/multi_volume_manager.h +0 -0
  34. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/multi_volume_stream_base.cc +0 -0
  35. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/path_hierarchy.cc +0 -0
  36. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/path_hierarchy_utils.cc +0 -0
  37. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/simple_profiler.h +0 -0
  38. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/system_file_stream.cc +0 -0
  39. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/system_file_stream.h +0 -0
  40. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/_vendor/archive_r/src/traverser.cc +0 -0
  41. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/archive_r_python.egg-info/SOURCES.txt +0 -0
  42. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/archive_r_python.egg-info/dependency_links.txt +0 -0
  43. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/archive_r_python.egg-info/not-zip-safe +0 -0
  44. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/archive_r_python.egg-info/top_level.txt +0 -0
  45. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/examples/traverse_archive.py +0 -0
  46. {archive_r_python-0.1.0 → archive_r_python-0.1.2}/pyproject.toml +0 -0
@@ -1,5 +1,5 @@
1
1
  archive_r License
2
- Version: 0.1.0 (2025-10-25)
2
+ Version: 0.1.2 (2025-12-02)
3
3
 
4
4
  ----------------------------------------
5
5
  Primary License
@@ -44,13 +44,34 @@ License shown above.
44
44
  - Purpose: header-only binding generator for the Python extension module.
45
45
  - License: BSD-style License (https://github.com/pybind/pybind11)
46
46
 
47
- 3. rake (development dependency for Ruby bindings)
48
- - Purpose: build and release tasks for the Ruby gem.
49
- - License: MIT License (https://github.com/ruby/rake)
47
+ The following components are redistributed only because libarchive (bundled with archive_r) depends on them at runtime:
50
48
 
51
- 4. minitest (development dependency for Ruby bindings)
52
- - Purpose: unit testing framework for the Ruby gem.
53
- - License: MIT License (https://github.com/minitest/minitest)
49
+ 3. zlib
50
+ - Purpose: libarchive dependency providing DEFLATE compression; bundled inside archive_r binaries and wheels because libarchive requires it.
51
+ - License: zlib License (https://zlib.net/zlib_license.html)
54
52
 
53
+ 4. bzip2
54
+ - Purpose: libarchive dependency providing bzip2 compression support; distributed with archive_r artifacts.
55
+ - License: BSD-style license (https://sourceware.org/bzip2/)
56
+
57
+ 5. liblzma (XZ Utils)
58
+ - Purpose: libarchive dependency providing LZMA/XZ compression; included with archive_r packages.
59
+ - License: Public Domain + GNU LGPLv2.1+ (https://tukaani.org/xz/)
60
+
61
+ 6. libxml2
62
+ - Purpose: libarchive dependency used for archive formats such as xar; distributed alongside archive_r.
63
+ - License: MIT-style License (http://xmlsoft.org/)
64
+
65
+ 7. zstd
66
+ - Purpose: libarchive dependency providing Zstandard compression; shipped within archive_r binaries.
67
+ - License: BSD License (https://github.com/facebook/zstd)
68
+
69
+ 8. OpenSSL 3
70
+ - Purpose: libarchive dependency providing cryptographic support for encrypted archives; included with archive_r packages.
71
+ - License: Apache License 2.0 with OpenSSL exception (https://www.openssl.org/source/license.html)
72
+
73
+ 9. libiconv / libcharset
74
+ - Purpose: libxml2/libarchive dependency for character set conversion; redistributed with archive_r artifacts.
75
+ - License: GNU LGPLv2.1+ (https://www.gnu.org/software/libiconv/)
55
76
  Users of archive_r should review the linked third-party licenses to ensure
56
77
  compliance with their terms when redistributing this software.
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archive_r_python
3
- Version: 0.1.0
4
- Summary: Python bindings for the archive_r traverser library
3
+ Version: 0.1.2
4
+ Summary: Python bindings for archive_r that recursively traverse nested archives without creating temporary extraction files
5
5
  Home-page: https://github.com/Raizo-TCS/archive_r
6
6
  Author: archive_r Team
7
7
  Author-email: raizo.tcs@users.noreply.github.com
@@ -609,4 +609,4 @@ The Python bindings are distributed under the MIT License, consistent with the a
609
609
 
610
610
  ---
611
611
 
612
- **Note**: This document describes archive_r Python bindings version 0.1.0.
612
+ **Note**: This document describes archive_r Python bindings version 0.1.2.
@@ -580,4 +580,4 @@ The Python bindings are distributed under the MIT License, consistent with the a
580
580
 
581
581
  ---
582
582
 
583
- **Note**: This document describes archive_r Python bindings version 0.1.0.
583
+ **Note**: This document describes archive_r Python bindings version 0.1.2.
@@ -0,0 +1 @@
1
+ 0.1.2
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archive_r_python
3
- Version: 0.1.0
4
- Summary: Python bindings for the archive_r traverser library
3
+ Version: 0.1.2
4
+ Summary: Python bindings for archive_r that recursively traverse nested archives without creating temporary extraction files
5
5
  Home-page: https://github.com/Raizo-TCS/archive_r
6
6
  Author: archive_r Team
7
7
  Author-email: raizo.tcs@users.noreply.github.com
@@ -609,4 +609,4 @@ The Python bindings are distributed under the MIT License, consistent with the a
609
609
 
610
610
  ---
611
611
 
612
- **Note**: This document describes archive_r Python bindings version 0.1.0.
612
+ **Note**: This document describes archive_r Python bindings version 0.1.2.
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = archive_r_python
3
- description = Python bindings for the archive_r traverser library
3
+ description = Python bindings for archive_r that recursively traverse nested archives without creating temporary extraction files
4
4
  author = archive_r Team
5
5
  author_email = raizo.tcs@users.noreply.github.com
6
6
  url = https://github.com/Raizo-TCS/archive_r
@@ -98,7 +98,7 @@ def read_readme() -> str:
98
98
  for candidate in paths:
99
99
  if candidate.exists():
100
100
  return candidate.read_text(encoding='utf-8')
101
- return 'Fast archive traversal library with support for nested archives and multipart files.'
101
+ return 'Python bindings for archive_r that recursively traverse nested archives without creating temporary extraction files.'
102
102
 
103
103
 
104
104
  def resolve_core_paths() -> Tuple[Path, Path]:
@@ -4,10 +4,12 @@
4
4
  #include "archive_r/data_stream.h"
5
5
  #include "archive_r/entry.h"
6
6
  #include "archive_r/entry_fault.h"
7
+ #include "archive_r/multi_volume_stream_base.h"
7
8
  #include "archive_r/path_hierarchy_utils.h"
8
9
  #include "archive_r/traverser.h"
9
10
  #include "archive_stack_orchestrator.h"
10
11
  #include <cctype>
12
+ #include <cstdio>
11
13
  #include <cstring>
12
14
  #include <iostream>
13
15
  #include <memory>
@@ -66,6 +68,96 @@ py::list path_hierarchy_to_python(const PathHierarchy &hierarchy) {
66
68
  return result;
67
69
  }
68
70
 
71
+ PathEntry python_path_entry_from_object(const py::handle &obj) {
72
+ if (py::isinstance<py::str>(obj)) {
73
+ return PathEntry::single(obj.cast<std::string>());
74
+ }
75
+
76
+ py::list sequence;
77
+ try {
78
+ sequence = py::list(py::reinterpret_borrow<py::object>(obj));
79
+ } catch (const py::cast_error &) {
80
+ throw std::invalid_argument("PathEntry must be string or nested sequence");
81
+ }
82
+
83
+ if (sequence.size() == 0) {
84
+ throw std::invalid_argument("PathEntry sequence cannot be empty");
85
+ }
86
+
87
+ bool all_strings = true;
88
+ for (py::handle item : sequence) {
89
+ if (!py::isinstance<py::str>(item)) {
90
+ all_strings = false;
91
+ break;
92
+ }
93
+ }
94
+
95
+ if (all_strings) {
96
+ std::vector<std::string> parts;
97
+ parts.reserve(static_cast<size_t>(sequence.size()));
98
+ for (py::handle item : sequence) {
99
+ parts.emplace_back(item.cast<std::string>());
100
+ }
101
+ return PathEntry::multi_volume(std::move(parts));
102
+ }
103
+
104
+ PathEntry::NodeList nodes;
105
+ nodes.reserve(static_cast<size_t>(sequence.size()));
106
+ for (py::handle item : sequence) {
107
+ nodes.emplace_back(python_path_entry_from_object(item));
108
+ }
109
+ return PathEntry::nested(std::move(nodes));
110
+ }
111
+
112
+ PathHierarchy python_path_hierarchy_from_object(const py::handle &obj) {
113
+ if (py::isinstance<py::str>(obj)) {
114
+ return make_single_path(obj.cast<std::string>());
115
+ }
116
+
117
+ py::list sequence;
118
+ try {
119
+ sequence = py::list(py::reinterpret_borrow<py::object>(obj));
120
+ } catch (const py::cast_error &) {
121
+ throw std::invalid_argument("path hierarchy must be string or sequence");
122
+ }
123
+
124
+ if (sequence.size() == 0) {
125
+ throw std::invalid_argument("path hierarchy cannot be empty");
126
+ }
127
+
128
+ PathHierarchy hierarchy;
129
+ hierarchy.reserve(static_cast<size_t>(sequence.size()));
130
+ for (py::handle component : sequence) {
131
+ hierarchy.emplace_back(python_path_entry_from_object(component));
132
+ }
133
+ return hierarchy;
134
+ }
135
+
136
+ std::vector<PathHierarchy> python_normalize_paths(const py::object &paths_obj) {
137
+ if (py::isinstance<py::str>(paths_obj)) {
138
+ return { make_single_path(paths_obj.cast<std::string>()) };
139
+ }
140
+
141
+ py::list path_list;
142
+ try {
143
+ path_list = py::list(paths_obj);
144
+ } catch (const py::cast_error &) {
145
+ throw std::invalid_argument("paths must be a string or a sequence of path hierarchies");
146
+ }
147
+
148
+ if (path_list.size() == 0) {
149
+ throw std::invalid_argument("paths cannot be empty");
150
+ }
151
+
152
+ std::vector<PathHierarchy> result;
153
+ result.reserve(static_cast<size_t>(path_list.size()));
154
+ for (py::handle item : path_list) {
155
+ result.emplace_back(python_path_hierarchy_from_object(item));
156
+ }
157
+
158
+ return result;
159
+ }
160
+
69
161
  py::dict entry_fault_to_python(const EntryFault &fault) {
70
162
  py::dict result;
71
163
  result[py::str("message")] = py::str(fault.message);
@@ -93,42 +185,98 @@ FaultCallback make_python_fault_callback(const py::object &callable) {
93
185
  };
94
186
  }
95
187
 
96
- class PyObjectStream : public IDataStream {
188
+ class PyUserStream : public MultiVolumeStreamBase {
97
189
  public:
98
- PyObjectStream(py::object io, PathHierarchy hierarchy)
99
- : io_(std::move(io))
100
- , hierarchy_(std::move(hierarchy))
101
- , at_end_(false)
102
- , seekable_(py::hasattr(io_, "seek"))
103
- , tellable_(py::hasattr(io_, "tell"))
104
- , has_custom_rewind_(py::hasattr(io_, "rewind")) {
105
- if (!py::hasattr(io_, "read")) {
106
- throw py::type_error("stream objects must provide a read() method");
190
+ PyUserStream(PathHierarchy hierarchy, bool supports_seek)
191
+ : MultiVolumeStreamBase(std::move(hierarchy), supports_seek)
192
+ , active_io_(py::none())
193
+ , io_seekable_(false)
194
+ , io_tellable_(false)
195
+ , io_closeable_(false)
196
+ , self_reference_(nullptr) {}
197
+
198
+ ~PyUserStream() override {
199
+ py::gil_scoped_acquire gil;
200
+ release_active_io();
201
+ if (self_reference_ != nullptr) {
202
+ Py_DECREF(self_reference_);
203
+ self_reference_ = nullptr;
204
+ }
107
205
  }
108
- if (!has_custom_rewind_ && !seekable_) {
109
- throw py::type_error("stream objects must provide either rewind() or seek()");
206
+
207
+ void retain_python_owner(const py::object &owner) {
208
+ PyObject *new_owner = owner.ptr();
209
+ if (new_owner == self_reference_) {
210
+ return;
211
+ }
212
+ Py_XINCREF(new_owner);
213
+ if (self_reference_ != nullptr) {
214
+ Py_DECREF(self_reference_);
215
+ }
216
+ self_reference_ = new_owner;
110
217
  }
111
- }
112
218
 
113
- ~PyObjectStream() override {
219
+ protected:
220
+ void open_single_part(const PathHierarchy &single_part) override {
114
221
  py::gil_scoped_acquire gil;
115
- if (py::hasattr(io_, "close")) {
116
- try {
117
- io_.attr("close")();
118
- } catch (const py::error_already_set &) {
119
- PyErr_Clear();
222
+ py::object open_io = py::get_override(this, "open_part_io");
223
+ if (open_io) {
224
+ py::object result = open_io(path_hierarchy_to_python(single_part));
225
+ if (result.is_none()) {
226
+ throw py::value_error("open_part_io must return an IO-like object");
120
227
  }
228
+ activate_io(std::move(result));
229
+ return;
230
+ }
231
+
232
+ py::object override = py::get_override(this, "open_part");
233
+ if (!override) {
234
+ throw py::type_error("Stream subclasses must implement open_part_io() or open_part()");
121
235
  }
236
+ override(path_hierarchy_to_python(single_part));
122
237
  }
123
238
 
124
- ssize_t read(void *buffer, size_t size) override {
239
+ void close_single_part() override {
240
+ py::gil_scoped_acquire gil;
241
+ release_active_io();
242
+ py::object override = py::get_override(this, "close_part");
243
+ if (override) {
244
+ override();
245
+ }
246
+ }
247
+
248
+ ssize_t read_from_single_part(void *buffer, size_t size) override {
125
249
  if (size == 0) {
126
250
  return 0;
127
251
  }
128
252
  py::gil_scoped_acquire gil;
129
- py::object result = io_.attr("read")(py::int_(size));
253
+ if (!active_io_.is_none()) {
254
+ py::object result = active_io_.attr("read")(py::int_(size));
255
+ if (result.is_none()) {
256
+ return 0;
257
+ }
258
+ py::bytes data = py::reinterpret_borrow<py::bytes>(py::bytes(result));
259
+ Py_ssize_t length = PyBytes_Size(data.ptr());
260
+ if (length < 0) {
261
+ throw py::error_already_set();
262
+ }
263
+ if (length == 0) {
264
+ return 0;
265
+ }
266
+ char *raw = PyBytes_AsString(data.ptr());
267
+ if (!raw) {
268
+ throw py::error_already_set();
269
+ }
270
+ std::memcpy(buffer, raw, static_cast<size_t>(length));
271
+ return static_cast<ssize_t>(length);
272
+ }
273
+
274
+ py::object override = py::get_override(this, "read_part");
275
+ if (!override) {
276
+ throw py::type_error("Stream subclasses must implement open_part_io() or read_part(size)");
277
+ }
278
+ py::object result = override(py::int_(size));
130
279
  if (result.is_none()) {
131
- at_end_ = true;
132
280
  return 0;
133
281
  }
134
282
  py::bytes data = py::reinterpret_borrow<py::bytes>(py::bytes(result));
@@ -137,7 +285,6 @@ public:
137
285
  throw py::error_already_set();
138
286
  }
139
287
  if (length == 0) {
140
- at_end_ = true;
141
288
  return 0;
142
289
  }
143
290
  char *raw = PyBytes_AsString(data.ptr());
@@ -148,61 +295,82 @@ public:
148
295
  return static_cast<ssize_t>(length);
149
296
  }
150
297
 
151
- void rewind() override {
298
+ int64_t seek_within_single_part(int64_t offset, int whence) override {
152
299
  py::gil_scoped_acquire gil;
153
- if (has_custom_rewind_) {
154
- io_.attr("rewind")();
155
- } else if (seekable_) {
156
- io_.attr("seek")(py::int_(0));
157
- } else {
158
- throw py::type_error("stream object does not support rewind");
300
+ if (!active_io_.is_none() && io_seekable_) {
301
+ py::object result = active_io_.attr("seek")(py::int_(offset), py::int_(whence));
302
+ return result.cast<int64_t>();
159
303
  }
160
- at_end_ = false;
161
- }
162
-
163
- bool at_end() const override { return at_end_; }
164
-
165
- int64_t seek(int64_t offset, int whence) override {
166
- if (!seekable_) {
304
+ py::object override = py::get_override(this, "seek_part");
305
+ if (!override) {
167
306
  return -1;
168
307
  }
169
- py::gil_scoped_acquire gil;
170
- py::object result = io_.attr("seek")(py::int_(offset), py::int_(whence));
171
- at_end_ = false;
308
+ py::object result = override(py::int_(offset), py::int_(whence));
172
309
  return result.cast<int64_t>();
173
310
  }
174
311
 
175
- int64_t tell() const override {
176
- if (!tellable_) {
312
+ int64_t size_of_single_part(const PathHierarchy &single_part) override {
313
+ py::gil_scoped_acquire gil;
314
+ if (!active_io_.is_none()) {
315
+ if (io_seekable_ && io_tellable_) {
316
+ py::object current = active_io_.attr("tell")();
317
+ active_io_.attr("seek")(py::int_(0), py::int_(SEEK_END));
318
+ py::object end_pos = active_io_.attr("tell")();
319
+ active_io_.attr("seek")(current, py::int_(SEEK_SET));
320
+ return end_pos.cast<int64_t>();
321
+ }
322
+ }
323
+
324
+ py::object override = py::get_override(this, "part_size");
325
+ if (!override) {
177
326
  return -1;
178
327
  }
179
- py::gil_scoped_acquire gil;
180
- py::object result = io_.attr("tell")();
328
+ py::object result = override(path_hierarchy_to_python(single_part));
181
329
  return result.cast<int64_t>();
182
330
  }
183
331
 
184
- bool can_seek() const override { return seekable_; }
185
-
186
- PathHierarchy source_hierarchy() const override { return hierarchy_; }
187
-
188
332
  private:
189
- py::object io_;
190
- PathHierarchy hierarchy_;
191
- mutable bool at_end_;
192
- bool seekable_;
193
- bool tellable_;
194
- bool has_custom_rewind_;
195
- };
196
-
197
- std::shared_ptr<IDataStream> stream_from_python_result(const py::object &result, const PathHierarchy &requested) {
198
- if (result.is_none()) {
199
- return nullptr;
333
+ void activate_io(py::object io) {
334
+ if (!py::hasattr(io, "read")) {
335
+ throw py::type_error("open_part_io must return an object with read()");
336
+ }
337
+ release_active_io();
338
+ active_io_ = std::move(io);
339
+ io_seekable_ = py::hasattr(active_io_, "seek");
340
+ io_tellable_ = py::hasattr(active_io_, "tell");
341
+ io_closeable_ = py::hasattr(active_io_, "close");
342
+ if (can_seek() && io_seekable_) {
343
+ try {
344
+ active_io_.attr("seek")(py::int_(0), py::int_(SEEK_CUR));
345
+ } catch (const py::error_already_set &) {
346
+ PyErr_Clear();
347
+ }
348
+ }
200
349
  }
201
- if (!py::hasattr(result, "read")) {
202
- throw py::type_error("stream factory must return None or an object with read()");
350
+
351
+ void release_active_io() {
352
+ if (active_io_.is_none()) {
353
+ return;
354
+ }
355
+ if (io_closeable_) {
356
+ try {
357
+ active_io_.attr("close")();
358
+ } catch (const py::error_already_set &) {
359
+ PyErr_Clear();
360
+ }
361
+ }
362
+ active_io_ = py::none();
363
+ io_seekable_ = false;
364
+ io_tellable_ = false;
365
+ io_closeable_ = false;
203
366
  }
204
- return std::make_shared<PyObjectStream>(result, requested);
205
- }
367
+
368
+ py::object active_io_;
369
+ bool io_seekable_;
370
+ bool io_tellable_;
371
+ bool io_closeable_;
372
+ PyObject *self_reference_;
373
+ };
206
374
 
207
375
  void register_python_stream_factory(const py::object &callable) {
208
376
  if (callable.is_none()) {
@@ -217,7 +385,18 @@ void register_python_stream_factory(const py::object &callable) {
217
385
  RootStreamFactory factory = [func](const PathHierarchy &hierarchy) -> std::shared_ptr<IDataStream> {
218
386
  py::gil_scoped_acquire gil;
219
387
  py::object result = func(path_hierarchy_to_python(hierarchy));
220
- return stream_from_python_result(result, hierarchy);
388
+ if (result.is_none()) {
389
+ return nullptr;
390
+ }
391
+ if (!py::isinstance<PyUserStream>(result)) {
392
+ throw py::type_error("stream factory must return None or an archive_r.Stream instance");
393
+ }
394
+ auto stream = result.cast<std::shared_ptr<PyUserStream>>();
395
+ stream->retain_python_owner(result);
396
+ if (!hierarchies_equal(stream->source_hierarchy(), hierarchy)) {
397
+ throw py::value_error("Stream subclass must be initialized with the hierarchy provided to the factory");
398
+ }
399
+ return stream;
221
400
  };
222
401
 
223
402
  set_root_stream_factory(factory);
@@ -475,7 +654,7 @@ public:
475
654
  std::optional<std::vector<std::string>> metadata_keys = std::nullopt, std::optional<bool> descend_archives = std::nullopt)
476
655
  : traverser_options_(build_options(passphrases, formats, metadata_keys, descend_archives))
477
656
  , archive_options_snapshot_(to_archive_option(traverser_options_))
478
- , traverser(normalize_paths(paths), traverser_options_)
657
+ , traverser(python_normalize_paths(paths), traverser_options_)
479
658
  , it(traverser.end()) {
480
659
  }
481
660
 
@@ -551,96 +730,6 @@ private:
551
730
  return options;
552
731
  }
553
732
 
554
- static PathEntry py_to_path_entry(const py::handle &obj) {
555
- if (py::isinstance<py::str>(obj)) {
556
- return PathEntry::single(obj.cast<std::string>());
557
- }
558
-
559
- py::list sequence;
560
- try {
561
- sequence = py::list(py::reinterpret_borrow<py::object>(obj));
562
- } catch (const py::cast_error &) {
563
- throw std::invalid_argument("PathEntry must be string or nested sequence");
564
- }
565
-
566
- if (sequence.size() == 0) {
567
- throw std::invalid_argument("PathEntry sequence cannot be empty");
568
- }
569
-
570
- bool all_strings = true;
571
- for (py::handle item : sequence) {
572
- if (!py::isinstance<py::str>(item)) {
573
- all_strings = false;
574
- break;
575
- }
576
- }
577
-
578
- if (all_strings) {
579
- std::vector<std::string> parts;
580
- parts.reserve(static_cast<size_t>(sequence.size()));
581
- for (py::handle item : sequence) {
582
- parts.emplace_back(item.cast<std::string>());
583
- }
584
- return PathEntry::multi_volume(std::move(parts));
585
- }
586
-
587
- PathEntry::NodeList nodes;
588
- nodes.reserve(static_cast<size_t>(sequence.size()));
589
- for (py::handle item : sequence) {
590
- nodes.emplace_back(py_to_path_entry(item));
591
- }
592
- return PathEntry::nested(std::move(nodes));
593
- }
594
-
595
- static PathHierarchy py_to_path_hierarchy(const py::handle &obj) {
596
- if (py::isinstance<py::str>(obj)) {
597
- return make_single_path(obj.cast<std::string>());
598
- }
599
-
600
- py::list sequence;
601
- try {
602
- sequence = py::list(py::reinterpret_borrow<py::object>(obj));
603
- } catch (const py::cast_error &) {
604
- throw std::invalid_argument("path hierarchy must be string or sequence");
605
- }
606
-
607
- if (sequence.size() == 0) {
608
- throw std::invalid_argument("path hierarchy cannot be empty");
609
- }
610
-
611
- PathHierarchy hierarchy;
612
- hierarchy.reserve(static_cast<size_t>(sequence.size()));
613
- for (py::handle component : sequence) {
614
- hierarchy.emplace_back(py_to_path_entry(component));
615
- }
616
- return hierarchy;
617
- }
618
-
619
- static std::vector<PathHierarchy> normalize_paths(const py::object &paths_obj) {
620
- if (py::isinstance<py::str>(paths_obj)) {
621
- return { make_single_path(paths_obj.cast<std::string>()) };
622
- }
623
-
624
- py::list path_list;
625
- try {
626
- path_list = py::list(paths_obj);
627
- } catch (const py::cast_error &) {
628
- throw std::invalid_argument("paths must be a string or a sequence of path hierarchies");
629
- }
630
-
631
- if (path_list.size() == 0) {
632
- throw std::invalid_argument("paths cannot be empty");
633
- }
634
-
635
- std::vector<PathHierarchy> result;
636
- result.reserve(static_cast<size_t>(path_list.size()));
637
- for (py::handle item : path_list) {
638
- result.emplace_back(py_to_path_hierarchy(item));
639
- }
640
-
641
- return result;
642
- }
643
-
644
733
  TraverserOptions traverser_options_;
645
734
  ArchiveOption archive_options_snapshot_;
646
735
  Traverser traverser;
@@ -680,13 +769,22 @@ PYBIND11_MODULE(archive_r, m) {
680
769
  m.attr("STANDARD_FORMATS") = formats_tuple;
681
770
  m.attr("SAFE_FORMATS") = formats_tuple;
682
771
 
683
- m.def("register_stream_factory", &register_python_stream_factory, py::arg("factory") = py::none(),
684
- "Register a callable returning a file-like object for custom root streams. Pass None to reset.");
772
+ m.def("register_stream_factory", &register_python_stream_factory, py::arg("factory") = py::none(),
773
+ "Register a callable returning archive_r.Stream instances for custom root streams. Pass None to reset.");
685
774
 
686
- m.def("on_fault",
775
+ m.def("on_fault",
687
776
  [](const py::object &callback) { register_fault_callback(make_python_fault_callback(callback)); }, py::arg("callback") = py::none(),
688
777
  "Register or clear the global EntryFault callback (None clears)");
689
778
 
779
+ py::class_<PyUserStream, std::shared_ptr<PyUserStream>>(m, "Stream")
780
+ .def(py::init([](py::object hierarchy, py::object supports_seek_obj) {
781
+ auto path = python_path_hierarchy_from_object(hierarchy);
782
+ bool supports_seek = supports_seek_obj.is_none() ? true : supports_seek_obj.cast<bool>();
783
+ return std::make_shared<PyUserStream>(std::move(path), supports_seek);
784
+ }),
785
+ py::arg("hierarchy"), py::arg("supports_seek") = py::none(),
786
+ "Base class for custom multi-volume streams returned from register_stream_factory.");
787
+
690
788
  // Entry class
691
789
  py::class_<PyEntry, std::shared_ptr<PyEntry>>(m, "Entry")
692
790
  .def_property_readonly("path", &PyEntry::path, "Get full path of the entry")
@@ -24,15 +24,16 @@ import archive_r
24
24
  class TestTraverser(unittest.TestCase):
25
25
  DEFAULT_FORMATS = tuple(list(archive_r.STANDARD_FORMATS) + ["mtree"])
26
26
 
27
- class MinimalStream:
28
- def __init__(self, payload: bytes):
29
- self._buffer = io.BytesIO(payload)
30
-
31
- def read(self, length: int = -1) -> bytes:
32
- return self._buffer.read(length)
27
+ class PayloadStream(archive_r.Stream):
28
+ def __init__(self, hierarchy, payload: bytes, *, supports_seek: bool | None = None):
29
+ if supports_seek is None:
30
+ super().__init__(hierarchy)
31
+ else:
32
+ super().__init__(hierarchy, supports_seek=supports_seek)
33
+ self._payload = payload
33
34
 
34
- def rewind(self) -> None:
35
- self._buffer.seek(0)
35
+ def open_part_io(self, _part_hierarchy):
36
+ return io.BytesIO(self._payload)
36
37
 
37
38
  @classmethod
38
39
  def setUpClass(cls):
@@ -44,6 +45,7 @@ class TestTraverser(unittest.TestCase):
44
45
  cls.directory_path = str(test_data_dir / 'directory_test')
45
46
  cls.broken_archive = str(test_data_dir / 'broken_nested.tar')
46
47
  cls.stress_archive = str(test_data_dir / 'stress_test_ultimate.tar.gz')
48
+ cls.multi_volume_parts = sorted(str(path.resolve()) for path in test_data_dir.glob('test_input.tar.gz.part*'))
47
49
 
48
50
  if not os.path.exists(cls.simple_archive):
49
51
  raise FileNotFoundError(f"Test archive not found: {cls.simple_archive}")
@@ -57,6 +59,8 @@ class TestTraverser(unittest.TestCase):
57
59
  raise FileNotFoundError(f"Broken archive not found: {cls.broken_archive}")
58
60
  if not os.path.exists(cls.stress_archive):
59
61
  raise FileNotFoundError(f"Stress archive not found: {cls.stress_archive}")
62
+ if not cls.multi_volume_parts:
63
+ raise FileNotFoundError('Multi-volume parts test_input.tar.gz.part* not found')
60
64
 
61
65
  def _normalized_options(self, **kwargs):
62
66
  options = dict(kwargs)
@@ -334,7 +338,7 @@ class TestTraverser(unittest.TestCase):
334
338
  def factory(hierarchy):
335
339
  calls["count"] += 1
336
340
  if hierarchy[0] == os.path.abspath(self.simple_archive):
337
- return io.BytesIO(payload)
341
+ return self.PayloadStream(hierarchy, payload)
338
342
  return None
339
343
 
340
344
  archive_r.register_stream_factory(factory)
@@ -351,7 +355,7 @@ class TestTraverser(unittest.TestCase):
351
355
 
352
356
  def factory(hierarchy):
353
357
  if hierarchy[0] == os.path.abspath(virtual):
354
- return open(self.simple_archive, "rb")
358
+ return self.PayloadStream(hierarchy, Path(self.simple_archive).read_bytes())
355
359
  return None
356
360
 
357
361
  archive_r.register_stream_factory(factory)
@@ -368,13 +372,46 @@ class TestTraverser(unittest.TestCase):
368
372
  if hierarchy[0] != absolute:
369
373
  return None
370
374
  calls["count"] += 1
371
- return self.MinimalStream(payload)
375
+ return self.PayloadStream(hierarchy, payload)
372
376
 
373
377
  archive_r.register_stream_factory(factory)
374
378
  actual = self._collect_paths(self.simple_archive)
375
379
  self.assertEqual(expected, actual)
376
380
  self.assertEqual(1, calls["count"])
377
381
 
382
+ def test_stream_factory_multi_volume_custom_stream(self):
383
+ class RecordingStream(archive_r.Stream):
384
+ def __init__(self, hierarchy):
385
+ super().__init__(hierarchy, supports_seek=True)
386
+ self.requests = []
387
+
388
+ def open_part_io(self, part_hierarchy):
389
+ head = part_hierarchy[0]
390
+ self.requests.append(head)
391
+ return open(head, 'rb')
392
+
393
+ parts_hierarchy = [[part for part in self.multi_volume_parts]]
394
+ archive_r.register_stream_factory(None)
395
+ expected = self._collect_paths(parts_hierarchy)
396
+
397
+ streams = []
398
+
399
+ def factory(hierarchy):
400
+ head = hierarchy[0]
401
+ self.assertIsInstance(head, list)
402
+ stream = RecordingStream(hierarchy)
403
+ streams.append(stream)
404
+ return stream
405
+
406
+ try:
407
+ archive_r.register_stream_factory(factory)
408
+ actual = self._collect_paths(parts_hierarchy)
409
+ self.assertEqual(expected, actual)
410
+ self.assertTrue(streams)
411
+ self.assertEqual(self.multi_volume_parts, streams[0].requests)
412
+ finally:
413
+ archive_r.register_stream_factory(None)
414
+
378
415
  def test_stream_factory_requires_callable(self):
379
416
  """register_stream_factory should reject non-callables"""
380
417
  with self.assertRaises(TypeError):
@@ -386,24 +423,27 @@ class TestTraverser(unittest.TestCase):
386
423
  payload = Path(self.simple_archive).read_bytes()
387
424
  streams = []
388
425
 
389
- class SeekableOnlyStream:
390
- def __init__(self, data: bytes):
391
- self._buffer = io.BytesIO(data)
392
- self.seek_calls = 0
426
+ class TrackingBuffer(io.BytesIO):
427
+ def __init__(self, owner, data: bytes):
428
+ super().__init__(data)
429
+ self._owner = owner
393
430
 
394
- def read(self, length: int = -1) -> bytes:
395
- return self._buffer.read(length)
431
+ def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
432
+ self._owner.seek_calls += 1
433
+ return super().seek(offset, whence)
396
434
 
397
- def seek(self, offset: int, whence: int = 0) -> int:
398
- self.seek_calls += 1
399
- return self._buffer.seek(offset, whence)
435
+ class SeekableOnlyStream(archive_r.Stream):
436
+ def __init__(self, hierarchy, data: bytes):
437
+ super().__init__(hierarchy, supports_seek=True)
438
+ self._data = data
439
+ self.seek_calls = 0
400
440
 
401
- def tell(self) -> int:
402
- return self._buffer.tell()
441
+ def open_part_io(self, _hierarchy):
442
+ return TrackingBuffer(self, self._data)
403
443
 
404
444
  def factory(hierarchy):
405
445
  if hierarchy[0] == os.path.abspath(self.simple_archive):
406
- stream = SeekableOnlyStream(payload)
446
+ stream = SeekableOnlyStream(hierarchy, payload)
407
447
  streams.append(stream)
408
448
  return stream
409
449
  return None
@@ -414,6 +454,14 @@ class TestTraverser(unittest.TestCase):
414
454
  self.assertTrue(streams)
415
455
  self.assertTrue(any(stream.seek_calls >= 1 for stream in streams))
416
456
 
457
+ def test_stream_factory_rejects_plain_io(self):
458
+ def factory(hierarchy):
459
+ if hierarchy[0] == head:
460
+ stream = RecordingStream(hierarchy)
461
+ streams.append(stream)
462
+ return stream
463
+ self._collect_paths(self.simple_archive)
464
+
417
465
  def test_multi_volume_grouping(self):
418
466
  """Verify multi-volume archives can be grouped and traversed"""
419
467
  part_paths = []
@@ -1 +0,0 @@
1
- 0.1.0