netpluck 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
netpluck-0.9.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2026] [Jesse Janzer]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: netpluck
3
+ Version: 0.9.1
4
+ Author-email: Jesse Janzer <jjanzer@gmail.com>
5
+ License-Expression: MIT
6
+ Project-URL: Homepage, https://github.com/jjanzer/netpluck
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: requests
10
+ Provides-Extra: bucket
11
+ Requires-Dist: buckethandler>=0.9.0; extra == "bucket"
12
+ Dynamic: license-file
13
+
14
+ # NetPluck
15
+
16
+ NetPluck is a tool allows you to remotely extract assets, typically files, from local and remote archives, such as a zip, without fetching or opening the entire archive. NetPluck was specifically designed to be used for very large .zip files that were stored in remote locations such as buckets on Backblaze and CDNs. It is extremely useful if you want to fetch the list of files or extract specific files of very large remote files.
17
+
18
+ If you intend to fetch and extract all the contents of the archive or your files are very small you don't need this library.
19
+
20
+ There are three key features of NetPluck:
21
+
22
+ 1. Works very efficiently on large files
23
+ 1. Can tell you what's in the file without reading the entire thing
24
+ 1. Extracts specific files without reading the entire thing
25
+
26
+ NetPluck provides a CLI tool via `netpluck` and the core python libraries.
27
+
28
+ ## Formats Supported
29
+
30
+ 1. zip
31
+ 1. zip64
32
+
33
+ ## Protocols Supported
34
+
35
+ 1. Local Files
36
+ 1. Remote Files over HTTP or HTTPS that support range
37
+ 1. Backblaze B2
38
+
39
+ ## CLI Usage
40
+
41
+ ### Retrieving the List of Files in the Archive
42
+ ```
43
+ netpluck --path sample_data.zip --toc
44
+ empty.txt
45
+ lava.png
46
+ message.txt
47
+ nova.bmp
48
+ triangles.png
49
+ ```
50
+
51
+ ### Extracting Files
52
+ You can pass regular expressions using `--filter` to extract files. If you do not specify the `--out` argument it will default to `./output/`
53
+ ```
54
+ netpluck --path sample_data.zip --filter="\.*bmp" --out ./output/
55
+ [1/1] 100.00% nova.bmp => ./output/nova.bmp
56
+ ```
57
+
58
+ You can also use the `--flatten` flag if you want to strip all directories from the resulting output, this will dump all extracted matches into your output folder with no hierarchy.
59
+
60
+ ### Getting Statistics
61
+ You can enable the `--stats` flag to see data about bytes and lookups made.
62
+ ```
63
+ netpluck --path sample_data.zip --stats --filter="\.*txt" --out ./output/
64
+ [1/2] 50.00% empty.txt => ./output/empty.txt
65
+ [2/2] 100.00% message.txt => ./output/message.txt
66
+
67
+ File size: 1.52MB
68
+ Cache hits: 7 size: 1.37KB
69
+ Uncached reads: 3 size: 64.10KB
70
+ Bytes saved: 1.46MB 95.88%
71
+ ```
72
+ In this instance we read 64kb of the 1556kb file while extracting the two txt files.
73
+
74
+
75
+ ## Extending
76
+
77
+ ### Adding New Protocols and Archives
78
+
79
+ Protocols in NetPluck are handled by a prefix like: `https://` or `b2://` to add new ones you must modify the `netpluck.py` main class and `ProtocolType` enum. Make sure you modify the `_guess_protocol` method so it can automatically determine the appropriate one.
80
+
81
+ Archive types can be extended by adding your own handler to `netpluck/virtual_archive` and subclassing the `VirtualArchive`. Be sure to implement all interfaces exposed by `VirtualArchive`. See the existing zip archive handler for an example. Typically this means you implement `__init__` and `_read_uncached_range`. See `netpluck/virtual_file/local.py` or `netpluck/virtual_file/http.py` for a simple example.
82
+
83
+ You may also need to add a new virtual file type depending on how your data is queried which can be done just like `VirtualArchive` but with `VirtualFile`.
@@ -0,0 +1,70 @@
1
+ # NetPluck
2
+
3
+ NetPluck is a tool allows you to remotely extract assets, typically files, from local and remote archives, such as a zip, without fetching or opening the entire archive. NetPluck was specifically designed to be used for very large .zip files that were stored in remote locations such as buckets on Backblaze and CDNs. It is extremely useful if you want to fetch the list of files or extract specific files of very large remote files.
4
+
5
+ If you intend to fetch and extract all the contents of the archive or your files are very small you don't need this library.
6
+
7
+ There are three key features of NetPluck:
8
+
9
+ 1. Works very efficiently on large files
10
+ 1. Can tell you what's in the file without reading the entire thing
11
+ 1. Extracts specific files without reading the entire thing
12
+
13
+ NetPluck provides a CLI tool via `netpluck` and the core python libraries.
14
+
15
+ ## Formats Supported
16
+
17
+ 1. zip
18
+ 1. zip64
19
+
20
+ ## Protocols Supported
21
+
22
+ 1. Local Files
23
+ 1. Remote Files over HTTP or HTTPS that support range
24
+ 1. Backblaze B2
25
+
26
+ ## CLI Usage
27
+
28
+ ### Retrieving the List of Files in the Archive
29
+ ```
30
+ netpluck --path sample_data.zip --toc
31
+ empty.txt
32
+ lava.png
33
+ message.txt
34
+ nova.bmp
35
+ triangles.png
36
+ ```
37
+
38
+ ### Extracting Files
39
+ You can pass regular expressions using `--filter` to extract files. If you do not specify the `--out` argument it will default to `./output/`
40
+ ```
41
+ netpluck --path sample_data.zip --filter="\.*bmp" --out ./output/
42
+ [1/1] 100.00% nova.bmp => ./output/nova.bmp
43
+ ```
44
+
45
+ You can also use the `--flatten` flag if you want to strip all directories from the resulting output, this will dump all extracted matches into your output folder with no hierarchy.
46
+
47
+ ### Getting Statistics
48
+ You can enable the `--stats` flag to see data about bytes and lookups made.
49
+ ```
50
+ netpluck --path sample_data.zip --stats --filter="\.*txt" --out ./output/
51
+ [1/2] 50.00% empty.txt => ./output/empty.txt
52
+ [2/2] 100.00% message.txt => ./output/message.txt
53
+
54
+ File size: 1.52MB
55
+ Cache hits: 7 size: 1.37KB
56
+ Uncached reads: 3 size: 64.10KB
57
+ Bytes saved: 1.46MB 95.88%
58
+ ```
59
+ In this instance we read 64kb of the 1556kb file while extracting the two txt files.
60
+
61
+
62
+ ## Extending
63
+
64
+ ### Adding New Protocols and Archives
65
+
66
+ Protocols in NetPluck are handled by a prefix like: `https://` or `b2://` to add new ones you must modify the `netpluck.py` main class and `ProtocolType` enum. Make sure you modify the `_guess_protocol` method so it can automatically determine the appropriate one.
67
+
68
+ Archive types can be extended by adding your own handler to `netpluck/virtual_archive` and subclassing the `VirtualArchive`. Be sure to implement all interfaces exposed by `VirtualArchive`. See the existing zip archive handler for an example. Typically this means you implement `__init__` and `_read_uncached_range`. See `netpluck/virtual_file/local.py` or `netpluck/virtual_file/http.py` for a simple example.
69
+
70
+ You may also need to add a new virtual file type depending on how your data is queried which can be done just like `VirtualArchive` but with `VirtualFile`.
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "netpluck"
7
+ version = "0.9.1"
8
+ readme = "README.md"
9
+ license = "MIT"
10
+ license-files = ["LICENSE"]
11
+ authors = [
12
+ {name = "Jesse Janzer", email = "jjanzer@gmail.com"}
13
+ ]
14
+
15
+ dependencies = [
16
+ "requests"
17
+ ]
18
+
19
+ [project.optional-dependencies]
20
+ # bucket handler adds support for things like backblaze
21
+ bucket = [
22
+ "buckethandler>=0.9.0",
23
+ ]
24
+
25
+ [project.scripts]
26
+ netpluck = "netpluck.cli:main"
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/jjanzer/netpluck"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,8 @@
1
+ from .netpluck import NetPluck
2
+ from .cli import main
3
+
4
+ __version__ = "0.1.0"
5
+ __all__ = [
6
+ "NetPluck",
7
+ "main",
8
+ ]
@@ -0,0 +1,74 @@
1
+ import argparse
2
+ import sys
3
+ import pprint
4
+ from enum import Enum
5
+ from .netpluck import NetPluck
6
+
7
+ def main():
8
+ def str2bool(v):
9
+ '''
10
+ Helper to parse bool values
11
+ '''
12
+ if isinstance(v, bool):
13
+ return v
14
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
15
+ return True
16
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
17
+ return False
18
+ else:
19
+ raise argparse.ArgumentTypeError('Boolean value expected.')
20
+
21
+ parser = argparse.ArgumentParser(description="Selectively extract files from an archive without reading the entire file")
22
+ parser.add_argument("--path", type=str, help="The input zip file, use 'use b2://' for remote backblaze urls")
23
+ #parser.add_argument("--toc", type=str2bool, nargs="?", help="Print the table of contents file", const=False)
24
+ parser.add_argument("--toc", action="store_true", help="Print the table of contents file")
25
+ parser.add_argument("--filter", type=str, help="If passed will match the regex and output files", default=None)
26
+ parser.add_argument("--flatten", action="store_true", help="If passed will flatten the output files so they do not expand their relative directories from the archive, eg: foo/bar/car.png => output/car.png")
27
+ parser.add_argument("--out", type=str, help="The output folder to write the results to, if no filter is set all files are unpacked", default="output")
28
+ parser.add_argument("--stats", action="store_true", help="Print statistics about the virtual file reads")
29
+ parser.add_argument("--config", type=str, help="Path to the config file for something like backblaze not used for local paths", default='config.zh.json')
30
+ parser.add_argument("--quiet", action="store_true", help="Suppress non-error output")
31
+ args = parser.parse_args()
32
+
33
+ in_file = args.path
34
+ print_toc = args.toc
35
+ filter = args.filter
36
+ flatten = args.flatten
37
+ config_path = args.config
38
+ output_dir = args.out
39
+ print_stats = args.stats
40
+ quiet = args.quiet
41
+
42
+ np = NetPluck(in_file, config=config_path)
43
+ np.quiet = quiet
44
+
45
+ files = np.get_file_list()
46
+
47
+ if print_toc:
48
+ for file in files:
49
+ print(file)
50
+
51
+ if output_dir:
52
+
53
+ if filter != None:
54
+ np.extract_from_filter(filter, output_dir, flatten=flatten)
55
+
56
+ if print_stats:
57
+ cached_bytes_str = np.pretty_file_size(np.vf.cached_read_bytes)
58
+ uncached_bytes_str = np.pretty_file_size(np.vf.uncached_read_bytes)
59
+ size_bytes_str = np.pretty_file_size(np.vf.size)
60
+
61
+ bytes_saved = np.vf.size - (np.vf.uncached_read_bytes)
62
+ bytes_saved_str = np.pretty_file_size(bytes_saved)
63
+ percent_saved = (bytes_saved / np.vf.size) * 100 if np.vf.size > 0 else 0.0
64
+ percent_saved_str = f"{percent_saved:.2f}%"
65
+
66
+ #cached_bytes = np.vf.cached_read_bytes
67
+ print("")
68
+ print(f"File size: {size_bytes_str}")
69
+ print(f"Cache hits: {np.vf.cached_reads} size: {cached_bytes_str}")
70
+ print(f"Uncached reads: {np.vf.uncached_reads} size: {uncached_bytes_str}")
71
+ print(f"Bytes saved: {bytes_saved_str} {percent_saved_str}")
72
+
73
+ if __name__ == "__main__":
74
+ main()
@@ -0,0 +1,242 @@
1
+ from enum import Enum
2
+ import re
3
+ import math
4
+ import os
5
+ from typing import List
6
+
7
+ from netpluck.virtual_archive.zh import ZipHopper
8
+ from netpluck.virtual_file import VirtualFile, VirtualFileHTTP, VirtualFileLocal
9
+ from netpluck.virtual_archive.base import VirtualArchive
10
+
11
+ class ArchiveType(Enum):
12
+ '''
13
+ The list of archive types NetPluck supports
14
+ '''
15
+ ZIP = 1
16
+ UNKNOWN = 99
17
+
18
+ class ProtocolType(Enum):
19
+ '''
20
+ The list of protocol types NetPluck supports
21
+ '''
22
+ LOCAL = 1 # Local file system
23
+ HTTP = 2 # Both HTTP AND HTTPS are covered by this
24
+ B2 = 3 # Backblaze B2
25
+ UNKNOWN = 99
26
+
27
+
28
+ class NetPluck:
29
+
30
+ def __init__(self, path, config=None, archive_type:ArchiveType=ArchiveType.UNKNOWN):
31
+ '''
32
+ Returns:
33
+ An instance of NetPluck bound to the archive type
34
+
35
+ Args:
36
+ path: this should be a path to a local file, url, or a backblaze b2 file path, or similar
37
+ config: can be a path to a json file or a key/value pair,
38
+ it is used with VirtualFile handlers, so check the specifics of each one
39
+ archive_type: if you know the type of archive you are working with you can specify it here, otherwise NetPluck will attempt to determine it by the file extension or magic bytes
40
+ '''
41
+
42
+ self.quiet = False
43
+ self.ah:VirtualArchive
44
+ self.at:ArchiveType = archive_type
45
+ self.protocol:ProtocolType = ProtocolType.UNKNOWN
46
+ self.vf:VirtualFile
47
+ self.path = path
48
+
49
+ self.protocol = self._guess_protocol(path)
50
+
51
+ if self.protocol == ProtocolType.B2:
52
+ # B2 support is only available if you install buckethandler
53
+
54
+ try:
55
+ from netpluck.virtual_file.backblaze import VirtualFileBackblaze
56
+ from buckethandler import BackblazeB2Handler
57
+ except ImportError:
58
+ raise ImportError("To use backblaze b2 support you need to install the bucket handler package, you can do this via: pip install netpluck[bucket]")
59
+
60
+ protocol_prefix_b2 = "b2://"
61
+ path = path[len(protocol_prefix_b2):]
62
+ handler = BackblazeB2Handler(config)
63
+ self.vf = VirtualFileBackblaze(handler, path)
64
+ elif self.protocol == ProtocolType.HTTP:
65
+ self.vf = VirtualFileHTTP(path)
66
+ elif self.protocol == ProtocolType.LOCAL:
67
+ if path.startswith("file://"):
68
+ path = path[len("file://"):]
69
+ self.vf = VirtualFileLocal(path)
70
+ else:
71
+ raise Exception("Unsupported protocol type")
72
+
73
+ if archive_type == ArchiveType.UNKNOWN:
74
+ archive_type = self._guess_archive_type()
75
+ self.at = archive_type
76
+
77
+ # Finally let's instantiate the archive handler based on the archive type
78
+ self.ah = ZipHopper(self.vf)
79
+
80
+
81
+ # Private Methods
82
+
83
+ def _guess_protocol(self,path) -> ProtocolType:
84
+ protocol_prefix_b2 = "b2://"
85
+ protocol_prefix_http = "http://"
86
+ protocol_prefix_https = "https://"
87
+ if path.startswith(protocol_prefix_b2):
88
+ return ProtocolType.B2
89
+ elif path.startswith(protocol_prefix_http) or path.startswith(protocol_prefix_https):
90
+ return ProtocolType.HTTP
91
+ else:
92
+ return ProtocolType.LOCAL
93
+
94
+ def _guess_archive_type(self) -> ArchiveType:
95
+ if self.path.endswith(".zip"):
96
+ return ArchiveType.ZIP
97
+ else:
98
+ # Try fetching the magic bytes of the file to determine the type
99
+ magic_bytes = self.vf[0:4]
100
+ if magic_bytes == b'PK\x03\x04':
101
+ return ArchiveType.ZIP
102
+ else:
103
+ raise Exception("Could not determine archive type")
104
+
105
+
106
+ # Public Methods
107
+
108
+
109
+ def get_file_list(self) -> List[str]:
110
+ '''
111
+ Returns the list of all files in the archive
112
+ Returns:
113
+ A list of file paths in the archive
114
+ '''
115
+ return self.ah.get_file_list()
116
+
117
+ def get_file(self, filename) -> bytes:
118
+ '''
119
+ Returns the contents of a file in the archive
120
+ Args:
121
+ filename: the path to the file within the archive
122
+ Returns:
123
+ The contents of the file as bytes
124
+ '''
125
+ return self.ah.get_file(filename)
126
+
127
+ def extract(self,internal_path:str, output_dir:str, flatten:bool=False, replace:bool=True) -> str:
128
+ '''
129
+ Extracts a single file from the archive to an output directory, it will maintain the relative path of the file in the archive unless flatten is set to True
130
+ Args:
131
+ internal_path: the path to the file within the archive
132
+ output_dir: the directory to extract the file to
133
+ flatten: if true the extracted file will not be placed in child directories
134
+ replace: if true, existing files will be replaced
135
+ Returns:
136
+ The path to the extracted file
137
+ '''
138
+ return self.ah.extract(internal_path, output_dir, flatten=flatten, replace=replace)
139
+
140
+ def extract_from_filter(self, pattern, output_dir, flatten=False, replace=True) -> List[str]:
141
+ '''
142
+ Extracts files that match a regex pattern or list of regex patterns to an output directory, it will maintain the relative paths of the files in the archive unless flatten is set to True
143
+ Args:
144
+ pattern: a regex pattern or list of regex patterns to match against the file list of the archive
145
+ output_dir: the directory to extract the matched files to
146
+ flatten: if true the extracted files will not be placed in child directories, they will lay directly in the output folder (they may clobber over each other if named the same)
147
+ replace: if true, existing files will be replaced
148
+ Returns:
149
+ A list of the paths to the extracted files
150
+ '''
151
+ if os.path.isdir(output_dir) == False:
152
+ os.makedirs(output_dir,exist_ok=True)
153
+
154
+ matched_files = self.filter(pattern)
155
+ i = 0
156
+ count = len(matched_files)
157
+ result = []
158
+ for file in matched_files:
159
+ dst = self.extract(file, output_dir, flatten=flatten, replace=replace)
160
+ result.append(dst)
161
+ i += 1
162
+ percent = (i / count) * 100.0
163
+
164
+ if not self.quiet:
165
+ print(f"[{i}/{count}] {percent:6.2f}% {file} => {dst}")
166
+
167
+ return result
168
+
169
+ def filter(self, patterns) -> List[str]:
170
+ '''
171
+ Finds all files in the TOC of an archive that match a regex pattern or list of regex patterns
172
+ Args:
173
+ patterns: a regex pattern or list of regex patterns to match against the file list of the archive
174
+ Returns:
175
+ A list of files that match the regex pattern(s)
176
+ '''
177
+ files = self.get_file_list()
178
+
179
+ if isinstance(patterns, str):
180
+ patterns = [patterns]
181
+
182
+ result = []
183
+
184
+ for pattern in patterns:
185
+ matching_files = [filename for filename in files if re.search(pattern, filename)]
186
+
187
+ for filename in matching_files:
188
+ if filename not in result:
189
+ result.append(filename)
190
+
191
+ return result
192
+
193
+
194
+
195
+
196
+
197
+ # misc helpers
198
+ def pretty_file_size(self,bytes) -> str:
199
+ '''
200
+ Converts a number like 10 * 1024 * 1024 to 10MB
201
+
202
+ Args:
203
+ bytes: the number of bytes to convert to a human readable format
204
+ Returns:
205
+ A string representing the human readable file size, like 10MB or 5KB or 46B
206
+ '''
207
+ if bytes is None:
208
+ return "0B"
209
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
210
+ if bytes < 1024:
211
+ return f"{bytes:.2f}{unit}"
212
+ bytes /= 1024
213
+ return f"{bytes:.2f}PB"
214
+
215
+ def from_pretty_file_size(self,raw_str) -> int:
216
+ '''
217
+ Converts something like 10MB to 10*1024*1024 or 5*1024 to 5KB or 46 to 46B
218
+ Args:
219
+ raw_str: a string representing a human readable file size, like 10MB or 5KB or 46B
220
+ Returns:
221
+ The number of bytes represented by the input string, like 10*1024*1024 for 10MB or 5*1024 for 5KB or 46 for 46B
222
+ '''
223
+ raw_str = raw_str.strip().upper()
224
+ match = re.match(r'([0-9\.]+)([KMGTP]?B)', raw_str)
225
+ if not match:
226
+ raise ValueError(f"Invalid file size format: {raw_str}")
227
+
228
+ size = float(match.group(1))
229
+ unit = match.group(2)
230
+
231
+ if unit == 'KB':
232
+ bytes = size * 1024
233
+ elif unit == 'MB':
234
+ bytes = size * 1024 * 1024
235
+ elif unit == 'GB':
236
+ bytes = size * 1024 * 1024 * 1024
237
+ elif unit == 'TB':
238
+ bytes = size * 1024 * 1024 * 1024 * 1024
239
+ else:
240
+ bytes = size
241
+
242
+ return math.ceil(bytes)
@@ -0,0 +1,2 @@
1
+ from .base import VirtualArchive
2
+ from .zh import ZipHopper
@@ -0,0 +1,16 @@
1
+ from netpluck.virtual_file import VirtualFile
2
+
3
+ class VirtualArchive:
4
+ '''
5
+ VirtualArchive is an interface that defines the methods that an archive handler should implement.
6
+ '''
7
+ def __init__(self, vf:VirtualFile):
8
+ self.vf = vf
9
+ def get_file(self, filename):
10
+ raise NotImplementedError()
11
+ def get_file_list(self):
12
+ raise NotImplementedError()
13
+ def extract(self,internal_path:str, output_dir:str, flatten:bool=False, replace:bool=True) -> str:
14
+ raise NotImplementedError()
15
+ def extract_from_filter(self, pattern, output_dir, flatten=False, replace=True):
16
+ raise NotImplementedError()