PyPI - netpluck - Versions diffs - 0.9.1__tar.gz - Mend

netpluck 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

netpluck-0.9.1/LICENSE +21 -0
netpluck-0.9.1/PKG-INFO +83 -0
netpluck-0.9.1/README.md +70 -0
netpluck-0.9.1/pyproject.toml +29 -0
netpluck-0.9.1/setup.cfg +4 -0
netpluck-0.9.1/src/netpluck/__init__.py +8 -0
netpluck-0.9.1/src/netpluck/cli.py +74 -0
netpluck-0.9.1/src/netpluck/netpluck.py +242 -0
netpluck-0.9.1/src/netpluck/virtual_archive/__init__.py +2 -0
netpluck-0.9.1/src/netpluck/virtual_archive/base.py +16 -0
netpluck-0.9.1/src/netpluck/virtual_archive/zh.py +443 -0
netpluck-0.9.1/src/netpluck/virtual_file/__init__.py +7 -0
netpluck-0.9.1/src/netpluck/virtual_file/backblaze.py +35 -0
netpluck-0.9.1/src/netpluck/virtual_file/base.py +126 -0
netpluck-0.9.1/src/netpluck/virtual_file/http.py +44 -0
netpluck-0.9.1/src/netpluck/virtual_file/local.py +31 -0
netpluck-0.9.1/src/netpluck.egg-info/PKG-INFO +83 -0
netpluck-0.9.1/src/netpluck.egg-info/SOURCES.txt +20 -0
netpluck-0.9.1/src/netpluck.egg-info/dependency_links.txt +1 -0
netpluck-0.9.1/src/netpluck.egg-info/entry_points.txt +2 -0
netpluck-0.9.1/src/netpluck.egg-info/requires.txt +4 -0
netpluck-0.9.1/src/netpluck.egg-info/top_level.txt +1 -0

netpluck-0.9.1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) [2026] [Jesse Janzer]
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

netpluck-0.9.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,83 @@
+Metadata-Version: 2.4
+Name: netpluck
+Version: 0.9.1
+Author-email: Jesse Janzer <jjanzer@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/jjanzer/netpluck
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: requests
+Provides-Extra: bucket
+Requires-Dist: buckethandler>=0.9.0; extra == "bucket"
+Dynamic: license-file
+# NetPluck
+NetPluck is a tool allows you to remotely extract assets, typically files, from local and remote archives, such as a zip, without fetching or opening the entire archive. NetPluck was specifically designed to be used for very large .zip files that were stored in remote locations such as buckets on Backblaze and CDNs. It is extremely useful if you want to fetch the list of files or extract specific files of very large remote files.
+If you intend to fetch and extract all the contents of the archive or your files are very small you don't need this library.
+There are three key features of NetPluck:
+1. Works very efficiently on large files
+1. Can tell you what's in the file without reading the entire thing
+1. Extracts specific files without reading the entire thing
+NetPluck provides a CLI tool via `netpluck` and the core python libraries.
+## Formats Supported
+1. zip
+1. zip64
+## Protocols Supported
+1. Local Files
+1. Remote Files over HTTP or HTTPS that support range
+1. Backblaze B2
+## CLI Usage
+### Retrieving the List of Files in the Archive
+```
+netpluck --path sample_data.zip --toc
+empty.txt
+lava.png
+message.txt
+nova.bmp
+triangles.png
+```
+### Extracting Files
+You can pass regular expressions using `--filter` to extract files. If you do not specify the `--out` argument it will default to `./output/`
+```
+netpluck --path sample_data.zip --filter="\.*bmp" --out ./output/
+[1/1] 100.00% nova.bmp => ./output/nova.bmp
+```
+You can also use the `--flatten` flag if you want to strip all directories from the resulting output, this will dump all extracted matches into your output folder with no hierarchy.
+### Getting Statistics
+You can enable the `--stats` flag to see data about bytes and lookups made.
+```
+netpluck --path sample_data.zip --stats --filter="\.*txt" --out ./output/
+[1/2]  50.00% empty.txt => ./output/empty.txt
+[2/2] 100.00% message.txt => ./output/message.txt
+File size: 1.52MB
+Cache hits: 7 size: 1.37KB
+Uncached reads: 3 size: 64.10KB
+Bytes saved: 1.46MB 95.88%
+```
+In this instance we read 64kb of the 1556kb file while extracting the two txt files.
+## Extending
+### Adding New Protocols and Archives
+Protocols in NetPluck are handled by a prefix like: `https://` or `b2://` to add new ones you must modify the `netpluck.py` main class and `ProtocolType` enum. Make sure you modify the `_guess_protocol` method so it can automatically determine the appropriate one.
+Archive types can be extended by adding your own handler to `netpluck/virtual_archive` and subclassing the `VirtualArchive`. Be sure to implement all interfaces exposed by `VirtualArchive`. See the existing zip archive handler for an example. Typically this means you implement `__init__` and `_read_uncached_range`. See `netpluck/virtual_file/local.py` or `netpluck/virtual_file/http.py` for a simple example.
+You may also need to add a new virtual file type depending on how your data is queried which can be done just like `VirtualArchive` but with `VirtualFile`.

netpluck-0.9.1/README.md ADDED Viewed

@@ -0,0 +1,70 @@
+# NetPluck
+NetPluck is a tool allows you to remotely extract assets, typically files, from local and remote archives, such as a zip, without fetching or opening the entire archive. NetPluck was specifically designed to be used for very large .zip files that were stored in remote locations such as buckets on Backblaze and CDNs. It is extremely useful if you want to fetch the list of files or extract specific files of very large remote files.
+If you intend to fetch and extract all the contents of the archive or your files are very small you don't need this library.
+There are three key features of NetPluck:
+1. Works very efficiently on large files
+1. Can tell you what's in the file without reading the entire thing
+1. Extracts specific files without reading the entire thing
+NetPluck provides a CLI tool via `netpluck` and the core python libraries.
+## Formats Supported
+1. zip
+1. zip64
+## Protocols Supported
+1. Local Files
+1. Remote Files over HTTP or HTTPS that support range
+1. Backblaze B2
+## CLI Usage
+### Retrieving the List of Files in the Archive
+```
+netpluck --path sample_data.zip --toc
+empty.txt
+lava.png
+message.txt
+nova.bmp
+triangles.png
+```
+### Extracting Files
+You can pass regular expressions using `--filter` to extract files. If you do not specify the `--out` argument it will default to `./output/`
+```
+netpluck --path sample_data.zip --filter="\.*bmp" --out ./output/
+[1/1] 100.00% nova.bmp => ./output/nova.bmp
+```
+You can also use the `--flatten` flag if you want to strip all directories from the resulting output, this will dump all extracted matches into your output folder with no hierarchy.
+### Getting Statistics
+You can enable the `--stats` flag to see data about bytes and lookups made.
+```
+netpluck --path sample_data.zip --stats --filter="\.*txt" --out ./output/
+[1/2]  50.00% empty.txt => ./output/empty.txt
+[2/2] 100.00% message.txt => ./output/message.txt
+File size: 1.52MB
+Cache hits: 7 size: 1.37KB
+Uncached reads: 3 size: 64.10KB
+Bytes saved: 1.46MB 95.88%
+```
+In this instance we read 64kb of the 1556kb file while extracting the two txt files.
+## Extending
+### Adding New Protocols and Archives
+Protocols in NetPluck are handled by a prefix like: `https://` or `b2://` to add new ones you must modify the `netpluck.py` main class and `ProtocolType` enum. Make sure you modify the `_guess_protocol` method so it can automatically determine the appropriate one.
+Archive types can be extended by adding your own handler to `netpluck/virtual_archive` and subclassing the `VirtualArchive`. Be sure to implement all interfaces exposed by `VirtualArchive`. See the existing zip archive handler for an example. Typically this means you implement `__init__` and `_read_uncached_range`. See `netpluck/virtual_file/local.py` or `netpluck/virtual_file/http.py` for a simple example.
+You may also need to add a new virtual file type depending on how your data is queried which can be done just like `VirtualArchive` but with `VirtualFile`.

netpluck-0.9.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,29 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "netpluck"
+version = "0.9.1"
+readme = "README.md"
+license = "MIT"
+license-files = ["LICENSE"]
+authors = [
+	{name = "Jesse Janzer", email = "jjanzer@gmail.com"}
+]
+dependencies = [
+	"requests"
+]
+[project.optional-dependencies]
+# bucket handler adds support for things like backblaze
+bucket = [
+	"buckethandler>=0.9.0",
+]
+[project.scripts]
+netpluck = "netpluck.cli:main"
+[project.urls]
+Homepage = "https://github.com/jjanzer/netpluck"

netpluck-0.9.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

netpluck-0.9.1/src/netpluck/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+from .netpluck import NetPluck
+from .cli import main
+__version__ = "0.1.0"
+__all__ = [
+   "NetPluck",
+   "main",
+]

netpluck-0.9.1/src/netpluck/cli.py ADDED Viewed

@@ -0,0 +1,74 @@
+import argparse
+import sys
+import pprint
+from enum import Enum
+from .netpluck import NetPluck
+def main():
+	def str2bool(v):
+		'''
+		Helper to parse bool values
+		'''
+		if isinstance(v, bool):
+			return v
+		if v.lower() in ('yes', 'true', 't', 'y', '1'):
+			return True
+		elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+			return False
+		else:
+			raise argparse.ArgumentTypeError('Boolean value expected.')
+	parser = argparse.ArgumentParser(description="Selectively extract files from an archive without reading the entire file")
+	parser.add_argument("--path", type=str, help="The input zip file, use 'use b2://' for remote backblaze urls")
+	#parser.add_argument("--toc", type=str2bool, nargs="?", help="Print the table of contents file", const=False)
+	parser.add_argument("--toc", action="store_true", help="Print the table of contents file")
+	parser.add_argument("--filter", type=str, help="If passed will match the regex and output files", default=None)
+	parser.add_argument("--flatten", action="store_true", help="If passed will flatten the output files so they do not expand their relative directories from the archive, eg: foo/bar/car.png => output/car.png")
+	parser.add_argument("--out", type=str, help="The output folder to write the results to, if no filter is set all files are unpacked", default="output")
+	parser.add_argument("--stats", action="store_true", help="Print statistics about the virtual file reads")
+	parser.add_argument("--config", type=str, help="Path to the config file for something like backblaze not used for local paths", default='config.zh.json')
+	parser.add_argument("--quiet", action="store_true", help="Suppress non-error output")
+	args = parser.parse_args()
+	in_file = args.path
+	print_toc = args.toc
+	filter = args.filter
+	flatten = args.flatten
+	config_path = args.config
+	output_dir = args.out
+	print_stats = args.stats
+	quiet = args.quiet
+	np = NetPluck(in_file, config=config_path)
+	np.quiet = quiet
+	files = np.get_file_list()
+	if print_toc:
+		for file in files:
+			print(file)
+	if output_dir:
+		if filter != None:
+			np.extract_from_filter(filter, output_dir, flatten=flatten)
+	if print_stats:
+		cached_bytes_str = np.pretty_file_size(np.vf.cached_read_bytes)
+		uncached_bytes_str = np.pretty_file_size(np.vf.uncached_read_bytes)
+		size_bytes_str = np.pretty_file_size(np.vf.size)
+		bytes_saved = np.vf.size - (np.vf.uncached_read_bytes)
+		bytes_saved_str = np.pretty_file_size(bytes_saved)
+		percent_saved = (bytes_saved / np.vf.size) * 100 if np.vf.size > 0 else 0.0
+		percent_saved_str = f"{percent_saved:.2f}%"
+		#cached_bytes = np.vf.cached_read_bytes
+		print("")
+		print(f"File size: {size_bytes_str}")
+		print(f"Cache hits: {np.vf.cached_reads} size: {cached_bytes_str}")
+		print(f"Uncached reads: {np.vf.uncached_reads} size: {uncached_bytes_str}")
+		print(f"Bytes saved: {bytes_saved_str} {percent_saved_str}")
+if __name__ == "__main__":
+	main()

netpluck-0.9.1/src/netpluck/netpluck.py ADDED Viewed

@@ -0,0 +1,242 @@
+from enum import Enum
+import re
+import math
+import os
+from typing import List
+from netpluck.virtual_archive.zh import ZipHopper
+from netpluck.virtual_file import VirtualFile, VirtualFileHTTP, VirtualFileLocal
+from netpluck.virtual_archive.base import VirtualArchive
+class ArchiveType(Enum):
+	'''
+	The list of archive types NetPluck supports
+	'''
+	ZIP = 1
+	UNKNOWN = 99
+class ProtocolType(Enum):
+	'''
+	The list of protocol types NetPluck supports
+	'''
+	LOCAL = 1 # Local file system
+	HTTP = 2 # Both HTTP AND HTTPS are covered by this
+	B2 = 3 # Backblaze B2
+	UNKNOWN = 99
+class NetPluck:
+	def __init__(self, path, config=None, archive_type:ArchiveType=ArchiveType.UNKNOWN):
+		'''
+		Returns:
+			An instance of NetPluck bound to the archive type
+		Args:
+			path: this should be a path to a local file, url, or a backblaze b2 file path, or similar
+			config: can be a path to a json file or a key/value pair,
+				it is used with VirtualFile handlers, so check the specifics of each one
+			archive_type: if you know the type of archive you are working with you can specify it here, otherwise NetPluck will attempt to determine it by the file extension or magic bytes
+		'''
+		self.quiet = False
+		self.ah:VirtualArchive
+		self.at:ArchiveType = archive_type
+		self.protocol:ProtocolType = ProtocolType.UNKNOWN
+		self.vf:VirtualFile
+		self.path = path
+		self.protocol = self._guess_protocol(path)
+		if self.protocol == ProtocolType.B2:
+			# B2 support is only available if you install buckethandler
+			try:
+				from netpluck.virtual_file.backblaze import VirtualFileBackblaze
+				from buckethandler import BackblazeB2Handler
+			except ImportError:
+				raise ImportError("To use backblaze b2 support you need to install the bucket handler package, you can do this via: pip install netpluck[bucket]")
+			protocol_prefix_b2 = "b2://"
+			path = path[len(protocol_prefix_b2):]
+			handler = BackblazeB2Handler(config)
+			self.vf = VirtualFileBackblaze(handler, path)
+		elif self.protocol == ProtocolType.HTTP:
+			self.vf = VirtualFileHTTP(path)
+		elif self.protocol == ProtocolType.LOCAL:
+			if path.startswith("file://"):
+				path = path[len("file://"):]
+			self.vf = VirtualFileLocal(path)
+		else:
+			raise Exception("Unsupported protocol type")
+		if archive_type == ArchiveType.UNKNOWN:
+			archive_type = self._guess_archive_type()
+		self.at = archive_type
+		# Finally let's instantiate the archive handler based on the archive type
+		self.ah = ZipHopper(self.vf)
+	# Private Methods
+	def _guess_protocol(self,path) -> ProtocolType:
+		protocol_prefix_b2 = "b2://"
+		protocol_prefix_http = "http://"
+		protocol_prefix_https = "https://"
+		if path.startswith(protocol_prefix_b2):
+			return ProtocolType.B2
+		elif path.startswith(protocol_prefix_http) or path.startswith(protocol_prefix_https):
+			return ProtocolType.HTTP
+		else:
+			return ProtocolType.LOCAL
+	def _guess_archive_type(self) -> ArchiveType:
+		if self.path.endswith(".zip"):
+			return ArchiveType.ZIP
+		else:
+			# Try fetching the magic bytes of the file to determine the type
+			magic_bytes = self.vf[0:4]
+			if magic_bytes == b'PK\x03\x04':
+				return ArchiveType.ZIP
+			else:
+				raise Exception("Could not determine archive type")
+	# Public Methods
+	def get_file_list(self) -> List[str]:
+		'''
+		Returns the list of all files in the archive
+		Returns:
+			A list of file paths in the archive
+		'''
+		return self.ah.get_file_list()
+	def get_file(self, filename) -> bytes:
+		'''
+		Returns the contents of a file in the archive
+		Args:
+			filename: the path to the file within the archive
+		Returns:
+			The contents of the file as bytes
+		'''
+		return self.ah.get_file(filename)
+	def extract(self,internal_path:str, output_dir:str, flatten:bool=False, replace:bool=True) -> str:
+		'''
+		Extracts a single file from the archive to an output directory, it will maintain the relative path of the file in the archive unless flatten is set to True
+		Args:
+			internal_path: the path to the file within the archive
+			output_dir: the directory to extract the file to
+			flatten: if true the extracted file will not be placed in child directories
+			replace: if true, existing files will be replaced
+		Returns:
+			The path to the extracted file
+		'''
+		return self.ah.extract(internal_path, output_dir, flatten=flatten, replace=replace)
+	def extract_from_filter(self, pattern, output_dir, flatten=False, replace=True) -> List[str]:
+		'''
+		Extracts files that match a regex pattern or list of regex patterns to an output directory, it will maintain the relative paths of the files in the archive unless flatten is set to True
+		Args:
+			pattern: a regex pattern or list of regex patterns to match against the file list of the archive
+			output_dir: the directory to extract the matched files to
+			flatten: if true the extracted files will not be placed in child directories, they will lay directly in the output folder (they may clobber over each other if named the same)
+			replace: if true, existing files will be replaced
+		Returns:
+			A list of the paths to the extracted files
+		'''
+		if os.path.isdir(output_dir) == False:
+			os.makedirs(output_dir,exist_ok=True)
+		matched_files = self.filter(pattern)
+		i = 0
+		count = len(matched_files)
+		result = []
+		for file in matched_files:
+			dst = self.extract(file, output_dir, flatten=flatten, replace=replace)
+			result.append(dst)
+			i += 1
+			percent = (i / count) * 100.0
+			if not self.quiet:
+				print(f"[{i}/{count}] {percent:6.2f}% {file} => {dst}")
+		return result
+	def filter(self, patterns) -> List[str]:
+		'''
+		Finds all files in the TOC of an archive that match a regex pattern or list of regex patterns
+		Args:
+			patterns: a regex pattern or list of regex patterns to match against the file list of the archive
+		Returns:
+			A list of files that match the regex pattern(s)
+		'''
+		files = self.get_file_list()
+		if isinstance(patterns, str):
+			patterns = [patterns]
+		result = []
+		for pattern in patterns:
+			matching_files = [filename for filename in files if re.search(pattern, filename)]
+			for filename in matching_files:
+				if filename not in result:
+					result.append(filename)
+		return result
+	# misc helpers
+	def pretty_file_size(self,bytes) -> str:
+		'''
+		Converts a number like 10 * 1024 * 1024 to 10MB
+		Args:
+			bytes: the number of bytes to convert to a human readable format
+		Returns:
+			A string representing the human readable file size, like 10MB or 5KB or 46B
+		'''
+		if bytes is None:
+			return "0B"
+		for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+			if bytes < 1024:
+				return f"{bytes:.2f}{unit}"
+			bytes /= 1024
+		return f"{bytes:.2f}PB"
+	def from_pretty_file_size(self,raw_str) -> int:
+		'''
+		Converts something like 10MB to 10*1024*1024 or 5*1024 to 5KB or 46 to 46B
+		Args:
+			raw_str: a string representing a human readable file size, like 10MB or 5KB or 46B
+		Returns:
+			The number of bytes represented by the input string, like 10*1024*1024 for 10MB or 5*1024 for 5KB or 46 for 46B
+		'''
+		raw_str = raw_str.strip().upper()
+		match = re.match(r'([0-9\.]+)([KMGTP]?B)', raw_str)
+		if not match:
+			raise ValueError(f"Invalid file size format: {raw_str}")
+		size = float(match.group(1))
+		unit = match.group(2)
+		if unit == 'KB':
+			bytes = size * 1024
+		elif unit == 'MB':
+			bytes = size * 1024 * 1024
+		elif unit == 'GB':
+			bytes = size * 1024 * 1024 * 1024
+		elif unit == 'TB':
+			bytes = size * 1024 * 1024 * 1024 * 1024
+		else:
+			bytes = size
+		return math.ceil(bytes)

netpluck-0.9.1/src/netpluck/virtual_archive/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .base import VirtualArchive
2	+ from .zh import ZipHopper

netpluck-0.9.1/src/netpluck/virtual_archive/base.py ADDED Viewed

@@ -0,0 +1,16 @@
+from netpluck.virtual_file import VirtualFile
+class VirtualArchive:
+	'''
+	VirtualArchive is an interface that defines the methods that an archive handler should implement.
+	'''
+	def __init__(self, vf:VirtualFile):
+		self.vf = vf
+	def get_file(self, filename):
+		raise NotImplementedError()
+	def get_file_list(self):
+		raise NotImplementedError()
+	def extract(self,internal_path:str, output_dir:str, flatten:bool=False, replace:bool=True) -> str:
+		raise NotImplementedError()
+	def extract_from_filter(self, pattern, output_dir, flatten=False, replace=True):
+		raise NotImplementedError()