PyPI - occystrap - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

occystrap 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

occystrap/_version.py +34 -0
occystrap/filters/__init__.py +10 -0
occystrap/filters/base.py +67 -0
occystrap/filters/exclude.py +136 -0
occystrap/filters/inspect.py +179 -0
occystrap/filters/normalize_timestamps.py +123 -0
occystrap/filters/search.py +177 -0
occystrap/inputs/__init__.py +1 -0
occystrap/inputs/base.py +40 -0
occystrap/inputs/docker.py +171 -0
occystrap/inputs/registry.py +260 -0
occystrap/inputs/tarfile.py +88 -0
occystrap/main.py +330 -31
occystrap/outputs/__init__.py +1 -0
occystrap/outputs/base.py +46 -0
occystrap/{output_directory.py → outputs/directory.py} +10 -9
occystrap/outputs/docker.py +137 -0
occystrap/{output_mounts.py → outputs/mounts.py} +2 -1
occystrap/{output_ocibundle.py → outputs/ocibundle.py} +1 -1
occystrap/outputs/registry.py +240 -0
occystrap/{output_tarfile.py → outputs/tarfile.py} +18 -2
occystrap/pipeline.py +297 -0
occystrap/tarformat.py +122 -0
occystrap/tests/test_inspect.py +355 -0
occystrap/tests/test_tarformat.py +199 -0
occystrap/uri.py +231 -0
occystrap/util.py +67 -38
occystrap-0.4.1.dist-info/METADATA +444 -0
occystrap-0.4.1.dist-info/RECORD +38 -0
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/WHEEL +1 -1
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/entry_points.txt +0 -1
occystrap/docker_extract.py +0 -36
occystrap/docker_registry.py +0 -192
occystrap-0.3.0.dist-info/METADATA +0 -131
occystrap-0.3.0.dist-info/RECORD +0 -20
occystrap-0.3.0.dist-info/pbr.json +0 -1
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info/licenses}/AUTHORS +0 -0
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info/licenses}/LICENSE +0 -0
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/top_level.txt +0 -0

occystrap/filters/search.py ADDED Viewed

@@ -0,0 +1,177 @@
+import fnmatch
+import logging
+import os
+import re
+import tarfile
+from occystrap import constants
+from occystrap.filters.base import ImageFilter
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+class SearchFilter(ImageFilter):
+    """Searches layers for files matching a pattern.
+    This filter can operate in two modes:
+    - Search-only: wrapped_output is None, just prints results
+    - Passthrough: searches AND passes elements to wrapped output
+    In passthrough mode, this allows searching while also writing output,
+    enabling pipelines like:
+        input -> search -> tarfile (search while creating tarball)
+    """
+    def __init__(self, wrapped_output, pattern, use_regex=False,
+                 image=None, tag=None, script_friendly=False):
+        """Initialize the search filter.
+        Args:
+            wrapped_output: The ImageOutput to pass elements to, or None
+                for search-only mode.
+            pattern: Glob pattern or regex to match file paths.
+            use_regex: If True, treat pattern as a regex instead of glob.
+            image: Image name for output formatting.
+            tag: Image tag for output formatting.
+            script_friendly: If True, output in machine-parseable format.
+        """
+        super().__init__(wrapped_output)
+        self.pattern = pattern
+        self.use_regex = use_regex
+        self.image = image
+        self.tag = tag
+        self.script_friendly = script_friendly
+        self.results = []  # List of (layer_digest, path, file_info_dict)
+        if use_regex:
+            self._compiled_pattern = re.compile(pattern)
+    def fetch_callback(self, digest):
+        """Always fetch all layers for searching."""
+        # If we have a wrapped output, also check its callback
+        if self._wrapped is not None:
+            # We need the layer for searching, but the wrapped output
+            # might not need it. We fetch it anyway for searching.
+            # The wrapped output's callback is still consulted but
+            # we always return True to ensure we get the data.
+            pass
+        return True
+    def _matches(self, path):
+        """Check if a path matches the search pattern."""
+        if self.use_regex:
+            return self._compiled_pattern.search(path) is not None
+        else:
+            # Match against full path or just the filename
+            # This allows patterns like "*bash" to match "/bin/bash"
+            filename = os.path.basename(path)
+            return (fnmatch.fnmatch(path, self.pattern) or
+                    fnmatch.fnmatch(filename, self.pattern))
+    def _get_file_type(self, member):
+        """Get a human-readable file type string."""
+        if member.isfile():
+            return 'file'
+        elif member.isdir():
+            return 'directory'
+        elif member.issym():
+            return 'symlink'
+        elif member.islnk():
+            return 'hardlink'
+        elif member.isfifo():
+            return 'fifo'
+        elif member.ischr():
+            return 'character device'
+        elif member.isblk():
+            return 'block device'
+        else:
+            return 'unknown'
+    def _search_layer(self, name, data):
+        """Search a layer for matching files."""
+        LOG.info('Searching layer %s' % name)
+        data.seek(0)
+        try:
+            with tarfile.open(fileobj=data, mode='r') as layer_tar:
+                for member in layer_tar:
+                    if self._matches(member.name):
+                        file_info = {
+                            'type': self._get_file_type(member),
+                            'size': member.size,
+                            'mode': member.mode,
+                            'uid': member.uid,
+                            'gid': member.gid,
+                            'mtime': member.mtime,
+                        }
+                        if member.issym() or member.islnk():
+                            file_info['linkname'] = member.linkname
+                        self.results.append((name, member.name, file_info))
+        except tarfile.TarError as e:
+            LOG.error('Failed to read layer %s: %s' % (name, e))
+    def process_image_element(self, element_type, name, data):
+        """Process an image element, searching layers for matches."""
+        # Search layers
+        if element_type == constants.IMAGE_LAYER and data is not None:
+            self._search_layer(name, data)
+        # Pass through to wrapped output if present
+        if self._wrapped is not None:
+            if data is not None:
+                data.seek(0)  # Reset for next consumer
+            self._wrapped.process_image_element(element_type, name, data)
+    def _print_results(self):
+        """Print search results to stdout."""
+        if not self.results:
+            if not self.script_friendly:
+                print('No matches found.')
+            return
+        if self.script_friendly:
+            # Output format: image:tag:layer:path
+            # One line per match, suitable for piping to other tools
+            for layer_digest, path, file_info in self.results:
+                print('%s:%s:%s:%s'
+                      % (self.image, self.tag, layer_digest, path))
+            return
+        # Group results by layer
+        results_by_layer = {}
+        for layer_digest, path, file_info in self.results:
+            if layer_digest not in results_by_layer:
+                results_by_layer[layer_digest] = []
+            results_by_layer[layer_digest].append((path, file_info))
+        # Print results
+        for layer_digest in results_by_layer:
+            print('Layer: %s' % layer_digest)
+            for path, file_info in results_by_layer[layer_digest]:
+                if file_info['type'] in ('symlink', 'hardlink'):
+                    print('  %s -> %s (%s)'
+                          % (path, file_info['linkname'], file_info['type']))
+                elif file_info['type'] == 'file':
+                    print('  %s (%s, %d bytes)'
+                          % (path, file_info['type'], file_info['size']))
+                elif file_info['type'] == 'directory':
+                    print('  %s (%s)' % (path, file_info['type']))
+                else:
+                    print('  %s (%s)' % (path, file_info['type']))
+            print()
+        layer_count = len(results_by_layer)
+        match_count = len(self.results)
+        print('Found %d match%s in %d layer%s.'
+              % (match_count, '' if match_count == 1 else 'es',
+                 layer_count, '' if layer_count == 1 else 's'))
+    def finalize(self):
+        """Print search results and finalize wrapped output."""
+        self._print_results()
+        if self._wrapped is not None:
+            self._wrapped.finalize()

occystrap/inputs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Input modules for reading container images from various sources

occystrap/inputs/base.py ADDED Viewed

@@ -0,0 +1,40 @@
+from abc import ABC, abstractmethod
+class ImageInput(ABC):
+    """Abstract base class for image input sources.
+    Input sources are responsible for fetching container images from various
+    sources (registries, local Docker daemon, tarfiles) and yielding image
+    elements (config files and layers) in a standard format.
+    """
+    @property
+    @abstractmethod
+    def image(self):
+        """Return the image name."""
+        pass
+    @property
+    @abstractmethod
+    def tag(self):
+        """Return the image tag."""
+        pass
+    @abstractmethod
+    def fetch(self, fetch_callback=None):
+        """Fetch image elements (config files and layers).
+        Args:
+            fetch_callback: Optional callable that takes a layer digest and
+                returns True if the layer should be fetched, False to skip.
+                If None, all layers are fetched.
+        Yields:
+            Tuples of (element_type, name, data) where:
+            - element_type is constants.CONFIG_FILE or constants.IMAGE_LAYER
+            - name is the element identifier (config filename or layer digest)
+            - data is a file-like object containing the element data,
+              or None if the layer was skipped by fetch_callback
+        """
+        pass

occystrap/inputs/docker.py ADDED Viewed

@@ -0,0 +1,171 @@
+# Fetch images from the local Docker or Podman daemon via the Docker Engine API.
+# This communicates over a Unix domain socket (default: /var/run/docker.sock).
+#
+# Docker Engine API documentation:
+# https://docs.docker.com/engine/api/
+#
+# Podman compatibility:
+# Podman provides a Docker-compatible API via podman.socket. Use the --socket
+# option to point to the Podman socket:
+# - Rootful: /run/podman/podman.sock
+# - Rootless: /run/user/<uid>/podman/podman.sock
+# See: https://docs.podman.io/en/latest/markdown/podman-system-service.1.html
+#
+# The API returns images in the same format as 'docker save', which is the
+# same format that inputs/tarfile.py reads. We stream the tarball and parse
+# it on the fly.
+#
+# API Limitation: Unlike the registry API (inputs/registry.py) which can fetch
+# individual layer blobs via GET /v2/<name>/blobs/<digest>, the Docker Engine
+# API only provides /images/{name}/get which returns a complete tarball. There
+# is no endpoint to fetch individual image components (config, layers)
+# separately. This is a fundamental limitation of the Docker Engine API.
+# See: https://github.com/moby/moby/issues/24851
+#
+# The tarball streaming approach used here is the official supported method
+# and matches what 'docker save' does internally.
+import io
+import json
+import logging
+import os
+import tarfile
+import requests_unixsocket
+from occystrap import constants
+from occystrap.inputs.base import ImageInput
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+DEFAULT_SOCKET_PATH = '/var/run/docker.sock'
+def always_fetch(digest):
+    return True
+class Image(ImageInput):
+    def __init__(self, image, tag='latest', socket_path=DEFAULT_SOCKET_PATH):
+        self._image = image
+        self._tag = tag
+        self.socket_path = socket_path
+        self._session = None
+    @property
+    def image(self):
+        """Return the image name."""
+        return self._image
+    @property
+    def tag(self):
+        """Return the image tag."""
+        return self._tag
+    def _get_session(self):
+        if self._session is None:
+            self._session = requests_unixsocket.Session()
+        return self._session
+    def _socket_url(self, path):
+        # requests_unixsocket uses http+unix:// scheme with URL-encoded path
+        encoded_socket = self.socket_path.replace('/', '%2F')
+        return 'http+unix://%s%s' % (encoded_socket, path)
+    def _request(self, method, path, stream=False):
+        session = self._get_session()
+        url = self._socket_url(path)
+        LOG.debug('Docker API request: %s %s' % (method, path))
+        r = session.request(method, url, stream=stream)
+        if r.status_code == 404:
+            raise Exception('Image not found: %s:%s' % (self.image, self.tag))
+        if r.status_code != 200:
+            raise Exception('Docker API error %d: %s' % (r.status_code, r.text))
+        return r
+    def _get_image_reference(self):
+        # Return the image reference in the format Docker expects
+        return '%s:%s' % (self.image, self.tag)
+    def inspect(self):
+        """Get image metadata from the Docker daemon."""
+        ref = self._get_image_reference()
+        r = self._request('GET', '/images/%s/json' % ref)
+        return r.json()
+    def fetch(self, fetch_callback=always_fetch):
+        """Fetch image layers from the local Docker daemon.
+        This uses the Docker Engine API to export the image as a tarball
+        (equivalent to 'docker save') and streams/parses it on the fly.
+        """
+        ref = self._get_image_reference()
+        LOG.info('Fetching image %s from Docker daemon at %s'
+                 % (ref, self.socket_path))
+        # First verify the image exists
+        try:
+            self.inspect()
+        except Exception as e:
+            LOG.error('Failed to inspect image: %s' % str(e))
+            raise
+        # Stream the image tarball from Docker
+        LOG.info('Streaming image tarball from Docker daemon')
+        r = self._request('GET', '/images/%s/get' % ref, stream=True)
+        # We need to buffer the stream into a file-like object for tarfile
+        # because tarfile needs to seek. We use a temporary file approach
+        # similar to the registry input.
+        import tempfile
+        with tempfile.NamedTemporaryFile(delete=False) as tf:
+            LOG.info('Buffering image to temporary file %s' % tf.name)
+            for chunk in r.iter_content(8192):
+                tf.write(chunk)
+            temp_path = tf.name
+        try:
+            # Parse the tarball (same format as 'docker save')
+            with tarfile.open(temp_path, 'r') as tar:
+                # Read manifest.json
+                manifest_member = tar.getmember('manifest.json')
+                manifest_file = tar.extractfile(manifest_member)
+                manifest = json.loads(manifest_file.read().decode('utf-8'))
+                # Yield config file
+                config_filename = manifest[0]['Config']
+                LOG.info('Reading config file %s' % config_filename)
+                config_member = tar.getmember(config_filename)
+                config_file = tar.extractfile(config_member)
+                config_data = config_file.read()
+                yield (constants.CONFIG_FILE, config_filename,
+                       io.BytesIO(config_data))
+                # Yield each layer
+                layers = manifest[0]['Layers']
+                LOG.info('There are %d image layers' % len(layers))
+                for layer_path in layers:
+                    # Layer path is like "abc123/layer.tar"
+                    layer_digest = os.path.dirname(layer_path)
+                    if not fetch_callback(layer_digest):
+                        LOG.info('Fetch callback says skip layer %s'
+                                 % layer_digest)
+                        yield (constants.IMAGE_LAYER, layer_digest, None)
+                        continue
+                    LOG.info('Reading layer %s' % layer_path)
+                    layer_member = tar.getmember(layer_path)
+                    layer_file = tar.extractfile(layer_member)
+                    layer_data = layer_file.read()
+                    yield (constants.IMAGE_LAYER, layer_digest,
+                           io.BytesIO(layer_data))
+        finally:
+            # Clean up temp file
+            if os.path.exists(temp_path):
+                os.unlink(temp_path)
+        LOG.info('Done')

occystrap/inputs/registry.py ADDED Viewed

@@ -0,0 +1,260 @@
+# A simple implementation of a docker registry client. Fetches an image to a tarball.
+# With a big nod to https://github.com/NotGlop/docker-drag/blob/master/docker_pull.py
+# https://docs.docker.com/registry/spec/manifest-v2-2/ documents the image manifest
+# format, noting that the response format you get back varies based on what you have
+# in your accept header for the request.
+# https://github.com/opencontainers/image-spec/blob/main/media-types.md documents
+# the new OCI mime types.
+import hashlib
+import io
+import logging
+import os
+import re
+from requests.exceptions import ChunkedEncodingError, ConnectionError
+import sys
+import tempfile
+import time
+import zlib
+from occystrap import constants
+from occystrap import util
+from occystrap.inputs.base import ImageInput
+# Retry configuration
+MAX_RETRIES = 3
+RETRY_BACKOFF_BASE = 2  # Exponential backoff: 2^attempt seconds
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+DELETED_FILE_RE = re.compile(r'.*/\.wh\.(.*)$')
+def always_fetch():
+    return True
+class Image(ImageInput):
+    def __init__(self, registry, image, tag, os='linux', architecture='amd64',
+                 variant='', secure=True, username=None, password=None):
+        self.registry = registry
+        self._image = image
+        self._tag = tag
+        self.os = os
+        self.architecture = architecture
+        self.variant = variant
+        self.secure = secure
+        self.username = username
+        self.password = password
+        self._cached_auth = None
+    @property
+    def image(self):
+        """Return the image name."""
+        return self._image
+    @property
+    def tag(self):
+        """Return the image tag."""
+        return self._tag
+    def request_url(self, method, url, headers=None, data=None, stream=False):
+        if not headers:
+            headers = {}
+        if self._cached_auth:
+            headers.update({'Authorization': 'Bearer %s' % self._cached_auth})
+        try:
+            return util.request_url(method, url, headers=headers, data=data,
+                                    stream=stream)
+        except util.UnauthorizedException as e:
+            auth_re = re.compile('Bearer realm="([^"]*)",service="([^"]*)"')
+            m = auth_re.match(e.args[5].get('Www-Authenticate', ''))
+            if m:
+                auth_url = ('%s?service=%s&scope=repository:%s:pull'
+                            % (m.group(1), m.group(2), self.image))
+                # If credentials are provided, use Basic auth for token request
+                if self.username and self.password:
+                    r = util.request_url(
+                        'GET', auth_url,
+                        auth=(self.username, self.password))
+                else:
+                    r = util.request_url('GET', auth_url)
+                token = r.json().get('token')
+                headers.update({'Authorization': 'Bearer %s' % token})
+                self._cached_auth = token
+            return util.request_url(
+                method, url, headers=headers, data=data, stream=stream)
+    def fetch(self, fetch_callback=always_fetch):
+        LOG.info('Fetching manifest')
+        moniker = 'https'
+        if not self.secure:
+            moniker = 'http'
+        r = self.request_url(
+            'GET',
+            '%(moniker)s://%(registry)s/v2/%(image)s/manifests/%(tag)s'
+            % {
+                'moniker': moniker,
+                'registry': self.registry,
+                'image': self.image,
+                'tag': self.tag
+            },
+            headers={
+                'Accept': ('application/vnd.docker.distribution.manifest.v2+json,'
+                           'application/vnd.docker.distribution.manifest.list.v2+json,'
+                           'application/vnd.oci.image.manifest.v1+json,'
+                           'application/vnd.oci.image.index.v1+json')
+            })
+        config_digest = None
+        if r.headers['Content-Type'] in [
+                'application/vnd.docker.distribution.manifest.v2+json',
+                'application/vnd.oci.image.manifest.v1+json']:
+            manifest = r.json()
+            config_digest = manifest['config']['digest']
+        elif r.headers['Content-Type'] in [
+                'application/vnd.docker.distribution.manifest.list.v2+json',
+                'application/vnd.oci.image.index.v1+json']:
+            for m in r.json()['manifests']:
+                if 'variant' in m['platform']:
+                    LOG.info('Found manifest for %s on %s %s'
+                             % (m['platform']['os'],
+                                m['platform']['architecture'],
+                                m['platform']['variant']))
+                else:
+                    LOG.info('Found manifest for %s on %s'
+                             % (m['platform']['os'],
+                                m['platform']['architecture']))
+                if (m['platform']['os'] == self.os and
+                    m['platform']['architecture'] == self.architecture and
+                        m['platform'].get('variant', '') == self.variant):
+                    LOG.info('Fetching matching manifest')
+                    r = self.request_url(
+                        'GET',
+                        '%(moniker)s://%(registry)s/v2/%(image)s/manifests/%(tag)s'
+                        % {
+                            'moniker': moniker,
+                            'registry': self.registry,
+                            'image': self.image,
+                            'tag': m['digest']
+                        },
+                        headers={
+                            'Accept': ('application/vnd.docker.distribution.'
+                                       'manifest.v2+json, '
+                                       'application/vnd.oci.image.manifest.v1+json')
+                        })
+                    manifest = r.json()
+                    config_digest = manifest['config']['digest']
+            if not config_digest:
+                raise Exception('Could not find a matching manifest for this '
+                                'os / architecture / variant')
+        else:
+            raise Exception('Unknown manifest content type %s!' %
+                            r.headers['Content-Type'])
+        LOG.info('Fetching config file')
+        r = self.request_url(
+            'GET',
+            '%(moniker)s://%(registry)s/v2/%(image)s/blobs/%(config)s'
+            % {
+                'moniker': moniker,
+                'registry': self.registry,
+                'image': self.image,
+                'config': config_digest
+            })
+        config = r.content
+        h = hashlib.sha256()
+        h.update(config)
+        if h.hexdigest() != config_digest.split(':')[1]:
+            LOG.error('Hash verification failed for image config blob (%s vs %s)'
+                      % (config_digest.split(':')[1], h.hexdigest()))
+            sys.exit(1)
+        config_filename = ('%s.json' % config_digest.split(':')[1])
+        yield (constants.CONFIG_FILE, config_filename,
+               io.BytesIO(config))
+        LOG.info('There are %d image layers' % len(manifest['layers']))
+        for layer in manifest['layers']:
+            layer_filename = layer['digest'].split(':')[1]
+            if not fetch_callback(layer_filename):
+                LOG.info('Fetch callback says skip layer %s' % layer['digest'])
+                yield (constants.IMAGE_LAYER, layer_filename, None)
+                continue
+            LOG.info('Fetching layer %s (%d bytes)'
+                     % (layer['digest'], layer['size']))
+            # Retry logic for streaming downloads which can fail mid-transfer
+            last_exception = None
+            for attempt in range(MAX_RETRIES + 1):
+                try:
+                    r = self.request_url(
+                        'GET',
+                        '%(moniker)s://%(registry)s/v2/%(image)s/blobs/%(layer)s'
+                        % {
+                            'moniker': moniker,
+                            'registry': self.registry,
+                            'image': self.image,
+                            'layer': layer['digest']
+                        },
+                        stream=True)
+                    # We can use zlib for streaming decompression, but we need
+                    # to tell it to ignore the gzip header which it doesn't
+                    # understand. Unfortunately tarfile doesn't do streaming
+                    # writes (and we need to know the decompressed size before
+                    # we can write to the tarfile), so we stream to a temporary
+                    # file on disk.
+                    h = hashlib.sha256()
+                    d = zlib.decompressobj(16 + zlib.MAX_WBITS)
+                    with tempfile.NamedTemporaryFile(delete=False) as tf:
+                        LOG.info('Temporary file for layer is %s' % tf.name)
+                        for chunk in r.iter_content(8192):
+                            tf.write(d.decompress(chunk))
+                            h.update(chunk)
+                    if h.hexdigest() != layer_filename:
+                        LOG.error('Hash verification failed for layer (%s vs %s)'
+                                  % (layer_filename, h.hexdigest()))
+                        sys.exit(1)
+                    try:
+                        with open(tf.name, 'rb') as f:
+                            yield (constants.IMAGE_LAYER, layer_filename, f)
+                    finally:
+                        os.unlink(tf.name)
+                    # Success - break out of retry loop
+                    break
+                except (ChunkedEncodingError, ConnectionError) as e:
+                    last_exception = e
+                    # Clean up temp file if it exists
+                    if 'tf' in dir() and tf.name and os.path.exists(tf.name):
+                        os.unlink(tf.name)
+                    if attempt < MAX_RETRIES:
+                        wait_time = RETRY_BACKOFF_BASE ** attempt
+                        LOG.warning(
+                            'Layer download failed (attempt %d/%d): %s. '
+                            'Retrying in %d seconds...'
+                            % (attempt + 1, MAX_RETRIES + 1, str(e), wait_time))
+                        time.sleep(wait_time)
+                    else:
+                        LOG.error('Layer download failed after %d attempts: %s'
+                                  % (MAX_RETRIES + 1, str(e)))
+                        raise last_exception
+        LOG.info('Done')

occystrap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

occystrap 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl