PyPI - occystrap - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl - Mend

occystrap 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

occystrap/_version.py +34 -0
occystrap/filters/__init__.py +10 -0
occystrap/filters/base.py +67 -0
occystrap/filters/exclude.py +136 -0
occystrap/filters/inspect.py +179 -0
occystrap/filters/normalize_timestamps.py +123 -0
occystrap/filters/search.py +177 -0
occystrap/inputs/__init__.py +1 -0
occystrap/inputs/base.py +40 -0
occystrap/inputs/docker.py +171 -0
occystrap/inputs/registry.py +260 -0
occystrap/inputs/tarfile.py +88 -0
occystrap/main.py +330 -31
occystrap/outputs/__init__.py +1 -0
occystrap/outputs/base.py +46 -0
occystrap/{output_directory.py → outputs/directory.py} +10 -9
occystrap/outputs/docker.py +137 -0
occystrap/{output_mounts.py → outputs/mounts.py} +2 -1
occystrap/{output_ocibundle.py → outputs/ocibundle.py} +1 -1
occystrap/outputs/registry.py +240 -0
occystrap/{output_tarfile.py → outputs/tarfile.py} +18 -2
occystrap/pipeline.py +297 -0
occystrap/tarformat.py +122 -0
occystrap/tests/test_inspect.py +355 -0
occystrap/tests/test_tarformat.py +199 -0
occystrap/uri.py +231 -0
occystrap/util.py +67 -38
occystrap-0.4.1.dist-info/METADATA +444 -0
occystrap-0.4.1.dist-info/RECORD +38 -0
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/WHEEL +1 -1
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/entry_points.txt +0 -1
occystrap/docker_extract.py +0 -36
occystrap/docker_registry.py +0 -192
occystrap-0.3.0.dist-info/METADATA +0 -131
occystrap-0.3.0.dist-info/RECORD +0 -20
occystrap-0.3.0.dist-info/pbr.json +0 -1
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info/licenses}/AUTHORS +0 -0
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info/licenses}/LICENSE +0 -0
{occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/top_level.txt +0 -0

occystrap/_version.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '0.4.1'
+__version_tuple__ = version_tuple = (0, 4, 1)
+__commit_id__ = commit_id = None

occystrap/filters/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from occystrap.filters.base import ImageFilter
+from occystrap.filters.exclude import ExcludeFilter
+from occystrap.filters.inspect import InspectFilter
+from occystrap.filters.normalize_timestamps import TimestampNormalizer
+from occystrap.filters.search import SearchFilter
+__all__ = [
+    'ImageFilter', 'ExcludeFilter', 'InspectFilter',
+    'TimestampNormalizer', 'SearchFilter',
+]

occystrap/filters/base.py ADDED Viewed

@@ -0,0 +1,67 @@
+from abc import ABC, abstractmethod
+from occystrap.outputs.base import ImageOutput
+class ImageFilter(ImageOutput, ABC):
+    """Abstract base class for image filters.
+    Filters wrap an ImageOutput and can transform or inspect image elements
+    as they pass through the pipeline. Filters implement the ImageOutput
+    interface so they can be chained together or used as the final output.
+    The decorator pattern allows filters to be stacked:
+        input -> filter1 -> filter2 -> output
+    Each filter can:
+    - Transform element data (e.g., normalize timestamps)
+    - Transform element names (e.g., recalculate hashes)
+    - Inspect elements without modification (e.g., search)
+    - Skip elements entirely
+    - Accumulate state across elements (e.g., collect search results)
+    """
+    def __init__(self, wrapped_output):
+        """Wrap another output (or filter) to form a chain.
+        Args:
+            wrapped_output: The ImageOutput to pass processed elements to.
+                Can be None for terminal filters that don't produce output
+                (e.g., search-only mode).
+        """
+        self._wrapped = wrapped_output
+    def fetch_callback(self, digest):
+        """Determine whether a layer should be fetched.
+        Default implementation delegates to the wrapped output.
+        Override to implement custom filtering logic.
+        """
+        if self._wrapped is None:
+            return True
+        return self._wrapped.fetch_callback(digest)
+    @abstractmethod
+    def process_image_element(self, element_type, name, data):
+        """Process and optionally transform an image element.
+        Implementations should typically:
+        1. Perform any transformation or inspection
+        2. Pass the (possibly modified) element to self._wrapped
+        Args:
+            element_type: constants.CONFIG_FILE or constants.IMAGE_LAYER
+            name: The element name/digest
+            data: File-like object containing the element data, or None
+                if the element was skipped by fetch_callback
+        """
+        pass
+    def finalize(self):
+        """Complete the filter operation.
+        Default implementation delegates to the wrapped output.
+        Override to perform cleanup or output accumulated results.
+        """
+        if self._wrapped is not None:
+            self._wrapped.finalize()

occystrap/filters/exclude.py ADDED Viewed

@@ -0,0 +1,136 @@
+import fnmatch
+import hashlib
+import logging
+import os
+import tarfile
+import tempfile
+from occystrap import constants
+from occystrap.filters.base import ImageFilter
+from occystrap.tarformat import select_tar_format_for_layer
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+class ExcludeFilter(ImageFilter):
+    """Excludes files matching glob patterns from image layers.
+    This filter rewrites layer tarballs to remove files and directories
+    that match any of the specified glob patterns. Since this changes the
+    layer content, the SHA256 hash is recalculated and the layer name is
+    updated to match.
+    This is useful for stripping unwanted content like .git directories,
+    __pycache__ folders, or other files before writing output.
+    """
+    def __init__(self, wrapped_output, patterns):
+        """Initialize the exclude filter.
+        Args:
+            wrapped_output: The ImageOutput to pass filtered elements to.
+            patterns: List of glob patterns to exclude. Each pattern is
+                matched against the full path using fnmatch.
+        """
+        super().__init__(wrapped_output)
+        self.patterns = patterns
+    def _matches_exclusion(self, path):
+        """Check if a path matches any exclusion pattern.
+        Args:
+            path: The file path to check.
+        Returns:
+            True if the path should be excluded, False otherwise.
+        """
+        for pattern in self.patterns:
+            if fnmatch.fnmatch(path, pattern):
+                return True
+        return False
+    def _filter_layer(self, layer_data):
+        """Filter a layer tarball, excluding matching entries.
+        Creates a new tarball with entries that don't match exclusion
+        patterns, calculates the new SHA256 hash, and returns both.
+        Uses USTAR format when possible (smaller output), falls back to
+        PAX format when layer contents require it. See tarformat.py.
+        Args:
+            layer_data: File-like object containing the original layer.
+        Returns:
+            Tuple of (filtered_file_handle, new_sha256_hex)
+        """
+        # Determine optimal tar format, skipping excluded members
+        tar_format = select_tar_format_for_layer(
+            layer_data,
+            skip_fn=lambda m: self._matches_exclusion(m.name)
+        )
+        excluded_count = 0
+        with tempfile.NamedTemporaryFile(delete=False) as filtered_tf:
+            try:
+                with tarfile.open(fileobj=filtered_tf, mode='w',
+                                  format=tar_format) as filtered_tar:
+                    layer_data.seek(0)
+                    with tarfile.open(fileobj=layer_data, mode='r') as layer_tar:
+                        for member in layer_tar:
+                            if self._matches_exclusion(member.name):
+                                excluded_count += 1
+                                continue
+                            if member.isfile():
+                                fileobj = layer_tar.extractfile(member)
+                                filtered_tar.addfile(member, fileobj)
+                            else:
+                                filtered_tar.addfile(member)
+                if excluded_count > 0:
+                    LOG.info('Excluded %d entries from layer' % excluded_count)
+                filtered_tf.flush()
+                filtered_tf.seek(0)
+                h = hashlib.sha256()
+                while True:
+                    chunk = filtered_tf.read(8192)
+                    if not chunk:
+                        break
+                    h.update(chunk)
+                new_sha = h.hexdigest()
+                filtered_tf.seek(0)
+                return open(filtered_tf.name, 'rb'), new_sha
+            except Exception:
+                os.unlink(filtered_tf.name)
+                raise
+    def process_image_element(self, element_type, name, data):
+        """Process an image element, filtering layer contents.
+        Config files are passed through unchanged. Layers have matching
+        entries excluded and their names updated to reflect the new
+        SHA256 hash.
+        """
+        if element_type == constants.IMAGE_LAYER and data is not None:
+            LOG.info('Filtering layer %s' % name)
+            filtered_data, new_name = self._filter_layer(data)
+            try:
+                self._wrapped.process_image_element(
+                    element_type, new_name, filtered_data)
+            finally:
+                try:
+                    filtered_data.close()
+                    os.unlink(filtered_data.name)
+                except Exception:
+                    pass
+        else:
+            self._wrapped.process_image_element(element_type, name, data)

occystrap/filters/inspect.py ADDED Viewed

@@ -0,0 +1,179 @@
+import json
+import logging
+from occystrap import constants
+from occystrap.filters.base import ImageFilter
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+class InspectFilter(ImageFilter):
+    """Collects layer metadata and appends it to a JSONL file.
+    This filter is a passthrough that records layer digests, sizes,
+    and history information as elements flow through the pipeline.
+    In finalize(), it appends one JSON line per image to the output
+    file.
+    This can be placed at any point in a filter chain to capture
+    the state of layers at that stage of processing. Multiple
+    inspect filters with different output files can be used to
+    compare before/after effects of other filters.
+    Output format (one JSON object per line):
+        {"name": "image:tag", "layers": [
+            {"Id": "sha256:...", "Size": N,
+             "Created": N, "CreatedBy": "...",
+             "Comment": "", "Tags": [...] or null},
+            ...
+        ]}
+    """
+    def __init__(self, wrapped_output, output_file,
+                 image=None, tag=None):
+        """Initialize the inspect filter.
+        Args:
+            wrapped_output: The ImageOutput to pass elements to,
+                or None for inspect-only mode.
+            output_file: Path to the file to append JSON lines to.
+            image: Image name for output formatting.
+            tag: Image tag for output formatting.
+        """
+        super().__init__(wrapped_output)
+        self.output_file = output_file
+        self.image = image
+        self.tag = tag
+        # Parsed from CONFIG_FILE
+        self._history = []  # Non-empty-layer history entries
+        # Collected from IMAGE_LAYER elements
+        self._layers = []  # List of (digest, size) tuples
+    def _parse_config(self, data):
+        """Parse the image config to extract history entries.
+        The config's history array has entries for all Dockerfile
+        steps, including no-op steps (ENV, LABEL, CMD, etc.)
+        marked with empty_layer=True. We extract only the entries
+        that correspond to actual filesystem layers.
+        """
+        data.seek(0)
+        try:
+            config = json.load(data)
+        except (json.JSONDecodeError, UnicodeDecodeError) as e:
+            LOG.warning('Failed to parse image config: %s' % e)
+            return
+        history = config.get('history', [])
+        for entry in history:
+            if not entry.get('empty_layer', False):
+                self._history.append(entry)
+    def _normalize_digest(self, name):
+        """Ensure digest has sha256: prefix."""
+        if name and not name.startswith('sha256:'):
+            return 'sha256:%s' % name
+        return name
+    def process_image_element(self, element_type, name, data):
+        """Process an image element, recording layer metadata."""
+        if element_type == constants.CONFIG_FILE and data is not None:
+            self._parse_config(data)
+        if element_type == constants.IMAGE_LAYER:
+            if data is not None:
+                data.seek(0, 2)
+                size = data.tell()
+                data.seek(0)
+            else:
+                size = 0
+            self._layers.append((name, size))
+        # Pass through to wrapped output
+        if self._wrapped is not None:
+            if data is not None:
+                data.seek(0)
+            self._wrapped.process_image_element(
+                element_type, name, data)
+    def _build_layer_entries(self):
+        """Build layer entry dicts by correlating layers with
+        history.
+        Returns layers in reverse order (newest first) to match
+        the convention used by docker history.
+        """
+        entries = []
+        image_tag = None
+        if self.image and self.tag:
+            image_tag = '%s:%s' % (self.image, self.tag)
+        for i, (digest, size) in enumerate(self._layers):
+            entry = {
+                'Id': self._normalize_digest(digest),
+                'Size': size,
+                'Created': 0,
+                'CreatedBy': '',
+                'Comment': '',
+                'Tags': None,
+            }
+            # Correlate with history if available
+            if i < len(self._history):
+                hist = self._history[i]
+                created = hist.get('created', '')
+                if isinstance(created, str):
+                    # Convert ISO format to unix timestamp
+                    import datetime
+                    try:
+                        dt = datetime.datetime.fromisoformat(
+                            created.replace('Z', '+00:00'))
+                        entry['Created'] = int(dt.timestamp())
+                    except (ValueError, OSError):
+                        entry['Created'] = 0
+                elif isinstance(created, (int, float)):
+                    entry['Created'] = int(created)
+                entry['CreatedBy'] = hist.get('created_by', '')
+                entry['Comment'] = hist.get('comment', '')
+            entries.append(entry)
+        # Reverse to match docker history convention
+        # (newest first) and tag the topmost layer
+        entries.reverse()
+        if entries and image_tag:
+            entries[0]['Tags'] = [image_tag]
+        return entries
+    def _write_output(self):
+        """Append a JSON line to the output file."""
+        image_tag = ''
+        if self.image and self.tag:
+            image_tag = '%s:%s' % (self.image, self.tag)
+        elif self.image:
+            image_tag = self.image
+        record = {
+            'name': image_tag,
+            'layers': self._build_layer_entries(),
+        }
+        line = json.dumps(record, sort_keys=True)
+        with open(self.output_file, 'a') as f:
+            f.write(line + '\n')
+        LOG.info(
+            'Wrote inspect data for %s (%d layers) to %s'
+            % (image_tag, len(self._layers), self.output_file))
+    def finalize(self):
+        """Write collected metadata and finalize wrapped output."""
+        self._write_output()
+        if self._wrapped is not None:
+            self._wrapped.finalize()

occystrap/filters/normalize_timestamps.py ADDED Viewed

@@ -0,0 +1,123 @@
+import hashlib
+import logging
+import os
+import tarfile
+import tempfile
+from occystrap import constants
+from occystrap.filters.base import ImageFilter
+from occystrap.tarformat import select_tar_format_for_layer
+LOG = logging.getLogger(__name__)
+LOG.setLevel(logging.INFO)
+class TimestampNormalizer(ImageFilter):
+    """Normalizes timestamps in image layers for reproducible builds.
+    This filter rewrites layer tarballs to set all file modification times
+    to a consistent value (default: 0, Unix epoch). Since this changes the
+    layer content, the SHA256 hash is recalculated and the layer name is
+    updated to match.
+    This is useful for creating reproducible image tarballs where the same
+    source content always produces the same output, regardless of when the
+    files were originally created or modified.
+    """
+    def __init__(self, wrapped_output, timestamp=0):
+        """Initialize the timestamp normalizer.
+        Args:
+            wrapped_output: The ImageOutput to pass normalized elements to.
+            timestamp: The Unix timestamp to set for all files (default: 0).
+        """
+        super().__init__(wrapped_output)
+        self.timestamp = timestamp
+    def _normalize_layer(self, layer_data):
+        """Normalize timestamps in a layer tarball.
+        Creates a new tarball with all timestamps set to self.timestamp,
+        calculates the new SHA256 hash, and returns both.
+        Uses USTAR format when possible (smaller output), falls back to
+        PAX format when layer contents require it. See tarformat.py.
+        Args:
+            layer_data: File-like object containing the original layer.
+        Returns:
+            Tuple of (normalized_file_handle, new_sha256_hex)
+        """
+        # Determine optimal tar format based on transformed members
+        def transform(member):
+            member.mtime = self.timestamp
+            return member
+        tar_format = select_tar_format_for_layer(layer_data, transform)
+        with tempfile.NamedTemporaryFile(delete=False) as normalized_tf:
+            try:
+                # Create a new tarball with normalized timestamps
+                with tarfile.open(fileobj=normalized_tf, mode='w',
+                                  format=tar_format) as normalized_tar:
+                    layer_data.seek(0)
+                    with tarfile.open(fileobj=layer_data, mode='r') as \
+                            layer_tar:
+                        for member in layer_tar:
+                            # Normalize all timestamp fields
+                            member.mtime = self.timestamp
+                            # Extract the file data if it's a regular file
+                            if member.isfile():
+                                fileobj = layer_tar.extractfile(member)
+                                normalized_tar.addfile(member, fileobj)
+                            else:
+                                normalized_tar.addfile(member)
+                # Calculate SHA256 of the normalized tarball
+                normalized_tf.flush()
+                normalized_tf.seek(0)
+                h = hashlib.sha256()
+                while True:
+                    chunk = normalized_tf.read(8192)
+                    if not chunk:
+                        break
+                    h.update(chunk)
+                new_sha = h.hexdigest()
+                # Return a new file handle and the hash
+                normalized_tf.seek(0)
+                return open(normalized_tf.name, 'rb'), new_sha
+            except Exception:
+                os.unlink(normalized_tf.name)
+                raise
+    def process_image_element(self, element_type, name, data):
+        """Process an image element, normalizing layer timestamps.
+        Config files are passed through unchanged. Layers have their
+        timestamps normalized and their names updated to reflect the
+        new SHA256 hash.
+        """
+        if element_type == constants.IMAGE_LAYER and data is not None:
+            LOG.info('Normalizing timestamps in layer %s' % name)
+            normalized_data, new_name = self._normalize_layer(data)
+            try:
+                self._wrapped.process_image_element(
+                    element_type, new_name, normalized_data)
+            finally:
+                # Clean up the temporary file
+                try:
+                    normalized_data.close()
+                    os.unlink(normalized_data.name)
+                except Exception:
+                    pass
+        else:
+            # Pass through unchanged (config files, skipped layers)
+            self._wrapped.process_image_element(element_type, name, data)

occystrap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

occystrap 0.3.0py3-none-any.whl → 0.4.1py3-none-any.whl