occystrap 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. occystrap/_version.py +34 -0
  2. occystrap/filters/__init__.py +10 -0
  3. occystrap/filters/base.py +67 -0
  4. occystrap/filters/exclude.py +136 -0
  5. occystrap/filters/inspect.py +179 -0
  6. occystrap/filters/normalize_timestamps.py +123 -0
  7. occystrap/filters/search.py +177 -0
  8. occystrap/inputs/__init__.py +1 -0
  9. occystrap/inputs/base.py +40 -0
  10. occystrap/inputs/docker.py +171 -0
  11. occystrap/inputs/registry.py +260 -0
  12. occystrap/inputs/tarfile.py +88 -0
  13. occystrap/main.py +330 -31
  14. occystrap/outputs/__init__.py +1 -0
  15. occystrap/outputs/base.py +46 -0
  16. occystrap/{output_directory.py → outputs/directory.py} +10 -9
  17. occystrap/outputs/docker.py +137 -0
  18. occystrap/{output_mounts.py → outputs/mounts.py} +2 -1
  19. occystrap/{output_ocibundle.py → outputs/ocibundle.py} +1 -1
  20. occystrap/outputs/registry.py +240 -0
  21. occystrap/{output_tarfile.py → outputs/tarfile.py} +18 -2
  22. occystrap/pipeline.py +297 -0
  23. occystrap/tarformat.py +122 -0
  24. occystrap/tests/test_inspect.py +355 -0
  25. occystrap/tests/test_tarformat.py +199 -0
  26. occystrap/uri.py +231 -0
  27. occystrap/util.py +67 -38
  28. occystrap-0.4.1.dist-info/METADATA +444 -0
  29. occystrap-0.4.1.dist-info/RECORD +38 -0
  30. {occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/WHEEL +1 -1
  31. {occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/entry_points.txt +0 -1
  32. occystrap/docker_extract.py +0 -36
  33. occystrap/docker_registry.py +0 -192
  34. occystrap-0.3.0.dist-info/METADATA +0 -131
  35. occystrap-0.3.0.dist-info/RECORD +0 -20
  36. occystrap-0.3.0.dist-info/pbr.json +0 -1
  37. {occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info/licenses}/AUTHORS +0 -0
  38. {occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info/licenses}/LICENSE +0 -0
  39. {occystrap-0.3.0.dist-info → occystrap-0.4.1.dist-info}/top_level.txt +0 -0
occystrap/_version.py ADDED
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.4.1'
32
+ __version_tuple__ = version_tuple = (0, 4, 1)
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,10 @@
1
+ from occystrap.filters.base import ImageFilter
2
+ from occystrap.filters.exclude import ExcludeFilter
3
+ from occystrap.filters.inspect import InspectFilter
4
+ from occystrap.filters.normalize_timestamps import TimestampNormalizer
5
+ from occystrap.filters.search import SearchFilter
6
+
7
+ __all__ = [
8
+ 'ImageFilter', 'ExcludeFilter', 'InspectFilter',
9
+ 'TimestampNormalizer', 'SearchFilter',
10
+ ]
@@ -0,0 +1,67 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from occystrap.outputs.base import ImageOutput
4
+
5
+
6
+ class ImageFilter(ImageOutput, ABC):
7
+ """Abstract base class for image filters.
8
+
9
+ Filters wrap an ImageOutput and can transform or inspect image elements
10
+ as they pass through the pipeline. Filters implement the ImageOutput
11
+ interface so they can be chained together or used as the final output.
12
+
13
+ The decorator pattern allows filters to be stacked:
14
+ input -> filter1 -> filter2 -> output
15
+
16
+ Each filter can:
17
+ - Transform element data (e.g., normalize timestamps)
18
+ - Transform element names (e.g., recalculate hashes)
19
+ - Inspect elements without modification (e.g., search)
20
+ - Skip elements entirely
21
+ - Accumulate state across elements (e.g., collect search results)
22
+ """
23
+
24
+ def __init__(self, wrapped_output):
25
+ """Wrap another output (or filter) to form a chain.
26
+
27
+ Args:
28
+ wrapped_output: The ImageOutput to pass processed elements to.
29
+ Can be None for terminal filters that don't produce output
30
+ (e.g., search-only mode).
31
+ """
32
+ self._wrapped = wrapped_output
33
+
34
+ def fetch_callback(self, digest):
35
+ """Determine whether a layer should be fetched.
36
+
37
+ Default implementation delegates to the wrapped output.
38
+ Override to implement custom filtering logic.
39
+ """
40
+ if self._wrapped is None:
41
+ return True
42
+ return self._wrapped.fetch_callback(digest)
43
+
44
+ @abstractmethod
45
+ def process_image_element(self, element_type, name, data):
46
+ """Process and optionally transform an image element.
47
+
48
+ Implementations should typically:
49
+ 1. Perform any transformation or inspection
50
+ 2. Pass the (possibly modified) element to self._wrapped
51
+
52
+ Args:
53
+ element_type: constants.CONFIG_FILE or constants.IMAGE_LAYER
54
+ name: The element name/digest
55
+ data: File-like object containing the element data, or None
56
+ if the element was skipped by fetch_callback
57
+ """
58
+ pass
59
+
60
+ def finalize(self):
61
+ """Complete the filter operation.
62
+
63
+ Default implementation delegates to the wrapped output.
64
+ Override to perform cleanup or output accumulated results.
65
+ """
66
+ if self._wrapped is not None:
67
+ self._wrapped.finalize()
@@ -0,0 +1,136 @@
1
+ import fnmatch
2
+ import hashlib
3
+ import logging
4
+ import os
5
+ import tarfile
6
+ import tempfile
7
+
8
+ from occystrap import constants
9
+ from occystrap.filters.base import ImageFilter
10
+ from occystrap.tarformat import select_tar_format_for_layer
11
+
12
+
13
+ LOG = logging.getLogger(__name__)
14
+ LOG.setLevel(logging.INFO)
15
+
16
+
17
+ class ExcludeFilter(ImageFilter):
18
+ """Excludes files matching glob patterns from image layers.
19
+
20
+ This filter rewrites layer tarballs to remove files and directories
21
+ that match any of the specified glob patterns. Since this changes the
22
+ layer content, the SHA256 hash is recalculated and the layer name is
23
+ updated to match.
24
+
25
+ This is useful for stripping unwanted content like .git directories,
26
+ __pycache__ folders, or other files before writing output.
27
+ """
28
+
29
+ def __init__(self, wrapped_output, patterns):
30
+ """Initialize the exclude filter.
31
+
32
+ Args:
33
+ wrapped_output: The ImageOutput to pass filtered elements to.
34
+ patterns: List of glob patterns to exclude. Each pattern is
35
+ matched against the full path using fnmatch.
36
+ """
37
+ super().__init__(wrapped_output)
38
+ self.patterns = patterns
39
+
40
+ def _matches_exclusion(self, path):
41
+ """Check if a path matches any exclusion pattern.
42
+
43
+ Args:
44
+ path: The file path to check.
45
+
46
+ Returns:
47
+ True if the path should be excluded, False otherwise.
48
+ """
49
+ for pattern in self.patterns:
50
+ if fnmatch.fnmatch(path, pattern):
51
+ return True
52
+ return False
53
+
54
+ def _filter_layer(self, layer_data):
55
+ """Filter a layer tarball, excluding matching entries.
56
+
57
+ Creates a new tarball with entries that don't match exclusion
58
+ patterns, calculates the new SHA256 hash, and returns both.
59
+
60
+ Uses USTAR format when possible (smaller output), falls back to
61
+ PAX format when layer contents require it. See tarformat.py.
62
+
63
+ Args:
64
+ layer_data: File-like object containing the original layer.
65
+
66
+ Returns:
67
+ Tuple of (filtered_file_handle, new_sha256_hex)
68
+ """
69
+ # Determine optimal tar format, skipping excluded members
70
+ tar_format = select_tar_format_for_layer(
71
+ layer_data,
72
+ skip_fn=lambda m: self._matches_exclusion(m.name)
73
+ )
74
+
75
+ excluded_count = 0
76
+
77
+ with tempfile.NamedTemporaryFile(delete=False) as filtered_tf:
78
+ try:
79
+ with tarfile.open(fileobj=filtered_tf, mode='w',
80
+ format=tar_format) as filtered_tar:
81
+ layer_data.seek(0)
82
+ with tarfile.open(fileobj=layer_data, mode='r') as layer_tar:
83
+ for member in layer_tar:
84
+ if self._matches_exclusion(member.name):
85
+ excluded_count += 1
86
+ continue
87
+
88
+ if member.isfile():
89
+ fileobj = layer_tar.extractfile(member)
90
+ filtered_tar.addfile(member, fileobj)
91
+ else:
92
+ filtered_tar.addfile(member)
93
+
94
+ if excluded_count > 0:
95
+ LOG.info('Excluded %d entries from layer' % excluded_count)
96
+
97
+ filtered_tf.flush()
98
+ filtered_tf.seek(0)
99
+ h = hashlib.sha256()
100
+ while True:
101
+ chunk = filtered_tf.read(8192)
102
+ if not chunk:
103
+ break
104
+ h.update(chunk)
105
+
106
+ new_sha = h.hexdigest()
107
+
108
+ filtered_tf.seek(0)
109
+ return open(filtered_tf.name, 'rb'), new_sha
110
+
111
+ except Exception:
112
+ os.unlink(filtered_tf.name)
113
+ raise
114
+
115
+ def process_image_element(self, element_type, name, data):
116
+ """Process an image element, filtering layer contents.
117
+
118
+ Config files are passed through unchanged. Layers have matching
119
+ entries excluded and their names updated to reflect the new
120
+ SHA256 hash.
121
+ """
122
+ if element_type == constants.IMAGE_LAYER and data is not None:
123
+ LOG.info('Filtering layer %s' % name)
124
+ filtered_data, new_name = self._filter_layer(data)
125
+
126
+ try:
127
+ self._wrapped.process_image_element(
128
+ element_type, new_name, filtered_data)
129
+ finally:
130
+ try:
131
+ filtered_data.close()
132
+ os.unlink(filtered_data.name)
133
+ except Exception:
134
+ pass
135
+ else:
136
+ self._wrapped.process_image_element(element_type, name, data)
@@ -0,0 +1,179 @@
1
+ import json
2
+ import logging
3
+
4
+ from occystrap import constants
5
+ from occystrap.filters.base import ImageFilter
6
+
7
+
8
+ LOG = logging.getLogger(__name__)
9
+ LOG.setLevel(logging.INFO)
10
+
11
+
12
+ class InspectFilter(ImageFilter):
13
+ """Collects layer metadata and appends it to a JSONL file.
14
+
15
+ This filter is a passthrough that records layer digests, sizes,
16
+ and history information as elements flow through the pipeline.
17
+ In finalize(), it appends one JSON line per image to the output
18
+ file.
19
+
20
+ This can be placed at any point in a filter chain to capture
21
+ the state of layers at that stage of processing. Multiple
22
+ inspect filters with different output files can be used to
23
+ compare before/after effects of other filters.
24
+
25
+ Output format (one JSON object per line):
26
+ {"name": "image:tag", "layers": [
27
+ {"Id": "sha256:...", "Size": N,
28
+ "Created": N, "CreatedBy": "...",
29
+ "Comment": "", "Tags": [...] or null},
30
+ ...
31
+ ]}
32
+ """
33
+
34
+ def __init__(self, wrapped_output, output_file,
35
+ image=None, tag=None):
36
+ """Initialize the inspect filter.
37
+
38
+ Args:
39
+ wrapped_output: The ImageOutput to pass elements to,
40
+ or None for inspect-only mode.
41
+ output_file: Path to the file to append JSON lines to.
42
+ image: Image name for output formatting.
43
+ tag: Image tag for output formatting.
44
+ """
45
+ super().__init__(wrapped_output)
46
+ self.output_file = output_file
47
+ self.image = image
48
+ self.tag = tag
49
+
50
+ # Parsed from CONFIG_FILE
51
+ self._history = [] # Non-empty-layer history entries
52
+
53
+ # Collected from IMAGE_LAYER elements
54
+ self._layers = [] # List of (digest, size) tuples
55
+
56
+ def _parse_config(self, data):
57
+ """Parse the image config to extract history entries.
58
+
59
+ The config's history array has entries for all Dockerfile
60
+ steps, including no-op steps (ENV, LABEL, CMD, etc.)
61
+ marked with empty_layer=True. We extract only the entries
62
+ that correspond to actual filesystem layers.
63
+ """
64
+ data.seek(0)
65
+ try:
66
+ config = json.load(data)
67
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
68
+ LOG.warning('Failed to parse image config: %s' % e)
69
+ return
70
+
71
+ history = config.get('history', [])
72
+ for entry in history:
73
+ if not entry.get('empty_layer', False):
74
+ self._history.append(entry)
75
+
76
+ def _normalize_digest(self, name):
77
+ """Ensure digest has sha256: prefix."""
78
+ if name and not name.startswith('sha256:'):
79
+ return 'sha256:%s' % name
80
+ return name
81
+
82
+ def process_image_element(self, element_type, name, data):
83
+ """Process an image element, recording layer metadata."""
84
+ if element_type == constants.CONFIG_FILE and data is not None:
85
+ self._parse_config(data)
86
+
87
+ if element_type == constants.IMAGE_LAYER:
88
+ if data is not None:
89
+ data.seek(0, 2)
90
+ size = data.tell()
91
+ data.seek(0)
92
+ else:
93
+ size = 0
94
+ self._layers.append((name, size))
95
+
96
+ # Pass through to wrapped output
97
+ if self._wrapped is not None:
98
+ if data is not None:
99
+ data.seek(0)
100
+ self._wrapped.process_image_element(
101
+ element_type, name, data)
102
+
103
+ def _build_layer_entries(self):
104
+ """Build layer entry dicts by correlating layers with
105
+ history.
106
+
107
+ Returns layers in reverse order (newest first) to match
108
+ the convention used by docker history.
109
+ """
110
+ entries = []
111
+ image_tag = None
112
+ if self.image and self.tag:
113
+ image_tag = '%s:%s' % (self.image, self.tag)
114
+
115
+ for i, (digest, size) in enumerate(self._layers):
116
+ entry = {
117
+ 'Id': self._normalize_digest(digest),
118
+ 'Size': size,
119
+ 'Created': 0,
120
+ 'CreatedBy': '',
121
+ 'Comment': '',
122
+ 'Tags': None,
123
+ }
124
+
125
+ # Correlate with history if available
126
+ if i < len(self._history):
127
+ hist = self._history[i]
128
+ created = hist.get('created', '')
129
+ if isinstance(created, str):
130
+ # Convert ISO format to unix timestamp
131
+ import datetime
132
+ try:
133
+ dt = datetime.datetime.fromisoformat(
134
+ created.replace('Z', '+00:00'))
135
+ entry['Created'] = int(dt.timestamp())
136
+ except (ValueError, OSError):
137
+ entry['Created'] = 0
138
+ elif isinstance(created, (int, float)):
139
+ entry['Created'] = int(created)
140
+ entry['CreatedBy'] = hist.get('created_by', '')
141
+ entry['Comment'] = hist.get('comment', '')
142
+
143
+ entries.append(entry)
144
+
145
+ # Reverse to match docker history convention
146
+ # (newest first) and tag the topmost layer
147
+ entries.reverse()
148
+ if entries and image_tag:
149
+ entries[0]['Tags'] = [image_tag]
150
+
151
+ return entries
152
+
153
+ def _write_output(self):
154
+ """Append a JSON line to the output file."""
155
+ image_tag = ''
156
+ if self.image and self.tag:
157
+ image_tag = '%s:%s' % (self.image, self.tag)
158
+ elif self.image:
159
+ image_tag = self.image
160
+
161
+ record = {
162
+ 'name': image_tag,
163
+ 'layers': self._build_layer_entries(),
164
+ }
165
+
166
+ line = json.dumps(record, sort_keys=True)
167
+ with open(self.output_file, 'a') as f:
168
+ f.write(line + '\n')
169
+
170
+ LOG.info(
171
+ 'Wrote inspect data for %s (%d layers) to %s'
172
+ % (image_tag, len(self._layers), self.output_file))
173
+
174
+ def finalize(self):
175
+ """Write collected metadata and finalize wrapped output."""
176
+ self._write_output()
177
+
178
+ if self._wrapped is not None:
179
+ self._wrapped.finalize()
@@ -0,0 +1,123 @@
1
+ import hashlib
2
+ import logging
3
+ import os
4
+ import tarfile
5
+ import tempfile
6
+
7
+ from occystrap import constants
8
+ from occystrap.filters.base import ImageFilter
9
+ from occystrap.tarformat import select_tar_format_for_layer
10
+
11
+
12
+ LOG = logging.getLogger(__name__)
13
+ LOG.setLevel(logging.INFO)
14
+
15
+
16
+ class TimestampNormalizer(ImageFilter):
17
+ """Normalizes timestamps in image layers for reproducible builds.
18
+
19
+ This filter rewrites layer tarballs to set all file modification times
20
+ to a consistent value (default: 0, Unix epoch). Since this changes the
21
+ layer content, the SHA256 hash is recalculated and the layer name is
22
+ updated to match.
23
+
24
+ This is useful for creating reproducible image tarballs where the same
25
+ source content always produces the same output, regardless of when the
26
+ files were originally created or modified.
27
+ """
28
+
29
+ def __init__(self, wrapped_output, timestamp=0):
30
+ """Initialize the timestamp normalizer.
31
+
32
+ Args:
33
+ wrapped_output: The ImageOutput to pass normalized elements to.
34
+ timestamp: The Unix timestamp to set for all files (default: 0).
35
+ """
36
+ super().__init__(wrapped_output)
37
+ self.timestamp = timestamp
38
+
39
+ def _normalize_layer(self, layer_data):
40
+ """Normalize timestamps in a layer tarball.
41
+
42
+ Creates a new tarball with all timestamps set to self.timestamp,
43
+ calculates the new SHA256 hash, and returns both.
44
+
45
+ Uses USTAR format when possible (smaller output), falls back to
46
+ PAX format when layer contents require it. See tarformat.py.
47
+
48
+ Args:
49
+ layer_data: File-like object containing the original layer.
50
+
51
+ Returns:
52
+ Tuple of (normalized_file_handle, new_sha256_hex)
53
+ """
54
+ # Determine optimal tar format based on transformed members
55
+ def transform(member):
56
+ member.mtime = self.timestamp
57
+ return member
58
+
59
+ tar_format = select_tar_format_for_layer(layer_data, transform)
60
+
61
+ with tempfile.NamedTemporaryFile(delete=False) as normalized_tf:
62
+ try:
63
+ # Create a new tarball with normalized timestamps
64
+ with tarfile.open(fileobj=normalized_tf, mode='w',
65
+ format=tar_format) as normalized_tar:
66
+ layer_data.seek(0)
67
+ with tarfile.open(fileobj=layer_data, mode='r') as \
68
+ layer_tar:
69
+ for member in layer_tar:
70
+ # Normalize all timestamp fields
71
+ member.mtime = self.timestamp
72
+
73
+ # Extract the file data if it's a regular file
74
+ if member.isfile():
75
+ fileobj = layer_tar.extractfile(member)
76
+ normalized_tar.addfile(member, fileobj)
77
+ else:
78
+ normalized_tar.addfile(member)
79
+
80
+ # Calculate SHA256 of the normalized tarball
81
+ normalized_tf.flush()
82
+ normalized_tf.seek(0)
83
+ h = hashlib.sha256()
84
+ while True:
85
+ chunk = normalized_tf.read(8192)
86
+ if not chunk:
87
+ break
88
+ h.update(chunk)
89
+
90
+ new_sha = h.hexdigest()
91
+
92
+ # Return a new file handle and the hash
93
+ normalized_tf.seek(0)
94
+ return open(normalized_tf.name, 'rb'), new_sha
95
+
96
+ except Exception:
97
+ os.unlink(normalized_tf.name)
98
+ raise
99
+
100
+ def process_image_element(self, element_type, name, data):
101
+ """Process an image element, normalizing layer timestamps.
102
+
103
+ Config files are passed through unchanged. Layers have their
104
+ timestamps normalized and their names updated to reflect the
105
+ new SHA256 hash.
106
+ """
107
+ if element_type == constants.IMAGE_LAYER and data is not None:
108
+ LOG.info('Normalizing timestamps in layer %s' % name)
109
+ normalized_data, new_name = self._normalize_layer(data)
110
+
111
+ try:
112
+ self._wrapped.process_image_element(
113
+ element_type, new_name, normalized_data)
114
+ finally:
115
+ # Clean up the temporary file
116
+ try:
117
+ normalized_data.close()
118
+ os.unlink(normalized_data.name)
119
+ except Exception:
120
+ pass
121
+ else:
122
+ # Pass through unchanged (config files, skipped layers)
123
+ self._wrapped.process_image_element(element_type, name, data)