occystrap 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. occystrap/_version.py +34 -0
  2. occystrap/filters/__init__.py +10 -0
  3. occystrap/filters/base.py +67 -0
  4. occystrap/filters/exclude.py +136 -0
  5. occystrap/filters/inspect.py +179 -0
  6. occystrap/filters/normalize_timestamps.py +123 -0
  7. occystrap/filters/search.py +177 -0
  8. occystrap/inputs/__init__.py +1 -0
  9. occystrap/inputs/base.py +40 -0
  10. occystrap/inputs/docker.py +171 -0
  11. occystrap/{docker_registry.py → inputs/registry.py} +112 -50
  12. occystrap/inputs/tarfile.py +88 -0
  13. occystrap/main.py +330 -31
  14. occystrap/outputs/__init__.py +1 -0
  15. occystrap/outputs/base.py +46 -0
  16. occystrap/{output_directory.py → outputs/directory.py} +10 -9
  17. occystrap/outputs/docker.py +137 -0
  18. occystrap/{output_mounts.py → outputs/mounts.py} +2 -1
  19. occystrap/{output_ocibundle.py → outputs/ocibundle.py} +1 -1
  20. occystrap/outputs/registry.py +240 -0
  21. occystrap/{output_tarfile.py → outputs/tarfile.py} +18 -2
  22. occystrap/pipeline.py +297 -0
  23. occystrap/tarformat.py +122 -0
  24. occystrap/tests/test_inspect.py +355 -0
  25. occystrap/tests/test_tarformat.py +199 -0
  26. occystrap/uri.py +231 -0
  27. occystrap/util.py +67 -38
  28. occystrap-0.4.1.dist-info/METADATA +444 -0
  29. occystrap-0.4.1.dist-info/RECORD +38 -0
  30. {occystrap-0.4.0.dist-info → occystrap-0.4.1.dist-info}/WHEEL +1 -1
  31. {occystrap-0.4.0.dist-info → occystrap-0.4.1.dist-info}/entry_points.txt +0 -1
  32. occystrap/docker_extract.py +0 -36
  33. occystrap-0.4.0.dist-info/METADATA +0 -131
  34. occystrap-0.4.0.dist-info/RECORD +0 -20
  35. occystrap-0.4.0.dist-info/pbr.json +0 -1
  36. {occystrap-0.4.0.dist-info → occystrap-0.4.1.dist-info/licenses}/AUTHORS +0 -0
  37. {occystrap-0.4.0.dist-info → occystrap-0.4.1.dist-info/licenses}/LICENSE +0 -0
  38. {occystrap-0.4.0.dist-info → occystrap-0.4.1.dist-info}/top_level.txt +0 -0
occystrap/pipeline.py ADDED
@@ -0,0 +1,297 @@
1
+ """Pipeline builder for occystrap.
2
+
3
+ This module provides a PipelineBuilder class that constructs input -> filter
4
+ chain -> output pipelines from URI specifications.
5
+ """
6
+
7
+ import os
8
+
9
+ from occystrap.inputs import docker as input_docker
10
+ from occystrap.inputs import registry as input_registry
11
+ from occystrap.inputs import tarfile as input_tarfile
12
+ from occystrap.outputs import directory as output_directory
13
+ from occystrap.outputs import docker as output_docker
14
+ from occystrap.outputs import mounts as output_mounts
15
+ from occystrap.outputs import ocibundle as output_ocibundle
16
+ from occystrap.outputs import registry as output_registry
17
+ from occystrap.outputs import tarfile as output_tarfile
18
+ from occystrap.filters import (
19
+ ExcludeFilter, InspectFilter, TimestampNormalizer, SearchFilter
20
+ )
21
+ from occystrap import uri
22
+
23
+
24
+ class PipelineError(Exception):
25
+ """Raised when a pipeline cannot be built."""
26
+ pass
27
+
28
+
29
+ class PipelineBuilder:
30
+ """Builds input -> filter chain -> output pipelines from URIs."""
31
+
32
+ def __init__(self, ctx=None):
33
+ """Initialize the pipeline builder.
34
+
35
+ Args:
36
+ ctx: Click context object containing global options like
37
+ OS, ARCHITECTURE, VARIANT, USERNAME, PASSWORD, INSECURE.
38
+ Can be None for defaults.
39
+ """
40
+ self.ctx = ctx
41
+ self._ctx_obj = ctx.obj if ctx and ctx.obj else {}
42
+
43
+ def _get_ctx(self, key, default=None):
44
+ """Get a value from the context object."""
45
+ return self._ctx_obj.get(key, default)
46
+
47
+ def build_input(self, uri_spec):
48
+ """Create an ImageInput from a URI spec.
49
+
50
+ Args:
51
+ uri_spec: A URISpec from uri.parse_uri()
52
+
53
+ Returns:
54
+ An ImageInput instance.
55
+
56
+ Raises:
57
+ PipelineError: If the input cannot be created.
58
+ """
59
+ if uri_spec.scheme == 'registry':
60
+ host, image, tag = uri.parse_registry_uri(uri_spec)
61
+
62
+ # Get options from URI or context
63
+ os_name = uri_spec.options.get('os', self._get_ctx('OS', 'linux'))
64
+ arch = uri_spec.options.get(
65
+ 'arch', uri_spec.options.get(
66
+ 'architecture', self._get_ctx('ARCHITECTURE', 'amd64')))
67
+ variant = uri_spec.options.get(
68
+ 'variant', self._get_ctx('VARIANT', ''))
69
+ username = uri_spec.options.get(
70
+ 'username', self._get_ctx('USERNAME'))
71
+ password = uri_spec.options.get(
72
+ 'password', self._get_ctx('PASSWORD'))
73
+ insecure = uri_spec.options.get(
74
+ 'insecure', self._get_ctx('INSECURE', False))
75
+
76
+ return input_registry.Image(
77
+ host, image, tag,
78
+ os=os_name,
79
+ architecture=arch,
80
+ variant=variant,
81
+ secure=(not insecure),
82
+ username=username,
83
+ password=password)
84
+
85
+ elif uri_spec.scheme == 'docker':
86
+ image, tag, socket = uri.parse_docker_uri(uri_spec)
87
+ return input_docker.Image(image, tag, socket_path=socket)
88
+
89
+ elif uri_spec.scheme == 'tar':
90
+ path = uri_spec.path
91
+ if not path:
92
+ raise PipelineError('tar:// URI requires a path')
93
+ return input_tarfile.Image(path)
94
+
95
+ else:
96
+ raise PipelineError('Unknown input scheme: %s' % uri_spec.scheme)
97
+
98
+ def build_output(self, uri_spec, image, tag):
99
+ """Create an ImageOutput from a URI spec.
100
+
101
+ Args:
102
+ uri_spec: A URISpec from uri.parse_uri()
103
+ image: Image name (from input source)
104
+ tag: Image tag (from input source)
105
+
106
+ Returns:
107
+ An ImageOutput instance.
108
+
109
+ Raises:
110
+ PipelineError: If the output cannot be created.
111
+ """
112
+ if uri_spec.scheme == 'tar':
113
+ path = uri_spec.path
114
+ if not path:
115
+ raise PipelineError('tar:// output URI requires a path')
116
+ return output_tarfile.TarWriter(image, tag, path)
117
+
118
+ elif uri_spec.scheme == 'dir':
119
+ path = uri_spec.path
120
+ if not path:
121
+ raise PipelineError('dir:// URI requires a path')
122
+ unique_names = uri_spec.options.get('unique_names', False)
123
+ expand = uri_spec.options.get('expand', False)
124
+ return output_directory.DirWriter(
125
+ image, tag, path,
126
+ unique_names=unique_names,
127
+ expand=expand)
128
+
129
+ elif uri_spec.scheme == 'oci':
130
+ path = uri_spec.path
131
+ if not path:
132
+ raise PipelineError('oci:// URI requires a path')
133
+ return output_ocibundle.OCIBundleWriter(image, tag, path)
134
+
135
+ elif uri_spec.scheme == 'mounts':
136
+ path = uri_spec.path
137
+ if not path:
138
+ raise PipelineError('mounts:// URI requires a path')
139
+
140
+ # Check for required OS features
141
+ if not hasattr(os, 'setxattr'):
142
+ raise PipelineError(
143
+ 'mounts:// output requires setxattr support')
144
+ if not hasattr(os, 'mknod'):
145
+ raise PipelineError(
146
+ 'mounts:// output requires mknod support')
147
+
148
+ return output_mounts.MountWriter(image, tag, path)
149
+
150
+ elif uri_spec.scheme == 'docker':
151
+ _, _, socket = uri.parse_docker_uri(uri_spec)
152
+ return output_docker.DockerWriter(image, tag, socket_path=socket)
153
+
154
+ elif uri_spec.scheme == 'registry':
155
+ host, dest_image, dest_tag = uri.parse_registry_uri(uri_spec)
156
+ username = uri_spec.options.get(
157
+ 'username', self._get_ctx('USERNAME'))
158
+ password = uri_spec.options.get(
159
+ 'password', self._get_ctx('PASSWORD'))
160
+ insecure = uri_spec.options.get(
161
+ 'insecure', self._get_ctx('INSECURE', False))
162
+ return output_registry.RegistryWriter(
163
+ host, dest_image, dest_tag,
164
+ secure=(not insecure),
165
+ username=username,
166
+ password=password)
167
+
168
+ else:
169
+ raise PipelineError('Unknown output scheme: %s' % uri_spec.scheme)
170
+
171
+ def build_filter(self, filter_spec, wrapped_output, image=None, tag=None):
172
+ """Wrap an output with a filter.
173
+
174
+ Args:
175
+ filter_spec: A FilterSpec from uri.parse_filter()
176
+ wrapped_output: The ImageOutput to wrap
177
+ image: Image name (for search filter output)
178
+ tag: Image tag (for search filter output)
179
+
180
+ Returns:
181
+ An ImageFilter wrapping the output.
182
+
183
+ Raises:
184
+ PipelineError: If the filter cannot be created.
185
+ """
186
+ name = filter_spec.name.lower().replace('_', '-')
187
+
188
+ if name == 'normalize-timestamps':
189
+ timestamp = filter_spec.options.get(
190
+ 'timestamp', filter_spec.options.get('ts', 0))
191
+ return TimestampNormalizer(wrapped_output, timestamp=timestamp)
192
+
193
+ elif name == 'search':
194
+ pattern = filter_spec.options.get('pattern')
195
+ if not pattern:
196
+ raise PipelineError(
197
+ 'search filter requires pattern option')
198
+ use_regex = filter_spec.options.get('regex', False)
199
+ script_friendly = filter_spec.options.get('script_friendly',
200
+ filter_spec.options.get(
201
+ 'script-friendly',
202
+ False))
203
+ return SearchFilter(
204
+ wrapped_output,
205
+ pattern=pattern,
206
+ use_regex=use_regex,
207
+ image=image,
208
+ tag=tag,
209
+ script_friendly=script_friendly)
210
+
211
+ elif name == 'exclude':
212
+ pattern_str = filter_spec.options.get('pattern')
213
+ if not pattern_str:
214
+ raise PipelineError(
215
+ 'exclude filter requires pattern option')
216
+ patterns = [p.strip() for p in pattern_str.split(',')]
217
+ return ExcludeFilter(wrapped_output, patterns=patterns)
218
+
219
+ elif name == 'inspect':
220
+ output_file = filter_spec.options.get('file')
221
+ if not output_file:
222
+ raise PipelineError(
223
+ 'inspect filter requires file option')
224
+ return InspectFilter(
225
+ wrapped_output,
226
+ output_file=output_file,
227
+ image=image,
228
+ tag=tag)
229
+
230
+ else:
231
+ raise PipelineError('Unknown filter: %s' % filter_spec.name)
232
+
233
+ def build_pipeline(self, source_uri_str, dest_uri_str, filter_strs=None):
234
+ """Build complete pipeline from URI strings.
235
+
236
+ Args:
237
+ source_uri_str: Input URI string
238
+ dest_uri_str: Output URI string
239
+ filter_strs: List of filter specification strings
240
+
241
+ Returns:
242
+ Tuple of (input_source, output_chain)
243
+
244
+ Raises:
245
+ PipelineError: If the pipeline cannot be built.
246
+ uri.URIParseError: If a URI cannot be parsed.
247
+ """
248
+ if filter_strs is None:
249
+ filter_strs = []
250
+
251
+ # Parse URIs
252
+ source_spec = uri.parse_uri(source_uri_str)
253
+ dest_spec = uri.parse_uri(dest_uri_str)
254
+ filter_specs = [uri.parse_filter(f) for f in filter_strs]
255
+
256
+ # Build input
257
+ input_source = self.build_input(source_spec)
258
+
259
+ # Build output
260
+ output = self.build_output(
261
+ dest_spec, input_source.image, input_source.tag)
262
+
263
+ # Wrap with filters (in reverse order so first filter is outermost)
264
+ for filter_spec in reversed(filter_specs):
265
+ output = self.build_filter(
266
+ filter_spec, output,
267
+ image=input_source.image,
268
+ tag=input_source.tag)
269
+
270
+ return input_source, output
271
+
272
+ def build_search_pipeline(self, source_uri_str, pattern, use_regex=False,
273
+ script_friendly=False):
274
+ """Build a search-only pipeline (no output destination).
275
+
276
+ Args:
277
+ source_uri_str: Input URI string
278
+ pattern: Search pattern
279
+ use_regex: If True, treat pattern as regex
280
+ script_friendly: If True, output in machine-parseable format
281
+
282
+ Returns:
283
+ Tuple of (input_source, search_filter)
284
+ """
285
+ source_spec = uri.parse_uri(source_uri_str)
286
+ input_source = self.build_input(source_spec)
287
+
288
+ # Create search filter with no wrapped output
289
+ searcher = SearchFilter(
290
+ None, # No wrapped output
291
+ pattern=pattern,
292
+ use_regex=use_regex,
293
+ image=input_source.image,
294
+ tag=input_source.tag,
295
+ script_friendly=script_friendly)
296
+
297
+ return input_source, searcher
occystrap/tarformat.py ADDED
@@ -0,0 +1,122 @@
1
+ # Smart tar format selection for occystrap.
2
+ #
3
+ # Uses USTAR format by default (smaller output), falls back to PAX when needed.
4
+ # This can save ~1KB per file with long names (>100 chars) which adds up to
5
+ # tens of megabytes on large container layers.
6
+ #
7
+ # See docs/tar-format-selection.md for detailed explanation.
8
+
9
+ import logging
10
+ import os
11
+ import tarfile
12
+
13
+
14
+ LOG = logging.getLogger(__name__)
15
+
16
+ # USTAR format limits (POSIX.1-1988)
17
+ #
18
+ # USTAR stores paths using two fields:
19
+ # - name: 100 bytes for the filename
20
+ # - prefix: 155 bytes for the directory path
21
+ #
22
+ # Combined, this allows paths up to 256 characters (prefix + '/' + name)
23
+ # without requiring extended headers.
24
+ #
25
+ # PAX format (POSIX.1-2001) adds extended header blocks for metadata that
26
+ # doesn't fit in the USTAR header. Each extended header adds ~1KB overhead.
27
+ USTAR_MAX_PATH = 256
28
+ USTAR_MAX_NAME = 100
29
+ USTAR_MAX_PREFIX = 155
30
+ USTAR_MAX_LINKNAME = 100
31
+ USTAR_MAX_SIZE = 8 * 1024 * 1024 * 1024 - 1 # 8 GiB - 1 byte
32
+ USTAR_MAX_ID = 0o7777777 # 2097151 (max value in 8-byte octal field)
33
+
34
+
35
+ def needs_pax_format(member):
36
+ """
37
+ Check if a TarInfo member requires PAX format due to USTAR limitations.
38
+
39
+ USTAR format is more compact but has restrictions. This function checks
40
+ if a member exceeds any of those restrictions.
41
+
42
+ Args:
43
+ member: A TarInfo object to check.
44
+
45
+ Returns:
46
+ bool: True if PAX format is required, False if USTAR suffices.
47
+ """
48
+ # Check total path length
49
+ if len(member.name) > USTAR_MAX_PATH:
50
+ return True
51
+
52
+ # Check if path can be split into prefix + name for USTAR
53
+ # The path must be splittable at a '/' boundary where:
54
+ # - basename (after last '/') <= 100 chars
55
+ # - dirname (before last '/') <= 155 chars
56
+ if len(member.name) > USTAR_MAX_NAME:
57
+ basename = os.path.basename(member.name)
58
+ dirname = os.path.dirname(member.name)
59
+ if len(basename) > USTAR_MAX_NAME or len(dirname) > USTAR_MAX_PREFIX:
60
+ return True
61
+
62
+ # Check symlink/hardlink target length
63
+ if member.linkname and len(member.linkname) > USTAR_MAX_LINKNAME:
64
+ return True
65
+
66
+ # Check file size (USTAR uses 12-byte octal, max ~8 GiB)
67
+ if member.size > USTAR_MAX_SIZE:
68
+ return True
69
+
70
+ # Check UID/GID (USTAR uses 8-byte octal fields)
71
+ if member.uid > USTAR_MAX_ID or member.gid > USTAR_MAX_ID:
72
+ return True
73
+
74
+ # Check for non-ASCII characters (USTAR only supports ASCII)
75
+ try:
76
+ member.name.encode('ascii')
77
+ if member.linkname:
78
+ member.linkname.encode('ascii')
79
+ except UnicodeEncodeError:
80
+ return True
81
+
82
+ return False
83
+
84
+
85
+ def select_tar_format_for_layer(layer_fileobj, transform_fn=None, skip_fn=None):
86
+ """
87
+ Determine the optimal tar format for a layer after applying transforms.
88
+
89
+ This performs a read-only scan of the layer to check if any members
90
+ (after transformation and filtering) would require PAX format. Returns
91
+ as soon as a PAX-requiring member is found.
92
+
93
+ Args:
94
+ layer_fileobj: File-like object containing the tar layer.
95
+ transform_fn: Optional function(TarInfo) -> TarInfo that will be
96
+ applied to members. The format check uses the
97
+ transformed member attributes.
98
+ skip_fn: Optional function(TarInfo) -> bool that returns True for
99
+ members that will be skipped/excluded. These members are
100
+ not considered in the format selection.
101
+
102
+ Returns:
103
+ tarfile format constant: tarfile.USTAR_FORMAT or tarfile.PAX_FORMAT
104
+ """
105
+ layer_fileobj.seek(0)
106
+
107
+ with tarfile.open(fileobj=layer_fileobj, mode='r') as tar:
108
+ for member in tar:
109
+ if skip_fn and skip_fn(member):
110
+ continue
111
+
112
+ if transform_fn:
113
+ member = transform_fn(member)
114
+
115
+ if needs_pax_format(member):
116
+ layer_fileobj.seek(0)
117
+ LOG.debug('Layer requires PAX format')
118
+ return tarfile.PAX_FORMAT
119
+
120
+ layer_fileobj.seek(0)
121
+ LOG.debug('Layer compatible with USTAR format')
122
+ return tarfile.USTAR_FORMAT