code-provenance 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ from code_provenance.models import ImageRef, ImageResult
code_provenance/cli.py ADDED
@@ -0,0 +1,69 @@
1
+ import argparse
2
+ import sys
3
+ from pathlib import Path
4
+ from code_provenance.compose_parser import parse_compose, parse_image_ref
5
+ from code_provenance.resolver import resolve_image
6
+ from code_provenance.output import format_json, format_table
7
+
8
+
9
+ def main(argv: list[str] | None = None) -> int:
10
+ parser = argparse.ArgumentParser(
11
+ prog="code-provenance",
12
+ description="Resolve Docker images to their source code commits on GitHub.",
13
+ )
14
+ parser.add_argument(
15
+ "compose_file",
16
+ nargs="?",
17
+ default="docker-compose.yml",
18
+ help="Path to docker-compose file (default: docker-compose.yml)",
19
+ )
20
+ parser.add_argument(
21
+ "--json",
22
+ action="store_true",
23
+ dest="json_output",
24
+ help="Output results as JSON",
25
+ )
26
+ parser.add_argument(
27
+ "--verbose", "-v",
28
+ action="store_true",
29
+ help="Show resolution steps",
30
+ )
31
+
32
+ args = parser.parse_args(argv)
33
+
34
+ compose_path = Path(args.compose_file)
35
+ if not compose_path.exists():
36
+ print(f"Error: {compose_path} not found", file=sys.stderr)
37
+ return 1
38
+
39
+ yaml_content = compose_path.read_text()
40
+ services = parse_compose(yaml_content)
41
+
42
+ if not services:
43
+ print("No services with images found.", file=sys.stderr)
44
+ return 0
45
+
46
+ results = []
47
+ for service_name, image_string in services:
48
+ ref = parse_image_ref(image_string)
49
+ result = resolve_image(service_name, ref)
50
+ results.append(result)
51
+
52
+ if args.verbose:
53
+ for result in results:
54
+ print(f"\nResolving {result.image} ...", file=sys.stderr)
55
+ for step in result.steps:
56
+ print(f" {step}", file=sys.stderr)
57
+ print(f" → {result.status}" + (f" ({result.resolution_method}, {result.confidence})" if result.status == "resolved" else ""), file=sys.stderr)
58
+ print(file=sys.stderr)
59
+
60
+ if args.json_output:
61
+ print(format_json(results))
62
+ else:
63
+ print(format_table(results))
64
+
65
+ return 0
66
+
67
+
68
+ if __name__ == "__main__":
69
+ sys.exit(main())
@@ -0,0 +1,58 @@
1
+ import yaml
2
+ from code_provenance.models import ImageRef
3
+
4
+
5
+ def parse_image_ref(image_string: str) -> ImageRef:
6
+ """Parse a Docker image string into an ImageRef."""
7
+ raw = image_string
8
+
9
+ # Handle digest references (image@sha256:...)
10
+ if "@" in image_string:
11
+ name_part, digest = image_string.split("@", 1)
12
+ tag = digest
13
+ image_string = name_part
14
+ elif ":" in image_string.split("/")[-1]:
15
+ colon_pos = image_string.rfind(":")
16
+ tag = image_string[colon_pos + 1:]
17
+ image_string = image_string[:colon_pos]
18
+ else:
19
+ tag = "latest"
20
+
21
+ # Determine registry
22
+ parts = image_string.split("/")
23
+ if len(parts) >= 2 and ("." in parts[0] or ":" in parts[0]):
24
+ registry = parts[0]
25
+ remaining = parts[1:]
26
+ else:
27
+ registry = "docker.io"
28
+ remaining = parts
29
+
30
+ # Determine namespace and name
31
+ if len(remaining) == 1:
32
+ namespace = "library"
33
+ name = remaining[0]
34
+ elif len(remaining) == 2:
35
+ namespace = remaining[0]
36
+ name = remaining[1]
37
+ else:
38
+ namespace = remaining[0]
39
+ name = "/".join(remaining[1:])
40
+
41
+ return ImageRef(
42
+ registry=registry,
43
+ namespace=namespace,
44
+ name=name,
45
+ tag=tag,
46
+ raw=raw,
47
+ )
48
+
49
+
50
+ def parse_compose(yaml_content: str) -> list[tuple[str, str]]:
51
+ """Parse docker-compose YAML and return list of (service_name, image_string)."""
52
+ data = yaml.safe_load(yaml_content)
53
+ services = data.get("services", {}) or {}
54
+ results = []
55
+ for service_name, service_config in services.items():
56
+ if isinstance(service_config, dict) and "image" in service_config:
57
+ results.append((service_name, service_config["image"]))
58
+ return results
@@ -0,0 +1,250 @@
1
+ import os
2
+ import re
3
+ import requests
4
+
5
+
6
+ def github_headers() -> dict[str, str]:
7
+ """Build GitHub API headers, with optional token auth."""
8
+ headers = {"Accept": "application/vnd.github+json"}
9
+ token = os.environ.get("GITHUB_TOKEN")
10
+ if token:
11
+ headers["Authorization"] = f"Bearer {token}"
12
+ return headers
13
+
14
+
15
+ def _normalize_tag(tag: str) -> str:
16
+ """Strip leading 'v' for comparison."""
17
+ return tag.lstrip("v")
18
+
19
+
20
+ def _is_prefix_match(image_tag: str, git_tag: str) -> bool:
21
+ """Check if git_tag is a more specific version of image_tag.
22
+
23
+ e.g., image_tag='v2.10' matches git_tag='v2.10.7' but not 'v2.1' or 'v2.100'.
24
+ """
25
+ norm_image = _normalize_tag(image_tag)
26
+ norm_git = _normalize_tag(git_tag)
27
+ return norm_git.startswith(norm_image + ".")
28
+
29
+
30
+ def _parse_version_tuple(tag: str) -> tuple[int, ...] | None:
31
+ """Parse a version string into a tuple of ints for comparison."""
32
+ norm = _normalize_tag(tag)
33
+ # Strip pre-release suffixes like -rc1, -beta2
34
+ norm = re.split(r"[-+]", norm)[0]
35
+ parts = norm.split(".")
36
+ try:
37
+ return tuple(int(p) for p in parts)
38
+ except ValueError:
39
+ return None
40
+
41
+
42
+ def resolve_tag_to_commit(owner: str, repo: str, tag: str) -> tuple[str, bool] | None:
43
+ """Resolve an image tag to a commit SHA by matching against git tags.
44
+
45
+ Tries exact match first, then prefix match (e.g., v2.10 -> highest v2.10.x).
46
+ Returns (commit_sha, is_exact_match) or None.
47
+ """
48
+ headers = github_headers()
49
+ url = f"https://api.github.com/repos/{owner}/{repo}/tags"
50
+
51
+ prefix_candidates: list[tuple[tuple[int, ...], str]] = []
52
+
53
+ while url:
54
+ resp = requests.get(url, headers=headers, params={"per_page": 100}, timeout=10)
55
+ if resp.status_code != 200:
56
+ return None
57
+
58
+ for git_tag in resp.json():
59
+ name = git_tag["name"]
60
+ # Exact match (with/without v prefix)
61
+ if name == tag or name == f"v{tag}" or _normalize_tag(name) == _normalize_tag(tag):
62
+ return git_tag["commit"]["sha"], True
63
+
64
+ # Collect prefix match candidates
65
+ if _is_prefix_match(tag, name):
66
+ version = _parse_version_tuple(name)
67
+ if version is not None:
68
+ prefix_candidates.append((version, git_tag["commit"]["sha"]))
69
+
70
+ url = resp.links.get("next", {}).get("url")
71
+
72
+ # Return the highest version among prefix matches
73
+ if prefix_candidates:
74
+ prefix_candidates.sort(reverse=True)
75
+ return prefix_candidates[0][1], False
76
+
77
+ return None
78
+
79
+
80
+ def get_latest_release_commit(owner: str, repo: str) -> tuple[str, str] | None:
81
+ """Get the commit SHA of the latest GitHub release.
82
+
83
+ Returns (commit_sha, tag_name) or None.
84
+ """
85
+ headers = github_headers()
86
+ try:
87
+ resp = requests.get(
88
+ f"https://api.github.com/repos/{owner}/{repo}/releases/latest",
89
+ headers=headers,
90
+ timeout=10,
91
+ )
92
+ if resp.status_code != 200:
93
+ return None
94
+ tag_name = resp.json().get("tag_name")
95
+ if not tag_name:
96
+ return None
97
+ except requests.RequestException:
98
+ return None
99
+
100
+ # Resolve the release tag to a commit
101
+ tag_result = resolve_tag_to_commit(owner, repo, tag_name)
102
+ if tag_result:
103
+ commit_sha, _ = tag_result
104
+ return commit_sha, tag_name
105
+ return None
106
+
107
+
108
+ def get_latest_commit(owner: str, repo: str) -> str | None:
109
+ """Get the latest commit SHA on the default branch."""
110
+ headers = github_headers()
111
+ try:
112
+ resp = requests.get(
113
+ f"https://api.github.com/repos/{owner}/{repo}/commits",
114
+ headers=headers,
115
+ params={"per_page": 1},
116
+ timeout=10,
117
+ )
118
+ if resp.status_code != 200:
119
+ return None
120
+ commits = resp.json()
121
+ if commits:
122
+ return commits[0]["sha"]
123
+ except (requests.RequestException, KeyError, IndexError):
124
+ pass
125
+ return None
126
+
127
+
128
+ def check_github_repo_exists(owner: str, repo: str) -> bool:
129
+ """Check if a GitHub repo exists."""
130
+ headers = github_headers()
131
+ try:
132
+ resp = requests.get(
133
+ f"https://api.github.com/repos/{owner}/{repo}",
134
+ headers=headers,
135
+ timeout=10,
136
+ )
137
+ return resp.status_code == 200
138
+ except requests.RequestException:
139
+ return False
140
+
141
+
142
+ def _find_ghcr_package_version(
143
+ owner: str, package_name: str, *, match_digest: str | None = None, match_tag: str | None = None,
144
+ ) -> dict | None:
145
+ """Find a GHCR package version by digest or tag via the GitHub Packages API.
146
+
147
+ Requires GITHUB_TOKEN with read:packages scope.
148
+ Returns {"repo": "owner/repo", "commit": "sha", "tags": [...]} or None.
149
+ """
150
+ headers = github_headers()
151
+ if "Authorization" not in headers:
152
+ return None
153
+
154
+ for entity_type in ["orgs", "users"]:
155
+ pkg_base = f"https://api.github.com/{entity_type}/{owner}/packages/container/{package_name}"
156
+
157
+ # Get package metadata for source repo
158
+ try:
159
+ pkg_resp = requests.get(pkg_base, headers=headers, timeout=10)
160
+ if pkg_resp.status_code == 403:
161
+ return None
162
+ if pkg_resp.status_code != 200:
163
+ continue
164
+ pkg_data = pkg_resp.json()
165
+ except requests.RequestException:
166
+ continue
167
+
168
+ repo_info = pkg_data.get("repository", {})
169
+ full_name = repo_info.get("full_name")
170
+ if not full_name:
171
+ continue
172
+
173
+ # Search versions
174
+ url = f"{pkg_base}/versions"
175
+ try:
176
+ while url:
177
+ resp = requests.get(url, headers=headers, params={"per_page": 50}, timeout=10)
178
+ if resp.status_code != 200:
179
+ break
180
+
181
+ for version in resp.json():
182
+ name = version.get("name", "")
183
+ metadata = version.get("metadata", {}).get("container", {})
184
+ tags = metadata.get("tags", [])
185
+
186
+ # Match by digest (version name is the digest)
187
+ if match_digest and name != match_digest:
188
+ if match_tag is None:
189
+ continue
190
+ # Match by tag
191
+ if match_tag and match_tag not in tags:
192
+ continue
193
+
194
+ # Found matching version — resolve tags to a commit
195
+ repo_owner, repo_name = full_name.split("/", 1)
196
+ resolvable_tags = [t for t in tags if t != "latest"]
197
+ for tag in resolvable_tags:
198
+ tag_result = resolve_tag_to_commit(repo_owner, repo_name, tag)
199
+ if tag_result:
200
+ commit_sha, _ = tag_result
201
+ return {"repo": full_name, "commit": commit_sha, "tags": tags}
202
+
203
+ return {"repo": full_name, "commit": None, "tags": tags}
204
+
205
+ url = resp.links.get("next", {}).get("url")
206
+ except requests.RequestException:
207
+ continue
208
+
209
+ return None
210
+
211
+
212
+ def resolve_ghcr_digest_via_packages(owner: str, package_name: str, digest: str) -> dict | None:
213
+ """Find the commit for a GHCR image by its digest."""
214
+ return _find_ghcr_package_version(owner, package_name, match_digest=digest)
215
+
216
+
217
+ def resolve_ghcr_latest_via_packages(owner: str, package_name: str) -> dict | None:
218
+ """Find the commit for a GHCR image's :latest tag."""
219
+ return _find_ghcr_package_version(owner, package_name, match_tag="latest")
220
+
221
+
222
+ def infer_repo_from_dockerhub(namespace: str, name: str) -> tuple[str, str] | None:
223
+ """Try to find the GitHub repo for a Docker Hub image."""
224
+ # For official images (library/X), try the image name as org/repo directly
225
+ # e.g., traefik -> traefik/traefik, nginx -> nginx/nginx
226
+ if namespace == "library":
227
+ if check_github_repo_exists(name, name):
228
+ return name, name
229
+
230
+ # For namespaced images, try namespace/name on GitHub
231
+ if namespace != "library":
232
+ if check_github_repo_exists(namespace, name):
233
+ return namespace, name
234
+
235
+ # Fall back to scraping Docker Hub description for GitHub links
236
+ url = f"https://hub.docker.com/v2/repositories/{namespace}/{name}"
237
+ try:
238
+ resp = requests.get(url, timeout=10)
239
+ if resp.status_code != 200:
240
+ return None
241
+
242
+ data = resp.json()
243
+ text = (data.get("full_description") or "") + " " + (data.get("description") or "")
244
+ match = re.search(r"https?://github\.com/([\w.-]+)/([\w.-]+)", text)
245
+ if match:
246
+ return match.group(1), match.group(2)
247
+ except requests.RequestException:
248
+ pass
249
+
250
+ return None
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass, field
2
+
3
+
4
+ @dataclass
5
+ class ImageRef:
6
+ """A parsed Docker image reference."""
7
+ registry: str # e.g. "ghcr.io", "docker.io"
8
+ namespace: str # e.g. "myorg", "library"
9
+ name: str # e.g. "excalidraw", "postgres"
10
+ tag: str # e.g. "v3.4.12", "latest"
11
+ raw: str # original string from docker-compose
12
+
13
+ @property
14
+ def full_name(self) -> str:
15
+ """Registry/namespace/name without tag."""
16
+ return f"{self.registry}/{self.namespace}/{self.name}"
17
+
18
+
19
+ @dataclass
20
+ class ImageResult:
21
+ """Resolution result for a single image."""
22
+ service: str
23
+ image: str # original image string
24
+ registry: str
25
+ repo: str | None = None
26
+ tag: str = ""
27
+ commit: str | None = None
28
+ commit_url: str | None = None
29
+ status: str = "repo_not_found"
30
+ resolution_method: str | None = None
31
+ confidence: str | None = None # "exact", "approximate", or None if unresolved
32
+ steps: list[str] = field(default_factory=list)
@@ -0,0 +1,33 @@
1
+ import json
2
+ from dataclasses import asdict
3
+ from io import StringIO
4
+ from rich.console import Console
5
+ from rich.table import Table
6
+ from code_provenance.models import ImageResult
7
+
8
+
9
+ def format_json(results: list[ImageResult]) -> str:
10
+ """Format results as a JSON array."""
11
+ return json.dumps([asdict(r) for r in results], indent=2)
12
+
13
+
14
+ def format_table(results: list[ImageResult]) -> str:
15
+ """Format results as a rich table, returned as a string."""
16
+ table = Table(show_header=True, header_style="bold")
17
+ table.add_column("SERVICE")
18
+ table.add_column("IMAGE")
19
+ table.add_column("REPO")
20
+ table.add_column("COMMIT")
21
+ table.add_column("STATUS")
22
+ table.add_column("CONFIDENCE")
23
+
24
+ for r in results:
25
+ commit_display = r.commit[:12] if r.commit else "-"
26
+ repo_display = r.repo.replace("https://", "") if r.repo else "-"
27
+ confidence_display = r.confidence or "-"
28
+ table.add_row(r.service, r.image, repo_display, commit_display, r.status, confidence_display)
29
+
30
+ buf = StringIO()
31
+ console = Console(file=buf, force_terminal=False, width=160)
32
+ console.print(table)
33
+ return buf.getvalue()
@@ -0,0 +1,121 @@
1
+ import requests
2
+ from code_provenance.models import ImageRef
3
+
4
+
5
+ def get_registry_token(registry: str, repo_path: str) -> str | None:
6
+ """Get an anonymous pull token from an OCI registry."""
7
+ if registry == "ghcr.io":
8
+ url = "https://ghcr.io/token"
9
+ params = {"scope": f"repository:{repo_path}:pull"}
10
+ elif registry == "docker.io":
11
+ url = "https://auth.docker.io/token"
12
+ params = {
13
+ "service": "registry.docker.io",
14
+ "scope": f"repository:{repo_path}:pull",
15
+ }
16
+ else:
17
+ return None
18
+
19
+ try:
20
+ resp = requests.get(url, params=params, timeout=10)
21
+ resp.raise_for_status()
22
+ return resp.json()["token"]
23
+ except (requests.RequestException, KeyError):
24
+ return None
25
+
26
+
27
+ def _registry_base_url(registry: str) -> str:
28
+ if registry == "docker.io":
29
+ return "https://registry-1.docker.io"
30
+ return f"https://{registry}"
31
+
32
+
33
+ _MANIFEST_ACCEPT = ", ".join([
34
+ "application/vnd.docker.distribution.manifest.v2+json",
35
+ "application/vnd.oci.image.manifest.v1+json",
36
+ "application/vnd.docker.distribution.manifest.list.v2+json",
37
+ "application/vnd.oci.image.index.v1+json",
38
+ ])
39
+
40
+ _INDEX_MEDIA_TYPES = {
41
+ "application/vnd.docker.distribution.manifest.list.v2+json",
42
+ "application/vnd.oci.image.index.v1+json",
43
+ }
44
+
45
+
46
+ def _resolve_manifest_to_config_digest(
47
+ base_url: str, repo_path: str, reference: str, token: str,
48
+ ) -> str | None:
49
+ """Resolve a manifest reference to a config blob digest, handling multi-arch indexes."""
50
+ headers = {"Authorization": f"Bearer {token}", "Accept": _MANIFEST_ACCEPT}
51
+
52
+ try:
53
+ resp = requests.get(
54
+ f"{base_url}/v2/{repo_path}/manifests/{reference}",
55
+ headers=headers,
56
+ timeout=10,
57
+ )
58
+ if resp.status_code != 200:
59
+ return None
60
+ data = resp.json()
61
+ except (requests.RequestException, KeyError, ValueError):
62
+ return None
63
+
64
+ media_type = data.get("mediaType", "")
65
+
66
+ # If it's an index/manifest list, pick the first amd64/linux manifest
67
+ if media_type in _INDEX_MEDIA_TYPES:
68
+ manifests = data.get("manifests", [])
69
+ platform_digest = None
70
+ for m in manifests:
71
+ platform = m.get("platform", {})
72
+ # Skip attestation manifests
73
+ if platform.get("os") == "unknown":
74
+ continue
75
+ if platform.get("architecture") == "amd64" and platform.get("os") == "linux":
76
+ platform_digest = m["digest"]
77
+ break
78
+ if not platform_digest and manifests:
79
+ # Fall back to first non-attestation manifest
80
+ for m in manifests:
81
+ if m.get("platform", {}).get("os") != "unknown":
82
+ platform_digest = m["digest"]
83
+ break
84
+ if not platform_digest:
85
+ return None
86
+ # Recursively resolve the platform-specific manifest
87
+ return _resolve_manifest_to_config_digest(base_url, repo_path, platform_digest, token)
88
+
89
+ # Single manifest — extract config digest
90
+ return data.get("config", {}).get("digest")
91
+
92
+
93
+ def fetch_oci_labels(ref: ImageRef) -> dict[str, str]:
94
+ """Fetch OCI labels from an image's config blob without pulling the image."""
95
+ repo_path = f"{ref.namespace}/{ref.name}"
96
+ token = get_registry_token(ref.registry, repo_path)
97
+ if not token:
98
+ return {}
99
+
100
+ base_url = _registry_base_url(ref.registry)
101
+ config_digest = _resolve_manifest_to_config_digest(base_url, repo_path, ref.tag, token)
102
+ if not config_digest:
103
+ return {}
104
+
105
+ # Fetch config blob to read labels
106
+ try:
107
+ config_headers = {
108
+ "Authorization": f"Bearer {token}",
109
+ "Accept": "application/vnd.docker.container.image.v1+json",
110
+ }
111
+ config_resp = requests.get(
112
+ f"{base_url}/v2/{repo_path}/blobs/{config_digest}",
113
+ headers=config_headers,
114
+ timeout=10,
115
+ allow_redirects=True,
116
+ )
117
+ if config_resp.status_code != 200:
118
+ return {}
119
+ return config_resp.json().get("config", {}).get("Labels", {}) or {}
120
+ except (requests.RequestException, KeyError):
121
+ return {}
@@ -0,0 +1,160 @@
1
+ import re
2
+ from code_provenance.models import ImageRef, ImageResult
3
+ from code_provenance.registry import fetch_oci_labels
4
+ from code_provenance.github import (
5
+ resolve_tag_to_commit, infer_repo_from_dockerhub,
6
+ resolve_ghcr_digest_via_packages, resolve_ghcr_latest_via_packages,
7
+ get_latest_release_commit, get_latest_commit,
8
+ )
9
+
10
+ _COMMIT_SHA_RE = re.compile(r"^[0-9a-f]{40,}$")
11
+ _DIGEST_RE = re.compile(r"^sha256:[0-9a-f]{64}$")
12
+
13
+
14
+ def _is_resolvable_tag(tag: str) -> bool:
15
+ """Check if a tag can be matched against git tags."""
16
+ return bool(tag) and tag != "latest" and not _DIGEST_RE.match(tag)
17
+
18
+
19
+ def resolve_image(service: str, ref: ImageRef) -> ImageResult:
20
+ """Run the resolution chain for a single image."""
21
+ result = ImageResult(
22
+ service=service,
23
+ image=ref.raw,
24
+ registry=ref.registry,
25
+ tag=ref.tag,
26
+ )
27
+
28
+ # Step 1: Check OCI labels
29
+ result.steps.append(f"[1/5] Fetching OCI labels from {ref.registry}/{ref.namespace}/{ref.name}:{ref.tag}")
30
+ labels = fetch_oci_labels(ref)
31
+ source = labels.get("org.opencontainers.image.source")
32
+ revision = labels.get("org.opencontainers.image.revision")
33
+ if source and revision:
34
+ result.steps.append(f"[1/5] Found OCI labels: source={source}, revision={revision[:12]}")
35
+ result.repo = source
36
+ result.commit = revision
37
+ result.commit_url = f"{source}/commit/{revision}"
38
+ result.status = "resolved"
39
+ result.resolution_method = "oci_labels"
40
+ result.confidence = "approximate" if ref.tag == "latest" else "exact"
41
+ return result
42
+ else:
43
+ result.steps.append("[1/5] No OCI labels found")
44
+
45
+ # Step 2: Infer repo
46
+ result.steps.append(f"[2/5] Inferring GitHub repo from {ref.registry}/{ref.namespace}/{ref.name}")
47
+ owner, repo_name = _infer_repo(ref)
48
+ if owner and repo_name:
49
+ result.steps.append(f"[2/5] Repo inferred: {owner}/{repo_name}")
50
+ result.repo = f"https://github.com/{owner}/{repo_name}"
51
+ else:
52
+ result.steps.append("[2/5] Could not infer GitHub repo")
53
+ result.status = "repo_not_found"
54
+ return result
55
+
56
+ # Check if tag is a commit SHA
57
+ if _COMMIT_SHA_RE.match(ref.tag):
58
+ result.steps.append("[2/5] Tag is a commit SHA, using directly")
59
+ result.commit = ref.tag
60
+ result.commit_url = f"{result.repo}/commit/{ref.tag}"
61
+ result.status = "resolved"
62
+ result.resolution_method = "commit_sha_tag"
63
+ result.confidence = "exact"
64
+ return result
65
+
66
+ # Step 3: Tag-to-commit resolution
67
+ if _is_resolvable_tag(ref.tag):
68
+ result.steps.append(f'[3/5] Matching tag "{ref.tag}" against git tags in {owner}/{repo_name}')
69
+ tag_result = resolve_tag_to_commit(owner, repo_name, ref.tag)
70
+ if tag_result:
71
+ commit_sha, is_exact = tag_result
72
+ if is_exact:
73
+ result.steps.append(f"[3/5] Exact tag match: {commit_sha[:12]}")
74
+ else:
75
+ result.steps.append(f"[3/5] Prefix match (e.g. v2.10 -> v2.10.x): {commit_sha[:12]}")
76
+ result.commit = commit_sha
77
+ result.commit_url = f"{result.repo}/commit/{commit_sha}"
78
+ result.status = "resolved"
79
+ result.resolution_method = "tag_match"
80
+ result.confidence = "exact" if is_exact else "approximate"
81
+ return result
82
+ result.steps.append("[3/5] No matching git tag found")
83
+ result.status = "repo_found_tag_not_matched"
84
+ return result
85
+
86
+ # Step 4: For GHCR images, try the packages API for digest or :latest
87
+ if ref.registry == "ghcr.io":
88
+ if _DIGEST_RE.match(ref.tag):
89
+ result.steps.append(f"[4/5] Trying GHCR packages API for digest {ref.tag[:20]}...")
90
+ pkg_result = resolve_ghcr_digest_via_packages(ref.namespace, ref.name, ref.tag)
91
+ pkg_confidence = "exact" # digest is immutable
92
+ elif ref.tag == "latest" or not ref.tag:
93
+ result.steps.append("[4/5] Trying GHCR packages API for :latest tag")
94
+ pkg_result = resolve_ghcr_latest_via_packages(ref.namespace, ref.name)
95
+ pkg_confidence = "approximate" # :latest is mutable
96
+ else:
97
+ pkg_result = None
98
+ pkg_confidence = None
99
+
100
+ if pkg_result:
101
+ repo_full = pkg_result["repo"]
102
+ result.repo = f"https://github.com/{repo_full}"
103
+ if pkg_result.get("commit"):
104
+ commit = pkg_result["commit"]
105
+ result.steps.append(f"[4/5] Packages API: repo={repo_full}, commit={commit[:12]}")
106
+ result.commit = commit
107
+ result.commit_url = f"{result.repo}/commit/{result.commit}"
108
+ result.status = "resolved"
109
+ result.resolution_method = "packages_api"
110
+ result.confidence = pkg_confidence
111
+ return result
112
+ result.steps.append("[4/5] Packages API: found package but no resolvable tags")
113
+ tags = pkg_result.get("tags", [])
114
+ resolvable = [t for t in tags if t != "latest"]
115
+ result.status = "repo_found_tag_not_matched" if resolvable else "no_tag"
116
+ return result
117
+
118
+ # Step 5: For :latest on any registry, try the latest GitHub release,
119
+ # then fall back to the latest commit on the default branch
120
+ if (ref.tag == "latest" or not ref.tag) and owner and repo_name:
121
+ result.steps.append(f"[5/5] Trying latest GitHub release for {owner}/{repo_name}")
122
+ release_result = get_latest_release_commit(owner, repo_name)
123
+ if release_result:
124
+ commit_sha, tag_name = release_result
125
+ result.steps.append(f"[5/5] Latest release: tag={tag_name}, commit={commit_sha[:12]}")
126
+ result.commit = commit_sha
127
+ result.commit_url = f"{result.repo}/commit/{commit_sha}"
128
+ result.status = "resolved"
129
+ result.resolution_method = "latest_release"
130
+ result.confidence = "approximate"
131
+ return result
132
+
133
+ # No releases — fall back to latest commit on default branch
134
+ result.steps.append("[5/5] No release found, trying latest commit on default branch")
135
+ latest_sha = get_latest_commit(owner, repo_name)
136
+ if latest_sha:
137
+ result.steps.append(f"[5/5] Latest commit: {latest_sha[:12]}")
138
+ result.commit = latest_sha
139
+ result.commit_url = f"{result.repo}/commit/{latest_sha}"
140
+ result.status = "resolved"
141
+ result.resolution_method = "latest_commit"
142
+ result.confidence = "approximate"
143
+ return result
144
+
145
+ result.steps.append("[5/5] Could not resolve to a commit")
146
+ result.status = "no_tag"
147
+ return result
148
+
149
+
150
+ def _infer_repo(ref: ImageRef) -> tuple[str | None, str | None]:
151
+ """Infer the GitHub owner and repo name from an image reference."""
152
+ if ref.registry == "ghcr.io":
153
+ return ref.namespace, ref.name
154
+
155
+ if ref.registry == "docker.io":
156
+ hub_result = infer_repo_from_dockerhub(ref.namespace, ref.name)
157
+ if hub_result:
158
+ return hub_result
159
+
160
+ return None, None
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: code-provenance
3
+ Version: 0.1.0
4
+ Summary: Resolve Docker images to their source code commits on GitHub
5
+ Author: SCRT Labs
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/scrtlabs/code-provenance
8
+ Project-URL: Repository, https://github.com/scrtlabs/code-provenance
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Topic :: Software Development :: Build Tools
16
+ Classifier: Topic :: System :: Software Distribution
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: pyyaml>=6.0
20
+ Requires-Dist: requests>=2.31
21
+ Requires-Dist: rich>=13.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=7.0; extra == "dev"
24
+
25
+ # code-provenance
26
+
27
+ Resolve Docker images in a docker-compose file to their exact source code commits on GitHub.
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install code-provenance
33
+ ```
34
+
35
+ Requires Python 3.10+.
36
+
37
+ ## CLI Usage
38
+
39
+ ```bash
40
+ code-provenance [compose-file] [--json] [--verbose]
41
+ ```
42
+
43
+ - `compose-file` -- path to a docker-compose file (default: `docker-compose.yml`)
44
+ - `--json` -- output results as JSON
45
+ - `--verbose`, `-v` -- show resolution steps for each image
46
+
47
+ ### Example
48
+
49
+ ```bash
50
+ code-provenance docker-compose.yml
51
+ ```
52
+
53
+ ```
54
+ ┌─────────┬────────────────┬────────────────────────────┬──────────────┬──────────┬────────────┐
55
+ │ SERVICE │ IMAGE │ REPO │ COMMIT │ STATUS │ CONFIDENCE │
56
+ ├─────────┼────────────────┼────────────────────────────┼──────────────┼──────────┼────────────┤
57
+ │ web │ traefik:v3.6.0 │ github.com/traefik/traefik │ 06db5168c0d9 │ resolved │ exact │
58
+ └─────────┴────────────────┴────────────────────────────┴──────────────┴──────────┴────────────┘
59
+ ```
60
+
61
+ ## Library Usage
62
+
63
+ ```python
64
+ from code_provenance.compose_parser import parse_compose, parse_image_ref
65
+ from code_provenance.resolver import resolve_image
66
+
67
+ yaml_content = open("docker-compose.yml").read()
68
+ for service, image in parse_compose(yaml_content):
69
+ ref = parse_image_ref(image)
70
+ result = resolve_image(service, ref)
71
+ print(f"{result.service}: {result.commit} ({result.confidence})")
72
+ ```
73
+
74
+ ## API Reference
75
+
76
+ ### Functions
77
+
78
+ - `parse_compose(yaml_content: str) -> list[tuple[str, str]]` -- parse a docker-compose YAML string and return `(service_name, image_string)` pairs
79
+ - `parse_image_ref(image: str) -> ImageRef` -- parse a Docker image string into its components
80
+ - `resolve_image(service: str, ref: ImageRef) -> ImageResult` -- resolve an image reference to its source code commit
81
+
82
+ ### ImageRef
83
+
84
+ | Field | Type | Description |
85
+ |-------|------|-------------|
86
+ | `registry` | `str` | e.g. `"ghcr.io"`, `"docker.io"` |
87
+ | `namespace` | `str` | e.g. `"myorg"`, `"library"` |
88
+ | `name` | `str` | e.g. `"traefik"`, `"postgres"` |
89
+ | `tag` | `str` | e.g. `"v3.6.0"`, `"latest"` |
90
+ | `raw` | `str` | original image string from docker-compose |
91
+
92
+ ### ImageResult
93
+
94
+ | Field | Type | Description |
95
+ |-------|------|-------------|
96
+ | `service` | `str` | service name from docker-compose |
97
+ | `image` | `str` | original image string |
98
+ | `registry` | `str` | image registry |
99
+ | `repo` | `str \| None` | GitHub repository URL |
100
+ | `tag` | `str` | image tag |
101
+ | `commit` | `str \| None` | resolved commit SHA |
102
+ | `commit_url` | `str \| None` | URL to the commit on GitHub |
103
+ | `status` | `str` | `"resolved"`, `"repo_not_found"`, `"repo_found_tag_not_matched"`, or `"no_tag"` |
104
+ | `resolution_method` | `str \| None` | how the commit was resolved (e.g. `"oci_labels"`, `"tag_match"`) |
105
+ | `confidence` | `str \| None` | `"exact"` or `"approximate"` |
106
+ | `steps` | `list[str]` | resolution steps taken (useful with `--verbose`) |
107
+
108
+ ## Authentication
109
+
110
+ Set `GITHUB_TOKEN` for full functionality (digest resolution, `:latest` on GHCR, higher rate limits):
111
+
112
+ ```bash
113
+ export GITHUB_TOKEN=ghp_your_token_here
114
+ ```
115
+
116
+ Create a classic token at https://github.com/settings/tokens with `read:packages` scope. If using the `gh` CLI, run `gh auth refresh -h github.com -s read:packages` first.
117
+
118
+ The `run.sh` wrapper auto-detects the token from `gh` CLI if available.
119
+
120
+ ## License
121
+
122
+ MIT
@@ -0,0 +1,13 @@
1
+ code_provenance/__init__.py,sha256=QXuTuB3BUwFyeHy-1C344W90AwTq_sylBVcfoq-eGuw,57
2
+ code_provenance/cli.py,sha256=mv1HbSP1NmaN08Pr-_8-jHyKo88kTLnoVAPWXdAn66Q,2061
3
+ code_provenance/compose_parser.py,sha256=KYi8rQsKHgjHxAKCX9lE-aXjnMmhuaXBeCnpfG31TaU,1747
4
+ code_provenance/github.py,sha256=TdUH7d__CaC_5si-APzGH9SnSdA0eQcpF3HpXonx-8A,8721
5
+ code_provenance/models.py,sha256=Ajst43R3AMTlGSZ8m03LoUjJukr5NdL-86u-qQdWB9E,1048
6
+ code_provenance/output.py,sha256=ntIlGAQIJRePpX9T4jEAZzANjL20opogRWUBFUUhTyM,1143
7
+ code_provenance/registry.py,sha256=1Y0H7sxZOXIy9fpQRzVF2fUVgECcd1W1xI6VBx7QZ1U,4195
8
+ code_provenance/resolver.py,sha256=MD23eZFm9VZ0ftqzNBzMkzKS4Ggo3OG8CSAyUErsiYE,7084
9
+ code_provenance-0.1.0.dist-info/METADATA,sha256=GjZskUG-1V5UutExejROe1Q59giRfsX65W2IJRkg5mE,4885
10
+ code_provenance-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
11
+ code_provenance-0.1.0.dist-info/entry_points.txt,sha256=iAIV7fGMBwmskCD1Vn15yrQt4VM52di9xjYf3cPC4jI,61
12
+ code_provenance-0.1.0.dist-info/top_level.txt,sha256=TJ3A2kuZ0yVRV3rQYBZ6-1ILu_umwN8m7lmdYnhT2ak,16
13
+ code_provenance-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ code-provenance = code_provenance.cli:main
@@ -0,0 +1 @@
1
+ code_provenance