osslag 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osslag/__init__.py +0 -0
- osslag/cli.py +1380 -0
- osslag/distro/__init__.py +0 -0
- osslag/distro/debian.py +382 -0
- osslag/distro/fedora.py +38 -0
- osslag/metrics/__init__.py +0 -0
- osslag/metrics/malta.py +585 -0
- osslag/metrics/pvac.py +166 -0
- osslag/utils/__init__.py +0 -0
- osslag/utils/github_helper.py +240 -0
- osslag/utils/vcs.py +543 -0
- osslag-1.0.0.dist-info/METADATA +46 -0
- osslag-1.0.0.dist-info/RECORD +15 -0
- osslag-1.0.0.dist-info/WHEEL +4 -0
- osslag-1.0.0.dist-info/entry_points.txt +3 -0
|
File without changes
|
osslag/distro/debian.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import lzma
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from debian import deb822 # from python-debian
|
|
11
|
+
from debian.debian_support import Version
|
|
12
|
+
from pandas._libs.missing import NAType # pyright: ignore[reportPrivateImportUsage]
|
|
13
|
+
|
|
14
|
+
import osslag.utils.vcs as gh
|
|
15
|
+
|
|
16
|
+
debian_packages_source_url_template = (
|
|
17
|
+
"https://ftp.debian.org/debian/dists/{release}/main/source/Sources.xz"
|
|
18
|
+
)
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_upstream_version(version_string: str) -> str | None:
|
|
23
|
+
"""Extract the upstream version from a Debian package version string using
|
|
24
|
+
the official debian.debian_support.Version parser.
|
|
25
|
+
|
|
26
|
+
Debian version format: [epoch:]upstream_version[-debian_revision]
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
version_string: A Debian package version string
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
The upstream version, or None if the input is invalid
|
|
33
|
+
|
|
34
|
+
Examples:
|
|
35
|
+
>>> extract_upstream_version("1.2.3-4")
|
|
36
|
+
'1.2.3'
|
|
37
|
+
>>> extract_upstream_version("2:1.2.3-4")
|
|
38
|
+
'1.2.3'
|
|
39
|
+
>>> extract_upstream_version("1.2.3")
|
|
40
|
+
'1.2.3'
|
|
41
|
+
>>> extract_upstream_version("1:2.0")
|
|
42
|
+
'2.0'
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
if not version_string or not isinstance(version_string, str):
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
version = Version(version_string.strip())
|
|
50
|
+
upstream = version.upstream_version
|
|
51
|
+
|
|
52
|
+
if isinstance(upstream, str):
|
|
53
|
+
# Drop Debian repack/metadata suffixes (e.g., +dfsg, +gitYYYY..., +ds)
|
|
54
|
+
if "+" in upstream:
|
|
55
|
+
upstream = upstream.split("+", 1)[0]
|
|
56
|
+
|
|
57
|
+
# Drop prerelease-style suffixes that use '~' (e.g., ~rc1)
|
|
58
|
+
if "~" in upstream:
|
|
59
|
+
upstream = upstream.split("~", 1)[0]
|
|
60
|
+
|
|
61
|
+
upstream = upstream.strip() if isinstance(upstream, str) else upstream
|
|
62
|
+
return upstream if upstream else None
|
|
63
|
+
except (ValueError, AttributeError):
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def add_upstream_version_column(
|
|
68
|
+
df: pd.DataFrame, version_column: str, new_column_name: str | None = None
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
"""Extract upstream version for each row in a DataFrame and add it as a new column.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
df: DataFrame containing version strings
|
|
74
|
+
version_column: Name of the column containing Debian version strings
|
|
75
|
+
new_column_name: Name for the new column (default: "{version_column}_upstream")
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
DataFrame with the new upstream version column added
|
|
79
|
+
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If the specified version_column doesn't exist in the DataFrame
|
|
82
|
+
|
|
83
|
+
Examples:
|
|
84
|
+
>>> df = pd.DataFrame(
|
|
85
|
+
... {"source": ["pkg1", "pkg2"], "version": ["1.2.3-4", "2:1.0-1"]}
|
|
86
|
+
... )
|
|
87
|
+
>>> result = add_upstream_version_column(df, "version")
|
|
88
|
+
>>> result["version_upstream"].tolist()
|
|
89
|
+
['1.2.3', '1.0']
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
if version_column not in df.columns:
|
|
93
|
+
raise ValueError(f"Column '{version_column}' not found in DataFrame")
|
|
94
|
+
|
|
95
|
+
# Determine the new column name
|
|
96
|
+
if new_column_name is None:
|
|
97
|
+
new_column_name = f"{version_column}_upstream"
|
|
98
|
+
|
|
99
|
+
# Apply the extraction function to each row
|
|
100
|
+
df = df.copy()
|
|
101
|
+
df[new_column_name] = df[version_column].apply(extract_upstream_version)
|
|
102
|
+
|
|
103
|
+
return df
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def add_local_repo_cache_path_column(
|
|
107
|
+
df: pd.DataFrame,
|
|
108
|
+
repo_url_column: str = "homepage",
|
|
109
|
+
cache_dir: str | os.PathLike = "./cache",
|
|
110
|
+
new_column_name: str = "repo_cache_path",
|
|
111
|
+
) -> pd.DataFrame:
|
|
112
|
+
"""Add a column to the DataFrame with the local repository cache path for each repository URL.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
df: DataFrame containing repository URLs
|
|
116
|
+
repo_url_column: Name of the column containing repository URLs
|
|
117
|
+
cache_dir: Base cache directory (default: "./cache")
|
|
118
|
+
new_column_name: Name for the new column (default: "repo_cache_path")
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
DataFrame with the new repository cache path column added
|
|
122
|
+
|
|
123
|
+
Raises:
|
|
124
|
+
ValueError: If the specified repo_url_column doesn't exist in the DataFrame
|
|
125
|
+
|
|
126
|
+
Examples:
|
|
127
|
+
>>> df = pd.DataFrame(
|
|
128
|
+
... {
|
|
129
|
+
... "source": ["pkg1", "pkg2"],
|
|
130
|
+
... "repo_url": [
|
|
131
|
+
... "https://github.com/owner/repo1",
|
|
132
|
+
... "https://github.com/owner/repo2",
|
|
133
|
+
... ],
|
|
134
|
+
... }
|
|
135
|
+
... )
|
|
136
|
+
>>> result = add_local_repo_cache_path_column(
|
|
137
|
+
... df, "repo_url", cache_dir="./cache"
|
|
138
|
+
... )
|
|
139
|
+
>>> "repo_cache_path" in result.columns
|
|
140
|
+
True
|
|
141
|
+
|
|
142
|
+
"""
|
|
143
|
+
if repo_url_column not in df.columns:
|
|
144
|
+
raise ValueError(f"Column '{repo_url_column}' not found in DataFrame")
|
|
145
|
+
|
|
146
|
+
if new_column_name is None:
|
|
147
|
+
new_column_name = "repo_cache_path"
|
|
148
|
+
|
|
149
|
+
df = df.copy()
|
|
150
|
+
|
|
151
|
+
def _get_cache_path(url: Any) -> str | NAType:
|
|
152
|
+
if (
|
|
153
|
+
url is None
|
|
154
|
+
or (isinstance(url, float) and pd.isna(url))
|
|
155
|
+
or not isinstance(url, str)
|
|
156
|
+
):
|
|
157
|
+
return pd.NA
|
|
158
|
+
path = gh.construct_repo_local_path(url, cache_dir)
|
|
159
|
+
return str(path) if path is not None else pd.NA
|
|
160
|
+
|
|
161
|
+
df[new_column_name] = df[repo_url_column].map(_get_cache_path)
|
|
162
|
+
return df
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def filter_github_repos(df: pd.DataFrame) -> pd.DataFrame:
|
|
166
|
+
"""Filter DataFrame to only include rows with valid GitHub repository URLs.
|
|
167
|
+
|
|
168
|
+
Normalizes all GitHub URLs using normalize_https_repo_url and updates the
|
|
169
|
+
homepage column with the cleaned URLs. Rows with invalid or non-GitHub URLs
|
|
170
|
+
are excluded.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
df: DataFrame containing a 'homepage' column with repository URLs
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
DataFrame filtered to valid GitHub repos with normalized homepage URLs
|
|
177
|
+
|
|
178
|
+
"""
|
|
179
|
+
if "homepage" not in df.columns:
|
|
180
|
+
raise ValueError(
|
|
181
|
+
"DataFrame must contain 'homepage' column to filter GitHub repositories."
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# First filter to rows containing github.com
|
|
185
|
+
mask = df["homepage"].str.contains("github.com", na=False)
|
|
186
|
+
filtered_df = df[mask].copy()
|
|
187
|
+
|
|
188
|
+
# Normalize all GitHub URLs
|
|
189
|
+
normalized = filtered_df["homepage"].apply(gh.normalize_https_repo_url)
|
|
190
|
+
|
|
191
|
+
# Extract the normalized URL (or None if invalid)
|
|
192
|
+
filtered_df["homepage"] = normalized.apply(lambda r: r.url)
|
|
193
|
+
|
|
194
|
+
# Drop rows where normalization failed (url is None)
|
|
195
|
+
filtered_df = filtered_df[filtered_df["homepage"].notna()]
|
|
196
|
+
|
|
197
|
+
# Drop duplicates based on normalized homepage
|
|
198
|
+
github_repos_df = filtered_df.drop_duplicates(subset=["homepage"], keep="first")
|
|
199
|
+
|
|
200
|
+
return github_repos_df
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def validate_merge_safety(
|
|
204
|
+
df1: pd.DataFrame, df2: pd.DataFrame, merge_key: str = "source"
|
|
205
|
+
) -> tuple[bool, list[str]]:
|
|
206
|
+
"""Validate if two DataFrames can be safely merged.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
(is_safe, warnings) tuple where is_safe is True if no critical issues found
|
|
210
|
+
|
|
211
|
+
"""
|
|
212
|
+
warnings = []
|
|
213
|
+
is_safe = True
|
|
214
|
+
logger.info(f"Validating merge safety on key '{merge_key}'")
|
|
215
|
+
# Check if merge key exists in both
|
|
216
|
+
if merge_key not in df1.columns:
|
|
217
|
+
warnings.append(f"Merge key '{merge_key}' missing in first DataFrame")
|
|
218
|
+
is_safe = False
|
|
219
|
+
if merge_key not in df2.columns:
|
|
220
|
+
warnings.append(f"Merge key '{merge_key}' missing in second DataFrame")
|
|
221
|
+
is_safe = False
|
|
222
|
+
|
|
223
|
+
if not is_safe:
|
|
224
|
+
return is_safe, warnings
|
|
225
|
+
|
|
226
|
+
# Check for duplicates in merge key
|
|
227
|
+
df1_dupes = df1[merge_key].duplicated().sum()
|
|
228
|
+
df2_dupes = df2[merge_key].duplicated().sum()
|
|
229
|
+
|
|
230
|
+
if df1_dupes > 0:
|
|
231
|
+
warnings.append(
|
|
232
|
+
f"First DataFrame has {df1_dupes} duplicate '{merge_key}' values"
|
|
233
|
+
)
|
|
234
|
+
if df2_dupes > 0:
|
|
235
|
+
warnings.append(
|
|
236
|
+
f"Second DataFrame has {df2_dupes} duplicate '{merge_key}' values"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Check overlapping columns and their dtypes
|
|
240
|
+
common_cols = set(df1.columns) & set(df2.columns) - {merge_key}
|
|
241
|
+
|
|
242
|
+
for col in common_cols:
|
|
243
|
+
if df1[col].dtype != df2[col].dtype:
|
|
244
|
+
warnings.append(
|
|
245
|
+
f"Column '{col}' has different dtypes: {df1[col].dtype} vs {df2[col].dtype}"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Check merge key overlap
|
|
249
|
+
overlap = set(df1[merge_key]) & set(df2[merge_key])
|
|
250
|
+
overlap_pct = len(overlap) / max(len(df1), len(df2)) * 100
|
|
251
|
+
|
|
252
|
+
if overlap_pct < 10:
|
|
253
|
+
warnings.append(f"Low overlap: only {overlap_pct:.1f}% of keys match")
|
|
254
|
+
|
|
255
|
+
return is_safe, warnings
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def merge_release_packages(
|
|
259
|
+
dfs: list[pd.DataFrame],
|
|
260
|
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
261
|
+
"""Merge multiple Debian package DataFrames from different releases on the 'source' column.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
dfs: List of DataFrames to merge (expects exactly 2 DataFrames)
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Tuple of (merged_df, unmerged_df) where:
|
|
268
|
+
- merged_df contains rows that matched on 'source' column
|
|
269
|
+
- unmerged_df contains rows that didn't match (from either dataframe)
|
|
270
|
+
|
|
271
|
+
"""
|
|
272
|
+
if len(dfs) != 2:
|
|
273
|
+
raise ValueError(f"Expected exactly 2 DataFrames, got {len(dfs)}")
|
|
274
|
+
|
|
275
|
+
df1, df2 = dfs[0], dfs[1]
|
|
276
|
+
|
|
277
|
+
# drop redundant columns before merge
|
|
278
|
+
redundant_columns = set(df1.columns) & set(df2.columns) - {"source"}
|
|
279
|
+
df2 = df2.drop(columns=list(redundant_columns))
|
|
280
|
+
logger.info(
|
|
281
|
+
f"Dropped redundant columns from second DataFrame before merge: {redundant_columns}"
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Validate merge safety before proceeding
|
|
285
|
+
is_safe, merge_warnings = validate_merge_safety(df1, df2, merge_key="source")
|
|
286
|
+
|
|
287
|
+
if not is_safe:
|
|
288
|
+
error_msg = "Cannot safely merge DataFrames:\n - " + "\n - ".join(
|
|
289
|
+
merge_warnings
|
|
290
|
+
)
|
|
291
|
+
raise ValueError(error_msg)
|
|
292
|
+
|
|
293
|
+
if merge_warnings:
|
|
294
|
+
for warning in merge_warnings:
|
|
295
|
+
logger.warning(f"Merge validation: {warning}")
|
|
296
|
+
|
|
297
|
+
# Merge on 'source' column with indicator to track merge status
|
|
298
|
+
merged_df = pd.merge(
|
|
299
|
+
df1, df2, on="source", how="outer", indicator=True, suffixes=("_left", "_right")
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Separate matched and unmatched rows
|
|
303
|
+
matched = merged_df[merged_df["_merge"] == "both"].copy()
|
|
304
|
+
unmatched = merged_df[merged_df["_merge"].isin(["left_only", "right_only"])].copy()
|
|
305
|
+
|
|
306
|
+
# Remove the merge indicator column
|
|
307
|
+
matched = matched.drop(columns=["_merge"])
|
|
308
|
+
unmatched = unmatched.drop(columns=["_merge"])
|
|
309
|
+
|
|
310
|
+
# Handle _left/_right column pairs: check if they match, then consolidate
|
|
311
|
+
left_cols = [col for col in matched.columns if col.endswith("_left")]
|
|
312
|
+
|
|
313
|
+
for left_col in left_cols:
|
|
314
|
+
base_name = left_col[:-5] # Remove '_left' suffix
|
|
315
|
+
right_col = base_name + "_right"
|
|
316
|
+
|
|
317
|
+
if right_col in matched.columns:
|
|
318
|
+
# Check if columns match (ignoring NaN values)
|
|
319
|
+
mismatches = matched[left_col] != matched[right_col]
|
|
320
|
+
# Account for NaN != NaN being True
|
|
321
|
+
both_nan = matched[left_col].isna() & matched[right_col].isna()
|
|
322
|
+
actual_mismatches = mismatches & ~both_nan
|
|
323
|
+
|
|
324
|
+
if actual_mismatches.any():
|
|
325
|
+
mismatch_count = actual_mismatches.sum()
|
|
326
|
+
logger.warning(
|
|
327
|
+
f"Column '{base_name}' has {mismatch_count} mismatches between releases"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Keep left column and rename it, drop right column
|
|
331
|
+
matched = matched.rename(columns={left_col: base_name})
|
|
332
|
+
matched = matched.drop(columns=[right_col])
|
|
333
|
+
|
|
334
|
+
matched = matched.rename(columns={"homepage": "upstream_repo_url"})
|
|
335
|
+
unmatched = unmatched.rename(columns={"homepage": "upstream_repo_url"})
|
|
336
|
+
|
|
337
|
+
return matched, unmatched
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def fetch_packages(release: str) -> pd.DataFrame | None:
|
|
341
|
+
packages_url = debian_packages_source_url_template.format(release=release)
|
|
342
|
+
xz_bytes = gh.fetch_file(packages_url)
|
|
343
|
+
if xz_bytes is None:
|
|
344
|
+
logger.error(f"Failed to fetch Packages.xz for release {release}")
|
|
345
|
+
return None
|
|
346
|
+
# Now xz_bytes contains the raw .xz data of the Sources file
|
|
347
|
+
data = lzma.decompress(xz_bytes)
|
|
348
|
+
|
|
349
|
+
# Deb822Source file may contain multiple stanzas
|
|
350
|
+
buf = io.BytesIO(data)
|
|
351
|
+
items = []
|
|
352
|
+
stanza = b""
|
|
353
|
+
for line in buf.readlines():
|
|
354
|
+
if line.strip() == b"":
|
|
355
|
+
if stanza.strip():
|
|
356
|
+
items.append(stanza)
|
|
357
|
+
stanza = b""
|
|
358
|
+
else:
|
|
359
|
+
stanza += line
|
|
360
|
+
if stanza.strip():
|
|
361
|
+
items.append(stanza)
|
|
362
|
+
rows = []
|
|
363
|
+
for st in items:
|
|
364
|
+
try:
|
|
365
|
+
d = deb822.Deb822(st.decode("utf-8", "ignore"))
|
|
366
|
+
rows.append(
|
|
367
|
+
{
|
|
368
|
+
"source": d.get("Package") or d.get("Source"),
|
|
369
|
+
f"{release}_version": d.get("Version"),
|
|
370
|
+
"homepage": d.get("Homepage"),
|
|
371
|
+
"depends": d.get("Depends"),
|
|
372
|
+
"maintainer": d.get("Maintainer"),
|
|
373
|
+
}
|
|
374
|
+
)
|
|
375
|
+
except Exception:
|
|
376
|
+
# Log parse failures to error log for visibility
|
|
377
|
+
try:
|
|
378
|
+
logger.error(f"Failed to parse stanza:\n{st.decode('utf-8', 'ignore')}")
|
|
379
|
+
except Exception:
|
|
380
|
+
pass
|
|
381
|
+
continue
|
|
382
|
+
return pd.DataFrame(rows)
|
osslag/distro/fedora.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
FEDORA_DISTGIT = os.environ.get("FEDORA_DISTGIT", "https://src.fedoraproject.org")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def list_rpms(
|
|
12
|
+
namespace: str = "rpms", page: int = 1, per_page: int = 100
|
|
13
|
+
) -> list[dict]:
|
|
14
|
+
# Pagure API: https://src.fedoraproject.org/api/0/projects?namespace=rpms
|
|
15
|
+
url = f"{FEDORA_DISTGIT}/api/0/projects?namespace={namespace}&page={page}&per_page={per_page}"
|
|
16
|
+
r = requests.get(url, timeout=30)
|
|
17
|
+
r.raise_for_status()
|
|
18
|
+
return r.json().get("projects", [])
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def fetch_all_rpms(max_pages: int = 200) -> pd.DataFrame:
|
|
22
|
+
rows = []
|
|
23
|
+
for p in range(1, max_pages + 1):
|
|
24
|
+
items = list_rpms(page=p)
|
|
25
|
+
if not items:
|
|
26
|
+
break
|
|
27
|
+
for prj in items:
|
|
28
|
+
rows.append(
|
|
29
|
+
{
|
|
30
|
+
"pkg_name": prj.get("name"),
|
|
31
|
+
"fullname": prj.get("fullname"),
|
|
32
|
+
"url": prj.get("url"),
|
|
33
|
+
"summary": prj.get("summary"),
|
|
34
|
+
"upstream_url": prj.get("upstream_url"),
|
|
35
|
+
"namespace": prj.get("namespace"),
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
return pd.DataFrame(rows)
|
|
File without changes
|