osslag 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,382 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import logging
5
+ import lzma
6
+ import os
7
+ from typing import Any
8
+
9
+ import pandas as pd
10
+ from debian import deb822 # from python-debian
11
+ from debian.debian_support import Version
12
+ from pandas._libs.missing import NAType # pyright: ignore[reportPrivateImportUsage]
13
+
14
+ import osslag.utils.vcs as gh
15
+
16
+ debian_packages_source_url_template = (
17
+ "https://ftp.debian.org/debian/dists/{release}/main/source/Sources.xz"
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def extract_upstream_version(version_string: str) -> str | None:
23
+ """Extract the upstream version from a Debian package version string using
24
+ the official debian.debian_support.Version parser.
25
+
26
+ Debian version format: [epoch:]upstream_version[-debian_revision]
27
+
28
+ Args:
29
+ version_string: A Debian package version string
30
+
31
+ Returns:
32
+ The upstream version, or None if the input is invalid
33
+
34
+ Examples:
35
+ >>> extract_upstream_version("1.2.3-4")
36
+ '1.2.3'
37
+ >>> extract_upstream_version("2:1.2.3-4")
38
+ '1.2.3'
39
+ >>> extract_upstream_version("1.2.3")
40
+ '1.2.3'
41
+ >>> extract_upstream_version("1:2.0")
42
+ '2.0'
43
+
44
+ """
45
+ if not version_string or not isinstance(version_string, str):
46
+ return None
47
+
48
+ try:
49
+ version = Version(version_string.strip())
50
+ upstream = version.upstream_version
51
+
52
+ if isinstance(upstream, str):
53
+ # Drop Debian repack/metadata suffixes (e.g., +dfsg, +gitYYYY..., +ds)
54
+ if "+" in upstream:
55
+ upstream = upstream.split("+", 1)[0]
56
+
57
+ # Drop prerelease-style suffixes that use '~' (e.g., ~rc1)
58
+ if "~" in upstream:
59
+ upstream = upstream.split("~", 1)[0]
60
+
61
+ upstream = upstream.strip() if isinstance(upstream, str) else upstream
62
+ return upstream if upstream else None
63
+ except (ValueError, AttributeError):
64
+ return None
65
+
66
+
67
+ def add_upstream_version_column(
68
+ df: pd.DataFrame, version_column: str, new_column_name: str | None = None
69
+ ) -> pd.DataFrame:
70
+ """Extract upstream version for each row in a DataFrame and add it as a new column.
71
+
72
+ Args:
73
+ df: DataFrame containing version strings
74
+ version_column: Name of the column containing Debian version strings
75
+ new_column_name: Name for the new column (default: "{version_column}_upstream")
76
+
77
+ Returns:
78
+ DataFrame with the new upstream version column added
79
+
80
+ Raises:
81
+ ValueError: If the specified version_column doesn't exist in the DataFrame
82
+
83
+ Examples:
84
+ >>> df = pd.DataFrame(
85
+ ... {"source": ["pkg1", "pkg2"], "version": ["1.2.3-4", "2:1.0-1"]}
86
+ ... )
87
+ >>> result = add_upstream_version_column(df, "version")
88
+ >>> result["version_upstream"].tolist()
89
+ ['1.2.3', '1.0']
90
+
91
+ """
92
+ if version_column not in df.columns:
93
+ raise ValueError(f"Column '{version_column}' not found in DataFrame")
94
+
95
+ # Determine the new column name
96
+ if new_column_name is None:
97
+ new_column_name = f"{version_column}_upstream"
98
+
99
+ # Apply the extraction function to each row
100
+ df = df.copy()
101
+ df[new_column_name] = df[version_column].apply(extract_upstream_version)
102
+
103
+ return df
104
+
105
+
106
+ def add_local_repo_cache_path_column(
107
+ df: pd.DataFrame,
108
+ repo_url_column: str = "homepage",
109
+ cache_dir: str | os.PathLike = "./cache",
110
+ new_column_name: str = "repo_cache_path",
111
+ ) -> pd.DataFrame:
112
+ """Add a column to the DataFrame with the local repository cache path for each repository URL.
113
+
114
+ Args:
115
+ df: DataFrame containing repository URLs
116
+ repo_url_column: Name of the column containing repository URLs
117
+ cache_dir: Base cache directory (default: "./cache")
118
+ new_column_name: Name for the new column (default: "repo_cache_path")
119
+
120
+ Returns:
121
+ DataFrame with the new repository cache path column added
122
+
123
+ Raises:
124
+ ValueError: If the specified repo_url_column doesn't exist in the DataFrame
125
+
126
+ Examples:
127
+ >>> df = pd.DataFrame(
128
+ ... {
129
+ ... "source": ["pkg1", "pkg2"],
130
+ ... "repo_url": [
131
+ ... "https://github.com/owner/repo1",
132
+ ... "https://github.com/owner/repo2",
133
+ ... ],
134
+ ... }
135
+ ... )
136
+ >>> result = add_local_repo_cache_path_column(
137
+ ... df, "repo_url", cache_dir="./cache"
138
+ ... )
139
+ >>> "repo_cache_path" in result.columns
140
+ True
141
+
142
+ """
143
+ if repo_url_column not in df.columns:
144
+ raise ValueError(f"Column '{repo_url_column}' not found in DataFrame")
145
+
146
+ if new_column_name is None:
147
+ new_column_name = "repo_cache_path"
148
+
149
+ df = df.copy()
150
+
151
+ def _get_cache_path(url: Any) -> str | NAType:
152
+ if (
153
+ url is None
154
+ or (isinstance(url, float) and pd.isna(url))
155
+ or not isinstance(url, str)
156
+ ):
157
+ return pd.NA
158
+ path = gh.construct_repo_local_path(url, cache_dir)
159
+ return str(path) if path is not None else pd.NA
160
+
161
+ df[new_column_name] = df[repo_url_column].map(_get_cache_path)
162
+ return df
163
+
164
+
165
+ def filter_github_repos(df: pd.DataFrame) -> pd.DataFrame:
166
+ """Filter DataFrame to only include rows with valid GitHub repository URLs.
167
+
168
+ Normalizes all GitHub URLs using normalize_https_repo_url and updates the
169
+ homepage column with the cleaned URLs. Rows with invalid or non-GitHub URLs
170
+ are excluded.
171
+
172
+ Args:
173
+ df: DataFrame containing a 'homepage' column with repository URLs
174
+
175
+ Returns:
176
+ DataFrame filtered to valid GitHub repos with normalized homepage URLs
177
+
178
+ """
179
+ if "homepage" not in df.columns:
180
+ raise ValueError(
181
+ "DataFrame must contain 'homepage' column to filter GitHub repositories."
182
+ )
183
+
184
+ # First filter to rows containing github.com
185
+ mask = df["homepage"].str.contains("github.com", na=False)
186
+ filtered_df = df[mask].copy()
187
+
188
+ # Normalize all GitHub URLs
189
+ normalized = filtered_df["homepage"].apply(gh.normalize_https_repo_url)
190
+
191
+ # Extract the normalized URL (or None if invalid)
192
+ filtered_df["homepage"] = normalized.apply(lambda r: r.url)
193
+
194
+ # Drop rows where normalization failed (url is None)
195
+ filtered_df = filtered_df[filtered_df["homepage"].notna()]
196
+
197
+ # Drop duplicates based on normalized homepage
198
+ github_repos_df = filtered_df.drop_duplicates(subset=["homepage"], keep="first")
199
+
200
+ return github_repos_df
201
+
202
+
203
+ def validate_merge_safety(
204
+ df1: pd.DataFrame, df2: pd.DataFrame, merge_key: str = "source"
205
+ ) -> tuple[bool, list[str]]:
206
+ """Validate if two DataFrames can be safely merged.
207
+
208
+ Returns:
209
+ (is_safe, warnings) tuple where is_safe is True if no critical issues found
210
+
211
+ """
212
+ warnings = []
213
+ is_safe = True
214
+ logger.info(f"Validating merge safety on key '{merge_key}'")
215
+ # Check if merge key exists in both
216
+ if merge_key not in df1.columns:
217
+ warnings.append(f"Merge key '{merge_key}' missing in first DataFrame")
218
+ is_safe = False
219
+ if merge_key not in df2.columns:
220
+ warnings.append(f"Merge key '{merge_key}' missing in second DataFrame")
221
+ is_safe = False
222
+
223
+ if not is_safe:
224
+ return is_safe, warnings
225
+
226
+ # Check for duplicates in merge key
227
+ df1_dupes = df1[merge_key].duplicated().sum()
228
+ df2_dupes = df2[merge_key].duplicated().sum()
229
+
230
+ if df1_dupes > 0:
231
+ warnings.append(
232
+ f"First DataFrame has {df1_dupes} duplicate '{merge_key}' values"
233
+ )
234
+ if df2_dupes > 0:
235
+ warnings.append(
236
+ f"Second DataFrame has {df2_dupes} duplicate '{merge_key}' values"
237
+ )
238
+
239
+ # Check overlapping columns and their dtypes
240
+ common_cols = set(df1.columns) & set(df2.columns) - {merge_key}
241
+
242
+ for col in common_cols:
243
+ if df1[col].dtype != df2[col].dtype:
244
+ warnings.append(
245
+ f"Column '{col}' has different dtypes: {df1[col].dtype} vs {df2[col].dtype}"
246
+ )
247
+
248
+ # Check merge key overlap
249
+ overlap = set(df1[merge_key]) & set(df2[merge_key])
250
+ overlap_pct = len(overlap) / max(len(df1), len(df2)) * 100
251
+
252
+ if overlap_pct < 10:
253
+ warnings.append(f"Low overlap: only {overlap_pct:.1f}% of keys match")
254
+
255
+ return is_safe, warnings
256
+
257
+
258
+ def merge_release_packages(
259
+ dfs: list[pd.DataFrame],
260
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
261
+ """Merge multiple Debian package DataFrames from different releases on the 'source' column.
262
+
263
+ Args:
264
+ dfs: List of DataFrames to merge (expects exactly 2 DataFrames)
265
+
266
+ Returns:
267
+ Tuple of (merged_df, unmerged_df) where:
268
+ - merged_df contains rows that matched on 'source' column
269
+ - unmerged_df contains rows that didn't match (from either dataframe)
270
+
271
+ """
272
+ if len(dfs) != 2:
273
+ raise ValueError(f"Expected exactly 2 DataFrames, got {len(dfs)}")
274
+
275
+ df1, df2 = dfs[0], dfs[1]
276
+
277
+ # drop redundant columns before merge
278
+ redundant_columns = set(df1.columns) & set(df2.columns) - {"source"}
279
+ df2 = df2.drop(columns=list(redundant_columns))
280
+ logger.info(
281
+ f"Dropped redundant columns from second DataFrame before merge: {redundant_columns}"
282
+ )
283
+
284
+ # Validate merge safety before proceeding
285
+ is_safe, merge_warnings = validate_merge_safety(df1, df2, merge_key="source")
286
+
287
+ if not is_safe:
288
+ error_msg = "Cannot safely merge DataFrames:\n - " + "\n - ".join(
289
+ merge_warnings
290
+ )
291
+ raise ValueError(error_msg)
292
+
293
+ if merge_warnings:
294
+ for warning in merge_warnings:
295
+ logger.warning(f"Merge validation: {warning}")
296
+
297
+ # Merge on 'source' column with indicator to track merge status
298
+ merged_df = pd.merge(
299
+ df1, df2, on="source", how="outer", indicator=True, suffixes=("_left", "_right")
300
+ )
301
+
302
+ # Separate matched and unmatched rows
303
+ matched = merged_df[merged_df["_merge"] == "both"].copy()
304
+ unmatched = merged_df[merged_df["_merge"].isin(["left_only", "right_only"])].copy()
305
+
306
+ # Remove the merge indicator column
307
+ matched = matched.drop(columns=["_merge"])
308
+ unmatched = unmatched.drop(columns=["_merge"])
309
+
310
+ # Handle _left/_right column pairs: check if they match, then consolidate
311
+ left_cols = [col for col in matched.columns if col.endswith("_left")]
312
+
313
+ for left_col in left_cols:
314
+ base_name = left_col[:-5] # Remove '_left' suffix
315
+ right_col = base_name + "_right"
316
+
317
+ if right_col in matched.columns:
318
+ # Check if columns match (ignoring NaN values)
319
+ mismatches = matched[left_col] != matched[right_col]
320
+ # Account for NaN != NaN being True
321
+ both_nan = matched[left_col].isna() & matched[right_col].isna()
322
+ actual_mismatches = mismatches & ~both_nan
323
+
324
+ if actual_mismatches.any():
325
+ mismatch_count = actual_mismatches.sum()
326
+ logger.warning(
327
+ f"Column '{base_name}' has {mismatch_count} mismatches between releases"
328
+ )
329
+
330
+ # Keep left column and rename it, drop right column
331
+ matched = matched.rename(columns={left_col: base_name})
332
+ matched = matched.drop(columns=[right_col])
333
+
334
+ matched = matched.rename(columns={"homepage": "upstream_repo_url"})
335
+ unmatched = unmatched.rename(columns={"homepage": "upstream_repo_url"})
336
+
337
+ return matched, unmatched
338
+
339
+
340
+ def fetch_packages(release: str) -> pd.DataFrame | None:
341
+ packages_url = debian_packages_source_url_template.format(release=release)
342
+ xz_bytes = gh.fetch_file(packages_url)
343
+ if xz_bytes is None:
344
+ logger.error(f"Failed to fetch Packages.xz for release {release}")
345
+ return None
346
+ # Now xz_bytes contains the raw .xz data of the Sources file
347
+ data = lzma.decompress(xz_bytes)
348
+
349
+ # Deb822Source file may contain multiple stanzas
350
+ buf = io.BytesIO(data)
351
+ items = []
352
+ stanza = b""
353
+ for line in buf.readlines():
354
+ if line.strip() == b"":
355
+ if stanza.strip():
356
+ items.append(stanza)
357
+ stanza = b""
358
+ else:
359
+ stanza += line
360
+ if stanza.strip():
361
+ items.append(stanza)
362
+ rows = []
363
+ for st in items:
364
+ try:
365
+ d = deb822.Deb822(st.decode("utf-8", "ignore"))
366
+ rows.append(
367
+ {
368
+ "source": d.get("Package") or d.get("Source"),
369
+ f"{release}_version": d.get("Version"),
370
+ "homepage": d.get("Homepage"),
371
+ "depends": d.get("Depends"),
372
+ "maintainer": d.get("Maintainer"),
373
+ }
374
+ )
375
+ except Exception:
376
+ # Log parse failures to error log for visibility
377
+ try:
378
+ logger.error(f"Failed to parse stanza:\n{st.decode('utf-8', 'ignore')}")
379
+ except Exception:
380
+ pass
381
+ continue
382
+ return pd.DataFrame(rows)
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import pandas as pd
6
+ import requests
7
+
8
+ FEDORA_DISTGIT = os.environ.get("FEDORA_DISTGIT", "https://src.fedoraproject.org")
9
+
10
+
11
+ def list_rpms(
12
+ namespace: str = "rpms", page: int = 1, per_page: int = 100
13
+ ) -> list[dict]:
14
+ # Pagure API: https://src.fedoraproject.org/api/0/projects?namespace=rpms
15
+ url = f"{FEDORA_DISTGIT}/api/0/projects?namespace={namespace}&page={page}&per_page={per_page}"
16
+ r = requests.get(url, timeout=30)
17
+ r.raise_for_status()
18
+ return r.json().get("projects", [])
19
+
20
+
21
+ def fetch_all_rpms(max_pages: int = 200) -> pd.DataFrame:
22
+ rows = []
23
+ for p in range(1, max_pages + 1):
24
+ items = list_rpms(page=p)
25
+ if not items:
26
+ break
27
+ for prj in items:
28
+ rows.append(
29
+ {
30
+ "pkg_name": prj.get("name"),
31
+ "fullname": prj.get("fullname"),
32
+ "url": prj.get("url"),
33
+ "summary": prj.get("summary"),
34
+ "upstream_url": prj.get("upstream_url"),
35
+ "namespace": prj.get("namespace"),
36
+ }
37
+ )
38
+ return pd.DataFrame(rows)
File without changes