osslag 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osslag/__init__.py +9 -0
- osslag/cli.py +100 -243
- osslag/distro/debian.py +13 -41
- osslag/distro/fedora.py +1 -3
- osslag/metrics/malta.py +412 -125
- osslag/metrics/pvac.py +2 -6
- osslag/utils/github_helper.py +7 -23
- osslag/utils/vcs.py +16 -49
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/METADATA +14 -8
- osslag-1.0.1.dist-info/RECORD +17 -0
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/WHEEL +2 -1
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/entry_points.txt +0 -1
- osslag-1.0.1.dist-info/licenses/LICENSE +21 -0
- osslag-1.0.1.dist-info/top_level.txt +1 -0
- osslag-1.0.0.dist-info/RECORD +0 -15
osslag/distro/debian.py
CHANGED
|
@@ -13,9 +13,7 @@ from pandas._libs.missing import NAType # pyright: ignore[reportPrivateImportUs
|
|
|
13
13
|
|
|
14
14
|
import osslag.utils.vcs as gh
|
|
15
15
|
|
|
16
|
-
debian_packages_source_url_template =
|
|
17
|
-
"https://ftp.debian.org/debian/dists/{release}/main/source/Sources.xz"
|
|
18
|
-
)
|
|
16
|
+
debian_packages_source_url_template = "https://ftp.debian.org/debian/dists/{release}/main/source/Sources.xz"
|
|
19
17
|
logger = logging.getLogger(__name__)
|
|
20
18
|
|
|
21
19
|
|
|
@@ -81,9 +79,7 @@ def add_upstream_version_column(
|
|
|
81
79
|
ValueError: If the specified version_column doesn't exist in the DataFrame
|
|
82
80
|
|
|
83
81
|
Examples:
|
|
84
|
-
>>> df = pd.DataFrame(
|
|
85
|
-
... {"source": ["pkg1", "pkg2"], "version": ["1.2.3-4", "2:1.0-1"]}
|
|
86
|
-
... )
|
|
82
|
+
>>> df = pd.DataFrame({"source": ["pkg1", "pkg2"], "version": ["1.2.3-4", "2:1.0-1"]})
|
|
87
83
|
>>> result = add_upstream_version_column(df, "version")
|
|
88
84
|
>>> result["version_upstream"].tolist()
|
|
89
85
|
['1.2.3', '1.0']
|
|
@@ -133,9 +129,7 @@ def add_local_repo_cache_path_column(
|
|
|
133
129
|
... ],
|
|
134
130
|
... }
|
|
135
131
|
... )
|
|
136
|
-
>>> result = add_local_repo_cache_path_column(
|
|
137
|
-
... df, "repo_url", cache_dir="./cache"
|
|
138
|
-
... )
|
|
132
|
+
>>> result = add_local_repo_cache_path_column(df, "repo_url", cache_dir="./cache")
|
|
139
133
|
>>> "repo_cache_path" in result.columns
|
|
140
134
|
True
|
|
141
135
|
|
|
@@ -149,11 +143,7 @@ def add_local_repo_cache_path_column(
|
|
|
149
143
|
df = df.copy()
|
|
150
144
|
|
|
151
145
|
def _get_cache_path(url: Any) -> str | NAType:
|
|
152
|
-
if (
|
|
153
|
-
url is None
|
|
154
|
-
or (isinstance(url, float) and pd.isna(url))
|
|
155
|
-
or not isinstance(url, str)
|
|
156
|
-
):
|
|
146
|
+
if url is None or (isinstance(url, float) and pd.isna(url)) or not isinstance(url, str):
|
|
157
147
|
return pd.NA
|
|
158
148
|
path = gh.construct_repo_local_path(url, cache_dir)
|
|
159
149
|
return str(path) if path is not None else pd.NA
|
|
@@ -177,9 +167,7 @@ def filter_github_repos(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
177
167
|
|
|
178
168
|
"""
|
|
179
169
|
if "homepage" not in df.columns:
|
|
180
|
-
raise ValueError(
|
|
181
|
-
"DataFrame must contain 'homepage' column to filter GitHub repositories."
|
|
182
|
-
)
|
|
170
|
+
raise ValueError("DataFrame must contain 'homepage' column to filter GitHub repositories.")
|
|
183
171
|
|
|
184
172
|
# First filter to rows containing github.com
|
|
185
173
|
mask = df["homepage"].str.contains("github.com", na=False)
|
|
@@ -200,9 +188,7 @@ def filter_github_repos(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
200
188
|
return github_repos_df
|
|
201
189
|
|
|
202
190
|
|
|
203
|
-
def validate_merge_safety(
|
|
204
|
-
df1: pd.DataFrame, df2: pd.DataFrame, merge_key: str = "source"
|
|
205
|
-
) -> tuple[bool, list[str]]:
|
|
191
|
+
def validate_merge_safety(df1: pd.DataFrame, df2: pd.DataFrame, merge_key: str = "source") -> tuple[bool, list[str]]:
|
|
206
192
|
"""Validate if two DataFrames can be safely merged.
|
|
207
193
|
|
|
208
194
|
Returns:
|
|
@@ -228,22 +214,16 @@ def validate_merge_safety(
|
|
|
228
214
|
df2_dupes = df2[merge_key].duplicated().sum()
|
|
229
215
|
|
|
230
216
|
if df1_dupes > 0:
|
|
231
|
-
warnings.append(
|
|
232
|
-
f"First DataFrame has {df1_dupes} duplicate '{merge_key}' values"
|
|
233
|
-
)
|
|
217
|
+
warnings.append(f"First DataFrame has {df1_dupes} duplicate '{merge_key}' values")
|
|
234
218
|
if df2_dupes > 0:
|
|
235
|
-
warnings.append(
|
|
236
|
-
f"Second DataFrame has {df2_dupes} duplicate '{merge_key}' values"
|
|
237
|
-
)
|
|
219
|
+
warnings.append(f"Second DataFrame has {df2_dupes} duplicate '{merge_key}' values")
|
|
238
220
|
|
|
239
221
|
# Check overlapping columns and their dtypes
|
|
240
222
|
common_cols = set(df1.columns) & set(df2.columns) - {merge_key}
|
|
241
223
|
|
|
242
224
|
for col in common_cols:
|
|
243
225
|
if df1[col].dtype != df2[col].dtype:
|
|
244
|
-
warnings.append(
|
|
245
|
-
f"Column '{col}' has different dtypes: {df1[col].dtype} vs {df2[col].dtype}"
|
|
246
|
-
)
|
|
226
|
+
warnings.append(f"Column '{col}' has different dtypes: {df1[col].dtype} vs {df2[col].dtype}")
|
|
247
227
|
|
|
248
228
|
# Check merge key overlap
|
|
249
229
|
overlap = set(df1[merge_key]) & set(df2[merge_key])
|
|
@@ -277,17 +257,13 @@ def merge_release_packages(
|
|
|
277
257
|
# drop redundant columns before merge
|
|
278
258
|
redundant_columns = set(df1.columns) & set(df2.columns) - {"source"}
|
|
279
259
|
df2 = df2.drop(columns=list(redundant_columns))
|
|
280
|
-
logger.info(
|
|
281
|
-
f"Dropped redundant columns from second DataFrame before merge: {redundant_columns}"
|
|
282
|
-
)
|
|
260
|
+
logger.info(f"Dropped redundant columns from second DataFrame before merge: {redundant_columns}")
|
|
283
261
|
|
|
284
262
|
# Validate merge safety before proceeding
|
|
285
263
|
is_safe, merge_warnings = validate_merge_safety(df1, df2, merge_key="source")
|
|
286
264
|
|
|
287
265
|
if not is_safe:
|
|
288
|
-
error_msg = "Cannot safely merge DataFrames:\n - " + "\n - ".join(
|
|
289
|
-
merge_warnings
|
|
290
|
-
)
|
|
266
|
+
error_msg = "Cannot safely merge DataFrames:\n - " + "\n - ".join(merge_warnings)
|
|
291
267
|
raise ValueError(error_msg)
|
|
292
268
|
|
|
293
269
|
if merge_warnings:
|
|
@@ -295,9 +271,7 @@ def merge_release_packages(
|
|
|
295
271
|
logger.warning(f"Merge validation: {warning}")
|
|
296
272
|
|
|
297
273
|
# Merge on 'source' column with indicator to track merge status
|
|
298
|
-
merged_df = pd.merge(
|
|
299
|
-
df1, df2, on="source", how="outer", indicator=True, suffixes=("_left", "_right")
|
|
300
|
-
)
|
|
274
|
+
merged_df = pd.merge(df1, df2, on="source", how="outer", indicator=True, suffixes=("_left", "_right"))
|
|
301
275
|
|
|
302
276
|
# Separate matched and unmatched rows
|
|
303
277
|
matched = merged_df[merged_df["_merge"] == "both"].copy()
|
|
@@ -323,9 +297,7 @@ def merge_release_packages(
|
|
|
323
297
|
|
|
324
298
|
if actual_mismatches.any():
|
|
325
299
|
mismatch_count = actual_mismatches.sum()
|
|
326
|
-
logger.warning(
|
|
327
|
-
f"Column '{base_name}' has {mismatch_count} mismatches between releases"
|
|
328
|
-
)
|
|
300
|
+
logger.warning(f"Column '{base_name}' has {mismatch_count} mismatches between releases")
|
|
329
301
|
|
|
330
302
|
# Keep left column and rename it, drop right column
|
|
331
303
|
matched = matched.rename(columns={left_col: base_name})
|
osslag/distro/fedora.py
CHANGED
|
@@ -8,9 +8,7 @@ import requests
|
|
|
8
8
|
FEDORA_DISTGIT = os.environ.get("FEDORA_DISTGIT", "https://src.fedoraproject.org")
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def list_rpms(
|
|
12
|
-
namespace: str = "rpms", page: int = 1, per_page: int = 100
|
|
13
|
-
) -> list[dict]:
|
|
11
|
+
def list_rpms(namespace: str = "rpms", page: int = 1, per_page: int = 100) -> list[dict]:
|
|
14
12
|
# Pagure API: https://src.fedoraproject.org/api/0/projects?namespace=rpms
|
|
15
13
|
url = f"{FEDORA_DISTGIT}/api/0/projects?namespace={namespace}&page={page}&per_page={per_page}"
|
|
16
14
|
r = requests.get(url, timeout=30)
|