osslag 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
osslag/distro/debian.py CHANGED
@@ -13,9 +13,7 @@ from pandas._libs.missing import NAType # pyright: ignore[reportPrivateImportUs
13
13
 
14
14
  import osslag.utils.vcs as gh
15
15
 
16
- debian_packages_source_url_template = (
17
- "https://ftp.debian.org/debian/dists/{release}/main/source/Sources.xz"
18
- )
16
+ debian_packages_source_url_template = "https://ftp.debian.org/debian/dists/{release}/main/source/Sources.xz"
19
17
  logger = logging.getLogger(__name__)
20
18
 
21
19
 
@@ -81,9 +79,7 @@ def add_upstream_version_column(
81
79
  ValueError: If the specified version_column doesn't exist in the DataFrame
82
80
 
83
81
  Examples:
84
- >>> df = pd.DataFrame(
85
- ... {"source": ["pkg1", "pkg2"], "version": ["1.2.3-4", "2:1.0-1"]}
86
- ... )
82
+ >>> df = pd.DataFrame({"source": ["pkg1", "pkg2"], "version": ["1.2.3-4", "2:1.0-1"]})
87
83
  >>> result = add_upstream_version_column(df, "version")
88
84
  >>> result["version_upstream"].tolist()
89
85
  ['1.2.3', '1.0']
@@ -133,9 +129,7 @@ def add_local_repo_cache_path_column(
133
129
  ... ],
134
130
  ... }
135
131
  ... )
136
- >>> result = add_local_repo_cache_path_column(
137
- ... df, "repo_url", cache_dir="./cache"
138
- ... )
132
+ >>> result = add_local_repo_cache_path_column(df, "repo_url", cache_dir="./cache")
139
133
  >>> "repo_cache_path" in result.columns
140
134
  True
141
135
 
@@ -149,11 +143,7 @@ def add_local_repo_cache_path_column(
149
143
  df = df.copy()
150
144
 
151
145
  def _get_cache_path(url: Any) -> str | NAType:
152
- if (
153
- url is None
154
- or (isinstance(url, float) and pd.isna(url))
155
- or not isinstance(url, str)
156
- ):
146
+ if url is None or (isinstance(url, float) and pd.isna(url)) or not isinstance(url, str):
157
147
  return pd.NA
158
148
  path = gh.construct_repo_local_path(url, cache_dir)
159
149
  return str(path) if path is not None else pd.NA
@@ -177,9 +167,7 @@ def filter_github_repos(df: pd.DataFrame) -> pd.DataFrame:
177
167
 
178
168
  """
179
169
  if "homepage" not in df.columns:
180
- raise ValueError(
181
- "DataFrame must contain 'homepage' column to filter GitHub repositories."
182
- )
170
+ raise ValueError("DataFrame must contain 'homepage' column to filter GitHub repositories.")
183
171
 
184
172
  # First filter to rows containing github.com
185
173
  mask = df["homepage"].str.contains("github.com", na=False)
@@ -200,9 +188,7 @@ def filter_github_repos(df: pd.DataFrame) -> pd.DataFrame:
200
188
  return github_repos_df
201
189
 
202
190
 
203
- def validate_merge_safety(
204
- df1: pd.DataFrame, df2: pd.DataFrame, merge_key: str = "source"
205
- ) -> tuple[bool, list[str]]:
191
+ def validate_merge_safety(df1: pd.DataFrame, df2: pd.DataFrame, merge_key: str = "source") -> tuple[bool, list[str]]:
206
192
  """Validate if two DataFrames can be safely merged.
207
193
 
208
194
  Returns:
@@ -228,22 +214,16 @@ def validate_merge_safety(
228
214
  df2_dupes = df2[merge_key].duplicated().sum()
229
215
 
230
216
  if df1_dupes > 0:
231
- warnings.append(
232
- f"First DataFrame has {df1_dupes} duplicate '{merge_key}' values"
233
- )
217
+ warnings.append(f"First DataFrame has {df1_dupes} duplicate '{merge_key}' values")
234
218
  if df2_dupes > 0:
235
- warnings.append(
236
- f"Second DataFrame has {df2_dupes} duplicate '{merge_key}' values"
237
- )
219
+ warnings.append(f"Second DataFrame has {df2_dupes} duplicate '{merge_key}' values")
238
220
 
239
221
  # Check overlapping columns and their dtypes
240
222
  common_cols = set(df1.columns) & set(df2.columns) - {merge_key}
241
223
 
242
224
  for col in common_cols:
243
225
  if df1[col].dtype != df2[col].dtype:
244
- warnings.append(
245
- f"Column '{col}' has different dtypes: {df1[col].dtype} vs {df2[col].dtype}"
246
- )
226
+ warnings.append(f"Column '{col}' has different dtypes: {df1[col].dtype} vs {df2[col].dtype}")
247
227
 
248
228
  # Check merge key overlap
249
229
  overlap = set(df1[merge_key]) & set(df2[merge_key])
@@ -277,17 +257,13 @@ def merge_release_packages(
277
257
  # drop redundant columns before merge
278
258
  redundant_columns = set(df1.columns) & set(df2.columns) - {"source"}
279
259
  df2 = df2.drop(columns=list(redundant_columns))
280
- logger.info(
281
- f"Dropped redundant columns from second DataFrame before merge: {redundant_columns}"
282
- )
260
+ logger.info(f"Dropped redundant columns from second DataFrame before merge: {redundant_columns}")
283
261
 
284
262
  # Validate merge safety before proceeding
285
263
  is_safe, merge_warnings = validate_merge_safety(df1, df2, merge_key="source")
286
264
 
287
265
  if not is_safe:
288
- error_msg = "Cannot safely merge DataFrames:\n - " + "\n - ".join(
289
- merge_warnings
290
- )
266
+ error_msg = "Cannot safely merge DataFrames:\n - " + "\n - ".join(merge_warnings)
291
267
  raise ValueError(error_msg)
292
268
 
293
269
  if merge_warnings:
@@ -295,9 +271,7 @@ def merge_release_packages(
295
271
  logger.warning(f"Merge validation: {warning}")
296
272
 
297
273
  # Merge on 'source' column with indicator to track merge status
298
- merged_df = pd.merge(
299
- df1, df2, on="source", how="outer", indicator=True, suffixes=("_left", "_right")
300
- )
274
+ merged_df = pd.merge(df1, df2, on="source", how="outer", indicator=True, suffixes=("_left", "_right"))
301
275
 
302
276
  # Separate matched and unmatched rows
303
277
  matched = merged_df[merged_df["_merge"] == "both"].copy()
@@ -323,9 +297,7 @@ def merge_release_packages(
323
297
 
324
298
  if actual_mismatches.any():
325
299
  mismatch_count = actual_mismatches.sum()
326
- logger.warning(
327
- f"Column '{base_name}' has {mismatch_count} mismatches between releases"
328
- )
300
+ logger.warning(f"Column '{base_name}' has {mismatch_count} mismatches between releases")
329
301
 
330
302
  # Keep left column and rename it, drop right column
331
303
  matched = matched.rename(columns={left_col: base_name})
osslag/distro/fedora.py CHANGED
@@ -8,9 +8,7 @@ import requests
8
8
  FEDORA_DISTGIT = os.environ.get("FEDORA_DISTGIT", "https://src.fedoraproject.org")
9
9
 
10
10
 
11
- def list_rpms(
12
- namespace: str = "rpms", page: int = 1, per_page: int = 100
13
- ) -> list[dict]:
11
+ def list_rpms(namespace: str = "rpms", page: int = 1, per_page: int = 100) -> list[dict]:
14
12
  # Pagure API: https://src.fedoraproject.org/api/0/projects?namespace=rpms
15
13
  url = f"{FEDORA_DISTGIT}/api/0/projects?namespace={namespace}&page={page}&per_page={per_page}"
16
14
  r = requests.get(url, timeout=30)