osslag 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osslag/__init__.py +0 -0
- osslag/cli.py +1380 -0
- osslag/distro/__init__.py +0 -0
- osslag/distro/debian.py +382 -0
- osslag/distro/fedora.py +38 -0
- osslag/metrics/__init__.py +0 -0
- osslag/metrics/malta.py +585 -0
- osslag/metrics/pvac.py +166 -0
- osslag/utils/__init__.py +0 -0
- osslag/utils/github_helper.py +240 -0
- osslag/utils/vcs.py +543 -0
- osslag-1.0.0.dist-info/METADATA +46 -0
- osslag-1.0.0.dist-info/RECORD +15 -0
- osslag-1.0.0.dist-info/WHEEL +4 -0
- osslag-1.0.0.dist-info/entry_points.txt +3 -0
osslag/utils/vcs.py
ADDED
|
@@ -0,0 +1,543 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import pathlib
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import TYPE_CHECKING, List, NamedTuple, cast
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pygit2.enums import DiffOption
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import pygit2
|
|
16
|
+
import requests
|
|
17
|
+
from dateutil.relativedelta import relativedelta
|
|
18
|
+
from dotenv import load_dotenv
|
|
19
|
+
|
|
20
|
+
load_dotenv() # Load environment variables from .env file if present
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NormalizeRepoResult(NamedTuple):
|
|
25
|
+
url: str | None
|
|
26
|
+
error: str | None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class RepoOwnerName(NamedTuple):
|
|
30
|
+
owner: str | None
|
|
31
|
+
name: str | None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CloneResult(NamedTuple):
|
|
35
|
+
success: bool
|
|
36
|
+
error: str | None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def normalize_https_repo_url(url: str) -> NormalizeRepoResult:
|
|
40
|
+
if not url or not isinstance(url, str):
|
|
41
|
+
return NormalizeRepoResult(None, "Invalid URL: None or not a string")
|
|
42
|
+
url = url.strip()
|
|
43
|
+
|
|
44
|
+
# Strip fragment (e.g., #readme) from URLs
|
|
45
|
+
if "#" in url:
|
|
46
|
+
url = url.split("#")[0]
|
|
47
|
+
|
|
48
|
+
# Convert http:// to https://
|
|
49
|
+
if url.startswith("http://"):
|
|
50
|
+
url = "https://" + url[7:]
|
|
51
|
+
|
|
52
|
+
# Strip .git suffix early if present (for all protocols)
|
|
53
|
+
url_for_matching = url
|
|
54
|
+
if url.endswith(".git"):
|
|
55
|
+
url_for_matching = url[:-4]
|
|
56
|
+
|
|
57
|
+
# Strip trailing slash
|
|
58
|
+
if url_for_matching.endswith("/"):
|
|
59
|
+
url_for_matching = url_for_matching[:-1]
|
|
60
|
+
|
|
61
|
+
# GitHub SSH pattern - return None for SSH URLs
|
|
62
|
+
github_ssh_pattern = re.compile(r"^git@github\.com:([\w.-]+)/([\w.-]+)$")
|
|
63
|
+
match = github_ssh_pattern.match(url_for_matching)
|
|
64
|
+
if match:
|
|
65
|
+
return NormalizeRepoResult(None, None)
|
|
66
|
+
|
|
67
|
+
# GitHub HTTPS pattern - clean up garbage
|
|
68
|
+
github_https_pattern = re.compile(r"^https://github\.com/([\w.-]+)/([\w.-]+)")
|
|
69
|
+
match = github_https_pattern.match(url_for_matching)
|
|
70
|
+
if match:
|
|
71
|
+
# Reconstruct clean URL without .git
|
|
72
|
+
clean_url = f"https://github.com/{match.group(1)}/{match.group(2)}"
|
|
73
|
+
return NormalizeRepoResult(clean_url, None)
|
|
74
|
+
|
|
75
|
+
return NormalizeRepoResult(
|
|
76
|
+
None, "URL does not match expected git repository patterns"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def ensure_dir(p: str | os.PathLike) -> pathlib.Path:
|
|
81
|
+
p = pathlib.Path(p)
|
|
82
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
return p
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def fetch_file(url) -> bytes:
|
|
87
|
+
"""Fetch a file from a URL and return its content as bytes.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
url: The URL of the file to fetch.
|
|
91
|
+
Returns: The content of the file as bytes.
|
|
92
|
+
|
|
93
|
+
"""
|
|
94
|
+
with requests.get(url, stream=True, timeout=60) as r:
|
|
95
|
+
r.raise_for_status()
|
|
96
|
+
chunk_size = 1024
|
|
97
|
+
content = bytearray()
|
|
98
|
+
for chunk in r.iter_content(chunk_size=chunk_size):
|
|
99
|
+
content.extend(chunk)
|
|
100
|
+
return bytes(content)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extract_owner_name_repo(repo_url: str) -> RepoOwnerName:
|
|
104
|
+
"""Extract owner and repository name from a GitHub token string. Expects
|
|
105
|
+
a URL that has been normalized by normalize_repo_url.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
repo_url: The GitHub repository URL.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
A tuple containing the owner and repository name.
|
|
112
|
+
|
|
113
|
+
"""
|
|
114
|
+
repo_url_normalized = normalize_https_repo_url(repo_url)
|
|
115
|
+
if repo_url_normalized and repo_url_normalized.url is not None:
|
|
116
|
+
parts = repo_url_normalized.url.split("/")
|
|
117
|
+
if len(parts) < 2:
|
|
118
|
+
logger.error(
|
|
119
|
+
"Could not extract owner/repo from URL: call normalize_repo_url first"
|
|
120
|
+
)
|
|
121
|
+
return RepoOwnerName(None, None)
|
|
122
|
+
owner = parts[-2]
|
|
123
|
+
repo = parts[-1]
|
|
124
|
+
return RepoOwnerName(owner, repo)
|
|
125
|
+
return RepoOwnerName(None, None)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def clone_repo(
|
|
129
|
+
repo_url: str,
|
|
130
|
+
dest_dir: str | os.PathLike,
|
|
131
|
+
include_tags: bool = True,
|
|
132
|
+
branch: str | None = None,
|
|
133
|
+
server_connect_timeout: int = 10000,
|
|
134
|
+
server_timeout: int = 300000,
|
|
135
|
+
) -> CloneResult:
|
|
136
|
+
"""Clone a Git repository using PyGithub to get the full repository with history. This function
|
|
137
|
+
is written to run currently in multiple processes (since pygit2 is not thread-safe) and
|
|
138
|
+
python 's multiprocessing module is recommended for parallelism.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
repo_url: The URL of the Git repository (GitHub URLs).
|
|
142
|
+
dest_dir: Optional destination directory. If not provided, a temporary directory is used.
|
|
143
|
+
include_tags: If True, fetches tags during clone.
|
|
144
|
+
server_connect_timeout: Connection timeout in milliseconds, default 10000 (10 seconds).
|
|
145
|
+
server_timeout: General server timeout in milliseconds, default 300000 (300 seconds).
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
CloneResult with success flag and error message if any.
|
|
149
|
+
|
|
150
|
+
"""
|
|
151
|
+
# Validate input
|
|
152
|
+
if not repo_url or not isinstance(repo_url, str):
|
|
153
|
+
return CloneResult(False, "Invalid URL: None or not a string")
|
|
154
|
+
|
|
155
|
+
dest_path = pathlib.Path(dest_dir)
|
|
156
|
+
failed_marker = dest_path.parent / f"{dest_path.name}.failed"
|
|
157
|
+
|
|
158
|
+
# Normalize the URL
|
|
159
|
+
sanitize_url = normalize_https_repo_url(repo_url)
|
|
160
|
+
if sanitize_url.url is None:
|
|
161
|
+
error_msg = sanitize_url.error or "Failed to normalize repository URL"
|
|
162
|
+
logger.warning(f"Failed to normalize URL {repo_url}: {error_msg}")
|
|
163
|
+
try:
|
|
164
|
+
failed_marker.write_text(f"normalize: {error_msg}")
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|
|
167
|
+
return CloneResult(False, error_msg)
|
|
168
|
+
|
|
169
|
+
repo_url = sanitize_url.url
|
|
170
|
+
owner, repo = extract_owner_name_repo(repo_url)
|
|
171
|
+
if owner is None or repo is None:
|
|
172
|
+
error_msg = "Failed to extract owner/repo from URL"
|
|
173
|
+
logger.warning(f"Failed to extract owner/repo from {repo_url}")
|
|
174
|
+
try:
|
|
175
|
+
failed_marker.write_text(f"extract: {error_msg}")
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
return CloneResult(False, error_msg)
|
|
179
|
+
|
|
180
|
+
# Check if local checkout exists
|
|
181
|
+
git_dir = pathlib.Path(dest_dir) / ".git"
|
|
182
|
+
if git_dir.exists():
|
|
183
|
+
return CloneResult(True, None)
|
|
184
|
+
|
|
185
|
+
# Check if repo exists on GitHub (lazy import to avoid circular dependency)
|
|
186
|
+
from osslag.utils.github_helper import gh_check_repo_exists
|
|
187
|
+
|
|
188
|
+
repo_exists_result = gh_check_repo_exists(owner, repo)
|
|
189
|
+
if not repo_exists_result.success:
|
|
190
|
+
error_msg = repo_exists_result.error or "Repository not found on GitHub"
|
|
191
|
+
try:
|
|
192
|
+
failed_marker.write_text(f"gh_check: {error_msg} url: {repo_url}")
|
|
193
|
+
except Exception:
|
|
194
|
+
pass
|
|
195
|
+
# Pass through rate limit info
|
|
196
|
+
return CloneResult(
|
|
197
|
+
success=False,
|
|
198
|
+
error=error_msg,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
pygit2.settings.server_connect_timeout = server_connect_timeout
|
|
202
|
+
pygit2.settings.server_timeout = server_timeout
|
|
203
|
+
|
|
204
|
+
# Clone using pygit2
|
|
205
|
+
try:
|
|
206
|
+
github_token = os.getenv("GITHUB_TOKEN")
|
|
207
|
+
github_username = os.getenv("GITHUB_USERNAME")
|
|
208
|
+
if github_token and github_username:
|
|
209
|
+
callbacks = pygit2.RemoteCallbacks(
|
|
210
|
+
pygit2.UserPass(github_username, github_token)
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
callbacks = None
|
|
214
|
+
|
|
215
|
+
repo_obj = pygit2.clone_repository(
|
|
216
|
+
sanitize_url.url, dest_dir, callbacks=callbacks, bare=False
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Checkout specific branch if requested
|
|
220
|
+
if branch:
|
|
221
|
+
repo_obj.revparse_single(branch)
|
|
222
|
+
repo_obj.checkout(branch)
|
|
223
|
+
|
|
224
|
+
# Fetch tags if requested
|
|
225
|
+
if include_tags:
|
|
226
|
+
remote = repo_obj.remotes["origin"]
|
|
227
|
+
remote.fetch()
|
|
228
|
+
return CloneResult(True, None)
|
|
229
|
+
|
|
230
|
+
except pygit2.GitError as e:
|
|
231
|
+
# Cleanup partial clone on failure
|
|
232
|
+
dest_path = pathlib.Path(dest_dir)
|
|
233
|
+
if dest_path.exists():
|
|
234
|
+
try:
|
|
235
|
+
shutil.rmtree(dest_path)
|
|
236
|
+
except Exception:
|
|
237
|
+
pass
|
|
238
|
+
error_msg = f"pygit2 error: {str(e)}"
|
|
239
|
+
try:
|
|
240
|
+
failed_marker.write_text(f"clone: {error_msg}")
|
|
241
|
+
except Exception:
|
|
242
|
+
pass
|
|
243
|
+
return CloneResult(False, error_msg)
|
|
244
|
+
except Exception as e:
|
|
245
|
+
# Cleanup partial clone on failure
|
|
246
|
+
dest_path = pathlib.Path(dest_dir)
|
|
247
|
+
if dest_path.exists():
|
|
248
|
+
try:
|
|
249
|
+
shutil.rmtree(dest_path)
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
error_msg = f"General error: {str(e)}"
|
|
253
|
+
try:
|
|
254
|
+
failed_marker.write_text(f"clone: {error_msg}")
|
|
255
|
+
except Exception:
|
|
256
|
+
pass
|
|
257
|
+
return CloneResult(False, error_msg)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def construct_repo_local_path(
|
|
261
|
+
repo_url: str, cache_dir: str | os.PathLike = "./cache", must_exist: bool = True
|
|
262
|
+
) -> pathlib.Path | None:
|
|
263
|
+
"""Get the local path for a cloned repository based on its URL and cache directory.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
repo_url: The URL of the repository
|
|
267
|
+
cache_dir: Base cache directory (default: "./cache")
|
|
268
|
+
must_exist: If True, returns None if the path doesn't exist (default: True)
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
The local path as a Path object, or None if the URL is invalid
|
|
272
|
+
(or if must_exist=True and path doesn't exist)
|
|
273
|
+
|
|
274
|
+
Examples:
|
|
275
|
+
>>> construct_repo_local_path("https://github.com/owner/repo", "./cache")
|
|
276
|
+
None # if not cloned yet
|
|
277
|
+
>>> construct_repo_local_path(
|
|
278
|
+
... "https://github.com/owner/repo", "./cache", must_exist=False
|
|
279
|
+
... )
|
|
280
|
+
PosixPath('./cache/owner--repo')
|
|
281
|
+
|
|
282
|
+
"""
|
|
283
|
+
sanitized_url = normalize_https_repo_url(repo_url)
|
|
284
|
+
if sanitized_url.url is None:
|
|
285
|
+
return None
|
|
286
|
+
|
|
287
|
+
repo_owner = extract_owner_name_repo(sanitized_url.url)
|
|
288
|
+
if repo_owner.owner is None or repo_owner.name is None:
|
|
289
|
+
return None
|
|
290
|
+
|
|
291
|
+
REPOS_CACHE_DIR = os.getenv("REPOS_CACHE_DIR") or str(cache_dir)
|
|
292
|
+
local_repo_path = (
|
|
293
|
+
pathlib.Path(REPOS_CACHE_DIR) / f"{repo_owner.owner}--{repo_owner.name}"
|
|
294
|
+
)
|
|
295
|
+
if must_exist and not local_repo_path.exists():
|
|
296
|
+
return None
|
|
297
|
+
if local_repo_path.exists():
|
|
298
|
+
local_repo_path = local_repo_path.resolve()
|
|
299
|
+
return local_repo_path
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def label_trivial_commits(
|
|
303
|
+
commits_df: pd.DataFrame,
|
|
304
|
+
files_column: str = "files",
|
|
305
|
+
label_column: str = "is_trivial",
|
|
306
|
+
cache_dir: os.PathLike | None = None,
|
|
307
|
+
cache_name: str = "commits_with_trivial_labels.parquet",
|
|
308
|
+
) -> pd.DataFrame:
|
|
309
|
+
"""Add a boolean column marking commits that only changed README.md.
|
|
310
|
+
|
|
311
|
+
A commit is marked True when the files list contains exactly one entry
|
|
312
|
+
and that entry's basename is README.md (case-insensitive, any path).
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
commits_df: DataFrame returned by get_commits_between_tags (expects a 'files' column).
|
|
316
|
+
files_column: Name of the column containing file path lists.
|
|
317
|
+
label_column: Name of the output boolean column to create.
|
|
318
|
+
cache_dir: Optional cache directory
|
|
319
|
+
cache_name: Optional cache file name (defaults to 'commits_with_trivial_labels.parquet' if cache_dir is set)
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
The same DataFrame with an added boolean column.
|
|
323
|
+
|
|
324
|
+
"""
|
|
325
|
+
if files_column not in commits_df.columns:
|
|
326
|
+
logger.error(f"Missing '{files_column}' column; cannot label trivial commits")
|
|
327
|
+
return commits_df
|
|
328
|
+
|
|
329
|
+
def _is_trivial(files: List[str]) -> bool:
|
|
330
|
+
# Empty files list is considered trivial
|
|
331
|
+
if len(files) == 0:
|
|
332
|
+
return True
|
|
333
|
+
|
|
334
|
+
# Documentation-only change if all files are .md files
|
|
335
|
+
rval = all(
|
|
336
|
+
isinstance(f, str) and pathlib.PurePosixPath(f).suffix.lower() == ".md"
|
|
337
|
+
for f in files
|
|
338
|
+
)
|
|
339
|
+
return rval
|
|
340
|
+
|
|
341
|
+
commits_df[label_column] = commits_df[files_column].apply(_is_trivial)
|
|
342
|
+
commits_df = commits_df.reset_index(drop=True)
|
|
343
|
+
if cache_dir is not None:
|
|
344
|
+
cache_path = pathlib.Path(cache_dir)
|
|
345
|
+
cache_path.mkdir(parents=True, exist_ok=True)
|
|
346
|
+
cache_file = cache_path / cache_name
|
|
347
|
+
commits_df.to_parquet(cache_file)
|
|
348
|
+
return commits_df
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def load_commits(
|
|
352
|
+
repo_path: str | os.PathLike,
|
|
353
|
+
branch: str | None = None,
|
|
354
|
+
include_files: bool = True,
|
|
355
|
+
since: datetime | None = None,
|
|
356
|
+
) -> pd.DataFrame:
|
|
357
|
+
"""Retrieve commits from a local Git repository and return as a pandas DataFrame.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
repo_path: Path to the local Git repository directory.
|
|
361
|
+
branch: Branch name to walk from. If None, uses HEAD.
|
|
362
|
+
include_files: If True, include list of changed files per commit. Default True.
|
|
363
|
+
since: Only include commits after this date. If None, defaults to 4 years ago.
|
|
364
|
+
Pass a very old date (e.g., datetime(1970, 1, 1)) to get all commits.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
A pandas DataFrame with columns: 'hash', 'author', 'email', 'message',
|
|
368
|
+
'timestamp', 'date', and optionally 'files'.
|
|
369
|
+
|
|
370
|
+
Examples:
|
|
371
|
+
>>> df = load_commits("/path/to/repo")
|
|
372
|
+
>>> df = load_commits("/path/to/repo", branch="main")
|
|
373
|
+
>>> df = load_commits("/path/to/repo", since=datetime(2020, 1, 1))
|
|
374
|
+
|
|
375
|
+
Raises:
|
|
376
|
+
FileNotFoundError: If repo path doesn't exist or is not a git repository.
|
|
377
|
+
ValueError: If the specified branch is not found or repo has no HEAD.
|
|
378
|
+
RuntimeError: For other git-related errors.
|
|
379
|
+
|
|
380
|
+
"""
|
|
381
|
+
# Default to 4 years ago if not specified
|
|
382
|
+
if since is None:
|
|
383
|
+
since = datetime.now() - relativedelta(years=4)
|
|
384
|
+
repo_path = pathlib.Path(repo_path)
|
|
385
|
+
|
|
386
|
+
if not repo_path.exists():
|
|
387
|
+
raise FileNotFoundError(f"Repository path does not exist: {repo_path}")
|
|
388
|
+
|
|
389
|
+
git_dir = repo_path / ".git"
|
|
390
|
+
if not git_dir.exists():
|
|
391
|
+
raise FileNotFoundError(
|
|
392
|
+
f"Not a Git repository (missing .git directory): {repo_path}"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
repo = pygit2.Repository(str(repo_path))
|
|
397
|
+
|
|
398
|
+
# Determine starting point
|
|
399
|
+
if branch:
|
|
400
|
+
try:
|
|
401
|
+
ref = repo.references[f"refs/heads/{branch}"]
|
|
402
|
+
start_id = ref.peel(pygit2.Commit).id
|
|
403
|
+
except KeyError:
|
|
404
|
+
# Try remote branch
|
|
405
|
+
try:
|
|
406
|
+
ref = repo.references[f"refs/remotes/origin/{branch}"]
|
|
407
|
+
start_id = ref.peel(pygit2.Commit).id
|
|
408
|
+
except KeyError as e:
|
|
409
|
+
raise ValueError(
|
|
410
|
+
f"Branch '{branch}' not found in {repo_path}"
|
|
411
|
+
) from e
|
|
412
|
+
else:
|
|
413
|
+
try:
|
|
414
|
+
start_id = repo.head.peel(pygit2.Commit).id
|
|
415
|
+
except pygit2.GitError as e:
|
|
416
|
+
raise ValueError(
|
|
417
|
+
f"Repository has no HEAD (empty repository?): {repo_path}"
|
|
418
|
+
) from e
|
|
419
|
+
|
|
420
|
+
def _changed_paths(c: pygit2.Commit) -> list[str]:
|
|
421
|
+
"""Return list of paths touched by the commit (optimized, skips merge commits)."""
|
|
422
|
+
# Skip merge commits (more than one parent)
|
|
423
|
+
if len(c.parents) > 1:
|
|
424
|
+
return []
|
|
425
|
+
try:
|
|
426
|
+
# Diff flags for speed:
|
|
427
|
+
# - SKIP_BINARY_CHECK: Don't examine binary file contents
|
|
428
|
+
# Cast needed because pygit2 exports int constants but stubs expect DiffOption enum
|
|
429
|
+
flags = cast("DiffOption", pygit2.GIT_DIFF_SKIP_BINARY_CHECK)
|
|
430
|
+
|
|
431
|
+
if c.parents:
|
|
432
|
+
# Diff from parent to commit (shows what changed in this commit)
|
|
433
|
+
# context_lines=0 skips computing context around changes
|
|
434
|
+
diff = c.parents[0].tree.diff_to_tree(
|
|
435
|
+
c.tree, flags=flags, context_lines=0
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
# Initial commit - diff against empty tree
|
|
439
|
+
diff = c.tree.diff_to_tree(flags=flags, context_lines=0)
|
|
440
|
+
|
|
441
|
+
# Extract paths directly with list comprehension
|
|
442
|
+
# new_file.path is set for adds/modifies, old_file.path for deletes
|
|
443
|
+
return [
|
|
444
|
+
delta.new_file.path or delta.old_file.path for delta in diff.deltas
|
|
445
|
+
]
|
|
446
|
+
except Exception:
|
|
447
|
+
return []
|
|
448
|
+
|
|
449
|
+
def _safe_str(accessor, default=None):
|
|
450
|
+
"""Safely access string attributes that may have encoding issues."""
|
|
451
|
+
try:
|
|
452
|
+
value = accessor()
|
|
453
|
+
return value.strip() if isinstance(value, str) and value else value
|
|
454
|
+
except (LookupError, UnicodeDecodeError):
|
|
455
|
+
# LookupError covers "unknown encoding: System" and similar
|
|
456
|
+
return default
|
|
457
|
+
|
|
458
|
+
# Walk all commits
|
|
459
|
+
walker = repo.walk(start_id, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_TIME) # type: ignore[arg-type]
|
|
460
|
+
since_timestamp = since.timestamp()
|
|
461
|
+
|
|
462
|
+
# Build a mapping from commit id to tag names
|
|
463
|
+
commit_tags: dict[str, list[str]] = {}
|
|
464
|
+
for ref_name in repo.references:
|
|
465
|
+
if ref_name.startswith("refs/tags/"):
|
|
466
|
+
tag_name = ref_name[len("refs/tags/") :]
|
|
467
|
+
try:
|
|
468
|
+
ref = repo.references[ref_name]
|
|
469
|
+
obj = ref.peel()
|
|
470
|
+
if isinstance(obj, pygit2.Commit):
|
|
471
|
+
commit_id = str(obj.id)
|
|
472
|
+
commit_tags.setdefault(commit_id, []).append(tag_name)
|
|
473
|
+
except Exception:
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
commits_data = []
|
|
477
|
+
for commit in walker:
|
|
478
|
+
# Skip commits older than the cutoff date
|
|
479
|
+
if commit.commit_time < since_timestamp:
|
|
480
|
+
continue
|
|
481
|
+
row = {
|
|
482
|
+
"hash": str(commit.id),
|
|
483
|
+
"author": _safe_str(
|
|
484
|
+
lambda c=commit: c.author.name if c.author else None
|
|
485
|
+
),
|
|
486
|
+
"email": _safe_str(
|
|
487
|
+
lambda c=commit: c.author.email if c.author else None
|
|
488
|
+
),
|
|
489
|
+
"message": _safe_str(lambda c=commit: c.message),
|
|
490
|
+
"timestamp": commit.commit_time,
|
|
491
|
+
"date": datetime.fromtimestamp(commit.commit_time),
|
|
492
|
+
}
|
|
493
|
+
if include_files:
|
|
494
|
+
row["files"] = _changed_paths(commit)
|
|
495
|
+
# Add tags associated with this commit
|
|
496
|
+
row["tags"] = commit_tags.get(str(commit.id), [])
|
|
497
|
+
commits_data.append(row)
|
|
498
|
+
|
|
499
|
+
# Sort by timestamp (oldest to newest)
|
|
500
|
+
commits_data.sort(key=lambda x: x["timestamp"])
|
|
501
|
+
|
|
502
|
+
return pd.DataFrame(commits_data)
|
|
503
|
+
|
|
504
|
+
except (FileNotFoundError, ValueError):
|
|
505
|
+
raise
|
|
506
|
+
except Exception as e:
|
|
507
|
+
raise RuntimeError(f"Failed to retrieve commits from {repo_path}: {e}") from e
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def find_upstream_version_tag_commit(
|
|
511
|
+
commits: pd.DataFrame,
|
|
512
|
+
version: str,
|
|
513
|
+
) -> str | None:
|
|
514
|
+
"""Find the commit hash corresponding to the upstream version tag.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
commits: DataFrame of commits with a 'tags' column.
|
|
518
|
+
version: The version string to search for (e.g., '1.2.3').
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
The commit hash of the matching tag, or None if not found.
|
|
522
|
+
|
|
523
|
+
"""
|
|
524
|
+
version_tag_patterns = [
|
|
525
|
+
re.compile(rf"^v{re.escape(version)}$"), # v1.2.3
|
|
526
|
+
re.compile(rf"^{re.escape(version)}$"), # 1.2.3
|
|
527
|
+
re.compile(
|
|
528
|
+
rf"^release[-_]?{re.escape(version)}$"
|
|
529
|
+
), # release-1.2.3 or release_1.2.3
|
|
530
|
+
re.compile(
|
|
531
|
+
rf"^version[-_]?{re.escape(version)}$"
|
|
532
|
+
), # version-1.2.3 or version_1.2.3
|
|
533
|
+
]
|
|
534
|
+
|
|
535
|
+
for _, row in commits.iterrows():
|
|
536
|
+
tags = row.get("tags", [])
|
|
537
|
+
if not isinstance(tags, list):
|
|
538
|
+
continue
|
|
539
|
+
for tag in tags:
|
|
540
|
+
for pattern in version_tag_patterns:
|
|
541
|
+
if pattern.match(tag):
|
|
542
|
+
return str(row["hash"])
|
|
543
|
+
return None
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: osslag
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Technical Lag tools for Open Source Software Projects
|
|
5
|
+
Keywords: oss,open source,technical lag,software lag,software maintenance
|
|
6
|
+
Author: Shane Panter
|
|
7
|
+
Author-email: Shane Panter <shanepanter@boisestate.edu>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Intended Audience :: Science/Research
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
12
|
+
Requires-Dist: dotenv>=0.9.9
|
|
13
|
+
Requires-Dist: pandas>=3.0.0
|
|
14
|
+
Requires-Dist: pygit2>=1.19.1
|
|
15
|
+
Requires-Dist: pygithub>=2.8.1
|
|
16
|
+
Requires-Dist: python-dateutil>=2.9.0.post0
|
|
17
|
+
Requires-Dist: python-debian>=1.0.1
|
|
18
|
+
Requires-Dist: requests>=2.32.5
|
|
19
|
+
Requires-Dist: rich>=14.3.1
|
|
20
|
+
Requires-Dist: typer>=0.21.1
|
|
21
|
+
Requires-Python: >=3.14
|
|
22
|
+
Project-URL: Homepage, https://github.com/shanep/osslag
|
|
23
|
+
Project-URL: Issues, https://github.com/shanep/osslag/issues
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# OSS-Lag: Open Source Software Lag Dataset
|
|
27
|
+
|
|
28
|
+
This repository contains code to build a dataset measuring technical lag and
|
|
29
|
+
abandonment of open source packages across multiple Linux distributions.
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
You can install from PyPI using pip:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install osslag
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
Run the cli tool with `--help` to see available commands and options:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
osslag --help
|
|
46
|
+
```
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
osslag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
osslag/cli.py,sha256=APVFUBACfJ-PRs9pwvD359aaCRd4w6KCZcAjGdTWNoM,52658
|
|
3
|
+
osslag/distro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
osslag/distro/debian.py,sha256=sjodEtq8rboTkWfywn3wrbC-RiDmIURev7XvB9KJ_Ig,12867
|
|
5
|
+
osslag/distro/fedora.py,sha256=88b9gIMUaixCwsjQpX5oBK654tFLVI_8Bv3QwwdV3Yo,1174
|
|
6
|
+
osslag/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
osslag/metrics/malta.py,sha256=ToT4imGHXHkws-y9m9qUMn913OMakRF1qPMiXR-wGEI,19563
|
|
8
|
+
osslag/metrics/pvac.py,sha256=tfkMbYFS6Mk-pEocu6sMOIoniD5to0QA-p8OyXW2lyo,6136
|
|
9
|
+
osslag/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
osslag/utils/github_helper.py,sha256=vsWPoSqKEcNaa_1MdUJlWJhP4CCwW-Es4CyodOr7tUs,8752
|
|
11
|
+
osslag/utils/vcs.py,sha256=EMtHgZ6ftQVn-B9H0D6gUhWK-YjGBp05ArAymcH7V4w,19203
|
|
12
|
+
osslag-1.0.0.dist-info/WHEEL,sha256=fAguSjoiATBe7TNBkJwOjyL1Tt4wwiaQGtNtjRPNMQA,80
|
|
13
|
+
osslag-1.0.0.dist-info/entry_points.txt,sha256=-MrR4OE3-FnFkYTVyMYLbhwzpLBKgTtrDvCyvHUQqP8,44
|
|
14
|
+
osslag-1.0.0.dist-info/METADATA,sha256=o1c5-RahYQoECta_PLJlukYme-azqzKTuikE4DlJ5z8,1290
|
|
15
|
+
osslag-1.0.0.dist-info/RECORD,,
|