osslag 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- osslag/__init__.py +9 -0
- osslag/cli.py +100 -243
- osslag/distro/debian.py +13 -41
- osslag/distro/fedora.py +1 -3
- osslag/metrics/malta.py +412 -125
- osslag/metrics/pvac.py +2 -6
- osslag/utils/github_helper.py +7 -23
- osslag/utils/vcs.py +16 -49
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/METADATA +14 -8
- osslag-1.0.1.dist-info/RECORD +17 -0
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/WHEEL +2 -1
- {osslag-1.0.0.dist-info → osslag-1.0.1.dist-info}/entry_points.txt +0 -1
- osslag-1.0.1.dist-info/licenses/LICENSE +21 -0
- osslag-1.0.1.dist-info/top_level.txt +1 -0
- osslag-1.0.0.dist-info/RECORD +0 -15
osslag/metrics/malta.py
CHANGED
|
@@ -4,10 +4,13 @@ from __future__ import annotations
|
|
|
4
4
|
import math
|
|
5
5
|
from statistics import median
|
|
6
6
|
from datetime import datetime
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import TYPE_CHECKING, Any, NamedTuple, Sequence
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from dateutil.relativedelta import relativedelta
|
|
10
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from tqdm import tqdm as TqdmType
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
class EvaluationWindow(NamedTuple):
|
|
13
16
|
"""Represents a time window for evaluation."""
|
|
@@ -78,11 +81,10 @@ class AggregateScoreComponents(NamedTuple):
|
|
|
78
81
|
"""Final maintenance score and its components"""
|
|
79
82
|
|
|
80
83
|
s_final: float # Final S_dev
|
|
84
|
+
s_final_100: float # in [0,100]
|
|
81
85
|
s_dev: float # Development activity score
|
|
82
86
|
s_resp: float # Responsiveness score
|
|
83
87
|
s_meta: float # Metadata score
|
|
84
|
-
s_final: float # in [0,1]
|
|
85
|
-
s_final_100: float # in [0,100]
|
|
86
88
|
|
|
87
89
|
|
|
88
90
|
class MaltaConstants(NamedTuple):
|
|
@@ -139,11 +141,11 @@ class Malta:
|
|
|
139
141
|
commits_df: pd.DataFrame,
|
|
140
142
|
pull_requests_df: pd.DataFrame,
|
|
141
143
|
repo_meta_df: pd.DataFrame,
|
|
142
|
-
malta_constants:
|
|
143
|
-
das_constants:
|
|
144
|
-
mrs_constants:
|
|
145
|
-
repo_meta_constants:
|
|
146
|
-
final_agg_constants:
|
|
144
|
+
malta_constants: MaltaConstants | None = None,
|
|
145
|
+
das_constants: DevelopmentActivityScoreConstants | None = None,
|
|
146
|
+
mrs_constants: MaintainerResponsivenessScoreConstants | None = None,
|
|
147
|
+
repo_meta_constants: RepoViabilityScoreConstants | None = None,
|
|
148
|
+
final_agg_constants: AggregateScoreConstants | None = None,
|
|
147
149
|
):
|
|
148
150
|
self.package = package
|
|
149
151
|
self.github_repo_url = github_repo_url
|
|
@@ -151,49 +153,33 @@ class Malta:
|
|
|
151
153
|
self.pull_requests_df = pull_requests_df
|
|
152
154
|
self.repo_meta_df = repo_meta_df
|
|
153
155
|
|
|
154
|
-
|
|
155
|
-
self.
|
|
156
|
-
self.
|
|
157
|
-
self.
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
self.
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
)
|
|
167
|
-
self.mrs_constants = (
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
else MaintainerResponsivenessScoreConstants()
|
|
171
|
-
)
|
|
172
|
-
self.rmv_constants = (
|
|
173
|
-
repo_meta_constants
|
|
174
|
-
if repo_meta_constants is not None
|
|
175
|
-
else RepoViabilityScoreConstants()
|
|
176
|
-
)
|
|
177
|
-
self.final_constants = (
|
|
178
|
-
final_agg_constants
|
|
179
|
-
if final_agg_constants is not None
|
|
180
|
-
else AggregateScoreConstants()
|
|
181
|
-
)
|
|
156
|
+
# Score components (populated by calling the respective methods)
|
|
157
|
+
self.das: DASComponents | None = None
|
|
158
|
+
self.mrs: MRSComponents | None = None
|
|
159
|
+
self.rmvs: RMVSComponents | None = None
|
|
160
|
+
self.final: AggregateScoreComponents | None = None
|
|
161
|
+
|
|
162
|
+
# Cached extracted data (lazily populated)
|
|
163
|
+
self._commits_cache: Sequence[Commit] | None = None
|
|
164
|
+
self._prs_cache: Sequence[PullRequest] | None = None
|
|
165
|
+
self._meta_cache: RepoMeta | None = None
|
|
166
|
+
|
|
167
|
+
self.malta_constants = malta_constants if malta_constants is not None else MaltaConstants()
|
|
168
|
+
self.das_constants = das_constants if das_constants is not None else DevelopmentActivityScoreConstants()
|
|
169
|
+
self.mrs_constants = mrs_constants if mrs_constants is not None else MaintainerResponsivenessScoreConstants()
|
|
170
|
+
self.rmv_constants = repo_meta_constants if repo_meta_constants is not None else RepoViabilityScoreConstants()
|
|
171
|
+
self.final_constants = final_agg_constants if final_agg_constants is not None else AggregateScoreConstants()
|
|
182
172
|
# Precompute evaluation and baseline windows
|
|
183
173
|
if eval_end.tzinfo is None:
|
|
184
174
|
raise ValueError("Datetime object must be timezone-aware")
|
|
185
|
-
eval_window_start = eval_end - relativedelta(
|
|
186
|
-
months=self.malta_constants.eval_months
|
|
187
|
-
)
|
|
175
|
+
eval_window_start = eval_end - relativedelta(months=self.malta_constants.eval_months)
|
|
188
176
|
# Evaluation window
|
|
189
177
|
self.eval_window = EvaluationWindow(
|
|
190
178
|
start=eval_window_start,
|
|
191
179
|
end=eval_end,
|
|
192
180
|
days=(eval_end - eval_window_start).days,
|
|
193
181
|
)
|
|
194
|
-
baseline_window_start = eval_window_start - relativedelta(
|
|
195
|
-
months=self.malta_constants.baseline_months
|
|
196
|
-
)
|
|
182
|
+
baseline_window_start = eval_window_start - relativedelta(months=self.malta_constants.baseline_months)
|
|
197
183
|
baseline_window_end = eval_window_start
|
|
198
184
|
# Baseline window
|
|
199
185
|
self.baseline_window = EvaluationWindow(
|
|
@@ -215,61 +201,82 @@ class Malta:
|
|
|
215
201
|
return min(1.0, math.log1p(x) / math.log1p(K))
|
|
216
202
|
|
|
217
203
|
def get_commits_for_package(self) -> Sequence[Commit]:
|
|
218
|
-
"""Extract commits for a given repository URL."""
|
|
204
|
+
"""Extract commits for a given repository URL. Results are cached."""
|
|
205
|
+
if self._commits_cache is not None:
|
|
206
|
+
return self._commits_cache
|
|
207
|
+
|
|
219
208
|
repo_url_column = self.malta_constants.repo_url_column
|
|
220
209
|
repo_dates_column = self.malta_constants.repo_dates_column
|
|
221
210
|
repo_is_trivial_column = self.malta_constants.repo_is_trivial_column
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
]
|
|
211
|
+
|
|
212
|
+
mask = self.commits_df[repo_url_column] == self.github_repo_url
|
|
213
|
+
repo_commits_df = self.commits_df.loc[mask]
|
|
225
214
|
if len(repo_commits_df) == 0:
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
]
|
|
235
|
-
return
|
|
215
|
+
self._commits_cache = []
|
|
216
|
+
return self._commits_cache
|
|
217
|
+
|
|
218
|
+
# Extract columns as arrays (vectorized datetime conversion)
|
|
219
|
+
dates = pd.to_datetime(repo_commits_df[repo_dates_column], utc=True).to_numpy()
|
|
220
|
+
trivials = repo_commits_df[repo_is_trivial_column].to_numpy()
|
|
221
|
+
|
|
222
|
+
# Build Commit objects from arrays (faster than iterrows/itertuples)
|
|
223
|
+
self._commits_cache = [Commit(date=d, is_trivial=t) for d, t in zip(dates, trivials)]
|
|
224
|
+
return self._commits_cache
|
|
236
225
|
|
|
237
226
|
def get_pull_requests_for_package(self) -> Sequence[PullRequest]:
|
|
238
|
-
"""Extract pull requests for a given repository URL."""
|
|
227
|
+
"""Extract pull requests for a given repository URL. Results are cached."""
|
|
228
|
+
if self._prs_cache is not None:
|
|
229
|
+
return self._prs_cache
|
|
230
|
+
|
|
239
231
|
repo_url_column = self.malta_constants.repo_url_column
|
|
240
232
|
pr_created_at_column = self.malta_constants.pr_created_at_column
|
|
241
233
|
pr_closed_at_column = self.malta_constants.pr_closed_at_column
|
|
242
234
|
pr_merged_at_column = self.malta_constants.pr_merged_at_column
|
|
243
235
|
pr_state_column = self.malta_constants.pr_state_column
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
]
|
|
236
|
+
|
|
237
|
+
mask = self.pull_requests_df[repo_url_column] == self.github_repo_url
|
|
238
|
+
repo_prs_df = self.pull_requests_df.loc[mask]
|
|
247
239
|
if len(repo_prs_df) == 0:
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
)
|
|
253
|
-
repo_prs_df[pr_closed_at_column] =
|
|
254
|
-
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
created_at=row[pr_created_at_column],
|
|
262
|
-
closed_at=row[pr_closed_at_column],
|
|
263
|
-
merged_at=row[pr_merged_at_column],
|
|
264
|
-
state=row[pr_state_column],
|
|
265
|
-
)
|
|
266
|
-
for _, row in repo_prs_df.iterrows()
|
|
240
|
+
self._prs_cache = []
|
|
241
|
+
return self._prs_cache
|
|
242
|
+
|
|
243
|
+
# Extract columns as arrays (vectorized datetime conversion)
|
|
244
|
+
created = pd.to_datetime(repo_prs_df[pr_created_at_column], utc=True).to_numpy()
|
|
245
|
+
closed = pd.to_datetime(repo_prs_df[pr_closed_at_column], utc=True).to_numpy()
|
|
246
|
+
merged = pd.to_datetime(repo_prs_df[pr_merged_at_column], utc=True).to_numpy()
|
|
247
|
+
states = repo_prs_df[pr_state_column].to_numpy()
|
|
248
|
+
|
|
249
|
+
# Build PullRequest objects from arrays
|
|
250
|
+
self._prs_cache = [
|
|
251
|
+
PullRequest(created_at=c, closed_at=cl, merged_at=m, state=s)
|
|
252
|
+
for c, cl, m, s in zip(created, closed, merged, states)
|
|
267
253
|
]
|
|
268
|
-
return
|
|
254
|
+
return self._prs_cache
|
|
255
|
+
|
|
256
|
+
def get_repo_meta_for_package(self) -> RepoMeta:
|
|
257
|
+
"""Extract repository metadata for a given repository URL. Results are cached."""
|
|
258
|
+
if self._meta_cache is not None:
|
|
259
|
+
return self._meta_cache
|
|
260
|
+
|
|
261
|
+
repo_url_column = self.malta_constants.repo_url_column
|
|
262
|
+
mask = self.repo_meta_df[repo_url_column] == self.github_repo_url
|
|
263
|
+
repo_meta_row = self.repo_meta_df.loc[mask]
|
|
264
|
+
if len(repo_meta_row) == 0:
|
|
265
|
+
self._meta_cache = RepoMeta()
|
|
266
|
+
return self._meta_cache
|
|
267
|
+
|
|
268
|
+
row = repo_meta_row.iloc[0]
|
|
269
|
+
self._meta_cache = RepoMeta(
|
|
270
|
+
stars=int(row.get("stars", 0)),
|
|
271
|
+
forks=int(row.get("forks", 0)),
|
|
272
|
+
watchers=int(row.get("watchers", 0)),
|
|
273
|
+
open_issues=int(row.get("open_issues", 0)),
|
|
274
|
+
archived=bool(row.get("archived", False)),
|
|
275
|
+
)
|
|
276
|
+
return self._meta_cache
|
|
269
277
|
|
|
270
278
|
def development_activity_score(
|
|
271
279
|
self,
|
|
272
|
-
commits: Sequence[Commit],
|
|
273
280
|
include_trivial: bool = False,
|
|
274
281
|
) -> DASComponents:
|
|
275
282
|
"""Computes S_dev in [0,1]:
|
|
@@ -280,10 +287,6 @@ class Malta:
|
|
|
280
287
|
|
|
281
288
|
Parameters
|
|
282
289
|
----------
|
|
283
|
-
commits : Sequence[Commit]
|
|
284
|
-
All commits in both baseline and evaluation windows.
|
|
285
|
-
tau_days : float
|
|
286
|
-
Decay half-life in days (default 180).
|
|
287
290
|
include_trivial : bool
|
|
288
291
|
If False, exclude trivial commits from counts and recency calculations.
|
|
289
292
|
|
|
@@ -294,21 +297,18 @@ class Malta:
|
|
|
294
297
|
|
|
295
298
|
Notes
|
|
296
299
|
-----
|
|
300
|
+
- Uses commits from self.commits_df via get_commits_for_package().
|
|
297
301
|
- If baseline rate is 0, we treat D_c as 1 when eval also has activity,
|
|
298
302
|
else 0. This avoids division-by-zero while remaining conservative.
|
|
299
303
|
- t_last is measured since most recent non-trivial commit (unless include_trivial=True).
|
|
300
304
|
|
|
301
305
|
"""
|
|
306
|
+
commits = self.get_commits_for_package()
|
|
302
307
|
# Partition commits into the baseline and evaluation sets, filtering trivial if needed.
|
|
303
308
|
commits_baseline: Sequence[Commit] = []
|
|
304
309
|
commits_eval: Sequence[Commit] = []
|
|
305
|
-
window_baseline_days: int = (
|
|
306
|
-
|
|
307
|
-
- self.baseline_window.start.toordinal()
|
|
308
|
-
)
|
|
309
|
-
window_eval_days: int = (
|
|
310
|
-
self.eval_window.end.toordinal() - self.eval_window.start.toordinal()
|
|
311
|
-
)
|
|
310
|
+
window_baseline_days: int = self.baseline_window.end.toordinal() - self.baseline_window.start.toordinal()
|
|
311
|
+
window_eval_days: int = self.eval_window.end.toordinal() - self.eval_window.start.toordinal()
|
|
312
312
|
|
|
313
313
|
for c in commits:
|
|
314
314
|
if self.baseline_window.start <= c.date < self.baseline_window.end:
|
|
@@ -319,9 +319,7 @@ class Malta:
|
|
|
319
319
|
if window_baseline_days <= 0 or window_eval_days <= 0:
|
|
320
320
|
raise ValueError("window_baseline_days and window_eval_days must be > 0")
|
|
321
321
|
if self.baseline_window.end > self.eval_window.start:
|
|
322
|
-
raise ValueError(
|
|
323
|
-
"Baseline window must end before evaluation window starts."
|
|
324
|
-
)
|
|
322
|
+
raise ValueError("Baseline window must end before evaluation window starts.")
|
|
325
323
|
if self.eval_window.start < self.baseline_window.end:
|
|
326
324
|
raise ValueError("Evaluation window must start after baseline window ends.")
|
|
327
325
|
|
|
@@ -351,12 +349,8 @@ class Malta:
|
|
|
351
349
|
if candidates:
|
|
352
350
|
last_commit_time = max(c.date for c in candidates)
|
|
353
351
|
if last_commit_time.tzinfo is None:
|
|
354
|
-
raise ValueError(
|
|
355
|
-
|
|
356
|
-
)
|
|
357
|
-
t_last = max(
|
|
358
|
-
0.0, (self.eval_window.end - last_commit_time).total_seconds() / 86400.0
|
|
359
|
-
)
|
|
352
|
+
raise ValueError("Commit.authored_at must be timezone-aware (UTC recommended).")
|
|
353
|
+
t_last = max(0.0, (self.eval_window.end - last_commit_time).total_seconds() / 86400.0)
|
|
360
354
|
R_c = math.exp(-t_last / self.das_constants.tau_days)
|
|
361
355
|
else:
|
|
362
356
|
# No commits at all -> fully inactive.
|
|
@@ -369,23 +363,23 @@ class Malta:
|
|
|
369
363
|
|
|
370
364
|
def maintainer_responsiveness_score(
|
|
371
365
|
self,
|
|
372
|
-
pull_requests: Sequence[PullRequest],
|
|
373
366
|
) -> MRSComponents:
|
|
374
367
|
"""Compute the PR-outcome-bound Maintainer Responsiveness Score (S_resp).
|
|
375
368
|
|
|
376
|
-
Parameters
|
|
377
|
-
----------
|
|
378
|
-
pull_requests : Sequence[PullRequest]
|
|
379
|
-
|
|
380
369
|
Returns
|
|
381
370
|
-------
|
|
382
371
|
MRSComponents
|
|
383
372
|
Components of the maintainer responsiveness score.
|
|
384
373
|
|
|
374
|
+
Notes
|
|
375
|
+
-----
|
|
376
|
+
- Uses pull requests from self.pull_requests_df via get_pull_requests_for_package().
|
|
377
|
+
|
|
385
378
|
"""
|
|
379
|
+
pull_requests = self.get_pull_requests_for_package()
|
|
386
380
|
if not pull_requests:
|
|
387
|
-
# No external contribution signal
|
|
388
|
-
|
|
381
|
+
# No external contribution signal - Sresp is undefined per paper
|
|
382
|
+
self.mrs = MRSComponents(
|
|
389
383
|
s_resp=0.0,
|
|
390
384
|
r_dec=0.0,
|
|
391
385
|
d_dec=0.0,
|
|
@@ -394,15 +388,13 @@ class Malta:
|
|
|
394
388
|
n_terminated=0,
|
|
395
389
|
n_open=0,
|
|
396
390
|
)
|
|
391
|
+
return self.mrs
|
|
392
|
+
|
|
397
393
|
# Filter PRs to those created within the evaluation window
|
|
398
|
-
P = [
|
|
399
|
-
pr
|
|
400
|
-
for pr in pull_requests
|
|
401
|
-
if self.eval_window.start <= pr.created_at < self.eval_window.end
|
|
402
|
-
]
|
|
394
|
+
P = [pr for pr in pull_requests if self.eval_window.start <= pr.created_at < self.eval_window.end]
|
|
403
395
|
if not P:
|
|
404
396
|
# No PRs in evaluation window
|
|
405
|
-
|
|
397
|
+
self.mrs = MRSComponents(
|
|
406
398
|
s_resp=0.0,
|
|
407
399
|
r_dec=0.0,
|
|
408
400
|
d_dec=0.0,
|
|
@@ -411,6 +403,8 @@ class Malta:
|
|
|
411
403
|
n_terminated=0,
|
|
412
404
|
n_open=0,
|
|
413
405
|
)
|
|
406
|
+
return self.mrs
|
|
407
|
+
|
|
414
408
|
# Partition PRs
|
|
415
409
|
P_term = []
|
|
416
410
|
P_open = []
|
|
@@ -423,11 +417,9 @@ class Malta:
|
|
|
423
417
|
else:
|
|
424
418
|
raise ValueError(f"Unknown PR state: {pr.state}")
|
|
425
419
|
|
|
426
|
-
# If PRs exist but none are
|
|
427
|
-
if P_term:
|
|
428
|
-
|
|
429
|
-
else:
|
|
430
|
-
return MRSComponents(
|
|
420
|
+
# If PRs exist but none are terminated, Sresp = 0 per paper
|
|
421
|
+
if not P_term:
|
|
422
|
+
self.mrs = MRSComponents(
|
|
431
423
|
s_resp=0.0,
|
|
432
424
|
r_dec=0.0,
|
|
433
425
|
d_dec=0.0,
|
|
@@ -436,11 +428,14 @@ class Malta:
|
|
|
436
428
|
n_terminated=0,
|
|
437
429
|
n_open=len(P_open),
|
|
438
430
|
)
|
|
431
|
+
return self.mrs
|
|
432
|
+
|
|
433
|
+
R_dec = len(P_term) / len(P)
|
|
439
434
|
|
|
440
435
|
# ---- Decision Timeliness (D_dec) ----
|
|
441
436
|
decision_delays = []
|
|
442
437
|
for pr in P_term:
|
|
443
|
-
closed_time = pr.merged_at
|
|
438
|
+
closed_time = pr.merged_at if pd.notna(pr.merged_at) else pr.closed_at
|
|
444
439
|
delta_days = (closed_time - pr.created_at).days
|
|
445
440
|
decision_delays.append(min(1.0, delta_days / self.mrs_constants.tref_days))
|
|
446
441
|
|
|
@@ -459,7 +454,7 @@ class Malta:
|
|
|
459
454
|
# ---- Responsiveness Aggregation ----
|
|
460
455
|
S_resp = R_dec * (1.0 - D_dec) * (1.0 - P_open_penalty)
|
|
461
456
|
|
|
462
|
-
self.
|
|
457
|
+
self.mrs = MRSComponents(
|
|
463
458
|
s_resp=self.__clamp(S_resp),
|
|
464
459
|
r_dec=R_dec,
|
|
465
460
|
d_dec=D_dec,
|
|
@@ -469,11 +464,10 @@ class Malta:
|
|
|
469
464
|
n_open=len(P_open),
|
|
470
465
|
)
|
|
471
466
|
|
|
472
|
-
return self.
|
|
467
|
+
return self.mrs
|
|
473
468
|
|
|
474
469
|
def repo_metadata_viability_score(
|
|
475
470
|
self,
|
|
476
|
-
meta: RepoMeta,
|
|
477
471
|
) -> RMVSComponents:
|
|
478
472
|
"""Compute repository metadata viability score S_meta.
|
|
479
473
|
|
|
@@ -487,7 +481,12 @@ class Malta:
|
|
|
487
481
|
Missing-data handling:
|
|
488
482
|
- If all counts are None: return None.
|
|
489
483
|
- If some counts are missing: renormalize betas over observed fields.
|
|
484
|
+
|
|
485
|
+
Notes
|
|
486
|
+
-----
|
|
487
|
+
- Uses repo metadata from self.repo_meta_df via __get_repo_meta_for_package().
|
|
490
488
|
"""
|
|
489
|
+
meta = self.get_repo_meta_for_package()
|
|
491
490
|
if not (0.0 <= self.rmv_constants.alpha_archived <= 1.0):
|
|
492
491
|
raise ValueError("alpha_archived must be in [0,1].")
|
|
493
492
|
betas = {
|
|
@@ -505,7 +504,7 @@ class Malta:
|
|
|
505
504
|
f = self.__phi_count(meta.forks, K)
|
|
506
505
|
w = self.__phi_count(meta.watchers, K)
|
|
507
506
|
i = self.__phi_count(meta.open_issues, K)
|
|
508
|
-
i_pen =
|
|
507
|
+
i_pen = 1.0 - i
|
|
509
508
|
|
|
510
509
|
parts = {"stars": s, "forks": f, "watchers": w, "issues": i_pen}
|
|
511
510
|
observed = {k: v for k, v in parts.items() if v is not None}
|
|
@@ -553,9 +552,21 @@ class Malta:
|
|
|
553
552
|
Archived override:
|
|
554
553
|
- If archived and S_resp is None: set S_resp = 0.0 (explicit cessation)
|
|
555
554
|
before aggregation (still renormalizes if S_meta is None).
|
|
555
|
+
|
|
556
|
+
Raises
|
|
557
|
+
------
|
|
558
|
+
ValueError
|
|
559
|
+
If component scores have not been computed yet.
|
|
556
560
|
"""
|
|
561
|
+
if self.das is None or self.mrs is None or self.rmvs is None:
|
|
562
|
+
raise ValueError(
|
|
563
|
+
"Component scores must be computed before final aggregation. "
|
|
564
|
+
"Call development_activity_score(), maintainer_responsiveness_score(), "
|
|
565
|
+
"and repo_metadata_viability_score() first."
|
|
566
|
+
)
|
|
567
|
+
|
|
557
568
|
s_dev = self.das.s_dev
|
|
558
|
-
s_resp = self.
|
|
569
|
+
s_resp = self.mrs.s_resp
|
|
559
570
|
s_meta = self.rmvs.s_meta
|
|
560
571
|
|
|
561
572
|
# If the repo is archived, treat as 0.0
|
|
@@ -583,3 +594,279 @@ class Malta:
|
|
|
583
594
|
)
|
|
584
595
|
|
|
585
596
|
return self.final
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
class MaltaResult(NamedTuple):
|
|
600
|
+
"""Result from scoring a single package with MALTA metrics."""
|
|
601
|
+
|
|
602
|
+
source: str
|
|
603
|
+
repo_url: str
|
|
604
|
+
# DAS components
|
|
605
|
+
das_score: float | None
|
|
606
|
+
das_dc: float | None
|
|
607
|
+
das_rc: float | None
|
|
608
|
+
# MRS components
|
|
609
|
+
mrs_score: float | None
|
|
610
|
+
mrs_rdec: float | None
|
|
611
|
+
mrs_ddec: float | None
|
|
612
|
+
mrs_popen: float | None
|
|
613
|
+
mrs_n_prs: int | None
|
|
614
|
+
mrs_n_terminated: int | None
|
|
615
|
+
mrs_n_open: int | None
|
|
616
|
+
# RMVS components
|
|
617
|
+
rmvs_score: float | None
|
|
618
|
+
rmvs_archived: bool | None
|
|
619
|
+
rmvs_stars_phi: float | None
|
|
620
|
+
rmvs_forks_phi: float | None
|
|
621
|
+
rmvs_issues_penalty: float | None
|
|
622
|
+
# Final score
|
|
623
|
+
final_score: float | None
|
|
624
|
+
final_score_100: float | None
|
|
625
|
+
# Counts
|
|
626
|
+
n_commits_total: int | None
|
|
627
|
+
n_commits_window: int | None
|
|
628
|
+
n_prs_total: int | None
|
|
629
|
+
n_prs_window: int | None
|
|
630
|
+
# Metadata
|
|
631
|
+
stars: int | None
|
|
632
|
+
forks: int | None
|
|
633
|
+
watchers: int | None
|
|
634
|
+
open_issues: int | None
|
|
635
|
+
archived: bool | None
|
|
636
|
+
# Error tracking
|
|
637
|
+
error: str | None
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def _score_single_repo(
|
|
641
|
+
source: str,
|
|
642
|
+
repo_url: str,
|
|
643
|
+
repo_commits_df: pd.DataFrame,
|
|
644
|
+
repo_prs_df: pd.DataFrame,
|
|
645
|
+
repo_meta_df: pd.DataFrame,
|
|
646
|
+
eval_end: datetime,
|
|
647
|
+
malta_constants: MaltaConstants | None,
|
|
648
|
+
das_constants: DevelopmentActivityScoreConstants | None,
|
|
649
|
+
mrs_constants: MaintainerResponsivenessScoreConstants | None,
|
|
650
|
+
repo_meta_constants: RepoViabilityScoreConstants | None,
|
|
651
|
+
final_agg_constants: AggregateScoreConstants | None,
|
|
652
|
+
) -> MaltaResult:
|
|
653
|
+
"""Score a single repository. Internal function used by score_repos."""
|
|
654
|
+
try:
|
|
655
|
+
m = Malta(
|
|
656
|
+
package=source,
|
|
657
|
+
github_repo_url=repo_url,
|
|
658
|
+
eval_end=eval_end,
|
|
659
|
+
commits_df=repo_commits_df,
|
|
660
|
+
pull_requests_df=repo_prs_df,
|
|
661
|
+
repo_meta_df=repo_meta_df,
|
|
662
|
+
malta_constants=malta_constants,
|
|
663
|
+
das_constants=das_constants,
|
|
664
|
+
mrs_constants=mrs_constants,
|
|
665
|
+
repo_meta_constants=repo_meta_constants,
|
|
666
|
+
final_agg_constants=final_agg_constants,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
das = m.development_activity_score()
|
|
670
|
+
mrs = m.maintainer_responsiveness_score()
|
|
671
|
+
rmvs = m.repo_metadata_viability_score()
|
|
672
|
+
final = m.final_aggregation_score()
|
|
673
|
+
|
|
674
|
+
commits = m.get_commits_for_package()
|
|
675
|
+
prs = m.get_pull_requests_for_package()
|
|
676
|
+
meta = m.get_repo_meta_for_package()
|
|
677
|
+
|
|
678
|
+
return MaltaResult(
|
|
679
|
+
source=source,
|
|
680
|
+
repo_url=repo_url,
|
|
681
|
+
das_score=das.s_dev,
|
|
682
|
+
das_dc=das.d_c,
|
|
683
|
+
das_rc=das.r_c,
|
|
684
|
+
mrs_score=mrs.s_resp,
|
|
685
|
+
mrs_rdec=mrs.r_dec,
|
|
686
|
+
mrs_ddec=mrs.d_dec,
|
|
687
|
+
mrs_popen=mrs.p_open,
|
|
688
|
+
mrs_n_prs=mrs.n_prs,
|
|
689
|
+
mrs_n_terminated=mrs.n_terminated,
|
|
690
|
+
mrs_n_open=mrs.n_open,
|
|
691
|
+
rmvs_score=rmvs.s_meta,
|
|
692
|
+
rmvs_archived=rmvs.archived,
|
|
693
|
+
rmvs_stars_phi=rmvs.stars_phi,
|
|
694
|
+
rmvs_forks_phi=rmvs.forks_phi,
|
|
695
|
+
rmvs_issues_penalty=rmvs.open_issues_penalty,
|
|
696
|
+
final_score=final.s_final,
|
|
697
|
+
final_score_100=final.s_final_100,
|
|
698
|
+
n_commits_total=len(repo_commits_df),
|
|
699
|
+
n_commits_window=len(commits),
|
|
700
|
+
n_prs_total=len(repo_prs_df),
|
|
701
|
+
n_prs_window=len(prs),
|
|
702
|
+
stars=meta.stars,
|
|
703
|
+
forks=meta.forks,
|
|
704
|
+
watchers=meta.watchers,
|
|
705
|
+
open_issues=meta.open_issues,
|
|
706
|
+
archived=meta.archived,
|
|
707
|
+
error=None,
|
|
708
|
+
)
|
|
709
|
+
except Exception as e:
|
|
710
|
+
return MaltaResult(
|
|
711
|
+
source=source,
|
|
712
|
+
repo_url=repo_url,
|
|
713
|
+
das_score=None,
|
|
714
|
+
das_dc=None,
|
|
715
|
+
das_rc=None,
|
|
716
|
+
mrs_score=None,
|
|
717
|
+
mrs_rdec=None,
|
|
718
|
+
mrs_ddec=None,
|
|
719
|
+
mrs_popen=None,
|
|
720
|
+
mrs_n_prs=None,
|
|
721
|
+
mrs_n_terminated=None,
|
|
722
|
+
mrs_n_open=None,
|
|
723
|
+
rmvs_score=None,
|
|
724
|
+
rmvs_archived=None,
|
|
725
|
+
rmvs_stars_phi=None,
|
|
726
|
+
rmvs_forks_phi=None,
|
|
727
|
+
rmvs_issues_penalty=None,
|
|
728
|
+
final_score=None,
|
|
729
|
+
final_score_100=None,
|
|
730
|
+
n_commits_total=None,
|
|
731
|
+
n_commits_window=None,
|
|
732
|
+
n_prs_total=None,
|
|
733
|
+
n_prs_window=None,
|
|
734
|
+
stars=None,
|
|
735
|
+
forks=None,
|
|
736
|
+
watchers=None,
|
|
737
|
+
open_issues=None,
|
|
738
|
+
archived=None,
|
|
739
|
+
error=str(e),
|
|
740
|
+
)
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def score_repos(
|
|
744
|
+
packages: Sequence[tuple[str, str]],
|
|
745
|
+
commits_df: pd.DataFrame,
|
|
746
|
+
pull_requests_df: pd.DataFrame,
|
|
747
|
+
repo_meta_df: pd.DataFrame,
|
|
748
|
+
eval_end: datetime,
|
|
749
|
+
n_workers: int | None = None,
|
|
750
|
+
malta_constants: MaltaConstants | None = None,
|
|
751
|
+
das_constants: DevelopmentActivityScoreConstants | None = None,
|
|
752
|
+
mrs_constants: MaintainerResponsivenessScoreConstants | None = None,
|
|
753
|
+
repo_meta_constants: RepoViabilityScoreConstants | None = None,
|
|
754
|
+
final_agg_constants: AggregateScoreConstants | None = None,
|
|
755
|
+
show_progress: bool = True,
|
|
756
|
+
) -> pd.DataFrame:
|
|
757
|
+
"""Score multiple repositories concurrently using MALTA metrics.
|
|
758
|
+
|
|
759
|
+
Parameters
|
|
760
|
+
----------
|
|
761
|
+
packages : Sequence[tuple[str, str]]
|
|
762
|
+
List of (source, repo_url) tuples identifying packages to score.
|
|
763
|
+
commits_df : pd.DataFrame
|
|
764
|
+
DataFrame containing commit data with 'repo_url' column.
|
|
765
|
+
pull_requests_df : pd.DataFrame
|
|
766
|
+
DataFrame containing PR data with 'repo_url' column.
|
|
767
|
+
repo_meta_df : pd.DataFrame
|
|
768
|
+
DataFrame containing repository metadata with 'repo_url' column.
|
|
769
|
+
eval_end : datetime
|
|
770
|
+
End of evaluation window (must be timezone-aware).
|
|
771
|
+
n_workers : int | None
|
|
772
|
+
Number of worker processes. None for auto (CPU count).
|
|
773
|
+
malta_constants : MaltaConstants | None
|
|
774
|
+
Custom MALTA constants.
|
|
775
|
+
das_constants : DevelopmentActivityScoreConstants | None
|
|
776
|
+
Custom DAS constants.
|
|
777
|
+
mrs_constants : MaintainerResponsivenessScoreConstants | None
|
|
778
|
+
Custom MRS constants.
|
|
779
|
+
repo_meta_constants : RepoViabilityScoreConstants | None
|
|
780
|
+
Custom RMVS constants.
|
|
781
|
+
final_agg_constants : AggregateScoreConstants | None
|
|
782
|
+
Custom aggregation constants.
|
|
783
|
+
show_progress : bool
|
|
784
|
+
Whether to show a progress bar (requires tqdm).
|
|
785
|
+
|
|
786
|
+
Returns
|
|
787
|
+
-------
|
|
788
|
+
pd.DataFrame
|
|
789
|
+
DataFrame with MALTA scores for each package. Columns include:
|
|
790
|
+
- source, repo_url: Package identifiers
|
|
791
|
+
- das_score, das_dc, das_rc: Development Activity Score components
|
|
792
|
+
- mrs_score, mrs_rdec, mrs_ddec, mrs_popen, mrs_n_*: MRS components
|
|
793
|
+
- rmvs_score, rmvs_archived, rmvs_*_phi, rmvs_issues_penalty: RMVS components
|
|
794
|
+
- final_score, final_score_100: Aggregated scores
|
|
795
|
+
- n_commits_total, n_commits_window, n_prs_total, n_prs_window: Counts
|
|
796
|
+
- stars, forks, watchers, open_issues, archived: Repository metadata
|
|
797
|
+
- error: Error message if scoring failed
|
|
798
|
+
|
|
799
|
+
Example
|
|
800
|
+
-------
|
|
801
|
+
>>> packages = [("pkg1", "https://github.com/owner/repo1"), ...]
|
|
802
|
+
>>> results_df = score_repos(
|
|
803
|
+
... packages=packages,
|
|
804
|
+
... commits_df=commits_df,
|
|
805
|
+
... pull_requests_df=prs_df,
|
|
806
|
+
... repo_meta_df=meta_df,
|
|
807
|
+
... eval_end=datetime(2026, 1, 1, tzinfo=timezone.utc),
|
|
808
|
+
... n_workers=8,
|
|
809
|
+
... )
|
|
810
|
+
"""
|
|
811
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
812
|
+
import os
|
|
813
|
+
|
|
814
|
+
if n_workers is None:
|
|
815
|
+
n_workers = os.cpu_count() or 4
|
|
816
|
+
|
|
817
|
+
# Get column name for repo_url (default or from constants)
|
|
818
|
+
repo_url_col = (malta_constants or MaltaConstants()).repo_url_column
|
|
819
|
+
|
|
820
|
+
# Pre-group DataFrames by repo_url for efficient lookup
|
|
821
|
+
commits_grouped = {url: group for url, group in commits_df.groupby(repo_url_col)}
|
|
822
|
+
prs_grouped = {url: group for url, group in pull_requests_df.groupby(repo_url_col)}
|
|
823
|
+
meta_grouped = {url: group for url, group in repo_meta_df.groupby(repo_url_col)}
|
|
824
|
+
|
|
825
|
+
# Empty DataFrames for repos with no data
|
|
826
|
+
empty_commits = commits_df.iloc[:0]
|
|
827
|
+
empty_prs = pull_requests_df.iloc[:0]
|
|
828
|
+
empty_meta = repo_meta_df.iloc[:0]
|
|
829
|
+
|
|
830
|
+
# Prepare work items
|
|
831
|
+
work_items = []
|
|
832
|
+
for source, repo_url in packages:
|
|
833
|
+
work_items.append((
|
|
834
|
+
source,
|
|
835
|
+
repo_url,
|
|
836
|
+
commits_grouped.get(repo_url, empty_commits),
|
|
837
|
+
prs_grouped.get(repo_url, empty_prs),
|
|
838
|
+
meta_grouped.get(repo_url, empty_meta),
|
|
839
|
+
eval_end,
|
|
840
|
+
malta_constants,
|
|
841
|
+
das_constants,
|
|
842
|
+
mrs_constants,
|
|
843
|
+
repo_meta_constants,
|
|
844
|
+
final_agg_constants,
|
|
845
|
+
))
|
|
846
|
+
|
|
847
|
+
results: list[MaltaResult] = []
|
|
848
|
+
|
|
849
|
+
# Set up progress bar if requested
|
|
850
|
+
progress: Any = None
|
|
851
|
+
if show_progress:
|
|
852
|
+
try:
|
|
853
|
+
from tqdm import tqdm
|
|
854
|
+
progress = tqdm(total=len(work_items), desc="Scoring repos")
|
|
855
|
+
except ImportError:
|
|
856
|
+
show_progress = False
|
|
857
|
+
|
|
858
|
+
# Process in parallel
|
|
859
|
+
with ProcessPoolExecutor(max_workers=n_workers) as executor:
|
|
860
|
+
futures = {executor.submit(_score_single_repo, *item): i for i, item in enumerate(work_items)}
|
|
861
|
+
|
|
862
|
+
for future in as_completed(futures):
|
|
863
|
+
result = future.result()
|
|
864
|
+
results.append(result)
|
|
865
|
+
if progress:
|
|
866
|
+
progress.update(1)
|
|
867
|
+
|
|
868
|
+
if progress:
|
|
869
|
+
progress.close()
|
|
870
|
+
|
|
871
|
+
# Convert to DataFrame
|
|
872
|
+
return pd.DataFrame([r._asdict() for r in results])
|