osslag 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,585 @@
1
+ # Maintenance-Aware Lag and Technical Abandonment (MALTA) metrics
2
+ from __future__ import annotations
3
+
4
+ import math
5
+ from statistics import median
6
+ from datetime import datetime
7
+ from typing import NamedTuple, Optional, Sequence
8
+ import pandas as pd
9
+ from dateutil.relativedelta import relativedelta
10
+
11
+
12
+ class EvaluationWindow(NamedTuple):
13
+ """Represents a time window for evaluation."""
14
+
15
+ start: datetime
16
+ end: datetime
17
+ days: int
18
+
19
+
20
+ class Commit(NamedTuple):
21
+ """Minimal commit record."""
22
+
23
+ date: datetime
24
+ is_trivial: bool = False
25
+
26
+
27
+ class PullRequest(NamedTuple):
28
+ """Minimal pull request record."""
29
+
30
+ created_at: datetime
31
+ closed_at: datetime | None
32
+ merged_at: datetime | None
33
+ state: str # 'open' or 'closed'
34
+
35
+
36
+ class RepoMeta(NamedTuple):
37
+ """Repository Metadata record."""
38
+
39
+ stars: int = 0
40
+ forks: int = 0
41
+ watchers: int = 0
42
+ open_issues: int = 0
43
+ archived: bool = False
44
+
45
+
46
+ class DASComponents(NamedTuple):
47
+ """Development Activity Score and its components"""
48
+
49
+ s_dev: float # Development Activity Score
50
+ d_c: float # Decay Ratio
51
+ r_c: float # Recency Factor
52
+
53
+
54
+ class MRSComponents(NamedTuple):
55
+ """Maintenance Responsiveness Score and its components"""
56
+
57
+ s_resp: float # Responsiveness score
58
+ r_dec: float # Decision rate
59
+ d_dec: float # Decision delay (normalized median)
60
+ p_open: float # Open PR staleness penalty
61
+ n_prs: int # Total PRs in window
62
+ n_terminated: int # Closed/merged PRs
63
+ n_open: int # Still open PRs
64
+
65
+
66
+ class RMVSComponents(NamedTuple):
67
+ """Repository Metadata Viability Score and its components"""
68
+
69
+ s_meta: float # Metadata score
70
+ stars_phi: float # Normalized stars
71
+ forks_phi: float # Normalized forks
72
+ watchers_phi: float # Normalized watchers
73
+ open_issues_penalty: float # Open issues penalty
74
+ archived: bool # Archived status
75
+
76
+
77
+ class AggregateScoreComponents(NamedTuple):
78
+ """Final maintenance score and its components"""
79
+
80
+ s_final: float # Final S_dev
81
+ s_dev: float # Development activity score
82
+ s_resp: float # Responsiveness score
83
+ s_meta: float # Metadata score
84
+ s_final: float # in [0,1]
85
+ s_final_100: float # in [0,100]
86
+
87
+
88
+ class MaltaConstants(NamedTuple):
89
+ """Constants for MALTA metric computations."""
90
+
91
+ eval_months: int = 18
92
+ baseline_months: int = 24
93
+ repo_url_column: str = "repo_url"
94
+ repo_dates_column: str = "date"
95
+ repo_is_trivial_column: str = "is_trivial"
96
+ pr_created_at_column: str = "created_at"
97
+ pr_closed_at_column: str = "closed_at"
98
+ pr_merged_at_column: str = "merged_at"
99
+ pr_state_column: str = "state"
100
+
101
+
102
+ class DevelopmentActivityScoreConstants(NamedTuple):
103
+ """Constants for development activity score computation."""
104
+
105
+ tau_days: float = 180.0 # Decay half-life in days
106
+
107
+
108
+ class MaintainerResponsivenessScoreConstants(NamedTuple):
109
+ """Constants for maintainer responsiveness score computation."""
110
+
111
+ tref_days: int = 180 # Time reference for decision timeliness in days
112
+
113
+
114
+ class RepoViabilityScoreConstants(NamedTuple):
115
+ """Constants for repository viability score computation."""
116
+
117
+ K: int = 10_000
118
+ alpha_archived: float = 0.7
119
+ beta_stars: float = 0.25
120
+ beta_forks: float = 0.25
121
+ beta_watchers: float = 0.25
122
+ beta_issues: float = 0.25
123
+
124
+
125
+ class AggregateScoreConstants(NamedTuple):
126
+ """Weights for final maintenance score aggregation."""
127
+
128
+ w_dev: float = 0.55
129
+ w_resp: float = 0.35
130
+ w_meta: float = 0.10
131
+
132
+
133
+ class Malta:
134
+ def __init__(
135
+ self,
136
+ package: str,
137
+ github_repo_url: str,
138
+ eval_end: datetime,
139
+ commits_df: pd.DataFrame,
140
+ pull_requests_df: pd.DataFrame,
141
+ repo_meta_df: pd.DataFrame,
142
+ malta_constants: Optional[MaltaConstants] = None,
143
+ das_constants: Optional[DevelopmentActivityScoreConstants] = None,
144
+ mrs_constants: Optional[MaintainerResponsivenessScoreConstants] = None,
145
+ repo_meta_constants: Optional[RepoViabilityScoreConstants] = None,
146
+ final_agg_constants: Optional[AggregateScoreConstants] = None,
147
+ ):
148
+ self.package = package
149
+ self.github_repo_url = github_repo_url
150
+ self.commits_df = commits_df
151
+ self.pull_requests_df = pull_requests_df
152
+ self.repo_meta_df = repo_meta_df
153
+
154
+ self.das: DASComponents
155
+ self.mrs: MRSComponents
156
+ self.rmvs: RMVSComponents
157
+ self.final: AggregateScoreComponents
158
+
159
+ self.malta_constants = (
160
+ malta_constants if malta_constants is not None else MaltaConstants()
161
+ )
162
+ self.das_constants = (
163
+ das_constants
164
+ if das_constants is not None
165
+ else DevelopmentActivityScoreConstants()
166
+ )
167
+ self.mrs_constants = (
168
+ mrs_constants
169
+ if mrs_constants is not None
170
+ else MaintainerResponsivenessScoreConstants()
171
+ )
172
+ self.rmv_constants = (
173
+ repo_meta_constants
174
+ if repo_meta_constants is not None
175
+ else RepoViabilityScoreConstants()
176
+ )
177
+ self.final_constants = (
178
+ final_agg_constants
179
+ if final_agg_constants is not None
180
+ else AggregateScoreConstants()
181
+ )
182
+ # Precompute evaluation and baseline windows
183
+ if eval_end.tzinfo is None:
184
+ raise ValueError("Datetime object must be timezone-aware")
185
+ eval_window_start = eval_end - relativedelta(
186
+ months=self.malta_constants.eval_months
187
+ )
188
+ # Evaluation window
189
+ self.eval_window = EvaluationWindow(
190
+ start=eval_window_start,
191
+ end=eval_end,
192
+ days=(eval_end - eval_window_start).days,
193
+ )
194
+ baseline_window_start = eval_window_start - relativedelta(
195
+ months=self.malta_constants.baseline_months
196
+ )
197
+ baseline_window_end = eval_window_start
198
+ # Baseline window
199
+ self.baseline_window = EvaluationWindow(
200
+ start=baseline_window_start,
201
+ end=baseline_window_end,
202
+ days=(baseline_window_end - baseline_window_start).days,
203
+ )
204
+
205
+ @staticmethod
206
+ def __clamp(x: float, lo: float = 0.0, hi: float = 1.0) -> float:
207
+ return max(lo, min(hi, x))
208
+
209
+ @staticmethod
210
+ def __phi_count(x: int, K: int) -> float:
211
+ """Log-saturating normalization: phi(x)=min(1, log(1+x)/log(1+K))."""
212
+ x = max(0, x)
213
+ if K <= 0:
214
+ raise ValueError("K must be positive.")
215
+ return min(1.0, math.log1p(x) / math.log1p(K))
216
+
217
+ def get_commits_for_package(self) -> Sequence[Commit]:
218
+ """Extract commits for a given repository URL."""
219
+ repo_url_column = self.malta_constants.repo_url_column
220
+ repo_dates_column = self.malta_constants.repo_dates_column
221
+ repo_is_trivial_column = self.malta_constants.repo_is_trivial_column
222
+ repo_commits_df = self.commits_df[
223
+ self.commits_df[repo_url_column] == self.github_repo_url
224
+ ].copy()
225
+ if len(repo_commits_df) == 0:
226
+ return []
227
+ # Ensure datetime is timezone-aware UTC
228
+ repo_commits_df[repo_dates_column] = pd.to_datetime(
229
+ repo_commits_df[repo_dates_column], utc=True
230
+ )
231
+ commits = [
232
+ Commit(date=row[repo_dates_column], is_trivial=row[repo_is_trivial_column])
233
+ for _, row in repo_commits_df.iterrows()
234
+ ]
235
+ return commits
236
+
237
+ def get_pull_requests_for_package(self) -> Sequence[PullRequest]:
238
+ """Extract pull requests for a given repository URL."""
239
+ repo_url_column = self.malta_constants.repo_url_column
240
+ pr_created_at_column = self.malta_constants.pr_created_at_column
241
+ pr_closed_at_column = self.malta_constants.pr_closed_at_column
242
+ pr_merged_at_column = self.malta_constants.pr_merged_at_column
243
+ pr_state_column = self.malta_constants.pr_state_column
244
+ repo_prs_df = self.pull_requests_df[
245
+ self.pull_requests_df[repo_url_column] == self.github_repo_url
246
+ ].copy()
247
+ if len(repo_prs_df) == 0:
248
+ return []
249
+ # Ensure datetime is timezone-aware UTC
250
+ repo_prs_df[pr_created_at_column] = pd.to_datetime(
251
+ repo_prs_df[pr_created_at_column], utc=True
252
+ )
253
+ repo_prs_df[pr_closed_at_column] = pd.to_datetime(
254
+ repo_prs_df[pr_closed_at_column], utc=True
255
+ )
256
+ repo_prs_df[pr_merged_at_column] = pd.to_datetime(
257
+ repo_prs_df[pr_merged_at_column], utc=True
258
+ )
259
+ prs = [
260
+ PullRequest(
261
+ created_at=row[pr_created_at_column],
262
+ closed_at=row[pr_closed_at_column],
263
+ merged_at=row[pr_merged_at_column],
264
+ state=row[pr_state_column],
265
+ )
266
+ for _, row in repo_prs_df.iterrows()
267
+ ]
268
+ return prs
269
+
270
+ def development_activity_score(
271
+ self,
272
+ commits: Sequence[Commit],
273
+ include_trivial: bool = False,
274
+ ) -> DASComponents:
275
+ """Computes S_dev in [0,1]:
276
+
277
+ D_c = (C_e / |W_e|) / (C_b / |W_b|)
278
+ R_c = exp(-t_last / tau)
279
+ S_dev = min(1, D_c) * R_c
280
+
281
+ Parameters
282
+ ----------
283
+ commits : Sequence[Commit]
284
+ All commits in both baseline and evaluation windows.
285
+ tau_days : float
286
+ Decay half-life in days (default 180).
287
+ include_trivial : bool
288
+ If False, exclude trivial commits from counts and recency calculations.
289
+
290
+ Returns
291
+ -------
292
+ DAComponents
293
+ Components of the development activity score.
294
+
295
+ Notes
296
+ -----
297
+ - If baseline rate is 0, we treat D_c as 1 when eval also has activity,
298
+ else 0. This avoids division-by-zero while remaining conservative.
299
+ - t_last is measured since most recent non-trivial commit (unless include_trivial=True).
300
+
301
+ """
302
+ # Partition commits into the baseline and evaluation sets, filtering trivial if needed.
303
+ commits_baseline: Sequence[Commit] = []
304
+ commits_eval: Sequence[Commit] = []
305
+ window_baseline_days: int = (
306
+ self.baseline_window.end.toordinal()
307
+ - self.baseline_window.start.toordinal()
308
+ )
309
+ window_eval_days: int = (
310
+ self.eval_window.end.toordinal() - self.eval_window.start.toordinal()
311
+ )
312
+
313
+ for c in commits:
314
+ if self.baseline_window.start <= c.date < self.baseline_window.end:
315
+ commits_baseline.append(c)
316
+ elif self.eval_window.start <= c.date < self.eval_window.end:
317
+ commits_eval.append(c)
318
+
319
+ if window_baseline_days <= 0 or window_eval_days <= 0:
320
+ raise ValueError("window_baseline_days and window_eval_days must be > 0")
321
+ if self.baseline_window.end > self.eval_window.start:
322
+ raise ValueError(
323
+ "Baseline window must end before evaluation window starts."
324
+ )
325
+ if self.eval_window.start < self.baseline_window.end:
326
+ raise ValueError("Evaluation window must start after baseline window ends.")
327
+
328
+ def _filter(cs: Sequence[Commit]) -> list[Commit]:
329
+ if include_trivial:
330
+ return list(cs)
331
+ return [c for c in cs if not c.is_trivial]
332
+
333
+ b = _filter(commits_baseline)
334
+ e = _filter(commits_eval)
335
+
336
+ C_b = len(b)
337
+ C_e = len(e)
338
+
339
+ # Rates per day
340
+ rate_b = C_b / float(window_baseline_days)
341
+ rate_e = C_e / float(window_eval_days)
342
+
343
+ # Velocity decay with careful handling when baseline has no commits.
344
+ if rate_b == 0.0:
345
+ D_c = 1.0 if rate_e > 0.0 else 0.0
346
+ else:
347
+ D_c = rate_e / rate_b
348
+
349
+ # Recency term based on last non-trivial commit in eval, else fallback to baseline.
350
+ candidates = e if e else b
351
+ if candidates:
352
+ last_commit_time = max(c.date for c in candidates)
353
+ if last_commit_time.tzinfo is None:
354
+ raise ValueError(
355
+ "Commit.authored_at must be timezone-aware (UTC recommended)."
356
+ )
357
+ t_last = max(
358
+ 0.0, (self.eval_window.end - last_commit_time).total_seconds() / 86400.0
359
+ )
360
+ R_c = math.exp(-t_last / self.das_constants.tau_days)
361
+ else:
362
+ # No commits at all -> fully inactive.
363
+ R_c = 0.0
364
+
365
+ S_dev = self.__clamp(min(1.0, D_c) * R_c)
366
+
367
+ self.das = DASComponents(d_c=D_c, r_c=R_c, s_dev=S_dev)
368
+ return self.das
369
+
370
+ def maintainer_responsiveness_score(
371
+ self,
372
+ pull_requests: Sequence[PullRequest],
373
+ ) -> MRSComponents:
374
+ """Compute the PR-outcome-bound Maintainer Responsiveness Score (S_resp).
375
+
376
+ Parameters
377
+ ----------
378
+ pull_requests : Sequence[PullRequest]
379
+
380
+ Returns
381
+ -------
382
+ MRSComponents
383
+ Components of the maintainer responsiveness score.
384
+
385
+ """
386
+ if not pull_requests:
387
+ # No external contribution signal
388
+ return MRSComponents(
389
+ s_resp=0.0,
390
+ r_dec=0.0,
391
+ d_dec=0.0,
392
+ p_open=0.0,
393
+ n_prs=0,
394
+ n_terminated=0,
395
+ n_open=0,
396
+ )
397
+ # Filter PRs to those created within the evaluation window
398
+ P = [
399
+ pr
400
+ for pr in pull_requests
401
+ if self.eval_window.start <= pr.created_at < self.eval_window.end
402
+ ]
403
+ if not P:
404
+ # No PRs in evaluation window
405
+ return MRSComponents(
406
+ s_resp=0.0,
407
+ r_dec=0.0,
408
+ d_dec=0.0,
409
+ p_open=0.0,
410
+ n_prs=0,
411
+ n_terminated=0,
412
+ n_open=0,
413
+ )
414
+ # Partition PRs
415
+ P_term = []
416
+ P_open = []
417
+
418
+ for pr in P:
419
+ if pr.state == "closed":
420
+ P_term.append(pr)
421
+ elif pr.state == "open":
422
+ P_open.append(pr)
423
+ else:
424
+ raise ValueError(f"Unknown PR state: {pr.state}")
425
+
426
+ # If PRs exist but none are handled, score is 0
427
+ if P_term:
428
+ R_dec = len(P_term) / len(P)
429
+ else:
430
+ return MRSComponents(
431
+ s_resp=0.0,
432
+ r_dec=0.0,
433
+ d_dec=0.0,
434
+ p_open=0.0,
435
+ n_prs=len(P),
436
+ n_terminated=0,
437
+ n_open=len(P_open),
438
+ )
439
+
440
+ # ---- Decision Timeliness (D_dec) ----
441
+ decision_delays = []
442
+ for pr in P_term:
443
+ closed_time = pr.merged_at or pr.closed_at
444
+ delta_days = (closed_time - pr.created_at).days
445
+ decision_delays.append(min(1.0, delta_days / self.mrs_constants.tref_days))
446
+
447
+ D_dec = median(decision_delays)
448
+
449
+ # ---- Open PR Staleness Penalty (P_open) ----
450
+ if P_open:
451
+ open_ages = []
452
+ for pr in P_open:
453
+ age_days = (self.eval_window.end - pr.created_at).days
454
+ open_ages.append(min(1.0, age_days / self.mrs_constants.tref_days))
455
+ P_open_penalty = median(open_ages)
456
+ else:
457
+ P_open_penalty = 0.0
458
+
459
+ # ---- Responsiveness Aggregation ----
460
+ S_resp = R_dec * (1.0 - D_dec) * (1.0 - P_open_penalty)
461
+
462
+ self.mrsc = MRSComponents(
463
+ s_resp=self.__clamp(S_resp),
464
+ r_dec=R_dec,
465
+ d_dec=D_dec,
466
+ p_open=P_open_penalty,
467
+ n_prs=len(P),
468
+ n_terminated=len(P_term),
469
+ n_open=len(P_open),
470
+ )
471
+
472
+ return self.mrsc
473
+
474
+ def repo_metadata_viability_score(
475
+ self,
476
+ meta: RepoMeta,
477
+ ) -> RMVSComponents:
478
+ """Compute repository metadata viability score S_meta.
479
+
480
+ phi(x) = min(1, log(1+x)/log(1+K))
481
+ S* = phi(stars), F* = phi(forks), W* = phi(watchers), I* = phi(open_issues)
482
+ I_pen = 1 - I*
483
+ A_pen = 1 - alpha * A (A=1 if archived else 0)
484
+
485
+ S_meta = A_pen * (beta_s*S* + beta_f*F* + beta_w*W* + beta_i*I_pen)
486
+
487
+ Missing-data handling:
488
+ - If all counts are None: return None.
489
+ - If some counts are missing: renormalize betas over observed fields.
490
+ """
491
+ if not (0.0 <= self.rmv_constants.alpha_archived <= 1.0):
492
+ raise ValueError("alpha_archived must be in [0,1].")
493
+ betas = {
494
+ "stars": self.rmv_constants.beta_stars,
495
+ "forks": self.rmv_constants.beta_forks,
496
+ "watchers": self.rmv_constants.beta_watchers,
497
+ "issues": self.rmv_constants.beta_issues,
498
+ }
499
+ beta_sum = sum(betas.values())
500
+ if abs(beta_sum - 1.0) > 1e-9:
501
+ # Keep strict for reproducibility
502
+ raise ValueError("beta weights must sum to 1.0 exactly.")
503
+ K = self.rmv_constants.K
504
+ s = self.__phi_count(meta.stars, K)
505
+ f = self.__phi_count(meta.forks, K)
506
+ w = self.__phi_count(meta.watchers, K)
507
+ i = self.__phi_count(meta.open_issues, K)
508
+ i_pen = 0 if i == 0 else (1.0 - i)
509
+
510
+ parts = {"stars": s, "forks": f, "watchers": w, "issues": i_pen}
511
+ observed = {k: v for k, v in parts.items() if v is not None}
512
+
513
+ if not observed:
514
+ # All fields missing
515
+ self.rmvs = RMVSComponents(
516
+ s_meta=0.0,
517
+ stars_phi=s,
518
+ forks_phi=f,
519
+ watchers_phi=w,
520
+ open_issues_penalty=i_pen,
521
+ archived=meta.archived,
522
+ )
523
+ return self.rmvs
524
+
525
+ # Renormalize betas over observed fields
526
+ wsum = sum(betas[k] for k in observed.keys())
527
+ linear = sum((betas[k] / wsum) * observed[k] for k in observed.keys())
528
+
529
+ A = 1.0 if meta.archived else 0.0
530
+ A_pen = 1.0 - self.rmv_constants.alpha_archived * A
531
+ self.rmvs = RMVSComponents(
532
+ s_meta=self.__clamp(A_pen * linear),
533
+ stars_phi=s,
534
+ forks_phi=f,
535
+ watchers_phi=w,
536
+ open_issues_penalty=i_pen,
537
+ archived=meta.archived,
538
+ )
539
+
540
+ return self.rmvs
541
+
542
+ def final_aggregation_score(
543
+ self,
544
+ ) -> AggregateScoreComponents:
545
+ """Aggregate S_dev, S_resp, S_meta into S_final with missing-data handling.
546
+
547
+ Base:
548
+ S_final = w_dev*S_dev + w_resp*S_resp + w_meta*S_meta
549
+
550
+ Missing responsiveness:
551
+ - If S_resp is None: renormalize over {S_dev, S_meta} that exist.
552
+
553
+ Archived override:
554
+ - If archived and S_resp is None: set S_resp = 0.0 (explicit cessation)
555
+ before aggregation (still renormalizes if S_meta is None).
556
+ """
557
+ s_dev = self.das.s_dev
558
+ s_resp = self.mrsc.s_resp
559
+ s_meta = self.rmvs.s_meta
560
+
561
+ # If the repo is archived, treat as 0.0
562
+ if self.rmvs.archived:
563
+ s_resp = 0.0
564
+
565
+ terms: list[tuple[float, float]] = [(self.final_constants.w_dev, s_dev)]
566
+
567
+ if s_resp is not None:
568
+ terms.append((self.final_constants.w_resp, s_resp))
569
+ if s_meta is not None:
570
+ terms.append((self.final_constants.w_meta, s_meta))
571
+
572
+ wsum = sum(w for w, _ in terms)
573
+ if wsum <= 0:
574
+ raise ValueError("Sum of active weights must be > 0.")
575
+ s_final = self.__clamp(sum((w / wsum) * v for w, v in terms))
576
+
577
+ self.final = AggregateScoreComponents(
578
+ s_final=s_final,
579
+ s_dev=s_dev,
580
+ s_resp=s_resp,
581
+ s_meta=s_meta,
582
+ s_final_100=100.0 * s_final,
583
+ )
584
+
585
+ return self.final