osslag 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
osslag/metrics/malta.py CHANGED
@@ -4,10 +4,13 @@ from __future__ import annotations
4
4
  import math
5
5
  from statistics import median
6
6
  from datetime import datetime
7
- from typing import NamedTuple, Optional, Sequence
7
+ from typing import TYPE_CHECKING, Any, NamedTuple, Sequence
8
8
  import pandas as pd
9
9
  from dateutil.relativedelta import relativedelta
10
10
 
11
+ if TYPE_CHECKING:
12
+ from tqdm import tqdm as TqdmType
13
+
11
14
 
12
15
  class EvaluationWindow(NamedTuple):
13
16
  """Represents a time window for evaluation."""
@@ -78,11 +81,10 @@ class AggregateScoreComponents(NamedTuple):
78
81
  """Final maintenance score and its components"""
79
82
 
80
83
  s_final: float # Final S_dev
84
+ s_final_100: float # in [0,100]
81
85
  s_dev: float # Development activity score
82
86
  s_resp: float # Responsiveness score
83
87
  s_meta: float # Metadata score
84
- s_final: float # in [0,1]
85
- s_final_100: float # in [0,100]
86
88
 
87
89
 
88
90
  class MaltaConstants(NamedTuple):
@@ -139,11 +141,11 @@ class Malta:
139
141
  commits_df: pd.DataFrame,
140
142
  pull_requests_df: pd.DataFrame,
141
143
  repo_meta_df: pd.DataFrame,
142
- malta_constants: Optional[MaltaConstants] = None,
143
- das_constants: Optional[DevelopmentActivityScoreConstants] = None,
144
- mrs_constants: Optional[MaintainerResponsivenessScoreConstants] = None,
145
- repo_meta_constants: Optional[RepoViabilityScoreConstants] = None,
146
- final_agg_constants: Optional[AggregateScoreConstants] = None,
144
+ malta_constants: MaltaConstants | None = None,
145
+ das_constants: DevelopmentActivityScoreConstants | None = None,
146
+ mrs_constants: MaintainerResponsivenessScoreConstants | None = None,
147
+ repo_meta_constants: RepoViabilityScoreConstants | None = None,
148
+ final_agg_constants: AggregateScoreConstants | None = None,
147
149
  ):
148
150
  self.package = package
149
151
  self.github_repo_url = github_repo_url
@@ -151,49 +153,33 @@ class Malta:
151
153
  self.pull_requests_df = pull_requests_df
152
154
  self.repo_meta_df = repo_meta_df
153
155
 
154
- self.das: DASComponents
155
- self.mrs: MRSComponents
156
- self.rmvs: RMVSComponents
157
- self.final: AggregateScoreComponents
158
-
159
- self.malta_constants = (
160
- malta_constants if malta_constants is not None else MaltaConstants()
161
- )
162
- self.das_constants = (
163
- das_constants
164
- if das_constants is not None
165
- else DevelopmentActivityScoreConstants()
166
- )
167
- self.mrs_constants = (
168
- mrs_constants
169
- if mrs_constants is not None
170
- else MaintainerResponsivenessScoreConstants()
171
- )
172
- self.rmv_constants = (
173
- repo_meta_constants
174
- if repo_meta_constants is not None
175
- else RepoViabilityScoreConstants()
176
- )
177
- self.final_constants = (
178
- final_agg_constants
179
- if final_agg_constants is not None
180
- else AggregateScoreConstants()
181
- )
156
+ # Score components (populated by calling the respective methods)
157
+ self.das: DASComponents | None = None
158
+ self.mrs: MRSComponents | None = None
159
+ self.rmvs: RMVSComponents | None = None
160
+ self.final: AggregateScoreComponents | None = None
161
+
162
+ # Cached extracted data (lazily populated)
163
+ self._commits_cache: Sequence[Commit] | None = None
164
+ self._prs_cache: Sequence[PullRequest] | None = None
165
+ self._meta_cache: RepoMeta | None = None
166
+
167
+ self.malta_constants = malta_constants if malta_constants is not None else MaltaConstants()
168
+ self.das_constants = das_constants if das_constants is not None else DevelopmentActivityScoreConstants()
169
+ self.mrs_constants = mrs_constants if mrs_constants is not None else MaintainerResponsivenessScoreConstants()
170
+ self.rmv_constants = repo_meta_constants if repo_meta_constants is not None else RepoViabilityScoreConstants()
171
+ self.final_constants = final_agg_constants if final_agg_constants is not None else AggregateScoreConstants()
182
172
  # Precompute evaluation and baseline windows
183
173
  if eval_end.tzinfo is None:
184
174
  raise ValueError("Datetime object must be timezone-aware")
185
- eval_window_start = eval_end - relativedelta(
186
- months=self.malta_constants.eval_months
187
- )
175
+ eval_window_start = eval_end - relativedelta(months=self.malta_constants.eval_months)
188
176
  # Evaluation window
189
177
  self.eval_window = EvaluationWindow(
190
178
  start=eval_window_start,
191
179
  end=eval_end,
192
180
  days=(eval_end - eval_window_start).days,
193
181
  )
194
- baseline_window_start = eval_window_start - relativedelta(
195
- months=self.malta_constants.baseline_months
196
- )
182
+ baseline_window_start = eval_window_start - relativedelta(months=self.malta_constants.baseline_months)
197
183
  baseline_window_end = eval_window_start
198
184
  # Baseline window
199
185
  self.baseline_window = EvaluationWindow(
@@ -215,61 +201,82 @@ class Malta:
215
201
  return min(1.0, math.log1p(x) / math.log1p(K))
216
202
 
217
203
  def get_commits_for_package(self) -> Sequence[Commit]:
218
- """Extract commits for a given repository URL."""
204
+ """Extract commits for a given repository URL. Results are cached."""
205
+ if self._commits_cache is not None:
206
+ return self._commits_cache
207
+
219
208
  repo_url_column = self.malta_constants.repo_url_column
220
209
  repo_dates_column = self.malta_constants.repo_dates_column
221
210
  repo_is_trivial_column = self.malta_constants.repo_is_trivial_column
222
- repo_commits_df = self.commits_df[
223
- self.commits_df[repo_url_column] == self.github_repo_url
224
- ].copy()
211
+
212
+ mask = self.commits_df[repo_url_column] == self.github_repo_url
213
+ repo_commits_df = self.commits_df.loc[mask]
225
214
  if len(repo_commits_df) == 0:
226
- return []
227
- # Ensure datetime is timezone-aware UTC
228
- repo_commits_df[repo_dates_column] = pd.to_datetime(
229
- repo_commits_df[repo_dates_column], utc=True
230
- )
231
- commits = [
232
- Commit(date=row[repo_dates_column], is_trivial=row[repo_is_trivial_column])
233
- for _, row in repo_commits_df.iterrows()
234
- ]
235
- return commits
215
+ self._commits_cache = []
216
+ return self._commits_cache
217
+
218
+ # Extract columns as arrays (vectorized datetime conversion)
219
+ dates = pd.to_datetime(repo_commits_df[repo_dates_column], utc=True).to_numpy()
220
+ trivials = repo_commits_df[repo_is_trivial_column].to_numpy()
221
+
222
+ # Build Commit objects from arrays (faster than iterrows/itertuples)
223
+ self._commits_cache = [Commit(date=d, is_trivial=t) for d, t in zip(dates, trivials)]
224
+ return self._commits_cache
236
225
 
237
226
  def get_pull_requests_for_package(self) -> Sequence[PullRequest]:
238
- """Extract pull requests for a given repository URL."""
227
+ """Extract pull requests for a given repository URL. Results are cached."""
228
+ if self._prs_cache is not None:
229
+ return self._prs_cache
230
+
239
231
  repo_url_column = self.malta_constants.repo_url_column
240
232
  pr_created_at_column = self.malta_constants.pr_created_at_column
241
233
  pr_closed_at_column = self.malta_constants.pr_closed_at_column
242
234
  pr_merged_at_column = self.malta_constants.pr_merged_at_column
243
235
  pr_state_column = self.malta_constants.pr_state_column
244
- repo_prs_df = self.pull_requests_df[
245
- self.pull_requests_df[repo_url_column] == self.github_repo_url
246
- ].copy()
236
+
237
+ mask = self.pull_requests_df[repo_url_column] == self.github_repo_url
238
+ repo_prs_df = self.pull_requests_df.loc[mask]
247
239
  if len(repo_prs_df) == 0:
248
- return []
249
- # Ensure datetime is timezone-aware UTC
250
- repo_prs_df[pr_created_at_column] = pd.to_datetime(
251
- repo_prs_df[pr_created_at_column], utc=True
252
- )
253
- repo_prs_df[pr_closed_at_column] = pd.to_datetime(
254
- repo_prs_df[pr_closed_at_column], utc=True
255
- )
256
- repo_prs_df[pr_merged_at_column] = pd.to_datetime(
257
- repo_prs_df[pr_merged_at_column], utc=True
258
- )
259
- prs = [
260
- PullRequest(
261
- created_at=row[pr_created_at_column],
262
- closed_at=row[pr_closed_at_column],
263
- merged_at=row[pr_merged_at_column],
264
- state=row[pr_state_column],
265
- )
266
- for _, row in repo_prs_df.iterrows()
240
+ self._prs_cache = []
241
+ return self._prs_cache
242
+
243
+ # Extract columns as arrays (vectorized datetime conversion)
244
+ created = pd.to_datetime(repo_prs_df[pr_created_at_column], utc=True).to_numpy()
245
+ closed = pd.to_datetime(repo_prs_df[pr_closed_at_column], utc=True).to_numpy()
246
+ merged = pd.to_datetime(repo_prs_df[pr_merged_at_column], utc=True).to_numpy()
247
+ states = repo_prs_df[pr_state_column].to_numpy()
248
+
249
+ # Build PullRequest objects from arrays
250
+ self._prs_cache = [
251
+ PullRequest(created_at=c, closed_at=cl, merged_at=m, state=s)
252
+ for c, cl, m, s in zip(created, closed, merged, states)
267
253
  ]
268
- return prs
254
+ return self._prs_cache
255
+
256
+ def get_repo_meta_for_package(self) -> RepoMeta:
257
+ """Extract repository metadata for a given repository URL. Results are cached."""
258
+ if self._meta_cache is not None:
259
+ return self._meta_cache
260
+
261
+ repo_url_column = self.malta_constants.repo_url_column
262
+ mask = self.repo_meta_df[repo_url_column] == self.github_repo_url
263
+ repo_meta_row = self.repo_meta_df.loc[mask]
264
+ if len(repo_meta_row) == 0:
265
+ self._meta_cache = RepoMeta()
266
+ return self._meta_cache
267
+
268
+ row = repo_meta_row.iloc[0]
269
+ self._meta_cache = RepoMeta(
270
+ stars=int(row.get("stars", 0)),
271
+ forks=int(row.get("forks", 0)),
272
+ watchers=int(row.get("watchers", 0)),
273
+ open_issues=int(row.get("open_issues", 0)),
274
+ archived=bool(row.get("archived", False)),
275
+ )
276
+ return self._meta_cache
269
277
 
270
278
  def development_activity_score(
271
279
  self,
272
- commits: Sequence[Commit],
273
280
  include_trivial: bool = False,
274
281
  ) -> DASComponents:
275
282
  """Computes S_dev in [0,1]:
@@ -280,10 +287,6 @@ class Malta:
280
287
 
281
288
  Parameters
282
289
  ----------
283
- commits : Sequence[Commit]
284
- All commits in both baseline and evaluation windows.
285
- tau_days : float
286
- Decay half-life in days (default 180).
287
290
  include_trivial : bool
288
291
  If False, exclude trivial commits from counts and recency calculations.
289
292
 
@@ -294,21 +297,18 @@ class Malta:
294
297
 
295
298
  Notes
296
299
  -----
300
+ - Uses commits from self.commits_df via get_commits_for_package().
297
301
  - If baseline rate is 0, we treat D_c as 1 when eval also has activity,
298
302
  else 0. This avoids division-by-zero while remaining conservative.
299
303
  - t_last is measured since most recent non-trivial commit (unless include_trivial=True).
300
304
 
301
305
  """
306
+ commits = self.get_commits_for_package()
302
307
  # Partition commits into the baseline and evaluation sets, filtering trivial if needed.
303
308
  commits_baseline: Sequence[Commit] = []
304
309
  commits_eval: Sequence[Commit] = []
305
- window_baseline_days: int = (
306
- self.baseline_window.end.toordinal()
307
- - self.baseline_window.start.toordinal()
308
- )
309
- window_eval_days: int = (
310
- self.eval_window.end.toordinal() - self.eval_window.start.toordinal()
311
- )
310
+ window_baseline_days: int = self.baseline_window.end.toordinal() - self.baseline_window.start.toordinal()
311
+ window_eval_days: int = self.eval_window.end.toordinal() - self.eval_window.start.toordinal()
312
312
 
313
313
  for c in commits:
314
314
  if self.baseline_window.start <= c.date < self.baseline_window.end:
@@ -319,9 +319,7 @@ class Malta:
319
319
  if window_baseline_days <= 0 or window_eval_days <= 0:
320
320
  raise ValueError("window_baseline_days and window_eval_days must be > 0")
321
321
  if self.baseline_window.end > self.eval_window.start:
322
- raise ValueError(
323
- "Baseline window must end before evaluation window starts."
324
- )
322
+ raise ValueError("Baseline window must end before evaluation window starts.")
325
323
  if self.eval_window.start < self.baseline_window.end:
326
324
  raise ValueError("Evaluation window must start after baseline window ends.")
327
325
 
@@ -351,12 +349,8 @@ class Malta:
351
349
  if candidates:
352
350
  last_commit_time = max(c.date for c in candidates)
353
351
  if last_commit_time.tzinfo is None:
354
- raise ValueError(
355
- "Commit.authored_at must be timezone-aware (UTC recommended)."
356
- )
357
- t_last = max(
358
- 0.0, (self.eval_window.end - last_commit_time).total_seconds() / 86400.0
359
- )
352
+ raise ValueError("Commit.authored_at must be timezone-aware (UTC recommended).")
353
+ t_last = max(0.0, (self.eval_window.end - last_commit_time).total_seconds() / 86400.0)
360
354
  R_c = math.exp(-t_last / self.das_constants.tau_days)
361
355
  else:
362
356
  # No commits at all -> fully inactive.
@@ -369,23 +363,23 @@ class Malta:
369
363
 
370
364
  def maintainer_responsiveness_score(
371
365
  self,
372
- pull_requests: Sequence[PullRequest],
373
366
  ) -> MRSComponents:
374
367
  """Compute the PR-outcome-bound Maintainer Responsiveness Score (S_resp).
375
368
 
376
- Parameters
377
- ----------
378
- pull_requests : Sequence[PullRequest]
379
-
380
369
  Returns
381
370
  -------
382
371
  MRSComponents
383
372
  Components of the maintainer responsiveness score.
384
373
 
374
+ Notes
375
+ -----
376
+ - Uses pull requests from self.pull_requests_df via get_pull_requests_for_package().
377
+
385
378
  """
379
+ pull_requests = self.get_pull_requests_for_package()
386
380
  if not pull_requests:
387
- # No external contribution signal
388
- return MRSComponents(
381
+ # No external contribution signal - Sresp is undefined per paper
382
+ self.mrs = MRSComponents(
389
383
  s_resp=0.0,
390
384
  r_dec=0.0,
391
385
  d_dec=0.0,
@@ -394,15 +388,13 @@ class Malta:
394
388
  n_terminated=0,
395
389
  n_open=0,
396
390
  )
391
+ return self.mrs
392
+
397
393
  # Filter PRs to those created within the evaluation window
398
- P = [
399
- pr
400
- for pr in pull_requests
401
- if self.eval_window.start <= pr.created_at < self.eval_window.end
402
- ]
394
+ P = [pr for pr in pull_requests if self.eval_window.start <= pr.created_at < self.eval_window.end]
403
395
  if not P:
404
396
  # No PRs in evaluation window
405
- return MRSComponents(
397
+ self.mrs = MRSComponents(
406
398
  s_resp=0.0,
407
399
  r_dec=0.0,
408
400
  d_dec=0.0,
@@ -411,6 +403,8 @@ class Malta:
411
403
  n_terminated=0,
412
404
  n_open=0,
413
405
  )
406
+ return self.mrs
407
+
414
408
  # Partition PRs
415
409
  P_term = []
416
410
  P_open = []
@@ -423,11 +417,9 @@ class Malta:
423
417
  else:
424
418
  raise ValueError(f"Unknown PR state: {pr.state}")
425
419
 
426
- # If PRs exist but none are handled, score is 0
427
- if P_term:
428
- R_dec = len(P_term) / len(P)
429
- else:
430
- return MRSComponents(
420
+ # If PRs exist but none are terminated, Sresp = 0 per paper
421
+ if not P_term:
422
+ self.mrs = MRSComponents(
431
423
  s_resp=0.0,
432
424
  r_dec=0.0,
433
425
  d_dec=0.0,
@@ -436,11 +428,14 @@ class Malta:
436
428
  n_terminated=0,
437
429
  n_open=len(P_open),
438
430
  )
431
+ return self.mrs
432
+
433
+ R_dec = len(P_term) / len(P)
439
434
 
440
435
  # ---- Decision Timeliness (D_dec) ----
441
436
  decision_delays = []
442
437
  for pr in P_term:
443
- closed_time = pr.merged_at or pr.closed_at
438
+ closed_time = pr.merged_at if pd.notna(pr.merged_at) else pr.closed_at
444
439
  delta_days = (closed_time - pr.created_at).days
445
440
  decision_delays.append(min(1.0, delta_days / self.mrs_constants.tref_days))
446
441
 
@@ -459,7 +454,7 @@ class Malta:
459
454
  # ---- Responsiveness Aggregation ----
460
455
  S_resp = R_dec * (1.0 - D_dec) * (1.0 - P_open_penalty)
461
456
 
462
- self.mrsc = MRSComponents(
457
+ self.mrs = MRSComponents(
463
458
  s_resp=self.__clamp(S_resp),
464
459
  r_dec=R_dec,
465
460
  d_dec=D_dec,
@@ -469,11 +464,10 @@ class Malta:
469
464
  n_open=len(P_open),
470
465
  )
471
466
 
472
- return self.mrsc
467
+ return self.mrs
473
468
 
474
469
  def repo_metadata_viability_score(
475
470
  self,
476
- meta: RepoMeta,
477
471
  ) -> RMVSComponents:
478
472
  """Compute repository metadata viability score S_meta.
479
473
 
@@ -487,7 +481,12 @@ class Malta:
487
481
  Missing-data handling:
488
482
  - If all counts are None: return None.
489
483
  - If some counts are missing: renormalize betas over observed fields.
484
+
485
+ Notes
486
+ -----
487
+ - Uses repo metadata from self.repo_meta_df via __get_repo_meta_for_package().
490
488
  """
489
+ meta = self.get_repo_meta_for_package()
491
490
  if not (0.0 <= self.rmv_constants.alpha_archived <= 1.0):
492
491
  raise ValueError("alpha_archived must be in [0,1].")
493
492
  betas = {
@@ -505,7 +504,7 @@ class Malta:
505
504
  f = self.__phi_count(meta.forks, K)
506
505
  w = self.__phi_count(meta.watchers, K)
507
506
  i = self.__phi_count(meta.open_issues, K)
508
- i_pen = 0 if i == 0 else (1.0 - i)
507
+ i_pen = 1.0 - i
509
508
 
510
509
  parts = {"stars": s, "forks": f, "watchers": w, "issues": i_pen}
511
510
  observed = {k: v for k, v in parts.items() if v is not None}
@@ -553,9 +552,21 @@ class Malta:
553
552
  Archived override:
554
553
  - If archived and S_resp is None: set S_resp = 0.0 (explicit cessation)
555
554
  before aggregation (still renormalizes if S_meta is None).
555
+
556
+ Raises
557
+ ------
558
+ ValueError
559
+ If component scores have not been computed yet.
556
560
  """
561
+ if self.das is None or self.mrs is None or self.rmvs is None:
562
+ raise ValueError(
563
+ "Component scores must be computed before final aggregation. "
564
+ "Call development_activity_score(), maintainer_responsiveness_score(), "
565
+ "and repo_metadata_viability_score() first."
566
+ )
567
+
557
568
  s_dev = self.das.s_dev
558
- s_resp = self.mrsc.s_resp
569
+ s_resp = self.mrs.s_resp
559
570
  s_meta = self.rmvs.s_meta
560
571
 
561
572
  # If the repo is archived, treat as 0.0
@@ -583,3 +594,279 @@ class Malta:
583
594
  )
584
595
 
585
596
  return self.final
597
+
598
+
599
+ class MaltaResult(NamedTuple):
600
+ """Result from scoring a single package with MALTA metrics."""
601
+
602
+ source: str
603
+ repo_url: str
604
+ # DAS components
605
+ das_score: float | None
606
+ das_dc: float | None
607
+ das_rc: float | None
608
+ # MRS components
609
+ mrs_score: float | None
610
+ mrs_rdec: float | None
611
+ mrs_ddec: float | None
612
+ mrs_popen: float | None
613
+ mrs_n_prs: int | None
614
+ mrs_n_terminated: int | None
615
+ mrs_n_open: int | None
616
+ # RMVS components
617
+ rmvs_score: float | None
618
+ rmvs_archived: bool | None
619
+ rmvs_stars_phi: float | None
620
+ rmvs_forks_phi: float | None
621
+ rmvs_issues_penalty: float | None
622
+ # Final score
623
+ final_score: float | None
624
+ final_score_100: float | None
625
+ # Counts
626
+ n_commits_total: int | None
627
+ n_commits_window: int | None
628
+ n_prs_total: int | None
629
+ n_prs_window: int | None
630
+ # Metadata
631
+ stars: int | None
632
+ forks: int | None
633
+ watchers: int | None
634
+ open_issues: int | None
635
+ archived: bool | None
636
+ # Error tracking
637
+ error: str | None
638
+
639
+
640
+ def _score_single_repo(
641
+ source: str,
642
+ repo_url: str,
643
+ repo_commits_df: pd.DataFrame,
644
+ repo_prs_df: pd.DataFrame,
645
+ repo_meta_df: pd.DataFrame,
646
+ eval_end: datetime,
647
+ malta_constants: MaltaConstants | None,
648
+ das_constants: DevelopmentActivityScoreConstants | None,
649
+ mrs_constants: MaintainerResponsivenessScoreConstants | None,
650
+ repo_meta_constants: RepoViabilityScoreConstants | None,
651
+ final_agg_constants: AggregateScoreConstants | None,
652
+ ) -> MaltaResult:
653
+ """Score a single repository. Internal function used by score_repos."""
654
+ try:
655
+ m = Malta(
656
+ package=source,
657
+ github_repo_url=repo_url,
658
+ eval_end=eval_end,
659
+ commits_df=repo_commits_df,
660
+ pull_requests_df=repo_prs_df,
661
+ repo_meta_df=repo_meta_df,
662
+ malta_constants=malta_constants,
663
+ das_constants=das_constants,
664
+ mrs_constants=mrs_constants,
665
+ repo_meta_constants=repo_meta_constants,
666
+ final_agg_constants=final_agg_constants,
667
+ )
668
+
669
+ das = m.development_activity_score()
670
+ mrs = m.maintainer_responsiveness_score()
671
+ rmvs = m.repo_metadata_viability_score()
672
+ final = m.final_aggregation_score()
673
+
674
+ commits = m.get_commits_for_package()
675
+ prs = m.get_pull_requests_for_package()
676
+ meta = m.get_repo_meta_for_package()
677
+
678
+ return MaltaResult(
679
+ source=source,
680
+ repo_url=repo_url,
681
+ das_score=das.s_dev,
682
+ das_dc=das.d_c,
683
+ das_rc=das.r_c,
684
+ mrs_score=mrs.s_resp,
685
+ mrs_rdec=mrs.r_dec,
686
+ mrs_ddec=mrs.d_dec,
687
+ mrs_popen=mrs.p_open,
688
+ mrs_n_prs=mrs.n_prs,
689
+ mrs_n_terminated=mrs.n_terminated,
690
+ mrs_n_open=mrs.n_open,
691
+ rmvs_score=rmvs.s_meta,
692
+ rmvs_archived=rmvs.archived,
693
+ rmvs_stars_phi=rmvs.stars_phi,
694
+ rmvs_forks_phi=rmvs.forks_phi,
695
+ rmvs_issues_penalty=rmvs.open_issues_penalty,
696
+ final_score=final.s_final,
697
+ final_score_100=final.s_final_100,
698
+ n_commits_total=len(repo_commits_df),
699
+ n_commits_window=len(commits),
700
+ n_prs_total=len(repo_prs_df),
701
+ n_prs_window=len(prs),
702
+ stars=meta.stars,
703
+ forks=meta.forks,
704
+ watchers=meta.watchers,
705
+ open_issues=meta.open_issues,
706
+ archived=meta.archived,
707
+ error=None,
708
+ )
709
+ except Exception as e:
710
+ return MaltaResult(
711
+ source=source,
712
+ repo_url=repo_url,
713
+ das_score=None,
714
+ das_dc=None,
715
+ das_rc=None,
716
+ mrs_score=None,
717
+ mrs_rdec=None,
718
+ mrs_ddec=None,
719
+ mrs_popen=None,
720
+ mrs_n_prs=None,
721
+ mrs_n_terminated=None,
722
+ mrs_n_open=None,
723
+ rmvs_score=None,
724
+ rmvs_archived=None,
725
+ rmvs_stars_phi=None,
726
+ rmvs_forks_phi=None,
727
+ rmvs_issues_penalty=None,
728
+ final_score=None,
729
+ final_score_100=None,
730
+ n_commits_total=None,
731
+ n_commits_window=None,
732
+ n_prs_total=None,
733
+ n_prs_window=None,
734
+ stars=None,
735
+ forks=None,
736
+ watchers=None,
737
+ open_issues=None,
738
+ archived=None,
739
+ error=str(e),
740
+ )
741
+
742
+
743
+ def score_repos(
744
+ packages: Sequence[tuple[str, str]],
745
+ commits_df: pd.DataFrame,
746
+ pull_requests_df: pd.DataFrame,
747
+ repo_meta_df: pd.DataFrame,
748
+ eval_end: datetime,
749
+ n_workers: int | None = None,
750
+ malta_constants: MaltaConstants | None = None,
751
+ das_constants: DevelopmentActivityScoreConstants | None = None,
752
+ mrs_constants: MaintainerResponsivenessScoreConstants | None = None,
753
+ repo_meta_constants: RepoViabilityScoreConstants | None = None,
754
+ final_agg_constants: AggregateScoreConstants | None = None,
755
+ show_progress: bool = True,
756
+ ) -> pd.DataFrame:
757
+ """Score multiple repositories concurrently using MALTA metrics.
758
+
759
+ Parameters
760
+ ----------
761
+ packages : Sequence[tuple[str, str]]
762
+ List of (source, repo_url) tuples identifying packages to score.
763
+ commits_df : pd.DataFrame
764
+ DataFrame containing commit data with 'repo_url' column.
765
+ pull_requests_df : pd.DataFrame
766
+ DataFrame containing PR data with 'repo_url' column.
767
+ repo_meta_df : pd.DataFrame
768
+ DataFrame containing repository metadata with 'repo_url' column.
769
+ eval_end : datetime
770
+ End of evaluation window (must be timezone-aware).
771
+ n_workers : int | None
772
+ Number of worker processes. None for auto (CPU count).
773
+ malta_constants : MaltaConstants | None
774
+ Custom MALTA constants.
775
+ das_constants : DevelopmentActivityScoreConstants | None
776
+ Custom DAS constants.
777
+ mrs_constants : MaintainerResponsivenessScoreConstants | None
778
+ Custom MRS constants.
779
+ repo_meta_constants : RepoViabilityScoreConstants | None
780
+ Custom RMVS constants.
781
+ final_agg_constants : AggregateScoreConstants | None
782
+ Custom aggregation constants.
783
+ show_progress : bool
784
+ Whether to show a progress bar (requires tqdm).
785
+
786
+ Returns
787
+ -------
788
+ pd.DataFrame
789
+ DataFrame with MALTA scores for each package. Columns include:
790
+ - source, repo_url: Package identifiers
791
+ - das_score, das_dc, das_rc: Development Activity Score components
792
+ - mrs_score, mrs_rdec, mrs_ddec, mrs_popen, mrs_n_*: MRS components
793
+ - rmvs_score, rmvs_archived, rmvs_*_phi, rmvs_issues_penalty: RMVS components
794
+ - final_score, final_score_100: Aggregated scores
795
+ - n_commits_total, n_commits_window, n_prs_total, n_prs_window: Counts
796
+ - stars, forks, watchers, open_issues, archived: Repository metadata
797
+ - error: Error message if scoring failed
798
+
799
+ Example
800
+ -------
801
+ >>> packages = [("pkg1", "https://github.com/owner/repo1"), ...]
802
+ >>> results_df = score_repos(
803
+ ... packages=packages,
804
+ ... commits_df=commits_df,
805
+ ... pull_requests_df=prs_df,
806
+ ... repo_meta_df=meta_df,
807
+ ... eval_end=datetime(2026, 1, 1, tzinfo=timezone.utc),
808
+ ... n_workers=8,
809
+ ... )
810
+ """
811
+ from concurrent.futures import ProcessPoolExecutor, as_completed
812
+ import os
813
+
814
+ if n_workers is None:
815
+ n_workers = os.cpu_count() or 4
816
+
817
+ # Get column name for repo_url (default or from constants)
818
+ repo_url_col = (malta_constants or MaltaConstants()).repo_url_column
819
+
820
+ # Pre-group DataFrames by repo_url for efficient lookup
821
+ commits_grouped = {url: group for url, group in commits_df.groupby(repo_url_col)}
822
+ prs_grouped = {url: group for url, group in pull_requests_df.groupby(repo_url_col)}
823
+ meta_grouped = {url: group for url, group in repo_meta_df.groupby(repo_url_col)}
824
+
825
+ # Empty DataFrames for repos with no data
826
+ empty_commits = commits_df.iloc[:0]
827
+ empty_prs = pull_requests_df.iloc[:0]
828
+ empty_meta = repo_meta_df.iloc[:0]
829
+
830
+ # Prepare work items
831
+ work_items = []
832
+ for source, repo_url in packages:
833
+ work_items.append((
834
+ source,
835
+ repo_url,
836
+ commits_grouped.get(repo_url, empty_commits),
837
+ prs_grouped.get(repo_url, empty_prs),
838
+ meta_grouped.get(repo_url, empty_meta),
839
+ eval_end,
840
+ malta_constants,
841
+ das_constants,
842
+ mrs_constants,
843
+ repo_meta_constants,
844
+ final_agg_constants,
845
+ ))
846
+
847
+ results: list[MaltaResult] = []
848
+
849
+ # Set up progress bar if requested
850
+ progress: Any = None
851
+ if show_progress:
852
+ try:
853
+ from tqdm import tqdm
854
+ progress = tqdm(total=len(work_items), desc="Scoring repos")
855
+ except ImportError:
856
+ show_progress = False
857
+
858
+ # Process in parallel
859
+ with ProcessPoolExecutor(max_workers=n_workers) as executor:
860
+ futures = {executor.submit(_score_single_repo, *item): i for i, item in enumerate(work_items)}
861
+
862
+ for future in as_completed(futures):
863
+ result = future.result()
864
+ results.append(result)
865
+ if progress:
866
+ progress.update(1)
867
+
868
+ if progress:
869
+ progress.close()
870
+
871
+ # Convert to DataFrame
872
+ return pd.DataFrame([r._asdict() for r in results])