ossuary-risk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,326 @@
1
+ """Composite reputation scoring for maintainers."""
2
+
3
+ import logging
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Optional
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ReputationTier(str, Enum):
13
+ """Reputation tier classification."""
14
+
15
+ TIER_1 = "TIER_1" # Strong reputation, -25 risk points
16
+ TIER_2 = "TIER_2" # Established, -10 risk points
17
+ UNKNOWN = "UNKNOWN" # No reduction
18
+
19
+ @classmethod
20
+ def from_score(cls, score: int) -> "ReputationTier":
21
+ """Get tier from reputation score."""
22
+ if score >= 60:
23
+ return cls.TIER_1
24
+ elif score >= 30:
25
+ return cls.TIER_2
26
+ else:
27
+ return cls.UNKNOWN
28
+
29
+ @property
30
+ def risk_reduction(self) -> int:
31
+ """Get risk reduction points for this tier."""
32
+ return {
33
+ ReputationTier.TIER_1: -25,
34
+ ReputationTier.TIER_2: -10,
35
+ ReputationTier.UNKNOWN: 0,
36
+ }[self]
37
+
38
+
39
+ # Recognized organizations that confer institutional backing
40
+ RECOGNIZED_ORGS = {
41
+ # JavaScript/Node
42
+ "nodejs",
43
+ "openjs-foundation",
44
+ "npm",
45
+ "expressjs",
46
+ "mochajs",
47
+ "eslint",
48
+ "webpack",
49
+ "babel",
50
+ "rollup",
51
+ "vitejs",
52
+ # Python
53
+ "python",
54
+ "psf",
55
+ "pypa",
56
+ "pallets",
57
+ "django",
58
+ "encode",
59
+ "tiangolo",
60
+ # General
61
+ "apache",
62
+ "cncf",
63
+ "linux-foundation",
64
+ "mozilla",
65
+ "rust-lang",
66
+ "golang",
67
+ # Cloud/Infra
68
+ "kubernetes",
69
+ "docker",
70
+ "hashicorp",
71
+ }
72
+
73
+ # Top packages by ecosystem (starter list, should be expanded)
74
+ TOP_PACKAGES = {
75
+ "npm": {
76
+ "lodash",
77
+ "chalk",
78
+ "express",
79
+ "react",
80
+ "vue",
81
+ "axios",
82
+ "moment",
83
+ "webpack",
84
+ "babel",
85
+ "eslint",
86
+ "typescript",
87
+ "next",
88
+ "prettier",
89
+ "jest",
90
+ "mocha",
91
+ "commander",
92
+ "debug",
93
+ "async",
94
+ "request",
95
+ "underscore",
96
+ "uuid",
97
+ "minimist",
98
+ "glob",
99
+ "yargs",
100
+ "semver",
101
+ "fs-extra",
102
+ "bluebird",
103
+ "rxjs",
104
+ "socket.io",
105
+ "mongoose",
106
+ },
107
+ "pypi": {
108
+ "requests",
109
+ "numpy",
110
+ "pandas",
111
+ "django",
112
+ "flask",
113
+ "pytest",
114
+ "boto3",
115
+ "urllib3",
116
+ "setuptools",
117
+ "pip",
118
+ "certifi",
119
+ "pyyaml",
120
+ "cryptography",
121
+ "pillow",
122
+ "sqlalchemy",
123
+ "jinja2",
124
+ "click",
125
+ "scipy",
126
+ "matplotlib",
127
+ "tensorflow",
128
+ "pytorch",
129
+ "fastapi",
130
+ "pydantic",
131
+ "httpx",
132
+ "aiohttp",
133
+ "redis",
134
+ "celery",
135
+ "scrapy",
136
+ "beautifulsoup4",
137
+ "lxml",
138
+ },
139
+ }
140
+
141
+
142
+ @dataclass
143
+ class ReputationBreakdown:
144
+ """Detailed breakdown of reputation score."""
145
+
146
+ username: str = ""
147
+
148
+ # Individual signal scores
149
+ tenure_score: int = 0 # +15 for >5 years
150
+ portfolio_score: int = 0 # +15 for >50 original repos with stars
151
+ stars_score: int = 0 # +15 for >50K total stars
152
+ sponsors_score: int = 0 # +15 for sponsors with >=10 backers
153
+ packages_score: int = 0 # +10 for >20 packages published
154
+ top_package_score: int = 0 # +15 for maintaining top-1000 package
155
+ org_membership_score: int = 0 # +15 for recognized org membership
156
+
157
+ # Evidence for each signal
158
+ account_age_years: float = 0.0
159
+ original_repos_with_stars: int = 0
160
+ total_stars: int = 0
161
+ sponsor_count: Optional[int] = None
162
+ packages_published: int = 0
163
+ top_packages_maintained: list[str] = field(default_factory=list)
164
+ recognized_orgs: list[str] = field(default_factory=list)
165
+
166
+ @property
167
+ def total_score(self) -> int:
168
+ """Calculate total reputation score."""
169
+ return (
170
+ self.tenure_score
171
+ + self.portfolio_score
172
+ + self.stars_score
173
+ + self.sponsors_score
174
+ + self.packages_score
175
+ + self.top_package_score
176
+ + self.org_membership_score
177
+ )
178
+
179
+ @property
180
+ def tier(self) -> ReputationTier:
181
+ """Get reputation tier."""
182
+ return ReputationTier.from_score(self.total_score)
183
+
184
+ def to_dict(self) -> dict:
185
+ """Convert to dictionary for JSON serialization."""
186
+ return {
187
+ "username": self.username,
188
+ "total_score": self.total_score,
189
+ "tier": self.tier.value,
190
+ "risk_reduction": self.tier.risk_reduction,
191
+ "signals": {
192
+ "tenure": {
193
+ "score": self.tenure_score,
194
+ "years": self.account_age_years,
195
+ },
196
+ "portfolio": {
197
+ "score": self.portfolio_score,
198
+ "original_repos_with_stars": self.original_repos_with_stars,
199
+ },
200
+ "stars": {
201
+ "score": self.stars_score,
202
+ "total": self.total_stars,
203
+ },
204
+ "sponsors": {
205
+ "score": self.sponsors_score,
206
+ "count": self.sponsor_count,
207
+ },
208
+ "packages": {
209
+ "score": self.packages_score,
210
+ "count": self.packages_published,
211
+ },
212
+ "top_packages": {
213
+ "score": self.top_package_score,
214
+ "packages": self.top_packages_maintained,
215
+ },
216
+ "organizations": {
217
+ "score": self.org_membership_score,
218
+ "recognized": self.recognized_orgs,
219
+ },
220
+ },
221
+ }
222
+
223
+
224
+ class ReputationScorer:
225
+ """Calculate composite reputation score for maintainers."""
226
+
227
+ # Thresholds
228
+ TENURE_YEARS = 5
229
+ MIN_REPOS_WITH_STARS = 50
230
+ MIN_STARS_PER_REPO = 10
231
+ TOTAL_STARS_THRESHOLD = 50_000
232
+ MIN_SPONSORS = 10
233
+ MIN_PACKAGES = 20
234
+
235
+ def calculate(
236
+ self,
237
+ username: str,
238
+ account_created: Optional[datetime],
239
+ repos: list[dict],
240
+ sponsor_count: Optional[int],
241
+ orgs: list[str],
242
+ packages_maintained: list[str],
243
+ ecosystem: str = "npm",
244
+ as_of_date: Optional[datetime] = None,
245
+ ) -> ReputationBreakdown:
246
+ """
247
+ Calculate reputation score for a maintainer.
248
+
249
+ Args:
250
+ username: GitHub username
251
+ account_created: Account creation date
252
+ repos: List of repo dicts with 'fork', 'stargazers_count' keys
253
+ sponsor_count: Number of sponsors (None if unknown)
254
+ orgs: List of organization logins user belongs to
255
+ packages_maintained: List of package names maintained
256
+ ecosystem: Package ecosystem for top-package lookup
257
+ as_of_date: Date to use as "now" for T-1 analysis (default: actual now)
258
+
259
+ Returns:
260
+ ReputationBreakdown with scores and evidence
261
+ """
262
+ breakdown = ReputationBreakdown(username=username)
263
+
264
+ # Signal 1: Tenure (+15 for >5 years)
265
+ if account_created:
266
+ # Handle timezone-aware vs naive datetime comparison
267
+ now = as_of_date or datetime.now()
268
+ if account_created.tzinfo is not None and now.tzinfo is None:
269
+ now = datetime.now(account_created.tzinfo)
270
+ elif account_created.tzinfo is None and now.tzinfo is not None:
271
+ now = now.replace(tzinfo=None)
272
+ age_years = (now - account_created).days / 365.25
273
+ breakdown.account_age_years = round(age_years, 1)
274
+ if age_years >= self.TENURE_YEARS:
275
+ breakdown.tenure_score = 15
276
+
277
+ # Signal 2: Portfolio - original repos with stars (+15)
278
+ original_repos_with_stars = 0
279
+ total_stars = 0
280
+ for repo in repos:
281
+ if not repo.get("fork", False):
282
+ stars = repo.get("stargazers_count", 0)
283
+ total_stars += stars
284
+ if stars >= self.MIN_STARS_PER_REPO:
285
+ original_repos_with_stars += 1
286
+
287
+ breakdown.original_repos_with_stars = original_repos_with_stars
288
+ breakdown.total_stars = total_stars
289
+
290
+ if original_repos_with_stars >= self.MIN_REPOS_WITH_STARS:
291
+ breakdown.portfolio_score = 15
292
+
293
+ # Signal 3: Total stars (+15 for >50K)
294
+ if total_stars >= self.TOTAL_STARS_THRESHOLD:
295
+ breakdown.stars_score = 15
296
+
297
+ # Signal 4: Sponsors (+15 for >=10 sponsors)
298
+ breakdown.sponsor_count = sponsor_count
299
+ if sponsor_count is not None and sponsor_count >= self.MIN_SPONSORS:
300
+ breakdown.sponsors_score = 15
301
+
302
+ # Signal 5: Packages published (+10 for >20)
303
+ breakdown.packages_published = len(packages_maintained)
304
+ if len(packages_maintained) >= self.MIN_PACKAGES:
305
+ breakdown.packages_score = 10
306
+
307
+ # Signal 6: Top package maintainer (+15)
308
+ top_packages = TOP_PACKAGES.get(ecosystem, set())
309
+ maintained_top = [p for p in packages_maintained if p.lower() in top_packages]
310
+ breakdown.top_packages_maintained = maintained_top
311
+ if maintained_top:
312
+ breakdown.top_package_score = 15
313
+
314
+ # Signal 7: Recognized org membership (+15)
315
+ recognized = [org for org in orgs if org.lower() in RECOGNIZED_ORGS]
316
+ breakdown.recognized_orgs = recognized
317
+ if recognized:
318
+ breakdown.org_membership_score = 15
319
+
320
+ logger.info(
321
+ f"Reputation for {username}: {breakdown.total_score} ({breakdown.tier.value}) - "
322
+ f"tenure={breakdown.tenure_score}, portfolio={breakdown.portfolio_score}, "
323
+ f"stars={breakdown.stars_score}, sponsors={breakdown.sponsors_score}"
324
+ )
325
+
326
+ return breakdown
@@ -0,0 +1,5 @@
1
+ """Sentiment analysis for maintainer communications."""
2
+
3
+ from ossuary.sentiment.analyzer import SentimentAnalyzer
4
+
5
+ __all__ = ["SentimentAnalyzer"]
@@ -0,0 +1,232 @@
1
+ """Sentiment analysis for maintainer communications."""
2
+
3
+ import hashlib
4
+ import logging
5
+ import re
6
+ from dataclasses import dataclass, field
7
+ from typing import Optional
8
+
9
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Keywords indicating maintainer frustration/burnout
14
+ # These should be specific enough to avoid false positives on normal development discussions
15
+ FRUSTRATION_KEYWORDS = [
16
+ # Direct economic frustration (high signal)
17
+ "not getting paid",
18
+ "unpaid work",
19
+ "free labor",
20
+ "work for free",
21
+ "donating my time",
22
+ "corporate exploitation",
23
+ "open source exploitation",
24
+ "mass resignation",
25
+ # Burnout signals (moderate signal)
26
+ "burned out",
27
+ "burnout",
28
+ "stepping down",
29
+ "giving up on this",
30
+ "abandoning this project",
31
+ # Economic frustration (moderate signal)
32
+ "fortune 500",
33
+ "pay developers",
34
+ "fund open source",
35
+ "companies make millions",
36
+ # Protest signals (high signal)
37
+ "protest",
38
+ "on strike",
39
+ "boycott",
40
+ # Explicit negative emotions (only strong ones)
41
+ "resentment",
42
+ "exploitation",
43
+ "taken advantage of",
44
+ ]
45
+
46
+
47
+ @dataclass
48
+ class SentimentResult:
49
+ """Result of sentiment analysis for a single text."""
50
+
51
+ text_hash: str
52
+ compound_score: float # -1 (negative) to +1 (positive)
53
+ positive_score: float
54
+ negative_score: float
55
+ neutral_score: float
56
+ frustration_detected: bool = False
57
+ frustration_keywords: list[str] = field(default_factory=list)
58
+
59
+
60
+ @dataclass
61
+ class AggregatedSentiment:
62
+ """Aggregated sentiment analysis results."""
63
+
64
+ total_analyzed: int = 0
65
+ average_compound: float = 0.0
66
+ average_positive: float = 0.0
67
+ average_negative: float = 0.0
68
+ frustration_count: int = 0
69
+ frustration_evidence: list[str] = field(default_factory=list)
70
+ most_negative_texts: list[tuple[str, float]] = field(default_factory=list) # (text_preview, score)
71
+
72
+
73
+ class SentimentAnalyzer:
74
+ """
75
+ Sentiment analyzer for OSS maintainer communications.
76
+
77
+ Uses VADER for general sentiment analysis and keyword matching
78
+ for frustration detection.
79
+ """
80
+
81
+ def __init__(self):
82
+ """Initialize the sentiment analyzer."""
83
+ self.vader = SentimentIntensityAnalyzer()
84
+ self.frustration_patterns = [re.compile(rf"\b{kw}\b", re.IGNORECASE) for kw in FRUSTRATION_KEYWORDS]
85
+
86
+ @staticmethod
87
+ def text_hash(text: str) -> str:
88
+ """Generate hash for text deduplication."""
89
+ return hashlib.sha256(text.encode()).hexdigest()
90
+
91
+ def _detect_frustration(self, text: str) -> tuple[bool, list[str]]:
92
+ """
93
+ Detect frustration keywords in text.
94
+
95
+ Args:
96
+ text: Text to analyze
97
+
98
+ Returns:
99
+ Tuple of (detected, keywords_found)
100
+ """
101
+ text_lower = text.lower()
102
+ found_keywords = []
103
+
104
+ for i, pattern in enumerate(self.frustration_patterns):
105
+ if pattern.search(text_lower):
106
+ found_keywords.append(FRUSTRATION_KEYWORDS[i])
107
+
108
+ return len(found_keywords) > 0, found_keywords
109
+
110
+ def analyze_text(self, text: str) -> SentimentResult:
111
+ """
112
+ Analyze sentiment of a single text.
113
+
114
+ Args:
115
+ text: Text to analyze
116
+
117
+ Returns:
118
+ SentimentResult with scores
119
+ """
120
+ if not text or not text.strip():
121
+ return SentimentResult(
122
+ text_hash=self.text_hash(""),
123
+ compound_score=0.0,
124
+ positive_score=0.0,
125
+ negative_score=0.0,
126
+ neutral_score=1.0,
127
+ )
128
+
129
+ # VADER sentiment scores
130
+ scores = self.vader.polarity_scores(text)
131
+
132
+ # Frustration detection
133
+ frustration_detected, keywords = self._detect_frustration(text)
134
+
135
+ return SentimentResult(
136
+ text_hash=self.text_hash(text),
137
+ compound_score=scores["compound"],
138
+ positive_score=scores["pos"],
139
+ negative_score=scores["neg"],
140
+ neutral_score=scores["neu"],
141
+ frustration_detected=frustration_detected,
142
+ frustration_keywords=keywords,
143
+ )
144
+
145
+ def analyze_texts(self, texts: list[str], source_type: str = "unknown") -> AggregatedSentiment:
146
+ """
147
+ Analyze multiple texts and aggregate results.
148
+
149
+ Args:
150
+ texts: List of texts to analyze
151
+ source_type: Type of source (commit, issue, comment) for reporting
152
+
153
+ Returns:
154
+ AggregatedSentiment with aggregated results
155
+ """
156
+ if not texts:
157
+ return AggregatedSentiment()
158
+
159
+ results = []
160
+ frustration_evidence = []
161
+ negative_texts = []
162
+
163
+ for text in texts:
164
+ if not text or not text.strip():
165
+ continue
166
+
167
+ result = self.analyze_text(text)
168
+ results.append(result)
169
+
170
+ if result.frustration_detected:
171
+ preview = text[:100] + "..." if len(text) > 100 else text
172
+ frustration_evidence.append(f"[{source_type}] Found keywords: {result.frustration_keywords}")
173
+
174
+ if result.compound_score < -0.3:
175
+ preview = text[:100] + "..." if len(text) > 100 else text
176
+ negative_texts.append((preview, result.compound_score))
177
+
178
+ if not results:
179
+ return AggregatedSentiment()
180
+
181
+ # Calculate averages
182
+ avg_compound = sum(r.compound_score for r in results) / len(results)
183
+ avg_positive = sum(r.positive_score for r in results) / len(results)
184
+ avg_negative = sum(r.negative_score for r in results) / len(results)
185
+ frustration_count = sum(1 for r in results if r.frustration_detected)
186
+
187
+ # Sort negative texts by score
188
+ negative_texts.sort(key=lambda x: x[1])
189
+
190
+ return AggregatedSentiment(
191
+ total_analyzed=len(results),
192
+ average_compound=avg_compound,
193
+ average_positive=avg_positive,
194
+ average_negative=avg_negative,
195
+ frustration_count=frustration_count,
196
+ frustration_evidence=frustration_evidence[:10], # Limit to 10 examples
197
+ most_negative_texts=negative_texts[:5], # Top 5 most negative
198
+ )
199
+
200
+ def analyze_commits(self, commit_messages: list[str]) -> AggregatedSentiment:
201
+ """Analyze sentiment of commit messages."""
202
+ return self.analyze_texts(commit_messages, source_type="commit")
203
+
204
+ def analyze_issues(self, issues: list[dict]) -> AggregatedSentiment:
205
+ """
206
+ Analyze sentiment of issues and their comments.
207
+
208
+ Args:
209
+ issues: List of issue dicts with 'title', 'body', and 'comments' keys
210
+
211
+ Returns:
212
+ AggregatedSentiment for all issue content
213
+ """
214
+ texts = []
215
+
216
+ for issue in issues:
217
+ # Issue title and body
218
+ title = issue.get("title", "")
219
+ body = issue.get("body", "")
220
+ if title:
221
+ texts.append(title)
222
+ if body:
223
+ texts.append(body)
224
+
225
+ # Comments
226
+ comments = issue.get("comments", [])
227
+ for comment in comments:
228
+ comment_body = comment.get("body", "") if isinstance(comment, dict) else str(comment)
229
+ if comment_body:
230
+ texts.append(comment_body)
231
+
232
+ return self.analyze_texts(texts, source_type="issue")