PyPI - ossuary-risk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ossuary-risk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

ossuary/__init__.py +7 -0
ossuary/api/__init__.py +1 -0
ossuary/api/main.py +173 -0
ossuary/cli.py +309 -0
ossuary/collectors/__init__.py +8 -0
ossuary/collectors/base.py +26 -0
ossuary/collectors/git.py +231 -0
ossuary/collectors/github.py +495 -0
ossuary/collectors/npm.py +113 -0
ossuary/collectors/pypi.py +118 -0
ossuary/db/__init__.py +15 -0
ossuary/db/models.py +197 -0
ossuary/db/session.py +49 -0
ossuary/scoring/__init__.py +16 -0
ossuary/scoring/engine.py +318 -0
ossuary/scoring/factors.py +175 -0
ossuary/scoring/reputation.py +326 -0
ossuary/sentiment/__init__.py +5 -0
ossuary/sentiment/analyzer.py +232 -0
ossuary_risk-0.1.0.dist-info/METADATA +241 -0
ossuary_risk-0.1.0.dist-info/RECORD +23 -0
ossuary_risk-0.1.0.dist-info/WHEEL +4 -0
ossuary_risk-0.1.0.dist-info/entry_points.txt +2 -0

ossuary/scoring/reputation.py ADDED Viewed

@@ -0,0 +1,326 @@
+"""Composite reputation scoring for maintainers."""
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Optional
+logger = logging.getLogger(__name__)
+class ReputationTier(str, Enum):
+    """Reputation tier classification."""
+    TIER_1 = "TIER_1"  # Strong reputation, -25 risk points
+    TIER_2 = "TIER_2"  # Established, -10 risk points
+    UNKNOWN = "UNKNOWN"  # No reduction
+    @classmethod
+    def from_score(cls, score: int) -> "ReputationTier":
+        """Get tier from reputation score."""
+        if score >= 60:
+            return cls.TIER_1
+        elif score >= 30:
+            return cls.TIER_2
+        else:
+            return cls.UNKNOWN
+    @property
+    def risk_reduction(self) -> int:
+        """Get risk reduction points for this tier."""
+        return {
+            ReputationTier.TIER_1: -25,
+            ReputationTier.TIER_2: -10,
+            ReputationTier.UNKNOWN: 0,
+        }[self]
+# Recognized organizations that confer institutional backing
+RECOGNIZED_ORGS = {
+    # JavaScript/Node
+    "nodejs",
+    "openjs-foundation",
+    "npm",
+    "expressjs",
+    "mochajs",
+    "eslint",
+    "webpack",
+    "babel",
+    "rollup",
+    "vitejs",
+    # Python
+    "python",
+    "psf",
+    "pypa",
+    "pallets",
+    "django",
+    "encode",
+    "tiangolo",
+    # General
+    "apache",
+    "cncf",
+    "linux-foundation",
+    "mozilla",
+    "rust-lang",
+    "golang",
+    # Cloud/Infra
+    "kubernetes",
+    "docker",
+    "hashicorp",
+}
+# Top packages by ecosystem (starter list, should be expanded)
+TOP_PACKAGES = {
+    "npm": {
+        "lodash",
+        "chalk",
+        "express",
+        "react",
+        "vue",
+        "axios",
+        "moment",
+        "webpack",
+        "babel",
+        "eslint",
+        "typescript",
+        "next",
+        "prettier",
+        "jest",
+        "mocha",
+        "commander",
+        "debug",
+        "async",
+        "request",
+        "underscore",
+        "uuid",
+        "minimist",
+        "glob",
+        "yargs",
+        "semver",
+        "fs-extra",
+        "bluebird",
+        "rxjs",
+        "socket.io",
+        "mongoose",
+    },
+    "pypi": {
+        "requests",
+        "numpy",
+        "pandas",
+        "django",
+        "flask",
+        "pytest",
+        "boto3",
+        "urllib3",
+        "setuptools",
+        "pip",
+        "certifi",
+        "pyyaml",
+        "cryptography",
+        "pillow",
+        "sqlalchemy",
+        "jinja2",
+        "click",
+        "scipy",
+        "matplotlib",
+        "tensorflow",
+        "pytorch",
+        "fastapi",
+        "pydantic",
+        "httpx",
+        "aiohttp",
+        "redis",
+        "celery",
+        "scrapy",
+        "beautifulsoup4",
+        "lxml",
+    },
+}
+@dataclass
+class ReputationBreakdown:
+    """Detailed breakdown of reputation score."""
+    username: str = ""
+    # Individual signal scores
+    tenure_score: int = 0  # +15 for >5 years
+    portfolio_score: int = 0  # +15 for >50 original repos with stars
+    stars_score: int = 0  # +15 for >50K total stars
+    sponsors_score: int = 0  # +15 for sponsors with >=10 backers
+    packages_score: int = 0  # +10 for >20 packages published
+    top_package_score: int = 0  # +15 for maintaining top-1000 package
+    org_membership_score: int = 0  # +15 for recognized org membership
+    # Evidence for each signal
+    account_age_years: float = 0.0
+    original_repos_with_stars: int = 0
+    total_stars: int = 0
+    sponsor_count: Optional[int] = None
+    packages_published: int = 0
+    top_packages_maintained: list[str] = field(default_factory=list)
+    recognized_orgs: list[str] = field(default_factory=list)
+    @property
+    def total_score(self) -> int:
+        """Calculate total reputation score."""
+        return (
+            self.tenure_score
+            + self.portfolio_score
+            + self.stars_score
+            + self.sponsors_score
+            + self.packages_score
+            + self.top_package_score
+            + self.org_membership_score
+        )
+    @property
+    def tier(self) -> ReputationTier:
+        """Get reputation tier."""
+        return ReputationTier.from_score(self.total_score)
+    def to_dict(self) -> dict:
+        """Convert to dictionary for JSON serialization."""
+        return {
+            "username": self.username,
+            "total_score": self.total_score,
+            "tier": self.tier.value,
+            "risk_reduction": self.tier.risk_reduction,
+            "signals": {
+                "tenure": {
+                    "score": self.tenure_score,
+                    "years": self.account_age_years,
+                },
+                "portfolio": {
+                    "score": self.portfolio_score,
+                    "original_repos_with_stars": self.original_repos_with_stars,
+                },
+                "stars": {
+                    "score": self.stars_score,
+                    "total": self.total_stars,
+                },
+                "sponsors": {
+                    "score": self.sponsors_score,
+                    "count": self.sponsor_count,
+                },
+                "packages": {
+                    "score": self.packages_score,
+                    "count": self.packages_published,
+                },
+                "top_packages": {
+                    "score": self.top_package_score,
+                    "packages": self.top_packages_maintained,
+                },
+                "organizations": {
+                    "score": self.org_membership_score,
+                    "recognized": self.recognized_orgs,
+                },
+            },
+        }
+class ReputationScorer:
+    """Calculate composite reputation score for maintainers."""
+    # Thresholds
+    TENURE_YEARS = 5
+    MIN_REPOS_WITH_STARS = 50
+    MIN_STARS_PER_REPO = 10
+    TOTAL_STARS_THRESHOLD = 50_000
+    MIN_SPONSORS = 10
+    MIN_PACKAGES = 20
+    def calculate(
+        self,
+        username: str,
+        account_created: Optional[datetime],
+        repos: list[dict],
+        sponsor_count: Optional[int],
+        orgs: list[str],
+        packages_maintained: list[str],
+        ecosystem: str = "npm",
+        as_of_date: Optional[datetime] = None,
+    ) -> ReputationBreakdown:
+        """
+        Calculate reputation score for a maintainer.
+        Args:
+            username: GitHub username
+            account_created: Account creation date
+            repos: List of repo dicts with 'fork', 'stargazers_count' keys
+            sponsor_count: Number of sponsors (None if unknown)
+            orgs: List of organization logins user belongs to
+            packages_maintained: List of package names maintained
+            ecosystem: Package ecosystem for top-package lookup
+            as_of_date: Date to use as "now" for T-1 analysis (default: actual now)
+        Returns:
+            ReputationBreakdown with scores and evidence
+        """
+        breakdown = ReputationBreakdown(username=username)
+        # Signal 1: Tenure (+15 for >5 years)
+        if account_created:
+            # Handle timezone-aware vs naive datetime comparison
+            now = as_of_date or datetime.now()
+            if account_created.tzinfo is not None and now.tzinfo is None:
+                now = datetime.now(account_created.tzinfo)
+            elif account_created.tzinfo is None and now.tzinfo is not None:
+                now = now.replace(tzinfo=None)
+            age_years = (now - account_created).days / 365.25
+            breakdown.account_age_years = round(age_years, 1)
+            if age_years >= self.TENURE_YEARS:
+                breakdown.tenure_score = 15
+        # Signal 2: Portfolio - original repos with stars (+15)
+        original_repos_with_stars = 0
+        total_stars = 0
+        for repo in repos:
+            if not repo.get("fork", False):
+                stars = repo.get("stargazers_count", 0)
+                total_stars += stars
+                if stars >= self.MIN_STARS_PER_REPO:
+                    original_repos_with_stars += 1
+        breakdown.original_repos_with_stars = original_repos_with_stars
+        breakdown.total_stars = total_stars
+        if original_repos_with_stars >= self.MIN_REPOS_WITH_STARS:
+            breakdown.portfolio_score = 15
+        # Signal 3: Total stars (+15 for >50K)
+        if total_stars >= self.TOTAL_STARS_THRESHOLD:
+            breakdown.stars_score = 15
+        # Signal 4: Sponsors (+15 for >=10 sponsors)
+        breakdown.sponsor_count = sponsor_count
+        if sponsor_count is not None and sponsor_count >= self.MIN_SPONSORS:
+            breakdown.sponsors_score = 15
+        # Signal 5: Packages published (+10 for >20)
+        breakdown.packages_published = len(packages_maintained)
+        if len(packages_maintained) >= self.MIN_PACKAGES:
+            breakdown.packages_score = 10
+        # Signal 6: Top package maintainer (+15)
+        top_packages = TOP_PACKAGES.get(ecosystem, set())
+        maintained_top = [p for p in packages_maintained if p.lower() in top_packages]
+        breakdown.top_packages_maintained = maintained_top
+        if maintained_top:
+            breakdown.top_package_score = 15
+        # Signal 7: Recognized org membership (+15)
+        recognized = [org for org in orgs if org.lower() in RECOGNIZED_ORGS]
+        breakdown.recognized_orgs = recognized
+        if recognized:
+            breakdown.org_membership_score = 15
+        logger.info(
+            f"Reputation for {username}: {breakdown.total_score} ({breakdown.tier.value}) - "
+            f"tenure={breakdown.tenure_score}, portfolio={breakdown.portfolio_score}, "
+            f"stars={breakdown.stars_score}, sponsors={breakdown.sponsors_score}"
+        )
+        return breakdown

ossuary/sentiment/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Sentiment analysis for maintainer communications."""
+from ossuary.sentiment.analyzer import SentimentAnalyzer
+__all__ = ["SentimentAnalyzer"]

ossuary/sentiment/analyzer.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Sentiment analysis for maintainer communications."""
+import hashlib
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+logger = logging.getLogger(__name__)
+# Keywords indicating maintainer frustration/burnout
+# These should be specific enough to avoid false positives on normal development discussions
+FRUSTRATION_KEYWORDS = [
+    # Direct economic frustration (high signal)
+    "not getting paid",
+    "unpaid work",
+    "free labor",
+    "work for free",
+    "donating my time",
+    "corporate exploitation",
+    "open source exploitation",
+    "mass resignation",
+    # Burnout signals (moderate signal)
+    "burned out",
+    "burnout",
+    "stepping down",
+    "giving up on this",
+    "abandoning this project",
+    # Economic frustration (moderate signal)
+    "fortune 500",
+    "pay developers",
+    "fund open source",
+    "companies make millions",
+    # Protest signals (high signal)
+    "protest",
+    "on strike",
+    "boycott",
+    # Explicit negative emotions (only strong ones)
+    "resentment",
+    "exploitation",
+    "taken advantage of",
+]
+@dataclass
+class SentimentResult:
+    """Result of sentiment analysis for a single text."""
+    text_hash: str
+    compound_score: float  # -1 (negative) to +1 (positive)
+    positive_score: float
+    negative_score: float
+    neutral_score: float
+    frustration_detected: bool = False
+    frustration_keywords: list[str] = field(default_factory=list)
+@dataclass
+class AggregatedSentiment:
+    """Aggregated sentiment analysis results."""
+    total_analyzed: int = 0
+    average_compound: float = 0.0
+    average_positive: float = 0.0
+    average_negative: float = 0.0
+    frustration_count: int = 0
+    frustration_evidence: list[str] = field(default_factory=list)
+    most_negative_texts: list[tuple[str, float]] = field(default_factory=list)  # (text_preview, score)
+class SentimentAnalyzer:
+    """
+    Sentiment analyzer for OSS maintainer communications.
+    Uses VADER for general sentiment analysis and keyword matching
+    for frustration detection.
+    """
+    def __init__(self):
+        """Initialize the sentiment analyzer."""
+        self.vader = SentimentIntensityAnalyzer()
+        self.frustration_patterns = [re.compile(rf"\b{kw}\b", re.IGNORECASE) for kw in FRUSTRATION_KEYWORDS]
+    @staticmethod
+    def text_hash(text: str) -> str:
+        """Generate hash for text deduplication."""
+        return hashlib.sha256(text.encode()).hexdigest()
+    def _detect_frustration(self, text: str) -> tuple[bool, list[str]]:
+        """
+        Detect frustration keywords in text.
+        Args:
+            text: Text to analyze
+        Returns:
+            Tuple of (detected, keywords_found)
+        """
+        text_lower = text.lower()
+        found_keywords = []
+        for i, pattern in enumerate(self.frustration_patterns):
+            if pattern.search(text_lower):
+                found_keywords.append(FRUSTRATION_KEYWORDS[i])
+        return len(found_keywords) > 0, found_keywords
+    def analyze_text(self, text: str) -> SentimentResult:
+        """
+        Analyze sentiment of a single text.
+        Args:
+            text: Text to analyze
+        Returns:
+            SentimentResult with scores
+        """
+        if not text or not text.strip():
+            return SentimentResult(
+                text_hash=self.text_hash(""),
+                compound_score=0.0,
+                positive_score=0.0,
+                negative_score=0.0,
+                neutral_score=1.0,
+            )
+        # VADER sentiment scores
+        scores = self.vader.polarity_scores(text)
+        # Frustration detection
+        frustration_detected, keywords = self._detect_frustration(text)
+        return SentimentResult(
+            text_hash=self.text_hash(text),
+            compound_score=scores["compound"],
+            positive_score=scores["pos"],
+            negative_score=scores["neg"],
+            neutral_score=scores["neu"],
+            frustration_detected=frustration_detected,
+            frustration_keywords=keywords,
+        )
+    def analyze_texts(self, texts: list[str], source_type: str = "unknown") -> AggregatedSentiment:
+        """
+        Analyze multiple texts and aggregate results.
+        Args:
+            texts: List of texts to analyze
+            source_type: Type of source (commit, issue, comment) for reporting
+        Returns:
+            AggregatedSentiment with aggregated results
+        """
+        if not texts:
+            return AggregatedSentiment()
+        results = []
+        frustration_evidence = []
+        negative_texts = []
+        for text in texts:
+            if not text or not text.strip():
+                continue
+            result = self.analyze_text(text)
+            results.append(result)
+            if result.frustration_detected:
+                preview = text[:100] + "..." if len(text) > 100 else text
+                frustration_evidence.append(f"[{source_type}] Found keywords: {result.frustration_keywords}")
+            if result.compound_score < -0.3:
+                preview = text[:100] + "..." if len(text) > 100 else text
+                negative_texts.append((preview, result.compound_score))
+        if not results:
+            return AggregatedSentiment()
+        # Calculate averages
+        avg_compound = sum(r.compound_score for r in results) / len(results)
+        avg_positive = sum(r.positive_score for r in results) / len(results)
+        avg_negative = sum(r.negative_score for r in results) / len(results)
+        frustration_count = sum(1 for r in results if r.frustration_detected)
+        # Sort negative texts by score
+        negative_texts.sort(key=lambda x: x[1])
+        return AggregatedSentiment(
+            total_analyzed=len(results),
+            average_compound=avg_compound,
+            average_positive=avg_positive,
+            average_negative=avg_negative,
+            frustration_count=frustration_count,
+            frustration_evidence=frustration_evidence[:10],  # Limit to 10 examples
+            most_negative_texts=negative_texts[:5],  # Top 5 most negative
+        )
+    def analyze_commits(self, commit_messages: list[str]) -> AggregatedSentiment:
+        """Analyze sentiment of commit messages."""
+        return self.analyze_texts(commit_messages, source_type="commit")
+    def analyze_issues(self, issues: list[dict]) -> AggregatedSentiment:
+        """
+        Analyze sentiment of issues and their comments.
+        Args:
+            issues: List of issue dicts with 'title', 'body', and 'comments' keys
+        Returns:
+            AggregatedSentiment for all issue content
+        """
+        texts = []
+        for issue in issues:
+            # Issue title and body
+            title = issue.get("title", "")
+            body = issue.get("body", "")
+            if title:
+                texts.append(title)
+            if body:
+                texts.append(body)
+            # Comments
+            comments = issue.get("comments", [])
+            for comment in comments:
+                comment_body = comment.get("body", "") if isinstance(comment, dict) else str(comment)
+                if comment_body:
+                    texts.append(comment_body)
+        return self.analyze_texts(texts, source_type="issue")