ossuary-risk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ossuary/__init__.py +7 -0
- ossuary/api/__init__.py +1 -0
- ossuary/api/main.py +173 -0
- ossuary/cli.py +309 -0
- ossuary/collectors/__init__.py +8 -0
- ossuary/collectors/base.py +26 -0
- ossuary/collectors/git.py +231 -0
- ossuary/collectors/github.py +495 -0
- ossuary/collectors/npm.py +113 -0
- ossuary/collectors/pypi.py +118 -0
- ossuary/db/__init__.py +15 -0
- ossuary/db/models.py +197 -0
- ossuary/db/session.py +49 -0
- ossuary/scoring/__init__.py +16 -0
- ossuary/scoring/engine.py +318 -0
- ossuary/scoring/factors.py +175 -0
- ossuary/scoring/reputation.py +326 -0
- ossuary/sentiment/__init__.py +5 -0
- ossuary/sentiment/analyzer.py +232 -0
- ossuary_risk-0.1.0.dist-info/METADATA +241 -0
- ossuary_risk-0.1.0.dist-info/RECORD +23 -0
- ossuary_risk-0.1.0.dist-info/WHEEL +4 -0
- ossuary_risk-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""PyPI registry collector."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from ossuary.collectors.base import BaseCollector
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class PyPIData:
|
|
16
|
+
"""Data collected from PyPI."""
|
|
17
|
+
|
|
18
|
+
name: str = ""
|
|
19
|
+
version: str = ""
|
|
20
|
+
description: str = ""
|
|
21
|
+
homepage: str = ""
|
|
22
|
+
repository_url: str = ""
|
|
23
|
+
weekly_downloads: int = 0
|
|
24
|
+
maintainers: list[str] = None
|
|
25
|
+
|
|
26
|
+
def __post_init__(self):
|
|
27
|
+
if self.maintainers is None:
|
|
28
|
+
self.maintainers = []
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class PyPICollector(BaseCollector):
|
|
32
|
+
"""Collector for PyPI data."""
|
|
33
|
+
|
|
34
|
+
PYPI_URL = "https://pypi.org/pypi"
|
|
35
|
+
STATS_URL = "https://pypistats.org/api"
|
|
36
|
+
|
|
37
|
+
def __init__(self):
|
|
38
|
+
"""Initialize PyPI collector."""
|
|
39
|
+
self.client = httpx.AsyncClient(timeout=30.0)
|
|
40
|
+
|
|
41
|
+
def is_available(self) -> bool:
|
|
42
|
+
"""PyPI collector is always available."""
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
async def get_package_info(self, package_name: str) -> Optional[dict]:
|
|
46
|
+
"""Get package metadata from PyPI."""
|
|
47
|
+
try:
|
|
48
|
+
response = await self.client.get(f"{self.PYPI_URL}/{package_name}/json")
|
|
49
|
+
if response.status_code == 200:
|
|
50
|
+
return response.json()
|
|
51
|
+
except httpx.HTTPError as e:
|
|
52
|
+
logger.error(f"PyPI API error: {e}")
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
async def get_weekly_downloads(self, package_name: str) -> int:
|
|
56
|
+
"""Get approximate weekly download count."""
|
|
57
|
+
try:
|
|
58
|
+
response = await self.client.get(f"{self.STATS_URL}/packages/{package_name}/recent")
|
|
59
|
+
if response.status_code == 200:
|
|
60
|
+
data = response.json().get("data", {})
|
|
61
|
+
monthly = data.get("last_month", 0)
|
|
62
|
+
return monthly // 4 # Approximate weekly
|
|
63
|
+
except httpx.HTTPError as e:
|
|
64
|
+
logger.error(f"PyPI stats error: {e}")
|
|
65
|
+
return 0
|
|
66
|
+
|
|
67
|
+
def _extract_repo_url(self, info: dict) -> str:
|
|
68
|
+
"""Extract repository URL from package info."""
|
|
69
|
+
# Check project_urls first
|
|
70
|
+
project_urls = info.get("project_urls", {}) or {}
|
|
71
|
+
for key in ["Repository", "Source", "Source Code", "GitHub", "Code"]:
|
|
72
|
+
if key in project_urls:
|
|
73
|
+
return project_urls[key]
|
|
74
|
+
|
|
75
|
+
# Check home_page
|
|
76
|
+
home_page = info.get("home_page", "") or ""
|
|
77
|
+
if "github.com" in home_page or "gitlab.com" in home_page:
|
|
78
|
+
return home_page
|
|
79
|
+
|
|
80
|
+
return ""
|
|
81
|
+
|
|
82
|
+
async def collect(self, package_name: str) -> PyPIData:
|
|
83
|
+
"""
|
|
84
|
+
Collect PyPI package data.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
package_name: PyPI package name
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
PyPIData with package information
|
|
91
|
+
"""
|
|
92
|
+
data = PyPIData(name=package_name)
|
|
93
|
+
|
|
94
|
+
# Get package metadata
|
|
95
|
+
pkg_info = await self.get_package_info(package_name)
|
|
96
|
+
if pkg_info:
|
|
97
|
+
info = pkg_info.get("info", {})
|
|
98
|
+
data.version = info.get("version", "")
|
|
99
|
+
data.description = info.get("summary", "")
|
|
100
|
+
data.homepage = info.get("home_page", "")
|
|
101
|
+
data.repository_url = self._extract_repo_url(info)
|
|
102
|
+
|
|
103
|
+
# Get maintainer/author
|
|
104
|
+
author = info.get("author", "")
|
|
105
|
+
maintainer = info.get("maintainer", "")
|
|
106
|
+
if maintainer:
|
|
107
|
+
data.maintainers = [maintainer]
|
|
108
|
+
elif author:
|
|
109
|
+
data.maintainers = [author]
|
|
110
|
+
|
|
111
|
+
# Get download stats
|
|
112
|
+
data.weekly_downloads = await self.get_weekly_downloads(package_name)
|
|
113
|
+
|
|
114
|
+
return data
|
|
115
|
+
|
|
116
|
+
async def close(self):
|
|
117
|
+
"""Close the HTTP client."""
|
|
118
|
+
await self.client.aclose()
|
ossuary/db/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Database models and session management."""
|
|
2
|
+
|
|
3
|
+
from ossuary.db.models import Base, Package, Commit, Issue, Score, SentimentRecord
|
|
4
|
+
from ossuary.db.session import get_session, init_db
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"Base",
|
|
8
|
+
"Package",
|
|
9
|
+
"Commit",
|
|
10
|
+
"Issue",
|
|
11
|
+
"Score",
|
|
12
|
+
"SentimentRecord",
|
|
13
|
+
"get_session",
|
|
14
|
+
"init_db",
|
|
15
|
+
]
|
ossuary/db/models.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""SQLAlchemy models for ossuary."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import (
|
|
8
|
+
JSON,
|
|
9
|
+
DateTime,
|
|
10
|
+
Float,
|
|
11
|
+
ForeignKey,
|
|
12
|
+
Index,
|
|
13
|
+
Integer,
|
|
14
|
+
String,
|
|
15
|
+
Text,
|
|
16
|
+
UniqueConstraint,
|
|
17
|
+
)
|
|
18
|
+
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Base(DeclarativeBase):
|
|
22
|
+
"""Base class for all models."""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Ecosystem(str, Enum):
|
|
28
|
+
"""Supported package ecosystems."""
|
|
29
|
+
|
|
30
|
+
NPM = "npm"
|
|
31
|
+
PYPI = "pypi"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Package(Base):
|
|
35
|
+
"""A package being tracked."""
|
|
36
|
+
|
|
37
|
+
__tablename__ = "packages"
|
|
38
|
+
|
|
39
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
40
|
+
name: Mapped[str] = mapped_column(String(255), nullable=False)
|
|
41
|
+
ecosystem: Mapped[str] = mapped_column(String(50), nullable=False)
|
|
42
|
+
repo_url: Mapped[Optional[str]] = mapped_column(String(500))
|
|
43
|
+
|
|
44
|
+
# Metadata from registry
|
|
45
|
+
description: Mapped[Optional[str]] = mapped_column(Text)
|
|
46
|
+
homepage: Mapped[Optional[str]] = mapped_column(String(500))
|
|
47
|
+
|
|
48
|
+
# Tracking
|
|
49
|
+
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
|
50
|
+
last_analyzed: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
|
51
|
+
|
|
52
|
+
# Relationships
|
|
53
|
+
commits: Mapped[list["Commit"]] = relationship(back_populates="package", cascade="all, delete-orphan")
|
|
54
|
+
issues: Mapped[list["Issue"]] = relationship(back_populates="package", cascade="all, delete-orphan")
|
|
55
|
+
scores: Mapped[list["Score"]] = relationship(back_populates="package", cascade="all, delete-orphan")
|
|
56
|
+
sentiment_records: Mapped[list["SentimentRecord"]] = relationship(
|
|
57
|
+
back_populates="package", cascade="all, delete-orphan"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
__table_args__ = (
|
|
61
|
+
UniqueConstraint("name", "ecosystem", name="uq_package_name_ecosystem"),
|
|
62
|
+
Index("ix_package_ecosystem", "ecosystem"),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class Commit(Base):
|
|
67
|
+
"""A commit from a package's repository."""
|
|
68
|
+
|
|
69
|
+
__tablename__ = "commits"
|
|
70
|
+
|
|
71
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
72
|
+
package_id: Mapped[int] = mapped_column(ForeignKey("packages.id", ondelete="CASCADE"))
|
|
73
|
+
|
|
74
|
+
sha: Mapped[str] = mapped_column(String(40), nullable=False)
|
|
75
|
+
author_name: Mapped[str] = mapped_column(String(255))
|
|
76
|
+
author_email: Mapped[str] = mapped_column(String(255))
|
|
77
|
+
authored_date: Mapped[datetime] = mapped_column(DateTime)
|
|
78
|
+
|
|
79
|
+
committer_name: Mapped[Optional[str]] = mapped_column(String(255))
|
|
80
|
+
committer_email: Mapped[Optional[str]] = mapped_column(String(255))
|
|
81
|
+
committed_date: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
|
82
|
+
|
|
83
|
+
message: Mapped[str] = mapped_column(Text)
|
|
84
|
+
|
|
85
|
+
# Relationships
|
|
86
|
+
package: Mapped["Package"] = relationship(back_populates="commits")
|
|
87
|
+
|
|
88
|
+
__table_args__ = (
|
|
89
|
+
UniqueConstraint("package_id", "sha", name="uq_commit_package_sha"),
|
|
90
|
+
Index("ix_commit_authored_date", "authored_date"),
|
|
91
|
+
Index("ix_commit_author_email", "author_email"),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class Issue(Base):
|
|
96
|
+
"""An issue or PR from a package's repository."""
|
|
97
|
+
|
|
98
|
+
__tablename__ = "issues"
|
|
99
|
+
|
|
100
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
101
|
+
package_id: Mapped[int] = mapped_column(ForeignKey("packages.id", ondelete="CASCADE"))
|
|
102
|
+
|
|
103
|
+
number: Mapped[int] = mapped_column(Integer)
|
|
104
|
+
title: Mapped[str] = mapped_column(String(500))
|
|
105
|
+
body: Mapped[Optional[str]] = mapped_column(Text)
|
|
106
|
+
state: Mapped[str] = mapped_column(String(20)) # open, closed
|
|
107
|
+
is_pull_request: Mapped[bool] = mapped_column(default=False)
|
|
108
|
+
|
|
109
|
+
author_login: Mapped[Optional[str]] = mapped_column(String(255))
|
|
110
|
+
created_at: Mapped[datetime] = mapped_column(DateTime)
|
|
111
|
+
updated_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
|
112
|
+
closed_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
|
113
|
+
|
|
114
|
+
# Store comments as JSON array
|
|
115
|
+
comments: Mapped[Optional[dict]] = mapped_column(JSON)
|
|
116
|
+
|
|
117
|
+
# Relationships
|
|
118
|
+
package: Mapped["Package"] = relationship(back_populates="issues")
|
|
119
|
+
|
|
120
|
+
__table_args__ = (
|
|
121
|
+
UniqueConstraint("package_id", "number", name="uq_issue_package_number"),
|
|
122
|
+
Index("ix_issue_created_at", "created_at"),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Score(Base):
|
|
127
|
+
"""A calculated risk score for a package."""
|
|
128
|
+
|
|
129
|
+
__tablename__ = "scores"
|
|
130
|
+
|
|
131
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
132
|
+
package_id: Mapped[int] = mapped_column(ForeignKey("packages.id", ondelete="CASCADE"))
|
|
133
|
+
|
|
134
|
+
# Score calculation date and cutoff
|
|
135
|
+
calculated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
|
136
|
+
cutoff_date: Mapped[datetime] = mapped_column(DateTime)
|
|
137
|
+
|
|
138
|
+
# Final score
|
|
139
|
+
final_score: Mapped[int] = mapped_column(Integer)
|
|
140
|
+
risk_level: Mapped[str] = mapped_column(String(20)) # CRITICAL, HIGH, MODERATE, LOW, VERY_LOW
|
|
141
|
+
|
|
142
|
+
# Score components
|
|
143
|
+
base_risk: Mapped[int] = mapped_column(Integer)
|
|
144
|
+
activity_modifier: Mapped[int] = mapped_column(Integer)
|
|
145
|
+
protective_factors_total: Mapped[int] = mapped_column(Integer)
|
|
146
|
+
sentiment_modifier: Mapped[int] = mapped_column(Integer, default=0)
|
|
147
|
+
|
|
148
|
+
# Detailed breakdown stored as JSON
|
|
149
|
+
breakdown: Mapped[dict] = mapped_column(JSON)
|
|
150
|
+
|
|
151
|
+
# Core metrics at time of scoring
|
|
152
|
+
maintainer_concentration: Mapped[float] = mapped_column(Float)
|
|
153
|
+
commits_last_year: Mapped[int] = mapped_column(Integer)
|
|
154
|
+
unique_contributors: Mapped[int] = mapped_column(Integer)
|
|
155
|
+
weekly_downloads: Mapped[int] = mapped_column(Integer, default=0)
|
|
156
|
+
|
|
157
|
+
# Relationships
|
|
158
|
+
package: Mapped["Package"] = relationship(back_populates="scores")
|
|
159
|
+
|
|
160
|
+
__table_args__ = (Index("ix_score_calculated_at", "calculated_at"),)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class SentimentRecord(Base):
|
|
164
|
+
"""Sentiment analysis result for a piece of text."""
|
|
165
|
+
|
|
166
|
+
__tablename__ = "sentiment_records"
|
|
167
|
+
|
|
168
|
+
id: Mapped[int] = mapped_column(Integer, primary_key=True)
|
|
169
|
+
package_id: Mapped[int] = mapped_column(ForeignKey("packages.id", ondelete="CASCADE"))
|
|
170
|
+
|
|
171
|
+
# Source of the text
|
|
172
|
+
source_type: Mapped[str] = mapped_column(String(50)) # commit, issue, comment
|
|
173
|
+
source_id: Mapped[str] = mapped_column(String(255)) # sha or issue number
|
|
174
|
+
|
|
175
|
+
# Text hash for deduplication
|
|
176
|
+
text_hash: Mapped[str] = mapped_column(String(64))
|
|
177
|
+
|
|
178
|
+
# Sentiment scores
|
|
179
|
+
compound_score: Mapped[float] = mapped_column(Float) # -1 to 1
|
|
180
|
+
positive_score: Mapped[float] = mapped_column(Float)
|
|
181
|
+
negative_score: Mapped[float] = mapped_column(Float)
|
|
182
|
+
neutral_score: Mapped[float] = mapped_column(Float)
|
|
183
|
+
|
|
184
|
+
# Frustration detection
|
|
185
|
+
frustration_detected: Mapped[bool] = mapped_column(default=False)
|
|
186
|
+
frustration_keywords: Mapped[Optional[list]] = mapped_column(JSON)
|
|
187
|
+
|
|
188
|
+
# Metadata
|
|
189
|
+
analyzed_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
|
|
190
|
+
|
|
191
|
+
# Relationships
|
|
192
|
+
package: Mapped["Package"] = relationship(back_populates="sentiment_records")
|
|
193
|
+
|
|
194
|
+
__table_args__ = (
|
|
195
|
+
UniqueConstraint("package_id", "text_hash", name="uq_sentiment_package_hash"),
|
|
196
|
+
Index("ix_sentiment_source_type", "source_type"),
|
|
197
|
+
)
|
ossuary/db/session.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Database session management."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Generator
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import create_engine
|
|
8
|
+
from sqlalchemy.orm import Session, sessionmaker
|
|
9
|
+
|
|
10
|
+
from ossuary.db.models import Base
|
|
11
|
+
|
|
12
|
+
# Default to SQLite for development
|
|
13
|
+
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./ossuary.db")
|
|
14
|
+
|
|
15
|
+
# Handle SQLite URL format for SQLAlchemy
|
|
16
|
+
if DATABASE_URL.startswith("sqlite"):
|
|
17
|
+
engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False})
|
|
18
|
+
else:
|
|
19
|
+
engine = create_engine(DATABASE_URL)
|
|
20
|
+
|
|
21
|
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def init_db() -> None:
|
|
25
|
+
"""Initialize the database, creating all tables."""
|
|
26
|
+
Base.metadata.create_all(bind=engine)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_session() -> Generator[Session, None, None]:
|
|
30
|
+
"""Get a database session."""
|
|
31
|
+
session = SessionLocal()
|
|
32
|
+
try:
|
|
33
|
+
yield session
|
|
34
|
+
finally:
|
|
35
|
+
session.close()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@contextmanager
|
|
39
|
+
def session_scope() -> Generator[Session, None, None]:
|
|
40
|
+
"""Provide a transactional scope around a series of operations."""
|
|
41
|
+
session = SessionLocal()
|
|
42
|
+
try:
|
|
43
|
+
yield session
|
|
44
|
+
session.commit()
|
|
45
|
+
except Exception:
|
|
46
|
+
session.rollback()
|
|
47
|
+
raise
|
|
48
|
+
finally:
|
|
49
|
+
session.close()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Risk scoring engine."""
|
|
2
|
+
|
|
3
|
+
from ossuary.scoring.engine import PackageMetrics, RiskScorer
|
|
4
|
+
from ossuary.scoring.factors import ProtectiveFactors, RiskBreakdown, RiskLevel
|
|
5
|
+
from ossuary.scoring.reputation import ReputationBreakdown, ReputationScorer, ReputationTier
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"PackageMetrics",
|
|
9
|
+
"RiskScorer",
|
|
10
|
+
"ProtectiveFactors",
|
|
11
|
+
"RiskBreakdown",
|
|
12
|
+
"RiskLevel",
|
|
13
|
+
"ReputationBreakdown",
|
|
14
|
+
"ReputationScorer",
|
|
15
|
+
"ReputationTier",
|
|
16
|
+
]
|