PyPI - buildlog - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

buildlog 0.5.0py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

buildlog/seed_engine/sources.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""Source management and fetching for the seed engine.
+Handles:
+- Source manifests (what to fetch)
+- Content caching (fetched markdown)
+- Incremental fetching (fetch on demand)
+"""
+from __future__ import annotations
+import hashlib
+import logging
+import re
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+import yaml
+from buildlog.seed_engine.models import Source, SourceType
+logger = logging.getLogger(__name__)
+class FetchStatus(Enum):
+    """Status of a source fetch."""
+    PENDING = "pending"  # Not yet fetched
+    CACHED = "cached"  # Successfully fetched and cached
+    FAILED = "failed"  # Fetch failed
+    STALE = "stale"  # Cached but needs refresh
+@dataclass
+class SourceEntry:
+    """A source entry in the manifest with fetch status."""
+    source: Source
+    status: FetchStatus = FetchStatus.PENDING
+    fetched_at: datetime | None = None
+    cache_path: str | None = None
+    error: str | None = None
+    content_hash: str | None = None
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dictionary."""
+        return {
+            "name": self.source.name,
+            "url": self.source.url,
+            "source_type": self.source.source_type.value,
+            "domain": self.source.domain,
+            "description": self.source.description,
+            "sections": self.source.sections,
+            "status": self.status.value,
+            "fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
+            "cache_path": self.cache_path,
+            "error": self.error,
+            "content_hash": self.content_hash,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> SourceEntry:
+        """Deserialize from dictionary."""
+        source = Source(
+            name=data["name"],
+            url=data["url"],
+            source_type=SourceType(data["source_type"]),
+            domain=data["domain"],
+            description=data.get("description", ""),
+            sections=data.get("sections", []),
+        )
+        fetched_at = None
+        if data.get("fetched_at"):
+            fetched_at = datetime.fromisoformat(data["fetched_at"])
+        return cls(
+            source=source,
+            status=FetchStatus(data.get("status", "pending")),
+            fetched_at=fetched_at,
+            cache_path=data.get("cache_path"),
+            error=data.get("error"),
+            content_hash=data.get("content_hash"),
+        )
+@dataclass
+class SourceManifest:
+    """Manifest of sources for a persona.
+    Tracks what sources exist, their fetch status, and where
+    cached content lives.
+    Usage:
+        manifest = SourceManifest(persona="test_terrorist")
+        manifest.add_source(Source(...))
+        manifest.save(Path(".buildlog/sources/test_terrorist"))
+        # Later
+        manifest = SourceManifest.load(Path(".buildlog/sources/test_terrorist"))
+        pending = manifest.get_pending()
+    """
+    persona: str
+    entries: list[SourceEntry] = field(default_factory=list)
+    version: int = 1
+    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    def add_source(self, source: Source) -> SourceEntry:
+        """Add a source to the manifest."""
+        # Check for duplicates
+        for entry in self.entries:
+            if entry.source.url == source.url:
+                logger.warning(f"Source already exists: {source.url}")
+                return entry
+        entry = SourceEntry(source=source)
+        self.entries.append(entry)
+        self.updated_at = datetime.now(timezone.utc)
+        return entry
+    def get_by_url(self, url: str) -> SourceEntry | None:
+        """Get entry by URL."""
+        for entry in self.entries:
+            if entry.source.url == url:
+                return entry
+        return None
+    def get_pending(self) -> list[SourceEntry]:
+        """Get all entries that haven't been fetched."""
+        return [e for e in self.entries if e.status == FetchStatus.PENDING]
+    def get_cached(self) -> list[SourceEntry]:
+        """Get all entries that have been fetched."""
+        return [e for e in self.entries if e.status == FetchStatus.CACHED]
+    def get_failed(self) -> list[SourceEntry]:
+        """Get all entries that failed to fetch."""
+        return [e for e in self.entries if e.status == FetchStatus.FAILED]
+    def summary(self) -> dict[str, int]:
+        """Get summary of fetch statuses."""
+        return {
+            "total": len(self.entries),
+            "pending": len(self.get_pending()),
+            "cached": len(self.get_cached()),
+            "failed": len(self.get_failed()),
+        }
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dictionary."""
+        return {
+            "persona": self.persona,
+            "version": self.version,
+            "created_at": self.created_at.isoformat(),
+            "updated_at": self.updated_at.isoformat(),
+            "entries": [e.to_dict() for e in self.entries],
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> SourceManifest:
+        """Deserialize from dictionary."""
+        return cls(
+            persona=data["persona"],
+            version=data.get("version", 1),
+            created_at=datetime.fromisoformat(data["created_at"]),
+            updated_at=datetime.fromisoformat(data["updated_at"]),
+            entries=[SourceEntry.from_dict(e) for e in data.get("entries", [])],
+        )
+    def save(self, base_dir: Path) -> Path:
+        """Save manifest to disk.
+        Args:
+            base_dir: Base directory (e.g., .buildlog/sources/test_terrorist)
+        Returns:
+            Path to saved manifest file.
+        """
+        base_dir.mkdir(parents=True, exist_ok=True)
+        manifest_path = base_dir / "manifest.yaml"
+        with open(manifest_path, "w") as f:
+            yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)
+        return manifest_path
+    @classmethod
+    def load(cls, base_dir: Path) -> SourceManifest | None:
+        """Load manifest from disk.
+        Args:
+            base_dir: Base directory containing manifest.yaml
+        Returns:
+            Loaded manifest or None if not found.
+        """
+        manifest_path = base_dir / "manifest.yaml"
+        if not manifest_path.exists():
+            return None
+        with open(manifest_path) as f:
+            data = yaml.safe_load(f)
+        return cls.from_dict(data)
+def url_to_cache_filename(url: str) -> str:
+    """Convert URL to a safe cache filename.
+    Examples:
+        https://testing.googleblog.com/2015/04/test.html
+        → testing_googleblog_com_2015_04_test.md
+    """
+    parsed = urlparse(url)
+    # Combine host and path
+    parts = [parsed.netloc] + [p for p in parsed.path.split("/") if p]
+    # Clean each part
+    clean_parts = []
+    for part in parts:
+        # Remove extension
+        part = re.sub(r"\.[a-z]+$", "", part)
+        # Replace non-alphanumeric with underscore
+        part = re.sub(r"[^a-zA-Z0-9]", "_", part)
+        # Collapse multiple underscores
+        part = re.sub(r"_+", "_", part)
+        # Strip leading/trailing underscores
+        part = part.strip("_")
+        if part:
+            clean_parts.append(part)
+    # Join and truncate
+    filename = "_".join(clean_parts)[:100]
+    return f"{filename}.md"
+def content_hash(content: str) -> str:
+    """Generate hash of content for change detection."""
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+@dataclass
+class SourceFetcher:
+    """Fetches and caches source content.
+    Usage:
+        fetcher = SourceFetcher(cache_dir=Path(".buildlog/sources/test_terrorist/cache"))
+        # Fetch a single source
+        content = fetcher.fetch(entry)
+        # Fetch all pending
+        results = fetcher.fetch_pending(manifest)
+    """
+    cache_dir: Path
+    def __post_init__(self):
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+    def fetch(self, entry: SourceEntry, force: bool = False) -> str | None:
+        """Fetch a single source and cache it.
+        Args:
+            entry: The source entry to fetch.
+            force: If True, refetch even if cached.
+        Returns:
+            Content as markdown, or None if failed.
+        """
+        # Check cache
+        if entry.status == FetchStatus.CACHED and not force:
+            if entry.cache_path:
+                cache_path = self.cache_dir / entry.cache_path
+                if cache_path.exists():
+                    return cache_path.read_text()
+        # Fetch content
+        try:
+            content = self._fetch_url(entry.source.url)
+            if content is None:
+                entry.status = FetchStatus.FAILED
+                entry.error = "Failed to fetch content"
+                return None
+            # Cache it
+            filename = url_to_cache_filename(entry.source.url)
+            cache_path = self.cache_dir / filename
+            cache_path.write_text(content)
+            # Update entry
+            entry.status = FetchStatus.CACHED
+            entry.fetched_at = datetime.now(timezone.utc)
+            entry.cache_path = filename
+            entry.content_hash = content_hash(content)
+            entry.error = None
+            logger.info(f"Fetched and cached: {entry.source.name} → {filename}")
+            return content
+        except Exception as e:
+            entry.status = FetchStatus.FAILED
+            entry.error = str(e)
+            logger.error(f"Failed to fetch {entry.source.url}: {e}")
+            return None
+    def _fetch_url(self, url: str) -> str | None:
+        """Fetch URL and convert to markdown.
+        This is a placeholder - in production you'd use:
+        - requests + html2text for web pages
+        - PDF extractors for PDFs
+        - API clients for structured sources
+        For now, returns a placeholder indicating manual fetch needed.
+        """
+        # TODO: Implement actual fetching
+        # For now, return a template indicating manual population needed
+        return f"""# {url}
+> **Note**: This source requires manual population.
+>
+> Fetch the content from: {url}
+> Then paste the relevant sections below.
+---
+## Content
+[Paste content here]
+---
+## Extracted Rules
+[Document rules extracted from this source]
+"""
+    def fetch_pending(self, manifest: SourceManifest) -> dict[str, bool]:
+        """Fetch all pending sources in a manifest.
+        Returns:
+            Dict mapping URL to success status.
+        """
+        results = {}
+        for entry in manifest.get_pending():
+            content = self.fetch(entry)
+            results[entry.source.url] = content is not None
+        return results
+    def get_cached_content(self, entry: SourceEntry) -> str | None:
+        """Get cached content for an entry."""
+        if entry.cache_path:
+            cache_path = self.cache_dir / entry.cache_path
+            if cache_path.exists():
+                return cache_path.read_text()
+        return None

buildlog/seeds.py ADDED Viewed

@@ -0,0 +1,261 @@
+"""Load curated seed rules for reviewer personas.
+Seed files provide defensible, human-curated rules that reviewers
+can use immediately without requiring learned data. Each persona
+(security_karen, test_terrorist, ruthless_reviewer) can have its
+own seed file with domain-specific rules.
+Seed files are YAML with the following format:
+```yaml
+persona: security_karen
+version: 1
+rules:
+  - rule: "Parameterize all SQL queries"
+    category: security
+    context: "Any code constructing SQL from user input"
+    antipattern: "String concatenation or f-strings with user data in SQL"
+    rationale: "SQL injection is OWASP A03 - prevents data breach"
+    tags: [sql, injection, owasp]
+    references:
+      - url: "https://owasp.org/Top10/A03_2021-Injection/"
+        title: "OWASP A03:2021 Injection"
+```
+"""
+from __future__ import annotations
+__all__ = [
+    "SeedRule",
+    "SeedFile",
+    "load_seed_file",
+    "load_all_seeds",
+    "seeds_to_skills",
+    "get_package_seeds_dir",
+    "get_default_seeds_dir",
+]
+import logging
+from dataclasses import dataclass, field
+from importlib import resources
+from pathlib import Path
+from typing import Any
+import yaml
+from buildlog.skills import Skill, _generate_skill_id
+logger = logging.getLogger(__name__)
+def get_package_seeds_dir() -> Path | None:
+    """Get the path to bundled seed files in the package.
+    Returns:
+        Path to the package's data/seeds directory, or None if not found.
+    """
+    try:
+        # Python 3.9+ way to get package resources
+        with resources.as_file(resources.files("buildlog").joinpath("data/seeds")) as p:
+            if p.exists():
+                return p
+    except (TypeError, FileNotFoundError):
+        pass
+    # Fallback: try relative to this file
+    fallback = Path(__file__).parent / "data" / "seeds"
+    if fallback.exists():
+        return fallback
+    return None
+def get_default_seeds_dir() -> Path | None:
+    """Get the default seeds directory, checking multiple locations.
+    Priority:
+    1. Local .buildlog/seeds/ (project-specific overrides)
+    2. Local buildlog/.buildlog/seeds/ (buildlog template structure)
+    3. Package bundled seeds (installed with pip)
+    Returns:
+        Path to the seeds directory with most precedence, or None if none found.
+    """
+    # Check local project seeds first (allows overrides)
+    local_seeds = Path(".buildlog") / "seeds"
+    if local_seeds.exists() and any(local_seeds.glob("*.yaml")):
+        return local_seeds
+    # Check buildlog template structure
+    buildlog_seeds = Path("buildlog") / ".buildlog" / "seeds"
+    if buildlog_seeds.exists() and any(buildlog_seeds.glob("*.yaml")):
+        return buildlog_seeds
+    # Fall back to package seeds
+    return get_package_seeds_dir()
+@dataclass
+class SeedReference:
+    """A reference/citation for a seed rule."""
+    url: str
+    title: str
+@dataclass
+class SeedRule:
+    """A curated seed rule for a reviewer persona.
+    Unlike learned Skills, seed rules come with full defensibility
+    metadata from the start: context, antipattern, rationale, and
+    references to authoritative sources.
+    """
+    rule: str
+    category: str
+    context: str
+    antipattern: str
+    rationale: str
+    tags: list[str] = field(default_factory=list)
+    references: list[SeedReference] = field(default_factory=list)
+@dataclass
+class SeedFile:
+    """A collection of seed rules for a persona."""
+    persona: str
+    version: int
+    rules: list[SeedRule]
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> SeedFile:
+        """Parse a seed file from dictionary (loaded YAML)."""
+        rules = []
+        for rule_data in data.get("rules", []):
+            refs = [
+                SeedReference(url=r["url"], title=r["title"])
+                for r in rule_data.get("references", [])
+            ]
+            rules.append(
+                SeedRule(
+                    rule=rule_data["rule"],
+                    category=rule_data.get("category", "security"),
+                    context=rule_data.get("context", ""),
+                    antipattern=rule_data.get("antipattern", ""),
+                    rationale=rule_data.get("rationale", ""),
+                    tags=rule_data.get("tags", []),
+                    references=refs,
+                )
+            )
+        return cls(
+            persona=data.get("persona", "unknown"),
+            version=data.get("version", 1),
+            rules=rules,
+        )
+def load_seed_file(path: Path) -> SeedFile | None:
+    """Load a single seed file from disk.
+    Args:
+        path: Path to the YAML seed file.
+    Returns:
+        Parsed SeedFile or None if loading fails.
+    """
+    if not path.exists():
+        logger.warning(f"Seed file not found: {path}")
+        return None
+    try:
+        with open(path) as f:
+            data = yaml.safe_load(f)
+        return SeedFile.from_dict(data)
+    except (yaml.YAMLError, KeyError, TypeError) as e:
+        logger.error(f"Failed to parse seed file {path}: {e}")
+        return None
+def load_all_seeds(seeds_dir: Path) -> dict[str, SeedFile]:
+    """Load all seed files from a directory.
+    Args:
+        seeds_dir: Directory containing persona seed files.
+    Returns:
+        Dict mapping persona name to SeedFile.
+    """
+    result: dict[str, SeedFile] = {}
+    if not seeds_dir.exists():
+        logger.info(f"Seeds directory not found: {seeds_dir}")
+        return result
+    for seed_path in seeds_dir.glob("*.yaml"):
+        seed_file = load_seed_file(seed_path)
+        if seed_file:
+            result[seed_file.persona] = seed_file
+            logger.info(
+                f"Loaded {len(seed_file.rules)} seed rules for {seed_file.persona}"
+            )
+    return result
+def seeds_to_skills(seed_file: SeedFile) -> list[Skill]:
+    """Convert seed rules to Skill objects.
+    Seed rules become Skills with:
+    - frequency=0 (not learned, seeded)
+    - confidence="high" (curated by humans)
+    - Full defensibility metadata
+    Args:
+        seed_file: The seed file to convert.
+    Returns:
+        List of Skill objects.
+    """
+    skills = []
+    for seed in seed_file.rules:
+        # Generate stable ID
+        skill_id = _generate_skill_id(seed.category, seed.rule)
+        # Build source references from citations
+        sources = [f"seed:{seed_file.persona}:v{seed_file.version}"]
+        sources.extend(ref.url for ref in seed.references)
+        skill = Skill(
+            id=skill_id,
+            category=seed.category,
+            rule=seed.rule,
+            frequency=0,  # Seeded, not learned
+            confidence="high",  # Human-curated
+            sources=sources,
+            tags=seed.tags,
+            confidence_score=1.0,  # Full confidence in curated rules
+            confidence_tier="entrenched",
+            context=seed.context,
+            antipattern=seed.antipattern,
+            rationale=seed.rationale,
+            persona_tags=[seed_file.persona],
+        )
+        skills.append(skill)
+    return skills
+def get_rules_for_persona(all_skills: list[Skill], persona: str) -> list[Skill]:
+    """Filter skills to those relevant for a specific persona.
+    Args:
+        all_skills: All available skills (seeded + learned).
+        persona: The persona to filter for.
+    Returns:
+        Skills tagged for this persona.
+    """
+    return [s for s in all_skills if persona in s.persona_tags]

buildlog/skills.py CHANGED Viewed

@@ -72,11 +72,17 @@ class SkillDict(_SkillDictRequired, total=False):
     """Type for skill dictionary representation.
     Inherits required fields from _SkillDictRequired.
-    Optional fields are only present when continuous confidence is enabled.
+    Optional fields are only present when continuous confidence is enabled
+    or when defensibility fields are populated.
     """
     confidence_score: float
     confidence_tier: str
+    # Defensibility fields (from #24 - tighter schema)
+    context: str  # When does this rule apply?
+    antipattern: str  # What does violation look like?
+    rationale: str  # Why does this matter?
+    persona_tags: list[str]  # Which reviewers use this rule?
 class SkillSetDict(TypedDict):
@@ -105,6 +111,10 @@ class Skill:
         tags: Extracted technology/concept tags.
         confidence_score: Continuous confidence score (0-1), if calculated.
         confidence_tier: Descriptive tier (speculative/provisional/stable/entrenched).
+        context: When does this rule apply? (defensibility)
+        antipattern: What does violation look like? (defensibility)
+        rationale: Why does this rule matter? (defensibility)
+        persona_tags: Which reviewer personas use this rule?
     """
     id: str
@@ -116,12 +126,16 @@ class Skill:
     tags: list[str] = field(default_factory=list)
     confidence_score: float | None = None
     confidence_tier: str | None = None
+    # Defensibility fields (#24)
+    context: str | None = None
+    antipattern: str | None = None
+    rationale: str | None = None
+    persona_tags: list[str] = field(default_factory=list)
     def to_dict(self) -> SkillDict:
         """Convert to dictionary for serialization.
-        Only includes optional fields (confidence_score, confidence_tier)
-        when they are set.
+        Only includes optional fields when they are set.
         """
         result = SkillDict(
             id=self.id,
@@ -136,6 +150,15 @@ class Skill:
             result["confidence_score"] = self.confidence_score
         if self.confidence_tier is not None:
             result["confidence_tier"] = self.confidence_tier
+        # Defensibility fields
+        if self.context is not None:
+            result["context"] = self.context
+        if self.antipattern is not None:
+            result["antipattern"] = self.antipattern
+        if self.rationale is not None:
+            result["rationale"] = self.rationale
+        if self.persona_tags:
+            result["persona_tags"] = self.persona_tags
         return result

buildlog 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

buildlog 0.5.0py3-none-any.whl → 0.6.1py3-none-any.whl