buildlog 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,362 @@
1
+ """Source management and fetching for the seed engine.
2
+
3
+ Handles:
4
+ - Source manifests (what to fetch)
5
+ - Content caching (fetched markdown)
6
+ - Incremental fetching (fetch on demand)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import logging
13
+ import re
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime, timezone
16
+ from enum import Enum
17
+ from pathlib import Path
18
+ from typing import Any
19
+ from urllib.parse import urlparse
20
+
21
+ import yaml
22
+
23
+ from buildlog.seed_engine.models import Source, SourceType
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class FetchStatus(Enum):
29
+ """Status of a source fetch."""
30
+
31
+ PENDING = "pending" # Not yet fetched
32
+ CACHED = "cached" # Successfully fetched and cached
33
+ FAILED = "failed" # Fetch failed
34
+ STALE = "stale" # Cached but needs refresh
35
+
36
+
37
+ @dataclass
38
+ class SourceEntry:
39
+ """A source entry in the manifest with fetch status."""
40
+
41
+ source: Source
42
+ status: FetchStatus = FetchStatus.PENDING
43
+ fetched_at: datetime | None = None
44
+ cache_path: str | None = None
45
+ error: str | None = None
46
+ content_hash: str | None = None
47
+
48
+ def to_dict(self) -> dict[str, Any]:
49
+ """Serialize to dictionary."""
50
+ return {
51
+ "name": self.source.name,
52
+ "url": self.source.url,
53
+ "source_type": self.source.source_type.value,
54
+ "domain": self.source.domain,
55
+ "description": self.source.description,
56
+ "sections": self.source.sections,
57
+ "status": self.status.value,
58
+ "fetched_at": self.fetched_at.isoformat() if self.fetched_at else None,
59
+ "cache_path": self.cache_path,
60
+ "error": self.error,
61
+ "content_hash": self.content_hash,
62
+ }
63
+
64
+ @classmethod
65
+ def from_dict(cls, data: dict[str, Any]) -> SourceEntry:
66
+ """Deserialize from dictionary."""
67
+ source = Source(
68
+ name=data["name"],
69
+ url=data["url"],
70
+ source_type=SourceType(data["source_type"]),
71
+ domain=data["domain"],
72
+ description=data.get("description", ""),
73
+ sections=data.get("sections", []),
74
+ )
75
+ fetched_at = None
76
+ if data.get("fetched_at"):
77
+ fetched_at = datetime.fromisoformat(data["fetched_at"])
78
+
79
+ return cls(
80
+ source=source,
81
+ status=FetchStatus(data.get("status", "pending")),
82
+ fetched_at=fetched_at,
83
+ cache_path=data.get("cache_path"),
84
+ error=data.get("error"),
85
+ content_hash=data.get("content_hash"),
86
+ )
87
+
88
+
89
+ @dataclass
90
+ class SourceManifest:
91
+ """Manifest of sources for a persona.
92
+
93
+ Tracks what sources exist, their fetch status, and where
94
+ cached content lives.
95
+
96
+ Usage:
97
+ manifest = SourceManifest(persona="test_terrorist")
98
+ manifest.add_source(Source(...))
99
+ manifest.save(Path(".buildlog/sources/test_terrorist"))
100
+
101
+ # Later
102
+ manifest = SourceManifest.load(Path(".buildlog/sources/test_terrorist"))
103
+ pending = manifest.get_pending()
104
+ """
105
+
106
+ persona: str
107
+ entries: list[SourceEntry] = field(default_factory=list)
108
+ version: int = 1
109
+ created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
110
+ updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
111
+
112
+ def add_source(self, source: Source) -> SourceEntry:
113
+ """Add a source to the manifest."""
114
+ # Check for duplicates
115
+ for entry in self.entries:
116
+ if entry.source.url == source.url:
117
+ logger.warning(f"Source already exists: {source.url}")
118
+ return entry
119
+
120
+ entry = SourceEntry(source=source)
121
+ self.entries.append(entry)
122
+ self.updated_at = datetime.now(timezone.utc)
123
+ return entry
124
+
125
+ def get_by_url(self, url: str) -> SourceEntry | None:
126
+ """Get entry by URL."""
127
+ for entry in self.entries:
128
+ if entry.source.url == url:
129
+ return entry
130
+ return None
131
+
132
+ def get_pending(self) -> list[SourceEntry]:
133
+ """Get all entries that haven't been fetched."""
134
+ return [e for e in self.entries if e.status == FetchStatus.PENDING]
135
+
136
+ def get_cached(self) -> list[SourceEntry]:
137
+ """Get all entries that have been fetched."""
138
+ return [e for e in self.entries if e.status == FetchStatus.CACHED]
139
+
140
+ def get_failed(self) -> list[SourceEntry]:
141
+ """Get all entries that failed to fetch."""
142
+ return [e for e in self.entries if e.status == FetchStatus.FAILED]
143
+
144
+ def summary(self) -> dict[str, int]:
145
+ """Get summary of fetch statuses."""
146
+ return {
147
+ "total": len(self.entries),
148
+ "pending": len(self.get_pending()),
149
+ "cached": len(self.get_cached()),
150
+ "failed": len(self.get_failed()),
151
+ }
152
+
153
+ def to_dict(self) -> dict[str, Any]:
154
+ """Serialize to dictionary."""
155
+ return {
156
+ "persona": self.persona,
157
+ "version": self.version,
158
+ "created_at": self.created_at.isoformat(),
159
+ "updated_at": self.updated_at.isoformat(),
160
+ "entries": [e.to_dict() for e in self.entries],
161
+ }
162
+
163
+ @classmethod
164
+ def from_dict(cls, data: dict[str, Any]) -> SourceManifest:
165
+ """Deserialize from dictionary."""
166
+ return cls(
167
+ persona=data["persona"],
168
+ version=data.get("version", 1),
169
+ created_at=datetime.fromisoformat(data["created_at"]),
170
+ updated_at=datetime.fromisoformat(data["updated_at"]),
171
+ entries=[SourceEntry.from_dict(e) for e in data.get("entries", [])],
172
+ )
173
+
174
+ def save(self, base_dir: Path) -> Path:
175
+ """Save manifest to disk.
176
+
177
+ Args:
178
+ base_dir: Base directory (e.g., .buildlog/sources/test_terrorist)
179
+
180
+ Returns:
181
+ Path to saved manifest file.
182
+ """
183
+ base_dir.mkdir(parents=True, exist_ok=True)
184
+ manifest_path = base_dir / "manifest.yaml"
185
+
186
+ with open(manifest_path, "w") as f:
187
+ yaml.dump(self.to_dict(), f, default_flow_style=False, sort_keys=False)
188
+
189
+ return manifest_path
190
+
191
+ @classmethod
192
+ def load(cls, base_dir: Path) -> SourceManifest | None:
193
+ """Load manifest from disk.
194
+
195
+ Args:
196
+ base_dir: Base directory containing manifest.yaml
197
+
198
+ Returns:
199
+ Loaded manifest or None if not found.
200
+ """
201
+ manifest_path = base_dir / "manifest.yaml"
202
+ if not manifest_path.exists():
203
+ return None
204
+
205
+ with open(manifest_path) as f:
206
+ data = yaml.safe_load(f)
207
+
208
+ return cls.from_dict(data)
209
+
210
+
211
+ def url_to_cache_filename(url: str) -> str:
212
+ """Convert URL to a safe cache filename.
213
+
214
+ Examples:
215
+ https://testing.googleblog.com/2015/04/test.html
216
+ → testing_googleblog_com_2015_04_test.md
217
+ """
218
+ parsed = urlparse(url)
219
+
220
+ # Combine host and path
221
+ parts = [parsed.netloc] + [p for p in parsed.path.split("/") if p]
222
+
223
+ # Clean each part
224
+ clean_parts = []
225
+ for part in parts:
226
+ # Remove extension
227
+ part = re.sub(r"\.[a-z]+$", "", part)
228
+ # Replace non-alphanumeric with underscore
229
+ part = re.sub(r"[^a-zA-Z0-9]", "_", part)
230
+ # Collapse multiple underscores
231
+ part = re.sub(r"_+", "_", part)
232
+ # Strip leading/trailing underscores
233
+ part = part.strip("_")
234
+ if part:
235
+ clean_parts.append(part)
236
+
237
+ # Join and truncate
238
+ filename = "_".join(clean_parts)[:100]
239
+ return f"{filename}.md"
240
+
241
+
242
+ def content_hash(content: str) -> str:
243
+ """Generate hash of content for change detection."""
244
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
245
+
246
+
247
+ @dataclass
248
+ class SourceFetcher:
249
+ """Fetches and caches source content.
250
+
251
+ Usage:
252
+ fetcher = SourceFetcher(cache_dir=Path(".buildlog/sources/test_terrorist/cache"))
253
+
254
+ # Fetch a single source
255
+ content = fetcher.fetch(entry)
256
+
257
+ # Fetch all pending
258
+ results = fetcher.fetch_pending(manifest)
259
+ """
260
+
261
+ cache_dir: Path
262
+
263
+ def __post_init__(self):
264
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
265
+
266
+ def fetch(self, entry: SourceEntry, force: bool = False) -> str | None:
267
+ """Fetch a single source and cache it.
268
+
269
+ Args:
270
+ entry: The source entry to fetch.
271
+ force: If True, refetch even if cached.
272
+
273
+ Returns:
274
+ Content as markdown, or None if failed.
275
+ """
276
+ # Check cache
277
+ if entry.status == FetchStatus.CACHED and not force:
278
+ if entry.cache_path:
279
+ cache_path = self.cache_dir / entry.cache_path
280
+ if cache_path.exists():
281
+ return cache_path.read_text()
282
+
283
+ # Fetch content
284
+ try:
285
+ content = self._fetch_url(entry.source.url)
286
+ if content is None:
287
+ entry.status = FetchStatus.FAILED
288
+ entry.error = "Failed to fetch content"
289
+ return None
290
+
291
+ # Cache it
292
+ filename = url_to_cache_filename(entry.source.url)
293
+ cache_path = self.cache_dir / filename
294
+ cache_path.write_text(content)
295
+
296
+ # Update entry
297
+ entry.status = FetchStatus.CACHED
298
+ entry.fetched_at = datetime.now(timezone.utc)
299
+ entry.cache_path = filename
300
+ entry.content_hash = content_hash(content)
301
+ entry.error = None
302
+
303
+ logger.info(f"Fetched and cached: {entry.source.name} → {filename}")
304
+ return content
305
+
306
+ except Exception as e:
307
+ entry.status = FetchStatus.FAILED
308
+ entry.error = str(e)
309
+ logger.error(f"Failed to fetch {entry.source.url}: {e}")
310
+ return None
311
+
312
+ def _fetch_url(self, url: str) -> str | None:
313
+ """Fetch URL and convert to markdown.
314
+
315
+ This is a placeholder - in production you'd use:
316
+ - requests + html2text for web pages
317
+ - PDF extractors for PDFs
318
+ - API clients for structured sources
319
+
320
+ For now, returns a placeholder indicating manual fetch needed.
321
+ """
322
+ # TODO: Implement actual fetching
323
+ # For now, return a template indicating manual population needed
324
+ return f"""# {url}
325
+
326
+ > **Note**: This source requires manual population.
327
+ >
328
+ > Fetch the content from: {url}
329
+ > Then paste the relevant sections below.
330
+
331
+ ---
332
+
333
+ ## Content
334
+
335
+ [Paste content here]
336
+
337
+ ---
338
+
339
+ ## Extracted Rules
340
+
341
+ [Document rules extracted from this source]
342
+ """
343
+
344
+ def fetch_pending(self, manifest: SourceManifest) -> dict[str, bool]:
345
+ """Fetch all pending sources in a manifest.
346
+
347
+ Returns:
348
+ Dict mapping URL to success status.
349
+ """
350
+ results = {}
351
+ for entry in manifest.get_pending():
352
+ content = self.fetch(entry)
353
+ results[entry.source.url] = content is not None
354
+ return results
355
+
356
+ def get_cached_content(self, entry: SourceEntry) -> str | None:
357
+ """Get cached content for an entry."""
358
+ if entry.cache_path:
359
+ cache_path = self.cache_dir / entry.cache_path
360
+ if cache_path.exists():
361
+ return cache_path.read_text()
362
+ return None
buildlog/seeds.py ADDED
@@ -0,0 +1,211 @@
1
+ """Load curated seed rules for reviewer personas.
2
+
3
+ Seed files provide defensible, human-curated rules that reviewers
4
+ can use immediately without requiring learned data. Each persona
5
+ (security_karen, test_terrorist, ruthless_reviewer) can have its
6
+ own seed file with domain-specific rules.
7
+
8
+ Seed files are YAML with the following format:
9
+
10
+ ```yaml
11
+ persona: security_karen
12
+ version: 1
13
+ rules:
14
+ - rule: "Parameterize all SQL queries"
15
+ category: security
16
+ context: "Any code constructing SQL from user input"
17
+ antipattern: "String concatenation or f-strings with user data in SQL"
18
+ rationale: "SQL injection is OWASP A03 - prevents data breach"
19
+ tags: [sql, injection, owasp]
20
+ references:
21
+ - url: "https://owasp.org/Top10/A03_2021-Injection/"
22
+ title: "OWASP A03:2021 Injection"
23
+ ```
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ __all__ = [
29
+ "SeedRule",
30
+ "SeedFile",
31
+ "load_seed_file",
32
+ "load_all_seeds",
33
+ "seeds_to_skills",
34
+ ]
35
+
36
+ import logging
37
+ from dataclasses import dataclass, field
38
+ from pathlib import Path
39
+ from typing import Any
40
+
41
+ import yaml
42
+
43
+ from buildlog.skills import Skill, _generate_skill_id
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ @dataclass
49
+ class SeedReference:
50
+ """A reference/citation for a seed rule."""
51
+
52
+ url: str
53
+ title: str
54
+
55
+
56
+ @dataclass
57
+ class SeedRule:
58
+ """A curated seed rule for a reviewer persona.
59
+
60
+ Unlike learned Skills, seed rules come with full defensibility
61
+ metadata from the start: context, antipattern, rationale, and
62
+ references to authoritative sources.
63
+ """
64
+
65
+ rule: str
66
+ category: str
67
+ context: str
68
+ antipattern: str
69
+ rationale: str
70
+ tags: list[str] = field(default_factory=list)
71
+ references: list[SeedReference] = field(default_factory=list)
72
+
73
+
74
+ @dataclass
75
+ class SeedFile:
76
+ """A collection of seed rules for a persona."""
77
+
78
+ persona: str
79
+ version: int
80
+ rules: list[SeedRule]
81
+
82
+ @classmethod
83
+ def from_dict(cls, data: dict[str, Any]) -> SeedFile:
84
+ """Parse a seed file from dictionary (loaded YAML)."""
85
+ rules = []
86
+ for rule_data in data.get("rules", []):
87
+ refs = [
88
+ SeedReference(url=r["url"], title=r["title"])
89
+ for r in rule_data.get("references", [])
90
+ ]
91
+ rules.append(
92
+ SeedRule(
93
+ rule=rule_data["rule"],
94
+ category=rule_data.get("category", "security"),
95
+ context=rule_data.get("context", ""),
96
+ antipattern=rule_data.get("antipattern", ""),
97
+ rationale=rule_data.get("rationale", ""),
98
+ tags=rule_data.get("tags", []),
99
+ references=refs,
100
+ )
101
+ )
102
+ return cls(
103
+ persona=data.get("persona", "unknown"),
104
+ version=data.get("version", 1),
105
+ rules=rules,
106
+ )
107
+
108
+
109
+ def load_seed_file(path: Path) -> SeedFile | None:
110
+ """Load a single seed file from disk.
111
+
112
+ Args:
113
+ path: Path to the YAML seed file.
114
+
115
+ Returns:
116
+ Parsed SeedFile or None if loading fails.
117
+ """
118
+ if not path.exists():
119
+ logger.warning(f"Seed file not found: {path}")
120
+ return None
121
+
122
+ try:
123
+ with open(path) as f:
124
+ data = yaml.safe_load(f)
125
+ return SeedFile.from_dict(data)
126
+ except (yaml.YAMLError, KeyError, TypeError) as e:
127
+ logger.error(f"Failed to parse seed file {path}: {e}")
128
+ return None
129
+
130
+
131
+ def load_all_seeds(seeds_dir: Path) -> dict[str, SeedFile]:
132
+ """Load all seed files from a directory.
133
+
134
+ Args:
135
+ seeds_dir: Directory containing persona seed files.
136
+
137
+ Returns:
138
+ Dict mapping persona name to SeedFile.
139
+ """
140
+ result: dict[str, SeedFile] = {}
141
+
142
+ if not seeds_dir.exists():
143
+ logger.info(f"Seeds directory not found: {seeds_dir}")
144
+ return result
145
+
146
+ for seed_path in seeds_dir.glob("*.yaml"):
147
+ seed_file = load_seed_file(seed_path)
148
+ if seed_file:
149
+ result[seed_file.persona] = seed_file
150
+ logger.info(
151
+ f"Loaded {len(seed_file.rules)} seed rules for {seed_file.persona}"
152
+ )
153
+
154
+ return result
155
+
156
+
157
+ def seeds_to_skills(seed_file: SeedFile) -> list[Skill]:
158
+ """Convert seed rules to Skill objects.
159
+
160
+ Seed rules become Skills with:
161
+ - frequency=0 (not learned, seeded)
162
+ - confidence="high" (curated by humans)
163
+ - Full defensibility metadata
164
+
165
+ Args:
166
+ seed_file: The seed file to convert.
167
+
168
+ Returns:
169
+ List of Skill objects.
170
+ """
171
+ skills = []
172
+
173
+ for seed in seed_file.rules:
174
+ # Generate stable ID
175
+ skill_id = _generate_skill_id(seed.category, seed.rule)
176
+
177
+ # Build source references from citations
178
+ sources = [f"seed:{seed_file.persona}:v{seed_file.version}"]
179
+ sources.extend(ref.url for ref in seed.references)
180
+
181
+ skill = Skill(
182
+ id=skill_id,
183
+ category=seed.category,
184
+ rule=seed.rule,
185
+ frequency=0, # Seeded, not learned
186
+ confidence="high", # Human-curated
187
+ sources=sources,
188
+ tags=seed.tags,
189
+ confidence_score=1.0, # Full confidence in curated rules
190
+ confidence_tier="entrenched",
191
+ context=seed.context,
192
+ antipattern=seed.antipattern,
193
+ rationale=seed.rationale,
194
+ persona_tags=[seed_file.persona],
195
+ )
196
+ skills.append(skill)
197
+
198
+ return skills
199
+
200
+
201
+ def get_rules_for_persona(all_skills: list[Skill], persona: str) -> list[Skill]:
202
+ """Filter skills to those relevant for a specific persona.
203
+
204
+ Args:
205
+ all_skills: All available skills (seeded + learned).
206
+ persona: The persona to filter for.
207
+
208
+ Returns:
209
+ Skills tagged for this persona.
210
+ """
211
+ return [s for s in all_skills if persona in s.persona_tags]
buildlog/skills.py CHANGED
@@ -72,11 +72,17 @@ class SkillDict(_SkillDictRequired, total=False):
72
72
  """Type for skill dictionary representation.
73
73
 
74
74
  Inherits required fields from _SkillDictRequired.
75
- Optional fields are only present when continuous confidence is enabled.
75
+ Optional fields are only present when continuous confidence is enabled
76
+ or when defensibility fields are populated.
76
77
  """
77
78
 
78
79
  confidence_score: float
79
80
  confidence_tier: str
81
+ # Defensibility fields (from #24 - tighter schema)
82
+ context: str # When does this rule apply?
83
+ antipattern: str # What does violation look like?
84
+ rationale: str # Why does this matter?
85
+ persona_tags: list[str] # Which reviewers use this rule?
80
86
 
81
87
 
82
88
  class SkillSetDict(TypedDict):
@@ -105,6 +111,10 @@ class Skill:
105
111
  tags: Extracted technology/concept tags.
106
112
  confidence_score: Continuous confidence score (0-1), if calculated.
107
113
  confidence_tier: Descriptive tier (speculative/provisional/stable/entrenched).
114
+ context: When does this rule apply? (defensibility)
115
+ antipattern: What does violation look like? (defensibility)
116
+ rationale: Why does this rule matter? (defensibility)
117
+ persona_tags: Which reviewer personas use this rule?
108
118
  """
109
119
 
110
120
  id: str
@@ -116,12 +126,16 @@ class Skill:
116
126
  tags: list[str] = field(default_factory=list)
117
127
  confidence_score: float | None = None
118
128
  confidence_tier: str | None = None
129
+ # Defensibility fields (#24)
130
+ context: str | None = None
131
+ antipattern: str | None = None
132
+ rationale: str | None = None
133
+ persona_tags: list[str] = field(default_factory=list)
119
134
 
120
135
  def to_dict(self) -> SkillDict:
121
136
  """Convert to dictionary for serialization.
122
137
 
123
- Only includes optional fields (confidence_score, confidence_tier)
124
- when they are set.
138
+ Only includes optional fields when they are set.
125
139
  """
126
140
  result = SkillDict(
127
141
  id=self.id,
@@ -136,6 +150,15 @@ class Skill:
136
150
  result["confidence_score"] = self.confidence_score
137
151
  if self.confidence_tier is not None:
138
152
  result["confidence_tier"] = self.confidence_tier
153
+ # Defensibility fields
154
+ if self.context is not None:
155
+ result["context"] = self.context
156
+ if self.antipattern is not None:
157
+ result["antipattern"] = self.antipattern
158
+ if self.rationale is not None:
159
+ result["rationale"] = self.rationale
160
+ if self.persona_tags:
161
+ result["persona_tags"] = self.persona_tags
139
162
  return result
140
163
 
141
164