buildlog 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/cli.py +379 -3
- buildlog/seed_engine/__init__.py +74 -0
- buildlog/seed_engine/categorizers.py +145 -0
- buildlog/seed_engine/extractors.py +148 -0
- buildlog/seed_engine/generators.py +144 -0
- buildlog/seed_engine/models.py +113 -0
- buildlog/seed_engine/pipeline.py +202 -0
- buildlog/seed_engine/sources.py +362 -0
- buildlog/seeds.py +211 -0
- buildlog/skills.py +26 -3
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/METADATA +82 -11
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/RECORD +22 -14
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/copier.yml +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/WHEEL +0 -0
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Rule extractors for Step 2 of the seed engine pipeline.
|
|
2
|
+
|
|
3
|
+
Extractors take sources and produce candidate rules.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from typing import Callable
|
|
10
|
+
|
|
11
|
+
from buildlog.seed_engine.models import CandidateRule, Source
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RuleExtractor(ABC):
|
|
15
|
+
"""Protocol for extracting rules from sources.
|
|
16
|
+
|
|
17
|
+
Implementations:
|
|
18
|
+
- ManualExtractor: Human-curated rules (highest quality)
|
|
19
|
+
- LLMExtractor: LLM-assisted extraction (future)
|
|
20
|
+
- StructuredExtractor: Parse structured docs like OWASP (future)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def extract(self, source: Source) -> list[CandidateRule]:
|
|
25
|
+
"""Extract candidate rules from a source.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
source: The source to extract rules from.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of candidate rules with defensibility fields.
|
|
32
|
+
"""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def validate(self, rule: CandidateRule) -> list[str]:
|
|
37
|
+
"""Validate a candidate rule, returning any issues.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
rule: The rule to validate.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List of validation issues (empty if valid).
|
|
44
|
+
"""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ManualExtractor(RuleExtractor):
|
|
49
|
+
"""Manual rule extraction via human curation.
|
|
50
|
+
|
|
51
|
+
This is the gold standard—humans read the source and
|
|
52
|
+
extract rules with full defensibility metadata.
|
|
53
|
+
|
|
54
|
+
Usage:
|
|
55
|
+
extractor = ManualExtractor()
|
|
56
|
+
|
|
57
|
+
# Register rules for a source
|
|
58
|
+
extractor.register(
|
|
59
|
+
source=google_testing_blog,
|
|
60
|
+
rules=[
|
|
61
|
+
CandidateRule(
|
|
62
|
+
rule="Tests must not depend on execution order",
|
|
63
|
+
context="Test suites with multiple tests",
|
|
64
|
+
antipattern="Test A sets state that Test B relies on",
|
|
65
|
+
rationale="Order-dependent tests are flaky",
|
|
66
|
+
source=google_testing_blog,
|
|
67
|
+
raw_tags=["isolation", "flaky"],
|
|
68
|
+
)
|
|
69
|
+
]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Extract returns registered rules
|
|
73
|
+
rules = extractor.extract(google_testing_blog)
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self) -> None:
|
|
77
|
+
self._rules_by_source: dict[str, list[CandidateRule]] = {}
|
|
78
|
+
|
|
79
|
+
def register(self, source: Source, rules: list[CandidateRule]) -> None:
|
|
80
|
+
"""Register manually curated rules for a source.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
source: The source these rules come from.
|
|
84
|
+
rules: The curated rules.
|
|
85
|
+
"""
|
|
86
|
+
# Validate all rules are complete
|
|
87
|
+
for rule in rules:
|
|
88
|
+
issues = self.validate(rule)
|
|
89
|
+
if issues:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
f"Invalid rule '{rule.rule[:50]}...': {'; '.join(issues)}"
|
|
92
|
+
)
|
|
93
|
+
self._rules_by_source[source.url] = rules
|
|
94
|
+
|
|
95
|
+
def extract(self, source: Source) -> list[CandidateRule]:
|
|
96
|
+
"""Return registered rules for this source."""
|
|
97
|
+
return self._rules_by_source.get(source.url, [])
|
|
98
|
+
|
|
99
|
+
def validate(self, rule: CandidateRule) -> list[str]:
|
|
100
|
+
"""Validate defensibility fields are populated."""
|
|
101
|
+
issues = []
|
|
102
|
+
if not rule.rule.strip():
|
|
103
|
+
issues.append("Rule text is empty")
|
|
104
|
+
if not rule.context.strip():
|
|
105
|
+
issues.append("Context is required for defensibility")
|
|
106
|
+
if not rule.antipattern.strip():
|
|
107
|
+
issues.append("Antipattern is required for defensibility")
|
|
108
|
+
if not rule.rationale.strip():
|
|
109
|
+
issues.append("Rationale is required for defensibility")
|
|
110
|
+
return issues
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class FunctionExtractor(RuleExtractor):
|
|
114
|
+
"""Extraction via custom function (for structured sources).
|
|
115
|
+
|
|
116
|
+
Allows plugging in custom extraction logic for sources
|
|
117
|
+
with known structure (e.g., OWASP pages, API docs).
|
|
118
|
+
|
|
119
|
+
Usage:
|
|
120
|
+
def extract_from_owasp(source: Source) -> list[CandidateRule]:
|
|
121
|
+
# Custom parsing logic for OWASP format
|
|
122
|
+
...
|
|
123
|
+
|
|
124
|
+
extractor = FunctionExtractor(extract_from_owasp)
|
|
125
|
+
rules = extractor.extract(owasp_source)
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(
|
|
129
|
+
self,
|
|
130
|
+
extract_fn: Callable[[Source], list[CandidateRule]],
|
|
131
|
+
validate_fn: Callable[[CandidateRule], list[str]] | None = None,
|
|
132
|
+
) -> None:
|
|
133
|
+
self._extract_fn = extract_fn
|
|
134
|
+
self._validate_fn = validate_fn or self._default_validate
|
|
135
|
+
|
|
136
|
+
def extract(self, source: Source) -> list[CandidateRule]:
|
|
137
|
+
"""Run the custom extraction function."""
|
|
138
|
+
return self._extract_fn(source)
|
|
139
|
+
|
|
140
|
+
def validate(self, rule: CandidateRule) -> list[str]:
|
|
141
|
+
"""Run the validation function."""
|
|
142
|
+
return self._validate_fn(rule)
|
|
143
|
+
|
|
144
|
+
def _default_validate(self, rule: CandidateRule) -> list[str]:
|
|
145
|
+
"""Default validation: check completeness."""
|
|
146
|
+
if not rule.is_complete():
|
|
147
|
+
return ["Rule is missing required defensibility fields"]
|
|
148
|
+
return []
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Seed file generators for Step 4 of the seed engine pipeline.
|
|
2
|
+
|
|
3
|
+
Generators take categorized rules and produce the final seed file.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
from buildlog.seed_engine.models import CategorizedRule
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class SeedGenerator:
|
|
19
|
+
"""Generate YAML seed files from categorized rules.
|
|
20
|
+
|
|
21
|
+
Usage:
|
|
22
|
+
generator = SeedGenerator(
|
|
23
|
+
persona="test_terrorist",
|
|
24
|
+
version=1,
|
|
25
|
+
output_dir=Path(".buildlog/seeds"),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
seed_file = generator.generate(categorized_rules)
|
|
29
|
+
generator.write(seed_file)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
persona: str
|
|
33
|
+
version: int = 1
|
|
34
|
+
output_dir: Path | None = None
|
|
35
|
+
header_comment: str | None = None
|
|
36
|
+
|
|
37
|
+
def generate(self, rules: list[CategorizedRule]) -> dict[str, Any]:
|
|
38
|
+
"""Generate seed file dictionary from categorized rules.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
rules: The categorized rules to include.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Seed file as dictionary (ready for YAML serialization).
|
|
45
|
+
"""
|
|
46
|
+
# Validate all rules are complete
|
|
47
|
+
incomplete = [r for r in rules if not self._is_complete(r)]
|
|
48
|
+
if incomplete:
|
|
49
|
+
raise ValueError(
|
|
50
|
+
f"{len(incomplete)} rules are incomplete. "
|
|
51
|
+
f"First: '{incomplete[0].rule[:50]}...'"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
"persona": self.persona,
|
|
56
|
+
"version": self.version,
|
|
57
|
+
"rules": [r.to_seed_dict() for r in rules],
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
def write(
|
|
61
|
+
self,
|
|
62
|
+
seed_data: dict[str, Any],
|
|
63
|
+
path: Path | None = None,
|
|
64
|
+
) -> Path:
|
|
65
|
+
"""Write seed file to disk.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
seed_data: The seed file dictionary.
|
|
69
|
+
path: Output path. If None, uses output_dir/persona.yaml.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Path to written file.
|
|
73
|
+
"""
|
|
74
|
+
if path is None:
|
|
75
|
+
if self.output_dir is None:
|
|
76
|
+
raise ValueError("No output path or output_dir specified")
|
|
77
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
path = self.output_dir / f"{self.persona}.yaml"
|
|
79
|
+
|
|
80
|
+
# Build YAML content with optional header
|
|
81
|
+
yaml_content = yaml.dump(
|
|
82
|
+
seed_data,
|
|
83
|
+
default_flow_style=False,
|
|
84
|
+
allow_unicode=True,
|
|
85
|
+
sort_keys=False,
|
|
86
|
+
width=100,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Add header comment if provided
|
|
90
|
+
if self.header_comment:
|
|
91
|
+
lines = [f"# {line}" for line in self.header_comment.split("\n")]
|
|
92
|
+
header = "\n".join(lines) + "\n\n"
|
|
93
|
+
yaml_content = header + yaml_content
|
|
94
|
+
|
|
95
|
+
path.write_text(yaml_content)
|
|
96
|
+
return path
|
|
97
|
+
|
|
98
|
+
def _is_complete(self, rule: CategorizedRule) -> bool:
|
|
99
|
+
"""Check if a rule has all required fields."""
|
|
100
|
+
return bool(
|
|
101
|
+
rule.rule.strip()
|
|
102
|
+
and rule.context.strip()
|
|
103
|
+
and rule.antipattern.strip()
|
|
104
|
+
and rule.rationale.strip()
|
|
105
|
+
and rule.category.strip()
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def validate(self, seed_data: dict[str, Any]) -> list[str]:
|
|
109
|
+
"""Validate seed file structure.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
seed_data: The seed file dictionary.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of validation issues (empty if valid).
|
|
116
|
+
"""
|
|
117
|
+
issues = []
|
|
118
|
+
|
|
119
|
+
if "persona" not in seed_data:
|
|
120
|
+
issues.append("Missing 'persona' field")
|
|
121
|
+
if "version" not in seed_data:
|
|
122
|
+
issues.append("Missing 'version' field")
|
|
123
|
+
if "rules" not in seed_data:
|
|
124
|
+
issues.append("Missing 'rules' field")
|
|
125
|
+
return issues
|
|
126
|
+
|
|
127
|
+
for i, rule in enumerate(seed_data.get("rules", [])):
|
|
128
|
+
prefix = f"Rule {i + 1}"
|
|
129
|
+
if not rule.get("rule"):
|
|
130
|
+
issues.append(f"{prefix}: Missing 'rule' text")
|
|
131
|
+
if not rule.get("context"):
|
|
132
|
+
issues.append(
|
|
133
|
+
f"{prefix}: Missing 'context' (required for defensibility)"
|
|
134
|
+
)
|
|
135
|
+
if not rule.get("antipattern"):
|
|
136
|
+
issues.append(
|
|
137
|
+
f"{prefix}: Missing 'antipattern' (required for defensibility)"
|
|
138
|
+
)
|
|
139
|
+
if not rule.get("rationale"):
|
|
140
|
+
issues.append(
|
|
141
|
+
f"{prefix}: Missing 'rationale' (required for defensibility)"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
return issues
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Data models for the seed engine pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SourceType(Enum):
|
|
11
|
+
"""Type of authoritative source."""
|
|
12
|
+
|
|
13
|
+
REFERENCE_DOC = "reference_doc" # OWASP, RFC, official docs
|
|
14
|
+
BLOG_POST = "blog_post" # Google Testing Blog, Fowler
|
|
15
|
+
BOOK = "book" # Clean Code, xUnit Patterns
|
|
16
|
+
STANDARD = "standard" # ISO, IEEE standards
|
|
17
|
+
CHEATSHEET = "cheatsheet" # OWASP cheatsheets
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Source:
|
|
22
|
+
"""An authoritative source for rule extraction.
|
|
23
|
+
|
|
24
|
+
Step 1 output: Sources define where domain knowledge comes from.
|
|
25
|
+
Each source should be citable and defensible.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name: str
|
|
29
|
+
url: str
|
|
30
|
+
source_type: SourceType
|
|
31
|
+
domain: str # e.g., "security", "testing", "code-quality"
|
|
32
|
+
description: str = ""
|
|
33
|
+
sections: list[str] = field(default_factory=list) # Specific sections to extract
|
|
34
|
+
|
|
35
|
+
def to_reference(self) -> dict[str, str]:
|
|
36
|
+
"""Convert to seed file reference format."""
|
|
37
|
+
return {"url": self.url, "title": self.name}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class CandidateRule:
|
|
42
|
+
"""A rule extracted from a source, before categorization.
|
|
43
|
+
|
|
44
|
+
Step 2 output: Raw rule with all defensibility fields.
|
|
45
|
+
May not yet be categorized or tagged.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
rule: str # The prescription
|
|
49
|
+
context: str # When it applies
|
|
50
|
+
antipattern: str # What violation looks like
|
|
51
|
+
rationale: str # Why it matters
|
|
52
|
+
source: Source # Where it came from
|
|
53
|
+
raw_tags: list[str] = field(default_factory=list) # Tags from extraction
|
|
54
|
+
confidence: float = 1.0 # Extraction confidence (1.0 for manual)
|
|
55
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
def is_complete(self) -> bool:
|
|
58
|
+
"""Check if all defensibility fields are populated."""
|
|
59
|
+
return bool(
|
|
60
|
+
self.rule.strip()
|
|
61
|
+
and self.context.strip()
|
|
62
|
+
and self.antipattern.strip()
|
|
63
|
+
and self.rationale.strip()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class CategorizedRule:
|
|
69
|
+
"""A rule after categorization and tagging.
|
|
70
|
+
|
|
71
|
+
Step 3 output: Ready for seed file generation.
|
|
72
|
+
Has final category and tags assigned.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
rule: str
|
|
76
|
+
category: str # Final category (e.g., "testing", "security")
|
|
77
|
+
context: str
|
|
78
|
+
antipattern: str
|
|
79
|
+
rationale: str
|
|
80
|
+
tags: list[str] # Final tags
|
|
81
|
+
references: list[dict[str, str]] # [{"url": ..., "title": ...}]
|
|
82
|
+
confidence: float = 1.0
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_candidate(
|
|
86
|
+
cls,
|
|
87
|
+
candidate: CandidateRule,
|
|
88
|
+
category: str,
|
|
89
|
+
tags: list[str],
|
|
90
|
+
) -> CategorizedRule:
|
|
91
|
+
"""Create from a candidate rule with assigned category/tags."""
|
|
92
|
+
return cls(
|
|
93
|
+
rule=candidate.rule,
|
|
94
|
+
category=category,
|
|
95
|
+
context=candidate.context,
|
|
96
|
+
antipattern=candidate.antipattern,
|
|
97
|
+
rationale=candidate.rationale,
|
|
98
|
+
tags=tags,
|
|
99
|
+
references=[candidate.source.to_reference()],
|
|
100
|
+
confidence=candidate.confidence,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def to_seed_dict(self) -> dict[str, Any]:
|
|
104
|
+
"""Convert to seed file rule format."""
|
|
105
|
+
return {
|
|
106
|
+
"rule": self.rule,
|
|
107
|
+
"category": self.category,
|
|
108
|
+
"context": self.context,
|
|
109
|
+
"antipattern": self.antipattern,
|
|
110
|
+
"rationale": self.rationale,
|
|
111
|
+
"tags": self.tags,
|
|
112
|
+
"references": self.references,
|
|
113
|
+
}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Pipeline orchestration for the seed engine.
|
|
2
|
+
|
|
3
|
+
The Pipeline ties together all 4 steps:
|
|
4
|
+
1. Source identification (input)
|
|
5
|
+
2. Rule extraction
|
|
6
|
+
3. Categorization
|
|
7
|
+
4. Seed generation
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from buildlog.seed_engine.categorizers import Categorizer, TagBasedCategorizer
|
|
18
|
+
from buildlog.seed_engine.extractors import ManualExtractor, RuleExtractor
|
|
19
|
+
from buildlog.seed_engine.generators import SeedGenerator
|
|
20
|
+
from buildlog.seed_engine.models import CandidateRule, CategorizedRule, Source
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class PipelineResult:
|
|
27
|
+
"""Result of running the seed engine pipeline."""
|
|
28
|
+
|
|
29
|
+
persona: str
|
|
30
|
+
sources: list[Source]
|
|
31
|
+
candidates: list[CandidateRule]
|
|
32
|
+
categorized: list[CategorizedRule]
|
|
33
|
+
seed_data: dict[str, Any]
|
|
34
|
+
output_path: Path | None = None
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def rule_count(self) -> int:
|
|
38
|
+
return len(self.categorized)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def source_count(self) -> int:
|
|
42
|
+
return len(self.sources)
|
|
43
|
+
|
|
44
|
+
def summary(self) -> str:
|
|
45
|
+
"""Human-readable summary of the pipeline run."""
|
|
46
|
+
lines = [
|
|
47
|
+
f"Seed Engine Pipeline Result: {self.persona}",
|
|
48
|
+
f" Sources: {self.source_count}",
|
|
49
|
+
f" Candidates extracted: {len(self.candidates)}",
|
|
50
|
+
f" Rules categorized: {self.rule_count}",
|
|
51
|
+
]
|
|
52
|
+
if self.output_path:
|
|
53
|
+
lines.append(f" Output: {self.output_path}")
|
|
54
|
+
return "\n".join(lines)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class Pipeline:
|
|
59
|
+
"""The seed engine pipeline.
|
|
60
|
+
|
|
61
|
+
Orchestrates the 4-step process for creating reviewer personas:
|
|
62
|
+
|
|
63
|
+
1. SOURCES → Define authoritative domain sources
|
|
64
|
+
2. EXTRACT → Pull rules with defensibility fields
|
|
65
|
+
3. CATEGORIZE → Assign categories and tags
|
|
66
|
+
4. GENERATE → Output validated YAML seed file
|
|
67
|
+
|
|
68
|
+
Usage:
|
|
69
|
+
# Create pipeline with default components
|
|
70
|
+
pipeline = Pipeline(
|
|
71
|
+
persona="test_terrorist",
|
|
72
|
+
default_category="testing",
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Or customize each step
|
|
76
|
+
pipeline = Pipeline(
|
|
77
|
+
persona="test_terrorist",
|
|
78
|
+
extractor=MyCustomExtractor(),
|
|
79
|
+
categorizer=MyCustomCategorizer(),
|
|
80
|
+
generator=SeedGenerator(persona="test_terrorist", version=1),
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Run the pipeline
|
|
84
|
+
result = pipeline.run(sources, output_dir=Path(".buildlog/seeds"))
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
persona: str
|
|
88
|
+
default_category: str = "general"
|
|
89
|
+
version: int = 1
|
|
90
|
+
extractor: RuleExtractor | None = None
|
|
91
|
+
categorizer: Categorizer | None = None
|
|
92
|
+
generator: SeedGenerator | None = None
|
|
93
|
+
|
|
94
|
+
def __post_init__(self) -> None:
|
|
95
|
+
# Set defaults if not provided
|
|
96
|
+
if self.extractor is None:
|
|
97
|
+
self.extractor = ManualExtractor()
|
|
98
|
+
if self.categorizer is None:
|
|
99
|
+
self.categorizer = TagBasedCategorizer(
|
|
100
|
+
default_category=self.default_category
|
|
101
|
+
)
|
|
102
|
+
if self.generator is None:
|
|
103
|
+
self.generator = SeedGenerator(
|
|
104
|
+
persona=self.persona,
|
|
105
|
+
version=self.version,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def run(
|
|
109
|
+
self,
|
|
110
|
+
sources: list[Source],
|
|
111
|
+
output_dir: Path | None = None,
|
|
112
|
+
write: bool = True,
|
|
113
|
+
) -> PipelineResult:
|
|
114
|
+
"""Run the full pipeline.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
sources: Step 1 - The authoritative sources to extract from.
|
|
118
|
+
output_dir: Where to write the seed file.
|
|
119
|
+
write: Whether to write the seed file to disk.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
PipelineResult with all intermediate and final outputs.
|
|
123
|
+
"""
|
|
124
|
+
logger.info(f"Starting seed engine pipeline for '{self.persona}'")
|
|
125
|
+
logger.info(f"Processing {len(sources)} sources")
|
|
126
|
+
|
|
127
|
+
# These are guaranteed set by __post_init__
|
|
128
|
+
assert self.extractor is not None
|
|
129
|
+
assert self.categorizer is not None
|
|
130
|
+
assert self.generator is not None
|
|
131
|
+
|
|
132
|
+
# Step 2: Extract rules from each source
|
|
133
|
+
candidates: list[CandidateRule] = []
|
|
134
|
+
for source in sources:
|
|
135
|
+
extracted = self.extractor.extract(source)
|
|
136
|
+
logger.info(f" Extracted {len(extracted)} rules from {source.name}")
|
|
137
|
+
candidates.extend(extracted)
|
|
138
|
+
|
|
139
|
+
logger.info(f"Total candidates: {len(candidates)}")
|
|
140
|
+
|
|
141
|
+
# Step 3: Categorize each rule
|
|
142
|
+
categorized: list[CategorizedRule] = []
|
|
143
|
+
for candidate in candidates:
|
|
144
|
+
cat_rule = self.categorizer.categorize(candidate)
|
|
145
|
+
categorized.append(cat_rule)
|
|
146
|
+
|
|
147
|
+
logger.info(f"Categorized {len(categorized)} rules")
|
|
148
|
+
|
|
149
|
+
# Step 4: Generate seed file
|
|
150
|
+
self.generator.output_dir = output_dir
|
|
151
|
+
seed_data = self.generator.generate(categorized)
|
|
152
|
+
|
|
153
|
+
# Optionally write to disk
|
|
154
|
+
output_path = None
|
|
155
|
+
if write and output_dir:
|
|
156
|
+
output_path = self.generator.write(seed_data)
|
|
157
|
+
logger.info(f"Wrote seed file to {output_path}")
|
|
158
|
+
|
|
159
|
+
return PipelineResult(
|
|
160
|
+
persona=self.persona,
|
|
161
|
+
sources=sources,
|
|
162
|
+
candidates=candidates,
|
|
163
|
+
categorized=categorized,
|
|
164
|
+
seed_data=seed_data,
|
|
165
|
+
output_path=output_path,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def validate_sources(self, sources: list[Source]) -> list[str]:
|
|
169
|
+
"""Validate that sources are properly defined.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
sources: The sources to validate.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List of validation issues (empty if valid).
|
|
176
|
+
"""
|
|
177
|
+
issues = []
|
|
178
|
+
for i, source in enumerate(sources):
|
|
179
|
+
prefix = f"Source {i + 1} ({source.name})"
|
|
180
|
+
if not source.name.strip():
|
|
181
|
+
issues.append(f"{prefix}: Missing name")
|
|
182
|
+
if not source.url.strip():
|
|
183
|
+
issues.append(f"{prefix}: Missing URL")
|
|
184
|
+
if not source.domain.strip():
|
|
185
|
+
issues.append(f"{prefix}: Missing domain")
|
|
186
|
+
return issues
|
|
187
|
+
|
|
188
|
+
def dry_run(self, sources: list[Source]) -> dict[str, Any]:
|
|
189
|
+
"""Run pipeline without writing, returning preview.
|
|
190
|
+
|
|
191
|
+
Useful for validation before committing to disk.
|
|
192
|
+
"""
|
|
193
|
+
result = self.run(sources, write=False)
|
|
194
|
+
return {
|
|
195
|
+
"persona": result.persona,
|
|
196
|
+
"rule_count": result.rule_count,
|
|
197
|
+
"source_count": result.source_count,
|
|
198
|
+
"categories": list(set(r.category for r in result.categorized)),
|
|
199
|
+
"sample_rules": [
|
|
200
|
+
{"rule": r.rule, "category": r.category} for r in result.categorized[:3]
|
|
201
|
+
],
|
|
202
|
+
}
|