buildlog 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. buildlog/cli.py +391 -3
  2. buildlog/data/__init__.py +0 -0
  3. buildlog/data/seeds/security_karen.yaml +162 -0
  4. buildlog/data/seeds/test_terrorist.yaml +280 -0
  5. buildlog/seed_engine/__init__.py +74 -0
  6. buildlog/seed_engine/categorizers.py +145 -0
  7. buildlog/seed_engine/extractors.py +148 -0
  8. buildlog/seed_engine/generators.py +144 -0
  9. buildlog/seed_engine/models.py +113 -0
  10. buildlog/seed_engine/pipeline.py +202 -0
  11. buildlog/seed_engine/sources.py +362 -0
  12. buildlog/seeds.py +261 -0
  13. buildlog/skills.py +26 -3
  14. {buildlog-0.5.0.dist-info → buildlog-0.6.1.dist-info}/METADATA +82 -11
  15. buildlog-0.6.1.dist-info/RECORD +41 -0
  16. buildlog-0.5.0.dist-info/RECORD +0 -30
  17. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/copier.yml +0 -0
  18. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/post_gen.py +0 -0
  19. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
  20. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
  21. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
  22. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
  23. {buildlog-0.5.0.data → buildlog-0.6.1.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
  24. {buildlog-0.5.0.dist-info → buildlog-0.6.1.dist-info}/WHEEL +0 -0
  25. {buildlog-0.5.0.dist-info → buildlog-0.6.1.dist-info}/entry_points.txt +0 -0
  26. {buildlog-0.5.0.dist-info → buildlog-0.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,280 @@
1
+ persona: test_terrorist
2
+ version: 1
3
+ rules:
4
+ - rule: Tests must not depend on execution order
5
+ category: isolation
6
+ context: Test suites with multiple tests, shared fixtures, database state
7
+ antipattern: Test A creates data that Test B asserts on; tests fail when run individually
8
+ rationale: Order-dependent tests are flaky and hide real failures. Each test must be hermetic.
9
+ tags:
10
+ - isolation
11
+ - order-independent
12
+ - hermetic
13
+ - test_terrorist
14
+ references:
15
+ - url: https://testing.googleblog.com/2010/12/test-sizes.html
16
+ title: Google Testing Blog - Test Sizes
17
+ - rule: Tests must clean up after themselves
18
+ category: isolation
19
+ context: Tests using databases, files, external services, global state
20
+ antipattern: Tests leaving data in shared resources; no teardown; assuming clean state
21
+ rationale: Test pollution causes cascading failures and makes debugging impossible.
22
+ tags:
23
+ - isolation
24
+ - test_terrorist
25
+ - cleanup
26
+ - teardown
27
+ references:
28
+ - url: https://testing.googleblog.com/2010/12/test-sizes.html
29
+ title: Google Testing Blog - Test Sizes
30
+ - rule: Tests should run in under 10 seconds for fast feedback
31
+ category: anti-patterns
32
+ context: Unit test suites, developer workflows, pre-commit hooks
33
+ antipattern: Minute-long test suites; 'just run CI'; tests that require coffee breaks
34
+ rationale: Slow tests don't get run. Fast feedback enables TDD and catches bugs early.
35
+ tags:
36
+ - feedback
37
+ - anti-patterns
38
+ - slow-tests
39
+ - test_terrorist
40
+ references:
41
+ - url: https://testing.googleblog.com/2010/12/test-sizes.html
42
+ title: Google Testing Blog - Test Sizes
43
+ - rule: Every public API must have at least one happy path test
44
+ category: coverage
45
+ context: New endpoints, public functions, exported modules
46
+ antipattern: Shipping code with no tests; 'I'll add tests later'; PRs without test changes
47
+ rationale: Untested code is legacy code the moment it's merged. Tests are executable documentation.
48
+ tags:
49
+ - happy-path
50
+ - public-api
51
+ - test_terrorist
52
+ - coverage
53
+ references:
54
+ - url: https://testing.googleblog.com/2015/04/just-say-no-to-more-end-to-end-tests.html
55
+ title: Google Testing Blog - Just Say No to More E2E Tests
56
+ - rule: New bug fixes must include a regression test
57
+ category: coverage
58
+ context: Any bug fix PR or commit
59
+ antipattern: Fixing bugs without adding tests that would have caught them
60
+ rationale: A bug that escapes once will escape again. Regression tests prevent recurrence.
61
+ tags:
62
+ - regression
63
+ - bug-fix
64
+ - test_terrorist
65
+ - coverage
66
+ references:
67
+ - url: https://testing.googleblog.com/2015/04/just-say-no-to-more-end-to-end-tests.html
68
+ title: Google Testing Blog - Just Say No to More E2E Tests
69
+ - rule: Critical paths require edge case and error path coverage
70
+ category: coverage
71
+ context: Payment flows, authentication, data mutations, external integrations
72
+ antipattern: Only happy path tests for critical code; no error handling tests
73
+ rationale: Edge cases in critical paths cause production incidents. Murphy's law applies.
74
+ tags:
75
+ - test_terrorist
76
+ - critical-path
77
+ - edge-cases
78
+ - coverage
79
+ references:
80
+ - url: https://martinfowler.com/articles/practical-test-pyramid.html
81
+ title: Martin Fowler - Testing Pyramid
82
+ - rule: Assert on behavior, not implementation details
83
+ category: assertions
84
+ context: Unit tests, refactoring scenarios
85
+ antipattern: Asserting on private method calls; testing internal state; mock call counts
86
+ rationale: Implementation-coupled tests break on refactoring. Test the contract, not the code.
87
+ tags:
88
+ - assertions
89
+ - behavior
90
+ - test_terrorist
91
+ - contract
92
+ references:
93
+ - url: https://martinfowler.com/articles/practical-test-pyramid.html
94
+ title: Martin Fowler - Testing Pyramid
95
+ - rule: Mock external dependencies at test boundaries
96
+ category: isolation
97
+ context: Tests calling APIs, databases, file systems, network services
98
+ antipattern: Real network calls in unit tests; tests that require running services
99
+ rationale: External dependencies make tests slow, flaky, and expensive. Mock at boundaries.
100
+ tags:
101
+ - isolation
102
+ - test_terrorist
103
+ - mocking
104
+ - boundaries
105
+ references:
106
+ - url: https://martinfowler.com/bliki/TestDouble.html
107
+ title: Martin Fowler - Test Double
108
+ - rule: Use property-based testing for functions with clear invariants
109
+ category: property-testing
110
+ context: Serializers, parsers, encoders/decoders, sorting, mathematical operations
111
+ antipattern: Only example-based tests for encode/decode pairs; hand-picked edge cases
112
+ rationale: Property tests generate thousands of examples, finding edge cases humans miss.
113
+ tags:
114
+ - invariants
115
+ - hypothesis
116
+ - test_terrorist
117
+ - property
118
+ references:
119
+ - url: https://hypothesis.readthedocs.io/en/latest/
120
+ title: Hypothesis Documentation
121
+ - rule: Define roundtrip properties for serialization code
122
+ category: property-testing
123
+ context: JSON, protobuf, custom serializers, data transformation pipelines
124
+ antipattern: Testing serialize and deserialize separately with fixed examples
125
+ rationale: decode(encode(x)) == x is a universal property. Hypothesis finds corner cases.
126
+ tags:
127
+ - roundtrip
128
+ - test_terrorist
129
+ - property
130
+ - serialization
131
+ references:
132
+ - url: https://hypothesis.readthedocs.io/en/latest/
133
+ title: Hypothesis Documentation
134
+ - rule: Apply metamorphic relations when test oracles are unavailable
135
+ category: metamorphic-testing
136
+ context: ML models, search engines, optimization algorithms, complex computations
137
+ antipattern: No testing because 'we don't know the right answer'; only manual inspection
138
+ rationale: Metamorphic testing validates input-output relationships without ground truth.
139
+ tags:
140
+ - metamorphic
141
+ - test_terrorist
142
+ - ml
143
+ - oracle-free
144
+ references:
145
+ - url: https://www.sciencedirect.com/science/article/pii/S0950584918300016
146
+ title: Metamorphic Testing - Chen et al. Survey
147
+ - rule: Define permutation invariance for order-independent operations
148
+ category: metamorphic-testing
149
+ context: Aggregations, set operations, commutative functions
150
+ antipattern: Testing with single fixed input order; assuming order doesn't matter
151
+ rationale: sum([1,2,3]) == sum([3,1,2]) is a metamorphic relation that catches bugs.
152
+ tags:
153
+ - metamorphic
154
+ - test_terrorist
155
+ - permutation
156
+ - invariance
157
+ references:
158
+ - url: https://www.sciencedirect.com/science/article/pii/S0950584918300016
159
+ title: Metamorphic Testing - Chen et al. Survey
160
+ - rule: Validate data distributions at pipeline boundaries
161
+ category: statistical-testing
162
+ context: ETL pipelines, ML feature stores, data ingestion, API responses
163
+ antipattern: Assuming input data matches expected distribution; no schema validation
164
+ rationale: Distribution drift breaks models silently. Validate expectations at boundaries.
165
+ tags:
166
+ - statistical
167
+ - drift
168
+ - test_terrorist
169
+ - distribution
170
+ references:
171
+ - url: https://docs.greatexpectations.io/docs/
172
+ title: Great Expectations Documentation
173
+ - rule: Define and enforce data contracts with schemas
174
+ category: statistical-testing
175
+ context: Data pipelines, API integrations, database migrations
176
+ antipattern: Implicit schemas; duck typing for data; hoping fields exist
177
+ rationale: Schema validation catches contract violations before they cause failures.
178
+ tags:
179
+ - statistical
180
+ - contracts
181
+ - test_terrorist
182
+ - schema
183
+ references:
184
+ - url: https://pandera.readthedocs.io/en/stable/
185
+ title: Pandera Documentation
186
+ - rule: LLM outputs require structured validation beyond string matching
187
+ category: llm-testing
188
+ context: Any code using LLM-generated content in production paths
189
+ antipattern: No validation; trusting raw LLM output; regex-only validation
190
+ rationale: '[GAP] Standard test frameworks don''t cover LLM eval. See Guardrails/DeepEval for emerging
191
+ patterns. This is a known gap requiring specialized tooling.'
192
+ tags:
193
+ - test_terrorist
194
+ - emerging
195
+ - llm-testing
196
+ - validation
197
+ - gap
198
+ references:
199
+ - url: https://www.guardrailsai.com/docs/concepts/guard
200
+ title: Guardrails AI Documentation
201
+ - rule: LLM-based features need evaluation datasets and metrics
202
+ category: llm-testing
203
+ context: RAG systems, chatbots, content generation, code assistants
204
+ antipattern: Vibes-based testing; manual spot checks; no regression tracking
205
+ rationale: '[GAP] LLM behavior is non-deterministic. Evaluation datasets with metrics enable regression
206
+ detection. Tooling is immature.'
207
+ tags:
208
+ - emerging
209
+ - llm-testing
210
+ - evaluation
211
+ - gap
212
+ - test_terrorist
213
+ references:
214
+ - url: https://docs.confident-ai.com/docs/getting-started
215
+ title: DeepEval Documentation
216
+ - rule: Every test must have at least one meaningful assertion
217
+ category: assertions
218
+ context: All test functions
219
+ antipattern: Tests that only call code without asserting; assert True; empty test bodies
220
+ rationale: A test without assertions is not a test. It's a false sense of security.
221
+ tags:
222
+ - assertions
223
+ - no-pass-through
224
+ - meaningful
225
+ - test_terrorist
226
+ references:
227
+ - url: http://xunitpatterns.com/Test%20Smells.html
228
+ title: xUnit Test Patterns - Test Smells
229
+ - rule: Follow Arrange-Act-Assert (AAA) pattern
230
+ category: structure
231
+ context: All test functions
232
+ antipattern: Interleaved setup and assertions; multiple acts per test; unclear test phases
233
+ rationale: AAA makes tests readable and debuggable. One logical assertion per test.
234
+ tags:
235
+ - structure
236
+ - aaa
237
+ - test_terrorist
238
+ - readability
239
+ references:
240
+ - url: http://xunitpatterns.com/Test%20Smells.html
241
+ title: xUnit Test Patterns - Test Smells
242
+ - rule: Avoid testing implementation details that change frequently
243
+ category: anti-patterns
244
+ context: Refactoring scenarios, internal APIs, private methods
245
+ antipattern: Tests break on every refactor; testing private method behavior
246
+ rationale: Fragile tests slow development. Test stable interfaces, not implementation.
247
+ tags:
248
+ - fragile
249
+ - implementation-details
250
+ - anti-patterns
251
+ - test_terrorist
252
+ references:
253
+ - url: http://xunitpatterns.com/Test%20Smells.html
254
+ title: xUnit Test Patterns - Test Smells
255
+ - rule: Flaky tests must be fixed or quarantined immediately
256
+ category: anti-patterns
257
+ context: CI pipelines, test suites with intermittent failures
258
+ antipattern: Rerunning CI until green; ignoring flaky tests; 'it works on my machine'
259
+ rationale: Flaky tests erode trust in the test suite. A flaky test is worse than no test.
260
+ tags:
261
+ - anti-patterns
262
+ - flaky
263
+ - ci
264
+ - test_terrorist
265
+ references:
266
+ - url: https://testing.googleblog.com/2016/05/flaky-tests-at-google-and-how-we.html
267
+ title: Google Testing Blog - Test Flakiness
268
+ - rule: Test names should describe the scenario and expected outcome
269
+ category: structure
270
+ context: Test function naming
271
+ antipattern: test_1, test_function, test_it_works; names that don't explain the test
272
+ rationale: Test names are documentation. A failing test name should tell you what broke.
273
+ tags:
274
+ - structure
275
+ - documentation
276
+ - test_terrorist
277
+ - naming
278
+ references:
279
+ - url: https://docs.pytest.org/en/stable/explanation/goodpractices.html
280
+ title: pytest - Good Integration Practices
@@ -0,0 +1,74 @@
1
+ """Seed Engine - Formalized pipeline for creating reviewer personas.
2
+
3
+ The seed engine abstracts the 4-step process for bootstrapping
4
+ defensible reviewer personas from authoritative domain sources:
5
+
6
+ 1. SOURCE IDENTIFICATION - Define authoritative sources
7
+ 2. RULE EXTRACTION - Extract candidate rules with defensibility fields
8
+ 3. CATEGORIZATION - Map rules to persona concern categories
9
+ 4. SEED GENERATION - Output validated YAML seed file
10
+
11
+ Usage:
12
+ from buildlog.seed_engine import Pipeline, Source, SourceType
13
+
14
+ # Define sources
15
+ sources = [
16
+ Source(
17
+ name="OWASP Top 10",
18
+ url="https://owasp.org/Top10/",
19
+ source_type=SourceType.REFERENCE_DOC,
20
+ domain="security",
21
+ )
22
+ ]
23
+
24
+ # Run pipeline
25
+ pipeline = Pipeline(persona="security_karen")
26
+ seed_file = pipeline.run(sources)
27
+ """
28
+
29
+ from buildlog.seed_engine.categorizers import (
30
+ Categorizer,
31
+ CategoryMapping,
32
+ TagBasedCategorizer,
33
+ )
34
+ from buildlog.seed_engine.extractors import ManualExtractor, RuleExtractor
35
+ from buildlog.seed_engine.generators import SeedGenerator
36
+ from buildlog.seed_engine.models import (
37
+ CandidateRule,
38
+ CategorizedRule,
39
+ Source,
40
+ SourceType,
41
+ )
42
+ from buildlog.seed_engine.pipeline import Pipeline
43
+ from buildlog.seed_engine.sources import (
44
+ FetchStatus,
45
+ SourceEntry,
46
+ SourceFetcher,
47
+ SourceManifest,
48
+ url_to_cache_filename,
49
+ )
50
+
51
+ __all__ = [
52
+ # Models
53
+ "Source",
54
+ "SourceType",
55
+ "CandidateRule",
56
+ "CategorizedRule",
57
+ # Pipeline
58
+ "Pipeline",
59
+ # Extractors
60
+ "RuleExtractor",
61
+ "ManualExtractor",
62
+ # Categorizers
63
+ "Categorizer",
64
+ "TagBasedCategorizer",
65
+ "CategoryMapping",
66
+ # Generators
67
+ "SeedGenerator",
68
+ # Sources
69
+ "FetchStatus",
70
+ "SourceEntry",
71
+ "SourceManifest",
72
+ "SourceFetcher",
73
+ "url_to_cache_filename",
74
+ ]
@@ -0,0 +1,145 @@
1
+ """Rule categorizers for Step 3 of the seed engine pipeline.
2
+
3
+ Categorizers take candidate rules and assign final categories and tags.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from abc import ABC, abstractmethod
9
+ from collections.abc import Callable
10
+ from dataclasses import dataclass
11
+
12
+ from buildlog.seed_engine.models import CandidateRule, CategorizedRule
13
+
14
+
15
+ class Categorizer(ABC):
16
+ """Protocol for categorizing rules.
17
+
18
+ Implementations:
19
+ - TagBasedCategorizer: Category from tags/keywords
20
+ - MappingCategorizer: Explicit source→category mapping
21
+ """
22
+
23
+ @abstractmethod
24
+ def categorize(self, rule: CandidateRule) -> CategorizedRule:
25
+ """Assign category and final tags to a rule.
26
+
27
+ Args:
28
+ rule: The candidate rule to categorize.
29
+
30
+ Returns:
31
+ Categorized rule ready for seed generation.
32
+ """
33
+ ...
34
+
35
+
36
+ @dataclass
37
+ class CategoryMapping:
38
+ """Mapping from keywords/tags to category."""
39
+
40
+ category: str
41
+ keywords: list[str] # If any of these appear in tags/rule, assign this category
42
+ priority: int = 0 # Higher priority wins on conflicts
43
+
44
+
45
+ class TagBasedCategorizer(Categorizer):
46
+ """Categorize rules based on their tags and keywords.
47
+
48
+ Usage:
49
+ categorizer = TagBasedCategorizer(
50
+ default_category="testing",
51
+ mappings=[
52
+ CategoryMapping("coverage", ["coverage", "untested"]),
53
+ CategoryMapping("isolation", ["flaky", "order", "hermetic"]),
54
+ CategoryMapping("assertions", ["assert", "expect", "verify"]),
55
+ ],
56
+ tag_normalizer=lambda t: t.lower().replace("-", "_"),
57
+ )
58
+
59
+ categorized = categorizer.categorize(candidate_rule)
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ default_category: str,
65
+ mappings: list[CategoryMapping] | None = None,
66
+ tag_normalizer: Callable[[str], str] | None = None,
67
+ additional_tags: list[str] | None = None,
68
+ ) -> None:
69
+ self.default_category = default_category
70
+ self.mappings = sorted(mappings or [], key=lambda m: m.priority, reverse=True)
71
+ self.tag_normalizer = tag_normalizer or (lambda t: t.lower())
72
+ self.additional_tags = additional_tags or []
73
+
74
+ def categorize(self, rule: CandidateRule) -> CategorizedRule:
75
+ """Assign category based on tag matching."""
76
+ # Normalize tags
77
+ normalized_tags = [self.tag_normalizer(t) for t in rule.raw_tags]
78
+
79
+ # Also check rule text for keywords
80
+ rule_text_lower = rule.rule.lower()
81
+
82
+ # Find matching category
83
+ category = self.default_category
84
+ for mapping in self.mappings:
85
+ for keyword in mapping.keywords:
86
+ keyword_lower = keyword.lower()
87
+ if keyword_lower in normalized_tags or keyword_lower in rule_text_lower:
88
+ category = mapping.category
89
+ break
90
+ else:
91
+ continue
92
+ break
93
+
94
+ # Build final tags
95
+ final_tags = list(set(normalized_tags + self.additional_tags))
96
+
97
+ return CategorizedRule.from_candidate(
98
+ candidate=rule,
99
+ category=category,
100
+ tags=final_tags,
101
+ )
102
+
103
+
104
+ class MappingCategorizer(Categorizer):
105
+ """Categorize rules via explicit source→category mapping.
106
+
107
+ Useful when sources map directly to categories
108
+ (e.g., OWASP A03 → "injection").
109
+
110
+ Usage:
111
+ categorizer = MappingCategorizer(
112
+ source_category_map={
113
+ "https://owasp.org/Top10/A03": "injection",
114
+ "https://owasp.org/Top10/A01": "access-control",
115
+ },
116
+ default_category="security",
117
+ )
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ source_category_map: dict[str, str],
123
+ default_category: str,
124
+ tag_transform: Callable[[list[str]], list[str]] | None = None,
125
+ ) -> None:
126
+ self.source_category_map = source_category_map
127
+ self.default_category = default_category
128
+ self.tag_transform = tag_transform or (lambda tags: tags)
129
+
130
+ def categorize(self, rule: CandidateRule) -> CategorizedRule:
131
+ """Assign category based on source URL."""
132
+ # Find category by matching source URL prefix
133
+ category = self.default_category
134
+ for url_prefix, cat in self.source_category_map.items():
135
+ if rule.source.url.startswith(url_prefix):
136
+ category = cat
137
+ break
138
+
139
+ final_tags = self.tag_transform(rule.raw_tags)
140
+
141
+ return CategorizedRule.from_candidate(
142
+ candidate=rule,
143
+ category=category,
144
+ tags=final_tags,
145
+ )
@@ -0,0 +1,148 @@
1
+ """Rule extractors for Step 2 of the seed engine pipeline.
2
+
3
+ Extractors take sources and produce candidate rules.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Callable
10
+
11
+ from buildlog.seed_engine.models import CandidateRule, Source
12
+
13
+
14
+ class RuleExtractor(ABC):
15
+ """Protocol for extracting rules from sources.
16
+
17
+ Implementations:
18
+ - ManualExtractor: Human-curated rules (highest quality)
19
+ - LLMExtractor: LLM-assisted extraction (future)
20
+ - StructuredExtractor: Parse structured docs like OWASP (future)
21
+ """
22
+
23
+ @abstractmethod
24
+ def extract(self, source: Source) -> list[CandidateRule]:
25
+ """Extract candidate rules from a source.
26
+
27
+ Args:
28
+ source: The source to extract rules from.
29
+
30
+ Returns:
31
+ List of candidate rules with defensibility fields.
32
+ """
33
+ ...
34
+
35
+ @abstractmethod
36
+ def validate(self, rule: CandidateRule) -> list[str]:
37
+ """Validate a candidate rule, returning any issues.
38
+
39
+ Args:
40
+ rule: The rule to validate.
41
+
42
+ Returns:
43
+ List of validation issues (empty if valid).
44
+ """
45
+ ...
46
+
47
+
48
+ class ManualExtractor(RuleExtractor):
49
+ """Manual rule extraction via human curation.
50
+
51
+ This is the gold standard—humans read the source and
52
+ extract rules with full defensibility metadata.
53
+
54
+ Usage:
55
+ extractor = ManualExtractor()
56
+
57
+ # Register rules for a source
58
+ extractor.register(
59
+ source=google_testing_blog,
60
+ rules=[
61
+ CandidateRule(
62
+ rule="Tests must not depend on execution order",
63
+ context="Test suites with multiple tests",
64
+ antipattern="Test A sets state that Test B relies on",
65
+ rationale="Order-dependent tests are flaky",
66
+ source=google_testing_blog,
67
+ raw_tags=["isolation", "flaky"],
68
+ )
69
+ ]
70
+ )
71
+
72
+ # Extract returns registered rules
73
+ rules = extractor.extract(google_testing_blog)
74
+ """
75
+
76
+ def __init__(self) -> None:
77
+ self._rules_by_source: dict[str, list[CandidateRule]] = {}
78
+
79
+ def register(self, source: Source, rules: list[CandidateRule]) -> None:
80
+ """Register manually curated rules for a source.
81
+
82
+ Args:
83
+ source: The source these rules come from.
84
+ rules: The curated rules.
85
+ """
86
+ # Validate all rules are complete
87
+ for rule in rules:
88
+ issues = self.validate(rule)
89
+ if issues:
90
+ raise ValueError(
91
+ f"Invalid rule '{rule.rule[:50]}...': {'; '.join(issues)}"
92
+ )
93
+ self._rules_by_source[source.url] = rules
94
+
95
+ def extract(self, source: Source) -> list[CandidateRule]:
96
+ """Return registered rules for this source."""
97
+ return self._rules_by_source.get(source.url, [])
98
+
99
+ def validate(self, rule: CandidateRule) -> list[str]:
100
+ """Validate defensibility fields are populated."""
101
+ issues = []
102
+ if not rule.rule.strip():
103
+ issues.append("Rule text is empty")
104
+ if not rule.context.strip():
105
+ issues.append("Context is required for defensibility")
106
+ if not rule.antipattern.strip():
107
+ issues.append("Antipattern is required for defensibility")
108
+ if not rule.rationale.strip():
109
+ issues.append("Rationale is required for defensibility")
110
+ return issues
111
+
112
+
113
+ class FunctionExtractor(RuleExtractor):
114
+ """Extraction via custom function (for structured sources).
115
+
116
+ Allows plugging in custom extraction logic for sources
117
+ with known structure (e.g., OWASP pages, API docs).
118
+
119
+ Usage:
120
+ def extract_from_owasp(source: Source) -> list[CandidateRule]:
121
+ # Custom parsing logic for OWASP format
122
+ ...
123
+
124
+ extractor = FunctionExtractor(extract_from_owasp)
125
+ rules = extractor.extract(owasp_source)
126
+ """
127
+
128
+ def __init__(
129
+ self,
130
+ extract_fn: Callable[[Source], list[CandidateRule]],
131
+ validate_fn: Callable[[CandidateRule], list[str]] | None = None,
132
+ ) -> None:
133
+ self._extract_fn = extract_fn
134
+ self._validate_fn = validate_fn or self._default_validate
135
+
136
+ def extract(self, source: Source) -> list[CandidateRule]:
137
+ """Run the custom extraction function."""
138
+ return self._extract_fn(source)
139
+
140
+ def validate(self, rule: CandidateRule) -> list[str]:
141
+ """Run the validation function."""
142
+ return self._validate_fn(rule)
143
+
144
+ def _default_validate(self, rule: CandidateRule) -> list[str]:
145
+ """Default validation: check completeness."""
146
+ if not rule.is_complete():
147
+ return ["Rule is missing required defensibility fields"]
148
+ return []