buildlog 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/cli.py +379 -3
- buildlog/seed_engine/__init__.py +74 -0
- buildlog/seed_engine/categorizers.py +145 -0
- buildlog/seed_engine/extractors.py +148 -0
- buildlog/seed_engine/generators.py +144 -0
- buildlog/seed_engine/models.py +113 -0
- buildlog/seed_engine/pipeline.py +202 -0
- buildlog/seed_engine/sources.py +362 -0
- buildlog/seeds.py +211 -0
- buildlog/skills.py +26 -3
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/METADATA +82 -11
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/RECORD +22 -14
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/copier.yml +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.5.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/WHEEL +0 -0
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.5.0.dist-info → buildlog-0.6.0.dist-info}/licenses/LICENSE +0 -0
buildlog/cli.py
CHANGED
|
@@ -172,8 +172,8 @@ def new(slug: str, entry_date: str | None):
|
|
|
172
172
|
click.echo(f"\nOpen it: $EDITOR {entry_path}")
|
|
173
173
|
|
|
174
174
|
|
|
175
|
-
@main.command()
|
|
176
|
-
def
|
|
175
|
+
@main.command("list")
|
|
176
|
+
def list_entries():
|
|
177
177
|
"""List all buildlog entries."""
|
|
178
178
|
buildlog_dir = Path("buildlog")
|
|
179
179
|
|
|
@@ -182,7 +182,8 @@ def list():
|
|
|
182
182
|
raise SystemExit(1)
|
|
183
183
|
|
|
184
184
|
entries = sorted(
|
|
185
|
-
buildlog_dir.glob("20??-??-??-*.md"),
|
|
185
|
+
buildlog_dir.glob("20??-??-??-*.md"),
|
|
186
|
+
reverse=True, # Most recent first
|
|
186
187
|
)
|
|
187
188
|
|
|
188
189
|
if not entries:
|
|
@@ -876,5 +877,380 @@ def experiment_report(output_json: bool):
|
|
|
876
877
|
)
|
|
877
878
|
|
|
878
879
|
|
|
880
|
+
# -----------------------------------------------------------------------------
|
|
881
|
+
# Gauntlet Commands (Review Personas)
|
|
882
|
+
# -----------------------------------------------------------------------------
|
|
883
|
+
|
|
884
|
+
PERSONAS = {
|
|
885
|
+
"security_karen": "OWASP Top 10 security review",
|
|
886
|
+
"test_terrorist": "Comprehensive testing coverage audit",
|
|
887
|
+
"ruthless_reviewer": "Code quality and functional principles",
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
@main.group()
|
|
892
|
+
def gauntlet():
|
|
893
|
+
"""Run the review gauntlet with curated personas.
|
|
894
|
+
|
|
895
|
+
The gauntlet runs your code through multiple ruthless reviewers,
|
|
896
|
+
each with domain-specific rules loaded from seed files.
|
|
897
|
+
|
|
898
|
+
Personas:
|
|
899
|
+
- security_karen: OWASP security review (12 rules)
|
|
900
|
+
- test_terrorist: Testing coverage audit (21 rules)
|
|
901
|
+
- ruthless_reviewer: Code quality review (coming soon)
|
|
902
|
+
|
|
903
|
+
Example workflow:
|
|
904
|
+
|
|
905
|
+
buildlog gauntlet list # See available personas
|
|
906
|
+
buildlog gauntlet rules --persona all # Show all rules
|
|
907
|
+
buildlog gauntlet prompt src/ # Generate review prompt
|
|
908
|
+
"""
|
|
909
|
+
pass
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
@gauntlet.command("list")
|
|
913
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON")
|
|
914
|
+
def gauntlet_list(output_json: bool):
|
|
915
|
+
"""List available reviewer personas and their rule counts.
|
|
916
|
+
|
|
917
|
+
Examples:
|
|
918
|
+
|
|
919
|
+
buildlog gauntlet list
|
|
920
|
+
buildlog gauntlet list --json
|
|
921
|
+
"""
|
|
922
|
+
import json as json_module
|
|
923
|
+
|
|
924
|
+
from buildlog.seeds import load_all_seeds
|
|
925
|
+
|
|
926
|
+
# Find seeds directory
|
|
927
|
+
buildlog_dir = Path("buildlog")
|
|
928
|
+
seeds_dir = buildlog_dir / ".buildlog" / "seeds"
|
|
929
|
+
|
|
930
|
+
# Also check .buildlog at repo root (common for installed templates)
|
|
931
|
+
if not seeds_dir.exists():
|
|
932
|
+
seeds_dir = Path(".buildlog") / "seeds"
|
|
933
|
+
|
|
934
|
+
seeds = load_all_seeds(seeds_dir)
|
|
935
|
+
|
|
936
|
+
if output_json:
|
|
937
|
+
data = {
|
|
938
|
+
"personas": {
|
|
939
|
+
name: {
|
|
940
|
+
"description": PERSONAS.get(name, "Custom persona"),
|
|
941
|
+
"rules_count": len(sf.rules),
|
|
942
|
+
"version": sf.version,
|
|
943
|
+
}
|
|
944
|
+
for name, sf in seeds.items()
|
|
945
|
+
},
|
|
946
|
+
"total_rules": sum(len(sf.rules) for sf in seeds.values()),
|
|
947
|
+
}
|
|
948
|
+
click.echo(json_module.dumps(data, indent=2))
|
|
949
|
+
else:
|
|
950
|
+
click.echo("Review Gauntlet Personas")
|
|
951
|
+
click.echo("=" * 50)
|
|
952
|
+
|
|
953
|
+
if not seeds:
|
|
954
|
+
click.echo("\nNo seed files found.")
|
|
955
|
+
click.echo("Initialize with: buildlog init")
|
|
956
|
+
click.echo("Or create seeds in: .buildlog/seeds/")
|
|
957
|
+
return
|
|
958
|
+
|
|
959
|
+
total = 0
|
|
960
|
+
for name, sf in sorted(seeds.items()):
|
|
961
|
+
desc = PERSONAS.get(name, "Custom persona")
|
|
962
|
+
click.echo(f"\n {name}")
|
|
963
|
+
click.echo(f" {desc}")
|
|
964
|
+
click.echo(f" Rules: {len(sf.rules)} (v{sf.version})")
|
|
965
|
+
total += len(sf.rules)
|
|
966
|
+
|
|
967
|
+
click.echo(f"\nTotal: {len(seeds)} personas, {total} rules")
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
@gauntlet.command("rules")
|
|
971
|
+
@click.option(
|
|
972
|
+
"--persona",
|
|
973
|
+
"-p",
|
|
974
|
+
default="all",
|
|
975
|
+
help="Persona to show rules for (or 'all')",
|
|
976
|
+
)
|
|
977
|
+
@click.option(
|
|
978
|
+
"--format",
|
|
979
|
+
"fmt",
|
|
980
|
+
type=click.Choice(["yaml", "json", "markdown"]),
|
|
981
|
+
default="yaml",
|
|
982
|
+
help="Output format",
|
|
983
|
+
)
|
|
984
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file")
|
|
985
|
+
def gauntlet_rules(persona: str, fmt: str, output: str | None):
|
|
986
|
+
"""Show rules for reviewer personas.
|
|
987
|
+
|
|
988
|
+
Use this to see what rules are loaded for each persona,
|
|
989
|
+
or export them for use in prompts.
|
|
990
|
+
|
|
991
|
+
Examples:
|
|
992
|
+
|
|
993
|
+
buildlog gauntlet rules # All rules (YAML)
|
|
994
|
+
buildlog gauntlet rules -p security_karen # Single persona
|
|
995
|
+
buildlog gauntlet rules --format json -o rules.json
|
|
996
|
+
buildlog gauntlet rules --format markdown # For docs
|
|
997
|
+
"""
|
|
998
|
+
import json as json_module
|
|
999
|
+
|
|
1000
|
+
from buildlog.seeds import load_all_seeds
|
|
1001
|
+
|
|
1002
|
+
# Find seeds directory
|
|
1003
|
+
seeds_dir = Path(".buildlog") / "seeds"
|
|
1004
|
+
if not seeds_dir.exists():
|
|
1005
|
+
seeds_dir = Path("buildlog") / ".buildlog" / "seeds"
|
|
1006
|
+
|
|
1007
|
+
seeds = load_all_seeds(seeds_dir)
|
|
1008
|
+
|
|
1009
|
+
if not seeds:
|
|
1010
|
+
click.echo("No seed files found.", err=True)
|
|
1011
|
+
click.echo("Initialize with: buildlog init", err=True)
|
|
1012
|
+
raise SystemExit(1)
|
|
1013
|
+
|
|
1014
|
+
# Filter personas
|
|
1015
|
+
if persona != "all":
|
|
1016
|
+
if persona not in seeds:
|
|
1017
|
+
available = ", ".join(seeds.keys())
|
|
1018
|
+
click.echo(f"Unknown persona: {persona}", err=True)
|
|
1019
|
+
click.echo(f"Available: {available}", err=True)
|
|
1020
|
+
raise SystemExit(1)
|
|
1021
|
+
seeds = {persona: seeds[persona]}
|
|
1022
|
+
|
|
1023
|
+
# Build output data
|
|
1024
|
+
if fmt == "json":
|
|
1025
|
+
data = {}
|
|
1026
|
+
for name, sf in seeds.items():
|
|
1027
|
+
data[name] = {
|
|
1028
|
+
"version": sf.version,
|
|
1029
|
+
"rules": [
|
|
1030
|
+
{
|
|
1031
|
+
"rule": r.rule,
|
|
1032
|
+
"category": r.category,
|
|
1033
|
+
"context": r.context,
|
|
1034
|
+
"antipattern": r.antipattern,
|
|
1035
|
+
"rationale": r.rationale,
|
|
1036
|
+
"tags": r.tags,
|
|
1037
|
+
"references": [
|
|
1038
|
+
{"url": ref.url, "title": ref.title} for ref in r.references
|
|
1039
|
+
],
|
|
1040
|
+
}
|
|
1041
|
+
for r in sf.rules
|
|
1042
|
+
],
|
|
1043
|
+
}
|
|
1044
|
+
formatted = json_module.dumps(data, indent=2)
|
|
1045
|
+
|
|
1046
|
+
elif fmt == "markdown":
|
|
1047
|
+
lines = ["# Review Gauntlet Rules\n"]
|
|
1048
|
+
for name, sf in seeds.items():
|
|
1049
|
+
lines.append(f"## {name.replace('_', ' ').title()}\n")
|
|
1050
|
+
lines.append(f"*{len(sf.rules)} rules, v{sf.version}*\n")
|
|
1051
|
+
for i, r in enumerate(sf.rules, 1):
|
|
1052
|
+
lines.append(f"### {i}. {r.rule}\n")
|
|
1053
|
+
lines.append(f"**Category**: {r.category} ")
|
|
1054
|
+
lines.append(f"**Tags**: {', '.join(r.tags)}\n")
|
|
1055
|
+
if r.context:
|
|
1056
|
+
lines.append(f"**When**: {r.context}\n")
|
|
1057
|
+
if r.antipattern:
|
|
1058
|
+
lines.append(f"**Antipattern**: {r.antipattern}\n")
|
|
1059
|
+
if r.rationale:
|
|
1060
|
+
lines.append(f"**Why**: {r.rationale}\n")
|
|
1061
|
+
if r.references:
|
|
1062
|
+
lines.append("**References**:")
|
|
1063
|
+
for ref in r.references:
|
|
1064
|
+
lines.append(f"- [{ref.title}]({ref.url})")
|
|
1065
|
+
lines.append("")
|
|
1066
|
+
formatted = "\n".join(lines)
|
|
1067
|
+
|
|
1068
|
+
else: # yaml
|
|
1069
|
+
import yaml as yaml_module
|
|
1070
|
+
|
|
1071
|
+
data = {}
|
|
1072
|
+
for name, sf in seeds.items():
|
|
1073
|
+
data[name] = {
|
|
1074
|
+
"version": sf.version,
|
|
1075
|
+
"rules": [
|
|
1076
|
+
{
|
|
1077
|
+
"rule": r.rule,
|
|
1078
|
+
"category": r.category,
|
|
1079
|
+
"context": r.context,
|
|
1080
|
+
"antipattern": r.antipattern,
|
|
1081
|
+
"rationale": r.rationale,
|
|
1082
|
+
"tags": r.tags,
|
|
1083
|
+
}
|
|
1084
|
+
for r in sf.rules
|
|
1085
|
+
],
|
|
1086
|
+
}
|
|
1087
|
+
formatted = yaml_module.dump(data, default_flow_style=False, sort_keys=False)
|
|
1088
|
+
|
|
1089
|
+
# Output
|
|
1090
|
+
if output:
|
|
1091
|
+
output_path = Path(output)
|
|
1092
|
+
output_path.write_text(formatted, encoding="utf-8")
|
|
1093
|
+
total = sum(len(sf.rules) for sf in seeds.values())
|
|
1094
|
+
click.echo(f"Wrote {total} rules to {output_path}")
|
|
1095
|
+
else:
|
|
1096
|
+
click.echo(formatted)
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
@gauntlet.command("prompt")
|
|
1100
|
+
@click.argument("target", type=click.Path(exists=True))
|
|
1101
|
+
@click.option(
|
|
1102
|
+
"--persona",
|
|
1103
|
+
"-p",
|
|
1104
|
+
multiple=True,
|
|
1105
|
+
help="Personas to include (default: all)",
|
|
1106
|
+
)
|
|
1107
|
+
@click.option("--output", "-o", type=click.Path(), help="Output file")
|
|
1108
|
+
def gauntlet_prompt(target: str, persona: tuple[str, ...], output: str | None):
|
|
1109
|
+
"""Generate a review prompt for the gauntlet.
|
|
1110
|
+
|
|
1111
|
+
Creates a prompt with rules and target code that can be
|
|
1112
|
+
used with Claude or another LLM to run a review.
|
|
1113
|
+
|
|
1114
|
+
Examples:
|
|
1115
|
+
|
|
1116
|
+
buildlog gauntlet prompt src/
|
|
1117
|
+
buildlog gauntlet prompt src/api.py -p security_karen
|
|
1118
|
+
buildlog gauntlet prompt . -o review_prompt.md
|
|
1119
|
+
"""
|
|
1120
|
+
from buildlog.seeds import load_all_seeds
|
|
1121
|
+
|
|
1122
|
+
# Find seeds directory
|
|
1123
|
+
seeds_dir = Path(".buildlog") / "seeds"
|
|
1124
|
+
if not seeds_dir.exists():
|
|
1125
|
+
seeds_dir = Path("buildlog") / ".buildlog" / "seeds"
|
|
1126
|
+
|
|
1127
|
+
seeds = load_all_seeds(seeds_dir)
|
|
1128
|
+
|
|
1129
|
+
if not seeds:
|
|
1130
|
+
click.echo("No seed files found.", err=True)
|
|
1131
|
+
raise SystemExit(1)
|
|
1132
|
+
|
|
1133
|
+
# Filter personas
|
|
1134
|
+
if persona:
|
|
1135
|
+
seeds = {k: v for k, v in seeds.items() if k in persona}
|
|
1136
|
+
if not seeds:
|
|
1137
|
+
click.echo(f"No matching personas: {', '.join(persona)}", err=True)
|
|
1138
|
+
raise SystemExit(1)
|
|
1139
|
+
|
|
1140
|
+
# Build the prompt
|
|
1141
|
+
target_path = Path(target)
|
|
1142
|
+
lines = [
|
|
1143
|
+
"# Review Gauntlet Prompt\n",
|
|
1144
|
+
"You are running the Review Gauntlet. Apply these rules ruthlessly.\n",
|
|
1145
|
+
"## Target\n",
|
|
1146
|
+
f"Review: `{target_path}`\n",
|
|
1147
|
+
"## Reviewers and Rules\n",
|
|
1148
|
+
]
|
|
1149
|
+
|
|
1150
|
+
for name, sf in seeds.items():
|
|
1151
|
+
persona_name = name.replace("_", " ").title()
|
|
1152
|
+
lines.append(f"### {persona_name}\n")
|
|
1153
|
+
for r in sf.rules:
|
|
1154
|
+
lines.append(f"- **{r.rule}**")
|
|
1155
|
+
if r.antipattern:
|
|
1156
|
+
lines.append(f" - Antipattern: {r.antipattern}")
|
|
1157
|
+
lines.append("")
|
|
1158
|
+
|
|
1159
|
+
lines.extend(
|
|
1160
|
+
[
|
|
1161
|
+
"## Output Format\n",
|
|
1162
|
+
"For each issue found, output:\n",
|
|
1163
|
+
"```json",
|
|
1164
|
+
"{",
|
|
1165
|
+
' "reviewer": "<persona>",',
|
|
1166
|
+
' "severity": "critical|major|minor|nitpick",',
|
|
1167
|
+
' "category": "<category>",',
|
|
1168
|
+
' "location": "<file:line>",',
|
|
1169
|
+
' "description": "<what is wrong>",',
|
|
1170
|
+
' "rule_learned": "<generalizable rule>"',
|
|
1171
|
+
"}",
|
|
1172
|
+
"```\n",
|
|
1173
|
+
"## Instructions\n",
|
|
1174
|
+
"1. Read the target code thoroughly",
|
|
1175
|
+
"2. Apply each rule from each reviewer",
|
|
1176
|
+
"3. Report ALL violations found",
|
|
1177
|
+
"4. Be ruthless - this is the gauntlet",
|
|
1178
|
+
"",
|
|
1179
|
+
]
|
|
1180
|
+
)
|
|
1181
|
+
|
|
1182
|
+
formatted = "\n".join(lines)
|
|
1183
|
+
|
|
1184
|
+
if output:
|
|
1185
|
+
output_path = Path(output)
|
|
1186
|
+
output_path.write_text(formatted, encoding="utf-8")
|
|
1187
|
+
click.echo(f"Wrote prompt to {output_path}")
|
|
1188
|
+
else:
|
|
1189
|
+
click.echo(formatted)
|
|
1190
|
+
|
|
1191
|
+
|
|
1192
|
+
@gauntlet.command("learn")
|
|
1193
|
+
@click.argument("issues_file", type=click.Path(exists=True))
|
|
1194
|
+
@click.option("--source", "-s", help="Source identifier (e.g., 'gauntlet:PR#42')")
|
|
1195
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON")
|
|
1196
|
+
def gauntlet_learn(issues_file: str, source: str | None, output_json: bool):
|
|
1197
|
+
"""Persist learnings from a gauntlet review.
|
|
1198
|
+
|
|
1199
|
+
Takes a JSON file of issues (in the gauntlet output format)
|
|
1200
|
+
and calls learn_from_review to persist them.
|
|
1201
|
+
|
|
1202
|
+
Examples:
|
|
1203
|
+
|
|
1204
|
+
buildlog gauntlet learn review_issues.json
|
|
1205
|
+
buildlog gauntlet learn issues.json --source "gauntlet:2026-01-22"
|
|
1206
|
+
"""
|
|
1207
|
+
import json as json_module
|
|
1208
|
+
from dataclasses import asdict
|
|
1209
|
+
|
|
1210
|
+
from buildlog.core import learn_from_review
|
|
1211
|
+
|
|
1212
|
+
buildlog_dir = Path("buildlog")
|
|
1213
|
+
|
|
1214
|
+
if not buildlog_dir.exists():
|
|
1215
|
+
click.echo("No buildlog/ directory found. Run 'buildlog init' first.", err=True)
|
|
1216
|
+
raise SystemExit(1)
|
|
1217
|
+
|
|
1218
|
+
# Load issues
|
|
1219
|
+
try:
|
|
1220
|
+
with open(issues_file) as f:
|
|
1221
|
+
data = json_module.load(f)
|
|
1222
|
+
except json_module.JSONDecodeError as e:
|
|
1223
|
+
click.echo(f"Invalid JSON: {e}", err=True)
|
|
1224
|
+
raise SystemExit(1)
|
|
1225
|
+
|
|
1226
|
+
# Handle different formats
|
|
1227
|
+
if isinstance(data, list):
|
|
1228
|
+
issues = data
|
|
1229
|
+
elif isinstance(data, dict) and "all_issues" in data:
|
|
1230
|
+
issues = data["all_issues"]
|
|
1231
|
+
elif isinstance(data, dict) and "issues" in data:
|
|
1232
|
+
issues = data["issues"]
|
|
1233
|
+
else:
|
|
1234
|
+
click.echo(
|
|
1235
|
+
"Expected list of issues or dict with 'issues'/'all_issues'", err=True
|
|
1236
|
+
)
|
|
1237
|
+
raise SystemExit(1)
|
|
1238
|
+
|
|
1239
|
+
if not issues:
|
|
1240
|
+
click.echo("No issues found in file.", err=True)
|
|
1241
|
+
raise SystemExit(1)
|
|
1242
|
+
|
|
1243
|
+
# Learn from review
|
|
1244
|
+
result = learn_from_review(buildlog_dir, issues, source=source or "gauntlet")
|
|
1245
|
+
|
|
1246
|
+
if output_json:
|
|
1247
|
+
click.echo(json_module.dumps(asdict(result), indent=2))
|
|
1248
|
+
else:
|
|
1249
|
+
click.echo(f"✓ {result.message}")
|
|
1250
|
+
click.echo(f" New learnings: {result.new_learnings}")
|
|
1251
|
+
click.echo(f" Reinforced: {result.reinforced_learnings}")
|
|
1252
|
+
click.echo(f" Total processed: {result.total_issues_processed}")
|
|
1253
|
+
|
|
1254
|
+
|
|
879
1255
|
if __name__ == "__main__":
|
|
880
1256
|
main()
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Seed Engine - Formalized pipeline for creating reviewer personas.
|
|
2
|
+
|
|
3
|
+
The seed engine abstracts the 4-step process for bootstrapping
|
|
4
|
+
defensible reviewer personas from authoritative domain sources:
|
|
5
|
+
|
|
6
|
+
1. SOURCE IDENTIFICATION - Define authoritative sources
|
|
7
|
+
2. RULE EXTRACTION - Extract candidate rules with defensibility fields
|
|
8
|
+
3. CATEGORIZATION - Map rules to persona concern categories
|
|
9
|
+
4. SEED GENERATION - Output validated YAML seed file
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from buildlog.seed_engine import Pipeline, Source, SourceType
|
|
13
|
+
|
|
14
|
+
# Define sources
|
|
15
|
+
sources = [
|
|
16
|
+
Source(
|
|
17
|
+
name="OWASP Top 10",
|
|
18
|
+
url="https://owasp.org/Top10/",
|
|
19
|
+
source_type=SourceType.REFERENCE_DOC,
|
|
20
|
+
domain="security",
|
|
21
|
+
)
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# Run pipeline
|
|
25
|
+
pipeline = Pipeline(persona="security_karen")
|
|
26
|
+
seed_file = pipeline.run(sources)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from buildlog.seed_engine.categorizers import (
|
|
30
|
+
Categorizer,
|
|
31
|
+
CategoryMapping,
|
|
32
|
+
TagBasedCategorizer,
|
|
33
|
+
)
|
|
34
|
+
from buildlog.seed_engine.extractors import ManualExtractor, RuleExtractor
|
|
35
|
+
from buildlog.seed_engine.generators import SeedGenerator
|
|
36
|
+
from buildlog.seed_engine.models import (
|
|
37
|
+
CandidateRule,
|
|
38
|
+
CategorizedRule,
|
|
39
|
+
Source,
|
|
40
|
+
SourceType,
|
|
41
|
+
)
|
|
42
|
+
from buildlog.seed_engine.pipeline import Pipeline
|
|
43
|
+
from buildlog.seed_engine.sources import (
|
|
44
|
+
FetchStatus,
|
|
45
|
+
SourceEntry,
|
|
46
|
+
SourceFetcher,
|
|
47
|
+
SourceManifest,
|
|
48
|
+
url_to_cache_filename,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
# Models
|
|
53
|
+
"Source",
|
|
54
|
+
"SourceType",
|
|
55
|
+
"CandidateRule",
|
|
56
|
+
"CategorizedRule",
|
|
57
|
+
# Pipeline
|
|
58
|
+
"Pipeline",
|
|
59
|
+
# Extractors
|
|
60
|
+
"RuleExtractor",
|
|
61
|
+
"ManualExtractor",
|
|
62
|
+
# Categorizers
|
|
63
|
+
"Categorizer",
|
|
64
|
+
"TagBasedCategorizer",
|
|
65
|
+
"CategoryMapping",
|
|
66
|
+
# Generators
|
|
67
|
+
"SeedGenerator",
|
|
68
|
+
# Sources
|
|
69
|
+
"FetchStatus",
|
|
70
|
+
"SourceEntry",
|
|
71
|
+
"SourceManifest",
|
|
72
|
+
"SourceFetcher",
|
|
73
|
+
"url_to_cache_filename",
|
|
74
|
+
]
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Rule categorizers for Step 3 of the seed engine pipeline.
|
|
2
|
+
|
|
3
|
+
Categorizers take candidate rules and assign final categories and tags.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from buildlog.seed_engine.models import CandidateRule, CategorizedRule
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Categorizer(ABC):
|
|
16
|
+
"""Protocol for categorizing rules.
|
|
17
|
+
|
|
18
|
+
Implementations:
|
|
19
|
+
- TagBasedCategorizer: Category from tags/keywords
|
|
20
|
+
- MappingCategorizer: Explicit source→category mapping
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def categorize(self, rule: CandidateRule) -> CategorizedRule:
|
|
25
|
+
"""Assign category and final tags to a rule.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
rule: The candidate rule to categorize.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Categorized rule ready for seed generation.
|
|
32
|
+
"""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class CategoryMapping:
|
|
38
|
+
"""Mapping from keywords/tags to category."""
|
|
39
|
+
|
|
40
|
+
category: str
|
|
41
|
+
keywords: list[str] # If any of these appear in tags/rule, assign this category
|
|
42
|
+
priority: int = 0 # Higher priority wins on conflicts
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TagBasedCategorizer(Categorizer):
|
|
46
|
+
"""Categorize rules based on their tags and keywords.
|
|
47
|
+
|
|
48
|
+
Usage:
|
|
49
|
+
categorizer = TagBasedCategorizer(
|
|
50
|
+
default_category="testing",
|
|
51
|
+
mappings=[
|
|
52
|
+
CategoryMapping("coverage", ["coverage", "untested"]),
|
|
53
|
+
CategoryMapping("isolation", ["flaky", "order", "hermetic"]),
|
|
54
|
+
CategoryMapping("assertions", ["assert", "expect", "verify"]),
|
|
55
|
+
],
|
|
56
|
+
tag_normalizer=lambda t: t.lower().replace("-", "_"),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
categorized = categorizer.categorize(candidate_rule)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
default_category: str,
|
|
65
|
+
mappings: list[CategoryMapping] | None = None,
|
|
66
|
+
tag_normalizer: Callable[[str], str] | None = None,
|
|
67
|
+
additional_tags: list[str] | None = None,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.default_category = default_category
|
|
70
|
+
self.mappings = sorted(mappings or [], key=lambda m: m.priority, reverse=True)
|
|
71
|
+
self.tag_normalizer = tag_normalizer or (lambda t: t.lower())
|
|
72
|
+
self.additional_tags = additional_tags or []
|
|
73
|
+
|
|
74
|
+
def categorize(self, rule: CandidateRule) -> CategorizedRule:
|
|
75
|
+
"""Assign category based on tag matching."""
|
|
76
|
+
# Normalize tags
|
|
77
|
+
normalized_tags = [self.tag_normalizer(t) for t in rule.raw_tags]
|
|
78
|
+
|
|
79
|
+
# Also check rule text for keywords
|
|
80
|
+
rule_text_lower = rule.rule.lower()
|
|
81
|
+
|
|
82
|
+
# Find matching category
|
|
83
|
+
category = self.default_category
|
|
84
|
+
for mapping in self.mappings:
|
|
85
|
+
for keyword in mapping.keywords:
|
|
86
|
+
keyword_lower = keyword.lower()
|
|
87
|
+
if keyword_lower in normalized_tags or keyword_lower in rule_text_lower:
|
|
88
|
+
category = mapping.category
|
|
89
|
+
break
|
|
90
|
+
else:
|
|
91
|
+
continue
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# Build final tags
|
|
95
|
+
final_tags = list(set(normalized_tags + self.additional_tags))
|
|
96
|
+
|
|
97
|
+
return CategorizedRule.from_candidate(
|
|
98
|
+
candidate=rule,
|
|
99
|
+
category=category,
|
|
100
|
+
tags=final_tags,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class MappingCategorizer(Categorizer):
|
|
105
|
+
"""Categorize rules via explicit source→category mapping.
|
|
106
|
+
|
|
107
|
+
Useful when sources map directly to categories
|
|
108
|
+
(e.g., OWASP A03 → "injection").
|
|
109
|
+
|
|
110
|
+
Usage:
|
|
111
|
+
categorizer = MappingCategorizer(
|
|
112
|
+
source_category_map={
|
|
113
|
+
"https://owasp.org/Top10/A03": "injection",
|
|
114
|
+
"https://owasp.org/Top10/A01": "access-control",
|
|
115
|
+
},
|
|
116
|
+
default_category="security",
|
|
117
|
+
)
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
source_category_map: dict[str, str],
|
|
123
|
+
default_category: str,
|
|
124
|
+
tag_transform: Callable[[list[str]], list[str]] | None = None,
|
|
125
|
+
) -> None:
|
|
126
|
+
self.source_category_map = source_category_map
|
|
127
|
+
self.default_category = default_category
|
|
128
|
+
self.tag_transform = tag_transform or (lambda tags: tags)
|
|
129
|
+
|
|
130
|
+
def categorize(self, rule: CandidateRule) -> CategorizedRule:
|
|
131
|
+
"""Assign category based on source URL."""
|
|
132
|
+
# Find category by matching source URL prefix
|
|
133
|
+
category = self.default_category
|
|
134
|
+
for url_prefix, cat in self.source_category_map.items():
|
|
135
|
+
if rule.source.url.startswith(url_prefix):
|
|
136
|
+
category = cat
|
|
137
|
+
break
|
|
138
|
+
|
|
139
|
+
final_tags = self.tag_transform(rule.raw_tags)
|
|
140
|
+
|
|
141
|
+
return CategorizedRule.from_candidate(
|
|
142
|
+
candidate=rule,
|
|
143
|
+
category=category,
|
|
144
|
+
tags=final_tags,
|
|
145
|
+
)
|