codebase-intel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. codebase_intel/__init__.py +3 -0
  2. codebase_intel/analytics/__init__.py +1 -0
  3. codebase_intel/analytics/benchmark.py +406 -0
  4. codebase_intel/analytics/feedback.py +496 -0
  5. codebase_intel/analytics/tracker.py +439 -0
  6. codebase_intel/cli/__init__.py +1 -0
  7. codebase_intel/cli/main.py +740 -0
  8. codebase_intel/contracts/__init__.py +1 -0
  9. codebase_intel/contracts/auto_generator.py +438 -0
  10. codebase_intel/contracts/evaluator.py +531 -0
  11. codebase_intel/contracts/models.py +433 -0
  12. codebase_intel/contracts/registry.py +225 -0
  13. codebase_intel/core/__init__.py +1 -0
  14. codebase_intel/core/config.py +248 -0
  15. codebase_intel/core/exceptions.py +454 -0
  16. codebase_intel/core/types.py +375 -0
  17. codebase_intel/decisions/__init__.py +1 -0
  18. codebase_intel/decisions/miner.py +297 -0
  19. codebase_intel/decisions/models.py +302 -0
  20. codebase_intel/decisions/store.py +411 -0
  21. codebase_intel/drift/__init__.py +1 -0
  22. codebase_intel/drift/detector.py +443 -0
  23. codebase_intel/graph/__init__.py +1 -0
  24. codebase_intel/graph/builder.py +391 -0
  25. codebase_intel/graph/parser.py +1232 -0
  26. codebase_intel/graph/query.py +377 -0
  27. codebase_intel/graph/storage.py +736 -0
  28. codebase_intel/mcp/__init__.py +1 -0
  29. codebase_intel/mcp/server.py +710 -0
  30. codebase_intel/orchestrator/__init__.py +1 -0
  31. codebase_intel/orchestrator/assembler.py +649 -0
  32. codebase_intel-0.1.0.dist-info/METADATA +361 -0
  33. codebase_intel-0.1.0.dist-info/RECORD +36 -0
  34. codebase_intel-0.1.0.dist-info/WHEEL +4 -0
  35. codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
  36. codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,375 @@
1
+ """Core type definitions shared across all modules.
2
+
3
+ Design principles:
4
+ - Pydantic models for all structured data — never raw dicts
5
+ - Immutable by default (frozen=True) — mutation must be explicit
6
+ - Content-addressable where possible — xxhash for identity
7
+ - Serializable to JSON for MCP transport and SQLite storage
8
+
9
+ Edge cases addressed in type design:
10
+ - File paths: always resolved to absolute, normalized (no symlink ambiguity)
11
+ - Timestamps: always UTC, never naive datetimes
12
+ - Identifiers: content-hash based where possible (stable across renames)
13
+ - Line ranges: 1-indexed to match editor conventions, validated min<=max
14
+ - Token counts: approximate — different models tokenize differently
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import hashlib
20
+ from datetime import UTC, datetime
21
+ from enum import Enum
22
+ from pathlib import Path
23
+ from typing import Annotated, Any, Self
24
+
25
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Enums
29
+ # ---------------------------------------------------------------------------
30
+
31
+
32
+ class NodeKind(str, Enum):
33
+ """Types of nodes in the semantic code graph."""
34
+
35
+ MODULE = "module" # A file / compilation unit
36
+ CLASS = "class"
37
+ FUNCTION = "function"
38
+ METHOD = "method"
39
+ VARIABLE = "variable" # Module-level constants, global state
40
+ INTERFACE = "interface" # TS interfaces, Python Protocols
41
+ TYPE_ALIAS = "type_alias"
42
+ ENDPOINT = "endpoint" # HTTP/gRPC/GraphQL endpoints
43
+ CONFIG = "config" # Config files that affect behavior
44
+ UNKNOWN = "unknown" # Parsed but unclassifiable
45
+
46
+
47
+ class EdgeKind(str, Enum):
48
+ """Types of relationships between graph nodes."""
49
+
50
+ IMPORTS = "imports" # Static import
51
+ DYNAMIC_IMPORT = "dynamic_import" # importlib, dynamic require()
52
+ CALLS = "calls" # Function/method invocation
53
+ INHERITS = "inherits" # Class inheritance
54
+ IMPLEMENTS = "implements" # Interface implementation
55
+ INSTANTIATES = "instantiates" # Object creation
56
+ READS = "reads" # Reads from variable/config
57
+ WRITES = "writes" # Mutates variable/state
58
+ DEPENDS_ON = "depends_on" # Generic dependency (package-level)
59
+ TESTS = "tests" # Test file → source file relationship
60
+ CONFIGURES = "configures" # Config file → module it affects
61
+ RE_EXPORTS = "re_exports" # Barrel file re-exporting
62
+
63
+
64
+ class Language(str, Enum):
65
+ """Supported source languages — 19 languages via tree-sitter-language-pack."""
66
+
67
+ PYTHON = "python"
68
+ JAVASCRIPT = "javascript"
69
+ TYPESCRIPT = "typescript"
70
+ TSX = "tsx"
71
+ GO = "go"
72
+ RUST = "rust"
73
+ JAVA = "java"
74
+ RUBY = "ruby"
75
+ C = "c"
76
+ CPP = "cpp"
77
+ CSHARP = "c_sharp"
78
+ PHP = "php"
79
+ SWIFT = "swift"
80
+ KOTLIN = "kotlin"
81
+ SCALA = "scala"
82
+ LUA = "lua"
83
+ DART = "dart"
84
+ ELIXIR = "elixir"
85
+ HASKELL = "haskell"
86
+ UNKNOWN = "unknown" # tracked in graph but not parsed internally
87
+
88
+
89
+ class DecisionStatus(str, Enum):
90
+ """Lifecycle of a decision record."""
91
+
92
+ DRAFT = "draft" # Not yet finalized
93
+ ACTIVE = "active" # Currently in effect
94
+ SUPERSEDED = "superseded" # Replaced by another decision
95
+ DEPRECATED = "deprecated" # Still in code but being removed
96
+ EXPIRED = "expired" # Past its review date
97
+
98
+
99
+ class ContractSeverity(str, Enum):
100
+ """How strictly a contract rule is enforced."""
101
+
102
+ ERROR = "error" # Must not violate — blocks generation
103
+ WARNING = "warning" # Should not violate — agent sees it as guidance
104
+ INFO = "info" # Nice to know — lowest priority context
105
+
106
+
107
+ class DriftLevel(str, Enum):
108
+ """Severity of detected drift."""
109
+
110
+ NONE = "none"
111
+ LOW = "low" # Minor: line numbers shifted
112
+ MEDIUM = "medium" # Moderate: content changed but structure intact
113
+ HIGH = "high" # Major: code significantly different
114
+ CRITICAL = "critical" # Anchor deleted or file removed
115
+
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # Value objects (immutable, content-addressed)
119
+ # ---------------------------------------------------------------------------
120
+
121
+
122
+ class FileFingerprint(BaseModel):
123
+ """Content-addressable identity for a file at a point in time.
124
+
125
+ Edge cases:
126
+ - Renamed files: same content hash → detectable as rename, not delete+create
127
+ - Binary files: we hash them but don't parse internals
128
+ - Empty files: valid fingerprint (empty string hashes consistently)
129
+ - Encoding issues: we read as bytes for hashing, decode for parsing separately
130
+ """
131
+
132
+ model_config = ConfigDict(frozen=True)
133
+
134
+ path: Path
135
+ content_hash: str = Field(description="xxhash of file bytes")
136
+ size_bytes: int = Field(ge=0)
137
+ last_modified: datetime
138
+ language: Language = Language.UNKNOWN
139
+
140
+ @field_validator("last_modified")
141
+ @classmethod
142
+ def ensure_utc(cls, v: datetime) -> datetime:
143
+ if v.tzinfo is None:
144
+ return v.replace(tzinfo=UTC)
145
+ return v.astimezone(UTC)
146
+
147
+ @field_validator("path")
148
+ @classmethod
149
+ def normalize_path(cls, v: Path) -> Path:
150
+ return v.resolve()
151
+
152
+
153
+ class LineRange(BaseModel):
154
+ """A range of lines in a file (1-indexed, inclusive).
155
+
156
+ Edge cases:
157
+ - Single-line range: start == end (valid)
158
+ - Full-file range: start=1, end=line_count
159
+ - After refactor: line numbers shift — we re-anchor via content hash
160
+ """
161
+
162
+ model_config = ConfigDict(frozen=True)
163
+
164
+ start: int = Field(ge=1)
165
+ end: int = Field(ge=1)
166
+
167
+ @model_validator(mode="after")
168
+ def start_before_end(self) -> Self:
169
+ if self.start > self.end:
170
+ msg = f"Line range start ({self.start}) must be <= end ({self.end})"
171
+ raise ValueError(msg)
172
+ return self
173
+
174
+ @property
175
+ def span(self) -> int:
176
+ return self.end - self.start + 1
177
+
178
+
179
+ class CodeAnchor(BaseModel):
180
+ """Links a non-code artifact (decision, contract) to a specific code location.
181
+
182
+ Edge cases:
183
+ - File renamed: content_hash lets us find it at new path
184
+ - Lines shifted: content_hash of the anchored region lets us re-locate
185
+ - File deleted: anchor becomes orphaned — drift detector flags it
186
+ - Function renamed: symbol_name helps re-anchor even if hash changes
187
+ """
188
+
189
+ model_config = ConfigDict(frozen=True)
190
+
191
+ file_path: Path
192
+ line_range: LineRange | None = None
193
+ symbol_name: str | None = None # e.g., "RateLimiter.check"
194
+ content_hash: str | None = None # hash of the anchored code region
195
+
196
+ @field_validator("file_path")
197
+ @classmethod
198
+ def normalize_path(cls, v: Path) -> Path:
199
+ return v.resolve()
200
+
201
+ def is_orphaned(self, existing_paths: set[Path]) -> bool:
202
+ """Check if the anchor's file still exists."""
203
+ return self.file_path.resolve() not in existing_paths
204
+
205
+
206
+ class TokenBudget(BaseModel):
207
+ """Token budget for context assembly.
208
+
209
+ Edge cases:
210
+ - Zero budget: return empty context with metadata only
211
+ - Budget smaller than minimum useful context: return highest-priority items
212
+ with a truncation warning
213
+ - Model-specific tokenization: budget is approximate since different models
214
+ tokenize differently. We use tiktoken cl100k_base as a reasonable baseline
215
+ and include a safety margin.
216
+ """
217
+
218
+ model_config = ConfigDict(frozen=True)
219
+
220
+ total: int = Field(gt=0, description="Total tokens available for context")
221
+ reserved_for_response: int = Field(
222
+ default=0,
223
+ ge=0,
224
+ description="Tokens to reserve for the agent's response",
225
+ )
226
+ safety_margin_pct: float = Field(
227
+ default=0.1,
228
+ ge=0,
229
+ le=0.5,
230
+ description="Percentage to hold back for tokenization variance",
231
+ )
232
+
233
+ @property
234
+ def usable(self) -> int:
235
+ """Actual tokens available for context after reserves."""
236
+ raw = self.total - self.reserved_for_response
237
+ margin = int(raw * self.safety_margin_pct)
238
+ return max(0, raw - margin)
239
+
240
+
241
+ # ---------------------------------------------------------------------------
242
+ # Graph node & edge
243
+ # ---------------------------------------------------------------------------
244
+
245
+
246
+ class GraphNode(BaseModel):
247
+ """A node in the semantic code graph.
248
+
249
+ Edge cases:
250
+ - Generated code: marked with is_generated=True, lower trust weight
251
+ - Vendored/third-party: marked with is_external=True, read-only context
252
+ - Test files: marked with is_test=True, linked to source via TESTS edge
253
+ - Barrel/index files: may have many RE_EXPORTS edges, low own content
254
+ """
255
+
256
+ model_config = ConfigDict(frozen=True)
257
+
258
+ node_id: str = Field(description="Stable ID: hash of (path, kind, name)")
259
+ kind: NodeKind
260
+ name: str
261
+ qualified_name: str = Field(
262
+ description="Full path: module.Class.method"
263
+ )
264
+ file_path: Path
265
+ line_range: LineRange | None = None
266
+ language: Language = Language.UNKNOWN
267
+ content_hash: str | None = None
268
+ docstring: str | None = None
269
+
270
+ is_generated: bool = False
271
+ is_external: bool = False
272
+ is_test: bool = False
273
+ is_entry_point: bool = False
274
+
275
+ metadata: dict[str, Any] = Field(default_factory=dict)
276
+
277
+ @staticmethod
278
+ def make_id(file_path: Path, kind: NodeKind, name: str) -> str:
279
+ """Deterministic node ID from its identity components."""
280
+ raw = f"{file_path.resolve()}:{kind.value}:{name}"
281
+ return hashlib.sha256(raw.encode()).hexdigest()[:16]
282
+
283
+
284
+ class GraphEdge(BaseModel):
285
+ """A directed edge between two graph nodes.
286
+
287
+ Edge cases:
288
+ - Conditional edges: import inside if TYPE_CHECKING — marked is_type_only
289
+ - Dynamic edges: importlib.import_module() — marked with confidence < 1.0
290
+ - Circular edges: A → B → A — valid, traversal must handle
291
+ - Cross-language edges: Python calling C extension — low confidence
292
+ """
293
+
294
+ model_config = ConfigDict(frozen=True)
295
+
296
+ source_id: str
297
+ target_id: str
298
+ kind: EdgeKind
299
+ confidence: float = Field(
300
+ default=1.0,
301
+ ge=0.0,
302
+ le=1.0,
303
+ description="1.0 = static import, <1.0 = inferred/dynamic",
304
+ )
305
+ is_type_only: bool = Field(
306
+ default=False,
307
+ description="True for TYPE_CHECKING imports, type annotations only",
308
+ )
309
+ metadata: dict[str, Any] = Field(default_factory=dict)
310
+
311
+
312
+ # ---------------------------------------------------------------------------
313
+ # Context assembly types
314
+ # ---------------------------------------------------------------------------
315
+
316
+
317
+ class ContextPriority(str, Enum):
318
+ """Priority levels for context items during budget allocation."""
319
+
320
+ CRITICAL = "critical" # Must include — directly referenced files
321
+ HIGH = "high" # Should include — immediate dependencies, active decisions
322
+ MEDIUM = "medium" # Include if budget allows — transitive deps, older decisions
323
+ LOW = "low" # Nice to have — tangential context, info-level contracts
324
+
325
+
326
+ class ContextItem(BaseModel):
327
+ """A single item in the assembled context payload.
328
+
329
+ This is the universal wrapper — every piece of context (file content,
330
+ decision record, contract rule) gets wrapped in this for the orchestrator
331
+ to prioritize and budget.
332
+ """
333
+
334
+ source: str = Field(description="Which module provided this: graph|decisions|contracts")
335
+ item_type: str = Field(description="Specific type: file_content|decision|contract_rule|warning")
336
+ priority: ContextPriority
337
+ estimated_tokens: int = Field(ge=0)
338
+ content: str
339
+ metadata: dict[str, Any] = Field(default_factory=dict)
340
+ freshness_score: float = Field(
341
+ default=1.0,
342
+ ge=0.0,
343
+ le=1.0,
344
+ description="1.0 = just validated, 0.0 = very stale",
345
+ )
346
+
347
+
348
+ class AssembledContext(BaseModel):
349
+ """The final context payload sent to an AI agent.
350
+
351
+ Edge cases:
352
+ - Empty context: valid if the task has no relevant files (new greenfield code)
353
+ - Truncated context: items were dropped due to budget — truncated=True with
354
+ a summary of what was dropped
355
+ - Conflicting context: contains contradictions — conflicts list populated
356
+ - Partial context: some modules unavailable — warnings list populated
357
+ """
358
+
359
+ items: list[ContextItem] = Field(default_factory=list)
360
+ total_tokens: int = 0
361
+ budget_tokens: int = 0
362
+ truncated: bool = False
363
+ dropped_count: int = Field(
364
+ default=0,
365
+ description="Number of items dropped due to budget",
366
+ )
367
+ conflicts: list[str] = Field(
368
+ default_factory=list,
369
+ description="Human-readable conflict descriptions",
370
+ )
371
+ warnings: list[str] = Field(
372
+ default_factory=list,
373
+ description="Degradation notices (partial init, stale data, etc.)",
374
+ )
375
+ assembly_time_ms: float = 0.0
@@ -0,0 +1 @@
1
+ """Decision Journal — structured records of architectural and business decisions."""
@@ -0,0 +1,297 @@
1
+ """Git history miner — extracts decision candidates from PRs, commits, and comments.
2
+
3
+ The goal is to auto-discover decisions that were made but never formally recorded.
4
+ Developers make decisions constantly in PR descriptions, commit messages, and code
5
+ review comments — this module surfaces them.
6
+
7
+ This is SUGGESTION-based, not automatic. Every mined decision has confidence < 1.0
8
+ and should be reviewed by a human before becoming official.
9
+
10
+ Edge cases:
11
+ - PR description is empty: skip (nothing to mine)
12
+ - PR description is a template with checkboxes: extract only non-template content
13
+ - Commit message is "fix" or "wip": skip (no decision content)
14
+ - Multiple decisions in one PR: extract each as a separate candidate
15
+ - Decision language in non-English: basic support via keyword matching only
16
+ - Merge commits: skip (they reference the PR, not new decisions)
17
+ - Squash commits: contain the full PR description — high-value target
18
+ - Revert commits: flag the original decision as potentially superseded
19
+ - Git history is very large: limit mining depth with max_commits parameter
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import logging
25
+ import re
26
+ from dataclasses import dataclass, field
27
+ from datetime import UTC, datetime
28
+ from pathlib import Path
29
+ from typing import TYPE_CHECKING
30
+
31
+ from codebase_intel.core.types import CodeAnchor, DecisionStatus
32
+ from codebase_intel.decisions.models import DecisionRecord
33
+
34
+ if TYPE_CHECKING:
35
+ from codebase_intel.core.config import DecisionConfig
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Keywords that suggest a commit/PR contains a decision
40
+ DECISION_KEYWORDS = [
41
+ # Architecture
42
+ "decided", "decision", "chose", "chosen", "opted for", "switched to",
43
+ "migrated from", "replaced", "instead of", "rather than",
44
+ # Reasoning
45
+ "because", "reason:", "rationale:", "why:", "trade-off", "tradeoff",
46
+ "considered", "evaluated", "compared",
47
+ # Constraints
48
+ "compliance", "regulation", "requirement", "sla", "must not", "cannot",
49
+ "forbidden", "prohibited",
50
+ # Breaking changes
51
+ "breaking change", "breaking:", "deprecated", "removed",
52
+ # Architecture-specific
53
+ "adr", "architecture decision", "design decision", "rfc",
54
+ ]
55
+
56
+ # Patterns that indicate a commit should be skipped
57
+ SKIP_PATTERNS = [
58
+ r"^merge\s",
59
+ r"^wip\b",
60
+ r"^fix\s*(typo|lint|format|style)",
61
+ r"^chore\s*:",
62
+ r"^bump\s+version",
63
+ r"^update\s+lock",
64
+ r"^auto-generated",
65
+ ]
66
+
67
+
68
+ @dataclass
69
+ class DecisionCandidate:
70
+ """A potential decision extracted from git history, pending human review."""
71
+
72
+ title: str
73
+ context: str
74
+ decision_text: str
75
+ source_type: str # "commit", "pr_description", "pr_comment"
76
+ source_ref: str # commit hash, PR URL
77
+ author: str
78
+ created_at: datetime
79
+ changed_files: list[Path] = field(default_factory=list)
80
+ confidence: float = 0.5
81
+ keywords_matched: list[str] = field(default_factory=list)
82
+
83
+ def to_decision_record(self, decision_id: str) -> DecisionRecord:
84
+ """Convert this candidate to a draft DecisionRecord."""
85
+ anchors = [
86
+ CodeAnchor(file_path=fp) for fp in self.changed_files[:10]
87
+ ]
88
+
89
+ return DecisionRecord(
90
+ id=decision_id,
91
+ title=self.title,
92
+ status=DecisionStatus.DRAFT,
93
+ context=self.context,
94
+ decision=self.decision_text,
95
+ code_anchors=anchors,
96
+ created_at=self.created_at,
97
+ author=self.author,
98
+ source="git-mined",
99
+ source_ref=self.source_ref,
100
+ confidence=self.confidence,
101
+ tags=["auto-mined"],
102
+ )
103
+
104
+
105
+ class GitMiner:
106
+ """Mines git history for decision candidates."""
107
+
108
+ def __init__(
109
+ self,
110
+ config: DecisionConfig,
111
+ project_root: Path,
112
+ ) -> None:
113
+ self._config = config
114
+ self._project_root = project_root
115
+
116
+ async def mine_commits(
117
+ self,
118
+ max_commits: int = 500,
119
+ since_days: int = 90,
120
+ ) -> list[DecisionCandidate]:
121
+ """Mine recent commit messages for decision candidates.
122
+
123
+ Edge cases:
124
+ - Not a git repo: return empty list with warning
125
+ - Shallow clone: limited history available — mine what we have
126
+ - Binary commits (large media files): skip based on file extensions
127
+ - Encoding issues in commit messages: handle gracefully
128
+ """
129
+ try:
130
+ from git import Repo
131
+ except ImportError:
132
+ logger.warning("GitPython not installed — cannot mine commits")
133
+ return []
134
+
135
+ try:
136
+ repo = Repo(self._project_root, search_parent_directories=True)
137
+ except Exception:
138
+ logger.warning("Not a git repository: %s", self._project_root)
139
+ return []
140
+
141
+ candidates: list[DecisionCandidate] = []
142
+ count = 0
143
+
144
+ since_dt = datetime.now(UTC).replace(
145
+ day=max(1, datetime.now(UTC).day),
146
+ )
147
+
148
+ for commit in repo.iter_commits(max_count=max_commits):
149
+ count += 1
150
+
151
+ message = commit.message.strip()
152
+ if not message:
153
+ continue
154
+
155
+ # Skip noise commits
156
+ if self._should_skip(message):
157
+ continue
158
+
159
+ # Check for decision keywords
160
+ matched_keywords = self._match_keywords(message)
161
+ if not matched_keywords:
162
+ continue
163
+
164
+ # Extract changed files
165
+ changed_files: list[Path] = []
166
+ try:
167
+ if commit.parents:
168
+ diff = commit.parents[0].diff(commit)
169
+ changed_files = [
170
+ self._project_root / d.a_path
171
+ for d in diff
172
+ if d.a_path
173
+ ]
174
+ except Exception:
175
+ pass # Diff extraction is best-effort
176
+
177
+ # Build candidate
178
+ title = self._extract_title(message)
179
+ context, decision_text = self._extract_context_and_decision(message)
180
+
181
+ confidence = self._compute_confidence(
182
+ message, matched_keywords, changed_files
183
+ )
184
+
185
+ candidates.append(DecisionCandidate(
186
+ title=title,
187
+ context=context,
188
+ decision_text=decision_text,
189
+ source_type="commit",
190
+ source_ref=str(commit.hexsha)[:12],
191
+ author=commit.author.name if commit.author else "unknown",
192
+ created_at=datetime.fromtimestamp(commit.committed_date, tz=UTC),
193
+ changed_files=changed_files[:20],
194
+ confidence=confidence,
195
+ keywords_matched=matched_keywords,
196
+ ))
197
+
198
+ logger.info(
199
+ "Mined %d commits, found %d decision candidates",
200
+ count,
201
+ len(candidates),
202
+ )
203
+
204
+ return candidates
205
+
206
+ def _should_skip(self, message: str) -> bool:
207
+ """Check if a commit message should be skipped.
208
+
209
+ Edge case: multi-line messages — check only the first line for
210
+ skip patterns but check all lines for decision keywords.
211
+ """
212
+ first_line = message.split("\n")[0].lower().strip()
213
+ return any(re.match(pattern, first_line) for pattern in SKIP_PATTERNS)
214
+
215
+ def _match_keywords(self, message: str) -> list[str]:
216
+ """Find decision-indicating keywords in the message."""
217
+ message_lower = message.lower()
218
+ return [kw for kw in DECISION_KEYWORDS if kw in message_lower]
219
+
220
+ def _extract_title(self, message: str) -> str:
221
+ """Extract a title from the commit message.
222
+
223
+ Convention: first line of the commit message, truncated at 80 chars.
224
+ Edge case: first line is very long (someone put everything on one line).
225
+ """
226
+ first_line = message.split("\n")[0].strip()
227
+ if len(first_line) > 80:
228
+ return first_line[:77] + "..."
229
+ return first_line
230
+
231
+ def _extract_context_and_decision(
232
+ self, message: str
233
+ ) -> tuple[str, str]:
234
+ """Separate the context ("why") from the decision ("what") in a message.
235
+
236
+ Heuristic: lines before "because"/"reason:" are the decision,
237
+ lines after are the context. If no such separator, the whole
238
+ message is both context and decision.
239
+
240
+ Edge case: message with no clear separation — use the first line
241
+ as the decision and the rest as context.
242
+ """
243
+ lines = message.strip().split("\n")
244
+
245
+ if len(lines) <= 1:
246
+ return message, message
247
+
248
+ first_line = lines[0].strip()
249
+ body = "\n".join(lines[1:]).strip()
250
+
251
+ if not body:
252
+ return first_line, first_line
253
+
254
+ return body, first_line
255
+
256
+ def _compute_confidence(
257
+ self,
258
+ message: str,
259
+ keywords: list[str],
260
+ changed_files: list[Path],
261
+ ) -> float:
262
+ """Compute confidence score for a mined decision candidate.
263
+
264
+ Factors:
265
+ - Number of keywords matched (more = higher)
266
+ - Message length (longer = more context = higher)
267
+ - Number of files changed (fewer = more focused = higher)
268
+ - Presence of reasoning words (because, reason) = higher
269
+ - Presence of "adr" or "decision" = much higher
270
+
271
+ Score range: 0.2 (barely qualifying) to 0.8 (strong signal).
272
+ Never 1.0 — that requires human confirmation.
273
+ """
274
+ score = 0.3 # Base score for matching any keyword
275
+
276
+ # Keyword count bonus
277
+ score += min(0.2, len(keywords) * 0.05)
278
+
279
+ # Message length bonus (meaningful messages are longer)
280
+ if len(message) > 200:
281
+ score += 0.1
282
+ if len(message) > 500:
283
+ score += 0.1
284
+
285
+ # Focused changes (fewer files = clearer decision)
286
+ if 1 <= len(changed_files) <= 5:
287
+ score += 0.05
288
+
289
+ # High-signal keywords
290
+ message_lower = message.lower()
291
+ if any(kw in message_lower for kw in ("adr", "architecture decision", "design decision")):
292
+ score += 0.15
293
+
294
+ if any(kw in message_lower for kw in ("because", "reason:", "rationale:")):
295
+ score += 0.1
296
+
297
+ return min(0.8, score)