codebase-intel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_intel/__init__.py +3 -0
- codebase_intel/analytics/__init__.py +1 -0
- codebase_intel/analytics/benchmark.py +406 -0
- codebase_intel/analytics/feedback.py +496 -0
- codebase_intel/analytics/tracker.py +439 -0
- codebase_intel/cli/__init__.py +1 -0
- codebase_intel/cli/main.py +740 -0
- codebase_intel/contracts/__init__.py +1 -0
- codebase_intel/contracts/auto_generator.py +438 -0
- codebase_intel/contracts/evaluator.py +531 -0
- codebase_intel/contracts/models.py +433 -0
- codebase_intel/contracts/registry.py +225 -0
- codebase_intel/core/__init__.py +1 -0
- codebase_intel/core/config.py +248 -0
- codebase_intel/core/exceptions.py +454 -0
- codebase_intel/core/types.py +375 -0
- codebase_intel/decisions/__init__.py +1 -0
- codebase_intel/decisions/miner.py +297 -0
- codebase_intel/decisions/models.py +302 -0
- codebase_intel/decisions/store.py +411 -0
- codebase_intel/drift/__init__.py +1 -0
- codebase_intel/drift/detector.py +443 -0
- codebase_intel/graph/__init__.py +1 -0
- codebase_intel/graph/builder.py +391 -0
- codebase_intel/graph/parser.py +1232 -0
- codebase_intel/graph/query.py +377 -0
- codebase_intel/graph/storage.py +736 -0
- codebase_intel/mcp/__init__.py +1 -0
- codebase_intel/mcp/server.py +710 -0
- codebase_intel/orchestrator/__init__.py +1 -0
- codebase_intel/orchestrator/assembler.py +649 -0
- codebase_intel-0.1.0.dist-info/METADATA +361 -0
- codebase_intel-0.1.0.dist-info/RECORD +36 -0
- codebase_intel-0.1.0.dist-info/WHEEL +4 -0
- codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
- codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Core type definitions shared across all modules.
|
|
2
|
+
|
|
3
|
+
Design principles:
|
|
4
|
+
- Pydantic models for all structured data — never raw dicts
|
|
5
|
+
- Immutable by default (frozen=True) — mutation must be explicit
|
|
6
|
+
- Content-addressable where possible — xxhash for identity
|
|
7
|
+
- Serializable to JSON for MCP transport and SQLite storage
|
|
8
|
+
|
|
9
|
+
Edge cases addressed in type design:
|
|
10
|
+
- File paths: always resolved to absolute, normalized (no symlink ambiguity)
|
|
11
|
+
- Timestamps: always UTC, never naive datetimes
|
|
12
|
+
- Identifiers: content-hash based where possible (stable across renames)
|
|
13
|
+
- Line ranges: 1-indexed to match editor conventions, validated min<=max
|
|
14
|
+
- Token counts: approximate — different models tokenize differently
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import hashlib
|
|
20
|
+
from datetime import UTC, datetime
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Annotated, Any, Self
|
|
24
|
+
|
|
25
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Enums
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class NodeKind(str, Enum):
|
|
33
|
+
"""Types of nodes in the semantic code graph."""
|
|
34
|
+
|
|
35
|
+
MODULE = "module" # A file / compilation unit
|
|
36
|
+
CLASS = "class"
|
|
37
|
+
FUNCTION = "function"
|
|
38
|
+
METHOD = "method"
|
|
39
|
+
VARIABLE = "variable" # Module-level constants, global state
|
|
40
|
+
INTERFACE = "interface" # TS interfaces, Python Protocols
|
|
41
|
+
TYPE_ALIAS = "type_alias"
|
|
42
|
+
ENDPOINT = "endpoint" # HTTP/gRPC/GraphQL endpoints
|
|
43
|
+
CONFIG = "config" # Config files that affect behavior
|
|
44
|
+
UNKNOWN = "unknown" # Parsed but unclassifiable
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class EdgeKind(str, Enum):
|
|
48
|
+
"""Types of relationships between graph nodes."""
|
|
49
|
+
|
|
50
|
+
IMPORTS = "imports" # Static import
|
|
51
|
+
DYNAMIC_IMPORT = "dynamic_import" # importlib, dynamic require()
|
|
52
|
+
CALLS = "calls" # Function/method invocation
|
|
53
|
+
INHERITS = "inherits" # Class inheritance
|
|
54
|
+
IMPLEMENTS = "implements" # Interface implementation
|
|
55
|
+
INSTANTIATES = "instantiates" # Object creation
|
|
56
|
+
READS = "reads" # Reads from variable/config
|
|
57
|
+
WRITES = "writes" # Mutates variable/state
|
|
58
|
+
DEPENDS_ON = "depends_on" # Generic dependency (package-level)
|
|
59
|
+
TESTS = "tests" # Test file → source file relationship
|
|
60
|
+
CONFIGURES = "configures" # Config file → module it affects
|
|
61
|
+
RE_EXPORTS = "re_exports" # Barrel file re-exporting
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Language(str, Enum):
|
|
65
|
+
"""Supported source languages — 19 languages via tree-sitter-language-pack."""
|
|
66
|
+
|
|
67
|
+
PYTHON = "python"
|
|
68
|
+
JAVASCRIPT = "javascript"
|
|
69
|
+
TYPESCRIPT = "typescript"
|
|
70
|
+
TSX = "tsx"
|
|
71
|
+
GO = "go"
|
|
72
|
+
RUST = "rust"
|
|
73
|
+
JAVA = "java"
|
|
74
|
+
RUBY = "ruby"
|
|
75
|
+
C = "c"
|
|
76
|
+
CPP = "cpp"
|
|
77
|
+
CSHARP = "c_sharp"
|
|
78
|
+
PHP = "php"
|
|
79
|
+
SWIFT = "swift"
|
|
80
|
+
KOTLIN = "kotlin"
|
|
81
|
+
SCALA = "scala"
|
|
82
|
+
LUA = "lua"
|
|
83
|
+
DART = "dart"
|
|
84
|
+
ELIXIR = "elixir"
|
|
85
|
+
HASKELL = "haskell"
|
|
86
|
+
UNKNOWN = "unknown" # tracked in graph but not parsed internally
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DecisionStatus(str, Enum):
|
|
90
|
+
"""Lifecycle of a decision record."""
|
|
91
|
+
|
|
92
|
+
DRAFT = "draft" # Not yet finalized
|
|
93
|
+
ACTIVE = "active" # Currently in effect
|
|
94
|
+
SUPERSEDED = "superseded" # Replaced by another decision
|
|
95
|
+
DEPRECATED = "deprecated" # Still in code but being removed
|
|
96
|
+
EXPIRED = "expired" # Past its review date
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class ContractSeverity(str, Enum):
|
|
100
|
+
"""How strictly a contract rule is enforced."""
|
|
101
|
+
|
|
102
|
+
ERROR = "error" # Must not violate — blocks generation
|
|
103
|
+
WARNING = "warning" # Should not violate — agent sees it as guidance
|
|
104
|
+
INFO = "info" # Nice to know — lowest priority context
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class DriftLevel(str, Enum):
|
|
108
|
+
"""Severity of detected drift."""
|
|
109
|
+
|
|
110
|
+
NONE = "none"
|
|
111
|
+
LOW = "low" # Minor: line numbers shifted
|
|
112
|
+
MEDIUM = "medium" # Moderate: content changed but structure intact
|
|
113
|
+
HIGH = "high" # Major: code significantly different
|
|
114
|
+
CRITICAL = "critical" # Anchor deleted or file removed
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# Value objects (immutable, content-addressed)
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class FileFingerprint(BaseModel):
|
|
123
|
+
"""Content-addressable identity for a file at a point in time.
|
|
124
|
+
|
|
125
|
+
Edge cases:
|
|
126
|
+
- Renamed files: same content hash → detectable as rename, not delete+create
|
|
127
|
+
- Binary files: we hash them but don't parse internals
|
|
128
|
+
- Empty files: valid fingerprint (empty string hashes consistently)
|
|
129
|
+
- Encoding issues: we read as bytes for hashing, decode for parsing separately
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
model_config = ConfigDict(frozen=True)
|
|
133
|
+
|
|
134
|
+
path: Path
|
|
135
|
+
content_hash: str = Field(description="xxhash of file bytes")
|
|
136
|
+
size_bytes: int = Field(ge=0)
|
|
137
|
+
last_modified: datetime
|
|
138
|
+
language: Language = Language.UNKNOWN
|
|
139
|
+
|
|
140
|
+
@field_validator("last_modified")
|
|
141
|
+
@classmethod
|
|
142
|
+
def ensure_utc(cls, v: datetime) -> datetime:
|
|
143
|
+
if v.tzinfo is None:
|
|
144
|
+
return v.replace(tzinfo=UTC)
|
|
145
|
+
return v.astimezone(UTC)
|
|
146
|
+
|
|
147
|
+
@field_validator("path")
|
|
148
|
+
@classmethod
|
|
149
|
+
def normalize_path(cls, v: Path) -> Path:
|
|
150
|
+
return v.resolve()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class LineRange(BaseModel):
|
|
154
|
+
"""A range of lines in a file (1-indexed, inclusive).
|
|
155
|
+
|
|
156
|
+
Edge cases:
|
|
157
|
+
- Single-line range: start == end (valid)
|
|
158
|
+
- Full-file range: start=1, end=line_count
|
|
159
|
+
- After refactor: line numbers shift — we re-anchor via content hash
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
model_config = ConfigDict(frozen=True)
|
|
163
|
+
|
|
164
|
+
start: int = Field(ge=1)
|
|
165
|
+
end: int = Field(ge=1)
|
|
166
|
+
|
|
167
|
+
@model_validator(mode="after")
|
|
168
|
+
def start_before_end(self) -> Self:
|
|
169
|
+
if self.start > self.end:
|
|
170
|
+
msg = f"Line range start ({self.start}) must be <= end ({self.end})"
|
|
171
|
+
raise ValueError(msg)
|
|
172
|
+
return self
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def span(self) -> int:
|
|
176
|
+
return self.end - self.start + 1
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class CodeAnchor(BaseModel):
|
|
180
|
+
"""Links a non-code artifact (decision, contract) to a specific code location.
|
|
181
|
+
|
|
182
|
+
Edge cases:
|
|
183
|
+
- File renamed: content_hash lets us find it at new path
|
|
184
|
+
- Lines shifted: content_hash of the anchored region lets us re-locate
|
|
185
|
+
- File deleted: anchor becomes orphaned — drift detector flags it
|
|
186
|
+
- Function renamed: symbol_name helps re-anchor even if hash changes
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
model_config = ConfigDict(frozen=True)
|
|
190
|
+
|
|
191
|
+
file_path: Path
|
|
192
|
+
line_range: LineRange | None = None
|
|
193
|
+
symbol_name: str | None = None # e.g., "RateLimiter.check"
|
|
194
|
+
content_hash: str | None = None # hash of the anchored code region
|
|
195
|
+
|
|
196
|
+
@field_validator("file_path")
|
|
197
|
+
@classmethod
|
|
198
|
+
def normalize_path(cls, v: Path) -> Path:
|
|
199
|
+
return v.resolve()
|
|
200
|
+
|
|
201
|
+
def is_orphaned(self, existing_paths: set[Path]) -> bool:
|
|
202
|
+
"""Check if the anchor's file still exists."""
|
|
203
|
+
return self.file_path.resolve() not in existing_paths
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class TokenBudget(BaseModel):
|
|
207
|
+
"""Token budget for context assembly.
|
|
208
|
+
|
|
209
|
+
Edge cases:
|
|
210
|
+
- Zero budget: return empty context with metadata only
|
|
211
|
+
- Budget smaller than minimum useful context: return highest-priority items
|
|
212
|
+
with a truncation warning
|
|
213
|
+
- Model-specific tokenization: budget is approximate since different models
|
|
214
|
+
tokenize differently. We use tiktoken cl100k_base as a reasonable baseline
|
|
215
|
+
and include a safety margin.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
model_config = ConfigDict(frozen=True)
|
|
219
|
+
|
|
220
|
+
total: int = Field(gt=0, description="Total tokens available for context")
|
|
221
|
+
reserved_for_response: int = Field(
|
|
222
|
+
default=0,
|
|
223
|
+
ge=0,
|
|
224
|
+
description="Tokens to reserve for the agent's response",
|
|
225
|
+
)
|
|
226
|
+
safety_margin_pct: float = Field(
|
|
227
|
+
default=0.1,
|
|
228
|
+
ge=0,
|
|
229
|
+
le=0.5,
|
|
230
|
+
description="Percentage to hold back for tokenization variance",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
@property
|
|
234
|
+
def usable(self) -> int:
|
|
235
|
+
"""Actual tokens available for context after reserves."""
|
|
236
|
+
raw = self.total - self.reserved_for_response
|
|
237
|
+
margin = int(raw * self.safety_margin_pct)
|
|
238
|
+
return max(0, raw - margin)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# ---------------------------------------------------------------------------
|
|
242
|
+
# Graph node & edge
|
|
243
|
+
# ---------------------------------------------------------------------------
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class GraphNode(BaseModel):
|
|
247
|
+
"""A node in the semantic code graph.
|
|
248
|
+
|
|
249
|
+
Edge cases:
|
|
250
|
+
- Generated code: marked with is_generated=True, lower trust weight
|
|
251
|
+
- Vendored/third-party: marked with is_external=True, read-only context
|
|
252
|
+
- Test files: marked with is_test=True, linked to source via TESTS edge
|
|
253
|
+
- Barrel/index files: may have many RE_EXPORTS edges, low own content
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
model_config = ConfigDict(frozen=True)
|
|
257
|
+
|
|
258
|
+
node_id: str = Field(description="Stable ID: hash of (path, kind, name)")
|
|
259
|
+
kind: NodeKind
|
|
260
|
+
name: str
|
|
261
|
+
qualified_name: str = Field(
|
|
262
|
+
description="Full path: module.Class.method"
|
|
263
|
+
)
|
|
264
|
+
file_path: Path
|
|
265
|
+
line_range: LineRange | None = None
|
|
266
|
+
language: Language = Language.UNKNOWN
|
|
267
|
+
content_hash: str | None = None
|
|
268
|
+
docstring: str | None = None
|
|
269
|
+
|
|
270
|
+
is_generated: bool = False
|
|
271
|
+
is_external: bool = False
|
|
272
|
+
is_test: bool = False
|
|
273
|
+
is_entry_point: bool = False
|
|
274
|
+
|
|
275
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
276
|
+
|
|
277
|
+
@staticmethod
|
|
278
|
+
def make_id(file_path: Path, kind: NodeKind, name: str) -> str:
|
|
279
|
+
"""Deterministic node ID from its identity components."""
|
|
280
|
+
raw = f"{file_path.resolve()}:{kind.value}:{name}"
|
|
281
|
+
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class GraphEdge(BaseModel):
|
|
285
|
+
"""A directed edge between two graph nodes.
|
|
286
|
+
|
|
287
|
+
Edge cases:
|
|
288
|
+
- Conditional edges: import inside if TYPE_CHECKING — marked is_type_only
|
|
289
|
+
- Dynamic edges: importlib.import_module() — marked with confidence < 1.0
|
|
290
|
+
- Circular edges: A → B → A — valid, traversal must handle
|
|
291
|
+
- Cross-language edges: Python calling C extension — low confidence
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
model_config = ConfigDict(frozen=True)
|
|
295
|
+
|
|
296
|
+
source_id: str
|
|
297
|
+
target_id: str
|
|
298
|
+
kind: EdgeKind
|
|
299
|
+
confidence: float = Field(
|
|
300
|
+
default=1.0,
|
|
301
|
+
ge=0.0,
|
|
302
|
+
le=1.0,
|
|
303
|
+
description="1.0 = static import, <1.0 = inferred/dynamic",
|
|
304
|
+
)
|
|
305
|
+
is_type_only: bool = Field(
|
|
306
|
+
default=False,
|
|
307
|
+
description="True for TYPE_CHECKING imports, type annotations only",
|
|
308
|
+
)
|
|
309
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
# Context assembly types
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
class ContextPriority(str, Enum):
|
|
318
|
+
"""Priority levels for context items during budget allocation."""
|
|
319
|
+
|
|
320
|
+
CRITICAL = "critical" # Must include — directly referenced files
|
|
321
|
+
HIGH = "high" # Should include — immediate dependencies, active decisions
|
|
322
|
+
MEDIUM = "medium" # Include if budget allows — transitive deps, older decisions
|
|
323
|
+
LOW = "low" # Nice to have — tangential context, info-level contracts
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class ContextItem(BaseModel):
|
|
327
|
+
"""A single item in the assembled context payload.
|
|
328
|
+
|
|
329
|
+
This is the universal wrapper — every piece of context (file content,
|
|
330
|
+
decision record, contract rule) gets wrapped in this for the orchestrator
|
|
331
|
+
to prioritize and budget.
|
|
332
|
+
"""
|
|
333
|
+
|
|
334
|
+
source: str = Field(description="Which module provided this: graph|decisions|contracts")
|
|
335
|
+
item_type: str = Field(description="Specific type: file_content|decision|contract_rule|warning")
|
|
336
|
+
priority: ContextPriority
|
|
337
|
+
estimated_tokens: int = Field(ge=0)
|
|
338
|
+
content: str
|
|
339
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
340
|
+
freshness_score: float = Field(
|
|
341
|
+
default=1.0,
|
|
342
|
+
ge=0.0,
|
|
343
|
+
le=1.0,
|
|
344
|
+
description="1.0 = just validated, 0.0 = very stale",
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class AssembledContext(BaseModel):
|
|
349
|
+
"""The final context payload sent to an AI agent.
|
|
350
|
+
|
|
351
|
+
Edge cases:
|
|
352
|
+
- Empty context: valid if the task has no relevant files (new greenfield code)
|
|
353
|
+
- Truncated context: items were dropped due to budget — truncated=True with
|
|
354
|
+
a summary of what was dropped
|
|
355
|
+
- Conflicting context: contains contradictions — conflicts list populated
|
|
356
|
+
- Partial context: some modules unavailable — warnings list populated
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
items: list[ContextItem] = Field(default_factory=list)
|
|
360
|
+
total_tokens: int = 0
|
|
361
|
+
budget_tokens: int = 0
|
|
362
|
+
truncated: bool = False
|
|
363
|
+
dropped_count: int = Field(
|
|
364
|
+
default=0,
|
|
365
|
+
description="Number of items dropped due to budget",
|
|
366
|
+
)
|
|
367
|
+
conflicts: list[str] = Field(
|
|
368
|
+
default_factory=list,
|
|
369
|
+
description="Human-readable conflict descriptions",
|
|
370
|
+
)
|
|
371
|
+
warnings: list[str] = Field(
|
|
372
|
+
default_factory=list,
|
|
373
|
+
description="Degradation notices (partial init, stale data, etc.)",
|
|
374
|
+
)
|
|
375
|
+
assembly_time_ms: float = 0.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Decision Journal — structured records of architectural and business decisions."""
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"""Git history miner — extracts decision candidates from PRs, commits, and comments.
|
|
2
|
+
|
|
3
|
+
The goal is to auto-discover decisions that were made but never formally recorded.
|
|
4
|
+
Developers make decisions constantly in PR descriptions, commit messages, and code
|
|
5
|
+
review comments — this module surfaces them.
|
|
6
|
+
|
|
7
|
+
This is SUGGESTION-based, not automatic. Every mined decision has confidence < 1.0
|
|
8
|
+
and should be reviewed by a human before becoming official.
|
|
9
|
+
|
|
10
|
+
Edge cases:
|
|
11
|
+
- PR description is empty: skip (nothing to mine)
|
|
12
|
+
- PR description is a template with checkboxes: extract only non-template content
|
|
13
|
+
- Commit message is "fix" or "wip": skip (no decision content)
|
|
14
|
+
- Multiple decisions in one PR: extract each as a separate candidate
|
|
15
|
+
- Decision language in non-English: basic support via keyword matching only
|
|
16
|
+
- Merge commits: skip (they reference the PR, not new decisions)
|
|
17
|
+
- Squash commits: contain the full PR description — high-value target
|
|
18
|
+
- Revert commits: flag the original decision as potentially superseded
|
|
19
|
+
- Git history is very large: limit mining depth with max_commits parameter
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import re
|
|
26
|
+
from dataclasses import dataclass, field
|
|
27
|
+
from datetime import UTC, datetime
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import TYPE_CHECKING
|
|
30
|
+
|
|
31
|
+
from codebase_intel.core.types import CodeAnchor, DecisionStatus
|
|
32
|
+
from codebase_intel.decisions.models import DecisionRecord
|
|
33
|
+
|
|
34
|
+
if TYPE_CHECKING:
|
|
35
|
+
from codebase_intel.core.config import DecisionConfig
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Keywords that suggest a commit/PR contains a decision
|
|
40
|
+
DECISION_KEYWORDS = [
|
|
41
|
+
# Architecture
|
|
42
|
+
"decided", "decision", "chose", "chosen", "opted for", "switched to",
|
|
43
|
+
"migrated from", "replaced", "instead of", "rather than",
|
|
44
|
+
# Reasoning
|
|
45
|
+
"because", "reason:", "rationale:", "why:", "trade-off", "tradeoff",
|
|
46
|
+
"considered", "evaluated", "compared",
|
|
47
|
+
# Constraints
|
|
48
|
+
"compliance", "regulation", "requirement", "sla", "must not", "cannot",
|
|
49
|
+
"forbidden", "prohibited",
|
|
50
|
+
# Breaking changes
|
|
51
|
+
"breaking change", "breaking:", "deprecated", "removed",
|
|
52
|
+
# Architecture-specific
|
|
53
|
+
"adr", "architecture decision", "design decision", "rfc",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# Patterns that indicate a commit should be skipped
|
|
57
|
+
SKIP_PATTERNS = [
|
|
58
|
+
r"^merge\s",
|
|
59
|
+
r"^wip\b",
|
|
60
|
+
r"^fix\s*(typo|lint|format|style)",
|
|
61
|
+
r"^chore\s*:",
|
|
62
|
+
r"^bump\s+version",
|
|
63
|
+
r"^update\s+lock",
|
|
64
|
+
r"^auto-generated",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class DecisionCandidate:
|
|
70
|
+
"""A potential decision extracted from git history, pending human review."""
|
|
71
|
+
|
|
72
|
+
title: str
|
|
73
|
+
context: str
|
|
74
|
+
decision_text: str
|
|
75
|
+
source_type: str # "commit", "pr_description", "pr_comment"
|
|
76
|
+
source_ref: str # commit hash, PR URL
|
|
77
|
+
author: str
|
|
78
|
+
created_at: datetime
|
|
79
|
+
changed_files: list[Path] = field(default_factory=list)
|
|
80
|
+
confidence: float = 0.5
|
|
81
|
+
keywords_matched: list[str] = field(default_factory=list)
|
|
82
|
+
|
|
83
|
+
def to_decision_record(self, decision_id: str) -> DecisionRecord:
|
|
84
|
+
"""Convert this candidate to a draft DecisionRecord."""
|
|
85
|
+
anchors = [
|
|
86
|
+
CodeAnchor(file_path=fp) for fp in self.changed_files[:10]
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
return DecisionRecord(
|
|
90
|
+
id=decision_id,
|
|
91
|
+
title=self.title,
|
|
92
|
+
status=DecisionStatus.DRAFT,
|
|
93
|
+
context=self.context,
|
|
94
|
+
decision=self.decision_text,
|
|
95
|
+
code_anchors=anchors,
|
|
96
|
+
created_at=self.created_at,
|
|
97
|
+
author=self.author,
|
|
98
|
+
source="git-mined",
|
|
99
|
+
source_ref=self.source_ref,
|
|
100
|
+
confidence=self.confidence,
|
|
101
|
+
tags=["auto-mined"],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class GitMiner:
|
|
106
|
+
"""Mines git history for decision candidates."""
|
|
107
|
+
|
|
108
|
+
def __init__(
|
|
109
|
+
self,
|
|
110
|
+
config: DecisionConfig,
|
|
111
|
+
project_root: Path,
|
|
112
|
+
) -> None:
|
|
113
|
+
self._config = config
|
|
114
|
+
self._project_root = project_root
|
|
115
|
+
|
|
116
|
+
async def mine_commits(
|
|
117
|
+
self,
|
|
118
|
+
max_commits: int = 500,
|
|
119
|
+
since_days: int = 90,
|
|
120
|
+
) -> list[DecisionCandidate]:
|
|
121
|
+
"""Mine recent commit messages for decision candidates.
|
|
122
|
+
|
|
123
|
+
Edge cases:
|
|
124
|
+
- Not a git repo: return empty list with warning
|
|
125
|
+
- Shallow clone: limited history available — mine what we have
|
|
126
|
+
- Binary commits (large media files): skip based on file extensions
|
|
127
|
+
- Encoding issues in commit messages: handle gracefully
|
|
128
|
+
"""
|
|
129
|
+
try:
|
|
130
|
+
from git import Repo
|
|
131
|
+
except ImportError:
|
|
132
|
+
logger.warning("GitPython not installed — cannot mine commits")
|
|
133
|
+
return []
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
repo = Repo(self._project_root, search_parent_directories=True)
|
|
137
|
+
except Exception:
|
|
138
|
+
logger.warning("Not a git repository: %s", self._project_root)
|
|
139
|
+
return []
|
|
140
|
+
|
|
141
|
+
candidates: list[DecisionCandidate] = []
|
|
142
|
+
count = 0
|
|
143
|
+
|
|
144
|
+
since_dt = datetime.now(UTC).replace(
|
|
145
|
+
day=max(1, datetime.now(UTC).day),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
for commit in repo.iter_commits(max_count=max_commits):
|
|
149
|
+
count += 1
|
|
150
|
+
|
|
151
|
+
message = commit.message.strip()
|
|
152
|
+
if not message:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
# Skip noise commits
|
|
156
|
+
if self._should_skip(message):
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# Check for decision keywords
|
|
160
|
+
matched_keywords = self._match_keywords(message)
|
|
161
|
+
if not matched_keywords:
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
# Extract changed files
|
|
165
|
+
changed_files: list[Path] = []
|
|
166
|
+
try:
|
|
167
|
+
if commit.parents:
|
|
168
|
+
diff = commit.parents[0].diff(commit)
|
|
169
|
+
changed_files = [
|
|
170
|
+
self._project_root / d.a_path
|
|
171
|
+
for d in diff
|
|
172
|
+
if d.a_path
|
|
173
|
+
]
|
|
174
|
+
except Exception:
|
|
175
|
+
pass # Diff extraction is best-effort
|
|
176
|
+
|
|
177
|
+
# Build candidate
|
|
178
|
+
title = self._extract_title(message)
|
|
179
|
+
context, decision_text = self._extract_context_and_decision(message)
|
|
180
|
+
|
|
181
|
+
confidence = self._compute_confidence(
|
|
182
|
+
message, matched_keywords, changed_files
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
candidates.append(DecisionCandidate(
|
|
186
|
+
title=title,
|
|
187
|
+
context=context,
|
|
188
|
+
decision_text=decision_text,
|
|
189
|
+
source_type="commit",
|
|
190
|
+
source_ref=str(commit.hexsha)[:12],
|
|
191
|
+
author=commit.author.name if commit.author else "unknown",
|
|
192
|
+
created_at=datetime.fromtimestamp(commit.committed_date, tz=UTC),
|
|
193
|
+
changed_files=changed_files[:20],
|
|
194
|
+
confidence=confidence,
|
|
195
|
+
keywords_matched=matched_keywords,
|
|
196
|
+
))
|
|
197
|
+
|
|
198
|
+
logger.info(
|
|
199
|
+
"Mined %d commits, found %d decision candidates",
|
|
200
|
+
count,
|
|
201
|
+
len(candidates),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return candidates
|
|
205
|
+
|
|
206
|
+
def _should_skip(self, message: str) -> bool:
|
|
207
|
+
"""Check if a commit message should be skipped.
|
|
208
|
+
|
|
209
|
+
Edge case: multi-line messages — check only the first line for
|
|
210
|
+
skip patterns but check all lines for decision keywords.
|
|
211
|
+
"""
|
|
212
|
+
first_line = message.split("\n")[0].lower().strip()
|
|
213
|
+
return any(re.match(pattern, first_line) for pattern in SKIP_PATTERNS)
|
|
214
|
+
|
|
215
|
+
def _match_keywords(self, message: str) -> list[str]:
|
|
216
|
+
"""Find decision-indicating keywords in the message."""
|
|
217
|
+
message_lower = message.lower()
|
|
218
|
+
return [kw for kw in DECISION_KEYWORDS if kw in message_lower]
|
|
219
|
+
|
|
220
|
+
def _extract_title(self, message: str) -> str:
|
|
221
|
+
"""Extract a title from the commit message.
|
|
222
|
+
|
|
223
|
+
Convention: first line of the commit message, truncated at 80 chars.
|
|
224
|
+
Edge case: first line is very long (someone put everything on one line).
|
|
225
|
+
"""
|
|
226
|
+
first_line = message.split("\n")[0].strip()
|
|
227
|
+
if len(first_line) > 80:
|
|
228
|
+
return first_line[:77] + "..."
|
|
229
|
+
return first_line
|
|
230
|
+
|
|
231
|
+
def _extract_context_and_decision(
|
|
232
|
+
self, message: str
|
|
233
|
+
) -> tuple[str, str]:
|
|
234
|
+
"""Separate the context ("why") from the decision ("what") in a message.
|
|
235
|
+
|
|
236
|
+
Heuristic: lines before "because"/"reason:" are the decision,
|
|
237
|
+
lines after are the context. If no such separator, the whole
|
|
238
|
+
message is both context and decision.
|
|
239
|
+
|
|
240
|
+
Edge case: message with no clear separation — use the first line
|
|
241
|
+
as the decision and the rest as context.
|
|
242
|
+
"""
|
|
243
|
+
lines = message.strip().split("\n")
|
|
244
|
+
|
|
245
|
+
if len(lines) <= 1:
|
|
246
|
+
return message, message
|
|
247
|
+
|
|
248
|
+
first_line = lines[0].strip()
|
|
249
|
+
body = "\n".join(lines[1:]).strip()
|
|
250
|
+
|
|
251
|
+
if not body:
|
|
252
|
+
return first_line, first_line
|
|
253
|
+
|
|
254
|
+
return body, first_line
|
|
255
|
+
|
|
256
|
+
def _compute_confidence(
|
|
257
|
+
self,
|
|
258
|
+
message: str,
|
|
259
|
+
keywords: list[str],
|
|
260
|
+
changed_files: list[Path],
|
|
261
|
+
) -> float:
|
|
262
|
+
"""Compute confidence score for a mined decision candidate.
|
|
263
|
+
|
|
264
|
+
Factors:
|
|
265
|
+
- Number of keywords matched (more = higher)
|
|
266
|
+
- Message length (longer = more context = higher)
|
|
267
|
+
- Number of files changed (fewer = more focused = higher)
|
|
268
|
+
- Presence of reasoning words (because, reason) = higher
|
|
269
|
+
- Presence of "adr" or "decision" = much higher
|
|
270
|
+
|
|
271
|
+
Score range: 0.2 (barely qualifying) to 0.8 (strong signal).
|
|
272
|
+
Never 1.0 — that requires human confirmation.
|
|
273
|
+
"""
|
|
274
|
+
score = 0.3 # Base score for matching any keyword
|
|
275
|
+
|
|
276
|
+
# Keyword count bonus
|
|
277
|
+
score += min(0.2, len(keywords) * 0.05)
|
|
278
|
+
|
|
279
|
+
# Message length bonus (meaningful messages are longer)
|
|
280
|
+
if len(message) > 200:
|
|
281
|
+
score += 0.1
|
|
282
|
+
if len(message) > 500:
|
|
283
|
+
score += 0.1
|
|
284
|
+
|
|
285
|
+
# Focused changes (fewer files = clearer decision)
|
|
286
|
+
if 1 <= len(changed_files) <= 5:
|
|
287
|
+
score += 0.05
|
|
288
|
+
|
|
289
|
+
# High-signal keywords
|
|
290
|
+
message_lower = message.lower()
|
|
291
|
+
if any(kw in message_lower for kw in ("adr", "architecture decision", "design decision")):
|
|
292
|
+
score += 0.15
|
|
293
|
+
|
|
294
|
+
if any(kw in message_lower for kw in ("because", "reason:", "rationale:")):
|
|
295
|
+
score += 0.1
|
|
296
|
+
|
|
297
|
+
return min(0.8, score)
|