ai-code-stats 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_code_stats/__init__.py +15 -0
- ai_code_stats/agents/__init__.py +1 -0
- ai_code_stats/agents/base.py +40 -0
- ai_code_stats/agents/claude_code.py +95 -0
- ai_code_stats/agents/codex.py +174 -0
- ai_code_stats/agents/registry.py +25 -0
- ai_code_stats/attribution.py +141 -0
- ai_code_stats/classify.py +203 -0
- ai_code_stats/cli.py +216 -0
- ai_code_stats/config.py +171 -0
- ai_code_stats/diffutil.py +96 -0
- ai_code_stats/githook/__init__.py +1 -0
- ai_code_stats/githook/post_commit.py +214 -0
- ai_code_stats/gitutil.py +51 -0
- ai_code_stats/hooks/__init__.py +1 -0
- ai_code_stats/hooks/session_event.py +14 -0
- ai_code_stats/hooks/tool_event.py +141 -0
- ai_code_stats/identity.py +89 -0
- ai_code_stats/install/__init__.py +5 -0
- ai_code_stats/install/agent_install.py +182 -0
- ai_code_stats/install/git_install.py +114 -0
- ai_code_stats/models.py +237 -0
- ai_code_stats/paths.py +85 -0
- ai_code_stats/py.typed +0 -0
- ai_code_stats/reporters/__init__.py +1 -0
- ai_code_stats/reporters/base.py +60 -0
- ai_code_stats/reporters/command.py +45 -0
- ai_code_stats/reporters/http_webhook.py +79 -0
- ai_code_stats/reporters/json_file.py +24 -0
- ai_code_stats/reporters/registry.py +104 -0
- ai_code_stats/storage.py +119 -0
- ai_code_stats/tokens.py +68 -0
- ai_code_stats/util.py +39 -0
- ai_code_stats-0.1.0.dist-info/METADATA +179 -0
- ai_code_stats-0.1.0.dist-info/RECORD +38 -0
- ai_code_stats-0.1.0.dist-info/WHEEL +5 -0
- ai_code_stats-0.1.0.dist-info/entry_points.txt +2 -0
- ai_code_stats-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""ai-code-stats: 统计 CodingAgent 生成代码的采纳率、AI 代码行数与 token 消耗。
|
|
2
|
+
|
|
3
|
+
按 git 仓库 × 提交人维度,在每次提交时上报本次提交的总代码行数、AI 代码行数、
|
|
4
|
+
AI 占比、采纳率以及 token 用量。支持 Claude Code 与 Codex 两种 Agent,
|
|
5
|
+
上报后端可插拔(HTTP webhook / 本地 JSON 文件 / 自定义命令)。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.0"
|
|
9
|
+
|
|
10
|
+
# 数据契约版本:JSON Schema 与上报信封共用,破坏性变更时递增。
|
|
11
|
+
SCHEMA_VERSION = "1.0"
|
|
12
|
+
|
|
13
|
+
PLUGIN_NAME = "ai-code-stats"
|
|
14
|
+
|
|
15
|
+
__all__ = ["__version__", "SCHEMA_VERSION", "PLUGIN_NAME"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Agent 适配层:把不同 CodingAgent 的钩子载荷归一成统一的捕获结果。"""
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Agent 适配器抽象接口。
|
|
2
|
+
|
|
3
|
+
每个适配器把某个 Agent 的钩子 stdin 载荷解析成统一的 :class:`HookCapture`:
|
|
4
|
+
一次 hook 触发涉及的文件编辑列表 + 该 session 的累计 token 用量 + session/cwd。
|
|
5
|
+
上层 ``hooks/tool_event`` 不关心 Agent 差异,只消费 ``HookCapture``。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ParsedEdit:
|
|
17
|
+
"""一次文件编辑解析结果。``added``/``removed`` 为去掉 diff 前缀后的纯文本行。"""
|
|
18
|
+
|
|
19
|
+
file_path: str
|
|
20
|
+
tool: str
|
|
21
|
+
added: List[str] = field(default_factory=list)
|
|
22
|
+
removed: List[str] = field(default_factory=list)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class HookCapture:
|
|
27
|
+
session_id: str
|
|
28
|
+
cwd: Optional[str]
|
|
29
|
+
edits: List[ParsedEdit] = field(default_factory=list)
|
|
30
|
+
# 该 session 到目前为止的累计 token 用量({} 表示未知)。
|
|
31
|
+
cumulative_usage: Dict[str, int] = field(default_factory=dict)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AgentAdapter(ABC):
|
|
35
|
+
name: str = ""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def parse(self, payload: Dict[str, Any]) -> HookCapture:
|
|
39
|
+
"""把钩子 stdin 载荷解析成 :class:`HookCapture`。"""
|
|
40
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Claude Code 适配器。
|
|
2
|
+
|
|
3
|
+
PostToolUse 钩子 stdin 载荷(JSON):
|
|
4
|
+
{
|
|
5
|
+
"session_id": "...",
|
|
6
|
+
"transcript_path": "/path/to/session.jsonl",
|
|
7
|
+
"cwd": "/abs/cwd",
|
|
8
|
+
"hook_event_name": "PostToolUse",
|
|
9
|
+
"tool_name": "Edit" | "Write" | "MultiEdit",
|
|
10
|
+
"tool_input": {...},
|
|
11
|
+
"tool_response": {...}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
token 用量从 transcript JSONL 累计:每条 assistant 消息带 ``message.usage``。
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from typing import Any, Dict, List
|
|
21
|
+
|
|
22
|
+
from ..diffutil import added_lines_between
|
|
23
|
+
from .base import AgentAdapter, HookCapture, ParsedEdit
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ClaudeCodeAdapter(AgentAdapter):
|
|
27
|
+
name = "claude_code"
|
|
28
|
+
|
|
29
|
+
def parse(self, payload: Dict[str, Any]) -> HookCapture:
|
|
30
|
+
session_id = str(payload.get("session_id", ""))
|
|
31
|
+
cwd = payload.get("cwd")
|
|
32
|
+
edits = self._parse_edits(payload)
|
|
33
|
+
usage = self._read_usage(payload.get("transcript_path"))
|
|
34
|
+
return HookCapture(
|
|
35
|
+
session_id=session_id, cwd=cwd, edits=edits, cumulative_usage=usage
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# ------------------------------------------------------------------
|
|
39
|
+
def _parse_edits(self, payload: Dict[str, Any]) -> List[ParsedEdit]:
|
|
40
|
+
tool = str(payload.get("tool_name", ""))
|
|
41
|
+
ti = payload.get("tool_input") or {}
|
|
42
|
+
if not isinstance(ti, dict):
|
|
43
|
+
return []
|
|
44
|
+
file_path = ti.get("file_path") or ti.get("path") or ""
|
|
45
|
+
|
|
46
|
+
if tool == "Write":
|
|
47
|
+
content = ti.get("content", "") or ""
|
|
48
|
+
return [ParsedEdit(file_path=file_path, tool=tool,
|
|
49
|
+
added=content.splitlines(), removed=[])]
|
|
50
|
+
|
|
51
|
+
if tool == "Edit":
|
|
52
|
+
added, removed = added_lines_between(
|
|
53
|
+
ti.get("old_string", "") or "", ti.get("new_string", "") or ""
|
|
54
|
+
)
|
|
55
|
+
return [ParsedEdit(file_path=file_path, tool=tool, added=added, removed=removed)]
|
|
56
|
+
|
|
57
|
+
if tool == "MultiEdit":
|
|
58
|
+
added: List[str] = []
|
|
59
|
+
removed: List[str] = []
|
|
60
|
+
for e in ti.get("edits", []) or []:
|
|
61
|
+
a, r = added_lines_between(
|
|
62
|
+
e.get("old_string", "") or "", e.get("new_string", "") or ""
|
|
63
|
+
)
|
|
64
|
+
added.extend(a)
|
|
65
|
+
removed.extend(r)
|
|
66
|
+
return [ParsedEdit(file_path=file_path, tool=tool, added=added, removed=removed)]
|
|
67
|
+
|
|
68
|
+
# 其它工具(Bash/Read/...)不产生 AI 代码归因。
|
|
69
|
+
return []
|
|
70
|
+
|
|
71
|
+
def _read_usage(self, transcript_path: Any) -> Dict[str, int]:
|
|
72
|
+
agg = {"input": 0, "output": 0, "cache_read": 0}
|
|
73
|
+
if not transcript_path:
|
|
74
|
+
return {}
|
|
75
|
+
try:
|
|
76
|
+
with open(transcript_path, "r", encoding="utf-8") as fh:
|
|
77
|
+
for line in fh:
|
|
78
|
+
line = line.strip()
|
|
79
|
+
if not line:
|
|
80
|
+
continue
|
|
81
|
+
try:
|
|
82
|
+
obj = json.loads(line)
|
|
83
|
+
except json.JSONDecodeError:
|
|
84
|
+
continue
|
|
85
|
+
msg = obj.get("message") if isinstance(obj, dict) else None
|
|
86
|
+
usage = msg.get("usage") if isinstance(msg, dict) else None
|
|
87
|
+
if not isinstance(usage, dict):
|
|
88
|
+
continue
|
|
89
|
+
agg["input"] += int(usage.get("input_tokens", 0) or 0)
|
|
90
|
+
agg["input"] += int(usage.get("cache_creation_input_tokens", 0) or 0)
|
|
91
|
+
agg["output"] += int(usage.get("output_tokens", 0) or 0)
|
|
92
|
+
agg["cache_read"] += int(usage.get("cache_read_input_tokens", 0) or 0)
|
|
93
|
+
except OSError:
|
|
94
|
+
return {}
|
|
95
|
+
return agg
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Codex CLI 适配器。
|
|
2
|
+
|
|
3
|
+
Codex 的两条信息源:
|
|
4
|
+
- **编辑**:通过 ``apply_patch`` 工具调用,补丁文本随钩子载荷传入;本适配器从
|
|
5
|
+
载荷中提取补丁字符串并解析新增/删除行。
|
|
6
|
+
- **token**:每个会话持久化为 ``$CODEX_HOME/sessions/YYYY/MM/DD/rollout-<id>.jsonl``
|
|
7
|
+
(``CODEX_HOME`` 默认 ``~/.codex``),其中 ``token_count`` 事件带累计用量。
|
|
8
|
+
本适配器按 session_id 定位该文件并读取最新累计值。
|
|
9
|
+
|
|
10
|
+
由于 Codex 钩子载荷 schema 仍在演进,编辑解析做成「容错扫描」:在载荷里找任何
|
|
11
|
+
形似补丁的字符串来解析,找不到就只上报 token。
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Dict, List, Optional
|
|
20
|
+
|
|
21
|
+
from ..diffutil import parse_patch
|
|
22
|
+
from .base import AgentAdapter, HookCapture, ParsedEdit
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CodexAdapter(AgentAdapter):
|
|
26
|
+
name = "codex"
|
|
27
|
+
|
|
28
|
+
def parse(self, payload: Dict[str, Any]) -> HookCapture:
|
|
29
|
+
session_id = self._session_id(payload)
|
|
30
|
+
cwd = payload.get("cwd") or self._nested(payload, ["session", "cwd"])
|
|
31
|
+
edits = self._parse_edits(payload)
|
|
32
|
+
usage = self._read_usage(session_id, payload)
|
|
33
|
+
return HookCapture(
|
|
34
|
+
session_id=session_id, cwd=cwd, edits=edits, cumulative_usage=usage
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# ------------------------------------------------------------------
|
|
38
|
+
def _session_id(self, payload: Dict[str, Any]) -> str:
|
|
39
|
+
for key in ("session_id", "sessionId"):
|
|
40
|
+
if payload.get(key):
|
|
41
|
+
return str(payload[key])
|
|
42
|
+
nested = self._nested(payload, ["session", "id"])
|
|
43
|
+
return str(nested) if nested else ""
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _nested(d: Dict[str, Any], path: List[str]) -> Any:
|
|
47
|
+
node: Any = d
|
|
48
|
+
for p in path:
|
|
49
|
+
if isinstance(node, dict) and p in node:
|
|
50
|
+
node = node[p]
|
|
51
|
+
else:
|
|
52
|
+
return None
|
|
53
|
+
return node
|
|
54
|
+
|
|
55
|
+
def _candidate_patch_strings(self, payload: Dict[str, Any]) -> List[str]:
|
|
56
|
+
out: List[str] = []
|
|
57
|
+
|
|
58
|
+
def collect(v: Any) -> None:
|
|
59
|
+
if isinstance(v, str):
|
|
60
|
+
if "*** Begin Patch" in v or "@@" in v or v.lstrip().startswith(("+", "-")):
|
|
61
|
+
out.append(v)
|
|
62
|
+
elif isinstance(v, dict):
|
|
63
|
+
for sub in v.values():
|
|
64
|
+
collect(sub)
|
|
65
|
+
elif isinstance(v, list):
|
|
66
|
+
for sub in v:
|
|
67
|
+
collect(sub)
|
|
68
|
+
|
|
69
|
+
for key in ("tool_input", "input", "patch", "arguments", "command", "tool_response"):
|
|
70
|
+
if key in payload:
|
|
71
|
+
collect(payload[key])
|
|
72
|
+
return out
|
|
73
|
+
|
|
74
|
+
def _parse_edits(self, payload: Dict[str, Any]) -> List[ParsedEdit]:
|
|
75
|
+
tool = str(payload.get("tool_name", "apply_patch"))
|
|
76
|
+
edits: List[ParsedEdit] = []
|
|
77
|
+
for patch in self._candidate_patch_strings(payload):
|
|
78
|
+
for pf in parse_patch(patch):
|
|
79
|
+
edits.append(
|
|
80
|
+
ParsedEdit(file_path=pf.path, tool=tool,
|
|
81
|
+
added=pf.added, removed=pf.removed)
|
|
82
|
+
)
|
|
83
|
+
return edits
|
|
84
|
+
|
|
85
|
+
# ------------------------------------------------------------------
|
|
86
|
+
def codex_home(self) -> Path:
|
|
87
|
+
return Path(os.environ.get("CODEX_HOME", str(Path.home() / ".codex")))
|
|
88
|
+
|
|
89
|
+
def _find_rollout(self, session_id: str, payload: Dict[str, Any]) -> Optional[Path]:
|
|
90
|
+
# 载荷可能直接给了 rollout 路径。
|
|
91
|
+
for key in ("rollout_path", "transcript_path", "session_path"):
|
|
92
|
+
p = payload.get(key)
|
|
93
|
+
if p and Path(p).is_file():
|
|
94
|
+
return Path(p)
|
|
95
|
+
if not session_id:
|
|
96
|
+
return None
|
|
97
|
+
sessions = self.codex_home() / "sessions"
|
|
98
|
+
if not sessions.is_dir():
|
|
99
|
+
return None
|
|
100
|
+
matches = list(sessions.rglob(f"*{session_id}*.jsonl"))
|
|
101
|
+
if matches:
|
|
102
|
+
# 取最新修改的一个。
|
|
103
|
+
return max(matches, key=lambda p: p.stat().st_mtime)
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
def _read_usage(self, session_id: str, payload: Dict[str, Any]) -> Dict[str, int]:
|
|
107
|
+
# 载荷自带 usage 时优先用。
|
|
108
|
+
inline = self._extract_usage_obj(payload)
|
|
109
|
+
if inline:
|
|
110
|
+
return inline
|
|
111
|
+
|
|
112
|
+
path = self._find_rollout(session_id, payload)
|
|
113
|
+
if path is None:
|
|
114
|
+
return {}
|
|
115
|
+
last: Dict[str, int] = {}
|
|
116
|
+
try:
|
|
117
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
118
|
+
for line in fh:
|
|
119
|
+
line = line.strip()
|
|
120
|
+
if not line:
|
|
121
|
+
continue
|
|
122
|
+
try:
|
|
123
|
+
obj = json.loads(line)
|
|
124
|
+
except json.JSONDecodeError:
|
|
125
|
+
continue
|
|
126
|
+
found = self._extract_usage_obj(obj)
|
|
127
|
+
if found:
|
|
128
|
+
last = found # token_count 累计,保留最后一个
|
|
129
|
+
except OSError:
|
|
130
|
+
return {}
|
|
131
|
+
return last
|
|
132
|
+
|
|
133
|
+
def _extract_usage_obj(self, obj: Any) -> Dict[str, int]:
|
|
134
|
+
"""在任意嵌套结构里找 usage,归一为 {input,output,cache_read}。
|
|
135
|
+
|
|
136
|
+
优先 ``total_token_usage``(累计),否则用形似 usage 的对象
|
|
137
|
+
(含 ``input_tokens`` 与 ``output_tokens``)。
|
|
138
|
+
"""
|
|
139
|
+
result: Dict[str, int] = {}
|
|
140
|
+
|
|
141
|
+
def walk(node: Any) -> None:
|
|
142
|
+
nonlocal result
|
|
143
|
+
if isinstance(node, dict):
|
|
144
|
+
if isinstance(node.get("total_token_usage"), dict):
|
|
145
|
+
result = self._normalize_codex_usage(node["total_token_usage"])
|
|
146
|
+
return
|
|
147
|
+
if "input_tokens" in node and "output_tokens" in node:
|
|
148
|
+
result = self._normalize_codex_usage(node)
|
|
149
|
+
# 继续找可能存在的 total_token_usage(更优)
|
|
150
|
+
for v in node.values():
|
|
151
|
+
walk(v)
|
|
152
|
+
elif isinstance(node, list):
|
|
153
|
+
for v in node:
|
|
154
|
+
walk(v)
|
|
155
|
+
|
|
156
|
+
walk(obj)
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def _normalize_codex_usage(u: Dict[str, Any]) -> Dict[str, int]:
|
|
161
|
+
def g(*keys: str) -> int:
|
|
162
|
+
for k in keys:
|
|
163
|
+
if k in u and u[k] is not None:
|
|
164
|
+
try:
|
|
165
|
+
return int(u[k])
|
|
166
|
+
except (TypeError, ValueError):
|
|
167
|
+
return 0
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
"input": g("input_tokens"),
|
|
172
|
+
"output": g("output_tokens") + g("reasoning_output_tokens"),
|
|
173
|
+
"cache_read": g("cached_input_tokens", "cache_read_input_tokens"),
|
|
174
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Agent 适配器注册表。"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Type
|
|
6
|
+
|
|
7
|
+
from .base import AgentAdapter
|
|
8
|
+
from .claude_code import ClaudeCodeAdapter
|
|
9
|
+
from .codex import CodexAdapter
|
|
10
|
+
|
|
11
|
+
_REGISTRY: Dict[str, Type[AgentAdapter]] = {
|
|
12
|
+
ClaudeCodeAdapter.name: ClaudeCodeAdapter,
|
|
13
|
+
CodexAdapter.name: CodexAdapter,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_adapter(name: str) -> AgentAdapter:
|
|
18
|
+
cls = _REGISTRY.get(name)
|
|
19
|
+
if cls is None:
|
|
20
|
+
raise KeyError(f"未知 Agent 适配器: {name}(可用: {', '.join(_REGISTRY)})")
|
|
21
|
+
return cls()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def available_agents() -> list:
|
|
25
|
+
return list(_REGISTRY)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""归因引擎:把 commit 的新增行与窗口内 AI 生成行做指纹匹配,算采纳率与 AI 占比。
|
|
2
|
+
|
|
3
|
+
定义(raw 与 effective 各算一份):
|
|
4
|
+
采纳率 adoption_rate = 落入本次 commit 的 AI 行数 / 窗口内 AI 生成的行数
|
|
5
|
+
AI 占比 ai_share = 落入本次 commit 的 AI 行数 / 本次 commit 总新增行数
|
|
6
|
+
|
|
7
|
+
匹配用「归一化内容哈希的多重集」实现:
|
|
8
|
+
- 同一文件优先匹配;匹配不到再回退全局(覆盖文件移动/重命名导致的行位移)。
|
|
9
|
+
- 消费式匹配:每个 AI 指纹最多被一条 commit 行命中一次,避免重复计数。
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
from collections import Counter, defaultdict
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Dict, Iterable, List, Tuple
|
|
18
|
+
|
|
19
|
+
from .classify import Classifier
|
|
20
|
+
from .models import AIEditEvent, AIModeMetrics, ModeMetrics
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def normalize_line(text: str, mode: str = "strip") -> str:
|
|
24
|
+
if mode == "strip":
|
|
25
|
+
return text.strip()
|
|
26
|
+
return text
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def line_hash(normalized: str) -> str:
|
|
30
|
+
return "sha256:" + hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _participates(normalized: str, min_line_length: int) -> bool:
|
|
34
|
+
"""归一化后长度不足的行(如孤立的 ``}``)不参与指纹匹配,降低误命中。"""
|
|
35
|
+
return len(normalized) >= max(min_line_length, 1)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class FileDiff:
|
|
40
|
+
"""一个文件在本次 commit 中的变更(已按文件过滤纳入统计)。"""
|
|
41
|
+
|
|
42
|
+
path: str
|
|
43
|
+
added: List[str] = field(default_factory=list)
|
|
44
|
+
removed: List[str] = field(default_factory=list)
|
|
45
|
+
is_rename: bool = False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FingerprintPool:
|
|
49
|
+
"""可消费的 AI 行指纹多重集:按文件分桶,支持回退全局匹配。"""
|
|
50
|
+
|
|
51
|
+
def __init__(self) -> None:
|
|
52
|
+
self.by_file: Dict[str, Counter] = defaultdict(Counter)
|
|
53
|
+
|
|
54
|
+
def add(self, file_path: str, h: str) -> None:
|
|
55
|
+
self.by_file[file_path][h] += 1
|
|
56
|
+
|
|
57
|
+
def consume(self, file_path: str, h: str) -> bool:
|
|
58
|
+
"""优先在同文件桶消费一个 ``h``;没有则回退到任意文件桶。"""
|
|
59
|
+
ctr = self.by_file.get(file_path)
|
|
60
|
+
if ctr is not None and ctr.get(h, 0) > 0:
|
|
61
|
+
ctr[h] -= 1
|
|
62
|
+
return True
|
|
63
|
+
for other in self.by_file.values():
|
|
64
|
+
if other.get(h, 0) > 0:
|
|
65
|
+
other[h] -= 1
|
|
66
|
+
return True
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class AttributionResult:
|
|
72
|
+
totals_raw: ModeMetrics
|
|
73
|
+
totals_effective: ModeMetrics
|
|
74
|
+
ai_raw: AIModeMetrics
|
|
75
|
+
ai_effective: AIModeMetrics
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _rate(numerator: int, denominator: int) -> float:
|
|
79
|
+
return (numerator / denominator) if denominator else 0.0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_attribution(
|
|
83
|
+
events: Iterable[AIEditEvent],
|
|
84
|
+
file_diffs: Iterable[FileDiff],
|
|
85
|
+
classifier: Classifier,
|
|
86
|
+
min_line_length: int = 1,
|
|
87
|
+
normalize: str = "strip",
|
|
88
|
+
) -> AttributionResult:
|
|
89
|
+
"""核心计算。``events`` 为窗口内 AI 编辑事件,``file_diffs`` 为本次 commit 变更。"""
|
|
90
|
+
pool = FingerprintPool()
|
|
91
|
+
gen_raw = 0
|
|
92
|
+
gen_eff = 0
|
|
93
|
+
for ev in events:
|
|
94
|
+
for al in ev.added_lines:
|
|
95
|
+
norm = normalize_line(al.text, normalize)
|
|
96
|
+
if not _participates(norm, min_line_length):
|
|
97
|
+
continue
|
|
98
|
+
gen_raw += 1
|
|
99
|
+
if al.is_effective:
|
|
100
|
+
gen_eff += 1
|
|
101
|
+
pool.add(ev.file_path, line_hash(norm))
|
|
102
|
+
|
|
103
|
+
tot_added_raw = tot_added_eff = 0
|
|
104
|
+
tot_removed_raw = tot_removed_eff = 0
|
|
105
|
+
ai_added_raw = ai_added_eff = 0
|
|
106
|
+
|
|
107
|
+
for fd in file_diffs:
|
|
108
|
+
added_flags = classifier.classify_lines(fd.added, fd.path)
|
|
109
|
+
removed_flags = classifier.classify_lines(fd.removed, fd.path)
|
|
110
|
+
tot_added_raw += len(fd.added)
|
|
111
|
+
tot_added_eff += sum(1 for f in added_flags if f)
|
|
112
|
+
tot_removed_raw += len(fd.removed)
|
|
113
|
+
tot_removed_eff += sum(1 for f in removed_flags if f)
|
|
114
|
+
|
|
115
|
+
for line, is_eff in zip(fd.added, added_flags):
|
|
116
|
+
norm = normalize_line(line, normalize)
|
|
117
|
+
if not _participates(norm, min_line_length):
|
|
118
|
+
continue
|
|
119
|
+
if pool.consume(fd.path, line_hash(norm)):
|
|
120
|
+
ai_added_raw += 1
|
|
121
|
+
if is_eff:
|
|
122
|
+
ai_added_eff += 1
|
|
123
|
+
|
|
124
|
+
ai_raw = AIModeMetrics(
|
|
125
|
+
ai_lines_added=ai_added_raw,
|
|
126
|
+
ai_lines_generated_in_window=gen_raw,
|
|
127
|
+
adoption_rate=_rate(ai_added_raw, gen_raw),
|
|
128
|
+
ai_share_of_commit=_rate(ai_added_raw, tot_added_raw),
|
|
129
|
+
)
|
|
130
|
+
ai_eff = AIModeMetrics(
|
|
131
|
+
ai_lines_added=ai_added_eff,
|
|
132
|
+
ai_lines_generated_in_window=gen_eff,
|
|
133
|
+
adoption_rate=_rate(ai_added_eff, gen_eff),
|
|
134
|
+
ai_share_of_commit=_rate(ai_added_eff, tot_added_eff),
|
|
135
|
+
)
|
|
136
|
+
return AttributionResult(
|
|
137
|
+
totals_raw=ModeMetrics(tot_added_raw, tot_removed_raw),
|
|
138
|
+
totals_effective=ModeMetrics(tot_added_eff, tot_removed_eff),
|
|
139
|
+
ai_raw=ai_raw,
|
|
140
|
+
ai_effective=ai_eff,
|
|
141
|
+
)
|