gitinsight-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitinsight/__init__.py +7 -0
- gitinsight/__main__.py +92 -0
- gitinsight/analysis.py +541 -0
- gitinsight/charts.py +849 -0
- gitinsight/dashboard.py +622 -0
- gitinsight/git_reader.py +225 -0
- gitinsight/report.py +26 -0
- gitinsight_cli-0.1.0.dist-info/METADATA +152 -0
- gitinsight_cli-0.1.0.dist-info/RECORD +11 -0
- gitinsight_cli-0.1.0.dist-info/WHEEL +4 -0
- gitinsight_cli-0.1.0.dist-info/entry_points.txt +2 -0
gitinsight/__init__.py
ADDED
gitinsight/__main__.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
main.py — Git 项目人员分析可视化系统入口。
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
# Fix Windows console encoding for Chinese + emoji
|
|
13
|
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
14
|
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
|
|
15
|
+
|
|
16
|
+
# Configure loguru
|
|
17
|
+
logger.remove()
|
|
18
|
+
logger.add(
|
|
19
|
+
sys.stderr,
|
|
20
|
+
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from .analysis import compute_insights, filter_automated_commits, prepare_dataframe
|
|
24
|
+
from .dashboard import build_dashboard_html
|
|
25
|
+
from .git_reader import get_git_log, parse_git_log
|
|
26
|
+
from .report import print_summary
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def resolve_git_dir() -> str | None:
|
|
30
|
+
if len(sys.argv) > 1:
|
|
31
|
+
return sys.argv[1]
|
|
32
|
+
|
|
33
|
+
git_dir = input("请输入Git仓库目录路径: ").strip()
|
|
34
|
+
if not git_dir:
|
|
35
|
+
logger.error("未指定Git仓库目录。")
|
|
36
|
+
logger.info("使用方法: python main.py [git_repo_directory]")
|
|
37
|
+
return None
|
|
38
|
+
return git_dir
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main() -> None:
|
|
42
|
+
git_dir = resolve_git_dir()
|
|
43
|
+
if not git_dir:
|
|
44
|
+
return
|
|
45
|
+
|
|
46
|
+
logger.info("[1/5] 正在读取 Git 日志...")
|
|
47
|
+
stdout = get_git_log(git_dir)
|
|
48
|
+
if stdout is None:
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
|
|
51
|
+
if not stdout.strip():
|
|
52
|
+
logger.warning("当前仓库没有提交记录。")
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
logger.info("[2/5] 正在解析提交记录...")
|
|
56
|
+
commits_df, file_stats_df = parse_git_log(stdout)
|
|
57
|
+
if commits_df.empty:
|
|
58
|
+
logger.error("无法读取任何有效提交记录。")
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
logger.info(f"解析到 {len(commits_df)} 条提交,{len(file_stats_df)} 条文件变更记录")
|
|
62
|
+
|
|
63
|
+
# 过滤自动化提交
|
|
64
|
+
df_filtered, filter_stats = filter_automated_commits(commits_df)
|
|
65
|
+
if df_filtered.empty:
|
|
66
|
+
logger.warning("过滤后没有有效的人工提交记录。")
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# 准备 DataFrame(解析时间等)
|
|
70
|
+
logger.info("[3/5] 正在计算分析指标...")
|
|
71
|
+
df_prepared = prepare_dataframe(df_filtered)
|
|
72
|
+
if df_prepared.empty:
|
|
73
|
+
logger.error("所有提交时间都无法解析。")
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
# 计算所有洞察指标
|
|
77
|
+
metrics = compute_insights(df_prepared, file_stats_df)
|
|
78
|
+
|
|
79
|
+
# 生成仪表板
|
|
80
|
+
repo_name = Path(git_dir).resolve().name or "git_repo"
|
|
81
|
+
output_html = f"git_analysis_{repo_name}.html"
|
|
82
|
+
|
|
83
|
+
logger.info("[4/5] 正在生成可视化仪表板...")
|
|
84
|
+
build_dashboard_html(metrics, repo_name, output_html)
|
|
85
|
+
|
|
86
|
+
# 打印摘要
|
|
87
|
+
logger.info("[5/5] 完成!")
|
|
88
|
+
print_summary(metrics, filter_stats, {"html": output_html})
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|
gitinsight/analysis.py
ADDED
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analysis.py — 数据分析引擎。
|
|
3
|
+
|
|
4
|
+
职责:
|
|
5
|
+
- 过滤自动化提交
|
|
6
|
+
- 解析时间字段并补充分析所需列
|
|
7
|
+
- 计算月度人员变动趋势
|
|
8
|
+
- 计算作者维度统计(活跃状态、参与阶段、贡献程度、夜间提交等)
|
|
9
|
+
- 计算代码活动趋势(月度增删行数)
|
|
10
|
+
- 计算文件修改热度
|
|
11
|
+
- 计算代码稳定性分析
|
|
12
|
+
- 汇总核心指标
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from datetime import timedelta
|
|
18
|
+
from typing import Any, Dict, Optional
|
|
19
|
+
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from tqdm import tqdm
|
|
22
|
+
|
|
23
|
+
TARGET_TZ = "Asia/Shanghai"
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Data classes
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class FilterStats:
|
|
31
|
+
before: int
|
|
32
|
+
removed: int
|
|
33
|
+
after: int
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Time helpers
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
def get_adjusted_time_in_6am_day(timestamp: pd.Timestamp) -> float:
|
|
41
|
+
"""
|
|
42
|
+
以 06:00 为一天分界,返回调整后的时间(小时+分钟的小数)。
|
|
43
|
+
00:00 -> 24.0, 05:59 -> 29.98, 06:00 -> 6.0, 23:59 -> 23.98
|
|
44
|
+
"""
|
|
45
|
+
hour = timestamp.hour
|
|
46
|
+
minute = timestamp.minute
|
|
47
|
+
second = timestamp.second
|
|
48
|
+
adjusted_hour = hour + 24 if hour < 6 else hour
|
|
49
|
+
return adjusted_hour + minute / 60.0 + second / 3600.0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_commit_date_with_6am_cutoff(timestamp: pd.Timestamp):
|
|
53
|
+
"""以 06:00 为分界点,凌晨提交算作前一天。"""
|
|
54
|
+
if timestamp.hour < 6:
|
|
55
|
+
return (timestamp - timedelta(days=1)).date()
|
|
56
|
+
return timestamp.date()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# Filtering
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
def filter_automated_commits(df: pd.DataFrame) -> tuple[pd.DataFrame, FilterStats]:
|
|
64
|
+
"""过滤时区为 +0000 的自动化提交。"""
|
|
65
|
+
before = len(df)
|
|
66
|
+
|
|
67
|
+
def _is_automated(datetime_str: str) -> bool:
|
|
68
|
+
if pd.isna(datetime_str):
|
|
69
|
+
return True
|
|
70
|
+
return str(datetime_str).strip().endswith("+0000")
|
|
71
|
+
|
|
72
|
+
filtered = df[~df["datetime_str"].apply(_is_automated)].copy()
|
|
73
|
+
after = len(filtered)
|
|
74
|
+
return filtered, FilterStats(before=before, removed=before - after, after=after)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# DataFrame preparation
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
82
|
+
"""解析时间并补充分析所需字段。"""
|
|
83
|
+
df = df.copy()
|
|
84
|
+
df["datetime_utc"] = pd.to_datetime(
|
|
85
|
+
df["datetime_str"],
|
|
86
|
+
format="%Y-%m-%d %H:%M:%S %z",
|
|
87
|
+
utc=True,
|
|
88
|
+
errors="coerce",
|
|
89
|
+
)
|
|
90
|
+
df = df.dropna(subset=["datetime_utc"]).copy()
|
|
91
|
+
if df.empty:
|
|
92
|
+
return df
|
|
93
|
+
|
|
94
|
+
# Enable tqdm for pandas
|
|
95
|
+
tqdm.pandas(desc="Processing timestamps", leave=False)
|
|
96
|
+
|
|
97
|
+
df["local_time"] = df["datetime_utc"].dt.tz_convert(TARGET_TZ).dt.tz_localize(None)
|
|
98
|
+
df["hour"] = df["local_time"].dt.hour
|
|
99
|
+
df["minute"] = df["local_time"].dt.minute
|
|
100
|
+
df["second"] = df["local_time"].dt.second
|
|
101
|
+
df["date"] = df["local_time"].dt.date
|
|
102
|
+
df["adjusted_time"] = df["local_time"].progress_apply(get_adjusted_time_in_6am_day)
|
|
103
|
+
df["date_6am_cutoff"] = df["local_time"].progress_apply(get_commit_date_with_6am_cutoff)
|
|
104
|
+
df["time_in_6am_day"] = df["adjusted_time"]
|
|
105
|
+
|
|
106
|
+
return df
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
# Time range filtering
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
def filter_by_time_range(
|
|
114
|
+
df: pd.DataFrame,
|
|
115
|
+
period: str = "all",
|
|
116
|
+
ref_date: Optional[pd.Timestamp] = None,
|
|
117
|
+
) -> pd.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
按时间范围筛选数据。
|
|
120
|
+
|
|
121
|
+
period: 'half_year' | 'one_year' | 'five_years' | 'all'
|
|
122
|
+
"""
|
|
123
|
+
if period == "all" or df.empty:
|
|
124
|
+
return df
|
|
125
|
+
|
|
126
|
+
if ref_date is None:
|
|
127
|
+
ref_date = pd.Timestamp.now()
|
|
128
|
+
|
|
129
|
+
days_map = {
|
|
130
|
+
"half_year": 180,
|
|
131
|
+
"one_year": 365,
|
|
132
|
+
"five_years": 1825,
|
|
133
|
+
}
|
|
134
|
+
days = days_map.get(period)
|
|
135
|
+
if days is None:
|
|
136
|
+
return df
|
|
137
|
+
|
|
138
|
+
cutoff = ref_date - pd.Timedelta(days=days)
|
|
139
|
+
return df[df["local_time"] >= cutoff].copy()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
# Monthly personnel trends
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
|
|
146
|
+
def compute_monthly_trends(df: pd.DataFrame) -> pd.DataFrame:
|
|
147
|
+
"""计算每月的人员变动趋势。"""
|
|
148
|
+
if df.empty:
|
|
149
|
+
return pd.DataFrame()
|
|
150
|
+
|
|
151
|
+
df = df.copy()
|
|
152
|
+
df["month_date"] = (
|
|
153
|
+
pd.to_datetime(df["date_6am_cutoff"])
|
|
154
|
+
.dt.to_period("M")
|
|
155
|
+
.dt.to_timestamp()
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# 每月活跃开发者
|
|
159
|
+
monthly_active = (
|
|
160
|
+
df.groupby("month_date")["author"].nunique().rename("active_authors")
|
|
161
|
+
)
|
|
162
|
+
# 新增开发者(当月首次提交)
|
|
163
|
+
author_first_commit = df.groupby("author")["month_date"].min()
|
|
164
|
+
new_authors = (
|
|
165
|
+
author_first_commit.value_counts().sort_index().rename("new_authors")
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
trend_df = pd.DataFrame(index=monthly_active.index)
|
|
169
|
+
trend_df = trend_df.join(new_authors).fillna(0)
|
|
170
|
+
trend_df["cumulative_authors"] = trend_df["new_authors"].cumsum()
|
|
171
|
+
trend_df = trend_df.join(monthly_active).fillna(0)
|
|
172
|
+
|
|
173
|
+
return trend_df.astype(int)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Author half-year trends
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
def compute_author_halfyear_trends(df: pd.DataFrame) -> pd.DataFrame:
|
|
181
|
+
"""按半年聚合每位开发者的提交数趋势。"""
|
|
182
|
+
if df.empty:
|
|
183
|
+
return pd.DataFrame()
|
|
184
|
+
|
|
185
|
+
df = df.copy()
|
|
186
|
+
dt = pd.to_datetime(df["date_6am_cutoff"])
|
|
187
|
+
half = ((dt.dt.month - 1) // 6) + 1
|
|
188
|
+
df["half_year_label"] = dt.dt.year.astype(str) + "-H" + half.astype(str)
|
|
189
|
+
df["half_year_start"] = pd.to_datetime(
|
|
190
|
+
dt.dt.year.astype(str) + "-" + half.map({1: "01", 2: "07"}) + "-01"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
grouped = (
|
|
194
|
+
df.groupby(["half_year_start", "half_year_label", "author"]).size()
|
|
195
|
+
.rename("commits")
|
|
196
|
+
.reset_index()
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
pivot = grouped.pivot_table(
|
|
200
|
+
index=["half_year_start", "half_year_label"],
|
|
201
|
+
columns="author",
|
|
202
|
+
values="commits",
|
|
203
|
+
fill_value=0,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return pivot.sort_index(level=0)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def compute_author_halfyear_ranges(df: pd.DataFrame) -> pd.DataFrame:
|
|
210
|
+
"""计算每位开发者的首末半年区间。"""
|
|
211
|
+
if df.empty:
|
|
212
|
+
return pd.DataFrame()
|
|
213
|
+
|
|
214
|
+
df = df.copy()
|
|
215
|
+
dt = pd.to_datetime(df["date_6am_cutoff"])
|
|
216
|
+
half = ((dt.dt.month - 1) // 6) + 1
|
|
217
|
+
df["half_year_start"] = pd.to_datetime(
|
|
218
|
+
dt.dt.year.astype(str) + "-" + half.map({1: "01", 2: "07"}) + "-01"
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
ranges = df.groupby("author")["half_year_start"].agg(
|
|
222
|
+
first_half_start="min",
|
|
223
|
+
last_half_start="max",
|
|
224
|
+
)
|
|
225
|
+
return ranges
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ---------------------------------------------------------------------------
|
|
229
|
+
# Author stats
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
|
|
232
|
+
def compute_author_stats(
|
|
233
|
+
df: pd.DataFrame,
|
|
234
|
+
ref_date: Optional[pd.Timestamp] = None,
|
|
235
|
+
) -> pd.DataFrame:
|
|
236
|
+
"""计算作者维度的详细统计信息。"""
|
|
237
|
+
if df.empty:
|
|
238
|
+
return pd.DataFrame()
|
|
239
|
+
|
|
240
|
+
if ref_date is None:
|
|
241
|
+
ref_date = pd.Timestamp.now(tz=TARGET_TZ)
|
|
242
|
+
|
|
243
|
+
# 基础聚合
|
|
244
|
+
agg_dict: dict[str, Any] = {
|
|
245
|
+
"total_commits": ("datetime_str", "count"),
|
|
246
|
+
"first_commit": ("local_time", "min"),
|
|
247
|
+
"last_commit": ("local_time", "max"),
|
|
248
|
+
"night_commits": ("time_in_6am_day", lambda x: int((x >= 20.0).sum())),
|
|
249
|
+
}
|
|
250
|
+
# 如果 df 中有 insertions / deletions 列,也聚合
|
|
251
|
+
if "insertions" in df.columns:
|
|
252
|
+
agg_dict["total_insertions"] = ("insertions", "sum")
|
|
253
|
+
agg_dict["total_deletions"] = ("deletions", "sum")
|
|
254
|
+
|
|
255
|
+
stats = df.groupby("author").agg(**agg_dict)
|
|
256
|
+
|
|
257
|
+
# 衍生指标
|
|
258
|
+
stats["maintenance_days"] = (
|
|
259
|
+
(stats["last_commit"] - stats["first_commit"]).dt.days
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# 活跃判定 (近 180 天)
|
|
263
|
+
days_since_last = (
|
|
264
|
+
ref_date - stats["last_commit"].dt.tz_localize(ref_date.tz)
|
|
265
|
+
).dt.days
|
|
266
|
+
stats["is_active"] = days_since_last <= 180
|
|
267
|
+
|
|
268
|
+
# 参与阶段
|
|
269
|
+
def _get_phase(days: int) -> str:
|
|
270
|
+
if days <= 90:
|
|
271
|
+
return "近期参与"
|
|
272
|
+
elif days <= 365:
|
|
273
|
+
return "中期参与"
|
|
274
|
+
return "历史参与"
|
|
275
|
+
|
|
276
|
+
stats["phase"] = days_since_last.apply(_get_phase)
|
|
277
|
+
|
|
278
|
+
# 贡献程度
|
|
279
|
+
stats["rank_pct"] = stats["total_commits"].rank(pct=True)
|
|
280
|
+
|
|
281
|
+
def _get_contribution_level(pct: float) -> str:
|
|
282
|
+
if pct > 0.8:
|
|
283
|
+
return "核心贡献者"
|
|
284
|
+
elif pct > 0.2:
|
|
285
|
+
return "常规贡献者"
|
|
286
|
+
return "偶尔贡献者"
|
|
287
|
+
|
|
288
|
+
stats["contribution_level"] = stats["rank_pct"].apply(_get_contribution_level)
|
|
289
|
+
|
|
290
|
+
# 夜间提交占比
|
|
291
|
+
stats["night_ratio"] = stats["night_commits"] / stats["total_commits"]
|
|
292
|
+
|
|
293
|
+
# Email (取最后一次使用的)
|
|
294
|
+
if "email" in df.columns:
|
|
295
|
+
email_map = df.groupby("author")["email"].last()
|
|
296
|
+
stats = stats.join(email_map)
|
|
297
|
+
|
|
298
|
+
return stats.sort_values("total_commits", ascending=False)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ---------------------------------------------------------------------------
|
|
302
|
+
# Daily commits (for calendar heatmap)
|
|
303
|
+
# ---------------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
def compute_daily_commits(df: pd.DataFrame) -> pd.Series:
|
|
306
|
+
"""按 date_6am_cutoff 聚合每日提交数。"""
|
|
307
|
+
if df.empty:
|
|
308
|
+
return pd.Series(dtype=int)
|
|
309
|
+
return df.groupby("date_6am_cutoff").size().sort_index()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
# Code activity (monthly insertions/deletions)
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
def compute_code_activity(df: pd.DataFrame) -> pd.DataFrame:
|
|
317
|
+
"""
|
|
318
|
+
按月聚合代码行数变更趋势。
|
|
319
|
+
|
|
320
|
+
Returns DataFrame with columns:
|
|
321
|
+
commits, insertions, deletions, net_lines
|
|
322
|
+
"""
|
|
323
|
+
if df.empty or "insertions" not in df.columns:
|
|
324
|
+
return pd.DataFrame()
|
|
325
|
+
|
|
326
|
+
df = df.copy()
|
|
327
|
+
df["month_date"] = (
|
|
328
|
+
pd.to_datetime(df["date_6am_cutoff"])
|
|
329
|
+
.dt.to_period("M")
|
|
330
|
+
.dt.to_timestamp()
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
activity = df.groupby("month_date").agg(
|
|
334
|
+
commits=("hash", "nunique"),
|
|
335
|
+
insertions=("insertions", "sum"),
|
|
336
|
+
deletions=("deletions", "sum"),
|
|
337
|
+
)
|
|
338
|
+
activity["net_lines"] = activity["insertions"] - activity["deletions"]
|
|
339
|
+
|
|
340
|
+
return activity
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# ---------------------------------------------------------------------------
|
|
344
|
+
# File modification heatmap (for sunburst)
|
|
345
|
+
# ---------------------------------------------------------------------------
|
|
346
|
+
|
|
347
|
+
def compute_file_heatmap(
|
|
348
|
+
file_stats_df: pd.DataFrame,
|
|
349
|
+
commits_df: Optional[pd.DataFrame] = None,
|
|
350
|
+
max_depth: int = 4,
|
|
351
|
+
) -> list[dict]:
|
|
352
|
+
"""
|
|
353
|
+
按目录层级聚合文件修改次数,生成旭日图数据结构。
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
嵌套字典列表 [{"name": "src", "children": [...], "value": N}, ...]
|
|
357
|
+
"""
|
|
358
|
+
if file_stats_df.empty:
|
|
359
|
+
return []
|
|
360
|
+
|
|
361
|
+
# 统计每个文件被修改的次数(出现在多少次提交中)
|
|
362
|
+
file_counts = file_stats_df.groupby("filepath").agg(
|
|
363
|
+
mod_count=("hash", "nunique"),
|
|
364
|
+
total_changes=("insertions", "sum"),
|
|
365
|
+
)
|
|
366
|
+
file_counts["total_changes"] += file_stats_df.groupby("filepath")["deletions"].sum()
|
|
367
|
+
|
|
368
|
+
# Limit to top 100 files by modification count for performance
|
|
369
|
+
file_counts = file_counts.sort_values("mod_count", ascending=False).head(50)
|
|
370
|
+
|
|
371
|
+
# 构建树结构
|
|
372
|
+
root_children: dict[str, Any] = {}
|
|
373
|
+
|
|
374
|
+
for filepath, row in file_counts.iterrows():
|
|
375
|
+
parts = str(filepath).replace("\\", "/").split("/")
|
|
376
|
+
# 限制深度
|
|
377
|
+
if len(parts) > max_depth:
|
|
378
|
+
parts = parts[: max_depth - 1] + ["/".join(parts[max_depth - 1 :])]
|
|
379
|
+
|
|
380
|
+
current = root_children
|
|
381
|
+
for i, part in enumerate(parts):
|
|
382
|
+
if part not in current:
|
|
383
|
+
current[part] = {"_children": {}, "_value": 0}
|
|
384
|
+
current[part]["_value"] += int(row["mod_count"])
|
|
385
|
+
if i < len(parts) - 1:
|
|
386
|
+
current = current[part]["_children"]
|
|
387
|
+
|
|
388
|
+
def _build_tree(node_dict: dict) -> list[dict]:
|
|
389
|
+
result = []
|
|
390
|
+
for name, info in node_dict.items():
|
|
391
|
+
entry: dict[str, Any] = {"name": name}
|
|
392
|
+
children = _build_tree(info["_children"])
|
|
393
|
+
if children:
|
|
394
|
+
entry["children"] = children
|
|
395
|
+
else:
|
|
396
|
+
entry["value"] = info["_value"]
|
|
397
|
+
result.append(entry)
|
|
398
|
+
# 按 value / 子节点总 value 降序排列
|
|
399
|
+
result.sort(key=lambda x: x.get("value", 0), reverse=True)
|
|
400
|
+
return result
|
|
401
|
+
|
|
402
|
+
return _build_tree(root_children)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# ---------------------------------------------------------------------------
|
|
406
|
+
# Code stability analysis
|
|
407
|
+
# ---------------------------------------------------------------------------
|
|
408
|
+
|
|
409
|
+
def compute_code_stability(df: pd.DataFrame) -> pd.DataFrame:
|
|
410
|
+
"""
|
|
411
|
+
分析各季度的代码稳定性。
|
|
412
|
+
|
|
413
|
+
按季度聚合:
|
|
414
|
+
- insertions, deletions
|
|
415
|
+
- ratio = insertions / (insertions + deletions)
|
|
416
|
+
- phase: 功能开发期 / 重构期 / 稳定期
|
|
417
|
+
"""
|
|
418
|
+
if df.empty or "insertions" not in df.columns:
|
|
419
|
+
return pd.DataFrame()
|
|
420
|
+
|
|
421
|
+
df = df.copy()
|
|
422
|
+
df["quarter"] = (
|
|
423
|
+
pd.to_datetime(df["date_6am_cutoff"])
|
|
424
|
+
.dt.to_period("Q")
|
|
425
|
+
.dt.to_timestamp()
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
stability = df.groupby("quarter").agg(
|
|
429
|
+
insertions=("insertions", "sum"),
|
|
430
|
+
deletions=("deletions", "sum"),
|
|
431
|
+
commits=("hash", "nunique"),
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
total = stability["insertions"] + stability["deletions"]
|
|
435
|
+
stability["add_ratio"] = (stability["insertions"] / total.replace(0, 1)).round(3)
|
|
436
|
+
|
|
437
|
+
def _classify(ratio: float) -> str:
|
|
438
|
+
if ratio >= 0.7:
|
|
439
|
+
return "功能开发期"
|
|
440
|
+
elif ratio <= 0.4:
|
|
441
|
+
return "重构期"
|
|
442
|
+
return "稳定期"
|
|
443
|
+
|
|
444
|
+
stability["phase"] = stability["add_ratio"].apply(_classify)
|
|
445
|
+
|
|
446
|
+
return stability
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
# ---------------------------------------------------------------------------
|
|
450
|
+
# Insights aggregation
|
|
451
|
+
# ---------------------------------------------------------------------------
|
|
452
|
+
|
|
453
|
+
def compute_insights(
|
|
454
|
+
df: pd.DataFrame,
|
|
455
|
+
file_stats_df: Optional[pd.DataFrame] = None,
|
|
456
|
+
) -> Dict[str, object]:
|
|
457
|
+
"""汇总核心指标与统计,供图表层和仪表板使用。"""
|
|
458
|
+
if df.empty:
|
|
459
|
+
return {}
|
|
460
|
+
|
|
461
|
+
now_time = pd.Timestamp.now(tz=TARGET_TZ)
|
|
462
|
+
|
|
463
|
+
total_commits = len(df)
|
|
464
|
+
total_authors = df["author"].nunique()
|
|
465
|
+
|
|
466
|
+
date_range = (
|
|
467
|
+
f"{df['date_6am_cutoff'].min()} ~ {df['date_6am_cutoff'].max()}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
first_commit_date = df["date_6am_cutoff"].min()
|
|
471
|
+
last_commit_date = df["date_6am_cutoff"].max()
|
|
472
|
+
project_lifecycle_days = (last_commit_date - first_commit_date).days if first_commit_date and last_commit_date else 0
|
|
473
|
+
|
|
474
|
+
# 代码行数
|
|
475
|
+
net_lines = 0
|
|
476
|
+
total_insertions = 0
|
|
477
|
+
total_deletions = 0
|
|
478
|
+
if "insertions" in df.columns:
|
|
479
|
+
total_insertions = int(df["insertions"].sum())
|
|
480
|
+
total_deletions = int(df["deletions"].sum())
|
|
481
|
+
net_lines = total_insertions - total_deletions
|
|
482
|
+
|
|
483
|
+
# 高级统计
|
|
484
|
+
# 高级统计
|
|
485
|
+
with tqdm(total=8, desc="Computing metrics", unit="step") as pbar:
|
|
486
|
+
monthly_trends = compute_monthly_trends(df)
|
|
487
|
+
pbar.update(1)
|
|
488
|
+
|
|
489
|
+
author_stats = compute_author_stats(df, ref_date=now_time)
|
|
490
|
+
pbar.update(1)
|
|
491
|
+
|
|
492
|
+
author_halfyear_trends = compute_author_halfyear_trends(df)
|
|
493
|
+
pbar.update(1)
|
|
494
|
+
|
|
495
|
+
author_halfyear_ranges = compute_author_halfyear_ranges(df)
|
|
496
|
+
pbar.update(1)
|
|
497
|
+
|
|
498
|
+
daily_commits = compute_daily_commits(df)
|
|
499
|
+
pbar.update(1)
|
|
500
|
+
|
|
501
|
+
code_activity = compute_code_activity(df)
|
|
502
|
+
pbar.update(1)
|
|
503
|
+
|
|
504
|
+
code_stability = compute_code_stability(df)
|
|
505
|
+
pbar.update(1)
|
|
506
|
+
|
|
507
|
+
# 文件热度
|
|
508
|
+
file_heatmap: list[dict] = []
|
|
509
|
+
if file_stats_df is not None and not file_stats_df.empty:
|
|
510
|
+
file_heatmap = compute_file_heatmap(file_stats_df)
|
|
511
|
+
pbar.update(1)
|
|
512
|
+
|
|
513
|
+
# 活跃人数
|
|
514
|
+
active_authors_6m = 0
|
|
515
|
+
if not author_stats.empty:
|
|
516
|
+
active_authors_6m = int(author_stats["is_active"].sum())
|
|
517
|
+
|
|
518
|
+
return {
|
|
519
|
+
# KPI
|
|
520
|
+
"total_commits": total_commits,
|
|
521
|
+
"total_authors": total_authors,
|
|
522
|
+
"active_authors_6m": active_authors_6m,
|
|
523
|
+
"net_lines": net_lines,
|
|
524
|
+
"total_insertions": total_insertions,
|
|
525
|
+
"total_deletions": total_deletions,
|
|
526
|
+
"project_lifecycle_days": project_lifecycle_days,
|
|
527
|
+
"date_range": date_range,
|
|
528
|
+
"first_commit_date": str(first_commit_date),
|
|
529
|
+
"last_commit_date": str(last_commit_date),
|
|
530
|
+
# Data
|
|
531
|
+
"monthly_trends": monthly_trends,
|
|
532
|
+
"author_stats": author_stats,
|
|
533
|
+
"author_halfyear_trends": author_halfyear_trends,
|
|
534
|
+
"author_halfyear_ranges": author_halfyear_ranges,
|
|
535
|
+
"daily_commits": daily_commits,
|
|
536
|
+
"code_activity": code_activity,
|
|
537
|
+
"code_stability": code_stability,
|
|
538
|
+
"file_heatmap": file_heatmap,
|
|
539
|
+
# Raw df for developer detail panel
|
|
540
|
+
"prepared_df": df,
|
|
541
|
+
}
|