gitcode-insight 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,545 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ GitCode 社区数据爬取模块
4
+ 从 GitCode API 获取社区项目的统计数据
5
+ """
6
+
7
+ import json
8
+ import time
9
+ from datetime import datetime, timedelta, timezone
10
+ from typing import List, Dict, Optional
11
+ import csv
12
+ import os
13
+ import requests
14
+
15
+ from .utils import request_with_retry
16
+
17
+
18
+ class GitCodeCommunityStats:
19
+ """GitCode 社区统计数据爬取器"""
20
+
21
+ def __init__(self, config_file: str = None, output_dir: str = None):
22
+ """
23
+ 初始化爬虫
24
+
25
+ Args:
26
+ config_file: 配置文件路径,默认使用 ./config/gitcode.json
27
+ output_dir: 输出目录,默认使用 ./output/
28
+ """
29
+ # 设置默认配置文件路径
30
+ if config_file is None:
31
+ config_file = os.path.join(os.getcwd(), "config", "gitcode.json")
32
+
33
+ # 设置输出目录
34
+ if output_dir is None:
35
+ output_dir = os.path.join(os.getcwd(), "output")
36
+
37
+ self.output_dir = output_dir
38
+
39
+ # 读取配置文件
40
+ with open(config_file, 'r', encoding='utf-8') as f:
41
+ config = json.load(f)
42
+
43
+ self.base_url = "https://api.gitcode.com/api/v5"
44
+ self.headers = {
45
+ "Content-Type": "application/json"
46
+ }
47
+ self.access_token = config.get("access_token", "")
48
+ self.label_ci_success = config.get("label_ci_success", "ci-pipeline-passed")
49
+ self.label_ci_running = config.get("label_ci_running", "ci-pipeline-running")
50
+ self.label_yellow_ci_running = config.get("label_yellow_ci_running", "SC-RUNNING")
51
+ self.label_yellow_ci_success = config.get("label_yellow_ci_success", "SC-SUCC")
52
+ self.owner = config.get("owner", "boostkit")
53
+ self.repo_whitelist = self._normalize_repo_list(config.get("repo_whitelist"))
54
+ self.repo_blacklist = self._normalize_repo_list(config.get("repo_blacklist"))
55
+
56
+ self.session = requests.Session()
57
+ self.session.headers.update(self.headers)
58
+
59
+ @staticmethod
60
+ def _normalize_repo_list(value) -> List[str]:
61
+ if not isinstance(value, list):
62
+ return []
63
+ result = []
64
+ for item in value:
65
+ if not isinstance(item, str):
66
+ continue
67
+ s = item.strip()
68
+ if s:
69
+ result.append(s)
70
+ return result
71
+
72
+ def _apply_repo_filters(self, projects: List[Dict]) -> List[Dict]:
73
+ if not projects:
74
+ return []
75
+
76
+ if self.repo_whitelist:
77
+ whitelist = set(self.repo_whitelist)
78
+ return [
79
+ p for p in projects
80
+ if p.get("path") in whitelist or p.get("name") in whitelist
81
+ ]
82
+
83
+ if self.repo_blacklist:
84
+ blacklist = set(self.repo_blacklist)
85
+ return [
86
+ p for p in projects
87
+ if p.get("path") not in blacklist and p.get("name") not in blacklist
88
+ ]
89
+
90
+ return projects
91
+
92
+ def get_community_projects(self, page: int = 1, per_page: int = 20) -> List[Dict]:
93
+ """
94
+ 获取社区的项目列表
95
+ """
96
+ url = f"{self.base_url}/orgs/{self.owner}/repos"
97
+ params = {
98
+ "access_token": self.access_token,
99
+ "page": page,
100
+ "per_page": per_page
101
+ }
102
+
103
+ result = request_with_retry(self.session, url, params)
104
+ return result if isinstance(result, list) else []
105
+
106
+ def get_project_contributors(self, project_path: str) -> List[Dict]:
107
+ """
108
+ 获取项目的贡献者列表
109
+ """
110
+ url = f"{self.base_url}/repos/{self.owner}/{project_path}/contributors"
111
+ params = {
112
+ "access_token": self.access_token,
113
+ "per_page": 100
114
+ }
115
+
116
+ result = request_with_retry(self.session, url, params)
117
+ return result if isinstance(result, list) else []
118
+
119
+ def get_project_contributor_year(self, project_path: str) -> List[Dict]:
120
+ """
121
+ 查询从年初到年末的贡献者
122
+ """
123
+ current_year = datetime.now().year
124
+ url = f"{self.base_url}/repos/{self.owner}/{project_path}/contributors/statistic"
125
+ params = {
126
+ "access_token": self.access_token,
127
+ "since": f"{current_year}-01-01",
128
+ "until": f"{current_year}-12-31"
129
+ }
130
+
131
+ result = request_with_retry(self.session, url, params)
132
+ return result if isinstance(result, list) else []
133
+
134
+ def get_project_merge_requests(self, project_path: str, days: int = 30) -> List[Dict]:
135
+ """
136
+ 获取指定项目的合并请求(PR)
137
+ """
138
+ # 计算时间范围
139
+ tz = timezone(timedelta(hours=8))
140
+ since_date = (datetime.now(tz) - timedelta(days=days)).isoformat()
141
+
142
+ all_mrs = []
143
+ page = 1
144
+ max_pages = 50
145
+
146
+ while page <= max_pages:
147
+ url = f"{self.base_url}/repos/{self.owner}/{project_path}/pulls"
148
+ params = {
149
+ "access_token": self.access_token,
150
+ "since": since_date,
151
+ "per_page": 100,
152
+ "page": page
153
+ }
154
+
155
+ data = request_with_retry(self.session, url, params)
156
+ if data is None or not isinstance(data, list):
157
+ break
158
+
159
+ if not data:
160
+ break
161
+
162
+ all_mrs.extend(data)
163
+
164
+ if len(data) < 100:
165
+ break
166
+
167
+ page += 1
168
+ time.sleep(0.6)
169
+
170
+ return all_mrs
171
+
172
+ def get_pr_events(self, project_path: str, pr_number: int) -> List[Dict]:
173
+ """
174
+ 获取PR的操作日志
175
+ """
176
+ url = f"{self.base_url}/repos/{self.owner}/{project_path}/pulls/{pr_number}/operate_logs"
177
+ params = {
178
+ "access_token": self.access_token,
179
+ "per_page": 100
180
+ }
181
+
182
+ result = request_with_retry(self.session, url, params)
183
+ return result if isinstance(result, list) else []
184
+
185
+ def calculate_gatekeeper_duration(self, project_path: str, pr_number: int) -> Dict:
186
+ """
187
+ 计算单个PR的门禁时长(分钟)
188
+ """
189
+ events = self.get_pr_events(project_path, pr_number)
190
+
191
+ if not events:
192
+ return None
193
+
194
+ # 倒序排列操作日志
195
+ sorted_events = sorted(events, key=lambda x: x.get('created_at', ''), reverse=True)
196
+
197
+ # 检查CI标志
198
+ yellow_ci_flag = False
199
+ blue_ci_flag = False
200
+ for event in events:
201
+ if not yellow_ci_flag and (event.get('action', '') == 'enterprise_label' or event.get('action', '') == 'label') and f'add label {self.label_yellow_ci_success}' in event.get('content', ''):
202
+ yellow_ci_flag = True
203
+ if not blue_ci_flag and (event.get('action', '') == 'enterprise_label' or event.get('action', '') == 'label') and f'add label {self.label_ci_success}' in event.get('content', ''):
204
+ blue_ci_flag = True
205
+ if yellow_ci_flag and blue_ci_flag:
206
+ break
207
+
208
+ if not blue_ci_flag:
209
+ return {"yellow_ci_flag": yellow_ci_flag, "blue_ci_flag": blue_ci_flag, "duration_minutes": 0}
210
+
211
+ # 收集所有参与门禁标签操作的用户ID
212
+ user_ids = set(
213
+ event.get('user', {}).get('id', '')
214
+ for event in sorted_events
215
+ if (event.get('action', '') == 'enterprise_label' or event.get('action', '') == 'label')
216
+ and (f'add label {self.label_ci_success}' in event.get('content', '')
217
+ or f'add label {self.label_ci_running}' in event.get('content', ''))
218
+ )
219
+
220
+ ci_passed_time = None
221
+ ci_running_time = None
222
+
223
+ # 针对每个用户ID,查找匹配的标签对
224
+ for user_id in user_ids:
225
+ user_passed_time = None
226
+ user_running_time = None
227
+
228
+ for event in sorted_events:
229
+ action = event.get('action', '')
230
+ content = event.get('content', '')
231
+ current_user_id = event.get('user', {}).get('id', '')
232
+ created_at = event.get('created_at', '')
233
+
234
+ if current_user_id != user_id:
235
+ continue
236
+
237
+ if user_passed_time and (action == 'enterprise_label' or action == 'label') and f'delete label {self.label_ci_success}' in content:
238
+ user_passed_time = None
239
+ continue
240
+
241
+ if user_passed_time is None:
242
+ if (action == 'enterprise_label' or action == 'label') and f'add label {self.label_ci_success}' in content:
243
+ user_passed_time = datetime.fromisoformat(created_at)
244
+ else:
245
+ if (action == 'enterprise_label' or action == 'label') and f'add label {self.label_ci_running}' in content:
246
+ user_running_time = datetime.fromisoformat(created_at)
247
+ if user_running_time < user_passed_time:
248
+ ci_passed_time = user_passed_time
249
+ ci_running_time = user_running_time
250
+ break
251
+
252
+ if ci_passed_time and ci_running_time:
253
+ break
254
+
255
+ if ci_passed_time and ci_running_time:
256
+ duration_seconds = (ci_passed_time - ci_running_time).total_seconds()
257
+ duration_minutes = max(0, duration_seconds / 60)
258
+ return {"yellow_ci_flag": yellow_ci_flag, "blue_ci_flag": blue_ci_flag, "duration_minutes": duration_minutes}
259
+ else:
260
+ return {"yellow_ci_flag": yellow_ci_flag, "blue_ci_flag": blue_ci_flag, "duration_minutes": 0}
261
+
262
+ def analyze_project_stats(self, project_path: str) -> Dict:
263
+ """
264
+ 分析单个项目的统计指标
265
+ """
266
+ print(f"[分析中] {project_path}")
267
+
268
+ # 获取贡献者数量
269
+ print(f" 获取贡献者数量...")
270
+ contributors = self.get_project_contributors(project_path)
271
+ contributor_count = len(contributors)
272
+ print(f" 贡献者数量: {contributor_count}")
273
+
274
+ # 获取贡献者数量(一年)
275
+ print(f" 获取贡献者数量(一年)...")
276
+ contributors_year = self.get_project_contributor_year(project_path)
277
+ contributor_count_year = len(contributors_year)
278
+ print(f" 贡献者数量(一年): {contributor_count_year}")
279
+
280
+ # 获取总PR数(100天)
281
+ print(f" 获取总PR数(100天)...")
282
+ all_prs = self.get_project_merge_requests(project_path, days=100)
283
+ total_pr_count = len(all_prs)
284
+ print(f" 总PR数(100天): {total_pr_count}")
285
+
286
+ # 获取最近30天PR数
287
+ print(f" 获取最近30天PR数...")
288
+ prs_30_days = self.get_30_days_prs(all_prs)
289
+ pr_count_30_days = len(prs_30_days)
290
+ print(f" 最近30天PR数: {pr_count_30_days}")
291
+
292
+ # 获取最近7天PR数
293
+ print(f" 获取最近7天PR数...")
294
+ pr_count_7_days = self.get_7_days_prs(all_prs)
295
+ print(f" 最近7天PR数: {pr_count_7_days}")
296
+
297
+ # 计算最近30天单日PR提交峰值
298
+ max_pr_count_30_days = 0
299
+ max_pr_date_30_days = ""
300
+ if len(prs_30_days) > 0:
301
+ daily_pr_counts = {}
302
+ for pr in prs_30_days:
303
+ pr_date = datetime.fromisoformat(pr['created_at']).strftime('%Y-%m-%d')
304
+ daily_pr_counts[pr_date] = daily_pr_counts.get(pr_date, 0) + 1
305
+
306
+ if daily_pr_counts:
307
+ max_pr_count_30_days = max(daily_pr_counts.values())
308
+ max_pr_date_30_days = max(daily_pr_counts, key=daily_pr_counts.get)
309
+
310
+ # 计算门禁时长
311
+ gatekeeper_durations = []
312
+ max_duration_pr_url = ""
313
+ max_duration = 0.0
314
+
315
+ # 计算PR闭环时间
316
+ pr_close_durations = []
317
+ max_pr_close_duration = 0.0
318
+ max_close_duration_pr_url = ""
319
+
320
+ # 筛选最近30天内已合入的PR
321
+ merged_prs = sorted(
322
+ [pr for pr in prs_30_days if pr['state'] == 'merged'],
323
+ key=lambda x: x.get('merged_at', x.get('created_at', '')),
324
+ reverse=True
325
+ )
326
+
327
+ # 计算PR闭环时间
328
+ for pr in prs_30_days:
329
+ if pr['state'] in ['merged', 'closed']:
330
+ try:
331
+ created_at = datetime.fromisoformat(pr['created_at'])
332
+ if pr['state'] == 'merged' and pr.get('merged_at'):
333
+ closed_at = datetime.fromisoformat(pr['merged_at'])
334
+ elif pr.get('closed_at'):
335
+ closed_at = datetime.fromisoformat(pr['closed_at'])
336
+ else:
337
+ continue
338
+
339
+ duration_minutes = (closed_at - created_at).total_seconds() / 60
340
+ pr_close_durations.append(duration_minutes)
341
+
342
+ if duration_minutes > max_pr_close_duration:
343
+ max_pr_close_duration = duration_minutes
344
+ max_close_duration_pr_url = f"https://gitcode.com/{self.owner}/{project_path}/pull/{pr['number']}"
345
+ except Exception as e:
346
+ print(f"计算PR #{pr['number']}闭环时间失败: {e}")
347
+
348
+ # 计算门禁时长(最近10个合入PR)
349
+ processed_prs = 0
350
+ max_processed_prs = 10
351
+ ci_flags = {"yellow_ci_flag": False, "blue_ci_flag": False}
352
+
353
+ for pr in merged_prs:
354
+ if processed_prs >= max_processed_prs:
355
+ break
356
+
357
+ ci_info = self.calculate_gatekeeper_duration(project_path, pr['number'])
358
+ if ci_info is not None:
359
+ ci_flags["yellow_ci_flag"] |= ci_info["yellow_ci_flag"]
360
+ ci_flags["blue_ci_flag"] |= ci_info["blue_ci_flag"]
361
+
362
+ gatekeeper_durations.append(ci_info["duration_minutes"])
363
+ if ci_info["duration_minutes"] > max_duration:
364
+ max_duration = ci_info["duration_minutes"]
365
+ max_duration_pr_url = f"https://gitcode.com/{self.owner}/{project_path}/pull/{pr['number']}"
366
+ processed_prs += 1
367
+
368
+ avg_gatekeeper_duration = round(sum(gatekeeper_durations) / len(gatekeeper_durations), 2) if gatekeeper_durations else 0
369
+ avg_pr_close_duration = round(sum(pr_close_durations) / len(pr_close_durations), 2) if pr_close_durations else 0
370
+
371
+ return {
372
+ 'contributor_count': contributor_count,
373
+ 'contributor_count_year': contributor_count_year,
374
+ 'total_pr_count': total_pr_count,
375
+ 'pr_count_7_days': pr_count_7_days,
376
+ 'pr_count_30_days': pr_count_30_days,
377
+ 'max_pr_count_30_days': max_pr_count_30_days,
378
+ 'max_pr_date_30_days': max_pr_date_30_days,
379
+ 'avg_gatekeeper_duration': avg_gatekeeper_duration,
380
+ 'max_duration_pr_url': max_duration_pr_url,
381
+ 'max_duration': max_duration,
382
+ 'avg_pr_close_duration': avg_pr_close_duration,
383
+ 'max_pr_close_duration': max_pr_close_duration,
384
+ 'max_close_duration_pr_url': max_close_duration_pr_url,
385
+ 'yellow_ci_flag': ci_flags["yellow_ci_flag"],
386
+ 'blue_ci_flag': ci_flags["blue_ci_flag"]
387
+ }
388
+
389
+ def get_30_days_prs(self, all_prs: List[Dict]) -> List[Dict]:
390
+ """获取最近30天内的PR"""
391
+ prs_30_days = [pr for pr in all_prs if (datetime.now(timezone.utc) - datetime.fromisoformat(pr['created_at'])).days <= 30]
392
+ return prs_30_days
393
+
394
+ def get_7_days_prs(self, all_prs: List[Dict]) -> int:
395
+ """获取最近7天内的PR数量"""
396
+ prs_7_days = [pr for pr in all_prs if (datetime.now(timezone.utc) - datetime.fromisoformat(pr['created_at'])).days <= 7]
397
+ return len(prs_7_days)
398
+
399
+ def get_all_community_projects(self) -> List[Dict]:
400
+ """获取所有社区的项目列表"""
401
+ all_projects = []
402
+ page = 1
403
+ max_pages = 20
404
+
405
+ print(f"获取项目列表:开始(最大页数限制:{max_pages})")
406
+
407
+ while page <= max_pages:
408
+ print(f" 获取第 {page} 页项目...")
409
+ projects = self.get_community_projects(page=page, per_page=100)
410
+
411
+ if not projects:
412
+ print(f" 第 {page} 页没有项目,结束获取")
413
+ break
414
+
415
+ print(f" 第 {page} 页获取到 {len(projects)} 个项目")
416
+ all_projects.extend(projects)
417
+
418
+ if len(projects) < 100:
419
+ print(f" 第 {page} 页项目不足100个,所有项目已获取完成")
420
+ break
421
+
422
+ page += 1
423
+ time.sleep(0.5)
424
+
425
+ print(f"获取项目列表:完成,共获取到 {len(all_projects)} 个项目")
426
+ filtered_projects = self._apply_repo_filters(all_projects)
427
+ if len(filtered_projects) != len(all_projects):
428
+ if self.repo_whitelist:
429
+ print(f"已启用仓库白名单过滤:{len(all_projects)} -> {len(filtered_projects)}")
430
+ elif self.repo_blacklist:
431
+ print(f"已启用仓库黑名单过滤:{len(all_projects)} -> {len(filtered_projects)}")
432
+ return filtered_projects
433
+
434
+ def crawl_community_stats(self) -> Dict:
435
+ """主函数:爬取社区的统计数据"""
436
+ print(f"开始爬取{self.owner}社区统计数据...")
437
+
438
+ projects = self.get_all_community_projects()
439
+
440
+ print(f"共获取到{len(projects)}个{self.owner}项目")
441
+
442
+ community_stats = {
443
+ "total_repos": len(projects),
444
+ "project_stats": {}
445
+ }
446
+
447
+ for i, project in enumerate(projects):
448
+ project_name = project["name"]
449
+ project_path = project["path"]
450
+ project_url = project["html_url"]
451
+
452
+ print(f"\n===== 开始处理第 {i+1}/{len(projects)} 个项目: {project_name} =====")
453
+
454
+ stats = self.analyze_project_stats(project_path)
455
+
456
+ community_stats["project_stats"][project_name] = {
457
+ "project_info": {
458
+ "name": project_name,
459
+ "url": project_url,
460
+ "description": project.get("description", "")
461
+ },
462
+ "stats": stats
463
+ }
464
+
465
+ print(f" - {project_name}: 贡献者{stats['contributor_count']}人, PR数{stats['total_pr_count']}, 最近7天{stats['pr_count_7_days']}, 最近30天{stats['pr_count_30_days']}")
466
+ print(f"===== 完成处理第 {i+1}/{len(projects)} 个项目: {project_name} =====")
467
+
468
+ time.sleep(1)
469
+
470
+ print("\n爬取完成!")
471
+ return community_stats
472
+
473
+ def save_to_csv(self, stats: Dict, filename: str = None):
474
+ """将统计结果保存为CSV文件"""
475
+ if filename is None:
476
+ os.makedirs(self.output_dir, exist_ok=True)
477
+ filename = os.path.join(self.output_dir, f"{self.owner}_community_stats.csv")
478
+
479
+ with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
480
+ fieldnames = [
481
+ '项目名称', '项目URL', '项目描述',
482
+ '贡献者数量', '总PR数',
483
+ '最近7天PR数', '最近30天PR数', 'PR单日峰值数(最近30天内)', '最近30天最大PR日期',
484
+ '门禁类型', '平均门禁时长(分钟)', '最长门禁时长(分钟)', '最长门禁时长PR链接',
485
+ '平均PR闭环时间(分钟)', '最长PR闭环时间(分钟)', '最长闭环PR链接'
486
+ ]
487
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
488
+
489
+ writer.writeheader()
490
+
491
+ for project_name, data in stats["project_stats"].items():
492
+ project_info = data["project_info"]
493
+ project_stats = data["stats"]
494
+
495
+ writer.writerow({
496
+ '项目名称': project_name,
497
+ '项目URL': project_info.get('url', ''),
498
+ '项目描述': project_info.get('description', '')[:100],
499
+ '贡献者数量': project_stats["contributor_count"],
500
+ '总PR数': project_stats["total_pr_count"],
501
+ '最近7天PR数': project_stats["pr_count_7_days"],
502
+ '最近30天PR数': project_stats["pr_count_30_days"],
503
+ 'PR单日峰值数(最近30天内)': project_stats["max_pr_count_30_days"],
504
+ '最近30天最大PR日期': project_stats["max_pr_date_30_days"],
505
+ '门禁类型': f"{'黄区 and 蓝区' if project_stats['yellow_ci_flag'] and project_stats['blue_ci_flag'] else '黄区' if project_stats['yellow_ci_flag'] else '蓝区' if project_stats['blue_ci_flag'] else '无'}",
506
+ '平均门禁时长(分钟)': project_stats["avg_gatekeeper_duration"],
507
+ '最长门禁时长(分钟)': project_stats["max_duration"],
508
+ '最长门禁时长PR链接': project_stats["max_duration_pr_url"],
509
+ '平均PR闭环时间(分钟)': project_stats["avg_pr_close_duration"],
510
+ '最长PR闭环时间(分钟)': project_stats["max_pr_close_duration"],
511
+ '最长闭环PR链接': project_stats["max_close_duration_pr_url"]
512
+ })
513
+
514
+ print(f"统计结果已保存到: {filename}")
515
+
516
+ def save_to_json(self, stats: Dict, filename: str = None):
517
+ """将统计结果保存为JSON文件"""
518
+ if filename is None:
519
+ os.makedirs(self.output_dir, exist_ok=True)
520
+ filename = os.path.join(self.output_dir, f"{self.owner}_community_stats_detailed.json")
521
+
522
+ with open(filename, 'w', encoding='utf-8') as f:
523
+ json.dump(stats, f, ensure_ascii=False, indent=4)
524
+
525
+ print(f"详细JSON数据已保存到: {filename}")
526
+
527
+ def generate_report(self, stats: Dict):
528
+ """生成社区统计报告"""
529
+ print("\n" + "="*80)
530
+ print(f"{self.owner}社区统计报告")
531
+ print("="*80)
532
+
533
+ print(f"社区级指标:")
534
+ print("-" * 40)
535
+ print(f"总代码仓数: {stats['total_repos']}")
536
+ print("\n")
537
+
538
+ print("项目级指标:")
539
+ print("-" * 120)
540
+ print(f"{'项目名称':<25} {'贡献者数':<10} {'总PR数':<10} {'最近7天':<10} {'最近30天':<10} {'30天最大':<10} {'平均门禁时长':<15} {'平均PR闭环时间':<15}")
541
+ print("-" * 120)
542
+
543
+ for project_name, data in stats["project_stats"].items():
544
+ project_stats = data["stats"]
545
+ print(f"{project_name:<25} {project_stats['contributor_count']:<10} {project_stats['total_pr_count']:<10} {project_stats['pr_count_7_days']:<10} {project_stats['pr_count_30_days']:<10} {project_stats['max_pr_count_30_days']:<10} {project_stats['avg_gatekeeper_duration']:<15}分钟 {project_stats['avg_pr_close_duration']:<15}分钟")