gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""LLM-based developer identity analyzer."""
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
from ..core.identity import DeveloperIdentityResolver
|
|
11
|
+
from .models import DeveloperAlias, DeveloperCluster, IdentityAnalysisResult
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LLMIdentityAnalyzer:
|
|
17
|
+
"""Analyzes developer identities using LLM for intelligent aliasing."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
api_key: Optional[str] = None,
|
|
22
|
+
model: str = "openai/gpt-4o-mini",
|
|
23
|
+
confidence_threshold: float = 0.8,
|
|
24
|
+
):
|
|
25
|
+
"""Initialize the LLM identity analyzer."""
|
|
26
|
+
self.api_key = api_key
|
|
27
|
+
self.model = model
|
|
28
|
+
self.confidence_threshold = confidence_threshold
|
|
29
|
+
self._has_openrouter = api_key is not None
|
|
30
|
+
|
|
31
|
+
def analyze_identities(
|
|
32
|
+
self,
|
|
33
|
+
commits: list[dict[str, Any]],
|
|
34
|
+
existing_resolver: Optional[DeveloperIdentityResolver] = None,
|
|
35
|
+
) -> IdentityAnalysisResult:
|
|
36
|
+
"""Analyze commits to identify developer aliases using LLM."""
|
|
37
|
+
# Extract unique developer identities from commits
|
|
38
|
+
identities = self._extract_identities(commits)
|
|
39
|
+
|
|
40
|
+
# Pre-cluster using heuristics
|
|
41
|
+
pre_clusters = self._pre_cluster_identities(identities)
|
|
42
|
+
|
|
43
|
+
# Analyze with LLM if available
|
|
44
|
+
if self._has_openrouter and self.api_key:
|
|
45
|
+
clusters = self._analyze_with_llm(pre_clusters, identities)
|
|
46
|
+
else:
|
|
47
|
+
# Fall back to heuristic-only clustering
|
|
48
|
+
clusters = self._finalize_heuristic_clusters(pre_clusters, identities)
|
|
49
|
+
|
|
50
|
+
# Identify unresolved identities
|
|
51
|
+
clustered_emails = set()
|
|
52
|
+
for cluster in clusters:
|
|
53
|
+
clustered_emails.update(cluster.all_emails)
|
|
54
|
+
|
|
55
|
+
unresolved = [
|
|
56
|
+
identity for identity in identities.values() if identity.email not in clustered_emails
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
return IdentityAnalysisResult(
|
|
60
|
+
clusters=clusters,
|
|
61
|
+
unresolved_identities=unresolved,
|
|
62
|
+
analysis_metadata={
|
|
63
|
+
"total_identities": len(identities),
|
|
64
|
+
"total_clusters": len(clusters),
|
|
65
|
+
"analysis_method": "llm" if self._has_openrouter else "heuristic",
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
def _extract_identities(self, commits: list[dict[str, Any]]) -> dict[str, DeveloperAlias]:
|
|
70
|
+
"""Extract unique developer identities from commits."""
|
|
71
|
+
identities = {}
|
|
72
|
+
|
|
73
|
+
for commit in commits:
|
|
74
|
+
key = f"{commit['author_email'].lower()}:{commit['author_name']}"
|
|
75
|
+
|
|
76
|
+
if key not in identities:
|
|
77
|
+
identities[key] = DeveloperAlias(
|
|
78
|
+
name=commit["author_name"],
|
|
79
|
+
email=commit["author_email"].lower(),
|
|
80
|
+
commit_count=0,
|
|
81
|
+
first_seen=commit["timestamp"],
|
|
82
|
+
last_seen=commit["timestamp"],
|
|
83
|
+
repositories=set(),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
identity = identities[key]
|
|
87
|
+
identity.commit_count += 1
|
|
88
|
+
identity.first_seen = min(identity.first_seen, commit["timestamp"])
|
|
89
|
+
identity.last_seen = max(identity.last_seen, commit["timestamp"])
|
|
90
|
+
|
|
91
|
+
# Track repository if available
|
|
92
|
+
if "repository" in commit:
|
|
93
|
+
identity.repositories.add(commit["repository"])
|
|
94
|
+
|
|
95
|
+
return identities
|
|
96
|
+
|
|
97
|
+
def _pre_cluster_identities(self, identities: dict[str, DeveloperAlias]) -> list[set[str]]:
|
|
98
|
+
"""Pre-cluster identities using heuristic rules."""
|
|
99
|
+
clusters = []
|
|
100
|
+
processed = set()
|
|
101
|
+
|
|
102
|
+
identity_list = list(identities.values())
|
|
103
|
+
|
|
104
|
+
for i, identity1 in enumerate(identity_list):
|
|
105
|
+
if identity1.email in processed:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
cluster = {identity1.email}
|
|
109
|
+
processed.add(identity1.email)
|
|
110
|
+
|
|
111
|
+
for _j, identity2 in enumerate(identity_list[i + 1 :], i + 1):
|
|
112
|
+
if identity2.email in processed:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# Check various similarity criteria
|
|
116
|
+
if self._are_likely_same_developer(identity1, identity2):
|
|
117
|
+
cluster.add(identity2.email)
|
|
118
|
+
processed.add(identity2.email)
|
|
119
|
+
|
|
120
|
+
if len(cluster) > 1:
|
|
121
|
+
clusters.append(cluster)
|
|
122
|
+
|
|
123
|
+
return clusters
|
|
124
|
+
|
|
125
|
+
def _are_likely_same_developer(self, id1: DeveloperAlias, id2: DeveloperAlias) -> bool:
|
|
126
|
+
"""Check if two identities are likely the same developer."""
|
|
127
|
+
# Same email domain with similar name
|
|
128
|
+
domain1 = id1.email.split("@")[1] if "@" in id1.email else ""
|
|
129
|
+
domain2 = id2.email.split("@")[1] if "@" in id2.email else ""
|
|
130
|
+
|
|
131
|
+
name_similarity = difflib.SequenceMatcher(None, id1.name.lower(), id2.name.lower()).ratio()
|
|
132
|
+
|
|
133
|
+
# GitHub noreply emails
|
|
134
|
+
github_pattern = r"^\d+\+(.+)@users\.noreply\.github\.com$"
|
|
135
|
+
match1 = re.match(github_pattern, id1.email)
|
|
136
|
+
match2 = re.match(github_pattern, id2.email)
|
|
137
|
+
|
|
138
|
+
# Check GitHub noreply patterns
|
|
139
|
+
if match1 or match2:
|
|
140
|
+
github_username1 = match1.group(1).lower() if match1 else None
|
|
141
|
+
github_username2 = match2.group(1).lower() if match2 else None
|
|
142
|
+
|
|
143
|
+
# Compare GitHub username with name/email
|
|
144
|
+
if github_username1:
|
|
145
|
+
# Check against other's name or email local part
|
|
146
|
+
other_name = id2.name.lower().replace(" ", "").replace(".", "").replace("-", "")
|
|
147
|
+
other_local = id2.email.split("@")[0].lower().replace(".", "").replace("-", "")
|
|
148
|
+
|
|
149
|
+
if (
|
|
150
|
+
github_username1 in other_name
|
|
151
|
+
or other_name in github_username1
|
|
152
|
+
or github_username1 in other_local
|
|
153
|
+
or other_local in github_username1
|
|
154
|
+
):
|
|
155
|
+
return True
|
|
156
|
+
|
|
157
|
+
if github_username2:
|
|
158
|
+
# Check against other's name or email local part
|
|
159
|
+
other_name = id1.name.lower().replace(" ", "").replace(".", "").replace("-", "")
|
|
160
|
+
other_local = id1.email.split("@")[0].lower().replace(".", "").replace("-", "")
|
|
161
|
+
|
|
162
|
+
if (
|
|
163
|
+
github_username2 in other_name
|
|
164
|
+
or other_name in github_username2
|
|
165
|
+
or github_username2 in other_local
|
|
166
|
+
or other_local in github_username2
|
|
167
|
+
):
|
|
168
|
+
return True
|
|
169
|
+
|
|
170
|
+
# Check if one email's local part matches the other's name
|
|
171
|
+
local1 = id1.email.split("@")[0].lower()
|
|
172
|
+
local2 = id2.email.split("@")[0].lower()
|
|
173
|
+
|
|
174
|
+
# Remove common suffixes/prefixes for comparison
|
|
175
|
+
clean_local1 = local1
|
|
176
|
+
clean_local2 = local2
|
|
177
|
+
for suffix in ["-ewtn", "-zaelot", "dev", "developer", "zaelot"]:
|
|
178
|
+
clean_local1 = clean_local1.replace(suffix, "")
|
|
179
|
+
clean_local2 = clean_local2.replace(suffix, "")
|
|
180
|
+
|
|
181
|
+
# Check if cleaned locals match names
|
|
182
|
+
name1_clean = id1.name.lower().replace(" ", "").replace(".", "")
|
|
183
|
+
name2_clean = id2.name.lower().replace(" ", "").replace(".", "")
|
|
184
|
+
|
|
185
|
+
if (
|
|
186
|
+
clean_local1 in name2_clean
|
|
187
|
+
or name2_clean in clean_local1
|
|
188
|
+
or clean_local2 in name1_clean
|
|
189
|
+
or name1_clean in clean_local2
|
|
190
|
+
):
|
|
191
|
+
return True
|
|
192
|
+
|
|
193
|
+
# Strong indicators
|
|
194
|
+
if name_similarity > 0.9 and domain1 == domain2:
|
|
195
|
+
return True
|
|
196
|
+
|
|
197
|
+
if name_similarity > 0.8 and (
|
|
198
|
+
domain1 == domain2 or "github.com" in domain1 or "github.com" in domain2
|
|
199
|
+
):
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
# Check if local part of email matches name parts
|
|
203
|
+
name1_parts = set(id1.name.lower().split())
|
|
204
|
+
name2_parts = set(id2.name.lower().split())
|
|
205
|
+
|
|
206
|
+
if local1 in name2_parts or local2 in name1_parts:
|
|
207
|
+
return True
|
|
208
|
+
|
|
209
|
+
# Check first/last name combinations
|
|
210
|
+
if len(name1_parts) >= 2 and len(name2_parts) >= 2:
|
|
211
|
+
# Check if initials match
|
|
212
|
+
initials1 = "".join(n[0] for n in sorted(name1_parts) if n)
|
|
213
|
+
initials2 = "".join(n[0] for n in sorted(name2_parts) if n)
|
|
214
|
+
|
|
215
|
+
if initials1 == initials2 and name_similarity > 0.6:
|
|
216
|
+
return True
|
|
217
|
+
|
|
218
|
+
# Check overlapping repositories with high name similarity
|
|
219
|
+
return bool(id1.repositories & id2.repositories and name_similarity > 0.7)
|
|
220
|
+
|
|
221
|
+
def _finalize_heuristic_clusters(
|
|
222
|
+
self, pre_clusters: list[set[str]], identities: dict[str, DeveloperAlias]
|
|
223
|
+
) -> list[DeveloperCluster]:
|
|
224
|
+
"""Convert pre-clusters to final clusters without LLM."""
|
|
225
|
+
clusters = []
|
|
226
|
+
|
|
227
|
+
for email_set in pre_clusters:
|
|
228
|
+
# Get all identities in this cluster
|
|
229
|
+
cluster_identities = [
|
|
230
|
+
identity for identity in identities.values() if identity.email in email_set
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
if not cluster_identities:
|
|
234
|
+
continue
|
|
235
|
+
|
|
236
|
+
# Choose canonical identity (most commits)
|
|
237
|
+
canonical = max(cluster_identities, key=lambda x: x.commit_count)
|
|
238
|
+
aliases = [id for id in cluster_identities if id.email != canonical.email]
|
|
239
|
+
|
|
240
|
+
# Calculate total stats
|
|
241
|
+
total_commits = sum(id.commit_count for id in cluster_identities)
|
|
242
|
+
|
|
243
|
+
cluster = DeveloperCluster(
|
|
244
|
+
canonical_name=canonical.name,
|
|
245
|
+
canonical_email=canonical.email,
|
|
246
|
+
aliases=aliases,
|
|
247
|
+
confidence=0.85, # Heuristic confidence
|
|
248
|
+
reasoning="Clustered based on name similarity and email patterns",
|
|
249
|
+
total_commits=total_commits,
|
|
250
|
+
total_story_points=0, # Would need commit data to calculate
|
|
251
|
+
)
|
|
252
|
+
clusters.append(cluster)
|
|
253
|
+
|
|
254
|
+
return clusters
|
|
255
|
+
|
|
256
|
+
def _analyze_with_llm(
|
|
257
|
+
self, pre_clusters: list[set[str]], identities: dict[str, DeveloperAlias]
|
|
258
|
+
) -> list[DeveloperCluster]:
|
|
259
|
+
"""Analyze pre-clusters with LLM for intelligent grouping."""
|
|
260
|
+
try:
|
|
261
|
+
import openai
|
|
262
|
+
|
|
263
|
+
# Configure OpenAI client for OpenRouter
|
|
264
|
+
client = openai.OpenAI(base_url="https://openrouter.ai/api/v1", api_key=self.api_key)
|
|
265
|
+
|
|
266
|
+
clusters = []
|
|
267
|
+
|
|
268
|
+
# Analyze each pre-cluster
|
|
269
|
+
for email_set in pre_clusters:
|
|
270
|
+
cluster_identities = [
|
|
271
|
+
identity for identity in identities.values() if identity.email in email_set
|
|
272
|
+
]
|
|
273
|
+
|
|
274
|
+
if len(cluster_identities) < 2:
|
|
275
|
+
continue
|
|
276
|
+
|
|
277
|
+
# Prepare data for LLM
|
|
278
|
+
identity_data = []
|
|
279
|
+
for identity in cluster_identities:
|
|
280
|
+
identity_data.append(
|
|
281
|
+
{
|
|
282
|
+
"name": identity.name,
|
|
283
|
+
"email": identity.email,
|
|
284
|
+
"commit_count": identity.commit_count,
|
|
285
|
+
"repositories": list(identity.repositories),
|
|
286
|
+
}
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
prompt = self._create_analysis_prompt(identity_data)
|
|
290
|
+
|
|
291
|
+
# Call LLM
|
|
292
|
+
response = client.chat.completions.create(
|
|
293
|
+
model=self.model,
|
|
294
|
+
messages=[
|
|
295
|
+
{
|
|
296
|
+
"role": "system",
|
|
297
|
+
"content": "You are an expert at analyzing developer identities and determining if different email/name combinations belong to the same person.",
|
|
298
|
+
},
|
|
299
|
+
{"role": "user", "content": prompt},
|
|
300
|
+
],
|
|
301
|
+
temperature=0.3,
|
|
302
|
+
max_tokens=500,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Parse LLM response
|
|
306
|
+
cluster = self._parse_llm_response(
|
|
307
|
+
response.choices[0].message.content, cluster_identities, identities
|
|
308
|
+
)
|
|
309
|
+
if cluster:
|
|
310
|
+
clusters.append(cluster)
|
|
311
|
+
|
|
312
|
+
# Also analyze unclustered identities
|
|
313
|
+
clustered_emails = set()
|
|
314
|
+
for cluster in clusters:
|
|
315
|
+
clustered_emails.update(cluster.all_emails)
|
|
316
|
+
|
|
317
|
+
unclustered = [
|
|
318
|
+
identity
|
|
319
|
+
for identity in identities.values()
|
|
320
|
+
if identity.email not in clustered_emails
|
|
321
|
+
]
|
|
322
|
+
|
|
323
|
+
# Group unclustered by name similarity for LLM analysis
|
|
324
|
+
if unclustered:
|
|
325
|
+
additional_clusters = self._analyze_unclustered_with_llm(unclustered, client)
|
|
326
|
+
clusters.extend(additional_clusters)
|
|
327
|
+
|
|
328
|
+
return clusters
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.warning(f"LLM analysis failed, falling back to heuristics: {e}")
|
|
332
|
+
return self._finalize_heuristic_clusters(pre_clusters, identities)
|
|
333
|
+
|
|
334
|
+
def _create_analysis_prompt(self, identity_data: list[dict[str, Any]]) -> str:
|
|
335
|
+
"""Create prompt for LLM analysis."""
|
|
336
|
+
return f"""Analyze these developer identities and determine if they belong to the same person:
|
|
337
|
+
|
|
338
|
+
{json.dumps(identity_data, indent=2)}
|
|
339
|
+
|
|
340
|
+
Consider:
|
|
341
|
+
1. Name variations (e.g., "John Doe" vs "John D" vs "jdoe")
|
|
342
|
+
2. Email patterns (company emails, personal emails, GitHub noreply)
|
|
343
|
+
3. Common repositories they work on
|
|
344
|
+
4. Email domain relationships
|
|
345
|
+
|
|
346
|
+
Respond with a JSON object:
|
|
347
|
+
{{
|
|
348
|
+
"same_person": true/false,
|
|
349
|
+
"confidence": 0.0-1.0,
|
|
350
|
+
"canonical_identity": {{"name": "...", "email": "..."}},
|
|
351
|
+
"reasoning": "explanation"
|
|
352
|
+
}}"""
|
|
353
|
+
|
|
354
|
+
def _parse_llm_response(
|
|
355
|
+
self,
|
|
356
|
+
response: str,
|
|
357
|
+
cluster_identities: list[DeveloperAlias],
|
|
358
|
+
all_identities: dict[str, DeveloperAlias],
|
|
359
|
+
) -> Optional[DeveloperCluster]:
|
|
360
|
+
"""Parse LLM response into a cluster."""
|
|
361
|
+
try:
|
|
362
|
+
# Extract JSON from response
|
|
363
|
+
json_match = re.search(r"\{.*\}", response, re.DOTALL)
|
|
364
|
+
if not json_match:
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
data = json.loads(json_match.group())
|
|
368
|
+
|
|
369
|
+
if not data.get("same_person", False):
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
confidence = float(data.get("confidence", 0.8))
|
|
373
|
+
if confidence < self.confidence_threshold:
|
|
374
|
+
return None
|
|
375
|
+
|
|
376
|
+
# Find canonical identity
|
|
377
|
+
canonical_data = data.get("canonical_identity", {})
|
|
378
|
+
canonical_email = canonical_data.get("email", "").lower()
|
|
379
|
+
|
|
380
|
+
# Find matching identity
|
|
381
|
+
canonical = None
|
|
382
|
+
for identity in cluster_identities:
|
|
383
|
+
if identity.email == canonical_email:
|
|
384
|
+
canonical = identity
|
|
385
|
+
break
|
|
386
|
+
|
|
387
|
+
if not canonical:
|
|
388
|
+
# Use highest commit count as canonical
|
|
389
|
+
canonical = max(cluster_identities, key=lambda x: x.commit_count)
|
|
390
|
+
|
|
391
|
+
# Create aliases list
|
|
392
|
+
aliases = [id for id in cluster_identities if id.email != canonical.email]
|
|
393
|
+
|
|
394
|
+
# Calculate total stats
|
|
395
|
+
total_commits = sum(id.commit_count for id in cluster_identities)
|
|
396
|
+
|
|
397
|
+
return DeveloperCluster(
|
|
398
|
+
canonical_name=canonical.name,
|
|
399
|
+
canonical_email=canonical.email,
|
|
400
|
+
aliases=aliases,
|
|
401
|
+
confidence=confidence,
|
|
402
|
+
reasoning=data.get("reasoning", "LLM analysis"),
|
|
403
|
+
total_commits=total_commits,
|
|
404
|
+
total_story_points=0,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
except Exception as e:
|
|
408
|
+
logger.warning(f"Failed to parse LLM response: {e}")
|
|
409
|
+
return None
|
|
410
|
+
|
|
411
|
+
def _analyze_unclustered_with_llm(
|
|
412
|
+
self, unclustered: list[DeveloperAlias], client
|
|
413
|
+
) -> list[DeveloperCluster]:
|
|
414
|
+
"""Analyze unclustered identities with LLM."""
|
|
415
|
+
clusters = []
|
|
416
|
+
|
|
417
|
+
# Group by similar names for analysis
|
|
418
|
+
name_groups = defaultdict(list)
|
|
419
|
+
for identity in unclustered:
|
|
420
|
+
# Get simplified name for grouping
|
|
421
|
+
name_key = "".join(identity.name.lower().split())[:5]
|
|
422
|
+
name_groups[name_key].append(identity)
|
|
423
|
+
|
|
424
|
+
for group in name_groups.values():
|
|
425
|
+
if len(group) < 2:
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
# Prepare data for LLM
|
|
429
|
+
identity_data = []
|
|
430
|
+
for identity in group:
|
|
431
|
+
identity_data.append(
|
|
432
|
+
{
|
|
433
|
+
"name": identity.name,
|
|
434
|
+
"email": identity.email,
|
|
435
|
+
"commit_count": identity.commit_count,
|
|
436
|
+
"repositories": list(identity.repositories),
|
|
437
|
+
}
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
prompt = self._create_analysis_prompt(identity_data)
|
|
441
|
+
|
|
442
|
+
try:
|
|
443
|
+
response = client.chat.completions.create(
|
|
444
|
+
model=self.model,
|
|
445
|
+
messages=[
|
|
446
|
+
{
|
|
447
|
+
"role": "system",
|
|
448
|
+
"content": "You are an expert at analyzing developer identities.",
|
|
449
|
+
},
|
|
450
|
+
{"role": "user", "content": prompt},
|
|
451
|
+
],
|
|
452
|
+
temperature=0.3,
|
|
453
|
+
max_tokens=500,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
cluster = self._parse_llm_response(response.choices[0].message.content, group, {})
|
|
457
|
+
if cluster:
|
|
458
|
+
clusters.append(cluster)
|
|
459
|
+
|
|
460
|
+
except Exception as e:
|
|
461
|
+
logger.warning(f"Failed to analyze group with LLM: {e}")
|
|
462
|
+
continue
|
|
463
|
+
|
|
464
|
+
return clusters
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Data models for LLM-based identity analysis."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class DeveloperAlias:
|
|
10
|
+
"""Represents a single developer alias."""
|
|
11
|
+
|
|
12
|
+
name: str
|
|
13
|
+
email: str
|
|
14
|
+
commit_count: int
|
|
15
|
+
first_seen: datetime
|
|
16
|
+
last_seen: datetime
|
|
17
|
+
repositories: set[str]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class DeveloperCluster:
|
|
22
|
+
"""Represents a cluster of related developer identities."""
|
|
23
|
+
|
|
24
|
+
canonical_name: str
|
|
25
|
+
canonical_email: str
|
|
26
|
+
aliases: list[DeveloperAlias]
|
|
27
|
+
confidence: float # 0.0 to 1.0
|
|
28
|
+
reasoning: str
|
|
29
|
+
total_commits: int
|
|
30
|
+
total_story_points: int
|
|
31
|
+
preferred_display_name: Optional[str] = None # Optional preferred name for reports
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def all_emails(self) -> set[str]:
|
|
35
|
+
"""Get all emails in this cluster."""
|
|
36
|
+
emails = {self.canonical_email}
|
|
37
|
+
emails.update(alias.email for alias in self.aliases)
|
|
38
|
+
return emails
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def all_names(self) -> set[str]:
|
|
42
|
+
"""Get all names in this cluster."""
|
|
43
|
+
names = {self.canonical_name}
|
|
44
|
+
names.update(alias.name for alias in self.aliases)
|
|
45
|
+
return names
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class IdentityAnalysisResult:
|
|
50
|
+
"""Result of LLM identity analysis."""
|
|
51
|
+
|
|
52
|
+
clusters: list[DeveloperCluster]
|
|
53
|
+
unresolved_identities: list[DeveloperAlias]
|
|
54
|
+
analysis_metadata: dict[str, any] = field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
def get_manual_mappings(self) -> list[dict[str, any]]:
|
|
57
|
+
"""Convert to manual mappings format for config."""
|
|
58
|
+
mappings = []
|
|
59
|
+
for cluster in self.clusters:
|
|
60
|
+
if len(cluster.aliases) > 0:
|
|
61
|
+
mapping = {}
|
|
62
|
+
# Add name first if specified
|
|
63
|
+
if cluster.preferred_display_name:
|
|
64
|
+
mapping["name"] = cluster.preferred_display_name
|
|
65
|
+
mapping["primary_email"] = cluster.canonical_email
|
|
66
|
+
mapping["aliases"] = [alias.email for alias in cluster.aliases]
|
|
67
|
+
mappings.append(mapping)
|
|
68
|
+
return mappings
|
|
69
|
+
|
|
70
|
+
def get_cluster_by_email(self, email: str) -> Optional[DeveloperCluster]:
|
|
71
|
+
"""Find cluster containing the given email."""
|
|
72
|
+
email_lower = email.lower()
|
|
73
|
+
for cluster in self.clusters:
|
|
74
|
+
if email_lower in [e.lower() for e in cluster.all_emails]:
|
|
75
|
+
return cluster
|
|
76
|
+
return None
|