gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,464 @@
1
+ """LLM-based developer identity analyzer."""
2
+
3
+ import difflib
4
+ import json
5
+ import logging
6
+ import re
7
+ from collections import defaultdict
8
+ from typing import Any, Optional
9
+
10
+ from ..core.identity import DeveloperIdentityResolver
11
+ from .models import DeveloperAlias, DeveloperCluster, IdentityAnalysisResult
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LLMIdentityAnalyzer:
17
+ """Analyzes developer identities using LLM for intelligent aliasing."""
18
+
19
+ def __init__(
20
+ self,
21
+ api_key: Optional[str] = None,
22
+ model: str = "openai/gpt-4o-mini",
23
+ confidence_threshold: float = 0.8,
24
+ ):
25
+ """Initialize the LLM identity analyzer."""
26
+ self.api_key = api_key
27
+ self.model = model
28
+ self.confidence_threshold = confidence_threshold
29
+ self._has_openrouter = api_key is not None
30
+
31
+ def analyze_identities(
32
+ self,
33
+ commits: list[dict[str, Any]],
34
+ existing_resolver: Optional[DeveloperIdentityResolver] = None,
35
+ ) -> IdentityAnalysisResult:
36
+ """Analyze commits to identify developer aliases using LLM."""
37
+ # Extract unique developer identities from commits
38
+ identities = self._extract_identities(commits)
39
+
40
+ # Pre-cluster using heuristics
41
+ pre_clusters = self._pre_cluster_identities(identities)
42
+
43
+ # Analyze with LLM if available
44
+ if self._has_openrouter and self.api_key:
45
+ clusters = self._analyze_with_llm(pre_clusters, identities)
46
+ else:
47
+ # Fall back to heuristic-only clustering
48
+ clusters = self._finalize_heuristic_clusters(pre_clusters, identities)
49
+
50
+ # Identify unresolved identities
51
+ clustered_emails = set()
52
+ for cluster in clusters:
53
+ clustered_emails.update(cluster.all_emails)
54
+
55
+ unresolved = [
56
+ identity for identity in identities.values() if identity.email not in clustered_emails
57
+ ]
58
+
59
+ return IdentityAnalysisResult(
60
+ clusters=clusters,
61
+ unresolved_identities=unresolved,
62
+ analysis_metadata={
63
+ "total_identities": len(identities),
64
+ "total_clusters": len(clusters),
65
+ "analysis_method": "llm" if self._has_openrouter else "heuristic",
66
+ },
67
+ )
68
+
69
+ def _extract_identities(self, commits: list[dict[str, Any]]) -> dict[str, DeveloperAlias]:
70
+ """Extract unique developer identities from commits."""
71
+ identities = {}
72
+
73
+ for commit in commits:
74
+ key = f"{commit['author_email'].lower()}:{commit['author_name']}"
75
+
76
+ if key not in identities:
77
+ identities[key] = DeveloperAlias(
78
+ name=commit["author_name"],
79
+ email=commit["author_email"].lower(),
80
+ commit_count=0,
81
+ first_seen=commit["timestamp"],
82
+ last_seen=commit["timestamp"],
83
+ repositories=set(),
84
+ )
85
+
86
+ identity = identities[key]
87
+ identity.commit_count += 1
88
+ identity.first_seen = min(identity.first_seen, commit["timestamp"])
89
+ identity.last_seen = max(identity.last_seen, commit["timestamp"])
90
+
91
+ # Track repository if available
92
+ if "repository" in commit:
93
+ identity.repositories.add(commit["repository"])
94
+
95
+ return identities
96
+
97
+ def _pre_cluster_identities(self, identities: dict[str, DeveloperAlias]) -> list[set[str]]:
98
+ """Pre-cluster identities using heuristic rules."""
99
+ clusters = []
100
+ processed = set()
101
+
102
+ identity_list = list(identities.values())
103
+
104
+ for i, identity1 in enumerate(identity_list):
105
+ if identity1.email in processed:
106
+ continue
107
+
108
+ cluster = {identity1.email}
109
+ processed.add(identity1.email)
110
+
111
+ for _j, identity2 in enumerate(identity_list[i + 1 :], i + 1):
112
+ if identity2.email in processed:
113
+ continue
114
+
115
+ # Check various similarity criteria
116
+ if self._are_likely_same_developer(identity1, identity2):
117
+ cluster.add(identity2.email)
118
+ processed.add(identity2.email)
119
+
120
+ if len(cluster) > 1:
121
+ clusters.append(cluster)
122
+
123
+ return clusters
124
+
125
+ def _are_likely_same_developer(self, id1: DeveloperAlias, id2: DeveloperAlias) -> bool:
126
+ """Check if two identities are likely the same developer."""
127
+ # Same email domain with similar name
128
+ domain1 = id1.email.split("@")[1] if "@" in id1.email else ""
129
+ domain2 = id2.email.split("@")[1] if "@" in id2.email else ""
130
+
131
+ name_similarity = difflib.SequenceMatcher(None, id1.name.lower(), id2.name.lower()).ratio()
132
+
133
+ # GitHub noreply emails
134
+ github_pattern = r"^\d+\+(.+)@users\.noreply\.github\.com$"
135
+ match1 = re.match(github_pattern, id1.email)
136
+ match2 = re.match(github_pattern, id2.email)
137
+
138
+ # Check GitHub noreply patterns
139
+ if match1 or match2:
140
+ github_username1 = match1.group(1).lower() if match1 else None
141
+ github_username2 = match2.group(1).lower() if match2 else None
142
+
143
+ # Compare GitHub username with name/email
144
+ if github_username1:
145
+ # Check against other's name or email local part
146
+ other_name = id2.name.lower().replace(" ", "").replace(".", "").replace("-", "")
147
+ other_local = id2.email.split("@")[0].lower().replace(".", "").replace("-", "")
148
+
149
+ if (
150
+ github_username1 in other_name
151
+ or other_name in github_username1
152
+ or github_username1 in other_local
153
+ or other_local in github_username1
154
+ ):
155
+ return True
156
+
157
+ if github_username2:
158
+ # Check against other's name or email local part
159
+ other_name = id1.name.lower().replace(" ", "").replace(".", "").replace("-", "")
160
+ other_local = id1.email.split("@")[0].lower().replace(".", "").replace("-", "")
161
+
162
+ if (
163
+ github_username2 in other_name
164
+ or other_name in github_username2
165
+ or github_username2 in other_local
166
+ or other_local in github_username2
167
+ ):
168
+ return True
169
+
170
+ # Check if one email's local part matches the other's name
171
+ local1 = id1.email.split("@")[0].lower()
172
+ local2 = id2.email.split("@")[0].lower()
173
+
174
+ # Remove common suffixes/prefixes for comparison
175
+ clean_local1 = local1
176
+ clean_local2 = local2
177
+ for suffix in ["-ewtn", "-zaelot", "dev", "developer", "zaelot"]:
178
+ clean_local1 = clean_local1.replace(suffix, "")
179
+ clean_local2 = clean_local2.replace(suffix, "")
180
+
181
+ # Check if cleaned locals match names
182
+ name1_clean = id1.name.lower().replace(" ", "").replace(".", "")
183
+ name2_clean = id2.name.lower().replace(" ", "").replace(".", "")
184
+
185
+ if (
186
+ clean_local1 in name2_clean
187
+ or name2_clean in clean_local1
188
+ or clean_local2 in name1_clean
189
+ or name1_clean in clean_local2
190
+ ):
191
+ return True
192
+
193
+ # Strong indicators
194
+ if name_similarity > 0.9 and domain1 == domain2:
195
+ return True
196
+
197
+ if name_similarity > 0.8 and (
198
+ domain1 == domain2 or "github.com" in domain1 or "github.com" in domain2
199
+ ):
200
+ return True
201
+
202
+ # Check if local part of email matches name parts
203
+ name1_parts = set(id1.name.lower().split())
204
+ name2_parts = set(id2.name.lower().split())
205
+
206
+ if local1 in name2_parts or local2 in name1_parts:
207
+ return True
208
+
209
+ # Check first/last name combinations
210
+ if len(name1_parts) >= 2 and len(name2_parts) >= 2:
211
+ # Check if initials match
212
+ initials1 = "".join(n[0] for n in sorted(name1_parts) if n)
213
+ initials2 = "".join(n[0] for n in sorted(name2_parts) if n)
214
+
215
+ if initials1 == initials2 and name_similarity > 0.6:
216
+ return True
217
+
218
+ # Check overlapping repositories with high name similarity
219
+ return bool(id1.repositories & id2.repositories and name_similarity > 0.7)
220
+
221
+ def _finalize_heuristic_clusters(
222
+ self, pre_clusters: list[set[str]], identities: dict[str, DeveloperAlias]
223
+ ) -> list[DeveloperCluster]:
224
+ """Convert pre-clusters to final clusters without LLM."""
225
+ clusters = []
226
+
227
+ for email_set in pre_clusters:
228
+ # Get all identities in this cluster
229
+ cluster_identities = [
230
+ identity for identity in identities.values() if identity.email in email_set
231
+ ]
232
+
233
+ if not cluster_identities:
234
+ continue
235
+
236
+ # Choose canonical identity (most commits)
237
+ canonical = max(cluster_identities, key=lambda x: x.commit_count)
238
+ aliases = [id for id in cluster_identities if id.email != canonical.email]
239
+
240
+ # Calculate total stats
241
+ total_commits = sum(id.commit_count for id in cluster_identities)
242
+
243
+ cluster = DeveloperCluster(
244
+ canonical_name=canonical.name,
245
+ canonical_email=canonical.email,
246
+ aliases=aliases,
247
+ confidence=0.85, # Heuristic confidence
248
+ reasoning="Clustered based on name similarity and email patterns",
249
+ total_commits=total_commits,
250
+ total_story_points=0, # Would need commit data to calculate
251
+ )
252
+ clusters.append(cluster)
253
+
254
+ return clusters
255
+
256
+ def _analyze_with_llm(
257
+ self, pre_clusters: list[set[str]], identities: dict[str, DeveloperAlias]
258
+ ) -> list[DeveloperCluster]:
259
+ """Analyze pre-clusters with LLM for intelligent grouping."""
260
+ try:
261
+ import openai
262
+
263
+ # Configure OpenAI client for OpenRouter
264
+ client = openai.OpenAI(base_url="https://openrouter.ai/api/v1", api_key=self.api_key)
265
+
266
+ clusters = []
267
+
268
+ # Analyze each pre-cluster
269
+ for email_set in pre_clusters:
270
+ cluster_identities = [
271
+ identity for identity in identities.values() if identity.email in email_set
272
+ ]
273
+
274
+ if len(cluster_identities) < 2:
275
+ continue
276
+
277
+ # Prepare data for LLM
278
+ identity_data = []
279
+ for identity in cluster_identities:
280
+ identity_data.append(
281
+ {
282
+ "name": identity.name,
283
+ "email": identity.email,
284
+ "commit_count": identity.commit_count,
285
+ "repositories": list(identity.repositories),
286
+ }
287
+ )
288
+
289
+ prompt = self._create_analysis_prompt(identity_data)
290
+
291
+ # Call LLM
292
+ response = client.chat.completions.create(
293
+ model=self.model,
294
+ messages=[
295
+ {
296
+ "role": "system",
297
+ "content": "You are an expert at analyzing developer identities and determining if different email/name combinations belong to the same person.",
298
+ },
299
+ {"role": "user", "content": prompt},
300
+ ],
301
+ temperature=0.3,
302
+ max_tokens=500,
303
+ )
304
+
305
+ # Parse LLM response
306
+ cluster = self._parse_llm_response(
307
+ response.choices[0].message.content, cluster_identities, identities
308
+ )
309
+ if cluster:
310
+ clusters.append(cluster)
311
+
312
+ # Also analyze unclustered identities
313
+ clustered_emails = set()
314
+ for cluster in clusters:
315
+ clustered_emails.update(cluster.all_emails)
316
+
317
+ unclustered = [
318
+ identity
319
+ for identity in identities.values()
320
+ if identity.email not in clustered_emails
321
+ ]
322
+
323
+ # Group unclustered by name similarity for LLM analysis
324
+ if unclustered:
325
+ additional_clusters = self._analyze_unclustered_with_llm(unclustered, client)
326
+ clusters.extend(additional_clusters)
327
+
328
+ return clusters
329
+
330
+ except Exception as e:
331
+ logger.warning(f"LLM analysis failed, falling back to heuristics: {e}")
332
+ return self._finalize_heuristic_clusters(pre_clusters, identities)
333
+
334
+ def _create_analysis_prompt(self, identity_data: list[dict[str, Any]]) -> str:
335
+ """Create prompt for LLM analysis."""
336
+ return f"""Analyze these developer identities and determine if they belong to the same person:
337
+
338
+ {json.dumps(identity_data, indent=2)}
339
+
340
+ Consider:
341
+ 1. Name variations (e.g., "John Doe" vs "John D" vs "jdoe")
342
+ 2. Email patterns (company emails, personal emails, GitHub noreply)
343
+ 3. Common repositories they work on
344
+ 4. Email domain relationships
345
+
346
+ Respond with a JSON object:
347
+ {{
348
+ "same_person": true/false,
349
+ "confidence": 0.0-1.0,
350
+ "canonical_identity": {{"name": "...", "email": "..."}},
351
+ "reasoning": "explanation"
352
+ }}"""
353
+
354
+ def _parse_llm_response(
355
+ self,
356
+ response: str,
357
+ cluster_identities: list[DeveloperAlias],
358
+ all_identities: dict[str, DeveloperAlias],
359
+ ) -> Optional[DeveloperCluster]:
360
+ """Parse LLM response into a cluster."""
361
+ try:
362
+ # Extract JSON from response
363
+ json_match = re.search(r"\{.*\}", response, re.DOTALL)
364
+ if not json_match:
365
+ return None
366
+
367
+ data = json.loads(json_match.group())
368
+
369
+ if not data.get("same_person", False):
370
+ return None
371
+
372
+ confidence = float(data.get("confidence", 0.8))
373
+ if confidence < self.confidence_threshold:
374
+ return None
375
+
376
+ # Find canonical identity
377
+ canonical_data = data.get("canonical_identity", {})
378
+ canonical_email = canonical_data.get("email", "").lower()
379
+
380
+ # Find matching identity
381
+ canonical = None
382
+ for identity in cluster_identities:
383
+ if identity.email == canonical_email:
384
+ canonical = identity
385
+ break
386
+
387
+ if not canonical:
388
+ # Use highest commit count as canonical
389
+ canonical = max(cluster_identities, key=lambda x: x.commit_count)
390
+
391
+ # Create aliases list
392
+ aliases = [id for id in cluster_identities if id.email != canonical.email]
393
+
394
+ # Calculate total stats
395
+ total_commits = sum(id.commit_count for id in cluster_identities)
396
+
397
+ return DeveloperCluster(
398
+ canonical_name=canonical.name,
399
+ canonical_email=canonical.email,
400
+ aliases=aliases,
401
+ confidence=confidence,
402
+ reasoning=data.get("reasoning", "LLM analysis"),
403
+ total_commits=total_commits,
404
+ total_story_points=0,
405
+ )
406
+
407
+ except Exception as e:
408
+ logger.warning(f"Failed to parse LLM response: {e}")
409
+ return None
410
+
411
+ def _analyze_unclustered_with_llm(
412
+ self, unclustered: list[DeveloperAlias], client
413
+ ) -> list[DeveloperCluster]:
414
+ """Analyze unclustered identities with LLM."""
415
+ clusters = []
416
+
417
+ # Group by similar names for analysis
418
+ name_groups = defaultdict(list)
419
+ for identity in unclustered:
420
+ # Get simplified name for grouping
421
+ name_key = "".join(identity.name.lower().split())[:5]
422
+ name_groups[name_key].append(identity)
423
+
424
+ for group in name_groups.values():
425
+ if len(group) < 2:
426
+ continue
427
+
428
+ # Prepare data for LLM
429
+ identity_data = []
430
+ for identity in group:
431
+ identity_data.append(
432
+ {
433
+ "name": identity.name,
434
+ "email": identity.email,
435
+ "commit_count": identity.commit_count,
436
+ "repositories": list(identity.repositories),
437
+ }
438
+ )
439
+
440
+ prompt = self._create_analysis_prompt(identity_data)
441
+
442
+ try:
443
+ response = client.chat.completions.create(
444
+ model=self.model,
445
+ messages=[
446
+ {
447
+ "role": "system",
448
+ "content": "You are an expert at analyzing developer identities.",
449
+ },
450
+ {"role": "user", "content": prompt},
451
+ ],
452
+ temperature=0.3,
453
+ max_tokens=500,
454
+ )
455
+
456
+ cluster = self._parse_llm_response(response.choices[0].message.content, group, {})
457
+ if cluster:
458
+ clusters.append(cluster)
459
+
460
+ except Exception as e:
461
+ logger.warning(f"Failed to analyze group with LLM: {e}")
462
+ continue
463
+
464
+ return clusters
@@ -0,0 +1,76 @@
1
+ """Data models for LLM-based identity analysis."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime
5
+ from typing import Optional
6
+
7
+
8
+ @dataclass
9
+ class DeveloperAlias:
10
+ """Represents a single developer alias."""
11
+
12
+ name: str
13
+ email: str
14
+ commit_count: int
15
+ first_seen: datetime
16
+ last_seen: datetime
17
+ repositories: set[str]
18
+
19
+
20
+ @dataclass
21
+ class DeveloperCluster:
22
+ """Represents a cluster of related developer identities."""
23
+
24
+ canonical_name: str
25
+ canonical_email: str
26
+ aliases: list[DeveloperAlias]
27
+ confidence: float # 0.0 to 1.0
28
+ reasoning: str
29
+ total_commits: int
30
+ total_story_points: int
31
+ preferred_display_name: Optional[str] = None # Optional preferred name for reports
32
+
33
+ @property
34
+ def all_emails(self) -> set[str]:
35
+ """Get all emails in this cluster."""
36
+ emails = {self.canonical_email}
37
+ emails.update(alias.email for alias in self.aliases)
38
+ return emails
39
+
40
+ @property
41
+ def all_names(self) -> set[str]:
42
+ """Get all names in this cluster."""
43
+ names = {self.canonical_name}
44
+ names.update(alias.name for alias in self.aliases)
45
+ return names
46
+
47
+
48
+ @dataclass
49
+ class IdentityAnalysisResult:
50
+ """Result of LLM identity analysis."""
51
+
52
+ clusters: list[DeveloperCluster]
53
+ unresolved_identities: list[DeveloperAlias]
54
+ analysis_metadata: dict[str, any] = field(default_factory=dict)
55
+
56
+ def get_manual_mappings(self) -> list[dict[str, any]]:
57
+ """Convert to manual mappings format for config."""
58
+ mappings = []
59
+ for cluster in self.clusters:
60
+ if len(cluster.aliases) > 0:
61
+ mapping = {}
62
+ # Add name first if specified
63
+ if cluster.preferred_display_name:
64
+ mapping["name"] = cluster.preferred_display_name
65
+ mapping["primary_email"] = cluster.canonical_email
66
+ mapping["aliases"] = [alias.email for alias in cluster.aliases]
67
+ mappings.append(mapping)
68
+ return mappings
69
+
70
+ def get_cluster_by_email(self, email: str) -> Optional[DeveloperCluster]:
71
+ """Find cluster containing the given email."""
72
+ email_lower = email.lower()
73
+ for cluster in self.clusters:
74
+ if email_lower in [e.lower() for e in cluster.all_emails]:
75
+ return cluster
76
+ return None