gitflow-analytics 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,402 @@
1
+ """Developer identity resolution with persistence."""
2
+ import uuid
3
+ import difflib
4
+ from datetime import datetime
5
+ from typing import Dict, List, Optional, Set, Tuple, Any
6
+ from collections import defaultdict
7
+ from contextlib import contextmanager
8
+
9
+ from sqlalchemy.orm import Session
10
+ from sqlalchemy import and_, or_
11
+
12
+ from ..models.database import Database, DeveloperIdentity, DeveloperAlias
13
+
14
+
15
+ class DeveloperIdentityResolver:
16
+ """Resolve and normalize developer identities across repositories."""
17
+
18
+ def __init__(self, db_path, similarity_threshold: float = 0.85,
19
+ manual_mappings: Optional[List[Dict[str, Any]]] = None):
20
+ """Initialize with database for persistence."""
21
+ self.similarity_threshold = similarity_threshold
22
+ self.db = Database(db_path)
23
+ self._cache = {} # In-memory cache for performance
24
+ self._load_cache()
25
+
26
+ # Store manual mappings to apply later
27
+ self.manual_mappings = manual_mappings
28
+
29
+ @contextmanager
30
+ def get_session(self):
31
+ """Get database session context manager."""
32
+ session = self.db.get_session()
33
+ try:
34
+ yield session
35
+ session.commit()
36
+ except Exception:
37
+ session.rollback()
38
+ raise
39
+ finally:
40
+ session.close()
41
+
42
+ def _load_cache(self):
43
+ """Load identities into memory cache."""
44
+ with self.get_session() as session:
45
+ # Load all identities
46
+ identities = session.query(DeveloperIdentity).all()
47
+ for identity in identities:
48
+ self._cache[identity.canonical_id] = {
49
+ 'primary_name': identity.primary_name,
50
+ 'primary_email': identity.primary_email,
51
+ 'github_username': identity.github_username
52
+ }
53
+
54
+ # Load all aliases
55
+ aliases = session.query(DeveloperAlias).all()
56
+ for alias in aliases:
57
+ key = f"{alias.email.lower()}:{alias.name.lower()}"
58
+ self._cache[key] = alias.canonical_id
59
+
60
+ def _apply_manual_mappings(self, manual_mappings: List[Dict[str, Any]]):
61
+ """Apply manual identity mappings from configuration."""
62
+ # Clear cache to ensure we get fresh data
63
+ self._cache.clear()
64
+ self._load_cache()
65
+
66
+ with self.get_session() as session:
67
+ for mapping in manual_mappings:
68
+ canonical_email = mapping.get('canonical_email', '').lower().strip()
69
+ aliases = mapping.get('aliases', [])
70
+
71
+ if not canonical_email or not aliases:
72
+ continue
73
+
74
+ # Find the canonical identity
75
+ canonical_identity = session.query(DeveloperIdentity).filter(
76
+ DeveloperIdentity.primary_email == canonical_email
77
+ ).first()
78
+
79
+ if not canonical_identity:
80
+ # Skip if canonical identity doesn't exist yet
81
+ print(f"Warning: Canonical identity not found for email: {canonical_email}")
82
+ continue
83
+
84
+ # Process each alias
85
+ for alias_email in aliases:
86
+ alias_email = alias_email.lower().strip()
87
+
88
+ # Check if alias identity exists as a primary identity
89
+ alias_identity = session.query(DeveloperIdentity).filter(
90
+ DeveloperIdentity.primary_email == alias_email
91
+ ).first()
92
+
93
+ if alias_identity:
94
+ if alias_identity.canonical_id != canonical_identity.canonical_id:
95
+ # Merge the identities - commit before merge to avoid locks
96
+ session.commit()
97
+ print(f"Merging identity: {alias_identity.primary_name} ({alias_email}) into {canonical_identity.primary_name} ({canonical_email})")
98
+ self.merge_identities(canonical_identity.canonical_id, alias_identity.canonical_id)
99
+ # Refresh session after merge
100
+ session.expire_all()
101
+ else:
102
+ # Just add as an alias if not a primary identity
103
+ existing_alias = session.query(DeveloperAlias).filter(
104
+ and_(
105
+ DeveloperAlias.email == alias_email,
106
+ DeveloperAlias.canonical_id == canonical_identity.canonical_id
107
+ )
108
+ ).first()
109
+
110
+ if not existing_alias:
111
+ # Get the name from any existing alias with this email
112
+ name_for_alias = None
113
+ any_alias = session.query(DeveloperAlias).filter(
114
+ DeveloperAlias.email == alias_email
115
+ ).first()
116
+ if any_alias:
117
+ name_for_alias = any_alias.name
118
+ else:
119
+ name_for_alias = canonical_identity.primary_name
120
+
121
+ new_alias = DeveloperAlias(
122
+ canonical_id=canonical_identity.canonical_id,
123
+ name=name_for_alias,
124
+ email=alias_email
125
+ )
126
+ session.add(new_alias)
127
+ print(f"Added alias: {alias_email} for {canonical_identity.primary_name}")
128
+
129
+ # Reload cache after all mappings
130
+ self._cache.clear()
131
+ self._load_cache()
132
+
133
+ def resolve_developer(self, name: str, email: str,
134
+ github_username: Optional[str] = None) -> str:
135
+ """Resolve developer identity and return canonical ID."""
136
+ # Normalize inputs
137
+ name = name.strip()
138
+ email = email.lower().strip()
139
+
140
+ # Check cache first
141
+ cache_key = f"{email}:{name.lower()}"
142
+ if cache_key in self._cache:
143
+ canonical_id = self._cache[cache_key]
144
+ # Update stats
145
+ self._update_developer_stats(canonical_id)
146
+ return canonical_id
147
+
148
+ # Check exact email match in database
149
+ with self.get_session() as session:
150
+ # Check aliases
151
+ alias = session.query(DeveloperAlias).filter(
152
+ DeveloperAlias.email == email
153
+ ).first()
154
+
155
+ if alias:
156
+ self._cache[cache_key] = alias.canonical_id
157
+ self._update_developer_stats(alias.canonical_id)
158
+ return alias.canonical_id
159
+
160
+ # Check primary identities
161
+ identity = session.query(DeveloperIdentity).filter(
162
+ DeveloperIdentity.primary_email == email
163
+ ).first()
164
+
165
+ if identity:
166
+ # Add as alias if name is different
167
+ if identity.primary_name.lower() != name.lower():
168
+ self._add_alias(identity.canonical_id, name, email)
169
+ self._cache[cache_key] = identity.canonical_id
170
+ return identity.canonical_id
171
+
172
+ # Find similar developer
173
+ best_match = self._find_best_match(name, email)
174
+
175
+ if best_match and best_match[1] >= self.similarity_threshold:
176
+ canonical_id = best_match[0]
177
+ self._add_alias(canonical_id, name, email)
178
+ self._cache[cache_key] = canonical_id
179
+ return canonical_id
180
+
181
+ # Create new identity
182
+ canonical_id = self._create_identity(name, email, github_username)
183
+ self._cache[cache_key] = canonical_id
184
+ return canonical_id
185
+
186
+ def _find_best_match(self, name: str, email: str) -> Optional[Tuple[str, float]]:
187
+ """Find the best matching existing developer."""
188
+ best_score = 0.0
189
+ best_canonical_id = None
190
+
191
+ name_lower = name.lower().strip()
192
+ email_domain = email.split('@')[1] if '@' in email else ''
193
+
194
+ with self.get_session() as session:
195
+ # Get all identities for comparison
196
+ identities = session.query(DeveloperIdentity).all()
197
+
198
+ for identity in identities:
199
+ score = 0.0
200
+
201
+ # Name similarity (40% weight)
202
+ name_sim = difflib.SequenceMatcher(
203
+ None, name_lower, identity.primary_name.lower()
204
+ ).ratio()
205
+ score += name_sim * 0.4
206
+
207
+ # Email domain similarity (30% weight)
208
+ identity_domain = (identity.primary_email.split('@')[1]
209
+ if '@' in identity.primary_email else '')
210
+ if email_domain and email_domain == identity_domain:
211
+ score += 0.3
212
+
213
+ # Check aliases (30% weight)
214
+ aliases = session.query(DeveloperAlias).filter(
215
+ DeveloperAlias.canonical_id == identity.canonical_id
216
+ ).all()
217
+
218
+ best_alias_score = 0.0
219
+ for alias in aliases:
220
+ alias_name_sim = difflib.SequenceMatcher(
221
+ None, name_lower, alias.name.lower()
222
+ ).ratio()
223
+
224
+ # Bonus for same email domain in aliases
225
+ alias_domain = alias.email.split('@')[1] if '@' in alias.email else ''
226
+ domain_bonus = 0.2 if alias_domain == email_domain else 0.0
227
+
228
+ alias_score = alias_name_sim + domain_bonus
229
+ best_alias_score = max(best_alias_score, alias_score)
230
+
231
+ score += min(best_alias_score * 0.3, 0.3)
232
+
233
+ if score > best_score:
234
+ best_score = score
235
+ best_canonical_id = identity.canonical_id
236
+
237
+ return (best_canonical_id, best_score) if best_canonical_id else None
238
+
239
+ def _create_identity(self, name: str, email: str,
240
+ github_username: Optional[str] = None) -> str:
241
+ """Create new developer identity."""
242
+ canonical_id = str(uuid.uuid4())
243
+
244
+ with self.get_session() as session:
245
+ identity = DeveloperIdentity(
246
+ canonical_id=canonical_id,
247
+ primary_name=name,
248
+ primary_email=email,
249
+ github_username=github_username,
250
+ total_commits=0,
251
+ total_story_points=0
252
+ )
253
+ session.add(identity)
254
+
255
+ # Update cache
256
+ self._cache[canonical_id] = {
257
+ 'primary_name': name,
258
+ 'primary_email': email,
259
+ 'github_username': github_username
260
+ }
261
+
262
+ return canonical_id
263
+
264
+ def _add_alias(self, canonical_id: str, name: str, email: str):
265
+ """Add alias for existing developer."""
266
+ with self.get_session() as session:
267
+ # Check if alias already exists
268
+ existing = session.query(DeveloperAlias).filter(
269
+ and_(
270
+ DeveloperAlias.canonical_id == canonical_id,
271
+ DeveloperAlias.email == email.lower()
272
+ )
273
+ ).first()
274
+
275
+ if not existing:
276
+ alias = DeveloperAlias(
277
+ canonical_id=canonical_id,
278
+ name=name,
279
+ email=email.lower()
280
+ )
281
+ session.add(alias)
282
+
283
+ def _update_developer_stats(self, canonical_id: str):
284
+ """Update developer statistics."""
285
+ with self.get_session() as session:
286
+ identity = session.query(DeveloperIdentity).filter(
287
+ DeveloperIdentity.canonical_id == canonical_id
288
+ ).first()
289
+
290
+ if identity:
291
+ identity.last_seen = datetime.utcnow()
292
+
293
+ def merge_identities(self, canonical_id1: str, canonical_id2: str):
294
+ """Merge two developer identities."""
295
+ # First, add the alias outside of the main merge transaction
296
+ with self.get_session() as session:
297
+ identity2 = session.query(DeveloperIdentity).filter(
298
+ DeveloperIdentity.canonical_id == canonical_id2
299
+ ).first()
300
+ if identity2:
301
+ identity2_name = identity2.primary_name
302
+ identity2_email = identity2.primary_email
303
+
304
+ # Add identity2's primary as alias to identity1 first
305
+ self._add_alias(canonical_id1, identity2_name, identity2_email)
306
+
307
+ # Now do the merge in a separate transaction
308
+ with self.get_session() as session:
309
+ # Get both identities fresh
310
+ identity1 = session.query(DeveloperIdentity).filter(
311
+ DeveloperIdentity.canonical_id == canonical_id1
312
+ ).first()
313
+ identity2 = session.query(DeveloperIdentity).filter(
314
+ DeveloperIdentity.canonical_id == canonical_id2
315
+ ).first()
316
+
317
+ if not identity1 or not identity2:
318
+ raise ValueError("One or both identities not found")
319
+
320
+ # Keep identity1, merge identity2 into it
321
+ identity1.total_commits += identity2.total_commits
322
+ identity1.total_story_points += identity2.total_story_points
323
+ identity1.first_seen = min(identity1.first_seen, identity2.first_seen)
324
+ identity1.last_seen = max(identity1.last_seen, identity2.last_seen)
325
+
326
+ # Move all aliases from identity2 to identity1
327
+ aliases = session.query(DeveloperAlias).filter(
328
+ DeveloperAlias.canonical_id == canonical_id2
329
+ ).all()
330
+
331
+ for alias in aliases:
332
+ alias.canonical_id = canonical_id1
333
+
334
+ # Delete identity2
335
+ session.delete(identity2)
336
+
337
+ # Clear cache to force reload
338
+ self._cache.clear()
339
+ self._load_cache()
340
+
341
+ def get_developer_stats(self) -> List[Dict[str, Any]]:
342
+ """Get statistics for all developers."""
343
+ stats = []
344
+
345
+ with self.get_session() as session:
346
+ identities = session.query(DeveloperIdentity).all()
347
+
348
+ for identity in identities:
349
+ # Count aliases
350
+ alias_count = session.query(DeveloperAlias).filter(
351
+ DeveloperAlias.canonical_id == identity.canonical_id
352
+ ).count()
353
+
354
+ stats.append({
355
+ 'canonical_id': identity.canonical_id,
356
+ 'primary_name': identity.primary_name,
357
+ 'primary_email': identity.primary_email,
358
+ 'github_username': identity.github_username,
359
+ 'total_commits': identity.total_commits,
360
+ 'total_story_points': identity.total_story_points,
361
+ 'alias_count': alias_count,
362
+ 'first_seen': identity.first_seen,
363
+ 'last_seen': identity.last_seen
364
+ })
365
+
366
+ # Sort by total commits
367
+ return sorted(stats, key=lambda x: x['total_commits'], reverse=True)
368
+
369
+ def update_commit_stats(self, commits: List[Dict[str, Any]]):
370
+ """Update developer statistics based on commits."""
371
+ # Aggregate stats by canonical ID
372
+ stats_by_dev = defaultdict(lambda: {'commits': 0, 'story_points': 0})
373
+
374
+ for commit in commits:
375
+ canonical_id = self.resolve_developer(
376
+ commit['author_name'],
377
+ commit['author_email']
378
+ )
379
+
380
+ stats_by_dev[canonical_id]['commits'] += 1
381
+ stats_by_dev[canonical_id]['story_points'] += commit.get('story_points', 0) or 0
382
+
383
+ # Update database
384
+ with self.get_session() as session:
385
+ for canonical_id, stats in stats_by_dev.items():
386
+ identity = session.query(DeveloperIdentity).filter(
387
+ DeveloperIdentity.canonical_id == canonical_id
388
+ ).first()
389
+
390
+ if identity:
391
+ identity.total_commits += stats['commits']
392
+ identity.total_story_points += stats['story_points']
393
+ identity.last_seen = datetime.utcnow()
394
+
395
+ # Apply manual mappings after all identities are created
396
+ if self.manual_mappings:
397
+ self.apply_manual_mappings()
398
+
399
+ def apply_manual_mappings(self):
400
+ """Apply manual mappings - can be called explicitly after identities are created."""
401
+ if self.manual_mappings:
402
+ self._apply_manual_mappings(self.manual_mappings)
File without changes
@@ -0,0 +1,41 @@
1
+ """Base classes for pluggable extractors."""
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Optional, List, Dict
4
+
5
+
6
+ class ExtractorBase(ABC):
7
+ """Base class for all extractors."""
8
+
9
+ @abstractmethod
10
+ def extract_from_text(self, text: str) -> Any:
11
+ """Extract information from text."""
12
+ pass
13
+
14
+
15
+ class StoryPointExtractorBase(ExtractorBase):
16
+ """Base class for story point extractors."""
17
+
18
+ @abstractmethod
19
+ def extract_from_text(self, text: str) -> Optional[int]:
20
+ """Extract story points from text."""
21
+ pass
22
+
23
+ @abstractmethod
24
+ def extract_from_pr(self, pr_data: Dict[str, Any],
25
+ commit_messages: Optional[List[str]] = None) -> Optional[int]:
26
+ """Extract story points from pull request."""
27
+ pass
28
+
29
+
30
+ class TicketExtractorBase(ExtractorBase):
31
+ """Base class for ticket extractors."""
32
+
33
+ @abstractmethod
34
+ def extract_from_text(self, text: str) -> List[Dict[str, str]]:
35
+ """Extract ticket references from text."""
36
+ pass
37
+
38
+ @abstractmethod
39
+ def extract_by_platform(self, text: str) -> Dict[str, List[str]]:
40
+ """Extract tickets grouped by platform."""
41
+ pass
@@ -0,0 +1,128 @@
1
+ """Story point extraction from commits and pull requests."""
2
+ import re
3
+ from typing import Optional, List, Dict, Any
4
+
5
+
6
+ class StoryPointExtractor:
7
+ """Extract story points from text using configurable patterns."""
8
+
9
+ def __init__(self, patterns: Optional[List[str]] = None):
10
+ """Initialize with extraction patterns."""
11
+ if patterns is None:
12
+ patterns = [
13
+ r'(?:story\s*points?|sp|pts?)\s*[:=]\s*(\d+)', # SP: 5, story points = 3
14
+ r'\[(\d+)\s*(?:sp|pts?)\]', # [3sp], [5 pts]
15
+ r'#(\d+)sp', # #3sp
16
+ r'estimate:\s*(\d+)', # estimate: 5
17
+ r'\bSP(\d+)\b', # SP5, SP13
18
+ r'points?:\s*(\d+)', # points: 8
19
+ ]
20
+
21
+ self.patterns = [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
22
+
23
+ def extract_from_text(self, text: str) -> Optional[int]:
24
+ """Extract story points from text."""
25
+ if not text:
26
+ return None
27
+
28
+ for pattern in self.patterns:
29
+ match = pattern.search(text)
30
+ if match:
31
+ try:
32
+ points = int(match.group(1))
33
+ # Sanity check - story points should be reasonable
34
+ if 0 < points <= 100:
35
+ return points
36
+ except (ValueError, IndexError):
37
+ continue
38
+
39
+ return None
40
+
41
+ def extract_from_pr(self, pr_data: Dict[str, Any],
42
+ commit_messages: Optional[List[str]] = None) -> Optional[int]:
43
+ """Extract story points from PR with fallback to commits."""
44
+ # Try PR description first (most authoritative)
45
+ points = self.extract_from_text(pr_data.get('description', ''))
46
+ if points:
47
+ return points
48
+
49
+ # Try PR title
50
+ points = self.extract_from_text(pr_data.get('title', ''))
51
+ if points:
52
+ return points
53
+
54
+ # Try PR body (if different from description)
55
+ if 'body' in pr_data:
56
+ points = self.extract_from_text(pr_data['body'])
57
+ if points:
58
+ return points
59
+
60
+ # Fallback to commit messages
61
+ if commit_messages:
62
+ commit_points = []
63
+ for message in commit_messages:
64
+ points = self.extract_from_text(message)
65
+ if points:
66
+ commit_points.append(points)
67
+
68
+ if commit_points:
69
+ # Use the most common value or max if no consensus
70
+ from collections import Counter
71
+ point_counts = Counter(commit_points)
72
+ most_common = point_counts.most_common(1)
73
+ if most_common:
74
+ return most_common[0][0]
75
+
76
+ return None
77
+
78
+ def aggregate_story_points(self, prs: List[Dict[str, Any]],
79
+ commits: List[Dict[str, Any]]) -> Dict[str, Any]:
80
+ """Aggregate story points from PRs and commits."""
81
+ # Map commits to PRs
82
+ pr_by_commit = {}
83
+ for pr in prs:
84
+ for commit_hash in pr.get('commit_hashes', []):
85
+ pr_by_commit[commit_hash] = pr
86
+
87
+ # Track which commits are associated with PRs
88
+ pr_commits = set(pr_by_commit.keys())
89
+
90
+ # Aggregate results
91
+ results = {
92
+ 'total_story_points': 0,
93
+ 'pr_story_points': 0,
94
+ 'commit_story_points': 0,
95
+ 'orphan_commits': [], # Commits without PRs
96
+ 'unestimated_prs': [] # PRs without story points
97
+ }
98
+
99
+ # Process PRs
100
+ for pr in prs:
101
+ pr_points = pr.get('story_points', 0)
102
+ if pr_points:
103
+ results['pr_story_points'] += pr_points
104
+ results['total_story_points'] += pr_points
105
+ else:
106
+ results['unestimated_prs'].append({
107
+ 'number': pr['number'],
108
+ 'title': pr['title']
109
+ })
110
+
111
+ # Process commits not in PRs
112
+ for commit in commits:
113
+ if commit['hash'] not in pr_commits:
114
+ commit_points = commit.get('story_points', 0)
115
+ if commit_points:
116
+ results['commit_story_points'] += commit_points
117
+ results['total_story_points'] += commit_points
118
+
119
+ # Track significant orphan commits
120
+ if commit['files_changed'] > 5 or commit['insertions'] > 100:
121
+ results['orphan_commits'].append({
122
+ 'hash': commit['hash'][:7],
123
+ 'message': commit['message'].split('\n')[0][:80],
124
+ 'story_points': commit_points,
125
+ 'files_changed': commit['files_changed']
126
+ })
127
+
128
+ return results