gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1144 @@
1
+ """Story point correlation analysis for GitFlow Analytics.
2
+
3
+ This module provides comprehensive analysis of story point estimation accuracy and
4
+ correlation with actual development work metrics including commits, lines of code,
5
+ and time spent. It tracks velocity trends and generates actionable insights for
6
+ process improvement and team calibration.
7
+
8
+ WHY: Story point estimation is a critical part of agile development, but accuracy
9
+ varies significantly across teams and individuals. This analysis helps identify
10
+ which teams/developers have accurate estimates vs which need calibration training.
11
+
12
+ DESIGN DECISION: Week-based aggregation using Monday-Sunday boundaries to align
13
+ with sprint planning cycles and provide consistent reporting periods. All metrics
14
+ are calculated both at individual and team levels for targeted improvements.
15
+ """
16
+
17
+ import logging
18
+ from collections import defaultdict
19
+ from datetime import datetime, timedelta, timezone
20
+ from pathlib import Path
21
+ from typing import Any, Optional
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+ from scipy import stats
26
+
27
+ # Get logger for this module
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class StoryPointCorrelationAnalyzer:
32
+ """Analyzes story point estimation accuracy and correlations with actual work."""
33
+
34
+ def __init__(self, anonymize: bool = False, identity_resolver=None):
35
+ """Initialize the correlation analyzer.
36
+
37
+ Args:
38
+ anonymize: Whether to anonymize developer names in reports
39
+ identity_resolver: Identity resolver for canonical developer names
40
+ """
41
+ self.anonymize = anonymize
42
+ self.identity_resolver = identity_resolver
43
+ self._anonymization_map: dict[str, str] = {}
44
+ self._anonymous_counter = 0
45
+
46
+ def calculate_weekly_correlations(
47
+ self,
48
+ commits: list[dict[str, Any]],
49
+ prs: list[dict[str, Any]],
50
+ pm_data: Optional[dict[str, Any]] = None,
51
+ weeks: int = 12
52
+ ) -> dict[str, Any]:
53
+ """Calculate weekly story point correlations with actual work metrics.
54
+
55
+ WHY: Weekly aggregation provides sprint-aligned analysis periods that match
56
+ typical development cycles, enabling actionable insights for sprint planning
57
+ and retrospectives.
58
+
59
+ Args:
60
+ commits: List of commit data with story points and metrics
61
+ prs: List of pull request data with story points
62
+ pm_data: PM platform data with issue correlations
63
+ weeks: Number of weeks to analyze
64
+
65
+ Returns:
66
+ Dictionary containing weekly correlation metrics and analysis
67
+ """
68
+ logger.debug(f"Starting weekly correlation analysis for {weeks} weeks")
69
+
70
+ # Calculate date range
71
+ end_date = datetime.now(timezone.utc)
72
+ start_date = end_date - timedelta(weeks=weeks)
73
+
74
+ logger.debug(f"Analysis period: {start_date} to {end_date}")
75
+
76
+ # Aggregate data by week and developer
77
+ weekly_metrics = self._aggregate_weekly_metrics(commits, prs, pm_data, start_date, end_date)
78
+
79
+ # Calculate correlations for each week
80
+ correlation_results = {}
81
+
82
+ for week_start, week_data in weekly_metrics.items():
83
+ week_correlations = self._calculate_week_correlations(week_data)
84
+ correlation_results[week_start] = week_correlations
85
+
86
+ logger.debug(f"Calculated correlations for {len(correlation_results)} weeks")
87
+
88
+ return {
89
+ "weekly_correlations": correlation_results,
90
+ "summary_stats": self._calculate_correlation_summary(correlation_results),
91
+ "trend_analysis": self._analyze_correlation_trends(correlation_results),
92
+ "developer_accuracy": self._analyze_developer_accuracy(weekly_metrics),
93
+ "recommendations": self._generate_correlation_recommendations(correlation_results, weekly_metrics)
94
+ }
95
+
96
+ def analyze_estimation_accuracy(
97
+ self,
98
+ commits: list[dict[str, Any]],
99
+ pm_data: Optional[dict[str, Any]] = None,
100
+ weeks: int = 12
101
+ ) -> dict[str, Any]:
102
+ """Analyze story point estimation accuracy by comparing estimated vs actual.
103
+
104
+ WHY: Estimation accuracy analysis helps identify systematic over/under-estimation
105
+ patterns and provides targeted feedback for improving planning accuracy.
106
+
107
+ DESIGN DECISION: Uses multiple accuracy metrics (absolute error, relative error,
108
+ accuracy percentage) to provide comprehensive view of estimation quality.
109
+
110
+ Args:
111
+ commits: List of commit data with story points
112
+ pm_data: PM platform data with original story point estimates
113
+ weeks: Number of weeks to analyze
114
+
115
+ Returns:
116
+ Dictionary containing estimation accuracy analysis
117
+ """
118
+ logger.debug("Starting estimation accuracy analysis")
119
+
120
+ if not pm_data or "correlations" not in pm_data:
121
+ logger.warning("No PM data available for estimation accuracy analysis")
122
+ return self._empty_accuracy_analysis()
123
+
124
+ # Extract estimation vs actual pairs
125
+ estimation_pairs = self._extract_estimation_pairs(commits, pm_data, weeks)
126
+
127
+ if not estimation_pairs:
128
+ logger.warning("No estimation pairs found for accuracy analysis")
129
+ return self._empty_accuracy_analysis()
130
+
131
+ # Calculate accuracy metrics
132
+ accuracy_metrics = self._calculate_accuracy_metrics(estimation_pairs)
133
+
134
+ # Analyze by developer
135
+ developer_accuracy = self._analyze_developer_estimation_accuracy(estimation_pairs)
136
+
137
+ # Analyze by story point size
138
+ size_accuracy = self._analyze_size_based_accuracy(estimation_pairs)
139
+
140
+ return {
141
+ "overall_accuracy": accuracy_metrics,
142
+ "developer_accuracy": developer_accuracy,
143
+ "size_based_accuracy": size_accuracy,
144
+ "improvement_suggestions": self._generate_accuracy_recommendations(
145
+ accuracy_metrics, developer_accuracy, size_accuracy
146
+ )
147
+ }
148
+
149
+ def calculate_velocity_metrics(
150
+ self,
151
+ commits: list[dict[str, Any]],
152
+ prs: list[dict[str, Any]],
153
+ pm_data: Optional[dict[str, Any]] = None,
154
+ weeks: int = 12
155
+ ) -> dict[str, Any]:
156
+ """Calculate velocity trends and patterns over time.
157
+
158
+ WHY: Velocity analysis helps track team productivity over time and identify
159
+ factors that impact delivery speed, enabling better sprint planning and
160
+ capacity management.
161
+
162
+ Args:
163
+ commits: List of commit data with story points
164
+ prs: List of pull request data
165
+ pm_data: PM platform data for additional context
166
+ weeks: Number of weeks to analyze
167
+
168
+ Returns:
169
+ Dictionary containing velocity metrics and trends
170
+ """
171
+ logger.debug(f"Calculating velocity metrics for {weeks} weeks")
172
+
173
+ # Calculate date range
174
+ end_date = datetime.now(timezone.utc)
175
+ start_date = end_date - timedelta(weeks=weeks)
176
+
177
+ # Aggregate velocity data by week
178
+ weekly_velocity = self._aggregate_weekly_velocity(commits, prs, start_date, end_date)
179
+
180
+ # Calculate velocity trends
181
+ velocity_trends = self._calculate_velocity_trends(weekly_velocity)
182
+
183
+ # Analyze velocity by developer
184
+ developer_velocity = self._analyze_developer_velocity(commits, start_date, end_date)
185
+
186
+ # Calculate predictability metrics
187
+ predictability = self._calculate_velocity_predictability(weekly_velocity)
188
+
189
+ return {
190
+ "weekly_velocity": weekly_velocity,
191
+ "trends": velocity_trends,
192
+ "developer_velocity": developer_velocity,
193
+ "predictability": predictability,
194
+ "capacity_analysis": self._analyze_team_capacity(weekly_velocity, developer_velocity)
195
+ }
196
+
197
+ def generate_correlation_report(
198
+ self,
199
+ commits: list[dict[str, Any]],
200
+ prs: list[dict[str, Any]],
201
+ pm_data: Optional[dict[str, Any]],
202
+ output_path: Path,
203
+ weeks: int = 12
204
+ ) -> Path:
205
+ """Generate comprehensive CSV report with story point correlation metrics.
206
+
207
+ WHY: CSV format enables easy import into spreadsheet tools for additional
208
+ analysis and sharing with stakeholders who need detailed correlation data.
209
+
210
+ Args:
211
+ commits: List of commit data with story points
212
+ prs: List of pull request data
213
+ pm_data: PM platform data with correlations
214
+ output_path: Path for the output CSV file
215
+ weeks: Number of weeks to analyze
216
+
217
+ Returns:
218
+ Path to the generated CSV report
219
+ """
220
+ logger.debug(f"Generating story point correlation report: {output_path}")
221
+
222
+ try:
223
+ # Calculate all correlation metrics
224
+ weekly_correlations = self.calculate_weekly_correlations(commits, prs, pm_data, weeks)
225
+ estimation_accuracy = self.analyze_estimation_accuracy(commits, pm_data, weeks)
226
+ velocity_metrics = self.calculate_velocity_metrics(commits, prs, pm_data, weeks)
227
+
228
+ # Build CSV rows
229
+ rows = self._build_correlation_csv_rows(
230
+ weekly_correlations, estimation_accuracy, velocity_metrics
231
+ )
232
+
233
+ if rows:
234
+ df = pd.DataFrame(rows)
235
+ df.to_csv(output_path, index=False)
236
+ logger.debug(f"Generated correlation report with {len(rows)} rows")
237
+ else:
238
+ # Write empty CSV with headers
239
+ self._write_empty_correlation_csv(output_path)
240
+ logger.debug("Generated empty correlation report (no data)")
241
+
242
+ return output_path
243
+
244
+ except Exception as e:
245
+ logger.error(f"Error generating story point correlation report: {e}")
246
+ # Still create empty report file
247
+ self._write_empty_correlation_csv(output_path)
248
+ raise
249
+
250
+ def _aggregate_weekly_metrics(
251
+ self,
252
+ commits: list[dict[str, Any]],
253
+ prs: list[dict[str, Any]],
254
+ pm_data: Optional[dict[str, Any]],
255
+ start_date: datetime,
256
+ end_date: datetime
257
+ ) -> dict[datetime, dict[str, dict[str, Any]]]:
258
+ """Aggregate metrics by week and developer for correlation analysis."""
259
+ weekly_metrics = defaultdict(lambda: defaultdict(lambda: {
260
+ "story_points": 0,
261
+ "commits": 0,
262
+ "lines_added": 0,
263
+ "lines_removed": 0,
264
+ "files_changed": 0,
265
+ "prs": 0,
266
+ "complexity_delta": 0.0,
267
+ "time_spent_hours": 0.0, # Estimated from commit frequency
268
+ "estimated_story_points": 0, # From PM platform
269
+ "actual_story_points": 0 # From commits
270
+ }))
271
+
272
+ # Process commits
273
+ for commit in commits:
274
+ timestamp = self._ensure_timezone_aware(commit.get("timestamp"))
275
+ if not timestamp or timestamp < start_date or timestamp > end_date:
276
+ continue
277
+
278
+ week_start = self._get_week_start(timestamp)
279
+ developer_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
280
+
281
+ metrics = weekly_metrics[week_start][developer_id]
282
+
283
+ # Aggregate commit metrics
284
+ metrics["commits"] += 1
285
+ metrics["story_points"] += commit.get("story_points", 0) or 0
286
+ metrics["actual_story_points"] += commit.get("story_points", 0) or 0
287
+ metrics["lines_added"] += commit.get("insertions", 0) or 0
288
+ metrics["lines_removed"] += commit.get("deletions", 0) or 0
289
+ metrics["files_changed"] += commit.get("files_changed", 0) or 0
290
+ metrics["complexity_delta"] += commit.get("complexity_delta", 0.0) or 0.0
291
+
292
+ # Process PRs
293
+ for pr in prs:
294
+ created_at = self._ensure_timezone_aware(pr.get("created_at"))
295
+ if not created_at or created_at < start_date or created_at > end_date:
296
+ continue
297
+
298
+ week_start = self._get_week_start(created_at)
299
+ developer_id = pr.get("canonical_id", pr.get("author", "unknown"))
300
+
301
+ if developer_id in weekly_metrics[week_start]:
302
+ weekly_metrics[week_start][developer_id]["prs"] += 1
303
+
304
+ # Add PM platform data if available
305
+ if pm_data and "correlations" in pm_data:
306
+ for correlation in pm_data["correlations"]:
307
+ commit_date = correlation.get("commit_date")
308
+ if not commit_date:
309
+ continue
310
+
311
+ timestamp = self._ensure_timezone_aware(
312
+ datetime.fromisoformat(commit_date.replace("Z", "+00:00"))
313
+ if isinstance(commit_date, str) else commit_date
314
+ )
315
+
316
+ if timestamp < start_date or timestamp > end_date:
317
+ continue
318
+
319
+ week_start = self._get_week_start(timestamp)
320
+ developer_id = correlation.get("commit_author", "unknown")
321
+
322
+ if developer_id in weekly_metrics[week_start]:
323
+ estimated_sp = correlation.get("story_points", 0) or 0
324
+ weekly_metrics[week_start][developer_id]["estimated_story_points"] += estimated_sp
325
+
326
+ # Convert defaultdicts to regular dicts for JSON serialization
327
+ return {week: dict(developers) for week, developers in weekly_metrics.items()}
328
+
329
+ def _calculate_week_correlations(self, week_data: dict[str, dict[str, Any]]) -> dict[str, Any]:
330
+ """Calculate correlations for a single week's data."""
331
+ if len(week_data) < 2:
332
+ return self._empty_week_correlations()
333
+
334
+ # Extract parallel arrays for correlation calculation
335
+ developers = []
336
+ story_points = []
337
+ commits = []
338
+ lines_changed = []
339
+ files_changed = []
340
+ prs = []
341
+ complexity = []
342
+
343
+ for dev_id, metrics in week_data.items():
344
+ developers.append(dev_id)
345
+ story_points.append(metrics["story_points"])
346
+ commits.append(metrics["commits"])
347
+ lines_changed.append(metrics["lines_added"] + metrics["lines_removed"])
348
+ files_changed.append(metrics["files_changed"])
349
+ prs.append(metrics["prs"])
350
+ complexity.append(metrics["complexity_delta"])
351
+
352
+ # Calculate correlations using scipy.stats
353
+ correlations = {}
354
+
355
+ try:
356
+ # Check if we have enough data points and variance for meaningful correlations
357
+ if len(story_points) < 2:
358
+ logger.debug(f"Insufficient data for correlation: only {len(story_points)} data points")
359
+ correlations = {k: 0.0 for k in ["sp_commits", "sp_lines", "sp_files", "sp_prs", "sp_complexity"]}
360
+ elif np.std(story_points) == 0:
361
+ logger.debug("All story points are the same value - no variance for correlation")
362
+ correlations = {k: 0.0 for k in ["sp_commits", "sp_lines", "sp_files", "sp_prs", "sp_complexity"]}
363
+ else:
364
+ # Calculate correlations only when we have sufficient variance
365
+ correlations["sp_commits"] = float(stats.pearsonr(story_points, commits)[0])
366
+ correlations["sp_lines"] = float(stats.pearsonr(story_points, lines_changed)[0])
367
+ correlations["sp_files"] = float(stats.pearsonr(story_points, files_changed)[0])
368
+ correlations["sp_prs"] = float(stats.pearsonr(story_points, prs)[0])
369
+ correlations["sp_complexity"] = float(stats.pearsonr(story_points, complexity)[0])
370
+ logger.debug(f"Calculated correlations with {len(story_points)} data points")
371
+
372
+ except Exception as e:
373
+ logger.warning(f"Error calculating correlations: {e}")
374
+ correlations = {k: 0.0 for k in ["sp_commits", "sp_lines", "sp_files", "sp_prs", "sp_complexity"]}
375
+
376
+ return {
377
+ "correlations": correlations,
378
+ "sample_size": len(developers),
379
+ "total_story_points": sum(story_points),
380
+ "total_commits": sum(commits),
381
+ "total_lines_changed": sum(lines_changed)
382
+ }
383
+
384
+ def _calculate_correlation_summary(self, correlation_results: dict[datetime, dict[str, Any]]) -> dict[str, Any]:
385
+ """Calculate summary statistics across all weeks."""
386
+ if not correlation_results:
387
+ return {"avg_correlations": {}, "trend_direction": "stable", "strength": "weak"}
388
+
389
+ # Aggregate correlations across weeks
390
+ all_correlations = defaultdict(list)
391
+
392
+ for week_data in correlation_results.values():
393
+ correlations = week_data.get("correlations", {})
394
+ for metric, value in correlations.items():
395
+ if not np.isnan(value): # Filter out NaN values
396
+ all_correlations[metric].append(value)
397
+
398
+ # Calculate averages
399
+ avg_correlations = {}
400
+ for metric, values in all_correlations.items():
401
+ if values:
402
+ avg_correlations[metric] = float(np.mean(values))
403
+ else:
404
+ avg_correlations[metric] = 0.0
405
+
406
+ # Determine overall correlation strength
407
+ avg_strength = np.mean(list(avg_correlations.values()))
408
+ if avg_strength > 0.7:
409
+ strength = "strong"
410
+ elif avg_strength > 0.4:
411
+ strength = "moderate"
412
+ else:
413
+ strength = "weak"
414
+
415
+ return {
416
+ "avg_correlations": avg_correlations,
417
+ "strength": strength,
418
+ "weeks_analyzed": len(correlation_results),
419
+ "max_correlation": max(avg_correlations.values()) if avg_correlations else 0.0,
420
+ "min_correlation": min(avg_correlations.values()) if avg_correlations else 0.0
421
+ }
422
+
423
+ def _analyze_correlation_trends(self, correlation_results: dict[datetime, dict[str, Any]]) -> dict[str, Any]:
424
+ """Analyze trends in correlations over time."""
425
+ if len(correlation_results) < 3:
426
+ return {"trend_direction": "insufficient_data", "trend_strength": 0.0}
427
+
428
+ # Sort by week for trend analysis
429
+ sorted_weeks = sorted(correlation_results.keys())
430
+
431
+ # Calculate trend for each correlation metric
432
+ trends = {}
433
+
434
+ for metric in ["sp_commits", "sp_lines", "sp_files", "sp_prs", "sp_complexity"]:
435
+ values = []
436
+ weeks = []
437
+
438
+ for week in sorted_weeks:
439
+ week_correlations = correlation_results[week].get("correlations", {})
440
+ if metric in week_correlations and not np.isnan(week_correlations[metric]):
441
+ values.append(week_correlations[metric])
442
+ weeks.append(len(weeks)) # Use index as x-value
443
+
444
+ if len(values) >= 3: # Need at least 3 points for trend
445
+ slope, intercept, r_value, p_value, std_err = stats.linregress(weeks, values)
446
+ trends[metric] = {
447
+ "slope": float(slope),
448
+ "r_squared": float(r_value ** 2),
449
+ "p_value": float(p_value),
450
+ "direction": "improving" if slope > 0.01 else "declining" if slope < -0.01 else "stable"
451
+ }
452
+ else:
453
+ trends[metric] = {"slope": 0.0, "direction": "insufficient_data"}
454
+
455
+ return trends
456
+
457
+ def _analyze_developer_accuracy(self, weekly_metrics: dict[datetime, dict[str, dict[str, Any]]]) -> dict[str, Any]:
458
+ """Analyze story point estimation accuracy by developer."""
459
+ developer_totals = defaultdict(lambda: {
460
+ "estimated_total": 0,
461
+ "actual_total": 0,
462
+ "weeks_active": 0,
463
+ "accuracy_scores": []
464
+ })
465
+
466
+ for week_data in weekly_metrics.values():
467
+ for dev_id, metrics in week_data.items():
468
+ estimated = metrics.get("estimated_story_points", 0)
469
+ actual = metrics.get("actual_story_points", 0)
470
+
471
+ if estimated > 0 or actual > 0: # Developer was active
472
+ dev_stats = developer_totals[dev_id]
473
+ dev_stats["estimated_total"] += estimated
474
+ dev_stats["actual_total"] += actual
475
+ dev_stats["weeks_active"] += 1
476
+
477
+ # Calculate weekly accuracy if both values exist
478
+ if estimated > 0 and actual > 0:
479
+ accuracy = 1.0 - abs(estimated - actual) / max(estimated, actual)
480
+ dev_stats["accuracy_scores"].append(accuracy)
481
+
482
+ # Calculate final accuracy metrics for each developer
483
+ developer_accuracy = {}
484
+
485
+ for dev_id, dev_stats in developer_totals.items():
486
+ if dev_stats["weeks_active"] > 0:
487
+ # Overall accuracy based on totals
488
+ if dev_stats["estimated_total"] > 0 and dev_stats["actual_total"] > 0:
489
+ overall_accuracy = 1.0 - abs(dev_stats["estimated_total"] - dev_stats["actual_total"]) / max(dev_stats["estimated_total"], dev_stats["actual_total"])
490
+ else:
491
+ overall_accuracy = 0.0
492
+
493
+ # Average weekly accuracy
494
+ if dev_stats["accuracy_scores"]:
495
+ avg_weekly_accuracy = float(np.mean(dev_stats["accuracy_scores"]))
496
+ consistency = 1.0 - float(np.std(dev_stats["accuracy_scores"]))
497
+ else:
498
+ avg_weekly_accuracy = 0.0
499
+ consistency = 0.0
500
+
501
+ developer_accuracy[self._anonymize_value(dev_id, "name")] = {
502
+ "overall_accuracy": float(overall_accuracy),
503
+ "avg_weekly_accuracy": avg_weekly_accuracy,
504
+ "consistency": consistency,
505
+ "weeks_active": dev_stats["weeks_active"],
506
+ "total_estimated": dev_stats["estimated_total"],
507
+ "total_actual": dev_stats["actual_total"],
508
+ "estimation_ratio": dev_stats["actual_total"] / max(dev_stats["estimated_total"], 1)
509
+ }
510
+
511
+ return developer_accuracy
512
+
513
+ def _generate_correlation_recommendations(
514
+ self, correlation_results: dict[datetime, dict[str, Any]], weekly_metrics: dict[datetime, dict[str, dict[str, Any]]]
515
+ ) -> list[dict[str, str]]:
516
+ """Generate actionable recommendations based on correlation analysis."""
517
+ recommendations = []
518
+
519
+ summary = self._calculate_correlation_summary(correlation_results)
520
+ avg_correlations = summary.get("avg_correlations", {})
521
+
522
+ # Check story points to commits correlation
523
+ sp_commits_corr = avg_correlations.get("sp_commits", 0)
524
+ if sp_commits_corr < 0.3:
525
+ recommendations.append({
526
+ "type": "process_improvement",
527
+ "priority": "high",
528
+ "title": "Weak Story Points to Commits Correlation",
529
+ "description": f"Story points show weak correlation with commit count ({sp_commits_corr:.2f}). Consider story point training or breaking down large stories.",
530
+ "action": "Review story point estimation guidelines and provide team training"
531
+ })
532
+
533
+ # Check story points to lines of code correlation
534
+ sp_lines_corr = avg_correlations.get("sp_lines", 0)
535
+ if sp_lines_corr < 0.4:
536
+ recommendations.append({
537
+ "type": "estimation_calibration",
538
+ "priority": "medium",
539
+ "title": "Story Points Don't Correlate with Code Changes",
540
+ "description": f"Story points show weak correlation with lines of code changed ({sp_lines_corr:.2f}). This may indicate estimation inconsistency.",
541
+ "action": "Analyze whether story points reflect complexity vs. effort, and align team understanding"
542
+ })
543
+
544
+ # Analyze developer accuracy
545
+ developer_accuracy = self._analyze_developer_accuracy(weekly_metrics)
546
+ low_accuracy_devs = [
547
+ dev for dev, stats in developer_accuracy.items()
548
+ if stats["overall_accuracy"] < 0.5 and stats["weeks_active"] >= 2
549
+ ]
550
+
551
+ if low_accuracy_devs:
552
+ recommendations.append({
553
+ "type": "individual_coaching",
554
+ "priority": "medium",
555
+ "title": "Developers Need Estimation Training",
556
+ "description": f"{len(low_accuracy_devs)} developers have low estimation accuracy. Consider individual coaching sessions.",
557
+ "action": f"Provide estimation training for: {', '.join(low_accuracy_devs[:3])}"
558
+ })
559
+
560
+ # Check overall correlation strength
561
+ if summary.get("strength") == "weak":
562
+ recommendations.append({
563
+ "type": "process_review",
564
+ "priority": "high",
565
+ "title": "Overall Weak Correlations",
566
+ "description": "Story points show weak correlations across all work metrics. The estimation process may need fundamental review.",
567
+ "action": "Conduct team retrospective on story point estimation process and consider alternative estimation methods"
568
+ })
569
+
570
+ return recommendations
571
+
572
+ def _extract_estimation_pairs(
573
+ self, commits: list[dict[str, Any]], pm_data: dict[str, Any], weeks: int
574
+ ) -> list[tuple[int, int, str]]:
575
+ """Extract (estimated, actual, developer) pairs for accuracy analysis."""
576
+ pairs = []
577
+
578
+ if not pm_data or "correlations" not in pm_data:
579
+ return pairs
580
+
581
+ # Calculate date range
582
+ end_date = datetime.now(timezone.utc)
583
+ start_date = end_date - timedelta(weeks=weeks)
584
+
585
+ for correlation in pm_data["correlations"]:
586
+ commit_date = correlation.get("commit_date")
587
+ if not commit_date:
588
+ continue
589
+
590
+ timestamp = self._ensure_timezone_aware(
591
+ datetime.fromisoformat(commit_date.replace("Z", "+00:00"))
592
+ if isinstance(commit_date, str) else commit_date
593
+ )
594
+
595
+ if timestamp < start_date or timestamp > end_date:
596
+ continue
597
+
598
+ estimated_sp = correlation.get("story_points", 0) or 0
599
+ commit_hash = correlation.get("commit_hash", "")
600
+ developer = correlation.get("commit_author", "unknown")
601
+
602
+ # Find matching commit for actual story points
603
+ matching_commit = next(
604
+ (c for c in commits if c.get("hash", "") == commit_hash), None
605
+ )
606
+
607
+ if matching_commit:
608
+ actual_sp = matching_commit.get("story_points", 0) or 0
609
+ if estimated_sp > 0 and actual_sp > 0: # Valid pair
610
+ pairs.append((estimated_sp, actual_sp, developer))
611
+
612
+ return pairs
613
+
614
+ def _calculate_accuracy_metrics(self, estimation_pairs: list[tuple[int, int, str]]) -> dict[str, Any]:
615
+ """Calculate overall estimation accuracy metrics."""
616
+ if not estimation_pairs:
617
+ return {"mean_absolute_error": 0, "mean_relative_error": 0, "accuracy_percentage": 0}
618
+
619
+ estimated_values = [pair[0] for pair in estimation_pairs]
620
+ actual_values = [pair[1] for pair in estimation_pairs]
621
+
622
+ # Mean Absolute Error
623
+ mae = float(np.mean([abs(est - act) for est, act in zip(estimated_values, actual_values)]))
624
+
625
+ # Mean Relative Error (as percentage)
626
+ relative_errors = [
627
+ abs(est - act) / max(est, act) * 100
628
+ for est, act in zip(estimated_values, actual_values)
629
+ if max(est, act) > 0
630
+ ]
631
+ mre = float(np.mean(relative_errors)) if relative_errors else 0
632
+
633
+ # Accuracy percentage (within 20% tolerance)
634
+ accurate_estimates = sum(
635
+ 1 for est, act in zip(estimated_values, actual_values)
636
+ if abs(est - act) / max(est, act) <= 0.2
637
+ )
638
+ accuracy_percentage = (accurate_estimates / len(estimation_pairs)) * 100 if estimation_pairs else 0
639
+
640
+ return {
641
+ "mean_absolute_error": mae,
642
+ "mean_relative_error": mre,
643
+ "accuracy_percentage": float(accuracy_percentage),
644
+ "total_comparisons": len(estimation_pairs),
645
+ "correlation_coefficient": float(stats.pearsonr(estimated_values, actual_values)[0]) if len(estimation_pairs) > 1 else 0
646
+ }
647
+
648
+ def _analyze_developer_estimation_accuracy(self, estimation_pairs: list[tuple[int, int, str]]) -> dict[str, dict[str, Any]]:
649
+ """Analyze estimation accuracy by individual developer."""
650
+ developer_pairs = defaultdict(list)
651
+
652
+ for estimated, actual, developer in estimation_pairs:
653
+ developer_pairs[developer].append((estimated, actual))
654
+
655
+ developer_accuracy = {}
656
+
657
+ for developer, pairs in developer_pairs.items():
658
+ if len(pairs) >= 2: # Need multiple estimates for meaningful analysis
659
+ estimated_values = [pair[0] for pair in pairs]
660
+ actual_values = [pair[1] for pair in pairs]
661
+
662
+ # Calculate metrics for this developer
663
+ mae = float(np.mean([abs(est - act) for est, act in zip(estimated_values, actual_values)]))
664
+
665
+ relative_errors = [
666
+ abs(est - act) / max(est, act) * 100
667
+ for est, act in zip(estimated_values, actual_values)
668
+ ]
669
+ mre = float(np.mean(relative_errors))
670
+
671
+ accurate_count = sum(
672
+ 1 for est, act in zip(estimated_values, actual_values)
673
+ if abs(est - act) / max(est, act) <= 0.2
674
+ )
675
+ accuracy_pct = (accurate_count / len(pairs)) * 100
676
+
677
+ developer_accuracy[self._anonymize_value(developer, "name")] = {
678
+ "mean_absolute_error": mae,
679
+ "mean_relative_error": mre,
680
+ "accuracy_percentage": float(accuracy_pct),
681
+ "estimates_count": len(pairs),
682
+ "tends_to_overestimate": sum(estimated_values) > sum(actual_values),
683
+ "consistency": 1.0 - float(np.std(relative_errors) / 100) if relative_errors else 0
684
+ }
685
+
686
+ return developer_accuracy
687
+
688
+ def _analyze_size_based_accuracy(self, estimation_pairs: list[tuple[int, int, str]]) -> dict[str, dict[str, Any]]:
689
+ """Analyze estimation accuracy by story point size ranges."""
690
+ size_ranges = {
691
+ "small": (1, 3),
692
+ "medium": (4, 8),
693
+ "large": (9, 21),
694
+ "extra_large": (22, 100)
695
+ }
696
+
697
+ size_accuracy = {}
698
+
699
+ for size_name, (min_sp, max_sp) in size_ranges.items():
700
+ size_pairs = [
701
+ (est, act) for est, act, _ in estimation_pairs
702
+ if min_sp <= est <= max_sp
703
+ ]
704
+
705
+ if size_pairs:
706
+ estimated_values = [pair[0] for pair in size_pairs]
707
+ actual_values = [pair[1] for pair in size_pairs]
708
+
709
+ mae = float(np.mean([abs(est - act) for est, act in zip(estimated_values, actual_values)]))
710
+
711
+ relative_errors = [
712
+ abs(est - act) / max(est, act) * 100
713
+ for est, act in zip(estimated_values, actual_values)
714
+ ]
715
+ mre = float(np.mean(relative_errors))
716
+
717
+ size_accuracy[size_name] = {
718
+ "mean_absolute_error": mae,
719
+ "mean_relative_error": mre,
720
+ "sample_size": len(size_pairs),
721
+ "avg_estimated": float(np.mean(estimated_values)),
722
+ "avg_actual": float(np.mean(actual_values))
723
+ }
724
+ else:
725
+ size_accuracy[size_name] = {
726
+ "mean_absolute_error": 0,
727
+ "mean_relative_error": 0,
728
+ "sample_size": 0,
729
+ "avg_estimated": 0,
730
+ "avg_actual": 0
731
+ }
732
+
733
+ return size_accuracy
734
+
735
+ def _aggregate_weekly_velocity(
736
+ self, commits: list[dict[str, Any]], prs: list[dict[str, Any]], start_date: datetime, end_date: datetime
737
+ ) -> dict[str, dict[str, Any]]:
738
+ """Aggregate velocity metrics by week."""
739
+ weekly_velocity = defaultdict(lambda: {
740
+ "story_points_completed": 0,
741
+ "commits": 0,
742
+ "prs_merged": 0,
743
+ "developers_active": set()
744
+ })
745
+
746
+ # Process commits
747
+ for commit in commits:
748
+ timestamp = self._ensure_timezone_aware(commit.get("timestamp"))
749
+ if not timestamp or timestamp < start_date or timestamp > end_date:
750
+ continue
751
+
752
+ week_start = self._get_week_start(timestamp)
753
+ week_key = week_start.strftime("%Y-%m-%d")
754
+
755
+ weekly_velocity[week_key]["story_points_completed"] += commit.get("story_points", 0) or 0
756
+ weekly_velocity[week_key]["commits"] += 1
757
+ weekly_velocity[week_key]["developers_active"].add(
758
+ commit.get("canonical_id", commit.get("author_email", "unknown"))
759
+ )
760
+
761
+ # Process PRs
762
+ for pr in prs:
763
+ merged_at = self._ensure_timezone_aware(pr.get("merged_at"))
764
+ if not merged_at or merged_at < start_date or merged_at > end_date:
765
+ continue
766
+
767
+ week_start = self._get_week_start(merged_at)
768
+ week_key = week_start.strftime("%Y-%m-%d")
769
+
770
+ weekly_velocity[week_key]["prs_merged"] += 1
771
+
772
+ # Convert sets to counts
773
+ result = {}
774
+ for week_key, metrics in weekly_velocity.items():
775
+ metrics["developers_active"] = len(metrics["developers_active"])
776
+ result[week_key] = dict(metrics)
777
+
778
+ return result
779
+
780
+ def _calculate_velocity_trends(self, weekly_velocity: dict[str, dict[str, Any]]) -> dict[str, Any]:
781
+ """Calculate velocity trend analysis."""
782
+ if len(weekly_velocity) < 3:
783
+ return {"trend": "insufficient_data", "velocity_change": 0}
784
+
785
+ weeks = sorted(weekly_velocity.keys())
786
+ story_points = [weekly_velocity[week]["story_points_completed"] for week in weeks]
787
+
788
+ if not any(sp > 0 for sp in story_points):
789
+ return {"trend": "no_story_points", "velocity_change": 0}
790
+
791
+ # Calculate trend using linear regression
792
+ x_values = list(range(len(weeks)))
793
+ slope, intercept, r_value, p_value, std_err = stats.linregress(x_values, story_points)
794
+
795
+ # Determine trend direction
796
+ if slope > 0.5:
797
+ trend = "improving"
798
+ elif slope < -0.5:
799
+ trend = "declining"
800
+ else:
801
+ trend = "stable"
802
+
803
+ # Calculate velocity change (percentage)
804
+ if len(story_points) >= 2:
805
+ recent_avg = np.mean(story_points[-3:]) if len(story_points) >= 3 else story_points[-1]
806
+ early_avg = np.mean(story_points[:3]) if len(story_points) >= 3 else story_points[0]
807
+ velocity_change = ((recent_avg - early_avg) / max(early_avg, 1)) * 100
808
+ else:
809
+ velocity_change = 0
810
+
811
+ return {
812
+ "trend": trend,
813
+ "velocity_change": float(velocity_change),
814
+ "trend_strength": float(abs(r_value)),
815
+ "slope": float(slope),
816
+ "weeks_analyzed": len(weeks),
817
+ "avg_velocity": float(np.mean(story_points)),
818
+ "velocity_stability": 1.0 - float(np.std(story_points) / max(np.mean(story_points), 1))
819
+ }
820
+
821
+ def _analyze_developer_velocity(
822
+ self, commits: list[dict[str, Any]], start_date: datetime, end_date: datetime
823
+ ) -> dict[str, dict[str, Any]]:
824
+ """Analyze individual developer velocity patterns."""
825
+ developer_metrics = defaultdict(lambda: {
826
+ "total_story_points": 0,
827
+ "total_commits": 0,
828
+ "weeks_active": set(),
829
+ "weekly_velocity": []
830
+ })
831
+
832
+ # Aggregate by developer and week
833
+ for commit in commits:
834
+ timestamp = self._ensure_timezone_aware(commit.get("timestamp"))
835
+ if not timestamp or timestamp < start_date or timestamp > end_date:
836
+ continue
837
+
838
+ developer_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
839
+ week_start = self._get_week_start(timestamp)
840
+
841
+ metrics = developer_metrics[developer_id]
842
+ metrics["total_story_points"] += commit.get("story_points", 0) or 0
843
+ metrics["total_commits"] += 1
844
+ metrics["weeks_active"].add(week_start)
845
+
846
+ # Calculate velocity metrics for each developer
847
+ developer_velocity = {}
848
+
849
+ for dev_id, metrics in developer_metrics.items():
850
+ if metrics["total_commits"] > 0:
851
+ weeks_active = len(metrics["weeks_active"])
852
+ avg_velocity = metrics["total_story_points"] / max(weeks_active, 1)
853
+
854
+ developer_velocity[self._anonymize_value(dev_id, "name")] = {
855
+ "total_story_points": metrics["total_story_points"],
856
+ "total_commits": metrics["total_commits"],
857
+ "weeks_active": weeks_active,
858
+ "avg_weekly_velocity": float(avg_velocity),
859
+ "story_points_per_commit": metrics["total_story_points"] / metrics["total_commits"]
860
+ }
861
+
862
+ return developer_velocity
863
+
864
+ def _calculate_velocity_predictability(self, weekly_velocity: dict[str, dict[str, Any]]) -> dict[str, Any]:
865
+ """Calculate how predictable the team's velocity is."""
866
+ if len(weekly_velocity) < 4:
867
+ return {"predictability": "insufficient_data", "confidence_interval": [0, 0]}
868
+
869
+ story_points = [metrics["story_points_completed"] for metrics in weekly_velocity.values()]
870
+
871
+ if not any(sp > 0 for sp in story_points):
872
+ return {"predictability": "no_velocity_data", "confidence_interval": [0, 0]}
873
+
874
+ mean_velocity = np.mean(story_points)
875
+ std_velocity = np.std(story_points)
876
+ coefficient_variation = std_velocity / max(mean_velocity, 1)
877
+
878
+ # Classify predictability
879
+ if coefficient_variation < 0.2:
880
+ predictability = "high"
881
+ elif coefficient_variation < 0.4:
882
+ predictability = "moderate"
883
+ else:
884
+ predictability = "low"
885
+
886
+ # Calculate 80% confidence interval
887
+ confidence_interval = [
888
+ float(max(0, mean_velocity - 1.28 * std_velocity)),
889
+ float(mean_velocity + 1.28 * std_velocity)
890
+ ]
891
+
892
+ return {
893
+ "predictability": predictability,
894
+ "coefficient_of_variation": float(coefficient_variation),
895
+ "confidence_interval": confidence_interval,
896
+ "mean_velocity": float(mean_velocity),
897
+ "std_deviation": float(std_velocity)
898
+ }
899
+
900
+ def _analyze_team_capacity(
901
+ self, weekly_velocity: dict[str, dict[str, Any]], developer_velocity: dict[str, dict[str, Any]]
902
+ ) -> dict[str, Any]:
903
+ """Analyze team capacity and workload distribution."""
904
+ if not weekly_velocity or not developer_velocity:
905
+ return {"analysis": "insufficient_data"}
906
+
907
+ # Calculate team metrics
908
+ total_developers = len(developer_velocity)
909
+ weeks_analyzed = len(weekly_velocity)
910
+
911
+ # Calculate capacity utilization
912
+ developer_contributions = [dev["total_story_points"] for dev in developer_velocity.values()]
913
+ total_story_points = sum(developer_contributions)
914
+
915
+ if total_story_points == 0:
916
+ return {"analysis": "no_story_points"}
917
+
918
+ # Analyze workload distribution
919
+ [
920
+ (contrib / total_story_points) * 100 for contrib in developer_contributions
921
+ ]
922
+
923
+ # Calculate Gini coefficient for workload inequality
924
+ sorted_contributions = sorted(developer_contributions)
925
+ n = len(sorted_contributions)
926
+ np.cumsum(sorted_contributions)
927
+ gini = (n + 1 - 2 * sum((n + 1 - i) * x for i, x in enumerate(sorted_contributions, 1))) / (n * sum(sorted_contributions))
928
+
929
+ # Capacity recommendations
930
+ recommendations = []
931
+
932
+ # Check for workload imbalance
933
+ if gini > 0.4: # High inequality
934
+ recommendations.append("Consider redistributing workload - significant imbalance detected")
935
+
936
+ # Check for low contributors
937
+ low_contributors = [
938
+ dev for dev, metrics in developer_velocity.items()
939
+ if metrics["avg_weekly_velocity"] < np.mean([m["avg_weekly_velocity"] for m in developer_velocity.values()]) * 0.5
940
+ ]
941
+
942
+ if low_contributors:
943
+ recommendations.append(f"Support developers with low velocity: {', '.join(low_contributors[:3])}")
944
+
945
+ return {
946
+ "total_developers": total_developers,
947
+ "weeks_analyzed": weeks_analyzed,
948
+ "total_story_points": total_story_points,
949
+ "avg_weekly_team_velocity": float(np.mean([w["story_points_completed"] for w in weekly_velocity.values()])),
950
+ "workload_distribution_gini": float(gini),
951
+ "workload_balance": "balanced" if gini < 0.3 else "imbalanced",
952
+ "capacity_recommendations": recommendations,
953
+ "top_contributors": sorted(
954
+ [(dev, metrics["total_story_points"]) for dev, metrics in developer_velocity.items()],
955
+ key=lambda x: x[1], reverse=True
956
+ )[:5]
957
+ }
958
+
959
+ def _build_correlation_csv_rows(
960
+ self,
961
+ weekly_correlations: dict[str, Any],
962
+ estimation_accuracy: dict[str, Any],
963
+ velocity_metrics: dict[str, Any]
964
+ ) -> list[dict[str, Any]]:
965
+ """Build CSV rows from correlation analysis results."""
966
+ rows = []
967
+
968
+ # Add weekly correlation data
969
+ correlation_results = weekly_correlations.get("weekly_correlations", {})
970
+
971
+ for week_start, week_data in correlation_results.items():
972
+ correlations = week_data.get("correlations", {})
973
+
974
+ row = {
975
+ "week_start": week_start.strftime("%Y-%m-%d"),
976
+ "metric_type": "weekly_correlations",
977
+ "sp_commits_correlation": round(correlations.get("sp_commits", 0), 3),
978
+ "sp_lines_correlation": round(correlations.get("sp_lines", 0), 3),
979
+ "sp_files_correlation": round(correlations.get("sp_files", 0), 3),
980
+ "sp_prs_correlation": round(correlations.get("sp_prs", 0), 3),
981
+ "sp_complexity_correlation": round(correlations.get("sp_complexity", 0), 3),
982
+ "sample_size": week_data.get("sample_size", 0),
983
+ "total_story_points": week_data.get("total_story_points", 0),
984
+ "total_commits": week_data.get("total_commits", 0)
985
+ }
986
+ rows.append(row)
987
+
988
+ # Add velocity data
989
+ weekly_velocity = velocity_metrics.get("weekly_velocity", {})
990
+ for week_key, velocity_data in weekly_velocity.items():
991
+ row = {
992
+ "week_start": week_key,
993
+ "metric_type": "velocity",
994
+ "story_points_completed": velocity_data.get("story_points_completed", 0),
995
+ "commits_count": velocity_data.get("commits", 0),
996
+ "prs_merged": velocity_data.get("prs_merged", 0),
997
+ "developers_active": velocity_data.get("developers_active", 0),
998
+ "velocity_trend": velocity_metrics.get("trends", {}).get("trend", "unknown")
999
+ }
1000
+ rows.append(row)
1001
+
1002
+ # Add developer accuracy summary
1003
+ developer_accuracy = estimation_accuracy.get("developer_accuracy", {})
1004
+ for developer, accuracy_data in developer_accuracy.items():
1005
+ row = {
1006
+ "developer_name": developer,
1007
+ "metric_type": "developer_accuracy",
1008
+ "overall_accuracy": round(accuracy_data.get("overall_accuracy", 0), 3),
1009
+ "avg_weekly_accuracy": round(accuracy_data.get("avg_weekly_accuracy", 0), 3),
1010
+ "consistency": round(accuracy_data.get("consistency", 0), 3),
1011
+ "weeks_active": accuracy_data.get("weeks_active", 0),
1012
+ "total_estimated_sp": accuracy_data.get("total_estimated", 0),
1013
+ "total_actual_sp": accuracy_data.get("total_actual", 0),
1014
+ "estimation_ratio": round(accuracy_data.get("estimation_ratio", 0), 3)
1015
+ }
1016
+ rows.append(row)
1017
+
1018
+ return rows
1019
+
1020
+ def _write_empty_correlation_csv(self, output_path: Path) -> None:
1021
+ """Write empty CSV file with proper headers."""
1022
+ headers = [
1023
+ "week_start", "metric_type", "developer_name",
1024
+ "sp_commits_correlation", "sp_lines_correlation", "sp_files_correlation",
1025
+ "sp_prs_correlation", "sp_complexity_correlation", "sample_size",
1026
+ "total_story_points", "total_commits", "story_points_completed",
1027
+ "commits_count", "prs_merged", "developers_active", "velocity_trend",
1028
+ "overall_accuracy", "avg_weekly_accuracy", "consistency",
1029
+ "weeks_active", "total_estimated_sp", "total_actual_sp", "estimation_ratio"
1030
+ ]
1031
+
1032
+ df = pd.DataFrame(columns=headers)
1033
+ df.to_csv(output_path, index=False)
1034
+
1035
+ def _empty_accuracy_analysis(self) -> dict[str, Any]:
1036
+ """Return empty accuracy analysis structure."""
1037
+ return {
1038
+ "overall_accuracy": {"mean_absolute_error": 0, "mean_relative_error": 0, "accuracy_percentage": 0},
1039
+ "developer_accuracy": {},
1040
+ "size_based_accuracy": {},
1041
+ "improvement_suggestions": []
1042
+ }
1043
+
1044
+ def _empty_week_correlations(self) -> dict[str, Any]:
1045
+ """Return empty week correlations structure."""
1046
+ return {
1047
+ "correlations": {k: 0.0 for k in ["sp_commits", "sp_lines", "sp_files", "sp_prs", "sp_complexity"]},
1048
+ "sample_size": 0,
1049
+ "total_story_points": 0,
1050
+ "total_commits": 0,
1051
+ "total_lines_changed": 0
1052
+ }
1053
+
1054
+ def _generate_accuracy_recommendations(
1055
+ self, accuracy_metrics: dict[str, Any], developer_accuracy: dict[str, dict[str, Any]], size_accuracy: dict[str, dict[str, Any]]
1056
+ ) -> list[dict[str, str]]:
1057
+ """Generate recommendations for improving estimation accuracy."""
1058
+ recommendations = []
1059
+
1060
+ overall_accuracy = accuracy_metrics.get("accuracy_percentage", 0)
1061
+
1062
+ if overall_accuracy < 50:
1063
+ recommendations.append({
1064
+ "priority": "high",
1065
+ "title": "Low Overall Estimation Accuracy",
1066
+ "description": f"Only {overall_accuracy:.1f}% of estimates are within 20% tolerance",
1067
+ "action": "Conduct team workshop on story point estimation techniques"
1068
+ })
1069
+
1070
+ # Check for developers with low accuracy
1071
+ low_accuracy_devs = [
1072
+ dev for dev, stats in developer_accuracy.items()
1073
+ if stats.get("overall_accuracy", 0) < 0.4
1074
+ ]
1075
+
1076
+ if low_accuracy_devs:
1077
+ recommendations.append({
1078
+ "priority": "medium",
1079
+ "title": "Individual Estimation Training Needed",
1080
+ "description": f"{len(low_accuracy_devs)} developers need estimation improvement",
1081
+ "action": f"Provide 1-on-1 training for: {', '.join(low_accuracy_devs[:3])}"
1082
+ })
1083
+
1084
+ # Check size-based accuracy patterns
1085
+ large_story_accuracy = size_accuracy.get("large", {}).get("mean_relative_error", 0)
1086
+ if large_story_accuracy > 40: # High error rate for large stories
1087
+ recommendations.append({
1088
+ "priority": "medium",
1089
+ "title": "Large Stories Are Poorly Estimated",
1090
+ "description": f"Large stories (9-21 pts) have {large_story_accuracy:.1f}% average error",
1091
+ "action": "Encourage breaking down large stories into smaller, more estimable pieces"
1092
+ })
1093
+
1094
+ return recommendations
1095
+
1096
+ def _ensure_timezone_aware(self, dt: Any) -> Optional[datetime]:
1097
+ """Ensure datetime is timezone-aware UTC."""
1098
+ if not dt:
1099
+ return None
1100
+
1101
+ if isinstance(dt, str):
1102
+ try:
1103
+ dt = datetime.fromisoformat(dt.replace("Z", "+00:00"))
1104
+ except (ValueError, AttributeError):
1105
+ return None
1106
+
1107
+ if not isinstance(dt, datetime):
1108
+ return None
1109
+
1110
+ if dt.tzinfo is None:
1111
+ return dt.replace(tzinfo=timezone.utc)
1112
+ elif dt.tzinfo != timezone.utc:
1113
+ return dt.astimezone(timezone.utc)
1114
+ else:
1115
+ return dt
1116
+
1117
+ def _get_week_start(self, date: datetime) -> datetime:
1118
+ """Get Monday of the week for consistent week boundaries."""
1119
+ if date.tzinfo is None:
1120
+ date = date.replace(tzinfo=timezone.utc)
1121
+ elif date.tzinfo != timezone.utc:
1122
+ date = date.astimezone(timezone.utc)
1123
+
1124
+ days_since_monday = date.weekday()
1125
+ monday = date - timedelta(days=days_since_monday)
1126
+ return monday.replace(hour=0, minute=0, second=0, microsecond=0)
1127
+
1128
+ def _anonymize_value(self, value: str, field_type: str) -> str:
1129
+ """Anonymize values if anonymization is enabled."""
1130
+ if not self.anonymize or not value:
1131
+ return value
1132
+
1133
+ if value not in self._anonymization_map:
1134
+ self._anonymous_counter += 1
1135
+ if field_type == "name":
1136
+ anonymous = f"Developer{self._anonymous_counter}"
1137
+ elif field_type == "id":
1138
+ anonymous = f"ID{self._anonymous_counter:04d}"
1139
+ else:
1140
+ anonymous = f"anon{self._anonymous_counter}"
1141
+
1142
+ self._anonymization_map[value] = anonymous
1143
+
1144
+ return self._anonymization_map[value]