gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,924 @@
1
+ """Classification report generator for GitFlow Analytics.
2
+
3
+ This module provides comprehensive reporting capabilities for commit classification
4
+ results, including aggregate statistics, developer breakdowns, confidence analysis,
5
+ and temporal patterns. Designed to integrate with existing GitFlow Analytics
6
+ reporting infrastructure.
7
+ """
8
+
9
+ import csv
10
+ import json
11
+ import logging
12
+ from collections import Counter, defaultdict
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional, Tuple
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class ClassificationReportGenerator:
21
+ """Generator for comprehensive commit classification reports.
22
+
23
+ This class creates detailed reports from commit classification results,
24
+ providing insights into development patterns, team productivity, and
25
+ code quality metrics through the lens of commit categorization.
26
+
27
+ Key capabilities:
28
+ - Aggregate classification statistics
29
+ - Per-developer activity breakdowns
30
+ - Per-repository analysis
31
+ - Confidence score analysis
32
+ - Temporal pattern identification
33
+ - Export to multiple formats (CSV, JSON, Markdown)
34
+ """
35
+
36
+ def __init__(self, output_directory: Path, config: Optional[Dict[str, Any]] = None):
37
+ """Initialize the classification report generator.
38
+
39
+ Args:
40
+ output_directory: Directory where reports will be saved
41
+ config: Optional configuration for report generation
42
+ """
43
+ self.output_directory = Path(output_directory)
44
+ self.output_directory.mkdir(parents=True, exist_ok=True)
45
+
46
+ self.config = config or {}
47
+ self.include_low_confidence = self.config.get('include_low_confidence', True)
48
+ self.confidence_threshold = self.config.get('confidence_threshold', 0.6)
49
+ self.min_commits_for_analysis = self.config.get('min_commits_for_analysis', 5)
50
+
51
+ # Report metadata
52
+ self.generated_at = datetime.now()
53
+ self.reports_generated = []
54
+
55
+ logger.info(f"Classification report generator initialized - output: {self.output_directory}")
56
+
57
+ def generate_comprehensive_report(self, classified_commits: List[Dict[str, Any]],
58
+ metadata: Optional[Dict[str, Any]] = None) -> Dict[str, str]:
59
+ """Generate all available classification reports.
60
+
61
+ Args:
62
+ classified_commits: List of commits with classification results
63
+ metadata: Optional metadata about the analysis (date range, repos, etc.)
64
+
65
+ Returns:
66
+ Dictionary mapping report types to file paths
67
+ """
68
+ if not classified_commits:
69
+ logger.warning("No classified commits provided - skipping report generation")
70
+ return {}
71
+
72
+ # Filter classified commits
73
+ classified_only = [c for c in classified_commits if 'predicted_class' in c]
74
+
75
+ if not classified_only:
76
+ logger.warning("No commits with classification results found")
77
+ return {}
78
+
79
+ logger.info(f"Generating comprehensive classification reports for {len(classified_only)} commits")
80
+
81
+ report_paths = {}
82
+
83
+ try:
84
+ # Generate individual reports
85
+ report_paths['summary'] = self.generate_summary_report(classified_only, metadata)
86
+ report_paths['detailed_csv'] = self.generate_detailed_csv_report(classified_only, metadata)
87
+ report_paths['developer_breakdown'] = self.generate_developer_breakdown_report(classified_only, metadata)
88
+ report_paths['repository_analysis'] = self.generate_repository_analysis_report(classified_only, metadata)
89
+ report_paths['confidence_analysis'] = self.generate_confidence_analysis_report(classified_only, metadata)
90
+ report_paths['temporal_patterns'] = self.generate_temporal_patterns_report(classified_only, metadata)
91
+ report_paths['classification_matrix'] = self.generate_classification_matrix_report(classified_only, metadata)
92
+ report_paths['executive_summary'] = self.generate_executive_summary_report(classified_only, metadata)
93
+
94
+ # Generate comprehensive JSON export
95
+ report_paths['comprehensive_json'] = self.generate_json_export(classified_only, metadata)
96
+
97
+ # Generate markdown summary
98
+ report_paths['markdown_summary'] = self.generate_markdown_summary(classified_only, metadata)
99
+
100
+ self.reports_generated = list(report_paths.keys())
101
+ logger.info(f"Generated {len(report_paths)} classification reports")
102
+
103
+ return report_paths
104
+
105
+ except Exception as e:
106
+ logger.error(f"Failed to generate comprehensive reports: {e}")
107
+ return report_paths
108
+
109
+ def generate_summary_report(self, classified_commits: List[Dict[str, Any]],
110
+ metadata: Optional[Dict[str, Any]] = None) -> str:
111
+ """Generate high-level summary report.
112
+
113
+ Args:
114
+ classified_commits: List of classified commits
115
+ metadata: Optional analysis metadata
116
+
117
+ Returns:
118
+ Path to generated summary CSV file
119
+ """
120
+ output_path = self.output_directory / f'classification_summary_{self._get_timestamp()}.csv'
121
+
122
+ # Calculate summary statistics
123
+ total_commits = len(classified_commits)
124
+ classification_counts = Counter(c['predicted_class'] for c in classified_commits)
125
+ confidence_scores = [c.get('classification_confidence', 0) for c in classified_commits]
126
+
127
+ high_confidence_count = sum(1 for score in confidence_scores if score >= self.confidence_threshold)
128
+ avg_confidence = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0
129
+
130
+ unique_developers = len(set(c.get('canonical_author_name', c.get('author_name', 'unknown'))
131
+ for c in classified_commits))
132
+ unique_repositories = len(set(c.get('repository', 'unknown') for c in classified_commits))
133
+
134
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
135
+ writer = csv.writer(f)
136
+
137
+ # Header information
138
+ writer.writerow(['Classification Analysis Summary'])
139
+ writer.writerow(['Generated:', self.generated_at.isoformat()])
140
+
141
+ if metadata:
142
+ writer.writerow(['Analysis Period:', f"{metadata.get('start_date', 'N/A')} to {metadata.get('end_date', 'N/A')}"])
143
+ writer.writerow(['Configuration:', metadata.get('config_path', 'N/A')])
144
+
145
+ writer.writerow([])
146
+
147
+ # Overall statistics
148
+ writer.writerow(['Overall Statistics'])
149
+ writer.writerow(['Metric', 'Value'])
150
+ writer.writerow(['Total Commits Analyzed', total_commits])
151
+ writer.writerow(['Unique Developers', unique_developers])
152
+ writer.writerow(['Unique Repositories', unique_repositories])
153
+ writer.writerow(['Average Confidence Score', f'{avg_confidence:.3f}'])
154
+ writer.writerow(['High Confidence Predictions', f'{high_confidence_count} ({(high_confidence_count/total_commits)*100:.1f}%)'])
155
+ writer.writerow([])
156
+
157
+ # Classification distribution
158
+ writer.writerow(['Classification Distribution'])
159
+ writer.writerow(['Classification Type', 'Count', 'Percentage'])
160
+
161
+ for class_type, count in classification_counts.most_common():
162
+ percentage = (count / total_commits) * 100
163
+ writer.writerow([class_type, count, f'{percentage:.1f}%'])
164
+
165
+ logger.info(f"Summary report generated: {output_path}")
166
+ return str(output_path)
167
+
168
+ def generate_detailed_csv_report(self, classified_commits: List[Dict[str, Any]],
169
+ metadata: Optional[Dict[str, Any]] = None) -> str:
170
+ """Generate detailed CSV report with all commit information.
171
+
172
+ Args:
173
+ classified_commits: List of classified commits
174
+ metadata: Optional analysis metadata
175
+
176
+ Returns:
177
+ Path to generated detailed CSV file
178
+ """
179
+ output_path = self.output_directory / f'classification_detailed_{self._get_timestamp()}.csv'
180
+
181
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
182
+ writer = csv.writer(f)
183
+
184
+ # Write header
185
+ headers = [
186
+ 'commit_hash', 'date', 'author', 'canonical_author', 'repository',
187
+ 'predicted_class', 'confidence', 'is_reliable', 'message_preview',
188
+ 'files_changed', 'insertions', 'deletions', 'lines_changed',
189
+ 'primary_language', 'primary_activity', 'is_multilingual',
190
+ 'branch', 'project_key', 'ticket_references'
191
+ ]
192
+ writer.writerow(headers)
193
+
194
+ # Write commit details
195
+ for commit in classified_commits:
196
+ file_analysis = commit.get('file_analysis_summary', {})
197
+
198
+ row = [
199
+ commit.get('hash', '')[:12], # Shortened hash
200
+ commit.get('timestamp', '').strftime('%Y-%m-%d %H:%M:%S') if commit.get('timestamp') else '',
201
+ commit.get('author_name', ''),
202
+ commit.get('canonical_author_name', commit.get('author_name', '')),
203
+ commit.get('repository', ''),
204
+ commit.get('predicted_class', ''),
205
+ f"{commit.get('classification_confidence', 0):.3f}",
206
+ commit.get('is_reliable_prediction', False),
207
+ commit.get('message', '')[:100].replace('\n', ' '), # Preview of message
208
+ commit.get('files_changed', 0),
209
+ commit.get('insertions', 0),
210
+ commit.get('deletions', 0),
211
+ commit.get('insertions', 0) + commit.get('deletions', 0),
212
+ file_analysis.get('primary_language', ''),
213
+ file_analysis.get('primary_activity', ''),
214
+ file_analysis.get('is_multilingual', False),
215
+ commit.get('branch', ''),
216
+ commit.get('project_key', ''),
217
+ len(commit.get('ticket_references', []))
218
+ ]
219
+ writer.writerow(row)
220
+
221
+ logger.info(f"Detailed CSV report generated: {output_path}")
222
+ return str(output_path)
223
+
224
+ def generate_developer_breakdown_report(self, classified_commits: List[Dict[str, Any]],
225
+ metadata: Optional[Dict[str, Any]] = None) -> str:
226
+ """Generate per-developer classification breakdown.
227
+
228
+ Args:
229
+ classified_commits: List of classified commits
230
+ metadata: Optional analysis metadata
231
+
232
+ Returns:
233
+ Path to generated developer breakdown CSV file
234
+ """
235
+ output_path = self.output_directory / f'classification_by_developer_{self._get_timestamp()}.csv'
236
+
237
+ # Aggregate developer statistics
238
+ developer_stats = defaultdict(lambda: {
239
+ 'total_commits': 0,
240
+ 'classifications': Counter(),
241
+ 'confidence_scores': [],
242
+ 'repositories': set(),
243
+ 'total_lines_changed': 0,
244
+ 'avg_files_per_commit': 0,
245
+ 'commit_dates': []
246
+ })
247
+
248
+ for commit in classified_commits:
249
+ author = commit.get('canonical_author_name', commit.get('author_name', 'unknown'))
250
+ stats = developer_stats[author]
251
+
252
+ stats['total_commits'] += 1
253
+ stats['classifications'][commit.get('predicted_class', 'unknown')] += 1
254
+
255
+ if 'classification_confidence' in commit:
256
+ stats['confidence_scores'].append(commit['classification_confidence'])
257
+
258
+ stats['repositories'].add(commit.get('repository', 'unknown'))
259
+ stats['total_lines_changed'] += commit.get('insertions', 0) + commit.get('deletions', 0)
260
+ stats['avg_files_per_commit'] += commit.get('files_changed', 0)
261
+
262
+ if commit.get('timestamp'):
263
+ stats['commit_dates'].append(commit['timestamp'])
264
+
265
+ # Calculate derived metrics
266
+ for author, stats in developer_stats.items():
267
+ if stats['total_commits'] > 0:
268
+ stats['avg_confidence'] = sum(stats['confidence_scores']) / len(stats['confidence_scores']) if stats['confidence_scores'] else 0
269
+ stats['avg_files_per_commit'] = stats['avg_files_per_commit'] / stats['total_commits']
270
+ stats['avg_lines_per_commit'] = stats['total_lines_changed'] / stats['total_commits']
271
+ stats['primary_classification'] = stats['classifications'].most_common(1)[0][0] if stats['classifications'] else 'unknown'
272
+ stats['classification_diversity'] = len(stats['classifications'])
273
+ stats['repository_count'] = len(stats['repositories'])
274
+
275
+ # Calculate activity span
276
+ if stats['commit_dates']:
277
+ date_range = max(stats['commit_dates']) - min(stats['commit_dates'])
278
+ stats['activity_span_days'] = date_range.days
279
+ else:
280
+ stats['activity_span_days'] = 0
281
+
282
+ # Filter developers with minimum commits
283
+ filtered_developers = {k: v for k, v in developer_stats.items()
284
+ if v['total_commits'] >= self.min_commits_for_analysis}
285
+
286
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
287
+ writer = csv.writer(f)
288
+
289
+ # Write summary section
290
+ writer.writerow(['Developer Classification Analysis'])
291
+ writer.writerow(['Total Developers:', len(developer_stats)])
292
+ writer.writerow(['Developers with ≥{} commits:'.format(self.min_commits_for_analysis), len(filtered_developers)])
293
+ writer.writerow([])
294
+
295
+ # Write detailed breakdown
296
+ headers = [
297
+ 'developer', 'total_commits', 'primary_classification', 'classification_diversity',
298
+ 'avg_confidence', 'high_confidence_ratio', 'repository_count', 'repositories',
299
+ 'avg_files_per_commit', 'avg_lines_per_commit', 'activity_span_days'
300
+ ]
301
+
302
+ # Add classification type columns
303
+ all_classifications = set()
304
+ for stats in filtered_developers.values():
305
+ all_classifications.update(stats['classifications'].keys())
306
+
307
+ classification_headers = [f'{cls}_count' for cls in sorted(all_classifications)]
308
+ headers.extend(classification_headers)
309
+
310
+ writer.writerow(headers)
311
+
312
+ # Sort developers by total commits (descending)
313
+ sorted_developers = sorted(filtered_developers.items(),
314
+ key=lambda x: x[1]['total_commits'], reverse=True)
315
+
316
+ for author, stats in sorted_developers:
317
+ high_confidence_count = sum(1 for score in stats['confidence_scores']
318
+ if score >= self.confidence_threshold)
319
+ high_confidence_ratio = high_confidence_count / len(stats['confidence_scores']) if stats['confidence_scores'] else 0
320
+
321
+ row = [
322
+ author,
323
+ stats['total_commits'],
324
+ stats['primary_classification'],
325
+ stats['classification_diversity'],
326
+ f"{stats['avg_confidence']:.3f}",
327
+ f"{high_confidence_ratio:.3f}",
328
+ stats['repository_count'],
329
+ '; '.join(sorted(stats['repositories'])),
330
+ f"{stats['avg_files_per_commit']:.1f}",
331
+ f"{stats['avg_lines_per_commit']:.0f}",
332
+ stats['activity_span_days']
333
+ ]
334
+
335
+ # Add classification counts
336
+ for cls in sorted(all_classifications):
337
+ row.append(stats['classifications'].get(cls, 0))
338
+
339
+ writer.writerow(row)
340
+
341
+ logger.info(f"Developer breakdown report generated: {output_path}")
342
+ return str(output_path)
343
+
344
+ def generate_repository_analysis_report(self, classified_commits: List[Dict[str, Any]],
345
+ metadata: Optional[Dict[str, Any]] = None) -> str:
346
+ """Generate per-repository classification analysis.
347
+
348
+ Args:
349
+ classified_commits: List of classified commits
350
+ metadata: Optional analysis metadata
351
+
352
+ Returns:
353
+ Path to generated repository analysis CSV file
354
+ """
355
+ output_path = self.output_directory / f'classification_by_repository_{self._get_timestamp()}.csv'
356
+
357
+ # Aggregate repository statistics
358
+ repo_stats = defaultdict(lambda: {
359
+ 'total_commits': 0,
360
+ 'classifications': Counter(),
361
+ 'developers': set(),
362
+ 'confidence_scores': [],
363
+ 'total_lines_changed': 0,
364
+ 'languages': Counter(),
365
+ 'activities': Counter()
366
+ })
367
+
368
+ for commit in classified_commits:
369
+ repo = commit.get('repository', 'unknown')
370
+ stats = repo_stats[repo]
371
+
372
+ stats['total_commits'] += 1
373
+ stats['classifications'][commit.get('predicted_class', 'unknown')] += 1
374
+ stats['developers'].add(commit.get('canonical_author_name', commit.get('author_name', 'unknown')))
375
+
376
+ if 'classification_confidence' in commit:
377
+ stats['confidence_scores'].append(commit['classification_confidence'])
378
+
379
+ stats['total_lines_changed'] += commit.get('insertions', 0) + commit.get('deletions', 0)
380
+
381
+ # File analysis information
382
+ file_analysis = commit.get('file_analysis_summary', {})
383
+ if file_analysis.get('primary_language'):
384
+ stats['languages'][file_analysis['primary_language']] += 1
385
+ if file_analysis.get('primary_activity'):
386
+ stats['activities'][file_analysis['primary_activity']] += 1
387
+
388
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
389
+ writer = csv.writer(f)
390
+
391
+ writer.writerow(['Repository Classification Analysis'])
392
+ writer.writerow(['Total Repositories:', len(repo_stats)])
393
+ writer.writerow([])
394
+
395
+ headers = [
396
+ 'repository', 'total_commits', 'developer_count', 'primary_classification',
397
+ 'avg_confidence', 'avg_lines_per_commit', 'primary_language', 'primary_activity',
398
+ 'classification_diversity', 'language_diversity'
399
+ ]
400
+ writer.writerow(headers)
401
+
402
+ # Sort repositories by commit count
403
+ sorted_repos = sorted(repo_stats.items(), key=lambda x: x[1]['total_commits'], reverse=True)
404
+
405
+ for repo, stats in sorted_repos:
406
+ avg_confidence = sum(stats['confidence_scores']) / len(stats['confidence_scores']) if stats['confidence_scores'] else 0
407
+ avg_lines = stats['total_lines_changed'] / stats['total_commits'] if stats['total_commits'] > 0 else 0
408
+
409
+ primary_class = stats['classifications'].most_common(1)[0][0] if stats['classifications'] else 'unknown'
410
+ primary_lang = stats['languages'].most_common(1)[0][0] if stats['languages'] else 'unknown'
411
+ primary_activity = stats['activities'].most_common(1)[0][0] if stats['activities'] else 'unknown'
412
+
413
+ row = [
414
+ repo,
415
+ stats['total_commits'],
416
+ len(stats['developers']),
417
+ primary_class,
418
+ f"{avg_confidence:.3f}",
419
+ f"{avg_lines:.0f}",
420
+ primary_lang,
421
+ primary_activity,
422
+ len(stats['classifications']),
423
+ len(stats['languages'])
424
+ ]
425
+ writer.writerow(row)
426
+
427
+ logger.info(f"Repository analysis report generated: {output_path}")
428
+ return str(output_path)
429
+
430
+ def generate_confidence_analysis_report(self, classified_commits: List[Dict[str, Any]],
431
+ metadata: Optional[Dict[str, Any]] = None) -> str:
432
+ """Generate confidence score analysis report.
433
+
434
+ Args:
435
+ classified_commits: List of classified commits
436
+ metadata: Optional analysis metadata
437
+
438
+ Returns:
439
+ Path to generated confidence analysis CSV file
440
+ """
441
+ output_path = self.output_directory / f'classification_confidence_analysis_{self._get_timestamp()}.csv'
442
+
443
+ confidence_scores = [c.get('classification_confidence', 0) for c in classified_commits]
444
+
445
+ # Calculate confidence statistics by classification type
446
+ confidence_by_class = defaultdict(list)
447
+ for commit in classified_commits:
448
+ class_type = commit.get('predicted_class', 'unknown')
449
+ confidence = commit.get('classification_confidence', 0)
450
+ confidence_by_class[class_type].append(confidence)
451
+
452
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
453
+ writer = csv.writer(f)
454
+
455
+ writer.writerow(['Classification Confidence Analysis'])
456
+ writer.writerow([])
457
+
458
+ # Overall confidence statistics
459
+ if confidence_scores:
460
+ writer.writerow(['Overall Confidence Statistics'])
461
+ writer.writerow(['Metric', 'Value'])
462
+ writer.writerow(['Total Predictions', len(confidence_scores)])
463
+ writer.writerow(['Average Confidence', f"{sum(confidence_scores) / len(confidence_scores):.3f}"])
464
+ writer.writerow(['Minimum Confidence', f"{min(confidence_scores):.3f}"])
465
+ writer.writerow(['Maximum Confidence', f"{max(confidence_scores):.3f}"])
466
+
467
+ # Confidence distribution
468
+ very_high = sum(1 for s in confidence_scores if s >= 0.9)
469
+ high = sum(1 for s in confidence_scores if 0.8 <= s < 0.9)
470
+ medium = sum(1 for s in confidence_scores if 0.6 <= s < 0.8)
471
+ low = sum(1 for s in confidence_scores if 0.4 <= s < 0.6)
472
+ very_low = sum(1 for s in confidence_scores if s < 0.4)
473
+
474
+ writer.writerow(['Very High (≥0.9)', f"{very_high} ({(very_high/len(confidence_scores))*100:.1f}%)"])
475
+ writer.writerow(['High (0.8-0.9)', f"{high} ({(high/len(confidence_scores))*100:.1f}%)"])
476
+ writer.writerow(['Medium (0.6-0.8)', f"{medium} ({(medium/len(confidence_scores))*100:.1f}%)"])
477
+ writer.writerow(['Low (0.4-0.6)', f"{low} ({(low/len(confidence_scores))*100:.1f}%)"])
478
+ writer.writerow(['Very Low (<0.4)', f"{very_low} ({(very_low/len(confidence_scores))*100:.1f}%)"])
479
+ writer.writerow([])
480
+
481
+ # Confidence by classification type
482
+ writer.writerow(['Confidence by Classification Type'])
483
+ writer.writerow(['Classification', 'Count', 'Avg Confidence', 'Min', 'Max', 'High Confidence Count'])
484
+
485
+ for class_type, scores in sorted(confidence_by_class.items()):
486
+ if scores:
487
+ avg_conf = sum(scores) / len(scores)
488
+ high_conf_count = sum(1 for s in scores if s >= self.confidence_threshold)
489
+
490
+ writer.writerow([
491
+ class_type,
492
+ len(scores),
493
+ f"{avg_conf:.3f}",
494
+ f"{min(scores):.3f}",
495
+ f"{max(scores):.3f}",
496
+ f"{high_conf_count} ({(high_conf_count/len(scores))*100:.1f}%)"
497
+ ])
498
+
499
+ logger.info(f"Confidence analysis report generated: {output_path}")
500
+ return str(output_path)
501
+
502
+ def generate_temporal_patterns_report(self, classified_commits: List[Dict[str, Any]],
503
+ metadata: Optional[Dict[str, Any]] = None) -> str:
504
+ """Generate temporal patterns analysis report.
505
+
506
+ Args:
507
+ classified_commits: List of classified commits
508
+ metadata: Optional analysis metadata
509
+
510
+ Returns:
511
+ Path to generated temporal patterns CSV file
512
+ """
513
+ output_path = self.output_directory / f'classification_temporal_patterns_{self._get_timestamp()}.csv'
514
+
515
+ # Group commits by date
516
+ daily_stats = defaultdict(lambda: {
517
+ 'total_commits': 0,
518
+ 'classifications': Counter(),
519
+ 'developers': set(),
520
+ 'confidence_scores': []
521
+ })
522
+
523
+ for commit in classified_commits:
524
+ if commit.get('timestamp'):
525
+ date_key = commit['timestamp'].date()
526
+ stats = daily_stats[date_key]
527
+
528
+ stats['total_commits'] += 1
529
+ stats['classifications'][commit.get('predicted_class', 'unknown')] += 1
530
+ stats['developers'].add(commit.get('canonical_author_name', commit.get('author_name', 'unknown')))
531
+
532
+ if 'classification_confidence' in commit:
533
+ stats['confidence_scores'].append(commit['classification_confidence'])
534
+
535
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
536
+ writer = csv.writer(f)
537
+
538
+ writer.writerow(['Temporal Classification Patterns'])
539
+ writer.writerow([])
540
+
541
+ # Get all classification types for column headers
542
+ all_classifications = set()
543
+ for stats in daily_stats.values():
544
+ all_classifications.update(stats['classifications'].keys())
545
+
546
+ headers = ['date', 'total_commits', 'developer_count', 'avg_confidence']
547
+ headers.extend([f'{cls}_count' for cls in sorted(all_classifications)])
548
+ writer.writerow(headers)
549
+
550
+ # Sort by date
551
+ for date, stats in sorted(daily_stats.items()):
552
+ avg_confidence = sum(stats['confidence_scores']) / len(stats['confidence_scores']) if stats['confidence_scores'] else 0
553
+
554
+ row = [
555
+ date.isoformat(),
556
+ stats['total_commits'],
557
+ len(stats['developers']),
558
+ f"{avg_confidence:.3f}"
559
+ ]
560
+
561
+ # Add classification counts for this date
562
+ for cls in sorted(all_classifications):
563
+ row.append(stats['classifications'].get(cls, 0))
564
+
565
+ writer.writerow(row)
566
+
567
+ logger.info(f"Temporal patterns report generated: {output_path}")
568
+ return str(output_path)
569
+
570
+ def generate_classification_matrix_report(self, classified_commits: List[Dict[str, Any]],
571
+ metadata: Optional[Dict[str, Any]] = None) -> str:
572
+ """Generate classification distribution matrix report.
573
+
574
+ Args:
575
+ classified_commits: List of classified commits
576
+ metadata: Optional analysis metadata
577
+
578
+ Returns:
579
+ Path to generated classification matrix CSV file
580
+ """
581
+ output_path = self.output_directory / f'classification_matrix_{self._get_timestamp()}.csv'
582
+
583
+ # Create cross-tabulation of classifications vs other dimensions
584
+ class_counts = Counter(c.get('predicted_class', 'unknown') for c in classified_commits)
585
+
586
+ # Developer vs Classification matrix
587
+ dev_class_matrix = defaultdict(Counter)
588
+ repo_class_matrix = defaultdict(Counter)
589
+ lang_class_matrix = defaultdict(Counter)
590
+
591
+ for commit in classified_commits:
592
+ class_type = commit.get('predicted_class', 'unknown')
593
+ developer = commit.get('canonical_author_name', commit.get('author_name', 'unknown'))
594
+ repository = commit.get('repository', 'unknown')
595
+
596
+ dev_class_matrix[developer][class_type] += 1
597
+ repo_class_matrix[repository][class_type] += 1
598
+
599
+ # Language information
600
+ file_analysis = commit.get('file_analysis_summary', {})
601
+ language = file_analysis.get('primary_language', 'unknown')
602
+ lang_class_matrix[language][class_type] += 1
603
+
604
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
605
+ writer = csv.writer(f)
606
+
607
+ writer.writerow(['Classification Distribution Matrix'])
608
+ writer.writerow([])
609
+
610
+ # Overall classification distribution
611
+ writer.writerow(['Overall Classification Distribution'])
612
+ writer.writerow(['Classification', 'Count', 'Percentage'])
613
+ total_commits = len(classified_commits)
614
+
615
+ for class_type, count in class_counts.most_common():
616
+ percentage = (count / total_commits) * 100
617
+ writer.writerow([class_type, count, f'{percentage:.1f}%'])
618
+
619
+ writer.writerow([])
620
+
621
+ # Top developers by classification diversity
622
+ writer.writerow(['Top Developers by Classification Diversity'])
623
+ writer.writerow(['Developer', 'Total Commits', 'Classifications Used', 'Primary Classification'])
624
+
625
+ dev_diversity = []
626
+ for dev, classifications in dev_class_matrix.items():
627
+ total_dev_commits = sum(classifications.values())
628
+ if total_dev_commits >= self.min_commits_for_analysis:
629
+ diversity = len(classifications)
630
+ primary = classifications.most_common(1)[0][0]
631
+ dev_diversity.append((dev, total_dev_commits, diversity, primary))
632
+
633
+ # Sort by diversity, then by total commits
634
+ for dev, total, diversity, primary in sorted(dev_diversity, key=lambda x: (x[2], x[1]), reverse=True)[:10]:
635
+ writer.writerow([dev, total, diversity, primary])
636
+
637
+ writer.writerow([])
638
+
639
+ # Language vs Classification matrix
640
+ writer.writerow(['Language vs Classification Matrix'])
641
+ all_classes = sorted(class_counts.keys())
642
+ header = ['Language'] + all_classes + ['Total']
643
+ writer.writerow(header)
644
+
645
+ for language, classifications in sorted(lang_class_matrix.items(),
646
+ key=lambda x: sum(x[1].values()), reverse=True):
647
+ row = [language]
648
+ total_lang_commits = sum(classifications.values())
649
+
650
+ for class_type in all_classes:
651
+ count = classifications.get(class_type, 0)
652
+ percentage = (count / total_lang_commits) * 100 if total_lang_commits > 0 else 0
653
+ row.append(f"{count} ({percentage:.1f}%)")
654
+
655
+ row.append(total_lang_commits)
656
+ writer.writerow(row)
657
+
658
+ logger.info(f"Classification matrix report generated: {output_path}")
659
+ return str(output_path)
660
+
661
+ def generate_executive_summary_report(self, classified_commits: List[Dict[str, Any]],
662
+ metadata: Optional[Dict[str, Any]] = None) -> str:
663
+ """Generate executive summary report for leadership.
664
+
665
+ Args:
666
+ classified_commits: List of classified commits
667
+ metadata: Optional analysis metadata
668
+
669
+ Returns:
670
+ Path to generated executive summary CSV file
671
+ """
672
+ output_path = self.output_directory / f'classification_executive_summary_{self._get_timestamp()}.csv'
673
+
674
+ # Calculate key metrics
675
+ total_commits = len(classified_commits)
676
+ unique_developers = len(set(c.get('canonical_author_name', c.get('author_name', 'unknown'))
677
+ for c in classified_commits))
678
+ unique_repositories = len(set(c.get('repository', 'unknown') for c in classified_commits))
679
+
680
+ classification_counts = Counter(c.get('predicted_class', 'unknown') for c in classified_commits)
681
+ confidence_scores = [c.get('classification_confidence', 0) for c in classified_commits]
682
+
683
+ # Productivity metrics
684
+ total_lines_changed = sum(c.get('insertions', 0) + c.get('deletions', 0) for c in classified_commits)
685
+ avg_lines_per_commit = total_lines_changed / total_commits if total_commits > 0 else 0
686
+
687
+ # Time span analysis
688
+ commit_dates = [c['timestamp'] for c in classified_commits if c.get('timestamp')]
689
+ if commit_dates:
690
+ analysis_span = (max(commit_dates) - min(commit_dates)).days
691
+ else:
692
+ analysis_span = 0
693
+
694
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
695
+ writer = csv.writer(f)
696
+
697
+ writer.writerow(['Executive Summary - Commit Classification Analysis'])
698
+ writer.writerow(['Generated:', self.generated_at.strftime('%Y-%m-%d %H:%M:%S')])
699
+
700
+ if metadata:
701
+ writer.writerow(['Analysis Period:', f"{metadata.get('start_date', 'N/A')} to {metadata.get('end_date', 'N/A')}"])
702
+
703
+ writer.writerow([])
704
+
705
+ # Key metrics
706
+ writer.writerow(['KEY METRICS'])
707
+ writer.writerow(['Total Development Activity', f'{total_commits:,} commits'])
708
+ writer.writerow(['Team Size', f'{unique_developers} active developers'])
709
+ writer.writerow(['Codebase Scope', f'{unique_repositories} repositories'])
710
+ writer.writerow(['Analysis Timespan', f'{analysis_span} days'])
711
+ writer.writerow(['Average Code Changes per Commit', f'{avg_lines_per_commit:.0f} lines'])
712
+
713
+ if confidence_scores:
714
+ avg_confidence = sum(confidence_scores) / len(confidence_scores)
715
+ high_confidence_pct = (sum(1 for s in confidence_scores if s >= self.confidence_threshold) / len(confidence_scores)) * 100
716
+ writer.writerow(['Classification Confidence', f'{avg_confidence:.1%} average'])
717
+ writer.writerow(['High Confidence Predictions', f'{high_confidence_pct:.1f}%'])
718
+
719
+ writer.writerow([])
720
+
721
+ # Development focus areas
722
+ writer.writerow(['DEVELOPMENT FOCUS AREAS'])
723
+ writer.writerow(['Activity Type', 'Commits', '% of Total', 'Strategic Insight'])
724
+
725
+ # Define strategic insights for each classification type
726
+ strategic_insights = {
727
+ 'feature': 'New capability development',
728
+ 'bugfix': 'Quality maintenance and stability',
729
+ 'refactor': 'Technical debt management',
730
+ 'docs': 'Knowledge management and documentation',
731
+ 'test': 'Quality assurance and testing',
732
+ 'config': 'Infrastructure and configuration',
733
+ 'chore': 'Maintenance and operational tasks',
734
+ 'security': 'Security and compliance',
735
+ 'hotfix': 'Critical issue resolution',
736
+ 'style': 'Code quality and standards',
737
+ 'build': 'Build system and deployment',
738
+ 'ci': 'Automation and continuous integration'
739
+ }
740
+
741
+ for class_type, count in classification_counts.most_common():
742
+ percentage = (count / total_commits) * 100
743
+ insight = strategic_insights.get(class_type, 'Unclassified development activity')
744
+ writer.writerow([class_type.title(), f'{count:,}', f'{percentage:.1f}%', insight])
745
+
746
+ writer.writerow([])
747
+
748
+ # Recommendations
749
+ writer.writerow(['STRATEGIC RECOMMENDATIONS'])
750
+
751
+ # Generate recommendations based on the data
752
+ recommendations = []
753
+
754
+ # Feature vs maintenance balance
755
+ feature_pct = (classification_counts.get('feature', 0) / total_commits) * 100
756
+ maintenance_pct = ((classification_counts.get('bugfix', 0) +
757
+ classification_counts.get('refactor', 0) +
758
+ classification_counts.get('chore', 0)) / total_commits) * 100
759
+
760
+ if feature_pct > 60:
761
+ recommendations.append("High feature development velocity - consider increasing quality assurance")
762
+ elif feature_pct < 20:
763
+ recommendations.append("Low feature development - may indicate focus on maintenance or technical debt")
764
+
765
+ if maintenance_pct > 40:
766
+ recommendations.append("High maintenance overhead - consider technical debt reduction initiatives")
767
+
768
+ # Documentation analysis
769
+ docs_pct = (classification_counts.get('docs', 0) / total_commits) * 100
770
+ if docs_pct < 5:
771
+ recommendations.append("Low documentation activity - consider improving documentation practices")
772
+
773
+ # Testing analysis
774
+ test_pct = (classification_counts.get('test', 0) / total_commits) * 100
775
+ if test_pct < 10:
776
+ recommendations.append("Limited testing activity - consider strengthening testing practices")
777
+
778
+ # Security analysis
779
+ security_pct = (classification_counts.get('security', 0) / total_commits) * 100
780
+ if security_pct > 0:
781
+ recommendations.append(f"Active security focus ({security_pct:.1f}% of commits) - positive security posture")
782
+
783
+ # Confidence analysis
784
+ if confidence_scores:
785
+ low_confidence_pct = (sum(1 for s in confidence_scores if s < 0.6) / len(confidence_scores)) * 100
786
+ if low_confidence_pct > 20:
787
+ recommendations.append("Consider improving commit message clarity for better classification")
788
+
789
+ for i, recommendation in enumerate(recommendations, 1):
790
+ writer.writerow([f'Recommendation {i}', recommendation])
791
+
792
+ logger.info(f"Executive summary report generated: {output_path}")
793
+ return str(output_path)
794
+
795
+ def generate_json_export(self, classified_commits: List[Dict[str, Any]],
796
+ metadata: Optional[Dict[str, Any]] = None) -> str:
797
+ """Generate comprehensive JSON export of all classification data.
798
+
799
+ Args:
800
+ classified_commits: List of classified commits
801
+ metadata: Optional analysis metadata
802
+
803
+ Returns:
804
+ Path to generated JSON file
805
+ """
806
+ output_path = self.output_directory / f'classification_comprehensive_{self._get_timestamp()}.json'
807
+
808
+ # Create comprehensive data structure
809
+ export_data = {
810
+ 'metadata': {
811
+ 'generated_at': self.generated_at.isoformat(),
812
+ 'total_commits': len(classified_commits),
813
+ 'generator_version': '1.0',
814
+ 'config': self.config
815
+ },
816
+ 'summary_statistics': self._calculate_summary_statistics(classified_commits),
817
+ 'commits': classified_commits,
818
+ 'analysis_metadata': metadata or {}
819
+ }
820
+
821
+ with open(output_path, 'w', encoding='utf-8') as f:
822
+ json.dump(export_data, f, indent=2, default=str, ensure_ascii=False)
823
+
824
+ logger.info(f"JSON export generated: {output_path}")
825
+ return str(output_path)
826
+
827
+ def generate_markdown_summary(self, classified_commits: List[Dict[str, Any]],
828
+ metadata: Optional[Dict[str, Any]] = None) -> str:
829
+ """Generate markdown summary report.
830
+
831
+ Args:
832
+ classified_commits: List of classified commits
833
+ metadata: Optional analysis metadata
834
+
835
+ Returns:
836
+ Path to generated markdown file
837
+ """
838
+ output_path = self.output_directory / f'classification_summary_{self._get_timestamp()}.md'
839
+
840
+ # Calculate statistics
841
+ total_commits = len(classified_commits)
842
+ classification_counts = Counter(c.get('predicted_class', 'unknown') for c in classified_commits)
843
+ confidence_scores = [c.get('classification_confidence', 0) for c in classified_commits]
844
+
845
+ with open(output_path, 'w', encoding='utf-8') as f:
846
+ f.write("# Commit Classification Analysis Report\n\n")
847
+ f.write(f"**Generated:** {self.generated_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
848
+
849
+ if metadata:
850
+ f.write(f"**Analysis Period:** {metadata.get('start_date', 'N/A')} to {metadata.get('end_date', 'N/A')}\n\n")
851
+
852
+ f.write("## Summary Statistics\n\n")
853
+ f.write(f"- **Total Commits Analyzed:** {total_commits:,}\n")
854
+ f.write(f"- **Unique Developers:** {len(set(c.get('canonical_author_name', c.get('author_name', 'unknown')) for c in classified_commits))}\n")
855
+ f.write(f"- **Unique Repositories:** {len(set(c.get('repository', 'unknown') for c in classified_commits))}\n")
856
+
857
+ if confidence_scores:
858
+ avg_confidence = sum(confidence_scores) / len(confidence_scores)
859
+ high_confidence_count = sum(1 for s in confidence_scores if s >= self.confidence_threshold)
860
+ f.write(f"- **Average Confidence:** {avg_confidence:.1%}\n")
861
+ f.write(f"- **High Confidence Predictions:** {high_confidence_count:,} ({(high_confidence_count/total_commits)*100:.1f}%)\n")
862
+
863
+ f.write("\n## Classification Distribution\n\n")
864
+ f.write("| Classification Type | Count | Percentage |\n")
865
+ f.write("|-------------------|--------|------------|\n")
866
+
867
+ for class_type, count in classification_counts.most_common():
868
+ percentage = (count / total_commits) * 100
869
+ f.write(f"| {class_type.title()} | {count:,} | {percentage:.1f}% |\n")
870
+
871
+ f.write(f"\n## Analysis Details\n\n")
872
+ f.write(f"This report was generated using GitFlow Analytics commit classification system.\n")
873
+ f.write(f"Classification confidence threshold: {self.confidence_threshold}\n\n")
874
+
875
+ f.write("For detailed analysis, see the accompanying CSV reports:\n")
876
+ for report_type in self.reports_generated:
877
+ f.write(f"- {report_type.replace('_', ' ').title()}\n")
878
+
879
+ logger.info(f"Markdown summary generated: {output_path}")
880
+ return str(output_path)
881
+
882
+ def _calculate_summary_statistics(self, classified_commits: List[Dict[str, Any]]) -> Dict[str, Any]:
883
+ """Calculate comprehensive summary statistics.
884
+
885
+ Args:
886
+ classified_commits: List of classified commits
887
+
888
+ Returns:
889
+ Dictionary containing summary statistics
890
+ """
891
+ total_commits = len(classified_commits)
892
+
893
+ classification_counts = Counter(c.get('predicted_class', 'unknown') for c in classified_commits)
894
+ confidence_scores = [c.get('classification_confidence', 0) for c in classified_commits]
895
+
896
+ developers = set(c.get('canonical_author_name', c.get('author_name', 'unknown')) for c in classified_commits)
897
+ repositories = set(c.get('repository', 'unknown') for c in classified_commits)
898
+
899
+ return {
900
+ 'total_commits': total_commits,
901
+ 'unique_developers': len(developers),
902
+ 'unique_repositories': len(repositories),
903
+ 'classification_distribution': dict(classification_counts),
904
+ 'confidence_statistics': {
905
+ 'average': sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0,
906
+ 'minimum': min(confidence_scores) if confidence_scores else 0,
907
+ 'maximum': max(confidence_scores) if confidence_scores else 0,
908
+ 'high_confidence_count': sum(1 for s in confidence_scores if s >= self.confidence_threshold),
909
+ 'high_confidence_percentage': (sum(1 for s in confidence_scores if s >= self.confidence_threshold) / len(confidence_scores)) * 100 if confidence_scores else 0
910
+ },
911
+ 'productivity_metrics': {
912
+ 'total_lines_changed': sum(c.get('insertions', 0) + c.get('deletions', 0) for c in classified_commits),
913
+ 'average_lines_per_commit': sum(c.get('insertions', 0) + c.get('deletions', 0) for c in classified_commits) / total_commits if total_commits > 0 else 0,
914
+ 'average_files_per_commit': sum(c.get('files_changed', 0) for c in classified_commits) / total_commits if total_commits > 0 else 0
915
+ }
916
+ }
917
+
918
+ def _get_timestamp(self) -> str:
919
+ """Get timestamp string for file naming.
920
+
921
+ Returns:
922
+ Timestamp string in YYYYMMDD_HHMMSS format
923
+ """
924
+ return self.generated_at.strftime('%Y%m%d_%H%M%S')