gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4108 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +904 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +441 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1193 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,339 +1,1803 @@
1
1
  """CSV report generation for GitFlow Analytics."""
2
+
2
3
  import csv
4
+ import logging
5
+ from collections import defaultdict
3
6
  from datetime import datetime, timedelta, timezone
4
7
  from pathlib import Path
5
- from typing import List, Dict, Any, Optional, Tuple
6
- from collections import defaultdict
8
+ from typing import Any, Dict, List, Optional
9
+
7
10
  import pandas as pd
8
11
 
12
+ from ..metrics.activity_scoring import ActivityScorer
13
+ from .base import BaseReportGenerator, ReportData, ReportOutput
14
+ from .interfaces import ReportFormat
9
15
 
10
- class CSVReportGenerator:
16
+ # Get logger for this module
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class CSVReportGenerator(BaseReportGenerator):
11
21
  """Generate CSV reports with weekly metrics."""
12
-
13
- def __init__(self, anonymize: bool = False):
22
+
23
+ def __init__(self, anonymize: bool = False, exclude_authors: list[str] = None, identity_resolver=None, **kwargs):
14
24
  """Initialize report generator."""
15
- self.anonymize = anonymize
16
- self._anonymization_map: Dict[str, str] = {}
17
- self._anonymous_counter = 0
25
+ super().__init__(anonymize=anonymize, exclude_authors=exclude_authors,
26
+ identity_resolver=identity_resolver, **kwargs)
27
+ self.activity_scorer = ActivityScorer()
28
+
29
+ # Implementation of abstract methods from BaseReportGenerator
30
+
31
+ def generate(self, data: ReportData, output_path: Optional[Path] = None) -> ReportOutput:
32
+ """Generate CSV report from standardized data.
33
+
34
+ Args:
35
+ data: Standardized report data
36
+ output_path: Optional path to write the report to
37
+
38
+ Returns:
39
+ ReportOutput containing the results
40
+ """
41
+ try:
42
+ # Validate data
43
+ if not self.validate_data(data):
44
+ return ReportOutput(
45
+ success=False,
46
+ errors=["Invalid or incomplete data provided"]
47
+ )
48
+
49
+ # Pre-process data (apply filters and anonymization)
50
+ data = self.pre_process(data)
51
+
52
+ # Generate appropriate CSV based on available data
53
+ if output_path:
54
+ # Determine report type based on filename or available data
55
+ filename = output_path.name.lower()
56
+
57
+ if "weekly" in filename and data.commits:
58
+ self.generate_weekly_report(data.commits, data.developer_stats, output_path)
59
+ elif "developer" in filename and data.developer_stats:
60
+ self.generate_developer_report(data.developer_stats, output_path)
61
+ elif "activity" in filename and data.activity_data:
62
+ # Write activity data directly
63
+ df = pd.DataFrame(data.activity_data)
64
+ df.to_csv(output_path, index=False)
65
+ elif "focus" in filename and data.focus_data:
66
+ # Write focus data directly
67
+ df = pd.DataFrame(data.focus_data)
68
+ df.to_csv(output_path, index=False)
69
+ elif data.commits:
70
+ # Default to weekly report
71
+ self.generate_weekly_report(data.commits, data.developer_stats, output_path)
72
+ else:
73
+ return ReportOutput(
74
+ success=False,
75
+ errors=["No suitable data found for CSV generation"]
76
+ )
77
+
78
+ # Calculate file size
79
+ file_size = output_path.stat().st_size if output_path.exists() else 0
80
+
81
+ return ReportOutput(
82
+ success=True,
83
+ file_path=output_path,
84
+ format="csv",
85
+ size_bytes=file_size
86
+ )
87
+ else:
88
+ # Generate in-memory CSV
89
+ import io
90
+ buffer = io.StringIO()
91
+
92
+ # Default to generating weekly report in memory
93
+ if data.commits:
94
+ # Create temporary dataframe
95
+ df = pd.DataFrame(self._aggregate_weekly_data(data.commits,
96
+ datetime.now(timezone.utc) - timedelta(weeks=52),
97
+ datetime.now(timezone.utc)))
98
+ df.to_csv(buffer, index=False)
99
+ content = buffer.getvalue()
100
+
101
+ return ReportOutput(
102
+ success=True,
103
+ content=content,
104
+ format="csv",
105
+ size_bytes=len(content)
106
+ )
107
+ else:
108
+ return ReportOutput(
109
+ success=False,
110
+ errors=["No data available for CSV generation"]
111
+ )
112
+
113
+ except Exception as e:
114
+ self.logger.error(f"Error generating CSV report: {e}")
115
+ return ReportOutput(
116
+ success=False,
117
+ errors=[str(e)]
118
+ )
18
119
 
19
- def generate_weekly_report(self, commits: List[Dict[str, Any]],
20
- developer_stats: List[Dict[str, Any]],
21
- output_path: Path,
22
- weeks: int = 12) -> Path:
120
+ def get_required_fields(self) -> List[str]:
121
+ """Get the list of required data fields for CSV generation.
122
+
123
+ Returns:
124
+ List of required field names
125
+ """
126
+ # CSV reports can work with various combinations of data
127
+ # At minimum, we need either commits or developer_stats
128
+ return ["commits"] # Primary requirement
129
+
130
+ def get_format_type(self) -> str:
131
+ """Get the format type this generator produces.
132
+
133
+ Returns:
134
+ Format identifier
135
+ """
136
+ return ReportFormat.CSV.value
137
+
138
+ def _filter_excluded_authors_list(self, data_list: list[dict[str, Any]]) -> list[dict[str, Any]]:
139
+ """
140
+ Filter out excluded authors from any data list using canonical_id and enhanced bot detection.
141
+
142
+ WHY: Bot exclusion happens in Phase 2 (reporting) instead of Phase 1 (data collection)
143
+ to ensure manual identity mappings work correctly. This allows the system to see
144
+ consolidated bot identities via canonical_id instead of just original author_email/author_name.
145
+
146
+ ENHANCEMENT: Added enhanced bot pattern matching to catch bots that weren't properly
147
+ consolidated via manual mappings, preventing bot leakage in reports.
148
+
149
+ Args:
150
+ data_list: List of data dictionaries containing canonical_id field
151
+
152
+ Returns:
153
+ Filtered list with excluded authors removed
154
+ """
155
+ if not self.exclude_authors:
156
+ return data_list
157
+
158
+ logger.debug(f"DEBUG EXCLUSION: Starting filter with {len(self.exclude_authors)} excluded authors: {self.exclude_authors}")
159
+ logger.debug(f"DEBUG EXCLUSION: Filtering {len(data_list)} items from data list")
160
+
161
+ excluded_lower = [author.lower() for author in self.exclude_authors]
162
+ logger.debug(f"DEBUG EXCLUSION: Excluded authors (lowercase): {excluded_lower}")
163
+
164
+ # Separate explicit excludes from bot patterns
165
+ explicit_excludes = []
166
+ bot_patterns = []
167
+
168
+ for exclude in excluded_lower:
169
+ if '[bot]' in exclude or 'bot' in exclude.split():
170
+ bot_patterns.append(exclude)
171
+ else:
172
+ explicit_excludes.append(exclude)
173
+
174
+ logger.debug(f"DEBUG EXCLUSION: Explicit excludes: {explicit_excludes}")
175
+ logger.debug(f"DEBUG EXCLUSION: Bot patterns: {bot_patterns}")
176
+
177
+ filtered_data = []
178
+ excluded_count = 0
179
+
180
+ # Sample first 5 items to see data structure
181
+ for i, item in enumerate(data_list[:5]):
182
+ logger.debug(f"DEBUG EXCLUSION: Sample item {i}: canonical_id='{item.get('canonical_id', '')}', "
183
+ f"author_email='{item.get('author_email', '')}', author_name='{item.get('author_name', '')}', "
184
+ f"author='{item.get('author', '')}', primary_name='{item.get('primary_name', '')}', "
185
+ f"name='{item.get('name', '')}', developer='{item.get('developer', '')}', "
186
+ f"display_name='{item.get('display_name', '')}'")
187
+
188
+ for item in data_list:
189
+ canonical_id = item.get("canonical_id", "")
190
+ # Also check original author fields as fallback for data without canonical_id
191
+ author_email = item.get("author_email", "")
192
+ author_name = item.get("author_name", "")
193
+
194
+ # Check all possible author fields to ensure we catch every variation
195
+ author = item.get("author", "")
196
+ primary_name = item.get("primary_name", "")
197
+ name = item.get("name", "")
198
+ developer = item.get("developer", "") # Common in CSV data
199
+ display_name = item.get("display_name", "") # Common in some data structures
200
+
201
+ # Collect all identity fields for checking
202
+ identity_fields = [
203
+ canonical_id,
204
+ item.get("primary_email", ""),
205
+ author_email,
206
+ author_name,
207
+ author,
208
+ primary_name,
209
+ name,
210
+ developer,
211
+ display_name
212
+ ]
213
+
214
+ should_exclude = False
215
+ exclusion_reason = ""
216
+
217
+ # Check for exact matches with explicit excludes first
218
+ for field in identity_fields:
219
+ if field and field.lower() in explicit_excludes:
220
+ should_exclude = True
221
+ exclusion_reason = f"exact match with '{field}' in explicit excludes"
222
+ break
223
+
224
+ # If not explicitly excluded, check for bot patterns
225
+ if not should_exclude:
226
+ for field in identity_fields:
227
+ if not field:
228
+ continue
229
+ field_lower = field.lower()
230
+
231
+ # Enhanced bot detection: check if any field contains bot-like patterns
232
+ for bot_pattern in bot_patterns:
233
+ if bot_pattern in field_lower:
234
+ should_exclude = True
235
+ exclusion_reason = f"bot pattern '{bot_pattern}' matches field '{field}'"
236
+ break
237
+
238
+ # Additional bot detection: check for common bot patterns not in explicit list
239
+ if not should_exclude:
240
+ bot_indicators = ['[bot]', 'bot@', '-bot', 'automated', 'github-actions', 'dependabot', 'renovate']
241
+ for indicator in bot_indicators:
242
+ if indicator in field_lower:
243
+ # Only exclude if this bot-like pattern matches something in our exclude list
244
+ for exclude in excluded_lower:
245
+ if indicator.replace('[', '').replace(']', '') in exclude or exclude in field_lower:
246
+ should_exclude = True
247
+ exclusion_reason = f"bot indicator '{indicator}' in field '{field}' matches exclude pattern '{exclude}'"
248
+ break
249
+ if should_exclude:
250
+ break
251
+
252
+ if should_exclude:
253
+ break
254
+
255
+ if should_exclude:
256
+ excluded_count += 1
257
+ logger.debug(f"DEBUG EXCLUSION: EXCLUDING item - {exclusion_reason}")
258
+ logger.debug(f" canonical_id='{canonical_id}', primary_email='{item.get('primary_email', '')}', "
259
+ f"author_email='{author_email}', author_name='{author_name}', author='{author}', "
260
+ f"primary_name='{primary_name}', name='{name}', developer='{developer}', "
261
+ f"display_name='{display_name}'")
262
+ else:
263
+ filtered_data.append(item)
264
+
265
+ logger.debug(f"DEBUG EXCLUSION: Excluded {excluded_count} items, kept {len(filtered_data)} items")
266
+ return filtered_data
267
+
268
+ def _get_canonical_display_name(self, canonical_id: str, fallback_name: str) -> str:
269
+ """
270
+ Get the canonical display name for a developer.
271
+
272
+ WHY: Manual identity mappings may have updated display names that aren't
273
+ reflected in the developer_stats data passed to report generators. This
274
+ method ensures we get the most current display name from the identity resolver.
275
+
276
+ Args:
277
+ canonical_id: The canonical ID to get the display name for
278
+ fallback_name: The fallback name to use if identity resolver is not available
279
+
280
+ Returns:
281
+ The canonical display name or fallback name
282
+ """
283
+ if self.identity_resolver and canonical_id:
284
+ try:
285
+ canonical_name = self.identity_resolver.get_canonical_name(canonical_id)
286
+ if canonical_name and canonical_name != "Unknown":
287
+ return canonical_name
288
+ except Exception as e:
289
+ logger.debug(f"Error getting canonical name for {canonical_id}: {e}")
290
+
291
+ return fallback_name
292
+
293
+ def _log_datetime_comparison(
294
+ self, dt1: datetime, dt2: datetime, operation: str, location: str
295
+ ) -> None:
296
+ """Log datetime comparison details for debugging timezone issues."""
297
+ logger.debug(f"Comparing dates in {location} ({operation}):")
298
+ logger.debug(f" dt1: {dt1} (tzinfo: {dt1.tzinfo}, aware: {dt1.tzinfo is not None})")
299
+ logger.debug(f" dt2: {dt2} (tzinfo: {dt2.tzinfo}, aware: {dt2.tzinfo is not None})")
300
+
301
+ def _safe_datetime_compare(
302
+ self, dt1: datetime, dt2: datetime, operation: str, location: str
303
+ ) -> bool:
304
+ """Safely compare datetimes with logging and error handling."""
305
+ try:
306
+ self._log_datetime_comparison(dt1, dt2, operation, location)
307
+
308
+ if operation == "lt":
309
+ result = dt1 < dt2
310
+ elif operation == "gt":
311
+ result = dt1 > dt2
312
+ elif operation == "le":
313
+ result = dt1 <= dt2
314
+ elif operation == "ge":
315
+ result = dt1 >= dt2
316
+ elif operation == "eq":
317
+ result = dt1 == dt2
318
+ else:
319
+ raise ValueError(f"Unknown operation: {operation}")
320
+
321
+ logger.debug(f" Result: {result}")
322
+ return result
323
+
324
+ except TypeError as e:
325
+ logger.error(f"Timezone comparison error in {location}:")
326
+ logger.error(
327
+ f" dt1: {dt1} (type: {type(dt1)}, tzinfo: {getattr(dt1, 'tzinfo', 'N/A')})"
328
+ )
329
+ logger.error(
330
+ f" dt2: {dt2} (type: {type(dt2)}, tzinfo: {getattr(dt2, 'tzinfo', 'N/A')})"
331
+ )
332
+ logger.error(f" Operation: {operation}")
333
+ logger.error(f" Error: {e}")
334
+
335
+ # Import traceback for detailed error info
336
+ import traceback
337
+
338
+ logger.error(f" Full traceback:\n{traceback.format_exc()}")
339
+
340
+ # Try to fix by making both timezone-aware in UTC
341
+ try:
342
+ if dt1.tzinfo is None:
343
+ dt1 = dt1.replace(tzinfo=timezone.utc)
344
+ logger.debug(f" Fixed dt1 to UTC: {dt1}")
345
+ if dt2.tzinfo is None:
346
+ dt2 = dt2.replace(tzinfo=timezone.utc)
347
+ logger.debug(f" Fixed dt2 to UTC: {dt2}")
348
+
349
+ # Retry comparison
350
+ if operation == "lt":
351
+ result = dt1 < dt2
352
+ elif operation == "gt":
353
+ result = dt1 > dt2
354
+ elif operation == "le":
355
+ result = dt1 <= dt2
356
+ elif operation == "ge":
357
+ result = dt1 >= dt2
358
+ elif operation == "eq":
359
+ result = dt1 == dt2
360
+ else:
361
+ raise ValueError(f"Unknown operation: {operation}")
362
+
363
+ logger.info(f" Fixed comparison result: {result}")
364
+ return result
365
+
366
+ except Exception as fix_error:
367
+ logger.error(f" Failed to fix timezone issue: {fix_error}")
368
+ raise
369
+
370
+ def _safe_datetime_format(self, dt: datetime, format_str: str) -> str:
371
+ """Safely format datetime with logging."""
372
+ try:
373
+ logger.debug(
374
+ f"Formatting datetime: {dt} (tzinfo: {getattr(dt, 'tzinfo', 'N/A')}) with format {format_str}"
375
+ )
376
+ result = dt.strftime(format_str)
377
+ logger.debug(f" Format result: {result}")
378
+ return result
379
+ except Exception as e:
380
+ logger.error(f"Error formatting datetime {dt}: {e}")
381
+ return str(dt)
382
+
383
+ def generate_weekly_report(
384
+ self,
385
+ commits: list[dict[str, Any]],
386
+ developer_stats: list[dict[str, Any]],
387
+ output_path: Path,
388
+ weeks: int = 12,
389
+ ) -> Path:
23
390
  """Generate weekly metrics CSV report."""
391
+ # Apply exclusion filtering in Phase 2
392
+ commits = self._filter_excluded_authors_list(commits)
393
+ developer_stats = self._filter_excluded_authors_list(developer_stats)
24
394
  # Calculate week boundaries (timezone-aware to match commit timestamps)
25
395
  end_date = datetime.now(timezone.utc)
26
396
  start_date = end_date - timedelta(weeks=weeks)
27
-
397
+
398
+ logger.debug("Weekly report date range:")
399
+ logger.debug(f" start_date: {start_date} (tzinfo: {start_date.tzinfo})")
400
+ logger.debug(f" end_date: {end_date} (tzinfo: {end_date.tzinfo})")
401
+
28
402
  # Group commits by week and developer
29
- weekly_data: Dict[Tuple[datetime, str, str], Dict[str, Any]] = self._aggregate_weekly_data(commits, start_date, end_date)
30
-
403
+ weekly_data: dict[tuple[datetime, str, str], dict[str, Any]] = self._aggregate_weekly_data(
404
+ commits, start_date, end_date
405
+ )
406
+
31
407
  # Create developer lookup
32
- dev_lookup = {dev['canonical_id']: dev for dev in developer_stats}
408
+ dev_lookup = {dev["canonical_id"]: dev for dev in developer_stats}
409
+
410
+ # First pass: collect all raw scores for curve normalization
411
+ developer_raw_scores = {}
412
+ weekly_scores = {}
33
413
 
414
+ for (week_start, canonical_id, project_key), metrics in weekly_data.items():
415
+ activity_result = self.activity_scorer.calculate_activity_score(metrics)
416
+ raw_score = activity_result["raw_score"]
417
+
418
+ # Store for curve normalization
419
+ if canonical_id not in developer_raw_scores:
420
+ developer_raw_scores[canonical_id] = 0
421
+ developer_raw_scores[canonical_id] += raw_score
422
+
423
+ # Store weekly result for later use
424
+ weekly_scores[(week_start, canonical_id, project_key)] = activity_result
425
+
426
+ # Apply curve normalization to developer totals
427
+ curve_normalized = self.activity_scorer.normalize_scores_on_curve(developer_raw_scores)
428
+
34
429
  # Build CSV rows
35
430
  rows = []
36
431
  for (week_start, canonical_id, project_key), metrics in weekly_data.items():
37
432
  developer = dev_lookup.get(canonical_id, {})
433
+ activity_result = weekly_scores[(week_start, canonical_id, project_key)]
38
434
 
435
+ # Get curve data for this developer
436
+ curve_data = curve_normalized.get(canonical_id, {})
437
+
39
438
  row = {
40
- 'week_start': week_start.strftime('%Y-%m-%d'),
41
- 'developer_id': self._anonymize_value(canonical_id, 'id'),
42
- 'developer_name': self._anonymize_value(
43
- developer.get('primary_name', 'Unknown'), 'name'
439
+ "week_start": week_start.strftime("%Y-%m-%d"),
440
+ "developer_id": self._anonymize_value(canonical_id, "id"),
441
+ "developer_name": self._anonymize_value(
442
+ self._get_canonical_display_name(
443
+ canonical_id,
444
+ developer.get("primary_name", "Unknown")
445
+ ), "name"
44
446
  ),
45
- 'developer_email': self._anonymize_value(
46
- developer.get('primary_email', 'unknown@example.com'), 'email'
447
+ "developer_email": self._anonymize_value(
448
+ developer.get("primary_email", "unknown@example.com"), "email"
47
449
  ),
48
- 'project': project_key,
49
- 'commits': metrics['commits'],
50
- 'story_points': metrics['story_points'],
51
- 'lines_added': metrics['lines_added'],
52
- 'lines_removed': metrics['lines_removed'],
53
- 'files_changed': metrics['files_changed'],
54
- 'complexity_delta': round(metrics['complexity_delta'], 2),
55
- 'ticket_coverage_pct': round(metrics['ticket_coverage_pct'], 1),
56
- 'avg_commit_size': round(metrics['avg_commit_size'], 1),
57
- 'unique_tickets': metrics['unique_tickets'],
58
- 'prs_involved': metrics['prs_involved']
450
+ "project": project_key,
451
+ "commits": metrics["commits"],
452
+ "story_points": metrics["story_points"],
453
+ "lines_added": metrics["lines_added"],
454
+ "lines_removed": metrics["lines_removed"],
455
+ "files_changed": metrics["files_changed"],
456
+ "complexity_delta": round(metrics["complexity_delta"], 2),
457
+ "ticket_coverage_pct": round(metrics["ticket_coverage_pct"], 1),
458
+ "avg_commit_size": round(metrics["avg_commit_size"], 1),
459
+ "unique_tickets": metrics["unique_tickets"],
460
+ "prs_involved": metrics["prs_involved"],
461
+ # Activity score fields
462
+ "activity_score": round(activity_result["normalized_score"], 1),
463
+ "activity_level": activity_result["activity_level"],
464
+ "commit_score": round(activity_result["components"]["commit_score"], 1),
465
+ "pr_score": round(activity_result["components"]["pr_score"], 1),
466
+ "code_impact_score": round(activity_result["components"]["code_impact_score"], 1),
467
+ "complexity_score": round(activity_result["components"]["complexity_score"], 1),
468
+ # Curve normalization fields
469
+ "curved_score": curve_data.get("curved_score", 0),
470
+ "percentile": curve_data.get("percentile", 0),
471
+ "quintile": curve_data.get("quintile", 0),
472
+ "curved_activity_level": curve_data.get("activity_level", "unknown"),
59
473
  }
60
474
  rows.append(row)
61
-
475
+
62
476
  # Sort by week and developer
63
- rows.sort(key=lambda x: (x['week_start'], x['developer_name'], x['project']))
64
-
477
+ rows.sort(key=lambda x: (x["week_start"], x["developer_name"], x["project"]))
478
+
65
479
  # Write CSV
66
480
  if rows:
67
481
  df = pd.DataFrame(rows)
68
482
  df.to_csv(output_path, index=False)
69
483
  else:
70
484
  # Write empty CSV with headers
71
- with open(output_path, 'w', newline='') as f:
72
- writer = csv.DictWriter(f, fieldnames=[
73
- 'week_start', 'developer_id', 'developer_name', 'developer_email',
74
- 'project', 'commits', 'story_points', 'lines_added', 'lines_removed',
75
- 'files_changed', 'complexity_delta', 'ticket_coverage_pct',
76
- 'avg_commit_size', 'unique_tickets', 'prs_involved'
77
- ])
485
+ with open(output_path, "w", newline="") as f:
486
+ writer = csv.DictWriter(
487
+ f,
488
+ fieldnames=[
489
+ "week_start",
490
+ "developer_id",
491
+ "developer_name",
492
+ "developer_email",
493
+ "project",
494
+ "commits",
495
+ "story_points",
496
+ "lines_added",
497
+ "lines_removed",
498
+ "files_changed",
499
+ "complexity_delta",
500
+ "ticket_coverage_pct",
501
+ "avg_commit_size",
502
+ "unique_tickets",
503
+ "prs_involved",
504
+ "activity_score",
505
+ "activity_level",
506
+ "commit_score",
507
+ "pr_score",
508
+ "code_impact_score",
509
+ "complexity_score",
510
+ "curved_score",
511
+ "percentile",
512
+ "quintile",
513
+ "curved_activity_level",
514
+ ],
515
+ )
78
516
  writer.writeheader()
79
-
517
+
80
518
  return output_path
81
-
82
- def generate_summary_report(self, commits: List[Dict[str, Any]],
83
- prs: List[Dict[str, Any]],
84
- developer_stats: List[Dict[str, Any]],
85
- ticket_analysis: Dict[str, Any],
86
- output_path: Path) -> Path:
519
+
520
+ def generate_summary_report(
521
+ self,
522
+ commits: list[dict[str, Any]],
523
+ prs: list[dict[str, Any]],
524
+ developer_stats: list[dict[str, Any]],
525
+ ticket_analysis: dict[str, Any],
526
+ output_path: Path,
527
+ pm_data: Optional[dict[str, Any]] = None,
528
+ ) -> Path:
87
529
  """Generate summary statistics CSV."""
88
- summary_data = []
530
+ # Apply exclusion filtering in Phase 2
531
+ commits = self._filter_excluded_authors_list(commits)
532
+ developer_stats = self._filter_excluded_authors_list(developer_stats)
89
533
 
534
+ summary_data = []
535
+
90
536
  # Overall statistics
91
537
  total_commits = len(commits)
92
- total_story_points = sum(c.get('story_points', 0) or 0 for c in commits)
538
+ total_story_points = sum(c.get("story_points", 0) or 0 for c in commits)
93
539
  # Use filtered stats if available, otherwise fall back to raw stats
94
540
  total_lines = sum(
95
- c.get('filtered_insertions', c.get('insertions', 0)) +
96
- c.get('filtered_deletions', c.get('deletions', 0))
541
+ c.get("filtered_insertions", c.get("insertions", 0))
542
+ + c.get("filtered_deletions", c.get("deletions", 0))
97
543
  for c in commits
98
544
  )
99
-
100
- summary_data.append({
101
- 'metric': 'Total Commits',
102
- 'value': total_commits,
103
- 'category': 'Overall'
104
- })
105
-
106
- summary_data.append({
107
- 'metric': 'Total Story Points',
108
- 'value': total_story_points,
109
- 'category': 'Overall'
110
- })
111
-
112
- summary_data.append({
113
- 'metric': 'Total Lines Changed',
114
- 'value': total_lines,
115
- 'category': 'Overall'
116
- })
117
-
118
- summary_data.append({
119
- 'metric': 'Active Developers',
120
- 'value': len(developer_stats),
121
- 'category': 'Overall'
122
- })
123
-
545
+
546
+ summary_data.append(
547
+ {"metric": "Total Commits", "value": total_commits, "category": "Overall"}
548
+ )
549
+
550
+ summary_data.append(
551
+ {"metric": "Total Story Points", "value": total_story_points, "category": "Overall"}
552
+ )
553
+
554
+ summary_data.append(
555
+ {"metric": "Total Lines Changed", "value": total_lines, "category": "Overall"}
556
+ )
557
+
558
+ summary_data.append(
559
+ {"metric": "Active Developers", "value": len(developer_stats), "category": "Overall"}
560
+ )
561
+
124
562
  # Ticket coverage
125
- summary_data.append({
126
- 'metric': 'Commit Ticket Coverage %',
127
- 'value': round(ticket_analysis.get('commit_coverage_pct', 0), 1),
128
- 'category': 'Tracking'
129
- })
130
-
131
- summary_data.append({
132
- 'metric': 'PR Ticket Coverage %',
133
- 'value': round(ticket_analysis.get('pr_coverage_pct', 0), 1),
134
- 'category': 'Tracking'
135
- })
136
-
563
+ summary_data.append(
564
+ {
565
+ "metric": "Commit Ticket Coverage %",
566
+ "value": round(ticket_analysis.get("commit_coverage_pct", 0), 1),
567
+ "category": "Tracking",
568
+ }
569
+ )
570
+
571
+ summary_data.append(
572
+ {
573
+ "metric": "PR Ticket Coverage %",
574
+ "value": round(ticket_analysis.get("pr_coverage_pct", 0), 1),
575
+ "category": "Tracking",
576
+ }
577
+ )
578
+
137
579
  # Platform breakdown
138
- for platform, count in ticket_analysis.get('ticket_summary', {}).items():
139
- summary_data.append({
140
- 'metric': f'{platform.title()} Tickets',
141
- 'value': count,
142
- 'category': 'Platforms'
143
- })
144
-
580
+ for platform, count in ticket_analysis.get("ticket_summary", {}).items():
581
+ summary_data.append(
582
+ {"metric": f"{platform.title()} Tickets", "value": count, "category": "Platforms"}
583
+ )
584
+
145
585
  # Developer statistics
146
586
  if developer_stats:
147
- top_contributor = max(developer_stats, key=lambda x: x['total_commits'])
148
- summary_data.append({
149
- 'metric': 'Top Contributor',
150
- 'value': self._anonymize_value(top_contributor['primary_name'], 'name'),
151
- 'category': 'Developers'
152
- })
153
-
154
- summary_data.append({
155
- 'metric': 'Top Contributor Commits',
156
- 'value': top_contributor['total_commits'],
157
- 'category': 'Developers'
158
- })
159
-
587
+ top_contributor = max(developer_stats, key=lambda x: x["total_commits"])
588
+ summary_data.append(
589
+ {
590
+ "metric": "Top Contributor",
591
+ "value": self._anonymize_value(
592
+ self._get_canonical_display_name(
593
+ top_contributor["canonical_id"],
594
+ top_contributor["primary_name"]
595
+ ), "name"
596
+ ),
597
+ "category": "Developers",
598
+ }
599
+ )
600
+
601
+ summary_data.append(
602
+ {
603
+ "metric": "Top Contributor Commits",
604
+ "value": top_contributor["total_commits"],
605
+ "category": "Developers",
606
+ }
607
+ )
608
+
609
+ # PM Platform statistics
610
+ if pm_data and "metrics" in pm_data:
611
+ metrics = pm_data["metrics"]
612
+
613
+ # Total PM issues
614
+ summary_data.append(
615
+ {
616
+ "metric": "Total PM Issues",
617
+ "value": metrics.get("total_pm_issues", 0),
618
+ "category": "PM Platforms",
619
+ }
620
+ )
621
+
622
+ # Story point analysis
623
+ story_analysis = metrics.get("story_point_analysis", {})
624
+ summary_data.append(
625
+ {
626
+ "metric": "PM Story Points",
627
+ "value": story_analysis.get("pm_total_story_points", 0),
628
+ "category": "PM Platforms",
629
+ }
630
+ )
631
+
632
+ summary_data.append(
633
+ {
634
+ "metric": "Story Point Coverage %",
635
+ "value": round(story_analysis.get("story_point_coverage_pct", 0), 1),
636
+ "category": "PM Platforms",
637
+ }
638
+ )
639
+
640
+ # Issue type distribution
641
+ issue_types = metrics.get("issue_type_distribution", {})
642
+ for issue_type, count in issue_types.items():
643
+ summary_data.append(
644
+ {
645
+ "metric": f"{issue_type.title()} Issues",
646
+ "value": count,
647
+ "category": "Issue Types",
648
+ }
649
+ )
650
+
651
+ # Platform coverage
652
+ platform_coverage = metrics.get("platform_coverage", {})
653
+ for platform, coverage_data in platform_coverage.items():
654
+ summary_data.append(
655
+ {
656
+ "metric": f"{platform.title()} Issues",
657
+ "value": coverage_data.get("total_issues", 0),
658
+ "category": "Platform Coverage",
659
+ }
660
+ )
661
+
662
+ summary_data.append(
663
+ {
664
+ "metric": f"{platform.title()} Linked %",
665
+ "value": round(coverage_data.get("coverage_percentage", 0), 1),
666
+ "category": "Platform Coverage",
667
+ }
668
+ )
669
+
670
+ # Correlation quality
671
+ correlation_quality = metrics.get("correlation_quality", {})
672
+ summary_data.append(
673
+ {
674
+ "metric": "Issue-Commit Correlations",
675
+ "value": correlation_quality.get("total_correlations", 0),
676
+ "category": "Correlation Quality",
677
+ }
678
+ )
679
+
680
+ summary_data.append(
681
+ {
682
+ "metric": "Avg Correlation Confidence",
683
+ "value": round(correlation_quality.get("average_confidence", 0), 2),
684
+ "category": "Correlation Quality",
685
+ }
686
+ )
687
+
160
688
  # Write summary CSV
161
689
  df = pd.DataFrame(summary_data)
162
690
  df.to_csv(output_path, index=False)
163
-
691
+
164
692
  return output_path
165
-
166
- def generate_developer_report(self, developer_stats: List[Dict[str, Any]],
167
- output_path: Path) -> Path:
693
+
694
+ def generate_developer_report(
695
+ self, developer_stats: list[dict[str, Any]], output_path: Path
696
+ ) -> Path:
168
697
  """Generate developer statistics CSV."""
169
698
  rows = []
170
-
699
+
171
700
  for dev in developer_stats:
172
701
  row = {
173
- 'developer_id': self._anonymize_value(dev['canonical_id'], 'id'),
174
- 'name': self._anonymize_value(dev['primary_name'], 'name'),
175
- 'email': self._anonymize_value(dev['primary_email'], 'email'),
176
- 'github_username': self._anonymize_value(
177
- dev.get('github_username', ''), 'username'
178
- ) if dev.get('github_username') else '',
179
- 'total_commits': dev['total_commits'],
180
- 'total_story_points': dev['total_story_points'],
181
- 'alias_count': dev['alias_count'],
182
- 'first_seen': dev['first_seen'].strftime('%Y-%m-%d') if dev['first_seen'] else '',
183
- 'last_seen': dev['last_seen'].strftime('%Y-%m-%d') if dev['last_seen'] else '',
184
- 'avg_story_points_per_commit': round(
185
- dev['total_story_points'] / max(dev['total_commits'], 1), 2
186
- )
702
+ "developer_id": self._anonymize_value(dev["canonical_id"], "id"),
703
+ "name": self._anonymize_value(
704
+ self._get_canonical_display_name(
705
+ dev["canonical_id"],
706
+ dev["primary_name"]
707
+ ), "name"
708
+ ),
709
+ "email": self._anonymize_value(dev["primary_email"], "email"),
710
+ "github_username": (
711
+ self._anonymize_value(dev.get("github_username", ""), "username")
712
+ if dev.get("github_username")
713
+ else ""
714
+ ),
715
+ "total_commits": dev["total_commits"],
716
+ "total_story_points": dev["total_story_points"],
717
+ "alias_count": dev.get("alias_count", 1),
718
+ "first_seen": (
719
+ self._safe_datetime_format(dev["first_seen"], "%Y-%m-%d")
720
+ if dev["first_seen"]
721
+ else ""
722
+ ),
723
+ "last_seen": (
724
+ self._safe_datetime_format(dev["last_seen"], "%Y-%m-%d")
725
+ if dev["last_seen"]
726
+ else ""
727
+ ),
728
+ "avg_story_points_per_commit": round(
729
+ dev["total_story_points"] / max(dev["total_commits"], 1), 2
730
+ ),
187
731
  }
188
732
  rows.append(row)
189
-
733
+
190
734
  # Sort by total commits
191
- rows.sort(key=lambda x: x['total_commits'], reverse=True)
192
-
735
+ rows.sort(key=lambda x: x["total_commits"], reverse=True)
736
+
193
737
  # Write CSV
194
738
  df = pd.DataFrame(rows)
195
739
  df.to_csv(output_path, index=False)
196
-
740
+
197
741
  return output_path
198
-
199
- def _aggregate_weekly_data(self, commits: List[Dict[str, Any]],
200
- start_date: datetime,
201
- end_date: datetime) -> Dict[Tuple[datetime, str, str], Dict[str, Any]]:
742
+
743
+ def generate_pm_correlations_report(self, pm_data: dict[str, Any], output_path: Path) -> Path:
744
+ """Generate PM platform correlations CSV report.
745
+
746
+ WHY: PM platform integration provides valuable correlation data between
747
+ work items and code changes. This report enables analysis of story point
748
+ accuracy, development velocity, and work item completion patterns.
749
+
750
+ Args:
751
+ pm_data: PM platform data including correlations and metrics.
752
+ output_path: Path where the CSV report should be written.
753
+
754
+ Returns:
755
+ Path to the generated CSV file.
756
+ """
757
+ if not pm_data or "correlations" not in pm_data:
758
+ # Generate empty report if no PM data
759
+ df = pd.DataFrame(
760
+ columns=[
761
+ "commit_hash",
762
+ "commit_message",
763
+ "commit_author",
764
+ "commit_date",
765
+ "issue_key",
766
+ "issue_title",
767
+ "issue_type",
768
+ "issue_status",
769
+ "issue_platform",
770
+ "story_points",
771
+ "correlation_method",
772
+ "confidence",
773
+ "matched_text",
774
+ ]
775
+ )
776
+ df.to_csv(output_path, index=False)
777
+ return output_path
778
+
779
+ correlations = pm_data["correlations"]
780
+ rows = []
781
+
782
+ for correlation in correlations:
783
+ row = {
784
+ "commit_hash": correlation.get("commit_hash", ""),
785
+ "commit_message": correlation.get("commit_message", ""),
786
+ "commit_author": self._anonymize_value(
787
+ correlation.get("commit_author", ""), "name"
788
+ ),
789
+ "commit_date": correlation.get("commit_date", ""),
790
+ "issue_key": correlation.get("issue_key", ""),
791
+ "issue_title": correlation.get("issue_title", ""),
792
+ "issue_type": correlation.get("issue_type", ""),
793
+ "issue_status": correlation.get("issue_status", ""),
794
+ "issue_platform": correlation.get("issue_platform", ""),
795
+ "story_points": correlation.get("story_points", 0) or 0,
796
+ "correlation_method": correlation.get("correlation_method", ""),
797
+ "confidence": round(correlation.get("confidence", 0), 3),
798
+ "matched_text": correlation.get("matched_text", ""),
799
+ }
800
+ rows.append(row)
801
+
802
+ df = pd.DataFrame(rows)
803
+ df.to_csv(output_path, index=False)
804
+
805
+ return output_path
806
+
807
+ def _aggregate_weekly_data(
808
+ self, commits: list[dict[str, Any]], start_date: datetime, end_date: datetime
809
+ ) -> dict[tuple[datetime, str, str], dict[str, Any]]:
202
810
  """Aggregate commit data by week."""
203
- weekly_data: defaultdict[Tuple[datetime, str, str], Dict[str, Any]] = defaultdict(lambda: {
204
- 'commits': 0,
205
- 'story_points': 0,
206
- 'lines_added': 0,
207
- 'lines_removed': 0,
208
- 'files_changed': 0,
209
- 'complexity_delta': 0.0,
210
- 'commits_with_tickets': 0,
211
- 'tickets': set(),
212
- 'prs': set()
213
- })
214
-
811
+ weekly_data: defaultdict[tuple[datetime, str, str], dict[str, Any]] = defaultdict(
812
+ lambda: {
813
+ "commits": 0,
814
+ "story_points": 0,
815
+ "lines_added": 0,
816
+ "lines_removed": 0,
817
+ "files_changed": 0,
818
+ "complexity_delta": 0.0,
819
+ "commits_with_tickets": 0,
820
+ "tickets": set(),
821
+ "prs": set(),
822
+ }
823
+ )
824
+
215
825
  for commit in commits:
216
- timestamp = commit['timestamp']
826
+ timestamp = commit["timestamp"]
827
+ logger.debug(
828
+ f"Processing commit timestamp: {timestamp} (tzinfo: {getattr(timestamp, 'tzinfo', 'N/A')})"
829
+ )
830
+
217
831
  # Ensure consistent timezone handling
218
- if hasattr(timestamp, 'tzinfo') and timestamp.tzinfo is not None:
832
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is not None:
219
833
  # Keep timezone-aware but ensure it's UTC
220
834
  if timestamp.tzinfo != timezone.utc:
221
835
  timestamp = timestamp.astimezone(timezone.utc)
836
+ logger.debug(f" Converted to UTC: {timestamp}")
222
837
  else:
223
838
  # Convert naive datetime to UTC timezone-aware
224
839
  timestamp = timestamp.replace(tzinfo=timezone.utc)
225
-
226
- if timestamp < start_date or timestamp > end_date:
840
+ logger.debug(f" Made timezone-aware: {timestamp}")
841
+
842
+ # Use safe comparison functions with logging
843
+ if self._safe_datetime_compare(
844
+ timestamp, start_date, "lt", "_aggregate_weekly_data range check"
845
+ ) or self._safe_datetime_compare(
846
+ timestamp, end_date, "gt", "_aggregate_weekly_data range check"
847
+ ):
848
+ logger.debug(" Skipping commit outside date range")
227
849
  continue
228
-
850
+
229
851
  # Get week start (Monday)
230
852
  week_start = self._get_week_start(timestamp)
231
-
853
+
232
854
  # Get project key (default to 'unknown')
233
- project_key = commit.get('project_key', 'unknown')
234
-
855
+ project_key = commit.get("project_key", "unknown")
856
+
235
857
  # Get canonical developer ID
236
- canonical_id = commit.get('canonical_id', commit.get('author_email', 'unknown'))
237
-
858
+ canonical_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
859
+
238
860
  key = (week_start, canonical_id, project_key)
239
-
861
+
240
862
  # Aggregate metrics
241
863
  data = weekly_data[key]
242
- data['commits'] += 1
243
- data['story_points'] += commit.get('story_points', 0) or 0
244
-
864
+ data["commits"] += 1
865
+ data["story_points"] += commit.get("story_points", 0) or 0
866
+
245
867
  # Use filtered stats if available, otherwise fall back to raw stats
246
- data['lines_added'] += commit.get('filtered_insertions', commit.get('insertions', 0)) or 0
247
- data['lines_removed'] += commit.get('filtered_deletions', commit.get('deletions', 0)) or 0
248
- data['files_changed'] += commit.get('filtered_files_changed', commit.get('files_changed', 0)) or 0
249
-
250
- data['complexity_delta'] += commit.get('complexity_delta', 0.0) or 0.0
251
-
868
+ data["lines_added"] += (
869
+ commit.get("filtered_insertions", commit.get("insertions", 0)) or 0
870
+ )
871
+ data["lines_removed"] += (
872
+ commit.get("filtered_deletions", commit.get("deletions", 0)) or 0
873
+ )
874
+ data["files_changed"] += (
875
+ commit.get("filtered_files_changed", commit.get("files_changed", 0)) or 0
876
+ )
877
+
878
+ data["complexity_delta"] += commit.get("complexity_delta", 0.0) or 0.0
879
+
252
880
  # Track tickets
253
- ticket_refs = commit.get('ticket_references', [])
881
+ ticket_refs = commit.get("ticket_references", [])
254
882
  if ticket_refs:
255
- data['commits_with_tickets'] += 1
256
- tickets_set = data['tickets']
883
+ data["commits_with_tickets"] += 1
884
+ tickets_set = data["tickets"]
257
885
  for ticket in ticket_refs:
258
886
  if isinstance(ticket, dict):
259
- tickets_set.add(ticket.get('full_id', ''))
887
+ tickets_set.add(ticket.get("full_id", ""))
260
888
  else:
261
889
  tickets_set.add(str(ticket))
262
-
890
+
263
891
  # Track PRs (if available)
264
- pr_number = commit.get('pr_number')
892
+ pr_number = commit.get("pr_number")
265
893
  if pr_number:
266
- prs_set = data['prs']
894
+ prs_set = data["prs"]
267
895
  prs_set.add(pr_number)
268
-
896
+
269
897
  # Calculate derived metrics
270
- result: Dict[Tuple[datetime, str, str], Dict[str, Any]] = {}
898
+ result: dict[tuple[datetime, str, str], dict[str, Any]] = {}
271
899
  for key, metrics in weekly_data.items():
272
- commits_count = metrics['commits']
900
+ commits_count = metrics["commits"]
273
901
  if commits_count > 0:
274
- metrics['ticket_coverage_pct'] = (
275
- metrics['commits_with_tickets'] / commits_count * 100
276
- )
277
- metrics['avg_commit_size'] = (
278
- (metrics['lines_added'] + metrics['lines_removed']) / commits_count
902
+ metrics["ticket_coverage_pct"] = (
903
+ metrics["commits_with_tickets"] / commits_count * 100
279
904
  )
905
+ metrics["avg_commit_size"] = (
906
+ metrics["lines_added"] + metrics["lines_removed"]
907
+ ) / commits_count
280
908
  else:
281
- metrics['ticket_coverage_pct'] = 0
282
- metrics['avg_commit_size'] = 0
283
-
284
- tickets_set = metrics['tickets']
285
- prs_set = metrics['prs']
286
- metrics['unique_tickets'] = len(tickets_set)
287
- metrics['prs_involved'] = len(prs_set)
288
-
909
+ metrics["ticket_coverage_pct"] = 0
910
+ metrics["avg_commit_size"] = 0
911
+
912
+ tickets_set = metrics["tickets"]
913
+ prs_set = metrics["prs"]
914
+ metrics["unique_tickets"] = len(tickets_set)
915
+ metrics["prs_involved"] = len(prs_set)
916
+
289
917
  # Remove sets before returning
290
- del metrics['tickets']
291
- del metrics['prs']
292
- del metrics['commits_with_tickets']
293
-
918
+ del metrics["tickets"]
919
+ del metrics["prs"]
920
+ del metrics["commits_with_tickets"]
921
+
294
922
  result[key] = metrics
295
-
923
+
296
924
  return result
297
-
925
+
298
926
  def _get_week_start(self, date: datetime) -> datetime:
299
927
  """Get Monday of the week for a given date."""
928
+ logger.debug(
929
+ f"Getting week start for date: {date} (tzinfo: {getattr(date, 'tzinfo', 'N/A')})"
930
+ )
931
+
300
932
  # Ensure consistent timezone handling - keep timezone info
301
- if hasattr(date, 'tzinfo') and date.tzinfo is not None:
933
+ if hasattr(date, "tzinfo") and date.tzinfo is not None:
302
934
  # Keep timezone-aware but ensure it's UTC
303
935
  if date.tzinfo != timezone.utc:
304
936
  date = date.astimezone(timezone.utc)
937
+ logger.debug(f" Converted to UTC: {date}")
305
938
  else:
306
939
  # Convert naive datetime to UTC timezone-aware
307
940
  date = date.replace(tzinfo=timezone.utc)
308
-
941
+ logger.debug(f" Made timezone-aware: {date}")
942
+
309
943
  days_since_monday = date.weekday()
310
944
  monday = date - timedelta(days=days_since_monday)
311
- return monday.replace(hour=0, minute=0, second=0, microsecond=0)
312
-
945
+ result = monday.replace(hour=0, minute=0, second=0, microsecond=0)
946
+
947
+ logger.debug(f" Week start result: {result} (tzinfo: {result.tzinfo})")
948
+ return result
949
+
950
+ def generate_developer_activity_summary(
951
+ self,
952
+ commits: list[dict[str, Any]],
953
+ developer_stats: list[dict[str, Any]],
954
+ prs: list[dict[str, Any]],
955
+ output_path: Path,
956
+ weeks: int = 12,
957
+ ) -> Path:
958
+ """Generate developer activity summary with curve-normalized scores.
959
+
960
+ This report provides a high-level view of developer activity with
961
+ curve-normalized scores that allow for fair comparison across the team.
962
+ """
963
+ # Apply exclusion filtering in Phase 2
964
+ commits = self._filter_excluded_authors_list(commits)
965
+ developer_stats = self._filter_excluded_authors_list(developer_stats)
966
+
967
+ # Calculate date range
968
+ end_date = datetime.now(timezone.utc)
969
+ start_date = end_date - timedelta(weeks=weeks)
970
+
971
+ # Aggregate metrics by developer
972
+ developer_metrics = defaultdict(lambda: {
973
+ "commits": 0,
974
+ "prs_involved": 0,
975
+ "lines_added": 0,
976
+ "lines_removed": 0,
977
+ "files_changed": 0,
978
+ "complexity_delta": 0.0,
979
+ "story_points": 0,
980
+ "unique_tickets": set(),
981
+ })
982
+
983
+ # Process commits
984
+ for commit in commits:
985
+ timestamp = commit["timestamp"]
986
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is None:
987
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
988
+
989
+ if timestamp < start_date or timestamp > end_date:
990
+ continue
991
+
992
+ dev_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
993
+ metrics = developer_metrics[dev_id]
994
+
995
+ metrics["commits"] += 1
996
+ metrics["lines_added"] += commit.get("filtered_insertions", commit.get("insertions", 0)) or 0
997
+ metrics["lines_removed"] += commit.get("filtered_deletions", commit.get("deletions", 0)) or 0
998
+ metrics["files_changed"] += commit.get("filtered_files_changed", commit.get("files_changed", 0)) or 0
999
+ metrics["complexity_delta"] += commit.get("complexity_delta", 0.0) or 0.0
1000
+ metrics["story_points"] += commit.get("story_points", 0) or 0
1001
+
1002
+ ticket_refs = commit.get("ticket_references", [])
1003
+ for ticket in ticket_refs:
1004
+ if isinstance(ticket, dict):
1005
+ metrics["unique_tickets"].add(ticket.get("full_id", ""))
1006
+ else:
1007
+ metrics["unique_tickets"].add(str(ticket))
1008
+
1009
+ # Process PRs
1010
+ for pr in prs:
1011
+ author_id = pr.get("canonical_id", pr.get("author", "unknown"))
1012
+ if author_id in developer_metrics:
1013
+ developer_metrics[author_id]["prs_involved"] += 1
1014
+
1015
+ # Calculate activity scores
1016
+ developer_scores = {}
1017
+ developer_results = {}
1018
+
1019
+ for dev_id, metrics in developer_metrics.items():
1020
+ # Convert sets to counts
1021
+ metrics["unique_tickets"] = len(metrics["unique_tickets"])
1022
+
1023
+ # Calculate activity score
1024
+ activity_result = self.activity_scorer.calculate_activity_score(metrics)
1025
+ developer_scores[dev_id] = activity_result["raw_score"]
1026
+ developer_results[dev_id] = activity_result
1027
+
1028
+ # Apply curve normalization
1029
+ curve_normalized = self.activity_scorer.normalize_scores_on_curve(developer_scores)
1030
+
1031
+ # Create developer lookup
1032
+ dev_lookup = {dev["canonical_id"]: dev for dev in developer_stats}
1033
+
1034
+ # Build rows
1035
+ rows = []
1036
+ for dev_id, metrics in developer_metrics.items():
1037
+ developer = dev_lookup.get(dev_id, {})
1038
+ activity_result = developer_results[dev_id]
1039
+ curve_data = curve_normalized.get(dev_id, {})
1040
+
1041
+ row = {
1042
+ "developer_id": self._anonymize_value(dev_id, "id"),
1043
+ "developer_name": self._anonymize_value(
1044
+ self._get_canonical_display_name(
1045
+ dev_id,
1046
+ developer.get("primary_name", "Unknown")
1047
+ ), "name"
1048
+ ),
1049
+ "commits": metrics["commits"],
1050
+ "prs": metrics["prs_involved"],
1051
+ "story_points": metrics["story_points"],
1052
+ "lines_added": metrics["lines_added"],
1053
+ "lines_removed": metrics["lines_removed"],
1054
+ "files_changed": metrics["files_changed"],
1055
+ "unique_tickets": metrics["unique_tickets"],
1056
+ # Raw activity scores
1057
+ "raw_activity_score": round(activity_result["raw_score"], 1),
1058
+ "normalized_activity_score": round(activity_result["normalized_score"], 1),
1059
+ "activity_level": activity_result["activity_level"],
1060
+ # Curve-normalized scores
1061
+ "curved_score": curve_data.get("curved_score", 0),
1062
+ "percentile": curve_data.get("percentile", 0),
1063
+ "quintile": curve_data.get("quintile", 0),
1064
+ "curved_activity_level": curve_data.get("activity_level", "unknown"),
1065
+ "level_description": curve_data.get("level_description", ""),
1066
+ # Component breakdown
1067
+ "commit_score": round(activity_result["components"]["commit_score"], 1),
1068
+ "pr_score": round(activity_result["components"]["pr_score"], 1),
1069
+ "code_impact_score": round(activity_result["components"]["code_impact_score"], 1),
1070
+ "complexity_score": round(activity_result["components"]["complexity_score"], 1),
1071
+ }
1072
+ rows.append(row)
1073
+
1074
+ # Sort by curved score (highest first)
1075
+ rows.sort(key=lambda x: x["curved_score"], reverse=True)
1076
+
1077
+ # Write CSV
1078
+ if rows:
1079
+ df = pd.DataFrame(rows)
1080
+ df.to_csv(output_path, index=False)
1081
+ else:
1082
+ # Write empty CSV with headers
1083
+ with open(output_path, "w", newline="") as f:
1084
+ writer = csv.DictWriter(
1085
+ f,
1086
+ fieldnames=[
1087
+ "developer_id",
1088
+ "developer_name",
1089
+ "commits",
1090
+ "prs",
1091
+ "story_points",
1092
+ "lines_added",
1093
+ "lines_removed",
1094
+ "files_changed",
1095
+ "unique_tickets",
1096
+ "raw_activity_score",
1097
+ "normalized_activity_score",
1098
+ "activity_level",
1099
+ "curved_score",
1100
+ "percentile",
1101
+ "quintile",
1102
+ "curved_activity_level",
1103
+ "level_description",
1104
+ "commit_score",
1105
+ "pr_score",
1106
+ "code_impact_score",
1107
+ "complexity_score",
1108
+ ],
1109
+ )
1110
+ writer.writeheader()
1111
+
1112
+ return output_path
1113
+
313
1114
  def _anonymize_value(self, value: str, field_type: str) -> str:
314
1115
  """Anonymize a value if anonymization is enabled."""
315
1116
  if not self.anonymize or not value:
316
1117
  return value
317
-
318
- if field_type == 'email' and '@' in value:
1118
+
1119
+ if field_type == "email" and "@" in value:
319
1120
  # Keep domain for email
320
- local, domain = value.split('@', 1)
1121
+ local, domain = value.split("@", 1)
321
1122
  value = local # Anonymize only local part
322
1123
  suffix = f"@{domain}"
323
1124
  else:
324
1125
  suffix = ""
325
-
1126
+
326
1127
  if value not in self._anonymization_map:
327
1128
  self._anonymous_counter += 1
328
- if field_type == 'name':
1129
+ if field_type == "name":
329
1130
  anonymous = f"Developer{self._anonymous_counter}"
330
- elif field_type == 'email':
1131
+ elif field_type == "email":
331
1132
  anonymous = f"dev{self._anonymous_counter}"
332
- elif field_type == 'id':
1133
+ elif field_type == "id":
333
1134
  anonymous = f"ID{self._anonymous_counter:04d}"
334
1135
  else:
335
1136
  anonymous = f"anon{self._anonymous_counter}"
336
-
1137
+
337
1138
  self._anonymization_map[value] = anonymous
1139
+
1140
+ return self._anonymization_map[value] + suffix
1141
+
1142
+ def generate_untracked_commits_report(
1143
+ self, ticket_analysis: dict[str, Any], output_path: Path
1144
+ ) -> Path:
1145
+ """Generate detailed CSV report for commits without ticket references.
1146
+
1147
+ WHY: Untracked commits represent work that may not be visible to project
1148
+ management tools. This report enables analysis of what types of work are
1149
+ being performed outside the tracked process, helping identify process
1150
+ improvements and training needs.
1151
+
1152
+ Args:
1153
+ ticket_analysis: Ticket analysis results containing untracked commits
1154
+ output_path: Path where the CSV report should be written
1155
+
1156
+ Returns:
1157
+ Path to the generated CSV file
1158
+ """
1159
+ untracked_commits = ticket_analysis.get("untracked_commits", [])
1160
+
1161
+ if not untracked_commits:
1162
+ # Generate empty report with headers
1163
+ headers = [
1164
+ "commit_hash",
1165
+ "short_hash",
1166
+ "author",
1167
+ "author_email",
1168
+ "canonical_id",
1169
+ "date",
1170
+ "project",
1171
+ "message",
1172
+ "category",
1173
+ "files_changed",
1174
+ "lines_added",
1175
+ "lines_removed",
1176
+ "lines_changed",
1177
+ "is_merge",
1178
+ ]
1179
+ with open(output_path, "w", newline="") as f:
1180
+ writer = csv.DictWriter(f, fieldnames=headers)
1181
+ writer.writeheader()
1182
+ return output_path
1183
+
1184
+ # Process untracked commits into CSV rows
1185
+ rows = []
1186
+ for commit in untracked_commits:
1187
+ # Handle datetime formatting
1188
+ timestamp = commit.get("timestamp")
1189
+ if timestamp:
1190
+ if hasattr(timestamp, "strftime"):
1191
+ date_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
1192
+ else:
1193
+ date_str = str(timestamp)
1194
+ else:
1195
+ date_str = ""
1196
+
1197
+ row = {
1198
+ "commit_hash": commit.get("full_hash", commit.get("hash", "")),
1199
+ "short_hash": commit.get("hash", ""),
1200
+ "author": self._anonymize_value(commit.get("author", "Unknown"), "name"),
1201
+ "author_email": self._anonymize_value(commit.get("author_email", ""), "email"),
1202
+ "canonical_id": self._anonymize_value(commit.get("canonical_id", ""), "id"),
1203
+ "date": date_str,
1204
+ "project": commit.get("project_key", "UNKNOWN"),
1205
+ "message": commit.get("message", ""),
1206
+ "category": commit.get("category", "other"),
1207
+ "files_changed": commit.get("files_changed", 0),
1208
+ "lines_added": commit.get("lines_added", 0),
1209
+ "lines_removed": commit.get("lines_removed", 0),
1210
+ "lines_changed": commit.get("lines_changed", 0),
1211
+ "is_merge": commit.get("is_merge", False),
1212
+ }
1213
+ rows.append(row)
1214
+
1215
+ # Write CSV
1216
+ if rows:
1217
+ df = pd.DataFrame(rows)
1218
+ df.to_csv(output_path, index=False)
1219
+
1220
+ return output_path
1221
+ def generate_weekly_categorization_report(
1222
+ self,
1223
+ all_commits: list[dict[str, Any]],
1224
+ ticket_extractor, # TicketExtractor or MLTicketExtractor instance
1225
+ output_path: Path,
1226
+ weeks: int = 12
1227
+ ) -> Path:
1228
+ """Generate weekly commit categorization metrics CSV report for ALL commits.
1229
+
1230
+ WHY: Categorization trends provide insights into development patterns
1231
+ over time, helping identify process improvements and training needs.
1232
+ This enhanced version processes ALL commits (tracked and untracked) to provide
1233
+ complete visibility into work patterns across the entire development flow.
1234
+
1235
+ DESIGN DECISION: Processes all commits using the same ML/rule-based categorization
1236
+ system used elsewhere in the application, ensuring consistent categorization
1237
+ across all reports and analysis.
1238
+
1239
+ Args:
1240
+ all_commits: Complete list of commits to categorize
1241
+ ticket_extractor: TicketExtractor instance for commit categorization
1242
+ output_path: Path where the CSV report should be written
1243
+ weeks: Number of weeks to analyze
1244
+
1245
+ Returns:
1246
+ Path to the generated CSV file
1247
+ """
1248
+ # Calculate week boundaries
1249
+ end_date = datetime.now(timezone.utc)
1250
+ start_date = end_date - timedelta(weeks=weeks)
1251
+
1252
+ # Initialize weekly aggregation structures
1253
+ weekly_categories = defaultdict(lambda: defaultdict(int))
1254
+ weekly_metrics = defaultdict(lambda: {
1255
+ 'lines_added': 0,
1256
+ 'lines_removed': 0,
1257
+ 'files_changed': 0,
1258
+ 'developers': set()
1259
+ })
1260
+
1261
+ # Process ALL commits with classification
1262
+ processed_commits = 0
1263
+ for commit in all_commits:
1264
+ if not isinstance(commit, dict):
1265
+ continue
1266
+
1267
+ # Get timestamp and validate date range
1268
+ timestamp = commit.get("timestamp")
1269
+ if not timestamp:
1270
+ continue
1271
+
1272
+ # Ensure timezone consistency
1273
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is None:
1274
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
1275
+ elif hasattr(timestamp, "tzinfo") and timestamp.tzinfo != timezone.utc:
1276
+ timestamp = timestamp.astimezone(timezone.utc)
1277
+
1278
+ if timestamp < start_date or timestamp > end_date:
1279
+ continue
1280
+
1281
+ # Skip merge commits (consistent with untracked analysis)
1282
+ if commit.get("is_merge", False):
1283
+ continue
1284
+
1285
+ # Categorize the commit using the same system as untracked analysis
1286
+ message = commit.get("message", "")
1287
+ files_changed_raw = commit.get("files_changed", [])
1288
+
1289
+ # Handle both int and list types for files_changed
1290
+ if isinstance(files_changed_raw, int):
1291
+ files_changed_count = files_changed_raw
1292
+ files_changed_list = [] # Can't provide file names, only count
1293
+ elif isinstance(files_changed_raw, list):
1294
+ files_changed_count = len(files_changed_raw)
1295
+ files_changed_list = files_changed_raw
1296
+ else:
1297
+ files_changed_count = 0
1298
+ files_changed_list = []
1299
+
1300
+ # Handle both TicketExtractor and MLTicketExtractor signatures
1301
+ try:
1302
+ # Try ML signature first (message, files_changed as list)
1303
+ category = ticket_extractor.categorize_commit(message, files_changed_list)
1304
+ except TypeError:
1305
+ # Fall back to base signature (message only)
1306
+ category = ticket_extractor.categorize_commit(message)
1307
+
1308
+ # Get week boundary (Monday start)
1309
+ week_start = self._get_week_start(timestamp)
1310
+
1311
+ # Aggregate by category
1312
+ weekly_categories[week_start][category] += 1
1313
+
1314
+ # Aggregate metrics
1315
+ weekly_metrics[week_start]['lines_added'] += commit.get("insertions", 0)
1316
+ weekly_metrics[week_start]['lines_removed'] += commit.get("deletions", 0)
1317
+ weekly_metrics[week_start]['files_changed'] += files_changed_count
1318
+
1319
+ # Track unique developers (use canonical_id or fallback to email)
1320
+ developer_id = commit.get("canonical_id") or commit.get("author_email", "Unknown")
1321
+ weekly_metrics[week_start]['developers'].add(developer_id)
1322
+
1323
+ processed_commits += 1
1324
+
1325
+ # Build CSV rows with comprehensive metrics
1326
+ rows = []
1327
+ all_categories = set()
1328
+
1329
+ # Collect all categories across all weeks
1330
+ for week_data in weekly_categories.values():
1331
+ all_categories.update(week_data.keys())
1332
+
1333
+ # Ensure standard categories are included even if not found
1334
+ standard_categories = ["bug_fix", "feature", "refactor", "documentation",
1335
+ "maintenance", "test", "style", "build", "integration", "other"]
1336
+ all_categories.update(standard_categories)
1337
+ sorted_categories = sorted(all_categories)
1338
+
1339
+ # Generate weekly rows
1340
+ for week_start in sorted(weekly_categories.keys()):
1341
+ week_data = weekly_categories[week_start]
1342
+ week_metrics = weekly_metrics[week_start]
1343
+ total_commits = sum(week_data.values())
1344
+
1345
+ row = {
1346
+ "week_start": week_start.strftime("%Y-%m-%d"),
1347
+ "total_commits": total_commits,
1348
+ "lines_added": week_metrics['lines_added'],
1349
+ "lines_removed": week_metrics['lines_removed'],
1350
+ "files_changed": week_metrics['files_changed'],
1351
+ "developer_count": len(week_metrics['developers'])
1352
+ }
1353
+
1354
+ # Add each category count and percentage
1355
+ for category in sorted_categories:
1356
+ count = week_data.get(category, 0)
1357
+ pct = (count / total_commits * 100) if total_commits > 0 else 0
1358
+
1359
+ row[f"{category}_count"] = count
1360
+ row[f"{category}_pct"] = round(pct, 1)
1361
+
1362
+ rows.append(row)
1363
+
1364
+ # Write CSV with comprehensive headers
1365
+ if rows:
1366
+ df = pd.DataFrame(rows)
1367
+ df.to_csv(output_path, index=False)
1368
+ else:
1369
+ # Write empty CSV with comprehensive headers
1370
+ headers = ["week_start", "total_commits", "lines_added", "lines_removed",
1371
+ "files_changed", "developer_count"]
1372
+
1373
+ for category in sorted_categories:
1374
+ headers.extend([f"{category}_count", f"{category}_pct"])
1375
+
1376
+ with open(output_path, "w", newline="") as f:
1377
+ writer = csv.DictWriter(f, fieldnames=headers)
1378
+ writer.writeheader()
338
1379
 
339
- return self._anonymization_map[value] + suffix
1380
+ return output_path
1381
+
1382
+ def generate_story_point_correlation_report(
1383
+ self,
1384
+ commits: list[dict[str, Any]],
1385
+ prs: list[dict[str, Any]],
1386
+ pm_data: Optional[dict[str, Any]],
1387
+ output_path: Path,
1388
+ weeks: int = 12
1389
+ ) -> Path:
1390
+ """Generate story point correlation analysis CSV report.
1391
+
1392
+ WHY: Story point correlation analysis helps teams understand the relationship
1393
+ between estimated effort (story points) and actual work metrics (commits,
1394
+ lines of code, time). This enables process improvements and better estimation
1395
+ calibration.
1396
+
1397
+ INTEGRATION: Uses the StoryPointCorrelationAnalyzer to provide comprehensive
1398
+ correlation metrics including weekly trends, developer accuracy, and velocity
1399
+ analysis in a format suitable for spreadsheet analysis.
1400
+
1401
+ Args:
1402
+ commits: List of commit data with story points
1403
+ prs: List of pull request data
1404
+ pm_data: PM platform data with issue correlations
1405
+ output_path: Path for the output CSV file
1406
+ weeks: Number of weeks to analyze
1407
+
1408
+ Returns:
1409
+ Path to the generated CSV report
1410
+ """
1411
+ try:
1412
+ # Import here to avoid circular imports
1413
+ from .story_point_correlation import StoryPointCorrelationAnalyzer
1414
+
1415
+ # Create analyzer with same configuration as CSV writer
1416
+ analyzer = StoryPointCorrelationAnalyzer(
1417
+ anonymize=self.anonymize,
1418
+ identity_resolver=self.identity_resolver
1419
+ )
1420
+
1421
+ # Apply exclusion filtering consistent with other reports
1422
+ commits = self._filter_excluded_authors_list(commits)
1423
+
1424
+ # Generate the correlation report
1425
+ logger.debug(f"Generating story point correlation report: {output_path}")
1426
+ return analyzer.generate_correlation_report(commits, prs, pm_data, output_path, weeks)
1427
+
1428
+ except Exception as e:
1429
+ logger.error(f"Error generating story point correlation report: {e}")
1430
+
1431
+ # Create empty report as fallback
1432
+ headers = [
1433
+ "week_start", "metric_type", "developer_name",
1434
+ "sp_commits_correlation", "sp_lines_correlation", "sp_files_correlation",
1435
+ "sp_prs_correlation", "sp_complexity_correlation", "sample_size",
1436
+ "total_story_points", "total_commits", "story_points_completed",
1437
+ "commits_count", "prs_merged", "developers_active", "velocity_trend",
1438
+ "overall_accuracy", "avg_weekly_accuracy", "consistency",
1439
+ "weeks_active", "total_estimated_sp", "total_actual_sp", "estimation_ratio"
1440
+ ]
1441
+
1442
+ df = pd.DataFrame(columns=headers)
1443
+ df.to_csv(output_path, index=False)
1444
+
1445
+ raise
1446
+
1447
+ def generate_weekly_velocity_report(
1448
+ self,
1449
+ commits: list[dict[str, Any]],
1450
+ prs: list[dict[str, Any]],
1451
+ output_path: Path,
1452
+ weeks: int = 12,
1453
+ ) -> Path:
1454
+ """Generate weekly lines-per-story-point velocity analysis report.
1455
+
1456
+ WHY: Velocity analysis helps teams understand the relationship between
1457
+ estimated effort (story points) and actual work performed (lines of code).
1458
+ This enables process improvements, better estimation calibration, and
1459
+ identification of efficiency trends over time.
1460
+
1461
+ DESIGN DECISION: Combines both PR-based and commit-based story points
1462
+ to provide comprehensive coverage, as some organizations track story
1463
+ points differently across their development workflow.
1464
+
1465
+ Args:
1466
+ commits: List of commit data dictionaries with story points
1467
+ prs: List of pull request data dictionaries with story points
1468
+ output_path: Path where the CSV report should be written
1469
+ weeks: Number of weeks to analyze (default: 12)
1470
+
1471
+ Returns:
1472
+ Path to the generated CSV file
1473
+ """
1474
+ # Apply exclusion filtering in Phase 2
1475
+ commits = self._filter_excluded_authors_list(commits)
1476
+
1477
+ # Calculate date range (timezone-aware to match commit timestamps)
1478
+ end_date = datetime.now(timezone.utc)
1479
+ start_date = end_date - timedelta(weeks=weeks)
1480
+
1481
+ logger.debug("Weekly velocity report date range:")
1482
+ logger.debug(f" start_date: {start_date} (tzinfo: {start_date.tzinfo})")
1483
+ logger.debug(f" end_date: {end_date} (tzinfo: {end_date.tzinfo})")
1484
+
1485
+ # Initialize weekly aggregation structures
1486
+ weekly_data: dict[datetime, dict[str, Any]] = defaultdict(lambda: {
1487
+ 'total_story_points': 0,
1488
+ 'pr_story_points': 0,
1489
+ 'commit_story_points': 0,
1490
+ 'total_lines': 0,
1491
+ 'lines_added': 0,
1492
+ 'lines_removed': 0,
1493
+ 'files_changed': 0,
1494
+ 'commits_count': 0,
1495
+ 'developers': set(),
1496
+ 'prs_with_sp': 0,
1497
+ 'commits_with_sp': 0,
1498
+ })
1499
+
1500
+ # Process commits for weekly aggregation
1501
+ for commit in commits:
1502
+ timestamp = commit["timestamp"]
1503
+ logger.debug(
1504
+ f"Processing commit timestamp: {timestamp} (tzinfo: {getattr(timestamp, 'tzinfo', 'N/A')})"
1505
+ )
1506
+
1507
+ # Ensure consistent timezone handling
1508
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is not None:
1509
+ if timestamp.tzinfo != timezone.utc:
1510
+ timestamp = timestamp.astimezone(timezone.utc)
1511
+ else:
1512
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
1513
+
1514
+ # Check date range
1515
+ if self._safe_datetime_compare(
1516
+ timestamp, start_date, "lt", "generate_weekly_velocity_report range check"
1517
+ ) or self._safe_datetime_compare(
1518
+ timestamp, end_date, "gt", "generate_weekly_velocity_report range check"
1519
+ ):
1520
+ continue
1521
+
1522
+ # Get week start (Monday)
1523
+ week_start = self._get_week_start(timestamp)
1524
+ week_data = weekly_data[week_start]
1525
+
1526
+ # Aggregate commit metrics
1527
+ story_points = commit.get("story_points", 0) or 0
1528
+ lines_added = commit.get("filtered_insertions", commit.get("insertions", 0)) or 0
1529
+ lines_removed = commit.get("filtered_deletions", commit.get("deletions", 0)) or 0
1530
+ files_changed = commit.get("filtered_files_changed", commit.get("files_changed", 0)) or 0
1531
+
1532
+ week_data['commits_count'] += 1
1533
+ week_data['commit_story_points'] += story_points
1534
+ week_data['total_story_points'] += story_points
1535
+ week_data['lines_added'] += lines_added
1536
+ week_data['lines_removed'] += lines_removed
1537
+ week_data['total_lines'] += lines_added + lines_removed
1538
+ week_data['files_changed'] += files_changed
1539
+
1540
+ # Track developers and story point coverage
1541
+ developer_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
1542
+ week_data['developers'].add(developer_id)
1543
+
1544
+ if story_points > 0:
1545
+ week_data['commits_with_sp'] += 1
1546
+
1547
+ # Process PRs for weekly aggregation (by merge date or creation date)
1548
+ for pr in prs:
1549
+ # Use merged_at if available and valid, otherwise created_at
1550
+ pr_date = pr.get("merged_at") or pr.get("created_at")
1551
+ if not pr_date:
1552
+ continue
1553
+
1554
+ # Handle string dates (convert to datetime if needed)
1555
+ if isinstance(pr_date, str):
1556
+ try:
1557
+ from dateutil.parser import parse
1558
+ pr_date = parse(pr_date)
1559
+ except Exception:
1560
+ continue
1561
+
1562
+ # Ensure timezone consistency
1563
+ if hasattr(pr_date, "tzinfo") and pr_date.tzinfo is not None:
1564
+ if pr_date.tzinfo != timezone.utc:
1565
+ pr_date = pr_date.astimezone(timezone.utc)
1566
+ else:
1567
+ pr_date = pr_date.replace(tzinfo=timezone.utc)
1568
+
1569
+ # Check date range
1570
+ if self._safe_datetime_compare(
1571
+ pr_date, start_date, "lt", "generate_weekly_velocity_report PR range check"
1572
+ ) or self._safe_datetime_compare(
1573
+ pr_date, end_date, "gt", "generate_weekly_velocity_report PR range check"
1574
+ ):
1575
+ continue
1576
+
1577
+ # Get week start
1578
+ week_start = self._get_week_start(pr_date)
1579
+ week_data = weekly_data[week_start]
1580
+
1581
+ # Aggregate PR metrics
1582
+ story_points = pr.get("story_points", 0) or 0
1583
+ if story_points > 0:
1584
+ week_data['pr_story_points'] += story_points
1585
+ week_data['total_story_points'] += story_points
1586
+ week_data['prs_with_sp'] += 1
1587
+
1588
+ # Track developer from PR
1589
+ developer_id = pr.get("canonical_id", pr.get("author", "unknown"))
1590
+ week_data['developers'].add(developer_id)
1591
+
1592
+ # Build CSV rows with velocity metrics
1593
+ rows = []
1594
+ previous_week_lines_per_point = None
1595
+
1596
+ for week_start in sorted(weekly_data.keys()):
1597
+ week_data = weekly_data[week_start]
1598
+ total_story_points = week_data['total_story_points']
1599
+ total_lines = week_data['total_lines']
1600
+
1601
+ # Calculate key metrics with division by zero protection
1602
+ lines_per_point = (total_lines / total_story_points) if total_story_points > 0 else 0
1603
+ commits_per_point = (week_data['commits_count'] / total_story_points) if total_story_points > 0 else 0
1604
+
1605
+ # Calculate efficiency score (inverse of lines per point, normalized to 0-100 scale)
1606
+ # Higher efficiency = fewer lines needed per story point
1607
+ if lines_per_point > 0:
1608
+ # Use a logarithmic scale to handle wide ranges
1609
+ import math
1610
+ efficiency_score = max(0, 100 - (math.log10(max(lines_per_point, 1)) * 20))
1611
+ else:
1612
+ efficiency_score = 0
1613
+
1614
+ # Calculate velocity trend (week-over-week change in lines per point)
1615
+ if previous_week_lines_per_point is not None and previous_week_lines_per_point > 0:
1616
+ if lines_per_point > 0:
1617
+ velocity_trend = ((lines_per_point - previous_week_lines_per_point) / previous_week_lines_per_point) * 100
1618
+ else:
1619
+ velocity_trend = -100 # Went from some lines per point to zero
1620
+ else:
1621
+ velocity_trend = 0 # No previous data for comparison
1622
+
1623
+ row = {
1624
+ "week_start": week_start.strftime("%Y-%m-%d"),
1625
+ "total_story_points": total_story_points,
1626
+ "pr_story_points": week_data['pr_story_points'],
1627
+ "commit_story_points": week_data['commit_story_points'],
1628
+ "total_lines": total_lines,
1629
+ "lines_added": week_data['lines_added'],
1630
+ "lines_removed": week_data['lines_removed'],
1631
+ "files_changed": week_data['files_changed'],
1632
+ "lines_per_point": round(lines_per_point, 2) if lines_per_point > 0 else 0,
1633
+ "commits_per_point": round(commits_per_point, 2) if commits_per_point > 0 else 0,
1634
+ "developers_involved": len(week_data['developers']),
1635
+ "efficiency_score": round(efficiency_score, 1),
1636
+ "velocity_trend": round(velocity_trend, 1),
1637
+ # Additional metrics for deeper analysis
1638
+ "commits_count": week_data['commits_count'],
1639
+ "prs_with_story_points": week_data['prs_with_sp'],
1640
+ "commits_with_story_points": week_data['commits_with_sp'],
1641
+ "story_point_coverage_pct": round(
1642
+ (week_data['commits_with_sp'] / max(week_data['commits_count'], 1)) * 100, 1
1643
+ ),
1644
+ "avg_lines_per_commit": round(
1645
+ total_lines / max(week_data['commits_count'], 1), 1
1646
+ ),
1647
+ "avg_files_per_commit": round(
1648
+ week_data['files_changed'] / max(week_data['commits_count'], 1), 1
1649
+ ),
1650
+ }
1651
+ rows.append(row)
1652
+
1653
+ # Store for next iteration's trend calculation
1654
+ previous_week_lines_per_point = lines_per_point if lines_per_point > 0 else None
1655
+
1656
+ # Write CSV
1657
+ if rows:
1658
+ df = pd.DataFrame(rows)
1659
+ df.to_csv(output_path, index=False)
1660
+ else:
1661
+ # Write empty CSV with headers
1662
+ headers = [
1663
+ "week_start",
1664
+ "total_story_points",
1665
+ "pr_story_points",
1666
+ "commit_story_points",
1667
+ "total_lines",
1668
+ "lines_added",
1669
+ "lines_removed",
1670
+ "files_changed",
1671
+ "lines_per_point",
1672
+ "commits_per_point",
1673
+ "developers_involved",
1674
+ "efficiency_score",
1675
+ "velocity_trend",
1676
+ "commits_count",
1677
+ "prs_with_story_points",
1678
+ "commits_with_story_points",
1679
+ "story_point_coverage_pct",
1680
+ "avg_lines_per_commit",
1681
+ "avg_files_per_commit",
1682
+ ]
1683
+ with open(output_path, "w", newline="") as f:
1684
+ writer = csv.DictWriter(f, fieldnames=headers)
1685
+ writer.writeheader()
1686
+
1687
+ return output_path
1688
+
1689
+ def generate_weekly_dora_report(
1690
+ self,
1691
+ commits: list[dict[str, Any]],
1692
+ prs: list[dict[str, Any]],
1693
+ output_path: Path,
1694
+ weeks: int = 12,
1695
+ ) -> Path:
1696
+ """Generate weekly DORA metrics CSV report.
1697
+
1698
+ WHY: Weekly DORA metrics provide trend analysis for software delivery
1699
+ performance, enabling teams to track improvements and identify periods
1700
+ of degraded performance across the four key metrics.
1701
+
1702
+ DESIGN DECISION: Uses the DORAMetricsCalculator with weekly breakdown
1703
+ to provide consistent methodology while adding trend analysis and
1704
+ rolling averages for smoother interpretation.
1705
+
1706
+ Args:
1707
+ commits: List of commit data dictionaries
1708
+ prs: List of pull request data dictionaries
1709
+ output_path: Path where the CSV report should be written
1710
+ weeks: Number of weeks to analyze (default: 12)
1711
+
1712
+ Returns:
1713
+ Path to the generated CSV file
1714
+ """
1715
+ from ..metrics.dora import DORAMetricsCalculator
1716
+
1717
+ # Apply exclusion filtering in Phase 2
1718
+ commits = self._filter_excluded_authors_list(commits)
1719
+
1720
+ # Calculate date range
1721
+ end_date = datetime.now(timezone.utc)
1722
+ start_date = end_date - timedelta(weeks=weeks)
1723
+
1724
+ # Initialize DORA calculator
1725
+ dora_calculator = DORAMetricsCalculator()
1726
+
1727
+ try:
1728
+ # Calculate weekly DORA metrics
1729
+ weekly_metrics = dora_calculator.calculate_weekly_dora_metrics(
1730
+ commits=commits,
1731
+ prs=prs,
1732
+ start_date=start_date,
1733
+ end_date=end_date,
1734
+ )
1735
+
1736
+ if not weekly_metrics:
1737
+ # Generate empty report with headers
1738
+ headers = [
1739
+ "week_start",
1740
+ "week_end",
1741
+ "deployment_frequency",
1742
+ "lead_time_hours",
1743
+ "change_failure_rate",
1744
+ "mttr_hours",
1745
+ "total_failures",
1746
+ "total_commits",
1747
+ "total_prs",
1748
+ "deployment_frequency_4w_avg",
1749
+ "lead_time_4w_avg",
1750
+ "change_failure_rate_4w_avg",
1751
+ "mttr_4w_avg",
1752
+ "deployment_frequency_change_pct",
1753
+ "lead_time_change_pct",
1754
+ "change_failure_rate_change_pct",
1755
+ "mttr_change_pct",
1756
+ "deployment_frequency_trend",
1757
+ "lead_time_trend",
1758
+ "change_failure_rate_trend",
1759
+ "mttr_trend",
1760
+ ]
1761
+
1762
+ df = pd.DataFrame(columns=headers)
1763
+ df.to_csv(output_path, index=False)
1764
+ return output_path
1765
+
1766
+ # Convert to DataFrame and write CSV
1767
+ df = pd.DataFrame(weekly_metrics)
1768
+ df.to_csv(output_path, index=False)
1769
+
1770
+ return output_path
1771
+
1772
+ except Exception as e:
1773
+ logger.error(f"Error generating weekly DORA report: {e}")
1774
+
1775
+ # Create empty report as fallback
1776
+ headers = [
1777
+ "week_start",
1778
+ "week_end",
1779
+ "deployment_frequency",
1780
+ "lead_time_hours",
1781
+ "change_failure_rate",
1782
+ "mttr_hours",
1783
+ "total_failures",
1784
+ "total_commits",
1785
+ "total_prs",
1786
+ "deployment_frequency_4w_avg",
1787
+ "lead_time_4w_avg",
1788
+ "change_failure_rate_4w_avg",
1789
+ "mttr_4w_avg",
1790
+ "deployment_frequency_change_pct",
1791
+ "lead_time_change_pct",
1792
+ "change_failure_rate_change_pct",
1793
+ "mttr_change_pct",
1794
+ "deployment_frequency_trend",
1795
+ "lead_time_trend",
1796
+ "change_failure_rate_trend",
1797
+ "mttr_trend",
1798
+ ]
1799
+
1800
+ df = pd.DataFrame(columns=headers)
1801
+ df.to_csv(output_path, index=False)
1802
+
1803
+ raise