gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,319 +1,1803 @@
1
1
  """CSV report generation for GitFlow Analytics."""
2
+
2
3
  import csv
3
- from datetime import datetime, timedelta
4
- from pathlib import Path
5
- from typing import List, Dict, Any, Optional
4
+ import logging
6
5
  from collections import defaultdict
6
+ from datetime import datetime, timedelta, timezone
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional
9
+
7
10
  import pandas as pd
8
11
 
12
+ from ..metrics.activity_scoring import ActivityScorer
13
+ from .base import BaseReportGenerator, ReportData, ReportOutput
14
+ from .interfaces import ReportFormat
9
15
 
10
- class CSVReportGenerator:
16
+ # Get logger for this module
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class CSVReportGenerator(BaseReportGenerator):
11
21
  """Generate CSV reports with weekly metrics."""
12
-
13
- def __init__(self, anonymize: bool = False):
22
+
23
+ def __init__(self, anonymize: bool = False, exclude_authors: list[str] = None, identity_resolver=None, **kwargs):
14
24
  """Initialize report generator."""
15
- self.anonymize = anonymize
16
- self._anonymization_map = {}
17
- self._anonymous_counter = 0
25
+ super().__init__(anonymize=anonymize, exclude_authors=exclude_authors,
26
+ identity_resolver=identity_resolver, **kwargs)
27
+ self.activity_scorer = ActivityScorer()
28
+
29
+ # Implementation of abstract methods from BaseReportGenerator
18
30
 
19
- def generate_weekly_report(self, commits: List[Dict[str, Any]],
20
- developer_stats: List[Dict[str, Any]],
21
- output_path: Path,
22
- weeks: int = 12) -> Path:
31
+ def generate(self, data: ReportData, output_path: Optional[Path] = None) -> ReportOutput:
32
+ """Generate CSV report from standardized data.
33
+
34
+ Args:
35
+ data: Standardized report data
36
+ output_path: Optional path to write the report to
37
+
38
+ Returns:
39
+ ReportOutput containing the results
40
+ """
41
+ try:
42
+ # Validate data
43
+ if not self.validate_data(data):
44
+ return ReportOutput(
45
+ success=False,
46
+ errors=["Invalid or incomplete data provided"]
47
+ )
48
+
49
+ # Pre-process data (apply filters and anonymization)
50
+ data = self.pre_process(data)
51
+
52
+ # Generate appropriate CSV based on available data
53
+ if output_path:
54
+ # Determine report type based on filename or available data
55
+ filename = output_path.name.lower()
56
+
57
+ if "weekly" in filename and data.commits:
58
+ self.generate_weekly_report(data.commits, data.developer_stats, output_path)
59
+ elif "developer" in filename and data.developer_stats:
60
+ self.generate_developer_report(data.developer_stats, output_path)
61
+ elif "activity" in filename and data.activity_data:
62
+ # Write activity data directly
63
+ df = pd.DataFrame(data.activity_data)
64
+ df.to_csv(output_path, index=False)
65
+ elif "focus" in filename and data.focus_data:
66
+ # Write focus data directly
67
+ df = pd.DataFrame(data.focus_data)
68
+ df.to_csv(output_path, index=False)
69
+ elif data.commits:
70
+ # Default to weekly report
71
+ self.generate_weekly_report(data.commits, data.developer_stats, output_path)
72
+ else:
73
+ return ReportOutput(
74
+ success=False,
75
+ errors=["No suitable data found for CSV generation"]
76
+ )
77
+
78
+ # Calculate file size
79
+ file_size = output_path.stat().st_size if output_path.exists() else 0
80
+
81
+ return ReportOutput(
82
+ success=True,
83
+ file_path=output_path,
84
+ format="csv",
85
+ size_bytes=file_size
86
+ )
87
+ else:
88
+ # Generate in-memory CSV
89
+ import io
90
+ buffer = io.StringIO()
91
+
92
+ # Default to generating weekly report in memory
93
+ if data.commits:
94
+ # Create temporary dataframe
95
+ df = pd.DataFrame(self._aggregate_weekly_data(data.commits,
96
+ datetime.now(timezone.utc) - timedelta(weeks=52),
97
+ datetime.now(timezone.utc)))
98
+ df.to_csv(buffer, index=False)
99
+ content = buffer.getvalue()
100
+
101
+ return ReportOutput(
102
+ success=True,
103
+ content=content,
104
+ format="csv",
105
+ size_bytes=len(content)
106
+ )
107
+ else:
108
+ return ReportOutput(
109
+ success=False,
110
+ errors=["No data available for CSV generation"]
111
+ )
112
+
113
+ except Exception as e:
114
+ self.logger.error(f"Error generating CSV report: {e}")
115
+ return ReportOutput(
116
+ success=False,
117
+ errors=[str(e)]
118
+ )
119
+
120
+ def get_required_fields(self) -> List[str]:
121
+ """Get the list of required data fields for CSV generation.
122
+
123
+ Returns:
124
+ List of required field names
125
+ """
126
+ # CSV reports can work with various combinations of data
127
+ # At minimum, we need either commits or developer_stats
128
+ return ["commits"] # Primary requirement
129
+
130
+ def get_format_type(self) -> str:
131
+ """Get the format type this generator produces.
132
+
133
+ Returns:
134
+ Format identifier
135
+ """
136
+ return ReportFormat.CSV.value
137
+
138
+ def _filter_excluded_authors_list(self, data_list: list[dict[str, Any]]) -> list[dict[str, Any]]:
139
+ """
140
+ Filter out excluded authors from any data list using canonical_id and enhanced bot detection.
141
+
142
+ WHY: Bot exclusion happens in Phase 2 (reporting) instead of Phase 1 (data collection)
143
+ to ensure manual identity mappings work correctly. This allows the system to see
144
+ consolidated bot identities via canonical_id instead of just original author_email/author_name.
145
+
146
+ ENHANCEMENT: Added enhanced bot pattern matching to catch bots that weren't properly
147
+ consolidated via manual mappings, preventing bot leakage in reports.
148
+
149
+ Args:
150
+ data_list: List of data dictionaries containing canonical_id field
151
+
152
+ Returns:
153
+ Filtered list with excluded authors removed
154
+ """
155
+ if not self.exclude_authors:
156
+ return data_list
157
+
158
+ logger.debug(f"DEBUG EXCLUSION: Starting filter with {len(self.exclude_authors)} excluded authors: {self.exclude_authors}")
159
+ logger.debug(f"DEBUG EXCLUSION: Filtering {len(data_list)} items from data list")
160
+
161
+ excluded_lower = [author.lower() for author in self.exclude_authors]
162
+ logger.debug(f"DEBUG EXCLUSION: Excluded authors (lowercase): {excluded_lower}")
163
+
164
+ # Separate explicit excludes from bot patterns
165
+ explicit_excludes = []
166
+ bot_patterns = []
167
+
168
+ for exclude in excluded_lower:
169
+ if '[bot]' in exclude or 'bot' in exclude.split():
170
+ bot_patterns.append(exclude)
171
+ else:
172
+ explicit_excludes.append(exclude)
173
+
174
+ logger.debug(f"DEBUG EXCLUSION: Explicit excludes: {explicit_excludes}")
175
+ logger.debug(f"DEBUG EXCLUSION: Bot patterns: {bot_patterns}")
176
+
177
+ filtered_data = []
178
+ excluded_count = 0
179
+
180
+ # Sample first 5 items to see data structure
181
+ for i, item in enumerate(data_list[:5]):
182
+ logger.debug(f"DEBUG EXCLUSION: Sample item {i}: canonical_id='{item.get('canonical_id', '')}', "
183
+ f"author_email='{item.get('author_email', '')}', author_name='{item.get('author_name', '')}', "
184
+ f"author='{item.get('author', '')}', primary_name='{item.get('primary_name', '')}', "
185
+ f"name='{item.get('name', '')}', developer='{item.get('developer', '')}', "
186
+ f"display_name='{item.get('display_name', '')}'")
187
+
188
+ for item in data_list:
189
+ canonical_id = item.get("canonical_id", "")
190
+ # Also check original author fields as fallback for data without canonical_id
191
+ author_email = item.get("author_email", "")
192
+ author_name = item.get("author_name", "")
193
+
194
+ # Check all possible author fields to ensure we catch every variation
195
+ author = item.get("author", "")
196
+ primary_name = item.get("primary_name", "")
197
+ name = item.get("name", "")
198
+ developer = item.get("developer", "") # Common in CSV data
199
+ display_name = item.get("display_name", "") # Common in some data structures
200
+
201
+ # Collect all identity fields for checking
202
+ identity_fields = [
203
+ canonical_id,
204
+ item.get("primary_email", ""),
205
+ author_email,
206
+ author_name,
207
+ author,
208
+ primary_name,
209
+ name,
210
+ developer,
211
+ display_name
212
+ ]
213
+
214
+ should_exclude = False
215
+ exclusion_reason = ""
216
+
217
+ # Check for exact matches with explicit excludes first
218
+ for field in identity_fields:
219
+ if field and field.lower() in explicit_excludes:
220
+ should_exclude = True
221
+ exclusion_reason = f"exact match with '{field}' in explicit excludes"
222
+ break
223
+
224
+ # If not explicitly excluded, check for bot patterns
225
+ if not should_exclude:
226
+ for field in identity_fields:
227
+ if not field:
228
+ continue
229
+ field_lower = field.lower()
230
+
231
+ # Enhanced bot detection: check if any field contains bot-like patterns
232
+ for bot_pattern in bot_patterns:
233
+ if bot_pattern in field_lower:
234
+ should_exclude = True
235
+ exclusion_reason = f"bot pattern '{bot_pattern}' matches field '{field}'"
236
+ break
237
+
238
+ # Additional bot detection: check for common bot patterns not in explicit list
239
+ if not should_exclude:
240
+ bot_indicators = ['[bot]', 'bot@', '-bot', 'automated', 'github-actions', 'dependabot', 'renovate']
241
+ for indicator in bot_indicators:
242
+ if indicator in field_lower:
243
+ # Only exclude if this bot-like pattern matches something in our exclude list
244
+ for exclude in excluded_lower:
245
+ if indicator.replace('[', '').replace(']', '') in exclude or exclude in field_lower:
246
+ should_exclude = True
247
+ exclusion_reason = f"bot indicator '{indicator}' in field '{field}' matches exclude pattern '{exclude}'"
248
+ break
249
+ if should_exclude:
250
+ break
251
+
252
+ if should_exclude:
253
+ break
254
+
255
+ if should_exclude:
256
+ excluded_count += 1
257
+ logger.debug(f"DEBUG EXCLUSION: EXCLUDING item - {exclusion_reason}")
258
+ logger.debug(f" canonical_id='{canonical_id}', primary_email='{item.get('primary_email', '')}', "
259
+ f"author_email='{author_email}', author_name='{author_name}', author='{author}', "
260
+ f"primary_name='{primary_name}', name='{name}', developer='{developer}', "
261
+ f"display_name='{display_name}'")
262
+ else:
263
+ filtered_data.append(item)
264
+
265
+ logger.debug(f"DEBUG EXCLUSION: Excluded {excluded_count} items, kept {len(filtered_data)} items")
266
+ return filtered_data
267
+
268
+ def _get_canonical_display_name(self, canonical_id: str, fallback_name: str) -> str:
269
+ """
270
+ Get the canonical display name for a developer.
271
+
272
+ WHY: Manual identity mappings may have updated display names that aren't
273
+ reflected in the developer_stats data passed to report generators. This
274
+ method ensures we get the most current display name from the identity resolver.
275
+
276
+ Args:
277
+ canonical_id: The canonical ID to get the display name for
278
+ fallback_name: The fallback name to use if identity resolver is not available
279
+
280
+ Returns:
281
+ The canonical display name or fallback name
282
+ """
283
+ if self.identity_resolver and canonical_id:
284
+ try:
285
+ canonical_name = self.identity_resolver.get_canonical_name(canonical_id)
286
+ if canonical_name and canonical_name != "Unknown":
287
+ return canonical_name
288
+ except Exception as e:
289
+ logger.debug(f"Error getting canonical name for {canonical_id}: {e}")
290
+
291
+ return fallback_name
292
+
293
+ def _log_datetime_comparison(
294
+ self, dt1: datetime, dt2: datetime, operation: str, location: str
295
+ ) -> None:
296
+ """Log datetime comparison details for debugging timezone issues."""
297
+ logger.debug(f"Comparing dates in {location} ({operation}):")
298
+ logger.debug(f" dt1: {dt1} (tzinfo: {dt1.tzinfo}, aware: {dt1.tzinfo is not None})")
299
+ logger.debug(f" dt2: {dt2} (tzinfo: {dt2.tzinfo}, aware: {dt2.tzinfo is not None})")
300
+
301
+ def _safe_datetime_compare(
302
+ self, dt1: datetime, dt2: datetime, operation: str, location: str
303
+ ) -> bool:
304
+ """Safely compare datetimes with logging and error handling."""
305
+ try:
306
+ self._log_datetime_comparison(dt1, dt2, operation, location)
307
+
308
+ if operation == "lt":
309
+ result = dt1 < dt2
310
+ elif operation == "gt":
311
+ result = dt1 > dt2
312
+ elif operation == "le":
313
+ result = dt1 <= dt2
314
+ elif operation == "ge":
315
+ result = dt1 >= dt2
316
+ elif operation == "eq":
317
+ result = dt1 == dt2
318
+ else:
319
+ raise ValueError(f"Unknown operation: {operation}")
320
+
321
+ logger.debug(f" Result: {result}")
322
+ return result
323
+
324
+ except TypeError as e:
325
+ logger.error(f"Timezone comparison error in {location}:")
326
+ logger.error(
327
+ f" dt1: {dt1} (type: {type(dt1)}, tzinfo: {getattr(dt1, 'tzinfo', 'N/A')})"
328
+ )
329
+ logger.error(
330
+ f" dt2: {dt2} (type: {type(dt2)}, tzinfo: {getattr(dt2, 'tzinfo', 'N/A')})"
331
+ )
332
+ logger.error(f" Operation: {operation}")
333
+ logger.error(f" Error: {e}")
334
+
335
+ # Import traceback for detailed error info
336
+ import traceback
337
+
338
+ logger.error(f" Full traceback:\n{traceback.format_exc()}")
339
+
340
+ # Try to fix by making both timezone-aware in UTC
341
+ try:
342
+ if dt1.tzinfo is None:
343
+ dt1 = dt1.replace(tzinfo=timezone.utc)
344
+ logger.debug(f" Fixed dt1 to UTC: {dt1}")
345
+ if dt2.tzinfo is None:
346
+ dt2 = dt2.replace(tzinfo=timezone.utc)
347
+ logger.debug(f" Fixed dt2 to UTC: {dt2}")
348
+
349
+ # Retry comparison
350
+ if operation == "lt":
351
+ result = dt1 < dt2
352
+ elif operation == "gt":
353
+ result = dt1 > dt2
354
+ elif operation == "le":
355
+ result = dt1 <= dt2
356
+ elif operation == "ge":
357
+ result = dt1 >= dt2
358
+ elif operation == "eq":
359
+ result = dt1 == dt2
360
+ else:
361
+ raise ValueError(f"Unknown operation: {operation}")
362
+
363
+ logger.info(f" Fixed comparison result: {result}")
364
+ return result
365
+
366
+ except Exception as fix_error:
367
+ logger.error(f" Failed to fix timezone issue: {fix_error}")
368
+ raise
369
+
370
+ def _safe_datetime_format(self, dt: datetime, format_str: str) -> str:
371
+ """Safely format datetime with logging."""
372
+ try:
373
+ logger.debug(
374
+ f"Formatting datetime: {dt} (tzinfo: {getattr(dt, 'tzinfo', 'N/A')}) with format {format_str}"
375
+ )
376
+ result = dt.strftime(format_str)
377
+ logger.debug(f" Format result: {result}")
378
+ return result
379
+ except Exception as e:
380
+ logger.error(f"Error formatting datetime {dt}: {e}")
381
+ return str(dt)
382
+
383
+ def generate_weekly_report(
384
+ self,
385
+ commits: list[dict[str, Any]],
386
+ developer_stats: list[dict[str, Any]],
387
+ output_path: Path,
388
+ weeks: int = 12,
389
+ ) -> Path:
23
390
  """Generate weekly metrics CSV report."""
24
- # Calculate week boundaries
25
- end_date = datetime.now()
391
+ # Apply exclusion filtering in Phase 2
392
+ commits = self._filter_excluded_authors_list(commits)
393
+ developer_stats = self._filter_excluded_authors_list(developer_stats)
394
+ # Calculate week boundaries (timezone-aware to match commit timestamps)
395
+ end_date = datetime.now(timezone.utc)
26
396
  start_date = end_date - timedelta(weeks=weeks)
27
-
397
+
398
+ logger.debug("Weekly report date range:")
399
+ logger.debug(f" start_date: {start_date} (tzinfo: {start_date.tzinfo})")
400
+ logger.debug(f" end_date: {end_date} (tzinfo: {end_date.tzinfo})")
401
+
28
402
  # Group commits by week and developer
29
- weekly_data = self._aggregate_weekly_data(commits, start_date, end_date)
30
-
403
+ weekly_data: dict[tuple[datetime, str, str], dict[str, Any]] = self._aggregate_weekly_data(
404
+ commits, start_date, end_date
405
+ )
406
+
31
407
  # Create developer lookup
32
- dev_lookup = {dev['canonical_id']: dev for dev in developer_stats}
408
+ dev_lookup = {dev["canonical_id"]: dev for dev in developer_stats}
409
+
410
+ # First pass: collect all raw scores for curve normalization
411
+ developer_raw_scores = {}
412
+ weekly_scores = {}
33
413
 
414
+ for (week_start, canonical_id, project_key), metrics in weekly_data.items():
415
+ activity_result = self.activity_scorer.calculate_activity_score(metrics)
416
+ raw_score = activity_result["raw_score"]
417
+
418
+ # Store for curve normalization
419
+ if canonical_id not in developer_raw_scores:
420
+ developer_raw_scores[canonical_id] = 0
421
+ developer_raw_scores[canonical_id] += raw_score
422
+
423
+ # Store weekly result for later use
424
+ weekly_scores[(week_start, canonical_id, project_key)] = activity_result
425
+
426
+ # Apply curve normalization to developer totals
427
+ curve_normalized = self.activity_scorer.normalize_scores_on_curve(developer_raw_scores)
428
+
34
429
  # Build CSV rows
35
430
  rows = []
36
431
  for (week_start, canonical_id, project_key), metrics in weekly_data.items():
37
432
  developer = dev_lookup.get(canonical_id, {})
433
+ activity_result = weekly_scores[(week_start, canonical_id, project_key)]
38
434
 
435
+ # Get curve data for this developer
436
+ curve_data = curve_normalized.get(canonical_id, {})
437
+
39
438
  row = {
40
- 'week_start': week_start.strftime('%Y-%m-%d'),
41
- 'developer_id': self._anonymize_value(canonical_id, 'id'),
42
- 'developer_name': self._anonymize_value(
43
- developer.get('primary_name', 'Unknown'), 'name'
439
+ "week_start": week_start.strftime("%Y-%m-%d"),
440
+ "developer_id": self._anonymize_value(canonical_id, "id"),
441
+ "developer_name": self._anonymize_value(
442
+ self._get_canonical_display_name(
443
+ canonical_id,
444
+ developer.get("primary_name", "Unknown")
445
+ ), "name"
44
446
  ),
45
- 'developer_email': self._anonymize_value(
46
- developer.get('primary_email', 'unknown@example.com'), 'email'
447
+ "developer_email": self._anonymize_value(
448
+ developer.get("primary_email", "unknown@example.com"), "email"
47
449
  ),
48
- 'project': project_key,
49
- 'commits': metrics['commits'],
50
- 'story_points': metrics['story_points'],
51
- 'lines_added': metrics['lines_added'],
52
- 'lines_removed': metrics['lines_removed'],
53
- 'files_changed': metrics['files_changed'],
54
- 'complexity_delta': round(metrics['complexity_delta'], 2),
55
- 'ticket_coverage_pct': round(metrics['ticket_coverage_pct'], 1),
56
- 'avg_commit_size': round(metrics['avg_commit_size'], 1),
57
- 'unique_tickets': metrics['unique_tickets'],
58
- 'prs_involved': metrics['prs_involved']
450
+ "project": project_key,
451
+ "commits": metrics["commits"],
452
+ "story_points": metrics["story_points"],
453
+ "lines_added": metrics["lines_added"],
454
+ "lines_removed": metrics["lines_removed"],
455
+ "files_changed": metrics["files_changed"],
456
+ "complexity_delta": round(metrics["complexity_delta"], 2),
457
+ "ticket_coverage_pct": round(metrics["ticket_coverage_pct"], 1),
458
+ "avg_commit_size": round(metrics["avg_commit_size"], 1),
459
+ "unique_tickets": metrics["unique_tickets"],
460
+ "prs_involved": metrics["prs_involved"],
461
+ # Activity score fields
462
+ "activity_score": round(activity_result["normalized_score"], 1),
463
+ "activity_level": activity_result["activity_level"],
464
+ "commit_score": round(activity_result["components"]["commit_score"], 1),
465
+ "pr_score": round(activity_result["components"]["pr_score"], 1),
466
+ "code_impact_score": round(activity_result["components"]["code_impact_score"], 1),
467
+ "complexity_score": round(activity_result["components"]["complexity_score"], 1),
468
+ # Curve normalization fields
469
+ "curved_score": curve_data.get("curved_score", 0),
470
+ "percentile": curve_data.get("percentile", 0),
471
+ "quintile": curve_data.get("quintile", 0),
472
+ "curved_activity_level": curve_data.get("activity_level", "unknown"),
59
473
  }
60
474
  rows.append(row)
61
-
475
+
62
476
  # Sort by week and developer
63
- rows.sort(key=lambda x: (x['week_start'], x['developer_name'], x['project']))
64
-
477
+ rows.sort(key=lambda x: (x["week_start"], x["developer_name"], x["project"]))
478
+
65
479
  # Write CSV
66
480
  if rows:
67
481
  df = pd.DataFrame(rows)
68
482
  df.to_csv(output_path, index=False)
69
483
  else:
70
484
  # Write empty CSV with headers
71
- with open(output_path, 'w', newline='') as f:
72
- writer = csv.DictWriter(f, fieldnames=[
73
- 'week_start', 'developer_id', 'developer_name', 'developer_email',
74
- 'project', 'commits', 'story_points', 'lines_added', 'lines_removed',
75
- 'files_changed', 'complexity_delta', 'ticket_coverage_pct',
76
- 'avg_commit_size', 'unique_tickets', 'prs_involved'
77
- ])
485
+ with open(output_path, "w", newline="") as f:
486
+ writer = csv.DictWriter(
487
+ f,
488
+ fieldnames=[
489
+ "week_start",
490
+ "developer_id",
491
+ "developer_name",
492
+ "developer_email",
493
+ "project",
494
+ "commits",
495
+ "story_points",
496
+ "lines_added",
497
+ "lines_removed",
498
+ "files_changed",
499
+ "complexity_delta",
500
+ "ticket_coverage_pct",
501
+ "avg_commit_size",
502
+ "unique_tickets",
503
+ "prs_involved",
504
+ "activity_score",
505
+ "activity_level",
506
+ "commit_score",
507
+ "pr_score",
508
+ "code_impact_score",
509
+ "complexity_score",
510
+ "curved_score",
511
+ "percentile",
512
+ "quintile",
513
+ "curved_activity_level",
514
+ ],
515
+ )
78
516
  writer.writeheader()
79
-
517
+
80
518
  return output_path
81
-
82
- def generate_summary_report(self, commits: List[Dict[str, Any]],
83
- prs: List[Dict[str, Any]],
84
- developer_stats: List[Dict[str, Any]],
85
- ticket_analysis: Dict[str, Any],
86
- output_path: Path) -> Path:
519
+
520
+ def generate_summary_report(
521
+ self,
522
+ commits: list[dict[str, Any]],
523
+ prs: list[dict[str, Any]],
524
+ developer_stats: list[dict[str, Any]],
525
+ ticket_analysis: dict[str, Any],
526
+ output_path: Path,
527
+ pm_data: Optional[dict[str, Any]] = None,
528
+ ) -> Path:
87
529
  """Generate summary statistics CSV."""
88
- summary_data = []
530
+ # Apply exclusion filtering in Phase 2
531
+ commits = self._filter_excluded_authors_list(commits)
532
+ developer_stats = self._filter_excluded_authors_list(developer_stats)
89
533
 
534
+ summary_data = []
535
+
90
536
  # Overall statistics
91
537
  total_commits = len(commits)
92
- total_story_points = sum(c.get('story_points', 0) or 0 for c in commits)
538
+ total_story_points = sum(c.get("story_points", 0) or 0 for c in commits)
93
539
  # Use filtered stats if available, otherwise fall back to raw stats
94
540
  total_lines = sum(
95
- c.get('filtered_insertions', c.get('insertions', 0)) +
96
- c.get('filtered_deletions', c.get('deletions', 0))
541
+ c.get("filtered_insertions", c.get("insertions", 0))
542
+ + c.get("filtered_deletions", c.get("deletions", 0))
97
543
  for c in commits
98
544
  )
99
-
100
- summary_data.append({
101
- 'metric': 'Total Commits',
102
- 'value': total_commits,
103
- 'category': 'Overall'
104
- })
105
-
106
- summary_data.append({
107
- 'metric': 'Total Story Points',
108
- 'value': total_story_points,
109
- 'category': 'Overall'
110
- })
111
-
112
- summary_data.append({
113
- 'metric': 'Total Lines Changed',
114
- 'value': total_lines,
115
- 'category': 'Overall'
116
- })
117
-
118
- summary_data.append({
119
- 'metric': 'Active Developers',
120
- 'value': len(developer_stats),
121
- 'category': 'Overall'
122
- })
123
-
545
+
546
+ summary_data.append(
547
+ {"metric": "Total Commits", "value": total_commits, "category": "Overall"}
548
+ )
549
+
550
+ summary_data.append(
551
+ {"metric": "Total Story Points", "value": total_story_points, "category": "Overall"}
552
+ )
553
+
554
+ summary_data.append(
555
+ {"metric": "Total Lines Changed", "value": total_lines, "category": "Overall"}
556
+ )
557
+
558
+ summary_data.append(
559
+ {"metric": "Active Developers", "value": len(developer_stats), "category": "Overall"}
560
+ )
561
+
124
562
  # Ticket coverage
125
- summary_data.append({
126
- 'metric': 'Commit Ticket Coverage %',
127
- 'value': round(ticket_analysis.get('commit_coverage_pct', 0), 1),
128
- 'category': 'Tracking'
129
- })
130
-
131
- summary_data.append({
132
- 'metric': 'PR Ticket Coverage %',
133
- 'value': round(ticket_analysis.get('pr_coverage_pct', 0), 1),
134
- 'category': 'Tracking'
135
- })
136
-
563
+ summary_data.append(
564
+ {
565
+ "metric": "Commit Ticket Coverage %",
566
+ "value": round(ticket_analysis.get("commit_coverage_pct", 0), 1),
567
+ "category": "Tracking",
568
+ }
569
+ )
570
+
571
+ summary_data.append(
572
+ {
573
+ "metric": "PR Ticket Coverage %",
574
+ "value": round(ticket_analysis.get("pr_coverage_pct", 0), 1),
575
+ "category": "Tracking",
576
+ }
577
+ )
578
+
137
579
  # Platform breakdown
138
- for platform, count in ticket_analysis.get('ticket_summary', {}).items():
139
- summary_data.append({
140
- 'metric': f'{platform.title()} Tickets',
141
- 'value': count,
142
- 'category': 'Platforms'
143
- })
144
-
580
+ for platform, count in ticket_analysis.get("ticket_summary", {}).items():
581
+ summary_data.append(
582
+ {"metric": f"{platform.title()} Tickets", "value": count, "category": "Platforms"}
583
+ )
584
+
145
585
  # Developer statistics
146
586
  if developer_stats:
147
- top_contributor = max(developer_stats, key=lambda x: x['total_commits'])
148
- summary_data.append({
149
- 'metric': 'Top Contributor',
150
- 'value': self._anonymize_value(top_contributor['primary_name'], 'name'),
151
- 'category': 'Developers'
152
- })
153
-
154
- summary_data.append({
155
- 'metric': 'Top Contributor Commits',
156
- 'value': top_contributor['total_commits'],
157
- 'category': 'Developers'
158
- })
159
-
587
+ top_contributor = max(developer_stats, key=lambda x: x["total_commits"])
588
+ summary_data.append(
589
+ {
590
+ "metric": "Top Contributor",
591
+ "value": self._anonymize_value(
592
+ self._get_canonical_display_name(
593
+ top_contributor["canonical_id"],
594
+ top_contributor["primary_name"]
595
+ ), "name"
596
+ ),
597
+ "category": "Developers",
598
+ }
599
+ )
600
+
601
+ summary_data.append(
602
+ {
603
+ "metric": "Top Contributor Commits",
604
+ "value": top_contributor["total_commits"],
605
+ "category": "Developers",
606
+ }
607
+ )
608
+
609
+ # PM Platform statistics
610
+ if pm_data and "metrics" in pm_data:
611
+ metrics = pm_data["metrics"]
612
+
613
+ # Total PM issues
614
+ summary_data.append(
615
+ {
616
+ "metric": "Total PM Issues",
617
+ "value": metrics.get("total_pm_issues", 0),
618
+ "category": "PM Platforms",
619
+ }
620
+ )
621
+
622
+ # Story point analysis
623
+ story_analysis = metrics.get("story_point_analysis", {})
624
+ summary_data.append(
625
+ {
626
+ "metric": "PM Story Points",
627
+ "value": story_analysis.get("pm_total_story_points", 0),
628
+ "category": "PM Platforms",
629
+ }
630
+ )
631
+
632
+ summary_data.append(
633
+ {
634
+ "metric": "Story Point Coverage %",
635
+ "value": round(story_analysis.get("story_point_coverage_pct", 0), 1),
636
+ "category": "PM Platforms",
637
+ }
638
+ )
639
+
640
+ # Issue type distribution
641
+ issue_types = metrics.get("issue_type_distribution", {})
642
+ for issue_type, count in issue_types.items():
643
+ summary_data.append(
644
+ {
645
+ "metric": f"{issue_type.title()} Issues",
646
+ "value": count,
647
+ "category": "Issue Types",
648
+ }
649
+ )
650
+
651
+ # Platform coverage
652
+ platform_coverage = metrics.get("platform_coverage", {})
653
+ for platform, coverage_data in platform_coverage.items():
654
+ summary_data.append(
655
+ {
656
+ "metric": f"{platform.title()} Issues",
657
+ "value": coverage_data.get("total_issues", 0),
658
+ "category": "Platform Coverage",
659
+ }
660
+ )
661
+
662
+ summary_data.append(
663
+ {
664
+ "metric": f"{platform.title()} Linked %",
665
+ "value": round(coverage_data.get("coverage_percentage", 0), 1),
666
+ "category": "Platform Coverage",
667
+ }
668
+ )
669
+
670
+ # Correlation quality
671
+ correlation_quality = metrics.get("correlation_quality", {})
672
+ summary_data.append(
673
+ {
674
+ "metric": "Issue-Commit Correlations",
675
+ "value": correlation_quality.get("total_correlations", 0),
676
+ "category": "Correlation Quality",
677
+ }
678
+ )
679
+
680
+ summary_data.append(
681
+ {
682
+ "metric": "Avg Correlation Confidence",
683
+ "value": round(correlation_quality.get("average_confidence", 0), 2),
684
+ "category": "Correlation Quality",
685
+ }
686
+ )
687
+
160
688
  # Write summary CSV
161
689
  df = pd.DataFrame(summary_data)
162
690
  df.to_csv(output_path, index=False)
163
-
691
+
164
692
  return output_path
165
-
166
- def generate_developer_report(self, developer_stats: List[Dict[str, Any]],
167
- output_path: Path) -> Path:
693
+
694
+ def generate_developer_report(
695
+ self, developer_stats: list[dict[str, Any]], output_path: Path
696
+ ) -> Path:
168
697
  """Generate developer statistics CSV."""
169
698
  rows = []
170
-
699
+
171
700
  for dev in developer_stats:
172
701
  row = {
173
- 'developer_id': self._anonymize_value(dev['canonical_id'], 'id'),
174
- 'name': self._anonymize_value(dev['primary_name'], 'name'),
175
- 'email': self._anonymize_value(dev['primary_email'], 'email'),
176
- 'github_username': self._anonymize_value(
177
- dev.get('github_username', ''), 'username'
178
- ) if dev.get('github_username') else '',
179
- 'total_commits': dev['total_commits'],
180
- 'total_story_points': dev['total_story_points'],
181
- 'alias_count': dev['alias_count'],
182
- 'first_seen': dev['first_seen'].strftime('%Y-%m-%d') if dev['first_seen'] else '',
183
- 'last_seen': dev['last_seen'].strftime('%Y-%m-%d') if dev['last_seen'] else '',
184
- 'avg_story_points_per_commit': round(
185
- dev['total_story_points'] / max(dev['total_commits'], 1), 2
186
- )
702
+ "developer_id": self._anonymize_value(dev["canonical_id"], "id"),
703
+ "name": self._anonymize_value(
704
+ self._get_canonical_display_name(
705
+ dev["canonical_id"],
706
+ dev["primary_name"]
707
+ ), "name"
708
+ ),
709
+ "email": self._anonymize_value(dev["primary_email"], "email"),
710
+ "github_username": (
711
+ self._anonymize_value(dev.get("github_username", ""), "username")
712
+ if dev.get("github_username")
713
+ else ""
714
+ ),
715
+ "total_commits": dev["total_commits"],
716
+ "total_story_points": dev["total_story_points"],
717
+ "alias_count": dev.get("alias_count", 1),
718
+ "first_seen": (
719
+ self._safe_datetime_format(dev["first_seen"], "%Y-%m-%d")
720
+ if dev["first_seen"]
721
+ else ""
722
+ ),
723
+ "last_seen": (
724
+ self._safe_datetime_format(dev["last_seen"], "%Y-%m-%d")
725
+ if dev["last_seen"]
726
+ else ""
727
+ ),
728
+ "avg_story_points_per_commit": round(
729
+ dev["total_story_points"] / max(dev["total_commits"], 1), 2
730
+ ),
187
731
  }
188
732
  rows.append(row)
189
-
733
+
190
734
  # Sort by total commits
191
- rows.sort(key=lambda x: x['total_commits'], reverse=True)
192
-
735
+ rows.sort(key=lambda x: x["total_commits"], reverse=True)
736
+
193
737
  # Write CSV
194
738
  df = pd.DataFrame(rows)
195
739
  df.to_csv(output_path, index=False)
196
-
740
+
197
741
  return output_path
198
-
199
- def _aggregate_weekly_data(self, commits: List[Dict[str, Any]],
200
- start_date: datetime,
201
- end_date: datetime) -> Dict[tuple, Dict[str, Any]]:
742
+
743
+ def generate_pm_correlations_report(self, pm_data: dict[str, Any], output_path: Path) -> Path:
744
+ """Generate PM platform correlations CSV report.
745
+
746
+ WHY: PM platform integration provides valuable correlation data between
747
+ work items and code changes. This report enables analysis of story point
748
+ accuracy, development velocity, and work item completion patterns.
749
+
750
+ Args:
751
+ pm_data: PM platform data including correlations and metrics.
752
+ output_path: Path where the CSV report should be written.
753
+
754
+ Returns:
755
+ Path to the generated CSV file.
756
+ """
757
+ if not pm_data or "correlations" not in pm_data:
758
+ # Generate empty report if no PM data
759
+ df = pd.DataFrame(
760
+ columns=[
761
+ "commit_hash",
762
+ "commit_message",
763
+ "commit_author",
764
+ "commit_date",
765
+ "issue_key",
766
+ "issue_title",
767
+ "issue_type",
768
+ "issue_status",
769
+ "issue_platform",
770
+ "story_points",
771
+ "correlation_method",
772
+ "confidence",
773
+ "matched_text",
774
+ ]
775
+ )
776
+ df.to_csv(output_path, index=False)
777
+ return output_path
778
+
779
+ correlations = pm_data["correlations"]
780
+ rows = []
781
+
782
+ for correlation in correlations:
783
+ row = {
784
+ "commit_hash": correlation.get("commit_hash", ""),
785
+ "commit_message": correlation.get("commit_message", ""),
786
+ "commit_author": self._anonymize_value(
787
+ correlation.get("commit_author", ""), "name"
788
+ ),
789
+ "commit_date": correlation.get("commit_date", ""),
790
+ "issue_key": correlation.get("issue_key", ""),
791
+ "issue_title": correlation.get("issue_title", ""),
792
+ "issue_type": correlation.get("issue_type", ""),
793
+ "issue_status": correlation.get("issue_status", ""),
794
+ "issue_platform": correlation.get("issue_platform", ""),
795
+ "story_points": correlation.get("story_points", 0) or 0,
796
+ "correlation_method": correlation.get("correlation_method", ""),
797
+ "confidence": round(correlation.get("confidence", 0), 3),
798
+ "matched_text": correlation.get("matched_text", ""),
799
+ }
800
+ rows.append(row)
801
+
802
+ df = pd.DataFrame(rows)
803
+ df.to_csv(output_path, index=False)
804
+
805
+ return output_path
806
+
807
+ def _aggregate_weekly_data(
808
+ self, commits: list[dict[str, Any]], start_date: datetime, end_date: datetime
809
+ ) -> dict[tuple[datetime, str, str], dict[str, Any]]:
202
810
  """Aggregate commit data by week."""
203
- weekly_data = defaultdict(lambda: {
204
- 'commits': 0,
205
- 'story_points': 0,
206
- 'lines_added': 0,
207
- 'lines_removed': 0,
208
- 'files_changed': 0,
209
- 'complexity_delta': 0.0,
210
- 'commits_with_tickets': 0,
211
- 'tickets': set(),
212
- 'prs': set()
213
- })
214
-
811
+ weekly_data: defaultdict[tuple[datetime, str, str], dict[str, Any]] = defaultdict(
812
+ lambda: {
813
+ "commits": 0,
814
+ "story_points": 0,
815
+ "lines_added": 0,
816
+ "lines_removed": 0,
817
+ "files_changed": 0,
818
+ "complexity_delta": 0.0,
819
+ "commits_with_tickets": 0,
820
+ "tickets": set(),
821
+ "prs": set(),
822
+ }
823
+ )
824
+
215
825
  for commit in commits:
216
- timestamp = commit['timestamp']
217
- # Handle both timezone-aware and naive datetimes
218
- if hasattr(timestamp, 'tzinfo') and timestamp.tzinfo is not None:
219
- # Convert timezone-aware to naive (UTC)
220
- timestamp = timestamp.replace(tzinfo=None)
221
-
222
- if timestamp < start_date or timestamp > end_date:
826
+ timestamp = commit["timestamp"]
827
+ logger.debug(
828
+ f"Processing commit timestamp: {timestamp} (tzinfo: {getattr(timestamp, 'tzinfo', 'N/A')})"
829
+ )
830
+
831
+ # Ensure consistent timezone handling
832
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is not None:
833
+ # Keep timezone-aware but ensure it's UTC
834
+ if timestamp.tzinfo != timezone.utc:
835
+ timestamp = timestamp.astimezone(timezone.utc)
836
+ logger.debug(f" Converted to UTC: {timestamp}")
837
+ else:
838
+ # Convert naive datetime to UTC timezone-aware
839
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
840
+ logger.debug(f" Made timezone-aware: {timestamp}")
841
+
842
+ # Use safe comparison functions with logging
843
+ if self._safe_datetime_compare(
844
+ timestamp, start_date, "lt", "_aggregate_weekly_data range check"
845
+ ) or self._safe_datetime_compare(
846
+ timestamp, end_date, "gt", "_aggregate_weekly_data range check"
847
+ ):
848
+ logger.debug(" Skipping commit outside date range")
223
849
  continue
224
-
850
+
225
851
  # Get week start (Monday)
226
852
  week_start = self._get_week_start(timestamp)
227
-
853
+
228
854
  # Get project key (default to 'unknown')
229
- project_key = commit.get('project_key', 'unknown')
230
-
855
+ project_key = commit.get("project_key", "unknown")
856
+
231
857
  # Get canonical developer ID
232
- canonical_id = commit.get('canonical_id', commit.get('author_email', 'unknown'))
233
-
858
+ canonical_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
859
+
234
860
  key = (week_start, canonical_id, project_key)
235
-
861
+
236
862
  # Aggregate metrics
237
- weekly_data[key]['commits'] += 1
238
- weekly_data[key]['story_points'] += commit.get('story_points', 0) or 0
239
-
863
+ data = weekly_data[key]
864
+ data["commits"] += 1
865
+ data["story_points"] += commit.get("story_points", 0) or 0
866
+
240
867
  # Use filtered stats if available, otherwise fall back to raw stats
241
- weekly_data[key]['lines_added'] += commit.get('filtered_insertions', commit.get('insertions', 0))
242
- weekly_data[key]['lines_removed'] += commit.get('filtered_deletions', commit.get('deletions', 0))
243
- weekly_data[key]['files_changed'] += commit.get('filtered_files_changed', commit.get('files_changed', 0))
244
-
245
- weekly_data[key]['complexity_delta'] += commit.get('complexity_delta', 0.0)
246
-
868
+ data["lines_added"] += (
869
+ commit.get("filtered_insertions", commit.get("insertions", 0)) or 0
870
+ )
871
+ data["lines_removed"] += (
872
+ commit.get("filtered_deletions", commit.get("deletions", 0)) or 0
873
+ )
874
+ data["files_changed"] += (
875
+ commit.get("filtered_files_changed", commit.get("files_changed", 0)) or 0
876
+ )
877
+
878
+ data["complexity_delta"] += commit.get("complexity_delta", 0.0) or 0.0
879
+
247
880
  # Track tickets
248
- if commit.get('ticket_references'):
249
- weekly_data[key]['commits_with_tickets'] += 1
250
- for ticket in commit['ticket_references']:
881
+ ticket_refs = commit.get("ticket_references", [])
882
+ if ticket_refs:
883
+ data["commits_with_tickets"] += 1
884
+ tickets_set = data["tickets"]
885
+ for ticket in ticket_refs:
251
886
  if isinstance(ticket, dict):
252
- weekly_data[key]['tickets'].add(ticket.get('full_id', ''))
887
+ tickets_set.add(ticket.get("full_id", ""))
253
888
  else:
254
- weekly_data[key]['tickets'].add(str(ticket))
255
-
889
+ tickets_set.add(str(ticket))
890
+
256
891
  # Track PRs (if available)
257
- if commit.get('pr_number'):
258
- weekly_data[key]['prs'].add(commit['pr_number'])
259
-
892
+ pr_number = commit.get("pr_number")
893
+ if pr_number:
894
+ prs_set = data["prs"]
895
+ prs_set.add(pr_number)
896
+
260
897
  # Calculate derived metrics
261
- result = {}
898
+ result: dict[tuple[datetime, str, str], dict[str, Any]] = {}
262
899
  for key, metrics in weekly_data.items():
263
- commits = metrics['commits']
264
- if commits > 0:
265
- metrics['ticket_coverage_pct'] = (
266
- metrics['commits_with_tickets'] / commits * 100
267
- )
268
- metrics['avg_commit_size'] = (
269
- (metrics['lines_added'] + metrics['lines_removed']) / commits
900
+ commits_count = metrics["commits"]
901
+ if commits_count > 0:
902
+ metrics["ticket_coverage_pct"] = (
903
+ metrics["commits_with_tickets"] / commits_count * 100
270
904
  )
905
+ metrics["avg_commit_size"] = (
906
+ metrics["lines_added"] + metrics["lines_removed"]
907
+ ) / commits_count
271
908
  else:
272
- metrics['ticket_coverage_pct'] = 0
273
- metrics['avg_commit_size'] = 0
274
-
275
- metrics['unique_tickets'] = len(metrics['tickets'])
276
- metrics['prs_involved'] = len(metrics['prs'])
277
-
909
+ metrics["ticket_coverage_pct"] = 0
910
+ metrics["avg_commit_size"] = 0
911
+
912
+ tickets_set = metrics["tickets"]
913
+ prs_set = metrics["prs"]
914
+ metrics["unique_tickets"] = len(tickets_set)
915
+ metrics["prs_involved"] = len(prs_set)
916
+
278
917
  # Remove sets before returning
279
- del metrics['tickets']
280
- del metrics['prs']
281
- del metrics['commits_with_tickets']
282
-
918
+ del metrics["tickets"]
919
+ del metrics["prs"]
920
+ del metrics["commits_with_tickets"]
921
+
283
922
  result[key] = metrics
284
-
923
+
285
924
  return result
286
-
925
+
287
926
  def _get_week_start(self, date: datetime) -> datetime:
288
927
  """Get Monday of the week for a given date."""
928
+ logger.debug(
929
+ f"Getting week start for date: {date} (tzinfo: {getattr(date, 'tzinfo', 'N/A')})"
930
+ )
931
+
932
+ # Ensure consistent timezone handling - keep timezone info
933
+ if hasattr(date, "tzinfo") and date.tzinfo is not None:
934
+ # Keep timezone-aware but ensure it's UTC
935
+ if date.tzinfo != timezone.utc:
936
+ date = date.astimezone(timezone.utc)
937
+ logger.debug(f" Converted to UTC: {date}")
938
+ else:
939
+ # Convert naive datetime to UTC timezone-aware
940
+ date = date.replace(tzinfo=timezone.utc)
941
+ logger.debug(f" Made timezone-aware: {date}")
942
+
289
943
  days_since_monday = date.weekday()
290
944
  monday = date - timedelta(days=days_since_monday)
291
- return monday.replace(hour=0, minute=0, second=0, microsecond=0)
292
-
945
+ result = monday.replace(hour=0, minute=0, second=0, microsecond=0)
946
+
947
+ logger.debug(f" Week start result: {result} (tzinfo: {result.tzinfo})")
948
+ return result
949
+
950
+ def generate_developer_activity_summary(
951
+ self,
952
+ commits: list[dict[str, Any]],
953
+ developer_stats: list[dict[str, Any]],
954
+ prs: list[dict[str, Any]],
955
+ output_path: Path,
956
+ weeks: int = 12,
957
+ ) -> Path:
958
+ """Generate developer activity summary with curve-normalized scores.
959
+
960
+ This report provides a high-level view of developer activity with
961
+ curve-normalized scores that allow for fair comparison across the team.
962
+ """
963
+ # Apply exclusion filtering in Phase 2
964
+ commits = self._filter_excluded_authors_list(commits)
965
+ developer_stats = self._filter_excluded_authors_list(developer_stats)
966
+
967
+ # Calculate date range
968
+ end_date = datetime.now(timezone.utc)
969
+ start_date = end_date - timedelta(weeks=weeks)
970
+
971
+ # Aggregate metrics by developer
972
+ developer_metrics = defaultdict(lambda: {
973
+ "commits": 0,
974
+ "prs_involved": 0,
975
+ "lines_added": 0,
976
+ "lines_removed": 0,
977
+ "files_changed": 0,
978
+ "complexity_delta": 0.0,
979
+ "story_points": 0,
980
+ "unique_tickets": set(),
981
+ })
982
+
983
+ # Process commits
984
+ for commit in commits:
985
+ timestamp = commit["timestamp"]
986
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is None:
987
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
988
+
989
+ if timestamp < start_date or timestamp > end_date:
990
+ continue
991
+
992
+ dev_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
993
+ metrics = developer_metrics[dev_id]
994
+
995
+ metrics["commits"] += 1
996
+ metrics["lines_added"] += commit.get("filtered_insertions", commit.get("insertions", 0)) or 0
997
+ metrics["lines_removed"] += commit.get("filtered_deletions", commit.get("deletions", 0)) or 0
998
+ metrics["files_changed"] += commit.get("filtered_files_changed", commit.get("files_changed", 0)) or 0
999
+ metrics["complexity_delta"] += commit.get("complexity_delta", 0.0) or 0.0
1000
+ metrics["story_points"] += commit.get("story_points", 0) or 0
1001
+
1002
+ ticket_refs = commit.get("ticket_references", [])
1003
+ for ticket in ticket_refs:
1004
+ if isinstance(ticket, dict):
1005
+ metrics["unique_tickets"].add(ticket.get("full_id", ""))
1006
+ else:
1007
+ metrics["unique_tickets"].add(str(ticket))
1008
+
1009
+ # Process PRs
1010
+ for pr in prs:
1011
+ author_id = pr.get("canonical_id", pr.get("author", "unknown"))
1012
+ if author_id in developer_metrics:
1013
+ developer_metrics[author_id]["prs_involved"] += 1
1014
+
1015
+ # Calculate activity scores
1016
+ developer_scores = {}
1017
+ developer_results = {}
1018
+
1019
+ for dev_id, metrics in developer_metrics.items():
1020
+ # Convert sets to counts
1021
+ metrics["unique_tickets"] = len(metrics["unique_tickets"])
1022
+
1023
+ # Calculate activity score
1024
+ activity_result = self.activity_scorer.calculate_activity_score(metrics)
1025
+ developer_scores[dev_id] = activity_result["raw_score"]
1026
+ developer_results[dev_id] = activity_result
1027
+
1028
+ # Apply curve normalization
1029
+ curve_normalized = self.activity_scorer.normalize_scores_on_curve(developer_scores)
1030
+
1031
+ # Create developer lookup
1032
+ dev_lookup = {dev["canonical_id"]: dev for dev in developer_stats}
1033
+
1034
+ # Build rows
1035
+ rows = []
1036
+ for dev_id, metrics in developer_metrics.items():
1037
+ developer = dev_lookup.get(dev_id, {})
1038
+ activity_result = developer_results[dev_id]
1039
+ curve_data = curve_normalized.get(dev_id, {})
1040
+
1041
+ row = {
1042
+ "developer_id": self._anonymize_value(dev_id, "id"),
1043
+ "developer_name": self._anonymize_value(
1044
+ self._get_canonical_display_name(
1045
+ dev_id,
1046
+ developer.get("primary_name", "Unknown")
1047
+ ), "name"
1048
+ ),
1049
+ "commits": metrics["commits"],
1050
+ "prs": metrics["prs_involved"],
1051
+ "story_points": metrics["story_points"],
1052
+ "lines_added": metrics["lines_added"],
1053
+ "lines_removed": metrics["lines_removed"],
1054
+ "files_changed": metrics["files_changed"],
1055
+ "unique_tickets": metrics["unique_tickets"],
1056
+ # Raw activity scores
1057
+ "raw_activity_score": round(activity_result["raw_score"], 1),
1058
+ "normalized_activity_score": round(activity_result["normalized_score"], 1),
1059
+ "activity_level": activity_result["activity_level"],
1060
+ # Curve-normalized scores
1061
+ "curved_score": curve_data.get("curved_score", 0),
1062
+ "percentile": curve_data.get("percentile", 0),
1063
+ "quintile": curve_data.get("quintile", 0),
1064
+ "curved_activity_level": curve_data.get("activity_level", "unknown"),
1065
+ "level_description": curve_data.get("level_description", ""),
1066
+ # Component breakdown
1067
+ "commit_score": round(activity_result["components"]["commit_score"], 1),
1068
+ "pr_score": round(activity_result["components"]["pr_score"], 1),
1069
+ "code_impact_score": round(activity_result["components"]["code_impact_score"], 1),
1070
+ "complexity_score": round(activity_result["components"]["complexity_score"], 1),
1071
+ }
1072
+ rows.append(row)
1073
+
1074
+ # Sort by curved score (highest first)
1075
+ rows.sort(key=lambda x: x["curved_score"], reverse=True)
1076
+
1077
+ # Write CSV
1078
+ if rows:
1079
+ df = pd.DataFrame(rows)
1080
+ df.to_csv(output_path, index=False)
1081
+ else:
1082
+ # Write empty CSV with headers
1083
+ with open(output_path, "w", newline="") as f:
1084
+ writer = csv.DictWriter(
1085
+ f,
1086
+ fieldnames=[
1087
+ "developer_id",
1088
+ "developer_name",
1089
+ "commits",
1090
+ "prs",
1091
+ "story_points",
1092
+ "lines_added",
1093
+ "lines_removed",
1094
+ "files_changed",
1095
+ "unique_tickets",
1096
+ "raw_activity_score",
1097
+ "normalized_activity_score",
1098
+ "activity_level",
1099
+ "curved_score",
1100
+ "percentile",
1101
+ "quintile",
1102
+ "curved_activity_level",
1103
+ "level_description",
1104
+ "commit_score",
1105
+ "pr_score",
1106
+ "code_impact_score",
1107
+ "complexity_score",
1108
+ ],
1109
+ )
1110
+ writer.writeheader()
1111
+
1112
+ return output_path
1113
+
293
1114
  def _anonymize_value(self, value: str, field_type: str) -> str:
294
1115
  """Anonymize a value if anonymization is enabled."""
295
1116
  if not self.anonymize or not value:
296
1117
  return value
297
-
298
- if field_type == 'email' and '@' in value:
1118
+
1119
+ if field_type == "email" and "@" in value:
299
1120
  # Keep domain for email
300
- local, domain = value.split('@', 1)
1121
+ local, domain = value.split("@", 1)
301
1122
  value = local # Anonymize only local part
302
1123
  suffix = f"@{domain}"
303
1124
  else:
304
1125
  suffix = ""
305
-
1126
+
306
1127
  if value not in self._anonymization_map:
307
1128
  self._anonymous_counter += 1
308
- if field_type == 'name':
1129
+ if field_type == "name":
309
1130
  anonymous = f"Developer{self._anonymous_counter}"
310
- elif field_type == 'email':
1131
+ elif field_type == "email":
311
1132
  anonymous = f"dev{self._anonymous_counter}"
312
- elif field_type == 'id':
1133
+ elif field_type == "id":
313
1134
  anonymous = f"ID{self._anonymous_counter:04d}"
314
1135
  else:
315
1136
  anonymous = f"anon{self._anonymous_counter}"
316
-
1137
+
317
1138
  self._anonymization_map[value] = anonymous
1139
+
1140
+ return self._anonymization_map[value] + suffix
1141
+
1142
+ def generate_untracked_commits_report(
1143
+ self, ticket_analysis: dict[str, Any], output_path: Path
1144
+ ) -> Path:
1145
+ """Generate detailed CSV report for commits without ticket references.
1146
+
1147
+ WHY: Untracked commits represent work that may not be visible to project
1148
+ management tools. This report enables analysis of what types of work are
1149
+ being performed outside the tracked process, helping identify process
1150
+ improvements and training needs.
1151
+
1152
+ Args:
1153
+ ticket_analysis: Ticket analysis results containing untracked commits
1154
+ output_path: Path where the CSV report should be written
1155
+
1156
+ Returns:
1157
+ Path to the generated CSV file
1158
+ """
1159
+ untracked_commits = ticket_analysis.get("untracked_commits", [])
1160
+
1161
+ if not untracked_commits:
1162
+ # Generate empty report with headers
1163
+ headers = [
1164
+ "commit_hash",
1165
+ "short_hash",
1166
+ "author",
1167
+ "author_email",
1168
+ "canonical_id",
1169
+ "date",
1170
+ "project",
1171
+ "message",
1172
+ "category",
1173
+ "files_changed",
1174
+ "lines_added",
1175
+ "lines_removed",
1176
+ "lines_changed",
1177
+ "is_merge",
1178
+ ]
1179
+ with open(output_path, "w", newline="") as f:
1180
+ writer = csv.DictWriter(f, fieldnames=headers)
1181
+ writer.writeheader()
1182
+ return output_path
1183
+
1184
+ # Process untracked commits into CSV rows
1185
+ rows = []
1186
+ for commit in untracked_commits:
1187
+ # Handle datetime formatting
1188
+ timestamp = commit.get("timestamp")
1189
+ if timestamp:
1190
+ if hasattr(timestamp, "strftime"):
1191
+ date_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
1192
+ else:
1193
+ date_str = str(timestamp)
1194
+ else:
1195
+ date_str = ""
1196
+
1197
+ row = {
1198
+ "commit_hash": commit.get("full_hash", commit.get("hash", "")),
1199
+ "short_hash": commit.get("hash", ""),
1200
+ "author": self._anonymize_value(commit.get("author", "Unknown"), "name"),
1201
+ "author_email": self._anonymize_value(commit.get("author_email", ""), "email"),
1202
+ "canonical_id": self._anonymize_value(commit.get("canonical_id", ""), "id"),
1203
+ "date": date_str,
1204
+ "project": commit.get("project_key", "UNKNOWN"),
1205
+ "message": commit.get("message", ""),
1206
+ "category": commit.get("category", "other"),
1207
+ "files_changed": commit.get("files_changed", 0),
1208
+ "lines_added": commit.get("lines_added", 0),
1209
+ "lines_removed": commit.get("lines_removed", 0),
1210
+ "lines_changed": commit.get("lines_changed", 0),
1211
+ "is_merge": commit.get("is_merge", False),
1212
+ }
1213
+ rows.append(row)
1214
+
1215
+ # Write CSV
1216
+ if rows:
1217
+ df = pd.DataFrame(rows)
1218
+ df.to_csv(output_path, index=False)
1219
+
1220
+ return output_path
1221
+ def generate_weekly_categorization_report(
1222
+ self,
1223
+ all_commits: list[dict[str, Any]],
1224
+ ticket_extractor, # TicketExtractor or MLTicketExtractor instance
1225
+ output_path: Path,
1226
+ weeks: int = 12
1227
+ ) -> Path:
1228
+ """Generate weekly commit categorization metrics CSV report for ALL commits.
1229
+
1230
+ WHY: Categorization trends provide insights into development patterns
1231
+ over time, helping identify process improvements and training needs.
1232
+ This enhanced version processes ALL commits (tracked and untracked) to provide
1233
+ complete visibility into work patterns across the entire development flow.
1234
+
1235
+ DESIGN DECISION: Processes all commits using the same ML/rule-based categorization
1236
+ system used elsewhere in the application, ensuring consistent categorization
1237
+ across all reports and analysis.
318
1238
 
319
- return self._anonymization_map[value] + suffix
1239
+ Args:
1240
+ all_commits: Complete list of commits to categorize
1241
+ ticket_extractor: TicketExtractor instance for commit categorization
1242
+ output_path: Path where the CSV report should be written
1243
+ weeks: Number of weeks to analyze
1244
+
1245
+ Returns:
1246
+ Path to the generated CSV file
1247
+ """
1248
+ # Calculate week boundaries
1249
+ end_date = datetime.now(timezone.utc)
1250
+ start_date = end_date - timedelta(weeks=weeks)
1251
+
1252
+ # Initialize weekly aggregation structures
1253
+ weekly_categories = defaultdict(lambda: defaultdict(int))
1254
+ weekly_metrics = defaultdict(lambda: {
1255
+ 'lines_added': 0,
1256
+ 'lines_removed': 0,
1257
+ 'files_changed': 0,
1258
+ 'developers': set()
1259
+ })
1260
+
1261
+ # Process ALL commits with classification
1262
+ processed_commits = 0
1263
+ for commit in all_commits:
1264
+ if not isinstance(commit, dict):
1265
+ continue
1266
+
1267
+ # Get timestamp and validate date range
1268
+ timestamp = commit.get("timestamp")
1269
+ if not timestamp:
1270
+ continue
1271
+
1272
+ # Ensure timezone consistency
1273
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is None:
1274
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
1275
+ elif hasattr(timestamp, "tzinfo") and timestamp.tzinfo != timezone.utc:
1276
+ timestamp = timestamp.astimezone(timezone.utc)
1277
+
1278
+ if timestamp < start_date or timestamp > end_date:
1279
+ continue
1280
+
1281
+ # Skip merge commits (consistent with untracked analysis)
1282
+ if commit.get("is_merge", False):
1283
+ continue
1284
+
1285
+ # Categorize the commit using the same system as untracked analysis
1286
+ message = commit.get("message", "")
1287
+ files_changed_raw = commit.get("files_changed", [])
1288
+
1289
+ # Handle both int and list types for files_changed
1290
+ if isinstance(files_changed_raw, int):
1291
+ files_changed_count = files_changed_raw
1292
+ files_changed_list = [] # Can't provide file names, only count
1293
+ elif isinstance(files_changed_raw, list):
1294
+ files_changed_count = len(files_changed_raw)
1295
+ files_changed_list = files_changed_raw
1296
+ else:
1297
+ files_changed_count = 0
1298
+ files_changed_list = []
1299
+
1300
+ # Handle both TicketExtractor and MLTicketExtractor signatures
1301
+ try:
1302
+ # Try ML signature first (message, files_changed as list)
1303
+ category = ticket_extractor.categorize_commit(message, files_changed_list)
1304
+ except TypeError:
1305
+ # Fall back to base signature (message only)
1306
+ category = ticket_extractor.categorize_commit(message)
1307
+
1308
+ # Get week boundary (Monday start)
1309
+ week_start = self._get_week_start(timestamp)
1310
+
1311
+ # Aggregate by category
1312
+ weekly_categories[week_start][category] += 1
1313
+
1314
+ # Aggregate metrics
1315
+ weekly_metrics[week_start]['lines_added'] += commit.get("insertions", 0)
1316
+ weekly_metrics[week_start]['lines_removed'] += commit.get("deletions", 0)
1317
+ weekly_metrics[week_start]['files_changed'] += files_changed_count
1318
+
1319
+ # Track unique developers (use canonical_id or fallback to email)
1320
+ developer_id = commit.get("canonical_id") or commit.get("author_email", "Unknown")
1321
+ weekly_metrics[week_start]['developers'].add(developer_id)
1322
+
1323
+ processed_commits += 1
1324
+
1325
+ # Build CSV rows with comprehensive metrics
1326
+ rows = []
1327
+ all_categories = set()
1328
+
1329
+ # Collect all categories across all weeks
1330
+ for week_data in weekly_categories.values():
1331
+ all_categories.update(week_data.keys())
1332
+
1333
+ # Ensure standard categories are included even if not found
1334
+ standard_categories = ["bug_fix", "feature", "refactor", "documentation",
1335
+ "maintenance", "test", "style", "build", "integration", "other"]
1336
+ all_categories.update(standard_categories)
1337
+ sorted_categories = sorted(all_categories)
1338
+
1339
+ # Generate weekly rows
1340
+ for week_start in sorted(weekly_categories.keys()):
1341
+ week_data = weekly_categories[week_start]
1342
+ week_metrics = weekly_metrics[week_start]
1343
+ total_commits = sum(week_data.values())
1344
+
1345
+ row = {
1346
+ "week_start": week_start.strftime("%Y-%m-%d"),
1347
+ "total_commits": total_commits,
1348
+ "lines_added": week_metrics['lines_added'],
1349
+ "lines_removed": week_metrics['lines_removed'],
1350
+ "files_changed": week_metrics['files_changed'],
1351
+ "developer_count": len(week_metrics['developers'])
1352
+ }
1353
+
1354
+ # Add each category count and percentage
1355
+ for category in sorted_categories:
1356
+ count = week_data.get(category, 0)
1357
+ pct = (count / total_commits * 100) if total_commits > 0 else 0
1358
+
1359
+ row[f"{category}_count"] = count
1360
+ row[f"{category}_pct"] = round(pct, 1)
1361
+
1362
+ rows.append(row)
1363
+
1364
+ # Write CSV with comprehensive headers
1365
+ if rows:
1366
+ df = pd.DataFrame(rows)
1367
+ df.to_csv(output_path, index=False)
1368
+ else:
1369
+ # Write empty CSV with comprehensive headers
1370
+ headers = ["week_start", "total_commits", "lines_added", "lines_removed",
1371
+ "files_changed", "developer_count"]
1372
+
1373
+ for category in sorted_categories:
1374
+ headers.extend([f"{category}_count", f"{category}_pct"])
1375
+
1376
+ with open(output_path, "w", newline="") as f:
1377
+ writer = csv.DictWriter(f, fieldnames=headers)
1378
+ writer.writeheader()
1379
+
1380
+ return output_path
1381
+
1382
+ def generate_story_point_correlation_report(
1383
+ self,
1384
+ commits: list[dict[str, Any]],
1385
+ prs: list[dict[str, Any]],
1386
+ pm_data: Optional[dict[str, Any]],
1387
+ output_path: Path,
1388
+ weeks: int = 12
1389
+ ) -> Path:
1390
+ """Generate story point correlation analysis CSV report.
1391
+
1392
+ WHY: Story point correlation analysis helps teams understand the relationship
1393
+ between estimated effort (story points) and actual work metrics (commits,
1394
+ lines of code, time). This enables process improvements and better estimation
1395
+ calibration.
1396
+
1397
+ INTEGRATION: Uses the StoryPointCorrelationAnalyzer to provide comprehensive
1398
+ correlation metrics including weekly trends, developer accuracy, and velocity
1399
+ analysis in a format suitable for spreadsheet analysis.
1400
+
1401
+ Args:
1402
+ commits: List of commit data with story points
1403
+ prs: List of pull request data
1404
+ pm_data: PM platform data with issue correlations
1405
+ output_path: Path for the output CSV file
1406
+ weeks: Number of weeks to analyze
1407
+
1408
+ Returns:
1409
+ Path to the generated CSV report
1410
+ """
1411
+ try:
1412
+ # Import here to avoid circular imports
1413
+ from .story_point_correlation import StoryPointCorrelationAnalyzer
1414
+
1415
+ # Create analyzer with same configuration as CSV writer
1416
+ analyzer = StoryPointCorrelationAnalyzer(
1417
+ anonymize=self.anonymize,
1418
+ identity_resolver=self.identity_resolver
1419
+ )
1420
+
1421
+ # Apply exclusion filtering consistent with other reports
1422
+ commits = self._filter_excluded_authors_list(commits)
1423
+
1424
+ # Generate the correlation report
1425
+ logger.debug(f"Generating story point correlation report: {output_path}")
1426
+ return analyzer.generate_correlation_report(commits, prs, pm_data, output_path, weeks)
1427
+
1428
+ except Exception as e:
1429
+ logger.error(f"Error generating story point correlation report: {e}")
1430
+
1431
+ # Create empty report as fallback
1432
+ headers = [
1433
+ "week_start", "metric_type", "developer_name",
1434
+ "sp_commits_correlation", "sp_lines_correlation", "sp_files_correlation",
1435
+ "sp_prs_correlation", "sp_complexity_correlation", "sample_size",
1436
+ "total_story_points", "total_commits", "story_points_completed",
1437
+ "commits_count", "prs_merged", "developers_active", "velocity_trend",
1438
+ "overall_accuracy", "avg_weekly_accuracy", "consistency",
1439
+ "weeks_active", "total_estimated_sp", "total_actual_sp", "estimation_ratio"
1440
+ ]
1441
+
1442
+ df = pd.DataFrame(columns=headers)
1443
+ df.to_csv(output_path, index=False)
1444
+
1445
+ raise
1446
+
1447
+ def generate_weekly_velocity_report(
1448
+ self,
1449
+ commits: list[dict[str, Any]],
1450
+ prs: list[dict[str, Any]],
1451
+ output_path: Path,
1452
+ weeks: int = 12,
1453
+ ) -> Path:
1454
+ """Generate weekly lines-per-story-point velocity analysis report.
1455
+
1456
+ WHY: Velocity analysis helps teams understand the relationship between
1457
+ estimated effort (story points) and actual work performed (lines of code).
1458
+ This enables process improvements, better estimation calibration, and
1459
+ identification of efficiency trends over time.
1460
+
1461
+ DESIGN DECISION: Combines both PR-based and commit-based story points
1462
+ to provide comprehensive coverage, as some organizations track story
1463
+ points differently across their development workflow.
1464
+
1465
+ Args:
1466
+ commits: List of commit data dictionaries with story points
1467
+ prs: List of pull request data dictionaries with story points
1468
+ output_path: Path where the CSV report should be written
1469
+ weeks: Number of weeks to analyze (default: 12)
1470
+
1471
+ Returns:
1472
+ Path to the generated CSV file
1473
+ """
1474
+ # Apply exclusion filtering in Phase 2
1475
+ commits = self._filter_excluded_authors_list(commits)
1476
+
1477
+ # Calculate date range (timezone-aware to match commit timestamps)
1478
+ end_date = datetime.now(timezone.utc)
1479
+ start_date = end_date - timedelta(weeks=weeks)
1480
+
1481
+ logger.debug("Weekly velocity report date range:")
1482
+ logger.debug(f" start_date: {start_date} (tzinfo: {start_date.tzinfo})")
1483
+ logger.debug(f" end_date: {end_date} (tzinfo: {end_date.tzinfo})")
1484
+
1485
+ # Initialize weekly aggregation structures
1486
+ weekly_data: dict[datetime, dict[str, Any]] = defaultdict(lambda: {
1487
+ 'total_story_points': 0,
1488
+ 'pr_story_points': 0,
1489
+ 'commit_story_points': 0,
1490
+ 'total_lines': 0,
1491
+ 'lines_added': 0,
1492
+ 'lines_removed': 0,
1493
+ 'files_changed': 0,
1494
+ 'commits_count': 0,
1495
+ 'developers': set(),
1496
+ 'prs_with_sp': 0,
1497
+ 'commits_with_sp': 0,
1498
+ })
1499
+
1500
+ # Process commits for weekly aggregation
1501
+ for commit in commits:
1502
+ timestamp = commit["timestamp"]
1503
+ logger.debug(
1504
+ f"Processing commit timestamp: {timestamp} (tzinfo: {getattr(timestamp, 'tzinfo', 'N/A')})"
1505
+ )
1506
+
1507
+ # Ensure consistent timezone handling
1508
+ if hasattr(timestamp, "tzinfo") and timestamp.tzinfo is not None:
1509
+ if timestamp.tzinfo != timezone.utc:
1510
+ timestamp = timestamp.astimezone(timezone.utc)
1511
+ else:
1512
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
1513
+
1514
+ # Check date range
1515
+ if self._safe_datetime_compare(
1516
+ timestamp, start_date, "lt", "generate_weekly_velocity_report range check"
1517
+ ) or self._safe_datetime_compare(
1518
+ timestamp, end_date, "gt", "generate_weekly_velocity_report range check"
1519
+ ):
1520
+ continue
1521
+
1522
+ # Get week start (Monday)
1523
+ week_start = self._get_week_start(timestamp)
1524
+ week_data = weekly_data[week_start]
1525
+
1526
+ # Aggregate commit metrics
1527
+ story_points = commit.get("story_points", 0) or 0
1528
+ lines_added = commit.get("filtered_insertions", commit.get("insertions", 0)) or 0
1529
+ lines_removed = commit.get("filtered_deletions", commit.get("deletions", 0)) or 0
1530
+ files_changed = commit.get("filtered_files_changed", commit.get("files_changed", 0)) or 0
1531
+
1532
+ week_data['commits_count'] += 1
1533
+ week_data['commit_story_points'] += story_points
1534
+ week_data['total_story_points'] += story_points
1535
+ week_data['lines_added'] += lines_added
1536
+ week_data['lines_removed'] += lines_removed
1537
+ week_data['total_lines'] += lines_added + lines_removed
1538
+ week_data['files_changed'] += files_changed
1539
+
1540
+ # Track developers and story point coverage
1541
+ developer_id = commit.get("canonical_id", commit.get("author_email", "unknown"))
1542
+ week_data['developers'].add(developer_id)
1543
+
1544
+ if story_points > 0:
1545
+ week_data['commits_with_sp'] += 1
1546
+
1547
+ # Process PRs for weekly aggregation (by merge date or creation date)
1548
+ for pr in prs:
1549
+ # Use merged_at if available and valid, otherwise created_at
1550
+ pr_date = pr.get("merged_at") or pr.get("created_at")
1551
+ if not pr_date:
1552
+ continue
1553
+
1554
+ # Handle string dates (convert to datetime if needed)
1555
+ if isinstance(pr_date, str):
1556
+ try:
1557
+ from dateutil.parser import parse
1558
+ pr_date = parse(pr_date)
1559
+ except Exception:
1560
+ continue
1561
+
1562
+ # Ensure timezone consistency
1563
+ if hasattr(pr_date, "tzinfo") and pr_date.tzinfo is not None:
1564
+ if pr_date.tzinfo != timezone.utc:
1565
+ pr_date = pr_date.astimezone(timezone.utc)
1566
+ else:
1567
+ pr_date = pr_date.replace(tzinfo=timezone.utc)
1568
+
1569
+ # Check date range
1570
+ if self._safe_datetime_compare(
1571
+ pr_date, start_date, "lt", "generate_weekly_velocity_report PR range check"
1572
+ ) or self._safe_datetime_compare(
1573
+ pr_date, end_date, "gt", "generate_weekly_velocity_report PR range check"
1574
+ ):
1575
+ continue
1576
+
1577
+ # Get week start
1578
+ week_start = self._get_week_start(pr_date)
1579
+ week_data = weekly_data[week_start]
1580
+
1581
+ # Aggregate PR metrics
1582
+ story_points = pr.get("story_points", 0) or 0
1583
+ if story_points > 0:
1584
+ week_data['pr_story_points'] += story_points
1585
+ week_data['total_story_points'] += story_points
1586
+ week_data['prs_with_sp'] += 1
1587
+
1588
+ # Track developer from PR
1589
+ developer_id = pr.get("canonical_id", pr.get("author", "unknown"))
1590
+ week_data['developers'].add(developer_id)
1591
+
1592
+ # Build CSV rows with velocity metrics
1593
+ rows = []
1594
+ previous_week_lines_per_point = None
1595
+
1596
+ for week_start in sorted(weekly_data.keys()):
1597
+ week_data = weekly_data[week_start]
1598
+ total_story_points = week_data['total_story_points']
1599
+ total_lines = week_data['total_lines']
1600
+
1601
+ # Calculate key metrics with division by zero protection
1602
+ lines_per_point = (total_lines / total_story_points) if total_story_points > 0 else 0
1603
+ commits_per_point = (week_data['commits_count'] / total_story_points) if total_story_points > 0 else 0
1604
+
1605
+ # Calculate efficiency score (inverse of lines per point, normalized to 0-100 scale)
1606
+ # Higher efficiency = fewer lines needed per story point
1607
+ if lines_per_point > 0:
1608
+ # Use a logarithmic scale to handle wide ranges
1609
+ import math
1610
+ efficiency_score = max(0, 100 - (math.log10(max(lines_per_point, 1)) * 20))
1611
+ else:
1612
+ efficiency_score = 0
1613
+
1614
+ # Calculate velocity trend (week-over-week change in lines per point)
1615
+ if previous_week_lines_per_point is not None and previous_week_lines_per_point > 0:
1616
+ if lines_per_point > 0:
1617
+ velocity_trend = ((lines_per_point - previous_week_lines_per_point) / previous_week_lines_per_point) * 100
1618
+ else:
1619
+ velocity_trend = -100 # Went from some lines per point to zero
1620
+ else:
1621
+ velocity_trend = 0 # No previous data for comparison
1622
+
1623
+ row = {
1624
+ "week_start": week_start.strftime("%Y-%m-%d"),
1625
+ "total_story_points": total_story_points,
1626
+ "pr_story_points": week_data['pr_story_points'],
1627
+ "commit_story_points": week_data['commit_story_points'],
1628
+ "total_lines": total_lines,
1629
+ "lines_added": week_data['lines_added'],
1630
+ "lines_removed": week_data['lines_removed'],
1631
+ "files_changed": week_data['files_changed'],
1632
+ "lines_per_point": round(lines_per_point, 2) if lines_per_point > 0 else 0,
1633
+ "commits_per_point": round(commits_per_point, 2) if commits_per_point > 0 else 0,
1634
+ "developers_involved": len(week_data['developers']),
1635
+ "efficiency_score": round(efficiency_score, 1),
1636
+ "velocity_trend": round(velocity_trend, 1),
1637
+ # Additional metrics for deeper analysis
1638
+ "commits_count": week_data['commits_count'],
1639
+ "prs_with_story_points": week_data['prs_with_sp'],
1640
+ "commits_with_story_points": week_data['commits_with_sp'],
1641
+ "story_point_coverage_pct": round(
1642
+ (week_data['commits_with_sp'] / max(week_data['commits_count'], 1)) * 100, 1
1643
+ ),
1644
+ "avg_lines_per_commit": round(
1645
+ total_lines / max(week_data['commits_count'], 1), 1
1646
+ ),
1647
+ "avg_files_per_commit": round(
1648
+ week_data['files_changed'] / max(week_data['commits_count'], 1), 1
1649
+ ),
1650
+ }
1651
+ rows.append(row)
1652
+
1653
+ # Store for next iteration's trend calculation
1654
+ previous_week_lines_per_point = lines_per_point if lines_per_point > 0 else None
1655
+
1656
+ # Write CSV
1657
+ if rows:
1658
+ df = pd.DataFrame(rows)
1659
+ df.to_csv(output_path, index=False)
1660
+ else:
1661
+ # Write empty CSV with headers
1662
+ headers = [
1663
+ "week_start",
1664
+ "total_story_points",
1665
+ "pr_story_points",
1666
+ "commit_story_points",
1667
+ "total_lines",
1668
+ "lines_added",
1669
+ "lines_removed",
1670
+ "files_changed",
1671
+ "lines_per_point",
1672
+ "commits_per_point",
1673
+ "developers_involved",
1674
+ "efficiency_score",
1675
+ "velocity_trend",
1676
+ "commits_count",
1677
+ "prs_with_story_points",
1678
+ "commits_with_story_points",
1679
+ "story_point_coverage_pct",
1680
+ "avg_lines_per_commit",
1681
+ "avg_files_per_commit",
1682
+ ]
1683
+ with open(output_path, "w", newline="") as f:
1684
+ writer = csv.DictWriter(f, fieldnames=headers)
1685
+ writer.writeheader()
1686
+
1687
+ return output_path
1688
+
1689
+ def generate_weekly_dora_report(
1690
+ self,
1691
+ commits: list[dict[str, Any]],
1692
+ prs: list[dict[str, Any]],
1693
+ output_path: Path,
1694
+ weeks: int = 12,
1695
+ ) -> Path:
1696
+ """Generate weekly DORA metrics CSV report.
1697
+
1698
+ WHY: Weekly DORA metrics provide trend analysis for software delivery
1699
+ performance, enabling teams to track improvements and identify periods
1700
+ of degraded performance across the four key metrics.
1701
+
1702
+ DESIGN DECISION: Uses the DORAMetricsCalculator with weekly breakdown
1703
+ to provide consistent methodology while adding trend analysis and
1704
+ rolling averages for smoother interpretation.
1705
+
1706
+ Args:
1707
+ commits: List of commit data dictionaries
1708
+ prs: List of pull request data dictionaries
1709
+ output_path: Path where the CSV report should be written
1710
+ weeks: Number of weeks to analyze (default: 12)
1711
+
1712
+ Returns:
1713
+ Path to the generated CSV file
1714
+ """
1715
+ from ..metrics.dora import DORAMetricsCalculator
1716
+
1717
+ # Apply exclusion filtering in Phase 2
1718
+ commits = self._filter_excluded_authors_list(commits)
1719
+
1720
+ # Calculate date range
1721
+ end_date = datetime.now(timezone.utc)
1722
+ start_date = end_date - timedelta(weeks=weeks)
1723
+
1724
+ # Initialize DORA calculator
1725
+ dora_calculator = DORAMetricsCalculator()
1726
+
1727
+ try:
1728
+ # Calculate weekly DORA metrics
1729
+ weekly_metrics = dora_calculator.calculate_weekly_dora_metrics(
1730
+ commits=commits,
1731
+ prs=prs,
1732
+ start_date=start_date,
1733
+ end_date=end_date,
1734
+ )
1735
+
1736
+ if not weekly_metrics:
1737
+ # Generate empty report with headers
1738
+ headers = [
1739
+ "week_start",
1740
+ "week_end",
1741
+ "deployment_frequency",
1742
+ "lead_time_hours",
1743
+ "change_failure_rate",
1744
+ "mttr_hours",
1745
+ "total_failures",
1746
+ "total_commits",
1747
+ "total_prs",
1748
+ "deployment_frequency_4w_avg",
1749
+ "lead_time_4w_avg",
1750
+ "change_failure_rate_4w_avg",
1751
+ "mttr_4w_avg",
1752
+ "deployment_frequency_change_pct",
1753
+ "lead_time_change_pct",
1754
+ "change_failure_rate_change_pct",
1755
+ "mttr_change_pct",
1756
+ "deployment_frequency_trend",
1757
+ "lead_time_trend",
1758
+ "change_failure_rate_trend",
1759
+ "mttr_trend",
1760
+ ]
1761
+
1762
+ df = pd.DataFrame(columns=headers)
1763
+ df.to_csv(output_path, index=False)
1764
+ return output_path
1765
+
1766
+ # Convert to DataFrame and write CSV
1767
+ df = pd.DataFrame(weekly_metrics)
1768
+ df.to_csv(output_path, index=False)
1769
+
1770
+ return output_path
1771
+
1772
+ except Exception as e:
1773
+ logger.error(f"Error generating weekly DORA report: {e}")
1774
+
1775
+ # Create empty report as fallback
1776
+ headers = [
1777
+ "week_start",
1778
+ "week_end",
1779
+ "deployment_frequency",
1780
+ "lead_time_hours",
1781
+ "change_failure_rate",
1782
+ "mttr_hours",
1783
+ "total_failures",
1784
+ "total_commits",
1785
+ "total_prs",
1786
+ "deployment_frequency_4w_avg",
1787
+ "lead_time_4w_avg",
1788
+ "change_failure_rate_4w_avg",
1789
+ "mttr_4w_avg",
1790
+ "deployment_frequency_change_pct",
1791
+ "lead_time_change_pct",
1792
+ "change_failure_rate_change_pct",
1793
+ "mttr_change_pct",
1794
+ "deployment_frequency_trend",
1795
+ "lead_time_trend",
1796
+ "change_failure_rate_trend",
1797
+ "mttr_trend",
1798
+ ]
1799
+
1800
+ df = pd.DataFrame(columns=headers)
1801
+ df.to_csv(output_path, index=False)
1802
+
1803
+ raise