gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,255 +1,1403 @@
1
1
  """Git repository analyzer with batch processing support."""
2
+
2
3
  import fnmatch
3
- from datetime import datetime
4
+ import logging
5
+ import os
6
+ import re
7
+ from collections.abc import Generator
8
+ from datetime import datetime, timedelta, timezone
4
9
  from pathlib import Path
5
- from typing import Any, Dict, Generator, List, Optional
10
+ from typing import Any, Optional
6
11
 
7
12
  import git
8
13
  from git import Repo
9
- from tqdm import tqdm
10
14
 
11
15
  from ..extractors.story_points import StoryPointExtractor
12
16
  from ..extractors.tickets import TicketExtractor
13
17
  from .branch_mapper import BranchToProjectMapper
14
18
  from .cache import GitAnalysisCache
19
+ from .progress import get_progress_service
20
+
21
+ # Import ML extractor with fallback
22
+ try:
23
+ from ..extractors.ml_tickets import MLTicketExtractor
24
+
25
+ ML_EXTRACTOR_AVAILABLE = True
26
+ except ImportError:
27
+ ML_EXTRACTOR_AVAILABLE = False
28
+
29
+ # Get logger for this module
30
+ logger = logging.getLogger(__name__)
15
31
 
16
32
 
17
33
  class GitAnalyzer:
18
34
  """Analyze Git repositories with caching and batch processing."""
19
-
20
- def __init__(self, cache: GitAnalysisCache, batch_size: int = 1000,
21
- branch_mapping_rules: Optional[Dict[str, List[str]]] = None,
22
- allowed_ticket_platforms: Optional[List[str]] = None,
23
- exclude_paths: Optional[List[str]] = None):
24
- """Initialize analyzer with cache."""
35
+
36
+ def __init__(
37
+ self,
38
+ cache: GitAnalysisCache,
39
+ batch_size: int = 1000,
40
+ branch_mapping_rules: Optional[dict[str, list[str]]] = None,
41
+ allowed_ticket_platforms: Optional[list[str]] = None,
42
+ exclude_paths: Optional[list[str]] = None,
43
+ story_point_patterns: Optional[list[str]] = None,
44
+ ml_categorization_config: Optional[dict[str, Any]] = None,
45
+ llm_config: Optional[dict[str, Any]] = None,
46
+ classification_config: Optional[dict[str, Any]] = None,
47
+ branch_analysis_config: Optional[dict[str, Any]] = None,
48
+ ):
49
+ """Initialize analyzer with cache and optional ML categorization and commit classification.
50
+
51
+ Args:
52
+ cache: Git analysis cache instance
53
+ batch_size: Number of commits to process in each batch
54
+ branch_mapping_rules: Rules for mapping branches to projects
55
+ allowed_ticket_platforms: List of allowed ticket platforms
56
+ exclude_paths: List of file paths to exclude from analysis
57
+ story_point_patterns: List of regex patterns for extracting story points
58
+ ml_categorization_config: Configuration for ML-based categorization
59
+ llm_config: Configuration for LLM-based commit classification
60
+ classification_config: Configuration for commit classification
61
+ branch_analysis_config: Configuration for branch analysis optimization
62
+ """
25
63
  self.cache = cache
26
64
  self.batch_size = batch_size
27
- self.story_point_extractor = StoryPointExtractor()
28
- self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
65
+ self.story_point_extractor = StoryPointExtractor(patterns=story_point_patterns)
66
+
67
+ # Initialize ticket extractor (ML or standard based on config and availability)
68
+ if (
69
+ ml_categorization_config
70
+ and ml_categorization_config.get("enabled", True)
71
+ and ML_EXTRACTOR_AVAILABLE
72
+ ):
73
+ logger.info("Initializing ML-enhanced ticket extractor")
74
+
75
+ # Check if LLM classification is enabled
76
+ enable_llm = llm_config and llm_config.get("enabled", False)
77
+ if enable_llm:
78
+ logger.info("LLM-based commit classification enabled")
79
+
80
+ self.ticket_extractor = MLTicketExtractor(
81
+ allowed_platforms=allowed_ticket_platforms,
82
+ ml_config=ml_categorization_config,
83
+ llm_config=llm_config,
84
+ cache_dir=cache.cache_dir / "ml_predictions",
85
+ enable_ml=True,
86
+ enable_llm=enable_llm,
87
+ )
88
+ else:
89
+ if ml_categorization_config and ml_categorization_config.get("enabled", True):
90
+ if not ML_EXTRACTOR_AVAILABLE:
91
+ logger.warning(
92
+ "ML categorization requested but dependencies not available, using standard extractor"
93
+ )
94
+ else:
95
+ logger.info(
96
+ "ML categorization disabled in configuration, using standard extractor"
97
+ )
98
+ else:
99
+ logger.debug("Using standard ticket extractor")
100
+
101
+ self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
102
+
29
103
  self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
30
104
  self.exclude_paths = exclude_paths or []
31
-
32
- def analyze_repository(self, repo_path: Path, since: datetime,
33
- branch: Optional[str] = None) -> List[Dict[str, Any]]:
34
- """Analyze a Git repository with batch processing."""
105
+
106
+ # Initialize branch analysis configuration
107
+ self.branch_analysis_config = branch_analysis_config or {}
108
+ self.branch_strategy = self.branch_analysis_config.get("strategy", "all")
109
+ self.max_branches_per_repo = self.branch_analysis_config.get("max_branches_per_repo", 50)
110
+ self.active_days_threshold = self.branch_analysis_config.get("active_days_threshold", 90)
111
+ self.include_main_branches = self.branch_analysis_config.get("include_main_branches", True)
112
+ self.always_include_patterns = self.branch_analysis_config.get(
113
+ "always_include_patterns",
114
+ [r"^(main|master|develop|dev)$", r"^release/.*", r"^hotfix/.*"],
115
+ )
116
+ self.always_exclude_patterns = self.branch_analysis_config.get(
117
+ "always_exclude_patterns",
118
+ [r"^dependabot/.*", r"^renovate/.*", r".*-backup$", r".*-temp$"],
119
+ )
120
+ self.enable_progress_logging = self.branch_analysis_config.get(
121
+ "enable_progress_logging", True
122
+ )
123
+ self.branch_commit_limit = self.branch_analysis_config.get(
124
+ "branch_commit_limit", None
125
+ ) # No limit by default
126
+
127
+ # Initialize commit classifier if enabled
128
+ self.classification_enabled = classification_config and classification_config.get(
129
+ "enabled", False
130
+ )
131
+ self.commit_classifier = None
132
+
133
+ if self.classification_enabled:
134
+ try:
135
+ from ..classification.classifier import CommitClassifier
136
+
137
+ self.commit_classifier = CommitClassifier(
138
+ config=classification_config, cache_dir=cache.cache_dir / "classification"
139
+ )
140
+ logger.info("Commit classification enabled")
141
+ except ImportError as e:
142
+ logger.warning(f"Classification dependencies not available: {e}")
143
+ self.classification_enabled = False
144
+ except Exception as e:
145
+ logger.error(f"Failed to initialize commit classifier: {e}")
146
+ self.classification_enabled = False
147
+
148
+ def analyze_repository(
149
+ self, repo_path: Path, since: datetime, branch: Optional[str] = None
150
+ ) -> list[dict[str, Any]]:
151
+ """Analyze a Git repository with batch processing and optional classification."""
35
152
  try:
36
153
  repo = Repo(repo_path)
154
+ # Update repository from remote before analysis
155
+ self._update_repository(repo)
37
156
  except Exception as e:
38
157
  raise ValueError(f"Failed to open repository at {repo_path}: {e}") from e
39
-
40
- # Get commits to analyze
41
- commits = self._get_commits(repo, since, branch)
158
+
159
+ # Get commits to analyze with optimized branch selection
160
+ commits = self._get_commits_optimized(repo, since, branch)
42
161
  total_commits = len(commits)
43
-
162
+
44
163
  if total_commits == 0:
45
164
  return []
46
-
165
+
47
166
  analyzed_commits = []
48
-
167
+ total_cache_hits = 0
168
+ total_cache_misses = 0
169
+
49
170
  # Process in batches with progress bar
50
- with tqdm(total=total_commits, desc=f"Analyzing {repo_path.name}") as pbar:
171
+ processed_commits = 0
172
+ progress_service = get_progress_service()
173
+
174
+ # Only create progress bar if enabled
175
+ if self.enable_progress_logging:
176
+ progress_ctx = progress_service.create_progress(
177
+ total=total_commits, description=f"Analyzing {repo_path.name}", unit="commits"
178
+ )
179
+ else:
180
+ progress_ctx = None
181
+
182
+ try:
51
183
  for batch in self._batch_commits(commits, self.batch_size):
52
- batch_results = self._process_batch(repo, repo_path, batch)
184
+ batch_results, batch_hits, batch_misses = self._process_batch(
185
+ repo, repo_path, batch
186
+ )
53
187
  analyzed_commits.extend(batch_results)
54
-
55
- # Cache the batch
56
- self.cache.cache_commits_batch(str(repo_path), batch_results)
57
-
58
- pbar.update(len(batch))
59
-
188
+
189
+ # Track overall cache performance
190
+ total_cache_hits += batch_hits
191
+ total_cache_misses += batch_misses
192
+
193
+ # Note: Caching is now handled within _process_batch for better performance
194
+
195
+ # Update progress tracking
196
+ batch_size = len(batch)
197
+ processed_commits += batch_size
198
+
199
+ # Update progress bar with cache info if enabled
200
+ if progress_ctx:
201
+ hit_rate = (batch_hits / batch_size) * 100 if batch_size > 0 else 0
202
+ progress_service.set_description(
203
+ progress_ctx,
204
+ f"Analyzing {repo_path.name} (cache hit: {hit_rate:.1f}%, {processed_commits}/{total_commits})",
205
+ )
206
+ progress_service.update(progress_ctx, batch_size)
207
+ finally:
208
+ if progress_ctx:
209
+ progress_service.complete(progress_ctx)
210
+
211
+ # Debug logging for progress tracking issues
212
+ if os.getenv("GITFLOW_DEBUG", "").lower() in ("1", "true", "yes"):
213
+ logger.debug(
214
+ f"Final progress: Processed: {processed_commits}/{total_commits} commits"
215
+ )
216
+
217
+ # Log overall cache performance
218
+ if total_cache_hits + total_cache_misses > 0:
219
+ overall_hit_rate = (total_cache_hits / (total_cache_hits + total_cache_misses)) * 100
220
+ logger.info(
221
+ f"Repository {repo_path.name}: {total_cache_hits} cached, {total_cache_misses} analyzed ({overall_hit_rate:.1f}% cache hit rate)"
222
+ )
223
+
224
+ # Apply commit classification if enabled
225
+ if self.classification_enabled and self.commit_classifier and analyzed_commits:
226
+ logger.info(f"Applying commit classification to {len(analyzed_commits)} commits")
227
+
228
+ try:
229
+ # Prepare commits for classification (add file changes information)
230
+ commits_with_files = self._prepare_commits_for_classification(
231
+ repo, analyzed_commits
232
+ )
233
+
234
+ # Get classification results
235
+ classification_results = self.commit_classifier.classify_commits(commits_with_files)
236
+
237
+ # Merge classification results back into analyzed commits
238
+ for commit, classification in zip(analyzed_commits, classification_results):
239
+ if classification: # Classification might be empty if disabled or failed
240
+ commit.update(
241
+ {
242
+ "predicted_class": classification.get("predicted_class"),
243
+ "classification_confidence": classification.get("confidence"),
244
+ "is_reliable_prediction": classification.get(
245
+ "is_reliable_prediction"
246
+ ),
247
+ "class_probabilities": classification.get("class_probabilities"),
248
+ "file_analysis_summary": classification.get("file_analysis"),
249
+ "classification_metadata": classification.get(
250
+ "classification_metadata"
251
+ ),
252
+ }
253
+ )
254
+
255
+ logger.info(f"Successfully classified {len(classification_results)} commits")
256
+
257
+ except Exception as e:
258
+ logger.error(f"Commit classification failed: {e}")
259
+ # Continue without classification rather than failing entirely
260
+
60
261
  return analyzed_commits
61
-
62
- def _get_commits(self, repo: Repo, since: datetime,
63
- branch: Optional[str] = None) -> List[git.Commit]:
64
- """Get commits from repository."""
65
- if branch:
262
+
263
+ def _update_repository(self, repo) -> bool:
264
+ """Update repository from remote before analysis.
265
+
266
+ WHY: This ensures we have the latest commits from the remote repository
267
+ before performing analysis. Critical for getting accurate data especially
268
+ when analyzing repositories that are actively being developed.
269
+
270
+ DESIGN DECISION: Uses fetch() for all cases, then pull() only when on a
271
+ tracking branch that's not in detached HEAD state. This approach:
272
+ - Handles detached HEAD states gracefully (common in CI/CD)
273
+ - Always gets latest refs from remote via fetch
274
+ - Only attempts pull when it's safe to do so
275
+ - Continues analysis even if update fails (logs warning)
276
+
277
+ Args:
278
+ repo: GitPython Repo object
279
+
280
+ Returns:
281
+ bool: True if update succeeded, False if failed (but analysis continues)
282
+ """
283
+ try:
284
+ if repo.remotes:
285
+ origin = repo.remotes.origin
286
+ logger.info("Fetching latest changes from remote")
287
+ origin.fetch()
288
+
289
+ # Only try to pull if not in detached HEAD state
290
+ if not repo.head.is_detached:
291
+ current_branch = repo.active_branch
292
+ tracking = current_branch.tracking_branch()
293
+ if tracking:
294
+ # Pull latest changes
295
+ origin.pull()
296
+ logger.debug(f"Pulled latest changes for {current_branch.name}")
297
+ else:
298
+ logger.debug(
299
+ f"Branch {current_branch.name} has no tracking branch, skipping pull"
300
+ )
301
+ else:
302
+ logger.debug("Repository in detached HEAD state, skipping pull")
303
+ return True
304
+ else:
305
+ logger.debug("No remotes configured, skipping repository update")
306
+ return True
307
+ except Exception as e:
308
+ logger.warning(f"Could not update repository: {e}")
309
+ # Continue with analysis using local state
310
+ return False
311
+
312
+ def _get_commits_optimized(
313
+ self, repo: Repo, since: datetime, branch: Optional[str] = None
314
+ ) -> list[git.Commit]:
315
+ """Get commits from repository with branch analysis strategy.
316
+
317
+ WHY: Different analysis needs require different branch coverage approaches.
318
+ The default "all" strategy ensures complete commit coverage without missing
319
+ important development work that happens on feature branches.
320
+
321
+ DESIGN DECISION: Three strategies available:
322
+ 1. "main_only": Only analyze main/master branch (fastest, least comprehensive)
323
+ 2. "smart": Analyze active branches with smart filtering (balanced, may miss commits)
324
+ 3. "all": Analyze all branches (comprehensive coverage, default)
325
+
326
+ DEFAULT STRATEGY CHANGED: Now defaults to "all" to ensure complete coverage
327
+ after reports that "smart" strategy was missing significant commits (~100+ commits
328
+ and entire developers working on feature branches).
329
+
330
+ The "smart" strategy filters branches based on:
331
+ - Recent activity (commits within active_days_threshold)
332
+ - Branch naming patterns (always include main, release, hotfix branches)
333
+ - Exclude automation branches (dependabot, renovate, etc.)
334
+ - Limit total branches per repository
335
+
336
+ The "all" strategy:
337
+ - Analyzes all local and remote branches/refs
338
+ - No artificial branch limits
339
+ - No commit limits per branch (unless explicitly configured)
340
+ - Ensures complete development history capture
341
+ """
342
+ logger.debug(f"Getting commits since: {since} (tzinfo: {getattr(since, 'tzinfo', 'N/A')})")
343
+ logger.debug(f"Using branch analysis strategy: {self.branch_strategy}")
344
+
345
+ if self.branch_strategy == "main_only":
346
+ return self._get_main_branch_commits(repo, since, branch)
347
+ elif self.branch_strategy == "all":
348
+ logger.info("Using 'all' branches strategy for complete commit coverage")
349
+ return self._get_all_branch_commits(repo, since)
350
+ else: # smart strategy
351
+ return self._get_smart_branch_commits(repo, since)
352
+
353
+ def _get_main_branch_commits(
354
+ self, repo: Repo, since: datetime, branch: Optional[str] = None
355
+ ) -> list[git.Commit]:
356
+ """Get commits from main branch only (fastest strategy).
357
+
358
+ Args:
359
+ repo: Git repository object
360
+ since: Date to get commits since
361
+ branch: Specific branch to analyze (overrides main branch detection)
362
+
363
+ Returns:
364
+ List of commits from main branch only
365
+ """
366
+ target_branch = branch
367
+ if not target_branch:
368
+ # Auto-detect main branch
369
+ main_branch_names = ["main", "master", "develop", "dev"]
370
+ for branch_name in main_branch_names:
371
+ try:
372
+ if branch_name in [b.name for b in repo.branches]:
373
+ target_branch = branch_name
374
+ break
375
+ except Exception:
376
+ continue
377
+
378
+ if not target_branch and repo.branches:
379
+ target_branch = repo.branches[0].name # Fallback to first branch
380
+
381
+ if not target_branch:
382
+ logger.warning("No main branch found, no commits will be analyzed")
383
+ return []
384
+
385
+ logger.debug(f"Analyzing main branch only: {target_branch}")
386
+
387
+ try:
388
+ if self.branch_commit_limit:
389
+ commits = list(
390
+ repo.iter_commits(
391
+ target_branch, since=since, max_count=self.branch_commit_limit
392
+ )
393
+ )
394
+ else:
395
+ commits = list(repo.iter_commits(target_branch, since=since))
396
+ logger.debug(f"Found {len(commits)} commits in main branch {target_branch}")
397
+ return sorted(commits, key=lambda c: c.committed_datetime)
398
+ except git.GitCommandError as e:
399
+ logger.warning(f"Failed to get commits from branch {target_branch}: {e}")
400
+ return []
401
+
402
+ def _get_all_branch_commits(self, repo: Repo, since: datetime) -> list[git.Commit]:
403
+ """Get commits from all branches (comprehensive analysis).
404
+
405
+ WHY: This strategy captures ALL commits from ALL branches without artificial limitations.
406
+ It's designed to ensure complete coverage even if it takes longer to run.
407
+
408
+ DESIGN DECISION: Analyzes both local and remote branches to ensure we don't miss
409
+ commits that exist only on remote branches. Uses no commit limits per branch
410
+ to capture complete development history.
411
+
412
+ Args:
413
+ repo: Git repository object
414
+ since: Date to get commits since
415
+
416
+ Returns:
417
+ List of unique commits from all branches
418
+ """
419
+ logger.info("Analyzing all branches for complete commit coverage")
420
+
421
+ commits = []
422
+ branch_count = 0
423
+ processed_refs = set() # Track processed refs to avoid duplicates
424
+
425
+ # Process all refs (local branches, remote branches, tags)
426
+ for ref in repo.refs:
427
+ # Skip if we've already processed this ref
428
+ ref_name = ref.name
429
+ if ref_name in processed_refs:
430
+ continue
431
+
432
+ processed_refs.add(ref_name)
433
+ branch_count += 1
434
+
66
435
  try:
67
- commits = list(repo.iter_commits(branch, since=since))
68
- except git.GitCommandError:
69
- # Branch doesn't exist
70
- return []
436
+ # No commit limit - get ALL commits from this branch
437
+ if self.branch_commit_limit:
438
+ branch_commits = list(
439
+ repo.iter_commits(ref, since=since, max_count=self.branch_commit_limit)
440
+ )
441
+ logger.debug(
442
+ f"Branch {ref_name}: found {len(branch_commits)} commits (limited to {self.branch_commit_limit})"
443
+ )
444
+ else:
445
+ branch_commits = list(repo.iter_commits(ref, since=since))
446
+ logger.debug(
447
+ f"Branch {ref_name}: found {len(branch_commits)} commits (no limit)"
448
+ )
449
+
450
+ commits.extend(branch_commits)
451
+
452
+ if self.enable_progress_logging and branch_count % 10 == 0:
453
+ logger.info(
454
+ f"Processed {branch_count} branches, found {len(commits)} total commits so far"
455
+ )
456
+
457
+ except git.GitCommandError as e:
458
+ logger.debug(f"Skipping branch {ref_name} due to error: {e}")
459
+ continue
460
+
461
+ # Remove duplicates while preserving order
462
+ unique_commits = self._deduplicate_commits(commits)
463
+
464
+ logger.info(
465
+ f"Found {len(unique_commits)} unique commits across {branch_count} branches/refs"
466
+ )
467
+ return sorted(unique_commits, key=lambda c: c.committed_datetime)
468
+
469
+ def is_analysis_needed(
470
+ self,
471
+ repo_path: Path,
472
+ project_key: str,
473
+ analysis_start: datetime,
474
+ analysis_end: datetime,
475
+ weeks_analyzed: int,
476
+ config_hash: Optional[str] = None,
477
+ force_fetch: bool = False,
478
+ ) -> tuple[bool, Optional[dict[str, Any]]]:
479
+ """Check if repository analysis is needed or if cached data can be used.
480
+
481
+ WHY: Implements cache-first workflow by checking if repository has been
482
+ fully analyzed for the given period. Enables "fetch once, report many".
483
+
484
+ Args:
485
+ repo_path: Path to the repository
486
+ project_key: Project key for the repository
487
+ analysis_start: Start of the analysis period
488
+ analysis_end: End of the analysis period
489
+ weeks_analyzed: Number of weeks to analyze
490
+ config_hash: Hash of relevant configuration to detect changes
491
+ force_fetch: Force re-analysis even if cached data exists
492
+
493
+ Returns:
494
+ Tuple of (needs_analysis, cached_status_info)
495
+ """
496
+ if force_fetch:
497
+ logger.info(f"Force fetch enabled for {project_key} - analysis needed")
498
+ return True, None
499
+
500
+ # Check if analysis is already complete
501
+ status = self.cache.get_repository_analysis_status(
502
+ repo_path=str(repo_path),
503
+ analysis_start=analysis_start,
504
+ analysis_end=analysis_end,
505
+ config_hash=config_hash,
506
+ )
507
+
508
+ if not status:
509
+ logger.info(f"No cached analysis found for {project_key} - analysis needed")
510
+ return True, None
511
+
512
+ # Validate completeness
513
+ if (
514
+ status["git_analysis_complete"]
515
+ and status["weeks_analyzed"] >= weeks_analyzed
516
+ and status["commit_count"] > 0
517
+ ):
518
+ logger.info(
519
+ f"Using cached analysis for {project_key}: "
520
+ f"{status['commit_count']} commits, "
521
+ f"{status.get('unique_developers', 0)} developers"
522
+ )
523
+ return False, status
71
524
  else:
72
- # Get commits from all branches
73
- commits = []
74
- for ref in repo.refs:
75
- if ref.name.startswith('origin/'):
76
- continue # Skip remote branches
525
+ logger.info(f"Incomplete cached analysis for {project_key} - re-analysis needed")
526
+ return True, None
527
+
528
+ def mark_analysis_complete(
529
+ self,
530
+ repo_path: Path,
531
+ repo_name: str,
532
+ project_key: str,
533
+ analysis_start: datetime,
534
+ analysis_end: datetime,
535
+ weeks_analyzed: int,
536
+ commit_count: int,
537
+ unique_developers: int = 0,
538
+ processing_time_seconds: Optional[float] = None,
539
+ config_hash: Optional[str] = None,
540
+ ) -> None:
541
+ """Mark repository analysis as complete in the cache.
542
+
543
+ WHY: Records successful completion to enable cache-first workflow.
544
+ Should be called after successful repository analysis.
545
+
546
+ Args:
547
+ repo_path: Path to the repository
548
+ repo_name: Display name for the repository
549
+ project_key: Project key for the repository
550
+ analysis_start: Start of the analysis period
551
+ analysis_end: End of the analysis period
552
+ weeks_analyzed: Number of weeks analyzed
553
+ commit_count: Number of commits processed
554
+ unique_developers: Number of unique developers found
555
+ processing_time_seconds: Time taken for analysis
556
+ config_hash: Hash of relevant configuration
557
+ """
558
+ try:
559
+ self.cache.mark_repository_analysis_complete(
560
+ repo_path=str(repo_path),
561
+ repo_name=repo_name,
562
+ project_key=project_key,
563
+ analysis_start=analysis_start,
564
+ analysis_end=analysis_end,
565
+ weeks_analyzed=weeks_analyzed,
566
+ commit_count=commit_count,
567
+ unique_developers=unique_developers,
568
+ processing_time_seconds=processing_time_seconds,
569
+ config_hash=config_hash,
570
+ )
571
+ logger.info(f"Marked {project_key} analysis as complete: {commit_count} commits")
572
+ except Exception as e:
573
+ logger.warning(f"Failed to mark analysis complete for {project_key}: {e}")
574
+
575
+ def mark_analysis_failed(
576
+ self,
577
+ repo_path: Path,
578
+ repo_name: str,
579
+ analysis_start: datetime,
580
+ analysis_end: datetime,
581
+ error_message: str,
582
+ config_hash: Optional[str] = None,
583
+ ) -> None:
584
+ """Mark repository analysis as failed in the cache.
585
+
586
+ Args:
587
+ repo_path: Path to the repository
588
+ repo_name: Display name for the repository
589
+ analysis_start: Start of the analysis period
590
+ analysis_end: End of the analysis period
591
+ error_message: Error message describing the failure
592
+ config_hash: Hash of relevant configuration
593
+ """
594
+ try:
595
+ self.cache.mark_repository_analysis_failed(
596
+ repo_path=str(repo_path),
597
+ repo_name=repo_name,
598
+ analysis_start=analysis_start,
599
+ analysis_end=analysis_end,
600
+ error_message=error_message,
601
+ config_hash=config_hash,
602
+ )
603
+ logger.warning(f"Marked {repo_name} analysis as failed: {error_message}")
604
+ except Exception as e:
605
+ logger.error(f"Failed to mark analysis failure for {repo_name}: {e}")
606
+
607
+ def _get_smart_branch_commits(self, repo: Repo, since: datetime) -> list[git.Commit]:
608
+ """Get commits using smart branch filtering (balanced approach).
609
+
610
+ This method implements intelligent branch selection that:
611
+ 1. Always includes main/important branches
612
+ 2. Includes recently active branches
613
+ 3. Excludes automation/temporary branches
614
+ 4. Limits total number of branches analyzed
615
+
616
+ Args:
617
+ repo: Git repository object
618
+ since: Date to get commits since
619
+
620
+ Returns:
621
+ List of unique commits from selected branches
622
+ """
623
+ logger.debug("Using smart branch analysis strategy")
624
+
625
+ # Get active date threshold
626
+ active_threshold = datetime.now(timezone.utc) - timedelta(days=self.active_days_threshold)
627
+
628
+ # Collect branch information
629
+ branch_info = []
630
+
631
+ for ref in repo.refs:
632
+ if ref.name.startswith("origin/"):
633
+ continue # Skip remote tracking branches
634
+
635
+ try:
636
+ branch_name = ref.name
637
+
638
+ # Check if branch should be excluded
639
+ if self._should_exclude_branch(branch_name):
640
+ continue
641
+
642
+ # Get latest commit date for this branch
77
643
  try:
78
- branch_commits = list(repo.iter_commits(ref, since=since))
644
+ latest_commit = next(repo.iter_commits(ref, max_count=1))
645
+ latest_date = latest_commit.committed_datetime
646
+
647
+ # Convert to timezone-aware if needed
648
+ if latest_date.tzinfo is None:
649
+ latest_date = latest_date.replace(tzinfo=timezone.utc)
650
+ elif latest_date.tzinfo != timezone.utc:
651
+ latest_date = latest_date.astimezone(timezone.utc)
652
+
653
+ except StopIteration:
654
+ continue # Empty branch
655
+
656
+ # Determine branch priority
657
+ is_important = self._is_important_branch(branch_name)
658
+ is_active = latest_date >= active_threshold
659
+
660
+ branch_info.append(
661
+ {
662
+ "ref": ref,
663
+ "name": branch_name,
664
+ "latest_date": latest_date,
665
+ "is_important": is_important,
666
+ "is_active": is_active,
667
+ }
668
+ )
669
+
670
+ except Exception as e:
671
+ logger.debug(f"Skipping branch {ref.name} due to error: {e}")
672
+ continue
673
+
674
+ # Sort branches by importance and activity
675
+ branch_info.sort(
676
+ key=lambda x: (
677
+ x["is_important"], # Important branches first
678
+ x["is_active"], # Then active branches
679
+ x["latest_date"], # Then by recency
680
+ ),
681
+ reverse=True,
682
+ )
683
+
684
+ # Select branches to analyze
685
+ selected_branches = branch_info[: self.max_branches_per_repo]
686
+
687
+ if self.enable_progress_logging:
688
+ logger.info(
689
+ f"Selected {len(selected_branches)} branches out of {len(branch_info)} total branches"
690
+ )
691
+ important_count = sum(1 for b in selected_branches if b["is_important"])
692
+ active_count = sum(1 for b in selected_branches if b["is_active"])
693
+ logger.debug(f"Selected branches: {important_count} important, {active_count} active")
694
+
695
+ # Get commits from selected branches
696
+ commits = []
697
+
698
+ # Use centralized progress service
699
+ progress = get_progress_service()
700
+
701
+ # Only create progress if logging is enabled
702
+ if self.enable_progress_logging:
703
+ with progress.progress(
704
+ total=len(selected_branches),
705
+ description="Analyzing branches",
706
+ unit="branches",
707
+ leave=False,
708
+ ) as ctx:
709
+ for branch_data in selected_branches:
710
+ try:
711
+ if self.branch_commit_limit:
712
+ branch_commits = list(
713
+ repo.iter_commits(
714
+ branch_data["ref"],
715
+ since=since,
716
+ max_count=self.branch_commit_limit,
717
+ )
718
+ )
719
+ else:
720
+ branch_commits = list(
721
+ repo.iter_commits(branch_data["ref"], since=since)
722
+ )
723
+ commits.extend(branch_commits)
724
+
725
+ # Update progress description with branch info
726
+ branch_display = branch_data["name"][:15] + (
727
+ "..." if len(branch_data["name"]) > 15 else ""
728
+ )
729
+ progress.set_description(
730
+ ctx,
731
+ f"Analyzing branches [{branch_display}: {len(branch_commits)} commits]",
732
+ )
733
+
734
+ except git.GitCommandError as e:
735
+ logger.debug(
736
+ f"Failed to get commits from branch {branch_data['name']}: {e}"
737
+ )
738
+
739
+ progress.update(ctx, 1)
740
+ else:
741
+ # No progress bar when logging is disabled
742
+ for branch_data in selected_branches:
743
+ try:
744
+ if self.branch_commit_limit:
745
+ branch_commits = list(
746
+ repo.iter_commits(
747
+ branch_data["ref"], since=since, max_count=self.branch_commit_limit
748
+ )
749
+ )
750
+ else:
751
+ branch_commits = list(repo.iter_commits(branch_data["ref"], since=since))
79
752
  commits.extend(branch_commits)
80
- except git.GitCommandError:
81
- continue
82
-
83
- # Remove duplicates while preserving order
84
- seen = set()
85
- unique_commits = []
86
- for commit in commits:
87
- if commit.hexsha not in seen:
88
- seen.add(commit.hexsha)
89
- unique_commits.append(commit)
90
-
91
- commits = unique_commits
92
-
93
- # Sort by date
94
- return sorted(commits, key=lambda c: c.committed_datetime)
95
-
96
- def _batch_commits(self, commits: List[git.Commit],
97
- batch_size: int) -> Generator[List[git.Commit], None, None]:
753
+
754
+ except git.GitCommandError as e:
755
+ logger.debug(f"Failed to get commits from branch {branch_data['name']}: {e}")
756
+
757
+ # Remove duplicates while preserving order
758
+ unique_commits = self._deduplicate_commits(commits)
759
+
760
+ logger.info(
761
+ f"Smart analysis found {len(unique_commits)} unique commits from {len(selected_branches)} branches"
762
+ )
763
+ return sorted(unique_commits, key=lambda c: c.committed_datetime)
764
+
765
+ def _should_exclude_branch(self, branch_name: str) -> bool:
766
+ """Check if a branch should be excluded from analysis.
767
+
768
+ Args:
769
+ branch_name: Name of the branch to check
770
+
771
+ Returns:
772
+ True if the branch should be excluded, False otherwise
773
+ """
774
+ # Check against exclude patterns
775
+ for pattern in self.always_exclude_patterns:
776
+ if re.match(pattern, branch_name, re.IGNORECASE):
777
+ return True
778
+ return False
779
+
780
+ def _is_important_branch(self, branch_name: str) -> bool:
781
+ """Check if a branch is considered important and should always be included.
782
+
783
+ Args:
784
+ branch_name: Name of the branch to check
785
+
786
+ Returns:
787
+ True if the branch is important, False otherwise
788
+ """
789
+ # Check against important branch patterns
790
+ for pattern in self.always_include_patterns:
791
+ if re.match(pattern, branch_name, re.IGNORECASE):
792
+ return True
793
+ return False
794
+
795
+ def _deduplicate_commits(self, commits: list[git.Commit]) -> list[git.Commit]:
796
+ """Remove duplicate commits while preserving order.
797
+
798
+ Args:
799
+ commits: List of commits that may contain duplicates
800
+
801
+ Returns:
802
+ List of unique commits in original order
803
+ """
804
+ seen = set()
805
+ unique_commits = []
806
+
807
+ for commit in commits:
808
+ if commit.hexsha not in seen:
809
+ seen.add(commit.hexsha)
810
+ unique_commits.append(commit)
811
+
812
+ return unique_commits
813
+
814
+ def _batch_commits(
815
+ self, commits: list[git.Commit], batch_size: int
816
+ ) -> Generator[list[git.Commit], None, None]:
98
817
  """Yield batches of commits."""
99
818
  for i in range(0, len(commits), batch_size):
100
- yield commits[i:i + batch_size]
101
-
102
- def _process_batch(self, repo: Repo, repo_path: Path,
103
- commits: List[git.Commit]) -> List[Dict[str, Any]]:
104
- """Process a batch of commits."""
819
+ yield commits[i : i + batch_size]
820
+
821
+ def _process_batch(
822
+ self, repo: Repo, repo_path: Path, commits: list[git.Commit]
823
+ ) -> tuple[list[dict[str, Any]], int, int]:
824
+ """Process a batch of commits with optimized cache lookups.
825
+
826
+ WHY: Bulk cache lookups are much faster than individual queries.
827
+ This optimization can reduce subsequent run times from minutes to seconds
828
+ when most commits are already cached.
829
+
830
+ ENHANCEMENT: Now uses enhanced bulk_get_commits for better performance
831
+ and automatically detects when to use bulk_store_commits for new data.
832
+
833
+ Returns:
834
+ Tuple of (results, cache_hits, cache_misses)
835
+ """
105
836
  results = []
106
-
837
+
838
+ # Use enhanced bulk fetch with better performance
839
+ commit_hashes = [commit.hexsha for commit in commits]
840
+ cached_commits = self.cache.bulk_get_commits(str(repo_path), commit_hashes)
841
+
842
+ cache_hits = 0
843
+ cache_misses = 0
844
+ new_commits = []
845
+
107
846
  for commit in commits:
108
- # Check cache first
109
- cached = self.cache.get_cached_commit(str(repo_path), commit.hexsha)
110
- if cached:
111
- results.append(cached)
847
+ # Check bulk cache results
848
+ if commit.hexsha in cached_commits:
849
+ results.append(cached_commits[commit.hexsha])
850
+ cache_hits += 1
112
851
  continue
113
-
852
+
114
853
  # Analyze commit
115
854
  commit_data = self._analyze_commit(repo, commit, repo_path)
116
855
  results.append(commit_data)
117
-
118
- return results
119
-
120
- def _analyze_commit(self, repo: Repo, commit: git.Commit, repo_path: Path) -> Dict[str, Any]:
856
+ new_commits.append(commit_data)
857
+ cache_misses += 1
858
+
859
+ # Use bulk_store_commits for better performance when we have many new commits
860
+ if len(new_commits) >= 10: # Threshold for bulk operations
861
+ logger.debug(f"Using bulk_store_commits for {len(new_commits)} new commits")
862
+ stats = self.cache.bulk_store_commits(str(repo_path), new_commits)
863
+ if stats["inserted"] > 0:
864
+ logger.debug(
865
+ f"Bulk stored {stats['inserted']} commits at {stats['commits_per_second']:.0f} commits/sec"
866
+ )
867
+ elif new_commits:
868
+ # Fall back to regular batch caching for small numbers
869
+ self.cache.cache_commits_batch(str(repo_path), new_commits)
870
+
871
+ # Log cache performance for debugging
872
+ if cache_hits + cache_misses > 0:
873
+ cache_hit_rate = (cache_hits / (cache_hits + cache_misses)) * 100
874
+ logger.debug(
875
+ f"Batch cache performance: {cache_hits} hits, {cache_misses} misses ({cache_hit_rate:.1f}% hit rate)"
876
+ )
877
+
878
+ return results, cache_hits, cache_misses
879
+
880
+ def _analyze_commit(self, repo: Repo, commit: git.Commit, repo_path: Path) -> dict[str, Any]:
121
881
  """Analyze a single commit."""
882
+ # Normalize timestamp handling
883
+ commit_timestamp = commit.committed_datetime
884
+ logger.debug(
885
+ f"Analyzing commit {commit.hexsha[:8]}: original timestamp={commit_timestamp} (tzinfo: {getattr(commit_timestamp, 'tzinfo', 'N/A')})"
886
+ )
887
+
888
+ # Ensure timezone-aware timestamp in UTC
889
+ from datetime import timezone
890
+
891
+ if commit_timestamp.tzinfo is None:
892
+ # Convert naive datetime to UTC
893
+ commit_timestamp = commit_timestamp.replace(tzinfo=timezone.utc)
894
+ logger.debug(f" Converted naive timestamp to UTC: {commit_timestamp}")
895
+ elif commit_timestamp.tzinfo != timezone.utc:
896
+ # Convert to UTC if in different timezone
897
+ commit_timestamp = commit_timestamp.astimezone(timezone.utc)
898
+ logger.debug(f" Converted timestamp to UTC: {commit_timestamp}")
899
+ else:
900
+ logger.debug(f" Timestamp already in UTC: {commit_timestamp}")
901
+
902
+ # Get the local hour for the developer (before UTC conversion)
903
+ local_hour = commit.committed_datetime.hour
904
+
122
905
  # Basic commit data
123
906
  commit_data = {
124
- 'hash': commit.hexsha,
125
- 'author_name': commit.author.name,
126
- 'author_email': commit.author.email,
127
- 'message': commit.message,
128
- 'timestamp': commit.committed_datetime,
129
- 'is_merge': len(commit.parents) > 1
907
+ "hash": commit.hexsha,
908
+ "author_name": commit.author.name,
909
+ "author_email": commit.author.email,
910
+ "message": commit.message,
911
+ "timestamp": commit_timestamp, # Now guaranteed to be UTC timezone-aware
912
+ "local_hour": local_hour, # Hour in developer's local timezone
913
+ "is_merge": len(commit.parents) > 1,
130
914
  }
131
-
915
+
132
916
  # Get branch name
133
- commit_data['branch'] = self._get_commit_branch(repo, commit)
134
-
917
+ commit_data["branch"] = self._get_commit_branch(repo, commit)
918
+
135
919
  # Map branch to project
136
- commit_data['inferred_project'] = self.branch_mapper.map_branch_to_project(
137
- commit_data['branch'], repo_path
920
+ commit_data["inferred_project"] = self.branch_mapper.map_branch_to_project(
921
+ str(commit_data["branch"]), repo_path
138
922
  )
139
-
140
- # Calculate metrics - use raw stats for backward compatibility
141
- stats = commit.stats.total
142
- commit_data['files_changed'] = stats.get('files', 0)
143
- commit_data['insertions'] = stats.get('insertions', 0)
144
- commit_data['deletions'] = stats.get('deletions', 0)
145
-
923
+
924
+ # Calculate metrics using reliable git numstat for accurate line counts
925
+ raw_stats = self._calculate_raw_stats(commit)
926
+ commit_data["files_changed_count"] = raw_stats[
927
+ "files"
928
+ ] # Integer count for backward compatibility
929
+ commit_data["files_changed"] = self._get_changed_file_paths(
930
+ commit
931
+ ) # List of file paths for ML
932
+ commit_data["insertions"] = raw_stats["insertions"]
933
+ commit_data["deletions"] = raw_stats["deletions"]
934
+
146
935
  # Calculate filtered metrics (excluding boilerplate/generated files)
147
936
  filtered_stats = self._calculate_filtered_stats(commit)
148
- commit_data['filtered_files_changed'] = filtered_stats['files']
149
- commit_data['filtered_insertions'] = filtered_stats['insertions']
150
- commit_data['filtered_deletions'] = filtered_stats['deletions']
151
-
937
+ commit_data["filtered_files_changed"] = filtered_stats["files"]
938
+ commit_data["filtered_insertions"] = filtered_stats["insertions"]
939
+ commit_data["filtered_deletions"] = filtered_stats["deletions"]
940
+
152
941
  # Extract story points
153
- commit_data['story_points'] = self.story_point_extractor.extract_from_text(
942
+ message_str = (
154
943
  commit.message
944
+ if isinstance(commit.message, str)
945
+ else commit.message.decode("utf-8", errors="ignore")
155
946
  )
156
-
947
+ commit_data["story_points"] = self.story_point_extractor.extract_from_text(message_str)
948
+
157
949
  # Extract ticket references
158
- commit_data['ticket_references'] = self.ticket_extractor.extract_from_text(
159
- commit.message
160
- )
161
-
950
+ commit_data["ticket_references"] = self.ticket_extractor.extract_from_text(message_str)
951
+
162
952
  # Calculate complexity delta
163
- commit_data['complexity_delta'] = self._calculate_complexity_delta(commit)
164
-
953
+ commit_data["complexity_delta"] = self._calculate_complexity_delta(commit)
954
+
165
955
  return commit_data
166
-
956
+
167
957
  def _get_commit_branch(self, repo: Repo, commit: git.Commit) -> str:
168
958
  """Get the branch name for a commit."""
169
959
  # This is a simplified approach - getting the first branch that contains the commit
170
960
  for branch in repo.branches:
171
961
  if commit in repo.iter_commits(branch):
172
962
  return branch.name
173
- return 'unknown'
174
-
963
+ return "unknown"
964
+
965
+ def _get_changed_file_paths(self, commit: git.Commit) -> list[str]:
966
+ """Extract list of changed file paths from a git commit.
967
+
968
+ Args:
969
+ commit: Git commit object
970
+
971
+ Returns:
972
+ List of file paths that were changed in the commit
973
+ """
974
+ file_paths = []
975
+
976
+ # Handle initial commits (no parents) and regular commits
977
+ parent = commit.parents[0] if commit.parents else None
978
+
979
+ try:
980
+ for diff in commit.diff(parent):
981
+ # Get file path - prefer the new path (b_path) for modifications and additions,
982
+ # fall back to old path (a_path) for deletions
983
+ file_path = diff.b_path if diff.b_path else diff.a_path
984
+ if file_path:
985
+ file_paths.append(file_path)
986
+ except Exception as e:
987
+ logger.warning(f"Failed to extract file paths from commit {commit.hexsha[:8]}: {e}")
988
+
989
+ return file_paths
990
+
175
991
  def _calculate_complexity_delta(self, commit: git.Commit) -> float:
176
- """Calculate complexity change for a commit."""
992
+ """Calculate complexity change for a commit with graceful error handling.
993
+
994
+ WHY: Repository corruption or missing blobs can cause SHA resolution errors.
995
+ This method provides a fallback complexity calculation that continues
996
+ analysis even when individual blobs are missing or corrupt.
997
+ """
177
998
  total_delta = 0.0
178
-
179
- for diff in commit.diff(commit.parents[0] if commit.parents else None):
180
- if not self._is_code_file(diff.b_path or diff.a_path or ''):
999
+
1000
+ try:
1001
+ parent = commit.parents[0] if commit.parents else None
1002
+ diffs = commit.diff(parent)
1003
+ except Exception as e:
1004
+ # If we can't get diffs at all, return 0 complexity delta
1005
+ logger.debug(f"Cannot calculate complexity for commit {commit.hexsha[:8]}: {e}")
1006
+ return 0.0
1007
+
1008
+ for diff in diffs:
1009
+ try:
1010
+ if not self._is_code_file(diff.b_path or diff.a_path or ""):
1011
+ continue
1012
+
1013
+ # Simple complexity estimation based on diff size
1014
+ # In a real implementation, you'd parse the code and calculate cyclomatic complexity
1015
+ if diff.new_file:
1016
+ try:
1017
+ if diff.b_blob and hasattr(diff.b_blob, "size"):
1018
+ total_delta += diff.b_blob.size / 100
1019
+ except (ValueError, AttributeError) as e:
1020
+ logger.debug(
1021
+ f"Cannot access b_blob for new file in {commit.hexsha[:8]}: {e}"
1022
+ )
1023
+ # Use a default small positive delta for new files
1024
+ total_delta += 1.0
1025
+
1026
+ elif diff.deleted_file:
1027
+ try:
1028
+ if diff.a_blob and hasattr(diff.a_blob, "size"):
1029
+ total_delta -= diff.a_blob.size / 100
1030
+ except (ValueError, AttributeError) as e:
1031
+ logger.debug(
1032
+ f"Cannot access a_blob for deleted file in {commit.hexsha[:8]}: {e}"
1033
+ )
1034
+ # Use a default small negative delta for deleted files
1035
+ total_delta -= 1.0
1036
+
1037
+ else:
1038
+ # Modified file - estimate based on change size
1039
+ try:
1040
+ if diff.diff:
1041
+ diff_content = (
1042
+ diff.diff
1043
+ if isinstance(diff.diff, str)
1044
+ else diff.diff.decode("utf-8", errors="ignore")
1045
+ )
1046
+ added = len(diff_content.split("\n+"))
1047
+ removed = len(diff_content.split("\n-"))
1048
+ total_delta += (added - removed) / 10
1049
+ except (ValueError, AttributeError, UnicodeDecodeError) as e:
1050
+ logger.debug(f"Cannot process diff content in {commit.hexsha[:8]}: {e}")
1051
+ # Skip this diff but continue processing
1052
+ pass
1053
+
1054
+ except Exception as e:
1055
+ logger.debug(f"Error processing diff in commit {commit.hexsha[:8]}: {e}")
1056
+ # Continue to next diff
181
1057
  continue
182
-
183
- # Simple complexity estimation based on diff size
184
- # In a real implementation, you'd parse the code and calculate cyclomatic complexity
185
- if diff.new_file:
186
- total_delta += diff.b_blob.size / 100 if diff.b_blob else 0
187
- elif diff.deleted_file:
188
- total_delta -= diff.a_blob.size / 100 if diff.a_blob else 0
189
- else:
190
- # Modified file - estimate based on change size
191
- added = len(diff.diff.decode('utf-8', errors='ignore').split('\n+')) if diff.diff else 0
192
- removed = len(diff.diff.decode('utf-8', errors='ignore').split('\n-')) if diff.diff else 0
193
- total_delta += (added - removed) / 10
194
-
1058
+
195
1059
  return total_delta
196
-
1060
+
197
1061
  def _is_code_file(self, filepath: str) -> bool:
198
1062
  """Check if file is a code file."""
199
1063
  code_extensions = {
200
- '.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.hpp',
201
- '.go', '.rs', '.rb', '.php', '.swift', '.kt', '.scala',
202
- '.cs', '.vb', '.r', '.m', '.mm', '.f90', '.f95', '.lua'
1064
+ ".py",
1065
+ ".js",
1066
+ ".ts",
1067
+ ".java",
1068
+ ".cpp",
1069
+ ".c",
1070
+ ".h",
1071
+ ".hpp",
1072
+ ".go",
1073
+ ".rs",
1074
+ ".rb",
1075
+ ".php",
1076
+ ".swift",
1077
+ ".kt",
1078
+ ".scala",
1079
+ ".cs",
1080
+ ".vb",
1081
+ ".r",
1082
+ ".m",
1083
+ ".mm",
1084
+ ".f90",
1085
+ ".f95",
1086
+ ".lua",
203
1087
  }
204
-
1088
+
205
1089
  return any(filepath.endswith(ext) for ext in code_extensions)
206
-
1090
+
207
1091
  def _should_exclude_file(self, filepath: str) -> bool:
208
1092
  """Check if file should be excluded from line counting."""
209
1093
  if not filepath:
210
1094
  return False
211
-
1095
+
212
1096
  # Normalize path separators for consistent matching
213
- filepath = filepath.replace('\\', '/')
214
-
215
- # Check against exclude patterns
216
- return any(fnmatch.fnmatch(filepath, pattern) for pattern in self.exclude_paths)
217
-
218
- def _calculate_filtered_stats(self, commit: git.Commit) -> Dict[str, int]:
219
- """Calculate commit statistics excluding boilerplate/generated files."""
220
- filtered_stats = {
221
- 'files': 0,
222
- 'insertions': 0,
223
- 'deletions': 0
224
- }
225
-
1097
+ filepath = filepath.replace("\\", "/")
1098
+
1099
+ # Check against exclude patterns with proper ** handling
1100
+ return any(self._matches_glob_pattern(filepath, pattern) for pattern in self.exclude_paths)
1101
+
1102
+ def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
1103
+ """Check if a file path matches a glob pattern, handling ** recursion correctly.
1104
+
1105
+ This method properly handles different glob pattern types:
1106
+ - **/vendor/** : matches files inside vendor directories at any level
1107
+ - **/*.min.js : matches files with specific suffix anywhere in directory tree
1108
+ - vendor/** : matches files inside vendor directory at root level only
1109
+ - **pattern** : handles other complex patterns with pathlib.match()
1110
+ - simple patterns : uses fnmatch for basic wildcards
1111
+
1112
+ Args:
1113
+ filepath: The file path to check
1114
+ pattern: The glob pattern to match against
1115
+
1116
+ Returns:
1117
+ True if the file path matches the pattern, False otherwise
1118
+ """
1119
+ from pathlib import PurePath
1120
+
1121
+ # Handle empty or invalid inputs
1122
+ if not filepath or not pattern:
1123
+ return False
1124
+
1125
+ path = PurePath(filepath)
1126
+
1127
+ # Check for multiple ** patterns first (most complex)
1128
+ if "**" in pattern and pattern.count("**") > 1:
1129
+ # Multiple ** patterns - use custom recursive matching for complex patterns
1130
+ return self._match_recursive_pattern(filepath, pattern)
1131
+
1132
+ # Then handle simple ** patterns
1133
+ elif pattern.startswith("**/") and pattern.endswith("/**"):
1134
+ # Pattern like **/vendor/** - matches files inside vendor directories at any level
1135
+ dir_name = pattern[3:-3] # Extract 'vendor' from '**/vendor/**'
1136
+ if not dir_name: # Handle edge case of '**/**'
1137
+ return True
1138
+ return dir_name in path.parts
1139
+
1140
+ elif pattern.startswith("**/"):
1141
+ # Pattern like **/*.min.js - matches files with specific suffix anywhere
1142
+ suffix_pattern = pattern[3:]
1143
+ if not suffix_pattern: # Handle edge case of '**/'
1144
+ return True
1145
+ # Check against filename for file patterns, or any path part for directory patterns
1146
+ if suffix_pattern.endswith("/"):
1147
+ # Directory pattern like **/build/
1148
+ dir_name = suffix_pattern[:-1]
1149
+ return dir_name in path.parts
1150
+ else:
1151
+ # File pattern like *.min.js
1152
+ return fnmatch.fnmatch(path.name, suffix_pattern)
1153
+
1154
+ elif pattern.endswith("/**"):
1155
+ # Pattern like vendor/** or docs/build/** - matches files inside directory at root level
1156
+ dir_name = pattern[:-3]
1157
+ if not dir_name: # Handle edge case of '/**'
1158
+ return True
1159
+
1160
+ # Handle both single directory names and nested paths
1161
+ expected_parts = PurePath(dir_name).parts
1162
+ return (
1163
+ len(path.parts) >= len(expected_parts)
1164
+ and path.parts[: len(expected_parts)] == expected_parts
1165
+ )
1166
+
1167
+ elif "**" in pattern:
1168
+ # Single ** pattern - use pathlib matching with fallback
1169
+ try:
1170
+ return path.match(pattern)
1171
+ except (ValueError, TypeError):
1172
+ # Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
1173
+ try:
1174
+ return fnmatch.fnmatch(filepath, pattern)
1175
+ except re.error:
1176
+ # Invalid regex pattern - return False to be safe
1177
+ return False
1178
+ else:
1179
+ # Simple pattern - use fnmatch for basic wildcards
1180
+ try:
1181
+ return fnmatch.fnmatch(filepath, pattern)
1182
+ except re.error:
1183
+ # Invalid regex pattern - return False to be safe
1184
+ return False
1185
+
1186
+ def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
1187
+ """Handle complex patterns with multiple ** wildcards.
1188
+
1189
+ Args:
1190
+ filepath: The file path to check
1191
+ pattern: The pattern with multiple ** wildcards
1192
+
1193
+ Returns:
1194
+ True if the path matches the pattern, False otherwise
1195
+ """
1196
+ from pathlib import PurePath
1197
+
1198
+ # Split pattern by ** to handle each segment
1199
+ parts = pattern.split("**")
1200
+ path = PurePath(filepath)
1201
+ path_str = str(path)
1202
+
1203
+ # Handle patterns like 'src/**/components/**/*.tsx' or '**/test/**/*.spec.js'
1204
+ if len(parts) >= 2:
1205
+ # First part should match from the beginning (if not empty)
1206
+ start_pattern = parts[0].rstrip("/")
1207
+ if start_pattern and not path_str.startswith(start_pattern):
1208
+ return False
1209
+
1210
+ # Last part should match the filename/end pattern
1211
+ end_pattern = parts[-1].lstrip("/")
1212
+ if end_pattern and not fnmatch.fnmatch(path.name, end_pattern):
1213
+ # Check if filename matches the end pattern
1214
+ return False
1215
+
1216
+ # Middle parts should exist somewhere in the path between start and end
1217
+ for i in range(1, len(parts) - 1):
1218
+ middle_pattern = parts[i].strip("/")
1219
+ if middle_pattern and middle_pattern not in path.parts:
1220
+ # Check if this directory exists in the path
1221
+ return False
1222
+
1223
+ return True
1224
+
1225
+ return False
1226
+
1227
+ def _calculate_filtered_stats(self, commit: git.Commit) -> dict[str, int]:
1228
+ """Calculate commit statistics excluding boilerplate/generated files using git diff --numstat."""
1229
+ filtered_stats = {"files": 0, "insertions": 0, "deletions": 0}
1230
+
226
1231
  # For initial commits or commits without parents
227
1232
  parent = commit.parents[0] if commit.parents else None
228
-
1233
+
229
1234
  try:
230
- for diff in commit.diff(parent):
231
- # Get file path
232
- file_path = diff.b_path if diff.b_path else diff.a_path
233
- if not file_path:
1235
+ # Use git command directly for accurate line counts
1236
+ repo = commit.repo
1237
+ if parent:
1238
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1239
+ else:
1240
+ # Initial commit - use git show with --numstat
1241
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1242
+
1243
+ # Parse the numstat output: insertions\tdeletions\tfilename
1244
+ for line in diff_output.strip().split("\n"):
1245
+ if not line.strip():
234
1246
  continue
235
-
236
- # Skip excluded files
237
- if self._should_exclude_file(file_path):
1247
+
1248
+ parts = line.split("\t")
1249
+ if len(parts) >= 3:
1250
+ try:
1251
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1252
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1253
+ filename = parts[2]
1254
+
1255
+ # Skip excluded files using the existing filter logic
1256
+ if self._should_exclude_file(filename):
1257
+ continue
1258
+
1259
+ # Count the file and its changes
1260
+ filtered_stats["files"] += 1
1261
+ filtered_stats["insertions"] += insertions
1262
+ filtered_stats["deletions"] += deletions
1263
+
1264
+ except ValueError:
1265
+ # Skip binary files or malformed lines
1266
+ continue
1267
+
1268
+ except Exception as e:
1269
+ # Log the error for debugging but don't crash
1270
+ logger.warning(f"Error calculating filtered stats for commit {commit.hexsha[:8]}: {e}")
1271
+
1272
+ return filtered_stats
1273
+
1274
+ def _calculate_raw_stats(self, commit: git.Commit) -> dict[str, int]:
1275
+ """Calculate commit statistics for all files (no filtering) using git diff --numstat."""
1276
+ raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
1277
+
1278
+ # For initial commits or commits without parents
1279
+ parent = commit.parents[0] if commit.parents else None
1280
+
1281
+ try:
1282
+ # Use git command directly for accurate line counts
1283
+ repo = commit.repo
1284
+ if parent:
1285
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1286
+ else:
1287
+ # Initial commit - use git show with --numstat
1288
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1289
+
1290
+ # Parse the numstat output: insertions\tdeletions\tfilename
1291
+ for line in diff_output.strip().split("\n"):
1292
+ if not line.strip():
238
1293
  continue
239
-
240
- # Count the file
241
- filtered_stats['files'] += 1
242
-
243
- # Count insertions and deletions
244
- if diff.diff:
245
- diff_text = diff.diff.decode('utf-8', errors='ignore')
246
- for line in diff_text.split('\n'):
247
- if line.startswith('+') and not line.startswith('+++'):
248
- filtered_stats['insertions'] += 1
249
- elif line.startswith('-') and not line.startswith('---'):
250
- filtered_stats['deletions'] += 1
251
- except Exception:
252
- # If we can't calculate filtered stats, return zeros
253
- pass
254
-
255
- return filtered_stats
1294
+
1295
+ parts = line.split("\t")
1296
+ if len(parts) >= 3:
1297
+ try:
1298
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1299
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1300
+ # filename = parts[2] - not used in raw stats
1301
+
1302
+ # Count all files and their changes (no filtering)
1303
+ raw_stats["files"] += 1
1304
+ raw_stats["insertions"] += insertions
1305
+ raw_stats["deletions"] += deletions
1306
+
1307
+ except ValueError:
1308
+ # Skip binary files or malformed lines
1309
+ continue
1310
+
1311
+ except Exception as e:
1312
+ # Log the error for debugging but don't crash
1313
+ logger.warning(f"Error calculating raw stats for commit {commit.hexsha[:8]}: {e}")
1314
+
1315
+ return raw_stats
1316
+
1317
+ def _prepare_commits_for_classification(
1318
+ self, repo: Repo, commits: list[dict[str, Any]]
1319
+ ) -> list[dict[str, Any]]:
1320
+ """Prepare commits for classification by adding file change information.
1321
+
1322
+ Args:
1323
+ repo: Git repository object
1324
+ commits: List of analyzed commit dictionaries
1325
+
1326
+ Returns:
1327
+ List of commits with file change information needed for classification
1328
+ """
1329
+ prepared_commits = []
1330
+
1331
+ for commit_data in commits:
1332
+ commit_hash = commit_data.get("hash")
1333
+ if not commit_hash:
1334
+ prepared_commits.append(commit_data)
1335
+ continue
1336
+
1337
+ try:
1338
+ # Use the file paths already extracted during analysis
1339
+ files_changed = commit_data.get("files_changed", [])
1340
+
1341
+ # If files_changed is somehow not available or empty, extract it as fallback
1342
+ if not files_changed:
1343
+ logger.warning(
1344
+ f"No file paths found for commit {commit_hash[:8]}, extracting as fallback"
1345
+ )
1346
+ files_changed = self._get_changed_file_paths(repo.commit(commit_hash))
1347
+
1348
+ # Create enhanced commit data for classification
1349
+ enhanced_commit = commit_data.copy()
1350
+ enhanced_commit["files_changed"] = files_changed
1351
+
1352
+ # Add file details if needed by classifier
1353
+ if files_changed:
1354
+ file_details = {}
1355
+ # Only extract file details if we need to get commit object for other reasons
1356
+ # or if file details are specifically required by the classifier
1357
+ try:
1358
+ commit = repo.commit(commit_hash)
1359
+ parent = commit.parents[0] if commit.parents else None
1360
+
1361
+ for diff in commit.diff(parent):
1362
+ file_path = diff.b_path if diff.b_path else diff.a_path
1363
+ if file_path and file_path in files_changed and diff.diff:
1364
+ # Calculate insertions and deletions per file
1365
+ diff_text = (
1366
+ diff.diff
1367
+ if isinstance(diff.diff, str)
1368
+ else diff.diff.decode("utf-8", errors="ignore")
1369
+ )
1370
+ insertions = len(
1371
+ [
1372
+ line
1373
+ for line in diff_text.split("\n")
1374
+ if line.startswith("+") and not line.startswith("+++")
1375
+ ]
1376
+ )
1377
+ deletions = len(
1378
+ [
1379
+ line
1380
+ for line in diff_text.split("\n")
1381
+ if line.startswith("-") and not line.startswith("---")
1382
+ ]
1383
+ )
1384
+
1385
+ file_details[file_path] = {
1386
+ "insertions": insertions,
1387
+ "deletions": deletions,
1388
+ }
1389
+
1390
+ enhanced_commit["file_details"] = file_details
1391
+ except Exception as detail_error:
1392
+ logger.warning(
1393
+ f"Failed to extract file details for commit {commit_hash[:8]}: {detail_error}"
1394
+ )
1395
+ enhanced_commit["file_details"] = {}
1396
+
1397
+ prepared_commits.append(enhanced_commit)
1398
+
1399
+ except Exception as e:
1400
+ logger.warning(f"Failed to prepare commit {commit_hash} for classification: {e}")
1401
+ prepared_commits.append(commit_data)
1402
+
1403
+ return prepared_commits