gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -1,19 +1,33 @@
1
1
  """Git repository analyzer with batch processing support."""
2
2
 
3
3
  import fnmatch
4
+ import logging
5
+ import os
6
+ import re
4
7
  from collections.abc import Generator
5
- from datetime import datetime
8
+ from datetime import datetime, timedelta, timezone
6
9
  from pathlib import Path
7
10
  from typing import Any, Optional
8
11
 
9
12
  import git
10
13
  from git import Repo
11
- from tqdm import tqdm
12
14
 
13
15
  from ..extractors.story_points import StoryPointExtractor
14
16
  from ..extractors.tickets import TicketExtractor
15
17
  from .branch_mapper import BranchToProjectMapper
16
18
  from .cache import GitAnalysisCache
19
+ from .progress import get_progress_service
20
+
21
+ # Import ML extractor with fallback
22
+ try:
23
+ from ..extractors.ml_tickets import MLTicketExtractor
24
+
25
+ ML_EXTRACTOR_AVAILABLE = True
26
+ except ImportError:
27
+ ML_EXTRACTOR_AVAILABLE = False
28
+
29
+ # Get logger for this module
30
+ logger = logging.getLogger(__name__)
17
31
 
18
32
 
19
33
  class GitAnalyzer:
@@ -26,80 +40,776 @@ class GitAnalyzer:
26
40
  branch_mapping_rules: Optional[dict[str, list[str]]] = None,
27
41
  allowed_ticket_platforms: Optional[list[str]] = None,
28
42
  exclude_paths: Optional[list[str]] = None,
43
+ story_point_patterns: Optional[list[str]] = None,
44
+ ml_categorization_config: Optional[dict[str, Any]] = None,
45
+ llm_config: Optional[dict[str, Any]] = None,
46
+ classification_config: Optional[dict[str, Any]] = None,
47
+ branch_analysis_config: Optional[dict[str, Any]] = None,
29
48
  ):
30
- """Initialize analyzer with cache."""
49
+ """Initialize analyzer with cache and optional ML categorization and commit classification.
50
+
51
+ Args:
52
+ cache: Git analysis cache instance
53
+ batch_size: Number of commits to process in each batch
54
+ branch_mapping_rules: Rules for mapping branches to projects
55
+ allowed_ticket_platforms: List of allowed ticket platforms
56
+ exclude_paths: List of file paths to exclude from analysis
57
+ story_point_patterns: List of regex patterns for extracting story points
58
+ ml_categorization_config: Configuration for ML-based categorization
59
+ llm_config: Configuration for LLM-based commit classification
60
+ classification_config: Configuration for commit classification
61
+ branch_analysis_config: Configuration for branch analysis optimization
62
+ """
31
63
  self.cache = cache
32
64
  self.batch_size = batch_size
33
- self.story_point_extractor = StoryPointExtractor()
34
- self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
65
+ self.story_point_extractor = StoryPointExtractor(patterns=story_point_patterns)
66
+
67
+ # Initialize ticket extractor (ML or standard based on config and availability)
68
+ if (
69
+ ml_categorization_config
70
+ and ml_categorization_config.get("enabled", True)
71
+ and ML_EXTRACTOR_AVAILABLE
72
+ ):
73
+ logger.info("Initializing ML-enhanced ticket extractor")
74
+
75
+ # Check if LLM classification is enabled
76
+ enable_llm = llm_config and llm_config.get("enabled", False)
77
+ if enable_llm:
78
+ logger.info("LLM-based commit classification enabled")
79
+
80
+ self.ticket_extractor = MLTicketExtractor(
81
+ allowed_platforms=allowed_ticket_platforms,
82
+ ml_config=ml_categorization_config,
83
+ llm_config=llm_config,
84
+ cache_dir=cache.cache_dir / "ml_predictions",
85
+ enable_ml=True,
86
+ enable_llm=enable_llm,
87
+ )
88
+ else:
89
+ if ml_categorization_config and ml_categorization_config.get("enabled", True):
90
+ if not ML_EXTRACTOR_AVAILABLE:
91
+ logger.warning(
92
+ "ML categorization requested but dependencies not available, using standard extractor"
93
+ )
94
+ else:
95
+ logger.info(
96
+ "ML categorization disabled in configuration, using standard extractor"
97
+ )
98
+ else:
99
+ logger.debug("Using standard ticket extractor")
100
+
101
+ self.ticket_extractor = TicketExtractor(allowed_platforms=allowed_ticket_platforms)
102
+
35
103
  self.branch_mapper = BranchToProjectMapper(branch_mapping_rules)
36
104
  self.exclude_paths = exclude_paths or []
37
105
 
106
+ # Initialize branch analysis configuration
107
+ self.branch_analysis_config = branch_analysis_config or {}
108
+ self.branch_strategy = self.branch_analysis_config.get("strategy", "all")
109
+ self.max_branches_per_repo = self.branch_analysis_config.get("max_branches_per_repo", 50)
110
+ self.active_days_threshold = self.branch_analysis_config.get("active_days_threshold", 90)
111
+ self.include_main_branches = self.branch_analysis_config.get("include_main_branches", True)
112
+ self.always_include_patterns = self.branch_analysis_config.get(
113
+ "always_include_patterns",
114
+ [r"^(main|master|develop|dev)$", r"^release/.*", r"^hotfix/.*"],
115
+ )
116
+ self.always_exclude_patterns = self.branch_analysis_config.get(
117
+ "always_exclude_patterns",
118
+ [r"^dependabot/.*", r"^renovate/.*", r".*-backup$", r".*-temp$"],
119
+ )
120
+ self.enable_progress_logging = self.branch_analysis_config.get(
121
+ "enable_progress_logging", True
122
+ )
123
+ self.branch_commit_limit = self.branch_analysis_config.get(
124
+ "branch_commit_limit", None
125
+ ) # No limit by default
126
+
127
+ # Initialize commit classifier if enabled
128
+ self.classification_enabled = classification_config and classification_config.get(
129
+ "enabled", False
130
+ )
131
+ self.commit_classifier = None
132
+
133
+ if self.classification_enabled:
134
+ try:
135
+ from ..classification.classifier import CommitClassifier
136
+
137
+ self.commit_classifier = CommitClassifier(
138
+ config=classification_config, cache_dir=cache.cache_dir / "classification"
139
+ )
140
+ logger.info("Commit classification enabled")
141
+ except ImportError as e:
142
+ logger.warning(f"Classification dependencies not available: {e}")
143
+ self.classification_enabled = False
144
+ except Exception as e:
145
+ logger.error(f"Failed to initialize commit classifier: {e}")
146
+ self.classification_enabled = False
147
+
38
148
  def analyze_repository(
39
149
  self, repo_path: Path, since: datetime, branch: Optional[str] = None
40
150
  ) -> list[dict[str, Any]]:
41
- """Analyze a Git repository with batch processing."""
151
+ """Analyze a Git repository with batch processing and optional classification."""
42
152
  try:
43
153
  repo = Repo(repo_path)
154
+ # Update repository from remote before analysis
155
+ self._update_repository(repo)
44
156
  except Exception as e:
45
157
  raise ValueError(f"Failed to open repository at {repo_path}: {e}") from e
46
158
 
47
- # Get commits to analyze
48
- commits = self._get_commits(repo, since, branch)
159
+ # Get commits to analyze with optimized branch selection
160
+ commits = self._get_commits_optimized(repo, since, branch)
49
161
  total_commits = len(commits)
50
162
 
51
163
  if total_commits == 0:
52
164
  return []
53
165
 
54
166
  analyzed_commits = []
167
+ total_cache_hits = 0
168
+ total_cache_misses = 0
55
169
 
56
170
  # Process in batches with progress bar
57
- with tqdm(total=total_commits, desc=f"Analyzing {repo_path.name}") as pbar:
171
+ processed_commits = 0
172
+ progress_service = get_progress_service()
173
+
174
+ # Only create progress bar if enabled
175
+ if self.enable_progress_logging:
176
+ progress_ctx = progress_service.create_progress(
177
+ total=total_commits, description=f"Analyzing {repo_path.name}", unit="commits"
178
+ )
179
+ else:
180
+ progress_ctx = None
181
+
182
+ try:
58
183
  for batch in self._batch_commits(commits, self.batch_size):
59
- batch_results = self._process_batch(repo, repo_path, batch)
184
+ batch_results, batch_hits, batch_misses = self._process_batch(
185
+ repo, repo_path, batch
186
+ )
60
187
  analyzed_commits.extend(batch_results)
61
188
 
62
- # Cache the batch
63
- self.cache.cache_commits_batch(str(repo_path), batch_results)
189
+ # Track overall cache performance
190
+ total_cache_hits += batch_hits
191
+ total_cache_misses += batch_misses
64
192
 
65
- pbar.update(len(batch))
193
+ # Note: Caching is now handled within _process_batch for better performance
194
+
195
+ # Update progress tracking
196
+ batch_size = len(batch)
197
+ processed_commits += batch_size
198
+
199
+ # Update progress bar with cache info if enabled
200
+ if progress_ctx:
201
+ hit_rate = (batch_hits / batch_size) * 100 if batch_size > 0 else 0
202
+ progress_service.set_description(
203
+ progress_ctx,
204
+ f"Analyzing {repo_path.name} (cache hit: {hit_rate:.1f}%, {processed_commits}/{total_commits})",
205
+ )
206
+ progress_service.update(progress_ctx, batch_size)
207
+ finally:
208
+ if progress_ctx:
209
+ progress_service.complete(progress_ctx)
210
+
211
+ # Debug logging for progress tracking issues
212
+ if os.getenv("GITFLOW_DEBUG", "").lower() in ("1", "true", "yes"):
213
+ logger.debug(
214
+ f"Final progress: Processed: {processed_commits}/{total_commits} commits"
215
+ )
216
+
217
+ # Log overall cache performance
218
+ if total_cache_hits + total_cache_misses > 0:
219
+ overall_hit_rate = (total_cache_hits / (total_cache_hits + total_cache_misses)) * 100
220
+ logger.info(
221
+ f"Repository {repo_path.name}: {total_cache_hits} cached, {total_cache_misses} analyzed ({overall_hit_rate:.1f}% cache hit rate)"
222
+ )
223
+
224
+ # Apply commit classification if enabled
225
+ if self.classification_enabled and self.commit_classifier and analyzed_commits:
226
+ logger.info(f"Applying commit classification to {len(analyzed_commits)} commits")
227
+
228
+ try:
229
+ # Prepare commits for classification (add file changes information)
230
+ commits_with_files = self._prepare_commits_for_classification(
231
+ repo, analyzed_commits
232
+ )
233
+
234
+ # Get classification results
235
+ classification_results = self.commit_classifier.classify_commits(commits_with_files)
236
+
237
+ # Merge classification results back into analyzed commits
238
+ for commit, classification in zip(analyzed_commits, classification_results):
239
+ if classification: # Classification might be empty if disabled or failed
240
+ commit.update(
241
+ {
242
+ "predicted_class": classification.get("predicted_class"),
243
+ "classification_confidence": classification.get("confidence"),
244
+ "is_reliable_prediction": classification.get(
245
+ "is_reliable_prediction"
246
+ ),
247
+ "class_probabilities": classification.get("class_probabilities"),
248
+ "file_analysis_summary": classification.get("file_analysis"),
249
+ "classification_metadata": classification.get(
250
+ "classification_metadata"
251
+ ),
252
+ }
253
+ )
254
+
255
+ logger.info(f"Successfully classified {len(classification_results)} commits")
256
+
257
+ except Exception as e:
258
+ logger.error(f"Commit classification failed: {e}")
259
+ # Continue without classification rather than failing entirely
66
260
 
67
261
  return analyzed_commits
68
262
 
69
- def _get_commits(
263
+ def _update_repository(self, repo) -> bool:
264
+ """Update repository from remote before analysis.
265
+
266
+ WHY: This ensures we have the latest commits from the remote repository
267
+ before performing analysis. Critical for getting accurate data especially
268
+ when analyzing repositories that are actively being developed.
269
+
270
+ DESIGN DECISION: Uses fetch() for all cases, then pull() only when on a
271
+ tracking branch that's not in detached HEAD state. This approach:
272
+ - Handles detached HEAD states gracefully (common in CI/CD)
273
+ - Always gets latest refs from remote via fetch
274
+ - Only attempts pull when it's safe to do so
275
+ - Continues analysis even if update fails (logs warning)
276
+
277
+ Args:
278
+ repo: GitPython Repo object
279
+
280
+ Returns:
281
+ bool: True if update succeeded, False if failed (but analysis continues)
282
+ """
283
+ try:
284
+ if repo.remotes:
285
+ origin = repo.remotes.origin
286
+ logger.info("Fetching latest changes from remote")
287
+ origin.fetch()
288
+
289
+ # Only try to pull if not in detached HEAD state
290
+ if not repo.head.is_detached:
291
+ current_branch = repo.active_branch
292
+ tracking = current_branch.tracking_branch()
293
+ if tracking:
294
+ # Pull latest changes
295
+ origin.pull()
296
+ logger.debug(f"Pulled latest changes for {current_branch.name}")
297
+ else:
298
+ logger.debug(
299
+ f"Branch {current_branch.name} has no tracking branch, skipping pull"
300
+ )
301
+ else:
302
+ logger.debug("Repository in detached HEAD state, skipping pull")
303
+ return True
304
+ else:
305
+ logger.debug("No remotes configured, skipping repository update")
306
+ return True
307
+ except Exception as e:
308
+ logger.warning(f"Could not update repository: {e}")
309
+ # Continue with analysis using local state
310
+ return False
311
+
312
+ def _get_commits_optimized(
70
313
  self, repo: Repo, since: datetime, branch: Optional[str] = None
71
314
  ) -> list[git.Commit]:
72
- """Get commits from repository."""
73
- if branch:
315
+ """Get commits from repository with branch analysis strategy.
316
+
317
+ WHY: Different analysis needs require different branch coverage approaches.
318
+ The default "all" strategy ensures complete commit coverage without missing
319
+ important development work that happens on feature branches.
320
+
321
+ DESIGN DECISION: Three strategies available:
322
+ 1. "main_only": Only analyze main/master branch (fastest, least comprehensive)
323
+ 2. "smart": Analyze active branches with smart filtering (balanced, may miss commits)
324
+ 3. "all": Analyze all branches (comprehensive coverage, default)
325
+
326
+ DEFAULT STRATEGY CHANGED: Now defaults to "all" to ensure complete coverage
327
+ after reports that "smart" strategy was missing significant commits (~100+ commits
328
+ and entire developers working on feature branches).
329
+
330
+ The "smart" strategy filters branches based on:
331
+ - Recent activity (commits within active_days_threshold)
332
+ - Branch naming patterns (always include main, release, hotfix branches)
333
+ - Exclude automation branches (dependabot, renovate, etc.)
334
+ - Limit total branches per repository
335
+
336
+ The "all" strategy:
337
+ - Analyzes all local and remote branches/refs
338
+ - No artificial branch limits
339
+ - No commit limits per branch (unless explicitly configured)
340
+ - Ensures complete development history capture
341
+ """
342
+ logger.debug(f"Getting commits since: {since} (tzinfo: {getattr(since, 'tzinfo', 'N/A')})")
343
+ logger.debug(f"Using branch analysis strategy: {self.branch_strategy}")
344
+
345
+ if self.branch_strategy == "main_only":
346
+ return self._get_main_branch_commits(repo, since, branch)
347
+ elif self.branch_strategy == "all":
348
+ logger.info("Using 'all' branches strategy for complete commit coverage")
349
+ return self._get_all_branch_commits(repo, since)
350
+ else: # smart strategy
351
+ return self._get_smart_branch_commits(repo, since)
352
+
353
+ def _get_main_branch_commits(
354
+ self, repo: Repo, since: datetime, branch: Optional[str] = None
355
+ ) -> list[git.Commit]:
356
+ """Get commits from main branch only (fastest strategy).
357
+
358
+ Args:
359
+ repo: Git repository object
360
+ since: Date to get commits since
361
+ branch: Specific branch to analyze (overrides main branch detection)
362
+
363
+ Returns:
364
+ List of commits from main branch only
365
+ """
366
+ target_branch = branch
367
+ if not target_branch:
368
+ # Auto-detect main branch
369
+ main_branch_names = ["main", "master", "develop", "dev"]
370
+ for branch_name in main_branch_names:
371
+ try:
372
+ if branch_name in [b.name for b in repo.branches]:
373
+ target_branch = branch_name
374
+ break
375
+ except Exception:
376
+ continue
377
+
378
+ if not target_branch and repo.branches:
379
+ target_branch = repo.branches[0].name # Fallback to first branch
380
+
381
+ if not target_branch:
382
+ logger.warning("No main branch found, no commits will be analyzed")
383
+ return []
384
+
385
+ logger.debug(f"Analyzing main branch only: {target_branch}")
386
+
387
+ try:
388
+ if self.branch_commit_limit:
389
+ commits = list(
390
+ repo.iter_commits(
391
+ target_branch, since=since, max_count=self.branch_commit_limit
392
+ )
393
+ )
394
+ else:
395
+ commits = list(repo.iter_commits(target_branch, since=since))
396
+ logger.debug(f"Found {len(commits)} commits in main branch {target_branch}")
397
+ return sorted(commits, key=lambda c: c.committed_datetime)
398
+ except git.GitCommandError as e:
399
+ logger.warning(f"Failed to get commits from branch {target_branch}: {e}")
400
+ return []
401
+
402
+ def _get_all_branch_commits(self, repo: Repo, since: datetime) -> list[git.Commit]:
403
+ """Get commits from all branches (comprehensive analysis).
404
+
405
+ WHY: This strategy captures ALL commits from ALL branches without artificial limitations.
406
+ It's designed to ensure complete coverage even if it takes longer to run.
407
+
408
+ DESIGN DECISION: Analyzes both local and remote branches to ensure we don't miss
409
+ commits that exist only on remote branches. Uses no commit limits per branch
410
+ to capture complete development history.
411
+
412
+ Args:
413
+ repo: Git repository object
414
+ since: Date to get commits since
415
+
416
+ Returns:
417
+ List of unique commits from all branches
418
+ """
419
+ logger.info("Analyzing all branches for complete commit coverage")
420
+
421
+ commits = []
422
+ branch_count = 0
423
+ processed_refs = set() # Track processed refs to avoid duplicates
424
+
425
+ # Process all refs (local branches, remote branches, tags)
426
+ for ref in repo.refs:
427
+ # Skip if we've already processed this ref
428
+ ref_name = ref.name
429
+ if ref_name in processed_refs:
430
+ continue
431
+
432
+ processed_refs.add(ref_name)
433
+ branch_count += 1
434
+
74
435
  try:
75
- commits = list(repo.iter_commits(branch, since=since))
76
- except git.GitCommandError:
77
- # Branch doesn't exist
78
- return []
436
+ # No commit limit - get ALL commits from this branch
437
+ if self.branch_commit_limit:
438
+ branch_commits = list(
439
+ repo.iter_commits(ref, since=since, max_count=self.branch_commit_limit)
440
+ )
441
+ logger.debug(
442
+ f"Branch {ref_name}: found {len(branch_commits)} commits (limited to {self.branch_commit_limit})"
443
+ )
444
+ else:
445
+ branch_commits = list(repo.iter_commits(ref, since=since))
446
+ logger.debug(
447
+ f"Branch {ref_name}: found {len(branch_commits)} commits (no limit)"
448
+ )
449
+
450
+ commits.extend(branch_commits)
451
+
452
+ if self.enable_progress_logging and branch_count % 10 == 0:
453
+ logger.info(
454
+ f"Processed {branch_count} branches, found {len(commits)} total commits so far"
455
+ )
456
+
457
+ except git.GitCommandError as e:
458
+ logger.debug(f"Skipping branch {ref_name} due to error: {e}")
459
+ continue
460
+
461
+ # Remove duplicates while preserving order
462
+ unique_commits = self._deduplicate_commits(commits)
463
+
464
+ logger.info(
465
+ f"Found {len(unique_commits)} unique commits across {branch_count} branches/refs"
466
+ )
467
+ return sorted(unique_commits, key=lambda c: c.committed_datetime)
468
+
469
+ def is_analysis_needed(
470
+ self,
471
+ repo_path: Path,
472
+ project_key: str,
473
+ analysis_start: datetime,
474
+ analysis_end: datetime,
475
+ weeks_analyzed: int,
476
+ config_hash: Optional[str] = None,
477
+ force_fetch: bool = False,
478
+ ) -> tuple[bool, Optional[dict[str, Any]]]:
479
+ """Check if repository analysis is needed or if cached data can be used.
480
+
481
+ WHY: Implements cache-first workflow by checking if repository has been
482
+ fully analyzed for the given period. Enables "fetch once, report many".
483
+
484
+ Args:
485
+ repo_path: Path to the repository
486
+ project_key: Project key for the repository
487
+ analysis_start: Start of the analysis period
488
+ analysis_end: End of the analysis period
489
+ weeks_analyzed: Number of weeks to analyze
490
+ config_hash: Hash of relevant configuration to detect changes
491
+ force_fetch: Force re-analysis even if cached data exists
492
+
493
+ Returns:
494
+ Tuple of (needs_analysis, cached_status_info)
495
+ """
496
+ if force_fetch:
497
+ logger.info(f"Force fetch enabled for {project_key} - analysis needed")
498
+ return True, None
499
+
500
+ # Check if analysis is already complete
501
+ status = self.cache.get_repository_analysis_status(
502
+ repo_path=str(repo_path),
503
+ analysis_start=analysis_start,
504
+ analysis_end=analysis_end,
505
+ config_hash=config_hash,
506
+ )
507
+
508
+ if not status:
509
+ logger.info(f"No cached analysis found for {project_key} - analysis needed")
510
+ return True, None
511
+
512
+ # Validate completeness
513
+ if (
514
+ status["git_analysis_complete"]
515
+ and status["weeks_analyzed"] >= weeks_analyzed
516
+ and status["commit_count"] > 0
517
+ ):
518
+ logger.info(
519
+ f"Using cached analysis for {project_key}: "
520
+ f"{status['commit_count']} commits, "
521
+ f"{status.get('unique_developers', 0)} developers"
522
+ )
523
+ return False, status
79
524
  else:
80
- # Get commits from all branches
81
- commits = []
82
- for ref in repo.refs:
83
- if ref.name.startswith("origin/"):
84
- continue # Skip remote branches
525
+ logger.info(f"Incomplete cached analysis for {project_key} - re-analysis needed")
526
+ return True, None
527
+
528
+ def mark_analysis_complete(
529
+ self,
530
+ repo_path: Path,
531
+ repo_name: str,
532
+ project_key: str,
533
+ analysis_start: datetime,
534
+ analysis_end: datetime,
535
+ weeks_analyzed: int,
536
+ commit_count: int,
537
+ unique_developers: int = 0,
538
+ processing_time_seconds: Optional[float] = None,
539
+ config_hash: Optional[str] = None,
540
+ ) -> None:
541
+ """Mark repository analysis as complete in the cache.
542
+
543
+ WHY: Records successful completion to enable cache-first workflow.
544
+ Should be called after successful repository analysis.
545
+
546
+ Args:
547
+ repo_path: Path to the repository
548
+ repo_name: Display name for the repository
549
+ project_key: Project key for the repository
550
+ analysis_start: Start of the analysis period
551
+ analysis_end: End of the analysis period
552
+ weeks_analyzed: Number of weeks analyzed
553
+ commit_count: Number of commits processed
554
+ unique_developers: Number of unique developers found
555
+ processing_time_seconds: Time taken for analysis
556
+ config_hash: Hash of relevant configuration
557
+ """
558
+ try:
559
+ self.cache.mark_repository_analysis_complete(
560
+ repo_path=str(repo_path),
561
+ repo_name=repo_name,
562
+ project_key=project_key,
563
+ analysis_start=analysis_start,
564
+ analysis_end=analysis_end,
565
+ weeks_analyzed=weeks_analyzed,
566
+ commit_count=commit_count,
567
+ unique_developers=unique_developers,
568
+ processing_time_seconds=processing_time_seconds,
569
+ config_hash=config_hash,
570
+ )
571
+ logger.info(f"Marked {project_key} analysis as complete: {commit_count} commits")
572
+ except Exception as e:
573
+ logger.warning(f"Failed to mark analysis complete for {project_key}: {e}")
574
+
575
+ def mark_analysis_failed(
576
+ self,
577
+ repo_path: Path,
578
+ repo_name: str,
579
+ analysis_start: datetime,
580
+ analysis_end: datetime,
581
+ error_message: str,
582
+ config_hash: Optional[str] = None,
583
+ ) -> None:
584
+ """Mark repository analysis as failed in the cache.
585
+
586
+ Args:
587
+ repo_path: Path to the repository
588
+ repo_name: Display name for the repository
589
+ analysis_start: Start of the analysis period
590
+ analysis_end: End of the analysis period
591
+ error_message: Error message describing the failure
592
+ config_hash: Hash of relevant configuration
593
+ """
594
+ try:
595
+ self.cache.mark_repository_analysis_failed(
596
+ repo_path=str(repo_path),
597
+ repo_name=repo_name,
598
+ analysis_start=analysis_start,
599
+ analysis_end=analysis_end,
600
+ error_message=error_message,
601
+ config_hash=config_hash,
602
+ )
603
+ logger.warning(f"Marked {repo_name} analysis as failed: {error_message}")
604
+ except Exception as e:
605
+ logger.error(f"Failed to mark analysis failure for {repo_name}: {e}")
606
+
607
+ def _get_smart_branch_commits(self, repo: Repo, since: datetime) -> list[git.Commit]:
608
+ """Get commits using smart branch filtering (balanced approach).
609
+
610
+ This method implements intelligent branch selection that:
611
+ 1. Always includes main/important branches
612
+ 2. Includes recently active branches
613
+ 3. Excludes automation/temporary branches
614
+ 4. Limits total number of branches analyzed
615
+
616
+ Args:
617
+ repo: Git repository object
618
+ since: Date to get commits since
619
+
620
+ Returns:
621
+ List of unique commits from selected branches
622
+ """
623
+ logger.debug("Using smart branch analysis strategy")
624
+
625
+ # Get active date threshold
626
+ active_threshold = datetime.now(timezone.utc) - timedelta(days=self.active_days_threshold)
627
+
628
+ # Collect branch information
629
+ branch_info = []
630
+
631
+ for ref in repo.refs:
632
+ if ref.name.startswith("origin/"):
633
+ continue # Skip remote tracking branches
634
+
635
+ try:
636
+ branch_name = ref.name
637
+
638
+ # Check if branch should be excluded
639
+ if self._should_exclude_branch(branch_name):
640
+ continue
641
+
642
+ # Get latest commit date for this branch
85
643
  try:
86
- branch_commits = list(repo.iter_commits(ref, since=since))
644
+ latest_commit = next(repo.iter_commits(ref, max_count=1))
645
+ latest_date = latest_commit.committed_datetime
646
+
647
+ # Convert to timezone-aware if needed
648
+ if latest_date.tzinfo is None:
649
+ latest_date = latest_date.replace(tzinfo=timezone.utc)
650
+ elif latest_date.tzinfo != timezone.utc:
651
+ latest_date = latest_date.astimezone(timezone.utc)
652
+
653
+ except StopIteration:
654
+ continue # Empty branch
655
+
656
+ # Determine branch priority
657
+ is_important = self._is_important_branch(branch_name)
658
+ is_active = latest_date >= active_threshold
659
+
660
+ branch_info.append(
661
+ {
662
+ "ref": ref,
663
+ "name": branch_name,
664
+ "latest_date": latest_date,
665
+ "is_important": is_important,
666
+ "is_active": is_active,
667
+ }
668
+ )
669
+
670
+ except Exception as e:
671
+ logger.debug(f"Skipping branch {ref.name} due to error: {e}")
672
+ continue
673
+
674
+ # Sort branches by importance and activity
675
+ branch_info.sort(
676
+ key=lambda x: (
677
+ x["is_important"], # Important branches first
678
+ x["is_active"], # Then active branches
679
+ x["latest_date"], # Then by recency
680
+ ),
681
+ reverse=True,
682
+ )
683
+
684
+ # Select branches to analyze
685
+ selected_branches = branch_info[: self.max_branches_per_repo]
686
+
687
+ if self.enable_progress_logging:
688
+ logger.info(
689
+ f"Selected {len(selected_branches)} branches out of {len(branch_info)} total branches"
690
+ )
691
+ important_count = sum(1 for b in selected_branches if b["is_important"])
692
+ active_count = sum(1 for b in selected_branches if b["is_active"])
693
+ logger.debug(f"Selected branches: {important_count} important, {active_count} active")
694
+
695
+ # Get commits from selected branches
696
+ commits = []
697
+
698
+ # Use centralized progress service
699
+ progress = get_progress_service()
700
+
701
+ # Only create progress if logging is enabled
702
+ if self.enable_progress_logging:
703
+ with progress.progress(
704
+ total=len(selected_branches),
705
+ description="Analyzing branches",
706
+ unit="branches",
707
+ leave=False,
708
+ ) as ctx:
709
+ for branch_data in selected_branches:
710
+ try:
711
+ if self.branch_commit_limit:
712
+ branch_commits = list(
713
+ repo.iter_commits(
714
+ branch_data["ref"],
715
+ since=since,
716
+ max_count=self.branch_commit_limit,
717
+ )
718
+ )
719
+ else:
720
+ branch_commits = list(
721
+ repo.iter_commits(branch_data["ref"], since=since)
722
+ )
723
+ commits.extend(branch_commits)
724
+
725
+ # Update progress description with branch info
726
+ branch_display = branch_data["name"][:15] + (
727
+ "..." if len(branch_data["name"]) > 15 else ""
728
+ )
729
+ progress.set_description(
730
+ ctx,
731
+ f"Analyzing branches [{branch_display}: {len(branch_commits)} commits]",
732
+ )
733
+
734
+ except git.GitCommandError as e:
735
+ logger.debug(
736
+ f"Failed to get commits from branch {branch_data['name']}: {e}"
737
+ )
738
+
739
+ progress.update(ctx, 1)
740
+ else:
741
+ # No progress bar when logging is disabled
742
+ for branch_data in selected_branches:
743
+ try:
744
+ if self.branch_commit_limit:
745
+ branch_commits = list(
746
+ repo.iter_commits(
747
+ branch_data["ref"], since=since, max_count=self.branch_commit_limit
748
+ )
749
+ )
750
+ else:
751
+ branch_commits = list(repo.iter_commits(branch_data["ref"], since=since))
87
752
  commits.extend(branch_commits)
88
- except git.GitCommandError:
89
- continue
90
753
 
91
- # Remove duplicates while preserving order
92
- seen = set()
93
- unique_commits = []
94
- for commit in commits:
95
- if commit.hexsha not in seen:
96
- seen.add(commit.hexsha)
97
- unique_commits.append(commit)
754
+ except git.GitCommandError as e:
755
+ logger.debug(f"Failed to get commits from branch {branch_data['name']}: {e}")
756
+
757
+ # Remove duplicates while preserving order
758
+ unique_commits = self._deduplicate_commits(commits)
98
759
 
99
- commits = unique_commits
760
+ logger.info(
761
+ f"Smart analysis found {len(unique_commits)} unique commits from {len(selected_branches)} branches"
762
+ )
763
+ return sorted(unique_commits, key=lambda c: c.committed_datetime)
764
+
765
+ def _should_exclude_branch(self, branch_name: str) -> bool:
766
+ """Check if a branch should be excluded from analysis.
100
767
 
101
- # Sort by date
102
- return sorted(commits, key=lambda c: c.committed_datetime)
768
+ Args:
769
+ branch_name: Name of the branch to check
770
+
771
+ Returns:
772
+ True if the branch should be excluded, False otherwise
773
+ """
774
+ # Check against exclude patterns
775
+ for pattern in self.always_exclude_patterns:
776
+ if re.match(pattern, branch_name, re.IGNORECASE):
777
+ return True
778
+ return False
779
+
780
+ def _is_important_branch(self, branch_name: str) -> bool:
781
+ """Check if a branch is considered important and should always be included.
782
+
783
+ Args:
784
+ branch_name: Name of the branch to check
785
+
786
+ Returns:
787
+ True if the branch is important, False otherwise
788
+ """
789
+ # Check against important branch patterns
790
+ for pattern in self.always_include_patterns:
791
+ if re.match(pattern, branch_name, re.IGNORECASE):
792
+ return True
793
+ return False
794
+
795
+ def _deduplicate_commits(self, commits: list[git.Commit]) -> list[git.Commit]:
796
+ """Remove duplicate commits while preserving order.
797
+
798
+ Args:
799
+ commits: List of commits that may contain duplicates
800
+
801
+ Returns:
802
+ List of unique commits in original order
803
+ """
804
+ seen = set()
805
+ unique_commits = []
806
+
807
+ for commit in commits:
808
+ if commit.hexsha not in seen:
809
+ seen.add(commit.hexsha)
810
+ unique_commits.append(commit)
811
+
812
+ return unique_commits
103
813
 
104
814
  def _batch_commits(
105
815
  self, commits: list[git.Commit], batch_size: int
@@ -110,32 +820,96 @@ class GitAnalyzer:
110
820
 
111
821
  def _process_batch(
112
822
  self, repo: Repo, repo_path: Path, commits: list[git.Commit]
113
- ) -> list[dict[str, Any]]:
114
- """Process a batch of commits."""
823
+ ) -> tuple[list[dict[str, Any]], int, int]:
824
+ """Process a batch of commits with optimized cache lookups.
825
+
826
+ WHY: Bulk cache lookups are much faster than individual queries.
827
+ This optimization can reduce subsequent run times from minutes to seconds
828
+ when most commits are already cached.
829
+
830
+ ENHANCEMENT: Now uses enhanced bulk_get_commits for better performance
831
+ and automatically detects when to use bulk_store_commits for new data.
832
+
833
+ Returns:
834
+ Tuple of (results, cache_hits, cache_misses)
835
+ """
115
836
  results = []
116
837
 
838
+ # Use enhanced bulk fetch with better performance
839
+ commit_hashes = [commit.hexsha for commit in commits]
840
+ cached_commits = self.cache.bulk_get_commits(str(repo_path), commit_hashes)
841
+
842
+ cache_hits = 0
843
+ cache_misses = 0
844
+ new_commits = []
845
+
117
846
  for commit in commits:
118
- # Check cache first
119
- cached = self.cache.get_cached_commit(str(repo_path), commit.hexsha)
120
- if cached:
121
- results.append(cached)
847
+ # Check bulk cache results
848
+ if commit.hexsha in cached_commits:
849
+ results.append(cached_commits[commit.hexsha])
850
+ cache_hits += 1
122
851
  continue
123
852
 
124
853
  # Analyze commit
125
854
  commit_data = self._analyze_commit(repo, commit, repo_path)
126
855
  results.append(commit_data)
856
+ new_commits.append(commit_data)
857
+ cache_misses += 1
858
+
859
+ # Use bulk_store_commits for better performance when we have many new commits
860
+ if len(new_commits) >= 10: # Threshold for bulk operations
861
+ logger.debug(f"Using bulk_store_commits for {len(new_commits)} new commits")
862
+ stats = self.cache.bulk_store_commits(str(repo_path), new_commits)
863
+ if stats["inserted"] > 0:
864
+ logger.debug(
865
+ f"Bulk stored {stats['inserted']} commits at {stats['commits_per_second']:.0f} commits/sec"
866
+ )
867
+ elif new_commits:
868
+ # Fall back to regular batch caching for small numbers
869
+ self.cache.cache_commits_batch(str(repo_path), new_commits)
127
870
 
128
- return results
871
+ # Log cache performance for debugging
872
+ if cache_hits + cache_misses > 0:
873
+ cache_hit_rate = (cache_hits / (cache_hits + cache_misses)) * 100
874
+ logger.debug(
875
+ f"Batch cache performance: {cache_hits} hits, {cache_misses} misses ({cache_hit_rate:.1f}% hit rate)"
876
+ )
877
+
878
+ return results, cache_hits, cache_misses
129
879
 
130
880
  def _analyze_commit(self, repo: Repo, commit: git.Commit, repo_path: Path) -> dict[str, Any]:
131
881
  """Analyze a single commit."""
882
+ # Normalize timestamp handling
883
+ commit_timestamp = commit.committed_datetime
884
+ logger.debug(
885
+ f"Analyzing commit {commit.hexsha[:8]}: original timestamp={commit_timestamp} (tzinfo: {getattr(commit_timestamp, 'tzinfo', 'N/A')})"
886
+ )
887
+
888
+ # Ensure timezone-aware timestamp in UTC
889
+ from datetime import timezone
890
+
891
+ if commit_timestamp.tzinfo is None:
892
+ # Convert naive datetime to UTC
893
+ commit_timestamp = commit_timestamp.replace(tzinfo=timezone.utc)
894
+ logger.debug(f" Converted naive timestamp to UTC: {commit_timestamp}")
895
+ elif commit_timestamp.tzinfo != timezone.utc:
896
+ # Convert to UTC if in different timezone
897
+ commit_timestamp = commit_timestamp.astimezone(timezone.utc)
898
+ logger.debug(f" Converted timestamp to UTC: {commit_timestamp}")
899
+ else:
900
+ logger.debug(f" Timestamp already in UTC: {commit_timestamp}")
901
+
902
+ # Get the local hour for the developer (before UTC conversion)
903
+ local_hour = commit.committed_datetime.hour
904
+
132
905
  # Basic commit data
133
906
  commit_data = {
134
907
  "hash": commit.hexsha,
135
908
  "author_name": commit.author.name,
136
909
  "author_email": commit.author.email,
137
910
  "message": commit.message,
138
- "timestamp": commit.committed_datetime,
911
+ "timestamp": commit_timestamp, # Now guaranteed to be UTC timezone-aware
912
+ "local_hour": local_hour, # Hour in developer's local timezone
139
913
  "is_merge": len(commit.parents) > 1,
140
914
  }
141
915
 
@@ -147,11 +921,16 @@ class GitAnalyzer:
147
921
  str(commit_data["branch"]), repo_path
148
922
  )
149
923
 
150
- # Calculate metrics - use raw stats for backward compatibility
151
- stats = commit.stats.total
152
- commit_data["files_changed"] = int(stats.get("files", 0)) if hasattr(stats, "get") else 0
153
- commit_data["insertions"] = int(stats.get("insertions", 0)) if hasattr(stats, "get") else 0
154
- commit_data["deletions"] = int(stats.get("deletions", 0)) if hasattr(stats, "get") else 0
924
+ # Calculate metrics using reliable git numstat for accurate line counts
925
+ raw_stats = self._calculate_raw_stats(commit)
926
+ commit_data["files_changed_count"] = raw_stats[
927
+ "files"
928
+ ] # Integer count for backward compatibility
929
+ commit_data["files_changed"] = self._get_changed_file_paths(
930
+ commit
931
+ ) # List of file paths for ML
932
+ commit_data["insertions"] = raw_stats["insertions"]
933
+ commit_data["deletions"] = raw_stats["deletions"]
155
934
 
156
935
  # Calculate filtered metrics (excluding boilerplate/generated files)
157
936
  filtered_stats = self._calculate_filtered_stats(commit)
@@ -183,31 +962,99 @@ class GitAnalyzer:
183
962
  return branch.name
184
963
  return "unknown"
185
964
 
965
+ def _get_changed_file_paths(self, commit: git.Commit) -> list[str]:
966
+ """Extract list of changed file paths from a git commit.
967
+
968
+ Args:
969
+ commit: Git commit object
970
+
971
+ Returns:
972
+ List of file paths that were changed in the commit
973
+ """
974
+ file_paths = []
975
+
976
+ # Handle initial commits (no parents) and regular commits
977
+ parent = commit.parents[0] if commit.parents else None
978
+
979
+ try:
980
+ for diff in commit.diff(parent):
981
+ # Get file path - prefer the new path (b_path) for modifications and additions,
982
+ # fall back to old path (a_path) for deletions
983
+ file_path = diff.b_path if diff.b_path else diff.a_path
984
+ if file_path:
985
+ file_paths.append(file_path)
986
+ except Exception as e:
987
+ logger.warning(f"Failed to extract file paths from commit {commit.hexsha[:8]}: {e}")
988
+
989
+ return file_paths
990
+
186
991
  def _calculate_complexity_delta(self, commit: git.Commit) -> float:
187
- """Calculate complexity change for a commit."""
992
+ """Calculate complexity change for a commit with graceful error handling.
993
+
994
+ WHY: Repository corruption or missing blobs can cause SHA resolution errors.
995
+ This method provides a fallback complexity calculation that continues
996
+ analysis even when individual blobs are missing or corrupt.
997
+ """
188
998
  total_delta = 0.0
189
999
 
190
- for diff in commit.diff(commit.parents[0] if commit.parents else None):
191
- if not self._is_code_file(diff.b_path or diff.a_path or ""):
192
- continue
1000
+ try:
1001
+ parent = commit.parents[0] if commit.parents else None
1002
+ diffs = commit.diff(parent)
1003
+ except Exception as e:
1004
+ # If we can't get diffs at all, return 0 complexity delta
1005
+ logger.debug(f"Cannot calculate complexity for commit {commit.hexsha[:8]}: {e}")
1006
+ return 0.0
193
1007
 
194
- # Simple complexity estimation based on diff size
195
- # In a real implementation, you'd parse the code and calculate cyclomatic complexity
196
- if diff.new_file:
197
- total_delta += diff.b_blob.size / 100 if diff.b_blob else 0
198
- elif diff.deleted_file:
199
- total_delta -= diff.a_blob.size / 100 if diff.a_blob else 0
200
- else:
201
- # Modified file - estimate based on change size
202
- if diff.diff:
203
- diff_content = (
204
- diff.diff
205
- if isinstance(diff.diff, str)
206
- else diff.diff.decode("utf-8", errors="ignore")
207
- )
208
- added = len(diff_content.split("\n+"))
209
- removed = len(diff_content.split("\n-"))
210
- total_delta += (added - removed) / 10
1008
+ for diff in diffs:
1009
+ try:
1010
+ if not self._is_code_file(diff.b_path or diff.a_path or ""):
1011
+ continue
1012
+
1013
+ # Simple complexity estimation based on diff size
1014
+ # In a real implementation, you'd parse the code and calculate cyclomatic complexity
1015
+ if diff.new_file:
1016
+ try:
1017
+ if diff.b_blob and hasattr(diff.b_blob, "size"):
1018
+ total_delta += diff.b_blob.size / 100
1019
+ except (ValueError, AttributeError) as e:
1020
+ logger.debug(
1021
+ f"Cannot access b_blob for new file in {commit.hexsha[:8]}: {e}"
1022
+ )
1023
+ # Use a default small positive delta for new files
1024
+ total_delta += 1.0
1025
+
1026
+ elif diff.deleted_file:
1027
+ try:
1028
+ if diff.a_blob and hasattr(diff.a_blob, "size"):
1029
+ total_delta -= diff.a_blob.size / 100
1030
+ except (ValueError, AttributeError) as e:
1031
+ logger.debug(
1032
+ f"Cannot access a_blob for deleted file in {commit.hexsha[:8]}: {e}"
1033
+ )
1034
+ # Use a default small negative delta for deleted files
1035
+ total_delta -= 1.0
1036
+
1037
+ else:
1038
+ # Modified file - estimate based on change size
1039
+ try:
1040
+ if diff.diff:
1041
+ diff_content = (
1042
+ diff.diff
1043
+ if isinstance(diff.diff, str)
1044
+ else diff.diff.decode("utf-8", errors="ignore")
1045
+ )
1046
+ added = len(diff_content.split("\n+"))
1047
+ removed = len(diff_content.split("\n-"))
1048
+ total_delta += (added - removed) / 10
1049
+ except (ValueError, AttributeError, UnicodeDecodeError) as e:
1050
+ logger.debug(f"Cannot process diff content in {commit.hexsha[:8]}: {e}")
1051
+ # Skip this diff but continue processing
1052
+ pass
1053
+
1054
+ except Exception as e:
1055
+ logger.debug(f"Error processing diff in commit {commit.hexsha[:8]}: {e}")
1056
+ # Continue to next diff
1057
+ continue
211
1058
 
212
1059
  return total_delta
213
1060
 
@@ -249,44 +1096,308 @@ class GitAnalyzer:
249
1096
  # Normalize path separators for consistent matching
250
1097
  filepath = filepath.replace("\\", "/")
251
1098
 
252
- # Check against exclude patterns
253
- return any(fnmatch.fnmatch(filepath, pattern) for pattern in self.exclude_paths)
1099
+ # Check against exclude patterns with proper ** handling
1100
+ return any(self._matches_glob_pattern(filepath, pattern) for pattern in self.exclude_paths)
1101
+
1102
+ def _matches_glob_pattern(self, filepath: str, pattern: str) -> bool:
1103
+ """Check if a file path matches a glob pattern, handling ** recursion correctly.
1104
+
1105
+ This method properly handles different glob pattern types:
1106
+ - **/vendor/** : matches files inside vendor directories at any level
1107
+ - **/*.min.js : matches files with specific suffix anywhere in directory tree
1108
+ - vendor/** : matches files inside vendor directory at root level only
1109
+ - **pattern** : handles other complex patterns with pathlib.match()
1110
+ - simple patterns : uses fnmatch for basic wildcards
1111
+
1112
+ Args:
1113
+ filepath: The file path to check
1114
+ pattern: The glob pattern to match against
1115
+
1116
+ Returns:
1117
+ True if the file path matches the pattern, False otherwise
1118
+ """
1119
+ from pathlib import PurePath
1120
+
1121
+ # Handle empty or invalid inputs
1122
+ if not filepath or not pattern:
1123
+ return False
1124
+
1125
+ path = PurePath(filepath)
1126
+
1127
+ # Check for multiple ** patterns first (most complex)
1128
+ if "**" in pattern and pattern.count("**") > 1:
1129
+ # Multiple ** patterns - use custom recursive matching for complex patterns
1130
+ return self._match_recursive_pattern(filepath, pattern)
1131
+
1132
+ # Then handle simple ** patterns
1133
+ elif pattern.startswith("**/") and pattern.endswith("/**"):
1134
+ # Pattern like **/vendor/** - matches files inside vendor directories at any level
1135
+ dir_name = pattern[3:-3] # Extract 'vendor' from '**/vendor/**'
1136
+ if not dir_name: # Handle edge case of '**/**'
1137
+ return True
1138
+ return dir_name in path.parts
1139
+
1140
+ elif pattern.startswith("**/"):
1141
+ # Pattern like **/*.min.js - matches files with specific suffix anywhere
1142
+ suffix_pattern = pattern[3:]
1143
+ if not suffix_pattern: # Handle edge case of '**/'
1144
+ return True
1145
+ # Check against filename for file patterns, or any path part for directory patterns
1146
+ if suffix_pattern.endswith("/"):
1147
+ # Directory pattern like **/build/
1148
+ dir_name = suffix_pattern[:-1]
1149
+ return dir_name in path.parts
1150
+ else:
1151
+ # File pattern like *.min.js
1152
+ return fnmatch.fnmatch(path.name, suffix_pattern)
1153
+
1154
+ elif pattern.endswith("/**"):
1155
+ # Pattern like vendor/** or docs/build/** - matches files inside directory at root level
1156
+ dir_name = pattern[:-3]
1157
+ if not dir_name: # Handle edge case of '/**'
1158
+ return True
1159
+
1160
+ # Handle both single directory names and nested paths
1161
+ expected_parts = PurePath(dir_name).parts
1162
+ return (
1163
+ len(path.parts) >= len(expected_parts)
1164
+ and path.parts[: len(expected_parts)] == expected_parts
1165
+ )
1166
+
1167
+ elif "**" in pattern:
1168
+ # Single ** pattern - use pathlib matching with fallback
1169
+ try:
1170
+ return path.match(pattern)
1171
+ except (ValueError, TypeError):
1172
+ # Fall back to fnmatch if pathlib fails (e.g., invalid pattern)
1173
+ try:
1174
+ return fnmatch.fnmatch(filepath, pattern)
1175
+ except re.error:
1176
+ # Invalid regex pattern - return False to be safe
1177
+ return False
1178
+ else:
1179
+ # Simple pattern - use fnmatch for basic wildcards
1180
+ try:
1181
+ return fnmatch.fnmatch(filepath, pattern)
1182
+ except re.error:
1183
+ # Invalid regex pattern - return False to be safe
1184
+ return False
1185
+
1186
+ def _match_recursive_pattern(self, filepath: str, pattern: str) -> bool:
1187
+ """Handle complex patterns with multiple ** wildcards.
1188
+
1189
+ Args:
1190
+ filepath: The file path to check
1191
+ pattern: The pattern with multiple ** wildcards
1192
+
1193
+ Returns:
1194
+ True if the path matches the pattern, False otherwise
1195
+ """
1196
+ from pathlib import PurePath
1197
+
1198
+ # Split pattern by ** to handle each segment
1199
+ parts = pattern.split("**")
1200
+ path = PurePath(filepath)
1201
+ path_str = str(path)
1202
+
1203
+ # Handle patterns like 'src/**/components/**/*.tsx' or '**/test/**/*.spec.js'
1204
+ if len(parts) >= 2:
1205
+ # First part should match from the beginning (if not empty)
1206
+ start_pattern = parts[0].rstrip("/")
1207
+ if start_pattern and not path_str.startswith(start_pattern):
1208
+ return False
1209
+
1210
+ # Last part should match the filename/end pattern
1211
+ end_pattern = parts[-1].lstrip("/")
1212
+ if end_pattern and not fnmatch.fnmatch(path.name, end_pattern):
1213
+ # Check if filename matches the end pattern
1214
+ return False
1215
+
1216
+ # Middle parts should exist somewhere in the path between start and end
1217
+ for i in range(1, len(parts) - 1):
1218
+ middle_pattern = parts[i].strip("/")
1219
+ if middle_pattern and middle_pattern not in path.parts:
1220
+ # Check if this directory exists in the path
1221
+ return False
1222
+
1223
+ return True
1224
+
1225
+ return False
254
1226
 
255
1227
  def _calculate_filtered_stats(self, commit: git.Commit) -> dict[str, int]:
256
- """Calculate commit statistics excluding boilerplate/generated files."""
1228
+ """Calculate commit statistics excluding boilerplate/generated files using git diff --numstat."""
257
1229
  filtered_stats = {"files": 0, "insertions": 0, "deletions": 0}
258
1230
 
259
1231
  # For initial commits or commits without parents
260
1232
  parent = commit.parents[0] if commit.parents else None
261
1233
 
262
1234
  try:
263
- for diff in commit.diff(parent):
264
- # Get file path
265
- file_path = diff.b_path if diff.b_path else diff.a_path
266
- if not file_path:
1235
+ # Use git command directly for accurate line counts
1236
+ repo = commit.repo
1237
+ if parent:
1238
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1239
+ else:
1240
+ # Initial commit - use git show with --numstat
1241
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1242
+
1243
+ # Parse the numstat output: insertions\tdeletions\tfilename
1244
+ for line in diff_output.strip().split("\n"):
1245
+ if not line.strip():
267
1246
  continue
268
1247
 
269
- # Skip excluded files
270
- if self._should_exclude_file(file_path):
1248
+ parts = line.split("\t")
1249
+ if len(parts) >= 3:
1250
+ try:
1251
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1252
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1253
+ filename = parts[2]
1254
+
1255
+ # Skip excluded files using the existing filter logic
1256
+ if self._should_exclude_file(filename):
1257
+ continue
1258
+
1259
+ # Count the file and its changes
1260
+ filtered_stats["files"] += 1
1261
+ filtered_stats["insertions"] += insertions
1262
+ filtered_stats["deletions"] += deletions
1263
+
1264
+ except ValueError:
1265
+ # Skip binary files or malformed lines
1266
+ continue
1267
+
1268
+ except Exception as e:
1269
+ # Log the error for debugging but don't crash
1270
+ logger.warning(f"Error calculating filtered stats for commit {commit.hexsha[:8]}: {e}")
1271
+
1272
+ return filtered_stats
1273
+
1274
+ def _calculate_raw_stats(self, commit: git.Commit) -> dict[str, int]:
1275
+ """Calculate commit statistics for all files (no filtering) using git diff --numstat."""
1276
+ raw_stats = {"files": 0, "insertions": 0, "deletions": 0}
1277
+
1278
+ # For initial commits or commits without parents
1279
+ parent = commit.parents[0] if commit.parents else None
1280
+
1281
+ try:
1282
+ # Use git command directly for accurate line counts
1283
+ repo = commit.repo
1284
+ if parent:
1285
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1286
+ else:
1287
+ # Initial commit - use git show with --numstat
1288
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1289
+
1290
+ # Parse the numstat output: insertions\tdeletions\tfilename
1291
+ for line in diff_output.strip().split("\n"):
1292
+ if not line.strip():
271
1293
  continue
272
1294
 
273
- # Count the file
274
- filtered_stats["files"] += 1
1295
+ parts = line.split("\t")
1296
+ if len(parts) >= 3:
1297
+ try:
1298
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1299
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1300
+ # filename = parts[2] - not used in raw stats
1301
+
1302
+ # Count all files and their changes (no filtering)
1303
+ raw_stats["files"] += 1
1304
+ raw_stats["insertions"] += insertions
1305
+ raw_stats["deletions"] += deletions
1306
+
1307
+ except ValueError:
1308
+ # Skip binary files or malformed lines
1309
+ continue
1310
+
1311
+ except Exception as e:
1312
+ # Log the error for debugging but don't crash
1313
+ logger.warning(f"Error calculating raw stats for commit {commit.hexsha[:8]}: {e}")
1314
+
1315
+ return raw_stats
275
1316
 
276
- # Count insertions and deletions
277
- if diff.diff:
278
- diff_text = (
279
- diff.diff
280
- if isinstance(diff.diff, str)
281
- else diff.diff.decode("utf-8", errors="ignore")
1317
+ def _prepare_commits_for_classification(
1318
+ self, repo: Repo, commits: list[dict[str, Any]]
1319
+ ) -> list[dict[str, Any]]:
1320
+ """Prepare commits for classification by adding file change information.
1321
+
1322
+ Args:
1323
+ repo: Git repository object
1324
+ commits: List of analyzed commit dictionaries
1325
+
1326
+ Returns:
1327
+ List of commits with file change information needed for classification
1328
+ """
1329
+ prepared_commits = []
1330
+
1331
+ for commit_data in commits:
1332
+ commit_hash = commit_data.get("hash")
1333
+ if not commit_hash:
1334
+ prepared_commits.append(commit_data)
1335
+ continue
1336
+
1337
+ try:
1338
+ # Use the file paths already extracted during analysis
1339
+ files_changed = commit_data.get("files_changed", [])
1340
+
1341
+ # If files_changed is somehow not available or empty, extract it as fallback
1342
+ if not files_changed:
1343
+ logger.warning(
1344
+ f"No file paths found for commit {commit_hash[:8]}, extracting as fallback"
282
1345
  )
283
- for line in diff_text.split("\n"):
284
- if line.startswith("+") and not line.startswith("+++"):
285
- filtered_stats["insertions"] += 1
286
- elif line.startswith("-") and not line.startswith("---"):
287
- filtered_stats["deletions"] += 1
288
- except Exception:
289
- # If we can't calculate filtered stats, return zeros
290
- pass
1346
+ files_changed = self._get_changed_file_paths(repo.commit(commit_hash))
291
1347
 
292
- return filtered_stats
1348
+ # Create enhanced commit data for classification
1349
+ enhanced_commit = commit_data.copy()
1350
+ enhanced_commit["files_changed"] = files_changed
1351
+
1352
+ # Add file details if needed by classifier
1353
+ if files_changed:
1354
+ file_details = {}
1355
+ # Only extract file details if we need to get commit object for other reasons
1356
+ # or if file details are specifically required by the classifier
1357
+ try:
1358
+ commit = repo.commit(commit_hash)
1359
+ parent = commit.parents[0] if commit.parents else None
1360
+
1361
+ for diff in commit.diff(parent):
1362
+ file_path = diff.b_path if diff.b_path else diff.a_path
1363
+ if file_path and file_path in files_changed and diff.diff:
1364
+ # Calculate insertions and deletions per file
1365
+ diff_text = (
1366
+ diff.diff
1367
+ if isinstance(diff.diff, str)
1368
+ else diff.diff.decode("utf-8", errors="ignore")
1369
+ )
1370
+ insertions = len(
1371
+ [
1372
+ line
1373
+ for line in diff_text.split("\n")
1374
+ if line.startswith("+") and not line.startswith("+++")
1375
+ ]
1376
+ )
1377
+ deletions = len(
1378
+ [
1379
+ line
1380
+ for line in diff_text.split("\n")
1381
+ if line.startswith("-") and not line.startswith("---")
1382
+ ]
1383
+ )
1384
+
1385
+ file_details[file_path] = {
1386
+ "insertions": insertions,
1387
+ "deletions": deletions,
1388
+ }
1389
+
1390
+ enhanced_commit["file_details"] = file_details
1391
+ except Exception as detail_error:
1392
+ logger.warning(
1393
+ f"Failed to extract file details for commit {commit_hash[:8]}: {detail_error}"
1394
+ )
1395
+ enhanced_commit["file_details"] = {}
1396
+
1397
+ prepared_commits.append(enhanced_commit)
1398
+
1399
+ except Exception as e:
1400
+ logger.warning(f"Failed to prepare commit {commit_hash} for classification: {e}")
1401
+ prepared_commits.append(commit_data)
1402
+
1403
+ return prepared_commits