gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,65 @@
1
1
  """Caching layer for Git analysis with SQLite backend."""
2
+
3
+ import hashlib
4
+ import json
5
+ import logging
6
+ import os
2
7
  from contextlib import contextmanager
3
8
  from datetime import datetime, timedelta
4
9
  from pathlib import Path
5
- from typing import Any, Dict, List, Optional
10
+ from typing import Any, Optional, Union
6
11
 
12
+ import git
7
13
  from sqlalchemy import and_
8
14
 
9
- from ..models.database import CachedCommit, Database, IssueCache, PullRequestCache
15
+ from ..models.database import (
16
+ CachedCommit,
17
+ Database,
18
+ IssueCache,
19
+ PullRequestCache,
20
+ RepositoryAnalysisStatus,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
10
24
 
11
25
 
12
26
  class GitAnalysisCache:
13
27
  """Cache for Git analysis results."""
14
-
15
- def __init__(self, cache_dir: Path, ttl_hours: int = 168):
16
- """Initialize cache with SQLite backend."""
17
- self.cache_dir = cache_dir
28
+
29
+ def __init__(
30
+ self, cache_dir: Union[Path, str], ttl_hours: int = 168, batch_size: int = 1000
31
+ ) -> None:
32
+ """Initialize cache with SQLite backend and configurable batch size.
33
+
34
+ WHY: Adding configurable batch size allows tuning for different repository
35
+ sizes and system capabilities. Default of 1000 balances memory usage with
36
+ performance gains from bulk operations.
37
+
38
+ Args:
39
+ cache_dir: Directory for cache database
40
+ ttl_hours: Time-to-live for cache entries in hours
41
+ batch_size: Default batch size for bulk operations (default: 1000)
42
+ """
43
+ self.cache_dir = Path(cache_dir) # Ensure it's a Path object
18
44
  self.ttl_hours = ttl_hours
19
- self.db = Database(cache_dir / 'gitflow_cache.db')
20
-
45
+ self.batch_size = batch_size
46
+ self.db = Database(self.cache_dir / "gitflow_cache.db")
47
+
48
+ # Cache performance tracking with enhanced metrics
49
+ self.cache_hits = 0
50
+ self.cache_misses = 0
51
+ self.cache_start_time = datetime.now()
52
+ self.bulk_operations_count = 0
53
+ self.bulk_operations_time = 0.0
54
+ self.single_operations_count = 0
55
+ self.single_operations_time = 0.0
56
+ self.total_bytes_cached = 0
57
+
58
+ # Debug mode controlled by environment variable
59
+ self.debug_mode = os.getenv("GITFLOW_DEBUG", "").lower() in ("1", "true", "yes")
60
+
21
61
  @contextmanager
22
- def get_session(self):
62
+ def get_session(self) -> Any:
23
63
  """Get database session context manager."""
24
64
  session = self.db.get_session()
25
65
  try:
@@ -30,33 +70,95 @@ class GitAnalysisCache:
30
70
  raise
31
71
  finally:
32
72
  session.close()
33
-
34
- def get_cached_commit(self, repo_path: str, commit_hash: str) -> Optional[Dict[str, Any]]:
73
+
74
+ def get_cached_commit(self, repo_path: str, commit_hash: str) -> Optional[dict[str, Any]]:
35
75
  """Retrieve cached commit data if not stale."""
36
76
  with self.get_session() as session:
37
- cached = session.query(CachedCommit).filter(
38
- and_(
39
- CachedCommit.repo_path == repo_path,
40
- CachedCommit.commit_hash == commit_hash
77
+ cached = (
78
+ session.query(CachedCommit)
79
+ .filter(
80
+ and_(
81
+ CachedCommit.repo_path == repo_path, CachedCommit.commit_hash == commit_hash
82
+ )
41
83
  )
42
- ).first()
43
-
84
+ .first()
85
+ )
86
+
44
87
  if cached and not self._is_stale(cached.cached_at):
88
+ self.cache_hits += 1
89
+ if self.debug_mode:
90
+ print(f"DEBUG: Cache HIT for {commit_hash[:8]} in {repo_path}")
45
91
  return self._commit_to_dict(cached)
46
-
92
+
93
+ self.cache_misses += 1
94
+ if self.debug_mode:
95
+ print(f"DEBUG: Cache MISS for {commit_hash[:8]} in {repo_path}")
47
96
  return None
48
-
49
- def cache_commit(self, repo_path: str, commit_data: Dict[str, Any]):
97
+
98
+ def get_cached_commits_bulk(
99
+ self, repo_path: str, commit_hashes: list[str]
100
+ ) -> dict[str, dict[str, Any]]:
101
+ """Retrieve multiple cached commits in a single query.
102
+
103
+ WHY: Individual cache lookups are inefficient for large batches.
104
+ This method fetches multiple commits at once, reducing database overhead
105
+ and significantly improving performance for subsequent runs.
106
+
107
+ Args:
108
+ repo_path: Repository path for filtering
109
+ commit_hashes: List of commit hashes to look up
110
+
111
+ Returns:
112
+ Dictionary mapping commit hash to commit data (only non-stale entries)
113
+ """
114
+ if not commit_hashes:
115
+ return {}
116
+
117
+ cached_commits = {}
118
+ with self.get_session() as session:
119
+ cached_results = (
120
+ session.query(CachedCommit)
121
+ .filter(
122
+ and_(
123
+ CachedCommit.repo_path == repo_path,
124
+ CachedCommit.commit_hash.in_(commit_hashes),
125
+ )
126
+ )
127
+ .all()
128
+ )
129
+
130
+ for cached in cached_results:
131
+ if not self._is_stale(cached.cached_at):
132
+ cached_commits[cached.commit_hash] = self._commit_to_dict(cached)
133
+
134
+ # Track cache performance
135
+ hits = len(cached_commits)
136
+ misses = len(commit_hashes) - hits
137
+ self.cache_hits += hits
138
+ self.cache_misses += misses
139
+
140
+ if self.debug_mode:
141
+ print(
142
+ f"DEBUG: Bulk cache lookup - {hits} hits, {misses} misses for {len(commit_hashes)} commits"
143
+ )
144
+
145
+ return cached_commits
146
+
147
+ def cache_commit(self, repo_path: str, commit_data: dict[str, Any]) -> None:
50
148
  """Cache commit analysis results."""
51
149
  with self.get_session() as session:
52
150
  # Check if already exists
53
- existing = session.query(CachedCommit).filter(
54
- and_(
55
- CachedCommit.repo_path == repo_path,
56
- CachedCommit.commit_hash == commit_data['hash']
151
+ existing = (
152
+ session.query(CachedCommit)
153
+ .filter(
154
+ and_(
155
+ CachedCommit.repo_path == repo_path,
156
+ CachedCommit.commit_hash == commit_data["hash"],
157
+ )
57
158
  )
58
- ).first()
59
-
159
+ .first()
160
+ )
161
+
60
162
  if existing:
61
163
  # Update existing
62
164
  for key, value in commit_data.items():
@@ -67,207 +169,1484 @@ class GitAnalysisCache:
67
169
  # Create new
68
170
  cached_commit = CachedCommit(
69
171
  repo_path=repo_path,
70
- commit_hash=commit_data['hash'],
71
- author_name=commit_data.get('author_name'),
72
- author_email=commit_data.get('author_email'),
73
- message=commit_data.get('message'),
74
- timestamp=commit_data.get('timestamp'),
75
- branch=commit_data.get('branch'),
76
- is_merge=commit_data.get('is_merge', False),
77
- files_changed=commit_data.get('files_changed', 0),
78
- insertions=commit_data.get('insertions', 0),
79
- deletions=commit_data.get('deletions', 0),
80
- complexity_delta=commit_data.get('complexity_delta', 0.0),
81
- story_points=commit_data.get('story_points'),
82
- ticket_references=commit_data.get('ticket_references', [])
172
+ commit_hash=commit_data["hash"],
173
+ author_name=commit_data.get("author_name"),
174
+ author_email=commit_data.get("author_email"),
175
+ message=commit_data.get("message"),
176
+ timestamp=commit_data.get("timestamp"),
177
+ branch=commit_data.get("branch"),
178
+ is_merge=commit_data.get("is_merge", False),
179
+ files_changed=commit_data.get(
180
+ "files_changed_count",
181
+ (
182
+ commit_data.get("files_changed", 0)
183
+ if isinstance(commit_data.get("files_changed"), int)
184
+ else len(commit_data.get("files_changed", []))
185
+ ),
186
+ ),
187
+ insertions=commit_data.get("insertions", 0),
188
+ deletions=commit_data.get("deletions", 0),
189
+ complexity_delta=commit_data.get("complexity_delta", 0.0),
190
+ story_points=commit_data.get("story_points"),
191
+ ticket_references=commit_data.get("ticket_references", []),
83
192
  )
84
193
  session.add(cached_commit)
85
-
86
- def cache_commits_batch(self, repo_path: str, commits: List[Dict[str, Any]]):
87
- """Cache multiple commits in a single transaction."""
194
+
195
+ def cache_commits_batch(self, repo_path: str, commits: list[dict[str, Any]]) -> None:
196
+ """Cache multiple commits in a single transaction.
197
+
198
+ WHY: Optimized batch caching reduces database overhead by using
199
+ bulk queries to check for existing commits instead of individual lookups.
200
+ This significantly improves performance when caching large batches.
201
+ """
202
+ if not commits:
203
+ return
204
+
205
+ import time
206
+
207
+ start_time = time.time()
208
+
88
209
  with self.get_session() as session:
89
- for commit_data in commits:
90
- # Check if already exists
91
- existing = session.query(CachedCommit).filter(
210
+ # Get all commit hashes in this batch
211
+ commit_hashes = [commit_data["hash"] for commit_data in commits]
212
+
213
+ # Bulk fetch existing commits
214
+ existing_commits = {
215
+ cached.commit_hash: cached
216
+ for cached in session.query(CachedCommit)
217
+ .filter(
92
218
  and_(
93
219
  CachedCommit.repo_path == repo_path,
94
- CachedCommit.commit_hash == commit_data['hash']
220
+ CachedCommit.commit_hash.in_(commit_hashes),
95
221
  )
96
- ).first()
97
-
98
- if existing:
222
+ )
223
+ .all()
224
+ }
225
+
226
+ # Process each commit
227
+ for commit_data in commits:
228
+ commit_hash = commit_data["hash"]
229
+
230
+ if commit_hash in existing_commits:
99
231
  # Update existing
232
+ existing = existing_commits[commit_hash]
100
233
  for key, value in commit_data.items():
101
- if key != 'hash' and hasattr(existing, key):
234
+ if key != "hash" and hasattr(existing, key):
102
235
  setattr(existing, key, value)
103
236
  existing.cached_at = datetime.utcnow()
104
237
  else:
105
238
  # Create new
106
239
  cached_commit = CachedCommit(
107
240
  repo_path=repo_path,
108
- commit_hash=commit_data['hash'],
109
- author_name=commit_data.get('author_name'),
110
- author_email=commit_data.get('author_email'),
111
- message=commit_data.get('message'),
112
- timestamp=commit_data.get('timestamp'),
113
- branch=commit_data.get('branch'),
114
- is_merge=commit_data.get('is_merge', False),
115
- files_changed=commit_data.get('files_changed', 0),
116
- insertions=commit_data.get('insertions', 0),
117
- deletions=commit_data.get('deletions', 0),
118
- complexity_delta=commit_data.get('complexity_delta', 0.0),
119
- story_points=commit_data.get('story_points'),
120
- ticket_references=commit_data.get('ticket_references', [])
241
+ commit_hash=commit_data["hash"],
242
+ author_name=commit_data.get("author_name"),
243
+ author_email=commit_data.get("author_email"),
244
+ message=commit_data.get("message"),
245
+ timestamp=commit_data.get("timestamp"),
246
+ branch=commit_data.get("branch"),
247
+ is_merge=commit_data.get("is_merge", False),
248
+ files_changed=commit_data.get(
249
+ "files_changed_count",
250
+ (
251
+ commit_data.get("files_changed", 0)
252
+ if isinstance(commit_data.get("files_changed"), int)
253
+ else len(commit_data.get("files_changed", []))
254
+ ),
255
+ ),
256
+ insertions=commit_data.get("insertions", 0),
257
+ deletions=commit_data.get("deletions", 0),
258
+ complexity_delta=commit_data.get("complexity_delta", 0.0),
259
+ story_points=commit_data.get("story_points"),
260
+ ticket_references=commit_data.get("ticket_references", []),
121
261
  )
122
262
  session.add(cached_commit)
123
-
124
- def get_cached_pr(self, repo_path: str, pr_number: int) -> Optional[Dict[str, Any]]:
263
+
264
+ # Track performance metrics
265
+ elapsed = time.time() - start_time
266
+ self.bulk_operations_count += 1
267
+ self.bulk_operations_time += elapsed
268
+
269
+ if self.debug_mode:
270
+ print(f"DEBUG: Bulk cached {len(commits)} commits in {elapsed:.3f}s")
271
+
272
+ def get_cached_pr(self, repo_path: str, pr_number: int) -> Optional[dict[str, Any]]:
125
273
  """Retrieve cached pull request data."""
126
274
  with self.get_session() as session:
127
- cached = session.query(PullRequestCache).filter(
128
- and_(
129
- PullRequestCache.repo_path == repo_path,
130
- PullRequestCache.pr_number == pr_number
275
+ cached = (
276
+ session.query(PullRequestCache)
277
+ .filter(
278
+ and_(
279
+ PullRequestCache.repo_path == repo_path,
280
+ PullRequestCache.pr_number == pr_number,
281
+ )
131
282
  )
132
- ).first()
133
-
283
+ .first()
284
+ )
285
+
134
286
  if cached and not self._is_stale(cached.cached_at):
135
287
  return self._pr_to_dict(cached)
136
-
288
+
137
289
  return None
138
-
139
- def cache_pr(self, repo_path: str, pr_data: Dict[str, Any]):
290
+
291
+ def cache_pr(self, repo_path: str, pr_data: dict[str, Any]) -> None:
140
292
  """Cache pull request data."""
141
293
  with self.get_session() as session:
142
- cached_pr = PullRequestCache(
143
- repo_path=repo_path,
144
- pr_number=pr_data['number'],
145
- title=pr_data.get('title'),
146
- description=pr_data.get('description'),
147
- author=pr_data.get('author'),
148
- created_at=pr_data.get('created_at'),
149
- merged_at=pr_data.get('merged_at'),
150
- story_points=pr_data.get('story_points'),
151
- labels=pr_data.get('labels', []),
152
- commit_hashes=pr_data.get('commit_hashes', [])
294
+ # Check if already exists
295
+ existing = (
296
+ session.query(PullRequestCache)
297
+ .filter(
298
+ and_(
299
+ PullRequestCache.repo_path == repo_path,
300
+ PullRequestCache.pr_number == pr_data["number"],
301
+ )
302
+ )
303
+ .first()
153
304
  )
154
- session.merge(cached_pr)
155
-
156
- def cache_issue(self, platform: str, issue_data: Dict[str, Any]):
305
+
306
+ if existing:
307
+ # Update existing
308
+ existing.title = pr_data.get("title")
309
+ existing.description = pr_data.get("description")
310
+ existing.author = pr_data.get("author")
311
+ existing.created_at = pr_data.get("created_at")
312
+ existing.merged_at = pr_data.get("merged_at")
313
+ existing.story_points = pr_data.get("story_points")
314
+ existing.labels = pr_data.get("labels", [])
315
+ existing.commit_hashes = pr_data.get("commit_hashes", [])
316
+ existing.cached_at = datetime.utcnow()
317
+ else:
318
+ # Create new
319
+ cached_pr = PullRequestCache(
320
+ repo_path=repo_path,
321
+ pr_number=pr_data["number"],
322
+ title=pr_data.get("title"),
323
+ description=pr_data.get("description"),
324
+ author=pr_data.get("author"),
325
+ created_at=pr_data.get("created_at"),
326
+ merged_at=pr_data.get("merged_at"),
327
+ story_points=pr_data.get("story_points"),
328
+ labels=pr_data.get("labels", []),
329
+ commit_hashes=pr_data.get("commit_hashes", []),
330
+ )
331
+ session.add(cached_pr)
332
+
333
+ def cache_issue(self, platform: str, issue_data: dict[str, Any]) -> None:
157
334
  """Cache issue data from various platforms."""
158
335
  with self.get_session() as session:
159
- cached_issue = IssueCache(
160
- platform=platform,
161
- issue_id=str(issue_data['id']),
162
- project_key=issue_data['project_key'],
163
- title=issue_data.get('title'),
164
- description=issue_data.get('description'),
165
- status=issue_data.get('status'),
166
- assignee=issue_data.get('assignee'),
167
- created_at=issue_data.get('created_at'),
168
- updated_at=issue_data.get('updated_at'),
169
- resolved_at=issue_data.get('resolved_at'),
170
- story_points=issue_data.get('story_points'),
171
- labels=issue_data.get('labels', []),
172
- platform_data=issue_data.get('platform_data', {})
336
+ # Check if already exists
337
+ existing = (
338
+ session.query(IssueCache)
339
+ .filter(
340
+ and_(
341
+ IssueCache.platform == platform,
342
+ IssueCache.issue_id == str(issue_data["id"]),
343
+ )
344
+ )
345
+ .first()
173
346
  )
174
- session.merge(cached_issue)
175
-
176
- def get_cached_issues(self, platform: str, project_key: str) -> List[Dict[str, Any]]:
347
+
348
+ if existing:
349
+ # Update existing
350
+ existing.project_key = issue_data["project_key"]
351
+ existing.title = issue_data.get("title")
352
+ existing.description = issue_data.get("description")
353
+ existing.status = issue_data.get("status")
354
+ existing.assignee = issue_data.get("assignee")
355
+ existing.created_at = issue_data.get("created_at")
356
+ existing.updated_at = issue_data.get("updated_at")
357
+ existing.resolved_at = issue_data.get("resolved_at")
358
+ existing.story_points = issue_data.get("story_points")
359
+ existing.labels = issue_data.get("labels", [])
360
+ existing.platform_data = issue_data.get("platform_data", {})
361
+ existing.cached_at = datetime.utcnow()
362
+ else:
363
+ # Create new
364
+ cached_issue = IssueCache(
365
+ platform=platform,
366
+ issue_id=str(issue_data["id"]),
367
+ project_key=issue_data["project_key"],
368
+ title=issue_data.get("title"),
369
+ description=issue_data.get("description"),
370
+ status=issue_data.get("status"),
371
+ assignee=issue_data.get("assignee"),
372
+ created_at=issue_data.get("created_at"),
373
+ updated_at=issue_data.get("updated_at"),
374
+ resolved_at=issue_data.get("resolved_at"),
375
+ story_points=issue_data.get("story_points"),
376
+ labels=issue_data.get("labels", []),
377
+ platform_data=issue_data.get("platform_data", {}),
378
+ )
379
+ session.add(cached_issue)
380
+
381
+ def get_cached_issues(self, platform: str, project_key: str) -> list[dict[str, Any]]:
177
382
  """Get all cached issues for a platform and project."""
178
383
  with self.get_session() as session:
179
- issues = session.query(IssueCache).filter(
180
- and_(
181
- IssueCache.platform == platform,
182
- IssueCache.project_key == project_key
384
+ issues = (
385
+ session.query(IssueCache)
386
+ .filter(
387
+ and_(IssueCache.platform == platform, IssueCache.project_key == project_key)
183
388
  )
184
- ).all()
185
-
186
- return [self._issue_to_dict(issue) for issue in issues
187
- if not self._is_stale(issue.cached_at)]
188
-
189
- def clear_stale_cache(self):
389
+ .all()
390
+ )
391
+
392
+ return [
393
+ self._issue_to_dict(issue)
394
+ for issue in issues
395
+ if not self._is_stale(issue.cached_at)
396
+ ]
397
+
398
+ def clear_stale_cache(self) -> None:
190
399
  """Remove stale cache entries."""
191
400
  cutoff_time = datetime.utcnow() - timedelta(hours=self.ttl_hours)
192
-
401
+
193
402
  with self.get_session() as session:
194
- session.query(CachedCommit).filter(
195
- CachedCommit.cached_at < cutoff_time
196
- ).delete()
197
-
403
+ session.query(CachedCommit).filter(CachedCommit.cached_at < cutoff_time).delete()
404
+
198
405
  session.query(PullRequestCache).filter(
199
406
  PullRequestCache.cached_at < cutoff_time
200
407
  ).delete()
201
-
202
- session.query(IssueCache).filter(
203
- IssueCache.cached_at < cutoff_time
408
+
409
+ session.query(IssueCache).filter(IssueCache.cached_at < cutoff_time).delete()
410
+
411
+ # Also clear stale repository analysis status
412
+ session.query(RepositoryAnalysisStatus).filter(
413
+ RepositoryAnalysisStatus.last_updated < cutoff_time
204
414
  ).delete()
205
-
206
- def get_cache_stats(self) -> Dict[str, int]:
207
- """Get cache statistics."""
415
+
416
+ def clear_all_cache(self) -> dict[str, int]:
417
+ """Clear all cache entries including repository analysis status.
418
+
419
+ WHY: Used by --clear-cache flag to force complete re-analysis.
420
+ Returns counts of cleared entries for user feedback.
421
+
422
+ Returns:
423
+ Dictionary with counts of cleared entries by type
424
+ """
208
425
  with self.get_session() as session:
426
+ # Count before clearing
427
+ commit_count = session.query(CachedCommit).count()
428
+ pr_count = session.query(PullRequestCache).count()
429
+ issue_count = session.query(IssueCache).count()
430
+ status_count = session.query(RepositoryAnalysisStatus).count()
431
+
432
+ # Clear all entries
433
+ session.query(CachedCommit).delete()
434
+ session.query(PullRequestCache).delete()
435
+ session.query(IssueCache).delete()
436
+ session.query(RepositoryAnalysisStatus).delete()
437
+
438
+ return {
439
+ "commits": commit_count,
440
+ "pull_requests": pr_count,
441
+ "issues": issue_count,
442
+ "repository_status": status_count,
443
+ "total": commit_count + pr_count + issue_count + status_count,
444
+ }
445
+
446
+ def get_cache_stats(self) -> dict[str, Any]:
447
+ """Get comprehensive cache statistics including external API cache performance."""
448
+ with self.get_session() as session:
449
+ # Basic counts
450
+ total_commits = session.query(CachedCommit).count()
451
+ total_prs = session.query(PullRequestCache).count()
452
+ total_issues = session.query(IssueCache).count()
453
+
454
+ # Platform-specific issue counts
455
+ jira_issues = session.query(IssueCache).filter(IssueCache.platform == "jira").count()
456
+ github_issues = (
457
+ session.query(IssueCache).filter(IssueCache.platform == "github").count()
458
+ )
459
+
460
+ # Stale entries
461
+ cutoff_time = datetime.utcnow() - timedelta(hours=self.ttl_hours)
462
+ stale_commits = (
463
+ session.query(CachedCommit).filter(CachedCommit.cached_at < cutoff_time).count()
464
+ )
465
+ stale_prs = (
466
+ session.query(PullRequestCache)
467
+ .filter(PullRequestCache.cached_at < cutoff_time)
468
+ .count()
469
+ )
470
+ stale_issues = (
471
+ session.query(IssueCache).filter(IssueCache.cached_at < cutoff_time).count()
472
+ )
473
+
474
+ # Performance metrics
475
+ total_requests = self.cache_hits + self.cache_misses
476
+ hit_rate = (self.cache_hits / total_requests * 100) if total_requests > 0 else 0
477
+
478
+ # Bulk vs Single operation performance
479
+ avg_bulk_time = (
480
+ self.bulk_operations_time / self.bulk_operations_count
481
+ if self.bulk_operations_count > 0
482
+ else 0
483
+ )
484
+ avg_single_time = (
485
+ self.single_operations_time / self.single_operations_count
486
+ if self.single_operations_count > 0
487
+ else 0
488
+ )
489
+ bulk_speedup = (
490
+ avg_single_time / avg_bulk_time if avg_bulk_time > 0 and avg_single_time > 0 else 0
491
+ )
492
+
493
+ # Estimated time savings (conservative estimates)
494
+ commit_time_saved = self.cache_hits * 0.1 # 0.1 seconds per commit analysis
495
+ api_time_saved = (total_issues * 0.5) + (total_prs * 0.3) # API call time savings
496
+ bulk_time_saved = (
497
+ self.bulk_operations_count * 2.0
498
+ ) # Estimated 2 seconds saved per bulk op
499
+ total_time_saved = commit_time_saved + api_time_saved + bulk_time_saved
500
+
501
+ # Database file size
502
+ db_file = self.cache_dir / "gitflow_cache.db"
503
+ db_size_mb = db_file.stat().st_size / (1024 * 1024) if db_file.exists() else 0
504
+
505
+ # Session duration
506
+ session_duration = (datetime.now() - self.cache_start_time).total_seconds()
507
+
508
+ # Cache efficiency metrics
509
+ fresh_commits = total_commits - stale_commits
510
+ fresh_prs = total_prs - stale_prs
511
+ fresh_issues = total_issues - stale_issues
512
+ total_fresh_entries = fresh_commits + fresh_prs + fresh_issues
513
+
209
514
  stats = {
210
- 'cached_commits': session.query(CachedCommit).count(),
211
- 'cached_prs': session.query(PullRequestCache).count(),
212
- 'cached_issues': session.query(IssueCache).count(),
213
- 'stale_commits': session.query(CachedCommit).filter(
214
- CachedCommit.cached_at < datetime.utcnow() - timedelta(hours=self.ttl_hours)
215
- ).count()
515
+ # Counts by type
516
+ "cached_commits": total_commits,
517
+ "cached_prs": total_prs,
518
+ "cached_issues": total_issues,
519
+ "cached_jira_issues": jira_issues,
520
+ "cached_github_issues": github_issues,
521
+ # Freshness analysis
522
+ "stale_commits": stale_commits,
523
+ "stale_prs": stale_prs,
524
+ "stale_issues": stale_issues,
525
+ "fresh_commits": fresh_commits,
526
+ "fresh_prs": fresh_prs,
527
+ "fresh_issues": fresh_issues,
528
+ "total_fresh_entries": total_fresh_entries,
529
+ "freshness_rate_percent": (
530
+ total_fresh_entries / max(1, total_commits + total_prs + total_issues)
531
+ )
532
+ * 100,
533
+ # Performance metrics
534
+ "cache_hits": self.cache_hits,
535
+ "cache_misses": self.cache_misses,
536
+ "total_requests": total_requests,
537
+ "hit_rate_percent": hit_rate,
538
+ # Bulk operation metrics
539
+ "bulk_operations_count": self.bulk_operations_count,
540
+ "bulk_operations_time_seconds": self.bulk_operations_time,
541
+ "avg_bulk_operation_time": avg_bulk_time,
542
+ "single_operations_count": self.single_operations_count,
543
+ "single_operations_time_seconds": self.single_operations_time,
544
+ "avg_single_operation_time": avg_single_time,
545
+ "bulk_speedup_factor": bulk_speedup,
546
+ # Time savings
547
+ "commit_analysis_time_saved_seconds": commit_time_saved,
548
+ "api_call_time_saved_seconds": api_time_saved,
549
+ "bulk_operations_time_saved_seconds": bulk_time_saved,
550
+ "total_time_saved_seconds": total_time_saved,
551
+ "total_time_saved_minutes": total_time_saved / 60,
552
+ # Backward compatibility aliases for CLI
553
+ "time_saved_seconds": total_time_saved,
554
+ "time_saved_minutes": total_time_saved / 60,
555
+ "estimated_api_calls_avoided": total_issues + total_prs,
556
+ # Storage metrics
557
+ "database_size_mb": db_size_mb,
558
+ "session_duration_seconds": session_duration,
559
+ "avg_entries_per_mb": (total_commits + total_prs + total_issues)
560
+ / max(0.1, db_size_mb),
561
+ "total_bytes_cached": self.total_bytes_cached,
562
+ # Configuration
563
+ "ttl_hours": self.ttl_hours,
564
+ "batch_size": self.batch_size,
565
+ "debug_mode": self.debug_mode,
216
566
  }
567
+
217
568
  return stats
218
-
569
+
570
+ def print_cache_performance_summary(self) -> None:
571
+ """Print a user-friendly cache performance summary.
572
+
573
+ WHY: Users need visibility into cache performance to understand
574
+ why repeated runs are faster and to identify any caching issues.
575
+ This provides actionable insights into cache effectiveness.
576
+ """
577
+ stats = self.get_cache_stats()
578
+
579
+ print("📊 Cache Performance Summary")
580
+ print("─" * 50)
581
+
582
+ # Cache contents
583
+ print("📦 Cache Contents:")
584
+ print(
585
+ f" • Commits: {stats['cached_commits']:,} ({stats['fresh_commits']:,} fresh, {stats['stale_commits']:,} stale)"
586
+ )
587
+ print(
588
+ f" • Pull Requests: {stats['cached_prs']:,} ({stats['fresh_prs']:,} fresh, {stats['stale_prs']:,} stale)"
589
+ )
590
+ print(
591
+ f" • Issues: {stats['cached_issues']:,} ({stats['fresh_issues']:,} fresh, {stats['stale_issues']:,} stale)"
592
+ )
593
+
594
+ if stats["cached_jira_issues"] > 0:
595
+ print(f" ├─ JIRA: {stats['cached_jira_issues']:,} issues")
596
+ if stats["cached_github_issues"] > 0:
597
+ print(f" └─ GitHub: {stats['cached_github_issues']:,} issues")
598
+
599
+ # Performance metrics
600
+ if stats["total_requests"] > 0:
601
+ print("\n⚡ Session Performance:")
602
+ print(
603
+ f" • Cache Hit Rate: {stats['hit_rate_percent']:.1f}% ({stats['cache_hits']:,}/{stats['total_requests']:,})"
604
+ )
605
+
606
+ if stats["total_time_saved_minutes"] > 1:
607
+ print(f" • Time Saved: {stats['total_time_saved_minutes']:.1f} minutes")
608
+ else:
609
+ print(f" • Time Saved: {stats['total_time_saved_seconds']:.1f} seconds")
610
+
611
+ if stats["estimated_api_calls_avoided"] > 0:
612
+ print(f" • API Calls Avoided: {stats['estimated_api_calls_avoided']:,}")
613
+
614
+ # Bulk operation performance
615
+ if stats["bulk_operations_count"] > 0:
616
+ print("\n🚀 Bulk Operations:")
617
+ print(f" • Bulk Operations: {stats['bulk_operations_count']:,}")
618
+ print(f" • Avg Bulk Time: {stats['avg_bulk_operation_time']:.3f}s")
619
+ if stats["bulk_speedup_factor"] > 1:
620
+ print(
621
+ f" • Speedup Factor: {stats['bulk_speedup_factor']:.1f}x faster than single ops"
622
+ )
623
+ print(f" • Batch Size: {stats['batch_size']:,} items")
624
+
625
+ # Storage info
626
+ print("\n💾 Storage:")
627
+ print(f" • Database Size: {stats['database_size_mb']:.1f} MB")
628
+ print(f" • Cache TTL: {stats['ttl_hours']} hours")
629
+ print(f" • Overall Freshness: {stats['freshness_rate_percent']:.1f}%")
630
+
631
+ # Performance insights
632
+ if stats["hit_rate_percent"] > 80:
633
+ print(" ✅ Excellent cache performance!")
634
+ elif stats["hit_rate_percent"] > 50:
635
+ print(" 👍 Good cache performance")
636
+ elif stats["total_requests"] > 0:
637
+ print(" ⚠️ Consider clearing stale cache entries")
638
+
639
+ print()
640
+
641
+ def validate_cache(self) -> dict[str, Any]:
642
+ """Validate cache consistency and integrity.
643
+
644
+ WHY: Cache validation ensures data integrity and identifies issues
645
+ that could cause analysis errors or inconsistent results.
646
+
647
+ Returns:
648
+ Dictionary with validation results and issues found
649
+ """
650
+ validation_results = {
651
+ "is_valid": True,
652
+ "issues": [],
653
+ "warnings": [],
654
+ "stats": {},
655
+ }
656
+
657
+ with self.get_session() as session:
658
+ try:
659
+ # Check for missing required fields
660
+ commits_without_hash = (
661
+ session.query(CachedCommit).filter(CachedCommit.commit_hash.is_(None)).count()
662
+ )
663
+
664
+ if commits_without_hash > 0:
665
+ validation_results["issues"].append(
666
+ f"Found {commits_without_hash} cached commits without hash"
667
+ )
668
+ validation_results["is_valid"] = False
669
+
670
+ # Check for duplicate commits
671
+ from sqlalchemy import func
672
+
673
+ duplicates = (
674
+ session.query(
675
+ CachedCommit.repo_path,
676
+ CachedCommit.commit_hash,
677
+ func.count().label("count"),
678
+ )
679
+ .group_by(CachedCommit.repo_path, CachedCommit.commit_hash)
680
+ .having(func.count() > 1)
681
+ .all()
682
+ )
683
+
684
+ if duplicates:
685
+ validation_results["warnings"].append(
686
+ f"Found {len(duplicates)} duplicate commit entries"
687
+ )
688
+
689
+ # Check for very old entries (older than 2 * TTL)
690
+ very_old_cutoff = datetime.utcnow() - timedelta(hours=self.ttl_hours * 2)
691
+ very_old_count = (
692
+ session.query(CachedCommit)
693
+ .filter(CachedCommit.cached_at < very_old_cutoff)
694
+ .count()
695
+ )
696
+
697
+ if very_old_count > 0:
698
+ validation_results["warnings"].append(
699
+ f"Found {very_old_count} very old cache entries (older than {self.ttl_hours * 2}h)"
700
+ )
701
+
702
+ # Basic integrity checks
703
+ commits_with_negative_changes = (
704
+ session.query(CachedCommit)
705
+ .filter(
706
+ (CachedCommit.files_changed < 0)
707
+ | (CachedCommit.insertions < 0)
708
+ | (CachedCommit.deletions < 0)
709
+ )
710
+ .count()
711
+ )
712
+
713
+ if commits_with_negative_changes > 0:
714
+ validation_results["issues"].append(
715
+ f"Found {commits_with_negative_changes} commits with negative change counts"
716
+ )
717
+ validation_results["is_valid"] = False
718
+
719
+ # Statistics
720
+ validation_results["stats"] = {
721
+ "total_commits": session.query(CachedCommit).count(),
722
+ "duplicates": len(duplicates),
723
+ "very_old_entries": very_old_count,
724
+ "invalid_commits": commits_without_hash + commits_with_negative_changes,
725
+ }
726
+
727
+ except Exception as e:
728
+ validation_results["issues"].append(f"Validation error: {str(e)}")
729
+ validation_results["is_valid"] = False
730
+
731
+ return validation_results
732
+
733
+ def warm_cache(self, repo_paths: list[str], weeks: int = 12) -> dict[str, Any]:
734
+ """Pre-warm cache by analyzing all commits in repositories.
735
+
736
+ WHY: Cache warming ensures all commits are pre-analyzed and cached,
737
+ making subsequent runs much faster. This is especially useful for
738
+ CI/CD environments or when analyzing the same repositories repeatedly.
739
+
740
+ Args:
741
+ repo_paths: List of repository paths to warm cache for
742
+ weeks: Number of weeks of history to warm (default: 12)
743
+
744
+ Returns:
745
+ Dictionary with warming results and statistics
746
+ """
747
+ from datetime import datetime, timedelta
748
+
749
+ import git
750
+
751
+ from .progress import get_progress_service
752
+
753
+ warming_results = {
754
+ "repos_processed": 0,
755
+ "total_commits_found": 0,
756
+ "commits_cached": 0,
757
+ "commits_already_cached": 0,
758
+ "errors": [],
759
+ "duration_seconds": 0,
760
+ }
761
+
762
+ start_time = datetime.now()
763
+ cutoff_date = datetime.now() - timedelta(weeks=weeks)
764
+
765
+ try:
766
+ for repo_path in repo_paths:
767
+ try:
768
+ from pathlib import Path
769
+
770
+ repo_path_obj = Path(repo_path)
771
+ repo = git.Repo(repo_path)
772
+
773
+ # Get commits from the specified time period
774
+ commits = list(
775
+ repo.iter_commits(all=True, since=cutoff_date.strftime("%Y-%m-%d"))
776
+ )
777
+
778
+ warming_results["total_commits_found"] += len(commits)
779
+
780
+ # Check which commits are already cached
781
+ commit_hashes = [c.hexsha for c in commits]
782
+ cached_commits = self.get_cached_commits_bulk(str(repo_path_obj), commit_hashes)
783
+ already_cached = len(cached_commits)
784
+ to_analyze = len(commits) - already_cached
785
+
786
+ warming_results["commits_already_cached"] += already_cached
787
+
788
+ if to_analyze > 0:
789
+ # Use centralized progress service
790
+ progress = get_progress_service()
791
+
792
+ # Analyze uncached commits with progress bar
793
+ with progress.progress(
794
+ total=to_analyze,
795
+ description=f"Warming cache for {repo_path_obj.name}",
796
+ unit="commits",
797
+ leave=False,
798
+ ) as ctx:
799
+ new_commits = []
800
+ for commit in commits:
801
+ if commit.hexsha not in cached_commits:
802
+ # Basic commit analysis (minimal for cache warming)
803
+ commit_data = self._analyze_commit_minimal(
804
+ repo, commit, repo_path_obj
805
+ )
806
+ new_commits.append(commit_data)
807
+ progress.update(ctx, 1)
808
+
809
+ # Batch cache commits for efficiency
810
+ if len(new_commits) >= 100:
811
+ self.cache_commits_batch(str(repo_path_obj), new_commits)
812
+ warming_results["commits_cached"] += len(new_commits)
813
+ new_commits = []
814
+
815
+ # Cache remaining commits
816
+ if new_commits:
817
+ self.cache_commits_batch(str(repo_path_obj), new_commits)
818
+ warming_results["commits_cached"] += len(new_commits)
819
+
820
+ warming_results["repos_processed"] += 1
821
+
822
+ except Exception as e:
823
+ warming_results["errors"].append(f"Error processing {repo_path}: {str(e)}")
824
+
825
+ except Exception as e:
826
+ warming_results["errors"].append(f"General error during cache warming: {str(e)}")
827
+
828
+ warming_results["duration_seconds"] = (datetime.now() - start_time).total_seconds()
829
+ return warming_results
830
+
831
+ def _analyze_commit_minimal(
832
+ self, repo: git.Repo, commit: git.Commit, repo_path: Path
833
+ ) -> dict[str, Any]:
834
+ """Minimal commit analysis for cache warming.
835
+
836
+ WHY: Cache warming doesn't need full analysis complexity,
837
+ just enough data to populate the cache effectively.
838
+ """
839
+ # Basic commit data
840
+ commit_data = {
841
+ "hash": commit.hexsha,
842
+ "author_name": commit.author.name,
843
+ "author_email": commit.author.email,
844
+ "message": commit.message,
845
+ "timestamp": commit.committed_datetime,
846
+ "is_merge": len(commit.parents) > 1,
847
+ "files_changed": self._get_files_changed_count(commit),
848
+ "insertions": self._get_insertions_count(commit),
849
+ "deletions": self._get_deletions_count(commit),
850
+ "complexity_delta": 0.0, # Skip complexity calculation for warming
851
+ "story_points": None, # Skip story point extraction for warming
852
+ "ticket_references": [], # Skip ticket analysis for warming
853
+ }
854
+
855
+ # Try to get branch info (if available)
856
+ try:
857
+ branches = repo.git.branch("--contains", commit.hexsha).split("\n")
858
+ commit_data["branch"] = branches[0].strip("* ") if branches else "unknown"
859
+ except Exception:
860
+ commit_data["branch"] = "unknown"
861
+
862
+ return commit_data
863
+
219
864
  def _is_stale(self, cached_at: datetime) -> bool:
220
865
  """Check if cache entry is stale."""
221
866
  if self.ttl_hours == 0: # No expiration
222
867
  return False
223
868
  return cached_at < datetime.utcnow() - timedelta(hours=self.ttl_hours)
224
-
225
- def _commit_to_dict(self, commit: CachedCommit) -> Dict[str, Any]:
869
+
870
+ def _commit_to_dict(self, commit: CachedCommit) -> dict[str, Any]:
226
871
  """Convert CachedCommit to dictionary."""
227
872
  return {
228
- 'hash': commit.commit_hash,
229
- 'author_name': commit.author_name,
230
- 'author_email': commit.author_email,
231
- 'message': commit.message,
232
- 'timestamp': commit.timestamp,
233
- 'branch': commit.branch,
234
- 'is_merge': commit.is_merge,
235
- 'files_changed': commit.files_changed,
236
- 'insertions': commit.insertions,
237
- 'deletions': commit.deletions,
238
- 'complexity_delta': commit.complexity_delta,
239
- 'story_points': commit.story_points,
240
- 'ticket_references': commit.ticket_references or []
873
+ "hash": commit.commit_hash,
874
+ "author_name": commit.author_name,
875
+ "author_email": commit.author_email,
876
+ "message": commit.message,
877
+ "timestamp": commit.timestamp,
878
+ "branch": commit.branch,
879
+ "is_merge": commit.is_merge,
880
+ "files_changed": commit.files_changed,
881
+ "insertions": commit.insertions,
882
+ "deletions": commit.deletions,
883
+ "complexity_delta": commit.complexity_delta,
884
+ "story_points": commit.story_points,
885
+ "ticket_references": commit.ticket_references or [],
241
886
  }
242
-
243
- def _pr_to_dict(self, pr: PullRequestCache) -> Dict[str, Any]:
887
+
888
+ def _pr_to_dict(self, pr: PullRequestCache) -> dict[str, Any]:
244
889
  """Convert PullRequestCache to dictionary."""
245
890
  return {
246
- 'number': pr.pr_number,
247
- 'title': pr.title,
248
- 'description': pr.description,
249
- 'author': pr.author,
250
- 'created_at': pr.created_at,
251
- 'merged_at': pr.merged_at,
252
- 'story_points': pr.story_points,
253
- 'labels': pr.labels or [],
254
- 'commit_hashes': pr.commit_hashes or []
891
+ "number": pr.pr_number,
892
+ "title": pr.title,
893
+ "description": pr.description,
894
+ "author": pr.author,
895
+ "created_at": pr.created_at,
896
+ "merged_at": pr.merged_at,
897
+ "story_points": pr.story_points,
898
+ "labels": pr.labels or [],
899
+ "commit_hashes": pr.commit_hashes or [],
255
900
  }
256
-
257
- def _issue_to_dict(self, issue: IssueCache) -> Dict[str, Any]:
901
+
902
+ def _issue_to_dict(self, issue: IssueCache) -> dict[str, Any]:
258
903
  """Convert IssueCache to dictionary."""
259
904
  return {
260
- 'platform': issue.platform,
261
- 'id': issue.issue_id,
262
- 'project_key': issue.project_key,
263
- 'title': issue.title,
264
- 'description': issue.description,
265
- 'status': issue.status,
266
- 'assignee': issue.assignee,
267
- 'created_at': issue.created_at,
268
- 'updated_at': issue.updated_at,
269
- 'resolved_at': issue.resolved_at,
270
- 'story_points': issue.story_points,
271
- 'labels': issue.labels or [],
272
- 'platform_data': issue.platform_data or {}
273
- }
905
+ "platform": issue.platform,
906
+ "id": issue.issue_id,
907
+ "project_key": issue.project_key,
908
+ "title": issue.title,
909
+ "description": issue.description,
910
+ "status": issue.status,
911
+ "assignee": issue.assignee,
912
+ "created_at": issue.created_at,
913
+ "updated_at": issue.updated_at,
914
+ "resolved_at": issue.resolved_at,
915
+ "story_points": issue.story_points,
916
+ "labels": issue.labels or [],
917
+ "platform_data": issue.platform_data or {},
918
+ }
919
+
920
+ def bulk_store_commits(self, repo_path: str, commits: list[dict[str, Any]]) -> dict[str, Any]:
921
+ """Store multiple commits using SQLAlchemy bulk operations for maximum performance.
922
+
923
+ WHY: This method uses SQLAlchemy's bulk_insert_mappings which is significantly
924
+ faster than individual inserts. It's designed for initial data loading where
925
+ we know commits don't exist yet in the cache.
926
+
927
+ DESIGN DECISION: Unlike cache_commits_batch which handles updates, this method
928
+ only inserts new commits. Use this when you know commits are not in cache.
929
+
930
+ Args:
931
+ repo_path: Repository path
932
+ commits: List of commit dictionaries to store
933
+
934
+ Returns:
935
+ Dictionary with operation statistics
936
+ """
937
+ if not commits:
938
+ return {"inserted": 0, "time_seconds": 0}
939
+
940
+ import time
941
+
942
+ start_time = time.time()
943
+
944
+ # Prepare mappings for bulk insert
945
+ mappings = []
946
+ for commit_data in commits:
947
+ mapping = {
948
+ "repo_path": repo_path,
949
+ "commit_hash": commit_data["hash"],
950
+ "author_name": commit_data.get("author_name"),
951
+ "author_email": commit_data.get("author_email"),
952
+ "message": commit_data.get("message"),
953
+ "timestamp": commit_data.get("timestamp"),
954
+ "branch": commit_data.get("branch"),
955
+ "is_merge": commit_data.get("is_merge", False),
956
+ "files_changed": commit_data.get(
957
+ "files_changed_count",
958
+ (
959
+ commit_data.get("files_changed", 0)
960
+ if isinstance(commit_data.get("files_changed"), int)
961
+ else len(commit_data.get("files_changed", []))
962
+ ),
963
+ ),
964
+ "insertions": commit_data.get("insertions", 0),
965
+ "deletions": commit_data.get("deletions", 0),
966
+ "complexity_delta": commit_data.get("complexity_delta", 0.0),
967
+ "story_points": commit_data.get("story_points"),
968
+ "ticket_references": commit_data.get("ticket_references", []),
969
+ "cached_at": datetime.utcnow(),
970
+ }
971
+ mappings.append(mapping)
972
+
973
+ # Process in configurable batch sizes for memory efficiency
974
+ inserted_count = 0
975
+ with self.get_session() as session:
976
+ for i in range(0, len(mappings), self.batch_size):
977
+ batch = mappings[i : i + self.batch_size]
978
+ try:
979
+ session.bulk_insert_mappings(CachedCommit, batch)
980
+ inserted_count += len(batch)
981
+ except Exception as e:
982
+ # On error, fall back to individual inserts for this batch
983
+ logger.warning(f"Bulk insert failed, falling back to individual inserts: {e}")
984
+ session.rollback() # Important: rollback failed transaction
985
+
986
+ for mapping in batch:
987
+ try:
988
+ # Create new record
989
+ new_commit = CachedCommit(**mapping)
990
+ session.add(new_commit)
991
+ session.flush() # Try to save this individual record
992
+ inserted_count += 1
993
+ except Exception:
994
+ # Skip duplicate commits silently
995
+ session.rollback() # Rollback this specific failure
996
+ continue
997
+
998
+ elapsed = time.time() - start_time
999
+ self.bulk_operations_count += 1
1000
+ self.bulk_operations_time += elapsed
1001
+
1002
+ if self.debug_mode:
1003
+ rate = inserted_count / elapsed if elapsed > 0 else 0
1004
+ print(
1005
+ f"DEBUG: Bulk stored {inserted_count} commits in {elapsed:.3f}s ({rate:.0f} commits/sec)"
1006
+ )
1007
+
1008
+ return {
1009
+ "inserted": inserted_count,
1010
+ "time_seconds": elapsed,
1011
+ "commits_per_second": inserted_count / elapsed if elapsed > 0 else 0,
1012
+ }
1013
+
1014
+ def bulk_update_commits(self, repo_path: str, commits: list[dict[str, Any]]) -> dict[str, Any]:
1015
+ """Update multiple commits efficiently using bulk operations.
1016
+
1017
+ WHY: Bulk updates are faster than individual updates when modifying many
1018
+ commits at once (e.g., after classification or enrichment).
1019
+
1020
+ Args:
1021
+ repo_path: Repository path
1022
+ commits: List of commit dictionaries with updates
1023
+
1024
+ Returns:
1025
+ Dictionary with operation statistics
1026
+ """
1027
+ if not commits:
1028
+ return {"updated": 0, "time_seconds": 0}
1029
+
1030
+ import time
1031
+
1032
+ start_time = time.time()
1033
+
1034
+ with self.get_session() as session:
1035
+ # Get all commit hashes for bulk fetch
1036
+ commit_hashes = [c["hash"] for c in commits]
1037
+
1038
+ # Bulk fetch existing commits to get their primary keys
1039
+ existing = {
1040
+ cached.commit_hash: cached
1041
+ for cached in session.query(CachedCommit)
1042
+ .filter(
1043
+ and_(
1044
+ CachedCommit.repo_path == repo_path,
1045
+ CachedCommit.commit_hash.in_(commit_hashes),
1046
+ )
1047
+ )
1048
+ .all()
1049
+ }
1050
+
1051
+ # Prepare bulk update mappings with primary key
1052
+ update_mappings = []
1053
+ for commit_data in commits:
1054
+ if commit_data["hash"] in existing:
1055
+ cached_record = existing[commit_data["hash"]]
1056
+ # Must include primary key for bulk_update_mappings
1057
+ update_mapping = {"id": cached_record.id}
1058
+
1059
+ # Map commit data fields to database columns
1060
+ field_mapping = {
1061
+ "author_name": commit_data.get("author_name"),
1062
+ "author_email": commit_data.get("author_email"),
1063
+ "message": commit_data.get("message"),
1064
+ "timestamp": commit_data.get("timestamp"),
1065
+ "branch": commit_data.get("branch"),
1066
+ "is_merge": commit_data.get("is_merge"),
1067
+ "files_changed": commit_data.get(
1068
+ "files_changed_count",
1069
+ (
1070
+ commit_data.get("files_changed", 0)
1071
+ if isinstance(commit_data.get("files_changed"), int)
1072
+ else len(commit_data.get("files_changed", []))
1073
+ ),
1074
+ ),
1075
+ "insertions": commit_data.get("insertions"),
1076
+ "deletions": commit_data.get("deletions"),
1077
+ "complexity_delta": commit_data.get("complexity_delta"),
1078
+ "story_points": commit_data.get("story_points"),
1079
+ "ticket_references": commit_data.get("ticket_references"),
1080
+ "cached_at": datetime.utcnow(),
1081
+ }
1082
+
1083
+ # Only include non-None values in update
1084
+ for key, value in field_mapping.items():
1085
+ if value is not None:
1086
+ update_mapping[key] = value
1087
+
1088
+ update_mappings.append(update_mapping)
1089
+
1090
+ # Perform bulk update
1091
+ if update_mappings:
1092
+ session.bulk_update_mappings(CachedCommit, update_mappings)
1093
+
1094
+ elapsed = time.time() - start_time
1095
+ self.bulk_operations_count += 1
1096
+ self.bulk_operations_time += elapsed
1097
+
1098
+ if self.debug_mode:
1099
+ print(f"DEBUG: Bulk updated {len(update_mappings)} commits in {elapsed:.3f}s")
1100
+
1101
+ return {
1102
+ "updated": len(update_mappings),
1103
+ "time_seconds": elapsed,
1104
+ "commits_per_second": len(update_mappings) / elapsed if elapsed > 0 else 0,
1105
+ }
1106
+
1107
+ def bulk_exists(self, repo_path: str, commit_hashes: list[str]) -> dict[str, bool]:
1108
+ """Check existence of multiple commits in a single query.
1109
+
1110
+ WHY: Checking existence of many commits individually is inefficient.
1111
+ This method uses a single query to check all commits at once.
1112
+
1113
+ Args:
1114
+ repo_path: Repository path
1115
+ commit_hashes: List of commit hashes to check
1116
+
1117
+ Returns:
1118
+ Dictionary mapping commit hash to existence boolean
1119
+ """
1120
+ if not commit_hashes:
1121
+ return {}
1122
+
1123
+ with self.get_session() as session:
1124
+ # Query for existing commits
1125
+ existing = set(
1126
+ row[0]
1127
+ for row in session.query(CachedCommit.commit_hash)
1128
+ .filter(
1129
+ and_(
1130
+ CachedCommit.repo_path == repo_path,
1131
+ CachedCommit.commit_hash.in_(commit_hashes),
1132
+ )
1133
+ )
1134
+ .all()
1135
+ )
1136
+
1137
+ # Build result dictionary
1138
+ return {hash: hash in existing for hash in commit_hashes}
1139
+
1140
+ def bulk_get_commits(
1141
+ self, repo_path: str, commit_hashes: list[str], include_stale: bool = False
1142
+ ) -> dict[str, dict[str, Any]]:
1143
+ """Retrieve multiple commits with enhanced performance.
1144
+
1145
+ WHY: Enhanced version of get_cached_commits_bulk with better performance
1146
+ characteristics and optional stale data inclusion.
1147
+
1148
+ Args:
1149
+ repo_path: Repository path
1150
+ commit_hashes: List of commit hashes to retrieve
1151
+ include_stale: Whether to include stale entries (default: False)
1152
+
1153
+ Returns:
1154
+ Dictionary mapping commit hash to commit data
1155
+ """
1156
+ if not commit_hashes:
1157
+ return {}
1158
+
1159
+ import time
1160
+
1161
+ start_time = time.time()
1162
+
1163
+ # Process in batches to avoid query size limits
1164
+ all_results = {}
1165
+ for i in range(0, len(commit_hashes), self.batch_size):
1166
+ batch_hashes = commit_hashes[i : i + self.batch_size]
1167
+
1168
+ with self.get_session() as session:
1169
+ cached_results = (
1170
+ session.query(CachedCommit)
1171
+ .filter(
1172
+ and_(
1173
+ CachedCommit.repo_path == repo_path,
1174
+ CachedCommit.commit_hash.in_(batch_hashes),
1175
+ )
1176
+ )
1177
+ .all()
1178
+ )
1179
+
1180
+ for cached in cached_results:
1181
+ if include_stale or not self._is_stale(cached.cached_at):
1182
+ all_results[cached.commit_hash] = self._commit_to_dict(cached)
1183
+
1184
+ # Track performance
1185
+ elapsed = time.time() - start_time
1186
+ hits = len(all_results)
1187
+ misses = len(commit_hashes) - hits
1188
+
1189
+ self.cache_hits += hits
1190
+ self.cache_misses += misses
1191
+ self.bulk_operations_count += 1
1192
+ self.bulk_operations_time += elapsed
1193
+
1194
+ if self.debug_mode:
1195
+ hit_rate = (hits / len(commit_hashes)) * 100 if commit_hashes else 0
1196
+ print(
1197
+ f"DEBUG: Bulk get {hits}/{len(commit_hashes)} commits in {elapsed:.3f}s ({hit_rate:.1f}% hit rate)"
1198
+ )
1199
+
1200
+ return all_results
1201
+
1202
+ def _get_files_changed_count(self, commit: git.Commit) -> int:
1203
+ """Get the number of files changed using reliable git command."""
1204
+ parent = commit.parents[0] if commit.parents else None
1205
+
1206
+ try:
1207
+ repo = commit.repo
1208
+ if parent:
1209
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1210
+ else:
1211
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1212
+
1213
+ file_count = 0
1214
+ for line in diff_output.strip().split("\n"):
1215
+ if line.strip() and "\t" in line:
1216
+ file_count += 1
1217
+
1218
+ return file_count
1219
+ except Exception:
1220
+ return 0
1221
+
1222
+ def _get_insertions_count(self, commit: git.Commit) -> int:
1223
+ """Get the number of insertions using reliable git command."""
1224
+ parent = commit.parents[0] if commit.parents else None
1225
+
1226
+ try:
1227
+ repo = commit.repo
1228
+ if parent:
1229
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1230
+ else:
1231
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1232
+
1233
+ total_insertions = 0
1234
+ for line in diff_output.strip().split("\n"):
1235
+ if not line.strip():
1236
+ continue
1237
+
1238
+ parts = line.split("\t")
1239
+ if len(parts) >= 3:
1240
+ try:
1241
+ insertions = int(parts[0]) if parts[0] != "-" else 0
1242
+ total_insertions += insertions
1243
+ except ValueError:
1244
+ continue
1245
+
1246
+ return total_insertions
1247
+ except Exception:
1248
+ return 0
1249
+
1250
+ def _get_deletions_count(self, commit: git.Commit) -> int:
1251
+ """Get the number of deletions using reliable git command."""
1252
+ parent = commit.parents[0] if commit.parents else None
1253
+
1254
+ try:
1255
+ repo = commit.repo
1256
+ if parent:
1257
+ diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
1258
+ else:
1259
+ diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
1260
+
1261
+ total_deletions = 0
1262
+ for line in diff_output.strip().split("\n"):
1263
+ if not line.strip():
1264
+ continue
1265
+
1266
+ parts = line.split("\t")
1267
+ if len(parts) >= 3:
1268
+ try:
1269
+ deletions = int(parts[1]) if parts[1] != "-" else 0
1270
+ total_deletions += deletions
1271
+ except ValueError:
1272
+ continue
1273
+
1274
+ return total_deletions
1275
+ except Exception:
1276
+ return 0
1277
+
1278
+ def get_repository_analysis_status(
1279
+ self,
1280
+ repo_path: str,
1281
+ analysis_start: datetime,
1282
+ analysis_end: datetime,
1283
+ config_hash: Optional[str] = None,
1284
+ ) -> Optional[dict[str, Any]]:
1285
+ """Check if repository analysis is complete for the given period.
1286
+
1287
+ WHY: Enables "fetch once, report many" by tracking which repositories
1288
+ have been fully analyzed. Prevents re-fetching Git data when generating
1289
+ different report formats from the same cached data.
1290
+
1291
+ CRITICAL FIX: Now verifies actual commits exist in cache, not just metadata.
1292
+ This prevents "Using cached data (X commits)" when commits aren't actually stored.
1293
+
1294
+ Args:
1295
+ repo_path: Path to the repository
1296
+ analysis_start: Start of the analysis period
1297
+ analysis_end: End of the analysis period
1298
+ config_hash: Optional hash of relevant configuration to detect changes
1299
+
1300
+ Returns:
1301
+ Dictionary with analysis status or None if not found/incomplete
1302
+ """
1303
+ with self.get_session() as session:
1304
+ status = (
1305
+ session.query(RepositoryAnalysisStatus)
1306
+ .filter(
1307
+ and_(
1308
+ RepositoryAnalysisStatus.repo_path == repo_path,
1309
+ RepositoryAnalysisStatus.analysis_start == analysis_start,
1310
+ RepositoryAnalysisStatus.analysis_end == analysis_end,
1311
+ RepositoryAnalysisStatus.status == "completed",
1312
+ )
1313
+ )
1314
+ .first()
1315
+ )
1316
+
1317
+ if not status:
1318
+ return None
1319
+
1320
+ # Check if configuration has changed (invalidates cache)
1321
+ if config_hash and status.config_hash != config_hash:
1322
+ return None
1323
+
1324
+ # CRITICAL FIX: Verify actual commits exist in the database
1325
+ # Don't trust metadata if commits aren't actually stored
1326
+ actual_commit_count = (
1327
+ session.query(CachedCommit)
1328
+ .filter(
1329
+ and_(
1330
+ CachedCommit.repo_path == repo_path,
1331
+ CachedCommit.timestamp >= analysis_start,
1332
+ CachedCommit.timestamp <= analysis_end,
1333
+ # Only count non-stale commits
1334
+ CachedCommit.cached_at
1335
+ >= datetime.utcnow() - timedelta(hours=self.ttl_hours),
1336
+ )
1337
+ )
1338
+ .count()
1339
+ )
1340
+
1341
+ # If metadata says we have commits but no commits are actually stored,
1342
+ # force a fresh fetch by returning None
1343
+ if status.commit_count > 0 and actual_commit_count == 0:
1344
+ if self.debug_mode:
1345
+ print(
1346
+ f"DEBUG: Metadata claims {status.commit_count} commits but found 0 in cache - forcing fresh fetch"
1347
+ )
1348
+ # Log warning about inconsistent cache state
1349
+ import logging
1350
+
1351
+ logger = logging.getLogger(__name__)
1352
+ logger.warning(
1353
+ f"Cache inconsistency detected for {repo_path}: "
1354
+ f"metadata reports {status.commit_count} commits but "
1355
+ f"actual stored commits: {actual_commit_count}. Forcing fresh analysis."
1356
+ )
1357
+ return None
1358
+
1359
+ # Update the commit count to reflect actual stored commits
1360
+ # This ensures the UI shows accurate information
1361
+ status.commit_count = actual_commit_count
1362
+
1363
+ # Return status information
1364
+ return {
1365
+ "repo_path": status.repo_path,
1366
+ "repo_name": status.repo_name,
1367
+ "project_key": status.project_key,
1368
+ "analysis_start": status.analysis_start,
1369
+ "analysis_end": status.analysis_end,
1370
+ "weeks_analyzed": status.weeks_analyzed,
1371
+ "git_analysis_complete": status.git_analysis_complete,
1372
+ "commit_count": actual_commit_count, # Use verified count, not metadata
1373
+ "pr_analysis_complete": status.pr_analysis_complete,
1374
+ "pr_count": status.pr_count,
1375
+ "ticket_analysis_complete": status.ticket_analysis_complete,
1376
+ "ticket_count": status.ticket_count,
1377
+ "identity_resolution_complete": status.identity_resolution_complete,
1378
+ "unique_developers": status.unique_developers,
1379
+ "last_updated": status.last_updated,
1380
+ "processing_time_seconds": status.processing_time_seconds,
1381
+ "cache_hit_rate_percent": status.cache_hit_rate_percent,
1382
+ "config_hash": status.config_hash,
1383
+ }
1384
+
1385
+ def mark_repository_analysis_complete(
1386
+ self,
1387
+ repo_path: str,
1388
+ repo_name: str,
1389
+ project_key: str,
1390
+ analysis_start: datetime,
1391
+ analysis_end: datetime,
1392
+ weeks_analyzed: int,
1393
+ commit_count: int = 0,
1394
+ pr_count: int = 0,
1395
+ ticket_count: int = 0,
1396
+ unique_developers: int = 0,
1397
+ processing_time_seconds: Optional[float] = None,
1398
+ cache_hit_rate_percent: Optional[float] = None,
1399
+ config_hash: Optional[str] = None,
1400
+ ) -> None:
1401
+ """Mark repository analysis as complete for the given period.
1402
+
1403
+ WHY: Records successful completion of repository analysis to enable
1404
+ cache-first workflow. Subsequent runs can skip re-analysis and go
1405
+ directly to report generation.
1406
+
1407
+ Args:
1408
+ repo_path: Path to the repository
1409
+ repo_name: Display name for the repository
1410
+ project_key: Project key for the repository
1411
+ analysis_start: Start of the analysis period
1412
+ analysis_end: End of the analysis period
1413
+ weeks_analyzed: Number of weeks analyzed
1414
+ commit_count: Number of commits analyzed
1415
+ pr_count: Number of pull requests analyzed
1416
+ ticket_count: Number of tickets analyzed
1417
+ unique_developers: Number of unique developers found
1418
+ processing_time_seconds: Time taken for analysis
1419
+ cache_hit_rate_percent: Cache hit rate during analysis
1420
+ config_hash: Hash of relevant configuration
1421
+ """
1422
+ with self.get_session() as session:
1423
+ # Check if status already exists
1424
+ existing = (
1425
+ session.query(RepositoryAnalysisStatus)
1426
+ .filter(
1427
+ and_(
1428
+ RepositoryAnalysisStatus.repo_path == repo_path,
1429
+ RepositoryAnalysisStatus.analysis_start == analysis_start,
1430
+ RepositoryAnalysisStatus.analysis_end == analysis_end,
1431
+ )
1432
+ )
1433
+ .first()
1434
+ )
1435
+
1436
+ if existing:
1437
+ # Update existing record
1438
+ existing.repo_name = repo_name
1439
+ existing.project_key = project_key
1440
+ existing.weeks_analyzed = weeks_analyzed
1441
+ existing.git_analysis_complete = True
1442
+ existing.commit_count = commit_count
1443
+ existing.pr_analysis_complete = True
1444
+ existing.pr_count = pr_count
1445
+ existing.ticket_analysis_complete = True
1446
+ existing.ticket_count = ticket_count
1447
+ existing.identity_resolution_complete = True
1448
+ existing.unique_developers = unique_developers
1449
+ existing.processing_time_seconds = processing_time_seconds
1450
+ existing.cache_hit_rate_percent = cache_hit_rate_percent
1451
+ existing.config_hash = config_hash
1452
+ existing.status = "completed"
1453
+ existing.error_message = None
1454
+ existing.last_updated = datetime.utcnow()
1455
+ else:
1456
+ # Create new record
1457
+ status = RepositoryAnalysisStatus(
1458
+ repo_path=repo_path,
1459
+ repo_name=repo_name,
1460
+ project_key=project_key,
1461
+ analysis_start=analysis_start,
1462
+ analysis_end=analysis_end,
1463
+ weeks_analyzed=weeks_analyzed,
1464
+ git_analysis_complete=True,
1465
+ commit_count=commit_count,
1466
+ pr_analysis_complete=True,
1467
+ pr_count=pr_count,
1468
+ ticket_analysis_complete=True,
1469
+ ticket_count=ticket_count,
1470
+ identity_resolution_complete=True,
1471
+ unique_developers=unique_developers,
1472
+ processing_time_seconds=processing_time_seconds,
1473
+ cache_hit_rate_percent=cache_hit_rate_percent,
1474
+ config_hash=config_hash,
1475
+ status="completed",
1476
+ )
1477
+ session.add(status)
1478
+
1479
+ def mark_repository_analysis_failed(
1480
+ self,
1481
+ repo_path: str,
1482
+ repo_name: str,
1483
+ analysis_start: datetime,
1484
+ analysis_end: datetime,
1485
+ error_message: str,
1486
+ config_hash: Optional[str] = None,
1487
+ ) -> None:
1488
+ """Mark repository analysis as failed.
1489
+
1490
+ Args:
1491
+ repo_path: Path to the repository
1492
+ repo_name: Display name for the repository
1493
+ analysis_start: Start of the analysis period
1494
+ analysis_end: End of the analysis period
1495
+ error_message: Error message describing the failure
1496
+ config_hash: Hash of relevant configuration
1497
+ """
1498
+ with self.get_session() as session:
1499
+ # Check if status already exists
1500
+ existing = (
1501
+ session.query(RepositoryAnalysisStatus)
1502
+ .filter(
1503
+ and_(
1504
+ RepositoryAnalysisStatus.repo_path == repo_path,
1505
+ RepositoryAnalysisStatus.analysis_start == analysis_start,
1506
+ RepositoryAnalysisStatus.analysis_end == analysis_end,
1507
+ )
1508
+ )
1509
+ .first()
1510
+ )
1511
+
1512
+ if existing:
1513
+ existing.repo_name = repo_name
1514
+ existing.status = "failed"
1515
+ existing.error_message = error_message
1516
+ existing.config_hash = config_hash
1517
+ existing.last_updated = datetime.utcnow()
1518
+ else:
1519
+ # Create new failed record
1520
+ status = RepositoryAnalysisStatus(
1521
+ repo_path=repo_path,
1522
+ repo_name=repo_name,
1523
+ project_key="unknown",
1524
+ analysis_start=analysis_start,
1525
+ analysis_end=analysis_end,
1526
+ weeks_analyzed=0,
1527
+ status="failed",
1528
+ error_message=error_message,
1529
+ config_hash=config_hash,
1530
+ )
1531
+ session.add(status)
1532
+
1533
+ def clear_repository_analysis_status(
1534
+ self, repo_path: Optional[str] = None, older_than_days: Optional[int] = None
1535
+ ) -> int:
1536
+ """Clear repository analysis status records.
1537
+
1538
+ WHY: Allows forcing re-analysis by clearing cached status records.
1539
+ Used by --force-fetch flag and for cleanup of old status records.
1540
+
1541
+ Args:
1542
+ repo_path: Specific repository path to clear (all repos if None)
1543
+ older_than_days: Clear records older than N days (all if None)
1544
+
1545
+ Returns:
1546
+ Number of records cleared
1547
+ """
1548
+ with self.get_session() as session:
1549
+ query = session.query(RepositoryAnalysisStatus)
1550
+
1551
+ if repo_path:
1552
+ query = query.filter(RepositoryAnalysisStatus.repo_path == repo_path)
1553
+
1554
+ if older_than_days:
1555
+ cutoff = datetime.utcnow() - timedelta(days=older_than_days)
1556
+ query = query.filter(RepositoryAnalysisStatus.last_updated < cutoff)
1557
+
1558
+ count = query.count()
1559
+ query.delete()
1560
+
1561
+ return count
1562
+
1563
+ def get_analysis_status_summary(self) -> dict[str, Any]:
1564
+ """Get summary of repository analysis status records.
1565
+
1566
+ Returns:
1567
+ Dictionary with summary statistics
1568
+ """
1569
+ with self.get_session() as session:
1570
+ from sqlalchemy import func
1571
+
1572
+ # Count by status
1573
+ status_counts = dict(
1574
+ session.query(RepositoryAnalysisStatus.status, func.count().label("count"))
1575
+ .group_by(RepositoryAnalysisStatus.status)
1576
+ .all()
1577
+ )
1578
+
1579
+ # Total records
1580
+ total_records = session.query(RepositoryAnalysisStatus).count()
1581
+
1582
+ # Recent activity (last 7 days)
1583
+ recent_cutoff = datetime.utcnow() - timedelta(days=7)
1584
+ recent_completed = (
1585
+ session.query(RepositoryAnalysisStatus)
1586
+ .filter(
1587
+ and_(
1588
+ RepositoryAnalysisStatus.status == "completed",
1589
+ RepositoryAnalysisStatus.last_updated >= recent_cutoff,
1590
+ )
1591
+ )
1592
+ .count()
1593
+ )
1594
+
1595
+ # Average processing time for completed analyses
1596
+ avg_processing_time = (
1597
+ session.query(func.avg(RepositoryAnalysisStatus.processing_time_seconds))
1598
+ .filter(
1599
+ and_(
1600
+ RepositoryAnalysisStatus.status == "completed",
1601
+ RepositoryAnalysisStatus.processing_time_seconds.isnot(None),
1602
+ )
1603
+ )
1604
+ .scalar()
1605
+ )
1606
+
1607
+ return {
1608
+ "total_records": total_records,
1609
+ "status_counts": status_counts,
1610
+ "recent_completed": recent_completed,
1611
+ "avg_processing_time_seconds": avg_processing_time,
1612
+ "completed_count": status_counts.get("completed", 0),
1613
+ "failed_count": status_counts.get("failed", 0),
1614
+ "pending_count": status_counts.get("pending", 0),
1615
+ }
1616
+
1617
+ @staticmethod
1618
+ def generate_config_hash(
1619
+ branch_mapping_rules: Optional[dict] = None,
1620
+ ticket_platforms: Optional[list] = None,
1621
+ exclude_paths: Optional[list] = None,
1622
+ ml_categorization_enabled: bool = False,
1623
+ additional_config: Optional[dict] = None,
1624
+ ) -> str:
1625
+ """Generate MD5 hash of relevant configuration for cache invalidation.
1626
+
1627
+ WHY: Configuration changes can affect analysis results, so we need to
1628
+ detect when cached analysis is no longer valid due to config changes.
1629
+
1630
+ Args:
1631
+ branch_mapping_rules: Branch to project mapping rules
1632
+ ticket_platforms: Allowed ticket platforms
1633
+ exclude_paths: Paths to exclude from analysis
1634
+ ml_categorization_enabled: Whether ML categorization is enabled
1635
+ additional_config: Any additional configuration to include
1636
+
1637
+ Returns:
1638
+ MD5 hash string representing the configuration
1639
+ """
1640
+ config_data = {
1641
+ "branch_mapping_rules": branch_mapping_rules or {},
1642
+ "ticket_platforms": sorted(ticket_platforms or []),
1643
+ "exclude_paths": sorted(exclude_paths or []),
1644
+ "ml_categorization_enabled": ml_categorization_enabled,
1645
+ "additional_config": additional_config or {},
1646
+ }
1647
+
1648
+ # Convert to JSON string with sorted keys for consistent hashing
1649
+ config_json = json.dumps(config_data, sort_keys=True, default=str)
1650
+
1651
+ # Generate MD5 hash
1652
+ return hashlib.md5(config_json.encode()).hexdigest()