gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
gitflow_analytics/core/cache.py
CHANGED
|
@@ -1,23 +1,62 @@
|
|
|
1
1
|
"""Caching layer for Git analysis with SQLite backend."""
|
|
2
2
|
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
3
7
|
from contextlib import contextmanager
|
|
4
8
|
from datetime import datetime, timedelta
|
|
5
9
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
10
|
+
from typing import Any, Optional, Union
|
|
7
11
|
|
|
12
|
+
import git
|
|
8
13
|
from sqlalchemy import and_
|
|
9
14
|
|
|
10
|
-
from ..models.database import
|
|
15
|
+
from ..models.database import (
|
|
16
|
+
CachedCommit,
|
|
17
|
+
Database,
|
|
18
|
+
IssueCache,
|
|
19
|
+
PullRequestCache,
|
|
20
|
+
RepositoryAnalysisStatus,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
11
24
|
|
|
12
25
|
|
|
13
26
|
class GitAnalysisCache:
|
|
14
27
|
"""Cache for Git analysis results."""
|
|
15
28
|
|
|
16
|
-
def __init__(
|
|
17
|
-
|
|
18
|
-
|
|
29
|
+
def __init__(
|
|
30
|
+
self, cache_dir: Union[Path, str], ttl_hours: int = 168, batch_size: int = 1000
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Initialize cache with SQLite backend and configurable batch size.
|
|
33
|
+
|
|
34
|
+
WHY: Adding configurable batch size allows tuning for different repository
|
|
35
|
+
sizes and system capabilities. Default of 1000 balances memory usage with
|
|
36
|
+
performance gains from bulk operations.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
cache_dir: Directory for cache database
|
|
40
|
+
ttl_hours: Time-to-live for cache entries in hours
|
|
41
|
+
batch_size: Default batch size for bulk operations (default: 1000)
|
|
42
|
+
"""
|
|
43
|
+
self.cache_dir = Path(cache_dir) # Ensure it's a Path object
|
|
19
44
|
self.ttl_hours = ttl_hours
|
|
20
|
-
self.
|
|
45
|
+
self.batch_size = batch_size
|
|
46
|
+
self.db = Database(self.cache_dir / "gitflow_cache.db")
|
|
47
|
+
|
|
48
|
+
# Cache performance tracking with enhanced metrics
|
|
49
|
+
self.cache_hits = 0
|
|
50
|
+
self.cache_misses = 0
|
|
51
|
+
self.cache_start_time = datetime.now()
|
|
52
|
+
self.bulk_operations_count = 0
|
|
53
|
+
self.bulk_operations_time = 0.0
|
|
54
|
+
self.single_operations_count = 0
|
|
55
|
+
self.single_operations_time = 0.0
|
|
56
|
+
self.total_bytes_cached = 0
|
|
57
|
+
|
|
58
|
+
# Debug mode controlled by environment variable
|
|
59
|
+
self.debug_mode = os.getenv("GITFLOW_DEBUG", "").lower() in ("1", "true", "yes")
|
|
21
60
|
|
|
22
61
|
@contextmanager
|
|
23
62
|
def get_session(self) -> Any:
|
|
@@ -46,10 +85,65 @@ class GitAnalysisCache:
|
|
|
46
85
|
)
|
|
47
86
|
|
|
48
87
|
if cached and not self._is_stale(cached.cached_at):
|
|
88
|
+
self.cache_hits += 1
|
|
89
|
+
if self.debug_mode:
|
|
90
|
+
print(f"DEBUG: Cache HIT for {commit_hash[:8]} in {repo_path}")
|
|
49
91
|
return self._commit_to_dict(cached)
|
|
50
92
|
|
|
93
|
+
self.cache_misses += 1
|
|
94
|
+
if self.debug_mode:
|
|
95
|
+
print(f"DEBUG: Cache MISS for {commit_hash[:8]} in {repo_path}")
|
|
51
96
|
return None
|
|
52
97
|
|
|
98
|
+
def get_cached_commits_bulk(
|
|
99
|
+
self, repo_path: str, commit_hashes: list[str]
|
|
100
|
+
) -> dict[str, dict[str, Any]]:
|
|
101
|
+
"""Retrieve multiple cached commits in a single query.
|
|
102
|
+
|
|
103
|
+
WHY: Individual cache lookups are inefficient for large batches.
|
|
104
|
+
This method fetches multiple commits at once, reducing database overhead
|
|
105
|
+
and significantly improving performance for subsequent runs.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
repo_path: Repository path for filtering
|
|
109
|
+
commit_hashes: List of commit hashes to look up
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Dictionary mapping commit hash to commit data (only non-stale entries)
|
|
113
|
+
"""
|
|
114
|
+
if not commit_hashes:
|
|
115
|
+
return {}
|
|
116
|
+
|
|
117
|
+
cached_commits = {}
|
|
118
|
+
with self.get_session() as session:
|
|
119
|
+
cached_results = (
|
|
120
|
+
session.query(CachedCommit)
|
|
121
|
+
.filter(
|
|
122
|
+
and_(
|
|
123
|
+
CachedCommit.repo_path == repo_path,
|
|
124
|
+
CachedCommit.commit_hash.in_(commit_hashes),
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
.all()
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
for cached in cached_results:
|
|
131
|
+
if not self._is_stale(cached.cached_at):
|
|
132
|
+
cached_commits[cached.commit_hash] = self._commit_to_dict(cached)
|
|
133
|
+
|
|
134
|
+
# Track cache performance
|
|
135
|
+
hits = len(cached_commits)
|
|
136
|
+
misses = len(commit_hashes) - hits
|
|
137
|
+
self.cache_hits += hits
|
|
138
|
+
self.cache_misses += misses
|
|
139
|
+
|
|
140
|
+
if self.debug_mode:
|
|
141
|
+
print(
|
|
142
|
+
f"DEBUG: Bulk cache lookup - {hits} hits, {misses} misses for {len(commit_hashes)} commits"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return cached_commits
|
|
146
|
+
|
|
53
147
|
def cache_commit(self, repo_path: str, commit_data: dict[str, Any]) -> None:
|
|
54
148
|
"""Cache commit analysis results."""
|
|
55
149
|
with self.get_session() as session:
|
|
@@ -82,7 +176,14 @@ class GitAnalysisCache:
|
|
|
82
176
|
timestamp=commit_data.get("timestamp"),
|
|
83
177
|
branch=commit_data.get("branch"),
|
|
84
178
|
is_merge=commit_data.get("is_merge", False),
|
|
85
|
-
files_changed=commit_data.get(
|
|
179
|
+
files_changed=commit_data.get(
|
|
180
|
+
"files_changed_count",
|
|
181
|
+
(
|
|
182
|
+
commit_data.get("files_changed", 0)
|
|
183
|
+
if isinstance(commit_data.get("files_changed"), int)
|
|
184
|
+
else len(commit_data.get("files_changed", []))
|
|
185
|
+
),
|
|
186
|
+
),
|
|
86
187
|
insertions=commit_data.get("insertions", 0),
|
|
87
188
|
deletions=commit_data.get("deletions", 0),
|
|
88
189
|
complexity_delta=commit_data.get("complexity_delta", 0.0),
|
|
@@ -92,23 +193,43 @@ class GitAnalysisCache:
|
|
|
92
193
|
session.add(cached_commit)
|
|
93
194
|
|
|
94
195
|
def cache_commits_batch(self, repo_path: str, commits: list[dict[str, Any]]) -> None:
|
|
95
|
-
"""Cache multiple commits in a single transaction.
|
|
196
|
+
"""Cache multiple commits in a single transaction.
|
|
197
|
+
|
|
198
|
+
WHY: Optimized batch caching reduces database overhead by using
|
|
199
|
+
bulk queries to check for existing commits instead of individual lookups.
|
|
200
|
+
This significantly improves performance when caching large batches.
|
|
201
|
+
"""
|
|
202
|
+
if not commits:
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
import time
|
|
206
|
+
|
|
207
|
+
start_time = time.time()
|
|
208
|
+
|
|
96
209
|
with self.get_session() as session:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
210
|
+
# Get all commit hashes in this batch
|
|
211
|
+
commit_hashes = [commit_data["hash"] for commit_data in commits]
|
|
212
|
+
|
|
213
|
+
# Bulk fetch existing commits
|
|
214
|
+
existing_commits = {
|
|
215
|
+
cached.commit_hash: cached
|
|
216
|
+
for cached in session.query(CachedCommit)
|
|
217
|
+
.filter(
|
|
218
|
+
and_(
|
|
219
|
+
CachedCommit.repo_path == repo_path,
|
|
220
|
+
CachedCommit.commit_hash.in_(commit_hashes),
|
|
106
221
|
)
|
|
107
|
-
.first()
|
|
108
222
|
)
|
|
223
|
+
.all()
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# Process each commit
|
|
227
|
+
for commit_data in commits:
|
|
228
|
+
commit_hash = commit_data["hash"]
|
|
109
229
|
|
|
110
|
-
if
|
|
230
|
+
if commit_hash in existing_commits:
|
|
111
231
|
# Update existing
|
|
232
|
+
existing = existing_commits[commit_hash]
|
|
112
233
|
for key, value in commit_data.items():
|
|
113
234
|
if key != "hash" and hasattr(existing, key):
|
|
114
235
|
setattr(existing, key, value)
|
|
@@ -124,7 +245,14 @@ class GitAnalysisCache:
|
|
|
124
245
|
timestamp=commit_data.get("timestamp"),
|
|
125
246
|
branch=commit_data.get("branch"),
|
|
126
247
|
is_merge=commit_data.get("is_merge", False),
|
|
127
|
-
files_changed=commit_data.get(
|
|
248
|
+
files_changed=commit_data.get(
|
|
249
|
+
"files_changed_count",
|
|
250
|
+
(
|
|
251
|
+
commit_data.get("files_changed", 0)
|
|
252
|
+
if isinstance(commit_data.get("files_changed"), int)
|
|
253
|
+
else len(commit_data.get("files_changed", []))
|
|
254
|
+
),
|
|
255
|
+
),
|
|
128
256
|
insertions=commit_data.get("insertions", 0),
|
|
129
257
|
deletions=commit_data.get("deletions", 0),
|
|
130
258
|
complexity_delta=commit_data.get("complexity_delta", 0.0),
|
|
@@ -133,6 +261,14 @@ class GitAnalysisCache:
|
|
|
133
261
|
)
|
|
134
262
|
session.add(cached_commit)
|
|
135
263
|
|
|
264
|
+
# Track performance metrics
|
|
265
|
+
elapsed = time.time() - start_time
|
|
266
|
+
self.bulk_operations_count += 1
|
|
267
|
+
self.bulk_operations_time += elapsed
|
|
268
|
+
|
|
269
|
+
if self.debug_mode:
|
|
270
|
+
print(f"DEBUG: Bulk cached {len(commits)} commits in {elapsed:.3f}s")
|
|
271
|
+
|
|
136
272
|
def get_cached_pr(self, repo_path: str, pr_number: int) -> Optional[dict[str, Any]]:
|
|
137
273
|
"""Retrieve cached pull request data."""
|
|
138
274
|
with self.get_session() as session:
|
|
@@ -272,21 +408,459 @@ class GitAnalysisCache:
|
|
|
272
408
|
|
|
273
409
|
session.query(IssueCache).filter(IssueCache.cached_at < cutoff_time).delete()
|
|
274
410
|
|
|
275
|
-
|
|
276
|
-
|
|
411
|
+
# Also clear stale repository analysis status
|
|
412
|
+
session.query(RepositoryAnalysisStatus).filter(
|
|
413
|
+
RepositoryAnalysisStatus.last_updated < cutoff_time
|
|
414
|
+
).delete()
|
|
415
|
+
|
|
416
|
+
def clear_all_cache(self) -> dict[str, int]:
|
|
417
|
+
"""Clear all cache entries including repository analysis status.
|
|
418
|
+
|
|
419
|
+
WHY: Used by --clear-cache flag to force complete re-analysis.
|
|
420
|
+
Returns counts of cleared entries for user feedback.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Dictionary with counts of cleared entries by type
|
|
424
|
+
"""
|
|
277
425
|
with self.get_session() as session:
|
|
426
|
+
# Count before clearing
|
|
427
|
+
commit_count = session.query(CachedCommit).count()
|
|
428
|
+
pr_count = session.query(PullRequestCache).count()
|
|
429
|
+
issue_count = session.query(IssueCache).count()
|
|
430
|
+
status_count = session.query(RepositoryAnalysisStatus).count()
|
|
431
|
+
|
|
432
|
+
# Clear all entries
|
|
433
|
+
session.query(CachedCommit).delete()
|
|
434
|
+
session.query(PullRequestCache).delete()
|
|
435
|
+
session.query(IssueCache).delete()
|
|
436
|
+
session.query(RepositoryAnalysisStatus).delete()
|
|
437
|
+
|
|
438
|
+
return {
|
|
439
|
+
"commits": commit_count,
|
|
440
|
+
"pull_requests": pr_count,
|
|
441
|
+
"issues": issue_count,
|
|
442
|
+
"repository_status": status_count,
|
|
443
|
+
"total": commit_count + pr_count + issue_count + status_count,
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
def get_cache_stats(self) -> dict[str, Any]:
|
|
447
|
+
"""Get comprehensive cache statistics including external API cache performance."""
|
|
448
|
+
with self.get_session() as session:
|
|
449
|
+
# Basic counts
|
|
450
|
+
total_commits = session.query(CachedCommit).count()
|
|
451
|
+
total_prs = session.query(PullRequestCache).count()
|
|
452
|
+
total_issues = session.query(IssueCache).count()
|
|
453
|
+
|
|
454
|
+
# Platform-specific issue counts
|
|
455
|
+
jira_issues = session.query(IssueCache).filter(IssueCache.platform == "jira").count()
|
|
456
|
+
github_issues = (
|
|
457
|
+
session.query(IssueCache).filter(IssueCache.platform == "github").count()
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Stale entries
|
|
461
|
+
cutoff_time = datetime.utcnow() - timedelta(hours=self.ttl_hours)
|
|
462
|
+
stale_commits = (
|
|
463
|
+
session.query(CachedCommit).filter(CachedCommit.cached_at < cutoff_time).count()
|
|
464
|
+
)
|
|
465
|
+
stale_prs = (
|
|
466
|
+
session.query(PullRequestCache)
|
|
467
|
+
.filter(PullRequestCache.cached_at < cutoff_time)
|
|
468
|
+
.count()
|
|
469
|
+
)
|
|
470
|
+
stale_issues = (
|
|
471
|
+
session.query(IssueCache).filter(IssueCache.cached_at < cutoff_time).count()
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
# Performance metrics
|
|
475
|
+
total_requests = self.cache_hits + self.cache_misses
|
|
476
|
+
hit_rate = (self.cache_hits / total_requests * 100) if total_requests > 0 else 0
|
|
477
|
+
|
|
478
|
+
# Bulk vs Single operation performance
|
|
479
|
+
avg_bulk_time = (
|
|
480
|
+
self.bulk_operations_time / self.bulk_operations_count
|
|
481
|
+
if self.bulk_operations_count > 0
|
|
482
|
+
else 0
|
|
483
|
+
)
|
|
484
|
+
avg_single_time = (
|
|
485
|
+
self.single_operations_time / self.single_operations_count
|
|
486
|
+
if self.single_operations_count > 0
|
|
487
|
+
else 0
|
|
488
|
+
)
|
|
489
|
+
bulk_speedup = (
|
|
490
|
+
avg_single_time / avg_bulk_time if avg_bulk_time > 0 and avg_single_time > 0 else 0
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
# Estimated time savings (conservative estimates)
|
|
494
|
+
commit_time_saved = self.cache_hits * 0.1 # 0.1 seconds per commit analysis
|
|
495
|
+
api_time_saved = (total_issues * 0.5) + (total_prs * 0.3) # API call time savings
|
|
496
|
+
bulk_time_saved = (
|
|
497
|
+
self.bulk_operations_count * 2.0
|
|
498
|
+
) # Estimated 2 seconds saved per bulk op
|
|
499
|
+
total_time_saved = commit_time_saved + api_time_saved + bulk_time_saved
|
|
500
|
+
|
|
501
|
+
# Database file size
|
|
502
|
+
db_file = self.cache_dir / "gitflow_cache.db"
|
|
503
|
+
db_size_mb = db_file.stat().st_size / (1024 * 1024) if db_file.exists() else 0
|
|
504
|
+
|
|
505
|
+
# Session duration
|
|
506
|
+
session_duration = (datetime.now() - self.cache_start_time).total_seconds()
|
|
507
|
+
|
|
508
|
+
# Cache efficiency metrics
|
|
509
|
+
fresh_commits = total_commits - stale_commits
|
|
510
|
+
fresh_prs = total_prs - stale_prs
|
|
511
|
+
fresh_issues = total_issues - stale_issues
|
|
512
|
+
total_fresh_entries = fresh_commits + fresh_prs + fresh_issues
|
|
513
|
+
|
|
278
514
|
stats = {
|
|
279
|
-
|
|
280
|
-
"
|
|
281
|
-
"
|
|
282
|
-
"
|
|
283
|
-
|
|
284
|
-
|
|
515
|
+
# Counts by type
|
|
516
|
+
"cached_commits": total_commits,
|
|
517
|
+
"cached_prs": total_prs,
|
|
518
|
+
"cached_issues": total_issues,
|
|
519
|
+
"cached_jira_issues": jira_issues,
|
|
520
|
+
"cached_github_issues": github_issues,
|
|
521
|
+
# Freshness analysis
|
|
522
|
+
"stale_commits": stale_commits,
|
|
523
|
+
"stale_prs": stale_prs,
|
|
524
|
+
"stale_issues": stale_issues,
|
|
525
|
+
"fresh_commits": fresh_commits,
|
|
526
|
+
"fresh_prs": fresh_prs,
|
|
527
|
+
"fresh_issues": fresh_issues,
|
|
528
|
+
"total_fresh_entries": total_fresh_entries,
|
|
529
|
+
"freshness_rate_percent": (
|
|
530
|
+
total_fresh_entries / max(1, total_commits + total_prs + total_issues)
|
|
285
531
|
)
|
|
286
|
-
|
|
532
|
+
* 100,
|
|
533
|
+
# Performance metrics
|
|
534
|
+
"cache_hits": self.cache_hits,
|
|
535
|
+
"cache_misses": self.cache_misses,
|
|
536
|
+
"total_requests": total_requests,
|
|
537
|
+
"hit_rate_percent": hit_rate,
|
|
538
|
+
# Bulk operation metrics
|
|
539
|
+
"bulk_operations_count": self.bulk_operations_count,
|
|
540
|
+
"bulk_operations_time_seconds": self.bulk_operations_time,
|
|
541
|
+
"avg_bulk_operation_time": avg_bulk_time,
|
|
542
|
+
"single_operations_count": self.single_operations_count,
|
|
543
|
+
"single_operations_time_seconds": self.single_operations_time,
|
|
544
|
+
"avg_single_operation_time": avg_single_time,
|
|
545
|
+
"bulk_speedup_factor": bulk_speedup,
|
|
546
|
+
# Time savings
|
|
547
|
+
"commit_analysis_time_saved_seconds": commit_time_saved,
|
|
548
|
+
"api_call_time_saved_seconds": api_time_saved,
|
|
549
|
+
"bulk_operations_time_saved_seconds": bulk_time_saved,
|
|
550
|
+
"total_time_saved_seconds": total_time_saved,
|
|
551
|
+
"total_time_saved_minutes": total_time_saved / 60,
|
|
552
|
+
# Backward compatibility aliases for CLI
|
|
553
|
+
"time_saved_seconds": total_time_saved,
|
|
554
|
+
"time_saved_minutes": total_time_saved / 60,
|
|
555
|
+
"estimated_api_calls_avoided": total_issues + total_prs,
|
|
556
|
+
# Storage metrics
|
|
557
|
+
"database_size_mb": db_size_mb,
|
|
558
|
+
"session_duration_seconds": session_duration,
|
|
559
|
+
"avg_entries_per_mb": (total_commits + total_prs + total_issues)
|
|
560
|
+
/ max(0.1, db_size_mb),
|
|
561
|
+
"total_bytes_cached": self.total_bytes_cached,
|
|
562
|
+
# Configuration
|
|
563
|
+
"ttl_hours": self.ttl_hours,
|
|
564
|
+
"batch_size": self.batch_size,
|
|
565
|
+
"debug_mode": self.debug_mode,
|
|
287
566
|
}
|
|
567
|
+
|
|
288
568
|
return stats
|
|
289
569
|
|
|
570
|
+
def print_cache_performance_summary(self) -> None:
|
|
571
|
+
"""Print a user-friendly cache performance summary.
|
|
572
|
+
|
|
573
|
+
WHY: Users need visibility into cache performance to understand
|
|
574
|
+
why repeated runs are faster and to identify any caching issues.
|
|
575
|
+
This provides actionable insights into cache effectiveness.
|
|
576
|
+
"""
|
|
577
|
+
stats = self.get_cache_stats()
|
|
578
|
+
|
|
579
|
+
print("📊 Cache Performance Summary")
|
|
580
|
+
print("─" * 50)
|
|
581
|
+
|
|
582
|
+
# Cache contents
|
|
583
|
+
print("📦 Cache Contents:")
|
|
584
|
+
print(
|
|
585
|
+
f" • Commits: {stats['cached_commits']:,} ({stats['fresh_commits']:,} fresh, {stats['stale_commits']:,} stale)"
|
|
586
|
+
)
|
|
587
|
+
print(
|
|
588
|
+
f" • Pull Requests: {stats['cached_prs']:,} ({stats['fresh_prs']:,} fresh, {stats['stale_prs']:,} stale)"
|
|
589
|
+
)
|
|
590
|
+
print(
|
|
591
|
+
f" • Issues: {stats['cached_issues']:,} ({stats['fresh_issues']:,} fresh, {stats['stale_issues']:,} stale)"
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
if stats["cached_jira_issues"] > 0:
|
|
595
|
+
print(f" ├─ JIRA: {stats['cached_jira_issues']:,} issues")
|
|
596
|
+
if stats["cached_github_issues"] > 0:
|
|
597
|
+
print(f" └─ GitHub: {stats['cached_github_issues']:,} issues")
|
|
598
|
+
|
|
599
|
+
# Performance metrics
|
|
600
|
+
if stats["total_requests"] > 0:
|
|
601
|
+
print("\n⚡ Session Performance:")
|
|
602
|
+
print(
|
|
603
|
+
f" • Cache Hit Rate: {stats['hit_rate_percent']:.1f}% ({stats['cache_hits']:,}/{stats['total_requests']:,})"
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if stats["total_time_saved_minutes"] > 1:
|
|
607
|
+
print(f" • Time Saved: {stats['total_time_saved_minutes']:.1f} minutes")
|
|
608
|
+
else:
|
|
609
|
+
print(f" • Time Saved: {stats['total_time_saved_seconds']:.1f} seconds")
|
|
610
|
+
|
|
611
|
+
if stats["estimated_api_calls_avoided"] > 0:
|
|
612
|
+
print(f" • API Calls Avoided: {stats['estimated_api_calls_avoided']:,}")
|
|
613
|
+
|
|
614
|
+
# Bulk operation performance
|
|
615
|
+
if stats["bulk_operations_count"] > 0:
|
|
616
|
+
print("\n🚀 Bulk Operations:")
|
|
617
|
+
print(f" • Bulk Operations: {stats['bulk_operations_count']:,}")
|
|
618
|
+
print(f" • Avg Bulk Time: {stats['avg_bulk_operation_time']:.3f}s")
|
|
619
|
+
if stats["bulk_speedup_factor"] > 1:
|
|
620
|
+
print(
|
|
621
|
+
f" • Speedup Factor: {stats['bulk_speedup_factor']:.1f}x faster than single ops"
|
|
622
|
+
)
|
|
623
|
+
print(f" • Batch Size: {stats['batch_size']:,} items")
|
|
624
|
+
|
|
625
|
+
# Storage info
|
|
626
|
+
print("\n💾 Storage:")
|
|
627
|
+
print(f" • Database Size: {stats['database_size_mb']:.1f} MB")
|
|
628
|
+
print(f" • Cache TTL: {stats['ttl_hours']} hours")
|
|
629
|
+
print(f" • Overall Freshness: {stats['freshness_rate_percent']:.1f}%")
|
|
630
|
+
|
|
631
|
+
# Performance insights
|
|
632
|
+
if stats["hit_rate_percent"] > 80:
|
|
633
|
+
print(" ✅ Excellent cache performance!")
|
|
634
|
+
elif stats["hit_rate_percent"] > 50:
|
|
635
|
+
print(" 👍 Good cache performance")
|
|
636
|
+
elif stats["total_requests"] > 0:
|
|
637
|
+
print(" ⚠️ Consider clearing stale cache entries")
|
|
638
|
+
|
|
639
|
+
print()
|
|
640
|
+
|
|
641
|
+
def validate_cache(self) -> dict[str, Any]:
|
|
642
|
+
"""Validate cache consistency and integrity.
|
|
643
|
+
|
|
644
|
+
WHY: Cache validation ensures data integrity and identifies issues
|
|
645
|
+
that could cause analysis errors or inconsistent results.
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
Dictionary with validation results and issues found
|
|
649
|
+
"""
|
|
650
|
+
validation_results = {
|
|
651
|
+
"is_valid": True,
|
|
652
|
+
"issues": [],
|
|
653
|
+
"warnings": [],
|
|
654
|
+
"stats": {},
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
with self.get_session() as session:
|
|
658
|
+
try:
|
|
659
|
+
# Check for missing required fields
|
|
660
|
+
commits_without_hash = (
|
|
661
|
+
session.query(CachedCommit).filter(CachedCommit.commit_hash.is_(None)).count()
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
if commits_without_hash > 0:
|
|
665
|
+
validation_results["issues"].append(
|
|
666
|
+
f"Found {commits_without_hash} cached commits without hash"
|
|
667
|
+
)
|
|
668
|
+
validation_results["is_valid"] = False
|
|
669
|
+
|
|
670
|
+
# Check for duplicate commits
|
|
671
|
+
from sqlalchemy import func
|
|
672
|
+
|
|
673
|
+
duplicates = (
|
|
674
|
+
session.query(
|
|
675
|
+
CachedCommit.repo_path,
|
|
676
|
+
CachedCommit.commit_hash,
|
|
677
|
+
func.count().label("count"),
|
|
678
|
+
)
|
|
679
|
+
.group_by(CachedCommit.repo_path, CachedCommit.commit_hash)
|
|
680
|
+
.having(func.count() > 1)
|
|
681
|
+
.all()
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
if duplicates:
|
|
685
|
+
validation_results["warnings"].append(
|
|
686
|
+
f"Found {len(duplicates)} duplicate commit entries"
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
# Check for very old entries (older than 2 * TTL)
|
|
690
|
+
very_old_cutoff = datetime.utcnow() - timedelta(hours=self.ttl_hours * 2)
|
|
691
|
+
very_old_count = (
|
|
692
|
+
session.query(CachedCommit)
|
|
693
|
+
.filter(CachedCommit.cached_at < very_old_cutoff)
|
|
694
|
+
.count()
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
if very_old_count > 0:
|
|
698
|
+
validation_results["warnings"].append(
|
|
699
|
+
f"Found {very_old_count} very old cache entries (older than {self.ttl_hours * 2}h)"
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
# Basic integrity checks
|
|
703
|
+
commits_with_negative_changes = (
|
|
704
|
+
session.query(CachedCommit)
|
|
705
|
+
.filter(
|
|
706
|
+
(CachedCommit.files_changed < 0)
|
|
707
|
+
| (CachedCommit.insertions < 0)
|
|
708
|
+
| (CachedCommit.deletions < 0)
|
|
709
|
+
)
|
|
710
|
+
.count()
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
if commits_with_negative_changes > 0:
|
|
714
|
+
validation_results["issues"].append(
|
|
715
|
+
f"Found {commits_with_negative_changes} commits with negative change counts"
|
|
716
|
+
)
|
|
717
|
+
validation_results["is_valid"] = False
|
|
718
|
+
|
|
719
|
+
# Statistics
|
|
720
|
+
validation_results["stats"] = {
|
|
721
|
+
"total_commits": session.query(CachedCommit).count(),
|
|
722
|
+
"duplicates": len(duplicates),
|
|
723
|
+
"very_old_entries": very_old_count,
|
|
724
|
+
"invalid_commits": commits_without_hash + commits_with_negative_changes,
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
except Exception as e:
|
|
728
|
+
validation_results["issues"].append(f"Validation error: {str(e)}")
|
|
729
|
+
validation_results["is_valid"] = False
|
|
730
|
+
|
|
731
|
+
return validation_results
|
|
732
|
+
|
|
733
|
+
def warm_cache(self, repo_paths: list[str], weeks: int = 12) -> dict[str, Any]:
|
|
734
|
+
"""Pre-warm cache by analyzing all commits in repositories.
|
|
735
|
+
|
|
736
|
+
WHY: Cache warming ensures all commits are pre-analyzed and cached,
|
|
737
|
+
making subsequent runs much faster. This is especially useful for
|
|
738
|
+
CI/CD environments or when analyzing the same repositories repeatedly.
|
|
739
|
+
|
|
740
|
+
Args:
|
|
741
|
+
repo_paths: List of repository paths to warm cache for
|
|
742
|
+
weeks: Number of weeks of history to warm (default: 12)
|
|
743
|
+
|
|
744
|
+
Returns:
|
|
745
|
+
Dictionary with warming results and statistics
|
|
746
|
+
"""
|
|
747
|
+
from datetime import datetime, timedelta
|
|
748
|
+
|
|
749
|
+
import git
|
|
750
|
+
|
|
751
|
+
from .progress import get_progress_service
|
|
752
|
+
|
|
753
|
+
warming_results = {
|
|
754
|
+
"repos_processed": 0,
|
|
755
|
+
"total_commits_found": 0,
|
|
756
|
+
"commits_cached": 0,
|
|
757
|
+
"commits_already_cached": 0,
|
|
758
|
+
"errors": [],
|
|
759
|
+
"duration_seconds": 0,
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
start_time = datetime.now()
|
|
763
|
+
cutoff_date = datetime.now() - timedelta(weeks=weeks)
|
|
764
|
+
|
|
765
|
+
try:
|
|
766
|
+
for repo_path in repo_paths:
|
|
767
|
+
try:
|
|
768
|
+
from pathlib import Path
|
|
769
|
+
|
|
770
|
+
repo_path_obj = Path(repo_path)
|
|
771
|
+
repo = git.Repo(repo_path)
|
|
772
|
+
|
|
773
|
+
# Get commits from the specified time period
|
|
774
|
+
commits = list(
|
|
775
|
+
repo.iter_commits(all=True, since=cutoff_date.strftime("%Y-%m-%d"))
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
warming_results["total_commits_found"] += len(commits)
|
|
779
|
+
|
|
780
|
+
# Check which commits are already cached
|
|
781
|
+
commit_hashes = [c.hexsha for c in commits]
|
|
782
|
+
cached_commits = self.get_cached_commits_bulk(str(repo_path_obj), commit_hashes)
|
|
783
|
+
already_cached = len(cached_commits)
|
|
784
|
+
to_analyze = len(commits) - already_cached
|
|
785
|
+
|
|
786
|
+
warming_results["commits_already_cached"] += already_cached
|
|
787
|
+
|
|
788
|
+
if to_analyze > 0:
|
|
789
|
+
# Use centralized progress service
|
|
790
|
+
progress = get_progress_service()
|
|
791
|
+
|
|
792
|
+
# Analyze uncached commits with progress bar
|
|
793
|
+
with progress.progress(
|
|
794
|
+
total=to_analyze,
|
|
795
|
+
description=f"Warming cache for {repo_path_obj.name}",
|
|
796
|
+
unit="commits",
|
|
797
|
+
leave=False,
|
|
798
|
+
) as ctx:
|
|
799
|
+
new_commits = []
|
|
800
|
+
for commit in commits:
|
|
801
|
+
if commit.hexsha not in cached_commits:
|
|
802
|
+
# Basic commit analysis (minimal for cache warming)
|
|
803
|
+
commit_data = self._analyze_commit_minimal(
|
|
804
|
+
repo, commit, repo_path_obj
|
|
805
|
+
)
|
|
806
|
+
new_commits.append(commit_data)
|
|
807
|
+
progress.update(ctx, 1)
|
|
808
|
+
|
|
809
|
+
# Batch cache commits for efficiency
|
|
810
|
+
if len(new_commits) >= 100:
|
|
811
|
+
self.cache_commits_batch(str(repo_path_obj), new_commits)
|
|
812
|
+
warming_results["commits_cached"] += len(new_commits)
|
|
813
|
+
new_commits = []
|
|
814
|
+
|
|
815
|
+
# Cache remaining commits
|
|
816
|
+
if new_commits:
|
|
817
|
+
self.cache_commits_batch(str(repo_path_obj), new_commits)
|
|
818
|
+
warming_results["commits_cached"] += len(new_commits)
|
|
819
|
+
|
|
820
|
+
warming_results["repos_processed"] += 1
|
|
821
|
+
|
|
822
|
+
except Exception as e:
|
|
823
|
+
warming_results["errors"].append(f"Error processing {repo_path}: {str(e)}")
|
|
824
|
+
|
|
825
|
+
except Exception as e:
|
|
826
|
+
warming_results["errors"].append(f"General error during cache warming: {str(e)}")
|
|
827
|
+
|
|
828
|
+
warming_results["duration_seconds"] = (datetime.now() - start_time).total_seconds()
|
|
829
|
+
return warming_results
|
|
830
|
+
|
|
831
|
+
def _analyze_commit_minimal(
|
|
832
|
+
self, repo: git.Repo, commit: git.Commit, repo_path: Path
|
|
833
|
+
) -> dict[str, Any]:
|
|
834
|
+
"""Minimal commit analysis for cache warming.
|
|
835
|
+
|
|
836
|
+
WHY: Cache warming doesn't need full analysis complexity,
|
|
837
|
+
just enough data to populate the cache effectively.
|
|
838
|
+
"""
|
|
839
|
+
# Basic commit data
|
|
840
|
+
commit_data = {
|
|
841
|
+
"hash": commit.hexsha,
|
|
842
|
+
"author_name": commit.author.name,
|
|
843
|
+
"author_email": commit.author.email,
|
|
844
|
+
"message": commit.message,
|
|
845
|
+
"timestamp": commit.committed_datetime,
|
|
846
|
+
"is_merge": len(commit.parents) > 1,
|
|
847
|
+
"files_changed": self._get_files_changed_count(commit),
|
|
848
|
+
"insertions": self._get_insertions_count(commit),
|
|
849
|
+
"deletions": self._get_deletions_count(commit),
|
|
850
|
+
"complexity_delta": 0.0, # Skip complexity calculation for warming
|
|
851
|
+
"story_points": None, # Skip story point extraction for warming
|
|
852
|
+
"ticket_references": [], # Skip ticket analysis for warming
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
# Try to get branch info (if available)
|
|
856
|
+
try:
|
|
857
|
+
branches = repo.git.branch("--contains", commit.hexsha).split("\n")
|
|
858
|
+
commit_data["branch"] = branches[0].strip("* ") if branches else "unknown"
|
|
859
|
+
except Exception:
|
|
860
|
+
commit_data["branch"] = "unknown"
|
|
861
|
+
|
|
862
|
+
return commit_data
|
|
863
|
+
|
|
290
864
|
def _is_stale(self, cached_at: datetime) -> bool:
|
|
291
865
|
"""Check if cache entry is stale."""
|
|
292
866
|
if self.ttl_hours == 0: # No expiration
|
|
@@ -342,3 +916,737 @@ class GitAnalysisCache:
|
|
|
342
916
|
"labels": issue.labels or [],
|
|
343
917
|
"platform_data": issue.platform_data or {},
|
|
344
918
|
}
|
|
919
|
+
|
|
920
|
+
def bulk_store_commits(self, repo_path: str, commits: list[dict[str, Any]]) -> dict[str, Any]:
|
|
921
|
+
"""Store multiple commits using SQLAlchemy bulk operations for maximum performance.
|
|
922
|
+
|
|
923
|
+
WHY: This method uses SQLAlchemy's bulk_insert_mappings which is significantly
|
|
924
|
+
faster than individual inserts. It's designed for initial data loading where
|
|
925
|
+
we know commits don't exist yet in the cache.
|
|
926
|
+
|
|
927
|
+
DESIGN DECISION: Unlike cache_commits_batch which handles updates, this method
|
|
928
|
+
only inserts new commits. Use this when you know commits are not in cache.
|
|
929
|
+
|
|
930
|
+
Args:
|
|
931
|
+
repo_path: Repository path
|
|
932
|
+
commits: List of commit dictionaries to store
|
|
933
|
+
|
|
934
|
+
Returns:
|
|
935
|
+
Dictionary with operation statistics
|
|
936
|
+
"""
|
|
937
|
+
if not commits:
|
|
938
|
+
return {"inserted": 0, "time_seconds": 0}
|
|
939
|
+
|
|
940
|
+
import time
|
|
941
|
+
|
|
942
|
+
start_time = time.time()
|
|
943
|
+
|
|
944
|
+
# Prepare mappings for bulk insert
|
|
945
|
+
mappings = []
|
|
946
|
+
for commit_data in commits:
|
|
947
|
+
mapping = {
|
|
948
|
+
"repo_path": repo_path,
|
|
949
|
+
"commit_hash": commit_data["hash"],
|
|
950
|
+
"author_name": commit_data.get("author_name"),
|
|
951
|
+
"author_email": commit_data.get("author_email"),
|
|
952
|
+
"message": commit_data.get("message"),
|
|
953
|
+
"timestamp": commit_data.get("timestamp"),
|
|
954
|
+
"branch": commit_data.get("branch"),
|
|
955
|
+
"is_merge": commit_data.get("is_merge", False),
|
|
956
|
+
"files_changed": commit_data.get(
|
|
957
|
+
"files_changed_count",
|
|
958
|
+
(
|
|
959
|
+
commit_data.get("files_changed", 0)
|
|
960
|
+
if isinstance(commit_data.get("files_changed"), int)
|
|
961
|
+
else len(commit_data.get("files_changed", []))
|
|
962
|
+
),
|
|
963
|
+
),
|
|
964
|
+
"insertions": commit_data.get("insertions", 0),
|
|
965
|
+
"deletions": commit_data.get("deletions", 0),
|
|
966
|
+
"complexity_delta": commit_data.get("complexity_delta", 0.0),
|
|
967
|
+
"story_points": commit_data.get("story_points"),
|
|
968
|
+
"ticket_references": commit_data.get("ticket_references", []),
|
|
969
|
+
"cached_at": datetime.utcnow(),
|
|
970
|
+
}
|
|
971
|
+
mappings.append(mapping)
|
|
972
|
+
|
|
973
|
+
# Process in configurable batch sizes for memory efficiency
|
|
974
|
+
inserted_count = 0
|
|
975
|
+
with self.get_session() as session:
|
|
976
|
+
for i in range(0, len(mappings), self.batch_size):
|
|
977
|
+
batch = mappings[i : i + self.batch_size]
|
|
978
|
+
try:
|
|
979
|
+
session.bulk_insert_mappings(CachedCommit, batch)
|
|
980
|
+
inserted_count += len(batch)
|
|
981
|
+
except Exception as e:
|
|
982
|
+
# On error, fall back to individual inserts for this batch
|
|
983
|
+
logger.warning(f"Bulk insert failed, falling back to individual inserts: {e}")
|
|
984
|
+
session.rollback() # Important: rollback failed transaction
|
|
985
|
+
|
|
986
|
+
for mapping in batch:
|
|
987
|
+
try:
|
|
988
|
+
# Create new record
|
|
989
|
+
new_commit = CachedCommit(**mapping)
|
|
990
|
+
session.add(new_commit)
|
|
991
|
+
session.flush() # Try to save this individual record
|
|
992
|
+
inserted_count += 1
|
|
993
|
+
except Exception:
|
|
994
|
+
# Skip duplicate commits silently
|
|
995
|
+
session.rollback() # Rollback this specific failure
|
|
996
|
+
continue
|
|
997
|
+
|
|
998
|
+
elapsed = time.time() - start_time
|
|
999
|
+
self.bulk_operations_count += 1
|
|
1000
|
+
self.bulk_operations_time += elapsed
|
|
1001
|
+
|
|
1002
|
+
if self.debug_mode:
|
|
1003
|
+
rate = inserted_count / elapsed if elapsed > 0 else 0
|
|
1004
|
+
print(
|
|
1005
|
+
f"DEBUG: Bulk stored {inserted_count} commits in {elapsed:.3f}s ({rate:.0f} commits/sec)"
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
return {
|
|
1009
|
+
"inserted": inserted_count,
|
|
1010
|
+
"time_seconds": elapsed,
|
|
1011
|
+
"commits_per_second": inserted_count / elapsed if elapsed > 0 else 0,
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
def bulk_update_commits(self, repo_path: str, commits: list[dict[str, Any]]) -> dict[str, Any]:
|
|
1015
|
+
"""Update multiple commits efficiently using bulk operations.
|
|
1016
|
+
|
|
1017
|
+
WHY: Bulk updates are faster than individual updates when modifying many
|
|
1018
|
+
commits at once (e.g., after classification or enrichment).
|
|
1019
|
+
|
|
1020
|
+
Args:
|
|
1021
|
+
repo_path: Repository path
|
|
1022
|
+
commits: List of commit dictionaries with updates
|
|
1023
|
+
|
|
1024
|
+
Returns:
|
|
1025
|
+
Dictionary with operation statistics
|
|
1026
|
+
"""
|
|
1027
|
+
if not commits:
|
|
1028
|
+
return {"updated": 0, "time_seconds": 0}
|
|
1029
|
+
|
|
1030
|
+
import time
|
|
1031
|
+
|
|
1032
|
+
start_time = time.time()
|
|
1033
|
+
|
|
1034
|
+
with self.get_session() as session:
|
|
1035
|
+
# Get all commit hashes for bulk fetch
|
|
1036
|
+
commit_hashes = [c["hash"] for c in commits]
|
|
1037
|
+
|
|
1038
|
+
# Bulk fetch existing commits to get their primary keys
|
|
1039
|
+
existing = {
|
|
1040
|
+
cached.commit_hash: cached
|
|
1041
|
+
for cached in session.query(CachedCommit)
|
|
1042
|
+
.filter(
|
|
1043
|
+
and_(
|
|
1044
|
+
CachedCommit.repo_path == repo_path,
|
|
1045
|
+
CachedCommit.commit_hash.in_(commit_hashes),
|
|
1046
|
+
)
|
|
1047
|
+
)
|
|
1048
|
+
.all()
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
# Prepare bulk update mappings with primary key
|
|
1052
|
+
update_mappings = []
|
|
1053
|
+
for commit_data in commits:
|
|
1054
|
+
if commit_data["hash"] in existing:
|
|
1055
|
+
cached_record = existing[commit_data["hash"]]
|
|
1056
|
+
# Must include primary key for bulk_update_mappings
|
|
1057
|
+
update_mapping = {"id": cached_record.id}
|
|
1058
|
+
|
|
1059
|
+
# Map commit data fields to database columns
|
|
1060
|
+
field_mapping = {
|
|
1061
|
+
"author_name": commit_data.get("author_name"),
|
|
1062
|
+
"author_email": commit_data.get("author_email"),
|
|
1063
|
+
"message": commit_data.get("message"),
|
|
1064
|
+
"timestamp": commit_data.get("timestamp"),
|
|
1065
|
+
"branch": commit_data.get("branch"),
|
|
1066
|
+
"is_merge": commit_data.get("is_merge"),
|
|
1067
|
+
"files_changed": commit_data.get(
|
|
1068
|
+
"files_changed_count",
|
|
1069
|
+
(
|
|
1070
|
+
commit_data.get("files_changed", 0)
|
|
1071
|
+
if isinstance(commit_data.get("files_changed"), int)
|
|
1072
|
+
else len(commit_data.get("files_changed", []))
|
|
1073
|
+
),
|
|
1074
|
+
),
|
|
1075
|
+
"insertions": commit_data.get("insertions"),
|
|
1076
|
+
"deletions": commit_data.get("deletions"),
|
|
1077
|
+
"complexity_delta": commit_data.get("complexity_delta"),
|
|
1078
|
+
"story_points": commit_data.get("story_points"),
|
|
1079
|
+
"ticket_references": commit_data.get("ticket_references"),
|
|
1080
|
+
"cached_at": datetime.utcnow(),
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
# Only include non-None values in update
|
|
1084
|
+
for key, value in field_mapping.items():
|
|
1085
|
+
if value is not None:
|
|
1086
|
+
update_mapping[key] = value
|
|
1087
|
+
|
|
1088
|
+
update_mappings.append(update_mapping)
|
|
1089
|
+
|
|
1090
|
+
# Perform bulk update
|
|
1091
|
+
if update_mappings:
|
|
1092
|
+
session.bulk_update_mappings(CachedCommit, update_mappings)
|
|
1093
|
+
|
|
1094
|
+
elapsed = time.time() - start_time
|
|
1095
|
+
self.bulk_operations_count += 1
|
|
1096
|
+
self.bulk_operations_time += elapsed
|
|
1097
|
+
|
|
1098
|
+
if self.debug_mode:
|
|
1099
|
+
print(f"DEBUG: Bulk updated {len(update_mappings)} commits in {elapsed:.3f}s")
|
|
1100
|
+
|
|
1101
|
+
return {
|
|
1102
|
+
"updated": len(update_mappings),
|
|
1103
|
+
"time_seconds": elapsed,
|
|
1104
|
+
"commits_per_second": len(update_mappings) / elapsed if elapsed > 0 else 0,
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
def bulk_exists(self, repo_path: str, commit_hashes: list[str]) -> dict[str, bool]:
|
|
1108
|
+
"""Check existence of multiple commits in a single query.
|
|
1109
|
+
|
|
1110
|
+
WHY: Checking existence of many commits individually is inefficient.
|
|
1111
|
+
This method uses a single query to check all commits at once.
|
|
1112
|
+
|
|
1113
|
+
Args:
|
|
1114
|
+
repo_path: Repository path
|
|
1115
|
+
commit_hashes: List of commit hashes to check
|
|
1116
|
+
|
|
1117
|
+
Returns:
|
|
1118
|
+
Dictionary mapping commit hash to existence boolean
|
|
1119
|
+
"""
|
|
1120
|
+
if not commit_hashes:
|
|
1121
|
+
return {}
|
|
1122
|
+
|
|
1123
|
+
with self.get_session() as session:
|
|
1124
|
+
# Query for existing commits
|
|
1125
|
+
existing = set(
|
|
1126
|
+
row[0]
|
|
1127
|
+
for row in session.query(CachedCommit.commit_hash)
|
|
1128
|
+
.filter(
|
|
1129
|
+
and_(
|
|
1130
|
+
CachedCommit.repo_path == repo_path,
|
|
1131
|
+
CachedCommit.commit_hash.in_(commit_hashes),
|
|
1132
|
+
)
|
|
1133
|
+
)
|
|
1134
|
+
.all()
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
# Build result dictionary
|
|
1138
|
+
return {hash: hash in existing for hash in commit_hashes}
|
|
1139
|
+
|
|
1140
|
+
def bulk_get_commits(
|
|
1141
|
+
self, repo_path: str, commit_hashes: list[str], include_stale: bool = False
|
|
1142
|
+
) -> dict[str, dict[str, Any]]:
|
|
1143
|
+
"""Retrieve multiple commits with enhanced performance.
|
|
1144
|
+
|
|
1145
|
+
WHY: Enhanced version of get_cached_commits_bulk with better performance
|
|
1146
|
+
characteristics and optional stale data inclusion.
|
|
1147
|
+
|
|
1148
|
+
Args:
|
|
1149
|
+
repo_path: Repository path
|
|
1150
|
+
commit_hashes: List of commit hashes to retrieve
|
|
1151
|
+
include_stale: Whether to include stale entries (default: False)
|
|
1152
|
+
|
|
1153
|
+
Returns:
|
|
1154
|
+
Dictionary mapping commit hash to commit data
|
|
1155
|
+
"""
|
|
1156
|
+
if not commit_hashes:
|
|
1157
|
+
return {}
|
|
1158
|
+
|
|
1159
|
+
import time
|
|
1160
|
+
|
|
1161
|
+
start_time = time.time()
|
|
1162
|
+
|
|
1163
|
+
# Process in batches to avoid query size limits
|
|
1164
|
+
all_results = {}
|
|
1165
|
+
for i in range(0, len(commit_hashes), self.batch_size):
|
|
1166
|
+
batch_hashes = commit_hashes[i : i + self.batch_size]
|
|
1167
|
+
|
|
1168
|
+
with self.get_session() as session:
|
|
1169
|
+
cached_results = (
|
|
1170
|
+
session.query(CachedCommit)
|
|
1171
|
+
.filter(
|
|
1172
|
+
and_(
|
|
1173
|
+
CachedCommit.repo_path == repo_path,
|
|
1174
|
+
CachedCommit.commit_hash.in_(batch_hashes),
|
|
1175
|
+
)
|
|
1176
|
+
)
|
|
1177
|
+
.all()
|
|
1178
|
+
)
|
|
1179
|
+
|
|
1180
|
+
for cached in cached_results:
|
|
1181
|
+
if include_stale or not self._is_stale(cached.cached_at):
|
|
1182
|
+
all_results[cached.commit_hash] = self._commit_to_dict(cached)
|
|
1183
|
+
|
|
1184
|
+
# Track performance
|
|
1185
|
+
elapsed = time.time() - start_time
|
|
1186
|
+
hits = len(all_results)
|
|
1187
|
+
misses = len(commit_hashes) - hits
|
|
1188
|
+
|
|
1189
|
+
self.cache_hits += hits
|
|
1190
|
+
self.cache_misses += misses
|
|
1191
|
+
self.bulk_operations_count += 1
|
|
1192
|
+
self.bulk_operations_time += elapsed
|
|
1193
|
+
|
|
1194
|
+
if self.debug_mode:
|
|
1195
|
+
hit_rate = (hits / len(commit_hashes)) * 100 if commit_hashes else 0
|
|
1196
|
+
print(
|
|
1197
|
+
f"DEBUG: Bulk get {hits}/{len(commit_hashes)} commits in {elapsed:.3f}s ({hit_rate:.1f}% hit rate)"
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
return all_results
|
|
1201
|
+
|
|
1202
|
+
def _get_files_changed_count(self, commit: git.Commit) -> int:
|
|
1203
|
+
"""Get the number of files changed using reliable git command."""
|
|
1204
|
+
parent = commit.parents[0] if commit.parents else None
|
|
1205
|
+
|
|
1206
|
+
try:
|
|
1207
|
+
repo = commit.repo
|
|
1208
|
+
if parent:
|
|
1209
|
+
diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1210
|
+
else:
|
|
1211
|
+
diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
|
|
1212
|
+
|
|
1213
|
+
file_count = 0
|
|
1214
|
+
for line in diff_output.strip().split("\n"):
|
|
1215
|
+
if line.strip() and "\t" in line:
|
|
1216
|
+
file_count += 1
|
|
1217
|
+
|
|
1218
|
+
return file_count
|
|
1219
|
+
except Exception:
|
|
1220
|
+
return 0
|
|
1221
|
+
|
|
1222
|
+
def _get_insertions_count(self, commit: git.Commit) -> int:
|
|
1223
|
+
"""Get the number of insertions using reliable git command."""
|
|
1224
|
+
parent = commit.parents[0] if commit.parents else None
|
|
1225
|
+
|
|
1226
|
+
try:
|
|
1227
|
+
repo = commit.repo
|
|
1228
|
+
if parent:
|
|
1229
|
+
diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1230
|
+
else:
|
|
1231
|
+
diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
|
|
1232
|
+
|
|
1233
|
+
total_insertions = 0
|
|
1234
|
+
for line in diff_output.strip().split("\n"):
|
|
1235
|
+
if not line.strip():
|
|
1236
|
+
continue
|
|
1237
|
+
|
|
1238
|
+
parts = line.split("\t")
|
|
1239
|
+
if len(parts) >= 3:
|
|
1240
|
+
try:
|
|
1241
|
+
insertions = int(parts[0]) if parts[0] != "-" else 0
|
|
1242
|
+
total_insertions += insertions
|
|
1243
|
+
except ValueError:
|
|
1244
|
+
continue
|
|
1245
|
+
|
|
1246
|
+
return total_insertions
|
|
1247
|
+
except Exception:
|
|
1248
|
+
return 0
|
|
1249
|
+
|
|
1250
|
+
def _get_deletions_count(self, commit: git.Commit) -> int:
|
|
1251
|
+
"""Get the number of deletions using reliable git command."""
|
|
1252
|
+
parent = commit.parents[0] if commit.parents else None
|
|
1253
|
+
|
|
1254
|
+
try:
|
|
1255
|
+
repo = commit.repo
|
|
1256
|
+
if parent:
|
|
1257
|
+
diff_output = repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1258
|
+
else:
|
|
1259
|
+
diff_output = repo.git.show(commit.hexsha, "--numstat", "--format=")
|
|
1260
|
+
|
|
1261
|
+
total_deletions = 0
|
|
1262
|
+
for line in diff_output.strip().split("\n"):
|
|
1263
|
+
if not line.strip():
|
|
1264
|
+
continue
|
|
1265
|
+
|
|
1266
|
+
parts = line.split("\t")
|
|
1267
|
+
if len(parts) >= 3:
|
|
1268
|
+
try:
|
|
1269
|
+
deletions = int(parts[1]) if parts[1] != "-" else 0
|
|
1270
|
+
total_deletions += deletions
|
|
1271
|
+
except ValueError:
|
|
1272
|
+
continue
|
|
1273
|
+
|
|
1274
|
+
return total_deletions
|
|
1275
|
+
except Exception:
|
|
1276
|
+
return 0
|
|
1277
|
+
|
|
1278
|
+
def get_repository_analysis_status(
|
|
1279
|
+
self,
|
|
1280
|
+
repo_path: str,
|
|
1281
|
+
analysis_start: datetime,
|
|
1282
|
+
analysis_end: datetime,
|
|
1283
|
+
config_hash: Optional[str] = None,
|
|
1284
|
+
) -> Optional[dict[str, Any]]:
|
|
1285
|
+
"""Check if repository analysis is complete for the given period.
|
|
1286
|
+
|
|
1287
|
+
WHY: Enables "fetch once, report many" by tracking which repositories
|
|
1288
|
+
have been fully analyzed. Prevents re-fetching Git data when generating
|
|
1289
|
+
different report formats from the same cached data.
|
|
1290
|
+
|
|
1291
|
+
CRITICAL FIX: Now verifies actual commits exist in cache, not just metadata.
|
|
1292
|
+
This prevents "Using cached data (X commits)" when commits aren't actually stored.
|
|
1293
|
+
|
|
1294
|
+
Args:
|
|
1295
|
+
repo_path: Path to the repository
|
|
1296
|
+
analysis_start: Start of the analysis period
|
|
1297
|
+
analysis_end: End of the analysis period
|
|
1298
|
+
config_hash: Optional hash of relevant configuration to detect changes
|
|
1299
|
+
|
|
1300
|
+
Returns:
|
|
1301
|
+
Dictionary with analysis status or None if not found/incomplete
|
|
1302
|
+
"""
|
|
1303
|
+
with self.get_session() as session:
|
|
1304
|
+
status = (
|
|
1305
|
+
session.query(RepositoryAnalysisStatus)
|
|
1306
|
+
.filter(
|
|
1307
|
+
and_(
|
|
1308
|
+
RepositoryAnalysisStatus.repo_path == repo_path,
|
|
1309
|
+
RepositoryAnalysisStatus.analysis_start == analysis_start,
|
|
1310
|
+
RepositoryAnalysisStatus.analysis_end == analysis_end,
|
|
1311
|
+
RepositoryAnalysisStatus.status == "completed",
|
|
1312
|
+
)
|
|
1313
|
+
)
|
|
1314
|
+
.first()
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
if not status:
|
|
1318
|
+
return None
|
|
1319
|
+
|
|
1320
|
+
# Check if configuration has changed (invalidates cache)
|
|
1321
|
+
if config_hash and status.config_hash != config_hash:
|
|
1322
|
+
return None
|
|
1323
|
+
|
|
1324
|
+
# CRITICAL FIX: Verify actual commits exist in the database
|
|
1325
|
+
# Don't trust metadata if commits aren't actually stored
|
|
1326
|
+
actual_commit_count = (
|
|
1327
|
+
session.query(CachedCommit)
|
|
1328
|
+
.filter(
|
|
1329
|
+
and_(
|
|
1330
|
+
CachedCommit.repo_path == repo_path,
|
|
1331
|
+
CachedCommit.timestamp >= analysis_start,
|
|
1332
|
+
CachedCommit.timestamp <= analysis_end,
|
|
1333
|
+
# Only count non-stale commits
|
|
1334
|
+
CachedCommit.cached_at
|
|
1335
|
+
>= datetime.utcnow() - timedelta(hours=self.ttl_hours),
|
|
1336
|
+
)
|
|
1337
|
+
)
|
|
1338
|
+
.count()
|
|
1339
|
+
)
|
|
1340
|
+
|
|
1341
|
+
# If metadata says we have commits but no commits are actually stored,
|
|
1342
|
+
# force a fresh fetch by returning None
|
|
1343
|
+
if status.commit_count > 0 and actual_commit_count == 0:
|
|
1344
|
+
if self.debug_mode:
|
|
1345
|
+
print(
|
|
1346
|
+
f"DEBUG: Metadata claims {status.commit_count} commits but found 0 in cache - forcing fresh fetch"
|
|
1347
|
+
)
|
|
1348
|
+
# Log warning about inconsistent cache state
|
|
1349
|
+
import logging
|
|
1350
|
+
|
|
1351
|
+
logger = logging.getLogger(__name__)
|
|
1352
|
+
logger.warning(
|
|
1353
|
+
f"Cache inconsistency detected for {repo_path}: "
|
|
1354
|
+
f"metadata reports {status.commit_count} commits but "
|
|
1355
|
+
f"actual stored commits: {actual_commit_count}. Forcing fresh analysis."
|
|
1356
|
+
)
|
|
1357
|
+
return None
|
|
1358
|
+
|
|
1359
|
+
# Update the commit count to reflect actual stored commits
|
|
1360
|
+
# This ensures the UI shows accurate information
|
|
1361
|
+
status.commit_count = actual_commit_count
|
|
1362
|
+
|
|
1363
|
+
# Return status information
|
|
1364
|
+
return {
|
|
1365
|
+
"repo_path": status.repo_path,
|
|
1366
|
+
"repo_name": status.repo_name,
|
|
1367
|
+
"project_key": status.project_key,
|
|
1368
|
+
"analysis_start": status.analysis_start,
|
|
1369
|
+
"analysis_end": status.analysis_end,
|
|
1370
|
+
"weeks_analyzed": status.weeks_analyzed,
|
|
1371
|
+
"git_analysis_complete": status.git_analysis_complete,
|
|
1372
|
+
"commit_count": actual_commit_count, # Use verified count, not metadata
|
|
1373
|
+
"pr_analysis_complete": status.pr_analysis_complete,
|
|
1374
|
+
"pr_count": status.pr_count,
|
|
1375
|
+
"ticket_analysis_complete": status.ticket_analysis_complete,
|
|
1376
|
+
"ticket_count": status.ticket_count,
|
|
1377
|
+
"identity_resolution_complete": status.identity_resolution_complete,
|
|
1378
|
+
"unique_developers": status.unique_developers,
|
|
1379
|
+
"last_updated": status.last_updated,
|
|
1380
|
+
"processing_time_seconds": status.processing_time_seconds,
|
|
1381
|
+
"cache_hit_rate_percent": status.cache_hit_rate_percent,
|
|
1382
|
+
"config_hash": status.config_hash,
|
|
1383
|
+
}
|
|
1384
|
+
|
|
1385
|
+
def mark_repository_analysis_complete(
|
|
1386
|
+
self,
|
|
1387
|
+
repo_path: str,
|
|
1388
|
+
repo_name: str,
|
|
1389
|
+
project_key: str,
|
|
1390
|
+
analysis_start: datetime,
|
|
1391
|
+
analysis_end: datetime,
|
|
1392
|
+
weeks_analyzed: int,
|
|
1393
|
+
commit_count: int = 0,
|
|
1394
|
+
pr_count: int = 0,
|
|
1395
|
+
ticket_count: int = 0,
|
|
1396
|
+
unique_developers: int = 0,
|
|
1397
|
+
processing_time_seconds: Optional[float] = None,
|
|
1398
|
+
cache_hit_rate_percent: Optional[float] = None,
|
|
1399
|
+
config_hash: Optional[str] = None,
|
|
1400
|
+
) -> None:
|
|
1401
|
+
"""Mark repository analysis as complete for the given period.
|
|
1402
|
+
|
|
1403
|
+
WHY: Records successful completion of repository analysis to enable
|
|
1404
|
+
cache-first workflow. Subsequent runs can skip re-analysis and go
|
|
1405
|
+
directly to report generation.
|
|
1406
|
+
|
|
1407
|
+
Args:
|
|
1408
|
+
repo_path: Path to the repository
|
|
1409
|
+
repo_name: Display name for the repository
|
|
1410
|
+
project_key: Project key for the repository
|
|
1411
|
+
analysis_start: Start of the analysis period
|
|
1412
|
+
analysis_end: End of the analysis period
|
|
1413
|
+
weeks_analyzed: Number of weeks analyzed
|
|
1414
|
+
commit_count: Number of commits analyzed
|
|
1415
|
+
pr_count: Number of pull requests analyzed
|
|
1416
|
+
ticket_count: Number of tickets analyzed
|
|
1417
|
+
unique_developers: Number of unique developers found
|
|
1418
|
+
processing_time_seconds: Time taken for analysis
|
|
1419
|
+
cache_hit_rate_percent: Cache hit rate during analysis
|
|
1420
|
+
config_hash: Hash of relevant configuration
|
|
1421
|
+
"""
|
|
1422
|
+
with self.get_session() as session:
|
|
1423
|
+
# Check if status already exists
|
|
1424
|
+
existing = (
|
|
1425
|
+
session.query(RepositoryAnalysisStatus)
|
|
1426
|
+
.filter(
|
|
1427
|
+
and_(
|
|
1428
|
+
RepositoryAnalysisStatus.repo_path == repo_path,
|
|
1429
|
+
RepositoryAnalysisStatus.analysis_start == analysis_start,
|
|
1430
|
+
RepositoryAnalysisStatus.analysis_end == analysis_end,
|
|
1431
|
+
)
|
|
1432
|
+
)
|
|
1433
|
+
.first()
|
|
1434
|
+
)
|
|
1435
|
+
|
|
1436
|
+
if existing:
|
|
1437
|
+
# Update existing record
|
|
1438
|
+
existing.repo_name = repo_name
|
|
1439
|
+
existing.project_key = project_key
|
|
1440
|
+
existing.weeks_analyzed = weeks_analyzed
|
|
1441
|
+
existing.git_analysis_complete = True
|
|
1442
|
+
existing.commit_count = commit_count
|
|
1443
|
+
existing.pr_analysis_complete = True
|
|
1444
|
+
existing.pr_count = pr_count
|
|
1445
|
+
existing.ticket_analysis_complete = True
|
|
1446
|
+
existing.ticket_count = ticket_count
|
|
1447
|
+
existing.identity_resolution_complete = True
|
|
1448
|
+
existing.unique_developers = unique_developers
|
|
1449
|
+
existing.processing_time_seconds = processing_time_seconds
|
|
1450
|
+
existing.cache_hit_rate_percent = cache_hit_rate_percent
|
|
1451
|
+
existing.config_hash = config_hash
|
|
1452
|
+
existing.status = "completed"
|
|
1453
|
+
existing.error_message = None
|
|
1454
|
+
existing.last_updated = datetime.utcnow()
|
|
1455
|
+
else:
|
|
1456
|
+
# Create new record
|
|
1457
|
+
status = RepositoryAnalysisStatus(
|
|
1458
|
+
repo_path=repo_path,
|
|
1459
|
+
repo_name=repo_name,
|
|
1460
|
+
project_key=project_key,
|
|
1461
|
+
analysis_start=analysis_start,
|
|
1462
|
+
analysis_end=analysis_end,
|
|
1463
|
+
weeks_analyzed=weeks_analyzed,
|
|
1464
|
+
git_analysis_complete=True,
|
|
1465
|
+
commit_count=commit_count,
|
|
1466
|
+
pr_analysis_complete=True,
|
|
1467
|
+
pr_count=pr_count,
|
|
1468
|
+
ticket_analysis_complete=True,
|
|
1469
|
+
ticket_count=ticket_count,
|
|
1470
|
+
identity_resolution_complete=True,
|
|
1471
|
+
unique_developers=unique_developers,
|
|
1472
|
+
processing_time_seconds=processing_time_seconds,
|
|
1473
|
+
cache_hit_rate_percent=cache_hit_rate_percent,
|
|
1474
|
+
config_hash=config_hash,
|
|
1475
|
+
status="completed",
|
|
1476
|
+
)
|
|
1477
|
+
session.add(status)
|
|
1478
|
+
|
|
1479
|
+
def mark_repository_analysis_failed(
|
|
1480
|
+
self,
|
|
1481
|
+
repo_path: str,
|
|
1482
|
+
repo_name: str,
|
|
1483
|
+
analysis_start: datetime,
|
|
1484
|
+
analysis_end: datetime,
|
|
1485
|
+
error_message: str,
|
|
1486
|
+
config_hash: Optional[str] = None,
|
|
1487
|
+
) -> None:
|
|
1488
|
+
"""Mark repository analysis as failed.
|
|
1489
|
+
|
|
1490
|
+
Args:
|
|
1491
|
+
repo_path: Path to the repository
|
|
1492
|
+
repo_name: Display name for the repository
|
|
1493
|
+
analysis_start: Start of the analysis period
|
|
1494
|
+
analysis_end: End of the analysis period
|
|
1495
|
+
error_message: Error message describing the failure
|
|
1496
|
+
config_hash: Hash of relevant configuration
|
|
1497
|
+
"""
|
|
1498
|
+
with self.get_session() as session:
|
|
1499
|
+
# Check if status already exists
|
|
1500
|
+
existing = (
|
|
1501
|
+
session.query(RepositoryAnalysisStatus)
|
|
1502
|
+
.filter(
|
|
1503
|
+
and_(
|
|
1504
|
+
RepositoryAnalysisStatus.repo_path == repo_path,
|
|
1505
|
+
RepositoryAnalysisStatus.analysis_start == analysis_start,
|
|
1506
|
+
RepositoryAnalysisStatus.analysis_end == analysis_end,
|
|
1507
|
+
)
|
|
1508
|
+
)
|
|
1509
|
+
.first()
|
|
1510
|
+
)
|
|
1511
|
+
|
|
1512
|
+
if existing:
|
|
1513
|
+
existing.repo_name = repo_name
|
|
1514
|
+
existing.status = "failed"
|
|
1515
|
+
existing.error_message = error_message
|
|
1516
|
+
existing.config_hash = config_hash
|
|
1517
|
+
existing.last_updated = datetime.utcnow()
|
|
1518
|
+
else:
|
|
1519
|
+
# Create new failed record
|
|
1520
|
+
status = RepositoryAnalysisStatus(
|
|
1521
|
+
repo_path=repo_path,
|
|
1522
|
+
repo_name=repo_name,
|
|
1523
|
+
project_key="unknown",
|
|
1524
|
+
analysis_start=analysis_start,
|
|
1525
|
+
analysis_end=analysis_end,
|
|
1526
|
+
weeks_analyzed=0,
|
|
1527
|
+
status="failed",
|
|
1528
|
+
error_message=error_message,
|
|
1529
|
+
config_hash=config_hash,
|
|
1530
|
+
)
|
|
1531
|
+
session.add(status)
|
|
1532
|
+
|
|
1533
|
+
def clear_repository_analysis_status(
|
|
1534
|
+
self, repo_path: Optional[str] = None, older_than_days: Optional[int] = None
|
|
1535
|
+
) -> int:
|
|
1536
|
+
"""Clear repository analysis status records.
|
|
1537
|
+
|
|
1538
|
+
WHY: Allows forcing re-analysis by clearing cached status records.
|
|
1539
|
+
Used by --force-fetch flag and for cleanup of old status records.
|
|
1540
|
+
|
|
1541
|
+
Args:
|
|
1542
|
+
repo_path: Specific repository path to clear (all repos if None)
|
|
1543
|
+
older_than_days: Clear records older than N days (all if None)
|
|
1544
|
+
|
|
1545
|
+
Returns:
|
|
1546
|
+
Number of records cleared
|
|
1547
|
+
"""
|
|
1548
|
+
with self.get_session() as session:
|
|
1549
|
+
query = session.query(RepositoryAnalysisStatus)
|
|
1550
|
+
|
|
1551
|
+
if repo_path:
|
|
1552
|
+
query = query.filter(RepositoryAnalysisStatus.repo_path == repo_path)
|
|
1553
|
+
|
|
1554
|
+
if older_than_days:
|
|
1555
|
+
cutoff = datetime.utcnow() - timedelta(days=older_than_days)
|
|
1556
|
+
query = query.filter(RepositoryAnalysisStatus.last_updated < cutoff)
|
|
1557
|
+
|
|
1558
|
+
count = query.count()
|
|
1559
|
+
query.delete()
|
|
1560
|
+
|
|
1561
|
+
return count
|
|
1562
|
+
|
|
1563
|
+
def get_analysis_status_summary(self) -> dict[str, Any]:
|
|
1564
|
+
"""Get summary of repository analysis status records.
|
|
1565
|
+
|
|
1566
|
+
Returns:
|
|
1567
|
+
Dictionary with summary statistics
|
|
1568
|
+
"""
|
|
1569
|
+
with self.get_session() as session:
|
|
1570
|
+
from sqlalchemy import func
|
|
1571
|
+
|
|
1572
|
+
# Count by status
|
|
1573
|
+
status_counts = dict(
|
|
1574
|
+
session.query(RepositoryAnalysisStatus.status, func.count().label("count"))
|
|
1575
|
+
.group_by(RepositoryAnalysisStatus.status)
|
|
1576
|
+
.all()
|
|
1577
|
+
)
|
|
1578
|
+
|
|
1579
|
+
# Total records
|
|
1580
|
+
total_records = session.query(RepositoryAnalysisStatus).count()
|
|
1581
|
+
|
|
1582
|
+
# Recent activity (last 7 days)
|
|
1583
|
+
recent_cutoff = datetime.utcnow() - timedelta(days=7)
|
|
1584
|
+
recent_completed = (
|
|
1585
|
+
session.query(RepositoryAnalysisStatus)
|
|
1586
|
+
.filter(
|
|
1587
|
+
and_(
|
|
1588
|
+
RepositoryAnalysisStatus.status == "completed",
|
|
1589
|
+
RepositoryAnalysisStatus.last_updated >= recent_cutoff,
|
|
1590
|
+
)
|
|
1591
|
+
)
|
|
1592
|
+
.count()
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
# Average processing time for completed analyses
|
|
1596
|
+
avg_processing_time = (
|
|
1597
|
+
session.query(func.avg(RepositoryAnalysisStatus.processing_time_seconds))
|
|
1598
|
+
.filter(
|
|
1599
|
+
and_(
|
|
1600
|
+
RepositoryAnalysisStatus.status == "completed",
|
|
1601
|
+
RepositoryAnalysisStatus.processing_time_seconds.isnot(None),
|
|
1602
|
+
)
|
|
1603
|
+
)
|
|
1604
|
+
.scalar()
|
|
1605
|
+
)
|
|
1606
|
+
|
|
1607
|
+
return {
|
|
1608
|
+
"total_records": total_records,
|
|
1609
|
+
"status_counts": status_counts,
|
|
1610
|
+
"recent_completed": recent_completed,
|
|
1611
|
+
"avg_processing_time_seconds": avg_processing_time,
|
|
1612
|
+
"completed_count": status_counts.get("completed", 0),
|
|
1613
|
+
"failed_count": status_counts.get("failed", 0),
|
|
1614
|
+
"pending_count": status_counts.get("pending", 0),
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1617
|
+
@staticmethod
|
|
1618
|
+
def generate_config_hash(
|
|
1619
|
+
branch_mapping_rules: Optional[dict] = None,
|
|
1620
|
+
ticket_platforms: Optional[list] = None,
|
|
1621
|
+
exclude_paths: Optional[list] = None,
|
|
1622
|
+
ml_categorization_enabled: bool = False,
|
|
1623
|
+
additional_config: Optional[dict] = None,
|
|
1624
|
+
) -> str:
|
|
1625
|
+
"""Generate MD5 hash of relevant configuration for cache invalidation.
|
|
1626
|
+
|
|
1627
|
+
WHY: Configuration changes can affect analysis results, so we need to
|
|
1628
|
+
detect when cached analysis is no longer valid due to config changes.
|
|
1629
|
+
|
|
1630
|
+
Args:
|
|
1631
|
+
branch_mapping_rules: Branch to project mapping rules
|
|
1632
|
+
ticket_platforms: Allowed ticket platforms
|
|
1633
|
+
exclude_paths: Paths to exclude from analysis
|
|
1634
|
+
ml_categorization_enabled: Whether ML categorization is enabled
|
|
1635
|
+
additional_config: Any additional configuration to include
|
|
1636
|
+
|
|
1637
|
+
Returns:
|
|
1638
|
+
MD5 hash string representing the configuration
|
|
1639
|
+
"""
|
|
1640
|
+
config_data = {
|
|
1641
|
+
"branch_mapping_rules": branch_mapping_rules or {},
|
|
1642
|
+
"ticket_platforms": sorted(ticket_platforms or []),
|
|
1643
|
+
"exclude_paths": sorted(exclude_paths or []),
|
|
1644
|
+
"ml_categorization_enabled": ml_categorization_enabled,
|
|
1645
|
+
"additional_config": additional_config or {},
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
# Convert to JSON string with sorted keys for consistent hashing
|
|
1649
|
+
config_json = json.dumps(config_data, sort_keys=True, default=str)
|
|
1650
|
+
|
|
1651
|
+
# Generate MD5 hash
|
|
1652
|
+
return hashlib.md5(config_json.encode()).hexdigest()
|