gitflow-analytics 3.3.0__py3-none-any.whl → 3.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/cli.py +517 -15
- gitflow_analytics/cli_wizards/__init__.py +10 -0
- gitflow_analytics/cli_wizards/install_wizard.py +1181 -0
- gitflow_analytics/cli_wizards/run_launcher.py +433 -0
- gitflow_analytics/config/__init__.py +3 -0
- gitflow_analytics/config/aliases.py +306 -0
- gitflow_analytics/config/loader.py +35 -1
- gitflow_analytics/config/schema.py +13 -0
- gitflow_analytics/constants.py +75 -0
- gitflow_analytics/core/cache.py +7 -3
- gitflow_analytics/core/data_fetcher.py +66 -30
- gitflow_analytics/core/git_timeout_wrapper.py +6 -4
- gitflow_analytics/core/progress.py +2 -4
- gitflow_analytics/core/subprocess_git.py +31 -5
- gitflow_analytics/identity_llm/analysis_pass.py +13 -3
- gitflow_analytics/identity_llm/analyzer.py +14 -2
- gitflow_analytics/identity_llm/models.py +7 -1
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +5 -3
- gitflow_analytics/security/config.py +6 -6
- gitflow_analytics/security/extractors/dependency_checker.py +14 -14
- gitflow_analytics/security/extractors/secret_detector.py +8 -14
- gitflow_analytics/security/extractors/vulnerability_scanner.py +9 -9
- gitflow_analytics/security/llm_analyzer.py +10 -10
- gitflow_analytics/security/security_analyzer.py +17 -17
- gitflow_analytics/tui/screens/analysis_progress_screen.py +1 -1
- gitflow_analytics/ui/progress_display.py +36 -29
- gitflow_analytics/verify_activity.py +23 -26
- {gitflow_analytics-3.3.0.dist-info → gitflow_analytics-3.5.2.dist-info}/METADATA +1 -1
- {gitflow_analytics-3.3.0.dist-info → gitflow_analytics-3.5.2.dist-info}/RECORD +34 -31
- gitflow_analytics/security/reports/__init__.py +0 -5
- gitflow_analytics/security/reports/security_report.py +0 -358
- {gitflow_analytics-3.3.0.dist-info → gitflow_analytics-3.5.2.dist-info}/WHEEL +0 -0
- {gitflow_analytics-3.3.0.dist-info → gitflow_analytics-3.5.2.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-3.3.0.dist-info → gitflow_analytics-3.5.2.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-3.3.0.dist-info → gitflow_analytics-3.5.2.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,7 @@ from typing import Any, Optional
|
|
|
18
18
|
from sqlalchemy import func
|
|
19
19
|
from sqlalchemy.orm import Session
|
|
20
20
|
|
|
21
|
+
from ..constants import BatchSizes, Timeouts
|
|
21
22
|
from ..extractors.story_points import StoryPointExtractor
|
|
22
23
|
from ..extractors.tickets import TicketExtractor
|
|
23
24
|
from ..integrations.jira_integration import JIRAIntegration
|
|
@@ -59,7 +60,7 @@ class GitDataFetcher:
|
|
|
59
60
|
allowed_ticket_platforms: Optional[list[str]] = None,
|
|
60
61
|
exclude_paths: Optional[list[str]] = None,
|
|
61
62
|
skip_remote_fetch: bool = False,
|
|
62
|
-
):
|
|
63
|
+
) -> None:
|
|
63
64
|
"""Initialize the data fetcher.
|
|
64
65
|
|
|
65
66
|
Args:
|
|
@@ -96,7 +97,7 @@ class GitDataFetcher:
|
|
|
96
97
|
self.identity_resolver = DeveloperIdentityResolver(identity_db_path)
|
|
97
98
|
|
|
98
99
|
# Initialize git timeout wrapper for safe operations
|
|
99
|
-
self.git_wrapper = GitTimeoutWrapper(default_timeout=
|
|
100
|
+
self.git_wrapper = GitTimeoutWrapper(default_timeout=Timeouts.DEFAULT_GIT_OPERATION)
|
|
100
101
|
|
|
101
102
|
# Statistics for tracking repository processing
|
|
102
103
|
self.processing_stats = {
|
|
@@ -170,10 +171,10 @@ class GitDataFetcher:
|
|
|
170
171
|
progress.start_repository(f"{project_key} (cloning)", 0)
|
|
171
172
|
else:
|
|
172
173
|
# Rough estimate based on weeks
|
|
173
|
-
estimated_commits = weeks_back *
|
|
174
|
+
estimated_commits = weeks_back * BatchSizes.COMMITS_PER_WEEK_ESTIMATE
|
|
174
175
|
progress.start_repository(project_key, estimated_commits)
|
|
175
176
|
except Exception:
|
|
176
|
-
progress.start_repository(project_key,
|
|
177
|
+
progress.start_repository(project_key, BatchSizes.DEFAULT_PROGRESS_ESTIMATE)
|
|
177
178
|
|
|
178
179
|
# Step 1: Collect all commits organized by day with enhanced progress tracking
|
|
179
180
|
logger.info("🔍 DEBUG: About to fetch commits by day")
|
|
@@ -402,8 +403,13 @@ class GitDataFetcher:
|
|
|
402
403
|
if hasattr(_thread_local, "temp_dir"):
|
|
403
404
|
shutil.rmtree(_thread_local.temp_dir, ignore_errors=True)
|
|
404
405
|
delattr(_thread_local, "temp_dir")
|
|
405
|
-
except:
|
|
406
|
-
|
|
406
|
+
except OSError as e:
|
|
407
|
+
# Log cleanup failures but don't fail the operation
|
|
408
|
+
logger.debug(f"Failed to clean up temp directory for {project_key}: {e}")
|
|
409
|
+
except Exception as e:
|
|
410
|
+
logger.warning(
|
|
411
|
+
f"Unexpected error during temp cleanup for {project_key}: {e}"
|
|
412
|
+
)
|
|
407
413
|
|
|
408
414
|
try:
|
|
409
415
|
# Configure git to never prompt for credentials
|
|
@@ -506,8 +512,14 @@ class GitDataFetcher:
|
|
|
506
512
|
# Estimate based on first branch (multiply by number of branches for rough estimate)
|
|
507
513
|
len(sample_commits) * len(branches_to_analyze)
|
|
508
514
|
break
|
|
509
|
-
except:
|
|
510
|
-
|
|
515
|
+
except GitOperationTimeout:
|
|
516
|
+
logger.warning(
|
|
517
|
+
f"Timeout while sampling commits for {project_key}, using default estimate"
|
|
518
|
+
)
|
|
519
|
+
len(days_to_process) * BatchSizes.COMMITS_PER_WEEK_ESTIMATE # Default estimate
|
|
520
|
+
except Exception as e:
|
|
521
|
+
logger.debug(f"Could not sample commits for {project_key}: {e}, using default estimate")
|
|
522
|
+
len(days_to_process) * BatchSizes.COMMITS_PER_WEEK_ESTIMATE # Default estimate
|
|
511
523
|
|
|
512
524
|
# Update repository in Rich display with estimated commit count
|
|
513
525
|
if hasattr(progress, "_use_rich") and progress._use_rich:
|
|
@@ -537,18 +549,25 @@ class GitDataFetcher:
|
|
|
537
549
|
for branch_name in branches_to_analyze:
|
|
538
550
|
try:
|
|
539
551
|
# Fetch commits for this specific day and branch with timeout protection
|
|
540
|
-
def fetch_branch_commits(
|
|
552
|
+
def fetch_branch_commits(
|
|
553
|
+
branch: str = branch_name,
|
|
554
|
+
start: datetime = day_start,
|
|
555
|
+
end: datetime = day_end,
|
|
556
|
+
) -> list[Any]:
|
|
557
|
+
"""Fetch commits for a specific branch and day range.
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
List of GitPython commit objects
|
|
561
|
+
"""
|
|
541
562
|
return list(
|
|
542
|
-
repo.iter_commits(
|
|
543
|
-
branch_name, since=day_start, until=day_end, reverse=False
|
|
544
|
-
)
|
|
563
|
+
repo.iter_commits(branch, since=start, until=end, reverse=False)
|
|
545
564
|
)
|
|
546
565
|
|
|
547
566
|
# Use timeout wrapper to prevent hanging on iter_commits
|
|
548
567
|
try:
|
|
549
568
|
branch_commits = self.git_wrapper.run_with_timeout(
|
|
550
569
|
fetch_branch_commits,
|
|
551
|
-
timeout=
|
|
570
|
+
timeout=Timeouts.GIT_BRANCH_ITERATION,
|
|
552
571
|
operation_name=f"iter_commits_{branch_name}_{day_str}",
|
|
553
572
|
)
|
|
554
573
|
except GitOperationTimeout:
|
|
@@ -913,23 +932,28 @@ class GitDataFetcher:
|
|
|
913
932
|
except Exception as e:
|
|
914
933
|
logger.debug(f"Error getting local branches: {e}")
|
|
915
934
|
|
|
916
|
-
# If we have remotes, also consider remote branches (
|
|
935
|
+
# If we have remotes, also consider remote branches (keep full remote reference)
|
|
917
936
|
# Skip remote branch checking if skip_remote_fetch is enabled to avoid auth prompts
|
|
918
937
|
if not self.skip_remote_fetch:
|
|
919
938
|
try:
|
|
920
939
|
if repo.remotes and hasattr(repo.remotes, "origin"):
|
|
940
|
+
# CRITICAL FIX: Keep full remote reference (origin/branch-name) for accessibility testing
|
|
941
|
+
# Remote branches need the full reference to work with iter_commits()
|
|
921
942
|
# THREAD SAFETY: Create a new list to avoid sharing references
|
|
922
943
|
remote_branches = list(
|
|
923
944
|
[
|
|
924
|
-
ref.name
|
|
945
|
+
ref.name # Keep full "origin/branch-name" format
|
|
925
946
|
for ref in repo.remotes.origin.refs
|
|
926
947
|
if not ref.name.endswith("HEAD") # Skip HEAD ref
|
|
927
948
|
]
|
|
928
949
|
)
|
|
929
|
-
#
|
|
930
|
-
for
|
|
931
|
-
|
|
932
|
-
|
|
950
|
+
# Add remote branches with full reference (origin/branch-name)
|
|
951
|
+
# Extract short name only for duplicate checking against local branches
|
|
952
|
+
for branch_ref in remote_branches:
|
|
953
|
+
short_name = branch_ref.replace("origin/", "")
|
|
954
|
+
# Only add if we don't have this branch locally
|
|
955
|
+
if short_name not in available_branches:
|
|
956
|
+
available_branches.append(branch_ref) # Store full reference
|
|
933
957
|
logger.debug(f"Found remote branches: {remote_branches}")
|
|
934
958
|
except Exception as e:
|
|
935
959
|
logger.debug(f"Error getting remote branches (may require authentication): {e}")
|
|
@@ -1043,7 +1067,9 @@ class GitDataFetcher:
|
|
|
1043
1067
|
repo_path = Path(repo.working_dir)
|
|
1044
1068
|
|
|
1045
1069
|
# Try to fetch with timeout protection
|
|
1046
|
-
fetch_success = self.git_wrapper.fetch_with_timeout(
|
|
1070
|
+
fetch_success = self.git_wrapper.fetch_with_timeout(
|
|
1071
|
+
repo_path, timeout=Timeouts.GIT_FETCH
|
|
1072
|
+
)
|
|
1047
1073
|
|
|
1048
1074
|
if not fetch_success:
|
|
1049
1075
|
# Mark this repository as having authentication issues if applicable
|
|
@@ -1070,7 +1096,9 @@ class GitDataFetcher:
|
|
|
1070
1096
|
tracking = current_branch.tracking_branch()
|
|
1071
1097
|
if tracking:
|
|
1072
1098
|
# Pull latest changes using timeout wrapper
|
|
1073
|
-
pull_success = self.git_wrapper.pull_with_timeout(
|
|
1099
|
+
pull_success = self.git_wrapper.pull_with_timeout(
|
|
1100
|
+
repo_path, timeout=Timeouts.GIT_PULL
|
|
1101
|
+
)
|
|
1074
1102
|
if pull_success:
|
|
1075
1103
|
logger.debug(f"Pulled latest changes for {current_branch.name}")
|
|
1076
1104
|
else:
|
|
@@ -1137,7 +1165,7 @@ class GitDataFetcher:
|
|
|
1137
1165
|
cwd=repo_path,
|
|
1138
1166
|
capture_output=True,
|
|
1139
1167
|
text=True,
|
|
1140
|
-
timeout=
|
|
1168
|
+
timeout=Timeouts.GIT_CONFIG,
|
|
1141
1169
|
env={"GIT_TERMINAL_PROMPT": "0"},
|
|
1142
1170
|
)
|
|
1143
1171
|
|
|
@@ -1164,9 +1192,12 @@ class GitDataFetcher:
|
|
|
1164
1192
|
f"3) Git credential manager instead."
|
|
1165
1193
|
)
|
|
1166
1194
|
break
|
|
1167
|
-
except:
|
|
1168
|
-
#
|
|
1169
|
-
|
|
1195
|
+
except AttributeError as e:
|
|
1196
|
+
# Repository might not have remotes attribute (e.g., in tests or unusual repo structures)
|
|
1197
|
+
logger.debug(f"Could not check remote URLs for security scan: {e}")
|
|
1198
|
+
except Exception as e:
|
|
1199
|
+
# Don't fail analysis due to security check, but log unexpected errors
|
|
1200
|
+
logger.warning(f"Error during credential security check: {e}")
|
|
1170
1201
|
|
|
1171
1202
|
def get_repository_status_summary(self) -> dict[str, Any]:
|
|
1172
1203
|
"""Get a summary of repository fetch status.
|
|
@@ -1244,7 +1275,7 @@ class GitDataFetcher:
|
|
|
1244
1275
|
logger.info(f"Fetching {len(tickets_to_fetch)} new tickets")
|
|
1245
1276
|
|
|
1246
1277
|
# Fetch tickets in batches
|
|
1247
|
-
batch_size =
|
|
1278
|
+
batch_size = BatchSizes.TICKET_FETCH
|
|
1248
1279
|
tickets_list = list(tickets_to_fetch)
|
|
1249
1280
|
|
|
1250
1281
|
# Use centralized progress service
|
|
@@ -1787,7 +1818,12 @@ class GitDataFetcher:
|
|
|
1787
1818
|
repo = commit.repo
|
|
1788
1819
|
Path(repo.working_dir)
|
|
1789
1820
|
|
|
1790
|
-
def get_diff_output():
|
|
1821
|
+
def get_diff_output() -> str:
|
|
1822
|
+
"""Get diff output for commit using git numstat.
|
|
1823
|
+
|
|
1824
|
+
Returns:
|
|
1825
|
+
Git diff output string in numstat format
|
|
1826
|
+
"""
|
|
1791
1827
|
if parent:
|
|
1792
1828
|
return repo.git.diff(parent.hexsha, commit.hexsha, "--numstat")
|
|
1793
1829
|
else:
|
|
@@ -1798,7 +1834,7 @@ class GitDataFetcher:
|
|
|
1798
1834
|
try:
|
|
1799
1835
|
diff_output = self.git_wrapper.run_with_timeout(
|
|
1800
1836
|
get_diff_output,
|
|
1801
|
-
timeout=
|
|
1837
|
+
timeout=Timeouts.GIT_DIFF,
|
|
1802
1838
|
operation_name=f"diff_{commit.hexsha[:8]}",
|
|
1803
1839
|
)
|
|
1804
1840
|
except GitOperationTimeout:
|
|
@@ -2003,7 +2039,7 @@ class GitDataFetcher:
|
|
|
2003
2039
|
elapsed_time = time.time() - repo_info["start_time"]
|
|
2004
2040
|
|
|
2005
2041
|
try:
|
|
2006
|
-
result = future.result(timeout=
|
|
2042
|
+
result = future.result(timeout=Timeouts.SUBPROCESS_DEFAULT)
|
|
2007
2043
|
|
|
2008
2044
|
if result:
|
|
2009
2045
|
self.processing_stats["success"] += 1
|
|
@@ -2167,7 +2203,7 @@ class GitDataFetcher:
|
|
|
2167
2203
|
jira_integration: Optional[JIRAIntegration] = None,
|
|
2168
2204
|
start_date: Optional[datetime] = None,
|
|
2169
2205
|
end_date: Optional[datetime] = None,
|
|
2170
|
-
timeout_per_operation: int =
|
|
2206
|
+
timeout_per_operation: int = Timeouts.DEFAULT_GIT_OPERATION,
|
|
2171
2207
|
) -> Optional[dict[str, Any]]:
|
|
2172
2208
|
"""Process a single repository with comprehensive timeout protection.
|
|
2173
2209
|
|
|
@@ -13,6 +13,8 @@ from contextlib import contextmanager
|
|
|
13
13
|
from pathlib import Path
|
|
14
14
|
from typing import Callable, Optional, TypeVar
|
|
15
15
|
|
|
16
|
+
from ..constants import Timeouts
|
|
17
|
+
|
|
16
18
|
logger = logging.getLogger(__name__)
|
|
17
19
|
|
|
18
20
|
T = TypeVar("T")
|
|
@@ -27,7 +29,7 @@ class GitOperationTimeout(Exception):
|
|
|
27
29
|
class GitTimeoutWrapper:
|
|
28
30
|
"""Wrapper for git operations with timeout protection."""
|
|
29
31
|
|
|
30
|
-
def __init__(self, default_timeout: int =
|
|
32
|
+
def __init__(self, default_timeout: int = Timeouts.DEFAULT_GIT_OPERATION):
|
|
31
33
|
"""Initialize the git timeout wrapper.
|
|
32
34
|
|
|
33
35
|
Args:
|
|
@@ -195,7 +197,7 @@ class GitTimeoutWrapper:
|
|
|
195
197
|
logger.error(f" Error details: {e.stderr}")
|
|
196
198
|
raise
|
|
197
199
|
|
|
198
|
-
def fetch_with_timeout(self, repo_path: Path, timeout: int =
|
|
200
|
+
def fetch_with_timeout(self, repo_path: Path, timeout: int = Timeouts.GIT_FETCH) -> bool:
|
|
199
201
|
"""Fetch from remote with timeout protection.
|
|
200
202
|
|
|
201
203
|
Args:
|
|
@@ -238,7 +240,7 @@ class GitTimeoutWrapper:
|
|
|
238
240
|
logger.warning(f"Git fetch failed for {repo_path.name}: {e}")
|
|
239
241
|
return False
|
|
240
242
|
|
|
241
|
-
def pull_with_timeout(self, repo_path: Path, timeout: int =
|
|
243
|
+
def pull_with_timeout(self, repo_path: Path, timeout: int = Timeouts.GIT_PULL) -> bool:
|
|
242
244
|
"""Pull from remote with timeout protection.
|
|
243
245
|
|
|
244
246
|
Args:
|
|
@@ -309,7 +311,7 @@ class HeartbeatLogger:
|
|
|
309
311
|
"""Stop the heartbeat logging thread."""
|
|
310
312
|
self._stop_event.set()
|
|
311
313
|
if self._thread:
|
|
312
|
-
self._thread.join(timeout=
|
|
314
|
+
self._thread.join(timeout=Timeouts.THREAD_JOIN)
|
|
313
315
|
|
|
314
316
|
def _heartbeat_loop(self):
|
|
315
317
|
"""Main heartbeat loop that logs current operations."""
|
|
@@ -31,7 +31,7 @@ import sys
|
|
|
31
31
|
import threading
|
|
32
32
|
from contextlib import contextmanager
|
|
33
33
|
from dataclasses import dataclass
|
|
34
|
-
from typing import Any,
|
|
34
|
+
from typing import Any, Optional
|
|
35
35
|
|
|
36
36
|
from tqdm import tqdm
|
|
37
37
|
|
|
@@ -39,8 +39,6 @@ from tqdm import tqdm
|
|
|
39
39
|
try:
|
|
40
40
|
from ..ui.progress_display import (
|
|
41
41
|
RICH_AVAILABLE,
|
|
42
|
-
RepositoryInfo,
|
|
43
|
-
RepositoryStatus,
|
|
44
42
|
create_progress_display,
|
|
45
43
|
)
|
|
46
44
|
|
|
@@ -109,7 +107,7 @@ class ProgressService:
|
|
|
109
107
|
|
|
110
108
|
# Rich display components
|
|
111
109
|
self._rich_display = None
|
|
112
|
-
self._repository_contexts:
|
|
110
|
+
self._repository_contexts: dict[str, Any] = {}
|
|
113
111
|
self._use_rich = False
|
|
114
112
|
|
|
115
113
|
# Initialize display based on configuration
|
|
@@ -105,7 +105,14 @@ class SubprocessGit:
|
|
|
105
105
|
|
|
106
106
|
@staticmethod
|
|
107
107
|
def check_remotes_safe(repo_path: Path) -> bool:
|
|
108
|
-
"""Check if repository has remotes without triggering authentication.
|
|
108
|
+
"""Check if repository has remotes without triggering authentication.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
repo_path: Path to the git repository
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
True if repository has remotes, False otherwise or on error
|
|
115
|
+
"""
|
|
109
116
|
cmd = ["git", "remote", "-v"]
|
|
110
117
|
|
|
111
118
|
env = {
|
|
@@ -118,12 +125,26 @@ class SubprocessGit:
|
|
|
118
125
|
cmd, cwd=repo_path, capture_output=True, text=True, env=env, timeout=5
|
|
119
126
|
)
|
|
120
127
|
return bool(result.stdout.strip())
|
|
121
|
-
except:
|
|
128
|
+
except subprocess.TimeoutExpired:
|
|
129
|
+
logger.warning(f"Git remote check timed out for {repo_path}")
|
|
130
|
+
return False
|
|
131
|
+
except OSError as e:
|
|
132
|
+
logger.warning(f"Failed to check git remotes for {repo_path}: {e}")
|
|
133
|
+
return False
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"Unexpected error checking git remotes for {repo_path}: {e}")
|
|
122
136
|
return False
|
|
123
137
|
|
|
124
138
|
@staticmethod
|
|
125
139
|
def get_branches_safe(repo_path: Path) -> list[str]:
|
|
126
|
-
"""Get list of branches without triggering authentication.
|
|
140
|
+
"""Get list of branches without triggering authentication.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
repo_path: Path to the git repository
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of branch names, defaults to common branch names if detection fails
|
|
147
|
+
"""
|
|
127
148
|
branches = []
|
|
128
149
|
|
|
129
150
|
# Get local branches
|
|
@@ -135,11 +156,16 @@ class SubprocessGit:
|
|
|
135
156
|
if result.returncode == 0:
|
|
136
157
|
branches = [b.strip() for b in result.stdout.split("\n") if b.strip()]
|
|
137
158
|
|
|
138
|
-
except:
|
|
139
|
-
|
|
159
|
+
except subprocess.TimeoutExpired:
|
|
160
|
+
logger.warning(f"Git branch listing timed out for {repo_path}")
|
|
161
|
+
except OSError as e:
|
|
162
|
+
logger.warning(f"Failed to list git branches for {repo_path}: {e}")
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.error(f"Unexpected error listing git branches for {repo_path}: {e}")
|
|
140
165
|
|
|
141
166
|
# Default to common branch names if none found
|
|
142
167
|
if not branches:
|
|
168
|
+
logger.info(f"No branches detected for {repo_path}, using default branch names")
|
|
143
169
|
branches = ["main", "master", "develop"]
|
|
144
170
|
|
|
145
171
|
return branches
|
|
@@ -150,23 +150,33 @@ class IdentityAnalysisPass:
|
|
|
150
150
|
# Merge new mappings
|
|
151
151
|
existing_emails = set()
|
|
152
152
|
for mapping in existing_mappings:
|
|
153
|
-
|
|
153
|
+
# Support both canonical_email and primary_email for backward compatibility
|
|
154
|
+
email = mapping.get("canonical_email") or mapping.get("primary_email", "")
|
|
155
|
+
existing_emails.add(email.lower())
|
|
154
156
|
|
|
155
157
|
for new_mapping in new_mappings:
|
|
156
|
-
|
|
158
|
+
# New mappings use primary_email
|
|
159
|
+
canonical_email = new_mapping["primary_email"].lower()
|
|
157
160
|
if canonical_email not in existing_emails:
|
|
158
161
|
existing_mappings.append(new_mapping)
|
|
159
162
|
logger.info(f"Added identity mapping for: {canonical_email}")
|
|
160
163
|
else:
|
|
161
164
|
# Update existing mapping with new aliases
|
|
162
165
|
for existing in existing_mappings:
|
|
163
|
-
|
|
166
|
+
existing_email = existing.get("canonical_email") or existing.get(
|
|
167
|
+
"primary_email", ""
|
|
168
|
+
)
|
|
169
|
+
if existing_email.lower() == canonical_email:
|
|
164
170
|
existing_aliases = set(
|
|
165
171
|
alias.lower() for alias in existing.get("aliases", [])
|
|
166
172
|
)
|
|
167
173
|
new_aliases = set(alias.lower() for alias in new_mapping["aliases"])
|
|
168
174
|
combined_aliases = existing_aliases | new_aliases
|
|
169
175
|
existing["aliases"] = list(combined_aliases)
|
|
176
|
+
# Update confidence and reasoning if new mapping has higher confidence
|
|
177
|
+
if new_mapping.get("confidence", 0) > existing.get("confidence", 0):
|
|
178
|
+
existing["confidence"] = new_mapping.get("confidence")
|
|
179
|
+
existing["reasoning"] = new_mapping.get("reasoning")
|
|
170
180
|
if new_aliases - existing_aliases:
|
|
171
181
|
logger.info(f"Updated aliases for: {canonical_email}")
|
|
172
182
|
break
|
|
@@ -20,9 +20,15 @@ class LLMIdentityAnalyzer:
|
|
|
20
20
|
self,
|
|
21
21
|
api_key: Optional[str] = None,
|
|
22
22
|
model: str = "openai/gpt-4o-mini",
|
|
23
|
-
confidence_threshold: float = 0.
|
|
23
|
+
confidence_threshold: float = 0.9,
|
|
24
24
|
):
|
|
25
|
-
"""Initialize the LLM identity analyzer.
|
|
25
|
+
"""Initialize the LLM identity analyzer.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
api_key: OpenRouter API key for LLM-based analysis
|
|
29
|
+
model: LLM model to use (default: openai/gpt-4o-mini)
|
|
30
|
+
confidence_threshold: Minimum confidence for identity matches (default: 0.9 = 90%)
|
|
31
|
+
"""
|
|
26
32
|
self.api_key = api_key
|
|
27
33
|
self.model = model
|
|
28
34
|
self.confidence_threshold = confidence_threshold
|
|
@@ -371,6 +377,12 @@ Respond with a JSON object:
|
|
|
371
377
|
|
|
372
378
|
confidence = float(data.get("confidence", 0.8))
|
|
373
379
|
if confidence < self.confidence_threshold:
|
|
380
|
+
# Log why this cluster was rejected
|
|
381
|
+
cluster_emails = [id.email for id in cluster_identities]
|
|
382
|
+
logger.info(
|
|
383
|
+
f"Rejected identity cluster: {', '.join(cluster_emails)} "
|
|
384
|
+
f"(confidence {confidence:.1%} < threshold {self.confidence_threshold:.1%})"
|
|
385
|
+
)
|
|
374
386
|
return None
|
|
375
387
|
|
|
376
388
|
# Find canonical identity
|
|
@@ -54,7 +54,10 @@ class IdentityAnalysisResult:
|
|
|
54
54
|
analysis_metadata: dict[str, any] = field(default_factory=dict)
|
|
55
55
|
|
|
56
56
|
def get_manual_mappings(self) -> list[dict[str, any]]:
|
|
57
|
-
"""Convert to manual mappings format for config.
|
|
57
|
+
"""Convert to manual mappings format for config.
|
|
58
|
+
|
|
59
|
+
Returns mappings with confidence scores and reasoning for display.
|
|
60
|
+
"""
|
|
58
61
|
mappings = []
|
|
59
62
|
for cluster in self.clusters:
|
|
60
63
|
if len(cluster.aliases) > 0:
|
|
@@ -64,6 +67,9 @@ class IdentityAnalysisResult:
|
|
|
64
67
|
mapping["name"] = cluster.preferred_display_name
|
|
65
68
|
mapping["primary_email"] = cluster.canonical_email
|
|
66
69
|
mapping["aliases"] = [alias.email for alias in cluster.aliases]
|
|
70
|
+
# Include confidence and reasoning for user review
|
|
71
|
+
mapping["confidence"] = cluster.confidence
|
|
72
|
+
mapping["reasoning"] = cluster.reasoning[:100] # Truncate for readability
|
|
67
73
|
mappings.append(mapping)
|
|
68
74
|
return mappings
|
|
69
75
|
|
|
@@ -330,13 +330,15 @@ class OpenAIClassifier(BaseLLMClassifier):
|
|
|
330
330
|
)
|
|
331
331
|
except requests.exceptions.Timeout as e:
|
|
332
332
|
logger.error(f"API request timed out after {self.config.timeout_seconds}s: {e}")
|
|
333
|
-
raise Exception(
|
|
333
|
+
raise Exception(
|
|
334
|
+
f"API request timed out after {self.config.timeout_seconds} seconds"
|
|
335
|
+
) from e
|
|
334
336
|
except requests.exceptions.ConnectionError as e:
|
|
335
337
|
logger.error(f"Connection error during API request: {e}")
|
|
336
|
-
raise Exception(f"Connection error: Unable to reach API at {url}")
|
|
338
|
+
raise Exception(f"Connection error: Unable to reach API at {url}") from e
|
|
337
339
|
except requests.exceptions.RequestException as e:
|
|
338
340
|
logger.error(f"Request failed: {e}")
|
|
339
|
-
raise Exception(f"Request failed: {str(e)}")
|
|
341
|
+
raise Exception(f"Request failed: {str(e)}") from e
|
|
340
342
|
|
|
341
343
|
# Check response
|
|
342
344
|
if response.status_code != 200:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Security configuration module."""
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Optional
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclass
|
|
@@ -9,7 +9,7 @@ class SecretScanningConfig:
|
|
|
9
9
|
"""Configuration for secret detection."""
|
|
10
10
|
|
|
11
11
|
enabled: bool = True
|
|
12
|
-
patterns:
|
|
12
|
+
patterns: dict[str, str] = field(
|
|
13
13
|
default_factory=lambda: {
|
|
14
14
|
# AWS
|
|
15
15
|
"aws_access_key": r"AKIA[0-9A-Z]{16}",
|
|
@@ -36,7 +36,7 @@ class SecretScanningConfig:
|
|
|
36
36
|
}
|
|
37
37
|
)
|
|
38
38
|
entropy_threshold: float = 4.5
|
|
39
|
-
exclude_paths:
|
|
39
|
+
exclude_paths: list[str] = field(
|
|
40
40
|
default_factory=lambda: [
|
|
41
41
|
"*.test.*",
|
|
42
42
|
"*.spec.*",
|
|
@@ -58,7 +58,7 @@ class VulnerabilityScanningConfig:
|
|
|
58
58
|
|
|
59
59
|
# Tool-specific configurations
|
|
60
60
|
enable_semgrep: bool = True
|
|
61
|
-
semgrep_rules:
|
|
61
|
+
semgrep_rules: list[str] = field(
|
|
62
62
|
default_factory=lambda: [
|
|
63
63
|
"auto", # Use Semgrep's auto configuration
|
|
64
64
|
"p/security-audit",
|
|
@@ -74,7 +74,7 @@ class VulnerabilityScanningConfig:
|
|
|
74
74
|
enable_brakeman: bool = False # Ruby on Rails
|
|
75
75
|
|
|
76
76
|
# Custom patterns for quick checks
|
|
77
|
-
vulnerability_patterns:
|
|
77
|
+
vulnerability_patterns: dict[str, str] = field(
|
|
78
78
|
default_factory=lambda: {
|
|
79
79
|
"sql_injection": r"(SELECT|DELETE|INSERT|UPDATE|DROP).*\+.*(?:request|params|input)",
|
|
80
80
|
"command_injection": r"(exec|eval|system|popen|subprocess).*\+.*(?:request|params|input)",
|
|
@@ -160,7 +160,7 @@ class SecurityConfig:
|
|
|
160
160
|
scan_timeout_seconds: int = 30
|
|
161
161
|
|
|
162
162
|
@classmethod
|
|
163
|
-
def from_dict(cls, data:
|
|
163
|
+
def from_dict(cls, data: dict) -> "SecurityConfig":
|
|
164
164
|
"""Create SecurityConfig from dictionary."""
|
|
165
165
|
if not data:
|
|
166
166
|
return cls()
|
|
@@ -4,7 +4,7 @@ import json
|
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import toml
|
|
10
10
|
|
|
@@ -23,7 +23,7 @@ class DependencyChecker:
|
|
|
23
23
|
self.config = config
|
|
24
24
|
self.vulnerability_cache = {}
|
|
25
25
|
|
|
26
|
-
def check_files(self, files_changed:
|
|
26
|
+
def check_files(self, files_changed: list[str], repo_path: Path) -> list[dict]:
|
|
27
27
|
"""Check dependency files for vulnerable packages.
|
|
28
28
|
|
|
29
29
|
Args:
|
|
@@ -68,7 +68,7 @@ class DependencyChecker:
|
|
|
68
68
|
file_name = Path(file_path).name
|
|
69
69
|
return file_name in dependency_files
|
|
70
70
|
|
|
71
|
-
def _check_dependency_file(self, file_path: Path, relative_path: str) ->
|
|
71
|
+
def _check_dependency_file(self, file_path: Path, relative_path: str) -> list[dict]:
|
|
72
72
|
"""Check a specific dependency file for vulnerabilities."""
|
|
73
73
|
findings = []
|
|
74
74
|
file_name = file_path.name
|
|
@@ -99,7 +99,7 @@ class DependencyChecker:
|
|
|
99
99
|
|
|
100
100
|
return findings
|
|
101
101
|
|
|
102
|
-
def _parse_package_json(self, file_path: Path) ->
|
|
102
|
+
def _parse_package_json(self, file_path: Path) -> dict[str, str]:
|
|
103
103
|
"""Parse package.json for dependencies."""
|
|
104
104
|
dependencies = {}
|
|
105
105
|
|
|
@@ -120,7 +120,7 @@ class DependencyChecker:
|
|
|
120
120
|
|
|
121
121
|
return dependencies
|
|
122
122
|
|
|
123
|
-
def _parse_requirements_txt(self, file_path: Path) ->
|
|
123
|
+
def _parse_requirements_txt(self, file_path: Path) -> dict[str, str]:
|
|
124
124
|
"""Parse requirements.txt for Python packages."""
|
|
125
125
|
dependencies = {}
|
|
126
126
|
|
|
@@ -143,7 +143,7 @@ class DependencyChecker:
|
|
|
143
143
|
|
|
144
144
|
return dependencies
|
|
145
145
|
|
|
146
|
-
def _parse_pyproject_toml(self, file_path: Path) ->
|
|
146
|
+
def _parse_pyproject_toml(self, file_path: Path) -> dict[str, str]:
|
|
147
147
|
"""Parse pyproject.toml for Python dependencies."""
|
|
148
148
|
dependencies = {}
|
|
149
149
|
|
|
@@ -183,7 +183,7 @@ class DependencyChecker:
|
|
|
183
183
|
|
|
184
184
|
return dependencies
|
|
185
185
|
|
|
186
|
-
def _parse_go_mod(self, file_path: Path) ->
|
|
186
|
+
def _parse_go_mod(self, file_path: Path) -> dict[str, str]:
|
|
187
187
|
"""Parse go.mod for Go dependencies."""
|
|
188
188
|
dependencies = {}
|
|
189
189
|
|
|
@@ -211,7 +211,7 @@ class DependencyChecker:
|
|
|
211
211
|
|
|
212
212
|
return dependencies
|
|
213
213
|
|
|
214
|
-
def _parse_gemfile(self, file_path: Path) ->
|
|
214
|
+
def _parse_gemfile(self, file_path: Path) -> dict[str, str]:
|
|
215
215
|
"""Parse Gemfile for Ruby dependencies."""
|
|
216
216
|
dependencies = {}
|
|
217
217
|
|
|
@@ -232,7 +232,7 @@ class DependencyChecker:
|
|
|
232
232
|
|
|
233
233
|
return dependencies
|
|
234
234
|
|
|
235
|
-
def _check_npm_dependencies(self, dependencies:
|
|
235
|
+
def _check_npm_dependencies(self, dependencies: dict[str, str], file_path: str) -> list[dict]:
|
|
236
236
|
"""Check NPM packages for vulnerabilities using GitHub Advisory Database."""
|
|
237
237
|
findings = []
|
|
238
238
|
|
|
@@ -259,8 +259,8 @@ class DependencyChecker:
|
|
|
259
259
|
return findings
|
|
260
260
|
|
|
261
261
|
def _check_python_dependencies(
|
|
262
|
-
self, dependencies:
|
|
263
|
-
) ->
|
|
262
|
+
self, dependencies: dict[str, str], file_path: str
|
|
263
|
+
) -> list[dict]:
|
|
264
264
|
"""Check Python packages for vulnerabilities."""
|
|
265
265
|
findings = []
|
|
266
266
|
|
|
@@ -286,7 +286,7 @@ class DependencyChecker:
|
|
|
286
286
|
|
|
287
287
|
return findings
|
|
288
288
|
|
|
289
|
-
def _check_go_dependencies(self, dependencies:
|
|
289
|
+
def _check_go_dependencies(self, dependencies: dict[str, str], file_path: str) -> list[dict]:
|
|
290
290
|
"""Check Go modules for vulnerabilities."""
|
|
291
291
|
findings = []
|
|
292
292
|
|
|
@@ -312,7 +312,7 @@ class DependencyChecker:
|
|
|
312
312
|
|
|
313
313
|
return findings
|
|
314
314
|
|
|
315
|
-
def _check_ruby_dependencies(self, dependencies:
|
|
315
|
+
def _check_ruby_dependencies(self, dependencies: dict[str, str], file_path: str) -> list[dict]:
|
|
316
316
|
"""Check Ruby gems for vulnerabilities."""
|
|
317
317
|
findings = []
|
|
318
318
|
|
|
@@ -338,7 +338,7 @@ class DependencyChecker:
|
|
|
338
338
|
|
|
339
339
|
def _query_vulnerability_db(
|
|
340
340
|
self, ecosystem: str, package: str, package_version: str
|
|
341
|
-
) ->
|
|
341
|
+
) -> list[dict]:
|
|
342
342
|
"""Query vulnerability database for package vulnerabilities.
|
|
343
343
|
|
|
344
344
|
This is a simplified implementation. In production, you would:
|