gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/_version.py +1 -1
- gitflow_analytics/classification/__init__.py +31 -0
- gitflow_analytics/classification/batch_classifier.py +752 -0
- gitflow_analytics/classification/classifier.py +464 -0
- gitflow_analytics/classification/feature_extractor.py +725 -0
- gitflow_analytics/classification/linguist_analyzer.py +574 -0
- gitflow_analytics/classification/model.py +455 -0
- gitflow_analytics/cli.py +4158 -350
- gitflow_analytics/cli_rich.py +198 -48
- gitflow_analytics/config/__init__.py +43 -0
- gitflow_analytics/config/errors.py +261 -0
- gitflow_analytics/config/loader.py +905 -0
- gitflow_analytics/config/profiles.py +264 -0
- gitflow_analytics/config/repository.py +124 -0
- gitflow_analytics/config/schema.py +444 -0
- gitflow_analytics/config/validator.py +154 -0
- gitflow_analytics/config.py +44 -508
- gitflow_analytics/core/analyzer.py +1209 -98
- gitflow_analytics/core/cache.py +1337 -29
- gitflow_analytics/core/data_fetcher.py +1285 -0
- gitflow_analytics/core/identity.py +363 -14
- gitflow_analytics/core/metrics_storage.py +526 -0
- gitflow_analytics/core/progress.py +372 -0
- gitflow_analytics/core/schema_version.py +269 -0
- gitflow_analytics/extractors/ml_tickets.py +1100 -0
- gitflow_analytics/extractors/story_points.py +8 -1
- gitflow_analytics/extractors/tickets.py +749 -11
- gitflow_analytics/identity_llm/__init__.py +6 -0
- gitflow_analytics/identity_llm/analysis_pass.py +231 -0
- gitflow_analytics/identity_llm/analyzer.py +464 -0
- gitflow_analytics/identity_llm/models.py +76 -0
- gitflow_analytics/integrations/github_integration.py +175 -11
- gitflow_analytics/integrations/jira_integration.py +461 -24
- gitflow_analytics/integrations/orchestrator.py +124 -1
- gitflow_analytics/metrics/activity_scoring.py +322 -0
- gitflow_analytics/metrics/branch_health.py +470 -0
- gitflow_analytics/metrics/dora.py +379 -20
- gitflow_analytics/models/database.py +843 -53
- gitflow_analytics/pm_framework/__init__.py +115 -0
- gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
- gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
- gitflow_analytics/pm_framework/base.py +406 -0
- gitflow_analytics/pm_framework/models.py +211 -0
- gitflow_analytics/pm_framework/orchestrator.py +652 -0
- gitflow_analytics/pm_framework/registry.py +333 -0
- gitflow_analytics/qualitative/__init__.py +9 -10
- gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
- gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
- gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
- gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
- gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
- gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
- gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
- gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
- gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
- gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
- gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
- gitflow_analytics/qualitative/core/__init__.py +4 -4
- gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
- gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
- gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
- gitflow_analytics/qualitative/core/processor.py +381 -248
- gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
- gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
- gitflow_analytics/qualitative/models/__init__.py +7 -7
- gitflow_analytics/qualitative/models/schemas.py +155 -121
- gitflow_analytics/qualitative/utils/__init__.py +4 -4
- gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
- gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
- gitflow_analytics/qualitative/utils/metrics.py +172 -158
- gitflow_analytics/qualitative/utils/text_processing.py +146 -104
- gitflow_analytics/reports/__init__.py +100 -0
- gitflow_analytics/reports/analytics_writer.py +539 -14
- gitflow_analytics/reports/base.py +648 -0
- gitflow_analytics/reports/branch_health_writer.py +322 -0
- gitflow_analytics/reports/classification_writer.py +924 -0
- gitflow_analytics/reports/cli_integration.py +427 -0
- gitflow_analytics/reports/csv_writer.py +1676 -212
- gitflow_analytics/reports/data_models.py +504 -0
- gitflow_analytics/reports/database_report_generator.py +427 -0
- gitflow_analytics/reports/example_usage.py +344 -0
- gitflow_analytics/reports/factory.py +499 -0
- gitflow_analytics/reports/formatters.py +698 -0
- gitflow_analytics/reports/html_generator.py +1116 -0
- gitflow_analytics/reports/interfaces.py +489 -0
- gitflow_analytics/reports/json_exporter.py +2770 -0
- gitflow_analytics/reports/narrative_writer.py +2287 -158
- gitflow_analytics/reports/story_point_correlation.py +1144 -0
- gitflow_analytics/reports/weekly_trends_writer.py +389 -0
- gitflow_analytics/training/__init__.py +5 -0
- gitflow_analytics/training/model_loader.py +377 -0
- gitflow_analytics/training/pipeline.py +550 -0
- gitflow_analytics/tui/__init__.py +1 -1
- gitflow_analytics/tui/app.py +129 -126
- gitflow_analytics/tui/screens/__init__.py +3 -3
- gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
- gitflow_analytics/tui/screens/configuration_screen.py +154 -178
- gitflow_analytics/tui/screens/loading_screen.py +100 -110
- gitflow_analytics/tui/screens/main_screen.py +89 -72
- gitflow_analytics/tui/screens/results_screen.py +305 -281
- gitflow_analytics/tui/widgets/__init__.py +2 -2
- gitflow_analytics/tui/widgets/data_table.py +67 -69
- gitflow_analytics/tui/widgets/export_modal.py +76 -76
- gitflow_analytics/tui/widgets/progress_widget.py +41 -46
- gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
- gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
- gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
"""Feature extraction for commit classification.
|
|
2
|
+
|
|
3
|
+
This module extracts 68-dimensional feature vectors from git commits for machine learning
|
|
4
|
+
classification. Features include keyword analysis, file patterns, commit statistics,
|
|
5
|
+
temporal patterns, and author information.
|
|
6
|
+
|
|
7
|
+
The feature vector is designed to capture comprehensive information about commits
|
|
8
|
+
while maintaining computational efficiency and interpretability.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Any, Optional
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from .linguist_analyzer import LinguistAnalyzer
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FeatureExtractor:
|
|
24
|
+
"""Extracts 68-dimensional feature vectors from git commits.
|
|
25
|
+
|
|
26
|
+
The feature extraction process creates a comprehensive representation of each commit
|
|
27
|
+
by analyzing multiple aspects:
|
|
28
|
+
|
|
29
|
+
- Keyword features (20 dimensions): Semantic analysis of commit messages
|
|
30
|
+
- File-based features (20 dimensions): Programming languages and activities
|
|
31
|
+
- Commit statistics (15 dimensions): Size, complexity, and change metrics
|
|
32
|
+
- Temporal features (8 dimensions): Time-based patterns and trends
|
|
33
|
+
- Author features (5 dimensions): Developer behavior and collaboration patterns
|
|
34
|
+
|
|
35
|
+
This design balances comprehensiveness with computational efficiency, allowing
|
|
36
|
+
for accurate classification while maintaining fast processing speeds.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self):
|
|
40
|
+
"""Initialize the feature extractor with analysis components."""
|
|
41
|
+
self.linguist = LinguistAnalyzer()
|
|
42
|
+
|
|
43
|
+
# Keyword categories for semantic analysis (20 dimensions)
|
|
44
|
+
self.keyword_categories = {
|
|
45
|
+
"feature_keywords": [
|
|
46
|
+
"add",
|
|
47
|
+
"implement",
|
|
48
|
+
"create",
|
|
49
|
+
"build",
|
|
50
|
+
"introduce",
|
|
51
|
+
"develop",
|
|
52
|
+
"feature",
|
|
53
|
+
"new",
|
|
54
|
+
"functionality",
|
|
55
|
+
"capability",
|
|
56
|
+
"enhancement",
|
|
57
|
+
],
|
|
58
|
+
"bugfix_keywords": [
|
|
59
|
+
"fix",
|
|
60
|
+
"bug",
|
|
61
|
+
"issue",
|
|
62
|
+
"resolve",
|
|
63
|
+
"correct",
|
|
64
|
+
"repair",
|
|
65
|
+
"patch",
|
|
66
|
+
"error",
|
|
67
|
+
"problem",
|
|
68
|
+
"defect",
|
|
69
|
+
"broken",
|
|
70
|
+
"wrong",
|
|
71
|
+
"crash",
|
|
72
|
+
],
|
|
73
|
+
"refactor_keywords": [
|
|
74
|
+
"refactor",
|
|
75
|
+
"restructure",
|
|
76
|
+
"cleanup",
|
|
77
|
+
"optimize",
|
|
78
|
+
"improve",
|
|
79
|
+
"simplify",
|
|
80
|
+
"reorganize",
|
|
81
|
+
"consolidate",
|
|
82
|
+
"streamline",
|
|
83
|
+
],
|
|
84
|
+
"docs_keywords": [
|
|
85
|
+
"doc",
|
|
86
|
+
"docs",
|
|
87
|
+
"documentation",
|
|
88
|
+
"readme",
|
|
89
|
+
"comment",
|
|
90
|
+
"explain",
|
|
91
|
+
"guide",
|
|
92
|
+
"tutorial",
|
|
93
|
+
"example",
|
|
94
|
+
"specification",
|
|
95
|
+
"manual",
|
|
96
|
+
],
|
|
97
|
+
"test_keywords": [
|
|
98
|
+
"test",
|
|
99
|
+
"testing",
|
|
100
|
+
"spec",
|
|
101
|
+
"unit",
|
|
102
|
+
"integration",
|
|
103
|
+
"e2e",
|
|
104
|
+
"coverage",
|
|
105
|
+
"mock",
|
|
106
|
+
"stub",
|
|
107
|
+
"fixture",
|
|
108
|
+
"assert",
|
|
109
|
+
],
|
|
110
|
+
"config_keywords": [
|
|
111
|
+
"config",
|
|
112
|
+
"configuration",
|
|
113
|
+
"setting",
|
|
114
|
+
"environment",
|
|
115
|
+
"setup",
|
|
116
|
+
"property",
|
|
117
|
+
"parameter",
|
|
118
|
+
"option",
|
|
119
|
+
"flag",
|
|
120
|
+
"variable",
|
|
121
|
+
],
|
|
122
|
+
"security_keywords": [
|
|
123
|
+
"security",
|
|
124
|
+
"secure",
|
|
125
|
+
"auth",
|
|
126
|
+
"authentication",
|
|
127
|
+
"authorization",
|
|
128
|
+
"permission",
|
|
129
|
+
"vulnerability",
|
|
130
|
+
"exploit",
|
|
131
|
+
"sanitize",
|
|
132
|
+
"validate",
|
|
133
|
+
],
|
|
134
|
+
"performance_keywords": [
|
|
135
|
+
"performance",
|
|
136
|
+
"optimize",
|
|
137
|
+
"fast",
|
|
138
|
+
"slow",
|
|
139
|
+
"cache",
|
|
140
|
+
"memory",
|
|
141
|
+
"cpu",
|
|
142
|
+
"speed",
|
|
143
|
+
"efficient",
|
|
144
|
+
"bottleneck",
|
|
145
|
+
"profile",
|
|
146
|
+
],
|
|
147
|
+
"ui_keywords": [
|
|
148
|
+
"ui",
|
|
149
|
+
"interface",
|
|
150
|
+
"frontend",
|
|
151
|
+
"design",
|
|
152
|
+
"layout",
|
|
153
|
+
"style",
|
|
154
|
+
"component",
|
|
155
|
+
"widget",
|
|
156
|
+
"view",
|
|
157
|
+
"screen",
|
|
158
|
+
"page",
|
|
159
|
+
],
|
|
160
|
+
"api_keywords": [
|
|
161
|
+
"api",
|
|
162
|
+
"endpoint",
|
|
163
|
+
"service",
|
|
164
|
+
"backend",
|
|
165
|
+
"server",
|
|
166
|
+
"client",
|
|
167
|
+
"request",
|
|
168
|
+
"response",
|
|
169
|
+
"http",
|
|
170
|
+
"rest",
|
|
171
|
+
"graphql",
|
|
172
|
+
],
|
|
173
|
+
"database_keywords": [
|
|
174
|
+
"database",
|
|
175
|
+
"db",
|
|
176
|
+
"sql",
|
|
177
|
+
"query",
|
|
178
|
+
"table",
|
|
179
|
+
"schema",
|
|
180
|
+
"migration",
|
|
181
|
+
"model",
|
|
182
|
+
"data",
|
|
183
|
+
"repository",
|
|
184
|
+
],
|
|
185
|
+
"deployment_keywords": [
|
|
186
|
+
"deploy",
|
|
187
|
+
"deployment",
|
|
188
|
+
"release",
|
|
189
|
+
"build",
|
|
190
|
+
"ci",
|
|
191
|
+
"cd",
|
|
192
|
+
"docker",
|
|
193
|
+
"kubernetes",
|
|
194
|
+
"infrastructure",
|
|
195
|
+
"production",
|
|
196
|
+
],
|
|
197
|
+
"dependency_keywords": [
|
|
198
|
+
"dependency",
|
|
199
|
+
"package",
|
|
200
|
+
"library",
|
|
201
|
+
"module",
|
|
202
|
+
"import",
|
|
203
|
+
"require",
|
|
204
|
+
"install",
|
|
205
|
+
"update",
|
|
206
|
+
"upgrade",
|
|
207
|
+
"version",
|
|
208
|
+
],
|
|
209
|
+
"maintenance_keywords": [
|
|
210
|
+
"maintenance",
|
|
211
|
+
"cleanup",
|
|
212
|
+
"housekeeping",
|
|
213
|
+
"chore",
|
|
214
|
+
"routine",
|
|
215
|
+
"update",
|
|
216
|
+
"bump",
|
|
217
|
+
"remove",
|
|
218
|
+
"delete",
|
|
219
|
+
"deprecated",
|
|
220
|
+
],
|
|
221
|
+
"hotfix_keywords": [
|
|
222
|
+
"hotfix",
|
|
223
|
+
"urgent",
|
|
224
|
+
"critical",
|
|
225
|
+
"emergency",
|
|
226
|
+
"immediate",
|
|
227
|
+
"asap",
|
|
228
|
+
"production",
|
|
229
|
+
"live",
|
|
230
|
+
"quick",
|
|
231
|
+
"temporary",
|
|
232
|
+
],
|
|
233
|
+
"merge_keywords": [
|
|
234
|
+
"merge",
|
|
235
|
+
"cherry-pick",
|
|
236
|
+
"rebase",
|
|
237
|
+
"conflict",
|
|
238
|
+
"branch",
|
|
239
|
+
"pull",
|
|
240
|
+
"request",
|
|
241
|
+
"pr",
|
|
242
|
+
"integration",
|
|
243
|
+
"combine",
|
|
244
|
+
],
|
|
245
|
+
"revert_keywords": [
|
|
246
|
+
"revert",
|
|
247
|
+
"rollback",
|
|
248
|
+
"undo",
|
|
249
|
+
"back",
|
|
250
|
+
"restore",
|
|
251
|
+
"reset",
|
|
252
|
+
"previous",
|
|
253
|
+
"original",
|
|
254
|
+
"cancel",
|
|
255
|
+
"abort",
|
|
256
|
+
],
|
|
257
|
+
"wip_keywords": [
|
|
258
|
+
"wip",
|
|
259
|
+
"progress",
|
|
260
|
+
"partial",
|
|
261
|
+
"incomplete",
|
|
262
|
+
"draft",
|
|
263
|
+
"temporary",
|
|
264
|
+
"placeholder",
|
|
265
|
+
"todo",
|
|
266
|
+
"fixme",
|
|
267
|
+
"hack",
|
|
268
|
+
],
|
|
269
|
+
"breaking_keywords": [
|
|
270
|
+
"breaking",
|
|
271
|
+
"break",
|
|
272
|
+
"incompatible",
|
|
273
|
+
"major",
|
|
274
|
+
"change",
|
|
275
|
+
"migration",
|
|
276
|
+
"upgrade",
|
|
277
|
+
"deprecated",
|
|
278
|
+
"removed",
|
|
279
|
+
"api",
|
|
280
|
+
],
|
|
281
|
+
"experimental_keywords": [
|
|
282
|
+
"experimental",
|
|
283
|
+
"prototype",
|
|
284
|
+
"poc",
|
|
285
|
+
"spike",
|
|
286
|
+
"trial",
|
|
287
|
+
"test",
|
|
288
|
+
"experiment",
|
|
289
|
+
"explore",
|
|
290
|
+
"research",
|
|
291
|
+
"investigate",
|
|
292
|
+
],
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
# Compile regex patterns for efficiency
|
|
296
|
+
self._compile_keyword_patterns()
|
|
297
|
+
|
|
298
|
+
def _compile_keyword_patterns(self) -> None:
|
|
299
|
+
"""Compile keyword patterns for efficient matching."""
|
|
300
|
+
self.compiled_keyword_patterns = {}
|
|
301
|
+
for category, keywords in self.keyword_categories.items():
|
|
302
|
+
# Create word boundary patterns for precise matching
|
|
303
|
+
patterns = [rf"\b{re.escape(keyword)}\b" for keyword in keywords]
|
|
304
|
+
combined_pattern = "|".join(patterns)
|
|
305
|
+
self.compiled_keyword_patterns[category] = re.compile(combined_pattern, re.IGNORECASE)
|
|
306
|
+
|
|
307
|
+
def extract_features(
|
|
308
|
+
self, commit_data: dict[str, Any], author_stats: Optional[dict[str, Any]] = None
|
|
309
|
+
) -> np.ndarray:
|
|
310
|
+
"""Extract 68-dimensional feature vector from commit data.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
commit_data: Dictionary containing commit information:
|
|
314
|
+
- hash: Commit hash
|
|
315
|
+
- message: Commit message
|
|
316
|
+
- author_name: Author name
|
|
317
|
+
- author_email: Author email
|
|
318
|
+
- timestamp: Commit timestamp (datetime)
|
|
319
|
+
- files_changed: List of changed file paths
|
|
320
|
+
- insertions: Number of lines added
|
|
321
|
+
- deletions: Number of lines deleted
|
|
322
|
+
author_stats: Optional dictionary with author statistics:
|
|
323
|
+
- total_commits: Total commits by this author
|
|
324
|
+
- avg_commit_size: Average commit size for this author
|
|
325
|
+
- languages_used: Set of languages this author typically uses
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
68-dimensional numpy array with extracted features
|
|
329
|
+
"""
|
|
330
|
+
features = np.zeros(68, dtype=np.float32)
|
|
331
|
+
|
|
332
|
+
# Extract different feature categories
|
|
333
|
+
keyword_features = self._extract_keyword_features(commit_data["message"])
|
|
334
|
+
|
|
335
|
+
# Handle files_changed being either a list or an integer
|
|
336
|
+
files_changed = commit_data.get("files_changed", [])
|
|
337
|
+
if isinstance(files_changed, int):
|
|
338
|
+
# If it's an integer, we can't extract file features, use empty list
|
|
339
|
+
files_changed = []
|
|
340
|
+
|
|
341
|
+
file_features = self._extract_file_features(files_changed)
|
|
342
|
+
stats_features = self._extract_stats_features(commit_data)
|
|
343
|
+
temporal_features = self._extract_temporal_features(commit_data["timestamp"])
|
|
344
|
+
author_features = self._extract_author_features(commit_data, author_stats)
|
|
345
|
+
|
|
346
|
+
# Combine all features into single vector
|
|
347
|
+
idx = 0
|
|
348
|
+
|
|
349
|
+
# Keyword features (20 dimensions)
|
|
350
|
+
features[idx : idx + 20] = keyword_features
|
|
351
|
+
idx += 20
|
|
352
|
+
|
|
353
|
+
# File-based features (20 dimensions)
|
|
354
|
+
features[idx : idx + 20] = file_features
|
|
355
|
+
idx += 20
|
|
356
|
+
|
|
357
|
+
# Commit statistics (15 dimensions)
|
|
358
|
+
features[idx : idx + 15] = stats_features
|
|
359
|
+
idx += 15
|
|
360
|
+
|
|
361
|
+
# Temporal features (8 dimensions)
|
|
362
|
+
features[idx : idx + 8] = temporal_features
|
|
363
|
+
idx += 8
|
|
364
|
+
|
|
365
|
+
# Author features (5 dimensions)
|
|
366
|
+
features[idx : idx + 5] = author_features
|
|
367
|
+
|
|
368
|
+
return features
|
|
369
|
+
|
|
370
|
+
def _extract_keyword_features(self, message: str) -> np.ndarray:
|
|
371
|
+
"""Extract keyword-based features from commit message.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
message: Commit message text
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
20-dimensional array with keyword features
|
|
378
|
+
"""
|
|
379
|
+
features = np.zeros(20, dtype=np.float32)
|
|
380
|
+
|
|
381
|
+
if not message:
|
|
382
|
+
return features
|
|
383
|
+
|
|
384
|
+
# Normalize message for consistent analysis
|
|
385
|
+
normalized_message = message.lower().strip()
|
|
386
|
+
message_length = len(normalized_message.split())
|
|
387
|
+
|
|
388
|
+
# Extract features for each keyword category
|
|
389
|
+
for i, (_category, pattern) in enumerate(self.compiled_keyword_patterns.items()):
|
|
390
|
+
matches = pattern.findall(normalized_message)
|
|
391
|
+
match_count = len(matches)
|
|
392
|
+
|
|
393
|
+
# Normalize by message length to handle varying message sizes
|
|
394
|
+
if message_length > 0:
|
|
395
|
+
features[i] = min(1.0, match_count / message_length)
|
|
396
|
+
else:
|
|
397
|
+
features[i] = 0.0
|
|
398
|
+
|
|
399
|
+
return features
|
|
400
|
+
|
|
401
|
+
def _extract_file_features(self, file_paths: list[str]) -> np.ndarray:
|
|
402
|
+
"""Extract file-based features using linguist analysis.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
file_paths: List of changed file paths
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
20-dimensional array with file-based features
|
|
409
|
+
"""
|
|
410
|
+
features = np.zeros(20, dtype=np.float32)
|
|
411
|
+
|
|
412
|
+
if not file_paths:
|
|
413
|
+
return features
|
|
414
|
+
|
|
415
|
+
# Get linguist analysis
|
|
416
|
+
analysis = self.linguist.analyze_commit_files(file_paths)
|
|
417
|
+
|
|
418
|
+
# Feature 0-4: Language distribution (top 5 languages)
|
|
419
|
+
top_languages = analysis["languages"].most_common(5)
|
|
420
|
+
for i, (_lang, count) in enumerate(top_languages):
|
|
421
|
+
features[i] = count / analysis["file_count"]
|
|
422
|
+
|
|
423
|
+
# Feature 5-9: Activity distribution (top 5 activities)
|
|
424
|
+
top_activities = analysis["activities"].most_common(5)
|
|
425
|
+
for i, (_activity, count) in enumerate(top_activities):
|
|
426
|
+
features[5 + i] = count / len(file_paths) # Activities can overlap
|
|
427
|
+
|
|
428
|
+
# Feature 10: Language diversity (normalized)
|
|
429
|
+
features[10] = min(1.0, analysis["language_diversity"] / 5.0)
|
|
430
|
+
|
|
431
|
+
# Feature 11: Activity diversity (normalized)
|
|
432
|
+
features[11] = min(1.0, analysis["activity_diversity"] / 5.0)
|
|
433
|
+
|
|
434
|
+
# Feature 12: Generated file ratio
|
|
435
|
+
features[12] = analysis["generated_ratio"]
|
|
436
|
+
|
|
437
|
+
# Feature 13: Is multilingual
|
|
438
|
+
features[13] = 1.0 if analysis["is_multilingual"] else 0.0
|
|
439
|
+
|
|
440
|
+
# Feature 14: Is cross-functional
|
|
441
|
+
features[14] = 1.0 if analysis["is_cross_functional"] else 0.0
|
|
442
|
+
|
|
443
|
+
# Feature 15-19: File type patterns
|
|
444
|
+
common_extensions = [".py", ".js", ".java", ".go", ".sql"]
|
|
445
|
+
for i, ext in enumerate(common_extensions):
|
|
446
|
+
if ext in analysis["file_types"]:
|
|
447
|
+
features[15 + i] = analysis["file_types"][ext] / analysis["file_count"]
|
|
448
|
+
|
|
449
|
+
return features
|
|
450
|
+
|
|
451
|
+
def _extract_stats_features(self, commit_data: dict[str, Any]) -> np.ndarray:
|
|
452
|
+
"""Extract statistical features from commit data.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
commit_data: Commit data dictionary
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
15-dimensional array with statistical features
|
|
459
|
+
"""
|
|
460
|
+
features = np.zeros(15, dtype=np.float32)
|
|
461
|
+
|
|
462
|
+
files_changed = len(commit_data.get("files_changed", []))
|
|
463
|
+
insertions = commit_data.get("insertions", 0)
|
|
464
|
+
deletions = commit_data.get("deletions", 0)
|
|
465
|
+
message = commit_data.get("message", "")
|
|
466
|
+
|
|
467
|
+
# Feature 0: Number of files changed (log-scaled)
|
|
468
|
+
features[0] = min(1.0, np.log1p(files_changed) / np.log1p(100))
|
|
469
|
+
|
|
470
|
+
# Feature 1: Lines inserted (log-scaled)
|
|
471
|
+
features[1] = min(1.0, np.log1p(insertions) / np.log1p(1000))
|
|
472
|
+
|
|
473
|
+
# Feature 2: Lines deleted (log-scaled)
|
|
474
|
+
features[2] = min(1.0, np.log1p(deletions) / np.log1p(1000))
|
|
475
|
+
|
|
476
|
+
# Feature 3: Total lines changed (log-scaled)
|
|
477
|
+
total_lines = insertions + deletions
|
|
478
|
+
features[3] = min(1.0, np.log1p(total_lines) / np.log1p(2000))
|
|
479
|
+
|
|
480
|
+
# Feature 4: Insert/delete ratio
|
|
481
|
+
if total_lines > 0:
|
|
482
|
+
features[4] = insertions / total_lines
|
|
483
|
+
|
|
484
|
+
# Feature 5: Commit message length (normalized)
|
|
485
|
+
features[5] = min(1.0, len(message) / 200.0)
|
|
486
|
+
|
|
487
|
+
# Feature 6: Message word count (normalized)
|
|
488
|
+
word_count = len(message.split())
|
|
489
|
+
features[6] = min(1.0, word_count / 50.0)
|
|
490
|
+
|
|
491
|
+
# Feature 7: Message lines count (normalized)
|
|
492
|
+
line_count = len(message.split("\n"))
|
|
493
|
+
features[7] = min(1.0, line_count / 10.0)
|
|
494
|
+
|
|
495
|
+
# Feature 8: Average lines per file
|
|
496
|
+
if files_changed > 0:
|
|
497
|
+
features[8] = min(1.0, total_lines / files_changed / 100.0)
|
|
498
|
+
|
|
499
|
+
# Feature 9: Has conventional commit format
|
|
500
|
+
conventional_pattern = (
|
|
501
|
+
r"^(feat|fix|docs|style|refactor|test|chore|perf|ci|build|revert)(\\(.+\\))?: .+"
|
|
502
|
+
)
|
|
503
|
+
features[9] = 1.0 if re.match(conventional_pattern, message.strip()) else 0.0
|
|
504
|
+
|
|
505
|
+
# Feature 10: Contains ticket reference
|
|
506
|
+
ticket_pattern = r"(#\\d+|[A-Z]+-\\d+|JIRA-\\d+|CU-\\d+)"
|
|
507
|
+
features[10] = 1.0 if re.search(ticket_pattern, message) else 0.0
|
|
508
|
+
|
|
509
|
+
# Feature 11: Is merge commit
|
|
510
|
+
features[11] = 1.0 if message.lower().startswith("merge") else 0.0
|
|
511
|
+
|
|
512
|
+
# Feature 12: Contains code in message (backticks or brackets)
|
|
513
|
+
code_pattern = r"(`[^`]+`|\[[^\]]+\]|\{[^}]+\})"
|
|
514
|
+
features[12] = 1.0 if re.search(code_pattern, message) else 0.0
|
|
515
|
+
|
|
516
|
+
# Feature 13: Message complexity (punctuation diversity)
|
|
517
|
+
punctuation = set(char for char in message if not char.isalnum() and not char.isspace())
|
|
518
|
+
features[13] = min(1.0, len(punctuation) / 10.0)
|
|
519
|
+
|
|
520
|
+
# Feature 14: Large commit indicator
|
|
521
|
+
is_large = files_changed > 10 or total_lines > 500
|
|
522
|
+
features[14] = 1.0 if is_large else 0.0
|
|
523
|
+
|
|
524
|
+
return features
|
|
525
|
+
|
|
526
|
+
def _extract_temporal_features(self, timestamp: datetime) -> np.ndarray:
|
|
527
|
+
"""Extract temporal features from commit timestamp.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
timestamp: Commit timestamp
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
8-dimensional array with temporal features
|
|
534
|
+
"""
|
|
535
|
+
features = np.zeros(8, dtype=np.float32)
|
|
536
|
+
|
|
537
|
+
if not timestamp:
|
|
538
|
+
return features
|
|
539
|
+
|
|
540
|
+
# Ensure timezone awareness
|
|
541
|
+
if timestamp.tzinfo is None:
|
|
542
|
+
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
543
|
+
|
|
544
|
+
# Feature 0: Hour of day (normalized)
|
|
545
|
+
features[0] = timestamp.hour / 24.0
|
|
546
|
+
|
|
547
|
+
# Feature 1: Day of week (0=Monday, 6=Sunday, normalized)
|
|
548
|
+
features[1] = timestamp.weekday() / 6.0
|
|
549
|
+
|
|
550
|
+
# Feature 2: Day of month (normalized)
|
|
551
|
+
features[2] = (timestamp.day - 1) / 30.0 # 0-based, max ~30 days
|
|
552
|
+
|
|
553
|
+
# Feature 3: Month of year (normalized)
|
|
554
|
+
features[3] = (timestamp.month - 1) / 11.0 # 0-based, 12 months
|
|
555
|
+
|
|
556
|
+
# Feature 4: Is weekend
|
|
557
|
+
features[4] = 1.0 if timestamp.weekday() >= 5 else 0.0
|
|
558
|
+
|
|
559
|
+
# Feature 5: Is business hours (9 AM - 5 PM)
|
|
560
|
+
features[5] = 1.0 if 9 <= timestamp.hour < 17 else 0.0
|
|
561
|
+
|
|
562
|
+
# Feature 6: Is late night (10 PM - 6 AM)
|
|
563
|
+
features[6] = 1.0 if timestamp.hour >= 22 or timestamp.hour < 6 else 0.0
|
|
564
|
+
|
|
565
|
+
# Feature 7: Quarter of year (normalized)
|
|
566
|
+
quarter = (timestamp.month - 1) // 3
|
|
567
|
+
features[7] = quarter / 3.0
|
|
568
|
+
|
|
569
|
+
return features
|
|
570
|
+
|
|
571
|
+
def _extract_author_features(
|
|
572
|
+
self, commit_data: dict[str, Any], author_stats: Optional[dict[str, Any]] = None
|
|
573
|
+
) -> np.ndarray:
|
|
574
|
+
"""Extract author-based features.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
commit_data: Commit data dictionary
|
|
578
|
+
author_stats: Optional author statistics
|
|
579
|
+
|
|
580
|
+
Returns:
|
|
581
|
+
5-dimensional array with author features
|
|
582
|
+
"""
|
|
583
|
+
features = np.zeros(5, dtype=np.float32)
|
|
584
|
+
|
|
585
|
+
author_name = commit_data.get("author_name", "")
|
|
586
|
+
author_email = commit_data.get("author_email", "")
|
|
587
|
+
|
|
588
|
+
# Feature 0: Author name length (normalized)
|
|
589
|
+
features[0] = min(1.0, len(author_name) / 50.0)
|
|
590
|
+
|
|
591
|
+
# Feature 1: Has corporate email
|
|
592
|
+
corporate_domains = [".com", ".org", ".net", ".io", ".co"]
|
|
593
|
+
has_corporate = any(domain in author_email.lower() for domain in corporate_domains)
|
|
594
|
+
is_github_noreply = "noreply.github.com" in author_email.lower()
|
|
595
|
+
features[1] = 1.0 if has_corporate and not is_github_noreply else 0.0
|
|
596
|
+
|
|
597
|
+
# Feature 2: Is likely automated (bot/CI)
|
|
598
|
+
automated_indicators = ["bot", "ci", "github-actions", "dependabot", "renovate"]
|
|
599
|
+
is_automated = any(
|
|
600
|
+
indicator in author_name.lower() or indicator in author_email.lower()
|
|
601
|
+
for indicator in automated_indicators
|
|
602
|
+
)
|
|
603
|
+
features[2] = 1.0 if is_automated else 0.0
|
|
604
|
+
|
|
605
|
+
# Features 3-4: Author statistics (if available)
|
|
606
|
+
if author_stats:
|
|
607
|
+
# Feature 3: Author experience (normalized commit count)
|
|
608
|
+
total_commits = author_stats.get("total_commits", 1)
|
|
609
|
+
features[3] = min(1.0, np.log1p(total_commits) / np.log1p(1000))
|
|
610
|
+
|
|
611
|
+
# Feature 4: Typical commit size compared to this commit
|
|
612
|
+
avg_size = author_stats.get("avg_commit_size", 0)
|
|
613
|
+
current_size = commit_data.get("insertions", 0) + commit_data.get("deletions", 0)
|
|
614
|
+
if avg_size > 0:
|
|
615
|
+
features[4] = min(2.0, current_size / avg_size) # Ratio, capped at 2x
|
|
616
|
+
|
|
617
|
+
return features
|
|
618
|
+
|
|
619
|
+
def get_feature_names(self) -> list[str]:
|
|
620
|
+
"""Get human-readable names for all 68 features.
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
List of feature names corresponding to the feature vector indices
|
|
624
|
+
"""
|
|
625
|
+
names = []
|
|
626
|
+
|
|
627
|
+
# Keyword features (20)
|
|
628
|
+
for category in self.keyword_categories:
|
|
629
|
+
names.append(f"keyword_{category}")
|
|
630
|
+
|
|
631
|
+
# File features (20)
|
|
632
|
+
file_feature_names = [
|
|
633
|
+
"lang_1st",
|
|
634
|
+
"lang_2nd",
|
|
635
|
+
"lang_3rd",
|
|
636
|
+
"lang_4th",
|
|
637
|
+
"lang_5th",
|
|
638
|
+
"activity_1st",
|
|
639
|
+
"activity_2nd",
|
|
640
|
+
"activity_3rd",
|
|
641
|
+
"activity_4th",
|
|
642
|
+
"activity_5th",
|
|
643
|
+
"lang_diversity",
|
|
644
|
+
"activity_diversity",
|
|
645
|
+
"generated_ratio",
|
|
646
|
+
"is_multilingual",
|
|
647
|
+
"is_cross_functional",
|
|
648
|
+
"ext_py",
|
|
649
|
+
"ext_js",
|
|
650
|
+
"ext_java",
|
|
651
|
+
"ext_go",
|
|
652
|
+
"ext_sql",
|
|
653
|
+
]
|
|
654
|
+
names.extend(file_feature_names)
|
|
655
|
+
|
|
656
|
+
# Statistics features (15)
|
|
657
|
+
stats_feature_names = [
|
|
658
|
+
"files_changed",
|
|
659
|
+
"insertions",
|
|
660
|
+
"deletions",
|
|
661
|
+
"total_lines",
|
|
662
|
+
"insert_delete_ratio",
|
|
663
|
+
"message_length",
|
|
664
|
+
"word_count",
|
|
665
|
+
"line_count",
|
|
666
|
+
"avg_lines_per_file",
|
|
667
|
+
"has_conventional_format",
|
|
668
|
+
"has_ticket_ref",
|
|
669
|
+
"is_merge",
|
|
670
|
+
"has_code_in_msg",
|
|
671
|
+
"message_complexity",
|
|
672
|
+
"is_large_commit",
|
|
673
|
+
]
|
|
674
|
+
names.extend(stats_feature_names)
|
|
675
|
+
|
|
676
|
+
# Temporal features (8)
|
|
677
|
+
temporal_feature_names = [
|
|
678
|
+
"hour_of_day",
|
|
679
|
+
"day_of_week",
|
|
680
|
+
"day_of_month",
|
|
681
|
+
"month_of_year",
|
|
682
|
+
"is_weekend",
|
|
683
|
+
"is_business_hours",
|
|
684
|
+
"is_late_night",
|
|
685
|
+
"quarter",
|
|
686
|
+
]
|
|
687
|
+
names.extend(temporal_feature_names)
|
|
688
|
+
|
|
689
|
+
# Author features (5)
|
|
690
|
+
author_feature_names = [
|
|
691
|
+
"author_name_length",
|
|
692
|
+
"has_corporate_email",
|
|
693
|
+
"is_automated",
|
|
694
|
+
"author_experience",
|
|
695
|
+
"commit_size_vs_typical",
|
|
696
|
+
]
|
|
697
|
+
names.extend(author_feature_names)
|
|
698
|
+
|
|
699
|
+
return names
|
|
700
|
+
|
|
701
|
+
def extract_batch_features(
|
|
702
|
+
self,
|
|
703
|
+
commit_batch: list[dict[str, Any]],
|
|
704
|
+
author_stats_batch: Optional[list[dict[str, Any]]] = None,
|
|
705
|
+
) -> np.ndarray:
|
|
706
|
+
"""Extract features for a batch of commits efficiently.
|
|
707
|
+
|
|
708
|
+
Args:
|
|
709
|
+
commit_batch: List of commit data dictionaries
|
|
710
|
+
author_stats_batch: Optional list of author statistics
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
2D numpy array of shape (n_commits, 68) with feature vectors
|
|
714
|
+
"""
|
|
715
|
+
n_commits = len(commit_batch)
|
|
716
|
+
features = np.zeros((n_commits, 68), dtype=np.float32)
|
|
717
|
+
|
|
718
|
+
for i, commit_data in enumerate(commit_batch):
|
|
719
|
+
author_stats = None
|
|
720
|
+
if author_stats_batch and i < len(author_stats_batch):
|
|
721
|
+
author_stats = author_stats_batch[i]
|
|
722
|
+
|
|
723
|
+
features[i] = self.extract_features(commit_data, author_stats)
|
|
724
|
+
|
|
725
|
+
return features
|