gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4158 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +905 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +444 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1285 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.11.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.11.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,725 @@
1
+ """Feature extraction for commit classification.
2
+
3
+ This module extracts 68-dimensional feature vectors from git commits for machine learning
4
+ classification. Features include keyword analysis, file patterns, commit statistics,
5
+ temporal patterns, and author information.
6
+
7
+ The feature vector is designed to capture comprehensive information about commits
8
+ while maintaining computational efficiency and interpretability.
9
+ """
10
+
11
+ import logging
12
+ import re
13
+ from datetime import datetime, timezone
14
+ from typing import Any, Optional
15
+
16
+ import numpy as np
17
+
18
+ from .linguist_analyzer import LinguistAnalyzer
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class FeatureExtractor:
24
+ """Extracts 68-dimensional feature vectors from git commits.
25
+
26
+ The feature extraction process creates a comprehensive representation of each commit
27
+ by analyzing multiple aspects:
28
+
29
+ - Keyword features (20 dimensions): Semantic analysis of commit messages
30
+ - File-based features (20 dimensions): Programming languages and activities
31
+ - Commit statistics (15 dimensions): Size, complexity, and change metrics
32
+ - Temporal features (8 dimensions): Time-based patterns and trends
33
+ - Author features (5 dimensions): Developer behavior and collaboration patterns
34
+
35
+ This design balances comprehensiveness with computational efficiency, allowing
36
+ for accurate classification while maintaining fast processing speeds.
37
+ """
38
+
39
+ def __init__(self):
40
+ """Initialize the feature extractor with analysis components."""
41
+ self.linguist = LinguistAnalyzer()
42
+
43
+ # Keyword categories for semantic analysis (20 dimensions)
44
+ self.keyword_categories = {
45
+ "feature_keywords": [
46
+ "add",
47
+ "implement",
48
+ "create",
49
+ "build",
50
+ "introduce",
51
+ "develop",
52
+ "feature",
53
+ "new",
54
+ "functionality",
55
+ "capability",
56
+ "enhancement",
57
+ ],
58
+ "bugfix_keywords": [
59
+ "fix",
60
+ "bug",
61
+ "issue",
62
+ "resolve",
63
+ "correct",
64
+ "repair",
65
+ "patch",
66
+ "error",
67
+ "problem",
68
+ "defect",
69
+ "broken",
70
+ "wrong",
71
+ "crash",
72
+ ],
73
+ "refactor_keywords": [
74
+ "refactor",
75
+ "restructure",
76
+ "cleanup",
77
+ "optimize",
78
+ "improve",
79
+ "simplify",
80
+ "reorganize",
81
+ "consolidate",
82
+ "streamline",
83
+ ],
84
+ "docs_keywords": [
85
+ "doc",
86
+ "docs",
87
+ "documentation",
88
+ "readme",
89
+ "comment",
90
+ "explain",
91
+ "guide",
92
+ "tutorial",
93
+ "example",
94
+ "specification",
95
+ "manual",
96
+ ],
97
+ "test_keywords": [
98
+ "test",
99
+ "testing",
100
+ "spec",
101
+ "unit",
102
+ "integration",
103
+ "e2e",
104
+ "coverage",
105
+ "mock",
106
+ "stub",
107
+ "fixture",
108
+ "assert",
109
+ ],
110
+ "config_keywords": [
111
+ "config",
112
+ "configuration",
113
+ "setting",
114
+ "environment",
115
+ "setup",
116
+ "property",
117
+ "parameter",
118
+ "option",
119
+ "flag",
120
+ "variable",
121
+ ],
122
+ "security_keywords": [
123
+ "security",
124
+ "secure",
125
+ "auth",
126
+ "authentication",
127
+ "authorization",
128
+ "permission",
129
+ "vulnerability",
130
+ "exploit",
131
+ "sanitize",
132
+ "validate",
133
+ ],
134
+ "performance_keywords": [
135
+ "performance",
136
+ "optimize",
137
+ "fast",
138
+ "slow",
139
+ "cache",
140
+ "memory",
141
+ "cpu",
142
+ "speed",
143
+ "efficient",
144
+ "bottleneck",
145
+ "profile",
146
+ ],
147
+ "ui_keywords": [
148
+ "ui",
149
+ "interface",
150
+ "frontend",
151
+ "design",
152
+ "layout",
153
+ "style",
154
+ "component",
155
+ "widget",
156
+ "view",
157
+ "screen",
158
+ "page",
159
+ ],
160
+ "api_keywords": [
161
+ "api",
162
+ "endpoint",
163
+ "service",
164
+ "backend",
165
+ "server",
166
+ "client",
167
+ "request",
168
+ "response",
169
+ "http",
170
+ "rest",
171
+ "graphql",
172
+ ],
173
+ "database_keywords": [
174
+ "database",
175
+ "db",
176
+ "sql",
177
+ "query",
178
+ "table",
179
+ "schema",
180
+ "migration",
181
+ "model",
182
+ "data",
183
+ "repository",
184
+ ],
185
+ "deployment_keywords": [
186
+ "deploy",
187
+ "deployment",
188
+ "release",
189
+ "build",
190
+ "ci",
191
+ "cd",
192
+ "docker",
193
+ "kubernetes",
194
+ "infrastructure",
195
+ "production",
196
+ ],
197
+ "dependency_keywords": [
198
+ "dependency",
199
+ "package",
200
+ "library",
201
+ "module",
202
+ "import",
203
+ "require",
204
+ "install",
205
+ "update",
206
+ "upgrade",
207
+ "version",
208
+ ],
209
+ "maintenance_keywords": [
210
+ "maintenance",
211
+ "cleanup",
212
+ "housekeeping",
213
+ "chore",
214
+ "routine",
215
+ "update",
216
+ "bump",
217
+ "remove",
218
+ "delete",
219
+ "deprecated",
220
+ ],
221
+ "hotfix_keywords": [
222
+ "hotfix",
223
+ "urgent",
224
+ "critical",
225
+ "emergency",
226
+ "immediate",
227
+ "asap",
228
+ "production",
229
+ "live",
230
+ "quick",
231
+ "temporary",
232
+ ],
233
+ "merge_keywords": [
234
+ "merge",
235
+ "cherry-pick",
236
+ "rebase",
237
+ "conflict",
238
+ "branch",
239
+ "pull",
240
+ "request",
241
+ "pr",
242
+ "integration",
243
+ "combine",
244
+ ],
245
+ "revert_keywords": [
246
+ "revert",
247
+ "rollback",
248
+ "undo",
249
+ "back",
250
+ "restore",
251
+ "reset",
252
+ "previous",
253
+ "original",
254
+ "cancel",
255
+ "abort",
256
+ ],
257
+ "wip_keywords": [
258
+ "wip",
259
+ "progress",
260
+ "partial",
261
+ "incomplete",
262
+ "draft",
263
+ "temporary",
264
+ "placeholder",
265
+ "todo",
266
+ "fixme",
267
+ "hack",
268
+ ],
269
+ "breaking_keywords": [
270
+ "breaking",
271
+ "break",
272
+ "incompatible",
273
+ "major",
274
+ "change",
275
+ "migration",
276
+ "upgrade",
277
+ "deprecated",
278
+ "removed",
279
+ "api",
280
+ ],
281
+ "experimental_keywords": [
282
+ "experimental",
283
+ "prototype",
284
+ "poc",
285
+ "spike",
286
+ "trial",
287
+ "test",
288
+ "experiment",
289
+ "explore",
290
+ "research",
291
+ "investigate",
292
+ ],
293
+ }
294
+
295
+ # Compile regex patterns for efficiency
296
+ self._compile_keyword_patterns()
297
+
298
+ def _compile_keyword_patterns(self) -> None:
299
+ """Compile keyword patterns for efficient matching."""
300
+ self.compiled_keyword_patterns = {}
301
+ for category, keywords in self.keyword_categories.items():
302
+ # Create word boundary patterns for precise matching
303
+ patterns = [rf"\b{re.escape(keyword)}\b" for keyword in keywords]
304
+ combined_pattern = "|".join(patterns)
305
+ self.compiled_keyword_patterns[category] = re.compile(combined_pattern, re.IGNORECASE)
306
+
307
+ def extract_features(
308
+ self, commit_data: dict[str, Any], author_stats: Optional[dict[str, Any]] = None
309
+ ) -> np.ndarray:
310
+ """Extract 68-dimensional feature vector from commit data.
311
+
312
+ Args:
313
+ commit_data: Dictionary containing commit information:
314
+ - hash: Commit hash
315
+ - message: Commit message
316
+ - author_name: Author name
317
+ - author_email: Author email
318
+ - timestamp: Commit timestamp (datetime)
319
+ - files_changed: List of changed file paths
320
+ - insertions: Number of lines added
321
+ - deletions: Number of lines deleted
322
+ author_stats: Optional dictionary with author statistics:
323
+ - total_commits: Total commits by this author
324
+ - avg_commit_size: Average commit size for this author
325
+ - languages_used: Set of languages this author typically uses
326
+
327
+ Returns:
328
+ 68-dimensional numpy array with extracted features
329
+ """
330
+ features = np.zeros(68, dtype=np.float32)
331
+
332
+ # Extract different feature categories
333
+ keyword_features = self._extract_keyword_features(commit_data["message"])
334
+
335
+ # Handle files_changed being either a list or an integer
336
+ files_changed = commit_data.get("files_changed", [])
337
+ if isinstance(files_changed, int):
338
+ # If it's an integer, we can't extract file features, use empty list
339
+ files_changed = []
340
+
341
+ file_features = self._extract_file_features(files_changed)
342
+ stats_features = self._extract_stats_features(commit_data)
343
+ temporal_features = self._extract_temporal_features(commit_data["timestamp"])
344
+ author_features = self._extract_author_features(commit_data, author_stats)
345
+
346
+ # Combine all features into single vector
347
+ idx = 0
348
+
349
+ # Keyword features (20 dimensions)
350
+ features[idx : idx + 20] = keyword_features
351
+ idx += 20
352
+
353
+ # File-based features (20 dimensions)
354
+ features[idx : idx + 20] = file_features
355
+ idx += 20
356
+
357
+ # Commit statistics (15 dimensions)
358
+ features[idx : idx + 15] = stats_features
359
+ idx += 15
360
+
361
+ # Temporal features (8 dimensions)
362
+ features[idx : idx + 8] = temporal_features
363
+ idx += 8
364
+
365
+ # Author features (5 dimensions)
366
+ features[idx : idx + 5] = author_features
367
+
368
+ return features
369
+
370
+ def _extract_keyword_features(self, message: str) -> np.ndarray:
371
+ """Extract keyword-based features from commit message.
372
+
373
+ Args:
374
+ message: Commit message text
375
+
376
+ Returns:
377
+ 20-dimensional array with keyword features
378
+ """
379
+ features = np.zeros(20, dtype=np.float32)
380
+
381
+ if not message:
382
+ return features
383
+
384
+ # Normalize message for consistent analysis
385
+ normalized_message = message.lower().strip()
386
+ message_length = len(normalized_message.split())
387
+
388
+ # Extract features for each keyword category
389
+ for i, (_category, pattern) in enumerate(self.compiled_keyword_patterns.items()):
390
+ matches = pattern.findall(normalized_message)
391
+ match_count = len(matches)
392
+
393
+ # Normalize by message length to handle varying message sizes
394
+ if message_length > 0:
395
+ features[i] = min(1.0, match_count / message_length)
396
+ else:
397
+ features[i] = 0.0
398
+
399
+ return features
400
+
401
+ def _extract_file_features(self, file_paths: list[str]) -> np.ndarray:
402
+ """Extract file-based features using linguist analysis.
403
+
404
+ Args:
405
+ file_paths: List of changed file paths
406
+
407
+ Returns:
408
+ 20-dimensional array with file-based features
409
+ """
410
+ features = np.zeros(20, dtype=np.float32)
411
+
412
+ if not file_paths:
413
+ return features
414
+
415
+ # Get linguist analysis
416
+ analysis = self.linguist.analyze_commit_files(file_paths)
417
+
418
+ # Feature 0-4: Language distribution (top 5 languages)
419
+ top_languages = analysis["languages"].most_common(5)
420
+ for i, (_lang, count) in enumerate(top_languages):
421
+ features[i] = count / analysis["file_count"]
422
+
423
+ # Feature 5-9: Activity distribution (top 5 activities)
424
+ top_activities = analysis["activities"].most_common(5)
425
+ for i, (_activity, count) in enumerate(top_activities):
426
+ features[5 + i] = count / len(file_paths) # Activities can overlap
427
+
428
+ # Feature 10: Language diversity (normalized)
429
+ features[10] = min(1.0, analysis["language_diversity"] / 5.0)
430
+
431
+ # Feature 11: Activity diversity (normalized)
432
+ features[11] = min(1.0, analysis["activity_diversity"] / 5.0)
433
+
434
+ # Feature 12: Generated file ratio
435
+ features[12] = analysis["generated_ratio"]
436
+
437
+ # Feature 13: Is multilingual
438
+ features[13] = 1.0 if analysis["is_multilingual"] else 0.0
439
+
440
+ # Feature 14: Is cross-functional
441
+ features[14] = 1.0 if analysis["is_cross_functional"] else 0.0
442
+
443
+ # Feature 15-19: File type patterns
444
+ common_extensions = [".py", ".js", ".java", ".go", ".sql"]
445
+ for i, ext in enumerate(common_extensions):
446
+ if ext in analysis["file_types"]:
447
+ features[15 + i] = analysis["file_types"][ext] / analysis["file_count"]
448
+
449
+ return features
450
+
451
+ def _extract_stats_features(self, commit_data: dict[str, Any]) -> np.ndarray:
452
+ """Extract statistical features from commit data.
453
+
454
+ Args:
455
+ commit_data: Commit data dictionary
456
+
457
+ Returns:
458
+ 15-dimensional array with statistical features
459
+ """
460
+ features = np.zeros(15, dtype=np.float32)
461
+
462
+ files_changed = len(commit_data.get("files_changed", []))
463
+ insertions = commit_data.get("insertions", 0)
464
+ deletions = commit_data.get("deletions", 0)
465
+ message = commit_data.get("message", "")
466
+
467
+ # Feature 0: Number of files changed (log-scaled)
468
+ features[0] = min(1.0, np.log1p(files_changed) / np.log1p(100))
469
+
470
+ # Feature 1: Lines inserted (log-scaled)
471
+ features[1] = min(1.0, np.log1p(insertions) / np.log1p(1000))
472
+
473
+ # Feature 2: Lines deleted (log-scaled)
474
+ features[2] = min(1.0, np.log1p(deletions) / np.log1p(1000))
475
+
476
+ # Feature 3: Total lines changed (log-scaled)
477
+ total_lines = insertions + deletions
478
+ features[3] = min(1.0, np.log1p(total_lines) / np.log1p(2000))
479
+
480
+ # Feature 4: Insert/delete ratio
481
+ if total_lines > 0:
482
+ features[4] = insertions / total_lines
483
+
484
+ # Feature 5: Commit message length (normalized)
485
+ features[5] = min(1.0, len(message) / 200.0)
486
+
487
+ # Feature 6: Message word count (normalized)
488
+ word_count = len(message.split())
489
+ features[6] = min(1.0, word_count / 50.0)
490
+
491
+ # Feature 7: Message lines count (normalized)
492
+ line_count = len(message.split("\n"))
493
+ features[7] = min(1.0, line_count / 10.0)
494
+
495
+ # Feature 8: Average lines per file
496
+ if files_changed > 0:
497
+ features[8] = min(1.0, total_lines / files_changed / 100.0)
498
+
499
+ # Feature 9: Has conventional commit format
500
+ conventional_pattern = (
501
+ r"^(feat|fix|docs|style|refactor|test|chore|perf|ci|build|revert)(\\(.+\\))?: .+"
502
+ )
503
+ features[9] = 1.0 if re.match(conventional_pattern, message.strip()) else 0.0
504
+
505
+ # Feature 10: Contains ticket reference
506
+ ticket_pattern = r"(#\\d+|[A-Z]+-\\d+|JIRA-\\d+|CU-\\d+)"
507
+ features[10] = 1.0 if re.search(ticket_pattern, message) else 0.0
508
+
509
+ # Feature 11: Is merge commit
510
+ features[11] = 1.0 if message.lower().startswith("merge") else 0.0
511
+
512
+ # Feature 12: Contains code in message (backticks or brackets)
513
+ code_pattern = r"(`[^`]+`|\[[^\]]+\]|\{[^}]+\})"
514
+ features[12] = 1.0 if re.search(code_pattern, message) else 0.0
515
+
516
+ # Feature 13: Message complexity (punctuation diversity)
517
+ punctuation = set(char for char in message if not char.isalnum() and not char.isspace())
518
+ features[13] = min(1.0, len(punctuation) / 10.0)
519
+
520
+ # Feature 14: Large commit indicator
521
+ is_large = files_changed > 10 or total_lines > 500
522
+ features[14] = 1.0 if is_large else 0.0
523
+
524
+ return features
525
+
526
+ def _extract_temporal_features(self, timestamp: datetime) -> np.ndarray:
527
+ """Extract temporal features from commit timestamp.
528
+
529
+ Args:
530
+ timestamp: Commit timestamp
531
+
532
+ Returns:
533
+ 8-dimensional array with temporal features
534
+ """
535
+ features = np.zeros(8, dtype=np.float32)
536
+
537
+ if not timestamp:
538
+ return features
539
+
540
+ # Ensure timezone awareness
541
+ if timestamp.tzinfo is None:
542
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
543
+
544
+ # Feature 0: Hour of day (normalized)
545
+ features[0] = timestamp.hour / 24.0
546
+
547
+ # Feature 1: Day of week (0=Monday, 6=Sunday, normalized)
548
+ features[1] = timestamp.weekday() / 6.0
549
+
550
+ # Feature 2: Day of month (normalized)
551
+ features[2] = (timestamp.day - 1) / 30.0 # 0-based, max ~30 days
552
+
553
+ # Feature 3: Month of year (normalized)
554
+ features[3] = (timestamp.month - 1) / 11.0 # 0-based, 12 months
555
+
556
+ # Feature 4: Is weekend
557
+ features[4] = 1.0 if timestamp.weekday() >= 5 else 0.0
558
+
559
+ # Feature 5: Is business hours (9 AM - 5 PM)
560
+ features[5] = 1.0 if 9 <= timestamp.hour < 17 else 0.0
561
+
562
+ # Feature 6: Is late night (10 PM - 6 AM)
563
+ features[6] = 1.0 if timestamp.hour >= 22 or timestamp.hour < 6 else 0.0
564
+
565
+ # Feature 7: Quarter of year (normalized)
566
+ quarter = (timestamp.month - 1) // 3
567
+ features[7] = quarter / 3.0
568
+
569
+ return features
570
+
571
+ def _extract_author_features(
572
+ self, commit_data: dict[str, Any], author_stats: Optional[dict[str, Any]] = None
573
+ ) -> np.ndarray:
574
+ """Extract author-based features.
575
+
576
+ Args:
577
+ commit_data: Commit data dictionary
578
+ author_stats: Optional author statistics
579
+
580
+ Returns:
581
+ 5-dimensional array with author features
582
+ """
583
+ features = np.zeros(5, dtype=np.float32)
584
+
585
+ author_name = commit_data.get("author_name", "")
586
+ author_email = commit_data.get("author_email", "")
587
+
588
+ # Feature 0: Author name length (normalized)
589
+ features[0] = min(1.0, len(author_name) / 50.0)
590
+
591
+ # Feature 1: Has corporate email
592
+ corporate_domains = [".com", ".org", ".net", ".io", ".co"]
593
+ has_corporate = any(domain in author_email.lower() for domain in corporate_domains)
594
+ is_github_noreply = "noreply.github.com" in author_email.lower()
595
+ features[1] = 1.0 if has_corporate and not is_github_noreply else 0.0
596
+
597
+ # Feature 2: Is likely automated (bot/CI)
598
+ automated_indicators = ["bot", "ci", "github-actions", "dependabot", "renovate"]
599
+ is_automated = any(
600
+ indicator in author_name.lower() or indicator in author_email.lower()
601
+ for indicator in automated_indicators
602
+ )
603
+ features[2] = 1.0 if is_automated else 0.0
604
+
605
+ # Features 3-4: Author statistics (if available)
606
+ if author_stats:
607
+ # Feature 3: Author experience (normalized commit count)
608
+ total_commits = author_stats.get("total_commits", 1)
609
+ features[3] = min(1.0, np.log1p(total_commits) / np.log1p(1000))
610
+
611
+ # Feature 4: Typical commit size compared to this commit
612
+ avg_size = author_stats.get("avg_commit_size", 0)
613
+ current_size = commit_data.get("insertions", 0) + commit_data.get("deletions", 0)
614
+ if avg_size > 0:
615
+ features[4] = min(2.0, current_size / avg_size) # Ratio, capped at 2x
616
+
617
+ return features
618
+
619
+ def get_feature_names(self) -> list[str]:
620
+ """Get human-readable names for all 68 features.
621
+
622
+ Returns:
623
+ List of feature names corresponding to the feature vector indices
624
+ """
625
+ names = []
626
+
627
+ # Keyword features (20)
628
+ for category in self.keyword_categories:
629
+ names.append(f"keyword_{category}")
630
+
631
+ # File features (20)
632
+ file_feature_names = [
633
+ "lang_1st",
634
+ "lang_2nd",
635
+ "lang_3rd",
636
+ "lang_4th",
637
+ "lang_5th",
638
+ "activity_1st",
639
+ "activity_2nd",
640
+ "activity_3rd",
641
+ "activity_4th",
642
+ "activity_5th",
643
+ "lang_diversity",
644
+ "activity_diversity",
645
+ "generated_ratio",
646
+ "is_multilingual",
647
+ "is_cross_functional",
648
+ "ext_py",
649
+ "ext_js",
650
+ "ext_java",
651
+ "ext_go",
652
+ "ext_sql",
653
+ ]
654
+ names.extend(file_feature_names)
655
+
656
+ # Statistics features (15)
657
+ stats_feature_names = [
658
+ "files_changed",
659
+ "insertions",
660
+ "deletions",
661
+ "total_lines",
662
+ "insert_delete_ratio",
663
+ "message_length",
664
+ "word_count",
665
+ "line_count",
666
+ "avg_lines_per_file",
667
+ "has_conventional_format",
668
+ "has_ticket_ref",
669
+ "is_merge",
670
+ "has_code_in_msg",
671
+ "message_complexity",
672
+ "is_large_commit",
673
+ ]
674
+ names.extend(stats_feature_names)
675
+
676
+ # Temporal features (8)
677
+ temporal_feature_names = [
678
+ "hour_of_day",
679
+ "day_of_week",
680
+ "day_of_month",
681
+ "month_of_year",
682
+ "is_weekend",
683
+ "is_business_hours",
684
+ "is_late_night",
685
+ "quarter",
686
+ ]
687
+ names.extend(temporal_feature_names)
688
+
689
+ # Author features (5)
690
+ author_feature_names = [
691
+ "author_name_length",
692
+ "has_corporate_email",
693
+ "is_automated",
694
+ "author_experience",
695
+ "commit_size_vs_typical",
696
+ ]
697
+ names.extend(author_feature_names)
698
+
699
+ return names
700
+
701
+ def extract_batch_features(
702
+ self,
703
+ commit_batch: list[dict[str, Any]],
704
+ author_stats_batch: Optional[list[dict[str, Any]]] = None,
705
+ ) -> np.ndarray:
706
+ """Extract features for a batch of commits efficiently.
707
+
708
+ Args:
709
+ commit_batch: List of commit data dictionaries
710
+ author_stats_batch: Optional list of author statistics
711
+
712
+ Returns:
713
+ 2D numpy array of shape (n_commits, 68) with feature vectors
714
+ """
715
+ n_commits = len(commit_batch)
716
+ features = np.zeros((n_commits, 68), dtype=np.float32)
717
+
718
+ for i, commit_data in enumerate(commit_batch):
719
+ author_stats = None
720
+ if author_stats_batch and i < len(author_stats_batch):
721
+ author_stats = author_stats_batch[i]
722
+
723
+ features[i] = self.extract_features(commit_data, author_stats)
724
+
725
+ return features