gitflow-analytics 1.0.1__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. gitflow_analytics/__init__.py +11 -11
  2. gitflow_analytics/_version.py +2 -2
  3. gitflow_analytics/classification/__init__.py +31 -0
  4. gitflow_analytics/classification/batch_classifier.py +752 -0
  5. gitflow_analytics/classification/classifier.py +464 -0
  6. gitflow_analytics/classification/feature_extractor.py +725 -0
  7. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  8. gitflow_analytics/classification/model.py +455 -0
  9. gitflow_analytics/cli.py +4490 -378
  10. gitflow_analytics/cli_rich.py +503 -0
  11. gitflow_analytics/config/__init__.py +43 -0
  12. gitflow_analytics/config/errors.py +261 -0
  13. gitflow_analytics/config/loader.py +904 -0
  14. gitflow_analytics/config/profiles.py +264 -0
  15. gitflow_analytics/config/repository.py +124 -0
  16. gitflow_analytics/config/schema.py +441 -0
  17. gitflow_analytics/config/validator.py +154 -0
  18. gitflow_analytics/config.py +44 -398
  19. gitflow_analytics/core/analyzer.py +1320 -172
  20. gitflow_analytics/core/branch_mapper.py +132 -132
  21. gitflow_analytics/core/cache.py +1554 -175
  22. gitflow_analytics/core/data_fetcher.py +1193 -0
  23. gitflow_analytics/core/identity.py +571 -185
  24. gitflow_analytics/core/metrics_storage.py +526 -0
  25. gitflow_analytics/core/progress.py +372 -0
  26. gitflow_analytics/core/schema_version.py +269 -0
  27. gitflow_analytics/extractors/base.py +13 -11
  28. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  29. gitflow_analytics/extractors/story_points.py +77 -59
  30. gitflow_analytics/extractors/tickets.py +841 -89
  31. gitflow_analytics/identity_llm/__init__.py +6 -0
  32. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  33. gitflow_analytics/identity_llm/analyzer.py +464 -0
  34. gitflow_analytics/identity_llm/models.py +76 -0
  35. gitflow_analytics/integrations/github_integration.py +258 -87
  36. gitflow_analytics/integrations/jira_integration.py +572 -123
  37. gitflow_analytics/integrations/orchestrator.py +206 -82
  38. gitflow_analytics/metrics/activity_scoring.py +322 -0
  39. gitflow_analytics/metrics/branch_health.py +470 -0
  40. gitflow_analytics/metrics/dora.py +542 -179
  41. gitflow_analytics/models/database.py +986 -59
  42. gitflow_analytics/pm_framework/__init__.py +115 -0
  43. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  44. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  45. gitflow_analytics/pm_framework/base.py +406 -0
  46. gitflow_analytics/pm_framework/models.py +211 -0
  47. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  48. gitflow_analytics/pm_framework/registry.py +333 -0
  49. gitflow_analytics/qualitative/__init__.py +29 -0
  50. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  51. gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
  52. gitflow_analytics/qualitative/classifiers/change_type.py +742 -0
  53. gitflow_analytics/qualitative/classifiers/domain_classifier.py +506 -0
  54. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +535 -0
  55. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  56. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  57. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  58. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  59. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  60. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  61. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  62. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  63. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  64. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +438 -0
  65. gitflow_analytics/qualitative/core/__init__.py +13 -0
  66. gitflow_analytics/qualitative/core/llm_fallback.py +657 -0
  67. gitflow_analytics/qualitative/core/nlp_engine.py +382 -0
  68. gitflow_analytics/qualitative/core/pattern_cache.py +479 -0
  69. gitflow_analytics/qualitative/core/processor.py +673 -0
  70. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  71. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  72. gitflow_analytics/qualitative/models/__init__.py +25 -0
  73. gitflow_analytics/qualitative/models/schemas.py +306 -0
  74. gitflow_analytics/qualitative/utils/__init__.py +13 -0
  75. gitflow_analytics/qualitative/utils/batch_processor.py +339 -0
  76. gitflow_analytics/qualitative/utils/cost_tracker.py +345 -0
  77. gitflow_analytics/qualitative/utils/metrics.py +361 -0
  78. gitflow_analytics/qualitative/utils/text_processing.py +285 -0
  79. gitflow_analytics/reports/__init__.py +100 -0
  80. gitflow_analytics/reports/analytics_writer.py +550 -18
  81. gitflow_analytics/reports/base.py +648 -0
  82. gitflow_analytics/reports/branch_health_writer.py +322 -0
  83. gitflow_analytics/reports/classification_writer.py +924 -0
  84. gitflow_analytics/reports/cli_integration.py +427 -0
  85. gitflow_analytics/reports/csv_writer.py +1700 -216
  86. gitflow_analytics/reports/data_models.py +504 -0
  87. gitflow_analytics/reports/database_report_generator.py +427 -0
  88. gitflow_analytics/reports/example_usage.py +344 -0
  89. gitflow_analytics/reports/factory.py +499 -0
  90. gitflow_analytics/reports/formatters.py +698 -0
  91. gitflow_analytics/reports/html_generator.py +1116 -0
  92. gitflow_analytics/reports/interfaces.py +489 -0
  93. gitflow_analytics/reports/json_exporter.py +2770 -0
  94. gitflow_analytics/reports/narrative_writer.py +2289 -158
  95. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  96. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  97. gitflow_analytics/training/__init__.py +5 -0
  98. gitflow_analytics/training/model_loader.py +377 -0
  99. gitflow_analytics/training/pipeline.py +550 -0
  100. gitflow_analytics/tui/__init__.py +5 -0
  101. gitflow_analytics/tui/app.py +724 -0
  102. gitflow_analytics/tui/screens/__init__.py +8 -0
  103. gitflow_analytics/tui/screens/analysis_progress_screen.py +496 -0
  104. gitflow_analytics/tui/screens/configuration_screen.py +523 -0
  105. gitflow_analytics/tui/screens/loading_screen.py +348 -0
  106. gitflow_analytics/tui/screens/main_screen.py +321 -0
  107. gitflow_analytics/tui/screens/results_screen.py +722 -0
  108. gitflow_analytics/tui/widgets/__init__.py +7 -0
  109. gitflow_analytics/tui/widgets/data_table.py +255 -0
  110. gitflow_analytics/tui/widgets/export_modal.py +301 -0
  111. gitflow_analytics/tui/widgets/progress_widget.py +187 -0
  112. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  113. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  114. gitflow_analytics-1.0.1.dist-info/METADATA +0 -463
  115. gitflow_analytics-1.0.1.dist-info/RECORD +0 -31
  116. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  117. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  118. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  119. {gitflow_analytics-1.0.1.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,742 @@
1
+ """Change type classifier using semantic analysis of commit messages."""
2
+
3
+ import importlib.util
4
+ import logging
5
+ import re
6
+ from typing import Any, Optional
7
+
8
+ from ..models.schemas import ChangeTypeConfig
9
+
10
+ # Check if spacy is available without importing it
11
+ SPACY_AVAILABLE = importlib.util.find_spec("spacy") is not None
12
+
13
+ if SPACY_AVAILABLE:
14
+ from spacy.tokens import Doc
15
+ else:
16
+ Doc = Any
17
+
18
+
19
+ class ChangeTypeClassifier:
20
+ """Classify commits by change type using semantic analysis.
21
+
22
+ This classifier determines the type of change represented by a commit
23
+ (feature, bugfix, refactor, etc.) by analyzing the commit message semantics
24
+ and file patterns.
25
+
26
+ The classification uses a combination of:
27
+ - Semantic keyword matching with action/object/context patterns
28
+ - File pattern analysis for additional signals
29
+ - Rule-based patterns for common commit message formats
30
+ """
31
+
32
+ def __init__(self, config: ChangeTypeConfig):
33
+ """Initialize change type classifier.
34
+
35
+ Args:
36
+ config: Configuration for change type classification
37
+ """
38
+ self.config = config
39
+ self.logger = logging.getLogger(__name__)
40
+
41
+ # Define semantic patterns for each change type
42
+ self.change_patterns = {
43
+ "feature": {
44
+ "action_words": {
45
+ "add",
46
+ "implement",
47
+ "create",
48
+ "build",
49
+ "introduce",
50
+ "develop",
51
+ "enable",
52
+ "support",
53
+ "allow",
54
+ "provide",
55
+ "include",
56
+ "addition",
57
+ "initialize",
58
+ "prepare",
59
+ "extend",
60
+ },
61
+ "object_words": {
62
+ "feature",
63
+ "functionality",
64
+ "capability",
65
+ "component",
66
+ "module",
67
+ "endpoint",
68
+ "api",
69
+ "service",
70
+ "interface",
71
+ "system",
72
+ "integration",
73
+ "column",
74
+ "field",
75
+ "property",
76
+ },
77
+ "context_words": {
78
+ "new",
79
+ "initial",
80
+ "first",
81
+ "user",
82
+ "client",
83
+ "support",
84
+ "enhancement",
85
+ "improvement",
86
+ "missing",
87
+ "space",
88
+ "sticky",
89
+ },
90
+ },
91
+ "bugfix": {
92
+ "action_words": {
93
+ "fix",
94
+ "resolve",
95
+ "correct",
96
+ "repair",
97
+ "patch",
98
+ "address",
99
+ "handle",
100
+ "solve",
101
+ "debug",
102
+ "prevent",
103
+ "corrected",
104
+ },
105
+ "object_words": {
106
+ "bug",
107
+ "issue",
108
+ "problem",
109
+ "error",
110
+ "defect",
111
+ "exception",
112
+ "crash",
113
+ "failure",
114
+ "leak",
115
+ "regression",
116
+ "beacon",
117
+ "beacons",
118
+ },
119
+ "context_words": {
120
+ "broken",
121
+ "failing",
122
+ "incorrect",
123
+ "wrong",
124
+ "invalid",
125
+ "missing",
126
+ "null",
127
+ "undefined",
128
+ "not",
129
+ "allowing",
130
+ },
131
+ },
132
+ "refactor": {
133
+ "action_words": {
134
+ "refactor",
135
+ "restructure",
136
+ "reorganize",
137
+ "cleanup",
138
+ "simplify",
139
+ "optimize",
140
+ "improve",
141
+ "enhance",
142
+ "streamline",
143
+ "consolidate",
144
+ "refine",
145
+ "ensure",
146
+ "replace",
147
+ "improves",
148
+ },
149
+ "object_words": {
150
+ "code",
151
+ "structure",
152
+ "architecture",
153
+ "design",
154
+ "logic",
155
+ "method",
156
+ "function",
157
+ "class",
158
+ "module",
159
+ "combo",
160
+ "behavior",
161
+ "focus",
162
+ },
163
+ "context_words": {
164
+ "better",
165
+ "cleaner",
166
+ "simpler",
167
+ "efficient",
168
+ "maintainable",
169
+ "readable",
170
+ "performance",
171
+ "box",
172
+ "hacking",
173
+ },
174
+ },
175
+ "docs": {
176
+ "action_words": {
177
+ "update",
178
+ "add",
179
+ "improve",
180
+ "write",
181
+ "document",
182
+ "clarify",
183
+ "explain",
184
+ "describe",
185
+ "detail",
186
+ "added",
187
+ },
188
+ "object_words": {
189
+ "documentation",
190
+ "readme",
191
+ "docs",
192
+ "comment",
193
+ "docstring",
194
+ "guide",
195
+ "tutorial",
196
+ "example",
197
+ "specification",
198
+ "translations",
199
+ "spanish",
200
+ "label",
201
+ },
202
+ "context_words": {
203
+ "explain",
204
+ "clarify",
205
+ "describe",
206
+ "instruction",
207
+ "help",
208
+ "change",
209
+ "dynamically",
210
+ "language",
211
+ },
212
+ },
213
+ "test": {
214
+ "action_words": {
215
+ "add",
216
+ "update",
217
+ "fix",
218
+ "improve",
219
+ "write",
220
+ "create",
221
+ "enhance",
222
+ "extend",
223
+ },
224
+ "object_words": {
225
+ "test",
226
+ "spec",
227
+ "coverage",
228
+ "unit",
229
+ "integration",
230
+ "e2e",
231
+ "testing",
232
+ "mock",
233
+ "stub",
234
+ "fixture",
235
+ },
236
+ "context_words": {
237
+ "testing",
238
+ "verify",
239
+ "validate",
240
+ "check",
241
+ "ensure",
242
+ "coverage",
243
+ "assertion",
244
+ },
245
+ },
246
+ "chore": {
247
+ "action_words": {
248
+ "update",
249
+ "bump",
250
+ "upgrade",
251
+ "configure",
252
+ "setup",
253
+ "install",
254
+ "remove",
255
+ "delete",
256
+ "clean",
257
+ "sync",
258
+ "merge",
259
+ },
260
+ "object_words": {
261
+ "dependency",
262
+ "package",
263
+ "config",
264
+ "configuration",
265
+ "build",
266
+ "version",
267
+ "tool",
268
+ "script",
269
+ "workflow",
270
+ "console",
271
+ "log",
272
+ "main",
273
+ },
274
+ "context_words": {
275
+ "maintenance",
276
+ "housekeeping",
277
+ "routine",
278
+ "automated",
279
+ "ci",
280
+ "cd",
281
+ "pipeline",
282
+ "auto",
283
+ "removal",
284
+ },
285
+ },
286
+ "security": {
287
+ "action_words": {
288
+ "fix",
289
+ "secure",
290
+ "protect",
291
+ "validate",
292
+ "sanitize",
293
+ "encrypt",
294
+ "authenticate",
295
+ "authorize",
296
+ },
297
+ "object_words": {
298
+ "security",
299
+ "vulnerability",
300
+ "exploit",
301
+ "xss",
302
+ "csrf",
303
+ "injection",
304
+ "authentication",
305
+ "authorization",
306
+ "permission",
307
+ },
308
+ "context_words": {
309
+ "secure",
310
+ "safe",
311
+ "protected",
312
+ "validated",
313
+ "sanitized",
314
+ "encrypted",
315
+ "threat",
316
+ "attack",
317
+ },
318
+ },
319
+ "hotfix": {
320
+ "action_words": {"hotfix", "fix", "patch", "urgent", "critical", "emergency"},
321
+ "object_words": {
322
+ "production",
323
+ "critical",
324
+ "urgent",
325
+ "emergency",
326
+ "hotfix",
327
+ "issue",
328
+ "bug",
329
+ "problem",
330
+ },
331
+ "context_words": {
332
+ "urgent",
333
+ "critical",
334
+ "immediate",
335
+ "production",
336
+ "live",
337
+ "emergency",
338
+ "asap",
339
+ },
340
+ },
341
+ "config": {
342
+ "action_words": {
343
+ "configure",
344
+ "setup",
345
+ "adjust",
346
+ "modify",
347
+ "change",
348
+ "update",
349
+ "tweak",
350
+ "changing",
351
+ },
352
+ "object_words": {
353
+ "config",
354
+ "configuration",
355
+ "settings",
356
+ "environment",
357
+ "parameter",
358
+ "option",
359
+ "flag",
360
+ "variable",
361
+ "roles",
362
+ "user",
363
+ "schema",
364
+ "access",
365
+ "levels",
366
+ },
367
+ "context_words": {
368
+ "environment",
369
+ "production",
370
+ "development",
371
+ "staging",
372
+ "deployment",
373
+ "setup",
374
+ "roles",
375
+ "permission",
376
+ "api",
377
+ },
378
+ },
379
+ "integration": {
380
+ "action_words": {
381
+ "integrate",
382
+ "add",
383
+ "implement",
384
+ "connect",
385
+ "setup",
386
+ "remove",
387
+ "extend",
388
+ "removing",
389
+ },
390
+ "object_words": {
391
+ "integration",
392
+ "posthog",
393
+ "iubenda",
394
+ "auth0",
395
+ "oauth",
396
+ "api",
397
+ "service",
398
+ "third-party",
399
+ "external",
400
+ "mena",
401
+ },
402
+ "context_words": {
403
+ "collection",
404
+ "data",
405
+ "privacy",
406
+ "policy",
407
+ "implementation",
408
+ "access",
409
+ "redirect",
410
+ },
411
+ },
412
+ }
413
+
414
+ # File pattern signals for change types
415
+ self.file_patterns = {
416
+ "test": [
417
+ r".*test.*\.py$",
418
+ r".*spec.*\.js$",
419
+ r".*test.*\.java$",
420
+ r"test_.*\.py$",
421
+ r".*_test\.go$",
422
+ r".*\.test\.(js|ts)$",
423
+ r"__tests__/.*",
424
+ r"tests?/.*",
425
+ r"spec/.*",
426
+ ],
427
+ "docs": [
428
+ r".*\.md$",
429
+ r".*\.rst$",
430
+ r".*\.txt$",
431
+ r"README.*",
432
+ r"CHANGELOG.*",
433
+ r"docs?/.*",
434
+ r"documentation/.*",
435
+ ],
436
+ "config": [
437
+ r".*\.ya?ml$",
438
+ r".*\.json$",
439
+ r".*\.toml$",
440
+ r".*\.ini$",
441
+ r".*\.env.*",
442
+ r"Dockerfile.*",
443
+ r".*config.*",
444
+ r"\.github/.*",
445
+ ],
446
+ "chore": [
447
+ r"package.*\.json$",
448
+ r"requirements.*\.txt$",
449
+ r"Pipfile.*",
450
+ r"pom\.xml$",
451
+ r"build\.gradle$",
452
+ r".*\.lock$",
453
+ ],
454
+ }
455
+
456
+ # Compile regex patterns for efficiency
457
+ self._compile_file_patterns()
458
+
459
+ # Common commit message prefixes
460
+ self.prefix_patterns = {
461
+ "feat": "feature",
462
+ "feature": "feature",
463
+ "fix": "bugfix",
464
+ "bugfix": "bugfix",
465
+ "refactor": "refactor",
466
+ "docs": "docs",
467
+ "test": "test",
468
+ "chore": "chore",
469
+ "security": "security",
470
+ "hotfix": "hotfix",
471
+ "config": "config",
472
+ "integration": "integration",
473
+ "integrate": "integration",
474
+ "style": "chore", # Style changes are usually chores
475
+ "perf": "refactor", # Performance improvements are refactoring
476
+ "build": "chore",
477
+ "ci": "chore",
478
+ }
479
+
480
+ def _compile_file_patterns(self) -> None:
481
+ """Compile regex patterns for file matching."""
482
+ self.compiled_file_patterns = {}
483
+ for change_type, patterns in self.file_patterns.items():
484
+ self.compiled_file_patterns[change_type] = [
485
+ re.compile(pattern, re.IGNORECASE) for pattern in patterns
486
+ ]
487
+
488
+ def classify(self, message: str, doc: Doc, files: list[str]) -> tuple[str, float]:
489
+ """Classify commit change type with confidence score.
490
+
491
+ Args:
492
+ message: Commit message
493
+ doc: spaCy processed document
494
+ files: List of changed files
495
+
496
+ Returns:
497
+ Tuple of (change_type, confidence_score)
498
+ """
499
+ if not message:
500
+ return "unknown", 0.0
501
+
502
+ # Step 1: Check for conventional commit prefixes
503
+ prefix_result = self._check_conventional_prefix(message)
504
+ if prefix_result:
505
+ change_type, confidence = prefix_result
506
+ if confidence >= self.config.min_confidence:
507
+ return change_type, confidence
508
+
509
+ # Step 2: Semantic analysis of message content
510
+ semantic_scores = self._analyze_semantic_content(message, doc)
511
+
512
+ # Step 3: File pattern analysis
513
+ file_scores = self._analyze_file_patterns(files)
514
+
515
+ # Step 4: Combine scores with weights
516
+ combined_scores = self._combine_scores(semantic_scores, file_scores)
517
+
518
+ # Step 5: Select best match
519
+ if not combined_scores:
520
+ return "unknown", 0.0
521
+
522
+ best_type = max(combined_scores.keys(), key=lambda k: combined_scores[k])
523
+ confidence = combined_scores[best_type]
524
+
525
+ # Apply confidence threshold
526
+ if confidence < self.config.min_confidence:
527
+ return "unknown", confidence
528
+
529
+ return best_type, confidence
530
+
531
+ def _check_conventional_prefix(self, message: str) -> Optional[tuple[str, float]]:
532
+ """Check for conventional commit message prefixes.
533
+
534
+ Args:
535
+ message: Commit message
536
+
537
+ Returns:
538
+ Tuple of (change_type, confidence) if found, None otherwise
539
+ """
540
+ # Look for conventional commit format: type(scope): description
541
+ conventional_pattern = r"^(\w+)(?:\([^)]*\))?\s*:\s*(.+)"
542
+ match = re.match(conventional_pattern, message.strip(), re.IGNORECASE)
543
+
544
+ if match:
545
+ prefix = match.group(1).lower()
546
+ if prefix in self.prefix_patterns:
547
+ return self.prefix_patterns[prefix], 0.9 # High confidence for explicit prefixes
548
+
549
+ # Check for simple prefixes at start of message
550
+ words = message.lower().split()
551
+ if words:
552
+ first_word = words[0].rstrip(":").rstrip("-")
553
+ if first_word in self.prefix_patterns:
554
+ return self.prefix_patterns[first_word], 0.8
555
+
556
+ return None
557
+
558
+ def _analyze_semantic_content(self, message: str, doc: Doc) -> dict[str, float]:
559
+ """Analyze semantic content of commit message.
560
+
561
+ Args:
562
+ message: Commit message
563
+ doc: spaCy processed document
564
+
565
+ Returns:
566
+ Dictionary of change_type -> confidence_score
567
+ """
568
+ if not SPACY_AVAILABLE or not doc:
569
+ # Fallback to simple keyword matching
570
+ return self._simple_keyword_analysis(message.lower())
571
+
572
+ # Extract semantic features from spaCy doc
573
+ features = self._extract_semantic_features(doc)
574
+
575
+ # Calculate similarity to each change type
576
+ scores = {}
577
+ for change_type, patterns in self.change_patterns.items():
578
+ similarity = self._calculate_semantic_similarity(features, patterns)
579
+ if similarity > 0:
580
+ scores[change_type] = similarity
581
+
582
+ return scores
583
+
584
+ def _extract_semantic_features(self, doc: Doc) -> dict[str, set[str]]:
585
+ """Extract semantic features from spaCy document.
586
+
587
+ Args:
588
+ doc: spaCy processed document
589
+
590
+ Returns:
591
+ Dictionary of feature_type -> set_of_words
592
+ """
593
+ features = {
594
+ "verbs": set(),
595
+ "nouns": set(),
596
+ "adjectives": set(),
597
+ "entities": set(),
598
+ "lemmas": set(),
599
+ }
600
+
601
+ for token in doc:
602
+ if token.is_stop or token.is_punct or len(token.text) < 2:
603
+ continue
604
+
605
+ lemma = token.lemma_.lower()
606
+ features["lemmas"].add(lemma)
607
+
608
+ if token.pos_ == "VERB":
609
+ features["verbs"].add(lemma)
610
+ elif token.pos_ in ["NOUN", "PROPN"]:
611
+ features["nouns"].add(lemma)
612
+ elif token.pos_ == "ADJ":
613
+ features["adjectives"].add(lemma)
614
+
615
+ # Add named entities
616
+ for ent in doc.ents:
617
+ features["entities"].add(ent.text.lower())
618
+
619
+ return features
620
+
621
+ def _calculate_semantic_similarity(
622
+ self, features: dict[str, set[str]], patterns: dict[str, set[str]]
623
+ ) -> float:
624
+ """Calculate semantic similarity between features and patterns.
625
+
626
+ Args:
627
+ features: Extracted semantic features
628
+ patterns: Change type patterns
629
+
630
+ Returns:
631
+ Similarity score (0.0 to 1.0)
632
+ """
633
+ similarity_score = 0.0
634
+
635
+ # Action words (verbs) - highest weight
636
+ action_matches = len(features["verbs"].intersection(patterns["action_words"]))
637
+ if action_matches > 0:
638
+ similarity_score += action_matches * 0.5
639
+
640
+ # Object words (nouns) - medium weight
641
+ object_matches = len(features["nouns"].intersection(patterns["object_words"]))
642
+ if object_matches > 0:
643
+ similarity_score += object_matches * 0.3
644
+
645
+ # Context words (any lemma) - lower weight
646
+ all_lemmas = features["lemmas"]
647
+ context_matches = len(all_lemmas.intersection(patterns["context_words"]))
648
+ if context_matches > 0:
649
+ similarity_score += context_matches * 0.2
650
+
651
+ # Normalize by maximum possible score
652
+ max_possible = (
653
+ len(patterns["action_words"]) * 0.5
654
+ + len(patterns["object_words"]) * 0.3
655
+ + len(patterns["context_words"]) * 0.2
656
+ )
657
+
658
+ return min(1.0, similarity_score / max_possible) if max_possible > 0 else 0.0
659
+
660
+ def _simple_keyword_analysis(self, message: str) -> dict[str, float]:
661
+ """Simple keyword-based analysis fallback.
662
+
663
+ Args:
664
+ message: Lowercase commit message
665
+
666
+ Returns:
667
+ Dictionary of change_type -> confidence_score
668
+ """
669
+ scores = {}
670
+ words = set(re.findall(r"\b\w+\b", message))
671
+
672
+ for change_type, patterns in self.change_patterns.items():
673
+ all_pattern_words = (
674
+ patterns["action_words"] | patterns["object_words"] | patterns["context_words"]
675
+ )
676
+ matches = len(words.intersection(all_pattern_words))
677
+
678
+ if matches > 0:
679
+ # Simple scoring based on keyword matches
680
+ scores[change_type] = min(1.0, matches / 5.0) # Scale to 0-1
681
+
682
+ return scores
683
+
684
+ def _analyze_file_patterns(self, files: list[str]) -> dict[str, float]:
685
+ """Analyze file patterns for change type signals.
686
+
687
+ Args:
688
+ files: List of changed file paths
689
+
690
+ Returns:
691
+ Dictionary of change_type -> confidence_score
692
+ """
693
+ if not files:
694
+ return {}
695
+
696
+ scores = {}
697
+
698
+ for change_type, patterns in self.compiled_file_patterns.items():
699
+ matching_files = 0
700
+
701
+ for file_path in files:
702
+ for pattern in patterns:
703
+ if pattern.search(file_path):
704
+ matching_files += 1
705
+ break # Don't double-count same file
706
+
707
+ if matching_files > 0:
708
+ # File pattern confidence based on proportion of matching files
709
+ confidence = min(1.0, matching_files / len(files))
710
+ scores[change_type] = confidence
711
+
712
+ return scores
713
+
714
+ def _combine_scores(
715
+ self, semantic_scores: dict[str, float], file_scores: dict[str, float]
716
+ ) -> dict[str, float]:
717
+ """Combine semantic and file pattern scores.
718
+
719
+ Args:
720
+ semantic_scores: Scores from semantic analysis
721
+ file_scores: Scores from file pattern analysis
722
+
723
+ Returns:
724
+ Combined scores dictionary
725
+ """
726
+ combined = {}
727
+ all_types = set(semantic_scores.keys()) | set(file_scores.keys())
728
+
729
+ for change_type in all_types:
730
+ semantic_score = semantic_scores.get(change_type, 0.0)
731
+ file_score = file_scores.get(change_type, 0.0)
732
+
733
+ # Weighted combination
734
+ combined_score = (
735
+ semantic_score * self.config.semantic_weight
736
+ + file_score * self.config.file_pattern_weight
737
+ )
738
+
739
+ if combined_score > 0:
740
+ combined[change_type] = combined_score
741
+
742
+ return combined