gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4108 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +904 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +441 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1193 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,113 @@
1
1
  """Ticket reference extraction for multiple platforms."""
2
2
 
3
+ import logging
3
4
  import re
4
5
  from collections import defaultdict
6
+ from datetime import timezone
5
7
  from typing import Any, Optional, cast
6
8
 
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def filter_git_artifacts(message: str) -> str:
13
+ """Filter out git artifacts from commit messages before classification.
14
+
15
+ WHY: Git-generated content like Co-authored-by lines, Signed-off-by lines,
16
+ and other metadata should not influence commit classification. This function
17
+ removes such artifacts to provide cleaner input for categorization.
18
+
19
+ Args:
20
+ message: Raw commit message that may contain git artifacts
21
+
22
+ Returns:
23
+ Cleaned commit message with git artifacts removed
24
+ """
25
+ if not message or not message.strip():
26
+ return ""
27
+
28
+ # Remove Co-authored-by lines (including standalone ones)
29
+ message = re.sub(r"^Co-authored-by:.*$", "", message, flags=re.MULTILINE | re.IGNORECASE)
30
+
31
+ # Remove Signed-off-by lines
32
+ message = re.sub(r"^Signed-off-by:.*$", "", message, flags=re.MULTILINE | re.IGNORECASE)
33
+
34
+ # Remove Reviewed-by lines (common in some workflows)
35
+ message = re.sub(r"^Reviewed-by:.*$", "", message, flags=re.MULTILINE | re.IGNORECASE)
36
+
37
+ # Remove Tested-by lines
38
+ message = re.sub(r"^Tested-by:.*$", "", message, flags=re.MULTILINE | re.IGNORECASE)
39
+
40
+ # Remove merge artifact lines (dashes, stars, or other separator patterns)
41
+ message = re.sub(r"^-+$", "", message, flags=re.MULTILINE)
42
+ message = re.sub(r"^\*\s*$", "", message, flags=re.MULTILINE)
43
+ message = re.sub(r"^#+$", "", message, flags=re.MULTILINE)
44
+
45
+ # Remove GitHub Copilot co-authorship lines
46
+ message = re.sub(
47
+ r"^Co-authored-by:.*[Cc]opilot.*$", "", message, flags=re.MULTILINE | re.IGNORECASE
48
+ )
49
+
50
+ # Remove common merge commit artifacts
51
+ message = re.sub(
52
+ r"^\s*Merge\s+(branch|pull request).*$", "", message, flags=re.MULTILINE | re.IGNORECASE
53
+ )
54
+ message = re.sub(
55
+ r"^\s*(into|from)\s+[a-zA-Z0-9/_-]+$", "", message, flags=re.MULTILINE | re.IGNORECASE
56
+ )
57
+
58
+ # Clean up whitespace while preserving meaningful blank lines
59
+ lines = message.split("\n")
60
+ cleaned_lines = []
61
+
62
+ for i, line in enumerate(lines):
63
+ stripped = line.strip()
64
+ if stripped: # Non-empty line
65
+ cleaned_lines.append(stripped)
66
+ elif (
67
+ i > 0
68
+ and i < len(lines) - 1
69
+ and any(line.strip() for line in lines[:i])
70
+ and any(line.strip() for line in lines[i + 1 :])
71
+ ): # Preserve blank lines in middle if there's content both before and after
72
+ cleaned_lines.append("")
73
+
74
+ cleaned = "\n".join(cleaned_lines)
75
+
76
+ # Handle edge cases - empty or dots-only messages
77
+ if not cleaned:
78
+ return ""
79
+
80
+ # Check if message is only dots (with any whitespace)
81
+ dots_only = re.sub(r"[.\s\n]+", "", cleaned) == ""
82
+ if dots_only and "..." in cleaned:
83
+ return ""
84
+
85
+ return cleaned.strip()
86
+
7
87
 
8
88
  class TicketExtractor:
9
- """Extract ticket references from various issue tracking systems."""
89
+ """Extract ticket references from various issue tracking systems.
90
+
91
+ Enhanced to support detailed untracked commit analysis including:
92
+ - Commit categorization (maintenance, bug fix, refactor, docs, etc.)
93
+ - Configurable file change thresholds
94
+ - Extended untracked commit metadata collection
95
+ """
10
96
 
11
- def __init__(self, allowed_platforms: Optional[list[str]] = None) -> None:
97
+ def __init__(
98
+ self, allowed_platforms: Optional[list[str]] = None, untracked_file_threshold: int = 1
99
+ ) -> None:
12
100
  """Initialize with patterns for different platforms.
13
101
 
14
102
  Args:
15
103
  allowed_platforms: List of platforms to extract tickets from.
16
104
  If None, all platforms are allowed.
105
+ untracked_file_threshold: Minimum number of files changed to consider
106
+ a commit as 'significant' for untracked analysis.
107
+ Default is 1 (all commits), previously was 3.
17
108
  """
18
109
  self.allowed_platforms = allowed_platforms
110
+ self.untracked_file_threshold = untracked_file_threshold
19
111
  self.patterns = {
20
112
  "jira": [
21
113
  r"([A-Z]{2,10}-\d+)", # Standard JIRA format: PROJ-123
@@ -46,6 +138,205 @@ class TicketExtractor:
46
138
  for pattern in patterns
47
139
  ]
48
140
 
141
+ # Commit categorization patterns
142
+ self.category_patterns = {
143
+ "bug_fix": [
144
+ r"^fix:",
145
+ r"\b(fix|bug|error|issue|problem|crash|exception|failure)\b",
146
+ r"\b(resolve|solve|repair|correct|corrected|address)\b",
147
+ r"\b(hotfix|bugfix|patch|quickfix)\b",
148
+ r"\b(broken|failing|failed|fault|defect)\b",
149
+ r"\b(prevent|stop|avoid)\s+(error|bug|issue|crash)\b",
150
+ r"\b(fixes|resolves|solves)\s+(bug|issue|error|problem)\b",
151
+ r"\b(beacon|beacons)\b.*\b(fix|fixes|issue|problem)\b",
152
+ r"\bmissing\s+(space|field|data|property)\b",
153
+ r"\b(counting|allowing|episodes)\s+(was|not|issue)\b",
154
+ r"^fixes\s+\b(beacon|beacons|combo|issue|problem)\b",
155
+ ],
156
+ "feature": [
157
+ r"^(feat|feature):",
158
+ r"\b(add|new|feature|implement|create|build)\b",
159
+ r"\b(introduce|enhance|extend|expand)\b",
160
+ r"\b(functionality|capability|support|enable)\b",
161
+ r"\b(initial|first)\s+(implementation|version)\b",
162
+ r"\b(addition|initialize|prepare)\b",
163
+ r"added?\s+(new|feature|functionality|capability)\b",
164
+ r"added?\s+(column|field|property|thumbnail)\b",
165
+ r"\b(homilists?|homily|homilies)\b",
166
+ r"\b(sticky|column)\s+(feature|functionality)\b",
167
+ r"adds?\s+(data|localization|beacon)\b",
168
+ r"\b(episode|episodes|audio|video)\s+(feature|support|implementation)\b",
169
+ r"\b(beacon)\s+(implementation|for|tracking)\b",
170
+ r"\b(localization)\s+(data|structure)\b",
171
+ ],
172
+ "refactor": [
173
+ r"\b(refactor|restructure|reorganize|cleanup|clean up)\b",
174
+ r"\b(optimize|improve|simplify|streamline)\b",
175
+ r"\b(rename|move|extract|consolidate)\b",
176
+ r"\b(modernize|redesign|rework|rewrite)\b",
177
+ r"\b(code\s+quality|tech\s+debt|legacy)\b",
178
+ r"\b(refine|ensure|replace)\b",
179
+ r"improves?\s+(performance|efficiency|structure)\b",
180
+ r"improves?\s+(combo|box|focus|behavior)\b",
181
+ r"using\s+\w+\s+instead\s+of\s+\w+\b", # "using X instead of Y" pattern
182
+ ],
183
+ "documentation": [
184
+ r"\b(doc|docs|documentation|readme|comment|comments)\b",
185
+ r"\b(javadoc|jsdoc|docstring|sphinx)\b",
186
+ r"\b(manual|guide|tutorial|how-to|howto)\b",
187
+ r"\b(explain|clarify|describe)\b",
188
+ r"\b(changelog|notes|examples)\b",
189
+ ],
190
+ "deployment": [
191
+ r"^deploy:",
192
+ r"\b(deploy|deployment|publish|rollout)\b",
193
+ r"\b(production|prod|staging|live)\b",
194
+ r"\b(go\s+live|launch|ship)\b",
195
+ r"\b(promote|migration|migrate)\b",
196
+ r"\brelease\s+(v\d+\.\d+|\d+\.\d+\.\d+)?\s+(to|on)\s+(production|staging|live)\b",
197
+ ],
198
+ "configuration": [
199
+ r"\b(config|configure|configuration|setup|settings)\b",
200
+ r"\b(env|environment|parameter|option)\b",
201
+ r"\b(property|properties|yaml|json|xml)\b",
202
+ r"\b(database\s+config|db\s+config|connection)\b",
203
+ r"\.env|\.config|\.yaml|\.json",
204
+ r"\b(setup|configure)\s+(new|for)\b",
205
+ r"\b(user|role|permission|access)\s+(change|update|configuration)\b",
206
+ r"\b(api|service|system)\s+(config|configuration|setup)\b",
207
+ r"\b(role|permission|access)\s+(update|change|management)\b",
208
+ r"\b(schema|model)\s+(update|change|addition)\b",
209
+ r"changing\s+(user|role|permission)\s+(roles?|settings?)\b",
210
+ r"\b(schema)\b(?!.*\b(test|spec)\b)", # Schema but not test schemas
211
+ r"\bsanity\s+schema\b",
212
+ r"changing\s+(some)?\s*(user|role)\s+(roles?|permissions?)\b",
213
+ ],
214
+ "content": [
215
+ r"\b(content|copy|text|wording|messaging)\b",
216
+ r"\b(translation|i18n|l10n|locale|localize)\b",
217
+ r"\b(language|multilingual|international)\b",
218
+ r"\b(strings|labels|captions|titles)\b",
219
+ r"\b(typo|spelling|grammar|proofreading)\b",
220
+ r"\b(typo|spelling)\s+(in|on|for)\b",
221
+ r"\b(spanish|translations?)\b",
222
+ r"\b(blast|banner|video|media)\s+(content|update)\b",
223
+ r"added?\s+(spanish|translation|text|copy|label)\b",
224
+ r"\b(label|message)\s+(change|update|fix)\b",
225
+ ],
226
+ "ui": [
227
+ r"\b(ui|ux|design|layout|styling|visual)\b",
228
+ r"\b(css|scss|sass|less|style)\b",
229
+ r"\b(responsive|mobile|desktop|tablet)\b",
230
+ r"\b(theme|color|font|icon|image)\b",
231
+ r"\b(component|widget|element|button|form)\b",
232
+ r"\b(frontend|front-end|client-side)\b",
233
+ r"\b(sticky|column)\b(?!.*\b(database|table)\b)", # UI sticky, not database
234
+ r"\b(focus|behavior)\b.*\b(combo|box)\b",
235
+ ],
236
+ "infrastructure": [
237
+ r"\b(infra|infrastructure|aws|azure|gcp|cloud)\b",
238
+ r"\b(docker|k8s|kubernetes|container|pod)\b",
239
+ r"\b(terraform|ansible|chef|puppet)\b",
240
+ r"\b(server|hosting|network|load\s+balancer)\b",
241
+ r"\b(monitoring|logging|alerting|metrics)\b",
242
+ ],
243
+ "security": [
244
+ r"\b(security|vulnerability|cve|exploit)\b",
245
+ r"\b(auth|authentication|authorization|permission)\b",
246
+ r"\b(ssl|tls|https|certificate|cert)\b",
247
+ r"\b(encrypt|decrypt|hash|token|oauth)\b",
248
+ r"\b(access\s+control|rbac|cors|xss|csrf)\b",
249
+ r"\b(secure|safety|protect|prevent)\b",
250
+ ],
251
+ "performance": [
252
+ r"\b(perf|performance|optimize|speed|faster)\b",
253
+ r"\b(cache|caching|memory|cpu|disk)\b",
254
+ r"\b(slow|lag|delay|timeout|bottleneck)\b",
255
+ r"\b(efficient|efficiency|throughput|latency)\b",
256
+ r"\b(load\s+time|response\s+time|benchmark)\b",
257
+ r"\b(improve|better)\s+(load|performance|speed)\b",
258
+ ],
259
+ "chore": [
260
+ r"^chore:",
261
+ r"\b(chore|cleanup|housekeeping|maintenance)\b",
262
+ r"\b(routine|regular|scheduled)\b",
263
+ r"\b(lint|linting|format|formatting|prettier)\b",
264
+ r"\b(gitignore|ignore\s+file|artifacts)\b",
265
+ r"\b(console|debug|log|logging)\s+(removal?|clean)\b",
266
+ r"\b(sync|auto-sync)\b",
267
+ r"\b(script\s+update|merge\s+main)\b",
268
+ r"removes?\s+(console|debug|log)\b",
269
+ ],
270
+ "wip": [
271
+ r"\b(wip|work\s+in\s+progress|temp|temporary|tmp)\b",
272
+ r"\b(draft|unfinished|partial|incomplete)\b",
273
+ r"\b(placeholder|todo|fixme)\b",
274
+ r"^wip:",
275
+ r"\b(experiment|experimental|poc|proof\s+of\s+concept)\b",
276
+ r"\b(temporary|temp)\s+(fix|solution|workaround)\b",
277
+ ],
278
+ "version": [
279
+ r"\b(version|bump|tag)\b",
280
+ r"\b(v\d+\.\d+|version\s+\d+|\d+\.\d+\.\d+)\b",
281
+ r"\b(major|minor|patch)\s+(version|release|bump)\b",
282
+ r"^(version|bump):",
283
+ r"\b(prepare\s+for\s+release|pre-release)\b",
284
+ ],
285
+ "maintenance": [
286
+ r"\b(update|upgrade|bump|maintenance|maint)\b",
287
+ r"\b(dependency|dependencies|package|packages)\b",
288
+ r"\b(npm\s+update|pip\s+install|yarn\s+upgrade)\b",
289
+ r"\b(deprecated|obsolete|outdated)\b",
290
+ r"package\.json|requirements\.txt|pom\.xml|Gemfile",
291
+ r"\b(combo|beacon)\s+(hacking|fixes?)\b",
292
+ r"\b(temp|temporary|hack|hacking)\b",
293
+ r"\b(test|testing)\s+(change|update|fix)\b",
294
+ r"\b(more|only)\s+(combo|beacon)\s+(hacking|fires?)\b",
295
+ r"adds?\s+(console|debug|log)\b",
296
+ ],
297
+ "test": [
298
+ r"^test:",
299
+ r"\b(test|testing|spec|unit\s+test|integration\s+test)\b",
300
+ r"\b(junit|pytest|mocha|jest|cypress|selenium)\b",
301
+ r"\b(mock|stub|fixture|factory)\b",
302
+ r"\b(e2e|end-to-end|acceptance|smoke)\b",
303
+ r"\b(coverage|assert|expect|should)\b",
304
+ ],
305
+ "style": [
306
+ r"^style:",
307
+ r"\b(format|formatting|style|lint|linting)\b",
308
+ r"\b(prettier|eslint|black|autopep8|rubocop)\b",
309
+ r"\b(whitespace|indentation|spacing|tabs)\b",
310
+ r"\b(code\s+style|consistent|standardize)\b",
311
+ ],
312
+ "build": [
313
+ r"^build:",
314
+ r"\b(build|compile|bundle|webpack|rollup)\b",
315
+ r"\b(ci|cd|pipeline|workflow|github\s+actions)\b",
316
+ r"\b(docker|dockerfile|makefile|npm\s+scripts)\b",
317
+ r"\b(jenkins|travis|circleci|gitlab)\b",
318
+ r"\b(artifact|binary|executable|jar|war)\b",
319
+ ],
320
+ "integration": [
321
+ r"\b(integrate|integration)\s+(with|posthog|iubenda|auth0)\b",
322
+ r"\b(posthog|iubenda|auth0|oauth|third-party|external)\b",
323
+ r"\b(api|endpoint|service)\s+(integration|connection|setup)\b",
324
+ r"\b(connect|linking|sync)\s+(with|to)\s+[a-z]+(hog|enda|auth)\b",
325
+ r"implement\s+(posthog|iubenda|auth0|api)\b",
326
+ r"adding\s+(posthog|auth|integration)\b",
327
+ r"\b(third-party|external)\s+(service|integration|api)\b",
328
+ r"\bniveles\s+de\s+acceso\s+a\s+la\s+api\b", # Spanish: API access levels
329
+ r"\b(implementation|removing)\s+(iubenda|posthog|auth0)\b",
330
+ ],
331
+ }
332
+
333
+ # Compile categorization patterns
334
+ self.compiled_category_patterns = {}
335
+ for category, patterns in self.category_patterns.items():
336
+ self.compiled_category_patterns[category] = [
337
+ re.compile(pattern, re.IGNORECASE) for pattern in patterns
338
+ ]
339
+
49
340
  def extract_from_text(self, text: str) -> list[dict[str, str]]:
50
341
  """Extract all ticket references from text."""
51
342
  if not text:
@@ -107,9 +398,26 @@ class TicketExtractor:
107
398
  }
108
399
 
109
400
  # Analyze commits
401
+ commits_analyzed = 0
402
+ commits_with_ticket_refs = 0
403
+
110
404
  for commit in commits:
405
+ # Debug: check if commit is actually a dictionary
406
+ if not isinstance(commit, dict):
407
+ logger.error(f"Expected commit to be dict, got {type(commit)}: {commit}")
408
+ continue
409
+
410
+ commits_analyzed += 1
111
411
  ticket_refs = commit.get("ticket_references", [])
412
+
413
+ # Debug logging for the first few commits
414
+ if commits_analyzed <= 5:
415
+ logger.debug(
416
+ f"Commit {commits_analyzed}: hash={commit.get('hash', 'N/A')[:8]}, ticket_refs={ticket_refs}"
417
+ )
418
+
112
419
  if ticket_refs:
420
+ commits_with_ticket_refs += 1
113
421
  commits_with_tickets = cast(int, results["commits_with_tickets"])
114
422
  results["commits_with_tickets"] = commits_with_tickets + 1
115
423
  for ticket in ticket_refs:
@@ -125,15 +433,34 @@ class TicketExtractor:
125
433
  ticket_platforms[platform] = platform_count + 1
126
434
  ticket_summary[platform].add(ticket_id)
127
435
  else:
128
- # Track significant untracked commits
129
- if not commit.get("is_merge") and commit.get("files_changed", 0) > 3:
130
- untracked_commits.append(
131
- {
132
- "hash": commit.get("hash", "")[:7],
133
- "message": commit.get("message", "").split("\n")[0][:60],
134
- "files_changed": commit.get("files_changed", 0),
135
- }
136
- )
436
+ # Track untracked commits with configurable threshold and enhanced data
437
+ files_changed = self._get_files_changed_count(commit)
438
+ if not commit.get("is_merge") and files_changed >= self.untracked_file_threshold:
439
+ # Categorize the commit
440
+ category = self.categorize_commit(commit.get("message", ""))
441
+
442
+ # Extract enhanced commit data
443
+ commit_data = {
444
+ "hash": commit.get("hash", "")[:7],
445
+ "full_hash": commit.get("hash", ""),
446
+ "message": commit.get("message", "").split("\n")[0][
447
+ :100
448
+ ], # Increased from 60 to 100
449
+ "full_message": commit.get("message", ""),
450
+ "author": commit.get("author_name", "Unknown"),
451
+ "author_email": commit.get("author_email", ""),
452
+ "canonical_id": commit.get("canonical_id", commit.get("author_email", "")),
453
+ "timestamp": commit.get("timestamp"),
454
+ "project_key": commit.get("project_key", "UNKNOWN"),
455
+ "files_changed": files_changed,
456
+ "lines_added": commit.get("insertions", 0),
457
+ "lines_removed": commit.get("deletions", 0),
458
+ "lines_changed": (commit.get("insertions", 0) + commit.get("deletions", 0)),
459
+ "category": category,
460
+ "is_merge": commit.get("is_merge", False),
461
+ }
462
+
463
+ untracked_commits.append(commit_data)
137
464
 
138
465
  # Analyze PRs
139
466
  for pr in prs:
@@ -168,8 +495,419 @@ class TicketExtractor:
168
495
  platform: len(tickets) for platform, tickets in ticket_summary.items()
169
496
  }
170
497
 
498
+ # Sort untracked commits by timestamp (most recent first)
499
+ # Handle timezone-aware and timezone-naive datetimes
500
+ def safe_timestamp_key(commit):
501
+ ts = commit.get("timestamp")
502
+ if ts is None:
503
+ return ""
504
+ # If it's a datetime object, handle timezone issues
505
+ if hasattr(ts, "tzinfo") and ts.tzinfo is None:
506
+ # Make timezone-naive datetime UTC-aware for consistent comparison
507
+ ts = ts.replace(tzinfo=timezone.utc)
508
+ return ts
509
+
510
+ untracked_commits.sort(key=safe_timestamp_key, reverse=True)
511
+
512
+ # Debug logging for ticket coverage analysis
513
+ final_commits_with_tickets = cast(int, results["commits_with_tickets"])
514
+ logger.debug(
515
+ f"Ticket coverage analysis complete: {commits_analyzed} commits analyzed, {commits_with_ticket_refs} had ticket_refs, {final_commits_with_tickets} counted as with tickets"
516
+ )
517
+ if commits_analyzed > 0 and final_commits_with_tickets == 0:
518
+ logger.warning(
519
+ f"Zero commits with tickets found out of {commits_analyzed} commits analyzed"
520
+ )
521
+
171
522
  return results
172
523
 
524
+ def calculate_developer_ticket_coverage(
525
+ self, commits: list[dict[str, Any]]
526
+ ) -> dict[str, float]:
527
+ """Calculate ticket coverage percentage per developer.
528
+
529
+ WHY: Individual developer ticket coverage was hardcoded to 0.0, causing
530
+ reports to show contradictory information where total coverage was >0%
531
+ but all individual developers showed 0%. This method provides the missing
532
+ per-developer calculation.
533
+
534
+ DESIGN DECISION: Uses canonical_id when available (post-identity resolution)
535
+ or falls back to author_email for consistent developer identification.
536
+ The coverage calculation only considers commits that meet the untracked
537
+ file threshold to maintain consistency with the overall analysis.
538
+
539
+ Args:
540
+ commits: List of commit dictionaries with ticket_references and identity info
541
+
542
+ Returns:
543
+ Dictionary mapping canonical_id/author_email to coverage percentage
544
+ """
545
+ if not commits:
546
+ return {}
547
+
548
+ # Group commits by developer (canonical_id preferred, fallback to author_email)
549
+ developer_commits = {}
550
+ developer_with_tickets = {}
551
+
552
+ for commit in commits:
553
+ # Skip merge commits (consistent with main analysis)
554
+ if commit.get("is_merge"):
555
+ continue
556
+
557
+ # Only count commits that meet the file threshold (consistent with untracked analysis)
558
+ files_changed = self._get_files_changed_count(commit)
559
+ if files_changed < self.untracked_file_threshold:
560
+ continue
561
+
562
+ # Determine developer identifier (canonical_id preferred)
563
+ developer_id = commit.get("canonical_id") or commit.get("author_email", "unknown")
564
+
565
+ # Initialize counters for this developer
566
+ if developer_id not in developer_commits:
567
+ developer_commits[developer_id] = 0
568
+ developer_with_tickets[developer_id] = 0
569
+
570
+ # Count total commits for this developer
571
+ developer_commits[developer_id] += 1
572
+
573
+ # Count commits with ticket references
574
+ ticket_refs = commit.get("ticket_references", [])
575
+ if ticket_refs:
576
+ developer_with_tickets[developer_id] += 1
577
+
578
+ # Calculate coverage percentages
579
+ coverage_by_developer = {}
580
+ for developer_id in developer_commits:
581
+ total_commits = developer_commits[developer_id]
582
+ commits_with_tickets = developer_with_tickets[developer_id]
583
+
584
+ if total_commits > 0:
585
+ coverage_pct = (commits_with_tickets / total_commits) * 100
586
+ coverage_by_developer[developer_id] = round(coverage_pct, 1)
587
+ else:
588
+ coverage_by_developer[developer_id] = 0.0
589
+
590
+ logger.debug(f"Calculated ticket coverage for {len(coverage_by_developer)} developers")
591
+ return coverage_by_developer
592
+
593
+ def _get_files_changed_count(self, commit: dict[str, Any]) -> int:
594
+ """Extract the number of files changed from commit data.
595
+
596
+ WHY: Commit data can have files_changed as either an integer count
597
+ or a list of file paths. This method handles both cases correctly
598
+ and provides a consistent integer count for analysis.
599
+
600
+ DESIGN DECISION: Priority order is:
601
+ 1. files_changed_count (if present, use directly)
602
+ 2. files_changed as integer (use directly)
603
+ 3. files_changed as list (use length)
604
+ 4. Default to 0 if none available
605
+
606
+ Args:
607
+ commit: Commit data dictionary
608
+
609
+ Returns:
610
+ Integer count of files changed
611
+ """
612
+ # First priority: explicit count field
613
+ if "files_changed_count" in commit:
614
+ return commit["files_changed_count"]
615
+
616
+ # Second priority: files_changed field
617
+ files_changed = commit.get("files_changed")
618
+ if files_changed is not None:
619
+ if isinstance(files_changed, int):
620
+ return files_changed
621
+ elif isinstance(files_changed, list):
622
+ return len(files_changed)
623
+
624
+ # Default fallback
625
+ return 0
626
+
627
+ def categorize_commit(self, message: str) -> str:
628
+ """Categorize a commit based on its message.
629
+
630
+ WHY: Commit categorization helps identify patterns in untracked work,
631
+ enabling better insights into what types of work are not being tracked
632
+ through tickets. This supports improved process recommendations.
633
+
634
+ DESIGN DECISION: Categories are checked in priority order to ensure
635
+ more specific patterns match before general ones. For example,
636
+ "security" patterns are checked before "feature" patterns to prevent
637
+ "add authentication" from being classified as a feature instead of security.
638
+
639
+ Args:
640
+ message: The commit message to categorize
641
+
642
+ Returns:
643
+ String category (bug_fix, feature, refactor, documentation,
644
+ maintenance, test, style, build, or other)
645
+ """
646
+ if not message:
647
+ return "other"
648
+
649
+ # Filter git artifacts before categorization
650
+ cleaned_message = filter_git_artifacts(message)
651
+ if not cleaned_message:
652
+ return "other"
653
+
654
+ # Remove ticket references to focus on content analysis
655
+ # This helps classify commits with ticket references based on their actual content
656
+ message_without_tickets = self._remove_ticket_references(cleaned_message)
657
+ message_lower = message_without_tickets.lower()
658
+
659
+ # Define priority order - conventional commits first, then specific patterns
660
+ priority_order = [
661
+ # Conventional commit formats (start with specific prefixes)
662
+ "wip", # ^wip: prefix
663
+ "chore", # ^chore: prefix
664
+ "style", # ^style: prefix
665
+ "bug_fix", # ^fix: prefix
666
+ "feature", # ^feat: prefix
667
+ "test", # ^test: prefix
668
+ "build", # ^build: prefix
669
+ "deployment", # ^deploy: prefix and specific deployment terms
670
+ # Specific domain patterns (no conventional prefix conflicts)
671
+ "version", # Version-specific patterns
672
+ "security", # Security-specific terms
673
+ "performance", # Performance-specific terms
674
+ "infrastructure", # Infrastructure-specific terms
675
+ "integration", # Third-party integration terms
676
+ "configuration", # Configuration-specific terms
677
+ "content", # Content-specific terms
678
+ "ui", # UI-specific terms
679
+ "documentation", # Documentation terms
680
+ "refactor", # Refactoring terms
681
+ "maintenance", # General maintenance terms
682
+ ]
683
+
684
+ # First, check for conventional commit patterns (^prefix:) which have absolute priority
685
+ conventional_patterns = {
686
+ "chore": r"^chore:",
687
+ "style": r"^style:",
688
+ "bug_fix": r"^fix:",
689
+ "feature": r"^(feat|feature):",
690
+ "test": r"^test:",
691
+ "build": r"^build:",
692
+ "deployment": r"^deploy:",
693
+ "wip": r"^wip:",
694
+ "version": r"^(version|bump):",
695
+ }
696
+
697
+ for category, pattern in conventional_patterns.items():
698
+ if re.match(pattern, message_lower):
699
+ return category
700
+
701
+ # Then check categories in priority order for non-conventional patterns
702
+ for category in priority_order:
703
+ if category in self.compiled_category_patterns:
704
+ for pattern in self.compiled_category_patterns[category]:
705
+ if pattern.search(message_lower):
706
+ return category
707
+
708
+ return "other"
709
+
710
+ def _remove_ticket_references(self, message: str) -> str:
711
+ """Remove ticket references from commit message to focus on content analysis.
712
+
713
+ WHY: Ticket references like 'RMVP-941' or '[CNA-482]' don't indicate the type
714
+ of work being done. We need to analyze the actual description to properly
715
+ categorize commits with ticket references.
716
+
717
+ Args:
718
+ message: The commit message possibly containing ticket references
719
+
720
+ Returns:
721
+ Message with ticket references removed, focusing on the actual description
722
+ """
723
+ if not message:
724
+ return ""
725
+
726
+ # Remove common ticket patterns at the start of messages
727
+ patterns_to_remove = [
728
+ # JIRA-style patterns
729
+ r"^[A-Z]{2,10}-\d+:?\s*", # RMVP-941: or RMVP-941
730
+ r"^\[[A-Z]{2,10}-\d+\]\s*", # [CNA-482]
731
+ # GitHub issue patterns
732
+ r"^#\d+:?\s*", # #123: or #123
733
+ r"^GH-\d+:?\s*", # GH-123:
734
+ # ClickUp patterns
735
+ r"^CU-[a-z0-9]+:?\s*", # CU-abc123:
736
+ # Linear patterns
737
+ r"^[A-Z]{2,5}-\d+:?\s*", # ENG-123:
738
+ r"^LIN-\d+:?\s*", # LIN-123:
739
+ # GitHub PR patterns in messages
740
+ r"\(#\d+\)$", # (#115) at end
741
+ r"\(#\d+\)\s*\(#\d+\)*\s*$", # (#131) (#133) (#134) at end
742
+ # Other ticket-like patterns
743
+ r"^[A-Z]{2,10}\s+\d+\s*", # NEWS 206
744
+ ]
745
+
746
+ cleaned_message = message
747
+ for pattern in patterns_to_remove:
748
+ cleaned_message = re.sub(pattern, "", cleaned_message, flags=re.IGNORECASE).strip()
749
+
750
+ # If we removed everything, return the original message
751
+ # This handles cases where the entire message was just a ticket reference
752
+ if not cleaned_message.strip():
753
+ return message
754
+
755
+ return cleaned_message
756
+
757
+ def analyze_untracked_patterns(self, untracked_commits: list[dict[str, Any]]) -> dict[str, Any]:
758
+ """Analyze patterns in untracked commits for insights.
759
+
760
+ WHY: Understanding patterns in untracked work helps identify:
761
+ - Common types of work that bypass ticket tracking
762
+ - Developers who need process guidance
763
+ - Categories of work that should be tracked vs. allowed to be untracked
764
+
765
+ Args:
766
+ untracked_commits: List of untracked commit data
767
+
768
+ Returns:
769
+ Dictionary with pattern analysis results
770
+ """
771
+ if not untracked_commits:
772
+ return {
773
+ "total_untracked": 0,
774
+ "categories": {},
775
+ "top_contributors": [],
776
+ "projects": {},
777
+ "avg_commit_size": 0,
778
+ "recommendations": [],
779
+ }
780
+
781
+ # Category analysis
782
+ categories = {}
783
+ for commit in untracked_commits:
784
+ category = commit.get("category", "other")
785
+ if category not in categories:
786
+ categories[category] = {"count": 0, "lines_changed": 0, "examples": []}
787
+ categories[category]["count"] += 1
788
+ categories[category]["lines_changed"] += commit.get("lines_changed", 0)
789
+ if len(categories[category]["examples"]) < 3:
790
+ categories[category]["examples"].append(
791
+ {
792
+ "hash": commit.get("hash", ""),
793
+ "message": commit.get("message", ""),
794
+ "author": commit.get("author", ""),
795
+ }
796
+ )
797
+
798
+ # Contributor analysis
799
+ contributors = {}
800
+ for commit in untracked_commits:
801
+ author = commit.get("canonical_id", commit.get("author_email", "Unknown"))
802
+ if author not in contributors:
803
+ contributors[author] = {"count": 0, "categories": set()}
804
+ contributors[author]["count"] += 1
805
+ contributors[author]["categories"].add(commit.get("category", "other"))
806
+
807
+ # Convert sets to lists for JSON serialization
808
+ for author_data in contributors.values():
809
+ author_data["categories"] = list(author_data["categories"])
810
+
811
+ # Top contributors
812
+ top_contributors = sorted(
813
+ [(author, data["count"]) for author, data in contributors.items()],
814
+ key=lambda x: x[1],
815
+ reverse=True,
816
+ )[:5]
817
+
818
+ # Project analysis
819
+ projects = {}
820
+ for commit in untracked_commits:
821
+ project = commit.get("project_key", "UNKNOWN")
822
+ if project not in projects:
823
+ projects[project] = {"count": 0, "categories": set()}
824
+ projects[project]["count"] += 1
825
+ projects[project]["categories"].add(commit.get("category", "other"))
826
+
827
+ # Convert sets to lists for JSON serialization
828
+ for project_data in projects.values():
829
+ project_data["categories"] = list(project_data["categories"])
830
+
831
+ # Calculate average commit size
832
+ total_lines = sum(commit.get("lines_changed", 0) for commit in untracked_commits)
833
+ avg_commit_size = total_lines / len(untracked_commits) if untracked_commits else 0
834
+
835
+ # Generate recommendations
836
+ recommendations = self._generate_untracked_recommendations(
837
+ categories, contributors, projects, len(untracked_commits)
838
+ )
839
+
840
+ return {
841
+ "total_untracked": len(untracked_commits),
842
+ "categories": categories,
843
+ "top_contributors": top_contributors,
844
+ "projects": projects,
845
+ "avg_commit_size": round(avg_commit_size, 1),
846
+ "recommendations": recommendations,
847
+ }
848
+
849
+ def _generate_untracked_recommendations(
850
+ self,
851
+ categories: dict[str, Any],
852
+ contributors: dict[str, Any],
853
+ projects: dict[str, Any],
854
+ total_untracked: int,
855
+ ) -> list[dict[str, str]]:
856
+ """Generate recommendations based on untracked commit patterns."""
857
+ recommendations = []
858
+
859
+ # Category-based recommendations
860
+ if categories.get("feature", {}).get("count", 0) > total_untracked * 0.2:
861
+ recommendations.append(
862
+ {
863
+ "type": "process",
864
+ "title": "Track Feature Development",
865
+ "description": "Many feature commits lack ticket references. Consider requiring tickets for new features.",
866
+ "priority": "high",
867
+ }
868
+ )
869
+
870
+ if categories.get("bug_fix", {}).get("count", 0) > total_untracked * 0.15:
871
+ recommendations.append(
872
+ {
873
+ "type": "process",
874
+ "title": "Improve Bug Tracking",
875
+ "description": "Bug fixes should be tracked through issue management systems.",
876
+ "priority": "high",
877
+ }
878
+ )
879
+
880
+ # Allow certain categories to be untracked
881
+ low_priority_categories = ["style", "documentation", "maintenance"]
882
+ low_priority_count = sum(
883
+ categories.get(cat, {}).get("count", 0) for cat in low_priority_categories
884
+ )
885
+
886
+ if low_priority_count > total_untracked * 0.6:
887
+ recommendations.append(
888
+ {
889
+ "type": "positive",
890
+ "title": "Appropriate Untracked Work",
891
+ "description": "Most untracked commits are maintenance/style/docs - this is acceptable.",
892
+ "priority": "low",
893
+ }
894
+ )
895
+
896
+ # Contributor-based recommendations
897
+ if len(contributors) > 1:
898
+ max_contributor_count = max(data["count"] for data in contributors.values())
899
+ if max_contributor_count > total_untracked * 0.5:
900
+ recommendations.append(
901
+ {
902
+ "type": "team",
903
+ "title": "Provide Process Training",
904
+ "description": "Some developers need guidance on ticket referencing practices.",
905
+ "priority": "medium",
906
+ }
907
+ )
908
+
909
+ return recommendations
910
+
173
911
  def _format_ticket_id(self, platform: str, ticket_id: str) -> str:
174
912
  """Format ticket ID for display."""
175
913
  if platform == "github":