gitflow-analytics 1.0.3__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. gitflow_analytics/_version.py +1 -1
  2. gitflow_analytics/classification/__init__.py +31 -0
  3. gitflow_analytics/classification/batch_classifier.py +752 -0
  4. gitflow_analytics/classification/classifier.py +464 -0
  5. gitflow_analytics/classification/feature_extractor.py +725 -0
  6. gitflow_analytics/classification/linguist_analyzer.py +574 -0
  7. gitflow_analytics/classification/model.py +455 -0
  8. gitflow_analytics/cli.py +4108 -350
  9. gitflow_analytics/cli_rich.py +198 -48
  10. gitflow_analytics/config/__init__.py +43 -0
  11. gitflow_analytics/config/errors.py +261 -0
  12. gitflow_analytics/config/loader.py +904 -0
  13. gitflow_analytics/config/profiles.py +264 -0
  14. gitflow_analytics/config/repository.py +124 -0
  15. gitflow_analytics/config/schema.py +441 -0
  16. gitflow_analytics/config/validator.py +154 -0
  17. gitflow_analytics/config.py +44 -508
  18. gitflow_analytics/core/analyzer.py +1209 -98
  19. gitflow_analytics/core/cache.py +1337 -29
  20. gitflow_analytics/core/data_fetcher.py +1193 -0
  21. gitflow_analytics/core/identity.py +363 -14
  22. gitflow_analytics/core/metrics_storage.py +526 -0
  23. gitflow_analytics/core/progress.py +372 -0
  24. gitflow_analytics/core/schema_version.py +269 -0
  25. gitflow_analytics/extractors/ml_tickets.py +1100 -0
  26. gitflow_analytics/extractors/story_points.py +8 -1
  27. gitflow_analytics/extractors/tickets.py +749 -11
  28. gitflow_analytics/identity_llm/__init__.py +6 -0
  29. gitflow_analytics/identity_llm/analysis_pass.py +231 -0
  30. gitflow_analytics/identity_llm/analyzer.py +464 -0
  31. gitflow_analytics/identity_llm/models.py +76 -0
  32. gitflow_analytics/integrations/github_integration.py +175 -11
  33. gitflow_analytics/integrations/jira_integration.py +461 -24
  34. gitflow_analytics/integrations/orchestrator.py +124 -1
  35. gitflow_analytics/metrics/activity_scoring.py +322 -0
  36. gitflow_analytics/metrics/branch_health.py +470 -0
  37. gitflow_analytics/metrics/dora.py +379 -20
  38. gitflow_analytics/models/database.py +843 -53
  39. gitflow_analytics/pm_framework/__init__.py +115 -0
  40. gitflow_analytics/pm_framework/adapters/__init__.py +50 -0
  41. gitflow_analytics/pm_framework/adapters/jira_adapter.py +1845 -0
  42. gitflow_analytics/pm_framework/base.py +406 -0
  43. gitflow_analytics/pm_framework/models.py +211 -0
  44. gitflow_analytics/pm_framework/orchestrator.py +652 -0
  45. gitflow_analytics/pm_framework/registry.py +333 -0
  46. gitflow_analytics/qualitative/__init__.py +9 -10
  47. gitflow_analytics/qualitative/chatgpt_analyzer.py +259 -0
  48. gitflow_analytics/qualitative/classifiers/__init__.py +3 -3
  49. gitflow_analytics/qualitative/classifiers/change_type.py +518 -244
  50. gitflow_analytics/qualitative/classifiers/domain_classifier.py +272 -165
  51. gitflow_analytics/qualitative/classifiers/intent_analyzer.py +321 -222
  52. gitflow_analytics/qualitative/classifiers/llm/__init__.py +35 -0
  53. gitflow_analytics/qualitative/classifiers/llm/base.py +193 -0
  54. gitflow_analytics/qualitative/classifiers/llm/batch_processor.py +383 -0
  55. gitflow_analytics/qualitative/classifiers/llm/cache.py +479 -0
  56. gitflow_analytics/qualitative/classifiers/llm/cost_tracker.py +435 -0
  57. gitflow_analytics/qualitative/classifiers/llm/openai_client.py +403 -0
  58. gitflow_analytics/qualitative/classifiers/llm/prompts.py +373 -0
  59. gitflow_analytics/qualitative/classifiers/llm/response_parser.py +287 -0
  60. gitflow_analytics/qualitative/classifiers/llm_commit_classifier.py +607 -0
  61. gitflow_analytics/qualitative/classifiers/risk_analyzer.py +215 -189
  62. gitflow_analytics/qualitative/core/__init__.py +4 -4
  63. gitflow_analytics/qualitative/core/llm_fallback.py +239 -235
  64. gitflow_analytics/qualitative/core/nlp_engine.py +157 -148
  65. gitflow_analytics/qualitative/core/pattern_cache.py +214 -192
  66. gitflow_analytics/qualitative/core/processor.py +381 -248
  67. gitflow_analytics/qualitative/enhanced_analyzer.py +2236 -0
  68. gitflow_analytics/qualitative/example_enhanced_usage.py +420 -0
  69. gitflow_analytics/qualitative/models/__init__.py +7 -7
  70. gitflow_analytics/qualitative/models/schemas.py +155 -121
  71. gitflow_analytics/qualitative/utils/__init__.py +4 -4
  72. gitflow_analytics/qualitative/utils/batch_processor.py +136 -123
  73. gitflow_analytics/qualitative/utils/cost_tracker.py +142 -140
  74. gitflow_analytics/qualitative/utils/metrics.py +172 -158
  75. gitflow_analytics/qualitative/utils/text_processing.py +146 -104
  76. gitflow_analytics/reports/__init__.py +100 -0
  77. gitflow_analytics/reports/analytics_writer.py +539 -14
  78. gitflow_analytics/reports/base.py +648 -0
  79. gitflow_analytics/reports/branch_health_writer.py +322 -0
  80. gitflow_analytics/reports/classification_writer.py +924 -0
  81. gitflow_analytics/reports/cli_integration.py +427 -0
  82. gitflow_analytics/reports/csv_writer.py +1676 -212
  83. gitflow_analytics/reports/data_models.py +504 -0
  84. gitflow_analytics/reports/database_report_generator.py +427 -0
  85. gitflow_analytics/reports/example_usage.py +344 -0
  86. gitflow_analytics/reports/factory.py +499 -0
  87. gitflow_analytics/reports/formatters.py +698 -0
  88. gitflow_analytics/reports/html_generator.py +1116 -0
  89. gitflow_analytics/reports/interfaces.py +489 -0
  90. gitflow_analytics/reports/json_exporter.py +2770 -0
  91. gitflow_analytics/reports/narrative_writer.py +2287 -158
  92. gitflow_analytics/reports/story_point_correlation.py +1144 -0
  93. gitflow_analytics/reports/weekly_trends_writer.py +389 -0
  94. gitflow_analytics/training/__init__.py +5 -0
  95. gitflow_analytics/training/model_loader.py +377 -0
  96. gitflow_analytics/training/pipeline.py +550 -0
  97. gitflow_analytics/tui/__init__.py +1 -1
  98. gitflow_analytics/tui/app.py +129 -126
  99. gitflow_analytics/tui/screens/__init__.py +3 -3
  100. gitflow_analytics/tui/screens/analysis_progress_screen.py +188 -179
  101. gitflow_analytics/tui/screens/configuration_screen.py +154 -178
  102. gitflow_analytics/tui/screens/loading_screen.py +100 -110
  103. gitflow_analytics/tui/screens/main_screen.py +89 -72
  104. gitflow_analytics/tui/screens/results_screen.py +305 -281
  105. gitflow_analytics/tui/widgets/__init__.py +2 -2
  106. gitflow_analytics/tui/widgets/data_table.py +67 -69
  107. gitflow_analytics/tui/widgets/export_modal.py +76 -76
  108. gitflow_analytics/tui/widgets/progress_widget.py +41 -46
  109. gitflow_analytics-1.3.6.dist-info/METADATA +1015 -0
  110. gitflow_analytics-1.3.6.dist-info/RECORD +122 -0
  111. gitflow_analytics-1.0.3.dist-info/METADATA +0 -490
  112. gitflow_analytics-1.0.3.dist-info/RECORD +0 -62
  113. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/WHEEL +0 -0
  114. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/entry_points.txt +0 -0
  115. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/licenses/LICENSE +0 -0
  116. {gitflow_analytics-1.0.3.dist-info → gitflow_analytics-1.3.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,607 @@
1
+ """LLM-based commit classification orchestrator.
2
+
3
+ This module provides the main interface for LLM-based commit classification,
4
+ orchestrating the various components for a complete classification solution.
5
+
6
+ WHY: This refactored version separates concerns into focused modules while
7
+ maintaining backward compatibility with the existing interface.
8
+
9
+ DESIGN DECISIONS:
10
+ - Main orchestrator delegates to specialized components
11
+ - Maintains backward compatibility with existing code
12
+ - Supports multiple LLM providers through abstraction
13
+ - Provides enhanced rule-based fallback
14
+ - Comprehensive error handling and graceful degradation
15
+ """
16
+
17
+ import logging
18
+ import re
19
+ import time
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+ from typing import Any, Optional
23
+
24
+ from .llm.batch_processor import BatchConfig, BatchProcessor
25
+ from .llm.cache import LLMCache
26
+ from .llm.openai_client import OpenAIClassifier, OpenAIConfig
27
+ from .llm.prompts import PromptVersion
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @dataclass
33
+ class LLMConfig:
34
+ """Configuration for LLM-based commit classification.
35
+
36
+ Maintains backward compatibility with existing configuration structure.
37
+ """
38
+
39
+ # OpenRouter API configuration
40
+ api_key: Optional[str] = None
41
+ api_base_url: str = "https://openrouter.ai/api/v1"
42
+ model: str = "mistralai/mistral-7b-instruct" # Fast, affordable model
43
+
44
+ # Classification parameters
45
+ confidence_threshold: float = 0.7 # Minimum confidence for LLM predictions
46
+ max_tokens: int = 50 # Keep responses short
47
+ temperature: float = 0.1 # Low temperature for consistent results
48
+ timeout_seconds: float = 30.0 # API timeout
49
+
50
+ # Caching configuration
51
+ cache_duration_days: int = 90 # Long cache duration for cost optimization
52
+ enable_caching: bool = True
53
+
54
+ # Cost optimization
55
+ batch_size: int = 1 # Process one at a time for simplicity
56
+ max_daily_requests: int = 1000 # Rate limiting
57
+
58
+ # Domain-specific terms for organization
59
+ domain_terms: dict[str, list[str]] = None
60
+
61
+ def __post_init__(self):
62
+ """Initialize default domain terms if not provided."""
63
+ if self.domain_terms is None:
64
+ self.domain_terms = {
65
+ "media": [
66
+ "video",
67
+ "audio",
68
+ "streaming",
69
+ "player",
70
+ "media",
71
+ "content",
72
+ "broadcast",
73
+ "live",
74
+ "recording",
75
+ "episode",
76
+ "program",
77
+ ],
78
+ "localization": [
79
+ "translation",
80
+ "i18n",
81
+ "l10n",
82
+ "locale",
83
+ "language",
84
+ "spanish",
85
+ "french",
86
+ "german",
87
+ "italian",
88
+ "portuguese",
89
+ "multilingual",
90
+ ],
91
+ "integration": [
92
+ "api",
93
+ "webhook",
94
+ "third-party",
95
+ "external",
96
+ "service",
97
+ "integration",
98
+ "sync",
99
+ "import",
100
+ "export",
101
+ "connector",
102
+ ],
103
+ }
104
+
105
+
106
+ class LLMCommitClassifier:
107
+ """LLM-based commit classifier with modular architecture.
108
+
109
+ This refactored version delegates to specialized components for better
110
+ maintainability while preserving the original interface.
111
+ """
112
+
113
+ # Streamlined category definitions (same as original)
114
+ CATEGORIES = {
115
+ "feature": "New functionality, capabilities, enhancements, additions",
116
+ "bugfix": "Fixes, errors, issues, crashes, bugs, corrections",
117
+ "maintenance": "Configuration, chores, dependencies, cleanup, refactoring, updates",
118
+ "integration": "Third-party services, APIs, webhooks, external systems",
119
+ "content": "Text, copy, documentation, README updates, comments",
120
+ "media": "Video, audio, streaming, players, visual assets, images",
121
+ "localization": "Translations, i18n, l10n, regional adaptations",
122
+ }
123
+
124
+ def __init__(self, config: LLMConfig, cache_dir: Optional[Path] = None):
125
+ """Initialize LLM commit classifier with modular components.
126
+
127
+ Args:
128
+ config: LLM configuration
129
+ cache_dir: Directory for caching predictions
130
+ """
131
+ self.config = config
132
+ self.cache_dir = cache_dir or Path(".gitflow-cache")
133
+ self.cache_dir.mkdir(exist_ok=True)
134
+
135
+ # Initialize components
136
+ self._init_classifier()
137
+ self._init_cache()
138
+ self._init_batch_processor()
139
+ self._init_rule_patterns()
140
+
141
+ # Request tracking for rate limiting (backward compatibility)
142
+ self._daily_requests = 0
143
+ self._last_reset_date = None
144
+
145
+ # Cost tracking (backward compatibility)
146
+ self.total_tokens_used = 0
147
+ self.total_cost = 0.0
148
+ self.api_calls_made = 0
149
+
150
+ logger.info(f"LLMCommitClassifier initialized with model: {self.config.model}")
151
+
152
+ def _init_classifier(self) -> None:
153
+ """Initialize the LLM classifier component.
154
+
155
+ WHY: Modular initialization allows easy switching between providers.
156
+ """
157
+ # Convert config to OpenAI config
158
+ openai_config = OpenAIConfig(
159
+ api_key=self.config.api_key,
160
+ api_base_url=self.config.api_base_url,
161
+ model=self.config.model,
162
+ temperature=self.config.temperature,
163
+ max_tokens=self.config.max_tokens,
164
+ timeout_seconds=self.config.timeout_seconds,
165
+ max_daily_requests=self.config.max_daily_requests,
166
+ use_openrouter=True, # Default to OpenRouter
167
+ )
168
+
169
+ # Initialize classifier
170
+ try:
171
+ self.classifier = OpenAIClassifier(
172
+ config=openai_config,
173
+ cache_dir=self.cache_dir,
174
+ prompt_version=PromptVersion.V3_CONTEXTUAL,
175
+ )
176
+
177
+ # Set domain terms in prompt generator
178
+ self.classifier.prompt_generator.domain_terms = self.config.domain_terms
179
+
180
+ except ImportError as e:
181
+ logger.warning(f"Failed to initialize LLM classifier: {e}")
182
+ self.classifier = None
183
+
184
+ def _init_cache(self) -> None:
185
+ """Initialize the caching component.
186
+
187
+ WHY: Separate cache initialization for better error handling.
188
+ """
189
+ self.cache: Optional[LLMCache] = None
190
+ if self.config.enable_caching:
191
+ try:
192
+ cache_path = self.cache_dir / "llm_predictions.db"
193
+ self.cache = LLMCache(
194
+ cache_path=cache_path, expiration_days=self.config.cache_duration_days
195
+ )
196
+ except Exception as e:
197
+ logger.warning(f"Failed to initialize LLM cache: {e}")
198
+ self.cache = None
199
+
200
+ def _init_batch_processor(self) -> None:
201
+ """Initialize the batch processing component.
202
+
203
+ WHY: Batch processing improves efficiency for large-scale classification.
204
+ """
205
+ batch_config = BatchConfig(
206
+ batch_size=self.config.batch_size, show_progress=True, continue_on_batch_failure=True
207
+ )
208
+ self.batch_processor = BatchProcessor(batch_config)
209
+
210
+ def _init_rule_patterns(self) -> None:
211
+ """Initialize rule-based patterns for fallback classification.
212
+
213
+ WHY: Rule-based fallback ensures classification works even
214
+ when LLM is unavailable.
215
+ """
216
+ self.rule_patterns = {
217
+ "feature": [
218
+ r"^(feat|feature)[\(\:]",
219
+ r"^add[\(\:]",
220
+ r"^implement[\(\:]",
221
+ r"^create[\(\:]",
222
+ r"add.*feature",
223
+ r"implement.*feature",
224
+ r"create.*feature",
225
+ r"new.*feature",
226
+ r"introduce.*feature",
227
+ r"^enhancement[\(\:]",
228
+ ],
229
+ "bugfix": [
230
+ r"^(fix|bug|hotfix|patch)[\(\:]",
231
+ r"fix.*bug(?!.*format)",
232
+ r"fix.*issue(?!.*format)",
233
+ r"resolve.*bug",
234
+ r"correct.*bug",
235
+ r"repair.*",
236
+ r"^hotfix[\(\:]",
237
+ r"patch.*bug",
238
+ r"debug.*",
239
+ ],
240
+ "maintenance": [
241
+ r"^(chore|refactor|style|deps|build|ci|test)[\(\:]",
242
+ r"^update[\(\:]",
243
+ r"^bump[\(\:]",
244
+ r"^upgrade[\(\:]",
245
+ r"refactor.*",
246
+ r"cleanup",
247
+ r"update.*depend",
248
+ r"bump.*version",
249
+ r"configure.*",
250
+ r"maintenance",
251
+ r"organize.*",
252
+ r"format.*",
253
+ r"style.*",
254
+ r"lint.*",
255
+ r"improve.*performance",
256
+ r"optimize.*",
257
+ ],
258
+ "content": [
259
+ r"^(docs|doc|readme)[\(\:]",
260
+ r"update.*readme",
261
+ r"documentation",
262
+ r"^comment[\(\:]",
263
+ r"doc.*update",
264
+ r"add.*comment",
265
+ r"update.*doc",
266
+ r"add.*documentation",
267
+ ],
268
+ }
269
+
270
+ def classify_commit(
271
+ self, message: str, files_changed: Optional[list[str]] = None
272
+ ) -> dict[str, Any]:
273
+ """Classify a commit message using LLM or fallback methods.
274
+
275
+ Args:
276
+ message: Cleaned commit message
277
+ files_changed: Optional list of changed files
278
+
279
+ Returns:
280
+ Classification result dictionary (backward compatible format)
281
+ """
282
+ start_time = time.time()
283
+
284
+ # Check for empty message
285
+ if not message or not message.strip():
286
+ return self._create_result("maintenance", 0.3, "empty_message", start_time)
287
+
288
+ # Try cache first
289
+ if self.cache:
290
+ cached_result = self.cache.get(message, files_changed)
291
+ if cached_result:
292
+ cached_result["processing_time_ms"] = (time.time() - start_time) * 1000
293
+ return cached_result
294
+
295
+ # Try LLM classification if available and configured
296
+ if self.classifier and self.config.api_key:
297
+ try:
298
+ # Check rate limits
299
+ if self._check_rate_limits():
300
+ result = self.classifier.classify_commit(message, files_changed)
301
+
302
+ # Check if LLM actually succeeded
303
+ if result.method == "llm":
304
+ # Update statistics for backward compatibility
305
+ self.api_calls_made += 1
306
+ self._daily_requests += 1
307
+
308
+ # Get cost information from classifier
309
+ stats = self.classifier.get_statistics()
310
+ self.total_tokens_used = stats.get("total_tokens_used", 0)
311
+ self.total_cost = stats.get("total_cost", 0.0)
312
+
313
+ # Convert to backward compatible format
314
+ result_dict = result.to_dict()
315
+
316
+ # Cache successful result
317
+ if self.cache:
318
+ self.cache.store(message, files_changed, result_dict)
319
+
320
+ return result_dict
321
+ # If method is not 'llm', fall through to rule-based
322
+ else:
323
+ logger.debug("Rate limit exceeded, using rule-based fallback")
324
+ except Exception as e:
325
+ logger.debug(f"LLM classification not available: {e}")
326
+
327
+ # Fall back to enhanced rule-based classification
328
+ return self._enhanced_rule_based_classification(message, files_changed or [])
329
+
330
+ def classify_commits_batch(
331
+ self,
332
+ commits: list[dict[str, Any]],
333
+ batch_id: Optional[str] = None,
334
+ include_confidence: bool = True,
335
+ ) -> list[dict[str, Any]]:
336
+ """Classify a batch of commits.
337
+
338
+ Args:
339
+ commits: List of commit dictionaries
340
+ batch_id: Optional batch identifier
341
+ include_confidence: Whether to include confidence scores
342
+
343
+ Returns:
344
+ List of classification results (backward compatible format)
345
+ """
346
+
347
+ def classify_func(commit: dict[str, Any]) -> dict[str, Any]:
348
+ """Classification function for batch processor."""
349
+ message = commit.get("message", "")
350
+ files_changed = []
351
+
352
+ # Extract files from commit
353
+ if "files_changed" in commit:
354
+ fc = commit["files_changed"]
355
+ if isinstance(fc, list):
356
+ files_changed = fc
357
+
358
+ return self.classify_commit(message, files_changed)
359
+
360
+ # Use batch processor
361
+ results = self.batch_processor.process_commits(
362
+ commits, classify_func, f"Classifying {len(commits)} commits"
363
+ )
364
+
365
+ # Add batch_id if provided
366
+ if batch_id:
367
+ for result in results:
368
+ result["batch_id"] = batch_id
369
+
370
+ logger.info(f"Batch {batch_id}: Classified {len(results)} commits")
371
+ return results
372
+
373
+ def _enhanced_rule_based_classification(
374
+ self, message: str, files_changed: list[str]
375
+ ) -> dict[str, Any]:
376
+ """Enhanced rule-based classification as fallback.
377
+
378
+ Args:
379
+ message: Commit message
380
+ files_changed: List of changed files
381
+
382
+ Returns:
383
+ Classification result dictionary
384
+ """
385
+ message_lower = message.lower()
386
+
387
+ # Check style/formatting first
388
+ if re.search(r"^(style|format)[\(\:]", message_lower):
389
+ return self._create_result(
390
+ "maintenance", 0.8, "rule_enhanced", 0.0, "Style/formatting commit"
391
+ )
392
+
393
+ # Check other patterns
394
+ for category, patterns in self.rule_patterns.items():
395
+ for pattern in patterns:
396
+ if re.search(pattern, message_lower):
397
+ return self._create_result(
398
+ category, 0.8, "rule_enhanced", 0.0, f"Matched pattern: {pattern}"
399
+ )
400
+
401
+ # File-based analysis
402
+ if files_changed:
403
+ category = self._analyze_files(files_changed)
404
+ if category:
405
+ return self._create_result(
406
+ category, 0.7, "rule_enhanced", 0.0, "File-based classification"
407
+ )
408
+
409
+ # Semantic analysis
410
+ category = self._semantic_analysis(message_lower)
411
+ if category:
412
+ return self._create_result(
413
+ category, 0.6, "rule_enhanced", 0.0, f"Semantic indicator for {category}"
414
+ )
415
+
416
+ # Default fallback
417
+ if len(message.split()) >= 5:
418
+ return self._create_result(
419
+ "feature", 0.4, "rule_enhanced", 0.0, "Detailed commit suggests feature"
420
+ )
421
+ elif any(term in message_lower for term in ["urgent", "critical", "!"]):
422
+ return self._create_result(
423
+ "bugfix", 0.5, "rule_enhanced", 0.0, "Urgent language suggests bug fix"
424
+ )
425
+ else:
426
+ return self._create_result(
427
+ "maintenance", 0.3, "rule_enhanced", 0.0, "General maintenance work"
428
+ )
429
+
430
+ def _analyze_files(self, files_changed: list[str]) -> Optional[str]:
431
+ """Analyze files to determine category.
432
+
433
+ Args:
434
+ files_changed: List of changed files
435
+
436
+ Returns:
437
+ Category or None
438
+ """
439
+ file_patterns = []
440
+
441
+ for file_path in files_changed:
442
+ file_lower = file_path.lower()
443
+ ext = Path(file_path).suffix.lower()
444
+
445
+ if any(term in file_lower for term in ["readme", "doc", "changelog", ".md"]):
446
+ file_patterns.append("documentation")
447
+ elif any(term in file_lower for term in ["test", "spec", "__test__"]):
448
+ file_patterns.append("test")
449
+ elif any(term in file_lower for term in ["config", "package.json", ".yml"]):
450
+ file_patterns.append("configuration")
451
+ elif ext in [".jpg", ".png", ".gif", ".mp4", ".mp3", ".svg"]:
452
+ file_patterns.append("media")
453
+
454
+ # Determine category from patterns
455
+ if "documentation" in file_patterns:
456
+ return "content"
457
+ elif "test" in file_patterns or "configuration" in file_patterns:
458
+ return "maintenance"
459
+ elif "media" in file_patterns:
460
+ return "media"
461
+
462
+ return None
463
+
464
+ def _semantic_analysis(self, message_lower: str) -> Optional[str]:
465
+ """Perform semantic analysis on message.
466
+
467
+ Args:
468
+ message_lower: Lowercase commit message
469
+
470
+ Returns:
471
+ Category or None
472
+ """
473
+ semantic_indicators = {
474
+ "feature": ["implement new", "create new", "introduce new", "develop", "build new"],
475
+ "bugfix": [
476
+ "resolve error",
477
+ "correct issue",
478
+ "repair bug",
479
+ "solve problem",
480
+ "address bug",
481
+ ],
482
+ "maintenance": [
483
+ "update config",
484
+ "upgrade",
485
+ "modify existing",
486
+ "change setting",
487
+ "improve performance",
488
+ ],
489
+ "content": ["document", "explain", "describe", "clarify", "write documentation"],
490
+ }
491
+
492
+ for category, indicators in semantic_indicators.items():
493
+ if any(indicator in message_lower for indicator in indicators):
494
+ return category
495
+
496
+ return None
497
+
498
+ def _check_rate_limits(self) -> bool:
499
+ """Check if we're within daily rate limits.
500
+
501
+ Returns:
502
+ True if request is allowed
503
+ """
504
+ from datetime import datetime
505
+
506
+ current_date = datetime.now().date()
507
+
508
+ # Reset counter if new day
509
+ if current_date != self._last_reset_date:
510
+ self._daily_requests = 0
511
+ self._last_reset_date = current_date
512
+
513
+ return self._daily_requests < self.config.max_daily_requests
514
+
515
+ def _create_result(
516
+ self,
517
+ category: str,
518
+ confidence: float,
519
+ method: str,
520
+ start_time: float,
521
+ reasoning: Optional[str] = None,
522
+ ) -> dict[str, Any]:
523
+ """Create a standardized result dictionary.
524
+
525
+ Args:
526
+ category: Classification category
527
+ confidence: Confidence score
528
+ method: Classification method
529
+ start_time: Processing start time
530
+ reasoning: Optional reasoning text
531
+
532
+ Returns:
533
+ Result dictionary (backward compatible format)
534
+ """
535
+ return {
536
+ "category": category,
537
+ "confidence": confidence,
538
+ "method": method,
539
+ "reasoning": reasoning or f"Classified using {method}",
540
+ "model": self.config.model if method == "llm" else "rule-based",
541
+ "alternatives": [],
542
+ "processing_time_ms": (time.time() - start_time) * 1000,
543
+ }
544
+
545
+ def get_statistics(self) -> dict[str, Any]:
546
+ """Get classifier usage statistics.
547
+
548
+ Returns:
549
+ Dictionary with usage statistics (backward compatible)
550
+ """
551
+ stats = {
552
+ "daily_requests": self._daily_requests,
553
+ "max_daily_requests": self.config.max_daily_requests,
554
+ "model": self.config.model,
555
+ "cache_enabled": self.config.enable_caching,
556
+ "api_configured": bool(self.config.api_key),
557
+ "total_tokens_used": self.total_tokens_used,
558
+ "total_cost": self.total_cost,
559
+ "api_calls_made": self.api_calls_made,
560
+ "average_tokens_per_call": (
561
+ self.total_tokens_used / self.api_calls_made if self.api_calls_made > 0 else 0
562
+ ),
563
+ }
564
+
565
+ # Add cache statistics
566
+ if self.cache:
567
+ stats["cache_statistics"] = self.cache.get_statistics()
568
+
569
+ # Add batch processor statistics
570
+ if self.batch_processor:
571
+ stats["batch_statistics"] = self.batch_processor.get_statistics()
572
+
573
+ # Add classifier statistics if available
574
+ if self.classifier:
575
+ stats["classifier_statistics"] = self.classifier.get_statistics()
576
+
577
+ return stats
578
+
579
+
580
+ # Legacy class for backward compatibility
581
+ class LLMPredictionCache:
582
+ """Legacy cache class for backward compatibility.
583
+
584
+ This wraps the new LLMCache to maintain the old interface.
585
+ """
586
+
587
+ def __init__(self, cache_path: Path, expiration_days: int = 90):
588
+ """Initialize legacy cache wrapper."""
589
+ self.cache = LLMCache(cache_path, expiration_days)
590
+
591
+ def get_prediction(self, message: str, files_changed: list[str]) -> Optional[dict[str, Any]]:
592
+ """Get cached prediction (legacy interface)."""
593
+ return self.cache.get(message, files_changed)
594
+
595
+ def store_prediction(
596
+ self, message: str, files_changed: list[str], result: dict[str, Any]
597
+ ) -> None:
598
+ """Store prediction (legacy interface)."""
599
+ self.cache.store(message, files_changed, result)
600
+
601
+ def cleanup_expired(self) -> int:
602
+ """Remove expired predictions (legacy interface)."""
603
+ return self.cache.cleanup_expired()
604
+
605
+ def get_statistics(self) -> dict[str, Any]:
606
+ """Get cache statistics (legacy interface)."""
607
+ return self.cache.get_statistics()