gitflow-analytics 1.0.0__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gitflow_analytics/__init__.py +11 -9
- gitflow_analytics/_version.py +2 -2
- gitflow_analytics/cli.py +691 -243
- gitflow_analytics/cli_rich.py +353 -0
- gitflow_analytics/config.py +389 -96
- gitflow_analytics/core/analyzer.py +175 -78
- gitflow_analytics/core/branch_mapper.py +132 -132
- gitflow_analytics/core/cache.py +242 -173
- gitflow_analytics/core/identity.py +214 -178
- gitflow_analytics/extractors/base.py +13 -11
- gitflow_analytics/extractors/story_points.py +70 -59
- gitflow_analytics/extractors/tickets.py +111 -88
- gitflow_analytics/integrations/github_integration.py +91 -77
- gitflow_analytics/integrations/jira_integration.py +284 -0
- gitflow_analytics/integrations/orchestrator.py +99 -72
- gitflow_analytics/metrics/dora.py +183 -179
- gitflow_analytics/models/database.py +191 -54
- gitflow_analytics/qualitative/__init__.py +30 -0
- gitflow_analytics/qualitative/classifiers/__init__.py +13 -0
- gitflow_analytics/qualitative/classifiers/change_type.py +468 -0
- gitflow_analytics/qualitative/classifiers/domain_classifier.py +399 -0
- gitflow_analytics/qualitative/classifiers/intent_analyzer.py +436 -0
- gitflow_analytics/qualitative/classifiers/risk_analyzer.py +412 -0
- gitflow_analytics/qualitative/core/__init__.py +13 -0
- gitflow_analytics/qualitative/core/llm_fallback.py +653 -0
- gitflow_analytics/qualitative/core/nlp_engine.py +373 -0
- gitflow_analytics/qualitative/core/pattern_cache.py +457 -0
- gitflow_analytics/qualitative/core/processor.py +540 -0
- gitflow_analytics/qualitative/models/__init__.py +25 -0
- gitflow_analytics/qualitative/models/schemas.py +272 -0
- gitflow_analytics/qualitative/utils/__init__.py +13 -0
- gitflow_analytics/qualitative/utils/batch_processor.py +326 -0
- gitflow_analytics/qualitative/utils/cost_tracker.py +343 -0
- gitflow_analytics/qualitative/utils/metrics.py +347 -0
- gitflow_analytics/qualitative/utils/text_processing.py +243 -0
- gitflow_analytics/reports/analytics_writer.py +25 -8
- gitflow_analytics/reports/csv_writer.py +60 -32
- gitflow_analytics/reports/narrative_writer.py +21 -15
- gitflow_analytics/tui/__init__.py +5 -0
- gitflow_analytics/tui/app.py +721 -0
- gitflow_analytics/tui/screens/__init__.py +8 -0
- gitflow_analytics/tui/screens/analysis_progress_screen.py +487 -0
- gitflow_analytics/tui/screens/configuration_screen.py +547 -0
- gitflow_analytics/tui/screens/loading_screen.py +358 -0
- gitflow_analytics/tui/screens/main_screen.py +304 -0
- gitflow_analytics/tui/screens/results_screen.py +698 -0
- gitflow_analytics/tui/widgets/__init__.py +7 -0
- gitflow_analytics/tui/widgets/data_table.py +257 -0
- gitflow_analytics/tui/widgets/export_modal.py +301 -0
- gitflow_analytics/tui/widgets/progress_widget.py +192 -0
- gitflow_analytics-1.0.3.dist-info/METADATA +490 -0
- gitflow_analytics-1.0.3.dist-info/RECORD +62 -0
- gitflow_analytics-1.0.0.dist-info/METADATA +0 -201
- gitflow_analytics-1.0.0.dist-info/RECORD +0 -30
- {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/WHEEL +0 -0
- {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/entry_points.txt +0 -0
- {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/licenses/LICENSE +0 -0
- {gitflow_analytics-1.0.0.dist-info → gitflow_analytics-1.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
"""Pattern caching system for qualitative analysis optimization."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
10
|
+
from collections import defaultdict
|
|
11
|
+
|
|
12
|
+
from sqlalchemy.orm import Session
|
|
13
|
+
from sqlalchemy import and_, desc
|
|
14
|
+
|
|
15
|
+
from ...models.database import Database, PatternCache as PatternCacheModel
|
|
16
|
+
from ..models.schemas import CacheConfig, QualitativeCommitData
|
|
17
|
+
from ..utils.text_processing import TextProcessor
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PatternCache:
|
|
21
|
+
"""Intelligent caching system for qualitative analysis patterns.
|
|
22
|
+
|
|
23
|
+
This system learns from successful classifications to speed up future
|
|
24
|
+
processing and improve accuracy through pattern recognition.
|
|
25
|
+
|
|
26
|
+
Features:
|
|
27
|
+
- Semantic fingerprinting for pattern matching
|
|
28
|
+
- Hit count tracking for popular patterns
|
|
29
|
+
- Automatic cache cleanup and optimization
|
|
30
|
+
- Pattern learning from successful classifications
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, config: CacheConfig, database: Database):
|
|
34
|
+
"""Initialize pattern cache.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
config: Cache configuration
|
|
38
|
+
database: Database instance for persistence
|
|
39
|
+
"""
|
|
40
|
+
self.config = config
|
|
41
|
+
self.database = database
|
|
42
|
+
self.logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
# Initialize text processor for fingerprinting
|
|
45
|
+
self.text_processor = TextProcessor()
|
|
46
|
+
|
|
47
|
+
# In-memory cache for frequently accessed patterns
|
|
48
|
+
self._memory_cache: Dict[str, Dict[str, Any]] = {}
|
|
49
|
+
self._memory_cache_hits = defaultdict(int)
|
|
50
|
+
|
|
51
|
+
# Statistics
|
|
52
|
+
self.cache_hits = 0
|
|
53
|
+
self.cache_misses = 0
|
|
54
|
+
self.pattern_learning_count = 0
|
|
55
|
+
|
|
56
|
+
# Initialize cache directory if using file-based caching
|
|
57
|
+
cache_dir = Path(config.cache_dir)
|
|
58
|
+
cache_dir.mkdir(exist_ok=True, parents=True)
|
|
59
|
+
|
|
60
|
+
self.logger.info(f"Pattern cache initialized with TTL: {config.pattern_cache_ttl_hours}h")
|
|
61
|
+
|
|
62
|
+
def lookup_pattern(self, message: str, files: List[str]) -> Optional[Dict[str, Any]]:
|
|
63
|
+
"""Look up cached classification for a commit pattern.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
message: Commit message
|
|
67
|
+
files: List of changed files
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Cached classification result or None if not found
|
|
71
|
+
"""
|
|
72
|
+
# Generate semantic fingerprint
|
|
73
|
+
fingerprint = self.text_processor.create_semantic_fingerprint(message, files)
|
|
74
|
+
|
|
75
|
+
# Check in-memory cache first
|
|
76
|
+
if fingerprint in self._memory_cache:
|
|
77
|
+
self._memory_cache_hits[fingerprint] += 1
|
|
78
|
+
self.cache_hits += 1
|
|
79
|
+
self.logger.debug(f"Memory cache hit for pattern: {fingerprint[:8]}")
|
|
80
|
+
return self._memory_cache[fingerprint]
|
|
81
|
+
|
|
82
|
+
# Check database cache
|
|
83
|
+
with self.database.get_session() as session:
|
|
84
|
+
cached_pattern = session.query(PatternCacheModel).filter(
|
|
85
|
+
PatternCacheModel.semantic_fingerprint == fingerprint
|
|
86
|
+
).first()
|
|
87
|
+
|
|
88
|
+
if cached_pattern:
|
|
89
|
+
# Check if pattern is still valid (not expired)
|
|
90
|
+
cutoff_time = datetime.utcnow() - timedelta(hours=self.config.pattern_cache_ttl_hours)
|
|
91
|
+
|
|
92
|
+
if cached_pattern.created_at > cutoff_time:
|
|
93
|
+
# Update hit count and last used
|
|
94
|
+
cached_pattern.hit_count += 1
|
|
95
|
+
cached_pattern.last_used = datetime.utcnow()
|
|
96
|
+
session.commit()
|
|
97
|
+
|
|
98
|
+
# Add to memory cache for faster future access
|
|
99
|
+
result = cached_pattern.classification_result
|
|
100
|
+
self._add_to_memory_cache(fingerprint, result)
|
|
101
|
+
|
|
102
|
+
self.cache_hits += 1
|
|
103
|
+
self.logger.debug(f"Database cache hit for pattern: {fingerprint[:8]}")
|
|
104
|
+
return result
|
|
105
|
+
else:
|
|
106
|
+
# Pattern is expired, remove it
|
|
107
|
+
session.delete(cached_pattern)
|
|
108
|
+
session.commit()
|
|
109
|
+
self.logger.debug(f"Expired pattern removed: {fingerprint[:8]}")
|
|
110
|
+
|
|
111
|
+
self.cache_misses += 1
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
def store_pattern(self, message: str, files: List[str],
|
|
115
|
+
classification_result: Dict[str, Any],
|
|
116
|
+
confidence_score: float, source_method: str,
|
|
117
|
+
processing_time_ms: float = 0.0) -> None:
|
|
118
|
+
"""Store a new pattern in the cache.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
message: Commit message
|
|
122
|
+
files: List of changed files
|
|
123
|
+
classification_result: Classification results to cache
|
|
124
|
+
confidence_score: Confidence in the classification
|
|
125
|
+
source_method: Method that produced this result ('nlp' or 'llm')
|
|
126
|
+
processing_time_ms: Time taken to process
|
|
127
|
+
"""
|
|
128
|
+
# Only cache high-confidence results
|
|
129
|
+
if confidence_score < 0.6:
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
fingerprint = self.text_processor.create_semantic_fingerprint(message, files)
|
|
133
|
+
message_hash = hashlib.md5(message.encode()).hexdigest()
|
|
134
|
+
|
|
135
|
+
# Add to memory cache
|
|
136
|
+
self._add_to_memory_cache(fingerprint, classification_result)
|
|
137
|
+
|
|
138
|
+
# Store in database
|
|
139
|
+
with self.database.get_session() as session:
|
|
140
|
+
# Check if pattern already exists
|
|
141
|
+
existing_pattern = session.query(PatternCacheModel).filter(
|
|
142
|
+
PatternCacheModel.semantic_fingerprint == fingerprint
|
|
143
|
+
).first()
|
|
144
|
+
|
|
145
|
+
if existing_pattern:
|
|
146
|
+
# Update existing pattern with new data
|
|
147
|
+
existing_pattern.hit_count += 1
|
|
148
|
+
existing_pattern.last_used = datetime.utcnow()
|
|
149
|
+
|
|
150
|
+
# Update confidence if new result is more confident
|
|
151
|
+
if confidence_score > existing_pattern.confidence_score:
|
|
152
|
+
existing_pattern.classification_result = classification_result
|
|
153
|
+
existing_pattern.confidence_score = confidence_score
|
|
154
|
+
existing_pattern.source_method = source_method
|
|
155
|
+
|
|
156
|
+
# Update average processing time
|
|
157
|
+
if processing_time_ms > 0:
|
|
158
|
+
if existing_pattern.avg_processing_time_ms:
|
|
159
|
+
existing_pattern.avg_processing_time_ms = (
|
|
160
|
+
existing_pattern.avg_processing_time_ms + processing_time_ms
|
|
161
|
+
) / 2
|
|
162
|
+
else:
|
|
163
|
+
existing_pattern.avg_processing_time_ms = processing_time_ms
|
|
164
|
+
else:
|
|
165
|
+
# Create new pattern
|
|
166
|
+
new_pattern = PatternCacheModel(
|
|
167
|
+
message_hash=message_hash,
|
|
168
|
+
semantic_fingerprint=fingerprint,
|
|
169
|
+
classification_result=classification_result,
|
|
170
|
+
confidence_score=confidence_score,
|
|
171
|
+
source_method=source_method,
|
|
172
|
+
avg_processing_time_ms=processing_time_ms
|
|
173
|
+
)
|
|
174
|
+
session.add(new_pattern)
|
|
175
|
+
self.pattern_learning_count += 1
|
|
176
|
+
|
|
177
|
+
session.commit()
|
|
178
|
+
|
|
179
|
+
self.logger.debug(
|
|
180
|
+
f"Stored pattern: {fingerprint[:8]} "
|
|
181
|
+
f"(confidence: {confidence_score:.2f}, method: {source_method})"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
def learn_from_results(self, results: List[QualitativeCommitData]) -> None:
|
|
185
|
+
"""Learn patterns from successful classification results.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
results: List of classification results to learn from
|
|
189
|
+
"""
|
|
190
|
+
learned_patterns = 0
|
|
191
|
+
|
|
192
|
+
for result in results:
|
|
193
|
+
if result.confidence_score >= 0.7: # Only learn from high-confidence results
|
|
194
|
+
classification_data = {
|
|
195
|
+
'change_type': result.change_type,
|
|
196
|
+
'change_type_confidence': result.change_type_confidence,
|
|
197
|
+
'business_domain': result.business_domain,
|
|
198
|
+
'domain_confidence': result.domain_confidence,
|
|
199
|
+
'risk_level': result.risk_level,
|
|
200
|
+
'confidence_score': result.confidence_score
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
self.store_pattern(
|
|
204
|
+
message=result.message,
|
|
205
|
+
files=result.files_changed,
|
|
206
|
+
classification_result=classification_data,
|
|
207
|
+
confidence_score=result.confidence_score,
|
|
208
|
+
source_method=result.processing_method,
|
|
209
|
+
processing_time_ms=result.processing_time_ms
|
|
210
|
+
)
|
|
211
|
+
learned_patterns += 1
|
|
212
|
+
|
|
213
|
+
if learned_patterns > 0:
|
|
214
|
+
self.logger.info(f"Learned {learned_patterns} new patterns from results")
|
|
215
|
+
|
|
216
|
+
def _add_to_memory_cache(self, fingerprint: str, result: Dict[str, Any]) -> None:
|
|
217
|
+
"""Add result to in-memory cache with size management.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
fingerprint: Pattern fingerprint
|
|
221
|
+
result: Classification result
|
|
222
|
+
"""
|
|
223
|
+
# Manage memory cache size
|
|
224
|
+
if len(self._memory_cache) >= self.config.semantic_cache_size:
|
|
225
|
+
# Remove least recently used items
|
|
226
|
+
sorted_items = sorted(
|
|
227
|
+
self._memory_cache_hits.items(),
|
|
228
|
+
key=lambda x: x[1]
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Remove bottom 20% of items
|
|
232
|
+
items_to_remove = len(sorted_items) // 5
|
|
233
|
+
for fingerprint_to_remove, _ in sorted_items[:items_to_remove]:
|
|
234
|
+
self._memory_cache.pop(fingerprint_to_remove, None)
|
|
235
|
+
self._memory_cache_hits.pop(fingerprint_to_remove, None)
|
|
236
|
+
|
|
237
|
+
self._memory_cache[fingerprint] = result
|
|
238
|
+
self._memory_cache_hits[fingerprint] = 1
|
|
239
|
+
|
|
240
|
+
def cleanup_cache(self) -> Dict[str, int]:
|
|
241
|
+
"""Clean up expired and low-quality cache entries.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Dictionary with cleanup statistics
|
|
245
|
+
"""
|
|
246
|
+
stats = {
|
|
247
|
+
'expired_removed': 0,
|
|
248
|
+
'low_confidence_removed': 0,
|
|
249
|
+
'low_usage_removed': 0,
|
|
250
|
+
'total_remaining': 0
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
cutoff_time = datetime.utcnow() - timedelta(hours=self.config.pattern_cache_ttl_hours)
|
|
254
|
+
|
|
255
|
+
with self.database.get_session() as session:
|
|
256
|
+
# Remove expired patterns
|
|
257
|
+
expired_patterns = session.query(PatternCacheModel).filter(
|
|
258
|
+
PatternCacheModel.created_at < cutoff_time
|
|
259
|
+
)
|
|
260
|
+
stats['expired_removed'] = expired_patterns.count()
|
|
261
|
+
expired_patterns.delete()
|
|
262
|
+
|
|
263
|
+
# Remove very low confidence patterns (< 0.4)
|
|
264
|
+
low_confidence_patterns = session.query(PatternCacheModel).filter(
|
|
265
|
+
PatternCacheModel.confidence_score < 0.4
|
|
266
|
+
)
|
|
267
|
+
stats['low_confidence_removed'] = low_confidence_patterns.count()
|
|
268
|
+
low_confidence_patterns.delete()
|
|
269
|
+
|
|
270
|
+
# Remove patterns with very low usage (hit_count = 1 and older than 7 days)
|
|
271
|
+
week_ago = datetime.utcnow() - timedelta(days=7)
|
|
272
|
+
low_usage_patterns = session.query(PatternCacheModel).filter(
|
|
273
|
+
and_(
|
|
274
|
+
PatternCacheModel.hit_count == 1,
|
|
275
|
+
PatternCacheModel.created_at < week_ago
|
|
276
|
+
)
|
|
277
|
+
)
|
|
278
|
+
stats['low_usage_removed'] = low_usage_patterns.count()
|
|
279
|
+
low_usage_patterns.delete()
|
|
280
|
+
|
|
281
|
+
# Count remaining patterns
|
|
282
|
+
stats['total_remaining'] = session.query(PatternCacheModel).count()
|
|
283
|
+
|
|
284
|
+
session.commit()
|
|
285
|
+
|
|
286
|
+
# Clear memory cache to force refresh
|
|
287
|
+
self._memory_cache.clear()
|
|
288
|
+
self._memory_cache_hits.clear()
|
|
289
|
+
|
|
290
|
+
self.logger.info(
|
|
291
|
+
f"Cache cleanup completed: {stats['expired_removed']} expired, "
|
|
292
|
+
f"{stats['low_confidence_removed']} low-confidence, "
|
|
293
|
+
f"{stats['low_usage_removed']} low-usage removed. "
|
|
294
|
+
f"{stats['total_remaining']} patterns remaining."
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
return stats
|
|
298
|
+
|
|
299
|
+
def get_cache_statistics(self) -> Dict[str, Any]:
|
|
300
|
+
"""Get comprehensive cache statistics.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Dictionary with cache statistics
|
|
304
|
+
"""
|
|
305
|
+
with self.database.get_session() as session:
|
|
306
|
+
# Basic counts
|
|
307
|
+
total_patterns = session.query(PatternCacheModel).count()
|
|
308
|
+
|
|
309
|
+
# Method breakdown
|
|
310
|
+
nlp_patterns = session.query(PatternCacheModel).filter(
|
|
311
|
+
PatternCacheModel.source_method == 'nlp'
|
|
312
|
+
).count()
|
|
313
|
+
|
|
314
|
+
llm_patterns = session.query(PatternCacheModel).filter(
|
|
315
|
+
PatternCacheModel.source_method == 'llm'
|
|
316
|
+
).count()
|
|
317
|
+
|
|
318
|
+
# Confidence distribution
|
|
319
|
+
high_confidence = session.query(PatternCacheModel).filter(
|
|
320
|
+
PatternCacheModel.confidence_score > 0.8
|
|
321
|
+
).count()
|
|
322
|
+
|
|
323
|
+
medium_confidence = session.query(PatternCacheModel).filter(
|
|
324
|
+
and_(
|
|
325
|
+
PatternCacheModel.confidence_score >= 0.6,
|
|
326
|
+
PatternCacheModel.confidence_score <= 0.8
|
|
327
|
+
)
|
|
328
|
+
).count()
|
|
329
|
+
|
|
330
|
+
# Usage statistics
|
|
331
|
+
top_patterns = session.query(PatternCacheModel).order_by(
|
|
332
|
+
desc(PatternCacheModel.hit_count)
|
|
333
|
+
).limit(10).all()
|
|
334
|
+
|
|
335
|
+
# Age statistics
|
|
336
|
+
week_ago = datetime.utcnow() - timedelta(days=7)
|
|
337
|
+
recent_patterns = session.query(PatternCacheModel).filter(
|
|
338
|
+
PatternCacheModel.created_at > week_ago
|
|
339
|
+
).count()
|
|
340
|
+
|
|
341
|
+
# Calculate hit rate
|
|
342
|
+
total_requests = self.cache_hits + self.cache_misses
|
|
343
|
+
hit_rate = (self.cache_hits / total_requests) if total_requests > 0 else 0.0
|
|
344
|
+
|
|
345
|
+
return {
|
|
346
|
+
'total_patterns': total_patterns,
|
|
347
|
+
'method_breakdown': {
|
|
348
|
+
'nlp_patterns': nlp_patterns,
|
|
349
|
+
'llm_patterns': llm_patterns
|
|
350
|
+
},
|
|
351
|
+
'confidence_distribution': {
|
|
352
|
+
'high_confidence': high_confidence,
|
|
353
|
+
'medium_confidence': medium_confidence,
|
|
354
|
+
'low_confidence': total_patterns - high_confidence - medium_confidence
|
|
355
|
+
},
|
|
356
|
+
'usage_stats': {
|
|
357
|
+
'cache_hits': self.cache_hits,
|
|
358
|
+
'cache_misses': self.cache_misses,
|
|
359
|
+
'hit_rate': hit_rate,
|
|
360
|
+
'memory_cache_size': len(self._memory_cache)
|
|
361
|
+
},
|
|
362
|
+
'top_patterns': [
|
|
363
|
+
{
|
|
364
|
+
'fingerprint': p.semantic_fingerprint[:8],
|
|
365
|
+
'hit_count': p.hit_count,
|
|
366
|
+
'confidence': p.confidence_score,
|
|
367
|
+
'method': p.source_method
|
|
368
|
+
}
|
|
369
|
+
for p in top_patterns
|
|
370
|
+
],
|
|
371
|
+
'recent_patterns': recent_patterns,
|
|
372
|
+
'patterns_learned': self.pattern_learning_count,
|
|
373
|
+
'config': {
|
|
374
|
+
'ttl_hours': self.config.pattern_cache_ttl_hours,
|
|
375
|
+
'max_memory_size': self.config.semantic_cache_size,
|
|
376
|
+
'learning_enabled': self.config.enable_pattern_learning
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
def optimize_cache(self) -> Dict[str, Any]:
|
|
381
|
+
"""Optimize cache for better performance.
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
Dictionary with optimization results
|
|
385
|
+
"""
|
|
386
|
+
optimization_stats = {}
|
|
387
|
+
|
|
388
|
+
# Step 1: Cleanup expired and low-quality entries
|
|
389
|
+
cleanup_stats = self.cleanup_cache()
|
|
390
|
+
optimization_stats['cleanup'] = cleanup_stats
|
|
391
|
+
|
|
392
|
+
# Step 2: Promote high-usage patterns to memory cache
|
|
393
|
+
with self.database.get_session() as session:
|
|
394
|
+
# Get top patterns by hit count
|
|
395
|
+
popular_patterns = session.query(PatternCacheModel).filter(
|
|
396
|
+
PatternCacheModel.hit_count >= 5
|
|
397
|
+
).order_by(desc(PatternCacheModel.hit_count)).limit(
|
|
398
|
+
self.config.semantic_cache_size // 2 # Fill half of memory cache
|
|
399
|
+
).all()
|
|
400
|
+
|
|
401
|
+
promoted_count = 0
|
|
402
|
+
for pattern in popular_patterns:
|
|
403
|
+
if pattern.semantic_fingerprint not in self._memory_cache:
|
|
404
|
+
self._add_to_memory_cache(
|
|
405
|
+
pattern.semantic_fingerprint,
|
|
406
|
+
pattern.classification_result
|
|
407
|
+
)
|
|
408
|
+
promoted_count += 1
|
|
409
|
+
|
|
410
|
+
optimization_stats['promoted_to_memory'] = promoted_count
|
|
411
|
+
|
|
412
|
+
# Step 3: Update learning threshold based on cache size
|
|
413
|
+
total_patterns = cleanup_stats['total_remaining']
|
|
414
|
+
if total_patterns > 1000:
|
|
415
|
+
# Increase learning threshold for large caches
|
|
416
|
+
self.config.learning_threshold = min(20, self.config.learning_threshold + 2)
|
|
417
|
+
elif total_patterns < 100:
|
|
418
|
+
# Decrease learning threshold for small caches
|
|
419
|
+
self.config.learning_threshold = max(5, self.config.learning_threshold - 1)
|
|
420
|
+
|
|
421
|
+
optimization_stats['learning_threshold'] = self.config.learning_threshold
|
|
422
|
+
|
|
423
|
+
self.logger.info(f"Cache optimization completed: {optimization_stats}")
|
|
424
|
+
return optimization_stats
|
|
425
|
+
|
|
426
|
+
def export_patterns(self, output_path: Path, min_confidence: float = 0.8) -> int:
|
|
427
|
+
"""Export high-quality patterns for analysis or backup.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
output_path: Path to export file
|
|
431
|
+
min_confidence: Minimum confidence threshold for export
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
Number of patterns exported
|
|
435
|
+
"""
|
|
436
|
+
with self.database.get_session() as session:
|
|
437
|
+
patterns = session.query(PatternCacheModel).filter(
|
|
438
|
+
PatternCacheModel.confidence_score >= min_confidence
|
|
439
|
+
).order_by(desc(PatternCacheModel.hit_count)).all()
|
|
440
|
+
|
|
441
|
+
export_data = []
|
|
442
|
+
for pattern in patterns:
|
|
443
|
+
export_data.append({
|
|
444
|
+
'semantic_fingerprint': pattern.semantic_fingerprint,
|
|
445
|
+
'classification_result': pattern.classification_result,
|
|
446
|
+
'confidence_score': pattern.confidence_score,
|
|
447
|
+
'hit_count': pattern.hit_count,
|
|
448
|
+
'source_method': pattern.source_method,
|
|
449
|
+
'created_at': pattern.created_at.isoformat(),
|
|
450
|
+
'last_used': pattern.last_used.isoformat()
|
|
451
|
+
})
|
|
452
|
+
|
|
453
|
+
with open(output_path, 'w') as f:
|
|
454
|
+
json.dump(export_data, f, indent=2)
|
|
455
|
+
|
|
456
|
+
self.logger.info(f"Exported {len(patterns)} patterns to {output_path}")
|
|
457
|
+
return len(patterns)
|