pomera-ai-commander 0.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -21
- package/README.md +105 -680
- package/bin/pomera-ai-commander.js +62 -62
- package/core/__init__.py +65 -65
- package/core/app_context.py +482 -482
- package/core/async_text_processor.py +421 -421
- package/core/backup_manager.py +655 -655
- package/core/backup_recovery_manager.py +1033 -1033
- package/core/content_hash_cache.py +508 -508
- package/core/context_menu.py +313 -313
- package/core/data_validator.py +1066 -1066
- package/core/database_connection_manager.py +744 -744
- package/core/database_curl_settings_manager.py +608 -608
- package/core/database_promera_ai_settings_manager.py +446 -446
- package/core/database_schema.py +411 -411
- package/core/database_schema_manager.py +395 -395
- package/core/database_settings_manager.py +1507 -1507
- package/core/database_settings_manager_interface.py +456 -456
- package/core/dialog_manager.py +734 -734
- package/core/efficient_line_numbers.py +510 -510
- package/core/error_handler.py +746 -746
- package/core/error_service.py +431 -431
- package/core/event_consolidator.py +511 -511
- package/core/mcp/__init__.py +43 -43
- package/core/mcp/protocol.py +288 -288
- package/core/mcp/schema.py +251 -251
- package/core/mcp/server_stdio.py +299 -299
- package/core/mcp/tool_registry.py +2372 -2345
- package/core/memory_efficient_text_widget.py +711 -711
- package/core/migration_manager.py +914 -914
- package/core/migration_test_suite.py +1085 -1085
- package/core/migration_validator.py +1143 -1143
- package/core/optimized_find_replace.py +714 -714
- package/core/optimized_pattern_engine.py +424 -424
- package/core/optimized_search_highlighter.py +552 -552
- package/core/performance_monitor.py +674 -674
- package/core/persistence_manager.py +712 -712
- package/core/progressive_stats_calculator.py +632 -632
- package/core/regex_pattern_cache.py +529 -529
- package/core/regex_pattern_library.py +350 -350
- package/core/search_operation_manager.py +434 -434
- package/core/settings_defaults_registry.py +1087 -1087
- package/core/settings_integrity_validator.py +1111 -1111
- package/core/settings_serializer.py +557 -557
- package/core/settings_validator.py +1823 -1823
- package/core/smart_stats_calculator.py +709 -709
- package/core/statistics_update_manager.py +619 -619
- package/core/stats_config_manager.py +858 -858
- package/core/streaming_text_handler.py +723 -723
- package/core/task_scheduler.py +596 -596
- package/core/update_pattern_library.py +168 -168
- package/core/visibility_monitor.py +596 -596
- package/core/widget_cache.py +498 -498
- package/mcp.json +51 -61
- package/package.json +61 -57
- package/pomera.py +7482 -7482
- package/pomera_mcp_server.py +183 -144
- package/requirements.txt +32 -0
- package/tools/__init__.py +4 -4
- package/tools/ai_tools.py +2891 -2891
- package/tools/ascii_art_generator.py +352 -352
- package/tools/base64_tools.py +183 -183
- package/tools/base_tool.py +511 -511
- package/tools/case_tool.py +308 -308
- package/tools/column_tools.py +395 -395
- package/tools/cron_tool.py +884 -884
- package/tools/curl_history.py +600 -600
- package/tools/curl_processor.py +1207 -1207
- package/tools/curl_settings.py +502 -502
- package/tools/curl_tool.py +5467 -5467
- package/tools/diff_viewer.py +1071 -1071
- package/tools/email_extraction_tool.py +248 -248
- package/tools/email_header_analyzer.py +425 -425
- package/tools/extraction_tools.py +250 -250
- package/tools/find_replace.py +1750 -1750
- package/tools/folder_file_reporter.py +1463 -1463
- package/tools/folder_file_reporter_adapter.py +480 -480
- package/tools/generator_tools.py +1216 -1216
- package/tools/hash_generator.py +255 -255
- package/tools/html_tool.py +656 -656
- package/tools/jsonxml_tool.py +729 -729
- package/tools/line_tools.py +419 -419
- package/tools/markdown_tools.py +561 -561
- package/tools/mcp_widget.py +1417 -1417
- package/tools/notes_widget.py +973 -973
- package/tools/number_base_converter.py +372 -372
- package/tools/regex_extractor.py +571 -571
- package/tools/slug_generator.py +310 -310
- package/tools/sorter_tools.py +458 -458
- package/tools/string_escape_tool.py +392 -392
- package/tools/text_statistics_tool.py +365 -365
- package/tools/text_wrapper.py +430 -430
- package/tools/timestamp_converter.py +421 -421
- package/tools/tool_loader.py +710 -710
- package/tools/translator_tools.py +522 -522
- package/tools/url_link_extractor.py +261 -261
- package/tools/url_parser.py +204 -204
- package/tools/whitespace_tools.py +355 -355
- package/tools/word_frequency_counter.py +146 -146
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/app_context.cpython-313.pyc +0 -0
- package/core/__pycache__/async_text_processor.cpython-313.pyc +0 -0
- package/core/__pycache__/backup_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/backup_recovery_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/content_hash_cache.cpython-313.pyc +0 -0
- package/core/__pycache__/context_menu.cpython-313.pyc +0 -0
- package/core/__pycache__/data_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/database_connection_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_curl_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_promera_ai_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_schema.cpython-313.pyc +0 -0
- package/core/__pycache__/database_schema_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_settings_manager_interface.cpython-313.pyc +0 -0
- package/core/__pycache__/dialog_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/efficient_line_numbers.cpython-313.pyc +0 -0
- package/core/__pycache__/error_handler.cpython-313.pyc +0 -0
- package/core/__pycache__/error_service.cpython-313.pyc +0 -0
- package/core/__pycache__/event_consolidator.cpython-313.pyc +0 -0
- package/core/__pycache__/memory_efficient_text_widget.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_test_suite.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_find_replace.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_pattern_engine.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_search_highlighter.cpython-313.pyc +0 -0
- package/core/__pycache__/performance_monitor.cpython-313.pyc +0 -0
- package/core/__pycache__/persistence_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/progressive_stats_calculator.cpython-313.pyc +0 -0
- package/core/__pycache__/regex_pattern_cache.cpython-313.pyc +0 -0
- package/core/__pycache__/regex_pattern_library.cpython-313.pyc +0 -0
- package/core/__pycache__/search_operation_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_defaults_registry.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_integrity_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_serializer.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/smart_stats_calculator.cpython-313.pyc +0 -0
- package/core/__pycache__/statistics_update_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/stats_config_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/streaming_text_handler.cpython-313.pyc +0 -0
- package/core/__pycache__/task_scheduler.cpython-313.pyc +0 -0
- package/core/__pycache__/visibility_monitor.cpython-313.pyc +0 -0
- package/core/__pycache__/widget_cache.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/protocol.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/server_stdio.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/tool_registry.cpython-313.pyc +0 -0
- package/tools/__pycache__/__init__.cpython-313.pyc +0 -0
- package/tools/__pycache__/ai_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/ascii_art_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/base64_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/base_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/case_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/column_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/cron_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_history.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_processor.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_settings.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/diff_viewer.cpython-313.pyc +0 -0
- package/tools/__pycache__/email_extraction_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/email_header_analyzer.cpython-313.pyc +0 -0
- package/tools/__pycache__/extraction_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/find_replace.cpython-313.pyc +0 -0
- package/tools/__pycache__/folder_file_reporter.cpython-313.pyc +0 -0
- package/tools/__pycache__/folder_file_reporter_adapter.cpython-313.pyc +0 -0
- package/tools/__pycache__/generator_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/hash_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/html_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/huggingface_helper.cpython-313.pyc +0 -0
- package/tools/__pycache__/jsonxml_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/line_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/list_comparator.cpython-313.pyc +0 -0
- package/tools/__pycache__/markdown_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/mcp_widget.cpython-313.pyc +0 -0
- package/tools/__pycache__/notes_widget.cpython-313.pyc +0 -0
- package/tools/__pycache__/number_base_converter.cpython-313.pyc +0 -0
- package/tools/__pycache__/regex_extractor.cpython-313.pyc +0 -0
- package/tools/__pycache__/slug_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/sorter_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/string_escape_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/text_statistics_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/text_wrapper.cpython-313.pyc +0 -0
- package/tools/__pycache__/timestamp_converter.cpython-313.pyc +0 -0
- package/tools/__pycache__/tool_loader.cpython-313.pyc +0 -0
- package/tools/__pycache__/translator_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/url_link_extractor.cpython-313.pyc +0 -0
- package/tools/__pycache__/url_parser.cpython-313.pyc +0 -0
- package/tools/__pycache__/whitespace_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/word_frequency_counter.cpython-313.pyc +0 -0
|
@@ -1,710 +1,710 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Smart statistics calculator with intelligent caching for Promera AI Commander.
|
|
3
|
-
Provides efficient text statistics calculation with incremental updates and caching.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import re
|
|
7
|
-
import time
|
|
8
|
-
import hashlib
|
|
9
|
-
import threading
|
|
10
|
-
import sys
|
|
11
|
-
from typing import Dict, List, Optional, Any, Tuple
|
|
12
|
-
from dataclasses import dataclass, field
|
|
13
|
-
from collections import defaultdict
|
|
14
|
-
import weakref
|
|
15
|
-
|
|
16
|
-
# Import optimized pattern engine
|
|
17
|
-
from core.optimized_pattern_engine import get_pattern_engine, OptimizedPatternEngine
|
|
18
|
-
|
|
19
|
-
@dataclass
|
|
20
|
-
class ChangeInfo:
|
|
21
|
-
"""Information about text changes for incremental updates."""
|
|
22
|
-
start_pos: int
|
|
23
|
-
end_pos: int
|
|
24
|
-
inserted_text: str
|
|
25
|
-
deleted_length: int
|
|
26
|
-
change_type: str # insert, delete, replace
|
|
27
|
-
timestamp: float = field(default_factory=time.time)
|
|
28
|
-
|
|
29
|
-
def is_minor_change(self) -> bool:
|
|
30
|
-
"""Check if this is a minor change suitable for incremental update."""
|
|
31
|
-
# Minor changes: single character edits, small insertions/deletions
|
|
32
|
-
if self.change_type == "insert":
|
|
33
|
-
return len(self.inserted_text) <= 10
|
|
34
|
-
elif self.change_type == "delete":
|
|
35
|
-
return self.deleted_length <= 10
|
|
36
|
-
elif self.change_type == "replace":
|
|
37
|
-
return len(self.inserted_text) <= 10 and self.deleted_length <= 10
|
|
38
|
-
return False
|
|
39
|
-
|
|
40
|
-
def affects_statistics(self) -> bool:
|
|
41
|
-
"""Check if this change affects statistics significantly."""
|
|
42
|
-
# Changes that affect word/sentence boundaries
|
|
43
|
-
if self.change_type == "insert":
|
|
44
|
-
return any(c in self.inserted_text for c in [' ', '\n', '.', '!', '?'])
|
|
45
|
-
return True
|
|
46
|
-
|
|
47
|
-
@dataclass
|
|
48
|
-
class TextStats:
|
|
49
|
-
"""Comprehensive text statistics."""
|
|
50
|
-
char_count: int = 0
|
|
51
|
-
word_count: int = 0
|
|
52
|
-
sentence_count: int = 0
|
|
53
|
-
line_count: int = 0
|
|
54
|
-
paragraph_count: int = 0
|
|
55
|
-
token_count: int = 0
|
|
56
|
-
|
|
57
|
-
# Advanced statistics
|
|
58
|
-
unique_words: int = 0
|
|
59
|
-
average_word_length: float = 0.0
|
|
60
|
-
average_sentence_length: float = 0.0
|
|
61
|
-
reading_time_minutes: float = 0.0
|
|
62
|
-
|
|
63
|
-
# Metadata
|
|
64
|
-
content_hash: str = ""
|
|
65
|
-
calculation_time_ms: float = 0.0
|
|
66
|
-
timestamp: float = field(default_factory=time.time)
|
|
67
|
-
cache_hit: bool = False
|
|
68
|
-
incremental_update: bool = False
|
|
69
|
-
processing_method: str = "full" # full, incremental, cached
|
|
70
|
-
memory_usage_bytes: int = 0
|
|
71
|
-
|
|
72
|
-
def is_stale(self, max_age_seconds: int = 300) -> bool:
|
|
73
|
-
"""Check if statistics are stale (older than max_age_seconds)."""
|
|
74
|
-
return (time.time() - self.timestamp) > max_age_seconds
|
|
75
|
-
|
|
76
|
-
def to_status_string(self) -> str:
|
|
77
|
-
"""Convert to status bar string format."""
|
|
78
|
-
formatted_bytes = self._format_bytes(self.char_count)
|
|
79
|
-
return f"Bytes: {formatted_bytes} | Word: {self.word_count} | Sentence: {self.sentence_count} | Line: {self.line_count} | Tokens: {self.token_count}"
|
|
80
|
-
|
|
81
|
-
def _format_bytes(self, byte_count):
|
|
82
|
-
"""Format byte count with K/M suffixes for readability."""
|
|
83
|
-
if byte_count >= 1000000:
|
|
84
|
-
value = byte_count / 1000000
|
|
85
|
-
formatted = f"{value:.1f}M"
|
|
86
|
-
elif byte_count >= 1000:
|
|
87
|
-
value = byte_count / 1000
|
|
88
|
-
# Check if rounding to 1 decimal would make it >= 1000K
|
|
89
|
-
if round(value, 1) >= 1000:
|
|
90
|
-
formatted = f"{value / 1000:.1f}M"
|
|
91
|
-
else:
|
|
92
|
-
formatted = f"{value:.1f}K"
|
|
93
|
-
else:
|
|
94
|
-
return str(byte_count)
|
|
95
|
-
|
|
96
|
-
# Remove trailing zeros and decimal point if not needed
|
|
97
|
-
return formatted.rstrip('0').rstrip('.')
|
|
98
|
-
|
|
99
|
-
def to_detailed_dict(self) -> Dict[str, Any]:
|
|
100
|
-
"""Convert to detailed dictionary for analysis."""
|
|
101
|
-
return {
|
|
102
|
-
'basic': {
|
|
103
|
-
'characters': self.char_count,
|
|
104
|
-
'words': self.word_count,
|
|
105
|
-
'sentences': self.sentence_count,
|
|
106
|
-
'lines': self.line_count,
|
|
107
|
-
'paragraphs': self.paragraph_count,
|
|
108
|
-
'tokens': self.token_count
|
|
109
|
-
},
|
|
110
|
-
'advanced': {
|
|
111
|
-
'unique_words': self.unique_words,
|
|
112
|
-
'average_word_length': self.average_word_length,
|
|
113
|
-
'average_sentence_length': self.average_sentence_length,
|
|
114
|
-
'reading_time_minutes': self.reading_time_minutes
|
|
115
|
-
},
|
|
116
|
-
'metadata': {
|
|
117
|
-
'content_hash': self.content_hash,
|
|
118
|
-
'calculation_time_ms': self.calculation_time_ms,
|
|
119
|
-
'timestamp': self.timestamp
|
|
120
|
-
}
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
@dataclass
|
|
124
|
-
class CacheEntry:
|
|
125
|
-
"""Cache entry with metadata for intelligent cache management."""
|
|
126
|
-
stats: TextStats
|
|
127
|
-
access_count: int = 0
|
|
128
|
-
last_access: float = field(default_factory=time.time)
|
|
129
|
-
size_estimate: int = 0
|
|
130
|
-
widget_id: Optional[str] = None
|
|
131
|
-
|
|
132
|
-
@property
|
|
133
|
-
def age_seconds(self) -> float:
|
|
134
|
-
"""Age of the cache entry in seconds."""
|
|
135
|
-
return time.time() - self.last_access
|
|
136
|
-
|
|
137
|
-
@property
|
|
138
|
-
def access_frequency(self) -> float:
|
|
139
|
-
"""Access frequency (accesses per hour)."""
|
|
140
|
-
age_hours = max(self.age_seconds / 3600, 0.01) # Avoid division by zero
|
|
141
|
-
return self.access_count / age_hours
|
|
142
|
-
|
|
143
|
-
@property
|
|
144
|
-
def memory_usage(self) -> int:
|
|
145
|
-
"""Estimate memory usage of this cache entry."""
|
|
146
|
-
# Rough estimate: stats object + metadata
|
|
147
|
-
return self.size_estimate + sys.getsizeof(self.stats) + 200
|
|
148
|
-
|
|
149
|
-
class SmartStatsCalculator:
|
|
150
|
-
"""
|
|
151
|
-
Intelligent text statistics calculator with caching and incremental updates.
|
|
152
|
-
"""
|
|
153
|
-
|
|
154
|
-
# Memory limits
|
|
155
|
-
MAX_CACHE_MEMORY_BYTES = 50 * 1024 * 1024 # 50MB
|
|
156
|
-
CLEANUP_THRESHOLD_BYTES = 45 * 1024 * 1024 # Start cleanup at 45MB
|
|
157
|
-
|
|
158
|
-
def __init__(self, cache_size_limit: int = 1000, enable_advanced_stats: bool = True):
|
|
159
|
-
self.cache_size_limit = cache_size_limit
|
|
160
|
-
self.enable_advanced_stats = enable_advanced_stats
|
|
161
|
-
|
|
162
|
-
# Cache storage
|
|
163
|
-
self.stats_cache: Dict[str, CacheEntry] = {}
|
|
164
|
-
self.cache_lock = threading.RLock()
|
|
165
|
-
|
|
166
|
-
# Memory tracking
|
|
167
|
-
self.current_memory_usage = 0
|
|
168
|
-
self.last_cleanup_time = time.time()
|
|
169
|
-
self.cleanup_interval = 300 # 5 minutes
|
|
170
|
-
|
|
171
|
-
# Widget tracking for tool switching cleanup
|
|
172
|
-
self.widget_cache_map: Dict[str, List[str]] = defaultdict(list)
|
|
173
|
-
|
|
174
|
-
# Use optimized pattern engine for fast text analysis
|
|
175
|
-
self.pattern_engine = get_pattern_engine()
|
|
176
|
-
|
|
177
|
-
# Fallback regex patterns (for advanced features only)
|
|
178
|
-
self.word_pattern = re.compile(r'\b\w+\b')
|
|
179
|
-
self.sentence_pattern = re.compile(r'[.!?]+')
|
|
180
|
-
self.paragraph_pattern = re.compile(r'\n\s*\n')
|
|
181
|
-
|
|
182
|
-
# Word frequency cache for advanced stats
|
|
183
|
-
self.word_frequency_cache: Dict[str, Dict[str, int]] = {}
|
|
184
|
-
|
|
185
|
-
# Periodic cleanup thread
|
|
186
|
-
self._start_periodic_cleanup()
|
|
187
|
-
|
|
188
|
-
def calculate_stats(self, text: str, widget_id: Optional[str] = None) -> TextStats:
|
|
189
|
-
"""
|
|
190
|
-
Calculate text statistics with intelligent caching.
|
|
191
|
-
|
|
192
|
-
Args:
|
|
193
|
-
text: Text content to analyze
|
|
194
|
-
widget_id: Optional widget identifier for cache optimization
|
|
195
|
-
|
|
196
|
-
Returns:
|
|
197
|
-
TextStats object with comprehensive statistics
|
|
198
|
-
"""
|
|
199
|
-
if not text.strip():
|
|
200
|
-
return TextStats()
|
|
201
|
-
|
|
202
|
-
# Generate content hash for caching
|
|
203
|
-
content_hash = self._generate_content_hash(text)
|
|
204
|
-
|
|
205
|
-
# Check cache first
|
|
206
|
-
with self.cache_lock:
|
|
207
|
-
if content_hash in self.stats_cache:
|
|
208
|
-
entry = self.stats_cache[content_hash]
|
|
209
|
-
entry.access_count += 1
|
|
210
|
-
entry.last_access = time.time()
|
|
211
|
-
|
|
212
|
-
# Update stats metadata
|
|
213
|
-
stats = entry.stats
|
|
214
|
-
stats.cache_hit = True
|
|
215
|
-
stats.processing_method = "cached"
|
|
216
|
-
|
|
217
|
-
return stats
|
|
218
|
-
|
|
219
|
-
# Cache miss - calculate stats
|
|
220
|
-
start_time = time.time()
|
|
221
|
-
stats = self._calculate_stats_impl(text, content_hash)
|
|
222
|
-
calculation_time = (time.time() - start_time) * 1000
|
|
223
|
-
|
|
224
|
-
stats.calculation_time_ms = calculation_time
|
|
225
|
-
stats.cache_hit = False
|
|
226
|
-
stats.processing_method = "full"
|
|
227
|
-
|
|
228
|
-
# Cache the result
|
|
229
|
-
self._cache_stats(content_hash, stats, len(text), widget_id)
|
|
230
|
-
|
|
231
|
-
# Check if periodic cleanup is needed
|
|
232
|
-
self._check_periodic_cleanup()
|
|
233
|
-
|
|
234
|
-
return stats
|
|
235
|
-
|
|
236
|
-
def calculate_stats_incremental(self,
|
|
237
|
-
text: str,
|
|
238
|
-
previous_stats: Optional[TextStats] = None,
|
|
239
|
-
change_info: Optional[ChangeInfo] = None,
|
|
240
|
-
widget_id: Optional[str] = None) -> TextStats:
|
|
241
|
-
"""
|
|
242
|
-
Calculate statistics incrementally when possible.
|
|
243
|
-
|
|
244
|
-
Args:
|
|
245
|
-
text: Current text content
|
|
246
|
-
previous_stats: Previous statistics for comparison
|
|
247
|
-
change_info: Information about what changed (position, length, etc.)
|
|
248
|
-
widget_id: Optional widget identifier
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
Updated TextStats object
|
|
252
|
-
"""
|
|
253
|
-
# If no change info or previous stats, do full calculation
|
|
254
|
-
if not change_info or not previous_stats:
|
|
255
|
-
return self.calculate_stats(text, widget_id)
|
|
256
|
-
|
|
257
|
-
# Check if incremental update is beneficial
|
|
258
|
-
if not change_info.is_minor_change() or not change_info.affects_statistics():
|
|
259
|
-
# For non-minor changes or changes that don't affect stats, do full calculation
|
|
260
|
-
return self.calculate_stats(text, widget_id)
|
|
261
|
-
|
|
262
|
-
# Attempt incremental update for minor changes
|
|
263
|
-
start_time = time.time()
|
|
264
|
-
|
|
265
|
-
try:
|
|
266
|
-
stats = self._calculate_incremental_stats(text, previous_stats, change_info)
|
|
267
|
-
calculation_time = (time.time() - start_time) * 1000
|
|
268
|
-
|
|
269
|
-
stats.calculation_time_ms = calculation_time
|
|
270
|
-
stats.incremental_update = True
|
|
271
|
-
stats.processing_method = "incremental"
|
|
272
|
-
stats.cache_hit = False
|
|
273
|
-
|
|
274
|
-
# Cache the result
|
|
275
|
-
content_hash = self._generate_content_hash(text)
|
|
276
|
-
stats.content_hash = content_hash
|
|
277
|
-
self._cache_stats(content_hash, stats, len(text), widget_id)
|
|
278
|
-
|
|
279
|
-
return stats
|
|
280
|
-
|
|
281
|
-
except Exception:
|
|
282
|
-
# Fall back to full calculation on error
|
|
283
|
-
return self.calculate_stats(text, widget_id)
|
|
284
|
-
|
|
285
|
-
def _calculate_incremental_stats(self, text: str, previous_stats: TextStats, change_info: ChangeInfo) -> TextStats:
|
|
286
|
-
"""
|
|
287
|
-
Calculate statistics incrementally based on change information.
|
|
288
|
-
|
|
289
|
-
Args:
|
|
290
|
-
text: Current text content
|
|
291
|
-
previous_stats: Previous statistics
|
|
292
|
-
change_info: Information about the change
|
|
293
|
-
|
|
294
|
-
Returns:
|
|
295
|
-
Updated TextStats object
|
|
296
|
-
"""
|
|
297
|
-
stats = TextStats()
|
|
298
|
-
|
|
299
|
-
# Start with previous stats
|
|
300
|
-
stats.char_count = previous_stats.char_count
|
|
301
|
-
stats.word_count = previous_stats.word_count
|
|
302
|
-
stats.sentence_count = previous_stats.sentence_count
|
|
303
|
-
stats.line_count = previous_stats.line_count
|
|
304
|
-
stats.paragraph_count = previous_stats.paragraph_count
|
|
305
|
-
|
|
306
|
-
# Adjust based on change type
|
|
307
|
-
if change_info.change_type == "insert":
|
|
308
|
-
# Update character count
|
|
309
|
-
stats.char_count += len(change_info.inserted_text)
|
|
310
|
-
|
|
311
|
-
# Update line count
|
|
312
|
-
new_lines = change_info.inserted_text.count('\n')
|
|
313
|
-
stats.line_count += new_lines
|
|
314
|
-
|
|
315
|
-
# Update word count (approximate)
|
|
316
|
-
new_words = len(change_info.inserted_text.split())
|
|
317
|
-
stats.word_count += new_words
|
|
318
|
-
|
|
319
|
-
# Update sentence count (approximate)
|
|
320
|
-
new_sentences = sum(change_info.inserted_text.count(c) for c in ['.', '!', '?'])
|
|
321
|
-
stats.sentence_count += new_sentences
|
|
322
|
-
|
|
323
|
-
elif change_info.change_type == "delete":
|
|
324
|
-
# For deletions, we need to recalculate (can't reliably decrement)
|
|
325
|
-
# Fall back to full calculation
|
|
326
|
-
return self._calculate_stats_impl(text, "")
|
|
327
|
-
|
|
328
|
-
# Recalculate token count
|
|
329
|
-
stats.token_count = max(1, round(stats.char_count / 4))
|
|
330
|
-
|
|
331
|
-
# Advanced stats require full recalculation
|
|
332
|
-
if self.enable_advanced_stats:
|
|
333
|
-
# For incremental updates, skip advanced stats or recalculate
|
|
334
|
-
pass
|
|
335
|
-
|
|
336
|
-
return stats
|
|
337
|
-
|
|
338
|
-
def _calculate_stats_impl(self, text: str, content_hash: str) -> TextStats:
|
|
339
|
-
"""Internal implementation of statistics calculation using optimized pattern engine."""
|
|
340
|
-
stats = TextStats(content_hash=content_hash)
|
|
341
|
-
|
|
342
|
-
# Use optimized pattern engine for fast counting
|
|
343
|
-
text_structure = self.pattern_engine.analyze_text_structure(text)
|
|
344
|
-
|
|
345
|
-
# Basic statistics from optimized engine
|
|
346
|
-
stats.char_count = text_structure.char_count
|
|
347
|
-
stats.line_count = text_structure.line_count
|
|
348
|
-
stats.word_count = text_structure.word_count
|
|
349
|
-
stats.sentence_count = text_structure.sentence_count
|
|
350
|
-
stats.paragraph_count = text_structure.paragraph_count
|
|
351
|
-
|
|
352
|
-
# Token count (rough estimate: 1 token ≈ 4 characters)
|
|
353
|
-
stats.token_count = max(1, round(len(text) / 4))
|
|
354
|
-
|
|
355
|
-
# Advanced statistics (if enabled)
|
|
356
|
-
if self.enable_advanced_stats and stats.word_count > 0:
|
|
357
|
-
# Use regex for word extraction (needed for advanced stats)
|
|
358
|
-
words = self.word_pattern.findall(text)
|
|
359
|
-
|
|
360
|
-
if words:
|
|
361
|
-
stats.unique_words = len(set(word.lower() for word in words))
|
|
362
|
-
stats.average_word_length = sum(len(word) for word in words) / len(words)
|
|
363
|
-
|
|
364
|
-
if stats.sentence_count > 0:
|
|
365
|
-
stats.average_sentence_length = stats.word_count / stats.sentence_count
|
|
366
|
-
|
|
367
|
-
# Reading time estimate (average 200 words per minute)
|
|
368
|
-
stats.reading_time_minutes = stats.word_count / 200.0
|
|
369
|
-
|
|
370
|
-
return stats
|
|
371
|
-
|
|
372
|
-
def _generate_content_hash(self, text: str) -> str:
|
|
373
|
-
"""Generate a hash for content caching."""
|
|
374
|
-
# Use a combination of length and content hash for efficiency
|
|
375
|
-
content_sample = text[:100] + text[-100:] if len(text) > 200 else text
|
|
376
|
-
hash_input = f"{len(text)}_{content_sample}"
|
|
377
|
-
return hashlib.md5(hash_input.encode('utf-8')).hexdigest()[:16]
|
|
378
|
-
|
|
379
|
-
def _cache_stats(self, content_hash: str, stats: TextStats, content_size: int, widget_id: Optional[str] = None):
|
|
380
|
-
"""Cache statistics with intelligent cache management."""
|
|
381
|
-
with self.cache_lock:
|
|
382
|
-
# Create cache entry
|
|
383
|
-
entry = CacheEntry(
|
|
384
|
-
stats=stats,
|
|
385
|
-
access_count=1,
|
|
386
|
-
size_estimate=content_size,
|
|
387
|
-
widget_id=widget_id
|
|
388
|
-
)
|
|
389
|
-
|
|
390
|
-
# Track widget association
|
|
391
|
-
if widget_id:
|
|
392
|
-
self.widget_cache_map[widget_id].append(content_hash)
|
|
393
|
-
|
|
394
|
-
# Check memory usage before adding
|
|
395
|
-
entry_memory = entry.memory_usage
|
|
396
|
-
if self.current_memory_usage + entry_memory > self.MAX_CACHE_MEMORY_BYTES:
|
|
397
|
-
self._evict_by_memory_pressure()
|
|
398
|
-
|
|
399
|
-
# Check if cache is full by count
|
|
400
|
-
if len(self.stats_cache) >= self.cache_size_limit:
|
|
401
|
-
self._evict_cache_entries()
|
|
402
|
-
|
|
403
|
-
# Store in cache
|
|
404
|
-
self.stats_cache[content_hash] = entry
|
|
405
|
-
self.current_memory_usage += entry_memory
|
|
406
|
-
|
|
407
|
-
# Update stats memory usage
|
|
408
|
-
stats.memory_usage_bytes = entry_memory
|
|
409
|
-
|
|
410
|
-
def _evict_cache_entries(self):
|
|
411
|
-
"""Evict cache entries using intelligent LRU + frequency algorithm."""
|
|
412
|
-
if not self.stats_cache:
|
|
413
|
-
return
|
|
414
|
-
|
|
415
|
-
# Calculate eviction scores (lower score = more likely to evict)
|
|
416
|
-
entries_with_scores = []
|
|
417
|
-
current_time = time.time()
|
|
418
|
-
|
|
419
|
-
for hash_key, entry in self.stats_cache.items():
|
|
420
|
-
# Score based on recency, frequency, and size
|
|
421
|
-
recency_score = 1.0 / max(entry.age_seconds, 1.0)
|
|
422
|
-
frequency_score = entry.access_frequency
|
|
423
|
-
size_penalty = entry.size_estimate / 10000.0 # Penalize large entries
|
|
424
|
-
|
|
425
|
-
score = (recency_score * 0.4 + frequency_score * 0.5) - (size_penalty * 0.1)
|
|
426
|
-
entries_with_scores.append((score, hash_key))
|
|
427
|
-
|
|
428
|
-
# Sort by score (lowest first) and evict bottom 25%
|
|
429
|
-
entries_with_scores.sort()
|
|
430
|
-
evict_count = max(1, len(entries_with_scores) // 4)
|
|
431
|
-
|
|
432
|
-
for _, hash_key in entries_with_scores[:evict_count]:
|
|
433
|
-
entry = self.stats_cache.pop(hash_key, None)
|
|
434
|
-
if entry:
|
|
435
|
-
self.current_memory_usage -= entry.memory_usage
|
|
436
|
-
# Remove from widget map
|
|
437
|
-
if entry.widget_id and entry.widget_id in self.widget_cache_map:
|
|
438
|
-
try:
|
|
439
|
-
self.widget_cache_map[entry.widget_id].remove(hash_key)
|
|
440
|
-
except ValueError:
|
|
441
|
-
pass
|
|
442
|
-
|
|
443
|
-
def _evict_by_memory_pressure(self):
|
|
444
|
-
"""Evict cache entries when memory usage exceeds limits."""
|
|
445
|
-
if not self.stats_cache:
|
|
446
|
-
return
|
|
447
|
-
|
|
448
|
-
# Calculate how much memory we need to free
|
|
449
|
-
target_memory = self.CLEANUP_THRESHOLD_BYTES
|
|
450
|
-
memory_to_free = self.current_memory_usage - target_memory
|
|
451
|
-
|
|
452
|
-
if memory_to_free <= 0:
|
|
453
|
-
return
|
|
454
|
-
|
|
455
|
-
# Sort entries by eviction priority (oldest, least accessed, largest)
|
|
456
|
-
entries_with_priority = []
|
|
457
|
-
|
|
458
|
-
for hash_key, entry in self.stats_cache.items():
|
|
459
|
-
# Priority score (higher = more likely to evict)
|
|
460
|
-
age_score = entry.age_seconds
|
|
461
|
-
access_score = 1.0 / max(entry.access_count, 1)
|
|
462
|
-
size_score = entry.memory_usage / 1024.0 # KB
|
|
463
|
-
|
|
464
|
-
priority = (age_score * 0.4) + (access_score * 0.3) + (size_score * 0.3)
|
|
465
|
-
entries_with_priority.append((priority, hash_key, entry.memory_usage))
|
|
466
|
-
|
|
467
|
-
# Sort by priority (highest first) and evict until we free enough memory
|
|
468
|
-
entries_with_priority.sort(reverse=True)
|
|
469
|
-
|
|
470
|
-
freed_memory = 0
|
|
471
|
-
for priority, hash_key, entry_memory in entries_with_priority:
|
|
472
|
-
if freed_memory >= memory_to_free:
|
|
473
|
-
break
|
|
474
|
-
|
|
475
|
-
entry = self.stats_cache.pop(hash_key, None)
|
|
476
|
-
if entry:
|
|
477
|
-
self.current_memory_usage -= entry.memory_usage
|
|
478
|
-
freed_memory += entry.memory_usage
|
|
479
|
-
|
|
480
|
-
# Remove from widget map
|
|
481
|
-
if entry.widget_id and entry.widget_id in self.widget_cache_map:
|
|
482
|
-
try:
|
|
483
|
-
self.widget_cache_map[entry.widget_id].remove(hash_key)
|
|
484
|
-
except ValueError:
|
|
485
|
-
pass
|
|
486
|
-
|
|
487
|
-
def get_cached_stats(self, content_hash: str) -> Optional[TextStats]:
|
|
488
|
-
"""Get cached statistics by content hash."""
|
|
489
|
-
with self.cache_lock:
|
|
490
|
-
if content_hash in self.stats_cache:
|
|
491
|
-
entry = self.stats_cache[content_hash]
|
|
492
|
-
entry.access_count += 1
|
|
493
|
-
entry.last_access = time.time()
|
|
494
|
-
return entry.stats
|
|
495
|
-
return None
|
|
496
|
-
|
|
497
|
-
def clear_cache(self):
|
|
498
|
-
"""Clear all cached statistics."""
|
|
499
|
-
with self.cache_lock:
|
|
500
|
-
self.stats_cache.clear()
|
|
501
|
-
self.current_memory_usage = 0
|
|
502
|
-
self.widget_cache_map.clear()
|
|
503
|
-
|
|
504
|
-
def clear_widget_cache(self, widget_id: str):
|
|
505
|
-
"""Clear cache entries associated with a specific widget (for tool switching)."""
|
|
506
|
-
with self.cache_lock:
|
|
507
|
-
if widget_id not in self.widget_cache_map:
|
|
508
|
-
return
|
|
509
|
-
|
|
510
|
-
# Get all cache keys for this widget
|
|
511
|
-
cache_keys = self.widget_cache_map[widget_id].copy()
|
|
512
|
-
|
|
513
|
-
# Remove each entry
|
|
514
|
-
for cache_key in cache_keys:
|
|
515
|
-
entry = self.stats_cache.pop(cache_key, None)
|
|
516
|
-
if entry:
|
|
517
|
-
self.current_memory_usage -= entry.memory_usage
|
|
518
|
-
|
|
519
|
-
# Clear widget mapping
|
|
520
|
-
self.widget_cache_map.pop(widget_id, None)
|
|
521
|
-
|
|
522
|
-
def _check_periodic_cleanup(self):
|
|
523
|
-
"""Check if periodic cleanup is needed and perform it."""
|
|
524
|
-
current_time = time.time()
|
|
525
|
-
|
|
526
|
-
# Check if cleanup interval has passed
|
|
527
|
-
if current_time - self.last_cleanup_time < self.cleanup_interval:
|
|
528
|
-
return
|
|
529
|
-
|
|
530
|
-
# Perform cleanup
|
|
531
|
-
self._perform_periodic_cleanup()
|
|
532
|
-
self.last_cleanup_time = current_time
|
|
533
|
-
|
|
534
|
-
def _perform_periodic_cleanup(self):
|
|
535
|
-
"""Perform periodic memory cleanup."""
|
|
536
|
-
with self.cache_lock:
|
|
537
|
-
# Remove stale entries (older than 10 minutes)
|
|
538
|
-
stale_keys = []
|
|
539
|
-
|
|
540
|
-
for hash_key, entry in self.stats_cache.items():
|
|
541
|
-
if entry.age_seconds > 600: # 10 minutes
|
|
542
|
-
stale_keys.append(hash_key)
|
|
543
|
-
|
|
544
|
-
# Remove stale entries
|
|
545
|
-
for hash_key in stale_keys:
|
|
546
|
-
entry = self.stats_cache.pop(hash_key, None)
|
|
547
|
-
if entry:
|
|
548
|
-
self.current_memory_usage -= entry.memory_usage
|
|
549
|
-
|
|
550
|
-
# Remove from widget map
|
|
551
|
-
if entry.widget_id and entry.widget_id in self.widget_cache_map:
|
|
552
|
-
try:
|
|
553
|
-
self.widget_cache_map[entry.widget_id].remove(hash_key)
|
|
554
|
-
except ValueError:
|
|
555
|
-
pass
|
|
556
|
-
|
|
557
|
-
# Check memory usage and evict if needed
|
|
558
|
-
if self.current_memory_usage > self.CLEANUP_THRESHOLD_BYTES:
|
|
559
|
-
self._evict_by_memory_pressure()
|
|
560
|
-
|
|
561
|
-
def _start_periodic_cleanup(self):
|
|
562
|
-
"""Start background thread for periodic cleanup."""
|
|
563
|
-
def cleanup_worker():
|
|
564
|
-
while True:
|
|
565
|
-
time.sleep(self.cleanup_interval)
|
|
566
|
-
try:
|
|
567
|
-
self._perform_periodic_cleanup()
|
|
568
|
-
except Exception:
|
|
569
|
-
pass # Silently handle errors in background thread
|
|
570
|
-
|
|
571
|
-
# Start daemon thread
|
|
572
|
-
cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
|
|
573
|
-
cleanup_thread.start()
|
|
574
|
-
|
|
575
|
-
def get_cache_stats(self) -> Dict[str, Any]:
|
|
576
|
-
"""Get cache statistics."""
|
|
577
|
-
with self.cache_lock:
|
|
578
|
-
cache_size = len(self.stats_cache)
|
|
579
|
-
memory_usage_mb = self.current_memory_usage / (1024 * 1024)
|
|
580
|
-
memory_limit_mb = self.MAX_CACHE_MEMORY_BYTES / (1024 * 1024)
|
|
581
|
-
|
|
582
|
-
# Calculate hit rate if we have access data
|
|
583
|
-
total_accesses = sum(entry.access_count for entry in self.stats_cache.values())
|
|
584
|
-
|
|
585
|
-
return {
|
|
586
|
-
'cache_size': cache_size,
|
|
587
|
-
'cache_size_limit': self.cache_size_limit,
|
|
588
|
-
'memory_usage_mb': round(memory_usage_mb, 2),
|
|
589
|
-
'memory_limit_mb': round(memory_limit_mb, 2),
|
|
590
|
-
'memory_usage_percent': round((self.current_memory_usage / self.MAX_CACHE_MEMORY_BYTES) * 100, 2),
|
|
591
|
-
'total_accesses': total_accesses,
|
|
592
|
-
'widget_count': len(self.widget_cache_map)
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
def optimize_cache_size(self, target_cache_size: int = 1000):
|
|
596
|
-
"""Optimize cache size based on usage patterns."""
|
|
597
|
-
stats = self.get_cache_stats()
|
|
598
|
-
|
|
599
|
-
if stats['cache_size'] < target_cache_size and self.cache_size_limit < 2000:
|
|
600
|
-
# Increase cache size if current size is below target
|
|
601
|
-
self.cache_size_limit = min(2000, int(self.cache_size_limit * 1.2))
|
|
602
|
-
elif stats['cache_size'] > target_cache_size and self.cache_size_limit > 100:
|
|
603
|
-
# Decrease cache size if current size is above target
|
|
604
|
-
self.cache_size_limit = max(100, int(self.cache_size_limit * 0.9))
|
|
605
|
-
|
|
606
|
-
def precompute_stats(self, texts: List[str], widget_ids: Optional[List[str]] = None):
|
|
607
|
-
"""Precompute statistics for a list of texts (background processing)."""
|
|
608
|
-
for i, text in enumerate(texts):
|
|
609
|
-
widget_id = widget_ids[i] if widget_ids and i < len(widget_ids) else None
|
|
610
|
-
self.calculate_stats(text, widget_id)
|
|
611
|
-
|
|
612
|
-
def get_word_frequency(self, text: str, top_n: int = 10) -> List[Tuple[str, int]]:
|
|
613
|
-
"""Get word frequency analysis with caching."""
|
|
614
|
-
content_hash = self._generate_content_hash(text)
|
|
615
|
-
|
|
616
|
-
if content_hash in self.word_frequency_cache:
|
|
617
|
-
word_freq = self.word_frequency_cache[content_hash]
|
|
618
|
-
else:
|
|
619
|
-
words = self.word_pattern.findall(text.lower())
|
|
620
|
-
word_freq = defaultdict(int)
|
|
621
|
-
for word in words:
|
|
622
|
-
word_freq[word] += 1
|
|
623
|
-
|
|
624
|
-
# Cache the result (limit cache size)
|
|
625
|
-
if len(self.word_frequency_cache) >= 100:
|
|
626
|
-
# Remove oldest entry
|
|
627
|
-
oldest_key = next(iter(self.word_frequency_cache))
|
|
628
|
-
self.word_frequency_cache.pop(oldest_key)
|
|
629
|
-
|
|
630
|
-
self.word_frequency_cache[content_hash] = dict(word_freq)
|
|
631
|
-
|
|
632
|
-
# Return top N words
|
|
633
|
-
return sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
634
|
-
|
|
635
|
-
class CachedStatsManager:
|
|
636
|
-
"""
|
|
637
|
-
Manager for multiple SmartStatsCalculator instances with global optimization.
|
|
638
|
-
"""
|
|
639
|
-
|
|
640
|
-
def __init__(self):
|
|
641
|
-
self.calculators: Dict[str, SmartStatsCalculator] = {}
|
|
642
|
-
self.active_tool: Optional[str] = None
|
|
643
|
-
|
|
644
|
-
def get_calculator(self, calculator_id: str, **kwargs) -> SmartStatsCalculator:
|
|
645
|
-
"""Get or create a stats calculator."""
|
|
646
|
-
if calculator_id not in self.calculators:
|
|
647
|
-
self.calculators[calculator_id] = SmartStatsCalculator(**kwargs)
|
|
648
|
-
return self.calculators[calculator_id]
|
|
649
|
-
|
|
650
|
-
def on_tool_switch(self, old_tool: Optional[str], new_tool: str):
|
|
651
|
-
"""Handle tool switching - cleanup caches for old tool."""
|
|
652
|
-
if old_tool and old_tool in self.calculators:
|
|
653
|
-
# Clear cache for widgets associated with old tool
|
|
654
|
-
calc = self.calculators[old_tool]
|
|
655
|
-
# Clear all widget caches for this tool
|
|
656
|
-
widget_ids = list(calc.widget_cache_map.keys())
|
|
657
|
-
for widget_id in widget_ids:
|
|
658
|
-
calc.clear_widget_cache(widget_id)
|
|
659
|
-
|
|
660
|
-
self.active_tool = new_tool
|
|
661
|
-
|
|
662
|
-
def get_global_stats(self) -> Dict[str, Any]:
|
|
663
|
-
"""Get aggregated statistics across all calculators."""
|
|
664
|
-
total_cache_size = 0
|
|
665
|
-
total_memory_mb = 0.0
|
|
666
|
-
|
|
667
|
-
for calc in self.calculators.values():
|
|
668
|
-
stats = calc.get_cache_stats()
|
|
669
|
-
total_cache_size += stats['cache_size']
|
|
670
|
-
total_memory_mb += stats['memory_usage_mb']
|
|
671
|
-
|
|
672
|
-
return {
|
|
673
|
-
'calculators_count': len(self.calculators),
|
|
674
|
-
'total_cache_size': total_cache_size,
|
|
675
|
-
'total_memory_mb': round(total_memory_mb, 2),
|
|
676
|
-
'active_tool': self.active_tool
|
|
677
|
-
}
|
|
678
|
-
|
|
679
|
-
def optimize_all_caches(self):
|
|
680
|
-
"""Optimize all calculator caches."""
|
|
681
|
-
for calc in self.calculators.values():
|
|
682
|
-
calc.optimize_cache_size()
|
|
683
|
-
|
|
684
|
-
def clear_all_caches(self):
|
|
685
|
-
"""Clear all caches."""
|
|
686
|
-
for calc in self.calculators.values():
|
|
687
|
-
calc.clear_cache()
|
|
688
|
-
|
|
689
|
-
def perform_global_cleanup(self):
|
|
690
|
-
"""Perform cleanup across all calculators."""
|
|
691
|
-
for calc in self.calculators.values():
|
|
692
|
-
calc._perform_periodic_cleanup()
|
|
693
|
-
|
|
694
|
-
# Global instances
|
|
695
|
-
_global_stats_calculator = None
|
|
696
|
-
_global_stats_manager = None
|
|
697
|
-
|
|
698
|
-
def get_smart_stats_calculator() -> SmartStatsCalculator:
|
|
699
|
-
"""Get the global smart stats calculator instance."""
|
|
700
|
-
global _global_stats_calculator
|
|
701
|
-
if _global_stats_calculator is None:
|
|
702
|
-
_global_stats_calculator = SmartStatsCalculator()
|
|
703
|
-
return _global_stats_calculator
|
|
704
|
-
|
|
705
|
-
def get_stats_manager() -> CachedStatsManager:
|
|
706
|
-
"""Get the global stats manager instance."""
|
|
707
|
-
global _global_stats_manager
|
|
708
|
-
if _global_stats_manager is None:
|
|
709
|
-
_global_stats_manager = CachedStatsManager()
|
|
1
|
+
"""
|
|
2
|
+
Smart statistics calculator with intelligent caching for Promera AI Commander.
|
|
3
|
+
Provides efficient text statistics calculation with incremental updates and caching.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
import hashlib
|
|
9
|
+
import threading
|
|
10
|
+
import sys
|
|
11
|
+
from typing import Dict, List, Optional, Any, Tuple
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
import weakref
|
|
15
|
+
|
|
16
|
+
# Import optimized pattern engine
|
|
17
|
+
from core.optimized_pattern_engine import get_pattern_engine, OptimizedPatternEngine
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ChangeInfo:
|
|
21
|
+
"""Information about text changes for incremental updates."""
|
|
22
|
+
start_pos: int
|
|
23
|
+
end_pos: int
|
|
24
|
+
inserted_text: str
|
|
25
|
+
deleted_length: int
|
|
26
|
+
change_type: str # insert, delete, replace
|
|
27
|
+
timestamp: float = field(default_factory=time.time)
|
|
28
|
+
|
|
29
|
+
def is_minor_change(self) -> bool:
|
|
30
|
+
"""Check if this is a minor change suitable for incremental update."""
|
|
31
|
+
# Minor changes: single character edits, small insertions/deletions
|
|
32
|
+
if self.change_type == "insert":
|
|
33
|
+
return len(self.inserted_text) <= 10
|
|
34
|
+
elif self.change_type == "delete":
|
|
35
|
+
return self.deleted_length <= 10
|
|
36
|
+
elif self.change_type == "replace":
|
|
37
|
+
return len(self.inserted_text) <= 10 and self.deleted_length <= 10
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
def affects_statistics(self) -> bool:
|
|
41
|
+
"""Check if this change affects statistics significantly."""
|
|
42
|
+
# Changes that affect word/sentence boundaries
|
|
43
|
+
if self.change_type == "insert":
|
|
44
|
+
return any(c in self.inserted_text for c in [' ', '\n', '.', '!', '?'])
|
|
45
|
+
return True
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class TextStats:
|
|
49
|
+
"""Comprehensive text statistics."""
|
|
50
|
+
char_count: int = 0
|
|
51
|
+
word_count: int = 0
|
|
52
|
+
sentence_count: int = 0
|
|
53
|
+
line_count: int = 0
|
|
54
|
+
paragraph_count: int = 0
|
|
55
|
+
token_count: int = 0
|
|
56
|
+
|
|
57
|
+
# Advanced statistics
|
|
58
|
+
unique_words: int = 0
|
|
59
|
+
average_word_length: float = 0.0
|
|
60
|
+
average_sentence_length: float = 0.0
|
|
61
|
+
reading_time_minutes: float = 0.0
|
|
62
|
+
|
|
63
|
+
# Metadata
|
|
64
|
+
content_hash: str = ""
|
|
65
|
+
calculation_time_ms: float = 0.0
|
|
66
|
+
timestamp: float = field(default_factory=time.time)
|
|
67
|
+
cache_hit: bool = False
|
|
68
|
+
incremental_update: bool = False
|
|
69
|
+
processing_method: str = "full" # full, incremental, cached
|
|
70
|
+
memory_usage_bytes: int = 0
|
|
71
|
+
|
|
72
|
+
def is_stale(self, max_age_seconds: int = 300) -> bool:
|
|
73
|
+
"""Check if statistics are stale (older than max_age_seconds)."""
|
|
74
|
+
return (time.time() - self.timestamp) > max_age_seconds
|
|
75
|
+
|
|
76
|
+
def to_status_string(self) -> str:
|
|
77
|
+
"""Convert to status bar string format."""
|
|
78
|
+
formatted_bytes = self._format_bytes(self.char_count)
|
|
79
|
+
return f"Bytes: {formatted_bytes} | Word: {self.word_count} | Sentence: {self.sentence_count} | Line: {self.line_count} | Tokens: {self.token_count}"
|
|
80
|
+
|
|
81
|
+
def _format_bytes(self, byte_count):
|
|
82
|
+
"""Format byte count with K/M suffixes for readability."""
|
|
83
|
+
if byte_count >= 1000000:
|
|
84
|
+
value = byte_count / 1000000
|
|
85
|
+
formatted = f"{value:.1f}M"
|
|
86
|
+
elif byte_count >= 1000:
|
|
87
|
+
value = byte_count / 1000
|
|
88
|
+
# Check if rounding to 1 decimal would make it >= 1000K
|
|
89
|
+
if round(value, 1) >= 1000:
|
|
90
|
+
formatted = f"{value / 1000:.1f}M"
|
|
91
|
+
else:
|
|
92
|
+
formatted = f"{value:.1f}K"
|
|
93
|
+
else:
|
|
94
|
+
return str(byte_count)
|
|
95
|
+
|
|
96
|
+
# Remove trailing zeros and decimal point if not needed
|
|
97
|
+
return formatted.rstrip('0').rstrip('.')
|
|
98
|
+
|
|
99
|
+
def to_detailed_dict(self) -> Dict[str, Any]:
|
|
100
|
+
"""Convert to detailed dictionary for analysis."""
|
|
101
|
+
return {
|
|
102
|
+
'basic': {
|
|
103
|
+
'characters': self.char_count,
|
|
104
|
+
'words': self.word_count,
|
|
105
|
+
'sentences': self.sentence_count,
|
|
106
|
+
'lines': self.line_count,
|
|
107
|
+
'paragraphs': self.paragraph_count,
|
|
108
|
+
'tokens': self.token_count
|
|
109
|
+
},
|
|
110
|
+
'advanced': {
|
|
111
|
+
'unique_words': self.unique_words,
|
|
112
|
+
'average_word_length': self.average_word_length,
|
|
113
|
+
'average_sentence_length': self.average_sentence_length,
|
|
114
|
+
'reading_time_minutes': self.reading_time_minutes
|
|
115
|
+
},
|
|
116
|
+
'metadata': {
|
|
117
|
+
'content_hash': self.content_hash,
|
|
118
|
+
'calculation_time_ms': self.calculation_time_ms,
|
|
119
|
+
'timestamp': self.timestamp
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class CacheEntry:
|
|
125
|
+
"""Cache entry with metadata for intelligent cache management."""
|
|
126
|
+
stats: TextStats
|
|
127
|
+
access_count: int = 0
|
|
128
|
+
last_access: float = field(default_factory=time.time)
|
|
129
|
+
size_estimate: int = 0
|
|
130
|
+
widget_id: Optional[str] = None
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def age_seconds(self) -> float:
|
|
134
|
+
"""Age of the cache entry in seconds."""
|
|
135
|
+
return time.time() - self.last_access
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def access_frequency(self) -> float:
|
|
139
|
+
"""Access frequency (accesses per hour)."""
|
|
140
|
+
age_hours = max(self.age_seconds / 3600, 0.01) # Avoid division by zero
|
|
141
|
+
return self.access_count / age_hours
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def memory_usage(self) -> int:
|
|
145
|
+
"""Estimate memory usage of this cache entry."""
|
|
146
|
+
# Rough estimate: stats object + metadata
|
|
147
|
+
return self.size_estimate + sys.getsizeof(self.stats) + 200
|
|
148
|
+
|
|
149
|
+
class SmartStatsCalculator:
|
|
150
|
+
"""
|
|
151
|
+
Intelligent text statistics calculator with caching and incremental updates.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
# Memory limits
|
|
155
|
+
MAX_CACHE_MEMORY_BYTES = 50 * 1024 * 1024 # 50MB
|
|
156
|
+
CLEANUP_THRESHOLD_BYTES = 45 * 1024 * 1024 # Start cleanup at 45MB
|
|
157
|
+
|
|
158
|
+
def __init__(self, cache_size_limit: int = 1000, enable_advanced_stats: bool = True):
|
|
159
|
+
self.cache_size_limit = cache_size_limit
|
|
160
|
+
self.enable_advanced_stats = enable_advanced_stats
|
|
161
|
+
|
|
162
|
+
# Cache storage
|
|
163
|
+
self.stats_cache: Dict[str, CacheEntry] = {}
|
|
164
|
+
self.cache_lock = threading.RLock()
|
|
165
|
+
|
|
166
|
+
# Memory tracking
|
|
167
|
+
self.current_memory_usage = 0
|
|
168
|
+
self.last_cleanup_time = time.time()
|
|
169
|
+
self.cleanup_interval = 300 # 5 minutes
|
|
170
|
+
|
|
171
|
+
# Widget tracking for tool switching cleanup
|
|
172
|
+
self.widget_cache_map: Dict[str, List[str]] = defaultdict(list)
|
|
173
|
+
|
|
174
|
+
# Use optimized pattern engine for fast text analysis
|
|
175
|
+
self.pattern_engine = get_pattern_engine()
|
|
176
|
+
|
|
177
|
+
# Fallback regex patterns (for advanced features only)
|
|
178
|
+
self.word_pattern = re.compile(r'\b\w+\b')
|
|
179
|
+
self.sentence_pattern = re.compile(r'[.!?]+')
|
|
180
|
+
self.paragraph_pattern = re.compile(r'\n\s*\n')
|
|
181
|
+
|
|
182
|
+
# Word frequency cache for advanced stats
|
|
183
|
+
self.word_frequency_cache: Dict[str, Dict[str, int]] = {}
|
|
184
|
+
|
|
185
|
+
# Periodic cleanup thread
|
|
186
|
+
self._start_periodic_cleanup()
|
|
187
|
+
|
|
188
|
+
def calculate_stats(self, text: str, widget_id: Optional[str] = None) -> TextStats:
|
|
189
|
+
"""
|
|
190
|
+
Calculate text statistics with intelligent caching.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
text: Text content to analyze
|
|
194
|
+
widget_id: Optional widget identifier for cache optimization
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
TextStats object with comprehensive statistics
|
|
198
|
+
"""
|
|
199
|
+
if not text.strip():
|
|
200
|
+
return TextStats()
|
|
201
|
+
|
|
202
|
+
# Generate content hash for caching
|
|
203
|
+
content_hash = self._generate_content_hash(text)
|
|
204
|
+
|
|
205
|
+
# Check cache first
|
|
206
|
+
with self.cache_lock:
|
|
207
|
+
if content_hash in self.stats_cache:
|
|
208
|
+
entry = self.stats_cache[content_hash]
|
|
209
|
+
entry.access_count += 1
|
|
210
|
+
entry.last_access = time.time()
|
|
211
|
+
|
|
212
|
+
# Update stats metadata
|
|
213
|
+
stats = entry.stats
|
|
214
|
+
stats.cache_hit = True
|
|
215
|
+
stats.processing_method = "cached"
|
|
216
|
+
|
|
217
|
+
return stats
|
|
218
|
+
|
|
219
|
+
# Cache miss - calculate stats
|
|
220
|
+
start_time = time.time()
|
|
221
|
+
stats = self._calculate_stats_impl(text, content_hash)
|
|
222
|
+
calculation_time = (time.time() - start_time) * 1000
|
|
223
|
+
|
|
224
|
+
stats.calculation_time_ms = calculation_time
|
|
225
|
+
stats.cache_hit = False
|
|
226
|
+
stats.processing_method = "full"
|
|
227
|
+
|
|
228
|
+
# Cache the result
|
|
229
|
+
self._cache_stats(content_hash, stats, len(text), widget_id)
|
|
230
|
+
|
|
231
|
+
# Check if periodic cleanup is needed
|
|
232
|
+
self._check_periodic_cleanup()
|
|
233
|
+
|
|
234
|
+
return stats
|
|
235
|
+
|
|
236
|
+
def calculate_stats_incremental(self,
|
|
237
|
+
text: str,
|
|
238
|
+
previous_stats: Optional[TextStats] = None,
|
|
239
|
+
change_info: Optional[ChangeInfo] = None,
|
|
240
|
+
widget_id: Optional[str] = None) -> TextStats:
|
|
241
|
+
"""
|
|
242
|
+
Calculate statistics incrementally when possible.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
text: Current text content
|
|
246
|
+
previous_stats: Previous statistics for comparison
|
|
247
|
+
change_info: Information about what changed (position, length, etc.)
|
|
248
|
+
widget_id: Optional widget identifier
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Updated TextStats object
|
|
252
|
+
"""
|
|
253
|
+
# If no change info or previous stats, do full calculation
|
|
254
|
+
if not change_info or not previous_stats:
|
|
255
|
+
return self.calculate_stats(text, widget_id)
|
|
256
|
+
|
|
257
|
+
# Check if incremental update is beneficial
|
|
258
|
+
if not change_info.is_minor_change() or not change_info.affects_statistics():
|
|
259
|
+
# For non-minor changes or changes that don't affect stats, do full calculation
|
|
260
|
+
return self.calculate_stats(text, widget_id)
|
|
261
|
+
|
|
262
|
+
# Attempt incremental update for minor changes
|
|
263
|
+
start_time = time.time()
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
stats = self._calculate_incremental_stats(text, previous_stats, change_info)
|
|
267
|
+
calculation_time = (time.time() - start_time) * 1000
|
|
268
|
+
|
|
269
|
+
stats.calculation_time_ms = calculation_time
|
|
270
|
+
stats.incremental_update = True
|
|
271
|
+
stats.processing_method = "incremental"
|
|
272
|
+
stats.cache_hit = False
|
|
273
|
+
|
|
274
|
+
# Cache the result
|
|
275
|
+
content_hash = self._generate_content_hash(text)
|
|
276
|
+
stats.content_hash = content_hash
|
|
277
|
+
self._cache_stats(content_hash, stats, len(text), widget_id)
|
|
278
|
+
|
|
279
|
+
return stats
|
|
280
|
+
|
|
281
|
+
except Exception:
|
|
282
|
+
# Fall back to full calculation on error
|
|
283
|
+
return self.calculate_stats(text, widget_id)
|
|
284
|
+
|
|
285
|
+
def _calculate_incremental_stats(self, text: str, previous_stats: TextStats, change_info: ChangeInfo) -> TextStats:
|
|
286
|
+
"""
|
|
287
|
+
Calculate statistics incrementally based on change information.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
text: Current text content
|
|
291
|
+
previous_stats: Previous statistics
|
|
292
|
+
change_info: Information about the change
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Updated TextStats object
|
|
296
|
+
"""
|
|
297
|
+
stats = TextStats()
|
|
298
|
+
|
|
299
|
+
# Start with previous stats
|
|
300
|
+
stats.char_count = previous_stats.char_count
|
|
301
|
+
stats.word_count = previous_stats.word_count
|
|
302
|
+
stats.sentence_count = previous_stats.sentence_count
|
|
303
|
+
stats.line_count = previous_stats.line_count
|
|
304
|
+
stats.paragraph_count = previous_stats.paragraph_count
|
|
305
|
+
|
|
306
|
+
# Adjust based on change type
|
|
307
|
+
if change_info.change_type == "insert":
|
|
308
|
+
# Update character count
|
|
309
|
+
stats.char_count += len(change_info.inserted_text)
|
|
310
|
+
|
|
311
|
+
# Update line count
|
|
312
|
+
new_lines = change_info.inserted_text.count('\n')
|
|
313
|
+
stats.line_count += new_lines
|
|
314
|
+
|
|
315
|
+
# Update word count (approximate)
|
|
316
|
+
new_words = len(change_info.inserted_text.split())
|
|
317
|
+
stats.word_count += new_words
|
|
318
|
+
|
|
319
|
+
# Update sentence count (approximate)
|
|
320
|
+
new_sentences = sum(change_info.inserted_text.count(c) for c in ['.', '!', '?'])
|
|
321
|
+
stats.sentence_count += new_sentences
|
|
322
|
+
|
|
323
|
+
elif change_info.change_type == "delete":
|
|
324
|
+
# For deletions, we need to recalculate (can't reliably decrement)
|
|
325
|
+
# Fall back to full calculation
|
|
326
|
+
return self._calculate_stats_impl(text, "")
|
|
327
|
+
|
|
328
|
+
# Recalculate token count
|
|
329
|
+
stats.token_count = max(1, round(stats.char_count / 4))
|
|
330
|
+
|
|
331
|
+
# Advanced stats require full recalculation
|
|
332
|
+
if self.enable_advanced_stats:
|
|
333
|
+
# For incremental updates, skip advanced stats or recalculate
|
|
334
|
+
pass
|
|
335
|
+
|
|
336
|
+
return stats
|
|
337
|
+
|
|
338
|
+
def _calculate_stats_impl(self, text: str, content_hash: str) -> TextStats:
|
|
339
|
+
"""Internal implementation of statistics calculation using optimized pattern engine."""
|
|
340
|
+
stats = TextStats(content_hash=content_hash)
|
|
341
|
+
|
|
342
|
+
# Use optimized pattern engine for fast counting
|
|
343
|
+
text_structure = self.pattern_engine.analyze_text_structure(text)
|
|
344
|
+
|
|
345
|
+
# Basic statistics from optimized engine
|
|
346
|
+
stats.char_count = text_structure.char_count
|
|
347
|
+
stats.line_count = text_structure.line_count
|
|
348
|
+
stats.word_count = text_structure.word_count
|
|
349
|
+
stats.sentence_count = text_structure.sentence_count
|
|
350
|
+
stats.paragraph_count = text_structure.paragraph_count
|
|
351
|
+
|
|
352
|
+
# Token count (rough estimate: 1 token ≈ 4 characters)
|
|
353
|
+
stats.token_count = max(1, round(len(text) / 4))
|
|
354
|
+
|
|
355
|
+
# Advanced statistics (if enabled)
|
|
356
|
+
if self.enable_advanced_stats and stats.word_count > 0:
|
|
357
|
+
# Use regex for word extraction (needed for advanced stats)
|
|
358
|
+
words = self.word_pattern.findall(text)
|
|
359
|
+
|
|
360
|
+
if words:
|
|
361
|
+
stats.unique_words = len(set(word.lower() for word in words))
|
|
362
|
+
stats.average_word_length = sum(len(word) for word in words) / len(words)
|
|
363
|
+
|
|
364
|
+
if stats.sentence_count > 0:
|
|
365
|
+
stats.average_sentence_length = stats.word_count / stats.sentence_count
|
|
366
|
+
|
|
367
|
+
# Reading time estimate (average 200 words per minute)
|
|
368
|
+
stats.reading_time_minutes = stats.word_count / 200.0
|
|
369
|
+
|
|
370
|
+
return stats
|
|
371
|
+
|
|
372
|
+
def _generate_content_hash(self, text: str) -> str:
|
|
373
|
+
"""Generate a hash for content caching."""
|
|
374
|
+
# Use a combination of length and content hash for efficiency
|
|
375
|
+
content_sample = text[:100] + text[-100:] if len(text) > 200 else text
|
|
376
|
+
hash_input = f"{len(text)}_{content_sample}"
|
|
377
|
+
return hashlib.md5(hash_input.encode('utf-8')).hexdigest()[:16]
|
|
378
|
+
|
|
379
|
+
def _cache_stats(self, content_hash: str, stats: TextStats, content_size: int, widget_id: Optional[str] = None):
|
|
380
|
+
"""Cache statistics with intelligent cache management."""
|
|
381
|
+
with self.cache_lock:
|
|
382
|
+
# Create cache entry
|
|
383
|
+
entry = CacheEntry(
|
|
384
|
+
stats=stats,
|
|
385
|
+
access_count=1,
|
|
386
|
+
size_estimate=content_size,
|
|
387
|
+
widget_id=widget_id
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Track widget association
|
|
391
|
+
if widget_id:
|
|
392
|
+
self.widget_cache_map[widget_id].append(content_hash)
|
|
393
|
+
|
|
394
|
+
# Check memory usage before adding
|
|
395
|
+
entry_memory = entry.memory_usage
|
|
396
|
+
if self.current_memory_usage + entry_memory > self.MAX_CACHE_MEMORY_BYTES:
|
|
397
|
+
self._evict_by_memory_pressure()
|
|
398
|
+
|
|
399
|
+
# Check if cache is full by count
|
|
400
|
+
if len(self.stats_cache) >= self.cache_size_limit:
|
|
401
|
+
self._evict_cache_entries()
|
|
402
|
+
|
|
403
|
+
# Store in cache
|
|
404
|
+
self.stats_cache[content_hash] = entry
|
|
405
|
+
self.current_memory_usage += entry_memory
|
|
406
|
+
|
|
407
|
+
# Update stats memory usage
|
|
408
|
+
stats.memory_usage_bytes = entry_memory
|
|
409
|
+
|
|
410
|
+
def _evict_cache_entries(self):
|
|
411
|
+
"""Evict cache entries using intelligent LRU + frequency algorithm."""
|
|
412
|
+
if not self.stats_cache:
|
|
413
|
+
return
|
|
414
|
+
|
|
415
|
+
# Calculate eviction scores (lower score = more likely to evict)
|
|
416
|
+
entries_with_scores = []
|
|
417
|
+
current_time = time.time()
|
|
418
|
+
|
|
419
|
+
for hash_key, entry in self.stats_cache.items():
|
|
420
|
+
# Score based on recency, frequency, and size
|
|
421
|
+
recency_score = 1.0 / max(entry.age_seconds, 1.0)
|
|
422
|
+
frequency_score = entry.access_frequency
|
|
423
|
+
size_penalty = entry.size_estimate / 10000.0 # Penalize large entries
|
|
424
|
+
|
|
425
|
+
score = (recency_score * 0.4 + frequency_score * 0.5) - (size_penalty * 0.1)
|
|
426
|
+
entries_with_scores.append((score, hash_key))
|
|
427
|
+
|
|
428
|
+
# Sort by score (lowest first) and evict bottom 25%
|
|
429
|
+
entries_with_scores.sort()
|
|
430
|
+
evict_count = max(1, len(entries_with_scores) // 4)
|
|
431
|
+
|
|
432
|
+
for _, hash_key in entries_with_scores[:evict_count]:
|
|
433
|
+
entry = self.stats_cache.pop(hash_key, None)
|
|
434
|
+
if entry:
|
|
435
|
+
self.current_memory_usage -= entry.memory_usage
|
|
436
|
+
# Remove from widget map
|
|
437
|
+
if entry.widget_id and entry.widget_id in self.widget_cache_map:
|
|
438
|
+
try:
|
|
439
|
+
self.widget_cache_map[entry.widget_id].remove(hash_key)
|
|
440
|
+
except ValueError:
|
|
441
|
+
pass
|
|
442
|
+
|
|
443
|
+
def _evict_by_memory_pressure(self):
|
|
444
|
+
"""Evict cache entries when memory usage exceeds limits."""
|
|
445
|
+
if not self.stats_cache:
|
|
446
|
+
return
|
|
447
|
+
|
|
448
|
+
# Calculate how much memory we need to free
|
|
449
|
+
target_memory = self.CLEANUP_THRESHOLD_BYTES
|
|
450
|
+
memory_to_free = self.current_memory_usage - target_memory
|
|
451
|
+
|
|
452
|
+
if memory_to_free <= 0:
|
|
453
|
+
return
|
|
454
|
+
|
|
455
|
+
# Sort entries by eviction priority (oldest, least accessed, largest)
|
|
456
|
+
entries_with_priority = []
|
|
457
|
+
|
|
458
|
+
for hash_key, entry in self.stats_cache.items():
|
|
459
|
+
# Priority score (higher = more likely to evict)
|
|
460
|
+
age_score = entry.age_seconds
|
|
461
|
+
access_score = 1.0 / max(entry.access_count, 1)
|
|
462
|
+
size_score = entry.memory_usage / 1024.0 # KB
|
|
463
|
+
|
|
464
|
+
priority = (age_score * 0.4) + (access_score * 0.3) + (size_score * 0.3)
|
|
465
|
+
entries_with_priority.append((priority, hash_key, entry.memory_usage))
|
|
466
|
+
|
|
467
|
+
# Sort by priority (highest first) and evict until we free enough memory
|
|
468
|
+
entries_with_priority.sort(reverse=True)
|
|
469
|
+
|
|
470
|
+
freed_memory = 0
|
|
471
|
+
for priority, hash_key, entry_memory in entries_with_priority:
|
|
472
|
+
if freed_memory >= memory_to_free:
|
|
473
|
+
break
|
|
474
|
+
|
|
475
|
+
entry = self.stats_cache.pop(hash_key, None)
|
|
476
|
+
if entry:
|
|
477
|
+
self.current_memory_usage -= entry.memory_usage
|
|
478
|
+
freed_memory += entry.memory_usage
|
|
479
|
+
|
|
480
|
+
# Remove from widget map
|
|
481
|
+
if entry.widget_id and entry.widget_id in self.widget_cache_map:
|
|
482
|
+
try:
|
|
483
|
+
self.widget_cache_map[entry.widget_id].remove(hash_key)
|
|
484
|
+
except ValueError:
|
|
485
|
+
pass
|
|
486
|
+
|
|
487
|
+
def get_cached_stats(self, content_hash: str) -> Optional[TextStats]:
|
|
488
|
+
"""Get cached statistics by content hash."""
|
|
489
|
+
with self.cache_lock:
|
|
490
|
+
if content_hash in self.stats_cache:
|
|
491
|
+
entry = self.stats_cache[content_hash]
|
|
492
|
+
entry.access_count += 1
|
|
493
|
+
entry.last_access = time.time()
|
|
494
|
+
return entry.stats
|
|
495
|
+
return None
|
|
496
|
+
|
|
497
|
+
def clear_cache(self):
|
|
498
|
+
"""Clear all cached statistics."""
|
|
499
|
+
with self.cache_lock:
|
|
500
|
+
self.stats_cache.clear()
|
|
501
|
+
self.current_memory_usage = 0
|
|
502
|
+
self.widget_cache_map.clear()
|
|
503
|
+
|
|
504
|
+
def clear_widget_cache(self, widget_id: str):
|
|
505
|
+
"""Clear cache entries associated with a specific widget (for tool switching)."""
|
|
506
|
+
with self.cache_lock:
|
|
507
|
+
if widget_id not in self.widget_cache_map:
|
|
508
|
+
return
|
|
509
|
+
|
|
510
|
+
# Get all cache keys for this widget
|
|
511
|
+
cache_keys = self.widget_cache_map[widget_id].copy()
|
|
512
|
+
|
|
513
|
+
# Remove each entry
|
|
514
|
+
for cache_key in cache_keys:
|
|
515
|
+
entry = self.stats_cache.pop(cache_key, None)
|
|
516
|
+
if entry:
|
|
517
|
+
self.current_memory_usage -= entry.memory_usage
|
|
518
|
+
|
|
519
|
+
# Clear widget mapping
|
|
520
|
+
self.widget_cache_map.pop(widget_id, None)
|
|
521
|
+
|
|
522
|
+
def _check_periodic_cleanup(self):
|
|
523
|
+
"""Check if periodic cleanup is needed and perform it."""
|
|
524
|
+
current_time = time.time()
|
|
525
|
+
|
|
526
|
+
# Check if cleanup interval has passed
|
|
527
|
+
if current_time - self.last_cleanup_time < self.cleanup_interval:
|
|
528
|
+
return
|
|
529
|
+
|
|
530
|
+
# Perform cleanup
|
|
531
|
+
self._perform_periodic_cleanup()
|
|
532
|
+
self.last_cleanup_time = current_time
|
|
533
|
+
|
|
534
|
+
def _perform_periodic_cleanup(self):
|
|
535
|
+
"""Perform periodic memory cleanup."""
|
|
536
|
+
with self.cache_lock:
|
|
537
|
+
# Remove stale entries (older than 10 minutes)
|
|
538
|
+
stale_keys = []
|
|
539
|
+
|
|
540
|
+
for hash_key, entry in self.stats_cache.items():
|
|
541
|
+
if entry.age_seconds > 600: # 10 minutes
|
|
542
|
+
stale_keys.append(hash_key)
|
|
543
|
+
|
|
544
|
+
# Remove stale entries
|
|
545
|
+
for hash_key in stale_keys:
|
|
546
|
+
entry = self.stats_cache.pop(hash_key, None)
|
|
547
|
+
if entry:
|
|
548
|
+
self.current_memory_usage -= entry.memory_usage
|
|
549
|
+
|
|
550
|
+
# Remove from widget map
|
|
551
|
+
if entry.widget_id and entry.widget_id in self.widget_cache_map:
|
|
552
|
+
try:
|
|
553
|
+
self.widget_cache_map[entry.widget_id].remove(hash_key)
|
|
554
|
+
except ValueError:
|
|
555
|
+
pass
|
|
556
|
+
|
|
557
|
+
# Check memory usage and evict if needed
|
|
558
|
+
if self.current_memory_usage > self.CLEANUP_THRESHOLD_BYTES:
|
|
559
|
+
self._evict_by_memory_pressure()
|
|
560
|
+
|
|
561
|
+
def _start_periodic_cleanup(self):
|
|
562
|
+
"""Start background thread for periodic cleanup."""
|
|
563
|
+
def cleanup_worker():
|
|
564
|
+
while True:
|
|
565
|
+
time.sleep(self.cleanup_interval)
|
|
566
|
+
try:
|
|
567
|
+
self._perform_periodic_cleanup()
|
|
568
|
+
except Exception:
|
|
569
|
+
pass # Silently handle errors in background thread
|
|
570
|
+
|
|
571
|
+
# Start daemon thread
|
|
572
|
+
cleanup_thread = threading.Thread(target=cleanup_worker, daemon=True)
|
|
573
|
+
cleanup_thread.start()
|
|
574
|
+
|
|
575
|
+
def get_cache_stats(self) -> Dict[str, Any]:
|
|
576
|
+
"""Get cache statistics."""
|
|
577
|
+
with self.cache_lock:
|
|
578
|
+
cache_size = len(self.stats_cache)
|
|
579
|
+
memory_usage_mb = self.current_memory_usage / (1024 * 1024)
|
|
580
|
+
memory_limit_mb = self.MAX_CACHE_MEMORY_BYTES / (1024 * 1024)
|
|
581
|
+
|
|
582
|
+
# Calculate hit rate if we have access data
|
|
583
|
+
total_accesses = sum(entry.access_count for entry in self.stats_cache.values())
|
|
584
|
+
|
|
585
|
+
return {
|
|
586
|
+
'cache_size': cache_size,
|
|
587
|
+
'cache_size_limit': self.cache_size_limit,
|
|
588
|
+
'memory_usage_mb': round(memory_usage_mb, 2),
|
|
589
|
+
'memory_limit_mb': round(memory_limit_mb, 2),
|
|
590
|
+
'memory_usage_percent': round((self.current_memory_usage / self.MAX_CACHE_MEMORY_BYTES) * 100, 2),
|
|
591
|
+
'total_accesses': total_accesses,
|
|
592
|
+
'widget_count': len(self.widget_cache_map)
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
def optimize_cache_size(self, target_cache_size: int = 1000):
|
|
596
|
+
"""Optimize cache size based on usage patterns."""
|
|
597
|
+
stats = self.get_cache_stats()
|
|
598
|
+
|
|
599
|
+
if stats['cache_size'] < target_cache_size and self.cache_size_limit < 2000:
|
|
600
|
+
# Increase cache size if current size is below target
|
|
601
|
+
self.cache_size_limit = min(2000, int(self.cache_size_limit * 1.2))
|
|
602
|
+
elif stats['cache_size'] > target_cache_size and self.cache_size_limit > 100:
|
|
603
|
+
# Decrease cache size if current size is above target
|
|
604
|
+
self.cache_size_limit = max(100, int(self.cache_size_limit * 0.9))
|
|
605
|
+
|
|
606
|
+
def precompute_stats(self, texts: List[str], widget_ids: Optional[List[str]] = None):
|
|
607
|
+
"""Precompute statistics for a list of texts (background processing)."""
|
|
608
|
+
for i, text in enumerate(texts):
|
|
609
|
+
widget_id = widget_ids[i] if widget_ids and i < len(widget_ids) else None
|
|
610
|
+
self.calculate_stats(text, widget_id)
|
|
611
|
+
|
|
612
|
+
def get_word_frequency(self, text: str, top_n: int = 10) -> List[Tuple[str, int]]:
|
|
613
|
+
"""Get word frequency analysis with caching."""
|
|
614
|
+
content_hash = self._generate_content_hash(text)
|
|
615
|
+
|
|
616
|
+
if content_hash in self.word_frequency_cache:
|
|
617
|
+
word_freq = self.word_frequency_cache[content_hash]
|
|
618
|
+
else:
|
|
619
|
+
words = self.word_pattern.findall(text.lower())
|
|
620
|
+
word_freq = defaultdict(int)
|
|
621
|
+
for word in words:
|
|
622
|
+
word_freq[word] += 1
|
|
623
|
+
|
|
624
|
+
# Cache the result (limit cache size)
|
|
625
|
+
if len(self.word_frequency_cache) >= 100:
|
|
626
|
+
# Remove oldest entry
|
|
627
|
+
oldest_key = next(iter(self.word_frequency_cache))
|
|
628
|
+
self.word_frequency_cache.pop(oldest_key)
|
|
629
|
+
|
|
630
|
+
self.word_frequency_cache[content_hash] = dict(word_freq)
|
|
631
|
+
|
|
632
|
+
# Return top N words
|
|
633
|
+
return sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
634
|
+
|
|
635
|
+
class CachedStatsManager:
|
|
636
|
+
"""
|
|
637
|
+
Manager for multiple SmartStatsCalculator instances with global optimization.
|
|
638
|
+
"""
|
|
639
|
+
|
|
640
|
+
def __init__(self):
|
|
641
|
+
self.calculators: Dict[str, SmartStatsCalculator] = {}
|
|
642
|
+
self.active_tool: Optional[str] = None
|
|
643
|
+
|
|
644
|
+
def get_calculator(self, calculator_id: str, **kwargs) -> SmartStatsCalculator:
|
|
645
|
+
"""Get or create a stats calculator."""
|
|
646
|
+
if calculator_id not in self.calculators:
|
|
647
|
+
self.calculators[calculator_id] = SmartStatsCalculator(**kwargs)
|
|
648
|
+
return self.calculators[calculator_id]
|
|
649
|
+
|
|
650
|
+
def on_tool_switch(self, old_tool: Optional[str], new_tool: str):
|
|
651
|
+
"""Handle tool switching - cleanup caches for old tool."""
|
|
652
|
+
if old_tool and old_tool in self.calculators:
|
|
653
|
+
# Clear cache for widgets associated with old tool
|
|
654
|
+
calc = self.calculators[old_tool]
|
|
655
|
+
# Clear all widget caches for this tool
|
|
656
|
+
widget_ids = list(calc.widget_cache_map.keys())
|
|
657
|
+
for widget_id in widget_ids:
|
|
658
|
+
calc.clear_widget_cache(widget_id)
|
|
659
|
+
|
|
660
|
+
self.active_tool = new_tool
|
|
661
|
+
|
|
662
|
+
def get_global_stats(self) -> Dict[str, Any]:
|
|
663
|
+
"""Get aggregated statistics across all calculators."""
|
|
664
|
+
total_cache_size = 0
|
|
665
|
+
total_memory_mb = 0.0
|
|
666
|
+
|
|
667
|
+
for calc in self.calculators.values():
|
|
668
|
+
stats = calc.get_cache_stats()
|
|
669
|
+
total_cache_size += stats['cache_size']
|
|
670
|
+
total_memory_mb += stats['memory_usage_mb']
|
|
671
|
+
|
|
672
|
+
return {
|
|
673
|
+
'calculators_count': len(self.calculators),
|
|
674
|
+
'total_cache_size': total_cache_size,
|
|
675
|
+
'total_memory_mb': round(total_memory_mb, 2),
|
|
676
|
+
'active_tool': self.active_tool
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
def optimize_all_caches(self):
|
|
680
|
+
"""Optimize all calculator caches."""
|
|
681
|
+
for calc in self.calculators.values():
|
|
682
|
+
calc.optimize_cache_size()
|
|
683
|
+
|
|
684
|
+
def clear_all_caches(self):
|
|
685
|
+
"""Clear all caches."""
|
|
686
|
+
for calc in self.calculators.values():
|
|
687
|
+
calc.clear_cache()
|
|
688
|
+
|
|
689
|
+
def perform_global_cleanup(self):
|
|
690
|
+
"""Perform cleanup across all calculators."""
|
|
691
|
+
for calc in self.calculators.values():
|
|
692
|
+
calc._perform_periodic_cleanup()
|
|
693
|
+
|
|
694
|
+
# Global instances
|
|
695
|
+
_global_stats_calculator = None
|
|
696
|
+
_global_stats_manager = None
|
|
697
|
+
|
|
698
|
+
def get_smart_stats_calculator() -> SmartStatsCalculator:
|
|
699
|
+
"""Get the global smart stats calculator instance."""
|
|
700
|
+
global _global_stats_calculator
|
|
701
|
+
if _global_stats_calculator is None:
|
|
702
|
+
_global_stats_calculator = SmartStatsCalculator()
|
|
703
|
+
return _global_stats_calculator
|
|
704
|
+
|
|
705
|
+
def get_stats_manager() -> CachedStatsManager:
|
|
706
|
+
"""Get the global stats manager instance."""
|
|
707
|
+
global _global_stats_manager
|
|
708
|
+
if _global_stats_manager is None:
|
|
709
|
+
_global_stats_manager = CachedStatsManager()
|
|
710
710
|
return _global_stats_manager
|