pomera-ai-commander 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +680 -0
- package/bin/pomera-ai-commander.js +62 -0
- package/core/__init__.py +66 -0
- package/core/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/__pycache__/app_context.cpython-313.pyc +0 -0
- package/core/__pycache__/async_text_processor.cpython-313.pyc +0 -0
- package/core/__pycache__/backup_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/backup_recovery_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/content_hash_cache.cpython-313.pyc +0 -0
- package/core/__pycache__/context_menu.cpython-313.pyc +0 -0
- package/core/__pycache__/data_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/database_connection_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_curl_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_promera_ai_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_schema.cpython-313.pyc +0 -0
- package/core/__pycache__/database_schema_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_settings_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/database_settings_manager_interface.cpython-313.pyc +0 -0
- package/core/__pycache__/dialog_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/efficient_line_numbers.cpython-313.pyc +0 -0
- package/core/__pycache__/error_handler.cpython-313.pyc +0 -0
- package/core/__pycache__/error_service.cpython-313.pyc +0 -0
- package/core/__pycache__/event_consolidator.cpython-313.pyc +0 -0
- package/core/__pycache__/memory_efficient_text_widget.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_test_suite.cpython-313.pyc +0 -0
- package/core/__pycache__/migration_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_find_replace.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_pattern_engine.cpython-313.pyc +0 -0
- package/core/__pycache__/optimized_search_highlighter.cpython-313.pyc +0 -0
- package/core/__pycache__/performance_monitor.cpython-313.pyc +0 -0
- package/core/__pycache__/persistence_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/progressive_stats_calculator.cpython-313.pyc +0 -0
- package/core/__pycache__/regex_pattern_cache.cpython-313.pyc +0 -0
- package/core/__pycache__/regex_pattern_library.cpython-313.pyc +0 -0
- package/core/__pycache__/search_operation_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_defaults_registry.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_integrity_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_serializer.cpython-313.pyc +0 -0
- package/core/__pycache__/settings_validator.cpython-313.pyc +0 -0
- package/core/__pycache__/smart_stats_calculator.cpython-313.pyc +0 -0
- package/core/__pycache__/statistics_update_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/stats_config_manager.cpython-313.pyc +0 -0
- package/core/__pycache__/streaming_text_handler.cpython-313.pyc +0 -0
- package/core/__pycache__/task_scheduler.cpython-313.pyc +0 -0
- package/core/__pycache__/visibility_monitor.cpython-313.pyc +0 -0
- package/core/__pycache__/widget_cache.cpython-313.pyc +0 -0
- package/core/app_context.py +482 -0
- package/core/async_text_processor.py +422 -0
- package/core/backup_manager.py +656 -0
- package/core/backup_recovery_manager.py +1034 -0
- package/core/content_hash_cache.py +509 -0
- package/core/context_menu.py +313 -0
- package/core/data_validator.py +1067 -0
- package/core/database_connection_manager.py +745 -0
- package/core/database_curl_settings_manager.py +609 -0
- package/core/database_promera_ai_settings_manager.py +447 -0
- package/core/database_schema.py +412 -0
- package/core/database_schema_manager.py +396 -0
- package/core/database_settings_manager.py +1508 -0
- package/core/database_settings_manager_interface.py +457 -0
- package/core/dialog_manager.py +735 -0
- package/core/efficient_line_numbers.py +511 -0
- package/core/error_handler.py +747 -0
- package/core/error_service.py +431 -0
- package/core/event_consolidator.py +512 -0
- package/core/mcp/__init__.py +43 -0
- package/core/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/protocol.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/schema.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/server_stdio.cpython-313.pyc +0 -0
- package/core/mcp/__pycache__/tool_registry.cpython-313.pyc +0 -0
- package/core/mcp/protocol.py +288 -0
- package/core/mcp/schema.py +251 -0
- package/core/mcp/server_stdio.py +299 -0
- package/core/mcp/tool_registry.py +2345 -0
- package/core/memory_efficient_text_widget.py +712 -0
- package/core/migration_manager.py +915 -0
- package/core/migration_test_suite.py +1086 -0
- package/core/migration_validator.py +1144 -0
- package/core/optimized_find_replace.py +715 -0
- package/core/optimized_pattern_engine.py +424 -0
- package/core/optimized_search_highlighter.py +553 -0
- package/core/performance_monitor.py +675 -0
- package/core/persistence_manager.py +713 -0
- package/core/progressive_stats_calculator.py +632 -0
- package/core/regex_pattern_cache.py +530 -0
- package/core/regex_pattern_library.py +351 -0
- package/core/search_operation_manager.py +435 -0
- package/core/settings_defaults_registry.py +1087 -0
- package/core/settings_integrity_validator.py +1112 -0
- package/core/settings_serializer.py +558 -0
- package/core/settings_validator.py +1824 -0
- package/core/smart_stats_calculator.py +710 -0
- package/core/statistics_update_manager.py +619 -0
- package/core/stats_config_manager.py +858 -0
- package/core/streaming_text_handler.py +723 -0
- package/core/task_scheduler.py +596 -0
- package/core/update_pattern_library.py +169 -0
- package/core/visibility_monitor.py +596 -0
- package/core/widget_cache.py +498 -0
- package/mcp.json +61 -0
- package/package.json +57 -0
- package/pomera.py +7483 -0
- package/pomera_mcp_server.py +144 -0
- package/tools/__init__.py +5 -0
- package/tools/__pycache__/__init__.cpython-313.pyc +0 -0
- package/tools/__pycache__/ai_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/ascii_art_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/base64_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/base_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/case_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/column_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/cron_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_history.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_processor.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_settings.cpython-313.pyc +0 -0
- package/tools/__pycache__/curl_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/diff_viewer.cpython-313.pyc +0 -0
- package/tools/__pycache__/email_extraction_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/email_header_analyzer.cpython-313.pyc +0 -0
- package/tools/__pycache__/extraction_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/find_replace.cpython-313.pyc +0 -0
- package/tools/__pycache__/folder_file_reporter.cpython-313.pyc +0 -0
- package/tools/__pycache__/folder_file_reporter_adapter.cpython-313.pyc +0 -0
- package/tools/__pycache__/generator_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/hash_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/html_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/huggingface_helper.cpython-313.pyc +0 -0
- package/tools/__pycache__/jsonxml_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/line_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/list_comparator.cpython-313.pyc +0 -0
- package/tools/__pycache__/markdown_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/mcp_widget.cpython-313.pyc +0 -0
- package/tools/__pycache__/notes_widget.cpython-313.pyc +0 -0
- package/tools/__pycache__/number_base_converter.cpython-313.pyc +0 -0
- package/tools/__pycache__/regex_extractor.cpython-313.pyc +0 -0
- package/tools/__pycache__/slug_generator.cpython-313.pyc +0 -0
- package/tools/__pycache__/sorter_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/string_escape_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/text_statistics_tool.cpython-313.pyc +0 -0
- package/tools/__pycache__/text_wrapper.cpython-313.pyc +0 -0
- package/tools/__pycache__/timestamp_converter.cpython-313.pyc +0 -0
- package/tools/__pycache__/tool_loader.cpython-313.pyc +0 -0
- package/tools/__pycache__/translator_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/url_link_extractor.cpython-313.pyc +0 -0
- package/tools/__pycache__/url_parser.cpython-313.pyc +0 -0
- package/tools/__pycache__/whitespace_tools.cpython-313.pyc +0 -0
- package/tools/__pycache__/word_frequency_counter.cpython-313.pyc +0 -0
- package/tools/ai_tools.py +2892 -0
- package/tools/ascii_art_generator.py +353 -0
- package/tools/base64_tools.py +184 -0
- package/tools/base_tool.py +511 -0
- package/tools/case_tool.py +309 -0
- package/tools/column_tools.py +396 -0
- package/tools/cron_tool.py +885 -0
- package/tools/curl_history.py +601 -0
- package/tools/curl_processor.py +1208 -0
- package/tools/curl_settings.py +503 -0
- package/tools/curl_tool.py +5467 -0
- package/tools/diff_viewer.py +1072 -0
- package/tools/email_extraction_tool.py +249 -0
- package/tools/email_header_analyzer.py +426 -0
- package/tools/extraction_tools.py +250 -0
- package/tools/find_replace.py +1751 -0
- package/tools/folder_file_reporter.py +1463 -0
- package/tools/folder_file_reporter_adapter.py +480 -0
- package/tools/generator_tools.py +1217 -0
- package/tools/hash_generator.py +256 -0
- package/tools/html_tool.py +657 -0
- package/tools/huggingface_helper.py +449 -0
- package/tools/jsonxml_tool.py +730 -0
- package/tools/line_tools.py +419 -0
- package/tools/list_comparator.py +720 -0
- package/tools/markdown_tools.py +562 -0
- package/tools/mcp_widget.py +1417 -0
- package/tools/notes_widget.py +973 -0
- package/tools/number_base_converter.py +373 -0
- package/tools/regex_extractor.py +572 -0
- package/tools/slug_generator.py +311 -0
- package/tools/sorter_tools.py +459 -0
- package/tools/string_escape_tool.py +393 -0
- package/tools/text_statistics_tool.py +366 -0
- package/tools/text_wrapper.py +431 -0
- package/tools/timestamp_converter.py +422 -0
- package/tools/tool_loader.py +710 -0
- package/tools/translator_tools.py +523 -0
- package/tools/url_link_extractor.py +262 -0
- package/tools/url_parser.py +205 -0
- package/tools/whitespace_tools.py +356 -0
- package/tools/word_frequency_counter.py +147 -0
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimized Pattern Engine for fast text analysis with minimal regex usage.
|
|
3
|
+
Provides specialized algorithms optimized for different text sizes with Unicode awareness.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import unicodedata
|
|
8
|
+
from typing import Dict, Optional, Tuple, List
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from functools import lru_cache
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class TextStructure:
|
|
15
|
+
"""Detailed text structure analysis."""
|
|
16
|
+
char_count: int = 0
|
|
17
|
+
word_count: int = 0
|
|
18
|
+
sentence_count: int = 0
|
|
19
|
+
line_count: int = 0
|
|
20
|
+
paragraph_count: int = 0
|
|
21
|
+
whitespace_count: int = 0
|
|
22
|
+
punctuation_count: int = 0
|
|
23
|
+
|
|
24
|
+
# Performance metadata
|
|
25
|
+
processing_method: str = "fast" # fast, standard, regex
|
|
26
|
+
processing_time_ms: float = 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OptimizedPatternEngine:
|
|
30
|
+
"""
|
|
31
|
+
High-performance pattern engine that minimizes regex usage.
|
|
32
|
+
Uses string-based counting methods where possible and compiled regex patterns with caching.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Sentence ending punctuation
|
|
36
|
+
SENTENCE_ENDINGS = frozenset('.!?')
|
|
37
|
+
|
|
38
|
+
# Common whitespace characters
|
|
39
|
+
WHITESPACE_CHARS = frozenset(' \t\n\r\f\v')
|
|
40
|
+
|
|
41
|
+
# Word boundary characters (optimized set)
|
|
42
|
+
WORD_BOUNDARIES = frozenset(' \t\n\r\f\v.,;:!?()[]{}"\'-—–')
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
"""Initialize the optimized pattern engine with compiled regex patterns."""
|
|
46
|
+
# Compiled regex patterns (cached for performance)
|
|
47
|
+
self._word_pattern = re.compile(r'\b\w+\b', re.UNICODE)
|
|
48
|
+
self._sentence_pattern = re.compile(r'[.!?]+(?:\s|$)', re.UNICODE)
|
|
49
|
+
self._paragraph_pattern = re.compile(r'\n\s*\n', re.UNICODE)
|
|
50
|
+
self._whitespace_pattern = re.compile(r'\s+', re.UNICODE)
|
|
51
|
+
|
|
52
|
+
# Complex sentence pattern for edge cases
|
|
53
|
+
self._complex_sentence_pattern = re.compile(
|
|
54
|
+
r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$',
|
|
55
|
+
re.UNICODE
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Pattern cache for dynamic patterns
|
|
59
|
+
self._pattern_cache: Dict[str, re.Pattern] = {}
|
|
60
|
+
self._cache_max_size = 50
|
|
61
|
+
|
|
62
|
+
def count_words_fast(self, text: str) -> int:
|
|
63
|
+
"""
|
|
64
|
+
Fast word counting using string-based methods.
|
|
65
|
+
Optimized for performance with minimal regex usage.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
text: Text to analyze
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Word count
|
|
72
|
+
"""
|
|
73
|
+
if not text:
|
|
74
|
+
return 0
|
|
75
|
+
|
|
76
|
+
# For small text, use simple split (fastest)
|
|
77
|
+
if len(text) < 500:
|
|
78
|
+
return self._count_words_simple(text)
|
|
79
|
+
|
|
80
|
+
# For all other sizes, use regex (fastest overall)
|
|
81
|
+
return len(self._word_pattern.findall(text))
|
|
82
|
+
|
|
83
|
+
def _count_words_simple(self, text: str) -> int:
|
|
84
|
+
"""Simple word counting for small text."""
|
|
85
|
+
# Split on whitespace and filter empty strings
|
|
86
|
+
return len([word for word in text.split() if word])
|
|
87
|
+
|
|
88
|
+
def _count_words_optimized(self, text: str) -> int:
|
|
89
|
+
"""
|
|
90
|
+
Optimized word counting using character scanning.
|
|
91
|
+
Handles Unicode word boundaries correctly.
|
|
92
|
+
"""
|
|
93
|
+
word_count = 0
|
|
94
|
+
in_word = False
|
|
95
|
+
|
|
96
|
+
for char in text:
|
|
97
|
+
if char in self.WORD_BOUNDARIES or char.isspace():
|
|
98
|
+
if in_word:
|
|
99
|
+
word_count += 1
|
|
100
|
+
in_word = False
|
|
101
|
+
elif char.isalnum() or char == '_':
|
|
102
|
+
in_word = True
|
|
103
|
+
|
|
104
|
+
# Count last word if text ends with a word character
|
|
105
|
+
if in_word:
|
|
106
|
+
word_count += 1
|
|
107
|
+
|
|
108
|
+
return word_count
|
|
109
|
+
|
|
110
|
+
def count_sentences_fast(self, text: str) -> int:
|
|
111
|
+
"""
|
|
112
|
+
Fast sentence counting with minimal regex usage.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
text: Text to analyze
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Sentence count
|
|
119
|
+
"""
|
|
120
|
+
if not text:
|
|
121
|
+
return 0
|
|
122
|
+
|
|
123
|
+
# Use compiled regex for all sizes (fastest and most accurate)
|
|
124
|
+
return len(self._sentence_pattern.findall(text))
|
|
125
|
+
|
|
126
|
+
def _count_sentences_simple(self, text: str) -> int:
|
|
127
|
+
"""
|
|
128
|
+
Simple sentence counting using character scanning.
|
|
129
|
+
Looks for sentence-ending punctuation followed by whitespace or end of text.
|
|
130
|
+
"""
|
|
131
|
+
sentence_count = 0
|
|
132
|
+
text_len = len(text)
|
|
133
|
+
i = 0
|
|
134
|
+
|
|
135
|
+
while i < text_len:
|
|
136
|
+
char = text[i]
|
|
137
|
+
|
|
138
|
+
# Check for sentence ending punctuation
|
|
139
|
+
if char in self.SENTENCE_ENDINGS:
|
|
140
|
+
# Look ahead to confirm it's a sentence boundary
|
|
141
|
+
# (followed by whitespace, uppercase, or end of text)
|
|
142
|
+
if i + 1 >= text_len:
|
|
143
|
+
sentence_count += 1
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
next_char = text[i + 1]
|
|
147
|
+
if next_char.isspace():
|
|
148
|
+
sentence_count += 1
|
|
149
|
+
# Skip consecutive punctuation
|
|
150
|
+
while i + 1 < text_len and text[i + 1] in self.SENTENCE_ENDINGS:
|
|
151
|
+
i += 1
|
|
152
|
+
|
|
153
|
+
i += 1
|
|
154
|
+
|
|
155
|
+
return max(sentence_count, 1 if text.strip() else 0)
|
|
156
|
+
|
|
157
|
+
def count_lines_fast(self, text: str) -> int:
|
|
158
|
+
"""
|
|
159
|
+
Fast line counting using string method.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
text: Text to analyze
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Line count
|
|
166
|
+
"""
|
|
167
|
+
if not text:
|
|
168
|
+
return 0
|
|
169
|
+
|
|
170
|
+
# Simple and fast: count newlines and add 1
|
|
171
|
+
line_count = text.count('\n') + 1
|
|
172
|
+
|
|
173
|
+
# Adjust if text ends with newline
|
|
174
|
+
if text.endswith('\n'):
|
|
175
|
+
line_count -= 1
|
|
176
|
+
|
|
177
|
+
return max(line_count, 1 if text.strip() else 0)
|
|
178
|
+
|
|
179
|
+
def count_paragraphs_fast(self, text: str) -> int:
|
|
180
|
+
"""
|
|
181
|
+
Fast paragraph counting.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
text: Text to analyze
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Paragraph count
|
|
188
|
+
"""
|
|
189
|
+
if not text.strip():
|
|
190
|
+
return 0
|
|
191
|
+
|
|
192
|
+
# For small text, use simple method
|
|
193
|
+
if len(text) < 5000:
|
|
194
|
+
return self._count_paragraphs_simple(text)
|
|
195
|
+
|
|
196
|
+
# For larger text, use regex
|
|
197
|
+
paragraphs = self._paragraph_pattern.split(text)
|
|
198
|
+
return len([p for p in paragraphs if p.strip()])
|
|
199
|
+
|
|
200
|
+
def _count_paragraphs_simple(self, text: str) -> int:
|
|
201
|
+
"""Simple paragraph counting by looking for blank lines."""
|
|
202
|
+
lines = text.split('\n')
|
|
203
|
+
paragraph_count = 0
|
|
204
|
+
in_paragraph = False
|
|
205
|
+
|
|
206
|
+
for line in lines:
|
|
207
|
+
if line.strip():
|
|
208
|
+
if not in_paragraph:
|
|
209
|
+
paragraph_count += 1
|
|
210
|
+
in_paragraph = True
|
|
211
|
+
else:
|
|
212
|
+
in_paragraph = False
|
|
213
|
+
|
|
214
|
+
return paragraph_count
|
|
215
|
+
|
|
216
|
+
def count_characters_unicode_aware(self, text: str) -> Tuple[int, int]:
|
|
217
|
+
"""
|
|
218
|
+
Count characters with Unicode awareness.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
text: Text to analyze
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Tuple of (character_count, byte_count)
|
|
225
|
+
"""
|
|
226
|
+
if not text:
|
|
227
|
+
return (0, 0)
|
|
228
|
+
|
|
229
|
+
# Character count (Unicode code points)
|
|
230
|
+
char_count = len(text)
|
|
231
|
+
|
|
232
|
+
# Byte count (UTF-8 encoding)
|
|
233
|
+
byte_count = len(text.encode('utf-8'))
|
|
234
|
+
|
|
235
|
+
return (char_count, byte_count)
|
|
236
|
+
|
|
237
|
+
def analyze_text_structure(self, text: str) -> TextStructure:
|
|
238
|
+
"""
|
|
239
|
+
Comprehensive text structure analysis using optimized methods.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
text: Text to analyze
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
TextStructure with detailed analysis
|
|
246
|
+
"""
|
|
247
|
+
import time
|
|
248
|
+
start_time = time.time()
|
|
249
|
+
|
|
250
|
+
structure = TextStructure()
|
|
251
|
+
|
|
252
|
+
if not text:
|
|
253
|
+
return structure
|
|
254
|
+
|
|
255
|
+
# Determine processing method based on text size
|
|
256
|
+
text_size = len(text)
|
|
257
|
+
if text_size < 1000:
|
|
258
|
+
structure.processing_method = "fast"
|
|
259
|
+
elif text_size < 50000:
|
|
260
|
+
structure.processing_method = "standard"
|
|
261
|
+
else:
|
|
262
|
+
structure.processing_method = "regex"
|
|
263
|
+
|
|
264
|
+
# Character counts
|
|
265
|
+
char_count, byte_count = self.count_characters_unicode_aware(text)
|
|
266
|
+
structure.char_count = byte_count # Use byte count for consistency
|
|
267
|
+
|
|
268
|
+
# Line count (always fast)
|
|
269
|
+
structure.line_count = self.count_lines_fast(text)
|
|
270
|
+
|
|
271
|
+
# Word count (optimized based on size)
|
|
272
|
+
structure.word_count = self.count_words_fast(text)
|
|
273
|
+
|
|
274
|
+
# Sentence count (optimized based on size)
|
|
275
|
+
structure.sentence_count = self.count_sentences_fast(text)
|
|
276
|
+
|
|
277
|
+
# Paragraph count (optimized based on size)
|
|
278
|
+
structure.paragraph_count = self.count_paragraphs_fast(text)
|
|
279
|
+
|
|
280
|
+
# Whitespace count (fast string method)
|
|
281
|
+
structure.whitespace_count = sum(1 for c in text if c.isspace())
|
|
282
|
+
|
|
283
|
+
# Punctuation count (fast character check)
|
|
284
|
+
structure.punctuation_count = sum(
|
|
285
|
+
1 for c in text
|
|
286
|
+
if unicodedata.category(c).startswith('P')
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Record processing time
|
|
290
|
+
structure.processing_time_ms = (time.time() - start_time) * 1000
|
|
291
|
+
|
|
292
|
+
return structure
|
|
293
|
+
|
|
294
|
+
@lru_cache(maxsize=100)
|
|
295
|
+
def get_compiled_pattern(self, pattern: str, flags: int = 0) -> re.Pattern:
|
|
296
|
+
"""
|
|
297
|
+
Get a compiled regex pattern with caching.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
pattern: Regex pattern string
|
|
301
|
+
flags: Regex flags
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Compiled regex pattern
|
|
305
|
+
"""
|
|
306
|
+
cache_key = f"{pattern}_{flags}"
|
|
307
|
+
|
|
308
|
+
if cache_key not in self._pattern_cache:
|
|
309
|
+
# Compile and cache the pattern
|
|
310
|
+
compiled = re.compile(pattern, flags)
|
|
311
|
+
|
|
312
|
+
# Manage cache size
|
|
313
|
+
if len(self._pattern_cache) >= self._cache_max_size:
|
|
314
|
+
# Remove oldest entry (simple FIFO)
|
|
315
|
+
oldest_key = next(iter(self._pattern_cache))
|
|
316
|
+
del self._pattern_cache[oldest_key]
|
|
317
|
+
|
|
318
|
+
self._pattern_cache[cache_key] = compiled
|
|
319
|
+
|
|
320
|
+
return self._pattern_cache[cache_key]
|
|
321
|
+
|
|
322
|
+
def find_all_optimized(self, pattern: str, text: str, flags: int = 0) -> List[str]:
|
|
323
|
+
"""
|
|
324
|
+
Find all matches using cached compiled pattern.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
pattern: Regex pattern
|
|
328
|
+
text: Text to search
|
|
329
|
+
flags: Regex flags
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
List of matches
|
|
333
|
+
"""
|
|
334
|
+
compiled_pattern = self.get_compiled_pattern(pattern, flags)
|
|
335
|
+
return compiled_pattern.findall(text)
|
|
336
|
+
|
|
337
|
+
def count_pattern_optimized(self, pattern: str, text: str, flags: int = 0) -> int:
|
|
338
|
+
"""
|
|
339
|
+
Count pattern matches using cached compiled pattern.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
pattern: Regex pattern
|
|
343
|
+
text: Text to search
|
|
344
|
+
flags: Regex flags
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Match count
|
|
348
|
+
"""
|
|
349
|
+
matches = self.find_all_optimized(pattern, text, flags)
|
|
350
|
+
return len(matches)
|
|
351
|
+
|
|
352
|
+
def is_unicode_text(self, text: str) -> bool:
|
|
353
|
+
"""
|
|
354
|
+
Check if text contains non-ASCII Unicode characters.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
text: Text to check
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
True if text contains Unicode characters beyond ASCII
|
|
361
|
+
"""
|
|
362
|
+
if not text:
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
# Fast check: if all characters are ASCII, no Unicode
|
|
366
|
+
try:
|
|
367
|
+
text.encode('ascii')
|
|
368
|
+
return False
|
|
369
|
+
except UnicodeEncodeError:
|
|
370
|
+
return True
|
|
371
|
+
|
|
372
|
+
def normalize_unicode(self, text: str, form: str = 'NFC') -> str:
|
|
373
|
+
"""
|
|
374
|
+
Normalize Unicode text for consistent processing.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
text: Text to normalize
|
|
378
|
+
form: Normalization form (NFC, NFD, NFKC, NFKD)
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Normalized text
|
|
382
|
+
"""
|
|
383
|
+
return unicodedata.normalize(form, text)
|
|
384
|
+
|
|
385
|
+
def clear_pattern_cache(self):
|
|
386
|
+
"""Clear the pattern cache."""
|
|
387
|
+
self._pattern_cache.clear()
|
|
388
|
+
# Clear LRU cache
|
|
389
|
+
self.get_compiled_pattern.cache_clear()
|
|
390
|
+
|
|
391
|
+
def get_cache_info(self) -> Dict[str, int]:
|
|
392
|
+
"""
|
|
393
|
+
Get pattern cache information.
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
Dictionary with cache statistics
|
|
397
|
+
"""
|
|
398
|
+
lru_info = self.get_compiled_pattern.cache_info()
|
|
399
|
+
|
|
400
|
+
return {
|
|
401
|
+
'pattern_cache_size': len(self._pattern_cache),
|
|
402
|
+
'pattern_cache_max_size': self._cache_max_size,
|
|
403
|
+
'lru_cache_hits': lru_info.hits,
|
|
404
|
+
'lru_cache_misses': lru_info.misses,
|
|
405
|
+
'lru_cache_size': lru_info.currsize,
|
|
406
|
+
'lru_cache_max_size': lru_info.maxsize
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
# Global instance
|
|
411
|
+
_global_pattern_engine: Optional[OptimizedPatternEngine] = None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def get_pattern_engine() -> OptimizedPatternEngine:
|
|
415
|
+
"""
|
|
416
|
+
Get the global optimized pattern engine instance.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Global OptimizedPatternEngine instance
|
|
420
|
+
"""
|
|
421
|
+
global _global_pattern_engine
|
|
422
|
+
if _global_pattern_engine is None:
|
|
423
|
+
_global_pattern_engine = OptimizedPatternEngine()
|
|
424
|
+
return _global_pattern_engine
|