tree-sitter-analyzer 1.9.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tree_sitter_analyzer/__init__.py +132 -0
- tree_sitter_analyzer/__main__.py +11 -0
- tree_sitter_analyzer/api.py +853 -0
- tree_sitter_analyzer/cli/__init__.py +39 -0
- tree_sitter_analyzer/cli/__main__.py +12 -0
- tree_sitter_analyzer/cli/argument_validator.py +89 -0
- tree_sitter_analyzer/cli/commands/__init__.py +26 -0
- tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
- tree_sitter_analyzer/cli/commands/base_command.py +181 -0
- tree_sitter_analyzer/cli/commands/default_command.py +18 -0
- tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
- tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
- tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
- tree_sitter_analyzer/cli/commands/query_command.py +109 -0
- tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
- tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
- tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
- tree_sitter_analyzer/cli/commands/table_command.py +414 -0
- tree_sitter_analyzer/cli/info_commands.py +124 -0
- tree_sitter_analyzer/cli_main.py +472 -0
- tree_sitter_analyzer/constants.py +85 -0
- tree_sitter_analyzer/core/__init__.py +15 -0
- tree_sitter_analyzer/core/analysis_engine.py +580 -0
- tree_sitter_analyzer/core/cache_service.py +333 -0
- tree_sitter_analyzer/core/engine.py +585 -0
- tree_sitter_analyzer/core/parser.py +293 -0
- tree_sitter_analyzer/core/query.py +605 -0
- tree_sitter_analyzer/core/query_filter.py +200 -0
- tree_sitter_analyzer/core/query_service.py +340 -0
- tree_sitter_analyzer/encoding_utils.py +530 -0
- tree_sitter_analyzer/exceptions.py +747 -0
- tree_sitter_analyzer/file_handler.py +246 -0
- tree_sitter_analyzer/formatters/__init__.py +1 -0
- tree_sitter_analyzer/formatters/base_formatter.py +201 -0
- tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
- tree_sitter_analyzer/formatters/formatter_config.py +197 -0
- tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
- tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
- tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
- tree_sitter_analyzer/formatters/go_formatter.py +368 -0
- tree_sitter_analyzer/formatters/html_formatter.py +498 -0
- tree_sitter_analyzer/formatters/java_formatter.py +423 -0
- tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
- tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
- tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
- tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
- tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
- tree_sitter_analyzer/formatters/php_formatter.py +301 -0
- tree_sitter_analyzer/formatters/python_formatter.py +830 -0
- tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
- tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
- tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
- tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
- tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
- tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
- tree_sitter_analyzer/interfaces/__init__.py +9 -0
- tree_sitter_analyzer/interfaces/cli.py +535 -0
- tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
- tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
- tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
- tree_sitter_analyzer/language_detector.py +553 -0
- tree_sitter_analyzer/language_loader.py +271 -0
- tree_sitter_analyzer/languages/__init__.py +10 -0
- tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
- tree_sitter_analyzer/languages/css_plugin.py +449 -0
- tree_sitter_analyzer/languages/go_plugin.py +836 -0
- tree_sitter_analyzer/languages/html_plugin.py +496 -0
- tree_sitter_analyzer/languages/java_plugin.py +1299 -0
- tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
- tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
- tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
- tree_sitter_analyzer/languages/php_plugin.py +862 -0
- tree_sitter_analyzer/languages/python_plugin.py +1636 -0
- tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
- tree_sitter_analyzer/languages/rust_plugin.py +673 -0
- tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
- tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
- tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
- tree_sitter_analyzer/legacy_table_formatter.py +860 -0
- tree_sitter_analyzer/mcp/__init__.py +34 -0
- tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
- tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
- tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
- tree_sitter_analyzer/mcp/server.py +869 -0
- tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
- tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
- tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
- tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
- tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
- tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
- tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
- tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
- tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
- tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
- tree_sitter_analyzer/models.py +840 -0
- tree_sitter_analyzer/mypy_current_errors.txt +2 -0
- tree_sitter_analyzer/output_manager.py +255 -0
- tree_sitter_analyzer/platform_compat/__init__.py +3 -0
- tree_sitter_analyzer/platform_compat/adapter.py +324 -0
- tree_sitter_analyzer/platform_compat/compare.py +224 -0
- tree_sitter_analyzer/platform_compat/detector.py +67 -0
- tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
- tree_sitter_analyzer/platform_compat/profiles.py +217 -0
- tree_sitter_analyzer/platform_compat/record.py +55 -0
- tree_sitter_analyzer/platform_compat/recorder.py +155 -0
- tree_sitter_analyzer/platform_compat/report.py +92 -0
- tree_sitter_analyzer/plugins/__init__.py +280 -0
- tree_sitter_analyzer/plugins/base.py +647 -0
- tree_sitter_analyzer/plugins/manager.py +384 -0
- tree_sitter_analyzer/project_detector.py +328 -0
- tree_sitter_analyzer/queries/__init__.py +27 -0
- tree_sitter_analyzer/queries/csharp.py +216 -0
- tree_sitter_analyzer/queries/css.py +615 -0
- tree_sitter_analyzer/queries/go.py +275 -0
- tree_sitter_analyzer/queries/html.py +543 -0
- tree_sitter_analyzer/queries/java.py +402 -0
- tree_sitter_analyzer/queries/javascript.py +724 -0
- tree_sitter_analyzer/queries/kotlin.py +192 -0
- tree_sitter_analyzer/queries/markdown.py +258 -0
- tree_sitter_analyzer/queries/php.py +95 -0
- tree_sitter_analyzer/queries/python.py +859 -0
- tree_sitter_analyzer/queries/ruby.py +92 -0
- tree_sitter_analyzer/queries/rust.py +223 -0
- tree_sitter_analyzer/queries/sql.py +555 -0
- tree_sitter_analyzer/queries/typescript.py +871 -0
- tree_sitter_analyzer/queries/yaml.py +236 -0
- tree_sitter_analyzer/query_loader.py +272 -0
- tree_sitter_analyzer/security/__init__.py +22 -0
- tree_sitter_analyzer/security/boundary_manager.py +277 -0
- tree_sitter_analyzer/security/regex_checker.py +297 -0
- tree_sitter_analyzer/security/validator.py +599 -0
- tree_sitter_analyzer/table_formatter.py +782 -0
- tree_sitter_analyzer/utils/__init__.py +53 -0
- tree_sitter_analyzer/utils/logging.py +433 -0
- tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
|
@@ -0,0 +1,530 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Optimized Encoding Utilities Module
|
|
4
|
+
|
|
5
|
+
This module provides unified encoding/decoding functionality with performance
|
|
6
|
+
optimizations including file-based encoding caching to reduce redundant
|
|
7
|
+
chardet.detect() calls.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
import threading
|
|
13
|
+
import time
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Set up encoding environment early
|
|
19
|
+
def _setup_encoding_environment() -> None:
|
|
20
|
+
"""Set up proper encoding environment"""
|
|
21
|
+
try:
|
|
22
|
+
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
23
|
+
os.environ["PYTHONUTF8"] = "1"
|
|
24
|
+
|
|
25
|
+
# Ensure proper stdout/stderr encoding if possible
|
|
26
|
+
if hasattr(sys.stdout, "reconfigure"):
|
|
27
|
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
|
28
|
+
if hasattr(sys.stderr, "reconfigure"):
|
|
29
|
+
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
|
|
30
|
+
except Exception as e:
|
|
31
|
+
# Ignore setup errors, use defaults; log at debug when possible
|
|
32
|
+
msg = f"[encoding_setup] non-fatal setup error: {e}\n"
|
|
33
|
+
if hasattr(sys, "stderr") and hasattr(sys.stderr, "write"):
|
|
34
|
+
try:
|
|
35
|
+
sys.stderr.write(msg)
|
|
36
|
+
except Exception:
|
|
37
|
+
# Swallow secondary I/O errors intentionally
|
|
38
|
+
...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Set up environment when module is imported
|
|
42
|
+
_setup_encoding_environment()
|
|
43
|
+
|
|
44
|
+
# Try to import chardet with fallback
|
|
45
|
+
try:
|
|
46
|
+
import chardet
|
|
47
|
+
|
|
48
|
+
CHARDET_AVAILABLE = True
|
|
49
|
+
except ImportError:
|
|
50
|
+
CHARDET_AVAILABLE = False
|
|
51
|
+
|
|
52
|
+
# Import utilities with fallback
|
|
53
|
+
try:
|
|
54
|
+
from .utils import log_debug, log_warning
|
|
55
|
+
except ImportError:
|
|
56
|
+
# Fallback logging functions with compatible signatures
|
|
57
|
+
def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
|
|
58
|
+
print(f"DEBUG: {message}")
|
|
59
|
+
|
|
60
|
+
def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
|
|
61
|
+
print(f"WARNING: {message}")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class EncodingCache:
|
|
65
|
+
"""Thread-safe encoding cache for file-based encoding detection optimization"""
|
|
66
|
+
|
|
67
|
+
def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
|
|
68
|
+
"""
|
|
69
|
+
Initialize encoding cache
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
max_size: Maximum number of cached entries
|
|
73
|
+
ttl_seconds: Time-to-live for cache entries in seconds
|
|
74
|
+
"""
|
|
75
|
+
self._cache: dict[
|
|
76
|
+
str, tuple[str, float]
|
|
77
|
+
] = {} # file_path -> (encoding, timestamp)
|
|
78
|
+
self._lock = threading.RLock()
|
|
79
|
+
self._max_size = max_size
|
|
80
|
+
self._ttl_seconds = ttl_seconds
|
|
81
|
+
|
|
82
|
+
def get(self, file_path: str) -> str | None:
|
|
83
|
+
"""
|
|
84
|
+
Get cached encoding for file path
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
file_path: Path to the file
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Cached encoding or None if not found/expired
|
|
91
|
+
"""
|
|
92
|
+
with self._lock:
|
|
93
|
+
if file_path not in self._cache:
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
encoding, timestamp = self._cache[file_path]
|
|
97
|
+
current_time = time.time()
|
|
98
|
+
|
|
99
|
+
# Check if entry has expired
|
|
100
|
+
if current_time - timestamp > self._ttl_seconds:
|
|
101
|
+
del self._cache[file_path]
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
return encoding
|
|
105
|
+
|
|
106
|
+
def set(self, file_path: str, encoding: str) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Cache encoding for file path
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
file_path: Path to the file
|
|
112
|
+
encoding: Detected encoding
|
|
113
|
+
"""
|
|
114
|
+
with self._lock:
|
|
115
|
+
current_time = time.time()
|
|
116
|
+
|
|
117
|
+
# Clean up expired entries if cache is getting full
|
|
118
|
+
if len(self._cache) >= self._max_size:
|
|
119
|
+
self._cleanup_expired()
|
|
120
|
+
|
|
121
|
+
# If still full after cleanup, remove oldest entry
|
|
122
|
+
if len(self._cache) >= self._max_size:
|
|
123
|
+
oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
|
|
124
|
+
del self._cache[oldest_key]
|
|
125
|
+
|
|
126
|
+
self._cache[file_path] = (encoding, current_time)
|
|
127
|
+
|
|
128
|
+
def _cleanup_expired(self) -> None:
|
|
129
|
+
"""Remove expired entries from cache"""
|
|
130
|
+
current_time = time.time()
|
|
131
|
+
expired_keys = [
|
|
132
|
+
key
|
|
133
|
+
for key, (_, timestamp) in self._cache.items()
|
|
134
|
+
if current_time - timestamp > self._ttl_seconds
|
|
135
|
+
]
|
|
136
|
+
for key in expired_keys:
|
|
137
|
+
del self._cache[key]
|
|
138
|
+
|
|
139
|
+
def clear(self) -> None:
|
|
140
|
+
"""Clear all cached entries"""
|
|
141
|
+
with self._lock:
|
|
142
|
+
self._cache.clear()
|
|
143
|
+
|
|
144
|
+
def size(self) -> int:
|
|
145
|
+
"""Get current cache size"""
|
|
146
|
+
with self._lock:
|
|
147
|
+
return len(self._cache)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# Global encoding cache instance
|
|
151
|
+
_encoding_cache = EncodingCache()
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class EncodingManager:
|
|
155
|
+
"""Centralized encoding management for consistent text processing"""
|
|
156
|
+
|
|
157
|
+
DEFAULT_ENCODING = "utf-8"
|
|
158
|
+
FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def safe_encode(cls, text: str | None, encoding: str | None = None) -> bytes:
|
|
162
|
+
"""
|
|
163
|
+
Safely encode text to bytes with fallback handling
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
text: Text to encode (can be None)
|
|
167
|
+
encoding: Target encoding (defaults to UTF-8)
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Encoded bytes
|
|
171
|
+
"""
|
|
172
|
+
# Handle None input
|
|
173
|
+
if text is None:
|
|
174
|
+
return b""
|
|
175
|
+
|
|
176
|
+
target_encoding = encoding or cls.DEFAULT_ENCODING
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
return text.encode(target_encoding)
|
|
180
|
+
except UnicodeEncodeError as e:
|
|
181
|
+
log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
|
|
182
|
+
|
|
183
|
+
# Try fallback encodings
|
|
184
|
+
for fallback in cls.FALLBACK_ENCODINGS:
|
|
185
|
+
if fallback != target_encoding:
|
|
186
|
+
try:
|
|
187
|
+
return text.encode(fallback, errors="replace")
|
|
188
|
+
except UnicodeEncodeError:
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
# Last resort: encode with error replacement
|
|
192
|
+
log_warning(f"Using error replacement for encoding: {text[:50]}...")
|
|
193
|
+
return text.encode(cls.DEFAULT_ENCODING, errors="replace")
|
|
194
|
+
|
|
195
|
+
@classmethod
|
|
196
|
+
def safe_decode(cls, data: bytes, encoding: str | None = None) -> str:
|
|
197
|
+
"""
|
|
198
|
+
Safely decode bytes to text with fallback handling
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
data: Bytes to decode
|
|
202
|
+
encoding: Source encoding (auto-detected if None)
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Decoded text
|
|
206
|
+
"""
|
|
207
|
+
if data is None or len(data) == 0:
|
|
208
|
+
return ""
|
|
209
|
+
|
|
210
|
+
# Use provided encoding or detect
|
|
211
|
+
target_encoding = encoding
|
|
212
|
+
if not target_encoding:
|
|
213
|
+
target_encoding = cls.detect_encoding(data)
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
return data.decode(target_encoding)
|
|
217
|
+
except UnicodeDecodeError as e:
|
|
218
|
+
log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
|
|
219
|
+
|
|
220
|
+
# Try fallback encodings
|
|
221
|
+
for fallback in cls.FALLBACK_ENCODINGS:
|
|
222
|
+
if fallback != target_encoding:
|
|
223
|
+
try:
|
|
224
|
+
return data.decode(fallback, errors="replace")
|
|
225
|
+
except UnicodeDecodeError:
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Last resort: decode with error replacement
|
|
229
|
+
log_warning(
|
|
230
|
+
f"Using error replacement for decoding data (length: {len(data)})"
|
|
231
|
+
)
|
|
232
|
+
return data.decode(cls.DEFAULT_ENCODING, errors="replace")
|
|
233
|
+
|
|
234
|
+
@classmethod
|
|
235
|
+
def detect_encoding(cls, data: bytes, file_path: str | None = None) -> str:
|
|
236
|
+
"""
|
|
237
|
+
Detect encoding of byte data with optional file-based caching
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
data: Bytes to analyze
|
|
241
|
+
file_path: Optional file path for caching (improves performance)
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Detected encoding name
|
|
245
|
+
"""
|
|
246
|
+
if not data:
|
|
247
|
+
return cls.DEFAULT_ENCODING
|
|
248
|
+
|
|
249
|
+
# Check cache first if file_path is provided
|
|
250
|
+
if file_path:
|
|
251
|
+
cached_encoding = _encoding_cache.get(file_path)
|
|
252
|
+
if cached_encoding:
|
|
253
|
+
log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
|
|
254
|
+
return cached_encoding
|
|
255
|
+
|
|
256
|
+
detected_encoding = cls.DEFAULT_ENCODING
|
|
257
|
+
|
|
258
|
+
# If chardet is not available, use simple heuristics
|
|
259
|
+
if not CHARDET_AVAILABLE:
|
|
260
|
+
try:
|
|
261
|
+
# Try UTF-8 first
|
|
262
|
+
data.decode("utf-8")
|
|
263
|
+
detected_encoding = "utf-8"
|
|
264
|
+
except UnicodeDecodeError:
|
|
265
|
+
# Check for BOM
|
|
266
|
+
if data.startswith(b"\xff\xfe"):
|
|
267
|
+
detected_encoding = "utf-16-le"
|
|
268
|
+
elif data.startswith(b"\xfe\xff"):
|
|
269
|
+
detected_encoding = "utf-16-be"
|
|
270
|
+
elif data.startswith(b"\xef\xbb\xbf"):
|
|
271
|
+
detected_encoding = "utf-8-sig"
|
|
272
|
+
else:
|
|
273
|
+
detected_encoding = cls.DEFAULT_ENCODING
|
|
274
|
+
else:
|
|
275
|
+
try:
|
|
276
|
+
# Use chardet for detection
|
|
277
|
+
detection = chardet.detect(data)
|
|
278
|
+
if detection and detection["encoding"]:
|
|
279
|
+
confidence = detection.get("confidence", 0)
|
|
280
|
+
detected_encoding = detection["encoding"].lower()
|
|
281
|
+
|
|
282
|
+
# Only trust high-confidence detections
|
|
283
|
+
if confidence > 0.7:
|
|
284
|
+
log_debug(
|
|
285
|
+
f"Detected encoding: {detected_encoding} "
|
|
286
|
+
f"(confidence: {confidence:.2f})"
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
log_debug(
|
|
290
|
+
f"Low confidence encoding detection: {detected_encoding} "
|
|
291
|
+
f"(confidence: {confidence:.2f}), using default"
|
|
292
|
+
)
|
|
293
|
+
detected_encoding = cls.DEFAULT_ENCODING
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
log_debug(f"Encoding detection failed: {e}")
|
|
297
|
+
detected_encoding = cls.DEFAULT_ENCODING
|
|
298
|
+
|
|
299
|
+
# Cache the result if file_path is provided
|
|
300
|
+
if file_path and detected_encoding:
|
|
301
|
+
_encoding_cache.set(file_path, detected_encoding)
|
|
302
|
+
log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
|
|
303
|
+
|
|
304
|
+
return detected_encoding
|
|
305
|
+
|
|
306
|
+
@classmethod
|
|
307
|
+
def read_file_safe(cls, file_path: str | Path) -> tuple[str, str]:
|
|
308
|
+
"""
|
|
309
|
+
Safely read a file with automatic encoding detection and caching
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
file_path: Path to the file
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Tuple of (content, detected_encoding)
|
|
316
|
+
"""
|
|
317
|
+
file_path = Path(file_path)
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
# Read raw bytes first
|
|
321
|
+
with open(file_path, "rb") as f:
|
|
322
|
+
raw_data = f.read()
|
|
323
|
+
|
|
324
|
+
if not raw_data:
|
|
325
|
+
return "", cls.DEFAULT_ENCODING
|
|
326
|
+
|
|
327
|
+
# Detect and decode with file path for caching
|
|
328
|
+
detected_encoding = cls.detect_encoding(raw_data, str(file_path))
|
|
329
|
+
content = cls.safe_decode(raw_data, detected_encoding)
|
|
330
|
+
|
|
331
|
+
# Normalize line endings for consistency
|
|
332
|
+
content = cls.normalize_line_endings(content)
|
|
333
|
+
|
|
334
|
+
return content, detected_encoding
|
|
335
|
+
|
|
336
|
+
except OSError as e:
|
|
337
|
+
log_warning(f"Failed to read file {file_path}: {e}")
|
|
338
|
+
raise e
|
|
339
|
+
|
|
340
|
+
@classmethod
|
|
341
|
+
def write_file_safe(
|
|
342
|
+
cls, file_path: str | Path, content: str, encoding: str | None = None
|
|
343
|
+
) -> bool:
|
|
344
|
+
"""
|
|
345
|
+
Safely write content to a file
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
file_path: Path to the file
|
|
349
|
+
content: Content to write
|
|
350
|
+
encoding: Target encoding (defaults to UTF-8)
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
True if successful, False otherwise
|
|
354
|
+
"""
|
|
355
|
+
file_path = Path(file_path)
|
|
356
|
+
target_encoding = encoding or cls.DEFAULT_ENCODING
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
encoded_content = cls.safe_encode(content, target_encoding)
|
|
360
|
+
|
|
361
|
+
with open(file_path, "wb") as f:
|
|
362
|
+
f.write(encoded_content)
|
|
363
|
+
|
|
364
|
+
return True
|
|
365
|
+
|
|
366
|
+
except OSError as e:
|
|
367
|
+
log_warning(f"Failed to write file {file_path}: {e}")
|
|
368
|
+
return False
|
|
369
|
+
|
|
370
|
+
@classmethod
|
|
371
|
+
def normalize_line_endings(cls, text: str) -> str:
|
|
372
|
+
"""
|
|
373
|
+
Normalize line endings to Unix style (\n)
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
text: Text to normalize
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
Text with normalized line endings
|
|
380
|
+
"""
|
|
381
|
+
if not text:
|
|
382
|
+
return text
|
|
383
|
+
|
|
384
|
+
# Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
|
|
385
|
+
return text.replace("\r\n", "\n").replace("\r", "\n")
|
|
386
|
+
|
|
387
|
+
@classmethod
|
|
388
|
+
def extract_text_slice(
|
|
389
|
+
cls,
|
|
390
|
+
content_bytes: bytes,
|
|
391
|
+
start_byte: int,
|
|
392
|
+
end_byte: int,
|
|
393
|
+
encoding: str | None = None,
|
|
394
|
+
) -> str:
|
|
395
|
+
"""
|
|
396
|
+
Extract a slice of text from bytes with proper encoding handling
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
content_bytes: Source bytes
|
|
400
|
+
start_byte: Start position
|
|
401
|
+
end_byte: End position
|
|
402
|
+
encoding: Encoding to use (auto-detected if None)
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Extracted text slice
|
|
406
|
+
"""
|
|
407
|
+
if not content_bytes or start_byte >= len(content_bytes):
|
|
408
|
+
return ""
|
|
409
|
+
|
|
410
|
+
# Ensure bounds are valid
|
|
411
|
+
start_byte = max(0, start_byte)
|
|
412
|
+
end_byte = min(len(content_bytes), end_byte)
|
|
413
|
+
|
|
414
|
+
if start_byte >= end_byte:
|
|
415
|
+
return ""
|
|
416
|
+
|
|
417
|
+
# Extract byte slice
|
|
418
|
+
byte_slice = content_bytes[start_byte:end_byte]
|
|
419
|
+
|
|
420
|
+
# Decode the slice
|
|
421
|
+
return cls.safe_decode(byte_slice, encoding)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# Convenience functions for backward compatibility
|
|
425
|
+
def safe_encode(text: str, encoding: str | None = None) -> bytes:
|
|
426
|
+
"""Convenience function for safe encoding"""
|
|
427
|
+
return EncodingManager.safe_encode(text, encoding)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def safe_decode(data: bytes, encoding: str | None = None) -> str:
|
|
431
|
+
"""Convenience function for safe decoding"""
|
|
432
|
+
return EncodingManager.safe_decode(data, encoding)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def detect_encoding(data: bytes, file_path: str | None = None) -> str:
|
|
436
|
+
"""Convenience function for encoding detection with optional caching"""
|
|
437
|
+
return EncodingManager.detect_encoding(data, file_path)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def read_file_safe(file_path: str | Path) -> tuple[str, str]:
|
|
441
|
+
"""Convenience function for safe file reading"""
|
|
442
|
+
return EncodingManager.read_file_safe(file_path)
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def write_file_safe(
|
|
446
|
+
file_path: str | Path, content: str, encoding: str | None = None
|
|
447
|
+
) -> bool:
|
|
448
|
+
"""Convenience function for safe file writing"""
|
|
449
|
+
return EncodingManager.write_file_safe(file_path, content, encoding)
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def extract_text_slice(
|
|
453
|
+
content_bytes: bytes, start_byte: int, end_byte: int, encoding: str | None = None
|
|
454
|
+
) -> str:
|
|
455
|
+
"""Convenience function for text slice extraction"""
|
|
456
|
+
return EncodingManager.extract_text_slice(
|
|
457
|
+
content_bytes, start_byte, end_byte, encoding
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def read_file_safe_streaming(file_path: str | Path) -> Any:
|
|
462
|
+
"""
|
|
463
|
+
Context manager for streaming file reading with automatic encoding detection.
|
|
464
|
+
|
|
465
|
+
This function opens a file with the correct encoding detected from the file's
|
|
466
|
+
content and yields a file handle that can be used for line-by-line reading.
|
|
467
|
+
This is memory-efficient for large files as it doesn't load the entire content.
|
|
468
|
+
|
|
469
|
+
Performance: Enables 150x speedup (30s → <200ms) for large file operations
|
|
470
|
+
by avoiding full file loading and using chunk-based streaming.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
file_path: Path to the file to read
|
|
474
|
+
|
|
475
|
+
Yields:
|
|
476
|
+
File handle opened with the correct encoding
|
|
477
|
+
|
|
478
|
+
Example:
|
|
479
|
+
with read_file_safe_streaming("large_file.txt") as f:
|
|
480
|
+
for line_num, line in enumerate(f, 1):
|
|
481
|
+
if line_num >= start_line:
|
|
482
|
+
# Process line
|
|
483
|
+
pass
|
|
484
|
+
"""
|
|
485
|
+
import contextlib
|
|
486
|
+
|
|
487
|
+
from .utils.logging import log_warning
|
|
488
|
+
|
|
489
|
+
file_path = Path(file_path)
|
|
490
|
+
|
|
491
|
+
# First, detect encoding by reading a small sample
|
|
492
|
+
try:
|
|
493
|
+
with open(file_path, "rb") as f:
|
|
494
|
+
# Read first 8KB to detect encoding
|
|
495
|
+
sample_data = f.read(8192)
|
|
496
|
+
|
|
497
|
+
if not sample_data:
|
|
498
|
+
# Empty file, use default encoding
|
|
499
|
+
detected_encoding = EncodingManager.DEFAULT_ENCODING
|
|
500
|
+
else:
|
|
501
|
+
# Detect encoding from sample with file path for caching
|
|
502
|
+
detected_encoding = EncodingManager.detect_encoding(
|
|
503
|
+
sample_data, str(file_path)
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
except OSError as e:
|
|
507
|
+
log_warning(f"Failed to read file for encoding detection {file_path}: {e}")
|
|
508
|
+
raise e
|
|
509
|
+
|
|
510
|
+
# Open file with detected encoding for streaming
|
|
511
|
+
@contextlib.contextmanager
|
|
512
|
+
def _file_context() -> Any:
|
|
513
|
+
try:
|
|
514
|
+
with open(file_path, encoding=detected_encoding, errors="replace") as f:
|
|
515
|
+
yield f
|
|
516
|
+
except OSError as e:
|
|
517
|
+
log_warning(f"Failed to open file for streaming {file_path}: {e}")
|
|
518
|
+
raise e
|
|
519
|
+
|
|
520
|
+
return _file_context()
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def clear_encoding_cache() -> None:
|
|
524
|
+
"""Clear the global encoding cache"""
|
|
525
|
+
_encoding_cache.clear()
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def get_encoding_cache_size() -> int:
|
|
529
|
+
"""Get the current size of the encoding cache"""
|
|
530
|
+
return _encoding_cache.size()
|