tree-sitter-analyzer 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tree-sitter-analyzer might be problematic. Click here for more details.

Files changed (78) hide show
  1. tree_sitter_analyzer/__init__.py +134 -121
  2. tree_sitter_analyzer/__main__.py +11 -12
  3. tree_sitter_analyzer/api.py +533 -539
  4. tree_sitter_analyzer/cli/__init__.py +39 -39
  5. tree_sitter_analyzer/cli/__main__.py +12 -13
  6. tree_sitter_analyzer/cli/commands/__init__.py +26 -27
  7. tree_sitter_analyzer/cli/commands/advanced_command.py +88 -88
  8. tree_sitter_analyzer/cli/commands/base_command.py +160 -155
  9. tree_sitter_analyzer/cli/commands/default_command.py +18 -19
  10. tree_sitter_analyzer/cli/commands/partial_read_command.py +141 -133
  11. tree_sitter_analyzer/cli/commands/query_command.py +81 -82
  12. tree_sitter_analyzer/cli/commands/structure_command.py +138 -121
  13. tree_sitter_analyzer/cli/commands/summary_command.py +101 -93
  14. tree_sitter_analyzer/cli/commands/table_command.py +235 -233
  15. tree_sitter_analyzer/cli/info_commands.py +120 -121
  16. tree_sitter_analyzer/cli_main.py +278 -276
  17. tree_sitter_analyzer/core/__init__.py +15 -20
  18. tree_sitter_analyzer/core/analysis_engine.py +555 -574
  19. tree_sitter_analyzer/core/cache_service.py +320 -330
  20. tree_sitter_analyzer/core/engine.py +559 -560
  21. tree_sitter_analyzer/core/parser.py +293 -288
  22. tree_sitter_analyzer/core/query.py +502 -502
  23. tree_sitter_analyzer/encoding_utils.py +456 -460
  24. tree_sitter_analyzer/exceptions.py +337 -340
  25. tree_sitter_analyzer/file_handler.py +210 -222
  26. tree_sitter_analyzer/formatters/__init__.py +1 -1
  27. tree_sitter_analyzer/formatters/base_formatter.py +167 -168
  28. tree_sitter_analyzer/formatters/formatter_factory.py +78 -74
  29. tree_sitter_analyzer/formatters/java_formatter.py +291 -270
  30. tree_sitter_analyzer/formatters/python_formatter.py +259 -235
  31. tree_sitter_analyzer/interfaces/__init__.py +9 -10
  32. tree_sitter_analyzer/interfaces/cli.py +528 -557
  33. tree_sitter_analyzer/interfaces/cli_adapter.py +343 -319
  34. tree_sitter_analyzer/interfaces/mcp_adapter.py +206 -170
  35. tree_sitter_analyzer/interfaces/mcp_server.py +405 -416
  36. tree_sitter_analyzer/java_analyzer.py +187 -219
  37. tree_sitter_analyzer/language_detector.py +398 -400
  38. tree_sitter_analyzer/language_loader.py +224 -228
  39. tree_sitter_analyzer/languages/__init__.py +10 -11
  40. tree_sitter_analyzer/languages/java_plugin.py +1174 -1113
  41. tree_sitter_analyzer/{plugins → languages}/javascript_plugin.py +446 -439
  42. tree_sitter_analyzer/languages/python_plugin.py +747 -712
  43. tree_sitter_analyzer/mcp/__init__.py +31 -32
  44. tree_sitter_analyzer/mcp/resources/__init__.py +44 -47
  45. tree_sitter_analyzer/mcp/resources/code_file_resource.py +209 -213
  46. tree_sitter_analyzer/mcp/resources/project_stats_resource.py +555 -550
  47. tree_sitter_analyzer/mcp/server.py +333 -345
  48. tree_sitter_analyzer/mcp/tools/__init__.py +30 -31
  49. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +654 -557
  50. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +247 -245
  51. tree_sitter_analyzer/mcp/tools/base_tool.py +54 -55
  52. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +300 -302
  53. tree_sitter_analyzer/mcp/tools/table_format_tool.py +362 -359
  54. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +543 -476
  55. tree_sitter_analyzer/mcp/utils/__init__.py +107 -106
  56. tree_sitter_analyzer/mcp/utils/error_handler.py +549 -549
  57. tree_sitter_analyzer/models.py +470 -481
  58. tree_sitter_analyzer/output_manager.py +255 -264
  59. tree_sitter_analyzer/plugins/__init__.py +280 -334
  60. tree_sitter_analyzer/plugins/base.py +496 -446
  61. tree_sitter_analyzer/plugins/manager.py +379 -355
  62. tree_sitter_analyzer/queries/__init__.py +26 -27
  63. tree_sitter_analyzer/queries/java.py +391 -394
  64. tree_sitter_analyzer/queries/javascript.py +148 -149
  65. tree_sitter_analyzer/queries/python.py +285 -286
  66. tree_sitter_analyzer/queries/typescript.py +229 -230
  67. tree_sitter_analyzer/query_loader.py +257 -260
  68. tree_sitter_analyzer/table_formatter.py +471 -448
  69. tree_sitter_analyzer/utils.py +277 -277
  70. {tree_sitter_analyzer-0.2.0.dist-info → tree_sitter_analyzer-0.4.0.dist-info}/METADATA +23 -8
  71. tree_sitter_analyzer-0.4.0.dist-info/RECORD +73 -0
  72. {tree_sitter_analyzer-0.2.0.dist-info → tree_sitter_analyzer-0.4.0.dist-info}/entry_points.txt +2 -1
  73. tree_sitter_analyzer/plugins/java_plugin.py +0 -625
  74. tree_sitter_analyzer/plugins/plugin_loader.py +0 -83
  75. tree_sitter_analyzer/plugins/python_plugin.py +0 -598
  76. tree_sitter_analyzer/plugins/registry.py +0 -366
  77. tree_sitter_analyzer-0.2.0.dist-info/RECORD +0 -77
  78. {tree_sitter_analyzer-0.2.0.dist-info → tree_sitter_analyzer-0.4.0.dist-info}/WHEEL +0 -0
@@ -1,460 +1,456 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Optimized Encoding Utilities Module
5
-
6
- This module provides unified encoding/decoding functionality with performance
7
- optimizations including file-based encoding caching to reduce redundant
8
- chardet.detect() calls.
9
- """
10
-
11
- import locale
12
- import os
13
- import sys
14
- import threading
15
- import time
16
- from pathlib import Path
17
- from typing import Any, Dict, Optional, Tuple, Union
18
-
19
-
20
- # Set up encoding environment early
21
- def _setup_encoding_environment() -> None:
22
- """Set up proper encoding environment"""
23
- try:
24
- os.environ["PYTHONIOENCODING"] = "utf-8"
25
- os.environ["PYTHONUTF8"] = "1"
26
-
27
- # Ensure proper stdout/stderr encoding if possible
28
- if hasattr(sys.stdout, "reconfigure"):
29
- sys.stdout.reconfigure(encoding="utf-8", errors="replace")
30
- if hasattr(sys.stderr, "reconfigure"):
31
- sys.stderr.reconfigure(encoding="utf-8", errors="replace")
32
- except Exception:
33
- pass # Ignore setup errors, use defaults
34
-
35
-
36
- # Set up environment when module is imported
37
- _setup_encoding_environment()
38
-
39
- # Try to import chardet with fallback
40
- try:
41
- import chardet
42
-
43
- CHARDET_AVAILABLE = True
44
- except ImportError:
45
- CHARDET_AVAILABLE = False
46
-
47
- # Import utilities with fallback
48
- try:
49
- from .utils import log_debug, log_warning
50
- except ImportError:
51
- # Fallback logging functions with compatible signatures
52
- def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
53
- print(f"DEBUG: {message}")
54
-
55
- def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
56
- print(f"WARNING: {message}")
57
-
58
-
59
- class EncodingCache:
60
- """Thread-safe encoding cache for file-based encoding detection optimization"""
61
-
62
- def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
63
- """
64
- Initialize encoding cache
65
-
66
- Args:
67
- max_size: Maximum number of cached entries
68
- ttl_seconds: Time-to-live for cache entries in seconds
69
- """
70
- self._cache: Dict[str, Tuple[str, float]] = (
71
- {}
72
- ) # file_path -> (encoding, timestamp)
73
- self._lock = threading.RLock()
74
- self._max_size = max_size
75
- self._ttl_seconds = ttl_seconds
76
-
77
- def get(self, file_path: str) -> Optional[str]:
78
- """
79
- Get cached encoding for file path
80
-
81
- Args:
82
- file_path: Path to the file
83
-
84
- Returns:
85
- Cached encoding or None if not found/expired
86
- """
87
- with self._lock:
88
- if file_path not in self._cache:
89
- return None
90
-
91
- encoding, timestamp = self._cache[file_path]
92
- current_time = time.time()
93
-
94
- # Check if entry has expired
95
- if current_time - timestamp > self._ttl_seconds:
96
- del self._cache[file_path]
97
- return None
98
-
99
- return encoding
100
-
101
- def set(self, file_path: str, encoding: str) -> None:
102
- """
103
- Cache encoding for file path
104
-
105
- Args:
106
- file_path: Path to the file
107
- encoding: Detected encoding
108
- """
109
- with self._lock:
110
- current_time = time.time()
111
-
112
- # Clean up expired entries if cache is getting full
113
- if len(self._cache) >= self._max_size:
114
- self._cleanup_expired()
115
-
116
- # If still full after cleanup, remove oldest entry
117
- if len(self._cache) >= self._max_size:
118
- oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
119
- del self._cache[oldest_key]
120
-
121
- self._cache[file_path] = (encoding, current_time)
122
-
123
- def _cleanup_expired(self) -> None:
124
- """Remove expired entries from cache"""
125
- current_time = time.time()
126
- expired_keys = [
127
- key
128
- for key, (_, timestamp) in self._cache.items()
129
- if current_time - timestamp > self._ttl_seconds
130
- ]
131
- for key in expired_keys:
132
- del self._cache[key]
133
-
134
- def clear(self) -> None:
135
- """Clear all cached entries"""
136
- with self._lock:
137
- self._cache.clear()
138
-
139
- def size(self) -> int:
140
- """Get current cache size"""
141
- with self._lock:
142
- return len(self._cache)
143
-
144
-
145
- # Global encoding cache instance
146
- _encoding_cache = EncodingCache()
147
-
148
-
149
- class EncodingManager:
150
- """Centralized encoding management for consistent text processing"""
151
-
152
- DEFAULT_ENCODING = "utf-8"
153
- FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
154
-
155
- @classmethod
156
- def safe_encode(cls, text: str, encoding: Optional[str] = None) -> bytes:
157
- """
158
- Safely encode text to bytes with fallback handling
159
-
160
- Args:
161
- text: Text to encode
162
- encoding: Target encoding (defaults to UTF-8)
163
-
164
- Returns:
165
- Encoded bytes
166
- """
167
- if text is None:
168
- return b""
169
-
170
- target_encoding = encoding or cls.DEFAULT_ENCODING
171
-
172
- try:
173
- return text.encode(target_encoding)
174
- except UnicodeEncodeError as e:
175
- log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
176
-
177
- # Try fallback encodings
178
- for fallback in cls.FALLBACK_ENCODINGS:
179
- if fallback != target_encoding:
180
- try:
181
- return text.encode(fallback, errors="replace")
182
- except UnicodeEncodeError:
183
- continue
184
-
185
- # Last resort: encode with error replacement
186
- log_warning(f"Using error replacement for encoding: {text[:50]}...")
187
- return text.encode(cls.DEFAULT_ENCODING, errors="replace")
188
-
189
- @classmethod
190
- def safe_decode(cls, data: bytes, encoding: Optional[str] = None) -> str:
191
- """
192
- Safely decode bytes to text with fallback handling
193
-
194
- Args:
195
- data: Bytes to decode
196
- encoding: Source encoding (auto-detected if None)
197
-
198
- Returns:
199
- Decoded text
200
- """
201
- if data is None or len(data) == 0:
202
- return ""
203
-
204
- # Use provided encoding or detect
205
- target_encoding = encoding
206
- if not target_encoding:
207
- target_encoding = cls.detect_encoding(data)
208
-
209
- try:
210
- return data.decode(target_encoding)
211
- except UnicodeDecodeError as e:
212
- log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
213
-
214
- # Try fallback encodings
215
- for fallback in cls.FALLBACK_ENCODINGS:
216
- if fallback != target_encoding:
217
- try:
218
- return data.decode(fallback, errors="replace")
219
- except UnicodeDecodeError:
220
- continue
221
-
222
- # Last resort: decode with error replacement
223
- log_warning(
224
- f"Using error replacement for decoding data (length: {len(data)})"
225
- )
226
- return data.decode(cls.DEFAULT_ENCODING, errors="replace")
227
-
228
- @classmethod
229
- def detect_encoding(cls, data: bytes, file_path: Optional[str] = None) -> str:
230
- """
231
- Detect encoding of byte data with optional file-based caching
232
-
233
- Args:
234
- data: Bytes to analyze
235
- file_path: Optional file path for caching (improves performance)
236
-
237
- Returns:
238
- Detected encoding name
239
- """
240
- if not data:
241
- return cls.DEFAULT_ENCODING
242
-
243
- # Check cache first if file_path is provided
244
- if file_path:
245
- cached_encoding = _encoding_cache.get(file_path)
246
- if cached_encoding:
247
- log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
248
- return cached_encoding
249
-
250
- detected_encoding = cls.DEFAULT_ENCODING
251
-
252
- # If chardet is not available, use simple heuristics
253
- if not CHARDET_AVAILABLE:
254
- try:
255
- # Try UTF-8 first
256
- data.decode("utf-8")
257
- detected_encoding = "utf-8"
258
- except UnicodeDecodeError:
259
- # Check for BOM
260
- if data.startswith(b"\xff\xfe"):
261
- detected_encoding = "utf-16-le"
262
- elif data.startswith(b"\xfe\xff"):
263
- detected_encoding = "utf-16-be"
264
- elif data.startswith(b"\xef\xbb\xbf"):
265
- detected_encoding = "utf-8-sig"
266
- else:
267
- detected_encoding = cls.DEFAULT_ENCODING
268
- else:
269
- try:
270
- # Use chardet for detection
271
- detection = chardet.detect(data)
272
- if detection and detection["encoding"]:
273
- confidence = detection.get("confidence", 0)
274
- detected_encoding = detection["encoding"].lower()
275
-
276
- # Only trust high-confidence detections
277
- if confidence > 0.7:
278
- log_debug(
279
- f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})"
280
- )
281
- else:
282
- log_debug(
283
- f"Low confidence encoding detection: {detected_encoding} (confidence: {confidence:.2f}), using default"
284
- )
285
- detected_encoding = cls.DEFAULT_ENCODING
286
-
287
- except Exception as e:
288
- log_debug(f"Encoding detection failed: {e}")
289
- detected_encoding = cls.DEFAULT_ENCODING
290
-
291
- # Cache the result if file_path is provided
292
- if file_path and detected_encoding:
293
- _encoding_cache.set(file_path, detected_encoding)
294
- log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
295
-
296
- return detected_encoding
297
-
298
- @classmethod
299
- def read_file_safe(cls, file_path: Union[str, Path]) -> Tuple[str, str]:
300
- """
301
- Safely read a file with automatic encoding detection and caching
302
-
303
- Args:
304
- file_path: Path to the file
305
-
306
- Returns:
307
- Tuple of (content, detected_encoding)
308
- """
309
- file_path = Path(file_path)
310
-
311
- try:
312
- # Read raw bytes first
313
- with open(file_path, "rb") as f:
314
- raw_data = f.read()
315
-
316
- if not raw_data:
317
- return "", cls.DEFAULT_ENCODING
318
-
319
- # Detect and decode with file path for caching
320
- detected_encoding = cls.detect_encoding(raw_data, str(file_path))
321
- content = cls.safe_decode(raw_data, detected_encoding)
322
-
323
- # Normalize line endings for consistency
324
- content = cls.normalize_line_endings(content)
325
-
326
- return content, detected_encoding
327
-
328
- except IOError as e:
329
- log_warning(f"Failed to read file {file_path}: {e}")
330
- raise e
331
-
332
- @classmethod
333
- def write_file_safe(
334
- cls, file_path: Union[str, Path], content: str, encoding: Optional[str] = None
335
- ) -> bool:
336
- """
337
- Safely write content to a file
338
-
339
- Args:
340
- file_path: Path to the file
341
- content: Content to write
342
- encoding: Target encoding (defaults to UTF-8)
343
-
344
- Returns:
345
- True if successful, False otherwise
346
- """
347
- file_path = Path(file_path)
348
- target_encoding = encoding or cls.DEFAULT_ENCODING
349
-
350
- try:
351
- encoded_content = cls.safe_encode(content, target_encoding)
352
-
353
- with open(file_path, "wb") as f:
354
- f.write(encoded_content)
355
-
356
- return True
357
-
358
- except IOError as e:
359
- log_warning(f"Failed to write file {file_path}: {e}")
360
- return False
361
-
362
- @classmethod
363
- def normalize_line_endings(cls, text: str) -> str:
364
- """
365
- Normalize line endings to Unix style (\n)
366
-
367
- Args:
368
- text: Text to normalize
369
-
370
- Returns:
371
- Text with normalized line endings
372
- """
373
- if not text:
374
- return text
375
-
376
- # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
377
- return text.replace("\r\n", "\n").replace("\r", "\n")
378
-
379
- @classmethod
380
- def extract_text_slice(
381
- cls,
382
- content_bytes: bytes,
383
- start_byte: int,
384
- end_byte: int,
385
- encoding: Optional[str] = None,
386
- ) -> str:
387
- """
388
- Extract a slice of text from bytes with proper encoding handling
389
-
390
- Args:
391
- content_bytes: Source bytes
392
- start_byte: Start position
393
- end_byte: End position
394
- encoding: Encoding to use (auto-detected if None)
395
-
396
- Returns:
397
- Extracted text slice
398
- """
399
- if not content_bytes or start_byte >= len(content_bytes):
400
- return ""
401
-
402
- # Ensure bounds are valid
403
- start_byte = max(0, start_byte)
404
- end_byte = min(len(content_bytes), end_byte)
405
-
406
- if start_byte >= end_byte:
407
- return ""
408
-
409
- # Extract byte slice
410
- byte_slice = content_bytes[start_byte:end_byte]
411
-
412
- # Decode the slice
413
- return cls.safe_decode(byte_slice, encoding)
414
-
415
-
416
- # Convenience functions for backward compatibility
417
- def safe_encode(text: str, encoding: Optional[str] = None) -> bytes:
418
- """Convenience function for safe encoding"""
419
- return EncodingManager.safe_encode(text, encoding)
420
-
421
-
422
- def safe_decode(data: bytes, encoding: Optional[str] = None) -> str:
423
- """Convenience function for safe decoding"""
424
- return EncodingManager.safe_decode(data, encoding)
425
-
426
-
427
- def detect_encoding(data: bytes, file_path: Optional[str] = None) -> str:
428
- """Convenience function for encoding detection with optional caching"""
429
- return EncodingManager.detect_encoding(data, file_path)
430
-
431
-
432
- def read_file_safe(file_path: Union[str, Path]) -> Tuple[str, str]:
433
- """Convenience function for safe file reading"""
434
- return EncodingManager.read_file_safe(file_path)
435
-
436
-
437
- def write_file_safe(
438
- file_path: Union[str, Path], content: str, encoding: Optional[str] = None
439
- ) -> bool:
440
- """Convenience function for safe file writing"""
441
- return EncodingManager.write_file_safe(file_path, content, encoding)
442
-
443
-
444
- def extract_text_slice(
445
- content_bytes: bytes, start_byte: int, end_byte: int, encoding: Optional[str] = None
446
- ) -> str:
447
- """Convenience function for text slice extraction"""
448
- return EncodingManager.extract_text_slice(
449
- content_bytes, start_byte, end_byte, encoding
450
- )
451
-
452
-
453
- def clear_encoding_cache() -> None:
454
- """Clear the global encoding cache"""
455
- _encoding_cache.clear()
456
-
457
-
458
- def get_encoding_cache_size() -> int:
459
- """Get the current size of the encoding cache"""
460
- return _encoding_cache.size()
1
+ #!/usr/bin/env python3
2
+ """
3
+ Optimized Encoding Utilities Module
4
+
5
+ This module provides unified encoding/decoding functionality with performance
6
+ optimizations including file-based encoding caching to reduce redundant
7
+ chardet.detect() calls.
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ # Set up encoding environment early
19
+ def _setup_encoding_environment() -> None:
20
+ """Set up proper encoding environment"""
21
+ try:
22
+ os.environ["PYTHONIOENCODING"] = "utf-8"
23
+ os.environ["PYTHONUTF8"] = "1"
24
+
25
+ # Ensure proper stdout/stderr encoding if possible
26
+ if hasattr(sys.stdout, "reconfigure"):
27
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
28
+ if hasattr(sys.stderr, "reconfigure"):
29
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
30
+ except Exception:
31
+ pass # Ignore setup errors, use defaults
32
+
33
+
34
+ # Set up environment when module is imported
35
+ _setup_encoding_environment()
36
+
37
+ # Try to import chardet with fallback
38
+ try:
39
+ import chardet
40
+
41
+ CHARDET_AVAILABLE = True
42
+ except ImportError:
43
+ CHARDET_AVAILABLE = False
44
+
45
+ # Import utilities with fallback
46
+ try:
47
+ from .utils import log_debug, log_warning
48
+ except ImportError:
49
+ # Fallback logging functions with compatible signatures
50
+ def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
51
+ print(f"DEBUG: {message}")
52
+
53
+ def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
54
+ print(f"WARNING: {message}")
55
+
56
+
57
+ class EncodingCache:
58
+ """Thread-safe encoding cache for file-based encoding detection optimization"""
59
+
60
+ def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
61
+ """
62
+ Initialize encoding cache
63
+
64
+ Args:
65
+ max_size: Maximum number of cached entries
66
+ ttl_seconds: Time-to-live for cache entries in seconds
67
+ """
68
+ self._cache: dict[
69
+ str, tuple[str, float]
70
+ ] = {} # file_path -> (encoding, timestamp)
71
+ self._lock = threading.RLock()
72
+ self._max_size = max_size
73
+ self._ttl_seconds = ttl_seconds
74
+
75
+ def get(self, file_path: str) -> str | None:
76
+ """
77
+ Get cached encoding for file path
78
+
79
+ Args:
80
+ file_path: Path to the file
81
+
82
+ Returns:
83
+ Cached encoding or None if not found/expired
84
+ """
85
+ with self._lock:
86
+ if file_path not in self._cache:
87
+ return None
88
+
89
+ encoding, timestamp = self._cache[file_path]
90
+ current_time = time.time()
91
+
92
+ # Check if entry has expired
93
+ if current_time - timestamp > self._ttl_seconds:
94
+ del self._cache[file_path]
95
+ return None
96
+
97
+ return encoding
98
+
99
+ def set(self, file_path: str, encoding: str) -> None:
100
+ """
101
+ Cache encoding for file path
102
+
103
+ Args:
104
+ file_path: Path to the file
105
+ encoding: Detected encoding
106
+ """
107
+ with self._lock:
108
+ current_time = time.time()
109
+
110
+ # Clean up expired entries if cache is getting full
111
+ if len(self._cache) >= self._max_size:
112
+ self._cleanup_expired()
113
+
114
+ # If still full after cleanup, remove oldest entry
115
+ if len(self._cache) >= self._max_size:
116
+ oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
117
+ del self._cache[oldest_key]
118
+
119
+ self._cache[file_path] = (encoding, current_time)
120
+
121
+ def _cleanup_expired(self) -> None:
122
+ """Remove expired entries from cache"""
123
+ current_time = time.time()
124
+ expired_keys = [
125
+ key
126
+ for key, (_, timestamp) in self._cache.items()
127
+ if current_time - timestamp > self._ttl_seconds
128
+ ]
129
+ for key in expired_keys:
130
+ del self._cache[key]
131
+
132
+ def clear(self) -> None:
133
+ """Clear all cached entries"""
134
+ with self._lock:
135
+ self._cache.clear()
136
+
137
+ def size(self) -> int:
138
+ """Get current cache size"""
139
+ with self._lock:
140
+ return len(self._cache)
141
+
142
+
143
+ # Global encoding cache instance
144
+ _encoding_cache = EncodingCache()
145
+
146
+
147
+ class EncodingManager:
148
+ """Centralized encoding management for consistent text processing"""
149
+
150
+ DEFAULT_ENCODING = "utf-8"
151
+ FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
152
+
153
+ @classmethod
154
+ def safe_encode(cls, text: str, encoding: str | None = None) -> bytes:
155
+ """
156
+ Safely encode text to bytes with fallback handling
157
+
158
+ Args:
159
+ text: Text to encode
160
+ encoding: Target encoding (defaults to UTF-8)
161
+
162
+ Returns:
163
+ Encoded bytes
164
+ """
165
+
166
+ target_encoding = encoding or cls.DEFAULT_ENCODING
167
+
168
+ try:
169
+ return text.encode(target_encoding)
170
+ except UnicodeEncodeError as e:
171
+ log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
172
+
173
+ # Try fallback encodings
174
+ for fallback in cls.FALLBACK_ENCODINGS:
175
+ if fallback != target_encoding:
176
+ try:
177
+ return text.encode(fallback, errors="replace")
178
+ except UnicodeEncodeError:
179
+ continue
180
+
181
+ # Last resort: encode with error replacement
182
+ log_warning(f"Using error replacement for encoding: {text[:50]}...")
183
+ return text.encode(cls.DEFAULT_ENCODING, errors="replace")
184
+
185
+ @classmethod
186
+ def safe_decode(cls, data: bytes, encoding: str | None = None) -> str:
187
+ """
188
+ Safely decode bytes to text with fallback handling
189
+
190
+ Args:
191
+ data: Bytes to decode
192
+ encoding: Source encoding (auto-detected if None)
193
+
194
+ Returns:
195
+ Decoded text
196
+ """
197
+ if data is None or len(data) == 0:
198
+ return ""
199
+
200
+ # Use provided encoding or detect
201
+ target_encoding = encoding
202
+ if not target_encoding:
203
+ target_encoding = cls.detect_encoding(data)
204
+
205
+ try:
206
+ return data.decode(target_encoding)
207
+ except UnicodeDecodeError as e:
208
+ log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
209
+
210
+ # Try fallback encodings
211
+ for fallback in cls.FALLBACK_ENCODINGS:
212
+ if fallback != target_encoding:
213
+ try:
214
+ return data.decode(fallback, errors="replace")
215
+ except UnicodeDecodeError:
216
+ continue
217
+
218
+ # Last resort: decode with error replacement
219
+ log_warning(
220
+ f"Using error replacement for decoding data (length: {len(data)})"
221
+ )
222
+ return data.decode(cls.DEFAULT_ENCODING, errors="replace")
223
+
224
+ @classmethod
225
+ def detect_encoding(cls, data: bytes, file_path: str | None = None) -> str:
226
+ """
227
+ Detect encoding of byte data with optional file-based caching
228
+
229
+ Args:
230
+ data: Bytes to analyze
231
+ file_path: Optional file path for caching (improves performance)
232
+
233
+ Returns:
234
+ Detected encoding name
235
+ """
236
+ if not data:
237
+ return cls.DEFAULT_ENCODING
238
+
239
+ # Check cache first if file_path is provided
240
+ if file_path:
241
+ cached_encoding = _encoding_cache.get(file_path)
242
+ if cached_encoding:
243
+ log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
244
+ return cached_encoding
245
+
246
+ detected_encoding = cls.DEFAULT_ENCODING
247
+
248
+ # If chardet is not available, use simple heuristics
249
+ if not CHARDET_AVAILABLE:
250
+ try:
251
+ # Try UTF-8 first
252
+ data.decode("utf-8")
253
+ detected_encoding = "utf-8"
254
+ except UnicodeDecodeError:
255
+ # Check for BOM
256
+ if data.startswith(b"\xff\xfe"):
257
+ detected_encoding = "utf-16-le"
258
+ elif data.startswith(b"\xfe\xff"):
259
+ detected_encoding = "utf-16-be"
260
+ elif data.startswith(b"\xef\xbb\xbf"):
261
+ detected_encoding = "utf-8-sig"
262
+ else:
263
+ detected_encoding = cls.DEFAULT_ENCODING
264
+ else:
265
+ try:
266
+ # Use chardet for detection
267
+ detection = chardet.detect(data)
268
+ if detection and detection["encoding"]:
269
+ confidence = detection.get("confidence", 0)
270
+ detected_encoding = detection["encoding"].lower()
271
+
272
+ # Only trust high-confidence detections
273
+ if confidence > 0.7:
274
+ log_debug(
275
+ f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})"
276
+ )
277
+ else:
278
+ log_debug(
279
+ f"Low confidence encoding detection: {detected_encoding} (confidence: {confidence:.2f}), using default"
280
+ )
281
+ detected_encoding = cls.DEFAULT_ENCODING
282
+
283
+ except Exception as e:
284
+ log_debug(f"Encoding detection failed: {e}")
285
+ detected_encoding = cls.DEFAULT_ENCODING
286
+
287
+ # Cache the result if file_path is provided
288
+ if file_path and detected_encoding:
289
+ _encoding_cache.set(file_path, detected_encoding)
290
+ log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
291
+
292
+ return detected_encoding
293
+
294
+ @classmethod
295
+ def read_file_safe(cls, file_path: str | Path) -> tuple[str, str]:
296
+ """
297
+ Safely read a file with automatic encoding detection and caching
298
+
299
+ Args:
300
+ file_path: Path to the file
301
+
302
+ Returns:
303
+ Tuple of (content, detected_encoding)
304
+ """
305
+ file_path = Path(file_path)
306
+
307
+ try:
308
+ # Read raw bytes first
309
+ with open(file_path, "rb") as f:
310
+ raw_data = f.read()
311
+
312
+ if not raw_data:
313
+ return "", cls.DEFAULT_ENCODING
314
+
315
+ # Detect and decode with file path for caching
316
+ detected_encoding = cls.detect_encoding(raw_data, str(file_path))
317
+ content = cls.safe_decode(raw_data, detected_encoding)
318
+
319
+ # Normalize line endings for consistency
320
+ content = cls.normalize_line_endings(content)
321
+
322
+ return content, detected_encoding
323
+
324
+ except OSError as e:
325
+ log_warning(f"Failed to read file {file_path}: {e}")
326
+ raise e
327
+
328
+ @classmethod
329
+ def write_file_safe(
330
+ cls, file_path: str | Path, content: str, encoding: str | None = None
331
+ ) -> bool:
332
+ """
333
+ Safely write content to a file
334
+
335
+ Args:
336
+ file_path: Path to the file
337
+ content: Content to write
338
+ encoding: Target encoding (defaults to UTF-8)
339
+
340
+ Returns:
341
+ True if successful, False otherwise
342
+ """
343
+ file_path = Path(file_path)
344
+ target_encoding = encoding or cls.DEFAULT_ENCODING
345
+
346
+ try:
347
+ encoded_content = cls.safe_encode(content, target_encoding)
348
+
349
+ with open(file_path, "wb") as f:
350
+ f.write(encoded_content)
351
+
352
+ return True
353
+
354
+ except OSError as e:
355
+ log_warning(f"Failed to write file {file_path}: {e}")
356
+ return False
357
+
358
+ @classmethod
359
+ def normalize_line_endings(cls, text: str) -> str:
360
+ """
361
+ Normalize line endings to Unix style (\n)
362
+
363
+ Args:
364
+ text: Text to normalize
365
+
366
+ Returns:
367
+ Text with normalized line endings
368
+ """
369
+ if not text:
370
+ return text
371
+
372
+ # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
373
+ return text.replace("\r\n", "\n").replace("\r", "\n")
374
+
375
+ @classmethod
376
+ def extract_text_slice(
377
+ cls,
378
+ content_bytes: bytes,
379
+ start_byte: int,
380
+ end_byte: int,
381
+ encoding: str | None = None,
382
+ ) -> str:
383
+ """
384
+ Extract a slice of text from bytes with proper encoding handling
385
+
386
+ Args:
387
+ content_bytes: Source bytes
388
+ start_byte: Start position
389
+ end_byte: End position
390
+ encoding: Encoding to use (auto-detected if None)
391
+
392
+ Returns:
393
+ Extracted text slice
394
+ """
395
+ if not content_bytes or start_byte >= len(content_bytes):
396
+ return ""
397
+
398
+ # Ensure bounds are valid
399
+ start_byte = max(0, start_byte)
400
+ end_byte = min(len(content_bytes), end_byte)
401
+
402
+ if start_byte >= end_byte:
403
+ return ""
404
+
405
+ # Extract byte slice
406
+ byte_slice = content_bytes[start_byte:end_byte]
407
+
408
+ # Decode the slice
409
+ return cls.safe_decode(byte_slice, encoding)
410
+
411
+
412
+ # Convenience functions for backward compatibility
413
+ def safe_encode(text: str, encoding: str | None = None) -> bytes:
414
+ """Convenience function for safe encoding"""
415
+ return EncodingManager.safe_encode(text, encoding)
416
+
417
+
418
+ def safe_decode(data: bytes, encoding: str | None = None) -> str:
419
+ """Convenience function for safe decoding"""
420
+ return EncodingManager.safe_decode(data, encoding)
421
+
422
+
423
+ def detect_encoding(data: bytes, file_path: str | None = None) -> str:
424
+ """Convenience function for encoding detection with optional caching"""
425
+ return EncodingManager.detect_encoding(data, file_path)
426
+
427
+
428
+ def read_file_safe(file_path: str | Path) -> tuple[str, str]:
429
+ """Convenience function for safe file reading"""
430
+ return EncodingManager.read_file_safe(file_path)
431
+
432
+
433
+ def write_file_safe(
434
+ file_path: str | Path, content: str, encoding: str | None = None
435
+ ) -> bool:
436
+ """Convenience function for safe file writing"""
437
+ return EncodingManager.write_file_safe(file_path, content, encoding)
438
+
439
+
440
+ def extract_text_slice(
441
+ content_bytes: bytes, start_byte: int, end_byte: int, encoding: str | None = None
442
+ ) -> str:
443
+ """Convenience function for text slice extraction"""
444
+ return EncodingManager.extract_text_slice(
445
+ content_bytes, start_byte, end_byte, encoding
446
+ )
447
+
448
+
449
+ def clear_encoding_cache() -> None:
450
+ """Clear the global encoding cache"""
451
+ _encoding_cache.clear()
452
+
453
+
454
+ def get_encoding_cache_size() -> int:
455
+ """Get the current size of the encoding cache"""
456
+ return _encoding_cache.size()