tree-sitter-analyzer 0.9.1__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tree-sitter-analyzer might be problematic. Click here for more details.

Files changed (64) hide show
  1. tree_sitter_analyzer/__init__.py +132 -132
  2. tree_sitter_analyzer/__main__.py +11 -11
  3. tree_sitter_analyzer/api.py +533 -533
  4. tree_sitter_analyzer/cli/__init__.py +39 -39
  5. tree_sitter_analyzer/cli/__main__.py +12 -12
  6. tree_sitter_analyzer/cli/commands/__init__.py +26 -26
  7. tree_sitter_analyzer/cli/commands/advanced_command.py +88 -88
  8. tree_sitter_analyzer/cli/commands/base_command.py +181 -178
  9. tree_sitter_analyzer/cli/commands/structure_command.py +138 -138
  10. tree_sitter_analyzer/cli/commands/summary_command.py +101 -101
  11. tree_sitter_analyzer/cli_main.py +7 -3
  12. tree_sitter_analyzer/core/__init__.py +15 -15
  13. tree_sitter_analyzer/core/analysis_engine.py +91 -87
  14. tree_sitter_analyzer/core/cache_service.py +320 -320
  15. tree_sitter_analyzer/core/engine.py +566 -566
  16. tree_sitter_analyzer/core/parser.py +293 -293
  17. tree_sitter_analyzer/encoding_utils.py +459 -459
  18. tree_sitter_analyzer/file_handler.py +210 -210
  19. tree_sitter_analyzer/formatters/__init__.py +1 -1
  20. tree_sitter_analyzer/formatters/base_formatter.py +167 -167
  21. tree_sitter_analyzer/formatters/formatter_factory.py +78 -78
  22. tree_sitter_analyzer/formatters/java_formatter.py +18 -18
  23. tree_sitter_analyzer/formatters/python_formatter.py +19 -19
  24. tree_sitter_analyzer/interfaces/__init__.py +9 -9
  25. tree_sitter_analyzer/interfaces/cli.py +528 -528
  26. tree_sitter_analyzer/interfaces/cli_adapter.py +344 -343
  27. tree_sitter_analyzer/interfaces/mcp_adapter.py +206 -206
  28. tree_sitter_analyzer/language_detector.py +53 -53
  29. tree_sitter_analyzer/languages/__init__.py +10 -10
  30. tree_sitter_analyzer/languages/java_plugin.py +1 -1
  31. tree_sitter_analyzer/languages/javascript_plugin.py +446 -446
  32. tree_sitter_analyzer/languages/python_plugin.py +755 -755
  33. tree_sitter_analyzer/mcp/__init__.py +34 -45
  34. tree_sitter_analyzer/mcp/resources/__init__.py +44 -44
  35. tree_sitter_analyzer/mcp/resources/code_file_resource.py +209 -209
  36. tree_sitter_analyzer/mcp/server.py +623 -568
  37. tree_sitter_analyzer/mcp/tools/__init__.py +30 -30
  38. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +681 -673
  39. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +247 -247
  40. tree_sitter_analyzer/mcp/tools/base_tool.py +54 -54
  41. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +310 -308
  42. tree_sitter_analyzer/mcp/tools/table_format_tool.py +386 -379
  43. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +563 -559
  44. tree_sitter_analyzer/mcp/utils/__init__.py +107 -107
  45. tree_sitter_analyzer/models.py +10 -10
  46. tree_sitter_analyzer/output_manager.py +253 -253
  47. tree_sitter_analyzer/plugins/__init__.py +280 -280
  48. tree_sitter_analyzer/plugins/base.py +529 -529
  49. tree_sitter_analyzer/plugins/manager.py +379 -379
  50. tree_sitter_analyzer/project_detector.py +330 -317
  51. tree_sitter_analyzer/queries/__init__.py +26 -26
  52. tree_sitter_analyzer/queries/java.py +391 -391
  53. tree_sitter_analyzer/queries/javascript.py +148 -148
  54. tree_sitter_analyzer/queries/python.py +285 -285
  55. tree_sitter_analyzer/queries/typescript.py +229 -229
  56. tree_sitter_analyzer/query_loader.py +257 -257
  57. tree_sitter_analyzer/security/boundary_manager.py +57 -51
  58. tree_sitter_analyzer/security/validator.py +246 -241
  59. tree_sitter_analyzer/utils.py +294 -277
  60. {tree_sitter_analyzer-0.9.1.dist-info → tree_sitter_analyzer-0.9.3.dist-info}/METADATA +13 -13
  61. tree_sitter_analyzer-0.9.3.dist-info/RECORD +77 -0
  62. {tree_sitter_analyzer-0.9.1.dist-info → tree_sitter_analyzer-0.9.3.dist-info}/entry_points.txt +1 -0
  63. tree_sitter_analyzer-0.9.1.dist-info/RECORD +0 -77
  64. {tree_sitter_analyzer-0.9.1.dist-info → tree_sitter_analyzer-0.9.3.dist-info}/WHEEL +0 -0
@@ -1,459 +1,459 @@
1
- #!/usr/bin/env python3
2
- """
3
- Optimized Encoding Utilities Module
4
-
5
- This module provides unified encoding/decoding functionality with performance
6
- optimizations including file-based encoding caching to reduce redundant
7
- chardet.detect() calls.
8
- """
9
-
10
- import os
11
- import sys
12
- import threading
13
- import time
14
- from pathlib import Path
15
- from typing import Any
16
-
17
-
18
- # Set up encoding environment early
19
- def _setup_encoding_environment() -> None:
20
- """Set up proper encoding environment"""
21
- try:
22
- os.environ["PYTHONIOENCODING"] = "utf-8"
23
- os.environ["PYTHONUTF8"] = "1"
24
-
25
- # Ensure proper stdout/stderr encoding if possible
26
- if hasattr(sys.stdout, "reconfigure"):
27
- sys.stdout.reconfigure(encoding="utf-8", errors="replace")
28
- if hasattr(sys.stderr, "reconfigure"):
29
- sys.stderr.reconfigure(encoding="utf-8", errors="replace")
30
- except Exception:
31
- pass # Ignore setup errors, use defaults
32
-
33
-
34
- # Set up environment when module is imported
35
- _setup_encoding_environment()
36
-
37
- # Try to import chardet with fallback
38
- try:
39
- import chardet
40
-
41
- CHARDET_AVAILABLE = True
42
- except ImportError:
43
- CHARDET_AVAILABLE = False
44
-
45
- # Import utilities with fallback
46
- try:
47
- from .utils import log_debug, log_warning
48
- except ImportError:
49
- # Fallback logging functions with compatible signatures
50
- def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
51
- print(f"DEBUG: {message}")
52
-
53
- def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
54
- print(f"WARNING: {message}")
55
-
56
-
57
- class EncodingCache:
58
- """Thread-safe encoding cache for file-based encoding detection optimization"""
59
-
60
- def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
61
- """
62
- Initialize encoding cache
63
-
64
- Args:
65
- max_size: Maximum number of cached entries
66
- ttl_seconds: Time-to-live for cache entries in seconds
67
- """
68
- self._cache: dict[
69
- str, tuple[str, float]
70
- ] = {} # file_path -> (encoding, timestamp)
71
- self._lock = threading.RLock()
72
- self._max_size = max_size
73
- self._ttl_seconds = ttl_seconds
74
-
75
- def get(self, file_path: str) -> str | None:
76
- """
77
- Get cached encoding for file path
78
-
79
- Args:
80
- file_path: Path to the file
81
-
82
- Returns:
83
- Cached encoding or None if not found/expired
84
- """
85
- with self._lock:
86
- if file_path not in self._cache:
87
- return None
88
-
89
- encoding, timestamp = self._cache[file_path]
90
- current_time = time.time()
91
-
92
- # Check if entry has expired
93
- if current_time - timestamp > self._ttl_seconds:
94
- del self._cache[file_path]
95
- return None
96
-
97
- return encoding
98
-
99
- def set(self, file_path: str, encoding: str) -> None:
100
- """
101
- Cache encoding for file path
102
-
103
- Args:
104
- file_path: Path to the file
105
- encoding: Detected encoding
106
- """
107
- with self._lock:
108
- current_time = time.time()
109
-
110
- # Clean up expired entries if cache is getting full
111
- if len(self._cache) >= self._max_size:
112
- self._cleanup_expired()
113
-
114
- # If still full after cleanup, remove oldest entry
115
- if len(self._cache) >= self._max_size:
116
- oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
117
- del self._cache[oldest_key]
118
-
119
- self._cache[file_path] = (encoding, current_time)
120
-
121
- def _cleanup_expired(self) -> None:
122
- """Remove expired entries from cache"""
123
- current_time = time.time()
124
- expired_keys = [
125
- key
126
- for key, (_, timestamp) in self._cache.items()
127
- if current_time - timestamp > self._ttl_seconds
128
- ]
129
- for key in expired_keys:
130
- del self._cache[key]
131
-
132
- def clear(self) -> None:
133
- """Clear all cached entries"""
134
- with self._lock:
135
- self._cache.clear()
136
-
137
- def size(self) -> int:
138
- """Get current cache size"""
139
- with self._lock:
140
- return len(self._cache)
141
-
142
-
143
- # Global encoding cache instance
144
- _encoding_cache = EncodingCache()
145
-
146
-
147
- class EncodingManager:
148
- """Centralized encoding management for consistent text processing"""
149
-
150
- DEFAULT_ENCODING = "utf-8"
151
- FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
152
-
153
- @classmethod
154
- def safe_encode(cls, text: str | None, encoding: str | None = None) -> bytes:
155
- """
156
- Safely encode text to bytes with fallback handling
157
-
158
- Args:
159
- text: Text to encode (can be None)
160
- encoding: Target encoding (defaults to UTF-8)
161
-
162
- Returns:
163
- Encoded bytes
164
- """
165
- # Handle None input
166
- if text is None:
167
- return b""
168
-
169
- target_encoding = encoding or cls.DEFAULT_ENCODING
170
-
171
- try:
172
- return text.encode(target_encoding)
173
- except UnicodeEncodeError as e:
174
- log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
175
-
176
- # Try fallback encodings
177
- for fallback in cls.FALLBACK_ENCODINGS:
178
- if fallback != target_encoding:
179
- try:
180
- return text.encode(fallback, errors="replace")
181
- except UnicodeEncodeError:
182
- continue
183
-
184
- # Last resort: encode with error replacement
185
- log_warning(f"Using error replacement for encoding: {text[:50]}...")
186
- return text.encode(cls.DEFAULT_ENCODING, errors="replace")
187
-
188
- @classmethod
189
- def safe_decode(cls, data: bytes, encoding: str | None = None) -> str:
190
- """
191
- Safely decode bytes to text with fallback handling
192
-
193
- Args:
194
- data: Bytes to decode
195
- encoding: Source encoding (auto-detected if None)
196
-
197
- Returns:
198
- Decoded text
199
- """
200
- if data is None or len(data) == 0:
201
- return ""
202
-
203
- # Use provided encoding or detect
204
- target_encoding = encoding
205
- if not target_encoding:
206
- target_encoding = cls.detect_encoding(data)
207
-
208
- try:
209
- return data.decode(target_encoding)
210
- except UnicodeDecodeError as e:
211
- log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
212
-
213
- # Try fallback encodings
214
- for fallback in cls.FALLBACK_ENCODINGS:
215
- if fallback != target_encoding:
216
- try:
217
- return data.decode(fallback, errors="replace")
218
- except UnicodeDecodeError:
219
- continue
220
-
221
- # Last resort: decode with error replacement
222
- log_warning(
223
- f"Using error replacement for decoding data (length: {len(data)})"
224
- )
225
- return data.decode(cls.DEFAULT_ENCODING, errors="replace")
226
-
227
- @classmethod
228
- def detect_encoding(cls, data: bytes, file_path: str | None = None) -> str:
229
- """
230
- Detect encoding of byte data with optional file-based caching
231
-
232
- Args:
233
- data: Bytes to analyze
234
- file_path: Optional file path for caching (improves performance)
235
-
236
- Returns:
237
- Detected encoding name
238
- """
239
- if not data:
240
- return cls.DEFAULT_ENCODING
241
-
242
- # Check cache first if file_path is provided
243
- if file_path:
244
- cached_encoding = _encoding_cache.get(file_path)
245
- if cached_encoding:
246
- log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
247
- return cached_encoding
248
-
249
- detected_encoding = cls.DEFAULT_ENCODING
250
-
251
- # If chardet is not available, use simple heuristics
252
- if not CHARDET_AVAILABLE:
253
- try:
254
- # Try UTF-8 first
255
- data.decode("utf-8")
256
- detected_encoding = "utf-8"
257
- except UnicodeDecodeError:
258
- # Check for BOM
259
- if data.startswith(b"\xff\xfe"):
260
- detected_encoding = "utf-16-le"
261
- elif data.startswith(b"\xfe\xff"):
262
- detected_encoding = "utf-16-be"
263
- elif data.startswith(b"\xef\xbb\xbf"):
264
- detected_encoding = "utf-8-sig"
265
- else:
266
- detected_encoding = cls.DEFAULT_ENCODING
267
- else:
268
- try:
269
- # Use chardet for detection
270
- detection = chardet.detect(data)
271
- if detection and detection["encoding"]:
272
- confidence = detection.get("confidence", 0)
273
- detected_encoding = detection["encoding"].lower()
274
-
275
- # Only trust high-confidence detections
276
- if confidence > 0.7:
277
- log_debug(
278
- f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})"
279
- )
280
- else:
281
- log_debug(
282
- f"Low confidence encoding detection: {detected_encoding} (confidence: {confidence:.2f}), using default"
283
- )
284
- detected_encoding = cls.DEFAULT_ENCODING
285
-
286
- except Exception as e:
287
- log_debug(f"Encoding detection failed: {e}")
288
- detected_encoding = cls.DEFAULT_ENCODING
289
-
290
- # Cache the result if file_path is provided
291
- if file_path and detected_encoding:
292
- _encoding_cache.set(file_path, detected_encoding)
293
- log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
294
-
295
- return detected_encoding
296
-
297
- @classmethod
298
- def read_file_safe(cls, file_path: str | Path) -> tuple[str, str]:
299
- """
300
- Safely read a file with automatic encoding detection and caching
301
-
302
- Args:
303
- file_path: Path to the file
304
-
305
- Returns:
306
- Tuple of (content, detected_encoding)
307
- """
308
- file_path = Path(file_path)
309
-
310
- try:
311
- # Read raw bytes first
312
- with open(file_path, "rb") as f:
313
- raw_data = f.read()
314
-
315
- if not raw_data:
316
- return "", cls.DEFAULT_ENCODING
317
-
318
- # Detect and decode with file path for caching
319
- detected_encoding = cls.detect_encoding(raw_data, str(file_path))
320
- content = cls.safe_decode(raw_data, detected_encoding)
321
-
322
- # Normalize line endings for consistency
323
- content = cls.normalize_line_endings(content)
324
-
325
- return content, detected_encoding
326
-
327
- except OSError as e:
328
- log_warning(f"Failed to read file {file_path}: {e}")
329
- raise e
330
-
331
- @classmethod
332
- def write_file_safe(
333
- cls, file_path: str | Path, content: str, encoding: str | None = None
334
- ) -> bool:
335
- """
336
- Safely write content to a file
337
-
338
- Args:
339
- file_path: Path to the file
340
- content: Content to write
341
- encoding: Target encoding (defaults to UTF-8)
342
-
343
- Returns:
344
- True if successful, False otherwise
345
- """
346
- file_path = Path(file_path)
347
- target_encoding = encoding or cls.DEFAULT_ENCODING
348
-
349
- try:
350
- encoded_content = cls.safe_encode(content, target_encoding)
351
-
352
- with open(file_path, "wb") as f:
353
- f.write(encoded_content)
354
-
355
- return True
356
-
357
- except OSError as e:
358
- log_warning(f"Failed to write file {file_path}: {e}")
359
- return False
360
-
361
- @classmethod
362
- def normalize_line_endings(cls, text: str) -> str:
363
- """
364
- Normalize line endings to Unix style (\n)
365
-
366
- Args:
367
- text: Text to normalize
368
-
369
- Returns:
370
- Text with normalized line endings
371
- """
372
- if not text:
373
- return text
374
-
375
- # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
376
- return text.replace("\r\n", "\n").replace("\r", "\n")
377
-
378
- @classmethod
379
- def extract_text_slice(
380
- cls,
381
- content_bytes: bytes,
382
- start_byte: int,
383
- end_byte: int,
384
- encoding: str | None = None,
385
- ) -> str:
386
- """
387
- Extract a slice of text from bytes with proper encoding handling
388
-
389
- Args:
390
- content_bytes: Source bytes
391
- start_byte: Start position
392
- end_byte: End position
393
- encoding: Encoding to use (auto-detected if None)
394
-
395
- Returns:
396
- Extracted text slice
397
- """
398
- if not content_bytes or start_byte >= len(content_bytes):
399
- return ""
400
-
401
- # Ensure bounds are valid
402
- start_byte = max(0, start_byte)
403
- end_byte = min(len(content_bytes), end_byte)
404
-
405
- if start_byte >= end_byte:
406
- return ""
407
-
408
- # Extract byte slice
409
- byte_slice = content_bytes[start_byte:end_byte]
410
-
411
- # Decode the slice
412
- return cls.safe_decode(byte_slice, encoding)
413
-
414
-
415
- # Convenience functions for backward compatibility
416
- def safe_encode(text: str, encoding: str | None = None) -> bytes:
417
- """Convenience function for safe encoding"""
418
- return EncodingManager.safe_encode(text, encoding)
419
-
420
-
421
- def safe_decode(data: bytes, encoding: str | None = None) -> str:
422
- """Convenience function for safe decoding"""
423
- return EncodingManager.safe_decode(data, encoding)
424
-
425
-
426
- def detect_encoding(data: bytes, file_path: str | None = None) -> str:
427
- """Convenience function for encoding detection with optional caching"""
428
- return EncodingManager.detect_encoding(data, file_path)
429
-
430
-
431
- def read_file_safe(file_path: str | Path) -> tuple[str, str]:
432
- """Convenience function for safe file reading"""
433
- return EncodingManager.read_file_safe(file_path)
434
-
435
-
436
- def write_file_safe(
437
- file_path: str | Path, content: str, encoding: str | None = None
438
- ) -> bool:
439
- """Convenience function for safe file writing"""
440
- return EncodingManager.write_file_safe(file_path, content, encoding)
441
-
442
-
443
- def extract_text_slice(
444
- content_bytes: bytes, start_byte: int, end_byte: int, encoding: str | None = None
445
- ) -> str:
446
- """Convenience function for text slice extraction"""
447
- return EncodingManager.extract_text_slice(
448
- content_bytes, start_byte, end_byte, encoding
449
- )
450
-
451
-
452
- def clear_encoding_cache() -> None:
453
- """Clear the global encoding cache"""
454
- _encoding_cache.clear()
455
-
456
-
457
- def get_encoding_cache_size() -> int:
458
- """Get the current size of the encoding cache"""
459
- return _encoding_cache.size()
1
+ #!/usr/bin/env python3
2
+ """
3
+ Optimized Encoding Utilities Module
4
+
5
+ This module provides unified encoding/decoding functionality with performance
6
+ optimizations including file-based encoding caching to reduce redundant
7
+ chardet.detect() calls.
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ # Set up encoding environment early
19
+ def _setup_encoding_environment() -> None:
20
+ """Set up proper encoding environment"""
21
+ try:
22
+ os.environ["PYTHONIOENCODING"] = "utf-8"
23
+ os.environ["PYTHONUTF8"] = "1"
24
+
25
+ # Ensure proper stdout/stderr encoding if possible
26
+ if hasattr(sys.stdout, "reconfigure"):
27
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
28
+ if hasattr(sys.stderr, "reconfigure"):
29
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
30
+ except Exception:
31
+ pass # Ignore setup errors, use defaults
32
+
33
+
34
+ # Set up environment when module is imported
35
+ _setup_encoding_environment()
36
+
37
+ # Try to import chardet with fallback
38
+ try:
39
+ import chardet
40
+
41
+ CHARDET_AVAILABLE = True
42
+ except ImportError:
43
+ CHARDET_AVAILABLE = False
44
+
45
+ # Import utilities with fallback
46
+ try:
47
+ from .utils import log_debug, log_warning
48
+ except ImportError:
49
+ # Fallback logging functions with compatible signatures
50
+ def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
51
+ print(f"DEBUG: {message}")
52
+
53
+ def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
54
+ print(f"WARNING: {message}")
55
+
56
+
57
+ class EncodingCache:
58
+ """Thread-safe encoding cache for file-based encoding detection optimization"""
59
+
60
+ def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
61
+ """
62
+ Initialize encoding cache
63
+
64
+ Args:
65
+ max_size: Maximum number of cached entries
66
+ ttl_seconds: Time-to-live for cache entries in seconds
67
+ """
68
+ self._cache: dict[
69
+ str, tuple[str, float]
70
+ ] = {} # file_path -> (encoding, timestamp)
71
+ self._lock = threading.RLock()
72
+ self._max_size = max_size
73
+ self._ttl_seconds = ttl_seconds
74
+
75
+ def get(self, file_path: str) -> str | None:
76
+ """
77
+ Get cached encoding for file path
78
+
79
+ Args:
80
+ file_path: Path to the file
81
+
82
+ Returns:
83
+ Cached encoding or None if not found/expired
84
+ """
85
+ with self._lock:
86
+ if file_path not in self._cache:
87
+ return None
88
+
89
+ encoding, timestamp = self._cache[file_path]
90
+ current_time = time.time()
91
+
92
+ # Check if entry has expired
93
+ if current_time - timestamp > self._ttl_seconds:
94
+ del self._cache[file_path]
95
+ return None
96
+
97
+ return encoding
98
+
99
+ def set(self, file_path: str, encoding: str) -> None:
100
+ """
101
+ Cache encoding for file path
102
+
103
+ Args:
104
+ file_path: Path to the file
105
+ encoding: Detected encoding
106
+ """
107
+ with self._lock:
108
+ current_time = time.time()
109
+
110
+ # Clean up expired entries if cache is getting full
111
+ if len(self._cache) >= self._max_size:
112
+ self._cleanup_expired()
113
+
114
+ # If still full after cleanup, remove oldest entry
115
+ if len(self._cache) >= self._max_size:
116
+ oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
117
+ del self._cache[oldest_key]
118
+
119
+ self._cache[file_path] = (encoding, current_time)
120
+
121
+ def _cleanup_expired(self) -> None:
122
+ """Remove expired entries from cache"""
123
+ current_time = time.time()
124
+ expired_keys = [
125
+ key
126
+ for key, (_, timestamp) in self._cache.items()
127
+ if current_time - timestamp > self._ttl_seconds
128
+ ]
129
+ for key in expired_keys:
130
+ del self._cache[key]
131
+
132
+ def clear(self) -> None:
133
+ """Clear all cached entries"""
134
+ with self._lock:
135
+ self._cache.clear()
136
+
137
+ def size(self) -> int:
138
+ """Get current cache size"""
139
+ with self._lock:
140
+ return len(self._cache)
141
+
142
+
143
+ # Global encoding cache instance
144
+ _encoding_cache = EncodingCache()
145
+
146
+
147
+ class EncodingManager:
148
+ """Centralized encoding management for consistent text processing"""
149
+
150
+ DEFAULT_ENCODING = "utf-8"
151
+ FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
152
+
153
+ @classmethod
154
+ def safe_encode(cls, text: str | None, encoding: str | None = None) -> bytes:
155
+ """
156
+ Safely encode text to bytes with fallback handling
157
+
158
+ Args:
159
+ text: Text to encode (can be None)
160
+ encoding: Target encoding (defaults to UTF-8)
161
+
162
+ Returns:
163
+ Encoded bytes
164
+ """
165
+ # Handle None input
166
+ if text is None:
167
+ return b""
168
+
169
+ target_encoding = encoding or cls.DEFAULT_ENCODING
170
+
171
+ try:
172
+ return text.encode(target_encoding)
173
+ except UnicodeEncodeError as e:
174
+ log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
175
+
176
+ # Try fallback encodings
177
+ for fallback in cls.FALLBACK_ENCODINGS:
178
+ if fallback != target_encoding:
179
+ try:
180
+ return text.encode(fallback, errors="replace")
181
+ except UnicodeEncodeError:
182
+ continue
183
+
184
+ # Last resort: encode with error replacement
185
+ log_warning(f"Using error replacement for encoding: {text[:50]}...")
186
+ return text.encode(cls.DEFAULT_ENCODING, errors="replace")
187
+
188
+ @classmethod
189
+ def safe_decode(cls, data: bytes, encoding: str | None = None) -> str:
190
+ """
191
+ Safely decode bytes to text with fallback handling
192
+
193
+ Args:
194
+ data: Bytes to decode
195
+ encoding: Source encoding (auto-detected if None)
196
+
197
+ Returns:
198
+ Decoded text
199
+ """
200
+ if data is None or len(data) == 0:
201
+ return ""
202
+
203
+ # Use provided encoding or detect
204
+ target_encoding = encoding
205
+ if not target_encoding:
206
+ target_encoding = cls.detect_encoding(data)
207
+
208
+ try:
209
+ return data.decode(target_encoding)
210
+ except UnicodeDecodeError as e:
211
+ log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
212
+
213
+ # Try fallback encodings
214
+ for fallback in cls.FALLBACK_ENCODINGS:
215
+ if fallback != target_encoding:
216
+ try:
217
+ return data.decode(fallback, errors="replace")
218
+ except UnicodeDecodeError:
219
+ continue
220
+
221
+ # Last resort: decode with error replacement
222
+ log_warning(
223
+ f"Using error replacement for decoding data (length: {len(data)})"
224
+ )
225
+ return data.decode(cls.DEFAULT_ENCODING, errors="replace")
226
+
227
+ @classmethod
228
+ def detect_encoding(cls, data: bytes, file_path: str | None = None) -> str:
229
+ """
230
+ Detect encoding of byte data with optional file-based caching
231
+
232
+ Args:
233
+ data: Bytes to analyze
234
+ file_path: Optional file path for caching (improves performance)
235
+
236
+ Returns:
237
+ Detected encoding name
238
+ """
239
+ if not data:
240
+ return cls.DEFAULT_ENCODING
241
+
242
+ # Check cache first if file_path is provided
243
+ if file_path:
244
+ cached_encoding = _encoding_cache.get(file_path)
245
+ if cached_encoding:
246
+ log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
247
+ return cached_encoding
248
+
249
+ detected_encoding = cls.DEFAULT_ENCODING
250
+
251
+ # If chardet is not available, use simple heuristics
252
+ if not CHARDET_AVAILABLE:
253
+ try:
254
+ # Try UTF-8 first
255
+ data.decode("utf-8")
256
+ detected_encoding = "utf-8"
257
+ except UnicodeDecodeError:
258
+ # Check for BOM
259
+ if data.startswith(b"\xff\xfe"):
260
+ detected_encoding = "utf-16-le"
261
+ elif data.startswith(b"\xfe\xff"):
262
+ detected_encoding = "utf-16-be"
263
+ elif data.startswith(b"\xef\xbb\xbf"):
264
+ detected_encoding = "utf-8-sig"
265
+ else:
266
+ detected_encoding = cls.DEFAULT_ENCODING
267
+ else:
268
+ try:
269
+ # Use chardet for detection
270
+ detection = chardet.detect(data)
271
+ if detection and detection["encoding"]:
272
+ confidence = detection.get("confidence", 0)
273
+ detected_encoding = detection["encoding"].lower()
274
+
275
+ # Only trust high-confidence detections
276
+ if confidence > 0.7:
277
+ log_debug(
278
+ f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})"
279
+ )
280
+ else:
281
+ log_debug(
282
+ f"Low confidence encoding detection: {detected_encoding} (confidence: {confidence:.2f}), using default"
283
+ )
284
+ detected_encoding = cls.DEFAULT_ENCODING
285
+
286
+ except Exception as e:
287
+ log_debug(f"Encoding detection failed: {e}")
288
+ detected_encoding = cls.DEFAULT_ENCODING
289
+
290
+ # Cache the result if file_path is provided
291
+ if file_path and detected_encoding:
292
+ _encoding_cache.set(file_path, detected_encoding)
293
+ log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
294
+
295
+ return detected_encoding
296
+
297
+ @classmethod
298
+ def read_file_safe(cls, file_path: str | Path) -> tuple[str, str]:
299
+ """
300
+ Safely read a file with automatic encoding detection and caching
301
+
302
+ Args:
303
+ file_path: Path to the file
304
+
305
+ Returns:
306
+ Tuple of (content, detected_encoding)
307
+ """
308
+ file_path = Path(file_path)
309
+
310
+ try:
311
+ # Read raw bytes first
312
+ with open(file_path, "rb") as f:
313
+ raw_data = f.read()
314
+
315
+ if not raw_data:
316
+ return "", cls.DEFAULT_ENCODING
317
+
318
+ # Detect and decode with file path for caching
319
+ detected_encoding = cls.detect_encoding(raw_data, str(file_path))
320
+ content = cls.safe_decode(raw_data, detected_encoding)
321
+
322
+ # Normalize line endings for consistency
323
+ content = cls.normalize_line_endings(content)
324
+
325
+ return content, detected_encoding
326
+
327
+ except OSError as e:
328
+ log_warning(f"Failed to read file {file_path}: {e}")
329
+ raise e
330
+
331
+ @classmethod
332
+ def write_file_safe(
333
+ cls, file_path: str | Path, content: str, encoding: str | None = None
334
+ ) -> bool:
335
+ """
336
+ Safely write content to a file
337
+
338
+ Args:
339
+ file_path: Path to the file
340
+ content: Content to write
341
+ encoding: Target encoding (defaults to UTF-8)
342
+
343
+ Returns:
344
+ True if successful, False otherwise
345
+ """
346
+ file_path = Path(file_path)
347
+ target_encoding = encoding or cls.DEFAULT_ENCODING
348
+
349
+ try:
350
+ encoded_content = cls.safe_encode(content, target_encoding)
351
+
352
+ with open(file_path, "wb") as f:
353
+ f.write(encoded_content)
354
+
355
+ return True
356
+
357
+ except OSError as e:
358
+ log_warning(f"Failed to write file {file_path}: {e}")
359
+ return False
360
+
361
+ @classmethod
362
+ def normalize_line_endings(cls, text: str) -> str:
363
+ """
364
+ Normalize line endings to Unix style (\n)
365
+
366
+ Args:
367
+ text: Text to normalize
368
+
369
+ Returns:
370
+ Text with normalized line endings
371
+ """
372
+ if not text:
373
+ return text
374
+
375
+ # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
376
+ return text.replace("\r\n", "\n").replace("\r", "\n")
377
+
378
+ @classmethod
379
+ def extract_text_slice(
380
+ cls,
381
+ content_bytes: bytes,
382
+ start_byte: int,
383
+ end_byte: int,
384
+ encoding: str | None = None,
385
+ ) -> str:
386
+ """
387
+ Extract a slice of text from bytes with proper encoding handling
388
+
389
+ Args:
390
+ content_bytes: Source bytes
391
+ start_byte: Start position
392
+ end_byte: End position
393
+ encoding: Encoding to use (auto-detected if None)
394
+
395
+ Returns:
396
+ Extracted text slice
397
+ """
398
+ if not content_bytes or start_byte >= len(content_bytes):
399
+ return ""
400
+
401
+ # Ensure bounds are valid
402
+ start_byte = max(0, start_byte)
403
+ end_byte = min(len(content_bytes), end_byte)
404
+
405
+ if start_byte >= end_byte:
406
+ return ""
407
+
408
+ # Extract byte slice
409
+ byte_slice = content_bytes[start_byte:end_byte]
410
+
411
+ # Decode the slice
412
+ return cls.safe_decode(byte_slice, encoding)
413
+
414
+
415
+ # Convenience functions for backward compatibility
416
+ def safe_encode(text: str, encoding: str | None = None) -> bytes:
417
+ """Convenience function for safe encoding"""
418
+ return EncodingManager.safe_encode(text, encoding)
419
+
420
+
421
+ def safe_decode(data: bytes, encoding: str | None = None) -> str:
422
+ """Convenience function for safe decoding"""
423
+ return EncodingManager.safe_decode(data, encoding)
424
+
425
+
426
+ def detect_encoding(data: bytes, file_path: str | None = None) -> str:
427
+ """Convenience function for encoding detection with optional caching"""
428
+ return EncodingManager.detect_encoding(data, file_path)
429
+
430
+
431
+ def read_file_safe(file_path: str | Path) -> tuple[str, str]:
432
+ """Convenience function for safe file reading"""
433
+ return EncodingManager.read_file_safe(file_path)
434
+
435
+
436
+ def write_file_safe(
437
+ file_path: str | Path, content: str, encoding: str | None = None
438
+ ) -> bool:
439
+ """Convenience function for safe file writing"""
440
+ return EncodingManager.write_file_safe(file_path, content, encoding)
441
+
442
+
443
+ def extract_text_slice(
444
+ content_bytes: bytes, start_byte: int, end_byte: int, encoding: str | None = None
445
+ ) -> str:
446
+ """Convenience function for text slice extraction"""
447
+ return EncodingManager.extract_text_slice(
448
+ content_bytes, start_byte, end_byte, encoding
449
+ )
450
+
451
+
452
+ def clear_encoding_cache() -> None:
453
+ """Clear the global encoding cache"""
454
+ _encoding_cache.clear()
455
+
456
+
457
+ def get_encoding_cache_size() -> int:
458
+ """Get the current size of the encoding cache"""
459
+ return _encoding_cache.size()