tree-sitter-analyzer 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tree-sitter-analyzer might be problematic. Click here for more details.

Files changed (78) hide show
  1. tree_sitter_analyzer/__init__.py +133 -121
  2. tree_sitter_analyzer/__main__.py +11 -12
  3. tree_sitter_analyzer/api.py +531 -539
  4. tree_sitter_analyzer/cli/__init__.py +39 -39
  5. tree_sitter_analyzer/cli/__main__.py +12 -13
  6. tree_sitter_analyzer/cli/commands/__init__.py +26 -27
  7. tree_sitter_analyzer/cli/commands/advanced_command.py +88 -88
  8. tree_sitter_analyzer/cli/commands/base_command.py +160 -155
  9. tree_sitter_analyzer/cli/commands/default_command.py +18 -19
  10. tree_sitter_analyzer/cli/commands/partial_read_command.py +141 -133
  11. tree_sitter_analyzer/cli/commands/query_command.py +81 -82
  12. tree_sitter_analyzer/cli/commands/structure_command.py +138 -121
  13. tree_sitter_analyzer/cli/commands/summary_command.py +101 -93
  14. tree_sitter_analyzer/cli/commands/table_command.py +232 -233
  15. tree_sitter_analyzer/cli/info_commands.py +120 -121
  16. tree_sitter_analyzer/cli_main.py +277 -276
  17. tree_sitter_analyzer/core/__init__.py +15 -20
  18. tree_sitter_analyzer/core/analysis_engine.py +591 -574
  19. tree_sitter_analyzer/core/cache_service.py +320 -330
  20. tree_sitter_analyzer/core/engine.py +557 -560
  21. tree_sitter_analyzer/core/parser.py +293 -288
  22. tree_sitter_analyzer/core/query.py +494 -502
  23. tree_sitter_analyzer/encoding_utils.py +458 -460
  24. tree_sitter_analyzer/exceptions.py +337 -340
  25. tree_sitter_analyzer/file_handler.py +217 -222
  26. tree_sitter_analyzer/formatters/__init__.py +1 -1
  27. tree_sitter_analyzer/formatters/base_formatter.py +167 -168
  28. tree_sitter_analyzer/formatters/formatter_factory.py +78 -74
  29. tree_sitter_analyzer/formatters/java_formatter.py +287 -270
  30. tree_sitter_analyzer/formatters/python_formatter.py +255 -235
  31. tree_sitter_analyzer/interfaces/__init__.py +9 -10
  32. tree_sitter_analyzer/interfaces/cli.py +528 -557
  33. tree_sitter_analyzer/interfaces/cli_adapter.py +322 -319
  34. tree_sitter_analyzer/interfaces/mcp_adapter.py +180 -170
  35. tree_sitter_analyzer/interfaces/mcp_server.py +405 -416
  36. tree_sitter_analyzer/java_analyzer.py +218 -219
  37. tree_sitter_analyzer/language_detector.py +398 -400
  38. tree_sitter_analyzer/language_loader.py +224 -228
  39. tree_sitter_analyzer/languages/__init__.py +10 -11
  40. tree_sitter_analyzer/languages/java_plugin.py +1129 -1113
  41. tree_sitter_analyzer/languages/python_plugin.py +737 -712
  42. tree_sitter_analyzer/mcp/__init__.py +31 -32
  43. tree_sitter_analyzer/mcp/resources/__init__.py +44 -47
  44. tree_sitter_analyzer/mcp/resources/code_file_resource.py +212 -213
  45. tree_sitter_analyzer/mcp/resources/project_stats_resource.py +560 -550
  46. tree_sitter_analyzer/mcp/server.py +333 -345
  47. tree_sitter_analyzer/mcp/tools/__init__.py +30 -31
  48. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +621 -557
  49. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +242 -245
  50. tree_sitter_analyzer/mcp/tools/base_tool.py +54 -55
  51. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +300 -302
  52. tree_sitter_analyzer/mcp/tools/table_format_tool.py +362 -359
  53. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +543 -476
  54. tree_sitter_analyzer/mcp/utils/__init__.py +105 -106
  55. tree_sitter_analyzer/mcp/utils/error_handler.py +549 -549
  56. tree_sitter_analyzer/models.py +470 -481
  57. tree_sitter_analyzer/output_manager.py +261 -264
  58. tree_sitter_analyzer/plugins/__init__.py +333 -334
  59. tree_sitter_analyzer/plugins/base.py +477 -446
  60. tree_sitter_analyzer/plugins/java_plugin.py +608 -625
  61. tree_sitter_analyzer/plugins/javascript_plugin.py +446 -439
  62. tree_sitter_analyzer/plugins/manager.py +362 -355
  63. tree_sitter_analyzer/plugins/plugin_loader.py +85 -83
  64. tree_sitter_analyzer/plugins/python_plugin.py +606 -598
  65. tree_sitter_analyzer/plugins/registry.py +374 -366
  66. tree_sitter_analyzer/queries/__init__.py +26 -27
  67. tree_sitter_analyzer/queries/java.py +391 -394
  68. tree_sitter_analyzer/queries/javascript.py +148 -149
  69. tree_sitter_analyzer/queries/python.py +285 -286
  70. tree_sitter_analyzer/queries/typescript.py +229 -230
  71. tree_sitter_analyzer/query_loader.py +254 -260
  72. tree_sitter_analyzer/table_formatter.py +468 -448
  73. tree_sitter_analyzer/utils.py +277 -277
  74. {tree_sitter_analyzer-0.2.0.dist-info → tree_sitter_analyzer-0.3.0.dist-info}/METADATA +21 -6
  75. tree_sitter_analyzer-0.3.0.dist-info/RECORD +77 -0
  76. tree_sitter_analyzer-0.2.0.dist-info/RECORD +0 -77
  77. {tree_sitter_analyzer-0.2.0.dist-info → tree_sitter_analyzer-0.3.0.dist-info}/WHEEL +0 -0
  78. {tree_sitter_analyzer-0.2.0.dist-info → tree_sitter_analyzer-0.3.0.dist-info}/entry_points.txt +0 -0
@@ -1,460 +1,458 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Optimized Encoding Utilities Module
5
-
6
- This module provides unified encoding/decoding functionality with performance
7
- optimizations including file-based encoding caching to reduce redundant
8
- chardet.detect() calls.
9
- """
10
-
11
- import locale
12
- import os
13
- import sys
14
- import threading
15
- import time
16
- from pathlib import Path
17
- from typing import Any, Dict, Optional, Tuple, Union
18
-
19
-
20
- # Set up encoding environment early
21
- def _setup_encoding_environment() -> None:
22
- """Set up proper encoding environment"""
23
- try:
24
- os.environ["PYTHONIOENCODING"] = "utf-8"
25
- os.environ["PYTHONUTF8"] = "1"
26
-
27
- # Ensure proper stdout/stderr encoding if possible
28
- if hasattr(sys.stdout, "reconfigure"):
29
- sys.stdout.reconfigure(encoding="utf-8", errors="replace")
30
- if hasattr(sys.stderr, "reconfigure"):
31
- sys.stderr.reconfigure(encoding="utf-8", errors="replace")
32
- except Exception:
33
- pass # Ignore setup errors, use defaults
34
-
35
-
36
- # Set up environment when module is imported
37
- _setup_encoding_environment()
38
-
39
- # Try to import chardet with fallback
40
- try:
41
- import chardet
42
-
43
- CHARDET_AVAILABLE = True
44
- except ImportError:
45
- CHARDET_AVAILABLE = False
46
-
47
- # Import utilities with fallback
48
- try:
49
- from .utils import log_debug, log_warning
50
- except ImportError:
51
- # Fallback logging functions with compatible signatures
52
- def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
53
- print(f"DEBUG: {message}")
54
-
55
- def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
56
- print(f"WARNING: {message}")
57
-
58
-
59
- class EncodingCache:
60
- """Thread-safe encoding cache for file-based encoding detection optimization"""
61
-
62
- def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
63
- """
64
- Initialize encoding cache
65
-
66
- Args:
67
- max_size: Maximum number of cached entries
68
- ttl_seconds: Time-to-live for cache entries in seconds
69
- """
70
- self._cache: Dict[str, Tuple[str, float]] = (
71
- {}
72
- ) # file_path -> (encoding, timestamp)
73
- self._lock = threading.RLock()
74
- self._max_size = max_size
75
- self._ttl_seconds = ttl_seconds
76
-
77
- def get(self, file_path: str) -> Optional[str]:
78
- """
79
- Get cached encoding for file path
80
-
81
- Args:
82
- file_path: Path to the file
83
-
84
- Returns:
85
- Cached encoding or None if not found/expired
86
- """
87
- with self._lock:
88
- if file_path not in self._cache:
89
- return None
90
-
91
- encoding, timestamp = self._cache[file_path]
92
- current_time = time.time()
93
-
94
- # Check if entry has expired
95
- if current_time - timestamp > self._ttl_seconds:
96
- del self._cache[file_path]
97
- return None
98
-
99
- return encoding
100
-
101
- def set(self, file_path: str, encoding: str) -> None:
102
- """
103
- Cache encoding for file path
104
-
105
- Args:
106
- file_path: Path to the file
107
- encoding: Detected encoding
108
- """
109
- with self._lock:
110
- current_time = time.time()
111
-
112
- # Clean up expired entries if cache is getting full
113
- if len(self._cache) >= self._max_size:
114
- self._cleanup_expired()
115
-
116
- # If still full after cleanup, remove oldest entry
117
- if len(self._cache) >= self._max_size:
118
- oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
119
- del self._cache[oldest_key]
120
-
121
- self._cache[file_path] = (encoding, current_time)
122
-
123
- def _cleanup_expired(self) -> None:
124
- """Remove expired entries from cache"""
125
- current_time = time.time()
126
- expired_keys = [
127
- key
128
- for key, (_, timestamp) in self._cache.items()
129
- if current_time - timestamp > self._ttl_seconds
130
- ]
131
- for key in expired_keys:
132
- del self._cache[key]
133
-
134
- def clear(self) -> None:
135
- """Clear all cached entries"""
136
- with self._lock:
137
- self._cache.clear()
138
-
139
- def size(self) -> int:
140
- """Get current cache size"""
141
- with self._lock:
142
- return len(self._cache)
143
-
144
-
145
- # Global encoding cache instance
146
- _encoding_cache = EncodingCache()
147
-
148
-
149
- class EncodingManager:
150
- """Centralized encoding management for consistent text processing"""
151
-
152
- DEFAULT_ENCODING = "utf-8"
153
- FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
154
-
155
- @classmethod
156
- def safe_encode(cls, text: str, encoding: Optional[str] = None) -> bytes:
157
- """
158
- Safely encode text to bytes with fallback handling
159
-
160
- Args:
161
- text: Text to encode
162
- encoding: Target encoding (defaults to UTF-8)
163
-
164
- Returns:
165
- Encoded bytes
166
- """
167
- if text is None:
168
- return b""
169
-
170
- target_encoding = encoding or cls.DEFAULT_ENCODING
171
-
172
- try:
173
- return text.encode(target_encoding)
174
- except UnicodeEncodeError as e:
175
- log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
176
-
177
- # Try fallback encodings
178
- for fallback in cls.FALLBACK_ENCODINGS:
179
- if fallback != target_encoding:
180
- try:
181
- return text.encode(fallback, errors="replace")
182
- except UnicodeEncodeError:
183
- continue
184
-
185
- # Last resort: encode with error replacement
186
- log_warning(f"Using error replacement for encoding: {text[:50]}...")
187
- return text.encode(cls.DEFAULT_ENCODING, errors="replace")
188
-
189
- @classmethod
190
- def safe_decode(cls, data: bytes, encoding: Optional[str] = None) -> str:
191
- """
192
- Safely decode bytes to text with fallback handling
193
-
194
- Args:
195
- data: Bytes to decode
196
- encoding: Source encoding (auto-detected if None)
197
-
198
- Returns:
199
- Decoded text
200
- """
201
- if data is None or len(data) == 0:
202
- return ""
203
-
204
- # Use provided encoding or detect
205
- target_encoding = encoding
206
- if not target_encoding:
207
- target_encoding = cls.detect_encoding(data)
208
-
209
- try:
210
- return data.decode(target_encoding)
211
- except UnicodeDecodeError as e:
212
- log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
213
-
214
- # Try fallback encodings
215
- for fallback in cls.FALLBACK_ENCODINGS:
216
- if fallback != target_encoding:
217
- try:
218
- return data.decode(fallback, errors="replace")
219
- except UnicodeDecodeError:
220
- continue
221
-
222
- # Last resort: decode with error replacement
223
- log_warning(
224
- f"Using error replacement for decoding data (length: {len(data)})"
225
- )
226
- return data.decode(cls.DEFAULT_ENCODING, errors="replace")
227
-
228
- @classmethod
229
- def detect_encoding(cls, data: bytes, file_path: Optional[str] = None) -> str:
230
- """
231
- Detect encoding of byte data with optional file-based caching
232
-
233
- Args:
234
- data: Bytes to analyze
235
- file_path: Optional file path for caching (improves performance)
236
-
237
- Returns:
238
- Detected encoding name
239
- """
240
- if not data:
241
- return cls.DEFAULT_ENCODING
242
-
243
- # Check cache first if file_path is provided
244
- if file_path:
245
- cached_encoding = _encoding_cache.get(file_path)
246
- if cached_encoding:
247
- log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
248
- return cached_encoding
249
-
250
- detected_encoding = cls.DEFAULT_ENCODING
251
-
252
- # If chardet is not available, use simple heuristics
253
- if not CHARDET_AVAILABLE:
254
- try:
255
- # Try UTF-8 first
256
- data.decode("utf-8")
257
- detected_encoding = "utf-8"
258
- except UnicodeDecodeError:
259
- # Check for BOM
260
- if data.startswith(b"\xff\xfe"):
261
- detected_encoding = "utf-16-le"
262
- elif data.startswith(b"\xfe\xff"):
263
- detected_encoding = "utf-16-be"
264
- elif data.startswith(b"\xef\xbb\xbf"):
265
- detected_encoding = "utf-8-sig"
266
- else:
267
- detected_encoding = cls.DEFAULT_ENCODING
268
- else:
269
- try:
270
- # Use chardet for detection
271
- detection = chardet.detect(data)
272
- if detection and detection["encoding"]:
273
- confidence = detection.get("confidence", 0)
274
- detected_encoding = detection["encoding"].lower()
275
-
276
- # Only trust high-confidence detections
277
- if confidence > 0.7:
278
- log_debug(
279
- f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})"
280
- )
281
- else:
282
- log_debug(
283
- f"Low confidence encoding detection: {detected_encoding} (confidence: {confidence:.2f}), using default"
284
- )
285
- detected_encoding = cls.DEFAULT_ENCODING
286
-
287
- except Exception as e:
288
- log_debug(f"Encoding detection failed: {e}")
289
- detected_encoding = cls.DEFAULT_ENCODING
290
-
291
- # Cache the result if file_path is provided
292
- if file_path and detected_encoding:
293
- _encoding_cache.set(file_path, detected_encoding)
294
- log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
295
-
296
- return detected_encoding
297
-
298
- @classmethod
299
- def read_file_safe(cls, file_path: Union[str, Path]) -> Tuple[str, str]:
300
- """
301
- Safely read a file with automatic encoding detection and caching
302
-
303
- Args:
304
- file_path: Path to the file
305
-
306
- Returns:
307
- Tuple of (content, detected_encoding)
308
- """
309
- file_path = Path(file_path)
310
-
311
- try:
312
- # Read raw bytes first
313
- with open(file_path, "rb") as f:
314
- raw_data = f.read()
315
-
316
- if not raw_data:
317
- return "", cls.DEFAULT_ENCODING
318
-
319
- # Detect and decode with file path for caching
320
- detected_encoding = cls.detect_encoding(raw_data, str(file_path))
321
- content = cls.safe_decode(raw_data, detected_encoding)
322
-
323
- # Normalize line endings for consistency
324
- content = cls.normalize_line_endings(content)
325
-
326
- return content, detected_encoding
327
-
328
- except IOError as e:
329
- log_warning(f"Failed to read file {file_path}: {e}")
330
- raise e
331
-
332
- @classmethod
333
- def write_file_safe(
334
- cls, file_path: Union[str, Path], content: str, encoding: Optional[str] = None
335
- ) -> bool:
336
- """
337
- Safely write content to a file
338
-
339
- Args:
340
- file_path: Path to the file
341
- content: Content to write
342
- encoding: Target encoding (defaults to UTF-8)
343
-
344
- Returns:
345
- True if successful, False otherwise
346
- """
347
- file_path = Path(file_path)
348
- target_encoding = encoding or cls.DEFAULT_ENCODING
349
-
350
- try:
351
- encoded_content = cls.safe_encode(content, target_encoding)
352
-
353
- with open(file_path, "wb") as f:
354
- f.write(encoded_content)
355
-
356
- return True
357
-
358
- except IOError as e:
359
- log_warning(f"Failed to write file {file_path}: {e}")
360
- return False
361
-
362
- @classmethod
363
- def normalize_line_endings(cls, text: str) -> str:
364
- """
365
- Normalize line endings to Unix style (\n)
366
-
367
- Args:
368
- text: Text to normalize
369
-
370
- Returns:
371
- Text with normalized line endings
372
- """
373
- if not text:
374
- return text
375
-
376
- # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
377
- return text.replace("\r\n", "\n").replace("\r", "\n")
378
-
379
- @classmethod
380
- def extract_text_slice(
381
- cls,
382
- content_bytes: bytes,
383
- start_byte: int,
384
- end_byte: int,
385
- encoding: Optional[str] = None,
386
- ) -> str:
387
- """
388
- Extract a slice of text from bytes with proper encoding handling
389
-
390
- Args:
391
- content_bytes: Source bytes
392
- start_byte: Start position
393
- end_byte: End position
394
- encoding: Encoding to use (auto-detected if None)
395
-
396
- Returns:
397
- Extracted text slice
398
- """
399
- if not content_bytes or start_byte >= len(content_bytes):
400
- return ""
401
-
402
- # Ensure bounds are valid
403
- start_byte = max(0, start_byte)
404
- end_byte = min(len(content_bytes), end_byte)
405
-
406
- if start_byte >= end_byte:
407
- return ""
408
-
409
- # Extract byte slice
410
- byte_slice = content_bytes[start_byte:end_byte]
411
-
412
- # Decode the slice
413
- return cls.safe_decode(byte_slice, encoding)
414
-
415
-
416
- # Convenience functions for backward compatibility
417
- def safe_encode(text: str, encoding: Optional[str] = None) -> bytes:
418
- """Convenience function for safe encoding"""
419
- return EncodingManager.safe_encode(text, encoding)
420
-
421
-
422
- def safe_decode(data: bytes, encoding: Optional[str] = None) -> str:
423
- """Convenience function for safe decoding"""
424
- return EncodingManager.safe_decode(data, encoding)
425
-
426
-
427
- def detect_encoding(data: bytes, file_path: Optional[str] = None) -> str:
428
- """Convenience function for encoding detection with optional caching"""
429
- return EncodingManager.detect_encoding(data, file_path)
430
-
431
-
432
- def read_file_safe(file_path: Union[str, Path]) -> Tuple[str, str]:
433
- """Convenience function for safe file reading"""
434
- return EncodingManager.read_file_safe(file_path)
435
-
436
-
437
- def write_file_safe(
438
- file_path: Union[str, Path], content: str, encoding: Optional[str] = None
439
- ) -> bool:
440
- """Convenience function for safe file writing"""
441
- return EncodingManager.write_file_safe(file_path, content, encoding)
442
-
443
-
444
- def extract_text_slice(
445
- content_bytes: bytes, start_byte: int, end_byte: int, encoding: Optional[str] = None
446
- ) -> str:
447
- """Convenience function for text slice extraction"""
448
- return EncodingManager.extract_text_slice(
449
- content_bytes, start_byte, end_byte, encoding
450
- )
451
-
452
-
453
- def clear_encoding_cache() -> None:
454
- """Clear the global encoding cache"""
455
- _encoding_cache.clear()
456
-
457
-
458
- def get_encoding_cache_size() -> int:
459
- """Get the current size of the encoding cache"""
460
- return _encoding_cache.size()
1
+ #!/usr/bin/env python3
2
+ """
3
+ Optimized Encoding Utilities Module
4
+
5
+ This module provides unified encoding/decoding functionality with performance
6
+ optimizations including file-based encoding caching to reduce redundant
7
+ chardet.detect() calls.
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ # Set up encoding environment early
19
+ def _setup_encoding_environment() -> None:
20
+ """Set up proper encoding environment"""
21
+ try:
22
+ os.environ["PYTHONIOENCODING"] = "utf-8"
23
+ os.environ["PYTHONUTF8"] = "1"
24
+
25
+ # Ensure proper stdout/stderr encoding if possible
26
+ if hasattr(sys.stdout, "reconfigure"):
27
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
28
+ if hasattr(sys.stderr, "reconfigure"):
29
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
30
+ except Exception:
31
+ pass # Ignore setup errors, use defaults
32
+
33
+
34
+ # Set up environment when module is imported
35
+ _setup_encoding_environment()
36
+
37
+ # Try to import chardet with fallback
38
+ try:
39
+ import chardet
40
+
41
+ CHARDET_AVAILABLE = True
42
+ except ImportError:
43
+ CHARDET_AVAILABLE = False
44
+
45
+ # Import utilities with fallback
46
+ try:
47
+ from .utils import log_debug, log_warning
48
+ except ImportError:
49
+ # Fallback logging functions with compatible signatures
50
+ def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
51
+ print(f"DEBUG: {message}")
52
+
53
+ def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
54
+ print(f"WARNING: {message}")
55
+
56
+
57
+ class EncodingCache:
58
+ """Thread-safe encoding cache for file-based encoding detection optimization"""
59
+
60
+ def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
61
+ """
62
+ Initialize encoding cache
63
+
64
+ Args:
65
+ max_size: Maximum number of cached entries
66
+ ttl_seconds: Time-to-live for cache entries in seconds
67
+ """
68
+ self._cache: dict[
69
+ str, tuple[str, float]
70
+ ] = {} # file_path -> (encoding, timestamp)
71
+ self._lock = threading.RLock()
72
+ self._max_size = max_size
73
+ self._ttl_seconds = ttl_seconds
74
+
75
+ def get(self, file_path: str) -> str | None:
76
+ """
77
+ Get cached encoding for file path
78
+
79
+ Args:
80
+ file_path: Path to the file
81
+
82
+ Returns:
83
+ Cached encoding or None if not found/expired
84
+ """
85
+ with self._lock:
86
+ if file_path not in self._cache:
87
+ return None
88
+
89
+ encoding, timestamp = self._cache[file_path]
90
+ current_time = time.time()
91
+
92
+ # Check if entry has expired
93
+ if current_time - timestamp > self._ttl_seconds:
94
+ del self._cache[file_path]
95
+ return None
96
+
97
+ return encoding
98
+
99
+ def set(self, file_path: str, encoding: str) -> None:
100
+ """
101
+ Cache encoding for file path
102
+
103
+ Args:
104
+ file_path: Path to the file
105
+ encoding: Detected encoding
106
+ """
107
+ with self._lock:
108
+ current_time = time.time()
109
+
110
+ # Clean up expired entries if cache is getting full
111
+ if len(self._cache) >= self._max_size:
112
+ self._cleanup_expired()
113
+
114
+ # If still full after cleanup, remove oldest entry
115
+ if len(self._cache) >= self._max_size:
116
+ oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
117
+ del self._cache[oldest_key]
118
+
119
+ self._cache[file_path] = (encoding, current_time)
120
+
121
+ def _cleanup_expired(self) -> None:
122
+ """Remove expired entries from cache"""
123
+ current_time = time.time()
124
+ expired_keys = [
125
+ key
126
+ for key, (_, timestamp) in self._cache.items()
127
+ if current_time - timestamp > self._ttl_seconds
128
+ ]
129
+ for key in expired_keys:
130
+ del self._cache[key]
131
+
132
+ def clear(self) -> None:
133
+ """Clear all cached entries"""
134
+ with self._lock:
135
+ self._cache.clear()
136
+
137
+ def size(self) -> int:
138
+ """Get current cache size"""
139
+ with self._lock:
140
+ return len(self._cache)
141
+
142
+
143
+ # Global encoding cache instance
144
+ _encoding_cache = EncodingCache()
145
+
146
+
147
+ class EncodingManager:
148
+ """Centralized encoding management for consistent text processing"""
149
+
150
+ DEFAULT_ENCODING = "utf-8"
151
+ FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
152
+
153
+ @classmethod
154
+ def safe_encode(cls, text: str, encoding: str | None = None) -> bytes:
155
+ """
156
+ Safely encode text to bytes with fallback handling
157
+
158
+ Args:
159
+ text: Text to encode
160
+ encoding: Target encoding (defaults to UTF-8)
161
+
162
+ Returns:
163
+ Encoded bytes
164
+ """
165
+ if text is None:
166
+ return b""
167
+
168
+ target_encoding = encoding or cls.DEFAULT_ENCODING
169
+
170
+ try:
171
+ return text.encode(target_encoding)
172
+ except UnicodeEncodeError as e:
173
+ log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
174
+
175
+ # Try fallback encodings
176
+ for fallback in cls.FALLBACK_ENCODINGS:
177
+ if fallback != target_encoding:
178
+ try:
179
+ return text.encode(fallback, errors="replace")
180
+ except UnicodeEncodeError:
181
+ continue
182
+
183
+ # Last resort: encode with error replacement
184
+ log_warning(f"Using error replacement for encoding: {text[:50]}...")
185
+ return text.encode(cls.DEFAULT_ENCODING, errors="replace")
186
+
187
+ @classmethod
188
+ def safe_decode(cls, data: bytes, encoding: str | None = None) -> str:
189
+ """
190
+ Safely decode bytes to text with fallback handling
191
+
192
+ Args:
193
+ data: Bytes to decode
194
+ encoding: Source encoding (auto-detected if None)
195
+
196
+ Returns:
197
+ Decoded text
198
+ """
199
+ if data is None or len(data) == 0:
200
+ return ""
201
+
202
+ # Use provided encoding or detect
203
+ target_encoding = encoding
204
+ if not target_encoding:
205
+ target_encoding = cls.detect_encoding(data)
206
+
207
+ try:
208
+ return data.decode(target_encoding)
209
+ except UnicodeDecodeError as e:
210
+ log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
211
+
212
+ # Try fallback encodings
213
+ for fallback in cls.FALLBACK_ENCODINGS:
214
+ if fallback != target_encoding:
215
+ try:
216
+ return data.decode(fallback, errors="replace")
217
+ except UnicodeDecodeError:
218
+ continue
219
+
220
+ # Last resort: decode with error replacement
221
+ log_warning(
222
+ f"Using error replacement for decoding data (length: {len(data)})"
223
+ )
224
+ return data.decode(cls.DEFAULT_ENCODING, errors="replace")
225
+
226
+ @classmethod
227
+ def detect_encoding(cls, data: bytes, file_path: str | None = None) -> str:
228
+ """
229
+ Detect encoding of byte data with optional file-based caching
230
+
231
+ Args:
232
+ data: Bytes to analyze
233
+ file_path: Optional file path for caching (improves performance)
234
+
235
+ Returns:
236
+ Detected encoding name
237
+ """
238
+ if not data:
239
+ return cls.DEFAULT_ENCODING
240
+
241
+ # Check cache first if file_path is provided
242
+ if file_path:
243
+ cached_encoding = _encoding_cache.get(file_path)
244
+ if cached_encoding:
245
+ log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
246
+ return cached_encoding
247
+
248
+ detected_encoding = cls.DEFAULT_ENCODING
249
+
250
+ # If chardet is not available, use simple heuristics
251
+ if not CHARDET_AVAILABLE:
252
+ try:
253
+ # Try UTF-8 first
254
+ data.decode("utf-8")
255
+ detected_encoding = "utf-8"
256
+ except UnicodeDecodeError:
257
+ # Check for BOM
258
+ if data.startswith(b"\xff\xfe"):
259
+ detected_encoding = "utf-16-le"
260
+ elif data.startswith(b"\xfe\xff"):
261
+ detected_encoding = "utf-16-be"
262
+ elif data.startswith(b"\xef\xbb\xbf"):
263
+ detected_encoding = "utf-8-sig"
264
+ else:
265
+ detected_encoding = cls.DEFAULT_ENCODING
266
+ else:
267
+ try:
268
+ # Use chardet for detection
269
+ detection = chardet.detect(data)
270
+ if detection and detection["encoding"]:
271
+ confidence = detection.get("confidence", 0)
272
+ detected_encoding = detection["encoding"].lower()
273
+
274
+ # Only trust high-confidence detections
275
+ if confidence > 0.7:
276
+ log_debug(
277
+ f"Detected encoding: {detected_encoding} (confidence: {confidence:.2f})"
278
+ )
279
+ else:
280
+ log_debug(
281
+ f"Low confidence encoding detection: {detected_encoding} (confidence: {confidence:.2f}), using default"
282
+ )
283
+ detected_encoding = cls.DEFAULT_ENCODING
284
+
285
+ except Exception as e:
286
+ log_debug(f"Encoding detection failed: {e}")
287
+ detected_encoding = cls.DEFAULT_ENCODING
288
+
289
+ # Cache the result if file_path is provided
290
+ if file_path and detected_encoding:
291
+ _encoding_cache.set(file_path, detected_encoding)
292
+ log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
293
+
294
+ return detected_encoding
295
+
296
+ @classmethod
297
+ def read_file_safe(cls, file_path: str | Path) -> tuple[str, str]:
298
+ """
299
+ Safely read a file with automatic encoding detection and caching
300
+
301
+ Args:
302
+ file_path: Path to the file
303
+
304
+ Returns:
305
+ Tuple of (content, detected_encoding)
306
+ """
307
+ file_path = Path(file_path)
308
+
309
+ try:
310
+ # Read raw bytes first
311
+ with open(file_path, "rb") as f:
312
+ raw_data = f.read()
313
+
314
+ if not raw_data:
315
+ return "", cls.DEFAULT_ENCODING
316
+
317
+ # Detect and decode with file path for caching
318
+ detected_encoding = cls.detect_encoding(raw_data, str(file_path))
319
+ content = cls.safe_decode(raw_data, detected_encoding)
320
+
321
+ # Normalize line endings for consistency
322
+ content = cls.normalize_line_endings(content)
323
+
324
+ return content, detected_encoding
325
+
326
+ except OSError as e:
327
+ log_warning(f"Failed to read file {file_path}: {e}")
328
+ raise e
329
+
330
+ @classmethod
331
+ def write_file_safe(
332
+ cls, file_path: str | Path, content: str, encoding: str | None = None
333
+ ) -> bool:
334
+ """
335
+ Safely write content to a file
336
+
337
+ Args:
338
+ file_path: Path to the file
339
+ content: Content to write
340
+ encoding: Target encoding (defaults to UTF-8)
341
+
342
+ Returns:
343
+ True if successful, False otherwise
344
+ """
345
+ file_path = Path(file_path)
346
+ target_encoding = encoding or cls.DEFAULT_ENCODING
347
+
348
+ try:
349
+ encoded_content = cls.safe_encode(content, target_encoding)
350
+
351
+ with open(file_path, "wb") as f:
352
+ f.write(encoded_content)
353
+
354
+ return True
355
+
356
+ except OSError as e:
357
+ log_warning(f"Failed to write file {file_path}: {e}")
358
+ return False
359
+
360
+ @classmethod
361
+ def normalize_line_endings(cls, text: str) -> str:
362
+ """
363
+ Normalize line endings to Unix style (\n)
364
+
365
+ Args:
366
+ text: Text to normalize
367
+
368
+ Returns:
369
+ Text with normalized line endings
370
+ """
371
+ if not text:
372
+ return text
373
+
374
+ # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
375
+ return text.replace("\r\n", "\n").replace("\r", "\n")
376
+
377
+ @classmethod
378
+ def extract_text_slice(
379
+ cls,
380
+ content_bytes: bytes,
381
+ start_byte: int,
382
+ end_byte: int,
383
+ encoding: str | None = None,
384
+ ) -> str:
385
+ """
386
+ Extract a slice of text from bytes with proper encoding handling
387
+
388
+ Args:
389
+ content_bytes: Source bytes
390
+ start_byte: Start position
391
+ end_byte: End position
392
+ encoding: Encoding to use (auto-detected if None)
393
+
394
+ Returns:
395
+ Extracted text slice
396
+ """
397
+ if not content_bytes or start_byte >= len(content_bytes):
398
+ return ""
399
+
400
+ # Ensure bounds are valid
401
+ start_byte = max(0, start_byte)
402
+ end_byte = min(len(content_bytes), end_byte)
403
+
404
+ if start_byte >= end_byte:
405
+ return ""
406
+
407
+ # Extract byte slice
408
+ byte_slice = content_bytes[start_byte:end_byte]
409
+
410
+ # Decode the slice
411
+ return cls.safe_decode(byte_slice, encoding)
412
+
413
+
414
+ # Convenience functions for backward compatibility
415
+ def safe_encode(text: str, encoding: str | None = None) -> bytes:
416
+ """Convenience function for safe encoding"""
417
+ return EncodingManager.safe_encode(text, encoding)
418
+
419
+
420
+ def safe_decode(data: bytes, encoding: str | None = None) -> str:
421
+ """Convenience function for safe decoding"""
422
+ return EncodingManager.safe_decode(data, encoding)
423
+
424
+
425
+ def detect_encoding(data: bytes, file_path: str | None = None) -> str:
426
+ """Convenience function for encoding detection with optional caching"""
427
+ return EncodingManager.detect_encoding(data, file_path)
428
+
429
+
430
+ def read_file_safe(file_path: str | Path) -> tuple[str, str]:
431
+ """Convenience function for safe file reading"""
432
+ return EncodingManager.read_file_safe(file_path)
433
+
434
+
435
+ def write_file_safe(
436
+ file_path: str | Path, content: str, encoding: str | None = None
437
+ ) -> bool:
438
+ """Convenience function for safe file writing"""
439
+ return EncodingManager.write_file_safe(file_path, content, encoding)
440
+
441
+
442
+ def extract_text_slice(
443
+ content_bytes: bytes, start_byte: int, end_byte: int, encoding: str | None = None
444
+ ) -> str:
445
+ """Convenience function for text slice extraction"""
446
+ return EncodingManager.extract_text_slice(
447
+ content_bytes, start_byte, end_byte, encoding
448
+ )
449
+
450
+
451
+ def clear_encoding_cache() -> None:
452
+ """Clear the global encoding cache"""
453
+ _encoding_cache.clear()
454
+
455
+
456
+ def get_encoding_cache_size() -> int:
457
+ """Get the current size of the encoding cache"""
458
+ return _encoding_cache.size()