tree-sitter-analyzer 1.9.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. tree_sitter_analyzer/__init__.py +132 -0
  2. tree_sitter_analyzer/__main__.py +11 -0
  3. tree_sitter_analyzer/api.py +853 -0
  4. tree_sitter_analyzer/cli/__init__.py +39 -0
  5. tree_sitter_analyzer/cli/__main__.py +12 -0
  6. tree_sitter_analyzer/cli/argument_validator.py +89 -0
  7. tree_sitter_analyzer/cli/commands/__init__.py +26 -0
  8. tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
  9. tree_sitter_analyzer/cli/commands/base_command.py +181 -0
  10. tree_sitter_analyzer/cli/commands/default_command.py +18 -0
  11. tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
  12. tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
  13. tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
  14. tree_sitter_analyzer/cli/commands/query_command.py +109 -0
  15. tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
  16. tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
  17. tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
  18. tree_sitter_analyzer/cli/commands/table_command.py +414 -0
  19. tree_sitter_analyzer/cli/info_commands.py +124 -0
  20. tree_sitter_analyzer/cli_main.py +472 -0
  21. tree_sitter_analyzer/constants.py +85 -0
  22. tree_sitter_analyzer/core/__init__.py +15 -0
  23. tree_sitter_analyzer/core/analysis_engine.py +580 -0
  24. tree_sitter_analyzer/core/cache_service.py +333 -0
  25. tree_sitter_analyzer/core/engine.py +585 -0
  26. tree_sitter_analyzer/core/parser.py +293 -0
  27. tree_sitter_analyzer/core/query.py +605 -0
  28. tree_sitter_analyzer/core/query_filter.py +200 -0
  29. tree_sitter_analyzer/core/query_service.py +340 -0
  30. tree_sitter_analyzer/encoding_utils.py +530 -0
  31. tree_sitter_analyzer/exceptions.py +747 -0
  32. tree_sitter_analyzer/file_handler.py +246 -0
  33. tree_sitter_analyzer/formatters/__init__.py +1 -0
  34. tree_sitter_analyzer/formatters/base_formatter.py +201 -0
  35. tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
  36. tree_sitter_analyzer/formatters/formatter_config.py +197 -0
  37. tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
  38. tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
  39. tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
  40. tree_sitter_analyzer/formatters/go_formatter.py +368 -0
  41. tree_sitter_analyzer/formatters/html_formatter.py +498 -0
  42. tree_sitter_analyzer/formatters/java_formatter.py +423 -0
  43. tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
  44. tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
  45. tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
  46. tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
  47. tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
  48. tree_sitter_analyzer/formatters/php_formatter.py +301 -0
  49. tree_sitter_analyzer/formatters/python_formatter.py +830 -0
  50. tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
  51. tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
  52. tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
  53. tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
  54. tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
  55. tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
  56. tree_sitter_analyzer/interfaces/__init__.py +9 -0
  57. tree_sitter_analyzer/interfaces/cli.py +535 -0
  58. tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
  59. tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
  60. tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
  61. tree_sitter_analyzer/language_detector.py +553 -0
  62. tree_sitter_analyzer/language_loader.py +271 -0
  63. tree_sitter_analyzer/languages/__init__.py +10 -0
  64. tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
  65. tree_sitter_analyzer/languages/css_plugin.py +449 -0
  66. tree_sitter_analyzer/languages/go_plugin.py +836 -0
  67. tree_sitter_analyzer/languages/html_plugin.py +496 -0
  68. tree_sitter_analyzer/languages/java_plugin.py +1299 -0
  69. tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
  70. tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
  71. tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
  72. tree_sitter_analyzer/languages/php_plugin.py +862 -0
  73. tree_sitter_analyzer/languages/python_plugin.py +1636 -0
  74. tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
  75. tree_sitter_analyzer/languages/rust_plugin.py +673 -0
  76. tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
  77. tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
  78. tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
  79. tree_sitter_analyzer/legacy_table_formatter.py +860 -0
  80. tree_sitter_analyzer/mcp/__init__.py +34 -0
  81. tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
  82. tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
  83. tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
  84. tree_sitter_analyzer/mcp/server.py +869 -0
  85. tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
  86. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
  87. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
  88. tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
  89. tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
  90. tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
  91. tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
  92. tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
  93. tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
  94. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
  95. tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
  96. tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
  97. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
  98. tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
  99. tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
  100. tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
  101. tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
  102. tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
  103. tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
  104. tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
  105. tree_sitter_analyzer/models.py +840 -0
  106. tree_sitter_analyzer/mypy_current_errors.txt +2 -0
  107. tree_sitter_analyzer/output_manager.py +255 -0
  108. tree_sitter_analyzer/platform_compat/__init__.py +3 -0
  109. tree_sitter_analyzer/platform_compat/adapter.py +324 -0
  110. tree_sitter_analyzer/platform_compat/compare.py +224 -0
  111. tree_sitter_analyzer/platform_compat/detector.py +67 -0
  112. tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
  113. tree_sitter_analyzer/platform_compat/profiles.py +217 -0
  114. tree_sitter_analyzer/platform_compat/record.py +55 -0
  115. tree_sitter_analyzer/platform_compat/recorder.py +155 -0
  116. tree_sitter_analyzer/platform_compat/report.py +92 -0
  117. tree_sitter_analyzer/plugins/__init__.py +280 -0
  118. tree_sitter_analyzer/plugins/base.py +647 -0
  119. tree_sitter_analyzer/plugins/manager.py +384 -0
  120. tree_sitter_analyzer/project_detector.py +328 -0
  121. tree_sitter_analyzer/queries/__init__.py +27 -0
  122. tree_sitter_analyzer/queries/csharp.py +216 -0
  123. tree_sitter_analyzer/queries/css.py +615 -0
  124. tree_sitter_analyzer/queries/go.py +275 -0
  125. tree_sitter_analyzer/queries/html.py +543 -0
  126. tree_sitter_analyzer/queries/java.py +402 -0
  127. tree_sitter_analyzer/queries/javascript.py +724 -0
  128. tree_sitter_analyzer/queries/kotlin.py +192 -0
  129. tree_sitter_analyzer/queries/markdown.py +258 -0
  130. tree_sitter_analyzer/queries/php.py +95 -0
  131. tree_sitter_analyzer/queries/python.py +859 -0
  132. tree_sitter_analyzer/queries/ruby.py +92 -0
  133. tree_sitter_analyzer/queries/rust.py +223 -0
  134. tree_sitter_analyzer/queries/sql.py +555 -0
  135. tree_sitter_analyzer/queries/typescript.py +871 -0
  136. tree_sitter_analyzer/queries/yaml.py +236 -0
  137. tree_sitter_analyzer/query_loader.py +272 -0
  138. tree_sitter_analyzer/security/__init__.py +22 -0
  139. tree_sitter_analyzer/security/boundary_manager.py +277 -0
  140. tree_sitter_analyzer/security/regex_checker.py +297 -0
  141. tree_sitter_analyzer/security/validator.py +599 -0
  142. tree_sitter_analyzer/table_formatter.py +782 -0
  143. tree_sitter_analyzer/utils/__init__.py +53 -0
  144. tree_sitter_analyzer/utils/logging.py +433 -0
  145. tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
  146. tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
  147. tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
  148. tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
  149. tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
@@ -0,0 +1,530 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Optimized Encoding Utilities Module
4
+
5
+ This module provides unified encoding/decoding functionality with performance
6
+ optimizations including file-based encoding caching to reduce redundant
7
+ chardet.detect() calls.
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import threading
13
+ import time
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ # Set up encoding environment early
19
+ def _setup_encoding_environment() -> None:
20
+ """Set up proper encoding environment"""
21
+ try:
22
+ os.environ["PYTHONIOENCODING"] = "utf-8"
23
+ os.environ["PYTHONUTF8"] = "1"
24
+
25
+ # Ensure proper stdout/stderr encoding if possible
26
+ if hasattr(sys.stdout, "reconfigure"):
27
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
28
+ if hasattr(sys.stderr, "reconfigure"):
29
+ sys.stderr.reconfigure(encoding="utf-8", errors="replace")
30
+ except Exception as e:
31
+ # Ignore setup errors, use defaults; log at debug when possible
32
+ msg = f"[encoding_setup] non-fatal setup error: {e}\n"
33
+ if hasattr(sys, "stderr") and hasattr(sys.stderr, "write"):
34
+ try:
35
+ sys.stderr.write(msg)
36
+ except Exception:
37
+ # Swallow secondary I/O errors intentionally
38
+ ...
39
+
40
+
41
+ # Set up environment when module is imported
42
+ _setup_encoding_environment()
43
+
44
+ # Try to import chardet with fallback
45
+ try:
46
+ import chardet
47
+
48
+ CHARDET_AVAILABLE = True
49
+ except ImportError:
50
+ CHARDET_AVAILABLE = False
51
+
52
+ # Import utilities with fallback
53
+ try:
54
+ from .utils import log_debug, log_warning
55
+ except ImportError:
56
+ # Fallback logging functions with compatible signatures
57
+ def log_debug(message: str, *args: Any, **kwargs: Any) -> None:
58
+ print(f"DEBUG: {message}")
59
+
60
+ def log_warning(message: str, *args: Any, **kwargs: Any) -> None:
61
+ print(f"WARNING: {message}")
62
+
63
+
64
+ class EncodingCache:
65
+ """Thread-safe encoding cache for file-based encoding detection optimization"""
66
+
67
+ def __init__(self, max_size: int = 1000, ttl_seconds: int = 3600):
68
+ """
69
+ Initialize encoding cache
70
+
71
+ Args:
72
+ max_size: Maximum number of cached entries
73
+ ttl_seconds: Time-to-live for cache entries in seconds
74
+ """
75
+ self._cache: dict[
76
+ str, tuple[str, float]
77
+ ] = {} # file_path -> (encoding, timestamp)
78
+ self._lock = threading.RLock()
79
+ self._max_size = max_size
80
+ self._ttl_seconds = ttl_seconds
81
+
82
+ def get(self, file_path: str) -> str | None:
83
+ """
84
+ Get cached encoding for file path
85
+
86
+ Args:
87
+ file_path: Path to the file
88
+
89
+ Returns:
90
+ Cached encoding or None if not found/expired
91
+ """
92
+ with self._lock:
93
+ if file_path not in self._cache:
94
+ return None
95
+
96
+ encoding, timestamp = self._cache[file_path]
97
+ current_time = time.time()
98
+
99
+ # Check if entry has expired
100
+ if current_time - timestamp > self._ttl_seconds:
101
+ del self._cache[file_path]
102
+ return None
103
+
104
+ return encoding
105
+
106
+ def set(self, file_path: str, encoding: str) -> None:
107
+ """
108
+ Cache encoding for file path
109
+
110
+ Args:
111
+ file_path: Path to the file
112
+ encoding: Detected encoding
113
+ """
114
+ with self._lock:
115
+ current_time = time.time()
116
+
117
+ # Clean up expired entries if cache is getting full
118
+ if len(self._cache) >= self._max_size:
119
+ self._cleanup_expired()
120
+
121
+ # If still full after cleanup, remove oldest entry
122
+ if len(self._cache) >= self._max_size:
123
+ oldest_key = min(self._cache.keys(), key=lambda k: self._cache[k][1])
124
+ del self._cache[oldest_key]
125
+
126
+ self._cache[file_path] = (encoding, current_time)
127
+
128
+ def _cleanup_expired(self) -> None:
129
+ """Remove expired entries from cache"""
130
+ current_time = time.time()
131
+ expired_keys = [
132
+ key
133
+ for key, (_, timestamp) in self._cache.items()
134
+ if current_time - timestamp > self._ttl_seconds
135
+ ]
136
+ for key in expired_keys:
137
+ del self._cache[key]
138
+
139
+ def clear(self) -> None:
140
+ """Clear all cached entries"""
141
+ with self._lock:
142
+ self._cache.clear()
143
+
144
+ def size(self) -> int:
145
+ """Get current cache size"""
146
+ with self._lock:
147
+ return len(self._cache)
148
+
149
+
150
+ # Global encoding cache instance
151
+ _encoding_cache = EncodingCache()
152
+
153
+
154
+ class EncodingManager:
155
+ """Centralized encoding management for consistent text processing"""
156
+
157
+ DEFAULT_ENCODING = "utf-8"
158
+ FALLBACK_ENCODINGS = ["utf-8", "cp1252", "iso-8859-1", "shift_jis", "gbk"]
159
+
160
+ @classmethod
161
+ def safe_encode(cls, text: str | None, encoding: str | None = None) -> bytes:
162
+ """
163
+ Safely encode text to bytes with fallback handling
164
+
165
+ Args:
166
+ text: Text to encode (can be None)
167
+ encoding: Target encoding (defaults to UTF-8)
168
+
169
+ Returns:
170
+ Encoded bytes
171
+ """
172
+ # Handle None input
173
+ if text is None:
174
+ return b""
175
+
176
+ target_encoding = encoding or cls.DEFAULT_ENCODING
177
+
178
+ try:
179
+ return text.encode(target_encoding)
180
+ except UnicodeEncodeError as e:
181
+ log_debug(f"Failed to encode with {target_encoding}, trying fallbacks: {e}")
182
+
183
+ # Try fallback encodings
184
+ for fallback in cls.FALLBACK_ENCODINGS:
185
+ if fallback != target_encoding:
186
+ try:
187
+ return text.encode(fallback, errors="replace")
188
+ except UnicodeEncodeError:
189
+ continue
190
+
191
+ # Last resort: encode with error replacement
192
+ log_warning(f"Using error replacement for encoding: {text[:50]}...")
193
+ return text.encode(cls.DEFAULT_ENCODING, errors="replace")
194
+
195
+ @classmethod
196
+ def safe_decode(cls, data: bytes, encoding: str | None = None) -> str:
197
+ """
198
+ Safely decode bytes to text with fallback handling
199
+
200
+ Args:
201
+ data: Bytes to decode
202
+ encoding: Source encoding (auto-detected if None)
203
+
204
+ Returns:
205
+ Decoded text
206
+ """
207
+ if data is None or len(data) == 0:
208
+ return ""
209
+
210
+ # Use provided encoding or detect
211
+ target_encoding = encoding
212
+ if not target_encoding:
213
+ target_encoding = cls.detect_encoding(data)
214
+
215
+ try:
216
+ return data.decode(target_encoding)
217
+ except UnicodeDecodeError as e:
218
+ log_debug(f"Failed to decode with {target_encoding}, trying fallbacks: {e}")
219
+
220
+ # Try fallback encodings
221
+ for fallback in cls.FALLBACK_ENCODINGS:
222
+ if fallback != target_encoding:
223
+ try:
224
+ return data.decode(fallback, errors="replace")
225
+ except UnicodeDecodeError:
226
+ continue
227
+
228
+ # Last resort: decode with error replacement
229
+ log_warning(
230
+ f"Using error replacement for decoding data (length: {len(data)})"
231
+ )
232
+ return data.decode(cls.DEFAULT_ENCODING, errors="replace")
233
+
234
+ @classmethod
235
+ def detect_encoding(cls, data: bytes, file_path: str | None = None) -> str:
236
+ """
237
+ Detect encoding of byte data with optional file-based caching
238
+
239
+ Args:
240
+ data: Bytes to analyze
241
+ file_path: Optional file path for caching (improves performance)
242
+
243
+ Returns:
244
+ Detected encoding name
245
+ """
246
+ if not data:
247
+ return cls.DEFAULT_ENCODING
248
+
249
+ # Check cache first if file_path is provided
250
+ if file_path:
251
+ cached_encoding = _encoding_cache.get(file_path)
252
+ if cached_encoding:
253
+ log_debug(f"Using cached encoding for {file_path}: {cached_encoding}")
254
+ return cached_encoding
255
+
256
+ detected_encoding = cls.DEFAULT_ENCODING
257
+
258
+ # If chardet is not available, use simple heuristics
259
+ if not CHARDET_AVAILABLE:
260
+ try:
261
+ # Try UTF-8 first
262
+ data.decode("utf-8")
263
+ detected_encoding = "utf-8"
264
+ except UnicodeDecodeError:
265
+ # Check for BOM
266
+ if data.startswith(b"\xff\xfe"):
267
+ detected_encoding = "utf-16-le"
268
+ elif data.startswith(b"\xfe\xff"):
269
+ detected_encoding = "utf-16-be"
270
+ elif data.startswith(b"\xef\xbb\xbf"):
271
+ detected_encoding = "utf-8-sig"
272
+ else:
273
+ detected_encoding = cls.DEFAULT_ENCODING
274
+ else:
275
+ try:
276
+ # Use chardet for detection
277
+ detection = chardet.detect(data)
278
+ if detection and detection["encoding"]:
279
+ confidence = detection.get("confidence", 0)
280
+ detected_encoding = detection["encoding"].lower()
281
+
282
+ # Only trust high-confidence detections
283
+ if confidence > 0.7:
284
+ log_debug(
285
+ f"Detected encoding: {detected_encoding} "
286
+ f"(confidence: {confidence:.2f})"
287
+ )
288
+ else:
289
+ log_debug(
290
+ f"Low confidence encoding detection: {detected_encoding} "
291
+ f"(confidence: {confidence:.2f}), using default"
292
+ )
293
+ detected_encoding = cls.DEFAULT_ENCODING
294
+
295
+ except Exception as e:
296
+ log_debug(f"Encoding detection failed: {e}")
297
+ detected_encoding = cls.DEFAULT_ENCODING
298
+
299
+ # Cache the result if file_path is provided
300
+ if file_path and detected_encoding:
301
+ _encoding_cache.set(file_path, detected_encoding)
302
+ log_debug(f"Cached encoding for {file_path}: {detected_encoding}")
303
+
304
+ return detected_encoding
305
+
306
+ @classmethod
307
+ def read_file_safe(cls, file_path: str | Path) -> tuple[str, str]:
308
+ """
309
+ Safely read a file with automatic encoding detection and caching
310
+
311
+ Args:
312
+ file_path: Path to the file
313
+
314
+ Returns:
315
+ Tuple of (content, detected_encoding)
316
+ """
317
+ file_path = Path(file_path)
318
+
319
+ try:
320
+ # Read raw bytes first
321
+ with open(file_path, "rb") as f:
322
+ raw_data = f.read()
323
+
324
+ if not raw_data:
325
+ return "", cls.DEFAULT_ENCODING
326
+
327
+ # Detect and decode with file path for caching
328
+ detected_encoding = cls.detect_encoding(raw_data, str(file_path))
329
+ content = cls.safe_decode(raw_data, detected_encoding)
330
+
331
+ # Normalize line endings for consistency
332
+ content = cls.normalize_line_endings(content)
333
+
334
+ return content, detected_encoding
335
+
336
+ except OSError as e:
337
+ log_warning(f"Failed to read file {file_path}: {e}")
338
+ raise e
339
+
340
+ @classmethod
341
+ def write_file_safe(
342
+ cls, file_path: str | Path, content: str, encoding: str | None = None
343
+ ) -> bool:
344
+ """
345
+ Safely write content to a file
346
+
347
+ Args:
348
+ file_path: Path to the file
349
+ content: Content to write
350
+ encoding: Target encoding (defaults to UTF-8)
351
+
352
+ Returns:
353
+ True if successful, False otherwise
354
+ """
355
+ file_path = Path(file_path)
356
+ target_encoding = encoding or cls.DEFAULT_ENCODING
357
+
358
+ try:
359
+ encoded_content = cls.safe_encode(content, target_encoding)
360
+
361
+ with open(file_path, "wb") as f:
362
+ f.write(encoded_content)
363
+
364
+ return True
365
+
366
+ except OSError as e:
367
+ log_warning(f"Failed to write file {file_path}: {e}")
368
+ return False
369
+
370
+ @classmethod
371
+ def normalize_line_endings(cls, text: str) -> str:
372
+ """
373
+ Normalize line endings to Unix style (\n)
374
+
375
+ Args:
376
+ text: Text to normalize
377
+
378
+ Returns:
379
+ Text with normalized line endings
380
+ """
381
+ if not text:
382
+ return text
383
+
384
+ # Replace Windows (\r\n) and Mac (\r) line endings with Unix (\n)
385
+ return text.replace("\r\n", "\n").replace("\r", "\n")
386
+
387
+ @classmethod
388
+ def extract_text_slice(
389
+ cls,
390
+ content_bytes: bytes,
391
+ start_byte: int,
392
+ end_byte: int,
393
+ encoding: str | None = None,
394
+ ) -> str:
395
+ """
396
+ Extract a slice of text from bytes with proper encoding handling
397
+
398
+ Args:
399
+ content_bytes: Source bytes
400
+ start_byte: Start position
401
+ end_byte: End position
402
+ encoding: Encoding to use (auto-detected if None)
403
+
404
+ Returns:
405
+ Extracted text slice
406
+ """
407
+ if not content_bytes or start_byte >= len(content_bytes):
408
+ return ""
409
+
410
+ # Ensure bounds are valid
411
+ start_byte = max(0, start_byte)
412
+ end_byte = min(len(content_bytes), end_byte)
413
+
414
+ if start_byte >= end_byte:
415
+ return ""
416
+
417
+ # Extract byte slice
418
+ byte_slice = content_bytes[start_byte:end_byte]
419
+
420
+ # Decode the slice
421
+ return cls.safe_decode(byte_slice, encoding)
422
+
423
+
424
+ # Convenience functions for backward compatibility
425
+ def safe_encode(text: str, encoding: str | None = None) -> bytes:
426
+ """Convenience function for safe encoding"""
427
+ return EncodingManager.safe_encode(text, encoding)
428
+
429
+
430
+ def safe_decode(data: bytes, encoding: str | None = None) -> str:
431
+ """Convenience function for safe decoding"""
432
+ return EncodingManager.safe_decode(data, encoding)
433
+
434
+
435
+ def detect_encoding(data: bytes, file_path: str | None = None) -> str:
436
+ """Convenience function for encoding detection with optional caching"""
437
+ return EncodingManager.detect_encoding(data, file_path)
438
+
439
+
440
+ def read_file_safe(file_path: str | Path) -> tuple[str, str]:
441
+ """Convenience function for safe file reading"""
442
+ return EncodingManager.read_file_safe(file_path)
443
+
444
+
445
+ def write_file_safe(
446
+ file_path: str | Path, content: str, encoding: str | None = None
447
+ ) -> bool:
448
+ """Convenience function for safe file writing"""
449
+ return EncodingManager.write_file_safe(file_path, content, encoding)
450
+
451
+
452
+ def extract_text_slice(
453
+ content_bytes: bytes, start_byte: int, end_byte: int, encoding: str | None = None
454
+ ) -> str:
455
+ """Convenience function for text slice extraction"""
456
+ return EncodingManager.extract_text_slice(
457
+ content_bytes, start_byte, end_byte, encoding
458
+ )
459
+
460
+
461
+ def read_file_safe_streaming(file_path: str | Path) -> Any:
462
+ """
463
+ Context manager for streaming file reading with automatic encoding detection.
464
+
465
+ This function opens a file with the correct encoding detected from the file's
466
+ content and yields a file handle that can be used for line-by-line reading.
467
+ This is memory-efficient for large files as it doesn't load the entire content.
468
+
469
+ Performance: Enables 150x speedup (30s → <200ms) for large file operations
470
+ by avoiding full file loading and using chunk-based streaming.
471
+
472
+ Args:
473
+ file_path: Path to the file to read
474
+
475
+ Yields:
476
+ File handle opened with the correct encoding
477
+
478
+ Example:
479
+ with read_file_safe_streaming("large_file.txt") as f:
480
+ for line_num, line in enumerate(f, 1):
481
+ if line_num >= start_line:
482
+ # Process line
483
+ pass
484
+ """
485
+ import contextlib
486
+
487
+ from .utils.logging import log_warning
488
+
489
+ file_path = Path(file_path)
490
+
491
+ # First, detect encoding by reading a small sample
492
+ try:
493
+ with open(file_path, "rb") as f:
494
+ # Read first 8KB to detect encoding
495
+ sample_data = f.read(8192)
496
+
497
+ if not sample_data:
498
+ # Empty file, use default encoding
499
+ detected_encoding = EncodingManager.DEFAULT_ENCODING
500
+ else:
501
+ # Detect encoding from sample with file path for caching
502
+ detected_encoding = EncodingManager.detect_encoding(
503
+ sample_data, str(file_path)
504
+ )
505
+
506
+ except OSError as e:
507
+ log_warning(f"Failed to read file for encoding detection {file_path}: {e}")
508
+ raise e
509
+
510
+ # Open file with detected encoding for streaming
511
+ @contextlib.contextmanager
512
+ def _file_context() -> Any:
513
+ try:
514
+ with open(file_path, encoding=detected_encoding, errors="replace") as f:
515
+ yield f
516
+ except OSError as e:
517
+ log_warning(f"Failed to open file for streaming {file_path}: {e}")
518
+ raise e
519
+
520
+ return _file_context()
521
+
522
+
523
+ def clear_encoding_cache() -> None:
524
+ """Clear the global encoding cache"""
525
+ _encoding_cache.clear()
526
+
527
+
528
+ def get_encoding_cache_size() -> int:
529
+ """Get the current size of the encoding cache"""
530
+ return _encoding_cache.size()