tree-sitter-analyzer 0.9.3__py3-none-any.whl → 0.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tree-sitter-analyzer might be problematic. Click here for more details.
- tree_sitter_analyzer/cli/commands/default_command.py +18 -18
- tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -141
- tree_sitter_analyzer/cli/commands/query_command.py +92 -88
- tree_sitter_analyzer/cli/commands/table_command.py +235 -235
- tree_sitter_analyzer/cli/info_commands.py +121 -121
- tree_sitter_analyzer/cli_main.py +307 -307
- tree_sitter_analyzer/core/analysis_engine.py +584 -584
- tree_sitter_analyzer/core/cache_service.py +5 -4
- tree_sitter_analyzer/core/query.py +502 -502
- tree_sitter_analyzer/encoding_utils.py +6 -2
- tree_sitter_analyzer/exceptions.py +400 -406
- tree_sitter_analyzer/formatters/java_formatter.py +291 -291
- tree_sitter_analyzer/formatters/python_formatter.py +259 -259
- tree_sitter_analyzer/interfaces/mcp_server.py +426 -425
- tree_sitter_analyzer/language_detector.py +398 -398
- tree_sitter_analyzer/language_loader.py +224 -224
- tree_sitter_analyzer/languages/java_plugin.py +1202 -1202
- tree_sitter_analyzer/mcp/resources/project_stats_resource.py +559 -555
- tree_sitter_analyzer/mcp/server.py +30 -9
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +21 -4
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +22 -4
- tree_sitter_analyzer/mcp/utils/error_handler.py +569 -567
- tree_sitter_analyzer/models.py +470 -470
- tree_sitter_analyzer/security/__init__.py +22 -22
- tree_sitter_analyzer/security/boundary_manager.py +243 -243
- tree_sitter_analyzer/security/regex_checker.py +297 -292
- tree_sitter_analyzer/table_formatter.py +703 -652
- tree_sitter_analyzer/utils.py +50 -19
- {tree_sitter_analyzer-0.9.3.dist-info → tree_sitter_analyzer-0.9.4.dist-info}/METADATA +1 -1
- {tree_sitter_analyzer-0.9.3.dist-info → tree_sitter_analyzer-0.9.4.dist-info}/RECORD +32 -32
- {tree_sitter_analyzer-0.9.3.dist-info → tree_sitter_analyzer-0.9.4.dist-info}/WHEEL +0 -0
- {tree_sitter_analyzer-0.9.3.dist-info → tree_sitter_analyzer-0.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,398 +1,398 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Language Detection System
|
|
4
|
-
|
|
5
|
-
Automatically detects programming language from file extensions and content.
|
|
6
|
-
Supports multiple languages with extensible configuration.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class LanguageDetector:
|
|
14
|
-
"""Automatic programming language detector"""
|
|
15
|
-
|
|
16
|
-
# Basic extension mapping
|
|
17
|
-
EXTENSION_MAPPING: dict[str, str] = {
|
|
18
|
-
# Java系
|
|
19
|
-
".java": "java",
|
|
20
|
-
".jsp": "jsp",
|
|
21
|
-
".jspx": "jsp",
|
|
22
|
-
# JavaScript/TypeScript系
|
|
23
|
-
".js": "javascript",
|
|
24
|
-
".jsx": "jsx",
|
|
25
|
-
".ts": "typescript",
|
|
26
|
-
".tsx": "tsx",
|
|
27
|
-
".mjs": "javascript",
|
|
28
|
-
".cjs": "javascript",
|
|
29
|
-
# Python系
|
|
30
|
-
".py": "python",
|
|
31
|
-
".pyx": "python",
|
|
32
|
-
".pyi": "python",
|
|
33
|
-
".pyw": "python",
|
|
34
|
-
# C/C++系
|
|
35
|
-
".c": "c",
|
|
36
|
-
".cpp": "cpp",
|
|
37
|
-
".cxx": "cpp",
|
|
38
|
-
".cc": "cpp",
|
|
39
|
-
".h": "c", # Ambiguous
|
|
40
|
-
".hpp": "cpp",
|
|
41
|
-
".hxx": "cpp",
|
|
42
|
-
# その他の言語
|
|
43
|
-
".rs": "rust",
|
|
44
|
-
".go": "go",
|
|
45
|
-
".rb": "ruby",
|
|
46
|
-
".php": "php",
|
|
47
|
-
".kt": "kotlin",
|
|
48
|
-
".swift": "swift",
|
|
49
|
-
".cs": "csharp",
|
|
50
|
-
".vb": "vbnet",
|
|
51
|
-
".fs": "fsharp",
|
|
52
|
-
".scala": "scala",
|
|
53
|
-
".clj": "clojure",
|
|
54
|
-
".hs": "haskell",
|
|
55
|
-
".ml": "ocaml",
|
|
56
|
-
".lua": "lua",
|
|
57
|
-
".pl": "perl",
|
|
58
|
-
".r": "r",
|
|
59
|
-
".m": "objc", # Ambiguous (MATLAB as well)
|
|
60
|
-
".dart": "dart",
|
|
61
|
-
".elm": "elm",
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
# Ambiguous extensions (map to multiple languages)
|
|
65
|
-
AMBIGUOUS_EXTENSIONS: dict[str, list[str]] = {
|
|
66
|
-
".h": ["c", "cpp", "objc"],
|
|
67
|
-
".m": ["objc", "matlab"],
|
|
68
|
-
".sql": ["sql", "plsql", "mysql"],
|
|
69
|
-
".xml": ["xml", "html", "jsp"],
|
|
70
|
-
".json": ["json", "jsonc"],
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
# Content-based detection patterns
|
|
74
|
-
CONTENT_PATTERNS: dict[str, dict[str, list[str]]] = {
|
|
75
|
-
"c_vs_cpp": {
|
|
76
|
-
"cpp": ["#include <iostream>", "std::", "namespace", "class ", "template<"],
|
|
77
|
-
"c": ["#include <stdio.h>", "printf(", "malloc(", "typedef struct"],
|
|
78
|
-
},
|
|
79
|
-
"objc_vs_matlab": {
|
|
80
|
-
"objc": ["#import", "@interface", "@implementation", "NSString", "alloc]"],
|
|
81
|
-
"matlab": ["function ", "end;", "disp(", "clc;", "clear all"],
|
|
82
|
-
},
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
# Tree-sitter supported languages
|
|
86
|
-
SUPPORTED_LANGUAGES = {
|
|
87
|
-
"java",
|
|
88
|
-
"javascript",
|
|
89
|
-
"typescript",
|
|
90
|
-
"python",
|
|
91
|
-
"c",
|
|
92
|
-
"cpp",
|
|
93
|
-
"rust",
|
|
94
|
-
"go",
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
def __init__(self) -> None:
|
|
98
|
-
"""Initialize detector"""
|
|
99
|
-
self.extension_map = {
|
|
100
|
-
".java": ("java", 0.9),
|
|
101
|
-
".js": ("javascript", 0.9),
|
|
102
|
-
".jsx": ("javascript", 0.8),
|
|
103
|
-
".ts": ("typescript", 0.9),
|
|
104
|
-
".tsx": ("typescript", 0.8),
|
|
105
|
-
".py": ("python", 0.9),
|
|
106
|
-
".pyw": ("python", 0.8),
|
|
107
|
-
".c": ("c", 0.9),
|
|
108
|
-
".h": ("c", 0.7),
|
|
109
|
-
".cpp": ("cpp", 0.9),
|
|
110
|
-
".cxx": ("cpp", 0.9),
|
|
111
|
-
".cc": ("cpp", 0.9),
|
|
112
|
-
".hpp": ("cpp", 0.8),
|
|
113
|
-
".rs": ("rust", 0.9),
|
|
114
|
-
".go": ("go", 0.9),
|
|
115
|
-
".cs": ("csharp", 0.9),
|
|
116
|
-
".php": ("php", 0.9),
|
|
117
|
-
".rb": ("ruby", 0.9),
|
|
118
|
-
".swift": ("swift", 0.9),
|
|
119
|
-
".kt": ("kotlin", 0.9),
|
|
120
|
-
".scala": ("scala", 0.9),
|
|
121
|
-
".clj": ("clojure", 0.9),
|
|
122
|
-
".hs": ("haskell", 0.9),
|
|
123
|
-
".ml": ("ocaml", 0.9),
|
|
124
|
-
".fs": ("fsharp", 0.9),
|
|
125
|
-
".elm": ("elm", 0.9),
|
|
126
|
-
".dart": ("dart", 0.9),
|
|
127
|
-
".lua": ("lua", 0.9),
|
|
128
|
-
".r": ("r", 0.9),
|
|
129
|
-
".m": ("objectivec", 0.7),
|
|
130
|
-
".mm": ("objectivec", 0.8),
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
# Content-based detection patterns
|
|
134
|
-
self.content_patterns = {
|
|
135
|
-
"java": [
|
|
136
|
-
(r"package\s+[\w\.]+\s*;", 0.3),
|
|
137
|
-
(r"public\s+class\s+\w+", 0.3),
|
|
138
|
-
(r"import\s+[\w\.]+\s*;", 0.2),
|
|
139
|
-
(r"@\w+\s*\(", 0.2), # Annotations
|
|
140
|
-
],
|
|
141
|
-
"python": [
|
|
142
|
-
(r"def\s+\w+\s*\(", 0.3),
|
|
143
|
-
(r"import\s+\w+", 0.2),
|
|
144
|
-
(r"from\s+\w+\s+import", 0.2),
|
|
145
|
-
(r'if\s+__name__\s*==\s*["\']__main__["\']', 0.3),
|
|
146
|
-
],
|
|
147
|
-
"javascript": [
|
|
148
|
-
(r"function\s+\w+\s*\(", 0.3),
|
|
149
|
-
(r"var\s+\w+\s*=", 0.2),
|
|
150
|
-
(r"let\s+\w+\s*=", 0.2),
|
|
151
|
-
(r"const\s+\w+\s*=", 0.2),
|
|
152
|
-
(r"console\.log\s*\(", 0.1),
|
|
153
|
-
],
|
|
154
|
-
"typescript": [
|
|
155
|
-
(r"interface\s+\w+", 0.3),
|
|
156
|
-
(r"type\s+\w+\s*=", 0.2),
|
|
157
|
-
(r":\s*\w+\s*=", 0.2), # Type annotations
|
|
158
|
-
(r"export\s+(interface|type|class)", 0.2),
|
|
159
|
-
],
|
|
160
|
-
"c": [
|
|
161
|
-
(r"#include\s*<[\w\.]+>", 0.3),
|
|
162
|
-
(r"int\s+main\s*\(", 0.3),
|
|
163
|
-
(r"printf\s*\(", 0.2),
|
|
164
|
-
(r"#define\s+\w+", 0.2),
|
|
165
|
-
],
|
|
166
|
-
"cpp": [
|
|
167
|
-
(r"#include\s*<[\w\.]+>", 0.2),
|
|
168
|
-
(r"using\s+namespace\s+\w+", 0.3),
|
|
169
|
-
(r"std::\w+", 0.2),
|
|
170
|
-
(r"class\s+\w+\s*{", 0.3),
|
|
171
|
-
],
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
from .utils import log_debug, log_warning
|
|
175
|
-
|
|
176
|
-
self._log_debug = log_debug
|
|
177
|
-
self._log_warning = log_warning
|
|
178
|
-
|
|
179
|
-
def detect_language(
|
|
180
|
-
self, file_path: str, content: str | None = None
|
|
181
|
-
) -> tuple[str, float]:
|
|
182
|
-
"""
|
|
183
|
-
ファイルパスとコンテンツから言語を判定
|
|
184
|
-
|
|
185
|
-
Args:
|
|
186
|
-
file_path: ファイルパス
|
|
187
|
-
content: ファイルコンテンツ(任意、曖昧性解決用)
|
|
188
|
-
|
|
189
|
-
Returns:
|
|
190
|
-
(言語名, 信頼度) のタプル
|
|
191
|
-
"""
|
|
192
|
-
path = Path(file_path)
|
|
193
|
-
extension = path.suffix.lower()
|
|
194
|
-
|
|
195
|
-
# Direct mapping by extension
|
|
196
|
-
if extension in self.EXTENSION_MAPPING:
|
|
197
|
-
language = self.EXTENSION_MAPPING[extension]
|
|
198
|
-
|
|
199
|
-
# No ambiguity -> high confidence
|
|
200
|
-
if extension not in self.AMBIGUOUS_EXTENSIONS:
|
|
201
|
-
return language, 1.0
|
|
202
|
-
|
|
203
|
-
# Resolve ambiguity using content
|
|
204
|
-
if content:
|
|
205
|
-
refined_language = self._resolve_ambiguity(extension, content)
|
|
206
|
-
return refined_language, 0.9 if refined_language != language else 0.7
|
|
207
|
-
else:
|
|
208
|
-
return language, 0.7 # Lower confidence without content
|
|
209
|
-
|
|
210
|
-
# Unknown extension
|
|
211
|
-
return "unknown", 0.0
|
|
212
|
-
|
|
213
|
-
def detect_from_extension(self, file_path: str) -> str:
|
|
214
|
-
"""
|
|
215
|
-
Quick detection using extension only
|
|
216
|
-
|
|
217
|
-
Args:
|
|
218
|
-
file_path: File path
|
|
219
|
-
|
|
220
|
-
Returns:
|
|
221
|
-
Detected language name
|
|
222
|
-
"""
|
|
223
|
-
language, _ = self.detect_language(file_path)
|
|
224
|
-
return language
|
|
225
|
-
|
|
226
|
-
def is_supported(self, language: str) -> bool:
|
|
227
|
-
"""
|
|
228
|
-
Check if language is supported by Tree-sitter
|
|
229
|
-
|
|
230
|
-
Args:
|
|
231
|
-
language: Language name
|
|
232
|
-
|
|
233
|
-
Returns:
|
|
234
|
-
Support status
|
|
235
|
-
"""
|
|
236
|
-
return language in self.SUPPORTED_LANGUAGES
|
|
237
|
-
|
|
238
|
-
def get_supported_extensions(self) -> list[str]:
|
|
239
|
-
"""
|
|
240
|
-
Get list of supported extensions
|
|
241
|
-
|
|
242
|
-
Returns:
|
|
243
|
-
List of extensions
|
|
244
|
-
"""
|
|
245
|
-
return sorted(self.EXTENSION_MAPPING.keys())
|
|
246
|
-
|
|
247
|
-
def get_supported_languages(self) -> list[str]:
|
|
248
|
-
"""
|
|
249
|
-
Get list of supported languages
|
|
250
|
-
|
|
251
|
-
Returns:
|
|
252
|
-
List of languages
|
|
253
|
-
"""
|
|
254
|
-
return sorted(self.SUPPORTED_LANGUAGES)
|
|
255
|
-
|
|
256
|
-
def _resolve_ambiguity(self, extension: str, content: str) -> str:
|
|
257
|
-
"""
|
|
258
|
-
Resolve ambiguous extension using content
|
|
259
|
-
|
|
260
|
-
Args:
|
|
261
|
-
extension: File extension
|
|
262
|
-
content: File content
|
|
263
|
-
|
|
264
|
-
Returns:
|
|
265
|
-
Resolved language name
|
|
266
|
-
"""
|
|
267
|
-
if extension not in self.AMBIGUOUS_EXTENSIONS:
|
|
268
|
-
return self.EXTENSION_MAPPING.get(extension, "unknown")
|
|
269
|
-
|
|
270
|
-
candidates = self.AMBIGUOUS_EXTENSIONS[extension]
|
|
271
|
-
|
|
272
|
-
# .h: C vs C++ vs Objective-C
|
|
273
|
-
if extension == ".h":
|
|
274
|
-
return self._detect_c_family(content, candidates)
|
|
275
|
-
|
|
276
|
-
# .m: Objective-C vs MATLAB
|
|
277
|
-
elif extension == ".m":
|
|
278
|
-
return self._detect_objc_vs_matlab(content, candidates)
|
|
279
|
-
|
|
280
|
-
# Fallback to first candidate
|
|
281
|
-
return candidates[0]
|
|
282
|
-
|
|
283
|
-
def _detect_c_family(self, content: str, candidates: list[str]) -> str:
|
|
284
|
-
"""Detect among C-family languages"""
|
|
285
|
-
cpp_score = 0
|
|
286
|
-
c_score = 0
|
|
287
|
-
objc_score = 0
|
|
288
|
-
|
|
289
|
-
# C++ features
|
|
290
|
-
cpp_patterns = self.CONTENT_PATTERNS["c_vs_cpp"]["cpp"]
|
|
291
|
-
for pattern in cpp_patterns:
|
|
292
|
-
if pattern in content:
|
|
293
|
-
cpp_score += 1
|
|
294
|
-
|
|
295
|
-
# C features
|
|
296
|
-
c_patterns = self.CONTENT_PATTERNS["c_vs_cpp"]["c"]
|
|
297
|
-
for pattern in c_patterns:
|
|
298
|
-
if pattern in content:
|
|
299
|
-
c_score += 1
|
|
300
|
-
|
|
301
|
-
# Objective-C features
|
|
302
|
-
objc_patterns = self.CONTENT_PATTERNS["objc_vs_matlab"]["objc"]
|
|
303
|
-
for pattern in objc_patterns:
|
|
304
|
-
if pattern in content:
|
|
305
|
-
objc_score += 3 # 強い指標なので重み大
|
|
306
|
-
|
|
307
|
-
# Select best-scoring language
|
|
308
|
-
scores = {"cpp": cpp_score, "c": c_score, "objc": objc_score}
|
|
309
|
-
best_language = max(scores, key=lambda x: scores[x])
|
|
310
|
-
|
|
311
|
-
# If objc not a candidate, fallback to C/C++
|
|
312
|
-
if best_language == "objc" and "objc" not in candidates:
|
|
313
|
-
best_language = "cpp" if cpp_score > c_score else "c"
|
|
314
|
-
|
|
315
|
-
return best_language if scores[best_language] > 0 else candidates[0]
|
|
316
|
-
|
|
317
|
-
def _detect_objc_vs_matlab(self, content: str, candidates: list[str]) -> str:
|
|
318
|
-
"""Detect between Objective-C and MATLAB"""
|
|
319
|
-
objc_score = 0
|
|
320
|
-
matlab_score = 0
|
|
321
|
-
|
|
322
|
-
# Objective-C patterns
|
|
323
|
-
for pattern in self.CONTENT_PATTERNS["objc_vs_matlab"]["objc"]:
|
|
324
|
-
if pattern in content:
|
|
325
|
-
objc_score += 1
|
|
326
|
-
|
|
327
|
-
# MATLAB patterns
|
|
328
|
-
for pattern in self.CONTENT_PATTERNS["objc_vs_matlab"]["matlab"]:
|
|
329
|
-
if pattern in content:
|
|
330
|
-
matlab_score += 1
|
|
331
|
-
|
|
332
|
-
if objc_score > matlab_score:
|
|
333
|
-
return "objc"
|
|
334
|
-
elif matlab_score > objc_score:
|
|
335
|
-
return "matlab"
|
|
336
|
-
else:
|
|
337
|
-
return candidates[0] # default
|
|
338
|
-
|
|
339
|
-
def add_extension_mapping(self, extension: str, language: str) -> None:
|
|
340
|
-
"""
|
|
341
|
-
Add custom extension mapping
|
|
342
|
-
|
|
343
|
-
Args:
|
|
344
|
-
extension: File extension (with dot)
|
|
345
|
-
language: Language name
|
|
346
|
-
"""
|
|
347
|
-
self.EXTENSION_MAPPING[extension.lower()] = language
|
|
348
|
-
|
|
349
|
-
def get_language_info(self, language: str) -> dict[str, Any]:
|
|
350
|
-
"""
|
|
351
|
-
Get language information
|
|
352
|
-
|
|
353
|
-
Args:
|
|
354
|
-
language: Language name
|
|
355
|
-
|
|
356
|
-
Returns:
|
|
357
|
-
Language info dictionary
|
|
358
|
-
"""
|
|
359
|
-
extensions = [
|
|
360
|
-
ext for ext, lang in self.EXTENSION_MAPPING.items() if lang == language
|
|
361
|
-
]
|
|
362
|
-
|
|
363
|
-
return {
|
|
364
|
-
"name": language,
|
|
365
|
-
"extensions": extensions,
|
|
366
|
-
"supported": self.is_supported(language),
|
|
367
|
-
"tree_sitter_available": language in self.SUPPORTED_LANGUAGES,
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
# Global instance
|
|
372
|
-
detector = LanguageDetector()
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
def detect_language_from_file(file_path: str) -> str:
|
|
376
|
-
"""
|
|
377
|
-
Detect language from path (simple API)
|
|
378
|
-
|
|
379
|
-
Args:
|
|
380
|
-
file_path: File path
|
|
381
|
-
|
|
382
|
-
Returns:
|
|
383
|
-
Detected language name
|
|
384
|
-
"""
|
|
385
|
-
return detector.detect_from_extension(file_path)
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
def is_language_supported(language: str) -> bool:
|
|
389
|
-
"""
|
|
390
|
-
Check if language is supported (simple API)
|
|
391
|
-
|
|
392
|
-
Args:
|
|
393
|
-
language: Language name
|
|
394
|
-
|
|
395
|
-
Returns:
|
|
396
|
-
Support status
|
|
397
|
-
"""
|
|
398
|
-
return detector.is_supported(language)
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Language Detection System
|
|
4
|
+
|
|
5
|
+
Automatically detects programming language from file extensions and content.
|
|
6
|
+
Supports multiple languages with extensible configuration.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LanguageDetector:
|
|
14
|
+
"""Automatic programming language detector"""
|
|
15
|
+
|
|
16
|
+
# Basic extension mapping
|
|
17
|
+
EXTENSION_MAPPING: dict[str, str] = {
|
|
18
|
+
# Java系
|
|
19
|
+
".java": "java",
|
|
20
|
+
".jsp": "jsp",
|
|
21
|
+
".jspx": "jsp",
|
|
22
|
+
# JavaScript/TypeScript系
|
|
23
|
+
".js": "javascript",
|
|
24
|
+
".jsx": "jsx",
|
|
25
|
+
".ts": "typescript",
|
|
26
|
+
".tsx": "tsx",
|
|
27
|
+
".mjs": "javascript",
|
|
28
|
+
".cjs": "javascript",
|
|
29
|
+
# Python系
|
|
30
|
+
".py": "python",
|
|
31
|
+
".pyx": "python",
|
|
32
|
+
".pyi": "python",
|
|
33
|
+
".pyw": "python",
|
|
34
|
+
# C/C++系
|
|
35
|
+
".c": "c",
|
|
36
|
+
".cpp": "cpp",
|
|
37
|
+
".cxx": "cpp",
|
|
38
|
+
".cc": "cpp",
|
|
39
|
+
".h": "c", # Ambiguous
|
|
40
|
+
".hpp": "cpp",
|
|
41
|
+
".hxx": "cpp",
|
|
42
|
+
# その他の言語
|
|
43
|
+
".rs": "rust",
|
|
44
|
+
".go": "go",
|
|
45
|
+
".rb": "ruby",
|
|
46
|
+
".php": "php",
|
|
47
|
+
".kt": "kotlin",
|
|
48
|
+
".swift": "swift",
|
|
49
|
+
".cs": "csharp",
|
|
50
|
+
".vb": "vbnet",
|
|
51
|
+
".fs": "fsharp",
|
|
52
|
+
".scala": "scala",
|
|
53
|
+
".clj": "clojure",
|
|
54
|
+
".hs": "haskell",
|
|
55
|
+
".ml": "ocaml",
|
|
56
|
+
".lua": "lua",
|
|
57
|
+
".pl": "perl",
|
|
58
|
+
".r": "r",
|
|
59
|
+
".m": "objc", # Ambiguous (MATLAB as well)
|
|
60
|
+
".dart": "dart",
|
|
61
|
+
".elm": "elm",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Ambiguous extensions (map to multiple languages)
|
|
65
|
+
AMBIGUOUS_EXTENSIONS: dict[str, list[str]] = {
|
|
66
|
+
".h": ["c", "cpp", "objc"],
|
|
67
|
+
".m": ["objc", "matlab"],
|
|
68
|
+
".sql": ["sql", "plsql", "mysql"],
|
|
69
|
+
".xml": ["xml", "html", "jsp"],
|
|
70
|
+
".json": ["json", "jsonc"],
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Content-based detection patterns
|
|
74
|
+
CONTENT_PATTERNS: dict[str, dict[str, list[str]]] = {
|
|
75
|
+
"c_vs_cpp": {
|
|
76
|
+
"cpp": ["#include <iostream>", "std::", "namespace", "class ", "template<"],
|
|
77
|
+
"c": ["#include <stdio.h>", "printf(", "malloc(", "typedef struct"],
|
|
78
|
+
},
|
|
79
|
+
"objc_vs_matlab": {
|
|
80
|
+
"objc": ["#import", "@interface", "@implementation", "NSString", "alloc]"],
|
|
81
|
+
"matlab": ["function ", "end;", "disp(", "clc;", "clear all"],
|
|
82
|
+
},
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Tree-sitter supported languages
|
|
86
|
+
SUPPORTED_LANGUAGES = {
|
|
87
|
+
"java",
|
|
88
|
+
"javascript",
|
|
89
|
+
"typescript",
|
|
90
|
+
"python",
|
|
91
|
+
"c",
|
|
92
|
+
"cpp",
|
|
93
|
+
"rust",
|
|
94
|
+
"go",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def __init__(self) -> None:
|
|
98
|
+
"""Initialize detector"""
|
|
99
|
+
self.extension_map = {
|
|
100
|
+
".java": ("java", 0.9),
|
|
101
|
+
".js": ("javascript", 0.9),
|
|
102
|
+
".jsx": ("javascript", 0.8),
|
|
103
|
+
".ts": ("typescript", 0.9),
|
|
104
|
+
".tsx": ("typescript", 0.8),
|
|
105
|
+
".py": ("python", 0.9),
|
|
106
|
+
".pyw": ("python", 0.8),
|
|
107
|
+
".c": ("c", 0.9),
|
|
108
|
+
".h": ("c", 0.7),
|
|
109
|
+
".cpp": ("cpp", 0.9),
|
|
110
|
+
".cxx": ("cpp", 0.9),
|
|
111
|
+
".cc": ("cpp", 0.9),
|
|
112
|
+
".hpp": ("cpp", 0.8),
|
|
113
|
+
".rs": ("rust", 0.9),
|
|
114
|
+
".go": ("go", 0.9),
|
|
115
|
+
".cs": ("csharp", 0.9),
|
|
116
|
+
".php": ("php", 0.9),
|
|
117
|
+
".rb": ("ruby", 0.9),
|
|
118
|
+
".swift": ("swift", 0.9),
|
|
119
|
+
".kt": ("kotlin", 0.9),
|
|
120
|
+
".scala": ("scala", 0.9),
|
|
121
|
+
".clj": ("clojure", 0.9),
|
|
122
|
+
".hs": ("haskell", 0.9),
|
|
123
|
+
".ml": ("ocaml", 0.9),
|
|
124
|
+
".fs": ("fsharp", 0.9),
|
|
125
|
+
".elm": ("elm", 0.9),
|
|
126
|
+
".dart": ("dart", 0.9),
|
|
127
|
+
".lua": ("lua", 0.9),
|
|
128
|
+
".r": ("r", 0.9),
|
|
129
|
+
".m": ("objectivec", 0.7),
|
|
130
|
+
".mm": ("objectivec", 0.8),
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
# Content-based detection patterns
|
|
134
|
+
self.content_patterns = {
|
|
135
|
+
"java": [
|
|
136
|
+
(r"package\s+[\w\.]+\s*;", 0.3),
|
|
137
|
+
(r"public\s+class\s+\w+", 0.3),
|
|
138
|
+
(r"import\s+[\w\.]+\s*;", 0.2),
|
|
139
|
+
(r"@\w+\s*\(", 0.2), # Annotations
|
|
140
|
+
],
|
|
141
|
+
"python": [
|
|
142
|
+
(r"def\s+\w+\s*\(", 0.3),
|
|
143
|
+
(r"import\s+\w+", 0.2),
|
|
144
|
+
(r"from\s+\w+\s+import", 0.2),
|
|
145
|
+
(r'if\s+__name__\s*==\s*["\']__main__["\']', 0.3),
|
|
146
|
+
],
|
|
147
|
+
"javascript": [
|
|
148
|
+
(r"function\s+\w+\s*\(", 0.3),
|
|
149
|
+
(r"var\s+\w+\s*=", 0.2),
|
|
150
|
+
(r"let\s+\w+\s*=", 0.2),
|
|
151
|
+
(r"const\s+\w+\s*=", 0.2),
|
|
152
|
+
(r"console\.log\s*\(", 0.1),
|
|
153
|
+
],
|
|
154
|
+
"typescript": [
|
|
155
|
+
(r"interface\s+\w+", 0.3),
|
|
156
|
+
(r"type\s+\w+\s*=", 0.2),
|
|
157
|
+
(r":\s*\w+\s*=", 0.2), # Type annotations
|
|
158
|
+
(r"export\s+(interface|type|class)", 0.2),
|
|
159
|
+
],
|
|
160
|
+
"c": [
|
|
161
|
+
(r"#include\s*<[\w\.]+>", 0.3),
|
|
162
|
+
(r"int\s+main\s*\(", 0.3),
|
|
163
|
+
(r"printf\s*\(", 0.2),
|
|
164
|
+
(r"#define\s+\w+", 0.2),
|
|
165
|
+
],
|
|
166
|
+
"cpp": [
|
|
167
|
+
(r"#include\s*<[\w\.]+>", 0.2),
|
|
168
|
+
(r"using\s+namespace\s+\w+", 0.3),
|
|
169
|
+
(r"std::\w+", 0.2),
|
|
170
|
+
(r"class\s+\w+\s*{", 0.3),
|
|
171
|
+
],
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
from .utils import log_debug, log_warning
|
|
175
|
+
|
|
176
|
+
self._log_debug = log_debug
|
|
177
|
+
self._log_warning = log_warning
|
|
178
|
+
|
|
179
|
+
def detect_language(
|
|
180
|
+
self, file_path: str, content: str | None = None
|
|
181
|
+
) -> tuple[str, float]:
|
|
182
|
+
"""
|
|
183
|
+
ファイルパスとコンテンツから言語を判定
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
file_path: ファイルパス
|
|
187
|
+
content: ファイルコンテンツ(任意、曖昧性解決用)
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
(言語名, 信頼度) のタプル
|
|
191
|
+
"""
|
|
192
|
+
path = Path(file_path)
|
|
193
|
+
extension = path.suffix.lower()
|
|
194
|
+
|
|
195
|
+
# Direct mapping by extension
|
|
196
|
+
if extension in self.EXTENSION_MAPPING:
|
|
197
|
+
language = self.EXTENSION_MAPPING[extension]
|
|
198
|
+
|
|
199
|
+
# No ambiguity -> high confidence
|
|
200
|
+
if extension not in self.AMBIGUOUS_EXTENSIONS:
|
|
201
|
+
return language, 1.0
|
|
202
|
+
|
|
203
|
+
# Resolve ambiguity using content
|
|
204
|
+
if content:
|
|
205
|
+
refined_language = self._resolve_ambiguity(extension, content)
|
|
206
|
+
return refined_language, 0.9 if refined_language != language else 0.7
|
|
207
|
+
else:
|
|
208
|
+
return language, 0.7 # Lower confidence without content
|
|
209
|
+
|
|
210
|
+
# Unknown extension
|
|
211
|
+
return "unknown", 0.0
|
|
212
|
+
|
|
213
|
+
def detect_from_extension(self, file_path: str) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Quick detection using extension only
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
file_path: File path
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Detected language name
|
|
222
|
+
"""
|
|
223
|
+
language, _ = self.detect_language(file_path)
|
|
224
|
+
return language
|
|
225
|
+
|
|
226
|
+
def is_supported(self, language: str) -> bool:
|
|
227
|
+
"""
|
|
228
|
+
Check if language is supported by Tree-sitter
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
language: Language name
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Support status
|
|
235
|
+
"""
|
|
236
|
+
return language in self.SUPPORTED_LANGUAGES
|
|
237
|
+
|
|
238
|
+
def get_supported_extensions(self) -> list[str]:
|
|
239
|
+
"""
|
|
240
|
+
Get list of supported extensions
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
List of extensions
|
|
244
|
+
"""
|
|
245
|
+
return sorted(self.EXTENSION_MAPPING.keys())
|
|
246
|
+
|
|
247
|
+
def get_supported_languages(self) -> list[str]:
|
|
248
|
+
"""
|
|
249
|
+
Get list of supported languages
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
List of languages
|
|
253
|
+
"""
|
|
254
|
+
return sorted(self.SUPPORTED_LANGUAGES)
|
|
255
|
+
|
|
256
|
+
def _resolve_ambiguity(self, extension: str, content: str) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Resolve ambiguous extension using content
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
extension: File extension
|
|
262
|
+
content: File content
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Resolved language name
|
|
266
|
+
"""
|
|
267
|
+
if extension not in self.AMBIGUOUS_EXTENSIONS:
|
|
268
|
+
return self.EXTENSION_MAPPING.get(extension, "unknown")
|
|
269
|
+
|
|
270
|
+
candidates = self.AMBIGUOUS_EXTENSIONS[extension]
|
|
271
|
+
|
|
272
|
+
# .h: C vs C++ vs Objective-C
|
|
273
|
+
if extension == ".h":
|
|
274
|
+
return self._detect_c_family(content, candidates)
|
|
275
|
+
|
|
276
|
+
# .m: Objective-C vs MATLAB
|
|
277
|
+
elif extension == ".m":
|
|
278
|
+
return self._detect_objc_vs_matlab(content, candidates)
|
|
279
|
+
|
|
280
|
+
# Fallback to first candidate
|
|
281
|
+
return candidates[0]
|
|
282
|
+
|
|
283
|
+
def _detect_c_family(self, content: str, candidates: list[str]) -> str:
|
|
284
|
+
"""Detect among C-family languages"""
|
|
285
|
+
cpp_score = 0
|
|
286
|
+
c_score = 0
|
|
287
|
+
objc_score = 0
|
|
288
|
+
|
|
289
|
+
# C++ features
|
|
290
|
+
cpp_patterns = self.CONTENT_PATTERNS["c_vs_cpp"]["cpp"]
|
|
291
|
+
for pattern in cpp_patterns:
|
|
292
|
+
if pattern in content:
|
|
293
|
+
cpp_score += 1
|
|
294
|
+
|
|
295
|
+
# C features
|
|
296
|
+
c_patterns = self.CONTENT_PATTERNS["c_vs_cpp"]["c"]
|
|
297
|
+
for pattern in c_patterns:
|
|
298
|
+
if pattern in content:
|
|
299
|
+
c_score += 1
|
|
300
|
+
|
|
301
|
+
# Objective-C features
|
|
302
|
+
objc_patterns = self.CONTENT_PATTERNS["objc_vs_matlab"]["objc"]
|
|
303
|
+
for pattern in objc_patterns:
|
|
304
|
+
if pattern in content:
|
|
305
|
+
objc_score += 3 # 強い指標なので重み大
|
|
306
|
+
|
|
307
|
+
# Select best-scoring language
|
|
308
|
+
scores = {"cpp": cpp_score, "c": c_score, "objc": objc_score}
|
|
309
|
+
best_language = max(scores, key=lambda x: scores[x])
|
|
310
|
+
|
|
311
|
+
# If objc not a candidate, fallback to C/C++
|
|
312
|
+
if best_language == "objc" and "objc" not in candidates:
|
|
313
|
+
best_language = "cpp" if cpp_score > c_score else "c"
|
|
314
|
+
|
|
315
|
+
return best_language if scores[best_language] > 0 else candidates[0]
|
|
316
|
+
|
|
317
|
+
def _detect_objc_vs_matlab(self, content: str, candidates: list[str]) -> str:
|
|
318
|
+
"""Detect between Objective-C and MATLAB"""
|
|
319
|
+
objc_score = 0
|
|
320
|
+
matlab_score = 0
|
|
321
|
+
|
|
322
|
+
# Objective-C patterns
|
|
323
|
+
for pattern in self.CONTENT_PATTERNS["objc_vs_matlab"]["objc"]:
|
|
324
|
+
if pattern in content:
|
|
325
|
+
objc_score += 1
|
|
326
|
+
|
|
327
|
+
# MATLAB patterns
|
|
328
|
+
for pattern in self.CONTENT_PATTERNS["objc_vs_matlab"]["matlab"]:
|
|
329
|
+
if pattern in content:
|
|
330
|
+
matlab_score += 1
|
|
331
|
+
|
|
332
|
+
if objc_score > matlab_score:
|
|
333
|
+
return "objc"
|
|
334
|
+
elif matlab_score > objc_score:
|
|
335
|
+
return "matlab"
|
|
336
|
+
else:
|
|
337
|
+
return candidates[0] # default
|
|
338
|
+
|
|
339
|
+
def add_extension_mapping(self, extension: str, language: str) -> None:
|
|
340
|
+
"""
|
|
341
|
+
Add custom extension mapping
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
extension: File extension (with dot)
|
|
345
|
+
language: Language name
|
|
346
|
+
"""
|
|
347
|
+
self.EXTENSION_MAPPING[extension.lower()] = language
|
|
348
|
+
|
|
349
|
+
def get_language_info(self, language: str) -> dict[str, Any]:
|
|
350
|
+
"""
|
|
351
|
+
Get language information
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
language: Language name
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Language info dictionary
|
|
358
|
+
"""
|
|
359
|
+
extensions = [
|
|
360
|
+
ext for ext, lang in self.EXTENSION_MAPPING.items() if lang == language
|
|
361
|
+
]
|
|
362
|
+
|
|
363
|
+
return {
|
|
364
|
+
"name": language,
|
|
365
|
+
"extensions": extensions,
|
|
366
|
+
"supported": self.is_supported(language),
|
|
367
|
+
"tree_sitter_available": language in self.SUPPORTED_LANGUAGES,
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
# Global instance
|
|
372
|
+
detector = LanguageDetector()
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def detect_language_from_file(file_path: str) -> str:
|
|
376
|
+
"""
|
|
377
|
+
Detect language from path (simple API)
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
file_path: File path
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Detected language name
|
|
384
|
+
"""
|
|
385
|
+
return detector.detect_from_extension(file_path)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def is_language_supported(language: str) -> bool:
|
|
389
|
+
"""
|
|
390
|
+
Check if language is supported (simple API)
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
language: Language name
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
Support status
|
|
397
|
+
"""
|
|
398
|
+
return detector.is_supported(language)
|