tree-sitter-analyzer 1.7.5__py3-none-any.whl → 1.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tree-sitter-analyzer might be problematic. Click here for more details.

Files changed (47) hide show
  1. tree_sitter_analyzer/__init__.py +1 -1
  2. tree_sitter_analyzer/api.py +26 -32
  3. tree_sitter_analyzer/cli/argument_validator.py +77 -0
  4. tree_sitter_analyzer/cli/commands/table_command.py +7 -2
  5. tree_sitter_analyzer/cli_main.py +17 -3
  6. tree_sitter_analyzer/core/cache_service.py +15 -5
  7. tree_sitter_analyzer/core/query.py +33 -22
  8. tree_sitter_analyzer/core/query_service.py +179 -154
  9. tree_sitter_analyzer/exceptions.py +334 -0
  10. tree_sitter_analyzer/file_handler.py +16 -1
  11. tree_sitter_analyzer/formatters/formatter_registry.py +355 -0
  12. tree_sitter_analyzer/formatters/html_formatter.py +462 -0
  13. tree_sitter_analyzer/formatters/language_formatter_factory.py +3 -0
  14. tree_sitter_analyzer/formatters/markdown_formatter.py +1 -1
  15. tree_sitter_analyzer/interfaces/mcp_server.py +3 -1
  16. tree_sitter_analyzer/language_detector.py +91 -7
  17. tree_sitter_analyzer/languages/css_plugin.py +390 -0
  18. tree_sitter_analyzer/languages/html_plugin.py +395 -0
  19. tree_sitter_analyzer/languages/java_plugin.py +116 -0
  20. tree_sitter_analyzer/languages/javascript_plugin.py +113 -0
  21. tree_sitter_analyzer/languages/markdown_plugin.py +266 -46
  22. tree_sitter_analyzer/languages/python_plugin.py +176 -33
  23. tree_sitter_analyzer/languages/typescript_plugin.py +130 -1
  24. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +68 -3
  25. tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +32 -7
  26. tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +10 -0
  27. tree_sitter_analyzer/mcp/tools/list_files_tool.py +9 -0
  28. tree_sitter_analyzer/mcp/tools/query_tool.py +100 -52
  29. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +98 -14
  30. tree_sitter_analyzer/mcp/tools/search_content_tool.py +9 -0
  31. tree_sitter_analyzer/mcp/tools/table_format_tool.py +37 -13
  32. tree_sitter_analyzer/models.py +53 -0
  33. tree_sitter_analyzer/output_manager.py +1 -1
  34. tree_sitter_analyzer/plugins/base.py +50 -0
  35. tree_sitter_analyzer/plugins/manager.py +5 -1
  36. tree_sitter_analyzer/queries/css.py +634 -0
  37. tree_sitter_analyzer/queries/html.py +556 -0
  38. tree_sitter_analyzer/queries/markdown.py +54 -164
  39. tree_sitter_analyzer/query_loader.py +16 -3
  40. tree_sitter_analyzer/security/validator.py +343 -46
  41. tree_sitter_analyzer/utils/__init__.py +113 -0
  42. tree_sitter_analyzer/utils/tree_sitter_compat.py +282 -0
  43. tree_sitter_analyzer/utils.py +62 -24
  44. {tree_sitter_analyzer-1.7.5.dist-info → tree_sitter_analyzer-1.8.2.dist-info}/METADATA +136 -14
  45. {tree_sitter_analyzer-1.7.5.dist-info → tree_sitter_analyzer-1.8.2.dist-info}/RECORD +47 -38
  46. {tree_sitter_analyzer-1.7.5.dist-info → tree_sitter_analyzer-1.8.2.dist-info}/entry_points.txt +2 -0
  47. {tree_sitter_analyzer-1.7.5.dist-info → tree_sitter_analyzer-1.8.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,462 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTML Formatter
4
+
5
+ Specialized formatter for HTML/CSS code elements including MarkupElement and StyleElement.
6
+ Provides HTML-specific formatting with element classification and hierarchy display.
7
+ """
8
+
9
+ import json
10
+ from typing import Any
11
+
12
+ from ..models import CodeElement, MarkupElement, StyleElement
13
+ from .formatter_registry import IFormatter
14
+ from .base_formatter import BaseFormatter
15
+
16
+
17
+ class HtmlFormatter(BaseFormatter, IFormatter):
18
+ """HTML-specific formatter for MarkupElement and StyleElement"""
19
+
20
+ @staticmethod
21
+ def get_format_name() -> str:
22
+ return "html"
23
+
24
+ def format(self, elements: list[CodeElement]) -> str:
25
+ """Format HTML elements with hierarchy and classification"""
26
+ if not elements:
27
+ return "No HTML elements found."
28
+
29
+ lines = []
30
+ lines.append("# HTML Structure Analysis")
31
+ lines.append("")
32
+
33
+ # Handle both CodeElement objects and dictionaries
34
+ markup_elements = []
35
+ style_elements = []
36
+ other_elements = []
37
+
38
+ for e in elements:
39
+ if isinstance(e, MarkupElement):
40
+ markup_elements.append(e)
41
+ elif isinstance(e, StyleElement):
42
+ style_elements.append(e)
43
+ elif isinstance(e, dict):
44
+ # Convert dictionary to appropriate element type based on content
45
+ element_type = e.get('type', e.get('element_type', 'unknown'))
46
+ if 'tag_name' in e or element_type in ['tag', 'element', 'markup']:
47
+ markup_elements.append(self._dict_to_markup_element(e))
48
+ elif 'selector' in e or element_type in ['rule', 'style']:
49
+ style_elements.append(self._dict_to_style_element(e))
50
+ else:
51
+ other_elements.append(e)
52
+ else:
53
+ other_elements.append(e)
54
+
55
+ # Format markup elements
56
+ if markup_elements:
57
+ lines.extend(self._format_markup_elements(markup_elements))
58
+
59
+ # Format style elements
60
+ if style_elements:
61
+ lines.extend(self._format_style_elements(style_elements))
62
+
63
+ # Format other elements
64
+ if other_elements:
65
+ lines.extend(self._format_other_elements(other_elements))
66
+
67
+ return "\n".join(lines)
68
+
69
+ def format_summary(self, analysis_result: dict[str, Any]) -> str:
70
+ """Format summary output for HTML elements"""
71
+ elements = analysis_result.get("elements", [])
72
+ if not elements:
73
+ return "No HTML elements found."
74
+
75
+ markup_count = sum(1 for e in elements if isinstance(e, MarkupElement))
76
+ style_count = sum(1 for e in elements if isinstance(e, StyleElement))
77
+ other_count = len(elements) - markup_count - style_count
78
+
79
+ lines = []
80
+ lines.append("# HTML Analysis Summary")
81
+ lines.append("")
82
+ lines.append(f"**Total Elements:** {len(elements)}")
83
+ lines.append(f"- Markup Elements: {markup_count}")
84
+ lines.append(f"- Style Elements: {style_count}")
85
+ lines.append(f"- Other Elements: {other_count}")
86
+
87
+ return "\n".join(lines)
88
+
89
+ def format_structure(self, analysis_result: dict[str, Any]) -> str:
90
+ """Format structure analysis output"""
91
+ elements = analysis_result.get("elements", [])
92
+ return self.format(elements)
93
+
94
+ def format_advanced(self, analysis_result: dict[str, Any], output_format: str = "json") -> str:
95
+ """Format advanced analysis output"""
96
+ elements = analysis_result.get("elements", [])
97
+
98
+ if output_format == "json":
99
+ formatter = HtmlJsonFormatter()
100
+ return formatter.format(elements)
101
+ else:
102
+ return self.format(elements)
103
+
104
+ def format_table(self, analysis_result: dict[str, Any], table_type: str = "full") -> str:
105
+ """Format table output"""
106
+ elements = analysis_result.get("elements", [])
107
+
108
+ if table_type == "compact":
109
+ formatter = HtmlCompactFormatter()
110
+ return formatter.format(elements)
111
+ elif table_type == "json":
112
+ formatter = HtmlJsonFormatter()
113
+ return formatter.format(elements)
114
+ else:
115
+ # Default to full format (including "html" and "full")
116
+ return self.format(elements)
117
+
118
+ def _format_markup_elements(self, elements: list[MarkupElement]) -> list[str]:
119
+ """Format MarkupElement list with hierarchy"""
120
+ lines = []
121
+ lines.append("## HTML Elements")
122
+ lines.append("")
123
+
124
+ # Group by element class
125
+ element_groups = {}
126
+ for element in elements:
127
+ element_class = element.element_class or "unknown"
128
+ if element_class not in element_groups:
129
+ element_groups[element_class] = []
130
+ element_groups[element_class].append(element)
131
+
132
+ # Format each group
133
+ for element_class, group_elements in element_groups.items():
134
+ lines.append(f"### {element_class.title()} Elements ({len(group_elements)})")
135
+ lines.append("")
136
+ lines.append("| Tag | Name | Lines | Attributes | Children |")
137
+ lines.append("|-----|------|-------|------------|----------|")
138
+
139
+ for element in group_elements:
140
+ tag_name = element.tag_name or "unknown"
141
+ name = element.name or tag_name
142
+ lines_str = f"{element.start_line}-{element.end_line}"
143
+
144
+ # Format attributes
145
+ attrs = []
146
+ attributes = element.attributes or {}
147
+ for key, value in attributes.items():
148
+ if value:
149
+ attrs.append(f"{key}=\"{value}\"")
150
+ else:
151
+ attrs.append(key)
152
+ attrs_str = ", ".join(attrs) if attrs else "-"
153
+ if len(attrs_str) > 30:
154
+ attrs_str = attrs_str[:27] + "..."
155
+
156
+ # Count children
157
+ children_count = len(element.children)
158
+
159
+ lines.append(f"| `{tag_name}` | {name} | {lines_str} | {attrs_str} | {children_count} |")
160
+
161
+ lines.append("")
162
+
163
+ # Show hierarchy for root elements
164
+ root_elements = [e for e in elements if e.parent is None]
165
+ if root_elements and len(root_elements) < len(elements):
166
+ lines.append("### Element Hierarchy")
167
+ lines.append("")
168
+ for root in root_elements:
169
+ lines.extend(self._format_element_tree(root, 0))
170
+ lines.append("")
171
+
172
+ return lines
173
+
174
+ def _format_element_tree(self, element: MarkupElement, depth: int) -> list[str]:
175
+ """Format element tree hierarchy"""
176
+ lines = []
177
+ indent = " " * depth
178
+ tag_name = element.tag_name or "unknown"
179
+
180
+ # Format element info
181
+ attrs_info = ""
182
+ attributes = element.attributes or {}
183
+ if attributes:
184
+ key_attrs = []
185
+ for key, value in attributes.items():
186
+ if key in ["id", "class", "name"]:
187
+ key_attrs.append(f"{key}=\"{value}\"" if value else key)
188
+ if key_attrs:
189
+ attrs_info = f" ({', '.join(key_attrs)})"
190
+
191
+ lines.append(f"{indent}- `{tag_name}`{attrs_info} [{element.start_line}-{element.end_line}]")
192
+
193
+ # Format children
194
+ for child in element.children:
195
+ lines.extend(self._format_element_tree(child, depth + 1))
196
+
197
+ return lines
198
+
199
+ def _format_style_elements(self, elements: list[StyleElement]) -> list[str]:
200
+ """Format StyleElement list"""
201
+ lines = []
202
+ lines.append("## CSS Rules")
203
+ lines.append("")
204
+
205
+ # Group by element class
206
+ element_groups = {}
207
+ for element in elements:
208
+ element_class = element.element_class or "unknown"
209
+ if element_class not in element_groups:
210
+ element_groups[element_class] = []
211
+ element_groups[element_class].append(element)
212
+
213
+ # Format each group
214
+ for element_class, group_elements in element_groups.items():
215
+ lines.append(f"### {element_class.title()} Rules ({len(group_elements)})")
216
+ lines.append("")
217
+ lines.append("| Selector | Properties | Lines |")
218
+ lines.append("|----------|------------|-------|")
219
+
220
+ for element in group_elements:
221
+ selector = element.selector or element.name
222
+ lines_str = f"{element.start_line}-{element.end_line}"
223
+
224
+ # Format properties
225
+ props = []
226
+ properties = element.properties or {}
227
+ for key, value in properties.items():
228
+ props.append(f"{key}: {value}")
229
+ props_str = "; ".join(props) if props else "-"
230
+ if len(props_str) > 40:
231
+ props_str = props_str[:37] + "..."
232
+
233
+ lines.append(f"| `{selector}` | {props_str} | {lines_str} |")
234
+
235
+ lines.append("")
236
+
237
+ return lines
238
+
239
+ def _format_other_elements(self, elements: list) -> list[str]:
240
+ """Format other code elements"""
241
+ lines = []
242
+ lines.append("## Other Elements")
243
+ lines.append("")
244
+ lines.append("| Type | Name | Lines | Language |")
245
+ lines.append("|------|------|-------|----------|")
246
+
247
+ for element in elements:
248
+ if isinstance(element, dict):
249
+ element_type = element.get("element_type", element.get("type", "unknown"))
250
+ name = element.get("name", "unknown")
251
+ start_line = element.get("start_line", 0)
252
+ end_line = element.get("end_line", 0)
253
+ language = element.get("language", "unknown")
254
+ else:
255
+ element_type = getattr(element, "element_type", "unknown")
256
+ name = getattr(element, "name", "unknown")
257
+ start_line = getattr(element, "start_line", 0)
258
+ end_line = getattr(element, "end_line", 0)
259
+ language = getattr(element, "language", "unknown")
260
+
261
+ lines_str = f"{start_line}-{end_line}"
262
+ lines.append(f"| {element_type} | {name} | {lines_str} | {language} |")
263
+
264
+ lines.append("")
265
+ return lines
266
+
267
+ def _dict_to_markup_element(self, data: dict):
268
+ """Convert dictionary to MarkupElement-like object"""
269
+ # Create a mock MarkupElement-like object
270
+ class MockMarkupElement:
271
+ def __init__(self, data):
272
+ self.name = data.get('name', 'unknown')
273
+ self.tag_name = data.get('tag_name', data.get('name', 'unknown'))
274
+ self.element_class = data.get('element_class', 'unknown')
275
+ self.start_line = data.get('start_line', 0)
276
+ self.end_line = data.get('end_line', 0)
277
+ self.attributes = data.get('attributes', {})
278
+ self.children = []
279
+ self.parent = None
280
+ self.language = data.get('language', 'html')
281
+
282
+ return MockMarkupElement(data)
283
+
284
+ def _dict_to_style_element(self, data: dict):
285
+ """Convert dictionary to StyleElement-like object"""
286
+ # Create a mock StyleElement-like object
287
+ class MockStyleElement:
288
+ def __init__(self, data):
289
+ self.name = data.get('name', 'unknown')
290
+ self.selector = data.get('selector', data.get('name', 'unknown'))
291
+ self.element_class = data.get('element_class', 'unknown')
292
+ self.start_line = data.get('start_line', 0)
293
+ self.end_line = data.get('end_line', 0)
294
+ self.properties = data.get('properties', {})
295
+ self.language = data.get('language', 'css')
296
+
297
+ return MockStyleElement(data)
298
+
299
+
300
+ class HtmlJsonFormatter(IFormatter):
301
+ """JSON formatter specifically for HTML elements"""
302
+
303
+ @staticmethod
304
+ def get_format_name() -> str:
305
+ return "html_json"
306
+
307
+ def format(self, elements: list[CodeElement]) -> str:
308
+ """Format HTML elements as JSON with hierarchy"""
309
+ result = {
310
+ "html_analysis": {
311
+ "total_elements": len(elements),
312
+ "markup_elements": [],
313
+ "style_elements": [],
314
+ "other_elements": []
315
+ }
316
+ }
317
+
318
+ for element in elements:
319
+ if isinstance(element, MarkupElement):
320
+ result["html_analysis"]["markup_elements"].append(self._markup_to_dict(element))
321
+ elif isinstance(element, StyleElement):
322
+ result["html_analysis"]["style_elements"].append(self._style_to_dict(element))
323
+ elif isinstance(element, dict):
324
+ # Handle dictionary format
325
+ element_type = element.get("element_type", element.get("type", "unknown"))
326
+ if "tag_name" in element or element_type in ['tag', 'element', 'markup']:
327
+ result["html_analysis"]["markup_elements"].append(element)
328
+ elif "selector" in element or element_type in ['rule', 'style']:
329
+ result["html_analysis"]["style_elements"].append(element)
330
+ else:
331
+ result["html_analysis"]["other_elements"].append(element)
332
+ else:
333
+ result["html_analysis"]["other_elements"].append(self._element_to_dict(element))
334
+
335
+ return json.dumps(result, indent=2, ensure_ascii=False)
336
+
337
+ def _markup_to_dict(self, element: MarkupElement) -> dict[str, Any]:
338
+ """Convert MarkupElement to dictionary"""
339
+ return {
340
+ "name": element.name,
341
+ "tag_name": element.tag_name,
342
+ "element_class": element.element_class,
343
+ "start_line": element.start_line,
344
+ "end_line": element.end_line,
345
+ "attributes": element.attributes,
346
+ "children_count": len(element.children),
347
+ "children": [self._markup_to_dict(child) for child in element.children],
348
+ "language": element.language
349
+ }
350
+
351
+ def _style_to_dict(self, element: StyleElement) -> dict[str, Any]:
352
+ """Convert StyleElement to dictionary"""
353
+ return {
354
+ "name": element.name,
355
+ "selector": element.selector,
356
+ "element_class": element.element_class,
357
+ "start_line": element.start_line,
358
+ "end_line": element.end_line,
359
+ "properties": element.properties,
360
+ "language": element.language
361
+ }
362
+
363
+ def _element_to_dict(self, element: CodeElement) -> dict[str, Any]:
364
+ """Convert generic CodeElement to dictionary"""
365
+ return {
366
+ "name": element.name,
367
+ "type": getattr(element, "element_type", "unknown"),
368
+ "start_line": element.start_line,
369
+ "end_line": element.end_line,
370
+ "language": element.language
371
+ }
372
+
373
+
374
+ class HtmlCompactFormatter(IFormatter):
375
+ """Compact formatter for HTML elements"""
376
+
377
+ @staticmethod
378
+ def get_format_name() -> str:
379
+ return "html_compact"
380
+
381
+ def format(self, elements: list[CodeElement]) -> str:
382
+ """Format HTML elements in compact format"""
383
+ if not elements:
384
+ return "No HTML elements found."
385
+
386
+ lines = []
387
+ lines.append("HTML ELEMENTS")
388
+ lines.append("-" * 20)
389
+
390
+ markup_count = sum(1 for e in elements if isinstance(e, MarkupElement))
391
+ style_count = sum(1 for e in elements if isinstance(e, StyleElement))
392
+ other_count = len(elements) - markup_count - style_count
393
+
394
+ lines.append(f"Total: {len(elements)} elements")
395
+ lines.append(f" Markup: {markup_count}")
396
+ lines.append(f" Style: {style_count}")
397
+ lines.append(f" Other: {other_count}")
398
+ lines.append("")
399
+
400
+ for element in elements:
401
+ if isinstance(element, MarkupElement):
402
+ symbol = "🏷️"
403
+ info = f"<{element.tag_name}>"
404
+ if element.attributes.get("id"):
405
+ info += f" #{element.attributes['id']}"
406
+ if element.attributes.get("class"):
407
+ info += f" .{element.attributes['class']}"
408
+ name = element.name
409
+ start_line = element.start_line
410
+ end_line = element.end_line
411
+ elif isinstance(element, StyleElement):
412
+ symbol = "🎨"
413
+ info = element.selector
414
+ name = element.name
415
+ start_line = element.start_line
416
+ end_line = element.end_line
417
+ elif isinstance(element, dict):
418
+ # Handle dictionary format
419
+ element_type = element.get("element_type", element.get("type", "unknown"))
420
+ name = element.get("name", "unknown")
421
+ start_line = element.get("start_line", 0)
422
+ end_line = element.get("end_line", 0)
423
+
424
+ if "tag_name" in element or element_type in ['tag', 'element', 'markup']:
425
+ symbol = "🏷️"
426
+ tag_name = element.get("tag_name", name)
427
+ info = f"<{tag_name}>"
428
+ attributes = element.get("attributes", {})
429
+ if attributes.get("id"):
430
+ info += f" #{attributes['id']}"
431
+ if attributes.get("class"):
432
+ info += f" .{attributes['class']}"
433
+ elif "selector" in element or element_type in ['rule', 'style']:
434
+ symbol = "🎨"
435
+ info = element.get("selector", name)
436
+ else:
437
+ symbol = "📄"
438
+ info = element_type
439
+ else:
440
+ symbol = "📄"
441
+ info = getattr(element, "element_type", "unknown")
442
+ name = getattr(element, "name", "unknown")
443
+ start_line = getattr(element, "start_line", 0)
444
+ end_line = getattr(element, "end_line", 0)
445
+
446
+ lines.append(f"{symbol} {name} {info} [{start_line}-{end_line}]")
447
+
448
+ return "\n".join(lines)
449
+
450
+
451
+ # Register HTML formatters
452
+ def register_html_formatters() -> None:
453
+ """Register HTML-specific formatters"""
454
+ from .formatter_registry import FormatterRegistry
455
+
456
+ FormatterRegistry.register_formatter(HtmlFormatter)
457
+ FormatterRegistry.register_formatter(HtmlJsonFormatter)
458
+ FormatterRegistry.register_formatter(HtmlCompactFormatter)
459
+
460
+
461
+ # Auto-register when module is imported
462
+ register_html_formatters()
@@ -6,6 +6,7 @@ Factory for creating language-specific formatters for different output types.
6
6
  from typing import Dict, Type, Any
7
7
  from .base_formatter import BaseFormatter
8
8
  from .markdown_formatter import MarkdownFormatter
9
+ from .html_formatter import HtmlFormatter
9
10
 
10
11
 
11
12
  class LanguageFormatterFactory:
@@ -14,6 +15,8 @@ class LanguageFormatterFactory:
14
15
  _formatters: Dict[str, Type[BaseFormatter]] = {
15
16
  "markdown": MarkdownFormatter,
16
17
  "md": MarkdownFormatter, # Alias
18
+ "html": HtmlFormatter,
19
+ "css": HtmlFormatter, # CSS files also use HTML formatter
17
20
  }
18
21
 
19
22
  @classmethod
@@ -442,7 +442,7 @@ class MarkdownFormatter(BaseFormatter):
442
442
  """Format advanced analysis in text format"""
443
443
  output = ["--- Advanced Analysis Results ---"]
444
444
 
445
- # Basic info
445
+ # Basic info - format with quotes to match expected output
446
446
  output.append(f'"File: {data["file_path"]}"')
447
447
  output.append(f'"Language: {data["language"]}"')
448
448
  output.append(f'"Lines: {data["line_count"]}"')
@@ -12,6 +12,8 @@ import logging
12
12
  import sys
13
13
  from typing import Any
14
14
 
15
+ from .. import __version__
16
+
15
17
  try:
16
18
  from mcp.server import Server
17
19
  from mcp.server.models import InitializationOptions
@@ -68,7 +70,7 @@ class TreeSitterAnalyzerMCPServer:
68
70
 
69
71
  self.server: Server | None = None
70
72
  self.name = "tree-sitter-analyzer"
71
- self.version = "2.0.0"
73
+ self.version = __version__
72
74
 
73
75
  log_info(f"Initializing {self.name} v{self.version}")
74
76
 
@@ -66,6 +66,19 @@ class LanguageDetector:
66
66
  ".mkd": "markdown",
67
67
  ".mkdn": "markdown",
68
68
  ".mdx": "markdown",
69
+ # HTML系
70
+ ".html": "html",
71
+ ".htm": "html",
72
+ ".xhtml": "html",
73
+ # CSS系
74
+ ".css": "css",
75
+ ".scss": "css",
76
+ ".sass": "css",
77
+ ".less": "css",
78
+ # JSON系
79
+ ".json": "json",
80
+ ".jsonc": "json",
81
+ ".json5": "json",
69
82
  }
70
83
 
71
84
  # Ambiguous extensions (map to multiple languages)
@@ -100,6 +113,9 @@ class LanguageDetector:
100
113
  "rust",
101
114
  "go",
102
115
  "markdown",
116
+ "html",
117
+ "css",
118
+ "json",
103
119
  }
104
120
 
105
121
  def __init__(self) -> None:
@@ -143,6 +159,19 @@ class LanguageDetector:
143
159
  ".mkd": ("markdown", 0.8),
144
160
  ".mkdn": ("markdown", 0.8),
145
161
  ".mdx": ("markdown", 0.7), # MDX might be mixed with JSX
162
+ # HTML extensions
163
+ ".html": ("html", 0.9),
164
+ ".htm": ("html", 0.9),
165
+ ".xhtml": ("html", 0.8),
166
+ # CSS extensions
167
+ ".css": ("css", 0.9),
168
+ ".scss": ("css", 0.8), # Sass/SCSS
169
+ ".sass": ("css", 0.8), # Sass
170
+ ".less": ("css", 0.8), # Less
171
+ # JSON extensions
172
+ ".json": ("json", 0.9),
173
+ ".jsonc": ("json", 0.8), # JSON with comments
174
+ ".json5": ("json", 0.8), # JSON5 format
146
175
  }
147
176
 
148
177
  # Content-based detection patterns
@@ -194,6 +223,26 @@ class LanguageDetector:
194
223
  (r"^\s*\|.*\|", 0.2), # Tables
195
224
  (r"^[-=]{3,}$", 0.2), # Setext headers or horizontal rules
196
225
  ],
226
+ "html": [
227
+ (r"<!DOCTYPE\s+html", 0.4), # HTML5 doctype
228
+ (r"<html[^>]*>", 0.3), # HTML tag
229
+ (r"<head[^>]*>", 0.3), # Head tag
230
+ (r"<body[^>]*>", 0.3), # Body tag
231
+ (r"<div[^>]*>", 0.2), # Div tag
232
+ (r"<p[^>]*>", 0.2), # Paragraph tag
233
+ (r"<a\s+href=", 0.2), # Link tag with href
234
+ (r"<img\s+src=", 0.2), # Image tag with src
235
+ ],
236
+ "css": [
237
+ (r"[.#][\w-]+\s*{", 0.4), # CSS selectors
238
+ (r"@media\s+", 0.3), # Media queries
239
+ (r"@import\s+", 0.3), # Import statements
240
+ (r"@keyframes\s+", 0.3), # Keyframes
241
+ (r":\s*[\w-]+\s*;", 0.2), # Property declarations
242
+ (r"color\s*:", 0.2), # Color property
243
+ (r"font-", 0.2), # Font properties
244
+ (r"margin\s*:", 0.2), # Margin property
245
+ ],
197
246
  }
198
247
 
199
248
  from .utils import log_debug, log_warning
@@ -212,14 +261,22 @@ class LanguageDetector:
212
261
  content: ファイルコンテンツ(任意、曖昧性解決用)
213
262
 
214
263
  Returns:
215
- (言語名, 信頼度) のタプル
264
+ (言語名, 信頼度) のタプル - 常に有効な言語名を返す
216
265
  """
266
+ # Handle invalid input
267
+ if not file_path or not isinstance(file_path, str):
268
+ return "unknown", 0.0
269
+
217
270
  path = Path(file_path)
218
271
  extension = path.suffix.lower()
219
272
 
220
273
  # Direct mapping by extension
221
274
  if extension in self.EXTENSION_MAPPING:
222
275
  language = self.EXTENSION_MAPPING[extension]
276
+
277
+ # Ensure language is valid
278
+ if not language or language.strip() == "":
279
+ return "unknown", 0.0
223
280
 
224
281
  # Use confidence from extension_map if available
225
282
  if extension in self.extension_map:
@@ -233,11 +290,14 @@ class LanguageDetector:
233
290
  # Resolve ambiguity using content
234
291
  if content:
235
292
  refined_language = self._resolve_ambiguity(extension, content)
293
+ # Ensure refined language is valid
294
+ if not refined_language or refined_language.strip() == "":
295
+ refined_language = "unknown"
236
296
  return refined_language, 0.9 if refined_language != language else 0.7
237
297
  else:
238
298
  return language, 0.7 # Lower confidence without content
239
299
 
240
- # Unknown extension
300
+ # Unknown extension - always return "unknown" instead of None
241
301
  return "unknown", 0.0
242
302
 
243
303
  def detect_from_extension(self, file_path: str) -> str:
@@ -248,10 +308,22 @@ class LanguageDetector:
248
308
  file_path: File path
249
309
 
250
310
  Returns:
251
- Detected language name
311
+ Detected language name - 常に有効な文字列を返す
252
312
  """
253
- language, _ = self.detect_language(file_path)
254
- return language
313
+ # Handle invalid input
314
+ if not file_path or not isinstance(file_path, str):
315
+ return "unknown"
316
+
317
+ result = self.detect_language(file_path)
318
+ if isinstance(result, tuple):
319
+ language, _ = result
320
+ # Ensure language is valid
321
+ if not language or language.strip() == "":
322
+ return "unknown"
323
+ return language
324
+ else:
325
+ # Fallback for unexpected result format
326
+ return "unknown"
255
327
 
256
328
  def is_supported(self, language: str) -> bool:
257
329
  """
@@ -410,9 +482,21 @@ def detect_language_from_file(file_path: str) -> str:
410
482
  file_path: File path
411
483
 
412
484
  Returns:
413
- Detected language name
485
+ Detected language name - 常に有効な文字列を返す
414
486
  """
415
- return detector.detect_from_extension(file_path)
487
+ # Handle invalid input
488
+ if not file_path or not isinstance(file_path, str):
489
+ return "unknown"
490
+
491
+ # Create a fresh instance to ensure latest configuration
492
+ fresh_detector = LanguageDetector()
493
+ result = fresh_detector.detect_from_extension(file_path)
494
+
495
+ # Ensure result is valid
496
+ if not result or result.strip() == "":
497
+ return "unknown"
498
+
499
+ return result
416
500
 
417
501
 
418
502
  def is_language_supported(language: str) -> bool: