tree-sitter-analyzer 1.9.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. tree_sitter_analyzer/__init__.py +132 -0
  2. tree_sitter_analyzer/__main__.py +11 -0
  3. tree_sitter_analyzer/api.py +853 -0
  4. tree_sitter_analyzer/cli/__init__.py +39 -0
  5. tree_sitter_analyzer/cli/__main__.py +12 -0
  6. tree_sitter_analyzer/cli/argument_validator.py +89 -0
  7. tree_sitter_analyzer/cli/commands/__init__.py +26 -0
  8. tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
  9. tree_sitter_analyzer/cli/commands/base_command.py +181 -0
  10. tree_sitter_analyzer/cli/commands/default_command.py +18 -0
  11. tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
  12. tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
  13. tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
  14. tree_sitter_analyzer/cli/commands/query_command.py +109 -0
  15. tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
  16. tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
  17. tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
  18. tree_sitter_analyzer/cli/commands/table_command.py +414 -0
  19. tree_sitter_analyzer/cli/info_commands.py +124 -0
  20. tree_sitter_analyzer/cli_main.py +472 -0
  21. tree_sitter_analyzer/constants.py +85 -0
  22. tree_sitter_analyzer/core/__init__.py +15 -0
  23. tree_sitter_analyzer/core/analysis_engine.py +580 -0
  24. tree_sitter_analyzer/core/cache_service.py +333 -0
  25. tree_sitter_analyzer/core/engine.py +585 -0
  26. tree_sitter_analyzer/core/parser.py +293 -0
  27. tree_sitter_analyzer/core/query.py +605 -0
  28. tree_sitter_analyzer/core/query_filter.py +200 -0
  29. tree_sitter_analyzer/core/query_service.py +340 -0
  30. tree_sitter_analyzer/encoding_utils.py +530 -0
  31. tree_sitter_analyzer/exceptions.py +747 -0
  32. tree_sitter_analyzer/file_handler.py +246 -0
  33. tree_sitter_analyzer/formatters/__init__.py +1 -0
  34. tree_sitter_analyzer/formatters/base_formatter.py +201 -0
  35. tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
  36. tree_sitter_analyzer/formatters/formatter_config.py +197 -0
  37. tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
  38. tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
  39. tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
  40. tree_sitter_analyzer/formatters/go_formatter.py +368 -0
  41. tree_sitter_analyzer/formatters/html_formatter.py +498 -0
  42. tree_sitter_analyzer/formatters/java_formatter.py +423 -0
  43. tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
  44. tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
  45. tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
  46. tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
  47. tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
  48. tree_sitter_analyzer/formatters/php_formatter.py +301 -0
  49. tree_sitter_analyzer/formatters/python_formatter.py +830 -0
  50. tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
  51. tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
  52. tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
  53. tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
  54. tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
  55. tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
  56. tree_sitter_analyzer/interfaces/__init__.py +9 -0
  57. tree_sitter_analyzer/interfaces/cli.py +535 -0
  58. tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
  59. tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
  60. tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
  61. tree_sitter_analyzer/language_detector.py +553 -0
  62. tree_sitter_analyzer/language_loader.py +271 -0
  63. tree_sitter_analyzer/languages/__init__.py +10 -0
  64. tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
  65. tree_sitter_analyzer/languages/css_plugin.py +449 -0
  66. tree_sitter_analyzer/languages/go_plugin.py +836 -0
  67. tree_sitter_analyzer/languages/html_plugin.py +496 -0
  68. tree_sitter_analyzer/languages/java_plugin.py +1299 -0
  69. tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
  70. tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
  71. tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
  72. tree_sitter_analyzer/languages/php_plugin.py +862 -0
  73. tree_sitter_analyzer/languages/python_plugin.py +1636 -0
  74. tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
  75. tree_sitter_analyzer/languages/rust_plugin.py +673 -0
  76. tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
  77. tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
  78. tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
  79. tree_sitter_analyzer/legacy_table_formatter.py +860 -0
  80. tree_sitter_analyzer/mcp/__init__.py +34 -0
  81. tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
  82. tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
  83. tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
  84. tree_sitter_analyzer/mcp/server.py +869 -0
  85. tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
  86. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
  87. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
  88. tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
  89. tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
  90. tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
  91. tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
  92. tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
  93. tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
  94. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
  95. tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
  96. tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
  97. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
  98. tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
  99. tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
  100. tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
  101. tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
  102. tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
  103. tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
  104. tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
  105. tree_sitter_analyzer/models.py +840 -0
  106. tree_sitter_analyzer/mypy_current_errors.txt +2 -0
  107. tree_sitter_analyzer/output_manager.py +255 -0
  108. tree_sitter_analyzer/platform_compat/__init__.py +3 -0
  109. tree_sitter_analyzer/platform_compat/adapter.py +324 -0
  110. tree_sitter_analyzer/platform_compat/compare.py +224 -0
  111. tree_sitter_analyzer/platform_compat/detector.py +67 -0
  112. tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
  113. tree_sitter_analyzer/platform_compat/profiles.py +217 -0
  114. tree_sitter_analyzer/platform_compat/record.py +55 -0
  115. tree_sitter_analyzer/platform_compat/recorder.py +155 -0
  116. tree_sitter_analyzer/platform_compat/report.py +92 -0
  117. tree_sitter_analyzer/plugins/__init__.py +280 -0
  118. tree_sitter_analyzer/plugins/base.py +647 -0
  119. tree_sitter_analyzer/plugins/manager.py +384 -0
  120. tree_sitter_analyzer/project_detector.py +328 -0
  121. tree_sitter_analyzer/queries/__init__.py +27 -0
  122. tree_sitter_analyzer/queries/csharp.py +216 -0
  123. tree_sitter_analyzer/queries/css.py +615 -0
  124. tree_sitter_analyzer/queries/go.py +275 -0
  125. tree_sitter_analyzer/queries/html.py +543 -0
  126. tree_sitter_analyzer/queries/java.py +402 -0
  127. tree_sitter_analyzer/queries/javascript.py +724 -0
  128. tree_sitter_analyzer/queries/kotlin.py +192 -0
  129. tree_sitter_analyzer/queries/markdown.py +258 -0
  130. tree_sitter_analyzer/queries/php.py +95 -0
  131. tree_sitter_analyzer/queries/python.py +859 -0
  132. tree_sitter_analyzer/queries/ruby.py +92 -0
  133. tree_sitter_analyzer/queries/rust.py +223 -0
  134. tree_sitter_analyzer/queries/sql.py +555 -0
  135. tree_sitter_analyzer/queries/typescript.py +871 -0
  136. tree_sitter_analyzer/queries/yaml.py +236 -0
  137. tree_sitter_analyzer/query_loader.py +272 -0
  138. tree_sitter_analyzer/security/__init__.py +22 -0
  139. tree_sitter_analyzer/security/boundary_manager.py +277 -0
  140. tree_sitter_analyzer/security/regex_checker.py +297 -0
  141. tree_sitter_analyzer/security/validator.py +599 -0
  142. tree_sitter_analyzer/table_formatter.py +782 -0
  143. tree_sitter_analyzer/utils/__init__.py +53 -0
  144. tree_sitter_analyzer/utils/logging.py +433 -0
  145. tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
  146. tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
  147. tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
  148. tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
  149. tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
@@ -0,0 +1,496 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTML Language Plugin
4
+
5
+ True HTML parser using tree-sitter-html for comprehensive HTML analysis.
6
+ Provides HTML-specific analysis capabilities including element extraction,
7
+ attribute parsing, and document structure analysis.
8
+ """
9
+
10
+ import logging
11
+ from typing import TYPE_CHECKING
12
+
13
+ from ..models import AnalysisResult, MarkupElement
14
+ from ..plugins.base import ElementExtractor, LanguagePlugin
15
+ from ..utils import log_debug, log_error, log_info
16
+
17
+ if TYPE_CHECKING:
18
+ import tree_sitter
19
+
20
+ from ..core.analysis_engine import AnalysisRequest
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class HtmlElementExtractor(ElementExtractor):
26
+ """HTML-specific element extractor using tree-sitter-html"""
27
+
28
+ def __init__(self) -> None:
29
+ self.element_categories = {
30
+ # HTML要素の分類システム
31
+ "structure": [
32
+ "html",
33
+ "body",
34
+ "div",
35
+ "span",
36
+ "section",
37
+ "article",
38
+ "aside",
39
+ "nav",
40
+ "main",
41
+ "header",
42
+ "footer",
43
+ ],
44
+ "heading": ["h1", "h2", "h3", "h4", "h5", "h6"],
45
+ "text": [
46
+ "p",
47
+ "a",
48
+ "strong",
49
+ "em",
50
+ "b",
51
+ "i",
52
+ "u",
53
+ "small",
54
+ "mark",
55
+ "del",
56
+ "ins",
57
+ "sub",
58
+ "sup",
59
+ ],
60
+ "list": ["ul", "ol", "li", "dl", "dt", "dd"],
61
+ "media": [
62
+ "img",
63
+ "video",
64
+ "audio",
65
+ "source",
66
+ "track",
67
+ "canvas",
68
+ "svg",
69
+ "picture",
70
+ ],
71
+ "form": [
72
+ "form",
73
+ "input",
74
+ "textarea",
75
+ "button",
76
+ "select",
77
+ "option",
78
+ "optgroup",
79
+ "label",
80
+ "fieldset",
81
+ "legend",
82
+ ],
83
+ "table": [
84
+ "table",
85
+ "thead",
86
+ "tbody",
87
+ "tfoot",
88
+ "tr",
89
+ "td",
90
+ "th",
91
+ "caption",
92
+ "colgroup",
93
+ "col",
94
+ ],
95
+ "metadata": [
96
+ "head",
97
+ "title",
98
+ "meta",
99
+ "link",
100
+ "style",
101
+ "script",
102
+ "noscript",
103
+ "base",
104
+ ],
105
+ }
106
+
107
+ def extract_functions(self, tree: "tree_sitter.Tree", source_code: str) -> list:
108
+ """HTML doesn't have functions, return empty list"""
109
+ return []
110
+
111
+ def extract_classes(self, tree: "tree_sitter.Tree", source_code: str) -> list:
112
+ """HTML doesn't have classes in the traditional sense, return empty list"""
113
+ return []
114
+
115
+ def extract_variables(self, tree: "tree_sitter.Tree", source_code: str) -> list:
116
+ """HTML doesn't have variables, return empty list"""
117
+ return []
118
+
119
+ def extract_imports(self, tree: "tree_sitter.Tree", source_code: str) -> list:
120
+ """HTML doesn't have imports, return empty list"""
121
+ return []
122
+
123
+ def extract_html_elements(
124
+ self, tree: "tree_sitter.Tree", source_code: str
125
+ ) -> list[MarkupElement]:
126
+ """Extract HTML elements using tree-sitter-html parser"""
127
+ elements: list[MarkupElement] = []
128
+
129
+ try:
130
+ if hasattr(tree, "root_node"):
131
+ self._traverse_for_html_elements(
132
+ tree.root_node, elements, source_code, None
133
+ )
134
+ except Exception as e:
135
+ log_error(f"Error in HTML element extraction: {e}")
136
+
137
+ return elements
138
+
139
+ def _traverse_for_html_elements(
140
+ self,
141
+ node: "tree_sitter.Node",
142
+ elements: list[MarkupElement],
143
+ source_code: str,
144
+ parent: MarkupElement | None,
145
+ ) -> None:
146
+ """Traverse tree to find HTML elements using tree-sitter-html grammar"""
147
+ if hasattr(node, "type") and self._is_html_element_node(node.type):
148
+ try:
149
+ element = self._create_markup_element(node, source_code, parent)
150
+ if element:
151
+ elements.append(element)
152
+
153
+ # Process children with this element as parent
154
+ if hasattr(node, "children"):
155
+ for child in node.children:
156
+ self._traverse_for_html_elements(
157
+ child, elements, source_code, element
158
+ )
159
+ return
160
+ except Exception as e:
161
+ log_debug(f"Failed to extract HTML element: {e}")
162
+
163
+ # Continue traversing children if this node is not an HTML element
164
+ if hasattr(node, "children"):
165
+ for child in node.children:
166
+ self._traverse_for_html_elements(child, elements, source_code, parent)
167
+
168
+ def _is_html_element_node(self, node_type: str) -> bool:
169
+ """Check if a node type represents an HTML element in tree-sitter-html grammar"""
170
+ # Only process top-level element nodes to avoid duplication
171
+ # tree-sitter-html structure: element contains start_tag/end_tag
172
+ # Processing only 'element' avoids counting start_tag separately
173
+ html_element_types = [
174
+ "element",
175
+ "self_closing_tag",
176
+ "script_element",
177
+ "style_element",
178
+ ]
179
+ return node_type in html_element_types
180
+
181
+ def _create_markup_element(
182
+ self,
183
+ node: "tree_sitter.Node",
184
+ source_code: str,
185
+ parent: MarkupElement | None,
186
+ ) -> MarkupElement | None:
187
+ """Create MarkupElement from tree-sitter node using tree-sitter-html grammar"""
188
+ try:
189
+ # Extract tag name using tree-sitter-html structure
190
+ tag_name = self._extract_tag_name(node, source_code)
191
+ if not tag_name:
192
+ return None
193
+
194
+ # Extract attributes using tree-sitter-html structure
195
+ attributes = self._extract_attributes(node, source_code)
196
+
197
+ # Determine element class based on tag name
198
+ element_class = self._classify_element(tag_name)
199
+
200
+ # Extract text content
201
+ raw_text = self._extract_node_text(node, source_code)
202
+
203
+ # Create MarkupElement
204
+ element = MarkupElement(
205
+ name=tag_name,
206
+ start_line=(
207
+ node.start_point[0] + 1 if hasattr(node, "start_point") else 0
208
+ ),
209
+ end_line=node.end_point[0] + 1 if hasattr(node, "end_point") else 0,
210
+ raw_text=raw_text,
211
+ language="html",
212
+ tag_name=tag_name,
213
+ attributes=attributes,
214
+ parent=parent,
215
+ children=[],
216
+ element_class=element_class,
217
+ )
218
+
219
+ # Add to parent's children if parent exists
220
+ if parent:
221
+ parent.children.append(element)
222
+
223
+ return element
224
+
225
+ except Exception as e:
226
+ log_debug(f"Failed to create MarkupElement: {e}")
227
+ return None
228
+
229
+ def _extract_tag_name(self, node: "tree_sitter.Node", source_code: str) -> str:
230
+ """Extract tag name from HTML element node using tree-sitter-html grammar"""
231
+ try:
232
+ # For tree-sitter-html, tag names are in specific child nodes
233
+ if hasattr(node, "children"):
234
+ for child in node.children:
235
+ if hasattr(child, "type"):
236
+ # Handle different node types in tree-sitter-html
237
+ if child.type == "tag_name":
238
+ return self._extract_node_text(child, source_code).strip()
239
+ elif child.type in ["start_tag", "self_closing_tag"]:
240
+ # Look for tag_name within start_tag or self_closing_tag
241
+ for grandchild in child.children:
242
+ if (
243
+ hasattr(grandchild, "type")
244
+ and grandchild.type == "tag_name"
245
+ ):
246
+ return self._extract_node_text(
247
+ grandchild, source_code
248
+ ).strip()
249
+
250
+ # Fallback: try to extract from node text
251
+ node_text = self._extract_node_text(node, source_code)
252
+ if node_text.startswith("<"):
253
+ # Extract tag name from <tagname ...> pattern
254
+ tag_part = node_text.split(">")[0].split()[0]
255
+ return tag_part.lstrip("<").rstrip(">")
256
+
257
+ return "unknown"
258
+ except Exception:
259
+ return "unknown"
260
+
261
+ def _extract_attributes(
262
+ self, node: "tree_sitter.Node", source_code: str
263
+ ) -> dict[str, str]:
264
+ """Extract attributes from HTML element node using tree-sitter-html grammar"""
265
+ attributes = {}
266
+
267
+ try:
268
+ if hasattr(node, "children"):
269
+ for child in node.children:
270
+ if hasattr(child, "type"):
271
+ # Handle attribute nodes in tree-sitter-html
272
+ if child.type == "attribute":
273
+ attr_name, attr_value = self._parse_attribute(
274
+ child, source_code
275
+ )
276
+ if attr_name:
277
+ attributes[attr_name] = attr_value
278
+ elif child.type in ["start_tag", "self_closing_tag"]:
279
+ # Look for attributes within start_tag or self_closing_tag
280
+ for grandchild in child.children:
281
+ if (
282
+ hasattr(grandchild, "type")
283
+ and grandchild.type == "attribute"
284
+ ):
285
+ attr_name, attr_value = self._parse_attribute(
286
+ grandchild, source_code
287
+ )
288
+ if attr_name:
289
+ attributes[attr_name] = attr_value
290
+ except Exception as e:
291
+ log_debug(f"Failed to extract attributes: {e}")
292
+
293
+ return attributes
294
+
295
+ def _parse_attribute(
296
+ self, attr_node: "tree_sitter.Node", source_code: str
297
+ ) -> tuple[str, str]:
298
+ """Parse individual attribute node using tree-sitter-html grammar"""
299
+ try:
300
+ # In tree-sitter-html, attributes have specific structure
301
+ attr_name = ""
302
+ attr_value = ""
303
+
304
+ if hasattr(attr_node, "children"):
305
+ for child in attr_node.children:
306
+ if hasattr(child, "type"):
307
+ if child.type == "attribute_name":
308
+ attr_name = self._extract_node_text(
309
+ child, source_code
310
+ ).strip()
311
+ elif child.type == "quoted_attribute_value":
312
+ attr_value = (
313
+ self._extract_node_text(child, source_code)
314
+ .strip()
315
+ .strip('"')
316
+ .strip("'")
317
+ )
318
+ elif child.type == "attribute_value":
319
+ attr_value = self._extract_node_text(
320
+ child, source_code
321
+ ).strip()
322
+
323
+ # Fallback to simple parsing
324
+ if not attr_name:
325
+ attr_text = self._extract_node_text(attr_node, source_code)
326
+ if "=" in attr_text:
327
+ name, value = attr_text.split("=", 1)
328
+ attr_name = name.strip()
329
+ attr_value = value.strip().strip('"').strip("'")
330
+ else:
331
+ # Boolean attribute
332
+ attr_name = attr_text.strip()
333
+ attr_value = ""
334
+
335
+ return attr_name, attr_value
336
+ except Exception:
337
+ return "", ""
338
+
339
+ def _classify_element(self, tag_name: str) -> str:
340
+ """Classify HTML element based on tag name"""
341
+ tag_name_lower = tag_name.lower()
342
+
343
+ for category, tags in self.element_categories.items():
344
+ if tag_name_lower in tags:
345
+ return category
346
+
347
+ return "unknown"
348
+
349
+ def _extract_node_text(self, node: "tree_sitter.Node", source_code: str) -> str:
350
+ """Extract text content from a tree-sitter node"""
351
+ try:
352
+ if hasattr(node, "start_byte") and hasattr(node, "end_byte"):
353
+ source_bytes = source_code.encode("utf-8")
354
+ node_bytes = source_bytes[node.start_byte : node.end_byte]
355
+ return node_bytes.decode("utf-8", errors="replace")
356
+ return ""
357
+ except Exception as e:
358
+ log_debug(f"Failed to extract node text: {e}")
359
+ return ""
360
+
361
+
362
+ class HtmlPlugin(LanguagePlugin):
363
+ """HTML language plugin using tree-sitter-html for true HTML parsing"""
364
+
365
+ def get_language_name(self) -> str:
366
+ return "html"
367
+
368
+ def get_file_extensions(self) -> list[str]:
369
+ return [".html", ".htm", ".xhtml"]
370
+
371
+ def create_extractor(self) -> ElementExtractor:
372
+ return HtmlElementExtractor()
373
+
374
+ def get_supported_element_types(self) -> list[str]:
375
+ return ["html_element"]
376
+
377
+ def get_queries(self) -> dict[str, str]:
378
+ """Return HTML-specific tree-sitter queries"""
379
+ from ..queries.html import HTML_QUERIES
380
+
381
+ return HTML_QUERIES
382
+
383
+ def execute_query_strategy(
384
+ self, query_key: str | None, language: str
385
+ ) -> str | None:
386
+ """Execute query strategy for HTML"""
387
+ if language != "html":
388
+ return None
389
+
390
+ queries = self.get_queries()
391
+ return queries.get(query_key) if query_key else None
392
+
393
+ def get_element_categories(self) -> dict[str, list[str]]:
394
+ """Return HTML element categories for query execution"""
395
+ return {
396
+ "structure": ["element"],
397
+ "heading": ["element"],
398
+ "text": ["element"],
399
+ "list": ["element"],
400
+ "media": ["element"],
401
+ "form": ["element"],
402
+ "table": ["element"],
403
+ "metadata": ["element"],
404
+ }
405
+
406
+ async def analyze_file(
407
+ self, file_path: str, request: "AnalysisRequest"
408
+ ) -> "AnalysisResult":
409
+ """Analyze HTML file using tree-sitter-html parser"""
410
+ from ..encoding_utils import read_file_safe
411
+
412
+ try:
413
+ # Read file content
414
+ content, encoding = read_file_safe(file_path)
415
+
416
+ # Use tree-sitter-html for parsing
417
+ try:
418
+ import tree_sitter
419
+ import tree_sitter_html as ts_html
420
+
421
+ # Get HTML language
422
+ HTML_LANGUAGE = tree_sitter.Language(ts_html.language())
423
+
424
+ # Create parser
425
+ parser = tree_sitter.Parser()
426
+ parser.language = HTML_LANGUAGE
427
+
428
+ # Parse the HTML content
429
+ tree = parser.parse(content.encode("utf-8"))
430
+
431
+ # Extract elements using the extractor
432
+ extractor = self.create_extractor()
433
+ elements = extractor.extract_html_elements(tree, content)
434
+
435
+ log_info(f"Extracted {len(elements)} HTML elements from {file_path}")
436
+
437
+ return AnalysisResult(
438
+ file_path=file_path,
439
+ language="html",
440
+ line_count=len(content.splitlines()),
441
+ elements=elements,
442
+ node_count=len(elements),
443
+ query_results={},
444
+ source_code=content,
445
+ success=True,
446
+ error_message=None,
447
+ )
448
+
449
+ except ImportError:
450
+ log_error(
451
+ "tree-sitter-html not available, falling back to basic parsing"
452
+ )
453
+ # Fallback to basic parsing
454
+ lines = content.splitlines()
455
+ line_count = len(lines)
456
+
457
+ # Create basic MarkupElement for the HTML document
458
+ html_element = MarkupElement(
459
+ name="html",
460
+ start_line=1,
461
+ end_line=line_count,
462
+ raw_text=content[:200] + "..." if len(content) > 200 else content,
463
+ language="html",
464
+ tag_name="html",
465
+ attributes={},
466
+ parent=None,
467
+ children=[],
468
+ element_class="structure",
469
+ )
470
+ elements = [html_element]
471
+
472
+ return AnalysisResult(
473
+ file_path=file_path,
474
+ language="html",
475
+ line_count=line_count,
476
+ elements=elements,
477
+ node_count=len(elements),
478
+ query_results={},
479
+ source_code=content,
480
+ success=True,
481
+ error_message=None,
482
+ )
483
+
484
+ except Exception as e:
485
+ log_error(f"Failed to analyze HTML file {file_path}: {e}")
486
+ return AnalysisResult(
487
+ file_path=file_path,
488
+ language="html",
489
+ line_count=0,
490
+ elements=[],
491
+ node_count=0,
492
+ query_results={},
493
+ source_code="",
494
+ success=False,
495
+ error_message=str(e),
496
+ )