tree-sitter-analyzer 1.9.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tree_sitter_analyzer/__init__.py +132 -0
- tree_sitter_analyzer/__main__.py +11 -0
- tree_sitter_analyzer/api.py +853 -0
- tree_sitter_analyzer/cli/__init__.py +39 -0
- tree_sitter_analyzer/cli/__main__.py +12 -0
- tree_sitter_analyzer/cli/argument_validator.py +89 -0
- tree_sitter_analyzer/cli/commands/__init__.py +26 -0
- tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
- tree_sitter_analyzer/cli/commands/base_command.py +181 -0
- tree_sitter_analyzer/cli/commands/default_command.py +18 -0
- tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
- tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
- tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
- tree_sitter_analyzer/cli/commands/query_command.py +109 -0
- tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
- tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
- tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
- tree_sitter_analyzer/cli/commands/table_command.py +414 -0
- tree_sitter_analyzer/cli/info_commands.py +124 -0
- tree_sitter_analyzer/cli_main.py +472 -0
- tree_sitter_analyzer/constants.py +85 -0
- tree_sitter_analyzer/core/__init__.py +15 -0
- tree_sitter_analyzer/core/analysis_engine.py +580 -0
- tree_sitter_analyzer/core/cache_service.py +333 -0
- tree_sitter_analyzer/core/engine.py +585 -0
- tree_sitter_analyzer/core/parser.py +293 -0
- tree_sitter_analyzer/core/query.py +605 -0
- tree_sitter_analyzer/core/query_filter.py +200 -0
- tree_sitter_analyzer/core/query_service.py +340 -0
- tree_sitter_analyzer/encoding_utils.py +530 -0
- tree_sitter_analyzer/exceptions.py +747 -0
- tree_sitter_analyzer/file_handler.py +246 -0
- tree_sitter_analyzer/formatters/__init__.py +1 -0
- tree_sitter_analyzer/formatters/base_formatter.py +201 -0
- tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
- tree_sitter_analyzer/formatters/formatter_config.py +197 -0
- tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
- tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
- tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
- tree_sitter_analyzer/formatters/go_formatter.py +368 -0
- tree_sitter_analyzer/formatters/html_formatter.py +498 -0
- tree_sitter_analyzer/formatters/java_formatter.py +423 -0
- tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
- tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
- tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
- tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
- tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
- tree_sitter_analyzer/formatters/php_formatter.py +301 -0
- tree_sitter_analyzer/formatters/python_formatter.py +830 -0
- tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
- tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
- tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
- tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
- tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
- tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
- tree_sitter_analyzer/interfaces/__init__.py +9 -0
- tree_sitter_analyzer/interfaces/cli.py +535 -0
- tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
- tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
- tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
- tree_sitter_analyzer/language_detector.py +553 -0
- tree_sitter_analyzer/language_loader.py +271 -0
- tree_sitter_analyzer/languages/__init__.py +10 -0
- tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
- tree_sitter_analyzer/languages/css_plugin.py +449 -0
- tree_sitter_analyzer/languages/go_plugin.py +836 -0
- tree_sitter_analyzer/languages/html_plugin.py +496 -0
- tree_sitter_analyzer/languages/java_plugin.py +1299 -0
- tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
- tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
- tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
- tree_sitter_analyzer/languages/php_plugin.py +862 -0
- tree_sitter_analyzer/languages/python_plugin.py +1636 -0
- tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
- tree_sitter_analyzer/languages/rust_plugin.py +673 -0
- tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
- tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
- tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
- tree_sitter_analyzer/legacy_table_formatter.py +860 -0
- tree_sitter_analyzer/mcp/__init__.py +34 -0
- tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
- tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
- tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
- tree_sitter_analyzer/mcp/server.py +869 -0
- tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
- tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
- tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
- tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
- tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
- tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
- tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
- tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
- tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
- tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
- tree_sitter_analyzer/models.py +840 -0
- tree_sitter_analyzer/mypy_current_errors.txt +2 -0
- tree_sitter_analyzer/output_manager.py +255 -0
- tree_sitter_analyzer/platform_compat/__init__.py +3 -0
- tree_sitter_analyzer/platform_compat/adapter.py +324 -0
- tree_sitter_analyzer/platform_compat/compare.py +224 -0
- tree_sitter_analyzer/platform_compat/detector.py +67 -0
- tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
- tree_sitter_analyzer/platform_compat/profiles.py +217 -0
- tree_sitter_analyzer/platform_compat/record.py +55 -0
- tree_sitter_analyzer/platform_compat/recorder.py +155 -0
- tree_sitter_analyzer/platform_compat/report.py +92 -0
- tree_sitter_analyzer/plugins/__init__.py +280 -0
- tree_sitter_analyzer/plugins/base.py +647 -0
- tree_sitter_analyzer/plugins/manager.py +384 -0
- tree_sitter_analyzer/project_detector.py +328 -0
- tree_sitter_analyzer/queries/__init__.py +27 -0
- tree_sitter_analyzer/queries/csharp.py +216 -0
- tree_sitter_analyzer/queries/css.py +615 -0
- tree_sitter_analyzer/queries/go.py +275 -0
- tree_sitter_analyzer/queries/html.py +543 -0
- tree_sitter_analyzer/queries/java.py +402 -0
- tree_sitter_analyzer/queries/javascript.py +724 -0
- tree_sitter_analyzer/queries/kotlin.py +192 -0
- tree_sitter_analyzer/queries/markdown.py +258 -0
- tree_sitter_analyzer/queries/php.py +95 -0
- tree_sitter_analyzer/queries/python.py +859 -0
- tree_sitter_analyzer/queries/ruby.py +92 -0
- tree_sitter_analyzer/queries/rust.py +223 -0
- tree_sitter_analyzer/queries/sql.py +555 -0
- tree_sitter_analyzer/queries/typescript.py +871 -0
- tree_sitter_analyzer/queries/yaml.py +236 -0
- tree_sitter_analyzer/query_loader.py +272 -0
- tree_sitter_analyzer/security/__init__.py +22 -0
- tree_sitter_analyzer/security/boundary_manager.py +277 -0
- tree_sitter_analyzer/security/regex_checker.py +297 -0
- tree_sitter_analyzer/security/validator.py +599 -0
- tree_sitter_analyzer/table_formatter.py +782 -0
- tree_sitter_analyzer/utils/__init__.py +53 -0
- tree_sitter_analyzer/utils/logging.py +433 -0
- tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
HTML Language Plugin
|
|
4
|
+
|
|
5
|
+
True HTML parser using tree-sitter-html for comprehensive HTML analysis.
|
|
6
|
+
Provides HTML-specific analysis capabilities including element extraction,
|
|
7
|
+
attribute parsing, and document structure analysis.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
from ..models import AnalysisResult, MarkupElement
|
|
14
|
+
from ..plugins.base import ElementExtractor, LanguagePlugin
|
|
15
|
+
from ..utils import log_debug, log_error, log_info
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
import tree_sitter
|
|
19
|
+
|
|
20
|
+
from ..core.analysis_engine import AnalysisRequest
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HtmlElementExtractor(ElementExtractor):
|
|
26
|
+
"""HTML-specific element extractor using tree-sitter-html"""
|
|
27
|
+
|
|
28
|
+
def __init__(self) -> None:
|
|
29
|
+
self.element_categories = {
|
|
30
|
+
# HTML要素の分類システム
|
|
31
|
+
"structure": [
|
|
32
|
+
"html",
|
|
33
|
+
"body",
|
|
34
|
+
"div",
|
|
35
|
+
"span",
|
|
36
|
+
"section",
|
|
37
|
+
"article",
|
|
38
|
+
"aside",
|
|
39
|
+
"nav",
|
|
40
|
+
"main",
|
|
41
|
+
"header",
|
|
42
|
+
"footer",
|
|
43
|
+
],
|
|
44
|
+
"heading": ["h1", "h2", "h3", "h4", "h5", "h6"],
|
|
45
|
+
"text": [
|
|
46
|
+
"p",
|
|
47
|
+
"a",
|
|
48
|
+
"strong",
|
|
49
|
+
"em",
|
|
50
|
+
"b",
|
|
51
|
+
"i",
|
|
52
|
+
"u",
|
|
53
|
+
"small",
|
|
54
|
+
"mark",
|
|
55
|
+
"del",
|
|
56
|
+
"ins",
|
|
57
|
+
"sub",
|
|
58
|
+
"sup",
|
|
59
|
+
],
|
|
60
|
+
"list": ["ul", "ol", "li", "dl", "dt", "dd"],
|
|
61
|
+
"media": [
|
|
62
|
+
"img",
|
|
63
|
+
"video",
|
|
64
|
+
"audio",
|
|
65
|
+
"source",
|
|
66
|
+
"track",
|
|
67
|
+
"canvas",
|
|
68
|
+
"svg",
|
|
69
|
+
"picture",
|
|
70
|
+
],
|
|
71
|
+
"form": [
|
|
72
|
+
"form",
|
|
73
|
+
"input",
|
|
74
|
+
"textarea",
|
|
75
|
+
"button",
|
|
76
|
+
"select",
|
|
77
|
+
"option",
|
|
78
|
+
"optgroup",
|
|
79
|
+
"label",
|
|
80
|
+
"fieldset",
|
|
81
|
+
"legend",
|
|
82
|
+
],
|
|
83
|
+
"table": [
|
|
84
|
+
"table",
|
|
85
|
+
"thead",
|
|
86
|
+
"tbody",
|
|
87
|
+
"tfoot",
|
|
88
|
+
"tr",
|
|
89
|
+
"td",
|
|
90
|
+
"th",
|
|
91
|
+
"caption",
|
|
92
|
+
"colgroup",
|
|
93
|
+
"col",
|
|
94
|
+
],
|
|
95
|
+
"metadata": [
|
|
96
|
+
"head",
|
|
97
|
+
"title",
|
|
98
|
+
"meta",
|
|
99
|
+
"link",
|
|
100
|
+
"style",
|
|
101
|
+
"script",
|
|
102
|
+
"noscript",
|
|
103
|
+
"base",
|
|
104
|
+
],
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
def extract_functions(self, tree: "tree_sitter.Tree", source_code: str) -> list:
|
|
108
|
+
"""HTML doesn't have functions, return empty list"""
|
|
109
|
+
return []
|
|
110
|
+
|
|
111
|
+
def extract_classes(self, tree: "tree_sitter.Tree", source_code: str) -> list:
|
|
112
|
+
"""HTML doesn't have classes in the traditional sense, return empty list"""
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
def extract_variables(self, tree: "tree_sitter.Tree", source_code: str) -> list:
|
|
116
|
+
"""HTML doesn't have variables, return empty list"""
|
|
117
|
+
return []
|
|
118
|
+
|
|
119
|
+
def extract_imports(self, tree: "tree_sitter.Tree", source_code: str) -> list:
|
|
120
|
+
"""HTML doesn't have imports, return empty list"""
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
def extract_html_elements(
|
|
124
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
125
|
+
) -> list[MarkupElement]:
|
|
126
|
+
"""Extract HTML elements using tree-sitter-html parser"""
|
|
127
|
+
elements: list[MarkupElement] = []
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
if hasattr(tree, "root_node"):
|
|
131
|
+
self._traverse_for_html_elements(
|
|
132
|
+
tree.root_node, elements, source_code, None
|
|
133
|
+
)
|
|
134
|
+
except Exception as e:
|
|
135
|
+
log_error(f"Error in HTML element extraction: {e}")
|
|
136
|
+
|
|
137
|
+
return elements
|
|
138
|
+
|
|
139
|
+
def _traverse_for_html_elements(
|
|
140
|
+
self,
|
|
141
|
+
node: "tree_sitter.Node",
|
|
142
|
+
elements: list[MarkupElement],
|
|
143
|
+
source_code: str,
|
|
144
|
+
parent: MarkupElement | None,
|
|
145
|
+
) -> None:
|
|
146
|
+
"""Traverse tree to find HTML elements using tree-sitter-html grammar"""
|
|
147
|
+
if hasattr(node, "type") and self._is_html_element_node(node.type):
|
|
148
|
+
try:
|
|
149
|
+
element = self._create_markup_element(node, source_code, parent)
|
|
150
|
+
if element:
|
|
151
|
+
elements.append(element)
|
|
152
|
+
|
|
153
|
+
# Process children with this element as parent
|
|
154
|
+
if hasattr(node, "children"):
|
|
155
|
+
for child in node.children:
|
|
156
|
+
self._traverse_for_html_elements(
|
|
157
|
+
child, elements, source_code, element
|
|
158
|
+
)
|
|
159
|
+
return
|
|
160
|
+
except Exception as e:
|
|
161
|
+
log_debug(f"Failed to extract HTML element: {e}")
|
|
162
|
+
|
|
163
|
+
# Continue traversing children if this node is not an HTML element
|
|
164
|
+
if hasattr(node, "children"):
|
|
165
|
+
for child in node.children:
|
|
166
|
+
self._traverse_for_html_elements(child, elements, source_code, parent)
|
|
167
|
+
|
|
168
|
+
def _is_html_element_node(self, node_type: str) -> bool:
|
|
169
|
+
"""Check if a node type represents an HTML element in tree-sitter-html grammar"""
|
|
170
|
+
# Only process top-level element nodes to avoid duplication
|
|
171
|
+
# tree-sitter-html structure: element contains start_tag/end_tag
|
|
172
|
+
# Processing only 'element' avoids counting start_tag separately
|
|
173
|
+
html_element_types = [
|
|
174
|
+
"element",
|
|
175
|
+
"self_closing_tag",
|
|
176
|
+
"script_element",
|
|
177
|
+
"style_element",
|
|
178
|
+
]
|
|
179
|
+
return node_type in html_element_types
|
|
180
|
+
|
|
181
|
+
def _create_markup_element(
|
|
182
|
+
self,
|
|
183
|
+
node: "tree_sitter.Node",
|
|
184
|
+
source_code: str,
|
|
185
|
+
parent: MarkupElement | None,
|
|
186
|
+
) -> MarkupElement | None:
|
|
187
|
+
"""Create MarkupElement from tree-sitter node using tree-sitter-html grammar"""
|
|
188
|
+
try:
|
|
189
|
+
# Extract tag name using tree-sitter-html structure
|
|
190
|
+
tag_name = self._extract_tag_name(node, source_code)
|
|
191
|
+
if not tag_name:
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
# Extract attributes using tree-sitter-html structure
|
|
195
|
+
attributes = self._extract_attributes(node, source_code)
|
|
196
|
+
|
|
197
|
+
# Determine element class based on tag name
|
|
198
|
+
element_class = self._classify_element(tag_name)
|
|
199
|
+
|
|
200
|
+
# Extract text content
|
|
201
|
+
raw_text = self._extract_node_text(node, source_code)
|
|
202
|
+
|
|
203
|
+
# Create MarkupElement
|
|
204
|
+
element = MarkupElement(
|
|
205
|
+
name=tag_name,
|
|
206
|
+
start_line=(
|
|
207
|
+
node.start_point[0] + 1 if hasattr(node, "start_point") else 0
|
|
208
|
+
),
|
|
209
|
+
end_line=node.end_point[0] + 1 if hasattr(node, "end_point") else 0,
|
|
210
|
+
raw_text=raw_text,
|
|
211
|
+
language="html",
|
|
212
|
+
tag_name=tag_name,
|
|
213
|
+
attributes=attributes,
|
|
214
|
+
parent=parent,
|
|
215
|
+
children=[],
|
|
216
|
+
element_class=element_class,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Add to parent's children if parent exists
|
|
220
|
+
if parent:
|
|
221
|
+
parent.children.append(element)
|
|
222
|
+
|
|
223
|
+
return element
|
|
224
|
+
|
|
225
|
+
except Exception as e:
|
|
226
|
+
log_debug(f"Failed to create MarkupElement: {e}")
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
def _extract_tag_name(self, node: "tree_sitter.Node", source_code: str) -> str:
|
|
230
|
+
"""Extract tag name from HTML element node using tree-sitter-html grammar"""
|
|
231
|
+
try:
|
|
232
|
+
# For tree-sitter-html, tag names are in specific child nodes
|
|
233
|
+
if hasattr(node, "children"):
|
|
234
|
+
for child in node.children:
|
|
235
|
+
if hasattr(child, "type"):
|
|
236
|
+
# Handle different node types in tree-sitter-html
|
|
237
|
+
if child.type == "tag_name":
|
|
238
|
+
return self._extract_node_text(child, source_code).strip()
|
|
239
|
+
elif child.type in ["start_tag", "self_closing_tag"]:
|
|
240
|
+
# Look for tag_name within start_tag or self_closing_tag
|
|
241
|
+
for grandchild in child.children:
|
|
242
|
+
if (
|
|
243
|
+
hasattr(grandchild, "type")
|
|
244
|
+
and grandchild.type == "tag_name"
|
|
245
|
+
):
|
|
246
|
+
return self._extract_node_text(
|
|
247
|
+
grandchild, source_code
|
|
248
|
+
).strip()
|
|
249
|
+
|
|
250
|
+
# Fallback: try to extract from node text
|
|
251
|
+
node_text = self._extract_node_text(node, source_code)
|
|
252
|
+
if node_text.startswith("<"):
|
|
253
|
+
# Extract tag name from <tagname ...> pattern
|
|
254
|
+
tag_part = node_text.split(">")[0].split()[0]
|
|
255
|
+
return tag_part.lstrip("<").rstrip(">")
|
|
256
|
+
|
|
257
|
+
return "unknown"
|
|
258
|
+
except Exception:
|
|
259
|
+
return "unknown"
|
|
260
|
+
|
|
261
|
+
def _extract_attributes(
|
|
262
|
+
self, node: "tree_sitter.Node", source_code: str
|
|
263
|
+
) -> dict[str, str]:
|
|
264
|
+
"""Extract attributes from HTML element node using tree-sitter-html grammar"""
|
|
265
|
+
attributes = {}
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
if hasattr(node, "children"):
|
|
269
|
+
for child in node.children:
|
|
270
|
+
if hasattr(child, "type"):
|
|
271
|
+
# Handle attribute nodes in tree-sitter-html
|
|
272
|
+
if child.type == "attribute":
|
|
273
|
+
attr_name, attr_value = self._parse_attribute(
|
|
274
|
+
child, source_code
|
|
275
|
+
)
|
|
276
|
+
if attr_name:
|
|
277
|
+
attributes[attr_name] = attr_value
|
|
278
|
+
elif child.type in ["start_tag", "self_closing_tag"]:
|
|
279
|
+
# Look for attributes within start_tag or self_closing_tag
|
|
280
|
+
for grandchild in child.children:
|
|
281
|
+
if (
|
|
282
|
+
hasattr(grandchild, "type")
|
|
283
|
+
and grandchild.type == "attribute"
|
|
284
|
+
):
|
|
285
|
+
attr_name, attr_value = self._parse_attribute(
|
|
286
|
+
grandchild, source_code
|
|
287
|
+
)
|
|
288
|
+
if attr_name:
|
|
289
|
+
attributes[attr_name] = attr_value
|
|
290
|
+
except Exception as e:
|
|
291
|
+
log_debug(f"Failed to extract attributes: {e}")
|
|
292
|
+
|
|
293
|
+
return attributes
|
|
294
|
+
|
|
295
|
+
def _parse_attribute(
|
|
296
|
+
self, attr_node: "tree_sitter.Node", source_code: str
|
|
297
|
+
) -> tuple[str, str]:
|
|
298
|
+
"""Parse individual attribute node using tree-sitter-html grammar"""
|
|
299
|
+
try:
|
|
300
|
+
# In tree-sitter-html, attributes have specific structure
|
|
301
|
+
attr_name = ""
|
|
302
|
+
attr_value = ""
|
|
303
|
+
|
|
304
|
+
if hasattr(attr_node, "children"):
|
|
305
|
+
for child in attr_node.children:
|
|
306
|
+
if hasattr(child, "type"):
|
|
307
|
+
if child.type == "attribute_name":
|
|
308
|
+
attr_name = self._extract_node_text(
|
|
309
|
+
child, source_code
|
|
310
|
+
).strip()
|
|
311
|
+
elif child.type == "quoted_attribute_value":
|
|
312
|
+
attr_value = (
|
|
313
|
+
self._extract_node_text(child, source_code)
|
|
314
|
+
.strip()
|
|
315
|
+
.strip('"')
|
|
316
|
+
.strip("'")
|
|
317
|
+
)
|
|
318
|
+
elif child.type == "attribute_value":
|
|
319
|
+
attr_value = self._extract_node_text(
|
|
320
|
+
child, source_code
|
|
321
|
+
).strip()
|
|
322
|
+
|
|
323
|
+
# Fallback to simple parsing
|
|
324
|
+
if not attr_name:
|
|
325
|
+
attr_text = self._extract_node_text(attr_node, source_code)
|
|
326
|
+
if "=" in attr_text:
|
|
327
|
+
name, value = attr_text.split("=", 1)
|
|
328
|
+
attr_name = name.strip()
|
|
329
|
+
attr_value = value.strip().strip('"').strip("'")
|
|
330
|
+
else:
|
|
331
|
+
# Boolean attribute
|
|
332
|
+
attr_name = attr_text.strip()
|
|
333
|
+
attr_value = ""
|
|
334
|
+
|
|
335
|
+
return attr_name, attr_value
|
|
336
|
+
except Exception:
|
|
337
|
+
return "", ""
|
|
338
|
+
|
|
339
|
+
def _classify_element(self, tag_name: str) -> str:
|
|
340
|
+
"""Classify HTML element based on tag name"""
|
|
341
|
+
tag_name_lower = tag_name.lower()
|
|
342
|
+
|
|
343
|
+
for category, tags in self.element_categories.items():
|
|
344
|
+
if tag_name_lower in tags:
|
|
345
|
+
return category
|
|
346
|
+
|
|
347
|
+
return "unknown"
|
|
348
|
+
|
|
349
|
+
def _extract_node_text(self, node: "tree_sitter.Node", source_code: str) -> str:
|
|
350
|
+
"""Extract text content from a tree-sitter node"""
|
|
351
|
+
try:
|
|
352
|
+
if hasattr(node, "start_byte") and hasattr(node, "end_byte"):
|
|
353
|
+
source_bytes = source_code.encode("utf-8")
|
|
354
|
+
node_bytes = source_bytes[node.start_byte : node.end_byte]
|
|
355
|
+
return node_bytes.decode("utf-8", errors="replace")
|
|
356
|
+
return ""
|
|
357
|
+
except Exception as e:
|
|
358
|
+
log_debug(f"Failed to extract node text: {e}")
|
|
359
|
+
return ""
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
class HtmlPlugin(LanguagePlugin):
|
|
363
|
+
"""HTML language plugin using tree-sitter-html for true HTML parsing"""
|
|
364
|
+
|
|
365
|
+
def get_language_name(self) -> str:
|
|
366
|
+
return "html"
|
|
367
|
+
|
|
368
|
+
def get_file_extensions(self) -> list[str]:
|
|
369
|
+
return [".html", ".htm", ".xhtml"]
|
|
370
|
+
|
|
371
|
+
def create_extractor(self) -> ElementExtractor:
|
|
372
|
+
return HtmlElementExtractor()
|
|
373
|
+
|
|
374
|
+
def get_supported_element_types(self) -> list[str]:
|
|
375
|
+
return ["html_element"]
|
|
376
|
+
|
|
377
|
+
def get_queries(self) -> dict[str, str]:
|
|
378
|
+
"""Return HTML-specific tree-sitter queries"""
|
|
379
|
+
from ..queries.html import HTML_QUERIES
|
|
380
|
+
|
|
381
|
+
return HTML_QUERIES
|
|
382
|
+
|
|
383
|
+
def execute_query_strategy(
|
|
384
|
+
self, query_key: str | None, language: str
|
|
385
|
+
) -> str | None:
|
|
386
|
+
"""Execute query strategy for HTML"""
|
|
387
|
+
if language != "html":
|
|
388
|
+
return None
|
|
389
|
+
|
|
390
|
+
queries = self.get_queries()
|
|
391
|
+
return queries.get(query_key) if query_key else None
|
|
392
|
+
|
|
393
|
+
def get_element_categories(self) -> dict[str, list[str]]:
|
|
394
|
+
"""Return HTML element categories for query execution"""
|
|
395
|
+
return {
|
|
396
|
+
"structure": ["element"],
|
|
397
|
+
"heading": ["element"],
|
|
398
|
+
"text": ["element"],
|
|
399
|
+
"list": ["element"],
|
|
400
|
+
"media": ["element"],
|
|
401
|
+
"form": ["element"],
|
|
402
|
+
"table": ["element"],
|
|
403
|
+
"metadata": ["element"],
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
async def analyze_file(
|
|
407
|
+
self, file_path: str, request: "AnalysisRequest"
|
|
408
|
+
) -> "AnalysisResult":
|
|
409
|
+
"""Analyze HTML file using tree-sitter-html parser"""
|
|
410
|
+
from ..encoding_utils import read_file_safe
|
|
411
|
+
|
|
412
|
+
try:
|
|
413
|
+
# Read file content
|
|
414
|
+
content, encoding = read_file_safe(file_path)
|
|
415
|
+
|
|
416
|
+
# Use tree-sitter-html for parsing
|
|
417
|
+
try:
|
|
418
|
+
import tree_sitter
|
|
419
|
+
import tree_sitter_html as ts_html
|
|
420
|
+
|
|
421
|
+
# Get HTML language
|
|
422
|
+
HTML_LANGUAGE = tree_sitter.Language(ts_html.language())
|
|
423
|
+
|
|
424
|
+
# Create parser
|
|
425
|
+
parser = tree_sitter.Parser()
|
|
426
|
+
parser.language = HTML_LANGUAGE
|
|
427
|
+
|
|
428
|
+
# Parse the HTML content
|
|
429
|
+
tree = parser.parse(content.encode("utf-8"))
|
|
430
|
+
|
|
431
|
+
# Extract elements using the extractor
|
|
432
|
+
extractor = self.create_extractor()
|
|
433
|
+
elements = extractor.extract_html_elements(tree, content)
|
|
434
|
+
|
|
435
|
+
log_info(f"Extracted {len(elements)} HTML elements from {file_path}")
|
|
436
|
+
|
|
437
|
+
return AnalysisResult(
|
|
438
|
+
file_path=file_path,
|
|
439
|
+
language="html",
|
|
440
|
+
line_count=len(content.splitlines()),
|
|
441
|
+
elements=elements,
|
|
442
|
+
node_count=len(elements),
|
|
443
|
+
query_results={},
|
|
444
|
+
source_code=content,
|
|
445
|
+
success=True,
|
|
446
|
+
error_message=None,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
except ImportError:
|
|
450
|
+
log_error(
|
|
451
|
+
"tree-sitter-html not available, falling back to basic parsing"
|
|
452
|
+
)
|
|
453
|
+
# Fallback to basic parsing
|
|
454
|
+
lines = content.splitlines()
|
|
455
|
+
line_count = len(lines)
|
|
456
|
+
|
|
457
|
+
# Create basic MarkupElement for the HTML document
|
|
458
|
+
html_element = MarkupElement(
|
|
459
|
+
name="html",
|
|
460
|
+
start_line=1,
|
|
461
|
+
end_line=line_count,
|
|
462
|
+
raw_text=content[:200] + "..." if len(content) > 200 else content,
|
|
463
|
+
language="html",
|
|
464
|
+
tag_name="html",
|
|
465
|
+
attributes={},
|
|
466
|
+
parent=None,
|
|
467
|
+
children=[],
|
|
468
|
+
element_class="structure",
|
|
469
|
+
)
|
|
470
|
+
elements = [html_element]
|
|
471
|
+
|
|
472
|
+
return AnalysisResult(
|
|
473
|
+
file_path=file_path,
|
|
474
|
+
language="html",
|
|
475
|
+
line_count=line_count,
|
|
476
|
+
elements=elements,
|
|
477
|
+
node_count=len(elements),
|
|
478
|
+
query_results={},
|
|
479
|
+
source_code=content,
|
|
480
|
+
success=True,
|
|
481
|
+
error_message=None,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
except Exception as e:
|
|
485
|
+
log_error(f"Failed to analyze HTML file {file_path}: {e}")
|
|
486
|
+
return AnalysisResult(
|
|
487
|
+
file_path=file_path,
|
|
488
|
+
language="html",
|
|
489
|
+
line_count=0,
|
|
490
|
+
elements=[],
|
|
491
|
+
node_count=0,
|
|
492
|
+
query_results={},
|
|
493
|
+
source_code="",
|
|
494
|
+
success=False,
|
|
495
|
+
error_message=str(e),
|
|
496
|
+
)
|