tree-sitter-analyzer 1.9.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tree_sitter_analyzer/__init__.py +132 -0
- tree_sitter_analyzer/__main__.py +11 -0
- tree_sitter_analyzer/api.py +853 -0
- tree_sitter_analyzer/cli/__init__.py +39 -0
- tree_sitter_analyzer/cli/__main__.py +12 -0
- tree_sitter_analyzer/cli/argument_validator.py +89 -0
- tree_sitter_analyzer/cli/commands/__init__.py +26 -0
- tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
- tree_sitter_analyzer/cli/commands/base_command.py +181 -0
- tree_sitter_analyzer/cli/commands/default_command.py +18 -0
- tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
- tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
- tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
- tree_sitter_analyzer/cli/commands/query_command.py +109 -0
- tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
- tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
- tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
- tree_sitter_analyzer/cli/commands/table_command.py +414 -0
- tree_sitter_analyzer/cli/info_commands.py +124 -0
- tree_sitter_analyzer/cli_main.py +472 -0
- tree_sitter_analyzer/constants.py +85 -0
- tree_sitter_analyzer/core/__init__.py +15 -0
- tree_sitter_analyzer/core/analysis_engine.py +580 -0
- tree_sitter_analyzer/core/cache_service.py +333 -0
- tree_sitter_analyzer/core/engine.py +585 -0
- tree_sitter_analyzer/core/parser.py +293 -0
- tree_sitter_analyzer/core/query.py +605 -0
- tree_sitter_analyzer/core/query_filter.py +200 -0
- tree_sitter_analyzer/core/query_service.py +340 -0
- tree_sitter_analyzer/encoding_utils.py +530 -0
- tree_sitter_analyzer/exceptions.py +747 -0
- tree_sitter_analyzer/file_handler.py +246 -0
- tree_sitter_analyzer/formatters/__init__.py +1 -0
- tree_sitter_analyzer/formatters/base_formatter.py +201 -0
- tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
- tree_sitter_analyzer/formatters/formatter_config.py +197 -0
- tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
- tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
- tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
- tree_sitter_analyzer/formatters/go_formatter.py +368 -0
- tree_sitter_analyzer/formatters/html_formatter.py +498 -0
- tree_sitter_analyzer/formatters/java_formatter.py +423 -0
- tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
- tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
- tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
- tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
- tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
- tree_sitter_analyzer/formatters/php_formatter.py +301 -0
- tree_sitter_analyzer/formatters/python_formatter.py +830 -0
- tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
- tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
- tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
- tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
- tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
- tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
- tree_sitter_analyzer/interfaces/__init__.py +9 -0
- tree_sitter_analyzer/interfaces/cli.py +535 -0
- tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
- tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
- tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
- tree_sitter_analyzer/language_detector.py +553 -0
- tree_sitter_analyzer/language_loader.py +271 -0
- tree_sitter_analyzer/languages/__init__.py +10 -0
- tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
- tree_sitter_analyzer/languages/css_plugin.py +449 -0
- tree_sitter_analyzer/languages/go_plugin.py +836 -0
- tree_sitter_analyzer/languages/html_plugin.py +496 -0
- tree_sitter_analyzer/languages/java_plugin.py +1299 -0
- tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
- tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
- tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
- tree_sitter_analyzer/languages/php_plugin.py +862 -0
- tree_sitter_analyzer/languages/python_plugin.py +1636 -0
- tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
- tree_sitter_analyzer/languages/rust_plugin.py +673 -0
- tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
- tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
- tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
- tree_sitter_analyzer/legacy_table_formatter.py +860 -0
- tree_sitter_analyzer/mcp/__init__.py +34 -0
- tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
- tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
- tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
- tree_sitter_analyzer/mcp/server.py +869 -0
- tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
- tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
- tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
- tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
- tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
- tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
- tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
- tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
- tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
- tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
- tree_sitter_analyzer/models.py +840 -0
- tree_sitter_analyzer/mypy_current_errors.txt +2 -0
- tree_sitter_analyzer/output_manager.py +255 -0
- tree_sitter_analyzer/platform_compat/__init__.py +3 -0
- tree_sitter_analyzer/platform_compat/adapter.py +324 -0
- tree_sitter_analyzer/platform_compat/compare.py +224 -0
- tree_sitter_analyzer/platform_compat/detector.py +67 -0
- tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
- tree_sitter_analyzer/platform_compat/profiles.py +217 -0
- tree_sitter_analyzer/platform_compat/record.py +55 -0
- tree_sitter_analyzer/platform_compat/recorder.py +155 -0
- tree_sitter_analyzer/platform_compat/report.py +92 -0
- tree_sitter_analyzer/plugins/__init__.py +280 -0
- tree_sitter_analyzer/plugins/base.py +647 -0
- tree_sitter_analyzer/plugins/manager.py +384 -0
- tree_sitter_analyzer/project_detector.py +328 -0
- tree_sitter_analyzer/queries/__init__.py +27 -0
- tree_sitter_analyzer/queries/csharp.py +216 -0
- tree_sitter_analyzer/queries/css.py +615 -0
- tree_sitter_analyzer/queries/go.py +275 -0
- tree_sitter_analyzer/queries/html.py +543 -0
- tree_sitter_analyzer/queries/java.py +402 -0
- tree_sitter_analyzer/queries/javascript.py +724 -0
- tree_sitter_analyzer/queries/kotlin.py +192 -0
- tree_sitter_analyzer/queries/markdown.py +258 -0
- tree_sitter_analyzer/queries/php.py +95 -0
- tree_sitter_analyzer/queries/python.py +859 -0
- tree_sitter_analyzer/queries/ruby.py +92 -0
- tree_sitter_analyzer/queries/rust.py +223 -0
- tree_sitter_analyzer/queries/sql.py +555 -0
- tree_sitter_analyzer/queries/typescript.py +871 -0
- tree_sitter_analyzer/queries/yaml.py +236 -0
- tree_sitter_analyzer/query_loader.py +272 -0
- tree_sitter_analyzer/security/__init__.py +22 -0
- tree_sitter_analyzer/security/boundary_manager.py +277 -0
- tree_sitter_analyzer/security/regex_checker.py +297 -0
- tree_sitter_analyzer/security/validator.py +599 -0
- tree_sitter_analyzer/table_formatter.py +782 -0
- tree_sitter_analyzer/utils/__init__.py +53 -0
- tree_sitter_analyzer/utils/logging.py +433 -0
- tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
|
@@ -0,0 +1,1928 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Markdown Language Plugin
|
|
4
|
+
|
|
5
|
+
Enhanced Markdown-specific parsing and element extraction functionality.
|
|
6
|
+
Provides comprehensive support for Markdown elements including headers,
|
|
7
|
+
links, code blocks, lists, tables, and other structural elements.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import tree_sitter
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import tree_sitter
|
|
17
|
+
|
|
18
|
+
TREE_SITTER_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
TREE_SITTER_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
from ..core.analysis_engine import AnalysisRequest
|
|
23
|
+
from ..encoding_utils import extract_text_slice, safe_encode
|
|
24
|
+
from ..models import AnalysisResult, CodeElement
|
|
25
|
+
from ..models import Class as ModelClass
|
|
26
|
+
from ..models import Function as ModelFunction
|
|
27
|
+
from ..models import Import as ModelImport
|
|
28
|
+
from ..models import Variable as ModelVariable
|
|
29
|
+
from ..plugins.base import ElementExtractor, LanguagePlugin
|
|
30
|
+
from ..utils import log_debug, log_error
|
|
31
|
+
from ..utils.tree_sitter_compat import TreeSitterQueryCompat
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class MarkdownElement(CodeElement):
|
|
35
|
+
"""Markdown-specific code element"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
name: str,
|
|
40
|
+
start_line: int,
|
|
41
|
+
end_line: int,
|
|
42
|
+
raw_text: str,
|
|
43
|
+
language: str = "markdown",
|
|
44
|
+
element_type: str = "markdown",
|
|
45
|
+
level: int | None = None,
|
|
46
|
+
url: str | None = None,
|
|
47
|
+
alt_text: str | None = None,
|
|
48
|
+
title: str | None = None,
|
|
49
|
+
language_info: str | None = None,
|
|
50
|
+
is_checked: bool | None = None,
|
|
51
|
+
**kwargs: Any,
|
|
52
|
+
) -> None:
|
|
53
|
+
super().__init__(
|
|
54
|
+
name=name,
|
|
55
|
+
start_line=start_line,
|
|
56
|
+
end_line=end_line,
|
|
57
|
+
raw_text=raw_text,
|
|
58
|
+
language=language,
|
|
59
|
+
**kwargs,
|
|
60
|
+
)
|
|
61
|
+
self.element_type = element_type
|
|
62
|
+
self.level = level # For headers (1-6)
|
|
63
|
+
self.url = url # For links and images
|
|
64
|
+
self.alt_text = alt_text # For images
|
|
65
|
+
self.title = title # For links and images
|
|
66
|
+
self.language_info = language_info # For code blocks
|
|
67
|
+
self.is_checked = is_checked # For task list items
|
|
68
|
+
|
|
69
|
+
# Additional attributes used by formatters
|
|
70
|
+
self.text: str | None = None # Text content
|
|
71
|
+
self.type: str | None = None # Element type for formatters
|
|
72
|
+
self.line_count: int | None = None # For code blocks
|
|
73
|
+
self.alt: str | None = None # Alternative text for images
|
|
74
|
+
self.list_type: str | None = None # For lists (ordered/unordered/task)
|
|
75
|
+
self.item_count: int | None = None # For lists
|
|
76
|
+
self.row_count: int | None = None # For tables
|
|
77
|
+
self.column_count: int | None = None # For tables
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class MarkdownElementExtractor(ElementExtractor):
|
|
81
|
+
"""Markdown-specific element extractor with comprehensive feature support"""
|
|
82
|
+
|
|
83
|
+
def __init__(self) -> None:
|
|
84
|
+
"""Initialize the Markdown element extractor."""
|
|
85
|
+
self.current_file: str = ""
|
|
86
|
+
self.source_code: str = ""
|
|
87
|
+
self.content_lines: list[str] = []
|
|
88
|
+
|
|
89
|
+
# Performance optimization caches
|
|
90
|
+
self._node_text_cache: dict[int, str] = {}
|
|
91
|
+
self._processed_nodes: set[int] = set()
|
|
92
|
+
self._element_cache: dict[tuple[int, str], Any] = {}
|
|
93
|
+
self._file_encoding: str | None = None
|
|
94
|
+
|
|
95
|
+
def extract_functions(
|
|
96
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
97
|
+
) -> list[ModelFunction]:
|
|
98
|
+
"""Extract Markdown elements (headers act as 'functions')"""
|
|
99
|
+
headers = self.extract_headers(tree, source_code)
|
|
100
|
+
functions = []
|
|
101
|
+
for header in headers:
|
|
102
|
+
func = ModelFunction(
|
|
103
|
+
name=header.name,
|
|
104
|
+
start_line=header.start_line,
|
|
105
|
+
end_line=header.end_line,
|
|
106
|
+
raw_text=header.raw_text,
|
|
107
|
+
language=header.language,
|
|
108
|
+
)
|
|
109
|
+
functions.append(func)
|
|
110
|
+
return functions
|
|
111
|
+
|
|
112
|
+
def extract_classes(
|
|
113
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
114
|
+
) -> list[ModelClass]:
|
|
115
|
+
"""Extract Markdown sections (code blocks act as 'classes')"""
|
|
116
|
+
code_blocks = self.extract_code_blocks(tree, source_code)
|
|
117
|
+
classes = []
|
|
118
|
+
for block in code_blocks:
|
|
119
|
+
cls = ModelClass(
|
|
120
|
+
name=block.name,
|
|
121
|
+
start_line=block.start_line,
|
|
122
|
+
end_line=block.end_line,
|
|
123
|
+
raw_text=block.raw_text,
|
|
124
|
+
language=block.language,
|
|
125
|
+
)
|
|
126
|
+
classes.append(cls)
|
|
127
|
+
return classes
|
|
128
|
+
|
|
129
|
+
def extract_variables(
|
|
130
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
131
|
+
) -> list[ModelVariable]:
|
|
132
|
+
"""Extract Markdown links and images (act as 'variables')"""
|
|
133
|
+
elements = []
|
|
134
|
+
elements.extend(self.extract_links(tree, source_code))
|
|
135
|
+
elements.extend(self.extract_images(tree, source_code))
|
|
136
|
+
|
|
137
|
+
variables = []
|
|
138
|
+
for element in elements:
|
|
139
|
+
var = ModelVariable(
|
|
140
|
+
name=element.name,
|
|
141
|
+
start_line=element.start_line,
|
|
142
|
+
end_line=element.end_line,
|
|
143
|
+
raw_text=element.raw_text,
|
|
144
|
+
language=element.language,
|
|
145
|
+
)
|
|
146
|
+
variables.append(var)
|
|
147
|
+
return variables
|
|
148
|
+
|
|
149
|
+
def extract_imports(
|
|
150
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
151
|
+
) -> list[ModelImport]:
|
|
152
|
+
"""Extract Markdown references and definitions"""
|
|
153
|
+
references = self.extract_references(tree, source_code)
|
|
154
|
+
imports = []
|
|
155
|
+
for ref in references:
|
|
156
|
+
imp = ModelImport(
|
|
157
|
+
name=ref.name,
|
|
158
|
+
start_line=ref.start_line,
|
|
159
|
+
end_line=ref.end_line,
|
|
160
|
+
raw_text=ref.raw_text,
|
|
161
|
+
language=ref.language,
|
|
162
|
+
)
|
|
163
|
+
imports.append(imp)
|
|
164
|
+
return imports
|
|
165
|
+
|
|
166
|
+
def extract_headers(
|
|
167
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
168
|
+
) -> list[MarkdownElement]:
|
|
169
|
+
"""Extract Markdown headers (H1-H6)"""
|
|
170
|
+
self.source_code = source_code or ""
|
|
171
|
+
self.content_lines = self.source_code.split("\n")
|
|
172
|
+
self._reset_caches()
|
|
173
|
+
|
|
174
|
+
headers: list[MarkdownElement] = []
|
|
175
|
+
|
|
176
|
+
if tree is not None and tree.root_node is not None:
|
|
177
|
+
try:
|
|
178
|
+
# Extract ATX headers (# ## ### etc.)
|
|
179
|
+
self._extract_atx_headers(tree.root_node, headers)
|
|
180
|
+
# Extract Setext headers (underlined)
|
|
181
|
+
self._extract_setext_headers(tree.root_node, headers)
|
|
182
|
+
except Exception as e:
|
|
183
|
+
log_debug(f"Error during header extraction: {e}")
|
|
184
|
+
|
|
185
|
+
log_debug(f"Extracted {len(headers)} Markdown headers")
|
|
186
|
+
return headers
|
|
187
|
+
|
|
188
|
+
def extract_code_blocks(
|
|
189
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
190
|
+
) -> list[MarkdownElement]:
|
|
191
|
+
"""Extract Markdown code blocks"""
|
|
192
|
+
self.source_code = source_code or ""
|
|
193
|
+
self.content_lines = self.source_code.split("\n")
|
|
194
|
+
self._reset_caches()
|
|
195
|
+
|
|
196
|
+
code_blocks: list[MarkdownElement] = []
|
|
197
|
+
|
|
198
|
+
if tree is not None and tree.root_node is not None:
|
|
199
|
+
try:
|
|
200
|
+
self._extract_fenced_code_blocks(tree.root_node, code_blocks)
|
|
201
|
+
self._extract_indented_code_blocks(tree.root_node, code_blocks)
|
|
202
|
+
except Exception as e:
|
|
203
|
+
log_debug(f"Error during code block extraction: {e}")
|
|
204
|
+
|
|
205
|
+
log_debug(f"Extracted {len(code_blocks)} Markdown code blocks")
|
|
206
|
+
return code_blocks
|
|
207
|
+
|
|
208
|
+
def extract_links(
|
|
209
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
210
|
+
) -> list[MarkdownElement]:
|
|
211
|
+
"""Extract Markdown links"""
|
|
212
|
+
self.source_code = source_code or ""
|
|
213
|
+
self.content_lines = self.source_code.split("\n")
|
|
214
|
+
self._reset_caches()
|
|
215
|
+
|
|
216
|
+
links: list[MarkdownElement] = []
|
|
217
|
+
|
|
218
|
+
if tree is not None and tree.root_node is not None:
|
|
219
|
+
try:
|
|
220
|
+
# Track extracted links to prevent global duplicates (ensure reset)
|
|
221
|
+
self._extracted_links = set()
|
|
222
|
+
|
|
223
|
+
self._extract_inline_links(tree.root_node, links)
|
|
224
|
+
self._extract_reference_links(tree.root_node, links)
|
|
225
|
+
self._extract_autolinks(tree.root_node, links)
|
|
226
|
+
|
|
227
|
+
# Clean up after extraction is complete
|
|
228
|
+
if hasattr(self, "_extracted_links"):
|
|
229
|
+
delattr(self, "_extracted_links")
|
|
230
|
+
|
|
231
|
+
except Exception as e:
|
|
232
|
+
log_debug(f"Error during link extraction: {e}")
|
|
233
|
+
|
|
234
|
+
# 重複除去: 同じtextとurlを持つ要素を除去
|
|
235
|
+
seen = set()
|
|
236
|
+
unique_links = []
|
|
237
|
+
for link in links:
|
|
238
|
+
key = (getattr(link, "text", "") or "", getattr(link, "url", "") or "")
|
|
239
|
+
if key not in seen:
|
|
240
|
+
seen.add(key)
|
|
241
|
+
unique_links.append(link)
|
|
242
|
+
|
|
243
|
+
links = unique_links
|
|
244
|
+
|
|
245
|
+
log_debug(f"Extracted {len(links)} Markdown links")
|
|
246
|
+
return links
|
|
247
|
+
|
|
248
|
+
def extract_images(
|
|
249
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
250
|
+
) -> list[MarkdownElement]:
|
|
251
|
+
"""Extract Markdown images"""
|
|
252
|
+
self.source_code = source_code or ""
|
|
253
|
+
self.content_lines = self.source_code.split("\n")
|
|
254
|
+
self._reset_caches()
|
|
255
|
+
|
|
256
|
+
images: list[MarkdownElement] = []
|
|
257
|
+
|
|
258
|
+
if tree is not None and tree.root_node is not None:
|
|
259
|
+
try:
|
|
260
|
+
self._extract_inline_images(tree.root_node, images)
|
|
261
|
+
self._extract_reference_images(tree.root_node, images)
|
|
262
|
+
self._extract_image_reference_definitions(tree.root_node, images)
|
|
263
|
+
except Exception as e:
|
|
264
|
+
log_debug(f"Error during image extraction: {e}")
|
|
265
|
+
|
|
266
|
+
# 重複除去: 同じalt_textとurlを持つ要素を除去
|
|
267
|
+
seen = set()
|
|
268
|
+
unique_images = []
|
|
269
|
+
for img in images:
|
|
270
|
+
key = (img.alt_text or "", img.url or "")
|
|
271
|
+
if key not in seen:
|
|
272
|
+
seen.add(key)
|
|
273
|
+
unique_images.append(img)
|
|
274
|
+
|
|
275
|
+
images = unique_images
|
|
276
|
+
|
|
277
|
+
log_debug(f"Extracted {len(images)} Markdown images")
|
|
278
|
+
return images
|
|
279
|
+
|
|
280
|
+
def extract_references(
|
|
281
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
282
|
+
) -> list[MarkdownElement]:
|
|
283
|
+
"""Extract Markdown reference definitions"""
|
|
284
|
+
self.source_code = source_code or ""
|
|
285
|
+
self.content_lines = self.source_code.split("\n")
|
|
286
|
+
self._reset_caches()
|
|
287
|
+
|
|
288
|
+
references: list[MarkdownElement] = []
|
|
289
|
+
|
|
290
|
+
if tree is not None and tree.root_node is not None:
|
|
291
|
+
try:
|
|
292
|
+
self._extract_link_reference_definitions(tree.root_node, references)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
log_debug(f"Error during reference extraction: {e}")
|
|
295
|
+
|
|
296
|
+
log_debug(f"Extracted {len(references)} Markdown references")
|
|
297
|
+
return references
|
|
298
|
+
|
|
299
|
+
def extract_blockquotes(
|
|
300
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
301
|
+
) -> list[MarkdownElement]:
|
|
302
|
+
"""Extract Markdown blockquotes"""
|
|
303
|
+
self.source_code = source_code or ""
|
|
304
|
+
self.content_lines = self.source_code.split("\n")
|
|
305
|
+
self._reset_caches()
|
|
306
|
+
|
|
307
|
+
blockquotes: list[MarkdownElement] = []
|
|
308
|
+
|
|
309
|
+
if tree is not None and tree.root_node is not None:
|
|
310
|
+
try:
|
|
311
|
+
self._extract_block_quotes(tree.root_node, blockquotes)
|
|
312
|
+
except Exception as e:
|
|
313
|
+
log_debug(f"Error during blockquote extraction: {e}")
|
|
314
|
+
|
|
315
|
+
log_debug(f"Extracted {len(blockquotes)} Markdown blockquotes")
|
|
316
|
+
return blockquotes
|
|
317
|
+
|
|
318
|
+
def extract_horizontal_rules(
|
|
319
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
320
|
+
) -> list[MarkdownElement]:
|
|
321
|
+
"""Extract Markdown horizontal rules"""
|
|
322
|
+
self.source_code = source_code or ""
|
|
323
|
+
self.content_lines = self.source_code.split("\n")
|
|
324
|
+
self._reset_caches()
|
|
325
|
+
|
|
326
|
+
horizontal_rules: list[MarkdownElement] = []
|
|
327
|
+
|
|
328
|
+
if tree is not None and tree.root_node is not None:
|
|
329
|
+
try:
|
|
330
|
+
self._extract_thematic_breaks(tree.root_node, horizontal_rules)
|
|
331
|
+
except Exception as e:
|
|
332
|
+
log_debug(f"Error during horizontal rule extraction: {e}")
|
|
333
|
+
|
|
334
|
+
log_debug(f"Extracted {len(horizontal_rules)} Markdown horizontal rules")
|
|
335
|
+
return horizontal_rules
|
|
336
|
+
|
|
337
|
+
def extract_html_elements(
|
|
338
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
339
|
+
) -> list[MarkdownElement]:
|
|
340
|
+
"""Extract HTML elements"""
|
|
341
|
+
self.source_code = source_code or ""
|
|
342
|
+
self.content_lines = self.source_code.split("\n")
|
|
343
|
+
self._reset_caches()
|
|
344
|
+
|
|
345
|
+
html_elements: list[MarkdownElement] = []
|
|
346
|
+
|
|
347
|
+
if tree is not None and tree.root_node is not None:
|
|
348
|
+
try:
|
|
349
|
+
self._extract_html_blocks(tree.root_node, html_elements)
|
|
350
|
+
self._extract_inline_html(tree.root_node, html_elements)
|
|
351
|
+
except Exception as e:
|
|
352
|
+
log_debug(f"Error during HTML element extraction: {e}")
|
|
353
|
+
|
|
354
|
+
log_debug(f"Extracted {len(html_elements)} HTML elements")
|
|
355
|
+
return html_elements
|
|
356
|
+
|
|
357
|
+
def extract_text_formatting(
|
|
358
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
359
|
+
) -> list[MarkdownElement]:
|
|
360
|
+
"""Extract text formatting elements (bold, italic, strikethrough, inline code)"""
|
|
361
|
+
self.source_code = source_code or ""
|
|
362
|
+
self.content_lines = self.source_code.split("\n")
|
|
363
|
+
self._reset_caches()
|
|
364
|
+
|
|
365
|
+
formatting_elements: list[MarkdownElement] = []
|
|
366
|
+
|
|
367
|
+
if tree is not None and tree.root_node is not None:
|
|
368
|
+
try:
|
|
369
|
+
self._extract_emphasis_elements(tree.root_node, formatting_elements)
|
|
370
|
+
self._extract_inline_code_spans(tree.root_node, formatting_elements)
|
|
371
|
+
self._extract_strikethrough_elements(
|
|
372
|
+
tree.root_node, formatting_elements
|
|
373
|
+
)
|
|
374
|
+
except Exception as e:
|
|
375
|
+
log_debug(f"Error during text formatting extraction: {e}")
|
|
376
|
+
|
|
377
|
+
log_debug(f"Extracted {len(formatting_elements)} text formatting elements")
|
|
378
|
+
return formatting_elements
|
|
379
|
+
|
|
380
|
+
def extract_footnotes(
|
|
381
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
382
|
+
) -> list[MarkdownElement]:
|
|
383
|
+
"""Extract footnotes"""
|
|
384
|
+
self.source_code = source_code or ""
|
|
385
|
+
self.content_lines = self.source_code.split("\n")
|
|
386
|
+
self._reset_caches()
|
|
387
|
+
|
|
388
|
+
footnotes: list[MarkdownElement] = []
|
|
389
|
+
|
|
390
|
+
if tree is not None and tree.root_node is not None:
|
|
391
|
+
try:
|
|
392
|
+
self._extract_footnote_elements(tree.root_node, footnotes)
|
|
393
|
+
except Exception as e:
|
|
394
|
+
log_debug(f"Error during footnote extraction: {e}")
|
|
395
|
+
|
|
396
|
+
log_debug(f"Extracted {len(footnotes)} footnotes")
|
|
397
|
+
return footnotes
|
|
398
|
+
|
|
399
|
+
def extract_lists(
|
|
400
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
401
|
+
) -> list[MarkdownElement]:
|
|
402
|
+
"""Extract Markdown lists"""
|
|
403
|
+
self.source_code = source_code or ""
|
|
404
|
+
self.content_lines = self.source_code.split("\n")
|
|
405
|
+
self._reset_caches()
|
|
406
|
+
|
|
407
|
+
lists: list[MarkdownElement] = []
|
|
408
|
+
|
|
409
|
+
if tree is not None and tree.root_node is not None:
|
|
410
|
+
try:
|
|
411
|
+
self._extract_list_items(tree.root_node, lists)
|
|
412
|
+
except Exception as e:
|
|
413
|
+
log_debug(f"Error during list extraction: {e}")
|
|
414
|
+
|
|
415
|
+
log_debug(f"Extracted {len(lists)} Markdown list items")
|
|
416
|
+
return lists
|
|
417
|
+
|
|
418
|
+
def extract_tables(
|
|
419
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
420
|
+
) -> list[MarkdownElement]:
|
|
421
|
+
"""Extract Markdown tables"""
|
|
422
|
+
self.source_code = source_code or ""
|
|
423
|
+
self.content_lines = self.source_code.split("\n")
|
|
424
|
+
self._reset_caches()
|
|
425
|
+
|
|
426
|
+
tables: list[MarkdownElement] = []
|
|
427
|
+
|
|
428
|
+
if tree is not None and tree.root_node is not None:
|
|
429
|
+
try:
|
|
430
|
+
self._extract_pipe_tables(tree.root_node, tables)
|
|
431
|
+
except Exception as e:
|
|
432
|
+
log_debug(f"Error during table extraction: {e}")
|
|
433
|
+
|
|
434
|
+
log_debug(f"Extracted {len(tables)} Markdown tables")
|
|
435
|
+
return tables
|
|
436
|
+
|
|
437
|
+
def _reset_caches(self) -> None:
|
|
438
|
+
"""Reset performance caches"""
|
|
439
|
+
self._node_text_cache.clear()
|
|
440
|
+
self._processed_nodes.clear()
|
|
441
|
+
self._element_cache.clear()
|
|
442
|
+
|
|
443
|
+
def _get_node_text_optimized(self, node: "tree_sitter.Node") -> str:
|
|
444
|
+
"""Get node text with optimized caching"""
|
|
445
|
+
node_id = id(node)
|
|
446
|
+
|
|
447
|
+
if node_id in self._node_text_cache:
|
|
448
|
+
return self._node_text_cache[node_id]
|
|
449
|
+
|
|
450
|
+
try:
|
|
451
|
+
start_byte = node.start_byte
|
|
452
|
+
end_byte = node.end_byte
|
|
453
|
+
|
|
454
|
+
encoding = self._file_encoding or "utf-8"
|
|
455
|
+
content_bytes = safe_encode("\n".join(self.content_lines), encoding)
|
|
456
|
+
text = extract_text_slice(content_bytes, start_byte, end_byte, encoding)
|
|
457
|
+
|
|
458
|
+
if text:
|
|
459
|
+
self._node_text_cache[node_id] = text
|
|
460
|
+
return text
|
|
461
|
+
except Exception as e:
|
|
462
|
+
log_error(f"Error in _get_node_text_optimized: {e}")
|
|
463
|
+
|
|
464
|
+
# Fallback to simple text extraction
|
|
465
|
+
try:
|
|
466
|
+
start_point = node.start_point
|
|
467
|
+
end_point = node.end_point
|
|
468
|
+
|
|
469
|
+
if start_point[0] < 0 or start_point[0] >= len(self.content_lines):
|
|
470
|
+
return ""
|
|
471
|
+
|
|
472
|
+
if end_point[0] < 0 or end_point[0] >= len(self.content_lines):
|
|
473
|
+
return ""
|
|
474
|
+
|
|
475
|
+
if start_point[0] == end_point[0]:
|
|
476
|
+
line = self.content_lines[start_point[0]]
|
|
477
|
+
start_col = max(0, min(start_point[1], len(line)))
|
|
478
|
+
end_col = max(start_col, min(end_point[1], len(line)))
|
|
479
|
+
result: str = line[start_col:end_col]
|
|
480
|
+
self._node_text_cache[node_id] = result
|
|
481
|
+
return result
|
|
482
|
+
else:
|
|
483
|
+
lines = []
|
|
484
|
+
for i in range(
|
|
485
|
+
start_point[0], min(end_point[0] + 1, len(self.content_lines))
|
|
486
|
+
):
|
|
487
|
+
if i < len(self.content_lines):
|
|
488
|
+
line = self.content_lines[i]
|
|
489
|
+
if i == start_point[0] and i == end_point[0]:
|
|
490
|
+
# Single line case
|
|
491
|
+
start_col = max(0, min(start_point[1], len(line)))
|
|
492
|
+
end_col = max(start_col, min(end_point[1], len(line)))
|
|
493
|
+
lines.append(line[start_col:end_col])
|
|
494
|
+
elif i == start_point[0]:
|
|
495
|
+
start_col = max(0, min(start_point[1], len(line)))
|
|
496
|
+
lines.append(line[start_col:])
|
|
497
|
+
elif i == end_point[0]:
|
|
498
|
+
end_col = max(0, min(end_point[1], len(line)))
|
|
499
|
+
lines.append(line[:end_col])
|
|
500
|
+
else:
|
|
501
|
+
lines.append(line)
|
|
502
|
+
result = "\n".join(lines)
|
|
503
|
+
self._node_text_cache[node_id] = result
|
|
504
|
+
return result
|
|
505
|
+
except Exception as fallback_error:
|
|
506
|
+
log_error(f"Fallback text extraction also failed: {fallback_error}")
|
|
507
|
+
return ""
|
|
508
|
+
|
|
509
|
+
def _extract_atx_headers(
|
|
510
|
+
self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]
|
|
511
|
+
) -> None:
|
|
512
|
+
"""Extract ATX-style headers (# ## ### etc.)"""
|
|
513
|
+
for node in self._traverse_nodes(root_node):
|
|
514
|
+
if node.type == "atx_heading":
|
|
515
|
+
try:
|
|
516
|
+
start_line = node.start_point[0] + 1
|
|
517
|
+
end_line = node.end_point[0] + 1
|
|
518
|
+
raw_text = self._get_node_text_optimized(node)
|
|
519
|
+
|
|
520
|
+
# Extract header level and content
|
|
521
|
+
level = 1
|
|
522
|
+
content = raw_text.strip()
|
|
523
|
+
|
|
524
|
+
# Count # symbols to determine level
|
|
525
|
+
if content.startswith("#"):
|
|
526
|
+
level = len(content) - len(content.lstrip("#"))
|
|
527
|
+
content = content.lstrip("# ").rstrip()
|
|
528
|
+
|
|
529
|
+
header = MarkdownElement(
|
|
530
|
+
name=content or f"Header Level {level}",
|
|
531
|
+
start_line=start_line,
|
|
532
|
+
end_line=end_line,
|
|
533
|
+
raw_text=raw_text,
|
|
534
|
+
element_type="heading",
|
|
535
|
+
level=level,
|
|
536
|
+
)
|
|
537
|
+
# Add additional attributes for formatter
|
|
538
|
+
header.text = content or f"Header Level {level}"
|
|
539
|
+
header.type = "heading"
|
|
540
|
+
headers.append(header)
|
|
541
|
+
except Exception as e:
|
|
542
|
+
log_debug(f"Failed to extract ATX header: {e}")
|
|
543
|
+
|
|
544
|
+
def _extract_setext_headers(
|
|
545
|
+
self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]
|
|
546
|
+
) -> None:
|
|
547
|
+
"""Extract Setext-style headers (underlined)"""
|
|
548
|
+
for node in self._traverse_nodes(root_node):
|
|
549
|
+
if node.type == "setext_heading":
|
|
550
|
+
try:
|
|
551
|
+
start_line = node.start_point[0] + 1
|
|
552
|
+
end_line = node.end_point[0] + 1
|
|
553
|
+
raw_text = self._get_node_text_optimized(node)
|
|
554
|
+
|
|
555
|
+
# Determine level based on underline character
|
|
556
|
+
level = 2 # Default to H2
|
|
557
|
+
lines = raw_text.strip().split("\n")
|
|
558
|
+
if len(lines) >= 2:
|
|
559
|
+
underline = lines[1].strip()
|
|
560
|
+
if underline.startswith("="):
|
|
561
|
+
level = 1 # H1
|
|
562
|
+
elif underline.startswith("-"):
|
|
563
|
+
level = 2 # H2
|
|
564
|
+
content = lines[0].strip()
|
|
565
|
+
else:
|
|
566
|
+
content = raw_text.strip()
|
|
567
|
+
|
|
568
|
+
header = MarkdownElement(
|
|
569
|
+
name=content or f"Header Level {level}",
|
|
570
|
+
start_line=start_line,
|
|
571
|
+
end_line=end_line,
|
|
572
|
+
raw_text=raw_text,
|
|
573
|
+
element_type="heading",
|
|
574
|
+
level=level,
|
|
575
|
+
)
|
|
576
|
+
# Add additional attributes for formatter
|
|
577
|
+
header.text = content or f"Header Level {level}"
|
|
578
|
+
header.type = "heading"
|
|
579
|
+
headers.append(header)
|
|
580
|
+
except Exception as e:
|
|
581
|
+
log_debug(f"Failed to extract Setext header: {e}")
|
|
582
|
+
|
|
583
|
+
def _extract_fenced_code_blocks(
|
|
584
|
+
self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]
|
|
585
|
+
) -> None:
|
|
586
|
+
"""Extract fenced code blocks"""
|
|
587
|
+
for node in self._traverse_nodes(root_node):
|
|
588
|
+
if node.type == "fenced_code_block":
|
|
589
|
+
try:
|
|
590
|
+
start_line = node.start_point[0] + 1
|
|
591
|
+
end_line = node.end_point[0] + 1
|
|
592
|
+
raw_text = self._get_node_text_optimized(node)
|
|
593
|
+
|
|
594
|
+
# Extract language info
|
|
595
|
+
language_info = None
|
|
596
|
+
lines = raw_text.strip().split("\n")
|
|
597
|
+
if lines and lines[0].startswith("```"):
|
|
598
|
+
language_info = lines[0][3:].strip()
|
|
599
|
+
|
|
600
|
+
# Extract content (excluding fence markers)
|
|
601
|
+
content_lines = []
|
|
602
|
+
in_content = False
|
|
603
|
+
for line in lines:
|
|
604
|
+
if line.startswith("```"):
|
|
605
|
+
if not in_content:
|
|
606
|
+
in_content = True
|
|
607
|
+
continue
|
|
608
|
+
else:
|
|
609
|
+
break
|
|
610
|
+
if in_content:
|
|
611
|
+
content_lines.append(line)
|
|
612
|
+
|
|
613
|
+
name = f"Code Block ({language_info or 'unknown'})"
|
|
614
|
+
|
|
615
|
+
code_block = MarkdownElement(
|
|
616
|
+
name=name,
|
|
617
|
+
start_line=start_line,
|
|
618
|
+
end_line=end_line,
|
|
619
|
+
raw_text=raw_text,
|
|
620
|
+
element_type="code_block",
|
|
621
|
+
language_info=language_info,
|
|
622
|
+
)
|
|
623
|
+
# Add additional attributes for formatter
|
|
624
|
+
code_block.language = language_info or "text"
|
|
625
|
+
code_block.line_count = len(content_lines)
|
|
626
|
+
code_block.type = "code_block"
|
|
627
|
+
code_blocks.append(code_block)
|
|
628
|
+
except Exception as e:
|
|
629
|
+
log_debug(f"Failed to extract fenced code block: {e}")
|
|
630
|
+
|
|
631
|
+
def _extract_indented_code_blocks(
|
|
632
|
+
self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]
|
|
633
|
+
) -> None:
|
|
634
|
+
"""Extract indented code blocks"""
|
|
635
|
+
for node in self._traverse_nodes(root_node):
|
|
636
|
+
if node.type == "indented_code_block":
|
|
637
|
+
try:
|
|
638
|
+
start_line = node.start_point[0] + 1
|
|
639
|
+
end_line = node.end_point[0] + 1
|
|
640
|
+
raw_text = self._get_node_text_optimized(node)
|
|
641
|
+
|
|
642
|
+
code_block = MarkdownElement(
|
|
643
|
+
name="Indented Code Block",
|
|
644
|
+
start_line=start_line,
|
|
645
|
+
end_line=end_line,
|
|
646
|
+
raw_text=raw_text,
|
|
647
|
+
element_type="code_block",
|
|
648
|
+
language_info="indented",
|
|
649
|
+
)
|
|
650
|
+
# Add additional attributes for formatter
|
|
651
|
+
code_block.language = "text"
|
|
652
|
+
code_block.line_count = end_line - start_line + 1
|
|
653
|
+
code_block.type = "code_block"
|
|
654
|
+
code_blocks.append(code_block)
|
|
655
|
+
except Exception as e:
|
|
656
|
+
log_debug(f"Failed to extract indented code block: {e}")
|
|
657
|
+
|
|
658
|
+
def _extract_inline_links(
|
|
659
|
+
self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
|
|
660
|
+
) -> None:
|
|
661
|
+
"""Extract inline links"""
|
|
662
|
+
import re
|
|
663
|
+
|
|
664
|
+
# Extract links from text within inline nodes using regular expressions
|
|
665
|
+
for node in self._traverse_nodes(root_node):
|
|
666
|
+
if node.type == "inline":
|
|
667
|
+
try:
|
|
668
|
+
raw_text = self._get_node_text_optimized(node)
|
|
669
|
+
if not raw_text:
|
|
670
|
+
continue
|
|
671
|
+
|
|
672
|
+
# Inline link pattern: [text](url "title") (excluding images)
|
|
673
|
+
inline_pattern = r'(?<!\!)\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
674
|
+
matches = re.finditer(inline_pattern, raw_text)
|
|
675
|
+
|
|
676
|
+
for match in matches:
|
|
677
|
+
text = match.group(1) or ""
|
|
678
|
+
url = match.group(2) or ""
|
|
679
|
+
title = match.group(3) or ""
|
|
680
|
+
|
|
681
|
+
# Global duplicate check: process same text and URL combination only once
|
|
682
|
+
link_signature = f"{text}|{url}"
|
|
683
|
+
if (
|
|
684
|
+
hasattr(self, "_extracted_links")
|
|
685
|
+
and link_signature in self._extracted_links
|
|
686
|
+
):
|
|
687
|
+
continue
|
|
688
|
+
|
|
689
|
+
if hasattr(self, "_extracted_links"):
|
|
690
|
+
self._extracted_links.add(link_signature)
|
|
691
|
+
|
|
692
|
+
start_line = node.start_point[0] + 1
|
|
693
|
+
end_line = node.end_point[0] + 1
|
|
694
|
+
|
|
695
|
+
link = MarkdownElement(
|
|
696
|
+
name=text or "Link",
|
|
697
|
+
start_line=start_line,
|
|
698
|
+
end_line=end_line,
|
|
699
|
+
raw_text=match.group(0),
|
|
700
|
+
element_type="link",
|
|
701
|
+
url=url,
|
|
702
|
+
title=title,
|
|
703
|
+
)
|
|
704
|
+
# Add additional attributes for formatter
|
|
705
|
+
link.text = text or "Link"
|
|
706
|
+
link.type = "link"
|
|
707
|
+
links.append(link)
|
|
708
|
+
|
|
709
|
+
except Exception as e:
|
|
710
|
+
log_debug(f"Failed to extract inline link: {e}")
|
|
711
|
+
|
|
712
|
+
def _extract_reference_links(
|
|
713
|
+
self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
|
|
714
|
+
) -> None:
|
|
715
|
+
"""Extract reference links"""
|
|
716
|
+
import re
|
|
717
|
+
|
|
718
|
+
# Reference links also need to be extracted from inline nodes
|
|
719
|
+
# Track already processed reference links to avoid duplicates
|
|
720
|
+
processed_ref_links = set()
|
|
721
|
+
|
|
722
|
+
for node in self._traverse_nodes(root_node):
|
|
723
|
+
if node.type == "inline":
|
|
724
|
+
try:
|
|
725
|
+
raw_text = self._get_node_text_optimized(node)
|
|
726
|
+
if not raw_text:
|
|
727
|
+
continue
|
|
728
|
+
|
|
729
|
+
# Reference link pattern: [text][ref]
|
|
730
|
+
ref_pattern = r"\[([^\]]*)\]\[([^\]]*)\]"
|
|
731
|
+
matches = re.finditer(ref_pattern, raw_text)
|
|
732
|
+
|
|
733
|
+
for match in matches:
|
|
734
|
+
text = match.group(1) or ""
|
|
735
|
+
ref = match.group(2) or ""
|
|
736
|
+
|
|
737
|
+
# Skip image references (starting with !)
|
|
738
|
+
if match.start() > 0 and raw_text[match.start() - 1] == "!":
|
|
739
|
+
continue
|
|
740
|
+
|
|
741
|
+
# Duplicate check: process same text and reference combination only once
|
|
742
|
+
start_line = node.start_point[0] + 1
|
|
743
|
+
ref_link_key = (text, ref, start_line)
|
|
744
|
+
|
|
745
|
+
if ref_link_key in processed_ref_links:
|
|
746
|
+
continue
|
|
747
|
+
processed_ref_links.add(ref_link_key)
|
|
748
|
+
|
|
749
|
+
end_line = node.end_point[0] + 1
|
|
750
|
+
|
|
751
|
+
link = MarkdownElement(
|
|
752
|
+
name=text or "Reference Link",
|
|
753
|
+
start_line=start_line,
|
|
754
|
+
end_line=end_line,
|
|
755
|
+
raw_text=match.group(0),
|
|
756
|
+
element_type="reference_link",
|
|
757
|
+
)
|
|
758
|
+
# Add additional attributes for formatter
|
|
759
|
+
link.text = text or "Reference Link"
|
|
760
|
+
link.type = "reference_link"
|
|
761
|
+
links.append(link)
|
|
762
|
+
|
|
763
|
+
except Exception as e:
|
|
764
|
+
log_debug(f"Failed to extract reference link: {e}")
|
|
765
|
+
|
|
766
|
+
def _extract_autolinks(
|
|
767
|
+
self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
|
|
768
|
+
) -> None:
|
|
769
|
+
"""Extract autolinks"""
|
|
770
|
+
import re
|
|
771
|
+
|
|
772
|
+
# Extract autolinks from text within inline nodes using regular expressions
|
|
773
|
+
for node in self._traverse_nodes(root_node):
|
|
774
|
+
if node.type == "inline":
|
|
775
|
+
try:
|
|
776
|
+
raw_text = self._get_node_text_optimized(node)
|
|
777
|
+
if not raw_text:
|
|
778
|
+
continue
|
|
779
|
+
|
|
780
|
+
# Autolink pattern: <url> or <email>
|
|
781
|
+
autolink_pattern = (
|
|
782
|
+
r"<(https?://[^>]+|mailto:[^>]+|[^@\s]+@[^@\s]+\.[^@\s]+)>"
|
|
783
|
+
)
|
|
784
|
+
matches = re.finditer(autolink_pattern, raw_text)
|
|
785
|
+
|
|
786
|
+
for match in matches:
|
|
787
|
+
url = match.group(1) or ""
|
|
788
|
+
full_match = match.group(0)
|
|
789
|
+
|
|
790
|
+
# Global duplicate check: process same URL for autolinks only once
|
|
791
|
+
autolink_signature = f"autolink|{url}"
|
|
792
|
+
if (
|
|
793
|
+
hasattr(self, "_extracted_links")
|
|
794
|
+
and autolink_signature in self._extracted_links
|
|
795
|
+
):
|
|
796
|
+
continue
|
|
797
|
+
|
|
798
|
+
if hasattr(self, "_extracted_links"):
|
|
799
|
+
self._extracted_links.add(autolink_signature)
|
|
800
|
+
|
|
801
|
+
start_line = node.start_point[0] + 1
|
|
802
|
+
end_line = node.end_point[0] + 1
|
|
803
|
+
|
|
804
|
+
link = MarkdownElement(
|
|
805
|
+
name=url or "Autolink",
|
|
806
|
+
start_line=start_line,
|
|
807
|
+
end_line=end_line,
|
|
808
|
+
raw_text=full_match,
|
|
809
|
+
element_type="autolink",
|
|
810
|
+
url=url,
|
|
811
|
+
)
|
|
812
|
+
# Add additional attributes for formatter
|
|
813
|
+
link.text = url or "Autolink"
|
|
814
|
+
link.type = "autolink"
|
|
815
|
+
links.append(link)
|
|
816
|
+
|
|
817
|
+
except Exception as e:
|
|
818
|
+
log_debug(f"Failed to extract autolink: {e}")
|
|
819
|
+
|
|
820
|
+
def _extract_inline_images(
|
|
821
|
+
self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
|
|
822
|
+
) -> None:
|
|
823
|
+
"""Extract inline images"""
|
|
824
|
+
import re
|
|
825
|
+
|
|
826
|
+
# Extract images from text within inline nodes using regular expressions
|
|
827
|
+
for node in self._traverse_nodes(root_node):
|
|
828
|
+
if node.type == "inline":
|
|
829
|
+
try:
|
|
830
|
+
raw_text = self._get_node_text_optimized(node)
|
|
831
|
+
if not raw_text:
|
|
832
|
+
continue
|
|
833
|
+
|
|
834
|
+
# Inline image pattern: 
|
|
835
|
+
image_pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
836
|
+
matches = re.finditer(image_pattern, raw_text)
|
|
837
|
+
|
|
838
|
+
for match in matches:
|
|
839
|
+
alt_text = match.group(1) or ""
|
|
840
|
+
url = match.group(2) or ""
|
|
841
|
+
title = match.group(3) or ""
|
|
842
|
+
|
|
843
|
+
# Calculate line number from matched position
|
|
844
|
+
start_line = node.start_point[0] + 1
|
|
845
|
+
end_line = node.end_point[0] + 1
|
|
846
|
+
|
|
847
|
+
image = MarkdownElement(
|
|
848
|
+
name=alt_text or "Image",
|
|
849
|
+
start_line=start_line,
|
|
850
|
+
end_line=end_line,
|
|
851
|
+
raw_text=match.group(0),
|
|
852
|
+
element_type="image",
|
|
853
|
+
url=url,
|
|
854
|
+
alt_text=alt_text,
|
|
855
|
+
title=title,
|
|
856
|
+
)
|
|
857
|
+
# Add additional attributes for formatter
|
|
858
|
+
image.alt = alt_text or ""
|
|
859
|
+
image.type = "image"
|
|
860
|
+
images.append(image)
|
|
861
|
+
|
|
862
|
+
except Exception as e:
|
|
863
|
+
log_debug(f"Failed to extract inline image: {e}")
|
|
864
|
+
|
|
865
|
+
def _extract_reference_images(
|
|
866
|
+
self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
|
|
867
|
+
) -> None:
|
|
868
|
+
"""Extract reference images"""
|
|
869
|
+
import re
|
|
870
|
+
|
|
871
|
+
# Reference images also need to be extracted from inline nodes
|
|
872
|
+
for node in self._traverse_nodes(root_node):
|
|
873
|
+
if node.type == "inline":
|
|
874
|
+
try:
|
|
875
|
+
raw_text = self._get_node_text_optimized(node)
|
|
876
|
+
if not raw_text:
|
|
877
|
+
continue
|
|
878
|
+
|
|
879
|
+
# Reference image pattern: ![alt][ref]
|
|
880
|
+
ref_image_pattern = r"!\[([^\]]*)\]\[([^\]]*)\]"
|
|
881
|
+
matches = re.finditer(ref_image_pattern, raw_text)
|
|
882
|
+
|
|
883
|
+
for match in matches:
|
|
884
|
+
alt_text = match.group(1) or ""
|
|
885
|
+
start_line = node.start_point[0] + 1
|
|
886
|
+
end_line = node.end_point[0] + 1
|
|
887
|
+
|
|
888
|
+
image = MarkdownElement(
|
|
889
|
+
name=alt_text or "Reference Image",
|
|
890
|
+
start_line=start_line,
|
|
891
|
+
end_line=end_line,
|
|
892
|
+
raw_text=match.group(0),
|
|
893
|
+
element_type="reference_image",
|
|
894
|
+
)
|
|
895
|
+
# Add additional attributes for formatter
|
|
896
|
+
image.alt = alt_text or ""
|
|
897
|
+
image.type = "reference_image"
|
|
898
|
+
images.append(image)
|
|
899
|
+
|
|
900
|
+
except Exception as e:
|
|
901
|
+
log_debug(f"Failed to extract reference image: {e}")
|
|
902
|
+
|
|
903
|
+
def _extract_image_reference_definitions(
|
|
904
|
+
self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
|
|
905
|
+
) -> None:
|
|
906
|
+
"""Extract image reference definitions"""
|
|
907
|
+
import re
|
|
908
|
+
|
|
909
|
+
# Extract all reference definitions that could be used for images
|
|
910
|
+
# We check if the URL points to an image file or if it's used by an image reference
|
|
911
|
+
# First, collect all image references used in the document
|
|
912
|
+
image_refs_used = set()
|
|
913
|
+
for node in self._traverse_nodes(root_node):
|
|
914
|
+
if node.type == "inline":
|
|
915
|
+
try:
|
|
916
|
+
raw_text = self._get_node_text_optimized(node)
|
|
917
|
+
if not raw_text:
|
|
918
|
+
continue
|
|
919
|
+
|
|
920
|
+
# Find image references: ![alt][ref]
|
|
921
|
+
ref_image_pattern = r"!\[([^\]]*)\]\[([^\]]*)\]"
|
|
922
|
+
matches = re.finditer(ref_image_pattern, raw_text)
|
|
923
|
+
|
|
924
|
+
for match in matches:
|
|
925
|
+
ref = match.group(2) or ""
|
|
926
|
+
if ref:
|
|
927
|
+
image_refs_used.add(ref.lower())
|
|
928
|
+
|
|
929
|
+
except Exception as e:
|
|
930
|
+
log_debug(f"Failed to scan for image references: {e}")
|
|
931
|
+
|
|
932
|
+
# Now extract reference definitions that are used by images OR point to image files
|
|
933
|
+
for node in self._traverse_nodes(root_node):
|
|
934
|
+
if node.type == "link_reference_definition":
|
|
935
|
+
try:
|
|
936
|
+
start_line = node.start_point[0] + 1
|
|
937
|
+
end_line = node.end_point[0] + 1
|
|
938
|
+
raw_text = self._get_node_text_optimized(node)
|
|
939
|
+
|
|
940
|
+
# Pattern: [label]: url "title"
|
|
941
|
+
ref_pattern = r'^\[([^\]]+)\]:\s*([^\s]+)(?:\s+"([^"]*)")?'
|
|
942
|
+
ref_match: re.Match[str] | None = re.match(
|
|
943
|
+
ref_pattern, raw_text.strip()
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
if ref_match:
|
|
947
|
+
label = ref_match.group(1) or ""
|
|
948
|
+
url = ref_match.group(2) or ""
|
|
949
|
+
title = ref_match.group(3) or ""
|
|
950
|
+
|
|
951
|
+
# Include if this reference is used by an image OR if URL looks like an image
|
|
952
|
+
is_used_by_image = label.lower() in image_refs_used
|
|
953
|
+
is_image_url = any(
|
|
954
|
+
url.lower().endswith(ext)
|
|
955
|
+
for ext in [
|
|
956
|
+
".png",
|
|
957
|
+
".jpg",
|
|
958
|
+
".jpeg",
|
|
959
|
+
".gif",
|
|
960
|
+
".svg",
|
|
961
|
+
".webp",
|
|
962
|
+
".bmp",
|
|
963
|
+
]
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
if is_used_by_image or is_image_url:
|
|
967
|
+
image_ref = MarkdownElement(
|
|
968
|
+
name=f"Image Reference Definition: {label}",
|
|
969
|
+
start_line=start_line,
|
|
970
|
+
end_line=end_line,
|
|
971
|
+
raw_text=raw_text,
|
|
972
|
+
element_type="image_reference_definition",
|
|
973
|
+
url=url,
|
|
974
|
+
alt_text=label,
|
|
975
|
+
title=title,
|
|
976
|
+
)
|
|
977
|
+
# Add additional attributes for formatter
|
|
978
|
+
image_ref.alt = label
|
|
979
|
+
image_ref.type = "image_reference_definition"
|
|
980
|
+
images.append(image_ref)
|
|
981
|
+
|
|
982
|
+
except Exception as e:
|
|
983
|
+
log_debug(f"Failed to extract image reference definition: {e}")
|
|
984
|
+
|
|
985
|
+
def _extract_link_reference_definitions(
|
|
986
|
+
self, root_node: "tree_sitter.Node", references: list[MarkdownElement]
|
|
987
|
+
) -> None:
|
|
988
|
+
"""Extract link reference definitions"""
|
|
989
|
+
for node in self._traverse_nodes(root_node):
|
|
990
|
+
if node.type == "link_reference_definition":
|
|
991
|
+
try:
|
|
992
|
+
start_line = node.start_point[0] + 1
|
|
993
|
+
end_line = node.end_point[0] + 1
|
|
994
|
+
raw_text = self._get_node_text_optimized(node)
|
|
995
|
+
|
|
996
|
+
reference = MarkdownElement(
|
|
997
|
+
name=raw_text or "Reference Definition",
|
|
998
|
+
start_line=start_line,
|
|
999
|
+
end_line=end_line,
|
|
1000
|
+
raw_text=raw_text,
|
|
1001
|
+
element_type="reference_definition",
|
|
1002
|
+
)
|
|
1003
|
+
references.append(reference)
|
|
1004
|
+
except Exception as e:
|
|
1005
|
+
log_debug(f"Failed to extract reference definition: {e}")
|
|
1006
|
+
|
|
1007
|
+
def _extract_list_items(
|
|
1008
|
+
self, root_node: "tree_sitter.Node", lists: list[MarkdownElement]
|
|
1009
|
+
) -> None:
|
|
1010
|
+
"""Extract lists (not individual items)"""
|
|
1011
|
+
for node in self._traverse_nodes(root_node):
|
|
1012
|
+
if node.type == "list":
|
|
1013
|
+
try:
|
|
1014
|
+
start_line = node.start_point[0] + 1
|
|
1015
|
+
end_line = node.end_point[0] + 1
|
|
1016
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1017
|
+
|
|
1018
|
+
# Count list items in this list
|
|
1019
|
+
item_count = 0
|
|
1020
|
+
is_task_list = False
|
|
1021
|
+
is_ordered = False
|
|
1022
|
+
|
|
1023
|
+
for child in node.children:
|
|
1024
|
+
if child.type == "list_item":
|
|
1025
|
+
item_count += 1
|
|
1026
|
+
item_text = self._get_node_text_optimized(child)
|
|
1027
|
+
|
|
1028
|
+
# Check if it's a task list item
|
|
1029
|
+
if (
|
|
1030
|
+
"[ ]" in item_text
|
|
1031
|
+
or "[x]" in item_text
|
|
1032
|
+
or "[X]" in item_text
|
|
1033
|
+
):
|
|
1034
|
+
is_task_list = True
|
|
1035
|
+
|
|
1036
|
+
# Check if it's an ordered list (starts with number)
|
|
1037
|
+
if item_text.strip() and item_text.strip()[0].isdigit():
|
|
1038
|
+
is_ordered = True
|
|
1039
|
+
|
|
1040
|
+
# Determine list type
|
|
1041
|
+
if is_task_list:
|
|
1042
|
+
list_type = "task"
|
|
1043
|
+
element_type = "task_list"
|
|
1044
|
+
elif is_ordered:
|
|
1045
|
+
list_type = "ordered"
|
|
1046
|
+
element_type = "list"
|
|
1047
|
+
else:
|
|
1048
|
+
list_type = "unordered"
|
|
1049
|
+
element_type = "list"
|
|
1050
|
+
|
|
1051
|
+
name = f"{list_type.title()} List ({item_count} items)"
|
|
1052
|
+
|
|
1053
|
+
list_element = MarkdownElement(
|
|
1054
|
+
name=name,
|
|
1055
|
+
start_line=start_line,
|
|
1056
|
+
end_line=end_line,
|
|
1057
|
+
raw_text=raw_text,
|
|
1058
|
+
element_type=element_type,
|
|
1059
|
+
)
|
|
1060
|
+
# Add additional attributes for formatter
|
|
1061
|
+
list_element.list_type = list_type
|
|
1062
|
+
list_element.item_count = item_count
|
|
1063
|
+
list_element.type = list_type
|
|
1064
|
+
lists.append(list_element)
|
|
1065
|
+
except Exception as e:
|
|
1066
|
+
log_debug(f"Failed to extract list: {e}")
|
|
1067
|
+
|
|
1068
|
+
def _extract_pipe_tables(
|
|
1069
|
+
self, root_node: "tree_sitter.Node", tables: list[MarkdownElement]
|
|
1070
|
+
) -> None:
|
|
1071
|
+
"""Extract pipe tables"""
|
|
1072
|
+
for node in self._traverse_nodes(root_node):
|
|
1073
|
+
if node.type == "pipe_table":
|
|
1074
|
+
try:
|
|
1075
|
+
start_line = node.start_point[0] + 1
|
|
1076
|
+
end_line = node.end_point[0] + 1
|
|
1077
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1078
|
+
|
|
1079
|
+
# Count rows and columns
|
|
1080
|
+
lines = raw_text.strip().split("\n")
|
|
1081
|
+
row_count = len(
|
|
1082
|
+
[
|
|
1083
|
+
line
|
|
1084
|
+
for line in lines
|
|
1085
|
+
if line.strip() and not line.strip().startswith("|---")
|
|
1086
|
+
]
|
|
1087
|
+
)
|
|
1088
|
+
|
|
1089
|
+
# Count columns from first row
|
|
1090
|
+
column_count = 0
|
|
1091
|
+
if lines:
|
|
1092
|
+
first_row = lines[0]
|
|
1093
|
+
column_count = len(
|
|
1094
|
+
[col for col in first_row.split("|") if col.strip()]
|
|
1095
|
+
)
|
|
1096
|
+
|
|
1097
|
+
table = MarkdownElement(
|
|
1098
|
+
name=f"Table ({row_count} rows, {column_count} columns)",
|
|
1099
|
+
start_line=start_line,
|
|
1100
|
+
end_line=end_line,
|
|
1101
|
+
raw_text=raw_text,
|
|
1102
|
+
element_type="table",
|
|
1103
|
+
)
|
|
1104
|
+
# Add additional attributes for formatter
|
|
1105
|
+
table.row_count = row_count
|
|
1106
|
+
table.column_count = column_count
|
|
1107
|
+
table.type = "table"
|
|
1108
|
+
tables.append(table)
|
|
1109
|
+
except Exception as e:
|
|
1110
|
+
log_debug(f"Failed to extract pipe table: {e}")
|
|
1111
|
+
|
|
1112
|
+
def _extract_block_quotes(
|
|
1113
|
+
self, root_node: "tree_sitter.Node", blockquotes: list[MarkdownElement]
|
|
1114
|
+
) -> None:
|
|
1115
|
+
"""Extract blockquotes"""
|
|
1116
|
+
import re
|
|
1117
|
+
|
|
1118
|
+
# Blockquotes are often represented as paragraphs starting with >
|
|
1119
|
+
for node in self._traverse_nodes(root_node):
|
|
1120
|
+
if node.type == "block_quote":
|
|
1121
|
+
try:
|
|
1122
|
+
start_line = node.start_point[0] + 1
|
|
1123
|
+
end_line = node.end_point[0] + 1
|
|
1124
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1125
|
+
|
|
1126
|
+
# Extract content without > markers
|
|
1127
|
+
lines = raw_text.strip().split("\n")
|
|
1128
|
+
content_lines = []
|
|
1129
|
+
for line in lines:
|
|
1130
|
+
# Remove > marker and optional space
|
|
1131
|
+
cleaned = re.sub(r"^>\s?", "", line)
|
|
1132
|
+
content_lines.append(cleaned)
|
|
1133
|
+
content = "\n".join(content_lines).strip()
|
|
1134
|
+
|
|
1135
|
+
blockquote = MarkdownElement(
|
|
1136
|
+
name=(
|
|
1137
|
+
f"Blockquote: {content[:50]}..."
|
|
1138
|
+
if len(content) > 50
|
|
1139
|
+
else f"Blockquote: {content}"
|
|
1140
|
+
),
|
|
1141
|
+
start_line=start_line,
|
|
1142
|
+
end_line=end_line,
|
|
1143
|
+
raw_text=raw_text,
|
|
1144
|
+
element_type="blockquote",
|
|
1145
|
+
)
|
|
1146
|
+
blockquote.type = "blockquote"
|
|
1147
|
+
blockquote.text = content
|
|
1148
|
+
blockquotes.append(blockquote)
|
|
1149
|
+
except Exception as e:
|
|
1150
|
+
log_debug(f"Failed to extract blockquote: {e}")
|
|
1151
|
+
|
|
1152
|
+
def _extract_thematic_breaks(
|
|
1153
|
+
self, root_node: "tree_sitter.Node", horizontal_rules: list[MarkdownElement]
|
|
1154
|
+
) -> None:
|
|
1155
|
+
"""Extract thematic breaks (horizontal rules)"""
|
|
1156
|
+
for node in self._traverse_nodes(root_node):
|
|
1157
|
+
if node.type == "thematic_break":
|
|
1158
|
+
try:
|
|
1159
|
+
start_line = node.start_point[0] + 1
|
|
1160
|
+
end_line = node.end_point[0] + 1
|
|
1161
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1162
|
+
|
|
1163
|
+
hr = MarkdownElement(
|
|
1164
|
+
name="Horizontal Rule",
|
|
1165
|
+
start_line=start_line,
|
|
1166
|
+
end_line=end_line,
|
|
1167
|
+
raw_text=raw_text,
|
|
1168
|
+
element_type="horizontal_rule",
|
|
1169
|
+
)
|
|
1170
|
+
hr.type = "horizontal_rule"
|
|
1171
|
+
horizontal_rules.append(hr)
|
|
1172
|
+
except Exception as e:
|
|
1173
|
+
log_debug(f"Failed to extract horizontal rule: {e}")
|
|
1174
|
+
|
|
1175
|
+
def _extract_html_blocks(
|
|
1176
|
+
self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]
|
|
1177
|
+
) -> None:
|
|
1178
|
+
"""Extract HTML block elements"""
|
|
1179
|
+
for node in self._traverse_nodes(root_node):
|
|
1180
|
+
if node.type == "html_block":
|
|
1181
|
+
try:
|
|
1182
|
+
start_line = node.start_point[0] + 1
|
|
1183
|
+
end_line = node.end_point[0] + 1
|
|
1184
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1185
|
+
|
|
1186
|
+
# Extract tag name if possible
|
|
1187
|
+
import re
|
|
1188
|
+
|
|
1189
|
+
tag_match = re.search(r"<(\w+)", raw_text)
|
|
1190
|
+
tag_name = tag_match.group(1) if tag_match else "HTML"
|
|
1191
|
+
|
|
1192
|
+
html_element = MarkdownElement(
|
|
1193
|
+
name=f"HTML Block: {tag_name}",
|
|
1194
|
+
start_line=start_line,
|
|
1195
|
+
end_line=end_line,
|
|
1196
|
+
raw_text=raw_text,
|
|
1197
|
+
element_type="html_block",
|
|
1198
|
+
)
|
|
1199
|
+
html_element.type = "html_block"
|
|
1200
|
+
html_elements.append(html_element)
|
|
1201
|
+
except Exception as e:
|
|
1202
|
+
log_debug(f"Failed to extract HTML block: {e}")
|
|
1203
|
+
|
|
1204
|
+
def _extract_inline_html(
|
|
1205
|
+
self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]
|
|
1206
|
+
) -> None:
|
|
1207
|
+
"""Extract inline HTML elements"""
|
|
1208
|
+
import re
|
|
1209
|
+
|
|
1210
|
+
# Look for HTML tags in inline content
|
|
1211
|
+
for node in self._traverse_nodes(root_node):
|
|
1212
|
+
if node.type == "inline":
|
|
1213
|
+
try:
|
|
1214
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1215
|
+
if not raw_text:
|
|
1216
|
+
continue
|
|
1217
|
+
|
|
1218
|
+
# Pattern for HTML tags (excluding autolinks)
|
|
1219
|
+
# Exclude autolink patterns: <url> or <email>
|
|
1220
|
+
html_pattern = r"<(?!(?:https?://|mailto:|[^@\s]+@[^@\s]+\.[^@\s]+)[^>]*>)[^>]+>"
|
|
1221
|
+
matches = re.finditer(html_pattern, raw_text)
|
|
1222
|
+
|
|
1223
|
+
for match in matches:
|
|
1224
|
+
tag_text = match.group(0)
|
|
1225
|
+
|
|
1226
|
+
# Extract tag name
|
|
1227
|
+
tag_match = re.search(r"<(\w+)", tag_text)
|
|
1228
|
+
tag_name = tag_match.group(1) if tag_match else "HTML"
|
|
1229
|
+
|
|
1230
|
+
start_line = node.start_point[0] + 1
|
|
1231
|
+
end_line = node.end_point[0] + 1
|
|
1232
|
+
|
|
1233
|
+
html_element = MarkdownElement(
|
|
1234
|
+
name=f"HTML Tag: {tag_name}",
|
|
1235
|
+
start_line=start_line,
|
|
1236
|
+
end_line=end_line,
|
|
1237
|
+
raw_text=tag_text,
|
|
1238
|
+
element_type="html_inline",
|
|
1239
|
+
)
|
|
1240
|
+
html_element.type = "html_inline"
|
|
1241
|
+
html_element.name = tag_name # Set name attribute for formatter
|
|
1242
|
+
html_elements.append(html_element)
|
|
1243
|
+
|
|
1244
|
+
except Exception as e:
|
|
1245
|
+
log_debug(f"Failed to extract inline HTML: {e}")
|
|
1246
|
+
|
|
1247
|
+
def _extract_emphasis_elements(
|
|
1248
|
+
self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
|
|
1249
|
+
) -> None:
|
|
1250
|
+
"""Extract emphasis and strong emphasis elements"""
|
|
1251
|
+
import re
|
|
1252
|
+
|
|
1253
|
+
for node in self._traverse_nodes(root_node):
|
|
1254
|
+
if node.type == "inline":
|
|
1255
|
+
try:
|
|
1256
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1257
|
+
if not raw_text:
|
|
1258
|
+
continue
|
|
1259
|
+
|
|
1260
|
+
# Pattern for bold text: **text** or __text__
|
|
1261
|
+
bold_pattern = r"\*\*([^*]+)\*\*|__([^_]+)__"
|
|
1262
|
+
bold_matches = re.finditer(bold_pattern, raw_text)
|
|
1263
|
+
|
|
1264
|
+
for match in bold_matches:
|
|
1265
|
+
content = match.group(1) or match.group(2) or ""
|
|
1266
|
+
start_line = node.start_point[0] + 1
|
|
1267
|
+
end_line = node.end_point[0] + 1
|
|
1268
|
+
|
|
1269
|
+
bold_element = MarkdownElement(
|
|
1270
|
+
name=f"Bold: {content}",
|
|
1271
|
+
start_line=start_line,
|
|
1272
|
+
end_line=end_line,
|
|
1273
|
+
raw_text=match.group(0),
|
|
1274
|
+
element_type="strong_emphasis",
|
|
1275
|
+
)
|
|
1276
|
+
bold_element.type = "strong_emphasis"
|
|
1277
|
+
bold_element.text = content
|
|
1278
|
+
formatting_elements.append(bold_element)
|
|
1279
|
+
|
|
1280
|
+
# Pattern for italic text: *text* or _text_ (but not **text** or __text__)
|
|
1281
|
+
italic_pattern = r"(?<!\*)\*([^*]+)\*(?!\*)|(?<!_)_([^_]+)_(?!_)"
|
|
1282
|
+
italic_matches = re.finditer(italic_pattern, raw_text)
|
|
1283
|
+
|
|
1284
|
+
for match in italic_matches:
|
|
1285
|
+
content = match.group(1) or match.group(2) or ""
|
|
1286
|
+
start_line = node.start_point[0] + 1
|
|
1287
|
+
end_line = node.end_point[0] + 1
|
|
1288
|
+
|
|
1289
|
+
italic_element = MarkdownElement(
|
|
1290
|
+
name=f"Italic: {content}",
|
|
1291
|
+
start_line=start_line,
|
|
1292
|
+
end_line=end_line,
|
|
1293
|
+
raw_text=match.group(0),
|
|
1294
|
+
element_type="emphasis",
|
|
1295
|
+
)
|
|
1296
|
+
italic_element.type = "emphasis"
|
|
1297
|
+
italic_element.text = content
|
|
1298
|
+
formatting_elements.append(italic_element)
|
|
1299
|
+
|
|
1300
|
+
except Exception as e:
|
|
1301
|
+
log_debug(f"Failed to extract emphasis elements: {e}")
|
|
1302
|
+
|
|
1303
|
+
def _extract_inline_code_spans(
|
|
1304
|
+
self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
|
|
1305
|
+
) -> None:
|
|
1306
|
+
"""Extract inline code spans"""
|
|
1307
|
+
import re
|
|
1308
|
+
|
|
1309
|
+
for node in self._traverse_nodes(root_node):
|
|
1310
|
+
if node.type == "inline":
|
|
1311
|
+
try:
|
|
1312
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1313
|
+
if not raw_text:
|
|
1314
|
+
continue
|
|
1315
|
+
|
|
1316
|
+
# Pattern for inline code: `code`
|
|
1317
|
+
code_pattern = r"`([^`]+)`"
|
|
1318
|
+
matches = re.finditer(code_pattern, raw_text)
|
|
1319
|
+
|
|
1320
|
+
for match in matches:
|
|
1321
|
+
content = match.group(1) or ""
|
|
1322
|
+
start_line = node.start_point[0] + 1
|
|
1323
|
+
end_line = node.end_point[0] + 1
|
|
1324
|
+
|
|
1325
|
+
code_element = MarkdownElement(
|
|
1326
|
+
name=f"Inline Code: {content}",
|
|
1327
|
+
start_line=start_line,
|
|
1328
|
+
end_line=end_line,
|
|
1329
|
+
raw_text=match.group(0),
|
|
1330
|
+
element_type="inline_code",
|
|
1331
|
+
)
|
|
1332
|
+
code_element.type = "inline_code"
|
|
1333
|
+
code_element.text = content
|
|
1334
|
+
formatting_elements.append(code_element)
|
|
1335
|
+
|
|
1336
|
+
except Exception as e:
|
|
1337
|
+
log_debug(f"Failed to extract inline code: {e}")
|
|
1338
|
+
|
|
1339
|
+
def _extract_strikethrough_elements(
|
|
1340
|
+
self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
|
|
1341
|
+
) -> None:
|
|
1342
|
+
"""Extract strikethrough elements"""
|
|
1343
|
+
import re
|
|
1344
|
+
|
|
1345
|
+
for node in self._traverse_nodes(root_node):
|
|
1346
|
+
if node.type == "inline":
|
|
1347
|
+
try:
|
|
1348
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1349
|
+
if not raw_text:
|
|
1350
|
+
continue
|
|
1351
|
+
|
|
1352
|
+
# Pattern for strikethrough: ~~text~~
|
|
1353
|
+
strike_pattern = r"~~([^~]+)~~"
|
|
1354
|
+
matches = re.finditer(strike_pattern, raw_text)
|
|
1355
|
+
|
|
1356
|
+
for match in matches:
|
|
1357
|
+
content = match.group(1) or ""
|
|
1358
|
+
start_line = node.start_point[0] + 1
|
|
1359
|
+
end_line = node.end_point[0] + 1
|
|
1360
|
+
|
|
1361
|
+
strike_element = MarkdownElement(
|
|
1362
|
+
name=f"Strikethrough: {content}",
|
|
1363
|
+
start_line=start_line,
|
|
1364
|
+
end_line=end_line,
|
|
1365
|
+
raw_text=match.group(0),
|
|
1366
|
+
element_type="strikethrough",
|
|
1367
|
+
)
|
|
1368
|
+
strike_element.type = "strikethrough"
|
|
1369
|
+
strike_element.text = content
|
|
1370
|
+
formatting_elements.append(strike_element)
|
|
1371
|
+
|
|
1372
|
+
except Exception as e:
|
|
1373
|
+
log_debug(f"Failed to extract strikethrough: {e}")
|
|
1374
|
+
|
|
1375
|
+
def _extract_footnote_elements(
|
|
1376
|
+
self, root_node: "tree_sitter.Node", footnotes: list[MarkdownElement]
|
|
1377
|
+
) -> None:
|
|
1378
|
+
"""Extract footnote elements"""
|
|
1379
|
+
import re
|
|
1380
|
+
|
|
1381
|
+
for node in self._traverse_nodes(root_node):
|
|
1382
|
+
if node.type == "inline":
|
|
1383
|
+
try:
|
|
1384
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1385
|
+
if not raw_text:
|
|
1386
|
+
continue
|
|
1387
|
+
|
|
1388
|
+
# Pattern for footnote references: [^1]
|
|
1389
|
+
footnote_ref_pattern = r"\[\^([^\]]+)\]"
|
|
1390
|
+
matches = re.finditer(footnote_ref_pattern, raw_text)
|
|
1391
|
+
|
|
1392
|
+
for match in matches:
|
|
1393
|
+
ref_id = match.group(1) or ""
|
|
1394
|
+
start_line = node.start_point[0] + 1
|
|
1395
|
+
end_line = node.end_point[0] + 1
|
|
1396
|
+
|
|
1397
|
+
footnote_element = MarkdownElement(
|
|
1398
|
+
name=f"Footnote Reference: {ref_id}",
|
|
1399
|
+
start_line=start_line,
|
|
1400
|
+
end_line=end_line,
|
|
1401
|
+
raw_text=match.group(0),
|
|
1402
|
+
element_type="footnote_reference",
|
|
1403
|
+
)
|
|
1404
|
+
footnote_element.type = "footnote_reference"
|
|
1405
|
+
footnote_element.text = ref_id
|
|
1406
|
+
footnotes.append(footnote_element)
|
|
1407
|
+
|
|
1408
|
+
except Exception as e:
|
|
1409
|
+
log_debug(f"Failed to extract footnote reference: {e}")
|
|
1410
|
+
|
|
1411
|
+
# Look for footnote definitions
|
|
1412
|
+
elif node.type == "paragraph":
|
|
1413
|
+
try:
|
|
1414
|
+
raw_text = self._get_node_text_optimized(node)
|
|
1415
|
+
if not raw_text:
|
|
1416
|
+
continue
|
|
1417
|
+
|
|
1418
|
+
# Pattern for footnote definitions: [^1]: content
|
|
1419
|
+
footnote_def_pattern = r"^\[\^([^\]]+)\]:\s*(.+)$"
|
|
1420
|
+
footnote_match: re.Match[str] | None = re.match(
|
|
1421
|
+
footnote_def_pattern, raw_text.strip(), re.MULTILINE
|
|
1422
|
+
)
|
|
1423
|
+
|
|
1424
|
+
if footnote_match:
|
|
1425
|
+
ref_id = footnote_match.group(1) or ""
|
|
1426
|
+
content = footnote_match.group(2) or ""
|
|
1427
|
+
start_line = node.start_point[0] + 1
|
|
1428
|
+
end_line = node.end_point[0] + 1
|
|
1429
|
+
|
|
1430
|
+
footnote_element = MarkdownElement(
|
|
1431
|
+
name=f"Footnote Definition: {ref_id}",
|
|
1432
|
+
start_line=start_line,
|
|
1433
|
+
end_line=end_line,
|
|
1434
|
+
raw_text=raw_text,
|
|
1435
|
+
element_type="footnote_definition",
|
|
1436
|
+
)
|
|
1437
|
+
footnote_element.type = "footnote_definition"
|
|
1438
|
+
footnote_element.text = content
|
|
1439
|
+
footnotes.append(footnote_element)
|
|
1440
|
+
|
|
1441
|
+
except Exception as e:
|
|
1442
|
+
log_debug(f"Failed to extract footnote definition: {e}")
|
|
1443
|
+
|
|
1444
|
+
def _traverse_nodes(self, node: "tree_sitter.Node") -> Any:
|
|
1445
|
+
"""Traverse all nodes in the tree"""
|
|
1446
|
+
yield node
|
|
1447
|
+
for child in node.children:
|
|
1448
|
+
yield from self._traverse_nodes(child)
|
|
1449
|
+
|
|
1450
|
+
def _parse_link_components(self, raw_text: str) -> tuple[str, str, str]:
|
|
1451
|
+
"""Parse link components from raw text"""
|
|
1452
|
+
import re
|
|
1453
|
+
|
|
1454
|
+
# Pattern for [text](url "title")
|
|
1455
|
+
pattern = r'\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
1456
|
+
match = re.search(pattern, raw_text)
|
|
1457
|
+
|
|
1458
|
+
if match:
|
|
1459
|
+
text = match.group(1) or ""
|
|
1460
|
+
url = match.group(2) or ""
|
|
1461
|
+
title = match.group(3) or ""
|
|
1462
|
+
return text, url, title
|
|
1463
|
+
|
|
1464
|
+
return "", "", ""
|
|
1465
|
+
|
|
1466
|
+
def _parse_image_components(self, raw_text: str) -> tuple[str, str, str]:
|
|
1467
|
+
"""Parse image components from raw text"""
|
|
1468
|
+
import re
|
|
1469
|
+
|
|
1470
|
+
# Pattern for 
|
|
1471
|
+
pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
1472
|
+
match = re.search(pattern, raw_text)
|
|
1473
|
+
|
|
1474
|
+
if match:
|
|
1475
|
+
alt_text = match.group(1) or ""
|
|
1476
|
+
url = match.group(2) or ""
|
|
1477
|
+
title = match.group(3) or ""
|
|
1478
|
+
return alt_text, url, title
|
|
1479
|
+
|
|
1480
|
+
return "", "", ""
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
class MarkdownPlugin(LanguagePlugin):
|
|
1484
|
+
"""Markdown language plugin for the tree-sitter analyzer"""
|
|
1485
|
+
|
|
1486
|
+
def __init__(self) -> None:
|
|
1487
|
+
"""Initialize the Markdown plugin"""
|
|
1488
|
+
super().__init__()
|
|
1489
|
+
self._language_cache: tree_sitter.Language | None = None
|
|
1490
|
+
self._extractor: MarkdownElementExtractor = MarkdownElementExtractor()
|
|
1491
|
+
|
|
1492
|
+
# Legacy compatibility attributes for tests
|
|
1493
|
+
self.language = "markdown"
|
|
1494
|
+
self.extractor = self._extractor
|
|
1495
|
+
|
|
1496
|
+
def get_language_name(self) -> str:
|
|
1497
|
+
"""Return the name of the programming language this plugin supports"""
|
|
1498
|
+
return "markdown"
|
|
1499
|
+
|
|
1500
|
+
def get_file_extensions(self) -> list[str]:
|
|
1501
|
+
"""Return list of file extensions this plugin supports"""
|
|
1502
|
+
return [".md", ".markdown", ".mdown", ".mkd", ".mkdn", ".mdx"]
|
|
1503
|
+
|
|
1504
|
+
def create_extractor(self) -> ElementExtractor:
|
|
1505
|
+
"""Create and return an element extractor for this language"""
|
|
1506
|
+
return MarkdownElementExtractor()
|
|
1507
|
+
|
|
1508
|
+
def get_extractor(self) -> ElementExtractor:
|
|
1509
|
+
"""Get the cached extractor instance, creating it if necessary"""
|
|
1510
|
+
return self._extractor
|
|
1511
|
+
|
|
1512
|
+
def get_language(self) -> str:
|
|
1513
|
+
"""Get the language name for Markdown (legacy compatibility)"""
|
|
1514
|
+
return "markdown"
|
|
1515
|
+
|
|
1516
|
+
def extract_functions(
|
|
1517
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1518
|
+
) -> list[CodeElement]:
|
|
1519
|
+
"""Extract functions from the tree (legacy compatibility)"""
|
|
1520
|
+
extractor = self.get_extractor()
|
|
1521
|
+
functions = extractor.extract_functions(tree, source_code)
|
|
1522
|
+
return [
|
|
1523
|
+
CodeElement(
|
|
1524
|
+
name=f.name,
|
|
1525
|
+
start_line=f.start_line,
|
|
1526
|
+
end_line=f.end_line,
|
|
1527
|
+
raw_text=f.raw_text,
|
|
1528
|
+
language=f.language,
|
|
1529
|
+
)
|
|
1530
|
+
for f in functions
|
|
1531
|
+
]
|
|
1532
|
+
|
|
1533
|
+
def extract_classes(
|
|
1534
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1535
|
+
) -> list[CodeElement]:
|
|
1536
|
+
"""Extract classes from the tree (legacy compatibility)"""
|
|
1537
|
+
extractor = self.get_extractor()
|
|
1538
|
+
classes = extractor.extract_classes(tree, source_code)
|
|
1539
|
+
return [
|
|
1540
|
+
CodeElement(
|
|
1541
|
+
name=c.name,
|
|
1542
|
+
start_line=c.start_line,
|
|
1543
|
+
end_line=c.end_line,
|
|
1544
|
+
raw_text=c.raw_text,
|
|
1545
|
+
language=c.language,
|
|
1546
|
+
)
|
|
1547
|
+
for c in classes
|
|
1548
|
+
]
|
|
1549
|
+
|
|
1550
|
+
def extract_variables(
|
|
1551
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1552
|
+
) -> list[CodeElement]:
|
|
1553
|
+
"""Extract variables from the tree (legacy compatibility)"""
|
|
1554
|
+
extractor = self.get_extractor()
|
|
1555
|
+
variables = extractor.extract_variables(tree, source_code)
|
|
1556
|
+
return [
|
|
1557
|
+
CodeElement(
|
|
1558
|
+
name=v.name,
|
|
1559
|
+
start_line=v.start_line,
|
|
1560
|
+
end_line=v.end_line,
|
|
1561
|
+
raw_text=v.raw_text,
|
|
1562
|
+
language=v.language,
|
|
1563
|
+
)
|
|
1564
|
+
for v in variables
|
|
1565
|
+
]
|
|
1566
|
+
|
|
1567
|
+
def extract_imports(
|
|
1568
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1569
|
+
) -> list[CodeElement]:
|
|
1570
|
+
"""Extract imports from the tree (legacy compatibility)"""
|
|
1571
|
+
extractor = self.get_extractor()
|
|
1572
|
+
imports = extractor.extract_imports(tree, source_code)
|
|
1573
|
+
return [
|
|
1574
|
+
CodeElement(
|
|
1575
|
+
name=i.name,
|
|
1576
|
+
start_line=i.start_line,
|
|
1577
|
+
end_line=i.end_line,
|
|
1578
|
+
raw_text=i.raw_text,
|
|
1579
|
+
language=i.language,
|
|
1580
|
+
)
|
|
1581
|
+
for i in imports
|
|
1582
|
+
]
|
|
1583
|
+
|
|
1584
|
+
def get_tree_sitter_language(self) -> Optional["tree_sitter.Language"]:
|
|
1585
|
+
"""Get the Tree-sitter language object for Markdown"""
|
|
1586
|
+
if self._language_cache is None:
|
|
1587
|
+
try:
|
|
1588
|
+
import tree_sitter
|
|
1589
|
+
import tree_sitter_markdown as tsmarkdown
|
|
1590
|
+
|
|
1591
|
+
# Use modern tree-sitter-markdown API
|
|
1592
|
+
language_capsule = tsmarkdown.language()
|
|
1593
|
+
self._language_cache = tree_sitter.Language(language_capsule)
|
|
1594
|
+
except ImportError:
|
|
1595
|
+
log_error("tree-sitter-markdown not available")
|
|
1596
|
+
return None
|
|
1597
|
+
except Exception as e:
|
|
1598
|
+
log_error(f"Failed to load Markdown language: {e}")
|
|
1599
|
+
return None
|
|
1600
|
+
return self._language_cache
|
|
1601
|
+
|
|
1602
|
+
def get_supported_queries(self) -> list[str]:
|
|
1603
|
+
"""Get list of supported query names for this language"""
|
|
1604
|
+
return [
|
|
1605
|
+
"headers",
|
|
1606
|
+
"code_blocks",
|
|
1607
|
+
"links",
|
|
1608
|
+
"images",
|
|
1609
|
+
"lists",
|
|
1610
|
+
"tables",
|
|
1611
|
+
"blockquotes",
|
|
1612
|
+
"emphasis",
|
|
1613
|
+
"inline_code",
|
|
1614
|
+
"references",
|
|
1615
|
+
"task_lists",
|
|
1616
|
+
"horizontal_rules",
|
|
1617
|
+
"html_blocks",
|
|
1618
|
+
"strikethrough",
|
|
1619
|
+
"footnotes",
|
|
1620
|
+
"text_content",
|
|
1621
|
+
"all_elements",
|
|
1622
|
+
]
|
|
1623
|
+
|
|
1624
|
+
def is_applicable(self, file_path: str) -> bool:
|
|
1625
|
+
"""Check if this plugin is applicable for the given file"""
|
|
1626
|
+
return any(
|
|
1627
|
+
file_path.lower().endswith(ext.lower())
|
|
1628
|
+
for ext in self.get_file_extensions()
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
def get_plugin_info(self) -> dict:
|
|
1632
|
+
"""Get information about this plugin"""
|
|
1633
|
+
return {
|
|
1634
|
+
"name": "Markdown Plugin",
|
|
1635
|
+
"language": self.get_language_name(),
|
|
1636
|
+
"extensions": self.get_file_extensions(),
|
|
1637
|
+
"version": "1.0.0",
|
|
1638
|
+
"supported_queries": self.get_supported_queries(),
|
|
1639
|
+
"features": [
|
|
1640
|
+
"ATX headers (# ## ###)",
|
|
1641
|
+
"Setext headers (underlined)",
|
|
1642
|
+
"Fenced code blocks",
|
|
1643
|
+
"Indented code blocks",
|
|
1644
|
+
"Inline code spans",
|
|
1645
|
+
"Inline links",
|
|
1646
|
+
"Reference links",
|
|
1647
|
+
"Autolinks",
|
|
1648
|
+
"Email autolinks",
|
|
1649
|
+
"Images (inline and reference)",
|
|
1650
|
+
"Lists (ordered and unordered)",
|
|
1651
|
+
"Task lists (checkboxes)",
|
|
1652
|
+
"Blockquotes",
|
|
1653
|
+
"Tables",
|
|
1654
|
+
"Emphasis and strong emphasis",
|
|
1655
|
+
"Strikethrough text",
|
|
1656
|
+
"Horizontal rules",
|
|
1657
|
+
"HTML blocks and inline HTML",
|
|
1658
|
+
"Footnotes (references and definitions)",
|
|
1659
|
+
"Reference definitions",
|
|
1660
|
+
"Text formatting extraction",
|
|
1661
|
+
"CommonMark compliance",
|
|
1662
|
+
],
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
async def analyze_file(
|
|
1666
|
+
self, file_path: str, request: AnalysisRequest
|
|
1667
|
+
) -> AnalysisResult:
|
|
1668
|
+
"""Analyze a Markdown file and return the analysis results."""
|
|
1669
|
+
if not TREE_SITTER_AVAILABLE:
|
|
1670
|
+
return AnalysisResult(
|
|
1671
|
+
file_path=file_path,
|
|
1672
|
+
language=self.get_language_name(),
|
|
1673
|
+
success=False,
|
|
1674
|
+
error_message="Tree-sitter library not available.",
|
|
1675
|
+
)
|
|
1676
|
+
|
|
1677
|
+
language = self.get_tree_sitter_language()
|
|
1678
|
+
if not language:
|
|
1679
|
+
return AnalysisResult(
|
|
1680
|
+
file_path=file_path,
|
|
1681
|
+
language=self.get_language_name(),
|
|
1682
|
+
success=False,
|
|
1683
|
+
error_message="Could not load Markdown language for parsing.",
|
|
1684
|
+
)
|
|
1685
|
+
|
|
1686
|
+
try:
|
|
1687
|
+
from ..encoding_utils import read_file_safe
|
|
1688
|
+
|
|
1689
|
+
source_code, _ = read_file_safe(file_path)
|
|
1690
|
+
|
|
1691
|
+
parser = tree_sitter.Parser()
|
|
1692
|
+
parser.language = language
|
|
1693
|
+
tree = parser.parse(source_code.encode("utf-8"))
|
|
1694
|
+
|
|
1695
|
+
extractor = self.create_extractor()
|
|
1696
|
+
extractor.current_file = file_path # Set current file for context
|
|
1697
|
+
|
|
1698
|
+
elements: list[CodeElement] = []
|
|
1699
|
+
|
|
1700
|
+
# Extract all element types using the markdown-specific extractor
|
|
1701
|
+
if isinstance(extractor, MarkdownElementExtractor):
|
|
1702
|
+
headers = extractor.extract_headers(tree, source_code)
|
|
1703
|
+
code_blocks = extractor.extract_code_blocks(tree, source_code)
|
|
1704
|
+
links = extractor.extract_links(tree, source_code)
|
|
1705
|
+
images = extractor.extract_images(tree, source_code)
|
|
1706
|
+
references = extractor.extract_references(tree, source_code)
|
|
1707
|
+
lists = extractor.extract_lists(tree, source_code)
|
|
1708
|
+
tables = extractor.extract_tables(tree, source_code)
|
|
1709
|
+
|
|
1710
|
+
# Extract new element types
|
|
1711
|
+
blockquotes = extractor.extract_blockquotes(tree, source_code)
|
|
1712
|
+
horizontal_rules = extractor.extract_horizontal_rules(tree, source_code)
|
|
1713
|
+
html_elements = extractor.extract_html_elements(tree, source_code)
|
|
1714
|
+
text_formatting = extractor.extract_text_formatting(tree, source_code)
|
|
1715
|
+
footnotes = extractor.extract_footnotes(tree, source_code)
|
|
1716
|
+
else:
|
|
1717
|
+
# Fallback for base ElementExtractor
|
|
1718
|
+
headers = []
|
|
1719
|
+
code_blocks = []
|
|
1720
|
+
links = []
|
|
1721
|
+
images = []
|
|
1722
|
+
references = []
|
|
1723
|
+
lists = []
|
|
1724
|
+
tables = []
|
|
1725
|
+
blockquotes = []
|
|
1726
|
+
horizontal_rules = []
|
|
1727
|
+
html_elements = []
|
|
1728
|
+
text_formatting = []
|
|
1729
|
+
footnotes = []
|
|
1730
|
+
|
|
1731
|
+
elements.extend(headers)
|
|
1732
|
+
elements.extend(code_blocks)
|
|
1733
|
+
elements.extend(links)
|
|
1734
|
+
elements.extend(images)
|
|
1735
|
+
elements.extend(references)
|
|
1736
|
+
elements.extend(lists)
|
|
1737
|
+
elements.extend(tables)
|
|
1738
|
+
elements.extend(blockquotes)
|
|
1739
|
+
elements.extend(horizontal_rules)
|
|
1740
|
+
elements.extend(html_elements)
|
|
1741
|
+
elements.extend(text_formatting)
|
|
1742
|
+
elements.extend(footnotes)
|
|
1743
|
+
|
|
1744
|
+
def count_nodes(node: "tree_sitter.Node") -> int:
|
|
1745
|
+
count = 1
|
|
1746
|
+
for child in node.children:
|
|
1747
|
+
count += count_nodes(child)
|
|
1748
|
+
return count
|
|
1749
|
+
|
|
1750
|
+
return AnalysisResult(
|
|
1751
|
+
file_path=file_path,
|
|
1752
|
+
language=self.get_language_name(),
|
|
1753
|
+
success=True,
|
|
1754
|
+
elements=elements,
|
|
1755
|
+
line_count=len(source_code.splitlines()),
|
|
1756
|
+
node_count=count_nodes(tree.root_node),
|
|
1757
|
+
)
|
|
1758
|
+
except Exception as e:
|
|
1759
|
+
log_error(f"Error analyzing Markdown file {file_path}: {e}")
|
|
1760
|
+
return AnalysisResult(
|
|
1761
|
+
file_path=file_path,
|
|
1762
|
+
language=self.get_language_name(),
|
|
1763
|
+
success=False,
|
|
1764
|
+
error_message=str(e),
|
|
1765
|
+
)
|
|
1766
|
+
|
|
1767
|
+
def execute_query(self, tree: "tree_sitter.Tree", query_name: str) -> dict:
|
|
1768
|
+
"""Execute a specific query on the tree"""
|
|
1769
|
+
try:
|
|
1770
|
+
language = self.get_tree_sitter_language()
|
|
1771
|
+
if not language:
|
|
1772
|
+
return {"error": "Language not available"}
|
|
1773
|
+
|
|
1774
|
+
# Import query definitions
|
|
1775
|
+
from ..queries.markdown import get_query
|
|
1776
|
+
|
|
1777
|
+
try:
|
|
1778
|
+
query_string = get_query(query_name)
|
|
1779
|
+
except KeyError:
|
|
1780
|
+
return {"error": f"Unknown query: {query_name}"}
|
|
1781
|
+
|
|
1782
|
+
# Use tree-sitter API with modern handling
|
|
1783
|
+
captures = TreeSitterQueryCompat.safe_execute_query(
|
|
1784
|
+
language, query_string, tree.root_node, fallback_result=[]
|
|
1785
|
+
)
|
|
1786
|
+
return {
|
|
1787
|
+
"captures": captures,
|
|
1788
|
+
"query": query_string,
|
|
1789
|
+
"matches": len(captures),
|
|
1790
|
+
}
|
|
1791
|
+
|
|
1792
|
+
except Exception as e:
|
|
1793
|
+
log_error(f"Query execution failed: {e}")
|
|
1794
|
+
return {"error": str(e)}
|
|
1795
|
+
|
|
1796
|
+
def extract_elements(self, tree: "tree_sitter.Tree", source_code: str) -> list:
|
|
1797
|
+
"""Extract elements from source code using tree-sitter AST"""
|
|
1798
|
+
extractor = self.get_extractor()
|
|
1799
|
+
elements = []
|
|
1800
|
+
|
|
1801
|
+
try:
|
|
1802
|
+
if isinstance(extractor, MarkdownElementExtractor):
|
|
1803
|
+
elements.extend(extractor.extract_headers(tree, source_code))
|
|
1804
|
+
elements.extend(extractor.extract_code_blocks(tree, source_code))
|
|
1805
|
+
elements.extend(extractor.extract_links(tree, source_code))
|
|
1806
|
+
elements.extend(extractor.extract_images(tree, source_code))
|
|
1807
|
+
elements.extend(extractor.extract_references(tree, source_code))
|
|
1808
|
+
elements.extend(extractor.extract_lists(tree, source_code))
|
|
1809
|
+
elements.extend(extractor.extract_tables(tree, source_code))
|
|
1810
|
+
elements.extend(extractor.extract_blockquotes(tree, source_code))
|
|
1811
|
+
elements.extend(extractor.extract_horizontal_rules(tree, source_code))
|
|
1812
|
+
elements.extend(extractor.extract_html_elements(tree, source_code))
|
|
1813
|
+
elements.extend(extractor.extract_text_formatting(tree, source_code))
|
|
1814
|
+
elements.extend(extractor.extract_footnotes(tree, source_code))
|
|
1815
|
+
except Exception as e:
|
|
1816
|
+
log_error(f"Failed to extract elements: {e}")
|
|
1817
|
+
|
|
1818
|
+
return elements
|
|
1819
|
+
|
|
1820
|
+
def execute_query_strategy(
|
|
1821
|
+
self, query_key: str | None, language: str
|
|
1822
|
+
) -> str | None:
|
|
1823
|
+
"""Execute query strategy for Markdown language"""
|
|
1824
|
+
if not query_key:
|
|
1825
|
+
return None
|
|
1826
|
+
|
|
1827
|
+
# Use markdown-specific element categories instead of base queries
|
|
1828
|
+
element_categories = self.get_element_categories()
|
|
1829
|
+
if query_key in element_categories:
|
|
1830
|
+
# Return a simple query string for the category
|
|
1831
|
+
node_types = element_categories[query_key]
|
|
1832
|
+
if node_types:
|
|
1833
|
+
# Create a basic query for the first node type
|
|
1834
|
+
return f"({node_types[0]}) @{query_key}"
|
|
1835
|
+
|
|
1836
|
+
# Fallback to base implementation
|
|
1837
|
+
queries = self.get_queries()
|
|
1838
|
+
return queries.get(query_key) if queries else None
|
|
1839
|
+
|
|
1840
|
+
def get_element_categories(self) -> dict[str, list[str]]:
|
|
1841
|
+
"""Get Markdown element categories mapping query_key to node_types"""
|
|
1842
|
+
return {
|
|
1843
|
+
# Header categories (function-like)
|
|
1844
|
+
"function": ["atx_heading", "setext_heading"],
|
|
1845
|
+
"headers": ["atx_heading", "setext_heading"],
|
|
1846
|
+
"heading": ["atx_heading", "setext_heading"],
|
|
1847
|
+
# Code block categories (class-like)
|
|
1848
|
+
"class": ["fenced_code_block", "indented_code_block"],
|
|
1849
|
+
"code_blocks": ["fenced_code_block", "indented_code_block"],
|
|
1850
|
+
"code_block": ["fenced_code_block", "indented_code_block"],
|
|
1851
|
+
# Link and image categories (variable-like)
|
|
1852
|
+
"variable": [
|
|
1853
|
+
"inline", # Contains links and images
|
|
1854
|
+
"link",
|
|
1855
|
+
"autolink",
|
|
1856
|
+
"reference_link",
|
|
1857
|
+
"image",
|
|
1858
|
+
],
|
|
1859
|
+
"links": [
|
|
1860
|
+
"inline", # Contains inline links
|
|
1861
|
+
"link",
|
|
1862
|
+
"autolink",
|
|
1863
|
+
"reference_link",
|
|
1864
|
+
],
|
|
1865
|
+
"link": ["inline", "link", "autolink", "reference_link"],
|
|
1866
|
+
"images": [
|
|
1867
|
+
"inline", # Contains inline images
|
|
1868
|
+
"image",
|
|
1869
|
+
],
|
|
1870
|
+
"image": ["inline", "image"],
|
|
1871
|
+
# Reference categories (import-like)
|
|
1872
|
+
"import": ["link_reference_definition"],
|
|
1873
|
+
"references": ["link_reference_definition"],
|
|
1874
|
+
"reference": ["link_reference_definition"],
|
|
1875
|
+
# List categories
|
|
1876
|
+
"lists": ["list", "list_item"],
|
|
1877
|
+
"list": ["list", "list_item"],
|
|
1878
|
+
"task_lists": ["list", "list_item"],
|
|
1879
|
+
# Table categories
|
|
1880
|
+
"tables": ["pipe_table", "table"],
|
|
1881
|
+
"table": ["pipe_table", "table"],
|
|
1882
|
+
# Content structure categories
|
|
1883
|
+
"blockquotes": ["block_quote"],
|
|
1884
|
+
"blockquote": ["block_quote"],
|
|
1885
|
+
"horizontal_rules": ["thematic_break"],
|
|
1886
|
+
"horizontal_rule": ["thematic_break"],
|
|
1887
|
+
# HTML categories
|
|
1888
|
+
"html_blocks": [
|
|
1889
|
+
"html_block",
|
|
1890
|
+
"inline", # Contains inline HTML
|
|
1891
|
+
],
|
|
1892
|
+
"html_block": ["html_block", "inline"],
|
|
1893
|
+
"html": ["html_block", "inline"],
|
|
1894
|
+
# Text formatting categories
|
|
1895
|
+
"emphasis": ["inline"], # Contains emphasis elements
|
|
1896
|
+
"formatting": ["inline"],
|
|
1897
|
+
"text_formatting": ["inline"],
|
|
1898
|
+
"inline_code": ["inline"],
|
|
1899
|
+
"strikethrough": ["inline"],
|
|
1900
|
+
# Footnote categories
|
|
1901
|
+
"footnotes": [
|
|
1902
|
+
"inline", # Contains footnote references
|
|
1903
|
+
"paragraph", # Contains footnote definitions
|
|
1904
|
+
],
|
|
1905
|
+
"footnote": ["inline", "paragraph"],
|
|
1906
|
+
# Comprehensive categories
|
|
1907
|
+
"all_elements": [
|
|
1908
|
+
"atx_heading",
|
|
1909
|
+
"setext_heading",
|
|
1910
|
+
"fenced_code_block",
|
|
1911
|
+
"indented_code_block",
|
|
1912
|
+
"inline",
|
|
1913
|
+
"link",
|
|
1914
|
+
"autolink",
|
|
1915
|
+
"reference_link",
|
|
1916
|
+
"image",
|
|
1917
|
+
"link_reference_definition",
|
|
1918
|
+
"list",
|
|
1919
|
+
"list_item",
|
|
1920
|
+
"pipe_table",
|
|
1921
|
+
"table",
|
|
1922
|
+
"block_quote",
|
|
1923
|
+
"thematic_break",
|
|
1924
|
+
"html_block",
|
|
1925
|
+
"paragraph",
|
|
1926
|
+
],
|
|
1927
|
+
"text_content": ["atx_heading", "setext_heading", "inline", "paragraph"],
|
|
1928
|
+
}
|