tree-sitter-analyzer 1.9.2__py3-none-any.whl → 1.9.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tree-sitter-analyzer might be problematic. Click here for more details.
- tree_sitter_analyzer/__init__.py +1 -1
- tree_sitter_analyzer/api.py +216 -8
- tree_sitter_analyzer/cli/argument_validator.py +1 -1
- tree_sitter_analyzer/cli/commands/advanced_command.py +3 -6
- tree_sitter_analyzer/cli/commands/query_command.py +3 -1
- tree_sitter_analyzer/cli/commands/table_command.py +3 -3
- tree_sitter_analyzer/constants.py +5 -3
- tree_sitter_analyzer/core/analysis_engine.py +1 -1
- tree_sitter_analyzer/core/cache_service.py +1 -1
- tree_sitter_analyzer/core/engine.py +34 -10
- tree_sitter_analyzer/core/query.py +82 -2
- tree_sitter_analyzer/encoding_utils.py +64 -0
- tree_sitter_analyzer/exceptions.py +1 -1
- tree_sitter_analyzer/file_handler.py +49 -33
- tree_sitter_analyzer/formatters/base_formatter.py +1 -1
- tree_sitter_analyzer/formatters/html_formatter.py +24 -14
- tree_sitter_analyzer/formatters/javascript_formatter.py +28 -21
- tree_sitter_analyzer/formatters/language_formatter_factory.py +7 -4
- tree_sitter_analyzer/formatters/markdown_formatter.py +4 -4
- tree_sitter_analyzer/formatters/python_formatter.py +4 -4
- tree_sitter_analyzer/formatters/typescript_formatter.py +1 -1
- tree_sitter_analyzer/interfaces/mcp_adapter.py +4 -2
- tree_sitter_analyzer/interfaces/mcp_server.py +10 -10
- tree_sitter_analyzer/language_detector.py +30 -5
- tree_sitter_analyzer/language_loader.py +46 -26
- tree_sitter_analyzer/languages/css_plugin.py +6 -6
- tree_sitter_analyzer/languages/html_plugin.py +12 -8
- tree_sitter_analyzer/languages/java_plugin.py +330 -520
- tree_sitter_analyzer/languages/javascript_plugin.py +22 -78
- tree_sitter_analyzer/languages/markdown_plugin.py +277 -297
- tree_sitter_analyzer/languages/python_plugin.py +47 -85
- tree_sitter_analyzer/languages/typescript_plugin.py +48 -123
- tree_sitter_analyzer/mcp/resources/project_stats_resource.py +14 -8
- tree_sitter_analyzer/mcp/server.py +38 -23
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +10 -7
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +51 -7
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +11 -7
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +8 -6
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +6 -6
- tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +48 -15
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +13 -8
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +8 -3
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +24 -12
- tree_sitter_analyzer/mcp/utils/path_resolver.py +2 -2
- tree_sitter_analyzer/models.py +16 -0
- tree_sitter_analyzer/mypy_current_errors.txt +2 -0
- tree_sitter_analyzer/plugins/base.py +66 -0
- tree_sitter_analyzer/queries/java.py +9 -3
- tree_sitter_analyzer/queries/javascript.py +3 -8
- tree_sitter_analyzer/queries/markdown.py +1 -1
- tree_sitter_analyzer/queries/python.py +2 -2
- tree_sitter_analyzer/security/boundary_manager.py +2 -5
- tree_sitter_analyzer/security/regex_checker.py +2 -2
- tree_sitter_analyzer/security/validator.py +5 -1
- tree_sitter_analyzer/table_formatter.py +4 -4
- tree_sitter_analyzer/utils/__init__.py +27 -116
- tree_sitter_analyzer/{utils.py → utils/logging.py} +2 -2
- tree_sitter_analyzer/utils/tree_sitter_compat.py +2 -2
- {tree_sitter_analyzer-1.9.2.dist-info → tree_sitter_analyzer-1.9.4.dist-info}/METADATA +87 -45
- tree_sitter_analyzer-1.9.4.dist-info/RECORD +111 -0
- tree_sitter_analyzer-1.9.2.dist-info/RECORD +0 -109
- {tree_sitter_analyzer-1.9.2.dist-info → tree_sitter_analyzer-1.9.4.dist-info}/WHEEL +0 -0
- {tree_sitter_analyzer-1.9.2.dist-info → tree_sitter_analyzer-1.9.4.dist-info}/entry_points.txt +0 -0
|
@@ -21,9 +21,14 @@ except ImportError:
|
|
|
21
21
|
|
|
22
22
|
from ..core.analysis_engine import AnalysisRequest
|
|
23
23
|
from ..encoding_utils import extract_text_slice, safe_encode
|
|
24
|
-
from ..models import AnalysisResult
|
|
24
|
+
from ..models import AnalysisResult
|
|
25
|
+
from ..models import Class as ModelClass
|
|
26
|
+
from ..models import CodeElement
|
|
27
|
+
from ..models import Function as ModelFunction
|
|
28
|
+
from ..models import Import as ModelImport
|
|
29
|
+
from ..models import Variable as ModelVariable
|
|
25
30
|
from ..plugins.base import ElementExtractor, LanguagePlugin
|
|
26
|
-
from ..utils import log_debug, log_error
|
|
31
|
+
from ..utils import log_debug, log_error
|
|
27
32
|
from ..utils.tree_sitter_compat import TreeSitterQueryCompat
|
|
28
33
|
|
|
29
34
|
|
|
@@ -44,8 +49,8 @@ class MarkdownElement(CodeElement):
|
|
|
44
49
|
title: str | None = None,
|
|
45
50
|
language_info: str | None = None,
|
|
46
51
|
is_checked: bool | None = None,
|
|
47
|
-
**kwargs,
|
|
48
|
-
):
|
|
52
|
+
**kwargs: Any,
|
|
53
|
+
) -> None:
|
|
49
54
|
super().__init__(
|
|
50
55
|
name=name,
|
|
51
56
|
start_line=start_line,
|
|
@@ -62,6 +67,16 @@ class MarkdownElement(CodeElement):
|
|
|
62
67
|
self.language_info = language_info # For code blocks
|
|
63
68
|
self.is_checked = is_checked # For task list items
|
|
64
69
|
|
|
70
|
+
# Additional attributes used by formatters
|
|
71
|
+
self.text: str | None = None # Text content
|
|
72
|
+
self.type: str | None = None # Element type for formatters
|
|
73
|
+
self.line_count: int | None = None # For code blocks
|
|
74
|
+
self.alt: str | None = None # Alternative text for images
|
|
75
|
+
self.list_type: str | None = None # For lists (ordered/unordered/task)
|
|
76
|
+
self.item_count: int | None = None # For lists
|
|
77
|
+
self.row_count: int | None = None # For tables
|
|
78
|
+
self.column_count: int | None = None # For tables
|
|
79
|
+
|
|
65
80
|
|
|
66
81
|
class MarkdownElementExtractor(ElementExtractor):
|
|
67
82
|
"""Markdown-specific element extractor with comprehensive feature support"""
|
|
@@ -80,30 +95,74 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
80
95
|
|
|
81
96
|
def extract_functions(
|
|
82
97
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
83
|
-
) -> list[
|
|
98
|
+
) -> list[ModelFunction]:
|
|
84
99
|
"""Extract Markdown elements (headers act as 'functions')"""
|
|
85
|
-
|
|
100
|
+
headers = self.extract_headers(tree, source_code)
|
|
101
|
+
functions = []
|
|
102
|
+
for header in headers:
|
|
103
|
+
func = ModelFunction(
|
|
104
|
+
name=header.name,
|
|
105
|
+
start_line=header.start_line,
|
|
106
|
+
end_line=header.end_line,
|
|
107
|
+
raw_text=header.raw_text,
|
|
108
|
+
language=header.language,
|
|
109
|
+
)
|
|
110
|
+
functions.append(func)
|
|
111
|
+
return functions
|
|
86
112
|
|
|
87
113
|
def extract_classes(
|
|
88
114
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
89
|
-
) -> list[
|
|
115
|
+
) -> list[ModelClass]:
|
|
90
116
|
"""Extract Markdown sections (code blocks act as 'classes')"""
|
|
91
|
-
|
|
117
|
+
code_blocks = self.extract_code_blocks(tree, source_code)
|
|
118
|
+
classes = []
|
|
119
|
+
for block in code_blocks:
|
|
120
|
+
cls = ModelClass(
|
|
121
|
+
name=block.name,
|
|
122
|
+
start_line=block.start_line,
|
|
123
|
+
end_line=block.end_line,
|
|
124
|
+
raw_text=block.raw_text,
|
|
125
|
+
language=block.language,
|
|
126
|
+
)
|
|
127
|
+
classes.append(cls)
|
|
128
|
+
return classes
|
|
92
129
|
|
|
93
130
|
def extract_variables(
|
|
94
131
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
95
|
-
) -> list[
|
|
132
|
+
) -> list[ModelVariable]:
|
|
96
133
|
"""Extract Markdown links and images (act as 'variables')"""
|
|
97
134
|
elements = []
|
|
98
135
|
elements.extend(self.extract_links(tree, source_code))
|
|
99
136
|
elements.extend(self.extract_images(tree, source_code))
|
|
100
|
-
|
|
137
|
+
|
|
138
|
+
variables = []
|
|
139
|
+
for element in elements:
|
|
140
|
+
var = ModelVariable(
|
|
141
|
+
name=element.name,
|
|
142
|
+
start_line=element.start_line,
|
|
143
|
+
end_line=element.end_line,
|
|
144
|
+
raw_text=element.raw_text,
|
|
145
|
+
language=element.language,
|
|
146
|
+
)
|
|
147
|
+
variables.append(var)
|
|
148
|
+
return variables
|
|
101
149
|
|
|
102
150
|
def extract_imports(
|
|
103
151
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
104
|
-
) -> list[
|
|
152
|
+
) -> list[ModelImport]:
|
|
105
153
|
"""Extract Markdown references and definitions"""
|
|
106
|
-
|
|
154
|
+
references = self.extract_references(tree, source_code)
|
|
155
|
+
imports = []
|
|
156
|
+
for ref in references:
|
|
157
|
+
imp = ModelImport(
|
|
158
|
+
name=ref.name,
|
|
159
|
+
start_line=ref.start_line,
|
|
160
|
+
end_line=ref.end_line,
|
|
161
|
+
raw_text=ref.raw_text,
|
|
162
|
+
language=ref.language,
|
|
163
|
+
)
|
|
164
|
+
imports.append(imp)
|
|
165
|
+
return imports
|
|
107
166
|
|
|
108
167
|
def extract_headers(
|
|
109
168
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
@@ -115,18 +174,14 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
115
174
|
|
|
116
175
|
headers: list[MarkdownElement] = []
|
|
117
176
|
|
|
118
|
-
if tree is None
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
self._extract_setext_headers(tree.root_node, headers)
|
|
127
|
-
except Exception as e:
|
|
128
|
-
log_debug(f"Error during header extraction: {e}")
|
|
129
|
-
return []
|
|
177
|
+
if tree is not None and tree.root_node is not None:
|
|
178
|
+
try:
|
|
179
|
+
# Extract ATX headers (# ## ### etc.)
|
|
180
|
+
self._extract_atx_headers(tree.root_node, headers)
|
|
181
|
+
# Extract Setext headers (underlined)
|
|
182
|
+
self._extract_setext_headers(tree.root_node, headers)
|
|
183
|
+
except Exception as e:
|
|
184
|
+
log_debug(f"Error during header extraction: {e}")
|
|
130
185
|
|
|
131
186
|
log_debug(f"Extracted {len(headers)} Markdown headers")
|
|
132
187
|
return headers
|
|
@@ -141,16 +196,12 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
141
196
|
|
|
142
197
|
code_blocks: list[MarkdownElement] = []
|
|
143
198
|
|
|
144
|
-
if tree is None
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
self._extract_indented_code_blocks(tree.root_node, code_blocks)
|
|
151
|
-
except Exception as e:
|
|
152
|
-
log_debug(f"Error during code block extraction: {e}")
|
|
153
|
-
return []
|
|
199
|
+
if tree is not None and tree.root_node is not None:
|
|
200
|
+
try:
|
|
201
|
+
self._extract_fenced_code_blocks(tree.root_node, code_blocks)
|
|
202
|
+
self._extract_indented_code_blocks(tree.root_node, code_blocks)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
log_debug(f"Error during code block extraction: {e}")
|
|
154
205
|
|
|
155
206
|
log_debug(f"Extracted {len(code_blocks)} Markdown code blocks")
|
|
156
207
|
return code_blocks
|
|
@@ -165,25 +216,21 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
165
216
|
|
|
166
217
|
links: list[MarkdownElement] = []
|
|
167
218
|
|
|
168
|
-
if tree is None
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
try:
|
|
173
|
-
# Track extracted links to prevent global duplicates (ensure reset)
|
|
174
|
-
self._extracted_links = set()
|
|
219
|
+
if tree is not None and tree.root_node is not None:
|
|
220
|
+
try:
|
|
221
|
+
# Track extracted links to prevent global duplicates (ensure reset)
|
|
222
|
+
self._extracted_links = set()
|
|
175
223
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
224
|
+
self._extract_inline_links(tree.root_node, links)
|
|
225
|
+
self._extract_reference_links(tree.root_node, links)
|
|
226
|
+
self._extract_autolinks(tree.root_node, links)
|
|
179
227
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
228
|
+
# Clean up after extraction is complete
|
|
229
|
+
if hasattr(self, "_extracted_links"):
|
|
230
|
+
delattr(self, "_extracted_links")
|
|
183
231
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
return []
|
|
232
|
+
except Exception as e:
|
|
233
|
+
log_debug(f"Error during link extraction: {e}")
|
|
187
234
|
|
|
188
235
|
# 重複除去: 同じtextとurlを持つ要素を除去
|
|
189
236
|
seen = set()
|
|
@@ -209,17 +256,13 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
209
256
|
|
|
210
257
|
images: list[MarkdownElement] = []
|
|
211
258
|
|
|
212
|
-
if tree is None
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
self._extract_image_reference_definitions(tree.root_node, images)
|
|
220
|
-
except Exception as e:
|
|
221
|
-
log_debug(f"Error during image extraction: {e}")
|
|
222
|
-
return []
|
|
259
|
+
if tree is not None and tree.root_node is not None:
|
|
260
|
+
try:
|
|
261
|
+
self._extract_inline_images(tree.root_node, images)
|
|
262
|
+
self._extract_reference_images(tree.root_node, images)
|
|
263
|
+
self._extract_image_reference_definitions(tree.root_node, images)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
log_debug(f"Error during image extraction: {e}")
|
|
223
266
|
|
|
224
267
|
# 重複除去: 同じalt_textとurlを持つ要素を除去
|
|
225
268
|
seen = set()
|
|
@@ -245,15 +288,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
245
288
|
|
|
246
289
|
references: list[MarkdownElement] = []
|
|
247
290
|
|
|
248
|
-
if tree is None
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
self._extract_link_reference_definitions(tree.root_node, references)
|
|
254
|
-
except Exception as e:
|
|
255
|
-
log_debug(f"Error during reference extraction: {e}")
|
|
256
|
-
return []
|
|
291
|
+
if tree is not None and tree.root_node is not None:
|
|
292
|
+
try:
|
|
293
|
+
self._extract_link_reference_definitions(tree.root_node, references)
|
|
294
|
+
except Exception as e:
|
|
295
|
+
log_debug(f"Error during reference extraction: {e}")
|
|
257
296
|
|
|
258
297
|
log_debug(f"Extracted {len(references)} Markdown references")
|
|
259
298
|
return references
|
|
@@ -268,15 +307,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
268
307
|
|
|
269
308
|
blockquotes: list[MarkdownElement] = []
|
|
270
309
|
|
|
271
|
-
if tree is None
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
self._extract_block_quotes(tree.root_node, blockquotes)
|
|
277
|
-
except Exception as e:
|
|
278
|
-
log_debug(f"Error during blockquote extraction: {e}")
|
|
279
|
-
return []
|
|
310
|
+
if tree is not None and tree.root_node is not None:
|
|
311
|
+
try:
|
|
312
|
+
self._extract_block_quotes(tree.root_node, blockquotes)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
log_debug(f"Error during blockquote extraction: {e}")
|
|
280
315
|
|
|
281
316
|
log_debug(f"Extracted {len(blockquotes)} Markdown blockquotes")
|
|
282
317
|
return blockquotes
|
|
@@ -291,17 +326,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
291
326
|
|
|
292
327
|
horizontal_rules: list[MarkdownElement] = []
|
|
293
328
|
|
|
294
|
-
if tree is None
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
try:
|
|
301
|
-
self._extract_thematic_breaks(tree.root_node, horizontal_rules)
|
|
302
|
-
except Exception as e:
|
|
303
|
-
log_debug(f"Error during horizontal rule extraction: {e}")
|
|
304
|
-
return []
|
|
329
|
+
if tree is not None and tree.root_node is not None:
|
|
330
|
+
try:
|
|
331
|
+
self._extract_thematic_breaks(tree.root_node, horizontal_rules)
|
|
332
|
+
except Exception as e:
|
|
333
|
+
log_debug(f"Error during horizontal rule extraction: {e}")
|
|
305
334
|
|
|
306
335
|
log_debug(f"Extracted {len(horizontal_rules)} Markdown horizontal rules")
|
|
307
336
|
return horizontal_rules
|
|
@@ -316,16 +345,12 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
316
345
|
|
|
317
346
|
html_elements: list[MarkdownElement] = []
|
|
318
347
|
|
|
319
|
-
if tree is None
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
self._extract_inline_html(tree.root_node, html_elements)
|
|
326
|
-
except Exception as e:
|
|
327
|
-
log_debug(f"Error during HTML element extraction: {e}")
|
|
328
|
-
return []
|
|
348
|
+
if tree is not None and tree.root_node is not None:
|
|
349
|
+
try:
|
|
350
|
+
self._extract_html_blocks(tree.root_node, html_elements)
|
|
351
|
+
self._extract_inline_html(tree.root_node, html_elements)
|
|
352
|
+
except Exception as e:
|
|
353
|
+
log_debug(f"Error during HTML element extraction: {e}")
|
|
329
354
|
|
|
330
355
|
log_debug(f"Extracted {len(html_elements)} HTML elements")
|
|
331
356
|
return html_elements
|
|
@@ -340,19 +365,15 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
340
365
|
|
|
341
366
|
formatting_elements: list[MarkdownElement] = []
|
|
342
367
|
|
|
343
|
-
if tree is None
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
self._extract_strikethrough_elements(tree.root_node, formatting_elements)
|
|
353
|
-
except Exception as e:
|
|
354
|
-
log_debug(f"Error during text formatting extraction: {e}")
|
|
355
|
-
return []
|
|
368
|
+
if tree is not None and tree.root_node is not None:
|
|
369
|
+
try:
|
|
370
|
+
self._extract_emphasis_elements(tree.root_node, formatting_elements)
|
|
371
|
+
self._extract_inline_code_spans(tree.root_node, formatting_elements)
|
|
372
|
+
self._extract_strikethrough_elements(
|
|
373
|
+
tree.root_node, formatting_elements
|
|
374
|
+
)
|
|
375
|
+
except Exception as e:
|
|
376
|
+
log_debug(f"Error during text formatting extraction: {e}")
|
|
356
377
|
|
|
357
378
|
log_debug(f"Extracted {len(formatting_elements)} text formatting elements")
|
|
358
379
|
return formatting_elements
|
|
@@ -367,15 +388,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
367
388
|
|
|
368
389
|
footnotes: list[MarkdownElement] = []
|
|
369
390
|
|
|
370
|
-
if tree is None
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
self._extract_footnote_elements(tree.root_node, footnotes)
|
|
376
|
-
except Exception as e:
|
|
377
|
-
log_debug(f"Error during footnote extraction: {e}")
|
|
378
|
-
return []
|
|
391
|
+
if tree is not None and tree.root_node is not None:
|
|
392
|
+
try:
|
|
393
|
+
self._extract_footnote_elements(tree.root_node, footnotes)
|
|
394
|
+
except Exception as e:
|
|
395
|
+
log_debug(f"Error during footnote extraction: {e}")
|
|
379
396
|
|
|
380
397
|
log_debug(f"Extracted {len(footnotes)} footnotes")
|
|
381
398
|
return footnotes
|
|
@@ -390,15 +407,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
390
407
|
|
|
391
408
|
lists: list[MarkdownElement] = []
|
|
392
409
|
|
|
393
|
-
if tree is None
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
self._extract_list_items(tree.root_node, lists)
|
|
399
|
-
except Exception as e:
|
|
400
|
-
log_debug(f"Error during list extraction: {e}")
|
|
401
|
-
return []
|
|
410
|
+
if tree is not None and tree.root_node is not None:
|
|
411
|
+
try:
|
|
412
|
+
self._extract_list_items(tree.root_node, lists)
|
|
413
|
+
except Exception as e:
|
|
414
|
+
log_debug(f"Error during list extraction: {e}")
|
|
402
415
|
|
|
403
416
|
log_debug(f"Extracted {len(lists)} Markdown list items")
|
|
404
417
|
return lists
|
|
@@ -413,15 +426,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
413
426
|
|
|
414
427
|
tables: list[MarkdownElement] = []
|
|
415
428
|
|
|
416
|
-
if tree is None
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
self._extract_pipe_tables(tree.root_node, tables)
|
|
422
|
-
except Exception as e:
|
|
423
|
-
log_debug(f"Error during table extraction: {e}")
|
|
424
|
-
return []
|
|
429
|
+
if tree is not None and tree.root_node is not None:
|
|
430
|
+
try:
|
|
431
|
+
self._extract_pipe_tables(tree.root_node, tables)
|
|
432
|
+
except Exception as e:
|
|
433
|
+
log_debug(f"Error during table extraction: {e}")
|
|
425
434
|
|
|
426
435
|
log_debug(f"Extracted {len(tables)} Markdown tables")
|
|
427
436
|
return tables
|
|
@@ -468,7 +477,7 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
468
477
|
line = self.content_lines[start_point[0]]
|
|
469
478
|
start_col = max(0, min(start_point[1], len(line)))
|
|
470
479
|
end_col = max(start_col, min(end_point[1], len(line)))
|
|
471
|
-
result = line[start_col:end_col]
|
|
480
|
+
result: str = line[start_col:end_col]
|
|
472
481
|
self._node_text_cache[node_id] = result
|
|
473
482
|
return result
|
|
474
483
|
else:
|
|
@@ -900,7 +909,6 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
900
909
|
|
|
901
910
|
# Extract all reference definitions that could be used for images
|
|
902
911
|
# We check if the URL points to an image file or if it's used by an image reference
|
|
903
|
-
|
|
904
912
|
# First, collect all image references used in the document
|
|
905
913
|
image_refs_used = set()
|
|
906
914
|
for node in self._traverse_nodes(root_node):
|
|
@@ -932,12 +940,14 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
932
940
|
|
|
933
941
|
# Pattern: [label]: url "title"
|
|
934
942
|
ref_pattern = r'^\[([^\]]+)\]:\s*([^\s]+)(?:\s+"([^"]*)")?'
|
|
935
|
-
|
|
943
|
+
ref_match: re.Match[str] | None = re.match(
|
|
944
|
+
ref_pattern, raw_text.strip()
|
|
945
|
+
)
|
|
936
946
|
|
|
937
|
-
if
|
|
938
|
-
label =
|
|
939
|
-
url =
|
|
940
|
-
title =
|
|
947
|
+
if ref_match:
|
|
948
|
+
label = ref_match.group(1) or ""
|
|
949
|
+
url = ref_match.group(2) or ""
|
|
950
|
+
title = ref_match.group(3) or ""
|
|
941
951
|
|
|
942
952
|
# Include if this reference is used by an image OR if URL looks like an image
|
|
943
953
|
is_used_by_image = label.lower() in image_refs_used
|
|
@@ -1124,9 +1134,11 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1124
1134
|
content = "\n".join(content_lines).strip()
|
|
1125
1135
|
|
|
1126
1136
|
blockquote = MarkdownElement(
|
|
1127
|
-
name=
|
|
1128
|
-
|
|
1129
|
-
|
|
1137
|
+
name=(
|
|
1138
|
+
f"Blockquote: {content[:50]}..."
|
|
1139
|
+
if len(content) > 50
|
|
1140
|
+
else f"Blockquote: {content}"
|
|
1141
|
+
),
|
|
1130
1142
|
start_line=start_line,
|
|
1131
1143
|
end_line=end_line,
|
|
1132
1144
|
raw_text=raw_text,
|
|
@@ -1406,13 +1418,13 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1406
1418
|
|
|
1407
1419
|
# Pattern for footnote definitions: [^1]: content
|
|
1408
1420
|
footnote_def_pattern = r"^\[\^([^\]]+)\]:\s*(.+)$"
|
|
1409
|
-
|
|
1421
|
+
footnote_match: re.Match[str] | None = re.match(
|
|
1410
1422
|
footnote_def_pattern, raw_text.strip(), re.MULTILINE
|
|
1411
1423
|
)
|
|
1412
1424
|
|
|
1413
|
-
if
|
|
1414
|
-
ref_id =
|
|
1415
|
-
content =
|
|
1425
|
+
if footnote_match:
|
|
1426
|
+
ref_id = footnote_match.group(1) or ""
|
|
1427
|
+
content = footnote_match.group(2) or ""
|
|
1416
1428
|
start_line = node.start_point[0] + 1
|
|
1417
1429
|
end_line = node.end_point[0] + 1
|
|
1418
1430
|
|
|
@@ -1430,7 +1442,7 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1430
1442
|
except Exception as e:
|
|
1431
1443
|
log_debug(f"Failed to extract footnote definition: {e}")
|
|
1432
1444
|
|
|
1433
|
-
def _traverse_nodes(self, node: "tree_sitter.Node"):
|
|
1445
|
+
def _traverse_nodes(self, node: "tree_sitter.Node") -> Any:
|
|
1434
1446
|
"""Traverse all nodes in the tree"""
|
|
1435
1447
|
yield node
|
|
1436
1448
|
for child in node.children:
|
|
@@ -1507,28 +1519,68 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1507
1519
|
) -> list[CodeElement]:
|
|
1508
1520
|
"""Extract functions from the tree (legacy compatibility)"""
|
|
1509
1521
|
extractor = self.get_extractor()
|
|
1510
|
-
|
|
1522
|
+
functions = extractor.extract_functions(tree, source_code)
|
|
1523
|
+
return [
|
|
1524
|
+
CodeElement(
|
|
1525
|
+
name=f.name,
|
|
1526
|
+
start_line=f.start_line,
|
|
1527
|
+
end_line=f.end_line,
|
|
1528
|
+
raw_text=f.raw_text,
|
|
1529
|
+
language=f.language,
|
|
1530
|
+
)
|
|
1531
|
+
for f in functions
|
|
1532
|
+
]
|
|
1511
1533
|
|
|
1512
1534
|
def extract_classes(
|
|
1513
1535
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
1514
1536
|
) -> list[CodeElement]:
|
|
1515
1537
|
"""Extract classes from the tree (legacy compatibility)"""
|
|
1516
1538
|
extractor = self.get_extractor()
|
|
1517
|
-
|
|
1539
|
+
classes = extractor.extract_classes(tree, source_code)
|
|
1540
|
+
return [
|
|
1541
|
+
CodeElement(
|
|
1542
|
+
name=c.name,
|
|
1543
|
+
start_line=c.start_line,
|
|
1544
|
+
end_line=c.end_line,
|
|
1545
|
+
raw_text=c.raw_text,
|
|
1546
|
+
language=c.language,
|
|
1547
|
+
)
|
|
1548
|
+
for c in classes
|
|
1549
|
+
]
|
|
1518
1550
|
|
|
1519
1551
|
def extract_variables(
|
|
1520
1552
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
1521
1553
|
) -> list[CodeElement]:
|
|
1522
1554
|
"""Extract variables from the tree (legacy compatibility)"""
|
|
1523
1555
|
extractor = self.get_extractor()
|
|
1524
|
-
|
|
1556
|
+
variables = extractor.extract_variables(tree, source_code)
|
|
1557
|
+
return [
|
|
1558
|
+
CodeElement(
|
|
1559
|
+
name=v.name,
|
|
1560
|
+
start_line=v.start_line,
|
|
1561
|
+
end_line=v.end_line,
|
|
1562
|
+
raw_text=v.raw_text,
|
|
1563
|
+
language=v.language,
|
|
1564
|
+
)
|
|
1565
|
+
for v in variables
|
|
1566
|
+
]
|
|
1525
1567
|
|
|
1526
1568
|
def extract_imports(
|
|
1527
1569
|
self, tree: "tree_sitter.Tree", source_code: str
|
|
1528
1570
|
) -> list[CodeElement]:
|
|
1529
1571
|
"""Extract imports from the tree (legacy compatibility)"""
|
|
1530
1572
|
extractor = self.get_extractor()
|
|
1531
|
-
|
|
1573
|
+
imports = extractor.extract_imports(tree, source_code)
|
|
1574
|
+
return [
|
|
1575
|
+
CodeElement(
|
|
1576
|
+
name=i.name,
|
|
1577
|
+
start_line=i.start_line,
|
|
1578
|
+
end_line=i.end_line,
|
|
1579
|
+
raw_text=i.raw_text,
|
|
1580
|
+
language=i.language,
|
|
1581
|
+
)
|
|
1582
|
+
for i in imports
|
|
1583
|
+
]
|
|
1532
1584
|
|
|
1533
1585
|
def get_tree_sitter_language(self) -> Optional["tree_sitter.Language"]:
|
|
1534
1586
|
"""Get the Tree-sitter language object for Markdown"""
|
|
@@ -1633,33 +1685,49 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1633
1685
|
)
|
|
1634
1686
|
|
|
1635
1687
|
try:
|
|
1636
|
-
|
|
1637
|
-
|
|
1688
|
+
from ..encoding_utils import read_file_safe
|
|
1689
|
+
|
|
1690
|
+
source_code, _ = read_file_safe(file_path)
|
|
1638
1691
|
|
|
1639
1692
|
parser = tree_sitter.Parser()
|
|
1640
1693
|
parser.language = language
|
|
1641
|
-
tree = parser.parse(
|
|
1694
|
+
tree = parser.parse(source_code.encode("utf-8"))
|
|
1642
1695
|
|
|
1643
1696
|
extractor = self.create_extractor()
|
|
1644
1697
|
extractor.current_file = file_path # Set current file for context
|
|
1645
1698
|
|
|
1646
1699
|
elements: list[CodeElement] = []
|
|
1647
1700
|
|
|
1648
|
-
# Extract all element types
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1701
|
+
# Extract all element types using the markdown-specific extractor
|
|
1702
|
+
if isinstance(extractor, MarkdownElementExtractor):
|
|
1703
|
+
headers = extractor.extract_headers(tree, source_code)
|
|
1704
|
+
code_blocks = extractor.extract_code_blocks(tree, source_code)
|
|
1705
|
+
links = extractor.extract_links(tree, source_code)
|
|
1706
|
+
images = extractor.extract_images(tree, source_code)
|
|
1707
|
+
references = extractor.extract_references(tree, source_code)
|
|
1708
|
+
lists = extractor.extract_lists(tree, source_code)
|
|
1709
|
+
tables = extractor.extract_tables(tree, source_code)
|
|
1710
|
+
|
|
1711
|
+
# Extract new element types
|
|
1712
|
+
blockquotes = extractor.extract_blockquotes(tree, source_code)
|
|
1713
|
+
horizontal_rules = extractor.extract_horizontal_rules(tree, source_code)
|
|
1714
|
+
html_elements = extractor.extract_html_elements(tree, source_code)
|
|
1715
|
+
text_formatting = extractor.extract_text_formatting(tree, source_code)
|
|
1716
|
+
footnotes = extractor.extract_footnotes(tree, source_code)
|
|
1717
|
+
else:
|
|
1718
|
+
# Fallback for base ElementExtractor
|
|
1719
|
+
headers = []
|
|
1720
|
+
code_blocks = []
|
|
1721
|
+
links = []
|
|
1722
|
+
images = []
|
|
1723
|
+
references = []
|
|
1724
|
+
lists = []
|
|
1725
|
+
tables = []
|
|
1726
|
+
blockquotes = []
|
|
1727
|
+
horizontal_rules = []
|
|
1728
|
+
html_elements = []
|
|
1729
|
+
text_formatting = []
|
|
1730
|
+
footnotes = []
|
|
1663
1731
|
|
|
1664
1732
|
elements.extend(headers)
|
|
1665
1733
|
elements.extend(code_blocks)
|
|
@@ -1732,129 +1800,43 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1732
1800
|
elements = []
|
|
1733
1801
|
|
|
1734
1802
|
try:
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1803
|
+
if isinstance(extractor, MarkdownElementExtractor):
|
|
1804
|
+
elements.extend(extractor.extract_headers(tree, source_code))
|
|
1805
|
+
elements.extend(extractor.extract_code_blocks(tree, source_code))
|
|
1806
|
+
elements.extend(extractor.extract_links(tree, source_code))
|
|
1807
|
+
elements.extend(extractor.extract_images(tree, source_code))
|
|
1808
|
+
elements.extend(extractor.extract_references(tree, source_code))
|
|
1809
|
+
elements.extend(extractor.extract_lists(tree, source_code))
|
|
1810
|
+
elements.extend(extractor.extract_tables(tree, source_code))
|
|
1811
|
+
elements.extend(extractor.extract_blockquotes(tree, source_code))
|
|
1812
|
+
elements.extend(extractor.extract_horizontal_rules(tree, source_code))
|
|
1813
|
+
elements.extend(extractor.extract_html_elements(tree, source_code))
|
|
1814
|
+
elements.extend(extractor.extract_text_formatting(tree, source_code))
|
|
1815
|
+
elements.extend(extractor.extract_footnotes(tree, source_code))
|
|
1747
1816
|
except Exception as e:
|
|
1748
1817
|
log_error(f"Failed to extract elements: {e}")
|
|
1749
1818
|
|
|
1750
1819
|
return elements
|
|
1751
1820
|
|
|
1752
1821
|
def execute_query_strategy(
|
|
1753
|
-
self,
|
|
1754
|
-
) ->
|
|
1755
|
-
"""Execute
|
|
1756
|
-
if not
|
|
1757
|
-
return
|
|
1758
|
-
|
|
1759
|
-
#
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
"class": lambda: self._extractor.extract_code_blocks(tree, source_code),
|
|
1772
|
-
"code_blocks": lambda: self._extractor.extract_code_blocks(
|
|
1773
|
-
tree, source_code
|
|
1774
|
-
),
|
|
1775
|
-
"code_block": lambda: self._extractor.extract_code_blocks(
|
|
1776
|
-
tree, source_code
|
|
1777
|
-
),
|
|
1778
|
-
# Link and image queries (mapped to variables)
|
|
1779
|
-
"variable": lambda: self._extractor.extract_links(tree, source_code)
|
|
1780
|
-
+ self._extractor.extract_images(tree, source_code),
|
|
1781
|
-
"links": lambda: self._extractor.extract_links(tree, source_code),
|
|
1782
|
-
"link": lambda: self._extractor.extract_links(tree, source_code),
|
|
1783
|
-
"images": lambda: self._extractor.extract_images(tree, source_code),
|
|
1784
|
-
"image": lambda: self._extractor.extract_images(tree, source_code),
|
|
1785
|
-
# Reference queries (mapped to imports)
|
|
1786
|
-
"import": lambda: self._extractor.extract_references(tree, source_code),
|
|
1787
|
-
"references": lambda: self._extractor.extract_references(tree, source_code),
|
|
1788
|
-
"reference": lambda: self._extractor.extract_references(tree, source_code),
|
|
1789
|
-
# List and table queries
|
|
1790
|
-
"lists": lambda: self._extractor.extract_lists(tree, source_code),
|
|
1791
|
-
"list": lambda: self._extractor.extract_lists(tree, source_code),
|
|
1792
|
-
"task_lists": lambda: [
|
|
1793
|
-
lst
|
|
1794
|
-
for lst in self._extractor.extract_lists(tree, source_code)
|
|
1795
|
-
if getattr(lst, "element_type", "") == "task_list"
|
|
1796
|
-
],
|
|
1797
|
-
"tables": lambda: self._extractor.extract_tables(tree, source_code),
|
|
1798
|
-
"table": lambda: self._extractor.extract_tables(tree, source_code),
|
|
1799
|
-
# Content structure queries
|
|
1800
|
-
"blockquotes": lambda: self._extractor.extract_blockquotes(
|
|
1801
|
-
tree, source_code
|
|
1802
|
-
),
|
|
1803
|
-
"blockquote": lambda: self._extractor.extract_blockquotes(
|
|
1804
|
-
tree, source_code
|
|
1805
|
-
),
|
|
1806
|
-
"horizontal_rules": lambda: self._extractor.extract_horizontal_rules(
|
|
1807
|
-
tree, source_code
|
|
1808
|
-
),
|
|
1809
|
-
"horizontal_rule": lambda: self._extractor.extract_horizontal_rules(
|
|
1810
|
-
tree, source_code
|
|
1811
|
-
),
|
|
1812
|
-
# HTML and formatting queries
|
|
1813
|
-
"html_blocks": lambda: self._extractor.extract_html_elements(
|
|
1814
|
-
tree, source_code
|
|
1815
|
-
),
|
|
1816
|
-
"html_block": lambda: self._extractor.extract_html_elements(
|
|
1817
|
-
tree, source_code
|
|
1818
|
-
),
|
|
1819
|
-
"html": lambda: self._extractor.extract_html_elements(tree, source_code),
|
|
1820
|
-
"emphasis": lambda: self._extractor.extract_text_formatting(
|
|
1821
|
-
tree, source_code
|
|
1822
|
-
),
|
|
1823
|
-
"formatting": lambda: self._extractor.extract_text_formatting(
|
|
1824
|
-
tree, source_code
|
|
1825
|
-
),
|
|
1826
|
-
"text_formatting": lambda: self._extractor.extract_text_formatting(
|
|
1827
|
-
tree, source_code
|
|
1828
|
-
),
|
|
1829
|
-
"inline_code": lambda: [
|
|
1830
|
-
f
|
|
1831
|
-
for f in self._extractor.extract_text_formatting(tree, source_code)
|
|
1832
|
-
if getattr(f, "element_type", "") == "inline_code"
|
|
1833
|
-
],
|
|
1834
|
-
"strikethrough": lambda: [
|
|
1835
|
-
f
|
|
1836
|
-
for f in self._extractor.extract_text_formatting(tree, source_code)
|
|
1837
|
-
if getattr(f, "element_type", "") == "strikethrough"
|
|
1838
|
-
],
|
|
1839
|
-
# Footnote queries
|
|
1840
|
-
"footnotes": lambda: self._extractor.extract_footnotes(tree, source_code),
|
|
1841
|
-
"footnote": lambda: self._extractor.extract_footnotes(tree, source_code),
|
|
1842
|
-
# Comprehensive queries
|
|
1843
|
-
"all_elements": lambda: self.extract_elements(tree, source_code),
|
|
1844
|
-
"text_content": lambda: self._extractor.extract_headers(tree, source_code)
|
|
1845
|
-
+ self._extractor.extract_text_formatting(tree, source_code),
|
|
1846
|
-
}
|
|
1847
|
-
|
|
1848
|
-
# Execute the appropriate extraction method
|
|
1849
|
-
if query_key in query_mapping:
|
|
1850
|
-
try:
|
|
1851
|
-
return query_mapping[query_key]()
|
|
1852
|
-
except Exception as e:
|
|
1853
|
-
log_error(f"Error executing Markdown query '{query_key}': {e}")
|
|
1854
|
-
return []
|
|
1855
|
-
else:
|
|
1856
|
-
log_warning(f"Unsupported Markdown query key: {query_key}")
|
|
1857
|
-
return []
|
|
1822
|
+
self, query_key: str | None, language: str
|
|
1823
|
+
) -> str | None:
|
|
1824
|
+
"""Execute query strategy for Markdown language"""
|
|
1825
|
+
if not query_key:
|
|
1826
|
+
return None
|
|
1827
|
+
|
|
1828
|
+
# Use markdown-specific element categories instead of base queries
|
|
1829
|
+
element_categories = self.get_element_categories()
|
|
1830
|
+
if query_key in element_categories:
|
|
1831
|
+
# Return a simple query string for the category
|
|
1832
|
+
node_types = element_categories[query_key]
|
|
1833
|
+
if node_types:
|
|
1834
|
+
# Create a basic query for the first node type
|
|
1835
|
+
return f"({node_types[0]}) @{query_key}"
|
|
1836
|
+
|
|
1837
|
+
# Fallback to base implementation
|
|
1838
|
+
queries = self.get_queries()
|
|
1839
|
+
return queries.get(query_key) if queries else None
|
|
1858
1840
|
|
|
1859
1841
|
def get_element_categories(self) -> dict[str, list[str]]:
|
|
1860
1842
|
"""Get Markdown element categories mapping query_key to node_types"""
|
|
@@ -1911,9 +1893,7 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1911
1893
|
"html_block": ["html_block", "inline"],
|
|
1912
1894
|
"html": ["html_block", "inline"],
|
|
1913
1895
|
# Text formatting categories
|
|
1914
|
-
"emphasis": [
|
|
1915
|
-
"inline" # Contains emphasis elements
|
|
1916
|
-
],
|
|
1896
|
+
"emphasis": ["inline"], # Contains emphasis elements
|
|
1917
1897
|
"formatting": ["inline"],
|
|
1918
1898
|
"text_formatting": ["inline"],
|
|
1919
1899
|
"inline_code": ["inline"],
|