tree-sitter-analyzer 1.9.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. tree_sitter_analyzer/__init__.py +132 -0
  2. tree_sitter_analyzer/__main__.py +11 -0
  3. tree_sitter_analyzer/api.py +853 -0
  4. tree_sitter_analyzer/cli/__init__.py +39 -0
  5. tree_sitter_analyzer/cli/__main__.py +12 -0
  6. tree_sitter_analyzer/cli/argument_validator.py +89 -0
  7. tree_sitter_analyzer/cli/commands/__init__.py +26 -0
  8. tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
  9. tree_sitter_analyzer/cli/commands/base_command.py +181 -0
  10. tree_sitter_analyzer/cli/commands/default_command.py +18 -0
  11. tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
  12. tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
  13. tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
  14. tree_sitter_analyzer/cli/commands/query_command.py +109 -0
  15. tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
  16. tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
  17. tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
  18. tree_sitter_analyzer/cli/commands/table_command.py +414 -0
  19. tree_sitter_analyzer/cli/info_commands.py +124 -0
  20. tree_sitter_analyzer/cli_main.py +472 -0
  21. tree_sitter_analyzer/constants.py +85 -0
  22. tree_sitter_analyzer/core/__init__.py +15 -0
  23. tree_sitter_analyzer/core/analysis_engine.py +580 -0
  24. tree_sitter_analyzer/core/cache_service.py +333 -0
  25. tree_sitter_analyzer/core/engine.py +585 -0
  26. tree_sitter_analyzer/core/parser.py +293 -0
  27. tree_sitter_analyzer/core/query.py +605 -0
  28. tree_sitter_analyzer/core/query_filter.py +200 -0
  29. tree_sitter_analyzer/core/query_service.py +340 -0
  30. tree_sitter_analyzer/encoding_utils.py +530 -0
  31. tree_sitter_analyzer/exceptions.py +747 -0
  32. tree_sitter_analyzer/file_handler.py +246 -0
  33. tree_sitter_analyzer/formatters/__init__.py +1 -0
  34. tree_sitter_analyzer/formatters/base_formatter.py +201 -0
  35. tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
  36. tree_sitter_analyzer/formatters/formatter_config.py +197 -0
  37. tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
  38. tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
  39. tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
  40. tree_sitter_analyzer/formatters/go_formatter.py +368 -0
  41. tree_sitter_analyzer/formatters/html_formatter.py +498 -0
  42. tree_sitter_analyzer/formatters/java_formatter.py +423 -0
  43. tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
  44. tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
  45. tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
  46. tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
  47. tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
  48. tree_sitter_analyzer/formatters/php_formatter.py +301 -0
  49. tree_sitter_analyzer/formatters/python_formatter.py +830 -0
  50. tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
  51. tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
  52. tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
  53. tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
  54. tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
  55. tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
  56. tree_sitter_analyzer/interfaces/__init__.py +9 -0
  57. tree_sitter_analyzer/interfaces/cli.py +535 -0
  58. tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
  59. tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
  60. tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
  61. tree_sitter_analyzer/language_detector.py +553 -0
  62. tree_sitter_analyzer/language_loader.py +271 -0
  63. tree_sitter_analyzer/languages/__init__.py +10 -0
  64. tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
  65. tree_sitter_analyzer/languages/css_plugin.py +449 -0
  66. tree_sitter_analyzer/languages/go_plugin.py +836 -0
  67. tree_sitter_analyzer/languages/html_plugin.py +496 -0
  68. tree_sitter_analyzer/languages/java_plugin.py +1299 -0
  69. tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
  70. tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
  71. tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
  72. tree_sitter_analyzer/languages/php_plugin.py +862 -0
  73. tree_sitter_analyzer/languages/python_plugin.py +1636 -0
  74. tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
  75. tree_sitter_analyzer/languages/rust_plugin.py +673 -0
  76. tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
  77. tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
  78. tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
  79. tree_sitter_analyzer/legacy_table_formatter.py +860 -0
  80. tree_sitter_analyzer/mcp/__init__.py +34 -0
  81. tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
  82. tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
  83. tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
  84. tree_sitter_analyzer/mcp/server.py +869 -0
  85. tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
  86. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
  87. tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
  88. tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
  89. tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
  90. tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
  91. tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
  92. tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
  93. tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
  94. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
  95. tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
  96. tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
  97. tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
  98. tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
  99. tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
  100. tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
  101. tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
  102. tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
  103. tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
  104. tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
  105. tree_sitter_analyzer/models.py +840 -0
  106. tree_sitter_analyzer/mypy_current_errors.txt +2 -0
  107. tree_sitter_analyzer/output_manager.py +255 -0
  108. tree_sitter_analyzer/platform_compat/__init__.py +3 -0
  109. tree_sitter_analyzer/platform_compat/adapter.py +324 -0
  110. tree_sitter_analyzer/platform_compat/compare.py +224 -0
  111. tree_sitter_analyzer/platform_compat/detector.py +67 -0
  112. tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
  113. tree_sitter_analyzer/platform_compat/profiles.py +217 -0
  114. tree_sitter_analyzer/platform_compat/record.py +55 -0
  115. tree_sitter_analyzer/platform_compat/recorder.py +155 -0
  116. tree_sitter_analyzer/platform_compat/report.py +92 -0
  117. tree_sitter_analyzer/plugins/__init__.py +280 -0
  118. tree_sitter_analyzer/plugins/base.py +647 -0
  119. tree_sitter_analyzer/plugins/manager.py +384 -0
  120. tree_sitter_analyzer/project_detector.py +328 -0
  121. tree_sitter_analyzer/queries/__init__.py +27 -0
  122. tree_sitter_analyzer/queries/csharp.py +216 -0
  123. tree_sitter_analyzer/queries/css.py +615 -0
  124. tree_sitter_analyzer/queries/go.py +275 -0
  125. tree_sitter_analyzer/queries/html.py +543 -0
  126. tree_sitter_analyzer/queries/java.py +402 -0
  127. tree_sitter_analyzer/queries/javascript.py +724 -0
  128. tree_sitter_analyzer/queries/kotlin.py +192 -0
  129. tree_sitter_analyzer/queries/markdown.py +258 -0
  130. tree_sitter_analyzer/queries/php.py +95 -0
  131. tree_sitter_analyzer/queries/python.py +859 -0
  132. tree_sitter_analyzer/queries/ruby.py +92 -0
  133. tree_sitter_analyzer/queries/rust.py +223 -0
  134. tree_sitter_analyzer/queries/sql.py +555 -0
  135. tree_sitter_analyzer/queries/typescript.py +871 -0
  136. tree_sitter_analyzer/queries/yaml.py +236 -0
  137. tree_sitter_analyzer/query_loader.py +272 -0
  138. tree_sitter_analyzer/security/__init__.py +22 -0
  139. tree_sitter_analyzer/security/boundary_manager.py +277 -0
  140. tree_sitter_analyzer/security/regex_checker.py +297 -0
  141. tree_sitter_analyzer/security/validator.py +599 -0
  142. tree_sitter_analyzer/table_formatter.py +782 -0
  143. tree_sitter_analyzer/utils/__init__.py +53 -0
  144. tree_sitter_analyzer/utils/logging.py +433 -0
  145. tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
  146. tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
  147. tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
  148. tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
  149. tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
@@ -0,0 +1,1928 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Markdown Language Plugin
4
+
5
+ Enhanced Markdown-specific parsing and element extraction functionality.
6
+ Provides comprehensive support for Markdown elements including headers,
7
+ links, code blocks, lists, tables, and other structural elements.
8
+ """
9
+
10
+ from typing import TYPE_CHECKING, Any, Optional
11
+
12
+ if TYPE_CHECKING:
13
+ import tree_sitter
14
+
15
+ try:
16
+ import tree_sitter
17
+
18
+ TREE_SITTER_AVAILABLE = True
19
+ except ImportError:
20
+ TREE_SITTER_AVAILABLE = False
21
+
22
+ from ..core.analysis_engine import AnalysisRequest
23
+ from ..encoding_utils import extract_text_slice, safe_encode
24
+ from ..models import AnalysisResult, CodeElement
25
+ from ..models import Class as ModelClass
26
+ from ..models import Function as ModelFunction
27
+ from ..models import Import as ModelImport
28
+ from ..models import Variable as ModelVariable
29
+ from ..plugins.base import ElementExtractor, LanguagePlugin
30
+ from ..utils import log_debug, log_error
31
+ from ..utils.tree_sitter_compat import TreeSitterQueryCompat
32
+
33
+
34
+ class MarkdownElement(CodeElement):
35
+ """Markdown-specific code element"""
36
+
37
+ def __init__(
38
+ self,
39
+ name: str,
40
+ start_line: int,
41
+ end_line: int,
42
+ raw_text: str,
43
+ language: str = "markdown",
44
+ element_type: str = "markdown",
45
+ level: int | None = None,
46
+ url: str | None = None,
47
+ alt_text: str | None = None,
48
+ title: str | None = None,
49
+ language_info: str | None = None,
50
+ is_checked: bool | None = None,
51
+ **kwargs: Any,
52
+ ) -> None:
53
+ super().__init__(
54
+ name=name,
55
+ start_line=start_line,
56
+ end_line=end_line,
57
+ raw_text=raw_text,
58
+ language=language,
59
+ **kwargs,
60
+ )
61
+ self.element_type = element_type
62
+ self.level = level # For headers (1-6)
63
+ self.url = url # For links and images
64
+ self.alt_text = alt_text # For images
65
+ self.title = title # For links and images
66
+ self.language_info = language_info # For code blocks
67
+ self.is_checked = is_checked # For task list items
68
+
69
+ # Additional attributes used by formatters
70
+ self.text: str | None = None # Text content
71
+ self.type: str | None = None # Element type for formatters
72
+ self.line_count: int | None = None # For code blocks
73
+ self.alt: str | None = None # Alternative text for images
74
+ self.list_type: str | None = None # For lists (ordered/unordered/task)
75
+ self.item_count: int | None = None # For lists
76
+ self.row_count: int | None = None # For tables
77
+ self.column_count: int | None = None # For tables
78
+
79
+
80
+ class MarkdownElementExtractor(ElementExtractor):
81
+ """Markdown-specific element extractor with comprehensive feature support"""
82
+
83
+ def __init__(self) -> None:
84
+ """Initialize the Markdown element extractor."""
85
+ self.current_file: str = ""
86
+ self.source_code: str = ""
87
+ self.content_lines: list[str] = []
88
+
89
+ # Performance optimization caches
90
+ self._node_text_cache: dict[int, str] = {}
91
+ self._processed_nodes: set[int] = set()
92
+ self._element_cache: dict[tuple[int, str], Any] = {}
93
+ self._file_encoding: str | None = None
94
+
95
+ def extract_functions(
96
+ self, tree: "tree_sitter.Tree", source_code: str
97
+ ) -> list[ModelFunction]:
98
+ """Extract Markdown elements (headers act as 'functions')"""
99
+ headers = self.extract_headers(tree, source_code)
100
+ functions = []
101
+ for header in headers:
102
+ func = ModelFunction(
103
+ name=header.name,
104
+ start_line=header.start_line,
105
+ end_line=header.end_line,
106
+ raw_text=header.raw_text,
107
+ language=header.language,
108
+ )
109
+ functions.append(func)
110
+ return functions
111
+
112
+ def extract_classes(
113
+ self, tree: "tree_sitter.Tree", source_code: str
114
+ ) -> list[ModelClass]:
115
+ """Extract Markdown sections (code blocks act as 'classes')"""
116
+ code_blocks = self.extract_code_blocks(tree, source_code)
117
+ classes = []
118
+ for block in code_blocks:
119
+ cls = ModelClass(
120
+ name=block.name,
121
+ start_line=block.start_line,
122
+ end_line=block.end_line,
123
+ raw_text=block.raw_text,
124
+ language=block.language,
125
+ )
126
+ classes.append(cls)
127
+ return classes
128
+
129
+ def extract_variables(
130
+ self, tree: "tree_sitter.Tree", source_code: str
131
+ ) -> list[ModelVariable]:
132
+ """Extract Markdown links and images (act as 'variables')"""
133
+ elements = []
134
+ elements.extend(self.extract_links(tree, source_code))
135
+ elements.extend(self.extract_images(tree, source_code))
136
+
137
+ variables = []
138
+ for element in elements:
139
+ var = ModelVariable(
140
+ name=element.name,
141
+ start_line=element.start_line,
142
+ end_line=element.end_line,
143
+ raw_text=element.raw_text,
144
+ language=element.language,
145
+ )
146
+ variables.append(var)
147
+ return variables
148
+
149
+ def extract_imports(
150
+ self, tree: "tree_sitter.Tree", source_code: str
151
+ ) -> list[ModelImport]:
152
+ """Extract Markdown references and definitions"""
153
+ references = self.extract_references(tree, source_code)
154
+ imports = []
155
+ for ref in references:
156
+ imp = ModelImport(
157
+ name=ref.name,
158
+ start_line=ref.start_line,
159
+ end_line=ref.end_line,
160
+ raw_text=ref.raw_text,
161
+ language=ref.language,
162
+ )
163
+ imports.append(imp)
164
+ return imports
165
+
166
+ def extract_headers(
167
+ self, tree: "tree_sitter.Tree", source_code: str
168
+ ) -> list[MarkdownElement]:
169
+ """Extract Markdown headers (H1-H6)"""
170
+ self.source_code = source_code or ""
171
+ self.content_lines = self.source_code.split("\n")
172
+ self._reset_caches()
173
+
174
+ headers: list[MarkdownElement] = []
175
+
176
+ if tree is not None and tree.root_node is not None:
177
+ try:
178
+ # Extract ATX headers (# ## ### etc.)
179
+ self._extract_atx_headers(tree.root_node, headers)
180
+ # Extract Setext headers (underlined)
181
+ self._extract_setext_headers(tree.root_node, headers)
182
+ except Exception as e:
183
+ log_debug(f"Error during header extraction: {e}")
184
+
185
+ log_debug(f"Extracted {len(headers)} Markdown headers")
186
+ return headers
187
+
188
+ def extract_code_blocks(
189
+ self, tree: "tree_sitter.Tree", source_code: str
190
+ ) -> list[MarkdownElement]:
191
+ """Extract Markdown code blocks"""
192
+ self.source_code = source_code or ""
193
+ self.content_lines = self.source_code.split("\n")
194
+ self._reset_caches()
195
+
196
+ code_blocks: list[MarkdownElement] = []
197
+
198
+ if tree is not None and tree.root_node is not None:
199
+ try:
200
+ self._extract_fenced_code_blocks(tree.root_node, code_blocks)
201
+ self._extract_indented_code_blocks(tree.root_node, code_blocks)
202
+ except Exception as e:
203
+ log_debug(f"Error during code block extraction: {e}")
204
+
205
+ log_debug(f"Extracted {len(code_blocks)} Markdown code blocks")
206
+ return code_blocks
207
+
208
+ def extract_links(
209
+ self, tree: "tree_sitter.Tree", source_code: str
210
+ ) -> list[MarkdownElement]:
211
+ """Extract Markdown links"""
212
+ self.source_code = source_code or ""
213
+ self.content_lines = self.source_code.split("\n")
214
+ self._reset_caches()
215
+
216
+ links: list[MarkdownElement] = []
217
+
218
+ if tree is not None and tree.root_node is not None:
219
+ try:
220
+ # Track extracted links to prevent global duplicates (ensure reset)
221
+ self._extracted_links = set()
222
+
223
+ self._extract_inline_links(tree.root_node, links)
224
+ self._extract_reference_links(tree.root_node, links)
225
+ self._extract_autolinks(tree.root_node, links)
226
+
227
+ # Clean up after extraction is complete
228
+ if hasattr(self, "_extracted_links"):
229
+ delattr(self, "_extracted_links")
230
+
231
+ except Exception as e:
232
+ log_debug(f"Error during link extraction: {e}")
233
+
234
+ # 重複除去: 同じtextとurlを持つ要素を除去
235
+ seen = set()
236
+ unique_links = []
237
+ for link in links:
238
+ key = (getattr(link, "text", "") or "", getattr(link, "url", "") or "")
239
+ if key not in seen:
240
+ seen.add(key)
241
+ unique_links.append(link)
242
+
243
+ links = unique_links
244
+
245
+ log_debug(f"Extracted {len(links)} Markdown links")
246
+ return links
247
+
248
+ def extract_images(
249
+ self, tree: "tree_sitter.Tree", source_code: str
250
+ ) -> list[MarkdownElement]:
251
+ """Extract Markdown images"""
252
+ self.source_code = source_code or ""
253
+ self.content_lines = self.source_code.split("\n")
254
+ self._reset_caches()
255
+
256
+ images: list[MarkdownElement] = []
257
+
258
+ if tree is not None and tree.root_node is not None:
259
+ try:
260
+ self._extract_inline_images(tree.root_node, images)
261
+ self._extract_reference_images(tree.root_node, images)
262
+ self._extract_image_reference_definitions(tree.root_node, images)
263
+ except Exception as e:
264
+ log_debug(f"Error during image extraction: {e}")
265
+
266
+ # 重複除去: 同じalt_textとurlを持つ要素を除去
267
+ seen = set()
268
+ unique_images = []
269
+ for img in images:
270
+ key = (img.alt_text or "", img.url or "")
271
+ if key not in seen:
272
+ seen.add(key)
273
+ unique_images.append(img)
274
+
275
+ images = unique_images
276
+
277
+ log_debug(f"Extracted {len(images)} Markdown images")
278
+ return images
279
+
280
+ def extract_references(
281
+ self, tree: "tree_sitter.Tree", source_code: str
282
+ ) -> list[MarkdownElement]:
283
+ """Extract Markdown reference definitions"""
284
+ self.source_code = source_code or ""
285
+ self.content_lines = self.source_code.split("\n")
286
+ self._reset_caches()
287
+
288
+ references: list[MarkdownElement] = []
289
+
290
+ if tree is not None and tree.root_node is not None:
291
+ try:
292
+ self._extract_link_reference_definitions(tree.root_node, references)
293
+ except Exception as e:
294
+ log_debug(f"Error during reference extraction: {e}")
295
+
296
+ log_debug(f"Extracted {len(references)} Markdown references")
297
+ return references
298
+
299
+ def extract_blockquotes(
300
+ self, tree: "tree_sitter.Tree", source_code: str
301
+ ) -> list[MarkdownElement]:
302
+ """Extract Markdown blockquotes"""
303
+ self.source_code = source_code or ""
304
+ self.content_lines = self.source_code.split("\n")
305
+ self._reset_caches()
306
+
307
+ blockquotes: list[MarkdownElement] = []
308
+
309
+ if tree is not None and tree.root_node is not None:
310
+ try:
311
+ self._extract_block_quotes(tree.root_node, blockquotes)
312
+ except Exception as e:
313
+ log_debug(f"Error during blockquote extraction: {e}")
314
+
315
+ log_debug(f"Extracted {len(blockquotes)} Markdown blockquotes")
316
+ return blockquotes
317
+
318
+ def extract_horizontal_rules(
319
+ self, tree: "tree_sitter.Tree", source_code: str
320
+ ) -> list[MarkdownElement]:
321
+ """Extract Markdown horizontal rules"""
322
+ self.source_code = source_code or ""
323
+ self.content_lines = self.source_code.split("\n")
324
+ self._reset_caches()
325
+
326
+ horizontal_rules: list[MarkdownElement] = []
327
+
328
+ if tree is not None and tree.root_node is not None:
329
+ try:
330
+ self._extract_thematic_breaks(tree.root_node, horizontal_rules)
331
+ except Exception as e:
332
+ log_debug(f"Error during horizontal rule extraction: {e}")
333
+
334
+ log_debug(f"Extracted {len(horizontal_rules)} Markdown horizontal rules")
335
+ return horizontal_rules
336
+
337
+ def extract_html_elements(
338
+ self, tree: "tree_sitter.Tree", source_code: str
339
+ ) -> list[MarkdownElement]:
340
+ """Extract HTML elements"""
341
+ self.source_code = source_code or ""
342
+ self.content_lines = self.source_code.split("\n")
343
+ self._reset_caches()
344
+
345
+ html_elements: list[MarkdownElement] = []
346
+
347
+ if tree is not None and tree.root_node is not None:
348
+ try:
349
+ self._extract_html_blocks(tree.root_node, html_elements)
350
+ self._extract_inline_html(tree.root_node, html_elements)
351
+ except Exception as e:
352
+ log_debug(f"Error during HTML element extraction: {e}")
353
+
354
+ log_debug(f"Extracted {len(html_elements)} HTML elements")
355
+ return html_elements
356
+
357
+ def extract_text_formatting(
358
+ self, tree: "tree_sitter.Tree", source_code: str
359
+ ) -> list[MarkdownElement]:
360
+ """Extract text formatting elements (bold, italic, strikethrough, inline code)"""
361
+ self.source_code = source_code or ""
362
+ self.content_lines = self.source_code.split("\n")
363
+ self._reset_caches()
364
+
365
+ formatting_elements: list[MarkdownElement] = []
366
+
367
+ if tree is not None and tree.root_node is not None:
368
+ try:
369
+ self._extract_emphasis_elements(tree.root_node, formatting_elements)
370
+ self._extract_inline_code_spans(tree.root_node, formatting_elements)
371
+ self._extract_strikethrough_elements(
372
+ tree.root_node, formatting_elements
373
+ )
374
+ except Exception as e:
375
+ log_debug(f"Error during text formatting extraction: {e}")
376
+
377
+ log_debug(f"Extracted {len(formatting_elements)} text formatting elements")
378
+ return formatting_elements
379
+
380
+ def extract_footnotes(
381
+ self, tree: "tree_sitter.Tree", source_code: str
382
+ ) -> list[MarkdownElement]:
383
+ """Extract footnotes"""
384
+ self.source_code = source_code or ""
385
+ self.content_lines = self.source_code.split("\n")
386
+ self._reset_caches()
387
+
388
+ footnotes: list[MarkdownElement] = []
389
+
390
+ if tree is not None and tree.root_node is not None:
391
+ try:
392
+ self._extract_footnote_elements(tree.root_node, footnotes)
393
+ except Exception as e:
394
+ log_debug(f"Error during footnote extraction: {e}")
395
+
396
+ log_debug(f"Extracted {len(footnotes)} footnotes")
397
+ return footnotes
398
+
399
+ def extract_lists(
400
+ self, tree: "tree_sitter.Tree", source_code: str
401
+ ) -> list[MarkdownElement]:
402
+ """Extract Markdown lists"""
403
+ self.source_code = source_code or ""
404
+ self.content_lines = self.source_code.split("\n")
405
+ self._reset_caches()
406
+
407
+ lists: list[MarkdownElement] = []
408
+
409
+ if tree is not None and tree.root_node is not None:
410
+ try:
411
+ self._extract_list_items(tree.root_node, lists)
412
+ except Exception as e:
413
+ log_debug(f"Error during list extraction: {e}")
414
+
415
+ log_debug(f"Extracted {len(lists)} Markdown list items")
416
+ return lists
417
+
418
+ def extract_tables(
419
+ self, tree: "tree_sitter.Tree", source_code: str
420
+ ) -> list[MarkdownElement]:
421
+ """Extract Markdown tables"""
422
+ self.source_code = source_code or ""
423
+ self.content_lines = self.source_code.split("\n")
424
+ self._reset_caches()
425
+
426
+ tables: list[MarkdownElement] = []
427
+
428
+ if tree is not None and tree.root_node is not None:
429
+ try:
430
+ self._extract_pipe_tables(tree.root_node, tables)
431
+ except Exception as e:
432
+ log_debug(f"Error during table extraction: {e}")
433
+
434
+ log_debug(f"Extracted {len(tables)} Markdown tables")
435
+ return tables
436
+
437
+ def _reset_caches(self) -> None:
438
+ """Reset performance caches"""
439
+ self._node_text_cache.clear()
440
+ self._processed_nodes.clear()
441
+ self._element_cache.clear()
442
+
443
+ def _get_node_text_optimized(self, node: "tree_sitter.Node") -> str:
444
+ """Get node text with optimized caching"""
445
+ node_id = id(node)
446
+
447
+ if node_id in self._node_text_cache:
448
+ return self._node_text_cache[node_id]
449
+
450
+ try:
451
+ start_byte = node.start_byte
452
+ end_byte = node.end_byte
453
+
454
+ encoding = self._file_encoding or "utf-8"
455
+ content_bytes = safe_encode("\n".join(self.content_lines), encoding)
456
+ text = extract_text_slice(content_bytes, start_byte, end_byte, encoding)
457
+
458
+ if text:
459
+ self._node_text_cache[node_id] = text
460
+ return text
461
+ except Exception as e:
462
+ log_error(f"Error in _get_node_text_optimized: {e}")
463
+
464
+ # Fallback to simple text extraction
465
+ try:
466
+ start_point = node.start_point
467
+ end_point = node.end_point
468
+
469
+ if start_point[0] < 0 or start_point[0] >= len(self.content_lines):
470
+ return ""
471
+
472
+ if end_point[0] < 0 or end_point[0] >= len(self.content_lines):
473
+ return ""
474
+
475
+ if start_point[0] == end_point[0]:
476
+ line = self.content_lines[start_point[0]]
477
+ start_col = max(0, min(start_point[1], len(line)))
478
+ end_col = max(start_col, min(end_point[1], len(line)))
479
+ result: str = line[start_col:end_col]
480
+ self._node_text_cache[node_id] = result
481
+ return result
482
+ else:
483
+ lines = []
484
+ for i in range(
485
+ start_point[0], min(end_point[0] + 1, len(self.content_lines))
486
+ ):
487
+ if i < len(self.content_lines):
488
+ line = self.content_lines[i]
489
+ if i == start_point[0] and i == end_point[0]:
490
+ # Single line case
491
+ start_col = max(0, min(start_point[1], len(line)))
492
+ end_col = max(start_col, min(end_point[1], len(line)))
493
+ lines.append(line[start_col:end_col])
494
+ elif i == start_point[0]:
495
+ start_col = max(0, min(start_point[1], len(line)))
496
+ lines.append(line[start_col:])
497
+ elif i == end_point[0]:
498
+ end_col = max(0, min(end_point[1], len(line)))
499
+ lines.append(line[:end_col])
500
+ else:
501
+ lines.append(line)
502
+ result = "\n".join(lines)
503
+ self._node_text_cache[node_id] = result
504
+ return result
505
+ except Exception as fallback_error:
506
+ log_error(f"Fallback text extraction also failed: {fallback_error}")
507
+ return ""
508
+
509
+ def _extract_atx_headers(
510
+ self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]
511
+ ) -> None:
512
+ """Extract ATX-style headers (# ## ### etc.)"""
513
+ for node in self._traverse_nodes(root_node):
514
+ if node.type == "atx_heading":
515
+ try:
516
+ start_line = node.start_point[0] + 1
517
+ end_line = node.end_point[0] + 1
518
+ raw_text = self._get_node_text_optimized(node)
519
+
520
+ # Extract header level and content
521
+ level = 1
522
+ content = raw_text.strip()
523
+
524
+ # Count # symbols to determine level
525
+ if content.startswith("#"):
526
+ level = len(content) - len(content.lstrip("#"))
527
+ content = content.lstrip("# ").rstrip()
528
+
529
+ header = MarkdownElement(
530
+ name=content or f"Header Level {level}",
531
+ start_line=start_line,
532
+ end_line=end_line,
533
+ raw_text=raw_text,
534
+ element_type="heading",
535
+ level=level,
536
+ )
537
+ # Add additional attributes for formatter
538
+ header.text = content or f"Header Level {level}"
539
+ header.type = "heading"
540
+ headers.append(header)
541
+ except Exception as e:
542
+ log_debug(f"Failed to extract ATX header: {e}")
543
+
544
+ def _extract_setext_headers(
545
+ self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]
546
+ ) -> None:
547
+ """Extract Setext-style headers (underlined)"""
548
+ for node in self._traverse_nodes(root_node):
549
+ if node.type == "setext_heading":
550
+ try:
551
+ start_line = node.start_point[0] + 1
552
+ end_line = node.end_point[0] + 1
553
+ raw_text = self._get_node_text_optimized(node)
554
+
555
+ # Determine level based on underline character
556
+ level = 2 # Default to H2
557
+ lines = raw_text.strip().split("\n")
558
+ if len(lines) >= 2:
559
+ underline = lines[1].strip()
560
+ if underline.startswith("="):
561
+ level = 1 # H1
562
+ elif underline.startswith("-"):
563
+ level = 2 # H2
564
+ content = lines[0].strip()
565
+ else:
566
+ content = raw_text.strip()
567
+
568
+ header = MarkdownElement(
569
+ name=content or f"Header Level {level}",
570
+ start_line=start_line,
571
+ end_line=end_line,
572
+ raw_text=raw_text,
573
+ element_type="heading",
574
+ level=level,
575
+ )
576
+ # Add additional attributes for formatter
577
+ header.text = content or f"Header Level {level}"
578
+ header.type = "heading"
579
+ headers.append(header)
580
+ except Exception as e:
581
+ log_debug(f"Failed to extract Setext header: {e}")
582
+
583
+ def _extract_fenced_code_blocks(
584
+ self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]
585
+ ) -> None:
586
+ """Extract fenced code blocks"""
587
+ for node in self._traverse_nodes(root_node):
588
+ if node.type == "fenced_code_block":
589
+ try:
590
+ start_line = node.start_point[0] + 1
591
+ end_line = node.end_point[0] + 1
592
+ raw_text = self._get_node_text_optimized(node)
593
+
594
+ # Extract language info
595
+ language_info = None
596
+ lines = raw_text.strip().split("\n")
597
+ if lines and lines[0].startswith("```"):
598
+ language_info = lines[0][3:].strip()
599
+
600
+ # Extract content (excluding fence markers)
601
+ content_lines = []
602
+ in_content = False
603
+ for line in lines:
604
+ if line.startswith("```"):
605
+ if not in_content:
606
+ in_content = True
607
+ continue
608
+ else:
609
+ break
610
+ if in_content:
611
+ content_lines.append(line)
612
+
613
+ name = f"Code Block ({language_info or 'unknown'})"
614
+
615
+ code_block = MarkdownElement(
616
+ name=name,
617
+ start_line=start_line,
618
+ end_line=end_line,
619
+ raw_text=raw_text,
620
+ element_type="code_block",
621
+ language_info=language_info,
622
+ )
623
+ # Add additional attributes for formatter
624
+ code_block.language = language_info or "text"
625
+ code_block.line_count = len(content_lines)
626
+ code_block.type = "code_block"
627
+ code_blocks.append(code_block)
628
+ except Exception as e:
629
+ log_debug(f"Failed to extract fenced code block: {e}")
630
+
631
+ def _extract_indented_code_blocks(
632
+ self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]
633
+ ) -> None:
634
+ """Extract indented code blocks"""
635
+ for node in self._traverse_nodes(root_node):
636
+ if node.type == "indented_code_block":
637
+ try:
638
+ start_line = node.start_point[0] + 1
639
+ end_line = node.end_point[0] + 1
640
+ raw_text = self._get_node_text_optimized(node)
641
+
642
+ code_block = MarkdownElement(
643
+ name="Indented Code Block",
644
+ start_line=start_line,
645
+ end_line=end_line,
646
+ raw_text=raw_text,
647
+ element_type="code_block",
648
+ language_info="indented",
649
+ )
650
+ # Add additional attributes for formatter
651
+ code_block.language = "text"
652
+ code_block.line_count = end_line - start_line + 1
653
+ code_block.type = "code_block"
654
+ code_blocks.append(code_block)
655
+ except Exception as e:
656
+ log_debug(f"Failed to extract indented code block: {e}")
657
+
658
+ def _extract_inline_links(
659
+ self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
660
+ ) -> None:
661
+ """Extract inline links"""
662
+ import re
663
+
664
+ # Extract links from text within inline nodes using regular expressions
665
+ for node in self._traverse_nodes(root_node):
666
+ if node.type == "inline":
667
+ try:
668
+ raw_text = self._get_node_text_optimized(node)
669
+ if not raw_text:
670
+ continue
671
+
672
+ # Inline link pattern: [text](url "title") (excluding images)
673
+ inline_pattern = r'(?<!\!)\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
674
+ matches = re.finditer(inline_pattern, raw_text)
675
+
676
+ for match in matches:
677
+ text = match.group(1) or ""
678
+ url = match.group(2) or ""
679
+ title = match.group(3) or ""
680
+
681
+ # Global duplicate check: process same text and URL combination only once
682
+ link_signature = f"{text}|{url}"
683
+ if (
684
+ hasattr(self, "_extracted_links")
685
+ and link_signature in self._extracted_links
686
+ ):
687
+ continue
688
+
689
+ if hasattr(self, "_extracted_links"):
690
+ self._extracted_links.add(link_signature)
691
+
692
+ start_line = node.start_point[0] + 1
693
+ end_line = node.end_point[0] + 1
694
+
695
+ link = MarkdownElement(
696
+ name=text or "Link",
697
+ start_line=start_line,
698
+ end_line=end_line,
699
+ raw_text=match.group(0),
700
+ element_type="link",
701
+ url=url,
702
+ title=title,
703
+ )
704
+ # Add additional attributes for formatter
705
+ link.text = text or "Link"
706
+ link.type = "link"
707
+ links.append(link)
708
+
709
+ except Exception as e:
710
+ log_debug(f"Failed to extract inline link: {e}")
711
+
712
+ def _extract_reference_links(
713
+ self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
714
+ ) -> None:
715
+ """Extract reference links"""
716
+ import re
717
+
718
+ # Reference links also need to be extracted from inline nodes
719
+ # Track already processed reference links to avoid duplicates
720
+ processed_ref_links = set()
721
+
722
+ for node in self._traverse_nodes(root_node):
723
+ if node.type == "inline":
724
+ try:
725
+ raw_text = self._get_node_text_optimized(node)
726
+ if not raw_text:
727
+ continue
728
+
729
+ # Reference link pattern: [text][ref]
730
+ ref_pattern = r"\[([^\]]*)\]\[([^\]]*)\]"
731
+ matches = re.finditer(ref_pattern, raw_text)
732
+
733
+ for match in matches:
734
+ text = match.group(1) or ""
735
+ ref = match.group(2) or ""
736
+
737
+ # Skip image references (starting with !)
738
+ if match.start() > 0 and raw_text[match.start() - 1] == "!":
739
+ continue
740
+
741
+ # Duplicate check: process same text and reference combination only once
742
+ start_line = node.start_point[0] + 1
743
+ ref_link_key = (text, ref, start_line)
744
+
745
+ if ref_link_key in processed_ref_links:
746
+ continue
747
+ processed_ref_links.add(ref_link_key)
748
+
749
+ end_line = node.end_point[0] + 1
750
+
751
+ link = MarkdownElement(
752
+ name=text or "Reference Link",
753
+ start_line=start_line,
754
+ end_line=end_line,
755
+ raw_text=match.group(0),
756
+ element_type="reference_link",
757
+ )
758
+ # Add additional attributes for formatter
759
+ link.text = text or "Reference Link"
760
+ link.type = "reference_link"
761
+ links.append(link)
762
+
763
+ except Exception as e:
764
+ log_debug(f"Failed to extract reference link: {e}")
765
+
766
+ def _extract_autolinks(
767
+ self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
768
+ ) -> None:
769
+ """Extract autolinks"""
770
+ import re
771
+
772
+ # Extract autolinks from text within inline nodes using regular expressions
773
+ for node in self._traverse_nodes(root_node):
774
+ if node.type == "inline":
775
+ try:
776
+ raw_text = self._get_node_text_optimized(node)
777
+ if not raw_text:
778
+ continue
779
+
780
+ # Autolink pattern: <url> or <email>
781
+ autolink_pattern = (
782
+ r"<(https?://[^>]+|mailto:[^>]+|[^@\s]+@[^@\s]+\.[^@\s]+)>"
783
+ )
784
+ matches = re.finditer(autolink_pattern, raw_text)
785
+
786
+ for match in matches:
787
+ url = match.group(1) or ""
788
+ full_match = match.group(0)
789
+
790
+ # Global duplicate check: process same URL for autolinks only once
791
+ autolink_signature = f"autolink|{url}"
792
+ if (
793
+ hasattr(self, "_extracted_links")
794
+ and autolink_signature in self._extracted_links
795
+ ):
796
+ continue
797
+
798
+ if hasattr(self, "_extracted_links"):
799
+ self._extracted_links.add(autolink_signature)
800
+
801
+ start_line = node.start_point[0] + 1
802
+ end_line = node.end_point[0] + 1
803
+
804
+ link = MarkdownElement(
805
+ name=url or "Autolink",
806
+ start_line=start_line,
807
+ end_line=end_line,
808
+ raw_text=full_match,
809
+ element_type="autolink",
810
+ url=url,
811
+ )
812
+ # Add additional attributes for formatter
813
+ link.text = url or "Autolink"
814
+ link.type = "autolink"
815
+ links.append(link)
816
+
817
+ except Exception as e:
818
+ log_debug(f"Failed to extract autolink: {e}")
819
+
820
+ def _extract_inline_images(
821
+ self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
822
+ ) -> None:
823
+ """Extract inline images"""
824
+ import re
825
+
826
+ # Extract images from text within inline nodes using regular expressions
827
+ for node in self._traverse_nodes(root_node):
828
+ if node.type == "inline":
829
+ try:
830
+ raw_text = self._get_node_text_optimized(node)
831
+ if not raw_text:
832
+ continue
833
+
834
+ # Inline image pattern: ![alt](url "title")
835
+ image_pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
836
+ matches = re.finditer(image_pattern, raw_text)
837
+
838
+ for match in matches:
839
+ alt_text = match.group(1) or ""
840
+ url = match.group(2) or ""
841
+ title = match.group(3) or ""
842
+
843
+ # Calculate line number from matched position
844
+ start_line = node.start_point[0] + 1
845
+ end_line = node.end_point[0] + 1
846
+
847
+ image = MarkdownElement(
848
+ name=alt_text or "Image",
849
+ start_line=start_line,
850
+ end_line=end_line,
851
+ raw_text=match.group(0),
852
+ element_type="image",
853
+ url=url,
854
+ alt_text=alt_text,
855
+ title=title,
856
+ )
857
+ # Add additional attributes for formatter
858
+ image.alt = alt_text or ""
859
+ image.type = "image"
860
+ images.append(image)
861
+
862
+ except Exception as e:
863
+ log_debug(f"Failed to extract inline image: {e}")
864
+
865
+ def _extract_reference_images(
866
+ self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
867
+ ) -> None:
868
+ """Extract reference images"""
869
+ import re
870
+
871
+ # Reference images also need to be extracted from inline nodes
872
+ for node in self._traverse_nodes(root_node):
873
+ if node.type == "inline":
874
+ try:
875
+ raw_text = self._get_node_text_optimized(node)
876
+ if not raw_text:
877
+ continue
878
+
879
+ # Reference image pattern: ![alt][ref]
880
+ ref_image_pattern = r"!\[([^\]]*)\]\[([^\]]*)\]"
881
+ matches = re.finditer(ref_image_pattern, raw_text)
882
+
883
+ for match in matches:
884
+ alt_text = match.group(1) or ""
885
+ start_line = node.start_point[0] + 1
886
+ end_line = node.end_point[0] + 1
887
+
888
+ image = MarkdownElement(
889
+ name=alt_text or "Reference Image",
890
+ start_line=start_line,
891
+ end_line=end_line,
892
+ raw_text=match.group(0),
893
+ element_type="reference_image",
894
+ )
895
+ # Add additional attributes for formatter
896
+ image.alt = alt_text or ""
897
+ image.type = "reference_image"
898
+ images.append(image)
899
+
900
+ except Exception as e:
901
+ log_debug(f"Failed to extract reference image: {e}")
902
+
903
+ def _extract_image_reference_definitions(
904
+ self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
905
+ ) -> None:
906
+ """Extract image reference definitions"""
907
+ import re
908
+
909
+ # Extract all reference definitions that could be used for images
910
+ # We check if the URL points to an image file or if it's used by an image reference
911
+ # First, collect all image references used in the document
912
+ image_refs_used = set()
913
+ for node in self._traverse_nodes(root_node):
914
+ if node.type == "inline":
915
+ try:
916
+ raw_text = self._get_node_text_optimized(node)
917
+ if not raw_text:
918
+ continue
919
+
920
+ # Find image references: ![alt][ref]
921
+ ref_image_pattern = r"!\[([^\]]*)\]\[([^\]]*)\]"
922
+ matches = re.finditer(ref_image_pattern, raw_text)
923
+
924
+ for match in matches:
925
+ ref = match.group(2) or ""
926
+ if ref:
927
+ image_refs_used.add(ref.lower())
928
+
929
+ except Exception as e:
930
+ log_debug(f"Failed to scan for image references: {e}")
931
+
932
+ # Now extract reference definitions that are used by images OR point to image files
933
+ for node in self._traverse_nodes(root_node):
934
+ if node.type == "link_reference_definition":
935
+ try:
936
+ start_line = node.start_point[0] + 1
937
+ end_line = node.end_point[0] + 1
938
+ raw_text = self._get_node_text_optimized(node)
939
+
940
+ # Pattern: [label]: url "title"
941
+ ref_pattern = r'^\[([^\]]+)\]:\s*([^\s]+)(?:\s+"([^"]*)")?'
942
+ ref_match: re.Match[str] | None = re.match(
943
+ ref_pattern, raw_text.strip()
944
+ )
945
+
946
+ if ref_match:
947
+ label = ref_match.group(1) or ""
948
+ url = ref_match.group(2) or ""
949
+ title = ref_match.group(3) or ""
950
+
951
+ # Include if this reference is used by an image OR if URL looks like an image
952
+ is_used_by_image = label.lower() in image_refs_used
953
+ is_image_url = any(
954
+ url.lower().endswith(ext)
955
+ for ext in [
956
+ ".png",
957
+ ".jpg",
958
+ ".jpeg",
959
+ ".gif",
960
+ ".svg",
961
+ ".webp",
962
+ ".bmp",
963
+ ]
964
+ )
965
+
966
+ if is_used_by_image or is_image_url:
967
+ image_ref = MarkdownElement(
968
+ name=f"Image Reference Definition: {label}",
969
+ start_line=start_line,
970
+ end_line=end_line,
971
+ raw_text=raw_text,
972
+ element_type="image_reference_definition",
973
+ url=url,
974
+ alt_text=label,
975
+ title=title,
976
+ )
977
+ # Add additional attributes for formatter
978
+ image_ref.alt = label
979
+ image_ref.type = "image_reference_definition"
980
+ images.append(image_ref)
981
+
982
+ except Exception as e:
983
+ log_debug(f"Failed to extract image reference definition: {e}")
984
+
985
+ def _extract_link_reference_definitions(
986
+ self, root_node: "tree_sitter.Node", references: list[MarkdownElement]
987
+ ) -> None:
988
+ """Extract link reference definitions"""
989
+ for node in self._traverse_nodes(root_node):
990
+ if node.type == "link_reference_definition":
991
+ try:
992
+ start_line = node.start_point[0] + 1
993
+ end_line = node.end_point[0] + 1
994
+ raw_text = self._get_node_text_optimized(node)
995
+
996
+ reference = MarkdownElement(
997
+ name=raw_text or "Reference Definition",
998
+ start_line=start_line,
999
+ end_line=end_line,
1000
+ raw_text=raw_text,
1001
+ element_type="reference_definition",
1002
+ )
1003
+ references.append(reference)
1004
+ except Exception as e:
1005
+ log_debug(f"Failed to extract reference definition: {e}")
1006
+
1007
+ def _extract_list_items(
1008
+ self, root_node: "tree_sitter.Node", lists: list[MarkdownElement]
1009
+ ) -> None:
1010
+ """Extract lists (not individual items)"""
1011
+ for node in self._traverse_nodes(root_node):
1012
+ if node.type == "list":
1013
+ try:
1014
+ start_line = node.start_point[0] + 1
1015
+ end_line = node.end_point[0] + 1
1016
+ raw_text = self._get_node_text_optimized(node)
1017
+
1018
+ # Count list items in this list
1019
+ item_count = 0
1020
+ is_task_list = False
1021
+ is_ordered = False
1022
+
1023
+ for child in node.children:
1024
+ if child.type == "list_item":
1025
+ item_count += 1
1026
+ item_text = self._get_node_text_optimized(child)
1027
+
1028
+ # Check if it's a task list item
1029
+ if (
1030
+ "[ ]" in item_text
1031
+ or "[x]" in item_text
1032
+ or "[X]" in item_text
1033
+ ):
1034
+ is_task_list = True
1035
+
1036
+ # Check if it's an ordered list (starts with number)
1037
+ if item_text.strip() and item_text.strip()[0].isdigit():
1038
+ is_ordered = True
1039
+
1040
+ # Determine list type
1041
+ if is_task_list:
1042
+ list_type = "task"
1043
+ element_type = "task_list"
1044
+ elif is_ordered:
1045
+ list_type = "ordered"
1046
+ element_type = "list"
1047
+ else:
1048
+ list_type = "unordered"
1049
+ element_type = "list"
1050
+
1051
+ name = f"{list_type.title()} List ({item_count} items)"
1052
+
1053
+ list_element = MarkdownElement(
1054
+ name=name,
1055
+ start_line=start_line,
1056
+ end_line=end_line,
1057
+ raw_text=raw_text,
1058
+ element_type=element_type,
1059
+ )
1060
+ # Add additional attributes for formatter
1061
+ list_element.list_type = list_type
1062
+ list_element.item_count = item_count
1063
+ list_element.type = list_type
1064
+ lists.append(list_element)
1065
+ except Exception as e:
1066
+ log_debug(f"Failed to extract list: {e}")
1067
+
1068
+ def _extract_pipe_tables(
1069
+ self, root_node: "tree_sitter.Node", tables: list[MarkdownElement]
1070
+ ) -> None:
1071
+ """Extract pipe tables"""
1072
+ for node in self._traverse_nodes(root_node):
1073
+ if node.type == "pipe_table":
1074
+ try:
1075
+ start_line = node.start_point[0] + 1
1076
+ end_line = node.end_point[0] + 1
1077
+ raw_text = self._get_node_text_optimized(node)
1078
+
1079
+ # Count rows and columns
1080
+ lines = raw_text.strip().split("\n")
1081
+ row_count = len(
1082
+ [
1083
+ line
1084
+ for line in lines
1085
+ if line.strip() and not line.strip().startswith("|---")
1086
+ ]
1087
+ )
1088
+
1089
+ # Count columns from first row
1090
+ column_count = 0
1091
+ if lines:
1092
+ first_row = lines[0]
1093
+ column_count = len(
1094
+ [col for col in first_row.split("|") if col.strip()]
1095
+ )
1096
+
1097
+ table = MarkdownElement(
1098
+ name=f"Table ({row_count} rows, {column_count} columns)",
1099
+ start_line=start_line,
1100
+ end_line=end_line,
1101
+ raw_text=raw_text,
1102
+ element_type="table",
1103
+ )
1104
+ # Add additional attributes for formatter
1105
+ table.row_count = row_count
1106
+ table.column_count = column_count
1107
+ table.type = "table"
1108
+ tables.append(table)
1109
+ except Exception as e:
1110
+ log_debug(f"Failed to extract pipe table: {e}")
1111
+
1112
+ def _extract_block_quotes(
1113
+ self, root_node: "tree_sitter.Node", blockquotes: list[MarkdownElement]
1114
+ ) -> None:
1115
+ """Extract blockquotes"""
1116
+ import re
1117
+
1118
+ # Blockquotes are often represented as paragraphs starting with >
1119
+ for node in self._traverse_nodes(root_node):
1120
+ if node.type == "block_quote":
1121
+ try:
1122
+ start_line = node.start_point[0] + 1
1123
+ end_line = node.end_point[0] + 1
1124
+ raw_text = self._get_node_text_optimized(node)
1125
+
1126
+ # Extract content without > markers
1127
+ lines = raw_text.strip().split("\n")
1128
+ content_lines = []
1129
+ for line in lines:
1130
+ # Remove > marker and optional space
1131
+ cleaned = re.sub(r"^>\s?", "", line)
1132
+ content_lines.append(cleaned)
1133
+ content = "\n".join(content_lines).strip()
1134
+
1135
+ blockquote = MarkdownElement(
1136
+ name=(
1137
+ f"Blockquote: {content[:50]}..."
1138
+ if len(content) > 50
1139
+ else f"Blockquote: {content}"
1140
+ ),
1141
+ start_line=start_line,
1142
+ end_line=end_line,
1143
+ raw_text=raw_text,
1144
+ element_type="blockquote",
1145
+ )
1146
+ blockquote.type = "blockquote"
1147
+ blockquote.text = content
1148
+ blockquotes.append(blockquote)
1149
+ except Exception as e:
1150
+ log_debug(f"Failed to extract blockquote: {e}")
1151
+
1152
+ def _extract_thematic_breaks(
1153
+ self, root_node: "tree_sitter.Node", horizontal_rules: list[MarkdownElement]
1154
+ ) -> None:
1155
+ """Extract thematic breaks (horizontal rules)"""
1156
+ for node in self._traverse_nodes(root_node):
1157
+ if node.type == "thematic_break":
1158
+ try:
1159
+ start_line = node.start_point[0] + 1
1160
+ end_line = node.end_point[0] + 1
1161
+ raw_text = self._get_node_text_optimized(node)
1162
+
1163
+ hr = MarkdownElement(
1164
+ name="Horizontal Rule",
1165
+ start_line=start_line,
1166
+ end_line=end_line,
1167
+ raw_text=raw_text,
1168
+ element_type="horizontal_rule",
1169
+ )
1170
+ hr.type = "horizontal_rule"
1171
+ horizontal_rules.append(hr)
1172
+ except Exception as e:
1173
+ log_debug(f"Failed to extract horizontal rule: {e}")
1174
+
1175
+ def _extract_html_blocks(
1176
+ self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]
1177
+ ) -> None:
1178
+ """Extract HTML block elements"""
1179
+ for node in self._traverse_nodes(root_node):
1180
+ if node.type == "html_block":
1181
+ try:
1182
+ start_line = node.start_point[0] + 1
1183
+ end_line = node.end_point[0] + 1
1184
+ raw_text = self._get_node_text_optimized(node)
1185
+
1186
+ # Extract tag name if possible
1187
+ import re
1188
+
1189
+ tag_match = re.search(r"<(\w+)", raw_text)
1190
+ tag_name = tag_match.group(1) if tag_match else "HTML"
1191
+
1192
+ html_element = MarkdownElement(
1193
+ name=f"HTML Block: {tag_name}",
1194
+ start_line=start_line,
1195
+ end_line=end_line,
1196
+ raw_text=raw_text,
1197
+ element_type="html_block",
1198
+ )
1199
+ html_element.type = "html_block"
1200
+ html_elements.append(html_element)
1201
+ except Exception as e:
1202
+ log_debug(f"Failed to extract HTML block: {e}")
1203
+
1204
+ def _extract_inline_html(
1205
+ self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]
1206
+ ) -> None:
1207
+ """Extract inline HTML elements"""
1208
+ import re
1209
+
1210
+ # Look for HTML tags in inline content
1211
+ for node in self._traverse_nodes(root_node):
1212
+ if node.type == "inline":
1213
+ try:
1214
+ raw_text = self._get_node_text_optimized(node)
1215
+ if not raw_text:
1216
+ continue
1217
+
1218
+ # Pattern for HTML tags (excluding autolinks)
1219
+ # Exclude autolink patterns: <url> or <email>
1220
+ html_pattern = r"<(?!(?:https?://|mailto:|[^@\s]+@[^@\s]+\.[^@\s]+)[^>]*>)[^>]+>"
1221
+ matches = re.finditer(html_pattern, raw_text)
1222
+
1223
+ for match in matches:
1224
+ tag_text = match.group(0)
1225
+
1226
+ # Extract tag name
1227
+ tag_match = re.search(r"<(\w+)", tag_text)
1228
+ tag_name = tag_match.group(1) if tag_match else "HTML"
1229
+
1230
+ start_line = node.start_point[0] + 1
1231
+ end_line = node.end_point[0] + 1
1232
+
1233
+ html_element = MarkdownElement(
1234
+ name=f"HTML Tag: {tag_name}",
1235
+ start_line=start_line,
1236
+ end_line=end_line,
1237
+ raw_text=tag_text,
1238
+ element_type="html_inline",
1239
+ )
1240
+ html_element.type = "html_inline"
1241
+ html_element.name = tag_name # Set name attribute for formatter
1242
+ html_elements.append(html_element)
1243
+
1244
+ except Exception as e:
1245
+ log_debug(f"Failed to extract inline HTML: {e}")
1246
+
1247
+ def _extract_emphasis_elements(
1248
+ self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
1249
+ ) -> None:
1250
+ """Extract emphasis and strong emphasis elements"""
1251
+ import re
1252
+
1253
+ for node in self._traverse_nodes(root_node):
1254
+ if node.type == "inline":
1255
+ try:
1256
+ raw_text = self._get_node_text_optimized(node)
1257
+ if not raw_text:
1258
+ continue
1259
+
1260
+ # Pattern for bold text: **text** or __text__
1261
+ bold_pattern = r"\*\*([^*]+)\*\*|__([^_]+)__"
1262
+ bold_matches = re.finditer(bold_pattern, raw_text)
1263
+
1264
+ for match in bold_matches:
1265
+ content = match.group(1) or match.group(2) or ""
1266
+ start_line = node.start_point[0] + 1
1267
+ end_line = node.end_point[0] + 1
1268
+
1269
+ bold_element = MarkdownElement(
1270
+ name=f"Bold: {content}",
1271
+ start_line=start_line,
1272
+ end_line=end_line,
1273
+ raw_text=match.group(0),
1274
+ element_type="strong_emphasis",
1275
+ )
1276
+ bold_element.type = "strong_emphasis"
1277
+ bold_element.text = content
1278
+ formatting_elements.append(bold_element)
1279
+
1280
+ # Pattern for italic text: *text* or _text_ (but not **text** or __text__)
1281
+ italic_pattern = r"(?<!\*)\*([^*]+)\*(?!\*)|(?<!_)_([^_]+)_(?!_)"
1282
+ italic_matches = re.finditer(italic_pattern, raw_text)
1283
+
1284
+ for match in italic_matches:
1285
+ content = match.group(1) or match.group(2) or ""
1286
+ start_line = node.start_point[0] + 1
1287
+ end_line = node.end_point[0] + 1
1288
+
1289
+ italic_element = MarkdownElement(
1290
+ name=f"Italic: {content}",
1291
+ start_line=start_line,
1292
+ end_line=end_line,
1293
+ raw_text=match.group(0),
1294
+ element_type="emphasis",
1295
+ )
1296
+ italic_element.type = "emphasis"
1297
+ italic_element.text = content
1298
+ formatting_elements.append(italic_element)
1299
+
1300
+ except Exception as e:
1301
+ log_debug(f"Failed to extract emphasis elements: {e}")
1302
+
1303
+ def _extract_inline_code_spans(
1304
+ self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
1305
+ ) -> None:
1306
+ """Extract inline code spans"""
1307
+ import re
1308
+
1309
+ for node in self._traverse_nodes(root_node):
1310
+ if node.type == "inline":
1311
+ try:
1312
+ raw_text = self._get_node_text_optimized(node)
1313
+ if not raw_text:
1314
+ continue
1315
+
1316
+ # Pattern for inline code: `code`
1317
+ code_pattern = r"`([^`]+)`"
1318
+ matches = re.finditer(code_pattern, raw_text)
1319
+
1320
+ for match in matches:
1321
+ content = match.group(1) or ""
1322
+ start_line = node.start_point[0] + 1
1323
+ end_line = node.end_point[0] + 1
1324
+
1325
+ code_element = MarkdownElement(
1326
+ name=f"Inline Code: {content}",
1327
+ start_line=start_line,
1328
+ end_line=end_line,
1329
+ raw_text=match.group(0),
1330
+ element_type="inline_code",
1331
+ )
1332
+ code_element.type = "inline_code"
1333
+ code_element.text = content
1334
+ formatting_elements.append(code_element)
1335
+
1336
+ except Exception as e:
1337
+ log_debug(f"Failed to extract inline code: {e}")
1338
+
1339
+ def _extract_strikethrough_elements(
1340
+ self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
1341
+ ) -> None:
1342
+ """Extract strikethrough elements"""
1343
+ import re
1344
+
1345
+ for node in self._traverse_nodes(root_node):
1346
+ if node.type == "inline":
1347
+ try:
1348
+ raw_text = self._get_node_text_optimized(node)
1349
+ if not raw_text:
1350
+ continue
1351
+
1352
+ # Pattern for strikethrough: ~~text~~
1353
+ strike_pattern = r"~~([^~]+)~~"
1354
+ matches = re.finditer(strike_pattern, raw_text)
1355
+
1356
+ for match in matches:
1357
+ content = match.group(1) or ""
1358
+ start_line = node.start_point[0] + 1
1359
+ end_line = node.end_point[0] + 1
1360
+
1361
+ strike_element = MarkdownElement(
1362
+ name=f"Strikethrough: {content}",
1363
+ start_line=start_line,
1364
+ end_line=end_line,
1365
+ raw_text=match.group(0),
1366
+ element_type="strikethrough",
1367
+ )
1368
+ strike_element.type = "strikethrough"
1369
+ strike_element.text = content
1370
+ formatting_elements.append(strike_element)
1371
+
1372
+ except Exception as e:
1373
+ log_debug(f"Failed to extract strikethrough: {e}")
1374
+
1375
+ def _extract_footnote_elements(
1376
+ self, root_node: "tree_sitter.Node", footnotes: list[MarkdownElement]
1377
+ ) -> None:
1378
+ """Extract footnote elements"""
1379
+ import re
1380
+
1381
+ for node in self._traverse_nodes(root_node):
1382
+ if node.type == "inline":
1383
+ try:
1384
+ raw_text = self._get_node_text_optimized(node)
1385
+ if not raw_text:
1386
+ continue
1387
+
1388
+ # Pattern for footnote references: [^1]
1389
+ footnote_ref_pattern = r"\[\^([^\]]+)\]"
1390
+ matches = re.finditer(footnote_ref_pattern, raw_text)
1391
+
1392
+ for match in matches:
1393
+ ref_id = match.group(1) or ""
1394
+ start_line = node.start_point[0] + 1
1395
+ end_line = node.end_point[0] + 1
1396
+
1397
+ footnote_element = MarkdownElement(
1398
+ name=f"Footnote Reference: {ref_id}",
1399
+ start_line=start_line,
1400
+ end_line=end_line,
1401
+ raw_text=match.group(0),
1402
+ element_type="footnote_reference",
1403
+ )
1404
+ footnote_element.type = "footnote_reference"
1405
+ footnote_element.text = ref_id
1406
+ footnotes.append(footnote_element)
1407
+
1408
+ except Exception as e:
1409
+ log_debug(f"Failed to extract footnote reference: {e}")
1410
+
1411
+ # Look for footnote definitions
1412
+ elif node.type == "paragraph":
1413
+ try:
1414
+ raw_text = self._get_node_text_optimized(node)
1415
+ if not raw_text:
1416
+ continue
1417
+
1418
+ # Pattern for footnote definitions: [^1]: content
1419
+ footnote_def_pattern = r"^\[\^([^\]]+)\]:\s*(.+)$"
1420
+ footnote_match: re.Match[str] | None = re.match(
1421
+ footnote_def_pattern, raw_text.strip(), re.MULTILINE
1422
+ )
1423
+
1424
+ if footnote_match:
1425
+ ref_id = footnote_match.group(1) or ""
1426
+ content = footnote_match.group(2) or ""
1427
+ start_line = node.start_point[0] + 1
1428
+ end_line = node.end_point[0] + 1
1429
+
1430
+ footnote_element = MarkdownElement(
1431
+ name=f"Footnote Definition: {ref_id}",
1432
+ start_line=start_line,
1433
+ end_line=end_line,
1434
+ raw_text=raw_text,
1435
+ element_type="footnote_definition",
1436
+ )
1437
+ footnote_element.type = "footnote_definition"
1438
+ footnote_element.text = content
1439
+ footnotes.append(footnote_element)
1440
+
1441
+ except Exception as e:
1442
+ log_debug(f"Failed to extract footnote definition: {e}")
1443
+
1444
+ def _traverse_nodes(self, node: "tree_sitter.Node") -> Any:
1445
+ """Traverse all nodes in the tree"""
1446
+ yield node
1447
+ for child in node.children:
1448
+ yield from self._traverse_nodes(child)
1449
+
1450
+ def _parse_link_components(self, raw_text: str) -> tuple[str, str, str]:
1451
+ """Parse link components from raw text"""
1452
+ import re
1453
+
1454
+ # Pattern for [text](url "title")
1455
+ pattern = r'\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
1456
+ match = re.search(pattern, raw_text)
1457
+
1458
+ if match:
1459
+ text = match.group(1) or ""
1460
+ url = match.group(2) or ""
1461
+ title = match.group(3) or ""
1462
+ return text, url, title
1463
+
1464
+ return "", "", ""
1465
+
1466
+ def _parse_image_components(self, raw_text: str) -> tuple[str, str, str]:
1467
+ """Parse image components from raw text"""
1468
+ import re
1469
+
1470
+ # Pattern for ![alt](url "title")
1471
+ pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
1472
+ match = re.search(pattern, raw_text)
1473
+
1474
+ if match:
1475
+ alt_text = match.group(1) or ""
1476
+ url = match.group(2) or ""
1477
+ title = match.group(3) or ""
1478
+ return alt_text, url, title
1479
+
1480
+ return "", "", ""
1481
+
1482
+
1483
+ class MarkdownPlugin(LanguagePlugin):
1484
+ """Markdown language plugin for the tree-sitter analyzer"""
1485
+
1486
+ def __init__(self) -> None:
1487
+ """Initialize the Markdown plugin"""
1488
+ super().__init__()
1489
+ self._language_cache: tree_sitter.Language | None = None
1490
+ self._extractor: MarkdownElementExtractor = MarkdownElementExtractor()
1491
+
1492
+ # Legacy compatibility attributes for tests
1493
+ self.language = "markdown"
1494
+ self.extractor = self._extractor
1495
+
1496
+ def get_language_name(self) -> str:
1497
+ """Return the name of the programming language this plugin supports"""
1498
+ return "markdown"
1499
+
1500
+ def get_file_extensions(self) -> list[str]:
1501
+ """Return list of file extensions this plugin supports"""
1502
+ return [".md", ".markdown", ".mdown", ".mkd", ".mkdn", ".mdx"]
1503
+
1504
+ def create_extractor(self) -> ElementExtractor:
1505
+ """Create and return an element extractor for this language"""
1506
+ return MarkdownElementExtractor()
1507
+
1508
+ def get_extractor(self) -> ElementExtractor:
1509
+ """Get the cached extractor instance, creating it if necessary"""
1510
+ return self._extractor
1511
+
1512
+ def get_language(self) -> str:
1513
+ """Get the language name for Markdown (legacy compatibility)"""
1514
+ return "markdown"
1515
+
1516
+ def extract_functions(
1517
+ self, tree: "tree_sitter.Tree", source_code: str
1518
+ ) -> list[CodeElement]:
1519
+ """Extract functions from the tree (legacy compatibility)"""
1520
+ extractor = self.get_extractor()
1521
+ functions = extractor.extract_functions(tree, source_code)
1522
+ return [
1523
+ CodeElement(
1524
+ name=f.name,
1525
+ start_line=f.start_line,
1526
+ end_line=f.end_line,
1527
+ raw_text=f.raw_text,
1528
+ language=f.language,
1529
+ )
1530
+ for f in functions
1531
+ ]
1532
+
1533
+ def extract_classes(
1534
+ self, tree: "tree_sitter.Tree", source_code: str
1535
+ ) -> list[CodeElement]:
1536
+ """Extract classes from the tree (legacy compatibility)"""
1537
+ extractor = self.get_extractor()
1538
+ classes = extractor.extract_classes(tree, source_code)
1539
+ return [
1540
+ CodeElement(
1541
+ name=c.name,
1542
+ start_line=c.start_line,
1543
+ end_line=c.end_line,
1544
+ raw_text=c.raw_text,
1545
+ language=c.language,
1546
+ )
1547
+ for c in classes
1548
+ ]
1549
+
1550
+ def extract_variables(
1551
+ self, tree: "tree_sitter.Tree", source_code: str
1552
+ ) -> list[CodeElement]:
1553
+ """Extract variables from the tree (legacy compatibility)"""
1554
+ extractor = self.get_extractor()
1555
+ variables = extractor.extract_variables(tree, source_code)
1556
+ return [
1557
+ CodeElement(
1558
+ name=v.name,
1559
+ start_line=v.start_line,
1560
+ end_line=v.end_line,
1561
+ raw_text=v.raw_text,
1562
+ language=v.language,
1563
+ )
1564
+ for v in variables
1565
+ ]
1566
+
1567
+ def extract_imports(
1568
+ self, tree: "tree_sitter.Tree", source_code: str
1569
+ ) -> list[CodeElement]:
1570
+ """Extract imports from the tree (legacy compatibility)"""
1571
+ extractor = self.get_extractor()
1572
+ imports = extractor.extract_imports(tree, source_code)
1573
+ return [
1574
+ CodeElement(
1575
+ name=i.name,
1576
+ start_line=i.start_line,
1577
+ end_line=i.end_line,
1578
+ raw_text=i.raw_text,
1579
+ language=i.language,
1580
+ )
1581
+ for i in imports
1582
+ ]
1583
+
1584
+ def get_tree_sitter_language(self) -> Optional["tree_sitter.Language"]:
1585
+ """Get the Tree-sitter language object for Markdown"""
1586
+ if self._language_cache is None:
1587
+ try:
1588
+ import tree_sitter
1589
+ import tree_sitter_markdown as tsmarkdown
1590
+
1591
+ # Use modern tree-sitter-markdown API
1592
+ language_capsule = tsmarkdown.language()
1593
+ self._language_cache = tree_sitter.Language(language_capsule)
1594
+ except ImportError:
1595
+ log_error("tree-sitter-markdown not available")
1596
+ return None
1597
+ except Exception as e:
1598
+ log_error(f"Failed to load Markdown language: {e}")
1599
+ return None
1600
+ return self._language_cache
1601
+
1602
+ def get_supported_queries(self) -> list[str]:
1603
+ """Get list of supported query names for this language"""
1604
+ return [
1605
+ "headers",
1606
+ "code_blocks",
1607
+ "links",
1608
+ "images",
1609
+ "lists",
1610
+ "tables",
1611
+ "blockquotes",
1612
+ "emphasis",
1613
+ "inline_code",
1614
+ "references",
1615
+ "task_lists",
1616
+ "horizontal_rules",
1617
+ "html_blocks",
1618
+ "strikethrough",
1619
+ "footnotes",
1620
+ "text_content",
1621
+ "all_elements",
1622
+ ]
1623
+
1624
+ def is_applicable(self, file_path: str) -> bool:
1625
+ """Check if this plugin is applicable for the given file"""
1626
+ return any(
1627
+ file_path.lower().endswith(ext.lower())
1628
+ for ext in self.get_file_extensions()
1629
+ )
1630
+
1631
+ def get_plugin_info(self) -> dict:
1632
+ """Get information about this plugin"""
1633
+ return {
1634
+ "name": "Markdown Plugin",
1635
+ "language": self.get_language_name(),
1636
+ "extensions": self.get_file_extensions(),
1637
+ "version": "1.0.0",
1638
+ "supported_queries": self.get_supported_queries(),
1639
+ "features": [
1640
+ "ATX headers (# ## ###)",
1641
+ "Setext headers (underlined)",
1642
+ "Fenced code blocks",
1643
+ "Indented code blocks",
1644
+ "Inline code spans",
1645
+ "Inline links",
1646
+ "Reference links",
1647
+ "Autolinks",
1648
+ "Email autolinks",
1649
+ "Images (inline and reference)",
1650
+ "Lists (ordered and unordered)",
1651
+ "Task lists (checkboxes)",
1652
+ "Blockquotes",
1653
+ "Tables",
1654
+ "Emphasis and strong emphasis",
1655
+ "Strikethrough text",
1656
+ "Horizontal rules",
1657
+ "HTML blocks and inline HTML",
1658
+ "Footnotes (references and definitions)",
1659
+ "Reference definitions",
1660
+ "Text formatting extraction",
1661
+ "CommonMark compliance",
1662
+ ],
1663
+ }
1664
+
1665
+ async def analyze_file(
1666
+ self, file_path: str, request: AnalysisRequest
1667
+ ) -> AnalysisResult:
1668
+ """Analyze a Markdown file and return the analysis results."""
1669
+ if not TREE_SITTER_AVAILABLE:
1670
+ return AnalysisResult(
1671
+ file_path=file_path,
1672
+ language=self.get_language_name(),
1673
+ success=False,
1674
+ error_message="Tree-sitter library not available.",
1675
+ )
1676
+
1677
+ language = self.get_tree_sitter_language()
1678
+ if not language:
1679
+ return AnalysisResult(
1680
+ file_path=file_path,
1681
+ language=self.get_language_name(),
1682
+ success=False,
1683
+ error_message="Could not load Markdown language for parsing.",
1684
+ )
1685
+
1686
+ try:
1687
+ from ..encoding_utils import read_file_safe
1688
+
1689
+ source_code, _ = read_file_safe(file_path)
1690
+
1691
+ parser = tree_sitter.Parser()
1692
+ parser.language = language
1693
+ tree = parser.parse(source_code.encode("utf-8"))
1694
+
1695
+ extractor = self.create_extractor()
1696
+ extractor.current_file = file_path # Set current file for context
1697
+
1698
+ elements: list[CodeElement] = []
1699
+
1700
+ # Extract all element types using the markdown-specific extractor
1701
+ if isinstance(extractor, MarkdownElementExtractor):
1702
+ headers = extractor.extract_headers(tree, source_code)
1703
+ code_blocks = extractor.extract_code_blocks(tree, source_code)
1704
+ links = extractor.extract_links(tree, source_code)
1705
+ images = extractor.extract_images(tree, source_code)
1706
+ references = extractor.extract_references(tree, source_code)
1707
+ lists = extractor.extract_lists(tree, source_code)
1708
+ tables = extractor.extract_tables(tree, source_code)
1709
+
1710
+ # Extract new element types
1711
+ blockquotes = extractor.extract_blockquotes(tree, source_code)
1712
+ horizontal_rules = extractor.extract_horizontal_rules(tree, source_code)
1713
+ html_elements = extractor.extract_html_elements(tree, source_code)
1714
+ text_formatting = extractor.extract_text_formatting(tree, source_code)
1715
+ footnotes = extractor.extract_footnotes(tree, source_code)
1716
+ else:
1717
+ # Fallback for base ElementExtractor
1718
+ headers = []
1719
+ code_blocks = []
1720
+ links = []
1721
+ images = []
1722
+ references = []
1723
+ lists = []
1724
+ tables = []
1725
+ blockquotes = []
1726
+ horizontal_rules = []
1727
+ html_elements = []
1728
+ text_formatting = []
1729
+ footnotes = []
1730
+
1731
+ elements.extend(headers)
1732
+ elements.extend(code_blocks)
1733
+ elements.extend(links)
1734
+ elements.extend(images)
1735
+ elements.extend(references)
1736
+ elements.extend(lists)
1737
+ elements.extend(tables)
1738
+ elements.extend(blockquotes)
1739
+ elements.extend(horizontal_rules)
1740
+ elements.extend(html_elements)
1741
+ elements.extend(text_formatting)
1742
+ elements.extend(footnotes)
1743
+
1744
+ def count_nodes(node: "tree_sitter.Node") -> int:
1745
+ count = 1
1746
+ for child in node.children:
1747
+ count += count_nodes(child)
1748
+ return count
1749
+
1750
+ return AnalysisResult(
1751
+ file_path=file_path,
1752
+ language=self.get_language_name(),
1753
+ success=True,
1754
+ elements=elements,
1755
+ line_count=len(source_code.splitlines()),
1756
+ node_count=count_nodes(tree.root_node),
1757
+ )
1758
+ except Exception as e:
1759
+ log_error(f"Error analyzing Markdown file {file_path}: {e}")
1760
+ return AnalysisResult(
1761
+ file_path=file_path,
1762
+ language=self.get_language_name(),
1763
+ success=False,
1764
+ error_message=str(e),
1765
+ )
1766
+
1767
+ def execute_query(self, tree: "tree_sitter.Tree", query_name: str) -> dict:
1768
+ """Execute a specific query on the tree"""
1769
+ try:
1770
+ language = self.get_tree_sitter_language()
1771
+ if not language:
1772
+ return {"error": "Language not available"}
1773
+
1774
+ # Import query definitions
1775
+ from ..queries.markdown import get_query
1776
+
1777
+ try:
1778
+ query_string = get_query(query_name)
1779
+ except KeyError:
1780
+ return {"error": f"Unknown query: {query_name}"}
1781
+
1782
+ # Use tree-sitter API with modern handling
1783
+ captures = TreeSitterQueryCompat.safe_execute_query(
1784
+ language, query_string, tree.root_node, fallback_result=[]
1785
+ )
1786
+ return {
1787
+ "captures": captures,
1788
+ "query": query_string,
1789
+ "matches": len(captures),
1790
+ }
1791
+
1792
+ except Exception as e:
1793
+ log_error(f"Query execution failed: {e}")
1794
+ return {"error": str(e)}
1795
+
1796
+ def extract_elements(self, tree: "tree_sitter.Tree", source_code: str) -> list:
1797
+ """Extract elements from source code using tree-sitter AST"""
1798
+ extractor = self.get_extractor()
1799
+ elements = []
1800
+
1801
+ try:
1802
+ if isinstance(extractor, MarkdownElementExtractor):
1803
+ elements.extend(extractor.extract_headers(tree, source_code))
1804
+ elements.extend(extractor.extract_code_blocks(tree, source_code))
1805
+ elements.extend(extractor.extract_links(tree, source_code))
1806
+ elements.extend(extractor.extract_images(tree, source_code))
1807
+ elements.extend(extractor.extract_references(tree, source_code))
1808
+ elements.extend(extractor.extract_lists(tree, source_code))
1809
+ elements.extend(extractor.extract_tables(tree, source_code))
1810
+ elements.extend(extractor.extract_blockquotes(tree, source_code))
1811
+ elements.extend(extractor.extract_horizontal_rules(tree, source_code))
1812
+ elements.extend(extractor.extract_html_elements(tree, source_code))
1813
+ elements.extend(extractor.extract_text_formatting(tree, source_code))
1814
+ elements.extend(extractor.extract_footnotes(tree, source_code))
1815
+ except Exception as e:
1816
+ log_error(f"Failed to extract elements: {e}")
1817
+
1818
+ return elements
1819
+
1820
+ def execute_query_strategy(
1821
+ self, query_key: str | None, language: str
1822
+ ) -> str | None:
1823
+ """Execute query strategy for Markdown language"""
1824
+ if not query_key:
1825
+ return None
1826
+
1827
+ # Use markdown-specific element categories instead of base queries
1828
+ element_categories = self.get_element_categories()
1829
+ if query_key in element_categories:
1830
+ # Return a simple query string for the category
1831
+ node_types = element_categories[query_key]
1832
+ if node_types:
1833
+ # Create a basic query for the first node type
1834
+ return f"({node_types[0]}) @{query_key}"
1835
+
1836
+ # Fallback to base implementation
1837
+ queries = self.get_queries()
1838
+ return queries.get(query_key) if queries else None
1839
+
1840
+ def get_element_categories(self) -> dict[str, list[str]]:
1841
+ """Get Markdown element categories mapping query_key to node_types"""
1842
+ return {
1843
+ # Header categories (function-like)
1844
+ "function": ["atx_heading", "setext_heading"],
1845
+ "headers": ["atx_heading", "setext_heading"],
1846
+ "heading": ["atx_heading", "setext_heading"],
1847
+ # Code block categories (class-like)
1848
+ "class": ["fenced_code_block", "indented_code_block"],
1849
+ "code_blocks": ["fenced_code_block", "indented_code_block"],
1850
+ "code_block": ["fenced_code_block", "indented_code_block"],
1851
+ # Link and image categories (variable-like)
1852
+ "variable": [
1853
+ "inline", # Contains links and images
1854
+ "link",
1855
+ "autolink",
1856
+ "reference_link",
1857
+ "image",
1858
+ ],
1859
+ "links": [
1860
+ "inline", # Contains inline links
1861
+ "link",
1862
+ "autolink",
1863
+ "reference_link",
1864
+ ],
1865
+ "link": ["inline", "link", "autolink", "reference_link"],
1866
+ "images": [
1867
+ "inline", # Contains inline images
1868
+ "image",
1869
+ ],
1870
+ "image": ["inline", "image"],
1871
+ # Reference categories (import-like)
1872
+ "import": ["link_reference_definition"],
1873
+ "references": ["link_reference_definition"],
1874
+ "reference": ["link_reference_definition"],
1875
+ # List categories
1876
+ "lists": ["list", "list_item"],
1877
+ "list": ["list", "list_item"],
1878
+ "task_lists": ["list", "list_item"],
1879
+ # Table categories
1880
+ "tables": ["pipe_table", "table"],
1881
+ "table": ["pipe_table", "table"],
1882
+ # Content structure categories
1883
+ "blockquotes": ["block_quote"],
1884
+ "blockquote": ["block_quote"],
1885
+ "horizontal_rules": ["thematic_break"],
1886
+ "horizontal_rule": ["thematic_break"],
1887
+ # HTML categories
1888
+ "html_blocks": [
1889
+ "html_block",
1890
+ "inline", # Contains inline HTML
1891
+ ],
1892
+ "html_block": ["html_block", "inline"],
1893
+ "html": ["html_block", "inline"],
1894
+ # Text formatting categories
1895
+ "emphasis": ["inline"], # Contains emphasis elements
1896
+ "formatting": ["inline"],
1897
+ "text_formatting": ["inline"],
1898
+ "inline_code": ["inline"],
1899
+ "strikethrough": ["inline"],
1900
+ # Footnote categories
1901
+ "footnotes": [
1902
+ "inline", # Contains footnote references
1903
+ "paragraph", # Contains footnote definitions
1904
+ ],
1905
+ "footnote": ["inline", "paragraph"],
1906
+ # Comprehensive categories
1907
+ "all_elements": [
1908
+ "atx_heading",
1909
+ "setext_heading",
1910
+ "fenced_code_block",
1911
+ "indented_code_block",
1912
+ "inline",
1913
+ "link",
1914
+ "autolink",
1915
+ "reference_link",
1916
+ "image",
1917
+ "link_reference_definition",
1918
+ "list",
1919
+ "list_item",
1920
+ "pipe_table",
1921
+ "table",
1922
+ "block_quote",
1923
+ "thematic_break",
1924
+ "html_block",
1925
+ "paragraph",
1926
+ ],
1927
+ "text_content": ["atx_heading", "setext_heading", "inline", "paragraph"],
1928
+ }