tree-sitter-analyzer 1.7.2__py3-none-any.whl → 1.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tree-sitter-analyzer might be problematic. Click here for more details.

Files changed (29) hide show
  1. tree_sitter_analyzer/__init__.py +1 -1
  2. tree_sitter_analyzer/cli/commands/advanced_command.py +52 -0
  3. tree_sitter_analyzer/cli/commands/structure_command.py +50 -1
  4. tree_sitter_analyzer/cli/commands/summary_command.py +49 -0
  5. tree_sitter_analyzer/cli/commands/table_command.py +48 -0
  6. tree_sitter_analyzer/core/query_service.py +155 -5
  7. tree_sitter_analyzer/formatters/base_formatter.py +29 -2
  8. tree_sitter_analyzer/formatters/language_formatter_factory.py +83 -0
  9. tree_sitter_analyzer/formatters/markdown_formatter.py +557 -0
  10. tree_sitter_analyzer/language_detector.py +30 -0
  11. tree_sitter_analyzer/language_loader.py +1 -0
  12. tree_sitter_analyzer/languages/markdown_plugin.py +1673 -0
  13. tree_sitter_analyzer/languages/python_plugin.py +75 -16
  14. tree_sitter_analyzer/mcp/server.py +5 -74
  15. tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +8 -18
  16. tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +1 -1
  17. tree_sitter_analyzer/mcp/tools/list_files_tool.py +1 -1
  18. tree_sitter_analyzer/mcp/tools/query_tool.py +86 -3
  19. tree_sitter_analyzer/mcp/tools/read_partial_tool.py +91 -23
  20. tree_sitter_analyzer/mcp/tools/search_content_tool.py +1 -1
  21. tree_sitter_analyzer/mcp/tools/table_format_tool.py +7 -17
  22. tree_sitter_analyzer/queries/javascript.py +20 -0
  23. tree_sitter_analyzer/queries/markdown.py +379 -0
  24. tree_sitter_analyzer/queries/typescript.py +22 -0
  25. tree_sitter_analyzer/query_loader.py +1 -0
  26. {tree_sitter_analyzer-1.7.2.dist-info → tree_sitter_analyzer-1.7.4.dist-info}/METADATA +45 -20
  27. {tree_sitter_analyzer-1.7.2.dist-info → tree_sitter_analyzer-1.7.4.dist-info}/RECORD +29 -25
  28. {tree_sitter_analyzer-1.7.2.dist-info → tree_sitter_analyzer-1.7.4.dist-info}/entry_points.txt +1 -0
  29. {tree_sitter_analyzer-1.7.2.dist-info → tree_sitter_analyzer-1.7.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1673 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Markdown Language Plugin
4
+
5
+ Enhanced Markdown-specific parsing and element extraction functionality.
6
+ Provides comprehensive support for Markdown elements including headers,
7
+ links, code blocks, lists, tables, and other structural elements.
8
+ """
9
+
10
+ from typing import TYPE_CHECKING, Any, Optional
11
+
12
+ if TYPE_CHECKING:
13
+ import tree_sitter
14
+
15
+ try:
16
+ import tree_sitter
17
+
18
+ TREE_SITTER_AVAILABLE = True
19
+ except ImportError:
20
+ TREE_SITTER_AVAILABLE = False
21
+
22
+ from ..core.analysis_engine import AnalysisRequest
23
+ from ..encoding_utils import extract_text_slice, safe_encode
24
+ from ..models import AnalysisResult, CodeElement
25
+ from ..plugins.base import ElementExtractor, LanguagePlugin
26
+ from ..utils import log_debug, log_error, log_warning
27
+
28
+
29
+ class MarkdownElement(CodeElement):
30
+ """Markdown-specific code element"""
31
+
32
+ def __init__(
33
+ self,
34
+ name: str,
35
+ start_line: int,
36
+ end_line: int,
37
+ raw_text: str,
38
+ language: str = "markdown",
39
+ element_type: str = "markdown",
40
+ level: Optional[int] = None,
41
+ url: Optional[str] = None,
42
+ alt_text: Optional[str] = None,
43
+ title: Optional[str] = None,
44
+ language_info: Optional[str] = None,
45
+ is_checked: Optional[bool] = None,
46
+ **kwargs
47
+ ):
48
+ super().__init__(
49
+ name=name,
50
+ start_line=start_line,
51
+ end_line=end_line,
52
+ raw_text=raw_text,
53
+ language=language,
54
+ **kwargs
55
+ )
56
+ self.element_type = element_type
57
+ self.level = level # For headers (1-6)
58
+ self.url = url # For links and images
59
+ self.alt_text = alt_text # For images
60
+ self.title = title # For links and images
61
+ self.language_info = language_info # For code blocks
62
+ self.is_checked = is_checked # For task list items
63
+
64
+
65
+ class MarkdownElementExtractor(ElementExtractor):
66
+ """Markdown-specific element extractor with comprehensive feature support"""
67
+
68
+ def __init__(self) -> None:
69
+ """Initialize the Markdown element extractor."""
70
+ self.current_file: str = ""
71
+ self.source_code: str = ""
72
+ self.content_lines: list[str] = []
73
+
74
+ # Performance optimization caches
75
+ self._node_text_cache: dict[int, str] = {}
76
+ self._processed_nodes: set[int] = set()
77
+ self._element_cache: dict[tuple[int, str], Any] = {}
78
+ self._file_encoding: str | None = None
79
+
80
+ def extract_functions(
81
+ self, tree: "tree_sitter.Tree", source_code: str
82
+ ) -> list[CodeElement]:
83
+ """Extract Markdown elements (headers act as 'functions')"""
84
+ return self.extract_headers(tree, source_code)
85
+
86
+ def extract_classes(
87
+ self, tree: "tree_sitter.Tree", source_code: str
88
+ ) -> list[CodeElement]:
89
+ """Extract Markdown sections (code blocks act as 'classes')"""
90
+ return self.extract_code_blocks(tree, source_code)
91
+
92
+ def extract_variables(
93
+ self, tree: "tree_sitter.Tree", source_code: str
94
+ ) -> list[CodeElement]:
95
+ """Extract Markdown links and images (act as 'variables')"""
96
+ elements = []
97
+ elements.extend(self.extract_links(tree, source_code))
98
+ elements.extend(self.extract_images(tree, source_code))
99
+ return elements
100
+
101
+ def extract_imports(
102
+ self, tree: "tree_sitter.Tree", source_code: str
103
+ ) -> list[CodeElement]:
104
+ """Extract Markdown references and definitions"""
105
+ return self.extract_references(tree, source_code)
106
+
107
+ def extract_headers(
108
+ self, tree: "tree_sitter.Tree", source_code: str
109
+ ) -> list[MarkdownElement]:
110
+ """Extract Markdown headers (H1-H6)"""
111
+ self.source_code = source_code or ""
112
+ self.content_lines = self.source_code.split("\n")
113
+ self._reset_caches()
114
+
115
+ headers: list[MarkdownElement] = []
116
+
117
+ if tree is None or tree.root_node is None:
118
+ log_debug("Tree or root_node is None, returning empty headers list")
119
+ return headers
120
+
121
+ try:
122
+ # Extract ATX headers (# ## ### etc.)
123
+ self._extract_atx_headers(tree.root_node, headers)
124
+ # Extract Setext headers (underlined)
125
+ self._extract_setext_headers(tree.root_node, headers)
126
+ except Exception as e:
127
+ log_debug(f"Error during header extraction: {e}")
128
+ return []
129
+
130
+ log_debug(f"Extracted {len(headers)} Markdown headers")
131
+ return headers
132
+
133
+ def extract_code_blocks(
134
+ self, tree: "tree_sitter.Tree", source_code: str
135
+ ) -> list[MarkdownElement]:
136
+ """Extract Markdown code blocks"""
137
+ self.source_code = source_code or ""
138
+ self.content_lines = self.source_code.split("\n")
139
+ self._reset_caches()
140
+
141
+ code_blocks: list[MarkdownElement] = []
142
+
143
+ if tree is None or tree.root_node is None:
144
+ log_debug("Tree or root_node is None, returning empty code blocks list")
145
+ return code_blocks
146
+
147
+ try:
148
+ self._extract_fenced_code_blocks(tree.root_node, code_blocks)
149
+ self._extract_indented_code_blocks(tree.root_node, code_blocks)
150
+ except Exception as e:
151
+ log_debug(f"Error during code block extraction: {e}")
152
+ return []
153
+
154
+ log_debug(f"Extracted {len(code_blocks)} Markdown code blocks")
155
+ return code_blocks
156
+
157
+ def extract_links(
158
+ self, tree: "tree_sitter.Tree", source_code: str
159
+ ) -> list[MarkdownElement]:
160
+ """Extract Markdown links"""
161
+ self.source_code = source_code or ""
162
+ self.content_lines = self.source_code.split("\n")
163
+ self._reset_caches()
164
+
165
+ links: list[MarkdownElement] = []
166
+
167
+ if tree is None or tree.root_node is None:
168
+ log_debug("Tree or root_node is None, returning empty links list")
169
+ return links
170
+
171
+ try:
172
+ # Track extracted links to prevent global duplicates (ensure reset)
173
+ self._extracted_links = set()
174
+
175
+ self._extract_inline_links(tree.root_node, links)
176
+ self._extract_reference_links(tree.root_node, links)
177
+ self._extract_autolinks(tree.root_node, links)
178
+
179
+ # Clean up after extraction is complete
180
+ if hasattr(self, '_extracted_links'):
181
+ delattr(self, '_extracted_links')
182
+
183
+ except Exception as e:
184
+ log_debug(f"Error during link extraction: {e}")
185
+ return []
186
+
187
+ log_debug(f"Extracted {len(links)} Markdown links")
188
+ return links
189
+
190
+ def extract_images(
191
+ self, tree: "tree_sitter.Tree", source_code: str
192
+ ) -> list[MarkdownElement]:
193
+ """Extract Markdown images"""
194
+ self.source_code = source_code or ""
195
+ self.content_lines = self.source_code.split("\n")
196
+ self._reset_caches()
197
+
198
+ images: list[MarkdownElement] = []
199
+
200
+ if tree is None or tree.root_node is None:
201
+ log_debug("Tree or root_node is None, returning empty images list")
202
+ return images
203
+
204
+ try:
205
+ self._extract_inline_images(tree.root_node, images)
206
+ self._extract_reference_images(tree.root_node, images)
207
+ self._extract_image_reference_definitions(tree.root_node, images)
208
+ except Exception as e:
209
+ log_debug(f"Error during image extraction: {e}")
210
+ return []
211
+
212
+ log_debug(f"Extracted {len(images)} Markdown images")
213
+ return images
214
+
215
+ def extract_references(
216
+ self, tree: "tree_sitter.Tree", source_code: str
217
+ ) -> list[MarkdownElement]:
218
+ """Extract Markdown reference definitions"""
219
+ self.source_code = source_code or ""
220
+ self.content_lines = self.source_code.split("\n")
221
+ self._reset_caches()
222
+
223
+ references: list[MarkdownElement] = []
224
+
225
+ if tree is None or tree.root_node is None:
226
+ log_debug("Tree or root_node is None, returning empty references list")
227
+ return references
228
+
229
+ try:
230
+ self._extract_link_reference_definitions(tree.root_node, references)
231
+ except Exception as e:
232
+ log_debug(f"Error during reference extraction: {e}")
233
+ return []
234
+
235
+ log_debug(f"Extracted {len(references)} Markdown references")
236
+ return references
237
+
238
+ def extract_blockquotes(
239
+ self, tree: "tree_sitter.Tree", source_code: str
240
+ ) -> list[MarkdownElement]:
241
+ """Extract Markdown blockquotes"""
242
+ self.source_code = source_code or ""
243
+ self.content_lines = self.source_code.split("\n")
244
+ self._reset_caches()
245
+
246
+ blockquotes: list[MarkdownElement] = []
247
+
248
+ if tree is None or tree.root_node is None:
249
+ log_debug("Tree or root_node is None, returning empty blockquotes list")
250
+ return blockquotes
251
+
252
+ try:
253
+ self._extract_block_quotes(tree.root_node, blockquotes)
254
+ except Exception as e:
255
+ log_debug(f"Error during blockquote extraction: {e}")
256
+ return []
257
+
258
+ log_debug(f"Extracted {len(blockquotes)} Markdown blockquotes")
259
+ return blockquotes
260
+
261
+ def extract_horizontal_rules(
262
+ self, tree: "tree_sitter.Tree", source_code: str
263
+ ) -> list[MarkdownElement]:
264
+ """Extract Markdown horizontal rules"""
265
+ self.source_code = source_code or ""
266
+ self.content_lines = self.source_code.split("\n")
267
+ self._reset_caches()
268
+
269
+ horizontal_rules: list[MarkdownElement] = []
270
+
271
+ if tree is None or tree.root_node is None:
272
+ log_debug("Tree or root_node is None, returning empty horizontal rules list")
273
+ return horizontal_rules
274
+
275
+ try:
276
+ self._extract_thematic_breaks(tree.root_node, horizontal_rules)
277
+ except Exception as e:
278
+ log_debug(f"Error during horizontal rule extraction: {e}")
279
+ return []
280
+
281
+ log_debug(f"Extracted {len(horizontal_rules)} Markdown horizontal rules")
282
+ return horizontal_rules
283
+
284
+ def extract_html_elements(
285
+ self, tree: "tree_sitter.Tree", source_code: str
286
+ ) -> list[MarkdownElement]:
287
+ """Extract HTML elements"""
288
+ self.source_code = source_code or ""
289
+ self.content_lines = self.source_code.split("\n")
290
+ self._reset_caches()
291
+
292
+ html_elements: list[MarkdownElement] = []
293
+
294
+ if tree is None or tree.root_node is None:
295
+ log_debug("Tree or root_node is None, returning empty HTML elements list")
296
+ return html_elements
297
+
298
+ try:
299
+ self._extract_html_blocks(tree.root_node, html_elements)
300
+ self._extract_inline_html(tree.root_node, html_elements)
301
+ except Exception as e:
302
+ log_debug(f"Error during HTML element extraction: {e}")
303
+ return []
304
+
305
+ log_debug(f"Extracted {len(html_elements)} HTML elements")
306
+ return html_elements
307
+
308
+ def extract_text_formatting(
309
+ self, tree: "tree_sitter.Tree", source_code: str
310
+ ) -> list[MarkdownElement]:
311
+ """Extract text formatting elements (bold, italic, strikethrough, inline code)"""
312
+ self.source_code = source_code or ""
313
+ self.content_lines = self.source_code.split("\n")
314
+ self._reset_caches()
315
+
316
+ formatting_elements: list[MarkdownElement] = []
317
+
318
+ if tree is None or tree.root_node is None:
319
+ log_debug("Tree or root_node is None, returning empty formatting elements list")
320
+ return formatting_elements
321
+
322
+ try:
323
+ self._extract_emphasis_elements(tree.root_node, formatting_elements)
324
+ self._extract_inline_code_spans(tree.root_node, formatting_elements)
325
+ self._extract_strikethrough_elements(tree.root_node, formatting_elements)
326
+ except Exception as e:
327
+ log_debug(f"Error during text formatting extraction: {e}")
328
+ return []
329
+
330
+ log_debug(f"Extracted {len(formatting_elements)} text formatting elements")
331
+ return formatting_elements
332
+
333
+ def extract_footnotes(
334
+ self, tree: "tree_sitter.Tree", source_code: str
335
+ ) -> list[MarkdownElement]:
336
+ """Extract footnotes"""
337
+ self.source_code = source_code or ""
338
+ self.content_lines = self.source_code.split("\n")
339
+ self._reset_caches()
340
+
341
+ footnotes: list[MarkdownElement] = []
342
+
343
+ if tree is None or tree.root_node is None:
344
+ log_debug("Tree or root_node is None, returning empty footnotes list")
345
+ return footnotes
346
+
347
+ try:
348
+ self._extract_footnote_elements(tree.root_node, footnotes)
349
+ except Exception as e:
350
+ log_debug(f"Error during footnote extraction: {e}")
351
+ return []
352
+
353
+ log_debug(f"Extracted {len(footnotes)} footnotes")
354
+ return footnotes
355
+
356
+ def extract_lists(
357
+ self, tree: "tree_sitter.Tree", source_code: str
358
+ ) -> list[MarkdownElement]:
359
+ """Extract Markdown lists"""
360
+ self.source_code = source_code or ""
361
+ self.content_lines = self.source_code.split("\n")
362
+ self._reset_caches()
363
+
364
+ lists: list[MarkdownElement] = []
365
+
366
+ if tree is None or tree.root_node is None:
367
+ log_debug("Tree or root_node is None, returning empty lists list")
368
+ return lists
369
+
370
+ try:
371
+ self._extract_list_items(tree.root_node, lists)
372
+ except Exception as e:
373
+ log_debug(f"Error during list extraction: {e}")
374
+ return []
375
+
376
+ log_debug(f"Extracted {len(lists)} Markdown list items")
377
+ return lists
378
+
379
+ def extract_tables(
380
+ self, tree: "tree_sitter.Tree", source_code: str
381
+ ) -> list[MarkdownElement]:
382
+ """Extract Markdown tables"""
383
+ self.source_code = source_code or ""
384
+ self.content_lines = self.source_code.split("\n")
385
+ self._reset_caches()
386
+
387
+ tables: list[MarkdownElement] = []
388
+
389
+ if tree is None or tree.root_node is None:
390
+ log_debug("Tree or root_node is None, returning empty tables list")
391
+ return tables
392
+
393
+ try:
394
+ self._extract_pipe_tables(tree.root_node, tables)
395
+ except Exception as e:
396
+ log_debug(f"Error during table extraction: {e}")
397
+ return []
398
+
399
+ log_debug(f"Extracted {len(tables)} Markdown tables")
400
+ return tables
401
+
402
+ def _reset_caches(self) -> None:
403
+ """Reset performance caches"""
404
+ self._node_text_cache.clear()
405
+ self._processed_nodes.clear()
406
+ self._element_cache.clear()
407
+
408
+ def _get_node_text_optimized(self, node: "tree_sitter.Node") -> str:
409
+ """Get node text with optimized caching"""
410
+ node_id = id(node)
411
+
412
+ if node_id in self._node_text_cache:
413
+ return self._node_text_cache[node_id]
414
+
415
+ try:
416
+ start_byte = node.start_byte
417
+ end_byte = node.end_byte
418
+
419
+ encoding = self._file_encoding or "utf-8"
420
+ content_bytes = safe_encode("\n".join(self.content_lines), encoding)
421
+ text = extract_text_slice(content_bytes, start_byte, end_byte, encoding)
422
+
423
+ if text:
424
+ self._node_text_cache[node_id] = text
425
+ return text
426
+ except Exception as e:
427
+ log_error(f"Error in _get_node_text_optimized: {e}")
428
+
429
+ # Fallback to simple text extraction
430
+ try:
431
+ start_point = node.start_point
432
+ end_point = node.end_point
433
+
434
+ if (start_point[0] < 0 or start_point[0] >= len(self.content_lines)):
435
+ return ""
436
+
437
+ if (end_point[0] < 0 or end_point[0] >= len(self.content_lines)):
438
+ return ""
439
+
440
+ if start_point[0] == end_point[0]:
441
+ line = self.content_lines[start_point[0]]
442
+ start_col = max(0, min(start_point[1], len(line)))
443
+ end_col = max(start_col, min(end_point[1], len(line)))
444
+ result = line[start_col:end_col]
445
+ self._node_text_cache[node_id] = result
446
+ return result
447
+ else:
448
+ lines = []
449
+ for i in range(start_point[0], min(end_point[0] + 1, len(self.content_lines))):
450
+ if i < len(self.content_lines):
451
+ line = self.content_lines[i]
452
+ if i == start_point[0] and i == end_point[0]:
453
+ # Single line case
454
+ start_col = max(0, min(start_point[1], len(line)))
455
+ end_col = max(start_col, min(end_point[1], len(line)))
456
+ lines.append(line[start_col:end_col])
457
+ elif i == start_point[0]:
458
+ start_col = max(0, min(start_point[1], len(line)))
459
+ lines.append(line[start_col:])
460
+ elif i == end_point[0]:
461
+ end_col = max(0, min(end_point[1], len(line)))
462
+ lines.append(line[:end_col])
463
+ else:
464
+ lines.append(line)
465
+ result = "\n".join(lines)
466
+ self._node_text_cache[node_id] = result
467
+ return result
468
+ except Exception as fallback_error:
469
+ log_error(f"Fallback text extraction also failed: {fallback_error}")
470
+ return ""
471
+
472
+ def _extract_atx_headers(self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]) -> None:
473
+ """Extract ATX-style headers (# ## ### etc.)"""
474
+ for node in self._traverse_nodes(root_node):
475
+ if node.type == "atx_heading":
476
+ try:
477
+ start_line = node.start_point[0] + 1
478
+ end_line = node.end_point[0] + 1
479
+ raw_text = self._get_node_text_optimized(node)
480
+
481
+ # Extract header level and content
482
+ level = 1
483
+ content = raw_text.strip()
484
+
485
+ # Count # symbols to determine level
486
+ if content.startswith("#"):
487
+ level = len(content) - len(content.lstrip("#"))
488
+ content = content.lstrip("# ").rstrip()
489
+
490
+ header = MarkdownElement(
491
+ name=content or f"Header Level {level}",
492
+ start_line=start_line,
493
+ end_line=end_line,
494
+ raw_text=raw_text,
495
+ element_type="heading",
496
+ level=level
497
+ )
498
+ # Add additional attributes for formatter
499
+ header.text = content or f"Header Level {level}"
500
+ header.type = "heading"
501
+ headers.append(header)
502
+ except Exception as e:
503
+ log_debug(f"Failed to extract ATX header: {e}")
504
+
505
+ def _extract_setext_headers(self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]) -> None:
506
+ """Extract Setext-style headers (underlined)"""
507
+ for node in self._traverse_nodes(root_node):
508
+ if node.type == "setext_heading":
509
+ try:
510
+ start_line = node.start_point[0] + 1
511
+ end_line = node.end_point[0] + 1
512
+ raw_text = self._get_node_text_optimized(node)
513
+
514
+ # Determine level based on underline character
515
+ level = 2 # Default to H2
516
+ lines = raw_text.strip().split("\n")
517
+ if len(lines) >= 2:
518
+ underline = lines[1].strip()
519
+ if underline.startswith("="):
520
+ level = 1 # H1
521
+ elif underline.startswith("-"):
522
+ level = 2 # H2
523
+ content = lines[0].strip()
524
+ else:
525
+ content = raw_text.strip()
526
+
527
+ header = MarkdownElement(
528
+ name=content or f"Header Level {level}",
529
+ start_line=start_line,
530
+ end_line=end_line,
531
+ raw_text=raw_text,
532
+ element_type="heading",
533
+ level=level
534
+ )
535
+ # Add additional attributes for formatter
536
+ header.text = content or f"Header Level {level}"
537
+ header.type = "heading"
538
+ headers.append(header)
539
+ except Exception as e:
540
+ log_debug(f"Failed to extract Setext header: {e}")
541
+
542
+ def _extract_fenced_code_blocks(self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]) -> None:
543
+ """Extract fenced code blocks"""
544
+ for node in self._traverse_nodes(root_node):
545
+ if node.type == "fenced_code_block":
546
+ try:
547
+ start_line = node.start_point[0] + 1
548
+ end_line = node.end_point[0] + 1
549
+ raw_text = self._get_node_text_optimized(node)
550
+
551
+ # Extract language info
552
+ language_info = None
553
+ lines = raw_text.strip().split("\n")
554
+ if lines and lines[0].startswith("```"):
555
+ language_info = lines[0][3:].strip()
556
+
557
+ # Extract content (excluding fence markers)
558
+ content_lines = []
559
+ in_content = False
560
+ for line in lines:
561
+ if line.startswith("```"):
562
+ if not in_content:
563
+ in_content = True
564
+ continue
565
+ else:
566
+ break
567
+ if in_content:
568
+ content_lines.append(line)
569
+
570
+ content = "\n".join(content_lines)
571
+ name = f"Code Block ({language_info or 'unknown'})"
572
+
573
+ code_block = MarkdownElement(
574
+ name=name,
575
+ start_line=start_line,
576
+ end_line=end_line,
577
+ raw_text=raw_text,
578
+ element_type="code_block",
579
+ language_info=language_info
580
+ )
581
+ # Add additional attributes for formatter
582
+ code_block.language = language_info or "text"
583
+ code_block.line_count = len(content_lines)
584
+ code_block.type = "code_block"
585
+ code_blocks.append(code_block)
586
+ except Exception as e:
587
+ log_debug(f"Failed to extract fenced code block: {e}")
588
+
589
+ def _extract_indented_code_blocks(self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]) -> None:
590
+ """Extract indented code blocks"""
591
+ for node in self._traverse_nodes(root_node):
592
+ if node.type == "indented_code_block":
593
+ try:
594
+ start_line = node.start_point[0] + 1
595
+ end_line = node.end_point[0] + 1
596
+ raw_text = self._get_node_text_optimized(node)
597
+
598
+ code_block = MarkdownElement(
599
+ name="Indented Code Block",
600
+ start_line=start_line,
601
+ end_line=end_line,
602
+ raw_text=raw_text,
603
+ element_type="code_block",
604
+ language_info="indented"
605
+ )
606
+ # Add additional attributes for formatter
607
+ code_block.language = "text"
608
+ code_block.line_count = end_line - start_line + 1
609
+ code_block.type = "code_block"
610
+ code_blocks.append(code_block)
611
+ except Exception as e:
612
+ log_debug(f"Failed to extract indented code block: {e}")
613
+
614
+ def _extract_inline_links(self, root_node: "tree_sitter.Node", links: list[MarkdownElement]) -> None:
615
+ """Extract inline links"""
616
+ import re
617
+
618
+ # Extract links from text within inline nodes using regular expressions
619
+ for node in self._traverse_nodes(root_node):
620
+ if node.type == "inline":
621
+ try:
622
+ raw_text = self._get_node_text_optimized(node)
623
+ if not raw_text:
624
+ continue
625
+
626
+ # Inline link pattern: [text](url "title") (excluding images)
627
+ inline_pattern = r'(?<!\!)\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
628
+ matches = re.finditer(inline_pattern, raw_text)
629
+
630
+ for match in matches:
631
+ text = match.group(1) or ""
632
+ url = match.group(2) or ""
633
+ title = match.group(3) or ""
634
+
635
+ # Global duplicate check: process same text and URL combination only once
636
+ link_signature = f"{text}|{url}"
637
+ if hasattr(self, '_extracted_links') and link_signature in self._extracted_links:
638
+ continue
639
+
640
+ if hasattr(self, '_extracted_links'):
641
+ self._extracted_links.add(link_signature)
642
+
643
+ start_line = node.start_point[0] + 1
644
+ end_line = node.end_point[0] + 1
645
+
646
+ link = MarkdownElement(
647
+ name=text or "Link",
648
+ start_line=start_line,
649
+ end_line=end_line,
650
+ raw_text=match.group(0),
651
+ element_type="link",
652
+ url=url,
653
+ title=title
654
+ )
655
+ # Add additional attributes for formatter
656
+ link.text = text or "Link"
657
+ link.type = "link"
658
+ links.append(link)
659
+
660
+ except Exception as e:
661
+ log_debug(f"Failed to extract inline link: {e}")
662
+
663
+ def _extract_reference_links(self, root_node: "tree_sitter.Node", links: list[MarkdownElement]) -> None:
664
+ """Extract reference links"""
665
+ import re
666
+
667
+ # Reference links also need to be extracted from inline nodes
668
+ # Track already processed reference links to avoid duplicates
669
+ processed_ref_links = set()
670
+
671
+ for node in self._traverse_nodes(root_node):
672
+ if node.type == "inline":
673
+ try:
674
+ raw_text = self._get_node_text_optimized(node)
675
+ if not raw_text:
676
+ continue
677
+
678
+ # Reference link pattern: [text][ref]
679
+ ref_pattern = r'\[([^\]]*)\]\[([^\]]*)\]'
680
+ matches = re.finditer(ref_pattern, raw_text)
681
+
682
+ for match in matches:
683
+ text = match.group(1) or ""
684
+ ref = match.group(2) or ""
685
+
686
+ # Skip image references (starting with !)
687
+ if match.start() > 0 and raw_text[match.start()-1] == '!':
688
+ continue
689
+
690
+ # Duplicate check: process same text and reference combination only once
691
+ start_line = node.start_point[0] + 1
692
+ ref_link_key = (text, ref, start_line)
693
+
694
+ if ref_link_key in processed_ref_links:
695
+ continue
696
+ processed_ref_links.add(ref_link_key)
697
+
698
+ end_line = node.end_point[0] + 1
699
+
700
+ link = MarkdownElement(
701
+ name=text or "Reference Link",
702
+ start_line=start_line,
703
+ end_line=end_line,
704
+ raw_text=match.group(0),
705
+ element_type="reference_link"
706
+ )
707
+ # Add additional attributes for formatter
708
+ link.text = text or "Reference Link"
709
+ link.type = "reference_link"
710
+ links.append(link)
711
+
712
+ except Exception as e:
713
+ log_debug(f"Failed to extract reference link: {e}")
714
+
715
+ def _extract_autolinks(self, root_node: "tree_sitter.Node", links: list[MarkdownElement]) -> None:
716
+ """Extract autolinks"""
717
+ import re
718
+
719
+ # Extract autolinks from text within inline nodes using regular expressions
720
+ for node in self._traverse_nodes(root_node):
721
+ if node.type == "inline":
722
+ try:
723
+ raw_text = self._get_node_text_optimized(node)
724
+ if not raw_text:
725
+ continue
726
+
727
+ # Autolink pattern: <url> or <email>
728
+ autolink_pattern = r'<(https?://[^>]+|mailto:[^>]+|[^@\s]+@[^@\s]+\.[^@\s]+)>'
729
+ matches = re.finditer(autolink_pattern, raw_text)
730
+
731
+ for match in matches:
732
+ url = match.group(1) or ""
733
+ full_match = match.group(0)
734
+
735
+ # Global duplicate check: process same URL for autolinks only once
736
+ autolink_signature = f"autolink|{url}"
737
+ if hasattr(self, '_extracted_links') and autolink_signature in self._extracted_links:
738
+ continue
739
+
740
+ if hasattr(self, '_extracted_links'):
741
+ self._extracted_links.add(autolink_signature)
742
+
743
+ start_line = node.start_point[0] + 1
744
+ end_line = node.end_point[0] + 1
745
+
746
+ link = MarkdownElement(
747
+ name=url or "Autolink",
748
+ start_line=start_line,
749
+ end_line=end_line,
750
+ raw_text=full_match,
751
+ element_type="autolink",
752
+ url=url
753
+ )
754
+ # Add additional attributes for formatter
755
+ link.text = url or "Autolink"
756
+ link.type = "autolink"
757
+ links.append(link)
758
+
759
+ except Exception as e:
760
+ log_debug(f"Failed to extract autolink: {e}")
761
+
762
+ def _extract_inline_images(self, root_node: "tree_sitter.Node", images: list[MarkdownElement]) -> None:
763
+ """Extract inline images"""
764
+ import re
765
+
766
+ # Extract images from text within inline nodes using regular expressions
767
+ for node in self._traverse_nodes(root_node):
768
+ if node.type == "inline":
769
+ try:
770
+ raw_text = self._get_node_text_optimized(node)
771
+ if not raw_text:
772
+ continue
773
+
774
+ # Inline image pattern: ![alt](url "title")
775
+ image_pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
776
+ matches = re.finditer(image_pattern, raw_text)
777
+
778
+ for match in matches:
779
+ alt_text = match.group(1) or ""
780
+ url = match.group(2) or ""
781
+ title = match.group(3) or ""
782
+
783
+ # Calculate line number from matched position
784
+ start_line = node.start_point[0] + 1
785
+ end_line = node.end_point[0] + 1
786
+
787
+ image = MarkdownElement(
788
+ name=alt_text or "Image",
789
+ start_line=start_line,
790
+ end_line=end_line,
791
+ raw_text=match.group(0),
792
+ element_type="image",
793
+ url=url,
794
+ alt_text=alt_text,
795
+ title=title
796
+ )
797
+ # Add additional attributes for formatter
798
+ image.alt = alt_text or ""
799
+ image.type = "image"
800
+ images.append(image)
801
+
802
+ except Exception as e:
803
+ log_debug(f"Failed to extract inline image: {e}")
804
+
805
+ def _extract_reference_images(self, root_node: "tree_sitter.Node", images: list[MarkdownElement]) -> None:
806
+ """Extract reference images"""
807
+ import re
808
+
809
+ # Reference images also need to be extracted from inline nodes
810
+ for node in self._traverse_nodes(root_node):
811
+ if node.type == "inline":
812
+ try:
813
+ raw_text = self._get_node_text_optimized(node)
814
+ if not raw_text:
815
+ continue
816
+
817
+ # Reference image pattern: ![alt][ref]
818
+ ref_image_pattern = r'!\[([^\]]*)\]\[([^\]]*)\]'
819
+ matches = re.finditer(ref_image_pattern, raw_text)
820
+
821
+ for match in matches:
822
+ alt_text = match.group(1) or ""
823
+ ref = match.group(2) or ""
824
+
825
+ start_line = node.start_point[0] + 1
826
+ end_line = node.end_point[0] + 1
827
+
828
+ image = MarkdownElement(
829
+ name=alt_text or "Reference Image",
830
+ start_line=start_line,
831
+ end_line=end_line,
832
+ raw_text=match.group(0),
833
+ element_type="reference_image"
834
+ )
835
+ # Add additional attributes for formatter
836
+ image.alt = alt_text or ""
837
+ image.type = "reference_image"
838
+ images.append(image)
839
+
840
+ except Exception as e:
841
+ log_debug(f"Failed to extract reference image: {e}")
842
+
843
+ def _extract_image_reference_definitions(self, root_node: "tree_sitter.Node", images: list[MarkdownElement]) -> None:
844
+ """Extract image reference definitions"""
845
+ import re
846
+
847
+ # Extract all reference definitions that could be used for images
848
+ # We check if the URL points to an image file or if it's used by an image reference
849
+
850
+ # First, collect all image references used in the document
851
+ image_refs_used = set()
852
+ for node in self._traverse_nodes(root_node):
853
+ if node.type == "inline":
854
+ try:
855
+ raw_text = self._get_node_text_optimized(node)
856
+ if not raw_text:
857
+ continue
858
+
859
+ # Find image references: ![alt][ref]
860
+ ref_image_pattern = r'!\[([^\]]*)\]\[([^\]]*)\]'
861
+ matches = re.finditer(ref_image_pattern, raw_text)
862
+
863
+ for match in matches:
864
+ ref = match.group(2) or ""
865
+ if ref:
866
+ image_refs_used.add(ref.lower())
867
+
868
+ except Exception as e:
869
+ log_debug(f"Failed to scan for image references: {e}")
870
+
871
+ # Now extract reference definitions that are used by images OR point to image files
872
+ for node in self._traverse_nodes(root_node):
873
+ if node.type == "link_reference_definition":
874
+ try:
875
+ start_line = node.start_point[0] + 1
876
+ end_line = node.end_point[0] + 1
877
+ raw_text = self._get_node_text_optimized(node)
878
+
879
+ # Pattern: [label]: url "title"
880
+ ref_pattern = r'^\[([^\]]+)\]:\s*([^\s]+)(?:\s+"([^"]*)")?'
881
+ match = re.match(ref_pattern, raw_text.strip())
882
+
883
+ if match:
884
+ label = match.group(1) or ""
885
+ url = match.group(2) or ""
886
+ title = match.group(3) or ""
887
+
888
+ # Include if this reference is used by an image OR if URL looks like an image
889
+ is_used_by_image = label.lower() in image_refs_used
890
+ is_image_url = any(url.lower().endswith(ext) for ext in ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.bmp'])
891
+
892
+ if is_used_by_image or is_image_url:
893
+ image_ref = MarkdownElement(
894
+ name=f"Image Reference Definition: {label}",
895
+ start_line=start_line,
896
+ end_line=end_line,
897
+ raw_text=raw_text,
898
+ element_type="image_reference_definition",
899
+ url=url,
900
+ alt_text=label,
901
+ title=title
902
+ )
903
+ # Add additional attributes for formatter
904
+ image_ref.alt = label
905
+ image_ref.type = "image_reference_definition"
906
+ images.append(image_ref)
907
+
908
+ except Exception as e:
909
+ log_debug(f"Failed to extract image reference definition: {e}")
910
+
911
+ def _extract_link_reference_definitions(self, root_node: "tree_sitter.Node", references: list[MarkdownElement]) -> None:
912
+ """Extract link reference definitions"""
913
+ for node in self._traverse_nodes(root_node):
914
+ if node.type == "link_reference_definition":
915
+ try:
916
+ start_line = node.start_point[0] + 1
917
+ end_line = node.end_point[0] + 1
918
+ raw_text = self._get_node_text_optimized(node)
919
+
920
+ reference = MarkdownElement(
921
+ name=raw_text or "Reference Definition",
922
+ start_line=start_line,
923
+ end_line=end_line,
924
+ raw_text=raw_text,
925
+ element_type="reference_definition"
926
+ )
927
+ references.append(reference)
928
+ except Exception as e:
929
+ log_debug(f"Failed to extract reference definition: {e}")
930
+
931
+ def _extract_list_items(self, root_node: "tree_sitter.Node", lists: list[MarkdownElement]) -> None:
932
+ """Extract lists (not individual items)"""
933
+ for node in self._traverse_nodes(root_node):
934
+ if node.type == "list":
935
+ try:
936
+ start_line = node.start_point[0] + 1
937
+ end_line = node.end_point[0] + 1
938
+ raw_text = self._get_node_text_optimized(node)
939
+
940
+ # Count list items in this list
941
+ item_count = 0
942
+ is_task_list = False
943
+ is_ordered = False
944
+
945
+ for child in node.children:
946
+ if child.type == "list_item":
947
+ item_count += 1
948
+ item_text = self._get_node_text_optimized(child)
949
+
950
+ # Check if it's a task list item
951
+ if "[ ]" in item_text or "[x]" in item_text or "[X]" in item_text:
952
+ is_task_list = True
953
+
954
+ # Check if it's an ordered list (starts with number)
955
+ if item_text.strip() and item_text.strip()[0].isdigit():
956
+ is_ordered = True
957
+
958
+ # Determine list type
959
+ if is_task_list:
960
+ list_type = "task"
961
+ element_type = "task_list"
962
+ elif is_ordered:
963
+ list_type = "ordered"
964
+ element_type = "list"
965
+ else:
966
+ list_type = "unordered"
967
+ element_type = "list"
968
+
969
+ name = f"{list_type.title()} List ({item_count} items)"
970
+
971
+ list_element = MarkdownElement(
972
+ name=name,
973
+ start_line=start_line,
974
+ end_line=end_line,
975
+ raw_text=raw_text,
976
+ element_type=element_type
977
+ )
978
+ # Add additional attributes for formatter
979
+ list_element.list_type = list_type
980
+ list_element.item_count = item_count
981
+ list_element.type = list_type
982
+ lists.append(list_element)
983
+ except Exception as e:
984
+ log_debug(f"Failed to extract list: {e}")
985
+
986
+ def _extract_pipe_tables(self, root_node: "tree_sitter.Node", tables: list[MarkdownElement]) -> None:
987
+ """Extract pipe tables"""
988
+ for node in self._traverse_nodes(root_node):
989
+ if node.type == "pipe_table":
990
+ try:
991
+ start_line = node.start_point[0] + 1
992
+ end_line = node.end_point[0] + 1
993
+ raw_text = self._get_node_text_optimized(node)
994
+
995
+ # Count rows and columns
996
+ lines = raw_text.strip().split("\n")
997
+ row_count = len([line for line in lines if line.strip() and not line.strip().startswith("|---")])
998
+
999
+ # Count columns from first row
1000
+ column_count = 0
1001
+ if lines:
1002
+ first_row = lines[0]
1003
+ column_count = len([col for col in first_row.split("|") if col.strip()])
1004
+
1005
+ table = MarkdownElement(
1006
+ name=f"Table ({row_count} rows, {column_count} columns)",
1007
+ start_line=start_line,
1008
+ end_line=end_line,
1009
+ raw_text=raw_text,
1010
+ element_type="table"
1011
+ )
1012
+ # Add additional attributes for formatter
1013
+ table.row_count = row_count
1014
+ table.column_count = column_count
1015
+ table.type = "table"
1016
+ tables.append(table)
1017
+ except Exception as e:
1018
+ log_debug(f"Failed to extract pipe table: {e}")
1019
+
1020
+ def _extract_block_quotes(self, root_node: "tree_sitter.Node", blockquotes: list[MarkdownElement]) -> None:
1021
+ """Extract blockquotes"""
1022
+ import re
1023
+
1024
+ # Blockquotes are often represented as paragraphs starting with >
1025
+ for node in self._traverse_nodes(root_node):
1026
+ if node.type == "block_quote":
1027
+ try:
1028
+ start_line = node.start_point[0] + 1
1029
+ end_line = node.end_point[0] + 1
1030
+ raw_text = self._get_node_text_optimized(node)
1031
+
1032
+ # Extract content without > markers
1033
+ lines = raw_text.strip().split("\n")
1034
+ content_lines = []
1035
+ for line in lines:
1036
+ # Remove > marker and optional space
1037
+ cleaned = re.sub(r'^>\s?', '', line)
1038
+ content_lines.append(cleaned)
1039
+ content = "\n".join(content_lines).strip()
1040
+
1041
+ blockquote = MarkdownElement(
1042
+ name=f"Blockquote: {content[:50]}..." if len(content) > 50 else f"Blockquote: {content}",
1043
+ start_line=start_line,
1044
+ end_line=end_line,
1045
+ raw_text=raw_text,
1046
+ element_type="blockquote"
1047
+ )
1048
+ blockquote.type = "blockquote"
1049
+ blockquote.text = content
1050
+ blockquotes.append(blockquote)
1051
+ except Exception as e:
1052
+ log_debug(f"Failed to extract blockquote: {e}")
1053
+
1054
+ def _extract_thematic_breaks(self, root_node: "tree_sitter.Node", horizontal_rules: list[MarkdownElement]) -> None:
1055
+ """Extract thematic breaks (horizontal rules)"""
1056
+ for node in self._traverse_nodes(root_node):
1057
+ if node.type == "thematic_break":
1058
+ try:
1059
+ start_line = node.start_point[0] + 1
1060
+ end_line = node.end_point[0] + 1
1061
+ raw_text = self._get_node_text_optimized(node)
1062
+
1063
+ hr = MarkdownElement(
1064
+ name="Horizontal Rule",
1065
+ start_line=start_line,
1066
+ end_line=end_line,
1067
+ raw_text=raw_text,
1068
+ element_type="horizontal_rule"
1069
+ )
1070
+ hr.type = "horizontal_rule"
1071
+ horizontal_rules.append(hr)
1072
+ except Exception as e:
1073
+ log_debug(f"Failed to extract horizontal rule: {e}")
1074
+
1075
+ def _extract_html_blocks(self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]) -> None:
1076
+ """Extract HTML block elements"""
1077
+ for node in self._traverse_nodes(root_node):
1078
+ if node.type == "html_block":
1079
+ try:
1080
+ start_line = node.start_point[0] + 1
1081
+ end_line = node.end_point[0] + 1
1082
+ raw_text = self._get_node_text_optimized(node)
1083
+
1084
+ # Extract tag name if possible
1085
+ import re
1086
+ tag_match = re.search(r'<(\w+)', raw_text)
1087
+ tag_name = tag_match.group(1) if tag_match else "HTML"
1088
+
1089
+ html_element = MarkdownElement(
1090
+ name=f"HTML Block: {tag_name}",
1091
+ start_line=start_line,
1092
+ end_line=end_line,
1093
+ raw_text=raw_text,
1094
+ element_type="html_block"
1095
+ )
1096
+ html_element.type = "html_block"
1097
+ html_elements.append(html_element)
1098
+ except Exception as e:
1099
+ log_debug(f"Failed to extract HTML block: {e}")
1100
+
1101
+ def _extract_inline_html(self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]) -> None:
1102
+ """Extract inline HTML elements"""
1103
+ import re
1104
+
1105
+ # Look for HTML tags in inline content
1106
+ for node in self._traverse_nodes(root_node):
1107
+ if node.type == "inline":
1108
+ try:
1109
+ raw_text = self._get_node_text_optimized(node)
1110
+ if not raw_text:
1111
+ continue
1112
+
1113
+ # Pattern for HTML tags (excluding autolinks)
1114
+ # Exclude autolink patterns: <url> or <email>
1115
+ html_pattern = r'<(?!(?:https?://|mailto:|[^@\s]+@[^@\s]+\.[^@\s]+)[^>]*>)[^>]+>'
1116
+ matches = re.finditer(html_pattern, raw_text)
1117
+
1118
+ for match in matches:
1119
+ tag_text = match.group(0)
1120
+
1121
+ # Extract tag name
1122
+ tag_match = re.search(r'<(\w+)', tag_text)
1123
+ tag_name = tag_match.group(1) if tag_match else "HTML"
1124
+
1125
+ start_line = node.start_point[0] + 1
1126
+ end_line = node.end_point[0] + 1
1127
+
1128
+ html_element = MarkdownElement(
1129
+ name=f"HTML Tag: {tag_name}",
1130
+ start_line=start_line,
1131
+ end_line=end_line,
1132
+ raw_text=tag_text,
1133
+ element_type="html_inline"
1134
+ )
1135
+ html_element.type = "html_inline"
1136
+ html_element.name = tag_name # Set name attribute for formatter
1137
+ html_elements.append(html_element)
1138
+
1139
+ except Exception as e:
1140
+ log_debug(f"Failed to extract inline HTML: {e}")
1141
+
1142
+ def _extract_emphasis_elements(self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]) -> None:
1143
+ """Extract emphasis and strong emphasis elements"""
1144
+ import re
1145
+
1146
+ for node in self._traverse_nodes(root_node):
1147
+ if node.type == "inline":
1148
+ try:
1149
+ raw_text = self._get_node_text_optimized(node)
1150
+ if not raw_text:
1151
+ continue
1152
+
1153
+ # Pattern for bold text: **text** or __text__
1154
+ bold_pattern = r'\*\*([^*]+)\*\*|__([^_]+)__'
1155
+ bold_matches = re.finditer(bold_pattern, raw_text)
1156
+
1157
+ for match in bold_matches:
1158
+ content = match.group(1) or match.group(2) or ""
1159
+ start_line = node.start_point[0] + 1
1160
+ end_line = node.end_point[0] + 1
1161
+
1162
+ bold_element = MarkdownElement(
1163
+ name=f"Bold: {content}",
1164
+ start_line=start_line,
1165
+ end_line=end_line,
1166
+ raw_text=match.group(0),
1167
+ element_type="strong_emphasis"
1168
+ )
1169
+ bold_element.type = "strong_emphasis"
1170
+ bold_element.text = content
1171
+ formatting_elements.append(bold_element)
1172
+
1173
+ # Pattern for italic text: *text* or _text_ (but not **text** or __text__)
1174
+ italic_pattern = r'(?<!\*)\*([^*]+)\*(?!\*)|(?<!_)_([^_]+)_(?!_)'
1175
+ italic_matches = re.finditer(italic_pattern, raw_text)
1176
+
1177
+ for match in italic_matches:
1178
+ content = match.group(1) or match.group(2) or ""
1179
+ start_line = node.start_point[0] + 1
1180
+ end_line = node.end_point[0] + 1
1181
+
1182
+ italic_element = MarkdownElement(
1183
+ name=f"Italic: {content}",
1184
+ start_line=start_line,
1185
+ end_line=end_line,
1186
+ raw_text=match.group(0),
1187
+ element_type="emphasis"
1188
+ )
1189
+ italic_element.type = "emphasis"
1190
+ italic_element.text = content
1191
+ formatting_elements.append(italic_element)
1192
+
1193
+ except Exception as e:
1194
+ log_debug(f"Failed to extract emphasis elements: {e}")
1195
+
1196
+ def _extract_inline_code_spans(self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]) -> None:
1197
+ """Extract inline code spans"""
1198
+ import re
1199
+
1200
+ for node in self._traverse_nodes(root_node):
1201
+ if node.type == "inline":
1202
+ try:
1203
+ raw_text = self._get_node_text_optimized(node)
1204
+ if not raw_text:
1205
+ continue
1206
+
1207
+ # Pattern for inline code: `code`
1208
+ code_pattern = r'`([^`]+)`'
1209
+ matches = re.finditer(code_pattern, raw_text)
1210
+
1211
+ for match in matches:
1212
+ content = match.group(1) or ""
1213
+ start_line = node.start_point[0] + 1
1214
+ end_line = node.end_point[0] + 1
1215
+
1216
+ code_element = MarkdownElement(
1217
+ name=f"Inline Code: {content}",
1218
+ start_line=start_line,
1219
+ end_line=end_line,
1220
+ raw_text=match.group(0),
1221
+ element_type="inline_code"
1222
+ )
1223
+ code_element.type = "inline_code"
1224
+ code_element.text = content
1225
+ formatting_elements.append(code_element)
1226
+
1227
+ except Exception as e:
1228
+ log_debug(f"Failed to extract inline code: {e}")
1229
+
1230
+ def _extract_strikethrough_elements(self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]) -> None:
1231
+ """Extract strikethrough elements"""
1232
+ import re
1233
+
1234
+ for node in self._traverse_nodes(root_node):
1235
+ if node.type == "inline":
1236
+ try:
1237
+ raw_text = self._get_node_text_optimized(node)
1238
+ if not raw_text:
1239
+ continue
1240
+
1241
+ # Pattern for strikethrough: ~~text~~
1242
+ strike_pattern = r'~~([^~]+)~~'
1243
+ matches = re.finditer(strike_pattern, raw_text)
1244
+
1245
+ for match in matches:
1246
+ content = match.group(1) or ""
1247
+ start_line = node.start_point[0] + 1
1248
+ end_line = node.end_point[0] + 1
1249
+
1250
+ strike_element = MarkdownElement(
1251
+ name=f"Strikethrough: {content}",
1252
+ start_line=start_line,
1253
+ end_line=end_line,
1254
+ raw_text=match.group(0),
1255
+ element_type="strikethrough"
1256
+ )
1257
+ strike_element.type = "strikethrough"
1258
+ strike_element.text = content
1259
+ formatting_elements.append(strike_element)
1260
+
1261
+ except Exception as e:
1262
+ log_debug(f"Failed to extract strikethrough: {e}")
1263
+
1264
+ def _extract_footnote_elements(self, root_node: "tree_sitter.Node", footnotes: list[MarkdownElement]) -> None:
1265
+ """Extract footnote elements"""
1266
+ import re
1267
+
1268
+ for node in self._traverse_nodes(root_node):
1269
+ if node.type == "inline":
1270
+ try:
1271
+ raw_text = self._get_node_text_optimized(node)
1272
+ if not raw_text:
1273
+ continue
1274
+
1275
+ # Pattern for footnote references: [^1]
1276
+ footnote_ref_pattern = r'\[\^([^\]]+)\]'
1277
+ matches = re.finditer(footnote_ref_pattern, raw_text)
1278
+
1279
+ for match in matches:
1280
+ ref_id = match.group(1) or ""
1281
+ start_line = node.start_point[0] + 1
1282
+ end_line = node.end_point[0] + 1
1283
+
1284
+ footnote_element = MarkdownElement(
1285
+ name=f"Footnote Reference: {ref_id}",
1286
+ start_line=start_line,
1287
+ end_line=end_line,
1288
+ raw_text=match.group(0),
1289
+ element_type="footnote_reference"
1290
+ )
1291
+ footnote_element.type = "footnote_reference"
1292
+ footnote_element.text = ref_id
1293
+ footnotes.append(footnote_element)
1294
+
1295
+ except Exception as e:
1296
+ log_debug(f"Failed to extract footnote reference: {e}")
1297
+
1298
+ # Look for footnote definitions
1299
+ elif node.type == "paragraph":
1300
+ try:
1301
+ raw_text = self._get_node_text_optimized(node)
1302
+ if not raw_text:
1303
+ continue
1304
+
1305
+ # Pattern for footnote definitions: [^1]: content
1306
+ footnote_def_pattern = r'^\[\^([^\]]+)\]:\s*(.+)$'
1307
+ match = re.match(footnote_def_pattern, raw_text.strip(), re.MULTILINE)
1308
+
1309
+ if match:
1310
+ ref_id = match.group(1) or ""
1311
+ content = match.group(2) or ""
1312
+ start_line = node.start_point[0] + 1
1313
+ end_line = node.end_point[0] + 1
1314
+
1315
+ footnote_element = MarkdownElement(
1316
+ name=f"Footnote Definition: {ref_id}",
1317
+ start_line=start_line,
1318
+ end_line=end_line,
1319
+ raw_text=raw_text,
1320
+ element_type="footnote_definition"
1321
+ )
1322
+ footnote_element.type = "footnote_definition"
1323
+ footnote_element.text = content
1324
+ footnotes.append(footnote_element)
1325
+
1326
+ except Exception as e:
1327
+ log_debug(f"Failed to extract footnote definition: {e}")
1328
+
1329
+ def _traverse_nodes(self, node: "tree_sitter.Node"):
1330
+ """Traverse all nodes in the tree"""
1331
+ yield node
1332
+ for child in node.children:
1333
+ yield from self._traverse_nodes(child)
1334
+
1335
+ def _parse_link_components(self, raw_text: str) -> tuple[str, str, str]:
1336
+ """Parse link components from raw text"""
1337
+ import re
1338
+
1339
+ # Pattern for [text](url "title")
1340
+ pattern = r'\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
1341
+ match = re.search(pattern, raw_text)
1342
+
1343
+ if match:
1344
+ text = match.group(1) or ""
1345
+ url = match.group(2) or ""
1346
+ title = match.group(3) or ""
1347
+ return text, url, title
1348
+
1349
+ return "", "", ""
1350
+
1351
+ def _parse_image_components(self, raw_text: str) -> tuple[str, str, str]:
1352
+ """Parse image components from raw text"""
1353
+ import re
1354
+
1355
+ # Pattern for ![alt](url "title")
1356
+ pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
1357
+ match = re.search(pattern, raw_text)
1358
+
1359
+ if match:
1360
+ alt_text = match.group(1) or ""
1361
+ url = match.group(2) or ""
1362
+ title = match.group(3) or ""
1363
+ return alt_text, url, title
1364
+
1365
+ return "", "", ""
1366
+
1367
+
1368
+ class MarkdownPlugin(LanguagePlugin):
1369
+ """Markdown language plugin for the tree-sitter analyzer"""
1370
+
1371
+ def __init__(self) -> None:
1372
+ """Initialize the Markdown plugin"""
1373
+ super().__init__()
1374
+ self._language_cache: tree_sitter.Language | None = None
1375
+ self._extractor: MarkdownElementExtractor = MarkdownElementExtractor()
1376
+
1377
+ # Legacy compatibility attributes for tests
1378
+ self.language = "markdown"
1379
+ self.extractor = self._extractor
1380
+
1381
+ def get_language_name(self) -> str:
1382
+ """Return the name of the programming language this plugin supports"""
1383
+ return "markdown"
1384
+
1385
+ def get_file_extensions(self) -> list[str]:
1386
+ """Return list of file extensions this plugin supports"""
1387
+ return [".md", ".markdown", ".mdown", ".mkd", ".mkdn", ".mdx"]
1388
+
1389
+ def create_extractor(self) -> ElementExtractor:
1390
+ """Create and return an element extractor for this language"""
1391
+ return MarkdownElementExtractor()
1392
+
1393
+ def get_extractor(self) -> ElementExtractor:
1394
+ """Get the cached extractor instance, creating it if necessary"""
1395
+ return self._extractor
1396
+
1397
+ def get_language(self) -> str:
1398
+ """Get the language name for Markdown (legacy compatibility)"""
1399
+ return "markdown"
1400
+
1401
+ def extract_functions(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1402
+ """Extract functions from the tree (legacy compatibility)"""
1403
+ extractor = self.get_extractor()
1404
+ return extractor.extract_functions(tree, source_code)
1405
+
1406
+ def extract_classes(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1407
+ """Extract classes from the tree (legacy compatibility)"""
1408
+ extractor = self.get_extractor()
1409
+ return extractor.extract_classes(tree, source_code)
1410
+
1411
+ def extract_variables(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1412
+ """Extract variables from the tree (legacy compatibility)"""
1413
+ extractor = self.get_extractor()
1414
+ return extractor.extract_variables(tree, source_code)
1415
+
1416
+ def extract_imports(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1417
+ """Extract imports from the tree (legacy compatibility)"""
1418
+ extractor = self.get_extractor()
1419
+ return extractor.extract_imports(tree, source_code)
1420
+
1421
+ def get_tree_sitter_language(self) -> Optional["tree_sitter.Language"]:
1422
+ """Get the Tree-sitter language object for Markdown"""
1423
+ if self._language_cache is None:
1424
+ try:
1425
+ import tree_sitter
1426
+ import tree_sitter_markdown as tsmarkdown
1427
+
1428
+ # Support for newer versions of tree-sitter-markdown
1429
+ try:
1430
+ # New API (0.3.1+)
1431
+ language_capsule = tsmarkdown.language()
1432
+ self._language_cache = tree_sitter.Language(language_capsule)
1433
+ except (AttributeError, TypeError):
1434
+ # For older API or different format
1435
+ try:
1436
+ # Get Language object directly
1437
+ self._language_cache = tsmarkdown.language()
1438
+ except Exception:
1439
+ # Last resort: get directly from module
1440
+ if hasattr(tsmarkdown, 'LANGUAGE'):
1441
+ self._language_cache = tree_sitter.Language(tsmarkdown.LANGUAGE)
1442
+ else:
1443
+ raise ImportError("Cannot access markdown language")
1444
+ except ImportError:
1445
+ log_error("tree-sitter-markdown not available")
1446
+ return None
1447
+ except Exception as e:
1448
+ log_error(f"Failed to load Markdown language: {e}")
1449
+ return None
1450
+ return self._language_cache
1451
+
1452
+ def get_supported_queries(self) -> list[str]:
1453
+ """Get list of supported query names for this language"""
1454
+ return [
1455
+ "headers",
1456
+ "code_blocks",
1457
+ "links",
1458
+ "images",
1459
+ "lists",
1460
+ "tables",
1461
+ "blockquotes",
1462
+ "emphasis",
1463
+ "inline_code",
1464
+ "references",
1465
+ "task_lists",
1466
+ "horizontal_rules",
1467
+ "html_blocks",
1468
+ "strikethrough",
1469
+ "footnotes",
1470
+ "text_content",
1471
+ "all_elements",
1472
+ ]
1473
+
1474
+ def is_applicable(self, file_path: str) -> bool:
1475
+ """Check if this plugin is applicable for the given file"""
1476
+ return any(
1477
+ file_path.lower().endswith(ext.lower())
1478
+ for ext in self.get_file_extensions()
1479
+ )
1480
+
1481
+ def get_plugin_info(self) -> dict:
1482
+ """Get information about this plugin"""
1483
+ return {
1484
+ "name": "Markdown Plugin",
1485
+ "language": self.get_language_name(),
1486
+ "extensions": self.get_file_extensions(),
1487
+ "version": "1.0.0",
1488
+ "supported_queries": self.get_supported_queries(),
1489
+ "features": [
1490
+ "ATX headers (# ## ###)",
1491
+ "Setext headers (underlined)",
1492
+ "Fenced code blocks",
1493
+ "Indented code blocks",
1494
+ "Inline code spans",
1495
+ "Inline links",
1496
+ "Reference links",
1497
+ "Autolinks",
1498
+ "Email autolinks",
1499
+ "Images (inline and reference)",
1500
+ "Lists (ordered and unordered)",
1501
+ "Task lists (checkboxes)",
1502
+ "Blockquotes",
1503
+ "Tables",
1504
+ "Emphasis and strong emphasis",
1505
+ "Strikethrough text",
1506
+ "Horizontal rules",
1507
+ "HTML blocks and inline HTML",
1508
+ "Footnotes (references and definitions)",
1509
+ "Reference definitions",
1510
+ "Text formatting extraction",
1511
+ "CommonMark compliance",
1512
+ ],
1513
+ }
1514
+
1515
+ async def analyze_file(
1516
+ self, file_path: str, request: AnalysisRequest
1517
+ ) -> AnalysisResult:
1518
+ """Analyze a Markdown file and return the analysis results."""
1519
+ if not TREE_SITTER_AVAILABLE:
1520
+ return AnalysisResult(
1521
+ file_path=file_path,
1522
+ language=self.get_language_name(),
1523
+ success=False,
1524
+ error_message="Tree-sitter library not available.",
1525
+ )
1526
+
1527
+ language = self.get_tree_sitter_language()
1528
+ if not language:
1529
+ return AnalysisResult(
1530
+ file_path=file_path,
1531
+ language=self.get_language_name(),
1532
+ success=False,
1533
+ error_message="Could not load Markdown language for parsing.",
1534
+ )
1535
+
1536
+ try:
1537
+ with open(file_path, encoding="utf-8") as f:
1538
+ source_code = f.read()
1539
+
1540
+ parser = tree_sitter.Parser()
1541
+ parser.language = language
1542
+ tree = parser.parse(bytes(source_code, "utf8"))
1543
+
1544
+ extractor = self.create_extractor()
1545
+ extractor.current_file = file_path # Set current file for context
1546
+
1547
+ elements: list[CodeElement] = []
1548
+
1549
+ # Extract all element types
1550
+ headers = extractor.extract_headers(tree, source_code)
1551
+ code_blocks = extractor.extract_code_blocks(tree, source_code)
1552
+ links = extractor.extract_links(tree, source_code)
1553
+ images = extractor.extract_images(tree, source_code)
1554
+ references = extractor.extract_references(tree, source_code)
1555
+ lists = extractor.extract_lists(tree, source_code)
1556
+ tables = extractor.extract_tables(tree, source_code)
1557
+
1558
+ # Extract new element types
1559
+ blockquotes = extractor.extract_blockquotes(tree, source_code)
1560
+ horizontal_rules = extractor.extract_horizontal_rules(tree, source_code)
1561
+ html_elements = extractor.extract_html_elements(tree, source_code)
1562
+ text_formatting = extractor.extract_text_formatting(tree, source_code)
1563
+ footnotes = extractor.extract_footnotes(tree, source_code)
1564
+
1565
+ elements.extend(headers)
1566
+ elements.extend(code_blocks)
1567
+ elements.extend(links)
1568
+ elements.extend(images)
1569
+ elements.extend(references)
1570
+ elements.extend(lists)
1571
+ elements.extend(tables)
1572
+ elements.extend(blockquotes)
1573
+ elements.extend(horizontal_rules)
1574
+ elements.extend(html_elements)
1575
+ elements.extend(text_formatting)
1576
+ elements.extend(footnotes)
1577
+
1578
+ def count_nodes(node: "tree_sitter.Node") -> int:
1579
+ count = 1
1580
+ for child in node.children:
1581
+ count += count_nodes(child)
1582
+ return count
1583
+
1584
+ return AnalysisResult(
1585
+ file_path=file_path,
1586
+ language=self.get_language_name(),
1587
+ success=True,
1588
+ elements=elements,
1589
+ line_count=len(source_code.splitlines()),
1590
+ node_count=count_nodes(tree.root_node),
1591
+ )
1592
+ except Exception as e:
1593
+ log_error(f"Error analyzing Markdown file {file_path}: {e}")
1594
+ return AnalysisResult(
1595
+ file_path=file_path,
1596
+ language=self.get_language_name(),
1597
+ success=False,
1598
+ error_message=str(e),
1599
+ )
1600
+
1601
+ def execute_query(self, tree: "tree_sitter.Tree", query_name: str) -> dict:
1602
+ """Execute a specific query on the tree"""
1603
+ try:
1604
+ import tree_sitter
1605
+
1606
+ language = self.get_tree_sitter_language()
1607
+ if not language:
1608
+ return {"error": "Language not available"}
1609
+
1610
+ # Import query definitions
1611
+ from ..queries.markdown import get_query
1612
+
1613
+ try:
1614
+ query_string = get_query(query_name)
1615
+ except KeyError:
1616
+ return {"error": f"Unknown query: {query_name}"}
1617
+
1618
+ # Use new tree-sitter 0.25.x API
1619
+ query = tree_sitter.Query(language, query_string)
1620
+
1621
+ # Execute query using the new API
1622
+ # In tree-sitter 0.25.x, we need to use a different approach
1623
+ matches = []
1624
+ captures = []
1625
+
1626
+ # Walk through the tree and find matches manually
1627
+ def walk_tree(node):
1628
+ # This is a simplified approach - in practice, you'd want to use
1629
+ # the proper query execution method when it becomes available
1630
+ if query_name == "headers" and node.type in ["atx_heading", "setext_heading"]:
1631
+ matches.append(node)
1632
+ elif query_name == "code_blocks" and node.type in ["fenced_code_block", "indented_code_block"]:
1633
+ matches.append(node)
1634
+ elif query_name == "links" and node.type in ["link", "autolink", "reference_link"]:
1635
+ matches.append(node)
1636
+
1637
+ for child in node.children:
1638
+ walk_tree(child)
1639
+
1640
+ walk_tree(tree.root_node)
1641
+
1642
+ # Convert matches to capture format
1643
+ for match in matches:
1644
+ captures.append((match, query_name))
1645
+
1646
+ return {"captures": captures, "query": query_string, "matches": len(matches)}
1647
+
1648
+ except Exception as e:
1649
+ log_error(f"Query execution failed: {e}")
1650
+ return {"error": str(e)}
1651
+
1652
+ def extract_elements(self, tree: "tree_sitter.Tree", source_code: str) -> list:
1653
+ """Extract elements from source code using tree-sitter AST"""
1654
+ extractor = self.get_extractor()
1655
+ elements = []
1656
+
1657
+ try:
1658
+ elements.extend(extractor.extract_headers(tree, source_code))
1659
+ elements.extend(extractor.extract_code_blocks(tree, source_code))
1660
+ elements.extend(extractor.extract_links(tree, source_code))
1661
+ elements.extend(extractor.extract_images(tree, source_code))
1662
+ elements.extend(extractor.extract_references(tree, source_code))
1663
+ elements.extend(extractor.extract_lists(tree, source_code))
1664
+ elements.extend(extractor.extract_tables(tree, source_code))
1665
+ elements.extend(extractor.extract_blockquotes(tree, source_code))
1666
+ elements.extend(extractor.extract_horizontal_rules(tree, source_code))
1667
+ elements.extend(extractor.extract_html_elements(tree, source_code))
1668
+ elements.extend(extractor.extract_text_formatting(tree, source_code))
1669
+ elements.extend(extractor.extract_footnotes(tree, source_code))
1670
+ except Exception as e:
1671
+ log_error(f"Failed to extract elements: {e}")
1672
+
1673
+ return elements