tree-sitter-analyzer 1.8.4__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tree-sitter-analyzer might be problematic. Click here for more details.
- tree_sitter_analyzer/__init__.py +1 -1
- tree_sitter_analyzer/api.py +4 -4
- tree_sitter_analyzer/cli/argument_validator.py +29 -17
- tree_sitter_analyzer/cli/commands/advanced_command.py +7 -5
- tree_sitter_analyzer/cli/commands/structure_command.py +7 -5
- tree_sitter_analyzer/cli/commands/summary_command.py +10 -6
- tree_sitter_analyzer/cli/commands/table_command.py +8 -7
- tree_sitter_analyzer/cli/info_commands.py +1 -1
- tree_sitter_analyzer/cli_main.py +3 -2
- tree_sitter_analyzer/core/analysis_engine.py +5 -5
- tree_sitter_analyzer/core/cache_service.py +3 -1
- tree_sitter_analyzer/core/query.py +17 -5
- tree_sitter_analyzer/core/query_service.py +1 -1
- tree_sitter_analyzer/encoding_utils.py +3 -3
- tree_sitter_analyzer/exceptions.py +61 -50
- tree_sitter_analyzer/file_handler.py +3 -0
- tree_sitter_analyzer/formatters/base_formatter.py +10 -5
- tree_sitter_analyzer/formatters/formatter_registry.py +83 -68
- tree_sitter_analyzer/formatters/html_formatter.py +90 -64
- tree_sitter_analyzer/formatters/javascript_formatter.py +21 -16
- tree_sitter_analyzer/formatters/language_formatter_factory.py +7 -6
- tree_sitter_analyzer/formatters/markdown_formatter.py +247 -124
- tree_sitter_analyzer/formatters/python_formatter.py +61 -38
- tree_sitter_analyzer/formatters/typescript_formatter.py +113 -45
- tree_sitter_analyzer/interfaces/mcp_server.py +2 -2
- tree_sitter_analyzer/language_detector.py +6 -6
- tree_sitter_analyzer/language_loader.py +3 -1
- tree_sitter_analyzer/languages/css_plugin.py +120 -61
- tree_sitter_analyzer/languages/html_plugin.py +159 -62
- tree_sitter_analyzer/languages/java_plugin.py +42 -34
- tree_sitter_analyzer/languages/javascript_plugin.py +59 -30
- tree_sitter_analyzer/languages/markdown_plugin.py +402 -368
- tree_sitter_analyzer/languages/python_plugin.py +111 -64
- tree_sitter_analyzer/languages/typescript_plugin.py +241 -132
- tree_sitter_analyzer/mcp/server.py +22 -18
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +13 -8
- tree_sitter_analyzer/mcp/tools/base_tool.py +2 -2
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +232 -26
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +31 -23
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +21 -19
- tree_sitter_analyzer/mcp/tools/query_tool.py +17 -18
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +30 -31
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +131 -77
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +29 -16
- tree_sitter_analyzer/mcp/utils/file_output_factory.py +64 -51
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +34 -24
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +8 -4
- tree_sitter_analyzer/models.py +7 -5
- tree_sitter_analyzer/plugins/base.py +9 -7
- tree_sitter_analyzer/plugins/manager.py +1 -0
- tree_sitter_analyzer/queries/css.py +2 -21
- tree_sitter_analyzer/queries/html.py +2 -15
- tree_sitter_analyzer/queries/markdown.py +30 -41
- tree_sitter_analyzer/queries/python.py +20 -5
- tree_sitter_analyzer/query_loader.py +5 -5
- tree_sitter_analyzer/security/validator.py +114 -86
- tree_sitter_analyzer/utils/__init__.py +58 -28
- tree_sitter_analyzer/utils/tree_sitter_compat.py +72 -65
- tree_sitter_analyzer/utils.py +26 -15
- {tree_sitter_analyzer-1.8.4.dist-info → tree_sitter_analyzer-1.9.1.dist-info}/METADATA +23 -6
- tree_sitter_analyzer-1.9.1.dist-info/RECORD +109 -0
- tree_sitter_analyzer-1.8.4.dist-info/RECORD +0 -109
- {tree_sitter_analyzer-1.8.4.dist-info → tree_sitter_analyzer-1.9.1.dist-info}/WHEEL +0 -0
- {tree_sitter_analyzer-1.8.4.dist-info → tree_sitter_analyzer-1.9.1.dist-info}/entry_points.txt +0 -0
|
@@ -24,12 +24,12 @@ from ..encoding_utils import extract_text_slice, safe_encode
|
|
|
24
24
|
from ..models import AnalysisResult, CodeElement
|
|
25
25
|
from ..plugins.base import ElementExtractor, LanguagePlugin
|
|
26
26
|
from ..utils import log_debug, log_error, log_warning
|
|
27
|
-
from ..utils.tree_sitter_compat import TreeSitterQueryCompat
|
|
27
|
+
from ..utils.tree_sitter_compat import TreeSitterQueryCompat
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class MarkdownElement(CodeElement):
|
|
31
31
|
"""Markdown-specific code element"""
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
def __init__(
|
|
34
34
|
self,
|
|
35
35
|
name: str,
|
|
@@ -38,13 +38,13 @@ class MarkdownElement(CodeElement):
|
|
|
38
38
|
raw_text: str,
|
|
39
39
|
language: str = "markdown",
|
|
40
40
|
element_type: str = "markdown",
|
|
41
|
-
level:
|
|
42
|
-
url:
|
|
43
|
-
alt_text:
|
|
44
|
-
title:
|
|
45
|
-
language_info:
|
|
46
|
-
is_checked:
|
|
47
|
-
**kwargs
|
|
41
|
+
level: int | None = None,
|
|
42
|
+
url: str | None = None,
|
|
43
|
+
alt_text: str | None = None,
|
|
44
|
+
title: str | None = None,
|
|
45
|
+
language_info: str | None = None,
|
|
46
|
+
is_checked: bool | None = None,
|
|
47
|
+
**kwargs,
|
|
48
48
|
):
|
|
49
49
|
super().__init__(
|
|
50
50
|
name=name,
|
|
@@ -52,7 +52,7 @@ class MarkdownElement(CodeElement):
|
|
|
52
52
|
end_line=end_line,
|
|
53
53
|
raw_text=raw_text,
|
|
54
54
|
language=language,
|
|
55
|
-
**kwargs
|
|
55
|
+
**kwargs,
|
|
56
56
|
)
|
|
57
57
|
self.element_type = element_type
|
|
58
58
|
self.level = level # For headers (1-6)
|
|
@@ -172,15 +172,15 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
172
172
|
try:
|
|
173
173
|
# Track extracted links to prevent global duplicates (ensure reset)
|
|
174
174
|
self._extracted_links = set()
|
|
175
|
-
|
|
175
|
+
|
|
176
176
|
self._extract_inline_links(tree.root_node, links)
|
|
177
177
|
self._extract_reference_links(tree.root_node, links)
|
|
178
178
|
self._extract_autolinks(tree.root_node, links)
|
|
179
|
-
|
|
179
|
+
|
|
180
180
|
# Clean up after extraction is complete
|
|
181
|
-
if hasattr(self,
|
|
182
|
-
delattr(self,
|
|
183
|
-
|
|
181
|
+
if hasattr(self, "_extracted_links"):
|
|
182
|
+
delattr(self, "_extracted_links")
|
|
183
|
+
|
|
184
184
|
except Exception as e:
|
|
185
185
|
log_debug(f"Error during link extraction: {e}")
|
|
186
186
|
return []
|
|
@@ -189,7 +189,7 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
189
189
|
seen = set()
|
|
190
190
|
unique_links = []
|
|
191
191
|
for link in links:
|
|
192
|
-
key = (getattr(link,
|
|
192
|
+
key = (getattr(link, "text", "") or "", getattr(link, "url", "") or "")
|
|
193
193
|
if key not in seen:
|
|
194
194
|
seen.add(key)
|
|
195
195
|
unique_links.append(link)
|
|
@@ -292,7 +292,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
292
292
|
horizontal_rules: list[MarkdownElement] = []
|
|
293
293
|
|
|
294
294
|
if tree is None or tree.root_node is None:
|
|
295
|
-
log_debug(
|
|
295
|
+
log_debug(
|
|
296
|
+
"Tree or root_node is None, returning empty horizontal rules list"
|
|
297
|
+
)
|
|
296
298
|
return horizontal_rules
|
|
297
299
|
|
|
298
300
|
try:
|
|
@@ -339,7 +341,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
339
341
|
formatting_elements: list[MarkdownElement] = []
|
|
340
342
|
|
|
341
343
|
if tree is None or tree.root_node is None:
|
|
342
|
-
log_debug(
|
|
344
|
+
log_debug(
|
|
345
|
+
"Tree or root_node is None, returning empty formatting elements list"
|
|
346
|
+
)
|
|
343
347
|
return formatting_elements
|
|
344
348
|
|
|
345
349
|
try:
|
|
@@ -454,10 +458,10 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
454
458
|
start_point = node.start_point
|
|
455
459
|
end_point = node.end_point
|
|
456
460
|
|
|
457
|
-
if
|
|
461
|
+
if start_point[0] < 0 or start_point[0] >= len(self.content_lines):
|
|
458
462
|
return ""
|
|
459
|
-
|
|
460
|
-
if
|
|
463
|
+
|
|
464
|
+
if end_point[0] < 0 or end_point[0] >= len(self.content_lines):
|
|
461
465
|
return ""
|
|
462
466
|
|
|
463
467
|
if start_point[0] == end_point[0]:
|
|
@@ -469,7 +473,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
469
473
|
return result
|
|
470
474
|
else:
|
|
471
475
|
lines = []
|
|
472
|
-
for i in range(
|
|
476
|
+
for i in range(
|
|
477
|
+
start_point[0], min(end_point[0] + 1, len(self.content_lines))
|
|
478
|
+
):
|
|
473
479
|
if i < len(self.content_lines):
|
|
474
480
|
line = self.content_lines[i]
|
|
475
481
|
if i == start_point[0] and i == end_point[0]:
|
|
@@ -492,7 +498,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
492
498
|
log_error(f"Fallback text extraction also failed: {fallback_error}")
|
|
493
499
|
return ""
|
|
494
500
|
|
|
495
|
-
def _extract_atx_headers(
|
|
501
|
+
def _extract_atx_headers(
|
|
502
|
+
self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]
|
|
503
|
+
) -> None:
|
|
496
504
|
"""Extract ATX-style headers (# ## ### etc.)"""
|
|
497
505
|
for node in self._traverse_nodes(root_node):
|
|
498
506
|
if node.type == "atx_heading":
|
|
@@ -500,23 +508,23 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
500
508
|
start_line = node.start_point[0] + 1
|
|
501
509
|
end_line = node.end_point[0] + 1
|
|
502
510
|
raw_text = self._get_node_text_optimized(node)
|
|
503
|
-
|
|
511
|
+
|
|
504
512
|
# Extract header level and content
|
|
505
513
|
level = 1
|
|
506
514
|
content = raw_text.strip()
|
|
507
|
-
|
|
515
|
+
|
|
508
516
|
# Count # symbols to determine level
|
|
509
517
|
if content.startswith("#"):
|
|
510
518
|
level = len(content) - len(content.lstrip("#"))
|
|
511
519
|
content = content.lstrip("# ").rstrip()
|
|
512
|
-
|
|
520
|
+
|
|
513
521
|
header = MarkdownElement(
|
|
514
522
|
name=content or f"Header Level {level}",
|
|
515
523
|
start_line=start_line,
|
|
516
524
|
end_line=end_line,
|
|
517
525
|
raw_text=raw_text,
|
|
518
526
|
element_type="heading",
|
|
519
|
-
level=level
|
|
527
|
+
level=level,
|
|
520
528
|
)
|
|
521
529
|
# Add additional attributes for formatter
|
|
522
530
|
header.text = content or f"Header Level {level}"
|
|
@@ -525,7 +533,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
525
533
|
except Exception as e:
|
|
526
534
|
log_debug(f"Failed to extract ATX header: {e}")
|
|
527
535
|
|
|
528
|
-
def _extract_setext_headers(
|
|
536
|
+
def _extract_setext_headers(
|
|
537
|
+
self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]
|
|
538
|
+
) -> None:
|
|
529
539
|
"""Extract Setext-style headers (underlined)"""
|
|
530
540
|
for node in self._traverse_nodes(root_node):
|
|
531
541
|
if node.type == "setext_heading":
|
|
@@ -533,7 +543,7 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
533
543
|
start_line = node.start_point[0] + 1
|
|
534
544
|
end_line = node.end_point[0] + 1
|
|
535
545
|
raw_text = self._get_node_text_optimized(node)
|
|
536
|
-
|
|
546
|
+
|
|
537
547
|
# Determine level based on underline character
|
|
538
548
|
level = 2 # Default to H2
|
|
539
549
|
lines = raw_text.strip().split("\n")
|
|
@@ -546,14 +556,14 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
546
556
|
content = lines[0].strip()
|
|
547
557
|
else:
|
|
548
558
|
content = raw_text.strip()
|
|
549
|
-
|
|
559
|
+
|
|
550
560
|
header = MarkdownElement(
|
|
551
561
|
name=content or f"Header Level {level}",
|
|
552
562
|
start_line=start_line,
|
|
553
563
|
end_line=end_line,
|
|
554
564
|
raw_text=raw_text,
|
|
555
565
|
element_type="heading",
|
|
556
|
-
level=level
|
|
566
|
+
level=level,
|
|
557
567
|
)
|
|
558
568
|
# Add additional attributes for formatter
|
|
559
569
|
header.text = content or f"Header Level {level}"
|
|
@@ -562,7 +572,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
562
572
|
except Exception as e:
|
|
563
573
|
log_debug(f"Failed to extract Setext header: {e}")
|
|
564
574
|
|
|
565
|
-
def _extract_fenced_code_blocks(
|
|
575
|
+
def _extract_fenced_code_blocks(
|
|
576
|
+
self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]
|
|
577
|
+
) -> None:
|
|
566
578
|
"""Extract fenced code blocks"""
|
|
567
579
|
for node in self._traverse_nodes(root_node):
|
|
568
580
|
if node.type == "fenced_code_block":
|
|
@@ -570,13 +582,13 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
570
582
|
start_line = node.start_point[0] + 1
|
|
571
583
|
end_line = node.end_point[0] + 1
|
|
572
584
|
raw_text = self._get_node_text_optimized(node)
|
|
573
|
-
|
|
585
|
+
|
|
574
586
|
# Extract language info
|
|
575
587
|
language_info = None
|
|
576
588
|
lines = raw_text.strip().split("\n")
|
|
577
589
|
if lines and lines[0].startswith("```"):
|
|
578
590
|
language_info = lines[0][3:].strip()
|
|
579
|
-
|
|
591
|
+
|
|
580
592
|
# Extract content (excluding fence markers)
|
|
581
593
|
content_lines = []
|
|
582
594
|
in_content = False
|
|
@@ -589,17 +601,16 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
589
601
|
break
|
|
590
602
|
if in_content:
|
|
591
603
|
content_lines.append(line)
|
|
592
|
-
|
|
593
|
-
content = "\n".join(content_lines)
|
|
604
|
+
|
|
594
605
|
name = f"Code Block ({language_info or 'unknown'})"
|
|
595
|
-
|
|
606
|
+
|
|
596
607
|
code_block = MarkdownElement(
|
|
597
608
|
name=name,
|
|
598
609
|
start_line=start_line,
|
|
599
610
|
end_line=end_line,
|
|
600
611
|
raw_text=raw_text,
|
|
601
612
|
element_type="code_block",
|
|
602
|
-
language_info=language_info
|
|
613
|
+
language_info=language_info,
|
|
603
614
|
)
|
|
604
615
|
# Add additional attributes for formatter
|
|
605
616
|
code_block.language = language_info or "text"
|
|
@@ -609,7 +620,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
609
620
|
except Exception as e:
|
|
610
621
|
log_debug(f"Failed to extract fenced code block: {e}")
|
|
611
622
|
|
|
612
|
-
def _extract_indented_code_blocks(
|
|
623
|
+
def _extract_indented_code_blocks(
|
|
624
|
+
self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]
|
|
625
|
+
) -> None:
|
|
613
626
|
"""Extract indented code blocks"""
|
|
614
627
|
for node in self._traverse_nodes(root_node):
|
|
615
628
|
if node.type == "indented_code_block":
|
|
@@ -617,14 +630,14 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
617
630
|
start_line = node.start_point[0] + 1
|
|
618
631
|
end_line = node.end_point[0] + 1
|
|
619
632
|
raw_text = self._get_node_text_optimized(node)
|
|
620
|
-
|
|
633
|
+
|
|
621
634
|
code_block = MarkdownElement(
|
|
622
635
|
name="Indented Code Block",
|
|
623
636
|
start_line=start_line,
|
|
624
637
|
end_line=end_line,
|
|
625
638
|
raw_text=raw_text,
|
|
626
639
|
element_type="code_block",
|
|
627
|
-
language_info="indented"
|
|
640
|
+
language_info="indented",
|
|
628
641
|
)
|
|
629
642
|
# Add additional attributes for formatter
|
|
630
643
|
code_block.language = "text"
|
|
@@ -634,10 +647,12 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
634
647
|
except Exception as e:
|
|
635
648
|
log_debug(f"Failed to extract indented code block: {e}")
|
|
636
649
|
|
|
637
|
-
def _extract_inline_links(
|
|
650
|
+
def _extract_inline_links(
|
|
651
|
+
self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
|
|
652
|
+
) -> None:
|
|
638
653
|
"""Extract inline links"""
|
|
639
654
|
import re
|
|
640
|
-
|
|
655
|
+
|
|
641
656
|
# Extract links from text within inline nodes using regular expressions
|
|
642
657
|
for node in self._traverse_nodes(root_node):
|
|
643
658
|
if node.type == "inline":
|
|
@@ -645,27 +660,30 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
645
660
|
raw_text = self._get_node_text_optimized(node)
|
|
646
661
|
if not raw_text:
|
|
647
662
|
continue
|
|
648
|
-
|
|
663
|
+
|
|
649
664
|
# Inline link pattern: [text](url "title") (excluding images)
|
|
650
665
|
inline_pattern = r'(?<!\!)\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
651
666
|
matches = re.finditer(inline_pattern, raw_text)
|
|
652
|
-
|
|
667
|
+
|
|
653
668
|
for match in matches:
|
|
654
669
|
text = match.group(1) or ""
|
|
655
670
|
url = match.group(2) or ""
|
|
656
671
|
title = match.group(3) or ""
|
|
657
|
-
|
|
672
|
+
|
|
658
673
|
# Global duplicate check: process same text and URL combination only once
|
|
659
674
|
link_signature = f"{text}|{url}"
|
|
660
|
-
if
|
|
675
|
+
if (
|
|
676
|
+
hasattr(self, "_extracted_links")
|
|
677
|
+
and link_signature in self._extracted_links
|
|
678
|
+
):
|
|
661
679
|
continue
|
|
662
|
-
|
|
663
|
-
if hasattr(self,
|
|
680
|
+
|
|
681
|
+
if hasattr(self, "_extracted_links"):
|
|
664
682
|
self._extracted_links.add(link_signature)
|
|
665
|
-
|
|
683
|
+
|
|
666
684
|
start_line = node.start_point[0] + 1
|
|
667
685
|
end_line = node.end_point[0] + 1
|
|
668
|
-
|
|
686
|
+
|
|
669
687
|
link = MarkdownElement(
|
|
670
688
|
name=text or "Link",
|
|
671
689
|
start_line=start_line,
|
|
@@ -673,72 +691,76 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
673
691
|
raw_text=match.group(0),
|
|
674
692
|
element_type="link",
|
|
675
693
|
url=url,
|
|
676
|
-
title=title
|
|
694
|
+
title=title,
|
|
677
695
|
)
|
|
678
696
|
# Add additional attributes for formatter
|
|
679
697
|
link.text = text or "Link"
|
|
680
698
|
link.type = "link"
|
|
681
699
|
links.append(link)
|
|
682
|
-
|
|
700
|
+
|
|
683
701
|
except Exception as e:
|
|
684
702
|
log_debug(f"Failed to extract inline link: {e}")
|
|
685
703
|
|
|
686
|
-
def _extract_reference_links(
|
|
704
|
+
def _extract_reference_links(
|
|
705
|
+
self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
|
|
706
|
+
) -> None:
|
|
687
707
|
"""Extract reference links"""
|
|
688
708
|
import re
|
|
689
|
-
|
|
709
|
+
|
|
690
710
|
# Reference links also need to be extracted from inline nodes
|
|
691
711
|
# Track already processed reference links to avoid duplicates
|
|
692
712
|
processed_ref_links = set()
|
|
693
|
-
|
|
713
|
+
|
|
694
714
|
for node in self._traverse_nodes(root_node):
|
|
695
715
|
if node.type == "inline":
|
|
696
716
|
try:
|
|
697
717
|
raw_text = self._get_node_text_optimized(node)
|
|
698
718
|
if not raw_text:
|
|
699
719
|
continue
|
|
700
|
-
|
|
720
|
+
|
|
701
721
|
# Reference link pattern: [text][ref]
|
|
702
|
-
ref_pattern = r
|
|
722
|
+
ref_pattern = r"\[([^\]]*)\]\[([^\]]*)\]"
|
|
703
723
|
matches = re.finditer(ref_pattern, raw_text)
|
|
704
|
-
|
|
724
|
+
|
|
705
725
|
for match in matches:
|
|
706
726
|
text = match.group(1) or ""
|
|
707
727
|
ref = match.group(2) or ""
|
|
708
|
-
|
|
728
|
+
|
|
709
729
|
# Skip image references (starting with !)
|
|
710
|
-
if match.start() > 0 and raw_text[match.start()-1] ==
|
|
730
|
+
if match.start() > 0 and raw_text[match.start() - 1] == "!":
|
|
711
731
|
continue
|
|
712
|
-
|
|
732
|
+
|
|
713
733
|
# Duplicate check: process same text and reference combination only once
|
|
714
734
|
start_line = node.start_point[0] + 1
|
|
715
735
|
ref_link_key = (text, ref, start_line)
|
|
716
|
-
|
|
736
|
+
|
|
717
737
|
if ref_link_key in processed_ref_links:
|
|
718
738
|
continue
|
|
719
739
|
processed_ref_links.add(ref_link_key)
|
|
720
|
-
|
|
740
|
+
|
|
721
741
|
end_line = node.end_point[0] + 1
|
|
722
|
-
|
|
742
|
+
|
|
723
743
|
link = MarkdownElement(
|
|
724
744
|
name=text or "Reference Link",
|
|
725
745
|
start_line=start_line,
|
|
726
746
|
end_line=end_line,
|
|
727
747
|
raw_text=match.group(0),
|
|
728
|
-
element_type="reference_link"
|
|
748
|
+
element_type="reference_link",
|
|
729
749
|
)
|
|
730
750
|
# Add additional attributes for formatter
|
|
731
751
|
link.text = text or "Reference Link"
|
|
732
752
|
link.type = "reference_link"
|
|
733
753
|
links.append(link)
|
|
734
|
-
|
|
754
|
+
|
|
735
755
|
except Exception as e:
|
|
736
756
|
log_debug(f"Failed to extract reference link: {e}")
|
|
737
757
|
|
|
738
|
-
def _extract_autolinks(
|
|
758
|
+
def _extract_autolinks(
|
|
759
|
+
self, root_node: "tree_sitter.Node", links: list[MarkdownElement]
|
|
760
|
+
) -> None:
|
|
739
761
|
"""Extract autolinks"""
|
|
740
762
|
import re
|
|
741
|
-
|
|
763
|
+
|
|
742
764
|
# Extract autolinks from text within inline nodes using regular expressions
|
|
743
765
|
for node in self._traverse_nodes(root_node):
|
|
744
766
|
if node.type == "inline":
|
|
@@ -746,46 +768,53 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
746
768
|
raw_text = self._get_node_text_optimized(node)
|
|
747
769
|
if not raw_text:
|
|
748
770
|
continue
|
|
749
|
-
|
|
771
|
+
|
|
750
772
|
# Autolink pattern: <url> or <email>
|
|
751
|
-
autolink_pattern =
|
|
773
|
+
autolink_pattern = (
|
|
774
|
+
r"<(https?://[^>]+|mailto:[^>]+|[^@\s]+@[^@\s]+\.[^@\s]+)>"
|
|
775
|
+
)
|
|
752
776
|
matches = re.finditer(autolink_pattern, raw_text)
|
|
753
|
-
|
|
777
|
+
|
|
754
778
|
for match in matches:
|
|
755
779
|
url = match.group(1) or ""
|
|
756
780
|
full_match = match.group(0)
|
|
757
|
-
|
|
781
|
+
|
|
758
782
|
# Global duplicate check: process same URL for autolinks only once
|
|
759
783
|
autolink_signature = f"autolink|{url}"
|
|
760
|
-
if
|
|
784
|
+
if (
|
|
785
|
+
hasattr(self, "_extracted_links")
|
|
786
|
+
and autolink_signature in self._extracted_links
|
|
787
|
+
):
|
|
761
788
|
continue
|
|
762
|
-
|
|
763
|
-
if hasattr(self,
|
|
789
|
+
|
|
790
|
+
if hasattr(self, "_extracted_links"):
|
|
764
791
|
self._extracted_links.add(autolink_signature)
|
|
765
|
-
|
|
792
|
+
|
|
766
793
|
start_line = node.start_point[0] + 1
|
|
767
794
|
end_line = node.end_point[0] + 1
|
|
768
|
-
|
|
795
|
+
|
|
769
796
|
link = MarkdownElement(
|
|
770
797
|
name=url or "Autolink",
|
|
771
798
|
start_line=start_line,
|
|
772
799
|
end_line=end_line,
|
|
773
800
|
raw_text=full_match,
|
|
774
801
|
element_type="autolink",
|
|
775
|
-
url=url
|
|
802
|
+
url=url,
|
|
776
803
|
)
|
|
777
804
|
# Add additional attributes for formatter
|
|
778
805
|
link.text = url or "Autolink"
|
|
779
806
|
link.type = "autolink"
|
|
780
807
|
links.append(link)
|
|
781
|
-
|
|
808
|
+
|
|
782
809
|
except Exception as e:
|
|
783
810
|
log_debug(f"Failed to extract autolink: {e}")
|
|
784
811
|
|
|
785
|
-
def _extract_inline_images(
|
|
812
|
+
def _extract_inline_images(
|
|
813
|
+
self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
|
|
814
|
+
) -> None:
|
|
786
815
|
"""Extract inline images"""
|
|
787
816
|
import re
|
|
788
|
-
|
|
817
|
+
|
|
789
818
|
# Extract images from text within inline nodes using regular expressions
|
|
790
819
|
for node in self._traverse_nodes(root_node):
|
|
791
820
|
if node.type == "inline":
|
|
@@ -793,20 +822,20 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
793
822
|
raw_text = self._get_node_text_optimized(node)
|
|
794
823
|
if not raw_text:
|
|
795
824
|
continue
|
|
796
|
-
|
|
825
|
+
|
|
797
826
|
# Inline image pattern: 
|
|
798
827
|
image_pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
799
828
|
matches = re.finditer(image_pattern, raw_text)
|
|
800
|
-
|
|
829
|
+
|
|
801
830
|
for match in matches:
|
|
802
831
|
alt_text = match.group(1) or ""
|
|
803
832
|
url = match.group(2) or ""
|
|
804
833
|
title = match.group(3) or ""
|
|
805
|
-
|
|
834
|
+
|
|
806
835
|
# Calculate line number from matched position
|
|
807
836
|
start_line = node.start_point[0] + 1
|
|
808
837
|
end_line = node.end_point[0] + 1
|
|
809
|
-
|
|
838
|
+
|
|
810
839
|
image = MarkdownElement(
|
|
811
840
|
name=alt_text or "Image",
|
|
812
841
|
start_line=start_line,
|
|
@@ -815,20 +844,22 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
815
844
|
element_type="image",
|
|
816
845
|
url=url,
|
|
817
846
|
alt_text=alt_text,
|
|
818
|
-
title=title
|
|
847
|
+
title=title,
|
|
819
848
|
)
|
|
820
849
|
# Add additional attributes for formatter
|
|
821
850
|
image.alt = alt_text or ""
|
|
822
851
|
image.type = "image"
|
|
823
852
|
images.append(image)
|
|
824
|
-
|
|
853
|
+
|
|
825
854
|
except Exception as e:
|
|
826
855
|
log_debug(f"Failed to extract inline image: {e}")
|
|
827
856
|
|
|
828
|
-
def _extract_reference_images(
|
|
857
|
+
def _extract_reference_images(
|
|
858
|
+
self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
|
|
859
|
+
) -> None:
|
|
829
860
|
"""Extract reference images"""
|
|
830
861
|
import re
|
|
831
|
-
|
|
862
|
+
|
|
832
863
|
# Reference images also need to be extracted from inline nodes
|
|
833
864
|
for node in self._traverse_nodes(root_node):
|
|
834
865
|
if node.type == "inline":
|
|
@@ -836,40 +867,40 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
836
867
|
raw_text = self._get_node_text_optimized(node)
|
|
837
868
|
if not raw_text:
|
|
838
869
|
continue
|
|
839
|
-
|
|
870
|
+
|
|
840
871
|
# Reference image pattern: ![alt][ref]
|
|
841
|
-
ref_image_pattern = r
|
|
872
|
+
ref_image_pattern = r"!\[([^\]]*)\]\[([^\]]*)\]"
|
|
842
873
|
matches = re.finditer(ref_image_pattern, raw_text)
|
|
843
|
-
|
|
874
|
+
|
|
844
875
|
for match in matches:
|
|
845
876
|
alt_text = match.group(1) or ""
|
|
846
|
-
ref = match.group(2) or ""
|
|
847
|
-
|
|
848
877
|
start_line = node.start_point[0] + 1
|
|
849
878
|
end_line = node.end_point[0] + 1
|
|
850
|
-
|
|
879
|
+
|
|
851
880
|
image = MarkdownElement(
|
|
852
881
|
name=alt_text or "Reference Image",
|
|
853
882
|
start_line=start_line,
|
|
854
883
|
end_line=end_line,
|
|
855
884
|
raw_text=match.group(0),
|
|
856
|
-
element_type="reference_image"
|
|
885
|
+
element_type="reference_image",
|
|
857
886
|
)
|
|
858
887
|
# Add additional attributes for formatter
|
|
859
888
|
image.alt = alt_text or ""
|
|
860
889
|
image.type = "reference_image"
|
|
861
890
|
images.append(image)
|
|
862
|
-
|
|
891
|
+
|
|
863
892
|
except Exception as e:
|
|
864
893
|
log_debug(f"Failed to extract reference image: {e}")
|
|
865
894
|
|
|
866
|
-
def _extract_image_reference_definitions(
|
|
895
|
+
def _extract_image_reference_definitions(
|
|
896
|
+
self, root_node: "tree_sitter.Node", images: list[MarkdownElement]
|
|
897
|
+
) -> None:
|
|
867
898
|
"""Extract image reference definitions"""
|
|
868
899
|
import re
|
|
869
|
-
|
|
900
|
+
|
|
870
901
|
# Extract all reference definitions that could be used for images
|
|
871
902
|
# We check if the URL points to an image file or if it's used by an image reference
|
|
872
|
-
|
|
903
|
+
|
|
873
904
|
# First, collect all image references used in the document
|
|
874
905
|
image_refs_used = set()
|
|
875
906
|
for node in self._traverse_nodes(root_node):
|
|
@@ -878,19 +909,19 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
878
909
|
raw_text = self._get_node_text_optimized(node)
|
|
879
910
|
if not raw_text:
|
|
880
911
|
continue
|
|
881
|
-
|
|
912
|
+
|
|
882
913
|
# Find image references: ![alt][ref]
|
|
883
|
-
ref_image_pattern = r
|
|
914
|
+
ref_image_pattern = r"!\[([^\]]*)\]\[([^\]]*)\]"
|
|
884
915
|
matches = re.finditer(ref_image_pattern, raw_text)
|
|
885
|
-
|
|
916
|
+
|
|
886
917
|
for match in matches:
|
|
887
918
|
ref = match.group(2) or ""
|
|
888
919
|
if ref:
|
|
889
920
|
image_refs_used.add(ref.lower())
|
|
890
|
-
|
|
921
|
+
|
|
891
922
|
except Exception as e:
|
|
892
923
|
log_debug(f"Failed to scan for image references: {e}")
|
|
893
|
-
|
|
924
|
+
|
|
894
925
|
# Now extract reference definitions that are used by images OR point to image files
|
|
895
926
|
for node in self._traverse_nodes(root_node):
|
|
896
927
|
if node.type == "link_reference_definition":
|
|
@@ -898,20 +929,31 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
898
929
|
start_line = node.start_point[0] + 1
|
|
899
930
|
end_line = node.end_point[0] + 1
|
|
900
931
|
raw_text = self._get_node_text_optimized(node)
|
|
901
|
-
|
|
932
|
+
|
|
902
933
|
# Pattern: [label]: url "title"
|
|
903
934
|
ref_pattern = r'^\[([^\]]+)\]:\s*([^\s]+)(?:\s+"([^"]*)")?'
|
|
904
935
|
match = re.match(ref_pattern, raw_text.strip())
|
|
905
|
-
|
|
936
|
+
|
|
906
937
|
if match:
|
|
907
938
|
label = match.group(1) or ""
|
|
908
939
|
url = match.group(2) or ""
|
|
909
940
|
title = match.group(3) or ""
|
|
910
|
-
|
|
941
|
+
|
|
911
942
|
# Include if this reference is used by an image OR if URL looks like an image
|
|
912
943
|
is_used_by_image = label.lower() in image_refs_used
|
|
913
|
-
is_image_url = any(
|
|
914
|
-
|
|
944
|
+
is_image_url = any(
|
|
945
|
+
url.lower().endswith(ext)
|
|
946
|
+
for ext in [
|
|
947
|
+
".png",
|
|
948
|
+
".jpg",
|
|
949
|
+
".jpeg",
|
|
950
|
+
".gif",
|
|
951
|
+
".svg",
|
|
952
|
+
".webp",
|
|
953
|
+
".bmp",
|
|
954
|
+
]
|
|
955
|
+
)
|
|
956
|
+
|
|
915
957
|
if is_used_by_image or is_image_url:
|
|
916
958
|
image_ref = MarkdownElement(
|
|
917
959
|
name=f"Image Reference Definition: {label}",
|
|
@@ -921,17 +963,19 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
921
963
|
element_type="image_reference_definition",
|
|
922
964
|
url=url,
|
|
923
965
|
alt_text=label,
|
|
924
|
-
title=title
|
|
966
|
+
title=title,
|
|
925
967
|
)
|
|
926
968
|
# Add additional attributes for formatter
|
|
927
969
|
image_ref.alt = label
|
|
928
970
|
image_ref.type = "image_reference_definition"
|
|
929
971
|
images.append(image_ref)
|
|
930
|
-
|
|
972
|
+
|
|
931
973
|
except Exception as e:
|
|
932
974
|
log_debug(f"Failed to extract image reference definition: {e}")
|
|
933
975
|
|
|
934
|
-
def _extract_link_reference_definitions(
|
|
976
|
+
def _extract_link_reference_definitions(
|
|
977
|
+
self, root_node: "tree_sitter.Node", references: list[MarkdownElement]
|
|
978
|
+
) -> None:
|
|
935
979
|
"""Extract link reference definitions"""
|
|
936
980
|
for node in self._traverse_nodes(root_node):
|
|
937
981
|
if node.type == "link_reference_definition":
|
|
@@ -939,19 +983,21 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
939
983
|
start_line = node.start_point[0] + 1
|
|
940
984
|
end_line = node.end_point[0] + 1
|
|
941
985
|
raw_text = self._get_node_text_optimized(node)
|
|
942
|
-
|
|
986
|
+
|
|
943
987
|
reference = MarkdownElement(
|
|
944
988
|
name=raw_text or "Reference Definition",
|
|
945
989
|
start_line=start_line,
|
|
946
990
|
end_line=end_line,
|
|
947
991
|
raw_text=raw_text,
|
|
948
|
-
element_type="reference_definition"
|
|
992
|
+
element_type="reference_definition",
|
|
949
993
|
)
|
|
950
994
|
references.append(reference)
|
|
951
995
|
except Exception as e:
|
|
952
996
|
log_debug(f"Failed to extract reference definition: {e}")
|
|
953
997
|
|
|
954
|
-
def _extract_list_items(
|
|
998
|
+
def _extract_list_items(
|
|
999
|
+
self, root_node: "tree_sitter.Node", lists: list[MarkdownElement]
|
|
1000
|
+
) -> None:
|
|
955
1001
|
"""Extract lists (not individual items)"""
|
|
956
1002
|
for node in self._traverse_nodes(root_node):
|
|
957
1003
|
if node.type == "list":
|
|
@@ -959,25 +1005,29 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
959
1005
|
start_line = node.start_point[0] + 1
|
|
960
1006
|
end_line = node.end_point[0] + 1
|
|
961
1007
|
raw_text = self._get_node_text_optimized(node)
|
|
962
|
-
|
|
1008
|
+
|
|
963
1009
|
# Count list items in this list
|
|
964
1010
|
item_count = 0
|
|
965
1011
|
is_task_list = False
|
|
966
1012
|
is_ordered = False
|
|
967
|
-
|
|
1013
|
+
|
|
968
1014
|
for child in node.children:
|
|
969
1015
|
if child.type == "list_item":
|
|
970
1016
|
item_count += 1
|
|
971
1017
|
item_text = self._get_node_text_optimized(child)
|
|
972
|
-
|
|
1018
|
+
|
|
973
1019
|
# Check if it's a task list item
|
|
974
|
-
if
|
|
1020
|
+
if (
|
|
1021
|
+
"[ ]" in item_text
|
|
1022
|
+
or "[x]" in item_text
|
|
1023
|
+
or "[X]" in item_text
|
|
1024
|
+
):
|
|
975
1025
|
is_task_list = True
|
|
976
|
-
|
|
1026
|
+
|
|
977
1027
|
# Check if it's an ordered list (starts with number)
|
|
978
1028
|
if item_text.strip() and item_text.strip()[0].isdigit():
|
|
979
1029
|
is_ordered = True
|
|
980
|
-
|
|
1030
|
+
|
|
981
1031
|
# Determine list type
|
|
982
1032
|
if is_task_list:
|
|
983
1033
|
list_type = "task"
|
|
@@ -988,15 +1038,15 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
988
1038
|
else:
|
|
989
1039
|
list_type = "unordered"
|
|
990
1040
|
element_type = "list"
|
|
991
|
-
|
|
1041
|
+
|
|
992
1042
|
name = f"{list_type.title()} List ({item_count} items)"
|
|
993
|
-
|
|
1043
|
+
|
|
994
1044
|
list_element = MarkdownElement(
|
|
995
1045
|
name=name,
|
|
996
1046
|
start_line=start_line,
|
|
997
1047
|
end_line=end_line,
|
|
998
1048
|
raw_text=raw_text,
|
|
999
|
-
element_type=element_type
|
|
1049
|
+
element_type=element_type,
|
|
1000
1050
|
)
|
|
1001
1051
|
# Add additional attributes for formatter
|
|
1002
1052
|
list_element.list_type = list_type
|
|
@@ -1006,7 +1056,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1006
1056
|
except Exception as e:
|
|
1007
1057
|
log_debug(f"Failed to extract list: {e}")
|
|
1008
1058
|
|
|
1009
|
-
def _extract_pipe_tables(
|
|
1059
|
+
def _extract_pipe_tables(
|
|
1060
|
+
self, root_node: "tree_sitter.Node", tables: list[MarkdownElement]
|
|
1061
|
+
) -> None:
|
|
1010
1062
|
"""Extract pipe tables"""
|
|
1011
1063
|
for node in self._traverse_nodes(root_node):
|
|
1012
1064
|
if node.type == "pipe_table":
|
|
@@ -1014,23 +1066,31 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1014
1066
|
start_line = node.start_point[0] + 1
|
|
1015
1067
|
end_line = node.end_point[0] + 1
|
|
1016
1068
|
raw_text = self._get_node_text_optimized(node)
|
|
1017
|
-
|
|
1069
|
+
|
|
1018
1070
|
# Count rows and columns
|
|
1019
1071
|
lines = raw_text.strip().split("\n")
|
|
1020
|
-
row_count = len(
|
|
1021
|
-
|
|
1072
|
+
row_count = len(
|
|
1073
|
+
[
|
|
1074
|
+
line
|
|
1075
|
+
for line in lines
|
|
1076
|
+
if line.strip() and not line.strip().startswith("|---")
|
|
1077
|
+
]
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1022
1080
|
# Count columns from first row
|
|
1023
1081
|
column_count = 0
|
|
1024
1082
|
if lines:
|
|
1025
1083
|
first_row = lines[0]
|
|
1026
|
-
column_count = len(
|
|
1027
|
-
|
|
1084
|
+
column_count = len(
|
|
1085
|
+
[col for col in first_row.split("|") if col.strip()]
|
|
1086
|
+
)
|
|
1087
|
+
|
|
1028
1088
|
table = MarkdownElement(
|
|
1029
1089
|
name=f"Table ({row_count} rows, {column_count} columns)",
|
|
1030
1090
|
start_line=start_line,
|
|
1031
1091
|
end_line=end_line,
|
|
1032
1092
|
raw_text=raw_text,
|
|
1033
|
-
element_type="table"
|
|
1093
|
+
element_type="table",
|
|
1034
1094
|
)
|
|
1035
1095
|
# Add additional attributes for formatter
|
|
1036
1096
|
table.row_count = row_count
|
|
@@ -1040,10 +1100,12 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1040
1100
|
except Exception as e:
|
|
1041
1101
|
log_debug(f"Failed to extract pipe table: {e}")
|
|
1042
1102
|
|
|
1043
|
-
def _extract_block_quotes(
|
|
1103
|
+
def _extract_block_quotes(
|
|
1104
|
+
self, root_node: "tree_sitter.Node", blockquotes: list[MarkdownElement]
|
|
1105
|
+
) -> None:
|
|
1044
1106
|
"""Extract blockquotes"""
|
|
1045
1107
|
import re
|
|
1046
|
-
|
|
1108
|
+
|
|
1047
1109
|
# Blockquotes are often represented as paragraphs starting with >
|
|
1048
1110
|
for node in self._traverse_nodes(root_node):
|
|
1049
1111
|
if node.type == "block_quote":
|
|
@@ -1051,22 +1113,24 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1051
1113
|
start_line = node.start_point[0] + 1
|
|
1052
1114
|
end_line = node.end_point[0] + 1
|
|
1053
1115
|
raw_text = self._get_node_text_optimized(node)
|
|
1054
|
-
|
|
1116
|
+
|
|
1055
1117
|
# Extract content without > markers
|
|
1056
1118
|
lines = raw_text.strip().split("\n")
|
|
1057
1119
|
content_lines = []
|
|
1058
1120
|
for line in lines:
|
|
1059
1121
|
# Remove > marker and optional space
|
|
1060
|
-
cleaned = re.sub(r
|
|
1122
|
+
cleaned = re.sub(r"^>\s?", "", line)
|
|
1061
1123
|
content_lines.append(cleaned)
|
|
1062
1124
|
content = "\n".join(content_lines).strip()
|
|
1063
|
-
|
|
1125
|
+
|
|
1064
1126
|
blockquote = MarkdownElement(
|
|
1065
|
-
name=f"Blockquote: {content[:50]}..."
|
|
1127
|
+
name=f"Blockquote: {content[:50]}..."
|
|
1128
|
+
if len(content) > 50
|
|
1129
|
+
else f"Blockquote: {content}",
|
|
1066
1130
|
start_line=start_line,
|
|
1067
1131
|
end_line=end_line,
|
|
1068
1132
|
raw_text=raw_text,
|
|
1069
|
-
element_type="blockquote"
|
|
1133
|
+
element_type="blockquote",
|
|
1070
1134
|
)
|
|
1071
1135
|
blockquote.type = "blockquote"
|
|
1072
1136
|
blockquote.text = content
|
|
@@ -1074,7 +1138,9 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1074
1138
|
except Exception as e:
|
|
1075
1139
|
log_debug(f"Failed to extract blockquote: {e}")
|
|
1076
1140
|
|
|
1077
|
-
def _extract_thematic_breaks(
|
|
1141
|
+
def _extract_thematic_breaks(
|
|
1142
|
+
self, root_node: "tree_sitter.Node", horizontal_rules: list[MarkdownElement]
|
|
1143
|
+
) -> None:
|
|
1078
1144
|
"""Extract thematic breaks (horizontal rules)"""
|
|
1079
1145
|
for node in self._traverse_nodes(root_node):
|
|
1080
1146
|
if node.type == "thematic_break":
|
|
@@ -1082,20 +1148,22 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1082
1148
|
start_line = node.start_point[0] + 1
|
|
1083
1149
|
end_line = node.end_point[0] + 1
|
|
1084
1150
|
raw_text = self._get_node_text_optimized(node)
|
|
1085
|
-
|
|
1151
|
+
|
|
1086
1152
|
hr = MarkdownElement(
|
|
1087
1153
|
name="Horizontal Rule",
|
|
1088
1154
|
start_line=start_line,
|
|
1089
1155
|
end_line=end_line,
|
|
1090
1156
|
raw_text=raw_text,
|
|
1091
|
-
element_type="horizontal_rule"
|
|
1157
|
+
element_type="horizontal_rule",
|
|
1092
1158
|
)
|
|
1093
1159
|
hr.type = "horizontal_rule"
|
|
1094
1160
|
horizontal_rules.append(hr)
|
|
1095
1161
|
except Exception as e:
|
|
1096
1162
|
log_debug(f"Failed to extract horizontal rule: {e}")
|
|
1097
1163
|
|
|
1098
|
-
def _extract_html_blocks(
|
|
1164
|
+
def _extract_html_blocks(
|
|
1165
|
+
self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]
|
|
1166
|
+
) -> None:
|
|
1099
1167
|
"""Extract HTML block elements"""
|
|
1100
1168
|
for node in self._traverse_nodes(root_node):
|
|
1101
1169
|
if node.type == "html_block":
|
|
@@ -1103,28 +1171,31 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1103
1171
|
start_line = node.start_point[0] + 1
|
|
1104
1172
|
end_line = node.end_point[0] + 1
|
|
1105
1173
|
raw_text = self._get_node_text_optimized(node)
|
|
1106
|
-
|
|
1174
|
+
|
|
1107
1175
|
# Extract tag name if possible
|
|
1108
1176
|
import re
|
|
1109
|
-
|
|
1177
|
+
|
|
1178
|
+
tag_match = re.search(r"<(\w+)", raw_text)
|
|
1110
1179
|
tag_name = tag_match.group(1) if tag_match else "HTML"
|
|
1111
|
-
|
|
1180
|
+
|
|
1112
1181
|
html_element = MarkdownElement(
|
|
1113
1182
|
name=f"HTML Block: {tag_name}",
|
|
1114
1183
|
start_line=start_line,
|
|
1115
1184
|
end_line=end_line,
|
|
1116
1185
|
raw_text=raw_text,
|
|
1117
|
-
element_type="html_block"
|
|
1186
|
+
element_type="html_block",
|
|
1118
1187
|
)
|
|
1119
1188
|
html_element.type = "html_block"
|
|
1120
1189
|
html_elements.append(html_element)
|
|
1121
1190
|
except Exception as e:
|
|
1122
1191
|
log_debug(f"Failed to extract HTML block: {e}")
|
|
1123
1192
|
|
|
1124
|
-
def _extract_inline_html(
|
|
1193
|
+
def _extract_inline_html(
|
|
1194
|
+
self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]
|
|
1195
|
+
) -> None:
|
|
1125
1196
|
"""Extract inline HTML elements"""
|
|
1126
1197
|
import re
|
|
1127
|
-
|
|
1198
|
+
|
|
1128
1199
|
# Look for HTML tags in inline content
|
|
1129
1200
|
for node in self._traverse_nodes(root_node):
|
|
1130
1201
|
if node.type == "inline":
|
|
@@ -1132,220 +1203,230 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1132
1203
|
raw_text = self._get_node_text_optimized(node)
|
|
1133
1204
|
if not raw_text:
|
|
1134
1205
|
continue
|
|
1135
|
-
|
|
1206
|
+
|
|
1136
1207
|
# Pattern for HTML tags (excluding autolinks)
|
|
1137
1208
|
# Exclude autolink patterns: <url> or <email>
|
|
1138
|
-
html_pattern = r
|
|
1209
|
+
html_pattern = r"<(?!(?:https?://|mailto:|[^@\s]+@[^@\s]+\.[^@\s]+)[^>]*>)[^>]+>"
|
|
1139
1210
|
matches = re.finditer(html_pattern, raw_text)
|
|
1140
|
-
|
|
1211
|
+
|
|
1141
1212
|
for match in matches:
|
|
1142
1213
|
tag_text = match.group(0)
|
|
1143
|
-
|
|
1214
|
+
|
|
1144
1215
|
# Extract tag name
|
|
1145
|
-
tag_match = re.search(r
|
|
1216
|
+
tag_match = re.search(r"<(\w+)", tag_text)
|
|
1146
1217
|
tag_name = tag_match.group(1) if tag_match else "HTML"
|
|
1147
|
-
|
|
1218
|
+
|
|
1148
1219
|
start_line = node.start_point[0] + 1
|
|
1149
1220
|
end_line = node.end_point[0] + 1
|
|
1150
|
-
|
|
1221
|
+
|
|
1151
1222
|
html_element = MarkdownElement(
|
|
1152
1223
|
name=f"HTML Tag: {tag_name}",
|
|
1153
1224
|
start_line=start_line,
|
|
1154
1225
|
end_line=end_line,
|
|
1155
1226
|
raw_text=tag_text,
|
|
1156
|
-
element_type="html_inline"
|
|
1227
|
+
element_type="html_inline",
|
|
1157
1228
|
)
|
|
1158
1229
|
html_element.type = "html_inline"
|
|
1159
1230
|
html_element.name = tag_name # Set name attribute for formatter
|
|
1160
1231
|
html_elements.append(html_element)
|
|
1161
|
-
|
|
1232
|
+
|
|
1162
1233
|
except Exception as e:
|
|
1163
1234
|
log_debug(f"Failed to extract inline HTML: {e}")
|
|
1164
1235
|
|
|
1165
|
-
def _extract_emphasis_elements(
|
|
1236
|
+
def _extract_emphasis_elements(
|
|
1237
|
+
self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
|
|
1238
|
+
) -> None:
|
|
1166
1239
|
"""Extract emphasis and strong emphasis elements"""
|
|
1167
1240
|
import re
|
|
1168
|
-
|
|
1241
|
+
|
|
1169
1242
|
for node in self._traverse_nodes(root_node):
|
|
1170
1243
|
if node.type == "inline":
|
|
1171
1244
|
try:
|
|
1172
1245
|
raw_text = self._get_node_text_optimized(node)
|
|
1173
1246
|
if not raw_text:
|
|
1174
1247
|
continue
|
|
1175
|
-
|
|
1248
|
+
|
|
1176
1249
|
# Pattern for bold text: **text** or __text__
|
|
1177
|
-
bold_pattern = r
|
|
1250
|
+
bold_pattern = r"\*\*([^*]+)\*\*|__([^_]+)__"
|
|
1178
1251
|
bold_matches = re.finditer(bold_pattern, raw_text)
|
|
1179
|
-
|
|
1252
|
+
|
|
1180
1253
|
for match in bold_matches:
|
|
1181
1254
|
content = match.group(1) or match.group(2) or ""
|
|
1182
1255
|
start_line = node.start_point[0] + 1
|
|
1183
1256
|
end_line = node.end_point[0] + 1
|
|
1184
|
-
|
|
1257
|
+
|
|
1185
1258
|
bold_element = MarkdownElement(
|
|
1186
1259
|
name=f"Bold: {content}",
|
|
1187
1260
|
start_line=start_line,
|
|
1188
1261
|
end_line=end_line,
|
|
1189
1262
|
raw_text=match.group(0),
|
|
1190
|
-
element_type="strong_emphasis"
|
|
1263
|
+
element_type="strong_emphasis",
|
|
1191
1264
|
)
|
|
1192
1265
|
bold_element.type = "strong_emphasis"
|
|
1193
1266
|
bold_element.text = content
|
|
1194
1267
|
formatting_elements.append(bold_element)
|
|
1195
|
-
|
|
1268
|
+
|
|
1196
1269
|
# Pattern for italic text: *text* or _text_ (but not **text** or __text__)
|
|
1197
|
-
italic_pattern = r
|
|
1270
|
+
italic_pattern = r"(?<!\*)\*([^*]+)\*(?!\*)|(?<!_)_([^_]+)_(?!_)"
|
|
1198
1271
|
italic_matches = re.finditer(italic_pattern, raw_text)
|
|
1199
|
-
|
|
1272
|
+
|
|
1200
1273
|
for match in italic_matches:
|
|
1201
1274
|
content = match.group(1) or match.group(2) or ""
|
|
1202
1275
|
start_line = node.start_point[0] + 1
|
|
1203
1276
|
end_line = node.end_point[0] + 1
|
|
1204
|
-
|
|
1277
|
+
|
|
1205
1278
|
italic_element = MarkdownElement(
|
|
1206
1279
|
name=f"Italic: {content}",
|
|
1207
1280
|
start_line=start_line,
|
|
1208
1281
|
end_line=end_line,
|
|
1209
1282
|
raw_text=match.group(0),
|
|
1210
|
-
element_type="emphasis"
|
|
1283
|
+
element_type="emphasis",
|
|
1211
1284
|
)
|
|
1212
1285
|
italic_element.type = "emphasis"
|
|
1213
1286
|
italic_element.text = content
|
|
1214
1287
|
formatting_elements.append(italic_element)
|
|
1215
|
-
|
|
1288
|
+
|
|
1216
1289
|
except Exception as e:
|
|
1217
1290
|
log_debug(f"Failed to extract emphasis elements: {e}")
|
|
1218
1291
|
|
|
1219
|
-
def _extract_inline_code_spans(
|
|
1292
|
+
def _extract_inline_code_spans(
|
|
1293
|
+
self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
|
|
1294
|
+
) -> None:
|
|
1220
1295
|
"""Extract inline code spans"""
|
|
1221
1296
|
import re
|
|
1222
|
-
|
|
1297
|
+
|
|
1223
1298
|
for node in self._traverse_nodes(root_node):
|
|
1224
1299
|
if node.type == "inline":
|
|
1225
1300
|
try:
|
|
1226
1301
|
raw_text = self._get_node_text_optimized(node)
|
|
1227
1302
|
if not raw_text:
|
|
1228
1303
|
continue
|
|
1229
|
-
|
|
1304
|
+
|
|
1230
1305
|
# Pattern for inline code: `code`
|
|
1231
|
-
code_pattern = r
|
|
1306
|
+
code_pattern = r"`([^`]+)`"
|
|
1232
1307
|
matches = re.finditer(code_pattern, raw_text)
|
|
1233
|
-
|
|
1308
|
+
|
|
1234
1309
|
for match in matches:
|
|
1235
1310
|
content = match.group(1) or ""
|
|
1236
1311
|
start_line = node.start_point[0] + 1
|
|
1237
1312
|
end_line = node.end_point[0] + 1
|
|
1238
|
-
|
|
1313
|
+
|
|
1239
1314
|
code_element = MarkdownElement(
|
|
1240
1315
|
name=f"Inline Code: {content}",
|
|
1241
1316
|
start_line=start_line,
|
|
1242
1317
|
end_line=end_line,
|
|
1243
1318
|
raw_text=match.group(0),
|
|
1244
|
-
element_type="inline_code"
|
|
1319
|
+
element_type="inline_code",
|
|
1245
1320
|
)
|
|
1246
1321
|
code_element.type = "inline_code"
|
|
1247
1322
|
code_element.text = content
|
|
1248
1323
|
formatting_elements.append(code_element)
|
|
1249
|
-
|
|
1324
|
+
|
|
1250
1325
|
except Exception as e:
|
|
1251
1326
|
log_debug(f"Failed to extract inline code: {e}")
|
|
1252
1327
|
|
|
1253
|
-
def _extract_strikethrough_elements(
|
|
1328
|
+
def _extract_strikethrough_elements(
|
|
1329
|
+
self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]
|
|
1330
|
+
) -> None:
|
|
1254
1331
|
"""Extract strikethrough elements"""
|
|
1255
1332
|
import re
|
|
1256
|
-
|
|
1333
|
+
|
|
1257
1334
|
for node in self._traverse_nodes(root_node):
|
|
1258
1335
|
if node.type == "inline":
|
|
1259
1336
|
try:
|
|
1260
1337
|
raw_text = self._get_node_text_optimized(node)
|
|
1261
1338
|
if not raw_text:
|
|
1262
1339
|
continue
|
|
1263
|
-
|
|
1340
|
+
|
|
1264
1341
|
# Pattern for strikethrough: ~~text~~
|
|
1265
|
-
strike_pattern = r
|
|
1342
|
+
strike_pattern = r"~~([^~]+)~~"
|
|
1266
1343
|
matches = re.finditer(strike_pattern, raw_text)
|
|
1267
|
-
|
|
1344
|
+
|
|
1268
1345
|
for match in matches:
|
|
1269
1346
|
content = match.group(1) or ""
|
|
1270
1347
|
start_line = node.start_point[0] + 1
|
|
1271
1348
|
end_line = node.end_point[0] + 1
|
|
1272
|
-
|
|
1349
|
+
|
|
1273
1350
|
strike_element = MarkdownElement(
|
|
1274
1351
|
name=f"Strikethrough: {content}",
|
|
1275
1352
|
start_line=start_line,
|
|
1276
1353
|
end_line=end_line,
|
|
1277
1354
|
raw_text=match.group(0),
|
|
1278
|
-
element_type="strikethrough"
|
|
1355
|
+
element_type="strikethrough",
|
|
1279
1356
|
)
|
|
1280
1357
|
strike_element.type = "strikethrough"
|
|
1281
1358
|
strike_element.text = content
|
|
1282
1359
|
formatting_elements.append(strike_element)
|
|
1283
|
-
|
|
1360
|
+
|
|
1284
1361
|
except Exception as e:
|
|
1285
1362
|
log_debug(f"Failed to extract strikethrough: {e}")
|
|
1286
1363
|
|
|
1287
|
-
def _extract_footnote_elements(
|
|
1364
|
+
def _extract_footnote_elements(
|
|
1365
|
+
self, root_node: "tree_sitter.Node", footnotes: list[MarkdownElement]
|
|
1366
|
+
) -> None:
|
|
1288
1367
|
"""Extract footnote elements"""
|
|
1289
1368
|
import re
|
|
1290
|
-
|
|
1369
|
+
|
|
1291
1370
|
for node in self._traverse_nodes(root_node):
|
|
1292
1371
|
if node.type == "inline":
|
|
1293
1372
|
try:
|
|
1294
1373
|
raw_text = self._get_node_text_optimized(node)
|
|
1295
1374
|
if not raw_text:
|
|
1296
1375
|
continue
|
|
1297
|
-
|
|
1376
|
+
|
|
1298
1377
|
# Pattern for footnote references: [^1]
|
|
1299
|
-
footnote_ref_pattern = r
|
|
1378
|
+
footnote_ref_pattern = r"\[\^([^\]]+)\]"
|
|
1300
1379
|
matches = re.finditer(footnote_ref_pattern, raw_text)
|
|
1301
|
-
|
|
1380
|
+
|
|
1302
1381
|
for match in matches:
|
|
1303
1382
|
ref_id = match.group(1) or ""
|
|
1304
1383
|
start_line = node.start_point[0] + 1
|
|
1305
1384
|
end_line = node.end_point[0] + 1
|
|
1306
|
-
|
|
1385
|
+
|
|
1307
1386
|
footnote_element = MarkdownElement(
|
|
1308
1387
|
name=f"Footnote Reference: {ref_id}",
|
|
1309
1388
|
start_line=start_line,
|
|
1310
1389
|
end_line=end_line,
|
|
1311
1390
|
raw_text=match.group(0),
|
|
1312
|
-
element_type="footnote_reference"
|
|
1391
|
+
element_type="footnote_reference",
|
|
1313
1392
|
)
|
|
1314
1393
|
footnote_element.type = "footnote_reference"
|
|
1315
1394
|
footnote_element.text = ref_id
|
|
1316
1395
|
footnotes.append(footnote_element)
|
|
1317
|
-
|
|
1396
|
+
|
|
1318
1397
|
except Exception as e:
|
|
1319
1398
|
log_debug(f"Failed to extract footnote reference: {e}")
|
|
1320
|
-
|
|
1399
|
+
|
|
1321
1400
|
# Look for footnote definitions
|
|
1322
1401
|
elif node.type == "paragraph":
|
|
1323
1402
|
try:
|
|
1324
1403
|
raw_text = self._get_node_text_optimized(node)
|
|
1325
1404
|
if not raw_text:
|
|
1326
1405
|
continue
|
|
1327
|
-
|
|
1406
|
+
|
|
1328
1407
|
# Pattern for footnote definitions: [^1]: content
|
|
1329
|
-
footnote_def_pattern = r
|
|
1330
|
-
match = re.match(
|
|
1331
|
-
|
|
1408
|
+
footnote_def_pattern = r"^\[\^([^\]]+)\]:\s*(.+)$"
|
|
1409
|
+
match = re.match(
|
|
1410
|
+
footnote_def_pattern, raw_text.strip(), re.MULTILINE
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1332
1413
|
if match:
|
|
1333
1414
|
ref_id = match.group(1) or ""
|
|
1334
1415
|
content = match.group(2) or ""
|
|
1335
1416
|
start_line = node.start_point[0] + 1
|
|
1336
1417
|
end_line = node.end_point[0] + 1
|
|
1337
|
-
|
|
1418
|
+
|
|
1338
1419
|
footnote_element = MarkdownElement(
|
|
1339
1420
|
name=f"Footnote Definition: {ref_id}",
|
|
1340
1421
|
start_line=start_line,
|
|
1341
1422
|
end_line=end_line,
|
|
1342
1423
|
raw_text=raw_text,
|
|
1343
|
-
element_type="footnote_definition"
|
|
1424
|
+
element_type="footnote_definition",
|
|
1344
1425
|
)
|
|
1345
1426
|
footnote_element.type = "footnote_definition"
|
|
1346
1427
|
footnote_element.text = content
|
|
1347
1428
|
footnotes.append(footnote_element)
|
|
1348
|
-
|
|
1429
|
+
|
|
1349
1430
|
except Exception as e:
|
|
1350
1431
|
log_debug(f"Failed to extract footnote definition: {e}")
|
|
1351
1432
|
|
|
@@ -1358,33 +1439,33 @@ class MarkdownElementExtractor(ElementExtractor):
|
|
|
1358
1439
|
def _parse_link_components(self, raw_text: str) -> tuple[str, str, str]:
|
|
1359
1440
|
"""Parse link components from raw text"""
|
|
1360
1441
|
import re
|
|
1361
|
-
|
|
1442
|
+
|
|
1362
1443
|
# Pattern for [text](url "title")
|
|
1363
1444
|
pattern = r'\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
1364
1445
|
match = re.search(pattern, raw_text)
|
|
1365
|
-
|
|
1446
|
+
|
|
1366
1447
|
if match:
|
|
1367
1448
|
text = match.group(1) or ""
|
|
1368
1449
|
url = match.group(2) or ""
|
|
1369
1450
|
title = match.group(3) or ""
|
|
1370
1451
|
return text, url, title
|
|
1371
|
-
|
|
1452
|
+
|
|
1372
1453
|
return "", "", ""
|
|
1373
1454
|
|
|
1374
1455
|
def _parse_image_components(self, raw_text: str) -> tuple[str, str, str]:
|
|
1375
1456
|
"""Parse image components from raw text"""
|
|
1376
1457
|
import re
|
|
1377
|
-
|
|
1458
|
+
|
|
1378
1459
|
# Pattern for 
|
|
1379
1460
|
pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
|
|
1380
1461
|
match = re.search(pattern, raw_text)
|
|
1381
|
-
|
|
1462
|
+
|
|
1382
1463
|
if match:
|
|
1383
1464
|
alt_text = match.group(1) or ""
|
|
1384
1465
|
url = match.group(2) or ""
|
|
1385
1466
|
title = match.group(3) or ""
|
|
1386
1467
|
return alt_text, url, title
|
|
1387
|
-
|
|
1468
|
+
|
|
1388
1469
|
return "", "", ""
|
|
1389
1470
|
|
|
1390
1471
|
|
|
@@ -1396,7 +1477,7 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1396
1477
|
super().__init__()
|
|
1397
1478
|
self._language_cache: tree_sitter.Language | None = None
|
|
1398
1479
|
self._extractor: MarkdownElementExtractor = MarkdownElementExtractor()
|
|
1399
|
-
|
|
1480
|
+
|
|
1400
1481
|
# Legacy compatibility attributes for tests
|
|
1401
1482
|
self.language = "markdown"
|
|
1402
1483
|
self.extractor = self._extractor
|
|
@@ -1421,22 +1502,30 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1421
1502
|
"""Get the language name for Markdown (legacy compatibility)"""
|
|
1422
1503
|
return "markdown"
|
|
1423
1504
|
|
|
1424
|
-
def extract_functions(
|
|
1505
|
+
def extract_functions(
|
|
1506
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1507
|
+
) -> list[CodeElement]:
|
|
1425
1508
|
"""Extract functions from the tree (legacy compatibility)"""
|
|
1426
1509
|
extractor = self.get_extractor()
|
|
1427
1510
|
return extractor.extract_functions(tree, source_code)
|
|
1428
1511
|
|
|
1429
|
-
def extract_classes(
|
|
1512
|
+
def extract_classes(
|
|
1513
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1514
|
+
) -> list[CodeElement]:
|
|
1430
1515
|
"""Extract classes from the tree (legacy compatibility)"""
|
|
1431
1516
|
extractor = self.get_extractor()
|
|
1432
1517
|
return extractor.extract_classes(tree, source_code)
|
|
1433
1518
|
|
|
1434
|
-
def extract_variables(
|
|
1519
|
+
def extract_variables(
|
|
1520
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1521
|
+
) -> list[CodeElement]:
|
|
1435
1522
|
"""Extract variables from the tree (legacy compatibility)"""
|
|
1436
1523
|
extractor = self.get_extractor()
|
|
1437
1524
|
return extractor.extract_variables(tree, source_code)
|
|
1438
1525
|
|
|
1439
|
-
def extract_imports(
|
|
1526
|
+
def extract_imports(
|
|
1527
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
1528
|
+
) -> list[CodeElement]:
|
|
1440
1529
|
"""Extract imports from the tree (legacy compatibility)"""
|
|
1441
1530
|
extractor = self.get_extractor()
|
|
1442
1531
|
return extractor.extract_imports(tree, source_code)
|
|
@@ -1564,7 +1653,7 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1564
1653
|
references = extractor.extract_references(tree, source_code)
|
|
1565
1654
|
lists = extractor.extract_lists(tree, source_code)
|
|
1566
1655
|
tables = extractor.extract_tables(tree, source_code)
|
|
1567
|
-
|
|
1656
|
+
|
|
1568
1657
|
# Extract new element types
|
|
1569
1658
|
blockquotes = extractor.extract_blockquotes(tree, source_code)
|
|
1570
1659
|
horizontal_rules = extractor.extract_horizontal_rules(tree, source_code)
|
|
@@ -1611,8 +1700,6 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1611
1700
|
def execute_query(self, tree: "tree_sitter.Tree", query_name: str) -> dict:
|
|
1612
1701
|
"""Execute a specific query on the tree"""
|
|
1613
1702
|
try:
|
|
1614
|
-
import tree_sitter
|
|
1615
|
-
|
|
1616
1703
|
language = self.get_tree_sitter_language()
|
|
1617
1704
|
if not language:
|
|
1618
1705
|
return {"error": "Language not available"}
|
|
@@ -1629,7 +1716,11 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1629
1716
|
captures = TreeSitterQueryCompat.safe_execute_query(
|
|
1630
1717
|
language, query_string, tree.root_node, fallback_result=[]
|
|
1631
1718
|
)
|
|
1632
|
-
return {
|
|
1719
|
+
return {
|
|
1720
|
+
"captures": captures,
|
|
1721
|
+
"query": query_string,
|
|
1722
|
+
"matches": len(captures),
|
|
1723
|
+
}
|
|
1633
1724
|
|
|
1634
1725
|
except Exception as e:
|
|
1635
1726
|
log_error(f"Query execution failed: {e}")
|
|
@@ -1639,7 +1730,7 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1639
1730
|
"""Extract elements from source code using tree-sitter AST"""
|
|
1640
1731
|
extractor = self.get_extractor()
|
|
1641
1732
|
elements = []
|
|
1642
|
-
|
|
1733
|
+
|
|
1643
1734
|
try:
|
|
1644
1735
|
elements.extend(extractor.extract_headers(tree, source_code))
|
|
1645
1736
|
elements.extend(extractor.extract_code_blocks(tree, source_code))
|
|
@@ -1655,10 +1746,12 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1655
1746
|
elements.extend(extractor.extract_footnotes(tree, source_code))
|
|
1656
1747
|
except Exception as e:
|
|
1657
1748
|
log_error(f"Failed to extract elements: {e}")
|
|
1658
|
-
|
|
1749
|
+
|
|
1659
1750
|
return elements
|
|
1660
1751
|
|
|
1661
|
-
def execute_query_strategy(
|
|
1752
|
+
def execute_query_strategy(
|
|
1753
|
+
self, tree: "tree_sitter.Tree", source_code: str, query_key: str
|
|
1754
|
+
) -> list[CodeElement]:
|
|
1662
1755
|
"""Execute Markdown-specific query strategy based on query_key"""
|
|
1663
1756
|
if not tree or not source_code:
|
|
1664
1757
|
return []
|
|
@@ -1674,54 +1767,82 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1674
1767
|
"function": lambda: self._extractor.extract_headers(tree, source_code),
|
|
1675
1768
|
"headers": lambda: self._extractor.extract_headers(tree, source_code),
|
|
1676
1769
|
"heading": lambda: self._extractor.extract_headers(tree, source_code),
|
|
1677
|
-
|
|
1678
1770
|
# Code block-related queries (mapped to classes)
|
|
1679
1771
|
"class": lambda: self._extractor.extract_code_blocks(tree, source_code),
|
|
1680
|
-
"code_blocks": lambda: self._extractor.extract_code_blocks(
|
|
1681
|
-
|
|
1682
|
-
|
|
1772
|
+
"code_blocks": lambda: self._extractor.extract_code_blocks(
|
|
1773
|
+
tree, source_code
|
|
1774
|
+
),
|
|
1775
|
+
"code_block": lambda: self._extractor.extract_code_blocks(
|
|
1776
|
+
tree, source_code
|
|
1777
|
+
),
|
|
1683
1778
|
# Link and image queries (mapped to variables)
|
|
1684
|
-
"variable": lambda: self._extractor.extract_links(tree, source_code)
|
|
1779
|
+
"variable": lambda: self._extractor.extract_links(tree, source_code)
|
|
1780
|
+
+ self._extractor.extract_images(tree, source_code),
|
|
1685
1781
|
"links": lambda: self._extractor.extract_links(tree, source_code),
|
|
1686
1782
|
"link": lambda: self._extractor.extract_links(tree, source_code),
|
|
1687
1783
|
"images": lambda: self._extractor.extract_images(tree, source_code),
|
|
1688
1784
|
"image": lambda: self._extractor.extract_images(tree, source_code),
|
|
1689
|
-
|
|
1690
1785
|
# Reference queries (mapped to imports)
|
|
1691
1786
|
"import": lambda: self._extractor.extract_references(tree, source_code),
|
|
1692
1787
|
"references": lambda: self._extractor.extract_references(tree, source_code),
|
|
1693
1788
|
"reference": lambda: self._extractor.extract_references(tree, source_code),
|
|
1694
|
-
|
|
1695
1789
|
# List and table queries
|
|
1696
1790
|
"lists": lambda: self._extractor.extract_lists(tree, source_code),
|
|
1697
1791
|
"list": lambda: self._extractor.extract_lists(tree, source_code),
|
|
1698
|
-
"task_lists": lambda: [
|
|
1792
|
+
"task_lists": lambda: [
|
|
1793
|
+
lst
|
|
1794
|
+
for lst in self._extractor.extract_lists(tree, source_code)
|
|
1795
|
+
if getattr(lst, "element_type", "") == "task_list"
|
|
1796
|
+
],
|
|
1699
1797
|
"tables": lambda: self._extractor.extract_tables(tree, source_code),
|
|
1700
1798
|
"table": lambda: self._extractor.extract_tables(tree, source_code),
|
|
1701
|
-
|
|
1702
1799
|
# Content structure queries
|
|
1703
|
-
"blockquotes": lambda: self._extractor.extract_blockquotes(
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
"
|
|
1707
|
-
|
|
1800
|
+
"blockquotes": lambda: self._extractor.extract_blockquotes(
|
|
1801
|
+
tree, source_code
|
|
1802
|
+
),
|
|
1803
|
+
"blockquote": lambda: self._extractor.extract_blockquotes(
|
|
1804
|
+
tree, source_code
|
|
1805
|
+
),
|
|
1806
|
+
"horizontal_rules": lambda: self._extractor.extract_horizontal_rules(
|
|
1807
|
+
tree, source_code
|
|
1808
|
+
),
|
|
1809
|
+
"horizontal_rule": lambda: self._extractor.extract_horizontal_rules(
|
|
1810
|
+
tree, source_code
|
|
1811
|
+
),
|
|
1708
1812
|
# HTML and formatting queries
|
|
1709
|
-
"html_blocks": lambda: self._extractor.extract_html_elements(
|
|
1710
|
-
|
|
1813
|
+
"html_blocks": lambda: self._extractor.extract_html_elements(
|
|
1814
|
+
tree, source_code
|
|
1815
|
+
),
|
|
1816
|
+
"html_block": lambda: self._extractor.extract_html_elements(
|
|
1817
|
+
tree, source_code
|
|
1818
|
+
),
|
|
1711
1819
|
"html": lambda: self._extractor.extract_html_elements(tree, source_code),
|
|
1712
|
-
"emphasis": lambda: self._extractor.extract_text_formatting(
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
"
|
|
1716
|
-
|
|
1717
|
-
|
|
1820
|
+
"emphasis": lambda: self._extractor.extract_text_formatting(
|
|
1821
|
+
tree, source_code
|
|
1822
|
+
),
|
|
1823
|
+
"formatting": lambda: self._extractor.extract_text_formatting(
|
|
1824
|
+
tree, source_code
|
|
1825
|
+
),
|
|
1826
|
+
"text_formatting": lambda: self._extractor.extract_text_formatting(
|
|
1827
|
+
tree, source_code
|
|
1828
|
+
),
|
|
1829
|
+
"inline_code": lambda: [
|
|
1830
|
+
f
|
|
1831
|
+
for f in self._extractor.extract_text_formatting(tree, source_code)
|
|
1832
|
+
if getattr(f, "element_type", "") == "inline_code"
|
|
1833
|
+
],
|
|
1834
|
+
"strikethrough": lambda: [
|
|
1835
|
+
f
|
|
1836
|
+
for f in self._extractor.extract_text_formatting(tree, source_code)
|
|
1837
|
+
if getattr(f, "element_type", "") == "strikethrough"
|
|
1838
|
+
],
|
|
1718
1839
|
# Footnote queries
|
|
1719
1840
|
"footnotes": lambda: self._extractor.extract_footnotes(tree, source_code),
|
|
1720
1841
|
"footnote": lambda: self._extractor.extract_footnotes(tree, source_code),
|
|
1721
|
-
|
|
1722
1842
|
# Comprehensive queries
|
|
1723
1843
|
"all_elements": lambda: self.extract_elements(tree, source_code),
|
|
1724
|
-
"text_content": lambda: self._extractor.extract_headers(tree, source_code)
|
|
1844
|
+
"text_content": lambda: self._extractor.extract_headers(tree, source_code)
|
|
1845
|
+
+ self._extractor.extract_text_formatting(tree, source_code),
|
|
1725
1846
|
}
|
|
1726
1847
|
|
|
1727
1848
|
# Execute the appropriate extraction method
|
|
@@ -1739,152 +1860,70 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1739
1860
|
"""Get Markdown element categories mapping query_key to node_types"""
|
|
1740
1861
|
return {
|
|
1741
1862
|
# Header categories (function-like)
|
|
1742
|
-
"function": [
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
],
|
|
1746
|
-
"headers": [
|
|
1747
|
-
"atx_heading",
|
|
1748
|
-
"setext_heading"
|
|
1749
|
-
],
|
|
1750
|
-
"heading": [
|
|
1751
|
-
"atx_heading",
|
|
1752
|
-
"setext_heading"
|
|
1753
|
-
],
|
|
1754
|
-
|
|
1863
|
+
"function": ["atx_heading", "setext_heading"],
|
|
1864
|
+
"headers": ["atx_heading", "setext_heading"],
|
|
1865
|
+
"heading": ["atx_heading", "setext_heading"],
|
|
1755
1866
|
# Code block categories (class-like)
|
|
1756
|
-
"class": [
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
],
|
|
1760
|
-
"code_blocks": [
|
|
1761
|
-
"fenced_code_block",
|
|
1762
|
-
"indented_code_block"
|
|
1763
|
-
],
|
|
1764
|
-
"code_block": [
|
|
1765
|
-
"fenced_code_block",
|
|
1766
|
-
"indented_code_block"
|
|
1767
|
-
],
|
|
1768
|
-
|
|
1867
|
+
"class": ["fenced_code_block", "indented_code_block"],
|
|
1868
|
+
"code_blocks": ["fenced_code_block", "indented_code_block"],
|
|
1869
|
+
"code_block": ["fenced_code_block", "indented_code_block"],
|
|
1769
1870
|
# Link and image categories (variable-like)
|
|
1770
1871
|
"variable": [
|
|
1771
1872
|
"inline", # Contains links and images
|
|
1772
1873
|
"link",
|
|
1773
1874
|
"autolink",
|
|
1774
1875
|
"reference_link",
|
|
1775
|
-
"image"
|
|
1876
|
+
"image",
|
|
1776
1877
|
],
|
|
1777
1878
|
"links": [
|
|
1778
1879
|
"inline", # Contains inline links
|
|
1779
1880
|
"link",
|
|
1780
1881
|
"autolink",
|
|
1781
|
-
"reference_link"
|
|
1782
|
-
],
|
|
1783
|
-
"link": [
|
|
1784
|
-
"inline",
|
|
1785
|
-
"link",
|
|
1786
|
-
"autolink",
|
|
1787
|
-
"reference_link"
|
|
1882
|
+
"reference_link",
|
|
1788
1883
|
],
|
|
1884
|
+
"link": ["inline", "link", "autolink", "reference_link"],
|
|
1789
1885
|
"images": [
|
|
1790
1886
|
"inline", # Contains inline images
|
|
1791
|
-
"image"
|
|
1792
|
-
],
|
|
1793
|
-
"image": [
|
|
1794
|
-
"inline",
|
|
1795
|
-
"image"
|
|
1887
|
+
"image",
|
|
1796
1888
|
],
|
|
1797
|
-
|
|
1889
|
+
"image": ["inline", "image"],
|
|
1798
1890
|
# Reference categories (import-like)
|
|
1799
|
-
"import": [
|
|
1800
|
-
|
|
1801
|
-
],
|
|
1802
|
-
"references": [
|
|
1803
|
-
"link_reference_definition"
|
|
1804
|
-
],
|
|
1805
|
-
"reference": [
|
|
1806
|
-
"link_reference_definition"
|
|
1807
|
-
],
|
|
1808
|
-
|
|
1891
|
+
"import": ["link_reference_definition"],
|
|
1892
|
+
"references": ["link_reference_definition"],
|
|
1893
|
+
"reference": ["link_reference_definition"],
|
|
1809
1894
|
# List categories
|
|
1810
|
-
"lists": [
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
],
|
|
1814
|
-
"list": [
|
|
1815
|
-
"list",
|
|
1816
|
-
"list_item"
|
|
1817
|
-
],
|
|
1818
|
-
"task_lists": [
|
|
1819
|
-
"list",
|
|
1820
|
-
"list_item"
|
|
1821
|
-
],
|
|
1822
|
-
|
|
1895
|
+
"lists": ["list", "list_item"],
|
|
1896
|
+
"list": ["list", "list_item"],
|
|
1897
|
+
"task_lists": ["list", "list_item"],
|
|
1823
1898
|
# Table categories
|
|
1824
|
-
"tables": [
|
|
1825
|
-
|
|
1826
|
-
"table"
|
|
1827
|
-
],
|
|
1828
|
-
"table": [
|
|
1829
|
-
"pipe_table",
|
|
1830
|
-
"table"
|
|
1831
|
-
],
|
|
1832
|
-
|
|
1899
|
+
"tables": ["pipe_table", "table"],
|
|
1900
|
+
"table": ["pipe_table", "table"],
|
|
1833
1901
|
# Content structure categories
|
|
1834
|
-
"blockquotes": [
|
|
1835
|
-
|
|
1836
|
-
],
|
|
1837
|
-
"
|
|
1838
|
-
"block_quote"
|
|
1839
|
-
],
|
|
1840
|
-
"horizontal_rules": [
|
|
1841
|
-
"thematic_break"
|
|
1842
|
-
],
|
|
1843
|
-
"horizontal_rule": [
|
|
1844
|
-
"thematic_break"
|
|
1845
|
-
],
|
|
1846
|
-
|
|
1902
|
+
"blockquotes": ["block_quote"],
|
|
1903
|
+
"blockquote": ["block_quote"],
|
|
1904
|
+
"horizontal_rules": ["thematic_break"],
|
|
1905
|
+
"horizontal_rule": ["thematic_break"],
|
|
1847
1906
|
# HTML categories
|
|
1848
1907
|
"html_blocks": [
|
|
1849
1908
|
"html_block",
|
|
1850
|
-
"inline" # Contains inline HTML
|
|
1851
|
-
],
|
|
1852
|
-
"html_block": [
|
|
1853
|
-
"html_block",
|
|
1854
|
-
"inline"
|
|
1909
|
+
"inline", # Contains inline HTML
|
|
1855
1910
|
],
|
|
1856
|
-
"
|
|
1857
|
-
|
|
1858
|
-
"inline"
|
|
1859
|
-
],
|
|
1860
|
-
|
|
1911
|
+
"html_block": ["html_block", "inline"],
|
|
1912
|
+
"html": ["html_block", "inline"],
|
|
1861
1913
|
# Text formatting categories
|
|
1862
1914
|
"emphasis": [
|
|
1863
1915
|
"inline" # Contains emphasis elements
|
|
1864
1916
|
],
|
|
1865
|
-
"formatting": [
|
|
1866
|
-
|
|
1867
|
-
],
|
|
1868
|
-
"
|
|
1869
|
-
"inline"
|
|
1870
|
-
],
|
|
1871
|
-
"inline_code": [
|
|
1872
|
-
"inline"
|
|
1873
|
-
],
|
|
1874
|
-
"strikethrough": [
|
|
1875
|
-
"inline"
|
|
1876
|
-
],
|
|
1877
|
-
|
|
1917
|
+
"formatting": ["inline"],
|
|
1918
|
+
"text_formatting": ["inline"],
|
|
1919
|
+
"inline_code": ["inline"],
|
|
1920
|
+
"strikethrough": ["inline"],
|
|
1878
1921
|
# Footnote categories
|
|
1879
1922
|
"footnotes": [
|
|
1880
1923
|
"inline", # Contains footnote references
|
|
1881
|
-
"paragraph" # Contains footnote definitions
|
|
1924
|
+
"paragraph", # Contains footnote definitions
|
|
1882
1925
|
],
|
|
1883
|
-
"footnote": [
|
|
1884
|
-
"inline",
|
|
1885
|
-
"paragraph"
|
|
1886
|
-
],
|
|
1887
|
-
|
|
1926
|
+
"footnote": ["inline", "paragraph"],
|
|
1888
1927
|
# Comprehensive categories
|
|
1889
1928
|
"all_elements": [
|
|
1890
1929
|
"atx_heading",
|
|
@@ -1904,12 +1943,7 @@ class MarkdownPlugin(LanguagePlugin):
|
|
|
1904
1943
|
"block_quote",
|
|
1905
1944
|
"thematic_break",
|
|
1906
1945
|
"html_block",
|
|
1907
|
-
"paragraph"
|
|
1946
|
+
"paragraph",
|
|
1908
1947
|
],
|
|
1909
|
-
"text_content": [
|
|
1910
|
-
|
|
1911
|
-
"setext_heading",
|
|
1912
|
-
"inline",
|
|
1913
|
-
"paragraph"
|
|
1914
|
-
]
|
|
1915
|
-
}
|
|
1948
|
+
"text_content": ["atx_heading", "setext_heading", "inline", "paragraph"],
|
|
1949
|
+
}
|