tree-sitter-analyzer 1.7.2__py3-none-any.whl → 1.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tree-sitter-analyzer might be problematic. Click here for more details.

@@ -0,0 +1,1569 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Markdown Language Plugin
4
+
5
+ Enhanced Markdown-specific parsing and element extraction functionality.
6
+ Provides comprehensive support for Markdown elements including headers,
7
+ links, code blocks, lists, tables, and other structural elements.
8
+ """
9
+
10
+ from typing import TYPE_CHECKING, Any, Optional
11
+
12
+ if TYPE_CHECKING:
13
+ import tree_sitter
14
+
15
+ try:
16
+ import tree_sitter
17
+
18
+ TREE_SITTER_AVAILABLE = True
19
+ except ImportError:
20
+ TREE_SITTER_AVAILABLE = False
21
+
22
+ from ..core.analysis_engine import AnalysisRequest
23
+ from ..encoding_utils import extract_text_slice, safe_encode
24
+ from ..models import AnalysisResult, CodeElement
25
+ from ..plugins.base import ElementExtractor, LanguagePlugin
26
+ from ..utils import log_debug, log_error, log_warning
27
+
28
+
29
+ class MarkdownElement(CodeElement):
30
+ """Markdown-specific code element"""
31
+
32
+ def __init__(
33
+ self,
34
+ name: str,
35
+ start_line: int,
36
+ end_line: int,
37
+ raw_text: str,
38
+ language: str = "markdown",
39
+ element_type: str = "markdown",
40
+ level: Optional[int] = None,
41
+ url: Optional[str] = None,
42
+ alt_text: Optional[str] = None,
43
+ title: Optional[str] = None,
44
+ language_info: Optional[str] = None,
45
+ is_checked: Optional[bool] = None,
46
+ **kwargs
47
+ ):
48
+ super().__init__(
49
+ name=name,
50
+ start_line=start_line,
51
+ end_line=end_line,
52
+ raw_text=raw_text,
53
+ language=language,
54
+ **kwargs
55
+ )
56
+ self.element_type = element_type
57
+ self.level = level # For headers (1-6)
58
+ self.url = url # For links and images
59
+ self.alt_text = alt_text # For images
60
+ self.title = title # For links and images
61
+ self.language_info = language_info # For code blocks
62
+ self.is_checked = is_checked # For task list items
63
+
64
+
65
+ class MarkdownElementExtractor(ElementExtractor):
66
+ """Markdown-specific element extractor with comprehensive feature support"""
67
+
68
+ def __init__(self) -> None:
69
+ """Initialize the Markdown element extractor."""
70
+ self.current_file: str = ""
71
+ self.source_code: str = ""
72
+ self.content_lines: list[str] = []
73
+
74
+ # Performance optimization caches
75
+ self._node_text_cache: dict[int, str] = {}
76
+ self._processed_nodes: set[int] = set()
77
+ self._element_cache: dict[tuple[int, str], Any] = {}
78
+ self._file_encoding: str | None = None
79
+
80
+ def extract_functions(
81
+ self, tree: "tree_sitter.Tree", source_code: str
82
+ ) -> list[CodeElement]:
83
+ """Extract Markdown elements (headers act as 'functions')"""
84
+ return self.extract_headers(tree, source_code)
85
+
86
+ def extract_classes(
87
+ self, tree: "tree_sitter.Tree", source_code: str
88
+ ) -> list[CodeElement]:
89
+ """Extract Markdown sections (code blocks act as 'classes')"""
90
+ return self.extract_code_blocks(tree, source_code)
91
+
92
+ def extract_variables(
93
+ self, tree: "tree_sitter.Tree", source_code: str
94
+ ) -> list[CodeElement]:
95
+ """Extract Markdown links and images (act as 'variables')"""
96
+ elements = []
97
+ elements.extend(self.extract_links(tree, source_code))
98
+ elements.extend(self.extract_images(tree, source_code))
99
+ return elements
100
+
101
+ def extract_imports(
102
+ self, tree: "tree_sitter.Tree", source_code: str
103
+ ) -> list[CodeElement]:
104
+ """Extract Markdown references and definitions"""
105
+ return self.extract_references(tree, source_code)
106
+
107
+ def extract_headers(
108
+ self, tree: "tree_sitter.Tree", source_code: str
109
+ ) -> list[MarkdownElement]:
110
+ """Extract Markdown headers (H1-H6)"""
111
+ self.source_code = source_code or ""
112
+ self.content_lines = self.source_code.split("\n")
113
+ self._reset_caches()
114
+
115
+ headers: list[MarkdownElement] = []
116
+
117
+ if tree is None or tree.root_node is None:
118
+ log_debug("Tree or root_node is None, returning empty headers list")
119
+ return headers
120
+
121
+ try:
122
+ # Extract ATX headers (# ## ### etc.)
123
+ self._extract_atx_headers(tree.root_node, headers)
124
+ # Extract Setext headers (underlined)
125
+ self._extract_setext_headers(tree.root_node, headers)
126
+ except Exception as e:
127
+ log_debug(f"Error during header extraction: {e}")
128
+ return []
129
+
130
+ log_debug(f"Extracted {len(headers)} Markdown headers")
131
+ return headers
132
+
133
+ def extract_code_blocks(
134
+ self, tree: "tree_sitter.Tree", source_code: str
135
+ ) -> list[MarkdownElement]:
136
+ """Extract Markdown code blocks"""
137
+ self.source_code = source_code or ""
138
+ self.content_lines = self.source_code.split("\n")
139
+ self._reset_caches()
140
+
141
+ code_blocks: list[MarkdownElement] = []
142
+
143
+ if tree is None or tree.root_node is None:
144
+ log_debug("Tree or root_node is None, returning empty code blocks list")
145
+ return code_blocks
146
+
147
+ try:
148
+ self._extract_fenced_code_blocks(tree.root_node, code_blocks)
149
+ self._extract_indented_code_blocks(tree.root_node, code_blocks)
150
+ except Exception as e:
151
+ log_debug(f"Error during code block extraction: {e}")
152
+ return []
153
+
154
+ log_debug(f"Extracted {len(code_blocks)} Markdown code blocks")
155
+ return code_blocks
156
+
157
+ def extract_links(
158
+ self, tree: "tree_sitter.Tree", source_code: str
159
+ ) -> list[MarkdownElement]:
160
+ """Extract Markdown links"""
161
+ self.source_code = source_code or ""
162
+ self.content_lines = self.source_code.split("\n")
163
+ self._reset_caches()
164
+
165
+ links: list[MarkdownElement] = []
166
+
167
+ if tree is None or tree.root_node is None:
168
+ log_debug("Tree or root_node is None, returning empty links list")
169
+ return links
170
+
171
+ try:
172
+ self._extract_inline_links(tree.root_node, links)
173
+ self._extract_reference_links(tree.root_node, links)
174
+ self._extract_autolinks(tree.root_node, links)
175
+ except Exception as e:
176
+ log_debug(f"Error during link extraction: {e}")
177
+ return []
178
+
179
+ log_debug(f"Extracted {len(links)} Markdown links")
180
+ return links
181
+
182
+ def extract_images(
183
+ self, tree: "tree_sitter.Tree", source_code: str
184
+ ) -> list[MarkdownElement]:
185
+ """Extract Markdown images"""
186
+ self.source_code = source_code or ""
187
+ self.content_lines = self.source_code.split("\n")
188
+ self._reset_caches()
189
+
190
+ images: list[MarkdownElement] = []
191
+
192
+ if tree is None or tree.root_node is None:
193
+ log_debug("Tree or root_node is None, returning empty images list")
194
+ return images
195
+
196
+ try:
197
+ self._extract_inline_images(tree.root_node, images)
198
+ self._extract_reference_images(tree.root_node, images)
199
+ except Exception as e:
200
+ log_debug(f"Error during image extraction: {e}")
201
+ return []
202
+
203
+ log_debug(f"Extracted {len(images)} Markdown images")
204
+ return images
205
+
206
+ def extract_references(
207
+ self, tree: "tree_sitter.Tree", source_code: str
208
+ ) -> list[MarkdownElement]:
209
+ """Extract Markdown reference definitions"""
210
+ self.source_code = source_code or ""
211
+ self.content_lines = self.source_code.split("\n")
212
+ self._reset_caches()
213
+
214
+ references: list[MarkdownElement] = []
215
+
216
+ if tree is None or tree.root_node is None:
217
+ log_debug("Tree or root_node is None, returning empty references list")
218
+ return references
219
+
220
+ try:
221
+ self._extract_link_reference_definitions(tree.root_node, references)
222
+ except Exception as e:
223
+ log_debug(f"Error during reference extraction: {e}")
224
+ return []
225
+
226
+ log_debug(f"Extracted {len(references)} Markdown references")
227
+ return references
228
+
229
+ def extract_blockquotes(
230
+ self, tree: "tree_sitter.Tree", source_code: str
231
+ ) -> list[MarkdownElement]:
232
+ """Extract Markdown blockquotes"""
233
+ self.source_code = source_code or ""
234
+ self.content_lines = self.source_code.split("\n")
235
+ self._reset_caches()
236
+
237
+ blockquotes: list[MarkdownElement] = []
238
+
239
+ if tree is None or tree.root_node is None:
240
+ log_debug("Tree or root_node is None, returning empty blockquotes list")
241
+ return blockquotes
242
+
243
+ try:
244
+ self._extract_block_quotes(tree.root_node, blockquotes)
245
+ except Exception as e:
246
+ log_debug(f"Error during blockquote extraction: {e}")
247
+ return []
248
+
249
+ log_debug(f"Extracted {len(blockquotes)} Markdown blockquotes")
250
+ return blockquotes
251
+
252
+ def extract_horizontal_rules(
253
+ self, tree: "tree_sitter.Tree", source_code: str
254
+ ) -> list[MarkdownElement]:
255
+ """Extract Markdown horizontal rules"""
256
+ self.source_code = source_code or ""
257
+ self.content_lines = self.source_code.split("\n")
258
+ self._reset_caches()
259
+
260
+ horizontal_rules: list[MarkdownElement] = []
261
+
262
+ if tree is None or tree.root_node is None:
263
+ log_debug("Tree or root_node is None, returning empty horizontal rules list")
264
+ return horizontal_rules
265
+
266
+ try:
267
+ self._extract_thematic_breaks(tree.root_node, horizontal_rules)
268
+ except Exception as e:
269
+ log_debug(f"Error during horizontal rule extraction: {e}")
270
+ return []
271
+
272
+ log_debug(f"Extracted {len(horizontal_rules)} Markdown horizontal rules")
273
+ return horizontal_rules
274
+
275
+ def extract_html_elements(
276
+ self, tree: "tree_sitter.Tree", source_code: str
277
+ ) -> list[MarkdownElement]:
278
+ """Extract HTML elements"""
279
+ self.source_code = source_code or ""
280
+ self.content_lines = self.source_code.split("\n")
281
+ self._reset_caches()
282
+
283
+ html_elements: list[MarkdownElement] = []
284
+
285
+ if tree is None or tree.root_node is None:
286
+ log_debug("Tree or root_node is None, returning empty HTML elements list")
287
+ return html_elements
288
+
289
+ try:
290
+ self._extract_html_blocks(tree.root_node, html_elements)
291
+ self._extract_inline_html(tree.root_node, html_elements)
292
+ except Exception as e:
293
+ log_debug(f"Error during HTML element extraction: {e}")
294
+ return []
295
+
296
+ log_debug(f"Extracted {len(html_elements)} HTML elements")
297
+ return html_elements
298
+
299
+ def extract_text_formatting(
300
+ self, tree: "tree_sitter.Tree", source_code: str
301
+ ) -> list[MarkdownElement]:
302
+ """Extract text formatting elements (bold, italic, strikethrough, inline code)"""
303
+ self.source_code = source_code or ""
304
+ self.content_lines = self.source_code.split("\n")
305
+ self._reset_caches()
306
+
307
+ formatting_elements: list[MarkdownElement] = []
308
+
309
+ if tree is None or tree.root_node is None:
310
+ log_debug("Tree or root_node is None, returning empty formatting elements list")
311
+ return formatting_elements
312
+
313
+ try:
314
+ self._extract_emphasis_elements(tree.root_node, formatting_elements)
315
+ self._extract_inline_code_spans(tree.root_node, formatting_elements)
316
+ self._extract_strikethrough_elements(tree.root_node, formatting_elements)
317
+ except Exception as e:
318
+ log_debug(f"Error during text formatting extraction: {e}")
319
+ return []
320
+
321
+ log_debug(f"Extracted {len(formatting_elements)} text formatting elements")
322
+ return formatting_elements
323
+
324
+ def extract_footnotes(
325
+ self, tree: "tree_sitter.Tree", source_code: str
326
+ ) -> list[MarkdownElement]:
327
+ """Extract footnotes"""
328
+ self.source_code = source_code or ""
329
+ self.content_lines = self.source_code.split("\n")
330
+ self._reset_caches()
331
+
332
+ footnotes: list[MarkdownElement] = []
333
+
334
+ if tree is None or tree.root_node is None:
335
+ log_debug("Tree or root_node is None, returning empty footnotes list")
336
+ return footnotes
337
+
338
+ try:
339
+ self._extract_footnote_elements(tree.root_node, footnotes)
340
+ except Exception as e:
341
+ log_debug(f"Error during footnote extraction: {e}")
342
+ return []
343
+
344
+ log_debug(f"Extracted {len(footnotes)} footnotes")
345
+ return footnotes
346
+
347
+ def extract_lists(
348
+ self, tree: "tree_sitter.Tree", source_code: str
349
+ ) -> list[MarkdownElement]:
350
+ """Extract Markdown lists"""
351
+ self.source_code = source_code or ""
352
+ self.content_lines = self.source_code.split("\n")
353
+ self._reset_caches()
354
+
355
+ lists: list[MarkdownElement] = []
356
+
357
+ if tree is None or tree.root_node is None:
358
+ log_debug("Tree or root_node is None, returning empty lists list")
359
+ return lists
360
+
361
+ try:
362
+ self._extract_list_items(tree.root_node, lists)
363
+ except Exception as e:
364
+ log_debug(f"Error during list extraction: {e}")
365
+ return []
366
+
367
+ log_debug(f"Extracted {len(lists)} Markdown list items")
368
+ return lists
369
+
370
+ def extract_tables(
371
+ self, tree: "tree_sitter.Tree", source_code: str
372
+ ) -> list[MarkdownElement]:
373
+ """Extract Markdown tables"""
374
+ self.source_code = source_code or ""
375
+ self.content_lines = self.source_code.split("\n")
376
+ self._reset_caches()
377
+
378
+ tables: list[MarkdownElement] = []
379
+
380
+ if tree is None or tree.root_node is None:
381
+ log_debug("Tree or root_node is None, returning empty tables list")
382
+ return tables
383
+
384
+ try:
385
+ self._extract_pipe_tables(tree.root_node, tables)
386
+ except Exception as e:
387
+ log_debug(f"Error during table extraction: {e}")
388
+ return []
389
+
390
+ log_debug(f"Extracted {len(tables)} Markdown tables")
391
+ return tables
392
+
393
+ def _reset_caches(self) -> None:
394
+ """Reset performance caches"""
395
+ self._node_text_cache.clear()
396
+ self._processed_nodes.clear()
397
+ self._element_cache.clear()
398
+
399
+ def _get_node_text_optimized(self, node: "tree_sitter.Node") -> str:
400
+ """Get node text with optimized caching"""
401
+ node_id = id(node)
402
+
403
+ if node_id in self._node_text_cache:
404
+ return self._node_text_cache[node_id]
405
+
406
+ try:
407
+ start_byte = node.start_byte
408
+ end_byte = node.end_byte
409
+
410
+ encoding = self._file_encoding or "utf-8"
411
+ content_bytes = safe_encode("\n".join(self.content_lines), encoding)
412
+ text = extract_text_slice(content_bytes, start_byte, end_byte, encoding)
413
+
414
+ if text:
415
+ self._node_text_cache[node_id] = text
416
+ return text
417
+ except Exception as e:
418
+ log_error(f"Error in _get_node_text_optimized: {e}")
419
+
420
+ # Fallback to simple text extraction
421
+ try:
422
+ start_point = node.start_point
423
+ end_point = node.end_point
424
+
425
+ if (start_point[0] < 0 or start_point[0] >= len(self.content_lines)):
426
+ return ""
427
+
428
+ if (end_point[0] < 0 or end_point[0] >= len(self.content_lines)):
429
+ return ""
430
+
431
+ if start_point[0] == end_point[0]:
432
+ line = self.content_lines[start_point[0]]
433
+ start_col = max(0, min(start_point[1], len(line)))
434
+ end_col = max(start_col, min(end_point[1], len(line)))
435
+ result = line[start_col:end_col]
436
+ self._node_text_cache[node_id] = result
437
+ return result
438
+ else:
439
+ lines = []
440
+ for i in range(start_point[0], min(end_point[0] + 1, len(self.content_lines))):
441
+ if i < len(self.content_lines):
442
+ line = self.content_lines[i]
443
+ if i == start_point[0] and i == end_point[0]:
444
+ # Single line case
445
+ start_col = max(0, min(start_point[1], len(line)))
446
+ end_col = max(start_col, min(end_point[1], len(line)))
447
+ lines.append(line[start_col:end_col])
448
+ elif i == start_point[0]:
449
+ start_col = max(0, min(start_point[1], len(line)))
450
+ lines.append(line[start_col:])
451
+ elif i == end_point[0]:
452
+ end_col = max(0, min(end_point[1], len(line)))
453
+ lines.append(line[:end_col])
454
+ else:
455
+ lines.append(line)
456
+ result = "\n".join(lines)
457
+ self._node_text_cache[node_id] = result
458
+ return result
459
+ except Exception as fallback_error:
460
+ log_error(f"Fallback text extraction also failed: {fallback_error}")
461
+ return ""
462
+
463
+ def _extract_atx_headers(self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]) -> None:
464
+ """Extract ATX-style headers (# ## ### etc.)"""
465
+ for node in self._traverse_nodes(root_node):
466
+ if node.type == "atx_heading":
467
+ try:
468
+ start_line = node.start_point[0] + 1
469
+ end_line = node.end_point[0] + 1
470
+ raw_text = self._get_node_text_optimized(node)
471
+
472
+ # Extract header level and content
473
+ level = 1
474
+ content = raw_text.strip()
475
+
476
+ # Count # symbols to determine level
477
+ if content.startswith("#"):
478
+ level = len(content) - len(content.lstrip("#"))
479
+ content = content.lstrip("# ").rstrip()
480
+
481
+ header = MarkdownElement(
482
+ name=content or f"Header Level {level}",
483
+ start_line=start_line,
484
+ end_line=end_line,
485
+ raw_text=raw_text,
486
+ element_type="heading",
487
+ level=level
488
+ )
489
+ # Add additional attributes for formatter
490
+ header.text = content or f"Header Level {level}"
491
+ header.type = "heading"
492
+ headers.append(header)
493
+ except Exception as e:
494
+ log_debug(f"Failed to extract ATX header: {e}")
495
+
496
+ def _extract_setext_headers(self, root_node: "tree_sitter.Node", headers: list[MarkdownElement]) -> None:
497
+ """Extract Setext-style headers (underlined)"""
498
+ for node in self._traverse_nodes(root_node):
499
+ if node.type == "setext_heading":
500
+ try:
501
+ start_line = node.start_point[0] + 1
502
+ end_line = node.end_point[0] + 1
503
+ raw_text = self._get_node_text_optimized(node)
504
+
505
+ # Determine level based on underline character
506
+ level = 2 # Default to H2
507
+ lines = raw_text.strip().split("\n")
508
+ if len(lines) >= 2:
509
+ underline = lines[1].strip()
510
+ if underline.startswith("="):
511
+ level = 1 # H1
512
+ elif underline.startswith("-"):
513
+ level = 2 # H2
514
+ content = lines[0].strip()
515
+ else:
516
+ content = raw_text.strip()
517
+
518
+ header = MarkdownElement(
519
+ name=content or f"Header Level {level}",
520
+ start_line=start_line,
521
+ end_line=end_line,
522
+ raw_text=raw_text,
523
+ element_type="heading",
524
+ level=level
525
+ )
526
+ # Add additional attributes for formatter
527
+ header.text = content or f"Header Level {level}"
528
+ header.type = "heading"
529
+ headers.append(header)
530
+ except Exception as e:
531
+ log_debug(f"Failed to extract Setext header: {e}")
532
+
533
+ def _extract_fenced_code_blocks(self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]) -> None:
534
+ """Extract fenced code blocks"""
535
+ for node in self._traverse_nodes(root_node):
536
+ if node.type == "fenced_code_block":
537
+ try:
538
+ start_line = node.start_point[0] + 1
539
+ end_line = node.end_point[0] + 1
540
+ raw_text = self._get_node_text_optimized(node)
541
+
542
+ # Extract language info
543
+ language_info = None
544
+ lines = raw_text.strip().split("\n")
545
+ if lines and lines[0].startswith("```"):
546
+ language_info = lines[0][3:].strip()
547
+
548
+ # Extract content (excluding fence markers)
549
+ content_lines = []
550
+ in_content = False
551
+ for line in lines:
552
+ if line.startswith("```"):
553
+ if not in_content:
554
+ in_content = True
555
+ continue
556
+ else:
557
+ break
558
+ if in_content:
559
+ content_lines.append(line)
560
+
561
+ content = "\n".join(content_lines)
562
+ name = f"Code Block ({language_info or 'unknown'})"
563
+
564
+ code_block = MarkdownElement(
565
+ name=name,
566
+ start_line=start_line,
567
+ end_line=end_line,
568
+ raw_text=raw_text,
569
+ element_type="code_block",
570
+ language_info=language_info
571
+ )
572
+ # Add additional attributes for formatter
573
+ code_block.language = language_info or "text"
574
+ code_block.line_count = len(content_lines)
575
+ code_block.type = "code_block"
576
+ code_blocks.append(code_block)
577
+ except Exception as e:
578
+ log_debug(f"Failed to extract fenced code block: {e}")
579
+
580
+ def _extract_indented_code_blocks(self, root_node: "tree_sitter.Node", code_blocks: list[MarkdownElement]) -> None:
581
+ """Extract indented code blocks"""
582
+ for node in self._traverse_nodes(root_node):
583
+ if node.type == "indented_code_block":
584
+ try:
585
+ start_line = node.start_point[0] + 1
586
+ end_line = node.end_point[0] + 1
587
+ raw_text = self._get_node_text_optimized(node)
588
+
589
+ code_block = MarkdownElement(
590
+ name="Indented Code Block",
591
+ start_line=start_line,
592
+ end_line=end_line,
593
+ raw_text=raw_text,
594
+ element_type="code_block",
595
+ language_info="indented"
596
+ )
597
+ # Add additional attributes for formatter
598
+ code_block.language = "text"
599
+ code_block.line_count = end_line - start_line + 1
600
+ code_block.type = "code_block"
601
+ code_blocks.append(code_block)
602
+ except Exception as e:
603
+ log_debug(f"Failed to extract indented code block: {e}")
604
+
605
+ def _extract_inline_links(self, root_node: "tree_sitter.Node", links: list[MarkdownElement]) -> None:
606
+ """Extract inline links"""
607
+ import re
608
+
609
+ # リンクは inline ノード内のテキストから正規表現で抽出
610
+ for node in self._traverse_nodes(root_node):
611
+ if node.type == "inline":
612
+ try:
613
+ raw_text = self._get_node_text_optimized(node)
614
+ if not raw_text:
615
+ continue
616
+
617
+ # インラインリンクのパターン: [text](url "title") (画像を除外)
618
+ inline_pattern = r'(?<!\!)\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
619
+ matches = re.finditer(inline_pattern, raw_text)
620
+
621
+ for match in matches:
622
+ text = match.group(1) or ""
623
+ url = match.group(2) or ""
624
+ title = match.group(3) or ""
625
+
626
+ # マッチした位置から行番号を計算
627
+ start_line = node.start_point[0] + 1
628
+ end_line = node.end_point[0] + 1
629
+
630
+ link = MarkdownElement(
631
+ name=text or "Link",
632
+ start_line=start_line,
633
+ end_line=end_line,
634
+ raw_text=match.group(0),
635
+ element_type="link",
636
+ url=url,
637
+ title=title
638
+ )
639
+ # Add additional attributes for formatter
640
+ link.text = text or "Link"
641
+ link.type = "link"
642
+ links.append(link)
643
+
644
+ except Exception as e:
645
+ log_debug(f"Failed to extract inline link: {e}")
646
+
647
+ def _extract_reference_links(self, root_node: "tree_sitter.Node", links: list[MarkdownElement]) -> None:
648
+ """Extract reference links"""
649
+ import re
650
+
651
+ # 引用链接也需要从inline节点中提取
652
+ for node in self._traverse_nodes(root_node):
653
+ if node.type == "inline":
654
+ try:
655
+ raw_text = self._get_node_text_optimized(node)
656
+ if not raw_text:
657
+ continue
658
+
659
+ # 引用链接的模式: [text][ref]
660
+ ref_pattern = r'\[([^\]]*)\]\[([^\]]*)\]'
661
+ matches = re.finditer(ref_pattern, raw_text)
662
+
663
+ for match in matches:
664
+ text = match.group(1) or ""
665
+ ref = match.group(2) or ""
666
+
667
+ # 跳过图像引用 (以!开头)
668
+ if match.start() > 0 and raw_text[match.start()-1] == '!':
669
+ continue
670
+
671
+ start_line = node.start_point[0] + 1
672
+ end_line = node.end_point[0] + 1
673
+
674
+ link = MarkdownElement(
675
+ name=text or "Reference Link",
676
+ start_line=start_line,
677
+ end_line=end_line,
678
+ raw_text=match.group(0),
679
+ element_type="reference_link"
680
+ )
681
+ # Add additional attributes for formatter
682
+ link.text = text or "Reference Link"
683
+ link.type = "reference_link"
684
+ links.append(link)
685
+
686
+ except Exception as e:
687
+ log_debug(f"Failed to extract reference link: {e}")
688
+
689
+ def _extract_autolinks(self, root_node: "tree_sitter.Node", links: list[MarkdownElement]) -> None:
690
+ """Extract autolinks"""
691
+ import re
692
+
693
+ # オートリンクは inline ノード内のテキストから正規表現で抽出
694
+ for node in self._traverse_nodes(root_node):
695
+ if node.type == "inline":
696
+ try:
697
+ raw_text = self._get_node_text_optimized(node)
698
+ if not raw_text:
699
+ continue
700
+
701
+ # オートリンクのパターン: <url> または <email>
702
+ autolink_pattern = r'<(https?://[^>]+|mailto:[^>]+|[^@\s]+@[^@\s]+\.[^@\s]+)>'
703
+ matches = re.finditer(autolink_pattern, raw_text)
704
+
705
+ for match in matches:
706
+ url = match.group(1) or ""
707
+
708
+ # マッチした位置から行番号を計算
709
+ start_line = node.start_point[0] + 1
710
+ end_line = node.end_point[0] + 1
711
+
712
+ link = MarkdownElement(
713
+ name=url or "Autolink",
714
+ start_line=start_line,
715
+ end_line=end_line,
716
+ raw_text=match.group(0),
717
+ element_type="autolink",
718
+ url=url
719
+ )
720
+ # Add additional attributes for formatter
721
+ link.text = url or "Autolink"
722
+ link.type = "autolink"
723
+ links.append(link)
724
+
725
+ except Exception as e:
726
+ log_debug(f"Failed to extract autolink: {e}")
727
+
728
+ def _extract_inline_images(self, root_node: "tree_sitter.Node", images: list[MarkdownElement]) -> None:
729
+ """Extract inline images"""
730
+ import re
731
+
732
+ # 画像は inline ノード内のテキストから正規表現で抽出
733
+ for node in self._traverse_nodes(root_node):
734
+ if node.type == "inline":
735
+ try:
736
+ raw_text = self._get_node_text_optimized(node)
737
+ if not raw_text:
738
+ continue
739
+
740
+ # インライン画像のパターン: ![alt](url "title")
741
+ image_pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
742
+ matches = re.finditer(image_pattern, raw_text)
743
+
744
+ for match in matches:
745
+ alt_text = match.group(1) or ""
746
+ url = match.group(2) or ""
747
+ title = match.group(3) or ""
748
+
749
+ # マッチした位置から行番号を計算
750
+ start_line = node.start_point[0] + 1
751
+ end_line = node.end_point[0] + 1
752
+
753
+ image = MarkdownElement(
754
+ name=alt_text or "Image",
755
+ start_line=start_line,
756
+ end_line=end_line,
757
+ raw_text=match.group(0),
758
+ element_type="image",
759
+ url=url,
760
+ alt_text=alt_text,
761
+ title=title
762
+ )
763
+ # Add additional attributes for formatter
764
+ image.alt = alt_text or ""
765
+ image.type = "image"
766
+ images.append(image)
767
+
768
+ except Exception as e:
769
+ log_debug(f"Failed to extract inline image: {e}")
770
+
771
+ def _extract_reference_images(self, root_node: "tree_sitter.Node", images: list[MarkdownElement]) -> None:
772
+ """Extract reference images"""
773
+ import re
774
+
775
+ # 引用图像也需要从inline节点中提取
776
+ for node in self._traverse_nodes(root_node):
777
+ if node.type == "inline":
778
+ try:
779
+ raw_text = self._get_node_text_optimized(node)
780
+ if not raw_text:
781
+ continue
782
+
783
+ # 引用图像的模式: ![alt][ref]
784
+ ref_image_pattern = r'!\[([^\]]*)\]\[([^\]]*)\]'
785
+ matches = re.finditer(ref_image_pattern, raw_text)
786
+
787
+ for match in matches:
788
+ alt_text = match.group(1) or ""
789
+ ref = match.group(2) or ""
790
+
791
+ start_line = node.start_point[0] + 1
792
+ end_line = node.end_point[0] + 1
793
+
794
+ image = MarkdownElement(
795
+ name=alt_text or "Reference Image",
796
+ start_line=start_line,
797
+ end_line=end_line,
798
+ raw_text=match.group(0),
799
+ element_type="reference_image"
800
+ )
801
+ # Add additional attributes for formatter
802
+ image.alt = alt_text or ""
803
+ image.type = "reference_image"
804
+ images.append(image)
805
+
806
+ except Exception as e:
807
+ log_debug(f"Failed to extract reference image: {e}")
808
+
809
+ def _extract_link_reference_definitions(self, root_node: "tree_sitter.Node", references: list[MarkdownElement]) -> None:
810
+ """Extract link reference definitions"""
811
+ for node in self._traverse_nodes(root_node):
812
+ if node.type == "link_reference_definition":
813
+ try:
814
+ start_line = node.start_point[0] + 1
815
+ end_line = node.end_point[0] + 1
816
+ raw_text = self._get_node_text_optimized(node)
817
+
818
+ reference = MarkdownElement(
819
+ name=raw_text or "Reference Definition",
820
+ start_line=start_line,
821
+ end_line=end_line,
822
+ raw_text=raw_text,
823
+ element_type="reference_definition"
824
+ )
825
+ references.append(reference)
826
+ except Exception as e:
827
+ log_debug(f"Failed to extract reference definition: {e}")
828
+
829
+ def _extract_list_items(self, root_node: "tree_sitter.Node", lists: list[MarkdownElement]) -> None:
830
+ """Extract lists (not individual items)"""
831
+ for node in self._traverse_nodes(root_node):
832
+ if node.type == "list":
833
+ try:
834
+ start_line = node.start_point[0] + 1
835
+ end_line = node.end_point[0] + 1
836
+ raw_text = self._get_node_text_optimized(node)
837
+
838
+ # Count list items in this list
839
+ item_count = 0
840
+ is_task_list = False
841
+ is_ordered = False
842
+
843
+ for child in node.children:
844
+ if child.type == "list_item":
845
+ item_count += 1
846
+ item_text = self._get_node_text_optimized(child)
847
+
848
+ # Check if it's a task list item
849
+ if "[ ]" in item_text or "[x]" in item_text or "[X]" in item_text:
850
+ is_task_list = True
851
+
852
+ # Check if it's an ordered list (starts with number)
853
+ if item_text.strip() and item_text.strip()[0].isdigit():
854
+ is_ordered = True
855
+
856
+ # Determine list type
857
+ if is_task_list:
858
+ list_type = "task"
859
+ element_type = "task_list"
860
+ elif is_ordered:
861
+ list_type = "ordered"
862
+ element_type = "list"
863
+ else:
864
+ list_type = "unordered"
865
+ element_type = "list"
866
+
867
+ name = f"{list_type.title()} List ({item_count} items)"
868
+
869
+ list_element = MarkdownElement(
870
+ name=name,
871
+ start_line=start_line,
872
+ end_line=end_line,
873
+ raw_text=raw_text,
874
+ element_type=element_type
875
+ )
876
+ # Add additional attributes for formatter
877
+ list_element.list_type = list_type
878
+ list_element.item_count = item_count
879
+ list_element.type = list_type
880
+ lists.append(list_element)
881
+ except Exception as e:
882
+ log_debug(f"Failed to extract list: {e}")
883
+
884
+ def _extract_pipe_tables(self, root_node: "tree_sitter.Node", tables: list[MarkdownElement]) -> None:
885
+ """Extract pipe tables"""
886
+ for node in self._traverse_nodes(root_node):
887
+ if node.type == "pipe_table":
888
+ try:
889
+ start_line = node.start_point[0] + 1
890
+ end_line = node.end_point[0] + 1
891
+ raw_text = self._get_node_text_optimized(node)
892
+
893
+ # Count rows and columns
894
+ lines = raw_text.strip().split("\n")
895
+ row_count = len([line for line in lines if line.strip() and not line.strip().startswith("|---")])
896
+
897
+ # Count columns from first row
898
+ column_count = 0
899
+ if lines:
900
+ first_row = lines[0]
901
+ column_count = len([col for col in first_row.split("|") if col.strip()])
902
+
903
+ table = MarkdownElement(
904
+ name=f"Table ({row_count} rows, {column_count} columns)",
905
+ start_line=start_line,
906
+ end_line=end_line,
907
+ raw_text=raw_text,
908
+ element_type="table"
909
+ )
910
+ # Add additional attributes for formatter
911
+ table.row_count = row_count
912
+ table.column_count = column_count
913
+ table.type = "table"
914
+ tables.append(table)
915
+ except Exception as e:
916
+ log_debug(f"Failed to extract pipe table: {e}")
917
+
918
+ def _extract_block_quotes(self, root_node: "tree_sitter.Node", blockquotes: list[MarkdownElement]) -> None:
919
+ """Extract blockquotes"""
920
+ import re
921
+
922
+ # Blockquotes are often represented as paragraphs starting with >
923
+ for node in self._traverse_nodes(root_node):
924
+ if node.type == "block_quote":
925
+ try:
926
+ start_line = node.start_point[0] + 1
927
+ end_line = node.end_point[0] + 1
928
+ raw_text = self._get_node_text_optimized(node)
929
+
930
+ # Extract content without > markers
931
+ lines = raw_text.strip().split("\n")
932
+ content_lines = []
933
+ for line in lines:
934
+ # Remove > marker and optional space
935
+ cleaned = re.sub(r'^>\s?', '', line)
936
+ content_lines.append(cleaned)
937
+ content = "\n".join(content_lines).strip()
938
+
939
+ blockquote = MarkdownElement(
940
+ name=f"Blockquote: {content[:50]}..." if len(content) > 50 else f"Blockquote: {content}",
941
+ start_line=start_line,
942
+ end_line=end_line,
943
+ raw_text=raw_text,
944
+ element_type="blockquote"
945
+ )
946
+ blockquote.type = "blockquote"
947
+ blockquote.text = content
948
+ blockquotes.append(blockquote)
949
+ except Exception as e:
950
+ log_debug(f"Failed to extract blockquote: {e}")
951
+
952
+ def _extract_thematic_breaks(self, root_node: "tree_sitter.Node", horizontal_rules: list[MarkdownElement]) -> None:
953
+ """Extract thematic breaks (horizontal rules)"""
954
+ for node in self._traverse_nodes(root_node):
955
+ if node.type == "thematic_break":
956
+ try:
957
+ start_line = node.start_point[0] + 1
958
+ end_line = node.end_point[0] + 1
959
+ raw_text = self._get_node_text_optimized(node)
960
+
961
+ hr = MarkdownElement(
962
+ name="Horizontal Rule",
963
+ start_line=start_line,
964
+ end_line=end_line,
965
+ raw_text=raw_text,
966
+ element_type="horizontal_rule"
967
+ )
968
+ hr.type = "horizontal_rule"
969
+ horizontal_rules.append(hr)
970
+ except Exception as e:
971
+ log_debug(f"Failed to extract horizontal rule: {e}")
972
+
973
+ def _extract_html_blocks(self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]) -> None:
974
+ """Extract HTML block elements"""
975
+ for node in self._traverse_nodes(root_node):
976
+ if node.type == "html_block":
977
+ try:
978
+ start_line = node.start_point[0] + 1
979
+ end_line = node.end_point[0] + 1
980
+ raw_text = self._get_node_text_optimized(node)
981
+
982
+ # Extract tag name if possible
983
+ import re
984
+ tag_match = re.search(r'<(\w+)', raw_text)
985
+ tag_name = tag_match.group(1) if tag_match else "HTML"
986
+
987
+ html_element = MarkdownElement(
988
+ name=f"HTML Block: {tag_name}",
989
+ start_line=start_line,
990
+ end_line=end_line,
991
+ raw_text=raw_text,
992
+ element_type="html_block"
993
+ )
994
+ html_element.type = "html_block"
995
+ html_elements.append(html_element)
996
+ except Exception as e:
997
+ log_debug(f"Failed to extract HTML block: {e}")
998
+
999
+ def _extract_inline_html(self, root_node: "tree_sitter.Node", html_elements: list[MarkdownElement]) -> None:
1000
+ """Extract inline HTML elements"""
1001
+ import re
1002
+
1003
+ # Look for HTML tags in inline content
1004
+ for node in self._traverse_nodes(root_node):
1005
+ if node.type == "inline":
1006
+ try:
1007
+ raw_text = self._get_node_text_optimized(node)
1008
+ if not raw_text:
1009
+ continue
1010
+
1011
+ # Pattern for HTML tags
1012
+ html_pattern = r'<[^>]+>'
1013
+ matches = re.finditer(html_pattern, raw_text)
1014
+
1015
+ for match in matches:
1016
+ tag_text = match.group(0)
1017
+
1018
+ # Extract tag name
1019
+ tag_match = re.search(r'<(\w+)', tag_text)
1020
+ tag_name = tag_match.group(1) if tag_match else "HTML"
1021
+
1022
+ start_line = node.start_point[0] + 1
1023
+ end_line = node.end_point[0] + 1
1024
+
1025
+ html_element = MarkdownElement(
1026
+ name=f"HTML Tag: {tag_name}",
1027
+ start_line=start_line,
1028
+ end_line=end_line,
1029
+ raw_text=tag_text,
1030
+ element_type="html_inline"
1031
+ )
1032
+ html_element.type = "html_inline"
1033
+ html_elements.append(html_element)
1034
+
1035
+ except Exception as e:
1036
+ log_debug(f"Failed to extract inline HTML: {e}")
1037
+
1038
+ def _extract_emphasis_elements(self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]) -> None:
1039
+ """Extract emphasis and strong emphasis elements"""
1040
+ import re
1041
+
1042
+ for node in self._traverse_nodes(root_node):
1043
+ if node.type == "inline":
1044
+ try:
1045
+ raw_text = self._get_node_text_optimized(node)
1046
+ if not raw_text:
1047
+ continue
1048
+
1049
+ # Pattern for bold text: **text** or __text__
1050
+ bold_pattern = r'\*\*([^*]+)\*\*|__([^_]+)__'
1051
+ bold_matches = re.finditer(bold_pattern, raw_text)
1052
+
1053
+ for match in bold_matches:
1054
+ content = match.group(1) or match.group(2) or ""
1055
+ start_line = node.start_point[0] + 1
1056
+ end_line = node.end_point[0] + 1
1057
+
1058
+ bold_element = MarkdownElement(
1059
+ name=f"Bold: {content}",
1060
+ start_line=start_line,
1061
+ end_line=end_line,
1062
+ raw_text=match.group(0),
1063
+ element_type="strong_emphasis"
1064
+ )
1065
+ bold_element.type = "strong_emphasis"
1066
+ bold_element.text = content
1067
+ formatting_elements.append(bold_element)
1068
+
1069
+ # Pattern for italic text: *text* or _text_ (but not **text** or __text__)
1070
+ italic_pattern = r'(?<!\*)\*([^*]+)\*(?!\*)|(?<!_)_([^_]+)_(?!_)'
1071
+ italic_matches = re.finditer(italic_pattern, raw_text)
1072
+
1073
+ for match in italic_matches:
1074
+ content = match.group(1) or match.group(2) or ""
1075
+ start_line = node.start_point[0] + 1
1076
+ end_line = node.end_point[0] + 1
1077
+
1078
+ italic_element = MarkdownElement(
1079
+ name=f"Italic: {content}",
1080
+ start_line=start_line,
1081
+ end_line=end_line,
1082
+ raw_text=match.group(0),
1083
+ element_type="emphasis"
1084
+ )
1085
+ italic_element.type = "emphasis"
1086
+ italic_element.text = content
1087
+ formatting_elements.append(italic_element)
1088
+
1089
+ except Exception as e:
1090
+ log_debug(f"Failed to extract emphasis elements: {e}")
1091
+
1092
+ def _extract_inline_code_spans(self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]) -> None:
1093
+ """Extract inline code spans"""
1094
+ import re
1095
+
1096
+ for node in self._traverse_nodes(root_node):
1097
+ if node.type == "inline":
1098
+ try:
1099
+ raw_text = self._get_node_text_optimized(node)
1100
+ if not raw_text:
1101
+ continue
1102
+
1103
+ # Pattern for inline code: `code`
1104
+ code_pattern = r'`([^`]+)`'
1105
+ matches = re.finditer(code_pattern, raw_text)
1106
+
1107
+ for match in matches:
1108
+ content = match.group(1) or ""
1109
+ start_line = node.start_point[0] + 1
1110
+ end_line = node.end_point[0] + 1
1111
+
1112
+ code_element = MarkdownElement(
1113
+ name=f"Inline Code: {content}",
1114
+ start_line=start_line,
1115
+ end_line=end_line,
1116
+ raw_text=match.group(0),
1117
+ element_type="inline_code"
1118
+ )
1119
+ code_element.type = "inline_code"
1120
+ code_element.text = content
1121
+ formatting_elements.append(code_element)
1122
+
1123
+ except Exception as e:
1124
+ log_debug(f"Failed to extract inline code: {e}")
1125
+
1126
+ def _extract_strikethrough_elements(self, root_node: "tree_sitter.Node", formatting_elements: list[MarkdownElement]) -> None:
1127
+ """Extract strikethrough elements"""
1128
+ import re
1129
+
1130
+ for node in self._traverse_nodes(root_node):
1131
+ if node.type == "inline":
1132
+ try:
1133
+ raw_text = self._get_node_text_optimized(node)
1134
+ if not raw_text:
1135
+ continue
1136
+
1137
+ # Pattern for strikethrough: ~~text~~
1138
+ strike_pattern = r'~~([^~]+)~~'
1139
+ matches = re.finditer(strike_pattern, raw_text)
1140
+
1141
+ for match in matches:
1142
+ content = match.group(1) or ""
1143
+ start_line = node.start_point[0] + 1
1144
+ end_line = node.end_point[0] + 1
1145
+
1146
+ strike_element = MarkdownElement(
1147
+ name=f"Strikethrough: {content}",
1148
+ start_line=start_line,
1149
+ end_line=end_line,
1150
+ raw_text=match.group(0),
1151
+ element_type="strikethrough"
1152
+ )
1153
+ strike_element.type = "strikethrough"
1154
+ strike_element.text = content
1155
+ formatting_elements.append(strike_element)
1156
+
1157
+ except Exception as e:
1158
+ log_debug(f"Failed to extract strikethrough: {e}")
1159
+
1160
+ def _extract_footnote_elements(self, root_node: "tree_sitter.Node", footnotes: list[MarkdownElement]) -> None:
1161
+ """Extract footnote elements"""
1162
+ import re
1163
+
1164
+ for node in self._traverse_nodes(root_node):
1165
+ if node.type == "inline":
1166
+ try:
1167
+ raw_text = self._get_node_text_optimized(node)
1168
+ if not raw_text:
1169
+ continue
1170
+
1171
+ # Pattern for footnote references: [^1]
1172
+ footnote_ref_pattern = r'\[\^([^\]]+)\]'
1173
+ matches = re.finditer(footnote_ref_pattern, raw_text)
1174
+
1175
+ for match in matches:
1176
+ ref_id = match.group(1) or ""
1177
+ start_line = node.start_point[0] + 1
1178
+ end_line = node.end_point[0] + 1
1179
+
1180
+ footnote_element = MarkdownElement(
1181
+ name=f"Footnote Reference: {ref_id}",
1182
+ start_line=start_line,
1183
+ end_line=end_line,
1184
+ raw_text=match.group(0),
1185
+ element_type="footnote_reference"
1186
+ )
1187
+ footnote_element.type = "footnote_reference"
1188
+ footnote_element.text = ref_id
1189
+ footnotes.append(footnote_element)
1190
+
1191
+ except Exception as e:
1192
+ log_debug(f"Failed to extract footnote reference: {e}")
1193
+
1194
+ # Look for footnote definitions
1195
+ elif node.type == "paragraph":
1196
+ try:
1197
+ raw_text = self._get_node_text_optimized(node)
1198
+ if not raw_text:
1199
+ continue
1200
+
1201
+ # Pattern for footnote definitions: [^1]: content
1202
+ footnote_def_pattern = r'^\[\^([^\]]+)\]:\s*(.+)$'
1203
+ match = re.match(footnote_def_pattern, raw_text.strip(), re.MULTILINE)
1204
+
1205
+ if match:
1206
+ ref_id = match.group(1) or ""
1207
+ content = match.group(2) or ""
1208
+ start_line = node.start_point[0] + 1
1209
+ end_line = node.end_point[0] + 1
1210
+
1211
+ footnote_element = MarkdownElement(
1212
+ name=f"Footnote Definition: {ref_id}",
1213
+ start_line=start_line,
1214
+ end_line=end_line,
1215
+ raw_text=raw_text,
1216
+ element_type="footnote_definition"
1217
+ )
1218
+ footnote_element.type = "footnote_definition"
1219
+ footnote_element.text = content
1220
+ footnotes.append(footnote_element)
1221
+
1222
+ except Exception as e:
1223
+ log_debug(f"Failed to extract footnote definition: {e}")
1224
+
1225
+ def _traverse_nodes(self, node: "tree_sitter.Node"):
1226
+ """Traverse all nodes in the tree"""
1227
+ yield node
1228
+ for child in node.children:
1229
+ yield from self._traverse_nodes(child)
1230
+
1231
+ def _parse_link_components(self, raw_text: str) -> tuple[str, str, str]:
1232
+ """Parse link components from raw text"""
1233
+ import re
1234
+
1235
+ # Pattern for [text](url "title")
1236
+ pattern = r'\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
1237
+ match = re.search(pattern, raw_text)
1238
+
1239
+ if match:
1240
+ text = match.group(1) or ""
1241
+ url = match.group(2) or ""
1242
+ title = match.group(3) or ""
1243
+ return text, url, title
1244
+
1245
+ return "", "", ""
1246
+
1247
+ def _parse_image_components(self, raw_text: str) -> tuple[str, str, str]:
1248
+ """Parse image components from raw text"""
1249
+ import re
1250
+
1251
+ # Pattern for ![alt](url "title")
1252
+ pattern = r'!\[([^\]]*)\]\(([^)]*?)(?:\s+"([^"]*)")?\)'
1253
+ match = re.search(pattern, raw_text)
1254
+
1255
+ if match:
1256
+ alt_text = match.group(1) or ""
1257
+ url = match.group(2) or ""
1258
+ title = match.group(3) or ""
1259
+ return alt_text, url, title
1260
+
1261
+ return "", "", ""
1262
+
1263
+
1264
+ class MarkdownPlugin(LanguagePlugin):
1265
+ """Markdown language plugin for the tree-sitter analyzer"""
1266
+
1267
+ def __init__(self) -> None:
1268
+ """Initialize the Markdown plugin"""
1269
+ super().__init__()
1270
+ self._language_cache: tree_sitter.Language | None = None
1271
+ self._extractor: MarkdownElementExtractor = MarkdownElementExtractor()
1272
+
1273
+ # Legacy compatibility attributes for tests
1274
+ self.language = "markdown"
1275
+ self.extractor = self._extractor
1276
+
1277
+ def get_language_name(self) -> str:
1278
+ """Return the name of the programming language this plugin supports"""
1279
+ return "markdown"
1280
+
1281
+ def get_file_extensions(self) -> list[str]:
1282
+ """Return list of file extensions this plugin supports"""
1283
+ return [".md", ".markdown", ".mdown", ".mkd", ".mkdn", ".mdx"]
1284
+
1285
+ def create_extractor(self) -> ElementExtractor:
1286
+ """Create and return an element extractor for this language"""
1287
+ return MarkdownElementExtractor()
1288
+
1289
+ def get_extractor(self) -> ElementExtractor:
1290
+ """Get the cached extractor instance, creating it if necessary"""
1291
+ return self._extractor
1292
+
1293
+ def get_language(self) -> str:
1294
+ """Get the language name for Markdown (legacy compatibility)"""
1295
+ return "markdown"
1296
+
1297
+ def extract_functions(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1298
+ """Extract functions from the tree (legacy compatibility)"""
1299
+ extractor = self.get_extractor()
1300
+ return extractor.extract_functions(tree, source_code)
1301
+
1302
+ def extract_classes(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1303
+ """Extract classes from the tree (legacy compatibility)"""
1304
+ extractor = self.get_extractor()
1305
+ return extractor.extract_classes(tree, source_code)
1306
+
1307
+ def extract_variables(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1308
+ """Extract variables from the tree (legacy compatibility)"""
1309
+ extractor = self.get_extractor()
1310
+ return extractor.extract_variables(tree, source_code)
1311
+
1312
+ def extract_imports(self, tree: "tree_sitter.Tree", source_code: str) -> list[CodeElement]:
1313
+ """Extract imports from the tree (legacy compatibility)"""
1314
+ extractor = self.get_extractor()
1315
+ return extractor.extract_imports(tree, source_code)
1316
+
1317
+ def get_tree_sitter_language(self) -> Optional["tree_sitter.Language"]:
1318
+ """Get the Tree-sitter language object for Markdown"""
1319
+ if self._language_cache is None:
1320
+ try:
1321
+ import tree_sitter
1322
+ import tree_sitter_markdown as tsmarkdown
1323
+
1324
+ # 新しいバージョンのtree-sitter-markdownに対応
1325
+ try:
1326
+ # 新しいAPI (0.3.1+)
1327
+ language_capsule = tsmarkdown.language()
1328
+ self._language_cache = tree_sitter.Language(language_capsule)
1329
+ except (AttributeError, TypeError):
1330
+ # 古いAPIまたは異なる形式の場合
1331
+ try:
1332
+ # 直接Languageオブジェクトを取得
1333
+ self._language_cache = tsmarkdown.language()
1334
+ except Exception:
1335
+ # 最後の手段:モジュールから直接取得
1336
+ if hasattr(tsmarkdown, 'LANGUAGE'):
1337
+ self._language_cache = tree_sitter.Language(tsmarkdown.LANGUAGE)
1338
+ else:
1339
+ raise ImportError("Cannot access markdown language")
1340
+ except ImportError:
1341
+ log_error("tree-sitter-markdown not available")
1342
+ return None
1343
+ except Exception as e:
1344
+ log_error(f"Failed to load Markdown language: {e}")
1345
+ return None
1346
+ return self._language_cache
1347
+
1348
+ def get_supported_queries(self) -> list[str]:
1349
+ """Get list of supported query names for this language"""
1350
+ return [
1351
+ "headers",
1352
+ "code_blocks",
1353
+ "links",
1354
+ "images",
1355
+ "lists",
1356
+ "tables",
1357
+ "blockquotes",
1358
+ "emphasis",
1359
+ "inline_code",
1360
+ "references",
1361
+ "task_lists",
1362
+ "horizontal_rules",
1363
+ "html_blocks",
1364
+ "strikethrough",
1365
+ "footnotes",
1366
+ "text_content",
1367
+ "all_elements",
1368
+ ]
1369
+
1370
+ def is_applicable(self, file_path: str) -> bool:
1371
+ """Check if this plugin is applicable for the given file"""
1372
+ return any(
1373
+ file_path.lower().endswith(ext.lower())
1374
+ for ext in self.get_file_extensions()
1375
+ )
1376
+
1377
+ def get_plugin_info(self) -> dict:
1378
+ """Get information about this plugin"""
1379
+ return {
1380
+ "name": "Markdown Plugin",
1381
+ "language": self.get_language_name(),
1382
+ "extensions": self.get_file_extensions(),
1383
+ "version": "1.0.0",
1384
+ "supported_queries": self.get_supported_queries(),
1385
+ "features": [
1386
+ "ATX headers (# ## ###)",
1387
+ "Setext headers (underlined)",
1388
+ "Fenced code blocks",
1389
+ "Indented code blocks",
1390
+ "Inline code spans",
1391
+ "Inline links",
1392
+ "Reference links",
1393
+ "Autolinks",
1394
+ "Email autolinks",
1395
+ "Images (inline and reference)",
1396
+ "Lists (ordered and unordered)",
1397
+ "Task lists (checkboxes)",
1398
+ "Blockquotes",
1399
+ "Tables",
1400
+ "Emphasis and strong emphasis",
1401
+ "Strikethrough text",
1402
+ "Horizontal rules",
1403
+ "HTML blocks and inline HTML",
1404
+ "Footnotes (references and definitions)",
1405
+ "Reference definitions",
1406
+ "Text formatting extraction",
1407
+ "CommonMark compliance",
1408
+ ],
1409
+ }
1410
+
1411
+ async def analyze_file(
1412
+ self, file_path: str, request: AnalysisRequest
1413
+ ) -> AnalysisResult:
1414
+ """Analyze a Markdown file and return the analysis results."""
1415
+ if not TREE_SITTER_AVAILABLE:
1416
+ return AnalysisResult(
1417
+ file_path=file_path,
1418
+ language=self.get_language_name(),
1419
+ success=False,
1420
+ error_message="Tree-sitter library not available.",
1421
+ )
1422
+
1423
+ language = self.get_tree_sitter_language()
1424
+ if not language:
1425
+ return AnalysisResult(
1426
+ file_path=file_path,
1427
+ language=self.get_language_name(),
1428
+ success=False,
1429
+ error_message="Could not load Markdown language for parsing.",
1430
+ )
1431
+
1432
+ try:
1433
+ with open(file_path, encoding="utf-8") as f:
1434
+ source_code = f.read()
1435
+
1436
+ parser = tree_sitter.Parser()
1437
+ parser.language = language
1438
+ tree = parser.parse(bytes(source_code, "utf8"))
1439
+
1440
+ extractor = self.create_extractor()
1441
+ extractor.current_file = file_path # Set current file for context
1442
+
1443
+ elements: list[CodeElement] = []
1444
+
1445
+ # Extract all element types
1446
+ headers = extractor.extract_headers(tree, source_code)
1447
+ code_blocks = extractor.extract_code_blocks(tree, source_code)
1448
+ links = extractor.extract_links(tree, source_code)
1449
+ images = extractor.extract_images(tree, source_code)
1450
+ references = extractor.extract_references(tree, source_code)
1451
+ lists = extractor.extract_lists(tree, source_code)
1452
+ tables = extractor.extract_tables(tree, source_code)
1453
+
1454
+ # Extract new element types
1455
+ blockquotes = extractor.extract_blockquotes(tree, source_code)
1456
+ horizontal_rules = extractor.extract_horizontal_rules(tree, source_code)
1457
+ html_elements = extractor.extract_html_elements(tree, source_code)
1458
+ text_formatting = extractor.extract_text_formatting(tree, source_code)
1459
+ footnotes = extractor.extract_footnotes(tree, source_code)
1460
+
1461
+ elements.extend(headers)
1462
+ elements.extend(code_blocks)
1463
+ elements.extend(links)
1464
+ elements.extend(images)
1465
+ elements.extend(references)
1466
+ elements.extend(lists)
1467
+ elements.extend(tables)
1468
+ elements.extend(blockquotes)
1469
+ elements.extend(horizontal_rules)
1470
+ elements.extend(html_elements)
1471
+ elements.extend(text_formatting)
1472
+ elements.extend(footnotes)
1473
+
1474
+ def count_nodes(node: "tree_sitter.Node") -> int:
1475
+ count = 1
1476
+ for child in node.children:
1477
+ count += count_nodes(child)
1478
+ return count
1479
+
1480
+ return AnalysisResult(
1481
+ file_path=file_path,
1482
+ language=self.get_language_name(),
1483
+ success=True,
1484
+ elements=elements,
1485
+ line_count=len(source_code.splitlines()),
1486
+ node_count=count_nodes(tree.root_node),
1487
+ )
1488
+ except Exception as e:
1489
+ log_error(f"Error analyzing Markdown file {file_path}: {e}")
1490
+ return AnalysisResult(
1491
+ file_path=file_path,
1492
+ language=self.get_language_name(),
1493
+ success=False,
1494
+ error_message=str(e),
1495
+ )
1496
+
1497
+ def execute_query(self, tree: "tree_sitter.Tree", query_name: str) -> dict:
1498
+ """Execute a specific query on the tree"""
1499
+ try:
1500
+ import tree_sitter
1501
+
1502
+ language = self.get_tree_sitter_language()
1503
+ if not language:
1504
+ return {"error": "Language not available"}
1505
+
1506
+ # Import query definitions
1507
+ from ..queries.markdown import get_query
1508
+
1509
+ try:
1510
+ query_string = get_query(query_name)
1511
+ except KeyError:
1512
+ return {"error": f"Unknown query: {query_name}"}
1513
+
1514
+ # Use new tree-sitter 0.25.x API
1515
+ query = tree_sitter.Query(language, query_string)
1516
+
1517
+ # Execute query using the new API
1518
+ # In tree-sitter 0.25.x, we need to use a different approach
1519
+ matches = []
1520
+ captures = []
1521
+
1522
+ # Walk through the tree and find matches manually
1523
+ def walk_tree(node):
1524
+ # This is a simplified approach - in practice, you'd want to use
1525
+ # the proper query execution method when it becomes available
1526
+ if query_name == "headers" and node.type in ["atx_heading", "setext_heading"]:
1527
+ matches.append(node)
1528
+ elif query_name == "code_blocks" and node.type in ["fenced_code_block", "indented_code_block"]:
1529
+ matches.append(node)
1530
+ elif query_name == "links" and node.type in ["link", "autolink", "reference_link"]:
1531
+ matches.append(node)
1532
+
1533
+ for child in node.children:
1534
+ walk_tree(child)
1535
+
1536
+ walk_tree(tree.root_node)
1537
+
1538
+ # Convert matches to capture format
1539
+ for match in matches:
1540
+ captures.append((match, query_name))
1541
+
1542
+ return {"captures": captures, "query": query_string, "matches": len(matches)}
1543
+
1544
+ except Exception as e:
1545
+ log_error(f"Query execution failed: {e}")
1546
+ return {"error": str(e)}
1547
+
1548
+ def extract_elements(self, tree: "tree_sitter.Tree", source_code: str) -> list:
1549
+ """Extract elements from source code using tree-sitter AST"""
1550
+ extractor = self.get_extractor()
1551
+ elements = []
1552
+
1553
+ try:
1554
+ elements.extend(extractor.extract_headers(tree, source_code))
1555
+ elements.extend(extractor.extract_code_blocks(tree, source_code))
1556
+ elements.extend(extractor.extract_links(tree, source_code))
1557
+ elements.extend(extractor.extract_images(tree, source_code))
1558
+ elements.extend(extractor.extract_references(tree, source_code))
1559
+ elements.extend(extractor.extract_lists(tree, source_code))
1560
+ elements.extend(extractor.extract_tables(tree, source_code))
1561
+ elements.extend(extractor.extract_blockquotes(tree, source_code))
1562
+ elements.extend(extractor.extract_horizontal_rules(tree, source_code))
1563
+ elements.extend(extractor.extract_html_elements(tree, source_code))
1564
+ elements.extend(extractor.extract_text_formatting(tree, source_code))
1565
+ elements.extend(extractor.extract_footnotes(tree, source_code))
1566
+ except Exception as e:
1567
+ log_error(f"Failed to extract elements: {e}")
1568
+
1569
+ return elements