tree-sitter-analyzer 1.7.1__py3-none-any.whl → 1.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tree-sitter-analyzer might be problematic. Click here for more details.
- tree_sitter_analyzer/__init__.py +1 -1
- tree_sitter_analyzer/cli/commands/advanced_command.py +52 -0
- tree_sitter_analyzer/cli/commands/structure_command.py +50 -1
- tree_sitter_analyzer/cli/commands/summary_command.py +49 -0
- tree_sitter_analyzer/cli/commands/table_command.py +48 -0
- tree_sitter_analyzer/core/query_service.py +145 -5
- tree_sitter_analyzer/formatters/base_formatter.py +29 -2
- tree_sitter_analyzer/formatters/language_formatter_factory.py +83 -0
- tree_sitter_analyzer/formatters/markdown_formatter.py +426 -0
- tree_sitter_analyzer/language_detector.py +30 -0
- tree_sitter_analyzer/language_loader.py +1 -0
- tree_sitter_analyzer/languages/markdown_plugin.py +1569 -0
- tree_sitter_analyzer/languages/python_plugin.py +75 -16
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +184 -11
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +112 -2
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +210 -10
- tree_sitter_analyzer/queries/markdown.py +379 -0
- tree_sitter_analyzer/query_loader.py +1 -0
- {tree_sitter_analyzer-1.7.1.dist-info → tree_sitter_analyzer-1.7.3.dist-info}/METADATA +54 -18
- {tree_sitter_analyzer-1.7.1.dist-info → tree_sitter_analyzer-1.7.3.dist-info}/RECORD +22 -18
- {tree_sitter_analyzer-1.7.1.dist-info → tree_sitter_analyzer-1.7.3.dist-info}/entry_points.txt +1 -0
- {tree_sitter_analyzer-1.7.1.dist-info → tree_sitter_analyzer-1.7.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Markdown Formatter
|
|
4
|
+
|
|
5
|
+
Provides specialized formatting for Markdown files, focusing on document structure
|
|
6
|
+
rather than programming constructs like classes and methods.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Dict, List, Any, Optional
|
|
10
|
+
from .base_formatter import BaseFormatter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MarkdownFormatter(BaseFormatter):
|
|
14
|
+
"""Formatter specialized for Markdown documents"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
super().__init__()
|
|
18
|
+
self.language = "markdown"
|
|
19
|
+
|
|
20
|
+
def format_summary(self, analysis_result: Dict[str, Any]) -> str:
|
|
21
|
+
"""Format summary for Markdown files"""
|
|
22
|
+
file_path = analysis_result.get("file_path", "")
|
|
23
|
+
elements = analysis_result.get("elements", [])
|
|
24
|
+
|
|
25
|
+
# Count different types of Markdown elements
|
|
26
|
+
headers = [e for e in elements if e.get("type") == "heading"]
|
|
27
|
+
links = [e for e in elements if e.get("type") in ["link", "autolink", "reference_link"]]
|
|
28
|
+
images = [e for e in elements if e.get("type") in ["image", "reference_image"]]
|
|
29
|
+
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
30
|
+
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
31
|
+
|
|
32
|
+
summary = {
|
|
33
|
+
"headers": [{"name": h.get("text", "").strip(), "level": h.get("level", 1)} for h in headers],
|
|
34
|
+
"links": [{"text": l.get("text", ""), "url": l.get("url", "")} for l in links],
|
|
35
|
+
"images": [{"alt": i.get("alt", ""), "url": i.get("url", "")} for i in images],
|
|
36
|
+
"code_blocks": [{"language": cb.get("language", ""), "lines": cb.get("line_count", 0)} for cb in code_blocks],
|
|
37
|
+
"lists": [{"type": l.get("list_type", ""), "items": l.get("item_count", 0)} for l in lists]
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
result = {
|
|
41
|
+
"file_path": file_path,
|
|
42
|
+
"language": "markdown",
|
|
43
|
+
"summary": summary
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return self._format_json_output("Summary Results", result)
|
|
47
|
+
|
|
48
|
+
def format_structure(self, analysis_result: Dict[str, Any]) -> str:
|
|
49
|
+
"""Format structure analysis for Markdown files"""
|
|
50
|
+
file_path = analysis_result.get("file_path", "")
|
|
51
|
+
elements = analysis_result.get("elements", [])
|
|
52
|
+
line_count = analysis_result.get("line_count", 0)
|
|
53
|
+
|
|
54
|
+
# Organize elements by type
|
|
55
|
+
headers = [e for e in elements if e.get("type") == "heading"]
|
|
56
|
+
links = [e for e in elements if e.get("type") in ["link", "autolink", "reference_link"]]
|
|
57
|
+
images = [e for e in elements if e.get("type") in ["image", "reference_image"]]
|
|
58
|
+
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
59
|
+
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
60
|
+
tables = [e for e in elements if e.get("type") == "table"]
|
|
61
|
+
|
|
62
|
+
structure = {
|
|
63
|
+
"file_path": file_path,
|
|
64
|
+
"language": "markdown",
|
|
65
|
+
"headers": [
|
|
66
|
+
{
|
|
67
|
+
"text": h.get("text", "").strip(),
|
|
68
|
+
"level": h.get("level", 1),
|
|
69
|
+
"line_range": h.get("line_range", {})
|
|
70
|
+
} for h in headers
|
|
71
|
+
],
|
|
72
|
+
"links": [
|
|
73
|
+
{
|
|
74
|
+
"text": l.get("text", ""),
|
|
75
|
+
"url": l.get("url", ""),
|
|
76
|
+
"line_range": l.get("line_range", {})
|
|
77
|
+
} for l in links
|
|
78
|
+
],
|
|
79
|
+
"images": [
|
|
80
|
+
{
|
|
81
|
+
"alt": i.get("alt", ""),
|
|
82
|
+
"url": i.get("url", ""),
|
|
83
|
+
"line_range": i.get("line_range", {})
|
|
84
|
+
} for i in images
|
|
85
|
+
],
|
|
86
|
+
"code_blocks": [
|
|
87
|
+
{
|
|
88
|
+
"language": cb.get("language", ""),
|
|
89
|
+
"line_count": cb.get("line_count", 0),
|
|
90
|
+
"line_range": cb.get("line_range", {})
|
|
91
|
+
} for cb in code_blocks
|
|
92
|
+
],
|
|
93
|
+
"lists": [
|
|
94
|
+
{
|
|
95
|
+
"type": l.get("list_type", ""),
|
|
96
|
+
"item_count": l.get("item_count", 0),
|
|
97
|
+
"line_range": l.get("line_range", {})
|
|
98
|
+
} for l in lists
|
|
99
|
+
],
|
|
100
|
+
"tables": [
|
|
101
|
+
{
|
|
102
|
+
"columns": t.get("column_count", 0),
|
|
103
|
+
"rows": t.get("row_count", 0),
|
|
104
|
+
"line_range": t.get("line_range", {})
|
|
105
|
+
} for t in tables
|
|
106
|
+
],
|
|
107
|
+
"statistics": {
|
|
108
|
+
"header_count": len(headers),
|
|
109
|
+
"link_count": len(links),
|
|
110
|
+
"image_count": len(images),
|
|
111
|
+
"code_block_count": len(code_blocks),
|
|
112
|
+
"list_count": len(lists),
|
|
113
|
+
"table_count": len(tables),
|
|
114
|
+
"total_lines": line_count
|
|
115
|
+
},
|
|
116
|
+
"analysis_metadata": analysis_result.get("analysis_metadata", {})
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return self._format_json_output("Structure Analysis Results", structure)
|
|
120
|
+
|
|
121
|
+
def format_advanced(self, analysis_result: Dict[str, Any], output_format: str = "json") -> str:
|
|
122
|
+
"""Format advanced analysis for Markdown files"""
|
|
123
|
+
file_path = analysis_result.get("file_path", "")
|
|
124
|
+
elements = analysis_result.get("elements", [])
|
|
125
|
+
line_count = analysis_result.get("line_count", 0)
|
|
126
|
+
element_count = len(elements)
|
|
127
|
+
|
|
128
|
+
# Calculate Markdown-specific metrics
|
|
129
|
+
headers = [e for e in elements if e.get("type") == "heading"]
|
|
130
|
+
links = [e for e in elements if e.get("type") in ["link", "autolink", "reference_link"]]
|
|
131
|
+
images = [e for e in elements if e.get("type") in ["image", "reference_image"]]
|
|
132
|
+
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
133
|
+
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
134
|
+
tables = [e for e in elements if e.get("type") == "table"]
|
|
135
|
+
|
|
136
|
+
# Calculate document structure metrics
|
|
137
|
+
header_levels = [h.get("level", 1) for h in headers]
|
|
138
|
+
max_header_level = max(header_levels) if header_levels else 0
|
|
139
|
+
avg_header_level = sum(header_levels) / len(header_levels) if header_levels else 0
|
|
140
|
+
|
|
141
|
+
# Calculate content metrics
|
|
142
|
+
total_code_lines = sum(cb.get("line_count", 0) for cb in code_blocks)
|
|
143
|
+
total_list_items = sum(l.get("item_count", 0) for l in lists)
|
|
144
|
+
|
|
145
|
+
# External vs internal links
|
|
146
|
+
external_links = [l for l in links if l.get("url") and l.get("url", "").startswith(("http://", "https://"))]
|
|
147
|
+
internal_links = [l for l in links if not (l.get("url") and l.get("url", "").startswith(("http://", "https://")))]
|
|
148
|
+
|
|
149
|
+
advanced_data = {
|
|
150
|
+
"file_path": file_path,
|
|
151
|
+
"language": "markdown",
|
|
152
|
+
"line_count": line_count,
|
|
153
|
+
"element_count": element_count,
|
|
154
|
+
"success": True,
|
|
155
|
+
"elements": elements,
|
|
156
|
+
"document_metrics": {
|
|
157
|
+
"header_count": len(headers),
|
|
158
|
+
"max_header_level": max_header_level,
|
|
159
|
+
"avg_header_level": round(avg_header_level, 2),
|
|
160
|
+
"link_count": len(links),
|
|
161
|
+
"external_link_count": len(external_links),
|
|
162
|
+
"internal_link_count": len(internal_links),
|
|
163
|
+
"image_count": len(images),
|
|
164
|
+
"code_block_count": len(code_blocks),
|
|
165
|
+
"total_code_lines": total_code_lines,
|
|
166
|
+
"list_count": len(lists),
|
|
167
|
+
"total_list_items": total_list_items,
|
|
168
|
+
"table_count": len(tables)
|
|
169
|
+
},
|
|
170
|
+
"content_analysis": {
|
|
171
|
+
"has_toc": any("table of contents" in h.get("text", "").lower() for h in headers),
|
|
172
|
+
"has_code_examples": len(code_blocks) > 0,
|
|
173
|
+
"has_images": len(images) > 0,
|
|
174
|
+
"has_external_links": len(external_links) > 0,
|
|
175
|
+
"document_complexity": self._calculate_document_complexity(headers, links, code_blocks, tables)
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
if output_format == "text":
|
|
180
|
+
return self._format_advanced_text(advanced_data)
|
|
181
|
+
else:
|
|
182
|
+
return self._format_json_output("Advanced Analysis Results", advanced_data)
|
|
183
|
+
|
|
184
|
+
def format_table(self, analysis_result: Dict[str, Any], table_type: str = "full") -> str:
|
|
185
|
+
"""Format table output for Markdown files"""
|
|
186
|
+
file_path = analysis_result.get("file_path", "")
|
|
187
|
+
elements = analysis_result.get("elements", [])
|
|
188
|
+
|
|
189
|
+
# Get document title from first header
|
|
190
|
+
headers = [e for e in elements if e.get("type") == "heading"]
|
|
191
|
+
title = headers[0].get("text", "").strip() if headers else file_path.split("/")[-1]
|
|
192
|
+
|
|
193
|
+
output = [f"# {title}\n"]
|
|
194
|
+
|
|
195
|
+
# Document Overview
|
|
196
|
+
output.append("## Document Overview\n")
|
|
197
|
+
output.append(f"| Property | Value |")
|
|
198
|
+
output.append(f"|----------|-------|")
|
|
199
|
+
output.append(f"| File | {file_path} |")
|
|
200
|
+
output.append(f"| Language | markdown |")
|
|
201
|
+
output.append(f"| Total Lines | {analysis_result.get('line_count', 0)} |")
|
|
202
|
+
output.append(f"| Total Elements | {len(elements)} |")
|
|
203
|
+
output.append("")
|
|
204
|
+
|
|
205
|
+
# Headers Section
|
|
206
|
+
if headers:
|
|
207
|
+
output.append("## Document Structure\n")
|
|
208
|
+
output.append("| Level | Header | Line |")
|
|
209
|
+
output.append("|-------|--------|------|")
|
|
210
|
+
for header in headers:
|
|
211
|
+
level = "#" * header.get("level", 1)
|
|
212
|
+
text = header.get("text", "").strip()
|
|
213
|
+
line = header.get("line_range", {}).get("start", "")
|
|
214
|
+
output.append(f"| {level} | {text} | {line} |")
|
|
215
|
+
output.append("")
|
|
216
|
+
|
|
217
|
+
# Links Section
|
|
218
|
+
links = [e for e in elements if e.get("type") in ["link", "autolink", "reference_link"]]
|
|
219
|
+
if links:
|
|
220
|
+
output.append("## Links\n")
|
|
221
|
+
output.append("| Text | URL | Type | Line |")
|
|
222
|
+
output.append("|------|-----|------|------|")
|
|
223
|
+
for link in links:
|
|
224
|
+
text = link.get("text", "")
|
|
225
|
+
url = link.get("url", "") or ""
|
|
226
|
+
link_type = "External" if url and url.startswith(("http://", "https://")) else "Internal"
|
|
227
|
+
line = link.get("line_range", {}).get("start", "")
|
|
228
|
+
output.append(f"| {text} | {url} | {link_type} | {line} |")
|
|
229
|
+
output.append("")
|
|
230
|
+
|
|
231
|
+
# Images Section
|
|
232
|
+
images = [e for e in elements if e.get("type") in ["image", "reference_image"]]
|
|
233
|
+
if images:
|
|
234
|
+
output.append("## Images\n")
|
|
235
|
+
output.append("| Alt Text | URL | Line |")
|
|
236
|
+
output.append("|----------|-----|------|")
|
|
237
|
+
for image in images:
|
|
238
|
+
alt = image.get("alt", "")
|
|
239
|
+
url = image.get("url", "")
|
|
240
|
+
line = image.get("line_range", {}).get("start", "")
|
|
241
|
+
output.append(f"| {alt} | {url} | {line} |")
|
|
242
|
+
output.append("")
|
|
243
|
+
|
|
244
|
+
# Code Blocks Section
|
|
245
|
+
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
246
|
+
if code_blocks:
|
|
247
|
+
output.append("## Code Blocks\n")
|
|
248
|
+
output.append("| Language | Lines | Line Range |")
|
|
249
|
+
output.append("|----------|-------|------------|")
|
|
250
|
+
for cb in code_blocks:
|
|
251
|
+
language = cb.get("language", "text")
|
|
252
|
+
lines = cb.get("line_count", 0)
|
|
253
|
+
line_range = cb.get("line_range", {})
|
|
254
|
+
start = line_range.get("start", "")
|
|
255
|
+
end = line_range.get("end", "")
|
|
256
|
+
range_str = f"{start}-{end}" if start and end else str(start)
|
|
257
|
+
output.append(f"| {language} | {lines} | {range_str} |")
|
|
258
|
+
output.append("")
|
|
259
|
+
|
|
260
|
+
# Lists Section
|
|
261
|
+
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
262
|
+
if lists:
|
|
263
|
+
output.append("## Lists\n")
|
|
264
|
+
output.append("| Type | Items | Line |")
|
|
265
|
+
output.append("|------|-------|------|")
|
|
266
|
+
for lst in lists:
|
|
267
|
+
list_type = lst.get("list_type", "unordered")
|
|
268
|
+
items = lst.get("item_count", 0)
|
|
269
|
+
line = lst.get("line_range", {}).get("start", "")
|
|
270
|
+
output.append(f"| {list_type} | {items} | {line} |")
|
|
271
|
+
output.append("")
|
|
272
|
+
|
|
273
|
+
# Tables Section
|
|
274
|
+
tables = [e for e in elements if e.get("type") == "table"]
|
|
275
|
+
if tables:
|
|
276
|
+
output.append("## Tables\n")
|
|
277
|
+
output.append("| Columns | Rows | Line |")
|
|
278
|
+
output.append("|---------|------|------|")
|
|
279
|
+
for table in tables:
|
|
280
|
+
columns = table.get("column_count", 0)
|
|
281
|
+
rows = table.get("row_count", 0)
|
|
282
|
+
line = table.get("line_range", {}).get("start", "")
|
|
283
|
+
output.append(f"| {columns} | {rows} | {line} |")
|
|
284
|
+
output.append("")
|
|
285
|
+
|
|
286
|
+
# Blockquotes Section
|
|
287
|
+
blockquotes = [e for e in elements if e.get("type") == "blockquote"]
|
|
288
|
+
if blockquotes:
|
|
289
|
+
output.append("## Blockquotes\n")
|
|
290
|
+
output.append("| Content | Line |")
|
|
291
|
+
output.append("|---------|------|")
|
|
292
|
+
for bq in blockquotes:
|
|
293
|
+
content = bq.get("text", "")[:50] + "..." if len(bq.get("text", "")) > 50 else bq.get("text", "")
|
|
294
|
+
line = bq.get("line_range", {}).get("start", "")
|
|
295
|
+
output.append(f"| {content} | {line} |")
|
|
296
|
+
output.append("")
|
|
297
|
+
|
|
298
|
+
# Horizontal Rules Section
|
|
299
|
+
horizontal_rules = [e for e in elements if e.get("type") == "horizontal_rule"]
|
|
300
|
+
if horizontal_rules:
|
|
301
|
+
output.append("## Horizontal Rules\n")
|
|
302
|
+
output.append("| Type | Line |")
|
|
303
|
+
output.append("|------|------|")
|
|
304
|
+
for hr in horizontal_rules:
|
|
305
|
+
line = hr.get("line_range", {}).get("start", "")
|
|
306
|
+
output.append(f"| Horizontal Rule | {line} |")
|
|
307
|
+
output.append("")
|
|
308
|
+
|
|
309
|
+
# HTML Elements Section
|
|
310
|
+
html_elements = [e for e in elements if e.get("type") in ["html_block", "html_inline"]]
|
|
311
|
+
if html_elements:
|
|
312
|
+
output.append("## HTML Elements\n")
|
|
313
|
+
output.append("| Type | Content | Line |")
|
|
314
|
+
output.append("|------|---------|------|")
|
|
315
|
+
for html in html_elements:
|
|
316
|
+
element_type = html.get("type", "")
|
|
317
|
+
content = html.get("name", "")[:30] + "..." if len(html.get("name", "")) > 30 else html.get("name", "")
|
|
318
|
+
line = html.get("line_range", {}).get("start", "")
|
|
319
|
+
output.append(f"| {element_type} | {content} | {line} |")
|
|
320
|
+
output.append("")
|
|
321
|
+
|
|
322
|
+
# Text Formatting Section
|
|
323
|
+
formatting_elements = [e for e in elements if e.get("type") in ["strong_emphasis", "emphasis", "inline_code", "strikethrough"]]
|
|
324
|
+
if formatting_elements:
|
|
325
|
+
output.append("## Text Formatting\n")
|
|
326
|
+
output.append("| Type | Content | Line |")
|
|
327
|
+
output.append("|------|---------|------|")
|
|
328
|
+
for fmt in formatting_elements:
|
|
329
|
+
format_type = fmt.get("type", "")
|
|
330
|
+
content = fmt.get("text", "")[:30] + "..." if len(fmt.get("text", "")) > 30 else fmt.get("text", "")
|
|
331
|
+
line = fmt.get("line_range", {}).get("start", "")
|
|
332
|
+
output.append(f"| {format_type} | {content} | {line} |")
|
|
333
|
+
output.append("")
|
|
334
|
+
|
|
335
|
+
# Footnotes Section
|
|
336
|
+
footnotes = [e for e in elements if e.get("type") in ["footnote_reference", "footnote_definition"]]
|
|
337
|
+
if footnotes:
|
|
338
|
+
output.append("## Footnotes\n")
|
|
339
|
+
output.append("| Type | Content | Line |")
|
|
340
|
+
output.append("|------|---------|------|")
|
|
341
|
+
for fn in footnotes:
|
|
342
|
+
footnote_type = fn.get("type", "")
|
|
343
|
+
content = fn.get("text", "")[:30] + "..." if len(fn.get("text", "")) > 30 else fn.get("text", "")
|
|
344
|
+
line = fn.get("line_range", {}).get("start", "")
|
|
345
|
+
output.append(f"| {footnote_type} | {content} | {line} |")
|
|
346
|
+
output.append("")
|
|
347
|
+
|
|
348
|
+
# Reference Definitions Section
|
|
349
|
+
references = [e for e in elements if e.get("type") == "reference_definition"]
|
|
350
|
+
if references:
|
|
351
|
+
output.append("## Reference Definitions\n")
|
|
352
|
+
output.append("| Content | Line |")
|
|
353
|
+
output.append("|---------|------|")
|
|
354
|
+
for ref in references:
|
|
355
|
+
content = ref.get("name", "")[:50] + "..." if len(ref.get("name", "")) > 50 else ref.get("name", "")
|
|
356
|
+
line = ref.get("line_range", {}).get("start", "")
|
|
357
|
+
output.append(f"| {content} | {line} |")
|
|
358
|
+
output.append("")
|
|
359
|
+
|
|
360
|
+
return "\n".join(output)
|
|
361
|
+
|
|
362
|
+
def _format_advanced_text(self, data: Dict[str, Any]) -> str:
|
|
363
|
+
"""Format advanced analysis in text format"""
|
|
364
|
+
output = ["--- Advanced Analysis Results ---"]
|
|
365
|
+
|
|
366
|
+
# Basic info
|
|
367
|
+
output.append(f'"File: {data["file_path"]}"')
|
|
368
|
+
output.append(f'"Language: {data["language"]}"')
|
|
369
|
+
output.append(f'"Lines: {data["line_count"]}"')
|
|
370
|
+
output.append(f'"Elements: {data["element_count"]}"')
|
|
371
|
+
|
|
372
|
+
# Document metrics
|
|
373
|
+
metrics = data["document_metrics"]
|
|
374
|
+
output.append(f'"Headers: {metrics["header_count"]}"')
|
|
375
|
+
output.append(f'"Max Header Level: {metrics["max_header_level"]}"')
|
|
376
|
+
output.append(f'"Links: {metrics["link_count"]}"')
|
|
377
|
+
output.append(f'"External Links: {metrics["external_link_count"]}"')
|
|
378
|
+
output.append(f'"Images: {metrics["image_count"]}"')
|
|
379
|
+
output.append(f'"Code Blocks: {metrics["code_block_count"]}"')
|
|
380
|
+
output.append(f'"Code Lines: {metrics["total_code_lines"]}"')
|
|
381
|
+
output.append(f'"Lists: {metrics["list_count"]}"')
|
|
382
|
+
output.append(f'"Tables: {metrics["table_count"]}"')
|
|
383
|
+
|
|
384
|
+
# Content analysis
|
|
385
|
+
content = data["content_analysis"]
|
|
386
|
+
output.append(f'"Has TOC: {content["has_toc"]}"')
|
|
387
|
+
output.append(f'"Has Code: {content["has_code_examples"]}"')
|
|
388
|
+
output.append(f'"Has Images: {content["has_images"]}"')
|
|
389
|
+
output.append(f'"Has External Links: {content["has_external_links"]}"')
|
|
390
|
+
output.append(f'"Document Complexity: {content["document_complexity"]}"')
|
|
391
|
+
|
|
392
|
+
return "\n".join(output)
|
|
393
|
+
|
|
394
|
+
def _calculate_document_complexity(self, headers: List[Dict], links: List[Dict],
|
|
395
|
+
code_blocks: List[Dict], tables: List[Dict]) -> str:
|
|
396
|
+
"""Calculate document complexity based on structure and content"""
|
|
397
|
+
score = 0
|
|
398
|
+
|
|
399
|
+
# Header complexity
|
|
400
|
+
if headers:
|
|
401
|
+
header_levels = [h.get("level", 1) for h in headers]
|
|
402
|
+
max_level = max(header_levels)
|
|
403
|
+
score += len(headers) * 2 # Base score for headers
|
|
404
|
+
score += max_level * 3 # Deeper nesting increases complexity
|
|
405
|
+
|
|
406
|
+
# Content complexity
|
|
407
|
+
score += len(links) * 1 # Links add moderate complexity
|
|
408
|
+
score += len(code_blocks) * 5 # Code blocks add significant complexity
|
|
409
|
+
score += len(tables) * 3 # Tables add moderate complexity
|
|
410
|
+
|
|
411
|
+
# Classify complexity
|
|
412
|
+
if score < 20:
|
|
413
|
+
return "Simple"
|
|
414
|
+
elif score < 50:
|
|
415
|
+
return "Moderate"
|
|
416
|
+
elif score < 100:
|
|
417
|
+
return "Complex"
|
|
418
|
+
else:
|
|
419
|
+
return "Very Complex"
|
|
420
|
+
|
|
421
|
+
def _format_json_output(self, title: str, data: Dict[str, Any]) -> str:
|
|
422
|
+
"""Format JSON output with title"""
|
|
423
|
+
import json
|
|
424
|
+
output = [f"--- {title} ---"]
|
|
425
|
+
output.append(json.dumps(data, indent=2, ensure_ascii=False))
|
|
426
|
+
return "\n".join(output)
|
|
@@ -59,6 +59,13 @@ class LanguageDetector:
|
|
|
59
59
|
".m": "objc", # Ambiguous (MATLAB as well)
|
|
60
60
|
".dart": "dart",
|
|
61
61
|
".elm": "elm",
|
|
62
|
+
# Markdown系
|
|
63
|
+
".md": "markdown",
|
|
64
|
+
".markdown": "markdown",
|
|
65
|
+
".mdown": "markdown",
|
|
66
|
+
".mkd": "markdown",
|
|
67
|
+
".mkdn": "markdown",
|
|
68
|
+
".mdx": "markdown",
|
|
62
69
|
}
|
|
63
70
|
|
|
64
71
|
# Ambiguous extensions (map to multiple languages)
|
|
@@ -92,6 +99,7 @@ class LanguageDetector:
|
|
|
92
99
|
"cpp",
|
|
93
100
|
"rust",
|
|
94
101
|
"go",
|
|
102
|
+
"markdown",
|
|
95
103
|
}
|
|
96
104
|
|
|
97
105
|
def __init__(self) -> None:
|
|
@@ -128,6 +136,13 @@ class LanguageDetector:
|
|
|
128
136
|
".r": ("r", 0.9),
|
|
129
137
|
".m": ("objectivec", 0.7),
|
|
130
138
|
".mm": ("objectivec", 0.8),
|
|
139
|
+
# Markdown extensions
|
|
140
|
+
".md": ("markdown", 0.9),
|
|
141
|
+
".markdown": ("markdown", 0.9),
|
|
142
|
+
".mdown": ("markdown", 0.8),
|
|
143
|
+
".mkd": ("markdown", 0.8),
|
|
144
|
+
".mkdn": ("markdown", 0.8),
|
|
145
|
+
".mdx": ("markdown", 0.7), # MDX might be mixed with JSX
|
|
131
146
|
}
|
|
132
147
|
|
|
133
148
|
# Content-based detection patterns
|
|
@@ -169,6 +184,16 @@ class LanguageDetector:
|
|
|
169
184
|
(r"std::\w+", 0.2),
|
|
170
185
|
(r"class\s+\w+\s*{", 0.3),
|
|
171
186
|
],
|
|
187
|
+
"markdown": [
|
|
188
|
+
(r"^#{1,6}\s+", 0.4), # ATX headers
|
|
189
|
+
(r"^\s*[-*+]\s+", 0.3), # List items
|
|
190
|
+
(r"```[\w]*", 0.3), # Fenced code blocks
|
|
191
|
+
(r"\[.*\]\(.*\)", 0.2), # Links
|
|
192
|
+
(r"!\[.*\]\(.*\)", 0.2), # Images
|
|
193
|
+
(r"^\s*>\s+", 0.2), # Blockquotes
|
|
194
|
+
(r"^\s*\|.*\|", 0.2), # Tables
|
|
195
|
+
(r"^[-=]{3,}$", 0.2), # Setext headers or horizontal rules
|
|
196
|
+
],
|
|
172
197
|
}
|
|
173
198
|
|
|
174
199
|
from .utils import log_debug, log_warning
|
|
@@ -196,6 +221,11 @@ class LanguageDetector:
|
|
|
196
221
|
if extension in self.EXTENSION_MAPPING:
|
|
197
222
|
language = self.EXTENSION_MAPPING[extension]
|
|
198
223
|
|
|
224
|
+
# Use confidence from extension_map if available
|
|
225
|
+
if extension in self.extension_map:
|
|
226
|
+
_, confidence = self.extension_map[extension]
|
|
227
|
+
return language, confidence
|
|
228
|
+
|
|
199
229
|
# No ambiguity -> high confidence
|
|
200
230
|
if extension not in self.AMBIGUOUS_EXTENSIONS:
|
|
201
231
|
return language, 1.0
|