tree-sitter-analyzer 1.8.4__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tree-sitter-analyzer might be problematic. Click here for more details.
- tree_sitter_analyzer/__init__.py +1 -1
- tree_sitter_analyzer/api.py +4 -4
- tree_sitter_analyzer/cli/argument_validator.py +29 -17
- tree_sitter_analyzer/cli/commands/advanced_command.py +7 -5
- tree_sitter_analyzer/cli/commands/structure_command.py +7 -5
- tree_sitter_analyzer/cli/commands/summary_command.py +10 -6
- tree_sitter_analyzer/cli/commands/table_command.py +8 -7
- tree_sitter_analyzer/cli/info_commands.py +1 -1
- tree_sitter_analyzer/cli_main.py +3 -2
- tree_sitter_analyzer/core/analysis_engine.py +5 -5
- tree_sitter_analyzer/core/cache_service.py +3 -1
- tree_sitter_analyzer/core/query.py +17 -5
- tree_sitter_analyzer/core/query_service.py +1 -1
- tree_sitter_analyzer/encoding_utils.py +3 -3
- tree_sitter_analyzer/exceptions.py +61 -50
- tree_sitter_analyzer/file_handler.py +3 -0
- tree_sitter_analyzer/formatters/base_formatter.py +10 -5
- tree_sitter_analyzer/formatters/formatter_registry.py +83 -68
- tree_sitter_analyzer/formatters/html_formatter.py +90 -64
- tree_sitter_analyzer/formatters/javascript_formatter.py +21 -16
- tree_sitter_analyzer/formatters/language_formatter_factory.py +7 -6
- tree_sitter_analyzer/formatters/markdown_formatter.py +247 -124
- tree_sitter_analyzer/formatters/python_formatter.py +61 -38
- tree_sitter_analyzer/formatters/typescript_formatter.py +113 -45
- tree_sitter_analyzer/interfaces/mcp_server.py +2 -2
- tree_sitter_analyzer/language_detector.py +6 -6
- tree_sitter_analyzer/language_loader.py +3 -1
- tree_sitter_analyzer/languages/css_plugin.py +120 -61
- tree_sitter_analyzer/languages/html_plugin.py +159 -62
- tree_sitter_analyzer/languages/java_plugin.py +42 -34
- tree_sitter_analyzer/languages/javascript_plugin.py +59 -30
- tree_sitter_analyzer/languages/markdown_plugin.py +402 -368
- tree_sitter_analyzer/languages/python_plugin.py +111 -64
- tree_sitter_analyzer/languages/typescript_plugin.py +241 -132
- tree_sitter_analyzer/mcp/server.py +22 -18
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +13 -8
- tree_sitter_analyzer/mcp/tools/base_tool.py +2 -2
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +232 -26
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +31 -23
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +21 -19
- tree_sitter_analyzer/mcp/tools/query_tool.py +17 -18
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +30 -31
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +131 -77
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +29 -16
- tree_sitter_analyzer/mcp/utils/file_output_factory.py +64 -51
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +34 -24
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +8 -4
- tree_sitter_analyzer/models.py +7 -5
- tree_sitter_analyzer/plugins/base.py +9 -7
- tree_sitter_analyzer/plugins/manager.py +1 -0
- tree_sitter_analyzer/queries/css.py +2 -21
- tree_sitter_analyzer/queries/html.py +2 -15
- tree_sitter_analyzer/queries/markdown.py +30 -41
- tree_sitter_analyzer/queries/python.py +20 -5
- tree_sitter_analyzer/query_loader.py +5 -5
- tree_sitter_analyzer/security/validator.py +114 -86
- tree_sitter_analyzer/utils/__init__.py +58 -28
- tree_sitter_analyzer/utils/tree_sitter_compat.py +72 -65
- tree_sitter_analyzer/utils.py +26 -15
- {tree_sitter_analyzer-1.8.4.dist-info → tree_sitter_analyzer-1.9.1.dist-info}/METADATA +23 -6
- tree_sitter_analyzer-1.9.1.dist-info/RECORD +109 -0
- tree_sitter_analyzer-1.8.4.dist-info/RECORD +0 -109
- {tree_sitter_analyzer-1.8.4.dist-info → tree_sitter_analyzer-1.9.1.dist-info}/WHEEL +0 -0
- {tree_sitter_analyzer-1.8.4.dist-info → tree_sitter_analyzer-1.9.1.dist-info}/entry_points.txt +0 -0
|
@@ -6,7 +6,8 @@ Provides specialized formatting for Markdown files, focusing on document structu
|
|
|
6
6
|
rather than programming constructs like classes and methods.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
10
11
|
from .base_formatter import BaseFormatter
|
|
11
12
|
|
|
12
13
|
|
|
@@ -17,18 +18,22 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
17
18
|
super().__init__()
|
|
18
19
|
self.language = "markdown"
|
|
19
20
|
|
|
20
|
-
def format_summary(self, analysis_result:
|
|
21
|
+
def format_summary(self, analysis_result: dict[str, Any]) -> str:
|
|
21
22
|
"""Format summary for Markdown files"""
|
|
22
23
|
file_path = analysis_result.get("file_path", "")
|
|
23
24
|
elements = analysis_result.get("elements", [])
|
|
24
|
-
|
|
25
|
+
|
|
25
26
|
# Count different types of Markdown elements
|
|
26
27
|
headers = [e for e in elements if e.get("type") == "heading"]
|
|
27
|
-
links = [
|
|
28
|
+
links = [
|
|
29
|
+
e
|
|
30
|
+
for e in elements
|
|
31
|
+
if e.get("type") in ["link", "autolink", "reference_link"]
|
|
32
|
+
]
|
|
28
33
|
images = self._collect_images(elements)
|
|
29
34
|
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
30
35
|
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
31
|
-
|
|
36
|
+
|
|
32
37
|
# Robust adjust for link/image counts to match other commands
|
|
33
38
|
robust_counts = self._compute_robust_counts_from_file(file_path)
|
|
34
39
|
if len(links) < robust_counts.get("link_count", len(links)):
|
|
@@ -37,7 +42,9 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
37
42
|
missing = robust_counts.get("link_count", 0) - len(links)
|
|
38
43
|
if missing > 0:
|
|
39
44
|
# Add placeholder autolink entries to align with expected count
|
|
40
|
-
links = links + [
|
|
45
|
+
links = links + [
|
|
46
|
+
{"text": "autolink", "url": "autolink"} for _ in range(missing)
|
|
47
|
+
]
|
|
41
48
|
|
|
42
49
|
# Some environments under-detect reference images in elements; align summary with
|
|
43
50
|
# robust image count used elsewhere (structure/advanced) by adding placeholders
|
|
@@ -48,35 +55,49 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
48
55
|
images = images + ([{"alt": "", "url": ""}] * missing)
|
|
49
56
|
|
|
50
57
|
summary = {
|
|
51
|
-
"headers": [
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
"
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
"
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
"headers": [
|
|
59
|
+
{"name": h.get("text", "").strip(), "level": h.get("level", 1)}
|
|
60
|
+
for h in headers
|
|
61
|
+
],
|
|
62
|
+
"links": [
|
|
63
|
+
{"text": link.get("text", ""), "url": link.get("url", "")}
|
|
64
|
+
for link in links
|
|
65
|
+
],
|
|
66
|
+
"images": [
|
|
67
|
+
{"alt": i.get("alt", ""), "url": i.get("url", "")} for i in images
|
|
68
|
+
],
|
|
69
|
+
"code_blocks": [
|
|
70
|
+
{"language": cb.get("language", ""), "lines": cb.get("line_count", 0)}
|
|
71
|
+
for cb in code_blocks
|
|
72
|
+
],
|
|
73
|
+
"lists": [
|
|
74
|
+
{"type": lst.get("list_type", ""), "items": lst.get("item_count", 0)}
|
|
75
|
+
for lst in lists
|
|
76
|
+
],
|
|
62
77
|
}
|
|
63
|
-
|
|
78
|
+
|
|
79
|
+
result = {"file_path": file_path, "language": "markdown", "summary": summary}
|
|
80
|
+
|
|
64
81
|
return self._format_json_output("Summary Results", result)
|
|
65
82
|
|
|
66
|
-
def format_structure(self, analysis_result:
|
|
83
|
+
def format_structure(self, analysis_result: dict[str, Any]) -> str:
|
|
67
84
|
"""Format structure analysis for Markdown files"""
|
|
68
85
|
file_path = analysis_result.get("file_path", "")
|
|
69
86
|
elements = analysis_result.get("elements", [])
|
|
70
87
|
line_count = analysis_result.get("line_count", 0)
|
|
71
|
-
|
|
88
|
+
|
|
72
89
|
# Organize elements by type
|
|
73
90
|
headers = [e for e in elements if e.get("type") == "heading"]
|
|
74
|
-
links = [
|
|
91
|
+
links = [
|
|
92
|
+
e
|
|
93
|
+
for e in elements
|
|
94
|
+
if e.get("type") in ["link", "autolink", "reference_link"]
|
|
95
|
+
]
|
|
75
96
|
images = self._collect_images(elements)
|
|
76
97
|
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
77
98
|
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
78
99
|
tables = [e for e in elements if e.get("type") == "table"]
|
|
79
|
-
|
|
100
|
+
|
|
80
101
|
# Robust counts to avoid undercount due to parser variance
|
|
81
102
|
robust_counts = self._compute_robust_counts_from_file(file_path)
|
|
82
103
|
|
|
@@ -91,43 +112,49 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
91
112
|
{
|
|
92
113
|
"text": h.get("text", "").strip(),
|
|
93
114
|
"level": h.get("level", 1),
|
|
94
|
-
"line_range": h.get("line_range", {})
|
|
95
|
-
}
|
|
115
|
+
"line_range": h.get("line_range", {}),
|
|
116
|
+
}
|
|
117
|
+
for h in headers
|
|
96
118
|
],
|
|
97
119
|
"links": [
|
|
98
120
|
{
|
|
99
|
-
"text":
|
|
100
|
-
"url":
|
|
101
|
-
"line_range":
|
|
102
|
-
}
|
|
121
|
+
"text": link.get("text", ""),
|
|
122
|
+
"url": link.get("url", ""),
|
|
123
|
+
"line_range": link.get("line_range", {}),
|
|
124
|
+
}
|
|
125
|
+
for link in links
|
|
103
126
|
],
|
|
104
127
|
"images": [
|
|
105
128
|
{
|
|
106
129
|
"alt": i.get("alt", ""),
|
|
107
130
|
"url": i.get("url", ""),
|
|
108
|
-
"line_range": i.get("line_range", {})
|
|
109
|
-
}
|
|
131
|
+
"line_range": i.get("line_range", {}),
|
|
132
|
+
}
|
|
133
|
+
for i in images
|
|
110
134
|
],
|
|
111
135
|
"code_blocks": [
|
|
112
136
|
{
|
|
113
137
|
"language": cb.get("language", ""),
|
|
114
138
|
"line_count": cb.get("line_count", 0),
|
|
115
|
-
"line_range": cb.get("line_range", {})
|
|
116
|
-
}
|
|
139
|
+
"line_range": cb.get("line_range", {}),
|
|
140
|
+
}
|
|
141
|
+
for cb in code_blocks
|
|
117
142
|
],
|
|
118
143
|
"lists": [
|
|
119
144
|
{
|
|
120
|
-
"type":
|
|
121
|
-
"item_count":
|
|
122
|
-
"line_range":
|
|
123
|
-
}
|
|
145
|
+
"type": lst.get("list_type", ""),
|
|
146
|
+
"item_count": lst.get("item_count", 0),
|
|
147
|
+
"line_range": lst.get("line_range", {}),
|
|
148
|
+
}
|
|
149
|
+
for lst in lists
|
|
124
150
|
],
|
|
125
151
|
"tables": [
|
|
126
152
|
{
|
|
127
153
|
"columns": t.get("column_count", 0),
|
|
128
154
|
"rows": t.get("row_count", 0),
|
|
129
|
-
"line_range": t.get("line_range", {})
|
|
130
|
-
}
|
|
155
|
+
"line_range": t.get("line_range", {}),
|
|
156
|
+
}
|
|
157
|
+
for t in tables
|
|
131
158
|
],
|
|
132
159
|
"statistics": {
|
|
133
160
|
"header_count": len(headers),
|
|
@@ -137,41 +164,61 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
137
164
|
"code_block_count": len(code_blocks),
|
|
138
165
|
"list_count": len(lists),
|
|
139
166
|
"table_count": len(tables),
|
|
140
|
-
"total_lines": line_count
|
|
167
|
+
"total_lines": line_count,
|
|
141
168
|
},
|
|
142
|
-
"analysis_metadata": analysis_result.get("analysis_metadata", {})
|
|
169
|
+
"analysis_metadata": analysis_result.get("analysis_metadata", {}),
|
|
143
170
|
}
|
|
144
|
-
|
|
171
|
+
|
|
145
172
|
return self._format_json_output("Structure Analysis Results", structure)
|
|
146
173
|
|
|
147
|
-
def format_advanced(
|
|
174
|
+
def format_advanced(
|
|
175
|
+
self, analysis_result: dict[str, Any], output_format: str = "json"
|
|
176
|
+
) -> str:
|
|
148
177
|
"""Format advanced analysis for Markdown files"""
|
|
149
178
|
file_path = analysis_result.get("file_path", "")
|
|
150
179
|
elements = analysis_result.get("elements", [])
|
|
151
180
|
line_count = analysis_result.get("line_count", 0)
|
|
152
181
|
element_count = len(elements)
|
|
153
|
-
|
|
182
|
+
|
|
154
183
|
# Calculate Markdown-specific metrics
|
|
155
184
|
headers = [e for e in elements if e.get("type") == "heading"]
|
|
156
|
-
links = [
|
|
185
|
+
links = [
|
|
186
|
+
e
|
|
187
|
+
for e in elements
|
|
188
|
+
if e.get("type") in ["link", "autolink", "reference_link"]
|
|
189
|
+
]
|
|
157
190
|
images = self._collect_images(elements)
|
|
158
191
|
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
159
192
|
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
160
193
|
tables = [e for e in elements if e.get("type") == "table"]
|
|
161
|
-
|
|
194
|
+
|
|
162
195
|
# Calculate document structure metrics
|
|
163
196
|
header_levels = [h.get("level", 1) for h in headers]
|
|
164
197
|
max_header_level = max(header_levels) if header_levels else 0
|
|
165
|
-
avg_header_level =
|
|
166
|
-
|
|
198
|
+
avg_header_level = (
|
|
199
|
+
sum(header_levels) / len(header_levels) if header_levels else 0
|
|
200
|
+
)
|
|
201
|
+
|
|
167
202
|
# Calculate content metrics
|
|
168
203
|
total_code_lines = sum(cb.get("line_count", 0) for cb in code_blocks)
|
|
169
|
-
total_list_items = sum(
|
|
170
|
-
|
|
204
|
+
total_list_items = sum(lst.get("item_count", 0) for lst in lists)
|
|
205
|
+
|
|
171
206
|
# External vs internal links
|
|
172
|
-
external_links = [
|
|
173
|
-
|
|
174
|
-
|
|
207
|
+
external_links = [
|
|
208
|
+
link
|
|
209
|
+
for link in links
|
|
210
|
+
if link.get("url")
|
|
211
|
+
and link.get("url", "").startswith(("http://", "https://"))
|
|
212
|
+
]
|
|
213
|
+
internal_links = [
|
|
214
|
+
link
|
|
215
|
+
for link in links
|
|
216
|
+
if not (
|
|
217
|
+
link.get("url")
|
|
218
|
+
and link.get("url", "").startswith(("http://", "https://"))
|
|
219
|
+
)
|
|
220
|
+
]
|
|
221
|
+
|
|
175
222
|
# Robust counts to avoid undercount due to parser variance
|
|
176
223
|
robust_counts = self._compute_robust_counts_from_file(file_path)
|
|
177
224
|
|
|
@@ -199,43 +246,51 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
199
246
|
"total_code_lines": total_code_lines,
|
|
200
247
|
"list_count": len(lists),
|
|
201
248
|
"total_list_items": total_list_items,
|
|
202
|
-
"table_count": len(tables)
|
|
249
|
+
"table_count": len(tables),
|
|
203
250
|
},
|
|
204
251
|
"content_analysis": {
|
|
205
|
-
"has_toc": any(
|
|
252
|
+
"has_toc": any(
|
|
253
|
+
"table of contents" in h.get("text", "").lower() for h in headers
|
|
254
|
+
),
|
|
206
255
|
"has_code_examples": len(code_blocks) > 0,
|
|
207
256
|
"has_images": len(images) > 0,
|
|
208
257
|
"has_external_links": len(external_links) > 0,
|
|
209
|
-
"document_complexity": self._calculate_document_complexity(
|
|
210
|
-
|
|
258
|
+
"document_complexity": self._calculate_document_complexity(
|
|
259
|
+
headers, links, code_blocks, tables
|
|
260
|
+
),
|
|
261
|
+
},
|
|
211
262
|
}
|
|
212
|
-
|
|
263
|
+
|
|
213
264
|
if output_format == "text":
|
|
214
265
|
return self._format_advanced_text(advanced_data)
|
|
215
266
|
else:
|
|
216
267
|
return self._format_json_output("Advanced Analysis Results", advanced_data)
|
|
217
268
|
|
|
218
|
-
def format_table(
|
|
269
|
+
def format_table(
|
|
270
|
+
self, analysis_result: dict[str, Any], table_type: str = "full"
|
|
271
|
+
) -> str:
|
|
219
272
|
"""Format table output for Markdown files"""
|
|
220
273
|
file_path = analysis_result.get("file_path", "")
|
|
221
274
|
elements = analysis_result.get("elements", [])
|
|
222
|
-
|
|
275
|
+
|
|
223
276
|
# Get document title from first header
|
|
224
277
|
headers = [e for e in elements if e.get("type") == "heading"]
|
|
225
|
-
title =
|
|
226
|
-
|
|
278
|
+
title = (
|
|
279
|
+
headers[0].get("text", "").strip() if headers else file_path.split("/")[-1]
|
|
280
|
+
)
|
|
281
|
+
|
|
227
282
|
output = [f"# {title}\n"]
|
|
228
|
-
|
|
283
|
+
|
|
229
284
|
# Document Overview
|
|
230
285
|
output.append("## Document Overview\n")
|
|
231
|
-
output.append(
|
|
232
|
-
output.append(
|
|
286
|
+
output.append("| Property | Value |")
|
|
287
|
+
output.append("|----------|-------|")
|
|
233
288
|
output.append(f"| File | {file_path} |")
|
|
234
|
-
output.append(
|
|
289
|
+
output.append("| Language | markdown |")
|
|
235
290
|
output.append(f"| Total Lines | {analysis_result.get('line_count', 0)} |")
|
|
236
291
|
output.append(f"| Total Elements | {len(elements)} |")
|
|
237
292
|
output.append("")
|
|
238
|
-
|
|
293
|
+
|
|
239
294
|
# Headers Section
|
|
240
295
|
if headers:
|
|
241
296
|
output.append("## Document Structure\n")
|
|
@@ -247,9 +302,13 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
247
302
|
line = header.get("line_range", {}).get("start", "")
|
|
248
303
|
output.append(f"| {level} | {text} | {line} |")
|
|
249
304
|
output.append("")
|
|
250
|
-
|
|
305
|
+
|
|
251
306
|
# Links Section
|
|
252
|
-
links = [
|
|
307
|
+
links = [
|
|
308
|
+
e
|
|
309
|
+
for e in elements
|
|
310
|
+
if e.get("type") in ["link", "autolink", "reference_link"]
|
|
311
|
+
]
|
|
253
312
|
if links:
|
|
254
313
|
output.append("## Links\n")
|
|
255
314
|
output.append("| Text | URL | Type | Line |")
|
|
@@ -257,11 +316,15 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
257
316
|
for link in links:
|
|
258
317
|
text = link.get("text", "")
|
|
259
318
|
url = link.get("url", "") or ""
|
|
260
|
-
link_type =
|
|
319
|
+
link_type = (
|
|
320
|
+
"External"
|
|
321
|
+
if url and url.startswith(("http://", "https://"))
|
|
322
|
+
else "Internal"
|
|
323
|
+
)
|
|
261
324
|
line = link.get("line_range", {}).get("start", "")
|
|
262
325
|
output.append(f"| {text} | {url} | {link_type} | {line} |")
|
|
263
326
|
output.append("")
|
|
264
|
-
|
|
327
|
+
|
|
265
328
|
# Images Section
|
|
266
329
|
images = self._collect_images(elements)
|
|
267
330
|
if images:
|
|
@@ -274,7 +337,7 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
274
337
|
line = image.get("line_range", {}).get("start", "")
|
|
275
338
|
output.append(f"| {alt} | {url} | {line} |")
|
|
276
339
|
output.append("")
|
|
277
|
-
|
|
340
|
+
|
|
278
341
|
# Code Blocks Section
|
|
279
342
|
code_blocks = [e for e in elements if e.get("type") == "code_block"]
|
|
280
343
|
if code_blocks:
|
|
@@ -290,7 +353,7 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
290
353
|
range_str = f"{start}-{end}" if start and end else str(start)
|
|
291
354
|
output.append(f"| {language} | {lines} | {range_str} |")
|
|
292
355
|
output.append("")
|
|
293
|
-
|
|
356
|
+
|
|
294
357
|
# Lists Section
|
|
295
358
|
lists = [e for e in elements if e.get("type") in ["list", "task_list"]]
|
|
296
359
|
if lists:
|
|
@@ -303,7 +366,7 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
303
366
|
line = lst.get("line_range", {}).get("start", "")
|
|
304
367
|
output.append(f"| {list_type} | {items} | {line} |")
|
|
305
368
|
output.append("")
|
|
306
|
-
|
|
369
|
+
|
|
307
370
|
# Tables Section
|
|
308
371
|
tables = [e for e in elements if e.get("type") == "table"]
|
|
309
372
|
if tables:
|
|
@@ -316,7 +379,7 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
316
379
|
line = table.get("line_range", {}).get("start", "")
|
|
317
380
|
output.append(f"| {columns} | {rows} | {line} |")
|
|
318
381
|
output.append("")
|
|
319
|
-
|
|
382
|
+
|
|
320
383
|
# Blockquotes Section
|
|
321
384
|
blockquotes = [e for e in elements if e.get("type") == "blockquote"]
|
|
322
385
|
if blockquotes:
|
|
@@ -324,11 +387,15 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
324
387
|
output.append("| Content | Line |")
|
|
325
388
|
output.append("|---------|------|")
|
|
326
389
|
for bq in blockquotes:
|
|
327
|
-
content =
|
|
390
|
+
content = (
|
|
391
|
+
bq.get("text", "")[:50] + "..."
|
|
392
|
+
if len(bq.get("text", "")) > 50
|
|
393
|
+
else bq.get("text", "")
|
|
394
|
+
)
|
|
328
395
|
line = bq.get("line_range", {}).get("start", "")
|
|
329
396
|
output.append(f"| {content} | {line} |")
|
|
330
397
|
output.append("")
|
|
331
|
-
|
|
398
|
+
|
|
332
399
|
# Horizontal Rules Section
|
|
333
400
|
horizontal_rules = [e for e in elements if e.get("type") == "horizontal_rule"]
|
|
334
401
|
if horizontal_rules:
|
|
@@ -339,46 +406,69 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
339
406
|
line = hr.get("line_range", {}).get("start", "")
|
|
340
407
|
output.append(f"| Horizontal Rule | {line} |")
|
|
341
408
|
output.append("")
|
|
342
|
-
|
|
409
|
+
|
|
343
410
|
# HTML Elements Section
|
|
344
|
-
html_elements = [
|
|
411
|
+
html_elements = [
|
|
412
|
+
e for e in elements if e.get("type") in ["html_block", "html_inline"]
|
|
413
|
+
]
|
|
345
414
|
if html_elements:
|
|
346
415
|
output.append("## HTML Elements\n")
|
|
347
416
|
output.append("| Type | Content | Line |")
|
|
348
417
|
output.append("|------|---------|------|")
|
|
349
418
|
for html in html_elements:
|
|
350
419
|
element_type = html.get("type", "")
|
|
351
|
-
content =
|
|
420
|
+
content = (
|
|
421
|
+
html.get("name", "")[:30] + "..."
|
|
422
|
+
if len(html.get("name", "")) > 30
|
|
423
|
+
else html.get("name", "")
|
|
424
|
+
)
|
|
352
425
|
line = html.get("line_range", {}).get("start", "")
|
|
353
426
|
output.append(f"| {element_type} | {content} | {line} |")
|
|
354
427
|
output.append("")
|
|
355
|
-
|
|
428
|
+
|
|
356
429
|
# Text Formatting Section
|
|
357
|
-
formatting_elements = [
|
|
430
|
+
formatting_elements = [
|
|
431
|
+
e
|
|
432
|
+
for e in elements
|
|
433
|
+
if e.get("type")
|
|
434
|
+
in ["strong_emphasis", "emphasis", "inline_code", "strikethrough"]
|
|
435
|
+
]
|
|
358
436
|
if formatting_elements:
|
|
359
437
|
output.append("## Text Formatting\n")
|
|
360
438
|
output.append("| Type | Content | Line |")
|
|
361
439
|
output.append("|------|---------|------|")
|
|
362
440
|
for fmt in formatting_elements:
|
|
363
441
|
format_type = fmt.get("type", "")
|
|
364
|
-
content =
|
|
442
|
+
content = (
|
|
443
|
+
fmt.get("text", "")[:30] + "..."
|
|
444
|
+
if len(fmt.get("text", "")) > 30
|
|
445
|
+
else fmt.get("text", "")
|
|
446
|
+
)
|
|
365
447
|
line = fmt.get("line_range", {}).get("start", "")
|
|
366
448
|
output.append(f"| {format_type} | {content} | {line} |")
|
|
367
449
|
output.append("")
|
|
368
|
-
|
|
450
|
+
|
|
369
451
|
# Footnotes Section
|
|
370
|
-
footnotes = [
|
|
452
|
+
footnotes = [
|
|
453
|
+
e
|
|
454
|
+
for e in elements
|
|
455
|
+
if e.get("type") in ["footnote_reference", "footnote_definition"]
|
|
456
|
+
]
|
|
371
457
|
if footnotes:
|
|
372
458
|
output.append("## Footnotes\n")
|
|
373
459
|
output.append("| Type | Content | Line |")
|
|
374
460
|
output.append("|------|---------|------|")
|
|
375
461
|
for fn in footnotes:
|
|
376
462
|
footnote_type = fn.get("type", "")
|
|
377
|
-
content =
|
|
463
|
+
content = (
|
|
464
|
+
fn.get("text", "")[:30] + "..."
|
|
465
|
+
if len(fn.get("text", "")) > 30
|
|
466
|
+
else fn.get("text", "")
|
|
467
|
+
)
|
|
378
468
|
line = fn.get("line_range", {}).get("start", "")
|
|
379
469
|
output.append(f"| {footnote_type} | {content} | {line} |")
|
|
380
470
|
output.append("")
|
|
381
|
-
|
|
471
|
+
|
|
382
472
|
# Reference Definitions Section
|
|
383
473
|
references = [e for e in elements if e.get("type") == "reference_definition"]
|
|
384
474
|
if references:
|
|
@@ -386,33 +476,42 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
386
476
|
output.append("| Content | Line |")
|
|
387
477
|
output.append("|---------|------|")
|
|
388
478
|
for ref in references:
|
|
389
|
-
content =
|
|
479
|
+
content = (
|
|
480
|
+
ref.get("name", "")[:50] + "..."
|
|
481
|
+
if len(ref.get("name", "")) > 50
|
|
482
|
+
else ref.get("name", "")
|
|
483
|
+
)
|
|
390
484
|
line = ref.get("line_range", {}).get("start", "")
|
|
391
485
|
output.append(f"| {content} | {line} |")
|
|
392
486
|
output.append("")
|
|
393
|
-
|
|
487
|
+
|
|
394
488
|
return "\n".join(output)
|
|
395
489
|
|
|
396
|
-
def _collect_images(self, elements:
|
|
490
|
+
def _collect_images(self, elements: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
397
491
|
"""Collect images including reference definitions that point to images.
|
|
398
492
|
|
|
399
493
|
Fallback: if no explicit image reference definitions are present, also
|
|
400
494
|
treat reference definitions with image-like URLs as images to keep
|
|
401
495
|
counts consistent across environments.
|
|
402
496
|
"""
|
|
403
|
-
images:
|
|
404
|
-
e
|
|
405
|
-
|
|
497
|
+
images: list[dict[str, Any]] = [
|
|
498
|
+
e
|
|
499
|
+
for e in elements
|
|
500
|
+
if e.get("type")
|
|
501
|
+
in ["image", "reference_image", "image_reference_definition"]
|
|
406
502
|
]
|
|
407
503
|
|
|
408
504
|
# Avoid duplicates if image reference definitions already exist
|
|
409
|
-
has_image_ref_defs = any(
|
|
505
|
+
has_image_ref_defs = any(
|
|
506
|
+
e.get("type") == "image_reference_definition" for e in elements
|
|
507
|
+
)
|
|
410
508
|
if has_image_ref_defs:
|
|
411
509
|
return images
|
|
412
510
|
|
|
413
511
|
# Fallback: promote reference_definition with image-like URL
|
|
414
512
|
try:
|
|
415
513
|
import re
|
|
514
|
+
|
|
416
515
|
image_exts = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".bmp")
|
|
417
516
|
for e in elements:
|
|
418
517
|
if e.get("type") == "reference_definition":
|
|
@@ -421,33 +520,35 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
421
520
|
if not url:
|
|
422
521
|
# Parse from raw content stored in name
|
|
423
522
|
name_field = (e.get("name") or "").strip()
|
|
424
|
-
m = re.match(r
|
|
523
|
+
m = re.match(r"^\[([^\]]+)\]:\s*([^\s]+)", name_field)
|
|
425
524
|
if m:
|
|
426
525
|
alt = alt or m.group(1)
|
|
427
526
|
url = m.group(2)
|
|
428
527
|
if url and any(url.lower().endswith(ext) for ext in image_exts):
|
|
429
|
-
images.append(
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
528
|
+
images.append(
|
|
529
|
+
{
|
|
530
|
+
**e,
|
|
531
|
+
"type": "image_reference_definition",
|
|
532
|
+
"url": url,
|
|
533
|
+
"alt": alt,
|
|
534
|
+
}
|
|
535
|
+
)
|
|
435
536
|
except Exception:
|
|
436
537
|
# Be conservative on any error
|
|
437
538
|
return images
|
|
438
539
|
|
|
439
540
|
return images
|
|
440
541
|
|
|
441
|
-
def _format_advanced_text(self, data:
|
|
542
|
+
def _format_advanced_text(self, data: dict[str, Any]) -> str:
|
|
442
543
|
"""Format advanced analysis in text format"""
|
|
443
544
|
output = ["--- Advanced Analysis Results ---"]
|
|
444
|
-
|
|
545
|
+
|
|
445
546
|
# Basic info - format with quotes to match expected output
|
|
446
547
|
output.append(f'"File: {data["file_path"]}"')
|
|
447
548
|
output.append(f'"Language: {data["language"]}"')
|
|
448
549
|
output.append(f'"Lines: {data["line_count"]}"')
|
|
449
550
|
output.append(f'"Elements: {data["element_count"]}"')
|
|
450
|
-
|
|
551
|
+
|
|
451
552
|
# Document metrics
|
|
452
553
|
metrics = data["document_metrics"]
|
|
453
554
|
output.append(f'"Headers: {metrics["header_count"]}"')
|
|
@@ -459,7 +560,7 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
459
560
|
output.append(f'"Code Lines: {metrics["total_code_lines"]}"')
|
|
460
561
|
output.append(f'"Lists: {metrics["list_count"]}"')
|
|
461
562
|
output.append(f'"Tables: {metrics["table_count"]}"')
|
|
462
|
-
|
|
563
|
+
|
|
463
564
|
# Content analysis
|
|
464
565
|
content = data["content_analysis"]
|
|
465
566
|
output.append(f'"Has TOC: {content["has_toc"]}"')
|
|
@@ -467,26 +568,31 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
467
568
|
output.append(f'"Has Images: {content["has_images"]}"')
|
|
468
569
|
output.append(f'"Has External Links: {content["has_external_links"]}"')
|
|
469
570
|
output.append(f'"Document Complexity: {content["document_complexity"]}"')
|
|
470
|
-
|
|
571
|
+
|
|
471
572
|
return "\n".join(output)
|
|
472
573
|
|
|
473
|
-
def _calculate_document_complexity(
|
|
474
|
-
|
|
574
|
+
def _calculate_document_complexity(
|
|
575
|
+
self,
|
|
576
|
+
headers: list[dict],
|
|
577
|
+
links: list[dict],
|
|
578
|
+
code_blocks: list[dict],
|
|
579
|
+
tables: list[dict],
|
|
580
|
+
) -> str:
|
|
475
581
|
"""Calculate document complexity based on structure and content"""
|
|
476
582
|
score = 0
|
|
477
|
-
|
|
583
|
+
|
|
478
584
|
# Header complexity
|
|
479
585
|
if headers:
|
|
480
586
|
header_levels = [h.get("level", 1) for h in headers]
|
|
481
587
|
max_level = max(header_levels)
|
|
482
588
|
score += len(headers) * 2 # Base score for headers
|
|
483
|
-
score += max_level * 3
|
|
484
|
-
|
|
589
|
+
score += max_level * 3 # Deeper nesting increases complexity
|
|
590
|
+
|
|
485
591
|
# Content complexity
|
|
486
|
-
score += len(links) * 1
|
|
592
|
+
score += len(links) * 1 # Links add moderate complexity
|
|
487
593
|
score += len(code_blocks) * 5 # Code blocks add significant complexity
|
|
488
|
-
score += len(tables) * 3
|
|
489
|
-
|
|
594
|
+
score += len(tables) * 3 # Tables add moderate complexity
|
|
595
|
+
|
|
490
596
|
# Classify complexity
|
|
491
597
|
if score < 20:
|
|
492
598
|
return "Simple"
|
|
@@ -497,36 +603,44 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
497
603
|
else:
|
|
498
604
|
return "Very Complex"
|
|
499
605
|
|
|
500
|
-
def _format_json_output(self, title: str, data:
|
|
606
|
+
def _format_json_output(self, title: str, data: dict[str, Any]) -> str:
|
|
501
607
|
"""Format JSON output with title"""
|
|
502
608
|
import json
|
|
609
|
+
|
|
503
610
|
output = [f"--- {title} ---"]
|
|
504
611
|
output.append(json.dumps(data, indent=2, ensure_ascii=False))
|
|
505
612
|
return "\n".join(output)
|
|
506
613
|
|
|
507
|
-
def _compute_robust_counts_from_file(self, file_path: str) ->
|
|
614
|
+
def _compute_robust_counts_from_file(self, file_path: str) -> dict[str, int]:
|
|
508
615
|
"""Compute robust counts for links and images directly from file content.
|
|
509
616
|
|
|
510
617
|
This mitigates occasional undercount from AST element extraction by
|
|
511
618
|
scanning the raw Markdown text with regex patterns.
|
|
512
619
|
"""
|
|
513
620
|
import re
|
|
621
|
+
|
|
514
622
|
counts = {"link_count": 0, "image_count": 0}
|
|
515
623
|
if not file_path:
|
|
516
624
|
return counts
|
|
517
625
|
|
|
518
626
|
try:
|
|
519
|
-
with open(file_path,
|
|
627
|
+
with open(file_path, encoding="utf-8", errors="replace") as f:
|
|
520
628
|
content = f.read()
|
|
521
629
|
except Exception:
|
|
522
630
|
return counts
|
|
523
631
|
|
|
524
632
|
# Autolinks (URLs, mailto, and bare emails), exclude HTML tags by pattern
|
|
525
|
-
autolink_pattern = re.compile(
|
|
633
|
+
autolink_pattern = re.compile(
|
|
634
|
+
r"<(?:https?://[^>]+|mailto:[^>]+|[^@\s]+@[^@\s]+\.[^@\s]+)>"
|
|
635
|
+
)
|
|
526
636
|
|
|
527
637
|
# Count inline links (subtract image inlines later)
|
|
528
|
-
inline_links_all = re.findall(
|
|
529
|
-
|
|
638
|
+
inline_links_all = re.findall(
|
|
639
|
+
r"\[[^\]]*\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)", content
|
|
640
|
+
)
|
|
641
|
+
inline_images = re.findall(
|
|
642
|
+
r"!\[[^\]]*\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)", content
|
|
643
|
+
)
|
|
530
644
|
inline_links = max(0, len(inline_links_all) - len(inline_images))
|
|
531
645
|
|
|
532
646
|
# Count reference links (subtract image references later)
|
|
@@ -544,14 +658,23 @@ class MarkdownFormatter(BaseFormatter):
|
|
|
544
658
|
# Reference images occurrences
|
|
545
659
|
ref_images_count = len(ref_images)
|
|
546
660
|
# Image reference definitions used by images
|
|
547
|
-
used_labels =
|
|
548
|
-
|
|
661
|
+
used_labels = {
|
|
662
|
+
m.group(1).lower() for m in re.finditer(r"!\[[^\]]*\]\[([^\]]*)\]", content)
|
|
663
|
+
}
|
|
664
|
+
def_pattern = re.compile(
|
|
665
|
+
r"^\[([^\]]+)\]:\s*([^\s]+)(?:\s+\"([^\"]*)\")?", re.MULTILINE
|
|
666
|
+
)
|
|
549
667
|
image_ref_defs_used = 0
|
|
550
668
|
for m in def_pattern.finditer(content):
|
|
551
669
|
label = (m.group(1) or "").lower()
|
|
552
670
|
url = (m.group(2) or "").lower()
|
|
553
|
-
if label in used_labels or any(
|
|
671
|
+
if label in used_labels or any(
|
|
672
|
+
url.endswith(ext)
|
|
673
|
+
for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".bmp"]
|
|
674
|
+
):
|
|
554
675
|
image_ref_defs_used += 1
|
|
555
676
|
|
|
556
|
-
counts["image_count"] =
|
|
557
|
-
|
|
677
|
+
counts["image_count"] = (
|
|
678
|
+
inline_images_count + ref_images_count + image_ref_defs_used
|
|
679
|
+
)
|
|
680
|
+
return counts
|