kash-shell 0.3.28__py3-none-any.whl → 0.3.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/core/chat.py +1 -0
- kash/actions/core/markdownify_html.py +4 -5
- kash/actions/core/minify_html.py +4 -5
- kash/actions/core/readability.py +1 -4
- kash/actions/core/render_as_html.py +10 -7
- kash/actions/core/save_sidematter_meta.py +47 -0
- kash/actions/core/show_webpage.py +2 -0
- kash/actions/core/zip_sidematter.py +47 -0
- kash/commands/base/basic_file_commands.py +7 -4
- kash/commands/base/diff_commands.py +6 -4
- kash/commands/base/files_command.py +31 -30
- kash/commands/base/general_commands.py +3 -2
- kash/commands/base/logs_commands.py +6 -4
- kash/commands/base/reformat_command.py +3 -2
- kash/commands/base/search_command.py +4 -3
- kash/commands/base/show_command.py +9 -7
- kash/commands/help/assistant_commands.py +6 -4
- kash/commands/help/help_commands.py +7 -4
- kash/commands/workspace/selection_commands.py +18 -16
- kash/commands/workspace/workspace_commands.py +39 -26
- kash/config/logger.py +1 -1
- kash/config/setup.py +2 -27
- kash/config/text_styles.py +1 -1
- kash/docs/markdown/topics/a1_what_is_kash.md +26 -18
- kash/docs/markdown/topics/a2_installation.md +3 -2
- kash/exec/action_decorators.py +7 -5
- kash/exec/action_exec.py +104 -53
- kash/exec/fetch_url_items.py +40 -11
- kash/exec/llm_transforms.py +14 -5
- kash/exec/preconditions.py +2 -2
- kash/exec/resolve_args.py +4 -1
- kash/exec/runtime_settings.py +3 -0
- kash/file_storage/file_store.py +108 -114
- kash/file_storage/item_file_format.py +91 -26
- kash/file_storage/item_id_index.py +128 -0
- kash/help/help_types.py +1 -1
- kash/llm_utils/llms.py +6 -1
- kash/local_server/local_server_commands.py +2 -1
- kash/mcp/mcp_server_commands.py +3 -2
- kash/mcp/mcp_server_routes.py +42 -12
- kash/model/actions_model.py +44 -32
- kash/model/compound_actions_model.py +4 -3
- kash/model/exec_model.py +33 -3
- kash/model/items_model.py +150 -60
- kash/model/params_model.py +4 -4
- kash/shell/output/shell_output.py +1 -2
- kash/utils/api_utils/gather_limited.py +2 -0
- kash/utils/api_utils/multitask_gather.py +74 -0
- kash/utils/common/s3_utils.py +108 -0
- kash/utils/common/url.py +16 -4
- kash/utils/file_formats/chat_format.py +7 -4
- kash/utils/file_utils/file_ext.py +1 -0
- kash/utils/file_utils/file_formats.py +4 -2
- kash/utils/file_utils/file_formats_model.py +12 -0
- kash/utils/text_handling/doc_normalization.py +1 -1
- kash/utils/text_handling/markdown_footnotes.py +224 -0
- kash/utils/text_handling/markdown_utils.py +532 -41
- kash/utils/text_handling/markdownify_utils.py +2 -1
- kash/web_content/web_fetch.py +2 -1
- kash/web_gen/templates/components/tooltip_scripts.js.jinja +186 -1
- kash/web_gen/templates/components/youtube_popover_scripts.js.jinja +223 -0
- kash/web_gen/templates/components/youtube_popover_styles.css.jinja +150 -0
- kash/web_gen/templates/content_styles.css.jinja +53 -1
- kash/web_gen/templates/youtube_webpage.html.jinja +47 -0
- kash/web_gen/webpage_render.py +103 -0
- kash/workspaces/workspaces.py +0 -5
- kash/xonsh_custom/custom_shell.py +4 -3
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/METADATA +35 -26
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/RECORD +72 -64
- kash/llm_utils/llm_features.py +0 -72
- kash/web_gen/simple_webpage.py +0 -55
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/entry_points.txt +0 -0
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,21 +1,34 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from collections.abc import Callable
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from textwrap import dedent
|
|
4
5
|
from typing import Any, TypeAlias
|
|
5
6
|
|
|
6
|
-
import marko
|
|
7
7
|
import regex
|
|
8
|
-
from
|
|
9
|
-
from
|
|
8
|
+
from chopdiff.html import rewrite_html_img_urls
|
|
9
|
+
from flowmark import flowmark_markdown, line_wrap_by_sentence
|
|
10
|
+
from marko.block import Heading, LinkRefDef, ListItem
|
|
11
|
+
from marko.inline import AutoLink, Image, Link
|
|
10
12
|
|
|
11
13
|
from kash.utils.common.url import Url
|
|
12
14
|
|
|
13
15
|
HTag: TypeAlias = str
|
|
14
16
|
|
|
17
|
+
|
|
18
|
+
UrlRewriter: TypeAlias = Callable[[str], str | None]
|
|
19
|
+
"""
|
|
20
|
+
An URL rewriter function takes a URL string and returns a new URL or
|
|
21
|
+
None to skip rewriting.
|
|
22
|
+
"""
|
|
23
|
+
|
|
15
24
|
# Characters that commonly need escaping in Markdown inline text.
|
|
16
25
|
MARKDOWN_ESCAPE_CHARS = r"([\\`*_{}\[\]()#+.!-])"
|
|
17
26
|
MARKDOWN_ESCAPE_RE = re.compile(MARKDOWN_ESCAPE_CHARS)
|
|
18
27
|
|
|
28
|
+
# Use flowmark for Markdown parsing and rendering.
|
|
29
|
+
# Replaces the single shard marko Markdown object.
|
|
30
|
+
MARKDOWN = flowmark_markdown(line_wrap_by_sentence(is_markdown=True))
|
|
31
|
+
|
|
19
32
|
|
|
20
33
|
def escape_markdown(text: str) -> str:
|
|
21
34
|
"""
|
|
@@ -56,23 +69,33 @@ def is_markdown_header(markdown: str) -> bool:
|
|
|
56
69
|
return regex.match(r"^#+ ", markdown) is not None
|
|
57
70
|
|
|
58
71
|
|
|
72
|
+
def comprehensive_transform_tree(element: Any, transformer: Callable[[Any], None]) -> None:
|
|
73
|
+
"""
|
|
74
|
+
Enhanced tree traversal that handles all marko element types including GFM tables.
|
|
75
|
+
|
|
76
|
+
This extends flowmark's transform_tree to handle table elements that are not
|
|
77
|
+
included in flowmark's ContainerElement tuple.
|
|
78
|
+
"""
|
|
79
|
+
transformer(element)
|
|
80
|
+
|
|
81
|
+
# Handle all types that can contain children
|
|
82
|
+
if hasattr(element, "children") and element.children is not None:
|
|
83
|
+
if isinstance(element.children, list):
|
|
84
|
+
# Create a copy for safe iteration if modification occurs
|
|
85
|
+
current_children = list(element.children)
|
|
86
|
+
for child in current_children:
|
|
87
|
+
comprehensive_transform_tree(child, transformer)
|
|
88
|
+
|
|
89
|
+
|
|
59
90
|
def _tree_links(element, include_internal=False):
|
|
60
91
|
links = []
|
|
61
92
|
|
|
62
93
|
def _find_links(element):
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
if include_internal or not element.dest.startswith("#"):
|
|
69
|
-
links.append(element.dest)
|
|
70
|
-
case _:
|
|
71
|
-
if hasattr(element, "children"):
|
|
72
|
-
for child in element.children:
|
|
73
|
-
_find_links(child)
|
|
74
|
-
|
|
75
|
-
_find_links(element)
|
|
94
|
+
if isinstance(element, (Link, AutoLink)):
|
|
95
|
+
if include_internal or not element.dest.startswith("#"):
|
|
96
|
+
links.append(element.dest)
|
|
97
|
+
|
|
98
|
+
comprehensive_transform_tree(element, _find_links)
|
|
76
99
|
return links
|
|
77
100
|
|
|
78
101
|
|
|
@@ -84,7 +107,7 @@ def extract_links(content: str, include_internal=False) -> list[str]:
|
|
|
84
107
|
Raises:
|
|
85
108
|
marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
|
|
86
109
|
"""
|
|
87
|
-
document =
|
|
110
|
+
document = MARKDOWN.parse(content)
|
|
88
111
|
all_links = _tree_links(document, include_internal)
|
|
89
112
|
|
|
90
113
|
# Deduplicate while preserving order
|
|
@@ -113,6 +136,94 @@ def extract_file_links(file_path: Path, include_internal=False) -> list[str]:
|
|
|
113
136
|
return []
|
|
114
137
|
|
|
115
138
|
|
|
139
|
+
def rewrite_urls(
|
|
140
|
+
content: str,
|
|
141
|
+
url_rewriter: UrlRewriter,
|
|
142
|
+
element_types: tuple[type, ...] = (Image, Link, AutoLink, LinkRefDef),
|
|
143
|
+
) -> str:
|
|
144
|
+
"""
|
|
145
|
+
Rewrite URLs in markdown content using the provided rewriter function.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
content: The markdown content to process
|
|
149
|
+
url_rewriter: A function of type UrlRewriter that takes a URL string and returns
|
|
150
|
+
a new URL string to replace it, or None to skip rewriting that URL
|
|
151
|
+
element_types: Tuple of element types to process (default: all URL-containing types)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
The markdown content with rewritten URLs
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
|
|
158
|
+
"""
|
|
159
|
+
document = MARKDOWN.parse(content)
|
|
160
|
+
_rewrite_tree_urls(document, url_rewriter, element_types)
|
|
161
|
+
|
|
162
|
+
return MARKDOWN.render(document)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def rewrite_image_urls(
|
|
166
|
+
content: str, from_prefix: str, to_prefix: str, *, include_img_tags: bool = True
|
|
167
|
+
) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Rewrite image paths in markdown content by replacing matching prefixes.
|
|
170
|
+
|
|
171
|
+
This works with URLs, relative paths, or absolute paths. Optionally also
|
|
172
|
+
processes HTML img tags within the markdown content.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
content: The markdown content to process
|
|
176
|
+
from_prefix: The prefix to match and replace
|
|
177
|
+
to_prefix: The prefix to replace the from_prefix with
|
|
178
|
+
include_img_tags: If True, also rewrite src attributes in HTML img tags
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
The markdown content with rewritten image paths
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
def prefix_rewriter(url: str) -> str | None:
|
|
188
|
+
if url.startswith(from_prefix):
|
|
189
|
+
return url.replace(from_prefix, to_prefix, 1)
|
|
190
|
+
return None # Skip URLs that don't match the prefix
|
|
191
|
+
|
|
192
|
+
# First rewrite markdown image syntax
|
|
193
|
+
result = rewrite_urls(content, prefix_rewriter, element_types=(Image,))
|
|
194
|
+
|
|
195
|
+
# Then optionally rewrite HTML img tags
|
|
196
|
+
if include_img_tags:
|
|
197
|
+
result = rewrite_html_img_urls(result, from_prefix=from_prefix, to_prefix=to_prefix)
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _rewrite_tree_urls(
|
|
202
|
+
element: Any,
|
|
203
|
+
url_rewriter: UrlRewriter,
|
|
204
|
+
element_types: tuple[type, ...],
|
|
205
|
+
) -> None:
|
|
206
|
+
"""
|
|
207
|
+
Recursively traverse the markdown AST and rewrite URLs in specified element types.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
def _rewrite_url(element: Any) -> None:
|
|
211
|
+
if isinstance(element, element_types) and hasattr(element, "dest"):
|
|
212
|
+
url = element.dest
|
|
213
|
+
new_url = url_rewriter(url)
|
|
214
|
+
if new_url is not None:
|
|
215
|
+
element.dest = new_url
|
|
216
|
+
|
|
217
|
+
comprehensive_transform_tree(element, _rewrite_url)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _is_remote_url(url: str) -> bool:
|
|
221
|
+
"""
|
|
222
|
+
Check if a URL is a remote URL (starts with http:// or https://)
|
|
223
|
+
"""
|
|
224
|
+
return url.startswith(("http://", "https://"))
|
|
225
|
+
|
|
226
|
+
|
|
116
227
|
def extract_first_header(content: str) -> str | None:
|
|
117
228
|
"""
|
|
118
229
|
Extract the first header from markdown content if present.
|
|
@@ -121,7 +232,7 @@ def extract_first_header(content: str) -> str | None:
|
|
|
121
232
|
Raises:
|
|
122
233
|
marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
|
|
123
234
|
"""
|
|
124
|
-
document =
|
|
235
|
+
document = MARKDOWN.parse(content)
|
|
125
236
|
|
|
126
237
|
if document.children and isinstance(document.children[0], Heading):
|
|
127
238
|
return _extract_text(document.children[0]).strip()
|
|
@@ -183,22 +294,15 @@ def extract_bullet_points(content: str, *, strict: bool = False) -> list[str]:
|
|
|
183
294
|
ValueError: If `strict` is True and no bullet points are found.
|
|
184
295
|
marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
|
|
185
296
|
"""
|
|
186
|
-
document =
|
|
297
|
+
document = MARKDOWN.parse(content)
|
|
187
298
|
bullet_points: list[str] = []
|
|
188
299
|
|
|
189
|
-
def
|
|
300
|
+
def _collect_bullet_point(element):
|
|
190
301
|
if isinstance(element, ListItem):
|
|
191
302
|
# Extract markdown from this list item, preserving formatting
|
|
192
303
|
bullet_points.append(_extract_list_item_markdown(element).strip())
|
|
193
|
-
# Then recursively process any nested lists within this item
|
|
194
|
-
if hasattr(element, "children"):
|
|
195
|
-
for child in element.children:
|
|
196
|
-
_find_bullet_points(child)
|
|
197
|
-
elif hasattr(element, "children"):
|
|
198
|
-
for child in element.children:
|
|
199
|
-
_find_bullet_points(child)
|
|
200
304
|
|
|
201
|
-
|
|
305
|
+
comprehensive_transform_tree(document, _collect_bullet_point)
|
|
202
306
|
|
|
203
307
|
# If no bullet points found
|
|
204
308
|
if not bullet_points:
|
|
@@ -268,20 +372,16 @@ def extract_headings(text: str) -> list[tuple[HTag, str]]:
|
|
|
268
372
|
marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
|
|
269
373
|
ValueError: If a heading with an unsupported level is encountered.
|
|
270
374
|
"""
|
|
271
|
-
document =
|
|
375
|
+
document = MARKDOWN.parse(text)
|
|
272
376
|
headings_list: list[tuple[HTag, str]] = []
|
|
273
377
|
|
|
274
|
-
def
|
|
378
|
+
def _collect_heading(element: Any) -> None:
|
|
275
379
|
if isinstance(element, Heading):
|
|
276
380
|
tag = _type_from_heading(element)
|
|
277
381
|
content = _extract_text(element).strip()
|
|
278
382
|
headings_list.append((tag, content))
|
|
279
383
|
|
|
280
|
-
|
|
281
|
-
for child in element.children:
|
|
282
|
-
_collect_headings_recursive(child)
|
|
283
|
-
|
|
284
|
-
_collect_headings_recursive(document)
|
|
384
|
+
comprehensive_transform_tree(document, _collect_heading)
|
|
285
385
|
|
|
286
386
|
return headings_list
|
|
287
387
|
|
|
@@ -742,12 +842,12 @@ def test_extract_links_comprehensive() -> None:
|
|
|
742
842
|
assert "https://github.com" in result_bare
|
|
743
843
|
assert len(result_bare) == 2
|
|
744
844
|
|
|
745
|
-
# Test autolinks without brackets (
|
|
845
|
+
# Test autolinks without brackets (GFM extension enables auto-linking of plain URLs)
|
|
746
846
|
auto_links = "Visit https://stackoverflow.com or http://reddit.com"
|
|
747
847
|
result_auto = extract_links(auto_links)
|
|
748
|
-
assert
|
|
749
|
-
|
|
750
|
-
) #
|
|
848
|
+
assert "https://stackoverflow.com" in result_auto
|
|
849
|
+
assert "http://reddit.com" in result_auto
|
|
850
|
+
assert len(result_auto) == 2 # GFM auto-links plain URLs
|
|
751
851
|
|
|
752
852
|
# Test GFM footnotes (the original issue)
|
|
753
853
|
footnote_content = """
|
|
@@ -777,13 +877,12 @@ Auto link: https://auto-link.com
|
|
|
777
877
|
expected_links = [
|
|
778
878
|
"https://example.com", # Regular link
|
|
779
879
|
"https://bare-link.com", # Bare link
|
|
880
|
+
"https://auto-link.com", # Plain auto link (GFM extension)
|
|
780
881
|
"https://footnote-regular.com", # Link in footnote
|
|
781
882
|
"https://footnote-bare.com", # Bare link in footnote
|
|
782
883
|
]
|
|
783
884
|
for link in expected_links:
|
|
784
885
|
assert link in result_mixed, f"Missing expected link: {link}"
|
|
785
|
-
# Should not include plain auto link (https://auto-link.com) as it's not in angle brackets
|
|
786
|
-
assert "https://auto-link.com" not in result_mixed
|
|
787
886
|
assert len(result_mixed) == len(expected_links)
|
|
788
887
|
|
|
789
888
|
|
|
@@ -878,3 +977,395 @@ def test_extract_links_mixed_real_world() -> None:
|
|
|
878
977
|
for link in expected_links:
|
|
879
978
|
assert link in result, f"Missing expected link: {link}"
|
|
880
979
|
assert len(result) == len(expected_links)
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
def test_rewrite_image_paths() -> None:
|
|
983
|
+
"""Test rewriting image paths in markdown content."""
|
|
984
|
+
|
|
985
|
+
# Test content with various image types
|
|
986
|
+
content = dedent("""
|
|
987
|
+
# Document with Images
|
|
988
|
+
|
|
989
|
+
Here's a local image: 
|
|
990
|
+
|
|
991
|
+
And a remote image: 
|
|
992
|
+
|
|
993
|
+
Another local one: 
|
|
994
|
+
|
|
995
|
+
More content here.
|
|
996
|
+
""")
|
|
997
|
+
|
|
998
|
+
# Test rewriting ./images/ prefix (default include_img_tags=True)
|
|
999
|
+
result1 = rewrite_image_urls(content, "./images/", "./new-images/")
|
|
1000
|
+
assert "./new-images/local.png" in result1
|
|
1001
|
+
assert "./images/local.png" not in result1
|
|
1002
|
+
assert "https://example.com/remote.jpg" in result1 # Remote unchanged
|
|
1003
|
+
assert "../assets/photo.jpeg" in result1 # Other local unchanged
|
|
1004
|
+
|
|
1005
|
+
# Test rewriting ../assets/ prefix
|
|
1006
|
+
result2 = rewrite_image_urls(content, "../assets/", "./new-assets/")
|
|
1007
|
+
assert "./new-assets/photo.jpeg" in result2
|
|
1008
|
+
assert "../assets/photo.jpeg" not in result2
|
|
1009
|
+
assert "./images/local.png" in result2 # Other local unchanged
|
|
1010
|
+
assert "https://example.com/remote.jpg" in result2 # Remote unchanged
|
|
1011
|
+
|
|
1012
|
+
# Test rewriting remote URLs
|
|
1013
|
+
result3 = rewrite_image_urls(content, "https://example.com/", "https://cdn.example.com/")
|
|
1014
|
+
assert "https://cdn.example.com/remote.jpg" in result3
|
|
1015
|
+
assert "https://example.com/remote.jpg" not in result3
|
|
1016
|
+
assert "./images/local.png" in result3 # Local unchanged
|
|
1017
|
+
assert "../assets/photo.jpeg" in result3 # Local unchanged
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def test_rewrite_image_paths_no_images() -> None:
|
|
1021
|
+
"""Test rewriting on content with no images."""
|
|
1022
|
+
content = dedent("""
|
|
1023
|
+
# No Images Here
|
|
1024
|
+
|
|
1025
|
+
Just some regular text and [a link](https://example.com).
|
|
1026
|
+
|
|
1027
|
+
And a list:
|
|
1028
|
+
- Item 1
|
|
1029
|
+
- Item 2
|
|
1030
|
+
""")
|
|
1031
|
+
|
|
1032
|
+
result = rewrite_image_urls(content, "./", "rewritten-")
|
|
1033
|
+
|
|
1034
|
+
# Content should be essentially unchanged (except possible minor formatting)
|
|
1035
|
+
assert "# No Images Here" in result
|
|
1036
|
+
assert "[a link](https://example.com)" in result
|
|
1037
|
+
assert "- Item 1" in result
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def test_rewrite_image_paths_only_remote() -> None:
|
|
1041
|
+
"""Test rewriting on content with only remote images."""
|
|
1042
|
+
content = dedent("""
|
|
1043
|
+
# Remote Images Only
|
|
1044
|
+
|
|
1045
|
+

|
|
1046
|
+

|
|
1047
|
+
""")
|
|
1048
|
+
|
|
1049
|
+
# Test rewriting https:// prefix
|
|
1050
|
+
result1 = rewrite_image_urls(content, "https://example.com/", "https://cdn.example.com/")
|
|
1051
|
+
assert "https://cdn.example.com/image1.png" in result1
|
|
1052
|
+
assert "https://example.com/image1.png" not in result1
|
|
1053
|
+
assert "http://test.com/image2.jpg" in result1 # Other protocol unchanged
|
|
1054
|
+
|
|
1055
|
+
# Test rewriting http:// prefix
|
|
1056
|
+
result2 = rewrite_image_urls(content, "http://test.com/", "https://secure.test.com/")
|
|
1057
|
+
assert "https://secure.test.com/image2.jpg" in result2
|
|
1058
|
+
assert "http://test.com/image2.jpg" not in result2
|
|
1059
|
+
assert "https://example.com/image1.png" in result2 # Other URL unchanged
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
def test_rewrite_image_paths_complex() -> None:
|
|
1063
|
+
"""Test rewriting with complex markdown structure."""
|
|
1064
|
+
content = dedent("""
|
|
1065
|
+
# Main Title
|
|
1066
|
+
|
|
1067
|
+
## Section with Images
|
|
1068
|
+
|
|
1069
|
+
Here's an image in a paragraph: 
|
|
1070
|
+
|
|
1071
|
+
> This is a blockquote with an image: 
|
|
1072
|
+
|
|
1073
|
+
1. List item with image: 
|
|
1074
|
+
2. Another item
|
|
1075
|
+
|
|
1076
|
+
| Table | With |
|
|
1077
|
+
|-------|------|
|
|
1078
|
+
|  | Cell |
|
|
1079
|
+
|
|
1080
|
+
And a remote one: 
|
|
1081
|
+
""")
|
|
1082
|
+
|
|
1083
|
+
# Test rewriting relative paths with ./ prefix
|
|
1084
|
+
result1 = rewrite_image_urls(content, "./", "assets/")
|
|
1085
|
+
assert "assets/local.png" in result1
|
|
1086
|
+
assert "assets/list.png" in result1
|
|
1087
|
+
assert "./local.png" not in result1
|
|
1088
|
+
assert "./list.png" not in result1
|
|
1089
|
+
assert "images/quote.jpg" in result1 # No ./ prefix, unchanged
|
|
1090
|
+
assert "table.png" in result1 # No ./ prefix, unchanged
|
|
1091
|
+
assert "https://remote.com/image.png" in result1 # Remote unchanged
|
|
1092
|
+
|
|
1093
|
+
# Test rewriting paths without prefix
|
|
1094
|
+
result2 = rewrite_image_urls(content, "images/", "new-images/")
|
|
1095
|
+
assert "new-images/quote.jpg" in result2
|
|
1096
|
+
assert "" not in result2 # Check full image syntax
|
|
1097
|
+
assert "./local.png" in result2 # Different prefix, unchanged
|
|
1098
|
+
|
|
1099
|
+
# Test rewriting absolute URLs
|
|
1100
|
+
result3 = rewrite_image_urls(content, "https://remote.com/", "https://cdn.remote.com/")
|
|
1101
|
+
assert "https://cdn.remote.com/image.png" in result3
|
|
1102
|
+
assert "https://remote.com/image.png" not in result3
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def test_rewrite_urls_all_types() -> None:
|
|
1106
|
+
"""Test the generalized URL rewriter with all element types."""
|
|
1107
|
+
content = dedent("""
|
|
1108
|
+
# Document with Various URL Types
|
|
1109
|
+
|
|
1110
|
+
Regular link: [Example](https://example.com/page)
|
|
1111
|
+
|
|
1112
|
+
Auto link: <https://autolink.com>
|
|
1113
|
+
|
|
1114
|
+
Image: 
|
|
1115
|
+
|
|
1116
|
+
Reference link: [Ref link][ref]
|
|
1117
|
+
|
|
1118
|
+
[ref]: https://reference.com/target
|
|
1119
|
+
""")
|
|
1120
|
+
|
|
1121
|
+
def add_prefix(url: str) -> str | None:
|
|
1122
|
+
if url.startswith("https://example.com"):
|
|
1123
|
+
return url.replace("https://example.com", "https://newsite.com")
|
|
1124
|
+
elif url.startswith("./"):
|
|
1125
|
+
return f"assets/{url[2:]}"
|
|
1126
|
+
return None # Skip other URLs
|
|
1127
|
+
|
|
1128
|
+
result = rewrite_urls(content, add_prefix)
|
|
1129
|
+
|
|
1130
|
+
# Check rewritten URLs
|
|
1131
|
+
assert "https://newsite.com/page" in result
|
|
1132
|
+
assert "assets/image.png" in result
|
|
1133
|
+
|
|
1134
|
+
# Check unchanged URLs
|
|
1135
|
+
assert "https://autolink.com" in result
|
|
1136
|
+
assert "https://reference.com/target" in result
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
def test_rewrite_urls_element_type_filter() -> None:
|
|
1140
|
+
"""Test filtering by element type."""
|
|
1141
|
+
content = dedent("""
|
|
1142
|
+
# Links and Images
|
|
1143
|
+
|
|
1144
|
+
Link: [Example](./local-link.html)
|
|
1145
|
+
Image: 
|
|
1146
|
+
Auto: <./auto-link.html>
|
|
1147
|
+
""")
|
|
1148
|
+
|
|
1149
|
+
def prefix_local(url: str) -> str | None:
|
|
1150
|
+
if url.startswith("./"):
|
|
1151
|
+
return f"new/{url[2:]}"
|
|
1152
|
+
return None
|
|
1153
|
+
|
|
1154
|
+
# Only rewrite images
|
|
1155
|
+
result_images = rewrite_urls(content, prefix_local, element_types=(Image,))
|
|
1156
|
+
assert "new/local-image.png" in result_images
|
|
1157
|
+
assert "./local-link.html" in result_images # Link unchanged
|
|
1158
|
+
assert "./auto-link.html" in result_images # AutoLink unchanged
|
|
1159
|
+
|
|
1160
|
+
# Only rewrite regular links
|
|
1161
|
+
result_links = rewrite_urls(content, prefix_local, element_types=(Link,))
|
|
1162
|
+
assert "new/local-link.html" in result_links
|
|
1163
|
+
assert "./local-image.png" in result_links # Image unchanged
|
|
1164
|
+
assert "./auto-link.html" in result_links # AutoLink unchanged
|
|
1165
|
+
|
|
1166
|
+
# Rewrite both links and images
|
|
1167
|
+
result_both = rewrite_urls(content, prefix_local, element_types=(Link, Image))
|
|
1168
|
+
assert "new/local-link.html" in result_both
|
|
1169
|
+
assert "new/local-image.png" in result_both
|
|
1170
|
+
assert "./auto-link.html" in result_both # AutoLink unchanged
|
|
1171
|
+
|
|
1172
|
+
|
|
1173
|
+
def test_rewrite_urls_unified_filter() -> None:
|
|
1174
|
+
"""Test unified filtering and rewriting in the rewriter function."""
|
|
1175
|
+
content = dedent("""
|
|
1176
|
+
# Mixed Local and Remote
|
|
1177
|
+
|
|
1178
|
+
Local link: [Local](./local.html)
|
|
1179
|
+
Remote link: [Remote](https://example.com/remote.html)
|
|
1180
|
+
Local image: 
|
|
1181
|
+
Remote image: 
|
|
1182
|
+
""")
|
|
1183
|
+
|
|
1184
|
+
def make_absolute_if_local(url: str) -> str | None:
|
|
1185
|
+
# Only rewrite local URLs, skip remote ones
|
|
1186
|
+
if url.startswith("./"):
|
|
1187
|
+
return f"https://mysite.com/{url[2:]}"
|
|
1188
|
+
return None # Skip remote URLs
|
|
1189
|
+
|
|
1190
|
+
result = rewrite_urls(content, make_absolute_if_local)
|
|
1191
|
+
|
|
1192
|
+
# Local URLs should be rewritten
|
|
1193
|
+
assert "https://mysite.com/local.html" in result
|
|
1194
|
+
assert "https://mysite.com/image.png" in result
|
|
1195
|
+
|
|
1196
|
+
# Remote URLs should be unchanged
|
|
1197
|
+
assert "https://example.com/remote.html" in result
|
|
1198
|
+
assert "https://example.com/image.jpg" in result
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
def test_rewrite_urls_none_return() -> None:
|
|
1202
|
+
"""Test that returning None skips rewriting."""
|
|
1203
|
+
content = dedent("""
|
|
1204
|
+
# Test Selective Rewriting
|
|
1205
|
+
|
|
1206
|
+
Keep this: [Keep](./keep.html)
|
|
1207
|
+
Change this: [Change](./change.html)
|
|
1208
|
+
""")
|
|
1209
|
+
|
|
1210
|
+
def selective_rewriter(url: str) -> str | None:
|
|
1211
|
+
if "change" in url:
|
|
1212
|
+
return url.replace("./change.html", "./modified.html")
|
|
1213
|
+
return None # Skip everything else
|
|
1214
|
+
|
|
1215
|
+
result = rewrite_urls(content, selective_rewriter)
|
|
1216
|
+
|
|
1217
|
+
assert "./modified.html" in result
|
|
1218
|
+
assert "./keep.html" in result # Unchanged
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
def test_rewrite_urls_reference_links() -> None:
|
|
1222
|
+
"""Test rewriting reference link definitions."""
|
|
1223
|
+
content = dedent("""
|
|
1224
|
+
# Reference Links
|
|
1225
|
+
|
|
1226
|
+
Here's a [reference link][ref1] and [another][ref2].
|
|
1227
|
+
|
|
1228
|
+
[ref1]: ./local-ref.html "Local Reference"
|
|
1229
|
+
[ref2]: https://example.com/remote-ref.html "Remote Reference"
|
|
1230
|
+
""")
|
|
1231
|
+
|
|
1232
|
+
def update_local_refs(url: str) -> str | None:
|
|
1233
|
+
if url.startswith("./"):
|
|
1234
|
+
return url.replace("./", "./updated/")
|
|
1235
|
+
return None
|
|
1236
|
+
|
|
1237
|
+
result = rewrite_urls(content, update_local_refs, element_types=(LinkRefDef,))
|
|
1238
|
+
|
|
1239
|
+
# Reference definition should be updated
|
|
1240
|
+
assert "./updated/local-ref.html" in result
|
|
1241
|
+
|
|
1242
|
+
# Remote reference should be unchanged
|
|
1243
|
+
assert "https://example.com/remote-ref.html" in result
|
|
1244
|
+
|
|
1245
|
+
|
|
1246
|
+
def test_rewrite_urls_complex_scenario() -> None:
|
|
1247
|
+
"""Test complex scenario with multiple filters and rewriters."""
|
|
1248
|
+
content = dedent("""
|
|
1249
|
+
# Complex Document
|
|
1250
|
+
|
|
1251
|
+
## Links Section
|
|
1252
|
+
- [Internal page](./pages/about.html)
|
|
1253
|
+
- [External site](https://external.com)
|
|
1254
|
+
- <./contact.html>
|
|
1255
|
+
|
|
1256
|
+
## Images Section
|
|
1257
|
+

|
|
1258
|
+

|
|
1259
|
+
|
|
1260
|
+
## References
|
|
1261
|
+
[About page][about]
|
|
1262
|
+
[Contact][contact]
|
|
1263
|
+
|
|
1264
|
+
[about]: ./pages/about.html
|
|
1265
|
+
[contact]: ./contact.html
|
|
1266
|
+
""")
|
|
1267
|
+
|
|
1268
|
+
def comprehensive_rewriter(url: str) -> str | None:
|
|
1269
|
+
# Move local pages to new structure
|
|
1270
|
+
if url.startswith("./pages/"):
|
|
1271
|
+
return url.replace("./pages/", "./new-pages/")
|
|
1272
|
+
# Move assets to CDN
|
|
1273
|
+
elif url.startswith("./assets/"):
|
|
1274
|
+
return url.replace("./assets/", "https://cdn.mysite.com/")
|
|
1275
|
+
# Update contact page
|
|
1276
|
+
elif url == "./contact.html":
|
|
1277
|
+
return "./new-contact.html"
|
|
1278
|
+
return None
|
|
1279
|
+
|
|
1280
|
+
result = rewrite_urls(content, comprehensive_rewriter)
|
|
1281
|
+
|
|
1282
|
+
# Check all expected rewrites
|
|
1283
|
+
assert "./new-pages/about.html" in result
|
|
1284
|
+
assert "https://cdn.mysite.com/logo.png" in result
|
|
1285
|
+
assert "./new-contact.html" in result
|
|
1286
|
+
|
|
1287
|
+
# Check unchanged URLs
|
|
1288
|
+
assert "https://external.com" in result
|
|
1289
|
+
assert "https://cdn.example.com/image.jpg" in result
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def test_rewrite_urls_simplified_api() -> None:
|
|
1293
|
+
"""Test the simplified unified API with various rewriting scenarios."""
|
|
1294
|
+
content = dedent("""
|
|
1295
|
+
# Website Migration
|
|
1296
|
+
|
|
1297
|
+
## Local Content
|
|
1298
|
+
- [About](./about.html)
|
|
1299
|
+
- [Help](./help/faq.html)
|
|
1300
|
+
- 
|
|
1301
|
+
- <./contact.html>
|
|
1302
|
+
|
|
1303
|
+
## External Content
|
|
1304
|
+
- [Partner](https://partner.com)
|
|
1305
|
+
- 
|
|
1306
|
+
- <https://external-service.com>
|
|
1307
|
+
|
|
1308
|
+
## Reference Links
|
|
1309
|
+
[Privacy Policy][privacy]
|
|
1310
|
+
[Terms][terms]
|
|
1311
|
+
|
|
1312
|
+
[privacy]: ./legal/privacy.html
|
|
1313
|
+
[terms]: https://example.com/terms
|
|
1314
|
+
""")
|
|
1315
|
+
|
|
1316
|
+
def migration_rewriter(url: str) -> str | None:
|
|
1317
|
+
"""
|
|
1318
|
+
Unified rewriter that handles both filtering and rewriting:
|
|
1319
|
+
- Migrates local pages to new site structure
|
|
1320
|
+
- Moves images to CDN
|
|
1321
|
+
- Updates specific domains
|
|
1322
|
+
- Skips other URLs unchanged
|
|
1323
|
+
"""
|
|
1324
|
+
# Local HTML pages -> new site structure
|
|
1325
|
+
if url.startswith("./") and url.endswith(".html"):
|
|
1326
|
+
if "/help/" in url:
|
|
1327
|
+
# Move help pages to support section
|
|
1328
|
+
filename = url.split("/")[-1]
|
|
1329
|
+
return f"https://newsite.com/support/{filename}"
|
|
1330
|
+
elif "/legal/" in url:
|
|
1331
|
+
# Move legal pages to main site
|
|
1332
|
+
filename = url.split("/")[-1]
|
|
1333
|
+
return f"https://newsite.com/legal/{filename}"
|
|
1334
|
+
else:
|
|
1335
|
+
# Root level pages
|
|
1336
|
+
filename = url[2:] # Remove "./"
|
|
1337
|
+
return f"https://newsite.com/{filename}"
|
|
1338
|
+
|
|
1339
|
+
# Local images -> CDN
|
|
1340
|
+
elif url.startswith("./images/"):
|
|
1341
|
+
filename = url.split("/")[-1]
|
|
1342
|
+
return f"https://cdn.newsite.com/{filename}"
|
|
1343
|
+
|
|
1344
|
+
# Domain migration for external links
|
|
1345
|
+
elif url.startswith("https://example.com"):
|
|
1346
|
+
return url.replace("example.com", "newsite.com")
|
|
1347
|
+
|
|
1348
|
+
# Skip all other URLs (external services, CDNs, etc.)
|
|
1349
|
+
return None
|
|
1350
|
+
|
|
1351
|
+
result = rewrite_urls(content, migration_rewriter)
|
|
1352
|
+
|
|
1353
|
+
# Verify local page migrations
|
|
1354
|
+
assert "https://newsite.com/about.html" in result
|
|
1355
|
+
assert "https://newsite.com/support/faq.html" in result
|
|
1356
|
+
assert "https://newsite.com/legal/privacy.html" in result
|
|
1357
|
+
|
|
1358
|
+
# Verify image migration to CDN
|
|
1359
|
+
assert "https://cdn.newsite.com/logo.png" in result
|
|
1360
|
+
|
|
1361
|
+
# Verify domain migration
|
|
1362
|
+
assert "https://newsite.com/terms" in result
|
|
1363
|
+
|
|
1364
|
+
# Verify unchanged external URLs
|
|
1365
|
+
assert "https://partner.com" in result
|
|
1366
|
+
assert "https://cdn.example.com/img.jpg" in result
|
|
1367
|
+
assert "https://external-service.com" in result
|
|
1368
|
+
|
|
1369
|
+
# Verify that relative URLs in angle brackets remain unchanged
|
|
1370
|
+
# (marko doesn't parse them as URL elements)
|
|
1371
|
+
assert "<./contact.html>" in result
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
+
from typing import Any
|
|
4
5
|
|
|
5
6
|
from kash.utils.text_handling.escape_html_tags import escape_html_tags
|
|
6
7
|
|
|
@@ -35,7 +36,7 @@ def markdownify_preprocess(html: str) -> str:
|
|
|
35
36
|
|
|
36
37
|
# Good options for markdownify. Without setting sup_symbol and sub_symbol, that
|
|
37
38
|
# info is typically lost.
|
|
38
|
-
MARKDOWNIFY_OPTIONS = {
|
|
39
|
+
MARKDOWNIFY_OPTIONS: dict[str, Any] = {
|
|
39
40
|
"sup_symbol": "<__sup>",
|
|
40
41
|
"sub_symbol": "<__sub>",
|
|
41
42
|
"escape_underscores": True,
|