kash-shell 0.3.28__py3-none-any.whl → 0.3.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. kash/actions/core/chat.py +1 -0
  2. kash/actions/core/markdownify_html.py +4 -5
  3. kash/actions/core/minify_html.py +4 -5
  4. kash/actions/core/readability.py +1 -4
  5. kash/actions/core/render_as_html.py +10 -7
  6. kash/actions/core/save_sidematter_meta.py +47 -0
  7. kash/actions/core/show_webpage.py +2 -0
  8. kash/actions/core/zip_sidematter.py +47 -0
  9. kash/commands/base/basic_file_commands.py +7 -4
  10. kash/commands/base/diff_commands.py +6 -4
  11. kash/commands/base/files_command.py +31 -30
  12. kash/commands/base/general_commands.py +3 -2
  13. kash/commands/base/logs_commands.py +6 -4
  14. kash/commands/base/reformat_command.py +3 -2
  15. kash/commands/base/search_command.py +4 -3
  16. kash/commands/base/show_command.py +9 -7
  17. kash/commands/help/assistant_commands.py +6 -4
  18. kash/commands/help/help_commands.py +7 -4
  19. kash/commands/workspace/selection_commands.py +18 -16
  20. kash/commands/workspace/workspace_commands.py +39 -26
  21. kash/config/logger.py +1 -1
  22. kash/config/setup.py +2 -27
  23. kash/config/text_styles.py +1 -1
  24. kash/docs/markdown/topics/a1_what_is_kash.md +26 -18
  25. kash/docs/markdown/topics/a2_installation.md +3 -2
  26. kash/exec/action_decorators.py +7 -5
  27. kash/exec/action_exec.py +104 -53
  28. kash/exec/fetch_url_items.py +40 -11
  29. kash/exec/llm_transforms.py +14 -5
  30. kash/exec/preconditions.py +2 -2
  31. kash/exec/resolve_args.py +4 -1
  32. kash/exec/runtime_settings.py +3 -0
  33. kash/file_storage/file_store.py +108 -114
  34. kash/file_storage/item_file_format.py +91 -26
  35. kash/file_storage/item_id_index.py +128 -0
  36. kash/help/help_types.py +1 -1
  37. kash/llm_utils/llms.py +6 -1
  38. kash/local_server/local_server_commands.py +2 -1
  39. kash/mcp/mcp_server_commands.py +3 -2
  40. kash/mcp/mcp_server_routes.py +42 -12
  41. kash/model/actions_model.py +44 -32
  42. kash/model/compound_actions_model.py +4 -3
  43. kash/model/exec_model.py +33 -3
  44. kash/model/items_model.py +150 -60
  45. kash/model/params_model.py +4 -4
  46. kash/shell/output/shell_output.py +1 -2
  47. kash/utils/api_utils/gather_limited.py +2 -0
  48. kash/utils/api_utils/multitask_gather.py +74 -0
  49. kash/utils/common/s3_utils.py +108 -0
  50. kash/utils/common/url.py +16 -4
  51. kash/utils/file_formats/chat_format.py +7 -4
  52. kash/utils/file_utils/file_ext.py +1 -0
  53. kash/utils/file_utils/file_formats.py +4 -2
  54. kash/utils/file_utils/file_formats_model.py +12 -0
  55. kash/utils/text_handling/doc_normalization.py +1 -1
  56. kash/utils/text_handling/markdown_footnotes.py +224 -0
  57. kash/utils/text_handling/markdown_utils.py +532 -41
  58. kash/utils/text_handling/markdownify_utils.py +2 -1
  59. kash/web_content/web_fetch.py +2 -1
  60. kash/web_gen/templates/components/tooltip_scripts.js.jinja +186 -1
  61. kash/web_gen/templates/components/youtube_popover_scripts.js.jinja +223 -0
  62. kash/web_gen/templates/components/youtube_popover_styles.css.jinja +150 -0
  63. kash/web_gen/templates/content_styles.css.jinja +53 -1
  64. kash/web_gen/templates/youtube_webpage.html.jinja +47 -0
  65. kash/web_gen/webpage_render.py +103 -0
  66. kash/workspaces/workspaces.py +0 -5
  67. kash/xonsh_custom/custom_shell.py +4 -3
  68. {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/METADATA +35 -26
  69. {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/RECORD +72 -64
  70. kash/llm_utils/llm_features.py +0 -72
  71. kash/web_gen/simple_webpage.py +0 -55
  72. {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/WHEEL +0 -0
  73. {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/entry_points.txt +0 -0
  74. {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/licenses/LICENSE +0 -0
@@ -1,21 +1,34 @@
1
1
  import re
2
+ from collections.abc import Callable
2
3
  from pathlib import Path
3
4
  from textwrap import dedent
4
5
  from typing import Any, TypeAlias
5
6
 
6
- import marko
7
7
  import regex
8
- from marko.block import Heading, ListItem
9
- from marko.inline import AutoLink, Link
8
+ from chopdiff.html import rewrite_html_img_urls
9
+ from flowmark import flowmark_markdown, line_wrap_by_sentence
10
+ from marko.block import Heading, LinkRefDef, ListItem
11
+ from marko.inline import AutoLink, Image, Link
10
12
 
11
13
  from kash.utils.common.url import Url
12
14
 
13
15
  HTag: TypeAlias = str
14
16
 
17
+
18
+ UrlRewriter: TypeAlias = Callable[[str], str | None]
19
+ """
20
+ An URL rewriter function takes a URL string and returns a new URL or
21
+ None to skip rewriting.
22
+ """
23
+
15
24
  # Characters that commonly need escaping in Markdown inline text.
16
25
  MARKDOWN_ESCAPE_CHARS = r"([\\`*_{}\[\]()#+.!-])"
17
26
  MARKDOWN_ESCAPE_RE = re.compile(MARKDOWN_ESCAPE_CHARS)
18
27
 
28
+ # Use flowmark for Markdown parsing and rendering.
29
+ # Replaces the single shard marko Markdown object.
30
+ MARKDOWN = flowmark_markdown(line_wrap_by_sentence(is_markdown=True))
31
+
19
32
 
20
33
  def escape_markdown(text: str) -> str:
21
34
  """
@@ -56,23 +69,33 @@ def is_markdown_header(markdown: str) -> bool:
56
69
  return regex.match(r"^#+ ", markdown) is not None
57
70
 
58
71
 
72
+ def comprehensive_transform_tree(element: Any, transformer: Callable[[Any], None]) -> None:
73
+ """
74
+ Enhanced tree traversal that handles all marko element types including GFM tables.
75
+
76
+ This extends flowmark's transform_tree to handle table elements that are not
77
+ included in flowmark's ContainerElement tuple.
78
+ """
79
+ transformer(element)
80
+
81
+ # Handle all types that can contain children
82
+ if hasattr(element, "children") and element.children is not None:
83
+ if isinstance(element.children, list):
84
+ # Create a copy for safe iteration if modification occurs
85
+ current_children = list(element.children)
86
+ for child in current_children:
87
+ comprehensive_transform_tree(child, transformer)
88
+
89
+
59
90
  def _tree_links(element, include_internal=False):
60
91
  links = []
61
92
 
62
93
  def _find_links(element):
63
- match element:
64
- case Link():
65
- if include_internal or not element.dest.startswith("#"):
66
- links.append(element.dest)
67
- case AutoLink():
68
- if include_internal or not element.dest.startswith("#"):
69
- links.append(element.dest)
70
- case _:
71
- if hasattr(element, "children"):
72
- for child in element.children:
73
- _find_links(child)
74
-
75
- _find_links(element)
94
+ if isinstance(element, (Link, AutoLink)):
95
+ if include_internal or not element.dest.startswith("#"):
96
+ links.append(element.dest)
97
+
98
+ comprehensive_transform_tree(element, _find_links)
76
99
  return links
77
100
 
78
101
 
@@ -84,7 +107,7 @@ def extract_links(content: str, include_internal=False) -> list[str]:
84
107
  Raises:
85
108
  marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
86
109
  """
87
- document = marko.parse(content)
110
+ document = MARKDOWN.parse(content)
88
111
  all_links = _tree_links(document, include_internal)
89
112
 
90
113
  # Deduplicate while preserving order
@@ -113,6 +136,94 @@ def extract_file_links(file_path: Path, include_internal=False) -> list[str]:
113
136
  return []
114
137
 
115
138
 
139
+ def rewrite_urls(
140
+ content: str,
141
+ url_rewriter: UrlRewriter,
142
+ element_types: tuple[type, ...] = (Image, Link, AutoLink, LinkRefDef),
143
+ ) -> str:
144
+ """
145
+ Rewrite URLs in markdown content using the provided rewriter function.
146
+
147
+ Args:
148
+ content: The markdown content to process
149
+ url_rewriter: A function of type UrlRewriter that takes a URL string and returns
150
+ a new URL string to replace it, or None to skip rewriting that URL
151
+ element_types: Tuple of element types to process (default: all URL-containing types)
152
+
153
+ Returns:
154
+ The markdown content with rewritten URLs
155
+
156
+ Raises:
157
+ marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
158
+ """
159
+ document = MARKDOWN.parse(content)
160
+ _rewrite_tree_urls(document, url_rewriter, element_types)
161
+
162
+ return MARKDOWN.render(document)
163
+
164
+
165
+ def rewrite_image_urls(
166
+ content: str, from_prefix: str, to_prefix: str, *, include_img_tags: bool = True
167
+ ) -> str:
168
+ """
169
+ Rewrite image paths in markdown content by replacing matching prefixes.
170
+
171
+ This works with URLs, relative paths, or absolute paths. Optionally also
172
+ processes HTML img tags within the markdown content.
173
+
174
+ Args:
175
+ content: The markdown content to process
176
+ from_prefix: The prefix to match and replace
177
+ to_prefix: The prefix to replace the from_prefix with
178
+ include_img_tags: If True, also rewrite src attributes in HTML img tags
179
+
180
+ Returns:
181
+ The markdown content with rewritten image paths
182
+
183
+ Raises:
184
+ marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
185
+ """
186
+
187
+ def prefix_rewriter(url: str) -> str | None:
188
+ if url.startswith(from_prefix):
189
+ return url.replace(from_prefix, to_prefix, 1)
190
+ return None # Skip URLs that don't match the prefix
191
+
192
+ # First rewrite markdown image syntax
193
+ result = rewrite_urls(content, prefix_rewriter, element_types=(Image,))
194
+
195
+ # Then optionally rewrite HTML img tags
196
+ if include_img_tags:
197
+ result = rewrite_html_img_urls(result, from_prefix=from_prefix, to_prefix=to_prefix)
198
+ return result
199
+
200
+
201
+ def _rewrite_tree_urls(
202
+ element: Any,
203
+ url_rewriter: UrlRewriter,
204
+ element_types: tuple[type, ...],
205
+ ) -> None:
206
+ """
207
+ Recursively traverse the markdown AST and rewrite URLs in specified element types.
208
+ """
209
+
210
+ def _rewrite_url(element: Any) -> None:
211
+ if isinstance(element, element_types) and hasattr(element, "dest"):
212
+ url = element.dest
213
+ new_url = url_rewriter(url)
214
+ if new_url is not None:
215
+ element.dest = new_url
216
+
217
+ comprehensive_transform_tree(element, _rewrite_url)
218
+
219
+
220
+ def _is_remote_url(url: str) -> bool:
221
+ """
222
+ Check if a URL is a remote URL (starts with http:// or https://)
223
+ """
224
+ return url.startswith(("http://", "https://"))
225
+
226
+
116
227
  def extract_first_header(content: str) -> str | None:
117
228
  """
118
229
  Extract the first header from markdown content if present.
@@ -121,7 +232,7 @@ def extract_first_header(content: str) -> str | None:
121
232
  Raises:
122
233
  marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
123
234
  """
124
- document = marko.parse(content)
235
+ document = MARKDOWN.parse(content)
125
236
 
126
237
  if document.children and isinstance(document.children[0], Heading):
127
238
  return _extract_text(document.children[0]).strip()
@@ -183,22 +294,15 @@ def extract_bullet_points(content: str, *, strict: bool = False) -> list[str]:
183
294
  ValueError: If `strict` is True and no bullet points are found.
184
295
  marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
185
296
  """
186
- document = marko.parse(content)
297
+ document = MARKDOWN.parse(content)
187
298
  bullet_points: list[str] = []
188
299
 
189
- def _find_bullet_points(element):
300
+ def _collect_bullet_point(element):
190
301
  if isinstance(element, ListItem):
191
302
  # Extract markdown from this list item, preserving formatting
192
303
  bullet_points.append(_extract_list_item_markdown(element).strip())
193
- # Then recursively process any nested lists within this item
194
- if hasattr(element, "children"):
195
- for child in element.children:
196
- _find_bullet_points(child)
197
- elif hasattr(element, "children"):
198
- for child in element.children:
199
- _find_bullet_points(child)
200
304
 
201
- _find_bullet_points(document)
305
+ comprehensive_transform_tree(document, _collect_bullet_point)
202
306
 
203
307
  # If no bullet points found
204
308
  if not bullet_points:
@@ -268,20 +372,16 @@ def extract_headings(text: str) -> list[tuple[HTag, str]]:
268
372
  marko.ParseError: If the markdown content contains invalid syntax that cannot be parsed.
269
373
  ValueError: If a heading with an unsupported level is encountered.
270
374
  """
271
- document = marko.parse(text)
375
+ document = MARKDOWN.parse(text)
272
376
  headings_list: list[tuple[HTag, str]] = []
273
377
 
274
- def _collect_headings_recursive(element: Any) -> None:
378
+ def _collect_heading(element: Any) -> None:
275
379
  if isinstance(element, Heading):
276
380
  tag = _type_from_heading(element)
277
381
  content = _extract_text(element).strip()
278
382
  headings_list.append((tag, content))
279
383
 
280
- if hasattr(element, "children"):
281
- for child in element.children:
282
- _collect_headings_recursive(child)
283
-
284
- _collect_headings_recursive(document)
384
+ comprehensive_transform_tree(document, _collect_heading)
285
385
 
286
386
  return headings_list
287
387
 
@@ -742,12 +842,12 @@ def test_extract_links_comprehensive() -> None:
742
842
  assert "https://github.com" in result_bare
743
843
  assert len(result_bare) == 2
744
844
 
745
- # Test autolinks without brackets (expected to not work with standard markdown)
845
+ # Test autolinks without brackets (GFM extension enables auto-linking of plain URLs)
746
846
  auto_links = "Visit https://stackoverflow.com or http://reddit.com"
747
847
  result_auto = extract_links(auto_links)
748
- assert (
749
- result_auto == []
750
- ) # Plain URLs without brackets aren't parsed as links in standard markdown
848
+ assert "https://stackoverflow.com" in result_auto
849
+ assert "http://reddit.com" in result_auto
850
+ assert len(result_auto) == 2 # GFM auto-links plain URLs
751
851
 
752
852
  # Test GFM footnotes (the original issue)
753
853
  footnote_content = """
@@ -777,13 +877,12 @@ Auto link: https://auto-link.com
777
877
  expected_links = [
778
878
  "https://example.com", # Regular link
779
879
  "https://bare-link.com", # Bare link
880
+ "https://auto-link.com", # Plain auto link (GFM extension)
780
881
  "https://footnote-regular.com", # Link in footnote
781
882
  "https://footnote-bare.com", # Bare link in footnote
782
883
  ]
783
884
  for link in expected_links:
784
885
  assert link in result_mixed, f"Missing expected link: {link}"
785
- # Should not include plain auto link (https://auto-link.com) as it's not in angle brackets
786
- assert "https://auto-link.com" not in result_mixed
787
886
  assert len(result_mixed) == len(expected_links)
788
887
 
789
888
 
@@ -878,3 +977,395 @@ def test_extract_links_mixed_real_world() -> None:
878
977
  for link in expected_links:
879
978
  assert link in result, f"Missing expected link: {link}"
880
979
  assert len(result) == len(expected_links)
980
+
981
+
982
+ def test_rewrite_image_paths() -> None:
983
+ """Test rewriting image paths in markdown content."""
984
+
985
+ # Test content with various image types
986
+ content = dedent("""
987
+ # Document with Images
988
+
989
+ Here's a local image: ![Alt text](./images/local.png)
990
+
991
+ And a remote image: ![Remote](https://example.com/remote.jpg)
992
+
993
+ Another local one: ![Another](../assets/photo.jpeg "Title")
994
+
995
+ More content here.
996
+ """)
997
+
998
+ # Test rewriting ./images/ prefix (default include_img_tags=True)
999
+ result1 = rewrite_image_urls(content, "./images/", "./new-images/")
1000
+ assert "./new-images/local.png" in result1
1001
+ assert "./images/local.png" not in result1
1002
+ assert "https://example.com/remote.jpg" in result1 # Remote unchanged
1003
+ assert "../assets/photo.jpeg" in result1 # Other local unchanged
1004
+
1005
+ # Test rewriting ../assets/ prefix
1006
+ result2 = rewrite_image_urls(content, "../assets/", "./new-assets/")
1007
+ assert "./new-assets/photo.jpeg" in result2
1008
+ assert "../assets/photo.jpeg" not in result2
1009
+ assert "./images/local.png" in result2 # Other local unchanged
1010
+ assert "https://example.com/remote.jpg" in result2 # Remote unchanged
1011
+
1012
+ # Test rewriting remote URLs
1013
+ result3 = rewrite_image_urls(content, "https://example.com/", "https://cdn.example.com/")
1014
+ assert "https://cdn.example.com/remote.jpg" in result3
1015
+ assert "https://example.com/remote.jpg" not in result3
1016
+ assert "./images/local.png" in result3 # Local unchanged
1017
+ assert "../assets/photo.jpeg" in result3 # Local unchanged
1018
+
1019
+
1020
+ def test_rewrite_image_paths_no_images() -> None:
1021
+ """Test rewriting on content with no images."""
1022
+ content = dedent("""
1023
+ # No Images Here
1024
+
1025
+ Just some regular text and [a link](https://example.com).
1026
+
1027
+ And a list:
1028
+ - Item 1
1029
+ - Item 2
1030
+ """)
1031
+
1032
+ result = rewrite_image_urls(content, "./", "rewritten-")
1033
+
1034
+ # Content should be essentially unchanged (except possible minor formatting)
1035
+ assert "# No Images Here" in result
1036
+ assert "[a link](https://example.com)" in result
1037
+ assert "- Item 1" in result
1038
+
1039
+
1040
+ def test_rewrite_image_paths_only_remote() -> None:
1041
+ """Test rewriting on content with only remote images."""
1042
+ content = dedent("""
1043
+ # Remote Images Only
1044
+
1045
+ ![Image 1](https://example.com/image1.png)
1046
+ ![Image 2](http://test.com/image2.jpg)
1047
+ """)
1048
+
1049
+ # Test rewriting https:// prefix
1050
+ result1 = rewrite_image_urls(content, "https://example.com/", "https://cdn.example.com/")
1051
+ assert "https://cdn.example.com/image1.png" in result1
1052
+ assert "https://example.com/image1.png" not in result1
1053
+ assert "http://test.com/image2.jpg" in result1 # Other protocol unchanged
1054
+
1055
+ # Test rewriting http:// prefix
1056
+ result2 = rewrite_image_urls(content, "http://test.com/", "https://secure.test.com/")
1057
+ assert "https://secure.test.com/image2.jpg" in result2
1058
+ assert "http://test.com/image2.jpg" not in result2
1059
+ assert "https://example.com/image1.png" in result2 # Other URL unchanged
1060
+
1061
+
1062
+ def test_rewrite_image_paths_complex() -> None:
1063
+ """Test rewriting with complex markdown structure."""
1064
+ content = dedent("""
1065
+ # Main Title
1066
+
1067
+ ## Section with Images
1068
+
1069
+ Here's an image in a paragraph: ![Local](./local.png)
1070
+
1071
+ > This is a blockquote with an image: ![Quote image](images/quote.jpg)
1072
+
1073
+ 1. List item with image: ![List image](./list.png)
1074
+ 2. Another item
1075
+
1076
+ | Table | With |
1077
+ |-------|------|
1078
+ | ![Table image](table.png) | Cell |
1079
+
1080
+ And a remote one: ![Remote](https://remote.com/image.png)
1081
+ """)
1082
+
1083
+ # Test rewriting relative paths with ./ prefix
1084
+ result1 = rewrite_image_urls(content, "./", "assets/")
1085
+ assert "assets/local.png" in result1
1086
+ assert "assets/list.png" in result1
1087
+ assert "./local.png" not in result1
1088
+ assert "./list.png" not in result1
1089
+ assert "images/quote.jpg" in result1 # No ./ prefix, unchanged
1090
+ assert "table.png" in result1 # No ./ prefix, unchanged
1091
+ assert "https://remote.com/image.png" in result1 # Remote unchanged
1092
+
1093
+ # Test rewriting paths without prefix
1094
+ result2 = rewrite_image_urls(content, "images/", "new-images/")
1095
+ assert "new-images/quote.jpg" in result2
1096
+ assert "![Quote image](images/quote.jpg)" not in result2 # Check full image syntax
1097
+ assert "./local.png" in result2 # Different prefix, unchanged
1098
+
1099
+ # Test rewriting absolute URLs
1100
+ result3 = rewrite_image_urls(content, "https://remote.com/", "https://cdn.remote.com/")
1101
+ assert "https://cdn.remote.com/image.png" in result3
1102
+ assert "https://remote.com/image.png" not in result3
1103
+
1104
+
1105
+ def test_rewrite_urls_all_types() -> None:
1106
+ """Test the generalized URL rewriter with all element types."""
1107
+ content = dedent("""
1108
+ # Document with Various URL Types
1109
+
1110
+ Regular link: [Example](https://example.com/page)
1111
+
1112
+ Auto link: <https://autolink.com>
1113
+
1114
+ Image: ![Alt text](./image.png)
1115
+
1116
+ Reference link: [Ref link][ref]
1117
+
1118
+ [ref]: https://reference.com/target
1119
+ """)
1120
+
1121
+ def add_prefix(url: str) -> str | None:
1122
+ if url.startswith("https://example.com"):
1123
+ return url.replace("https://example.com", "https://newsite.com")
1124
+ elif url.startswith("./"):
1125
+ return f"assets/{url[2:]}"
1126
+ return None # Skip other URLs
1127
+
1128
+ result = rewrite_urls(content, add_prefix)
1129
+
1130
+ # Check rewritten URLs
1131
+ assert "https://newsite.com/page" in result
1132
+ assert "assets/image.png" in result
1133
+
1134
+ # Check unchanged URLs
1135
+ assert "https://autolink.com" in result
1136
+ assert "https://reference.com/target" in result
1137
+
1138
+
1139
+ def test_rewrite_urls_element_type_filter() -> None:
1140
+ """Test filtering by element type."""
1141
+ content = dedent("""
1142
+ # Links and Images
1143
+
1144
+ Link: [Example](./local-link.html)
1145
+ Image: ![Alt](./local-image.png)
1146
+ Auto: <./auto-link.html>
1147
+ """)
1148
+
1149
+ def prefix_local(url: str) -> str | None:
1150
+ if url.startswith("./"):
1151
+ return f"new/{url[2:]}"
1152
+ return None
1153
+
1154
+ # Only rewrite images
1155
+ result_images = rewrite_urls(content, prefix_local, element_types=(Image,))
1156
+ assert "new/local-image.png" in result_images
1157
+ assert "./local-link.html" in result_images # Link unchanged
1158
+ assert "./auto-link.html" in result_images # AutoLink unchanged
1159
+
1160
+ # Only rewrite regular links
1161
+ result_links = rewrite_urls(content, prefix_local, element_types=(Link,))
1162
+ assert "new/local-link.html" in result_links
1163
+ assert "./local-image.png" in result_links # Image unchanged
1164
+ assert "./auto-link.html" in result_links # AutoLink unchanged
1165
+
1166
+ # Rewrite both links and images
1167
+ result_both = rewrite_urls(content, prefix_local, element_types=(Link, Image))
1168
+ assert "new/local-link.html" in result_both
1169
+ assert "new/local-image.png" in result_both
1170
+ assert "./auto-link.html" in result_both # AutoLink unchanged
1171
+
1172
+
1173
+ def test_rewrite_urls_unified_filter() -> None:
1174
+ """Test unified filtering and rewriting in the rewriter function."""
1175
+ content = dedent("""
1176
+ # Mixed Local and Remote
1177
+
1178
+ Local link: [Local](./local.html)
1179
+ Remote link: [Remote](https://example.com/remote.html)
1180
+ Local image: ![Local](./image.png)
1181
+ Remote image: ![Remote](https://example.com/image.jpg)
1182
+ """)
1183
+
1184
+ def make_absolute_if_local(url: str) -> str | None:
1185
+ # Only rewrite local URLs, skip remote ones
1186
+ if url.startswith("./"):
1187
+ return f"https://mysite.com/{url[2:]}"
1188
+ return None # Skip remote URLs
1189
+
1190
+ result = rewrite_urls(content, make_absolute_if_local)
1191
+
1192
+ # Local URLs should be rewritten
1193
+ assert "https://mysite.com/local.html" in result
1194
+ assert "https://mysite.com/image.png" in result
1195
+
1196
+ # Remote URLs should be unchanged
1197
+ assert "https://example.com/remote.html" in result
1198
+ assert "https://example.com/image.jpg" in result
1199
+
1200
+
1201
+ def test_rewrite_urls_none_return() -> None:
1202
+ """Test that returning None skips rewriting."""
1203
+ content = dedent("""
1204
+ # Test Selective Rewriting
1205
+
1206
+ Keep this: [Keep](./keep.html)
1207
+ Change this: [Change](./change.html)
1208
+ """)
1209
+
1210
+ def selective_rewriter(url: str) -> str | None:
1211
+ if "change" in url:
1212
+ return url.replace("./change.html", "./modified.html")
1213
+ return None # Skip everything else
1214
+
1215
+ result = rewrite_urls(content, selective_rewriter)
1216
+
1217
+ assert "./modified.html" in result
1218
+ assert "./keep.html" in result # Unchanged
1219
+
1220
+
1221
+ def test_rewrite_urls_reference_links() -> None:
1222
+ """Test rewriting reference link definitions."""
1223
+ content = dedent("""
1224
+ # Reference Links
1225
+
1226
+ Here's a [reference link][ref1] and [another][ref2].
1227
+
1228
+ [ref1]: ./local-ref.html "Local Reference"
1229
+ [ref2]: https://example.com/remote-ref.html "Remote Reference"
1230
+ """)
1231
+
1232
+ def update_local_refs(url: str) -> str | None:
1233
+ if url.startswith("./"):
1234
+ return url.replace("./", "./updated/")
1235
+ return None
1236
+
1237
+ result = rewrite_urls(content, update_local_refs, element_types=(LinkRefDef,))
1238
+
1239
+ # Reference definition should be updated
1240
+ assert "./updated/local-ref.html" in result
1241
+
1242
+ # Remote reference should be unchanged
1243
+ assert "https://example.com/remote-ref.html" in result
1244
+
1245
+
1246
+ def test_rewrite_urls_complex_scenario() -> None:
1247
+ """Test complex scenario with multiple filters and rewriters."""
1248
+ content = dedent("""
1249
+ # Complex Document
1250
+
1251
+ ## Links Section
1252
+ - [Internal page](./pages/about.html)
1253
+ - [External site](https://external.com)
1254
+ - <./contact.html>
1255
+
1256
+ ## Images Section
1257
+ ![Logo](./assets/logo.png)
1258
+ ![External](https://cdn.example.com/image.jpg)
1259
+
1260
+ ## References
1261
+ [About page][about]
1262
+ [Contact][contact]
1263
+
1264
+ [about]: ./pages/about.html
1265
+ [contact]: ./contact.html
1266
+ """)
1267
+
1268
+ def comprehensive_rewriter(url: str) -> str | None:
1269
+ # Move local pages to new structure
1270
+ if url.startswith("./pages/"):
1271
+ return url.replace("./pages/", "./new-pages/")
1272
+ # Move assets to CDN
1273
+ elif url.startswith("./assets/"):
1274
+ return url.replace("./assets/", "https://cdn.mysite.com/")
1275
+ # Update contact page
1276
+ elif url == "./contact.html":
1277
+ return "./new-contact.html"
1278
+ return None
1279
+
1280
+ result = rewrite_urls(content, comprehensive_rewriter)
1281
+
1282
+ # Check all expected rewrites
1283
+ assert "./new-pages/about.html" in result
1284
+ assert "https://cdn.mysite.com/logo.png" in result
1285
+ assert "./new-contact.html" in result
1286
+
1287
+ # Check unchanged URLs
1288
+ assert "https://external.com" in result
1289
+ assert "https://cdn.example.com/image.jpg" in result
1290
+
1291
+
1292
+ def test_rewrite_urls_simplified_api() -> None:
1293
+ """Test the simplified unified API with various rewriting scenarios."""
1294
+ content = dedent("""
1295
+ # Website Migration
1296
+
1297
+ ## Local Content
1298
+ - [About](./about.html)
1299
+ - [Help](./help/faq.html)
1300
+ - ![Logo](./images/logo.png)
1301
+ - <./contact.html>
1302
+
1303
+ ## External Content
1304
+ - [Partner](https://partner.com)
1305
+ - ![CDN Image](https://cdn.example.com/img.jpg)
1306
+ - <https://external-service.com>
1307
+
1308
+ ## Reference Links
1309
+ [Privacy Policy][privacy]
1310
+ [Terms][terms]
1311
+
1312
+ [privacy]: ./legal/privacy.html
1313
+ [terms]: https://example.com/terms
1314
+ """)
1315
+
1316
+ def migration_rewriter(url: str) -> str | None:
1317
+ """
1318
+ Unified rewriter that handles both filtering and rewriting:
1319
+ - Migrates local pages to new site structure
1320
+ - Moves images to CDN
1321
+ - Updates specific domains
1322
+ - Skips other URLs unchanged
1323
+ """
1324
+ # Local HTML pages -> new site structure
1325
+ if url.startswith("./") and url.endswith(".html"):
1326
+ if "/help/" in url:
1327
+ # Move help pages to support section
1328
+ filename = url.split("/")[-1]
1329
+ return f"https://newsite.com/support/{filename}"
1330
+ elif "/legal/" in url:
1331
+ # Move legal pages to main site
1332
+ filename = url.split("/")[-1]
1333
+ return f"https://newsite.com/legal/{filename}"
1334
+ else:
1335
+ # Root level pages
1336
+ filename = url[2:] # Remove "./"
1337
+ return f"https://newsite.com/{filename}"
1338
+
1339
+ # Local images -> CDN
1340
+ elif url.startswith("./images/"):
1341
+ filename = url.split("/")[-1]
1342
+ return f"https://cdn.newsite.com/{filename}"
1343
+
1344
+ # Domain migration for external links
1345
+ elif url.startswith("https://example.com"):
1346
+ return url.replace("example.com", "newsite.com")
1347
+
1348
+ # Skip all other URLs (external services, CDNs, etc.)
1349
+ return None
1350
+
1351
+ result = rewrite_urls(content, migration_rewriter)
1352
+
1353
+ # Verify local page migrations
1354
+ assert "https://newsite.com/about.html" in result
1355
+ assert "https://newsite.com/support/faq.html" in result
1356
+ assert "https://newsite.com/legal/privacy.html" in result
1357
+
1358
+ # Verify image migration to CDN
1359
+ assert "https://cdn.newsite.com/logo.png" in result
1360
+
1361
+ # Verify domain migration
1362
+ assert "https://newsite.com/terms" in result
1363
+
1364
+ # Verify unchanged external URLs
1365
+ assert "https://partner.com" in result
1366
+ assert "https://cdn.example.com/img.jpg" in result
1367
+ assert "https://external-service.com" in result
1368
+
1369
+ # Verify that relative URLs in angle brackets remain unchanged
1370
+ # (marko doesn't parse them as URL elements)
1371
+ assert "<./contact.html>" in result
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
+ from typing import Any
4
5
 
5
6
  from kash.utils.text_handling.escape_html_tags import escape_html_tags
6
7
 
@@ -35,7 +36,7 @@ def markdownify_preprocess(html: str) -> str:
35
36
 
36
37
  # Good options for markdownify. Without setting sup_symbol and sub_symbol, that
37
38
  # info is typically lost.
38
- MARKDOWNIFY_OPTIONS = {
39
+ MARKDOWNIFY_OPTIONS: dict[str, Any] = {
39
40
  "sup_symbol": "<__sup>",
40
41
  "sub_symbol": "<__sub>",
41
42
  "escape_underscores": True,