kash-shell 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. kash/actions/core/markdownify.py +12 -8
  2. kash/actions/core/readability.py +8 -7
  3. kash/actions/core/render_as_html.py +8 -6
  4. kash/actions/core/show_webpage.py +2 -2
  5. kash/commands/base/basic_file_commands.py +3 -0
  6. kash/commands/base/diff_commands.py +38 -3
  7. kash/commands/base/reformat_command.py +1 -1
  8. kash/commands/base/show_command.py +1 -1
  9. kash/commands/workspace/selection_commands.py +1 -1
  10. kash/commands/workspace/workspace_commands.py +92 -29
  11. kash/docs/load_source_code.py +1 -1
  12. kash/exec/action_exec.py +6 -8
  13. kash/exec/fetch_url_metadata.py +8 -5
  14. kash/exec/importing.py +4 -4
  15. kash/exec/llm_transforms.py +1 -1
  16. kash/exec/preconditions.py +30 -10
  17. kash/file_storage/file_store.py +105 -43
  18. kash/file_storage/item_file_format.py +1 -1
  19. kash/file_storage/store_filenames.py +2 -1
  20. kash/help/help_embeddings.py +2 -2
  21. kash/llm_utils/clean_headings.py +1 -1
  22. kash/{text_handling → llm_utils}/custom_sliding_transforms.py +0 -3
  23. kash/llm_utils/llm_completion.py +1 -1
  24. kash/local_server/__init__.py +1 -1
  25. kash/local_server/local_server_commands.py +2 -1
  26. kash/mcp/__init__.py +1 -1
  27. kash/mcp/mcp_server_commands.py +8 -2
  28. kash/media_base/media_cache.py +10 -3
  29. kash/model/actions_model.py +3 -0
  30. kash/model/items_model.py +78 -44
  31. kash/model/operations_model.py +14 -0
  32. kash/shell/ui/shell_results.py +2 -1
  33. kash/shell/utils/native_utils.py +2 -2
  34. kash/utils/common/format_utils.py +0 -8
  35. kash/utils/common/import_utils.py +46 -18
  36. kash/utils/common/url.py +80 -3
  37. kash/utils/file_utils/file_formats.py +3 -2
  38. kash/utils/file_utils/file_formats_model.py +47 -45
  39. kash/utils/file_utils/filename_parsing.py +41 -16
  40. kash/{text_handling → utils/text_handling}/doc_normalization.py +10 -8
  41. kash/utils/text_handling/escape_html_tags.py +156 -0
  42. kash/{text_handling → utils/text_handling}/markdown_utils.py +0 -3
  43. kash/utils/text_handling/markdownify_utils.py +87 -0
  44. kash/{text_handling → utils/text_handling}/unified_diffs.py +1 -44
  45. kash/web_content/file_cache_utils.py +42 -34
  46. kash/web_content/local_file_cache.py +53 -13
  47. kash/web_content/web_extract.py +1 -1
  48. kash/web_content/web_extract_readabilipy.py +4 -2
  49. kash/web_content/web_fetch.py +42 -7
  50. kash/web_content/web_page_model.py +2 -1
  51. kash/web_gen/simple_webpage.py +1 -1
  52. kash/web_gen/templates/base_styles.css.jinja +134 -16
  53. kash/web_gen/templates/simple_webpage.html.jinja +1 -1
  54. kash/workspaces/selections.py +2 -2
  55. kash/workspaces/workspace_output.py +2 -2
  56. kash/xonsh_custom/load_into_xonsh.py +4 -2
  57. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/METADATA +1 -1
  58. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/RECORD +62 -62
  59. kash/utils/common/inflection.py +0 -22
  60. kash/workspaces/workspace_importing.py +0 -56
  61. /kash/{text_handling → utils/text_handling}/markdown_render.py +0 -0
  62. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/WHEEL +0 -0
  63. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/entry_points.txt +0 -0
  64. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/licenses/LICENSE +0 -0
@@ -4,7 +4,7 @@ from dataclasses import dataclass
4
4
  from enum import Enum
5
5
  from pathlib import Path
6
6
 
7
- from kash.utils.common.url import Url, is_file_url, parse_file_url
7
+ from kash.utils.common.url import is_valid_path
8
8
  from kash.utils.file_utils.file_ext import FileExt
9
9
  from kash.utils.file_utils.file_formats import (
10
10
  MIME_EMPTY,
@@ -112,6 +112,9 @@ class Format(Enum):
112
112
 
113
113
  @property
114
114
  def is_doc(self) -> bool:
115
+ """
116
+ Is this a textual document of some kind?
117
+ """
115
118
  return self in [
116
119
  self.markdown,
117
120
  self.md_html,
@@ -119,6 +122,7 @@ class Format(Enum):
119
122
  self.pdf,
120
123
  self.docx,
121
124
  self.pptx,
125
+ self.epub,
122
126
  ]
123
127
 
124
128
  @property
@@ -139,7 +143,13 @@ class Format(Enum):
139
143
 
140
144
  @property
141
145
  def is_markdown(self) -> bool:
142
- return self in [self.markdown, self.md_html]
146
+ """Is in pure Markdown (no HTML)."""
147
+ return self in [self.markdown]
148
+
149
+ @property
150
+ def is_markdown_with_html(self) -> bool:
151
+ """Is in Markdown with HTML."""
152
+ return self in [self.md_html]
143
153
 
144
154
  @property
145
155
  def is_html(self) -> bool:
@@ -340,8 +350,8 @@ Format._init_mime_type_map()
340
350
 
341
351
  @dataclass(frozen=True)
342
352
  class FileFormatInfo:
343
- file_ext: FileExt | None
344
- """File extension, if recognized."""
353
+ current_file_ext: FileExt | None
354
+ """File extension, if recognized and in the current filename."""
345
355
 
346
356
  format: Format | None
347
357
  """Format, if recognized."""
@@ -349,11 +359,18 @@ class FileFormatInfo:
349
359
  mime_type: MimeType | None
350
360
  """Raw mime type, which may include more formats than the ones above."""
351
361
 
362
+ @property
363
+ def suggested_file_ext(self) -> FileExt | None:
364
+ """
365
+ Suggested file extension based on detected format.
366
+ """
367
+ return self.format.file_ext if self.format else self.current_file_ext
368
+
352
369
  @property
353
370
  def is_text(self) -> bool:
354
371
  return bool(
355
- self.file_ext
356
- and self.file_ext.is_text
372
+ self.current_file_ext
373
+ and self.current_file_ext.is_text
357
374
  or self.format
358
375
  and self.format.is_text
359
376
  or self.mime_type
@@ -373,8 +390,8 @@ class FileFormatInfo:
373
390
  @property
374
391
  def is_image(self) -> bool:
375
392
  return bool(
376
- self.file_ext
377
- and self.file_ext.is_image
393
+ self.current_file_ext
394
+ and self.current_file_ext.is_image
378
395
  or self.format
379
396
  and self.format.is_image
380
397
  or self.mime_type
@@ -395,15 +412,6 @@ class FileFormatInfo:
395
412
  return self.as_str()
396
413
 
397
414
 
398
- def _guess_format(file_ext: FileExt | None, mime_type: MimeType | None) -> Format | None:
399
- format = None
400
- if file_ext:
401
- format = Format.guess_by_file_ext(file_ext)
402
- if not format and mime_type:
403
- format = Format.from_mime_type(mime_type)
404
- return format
405
-
406
-
407
415
  def guess_format_by_name(path: str | Path) -> Format | None:
408
416
  """
409
417
  Fast guess of file format by the file name only.
@@ -412,22 +420,39 @@ def guess_format_by_name(path: str | Path) -> Format | None:
412
420
  return Format.guess_by_file_ext(file_ext) if file_ext else None
413
421
 
414
422
 
415
- def file_format_info(path: str | Path, always_check_content: bool = False) -> FileFormatInfo:
423
+ def file_format_info(
424
+ path: str | Path,
425
+ suggested_mime_type: MimeType | None = None,
426
+ ) -> FileFormatInfo:
416
427
  """
417
428
  Get info on the file format path and content (file extension and file content).
418
429
  Looks at the file extension first and then the file content if needed.
419
- If `always_check_content` is True, look at the file content even if we
420
- recognize the file extension.
430
+ If `suggested_mime_type` is provided, it will be used as the detected mime type
431
+ instead of detecting it from the file content.
421
432
  """
433
+ if not is_valid_path(path):
434
+ raise ValueError(f"Expected a file path but got: {path!r}")
435
+
422
436
  path = Path(path)
423
437
  file_ext = parse_file_ext(path)
424
- if always_check_content or not file_ext:
438
+ if not suggested_mime_type and not file_ext:
425
439
  # Look at the file content.
426
440
  detected_mime_type = detect_mime_type(path)
441
+ elif suggested_mime_type:
442
+ detected_mime_type = suggested_mime_type
427
443
  else:
428
444
  detected_mime_type = None
429
- format = _guess_format(file_ext, detected_mime_type)
445
+
446
+ # Pick format first by file extension, then by detected mime type.
447
+ format = None
448
+ if file_ext:
449
+ format = Format.guess_by_file_ext(file_ext)
450
+ if not format and detected_mime_type:
451
+ format = Format.from_mime_type(detected_mime_type)
452
+
453
+ # Attempt to canonicalize the mime type to match the format.
430
454
  final_mime_type = format.mime_type if format else detected_mime_type
455
+
431
456
  return FileFormatInfo(file_ext, format, final_mime_type)
432
457
 
433
458
 
@@ -445,26 +470,3 @@ def detect_media_type(filename: str | Path) -> MediaType:
445
470
  fmt = detect_file_format(filename)
446
471
  media_type = fmt.media_type if fmt else MediaType.binary
447
472
  return media_type
448
-
449
-
450
- def choose_file_ext(url_or_path: Url | Path | str) -> FileExt | None:
451
- """
452
- Pick a suffix to reflect the type of the content. Recognizes known file
453
- extensions, then tries libmagic, then gives up.
454
- """
455
-
456
- def file_ext_for(path: Path) -> FileExt | None:
457
- fmt = detect_file_format(path)
458
- return fmt.file_ext if fmt else None
459
-
460
- ext = None
461
- if isinstance(url_or_path, Path):
462
- ext = parse_file_ext(url_or_path) or file_ext_for(url_or_path)
463
- elif is_file_url(url_or_path):
464
- path = parse_file_url(url_or_path)
465
- if path:
466
- ext = parse_file_ext(path) or file_ext_for(path)
467
- else:
468
- ext = parse_file_ext(url_or_path)
469
-
470
- return ext
@@ -1,39 +1,46 @@
1
1
  import os
2
+ import re
2
3
  from pathlib import Path
3
4
 
4
5
  from kash.config.logger import get_logger
5
6
  from kash.utils.common.url import Url, check_if_url
6
- from kash.utils.errors import InvalidFilename
7
7
  from kash.utils.file_utils.file_ext import FileExt, canonicalize_file_ext
8
8
 
9
9
  log = get_logger(__name__)
10
10
 
11
+ _valid_ext_re = re.compile(r"^[a-z0-9]*[a-z][a-z0-9]*$", re.IGNORECASE)
11
12
 
12
- def split_filename(path: str | Path, require_type_ext: bool = False) -> tuple[str, str, str, str]:
13
+
14
+ def split_filename(path: str | Path) -> tuple[str, str, str, str]:
13
15
  """
14
- Parse a filename into its path, name, (optional) type, and extension parts:
16
+ Parse a filename into its path, name, (optional) type, and extension parts.
17
+ Type and extension are optional but must be only letters/numbers and not
18
+ all numbers.
15
19
 
16
20
  folder/file.name.type.ext -> ("folder", "file.name", "type", "ext")
17
21
  filename.doc.txt -> ("", "filename", "note", "txt")
18
22
  filename.txt -> ("", "filename", "", "txt")
19
23
  filename -> ("", "filename", "", "")
24
+ filename.123.txt -> ("", "filename.123", "", "txt")
25
+ filename.123.456 -> ("", "filename.123.456", "", "")
20
26
  """
21
27
  path_str = str(path)
22
28
 
23
29
  dirname = os.path.dirname(path_str)
24
30
  parts = os.path.basename(path_str).rsplit(".", 2)
25
- if len(parts) == 3:
31
+ if len(parts) == 3 and _valid_ext_re.match(parts[1]) and _valid_ext_re.match(parts[2]):
26
32
  name, item_type, ext = parts
27
- elif len(parts) == 2 and not require_type_ext:
33
+ elif len(parts) == 3 and _valid_ext_re.match(parts[2]):
34
+ name = f"{parts[0]}.{parts[1]}"
35
+ item_type = ""
36
+ ext = parts[2]
37
+ elif len(parts) == 2 and _valid_ext_re.match(parts[1]):
28
38
  name, ext = parts
29
39
  item_type = ""
30
- elif len(parts) == 1 and not require_type_ext:
31
- name = parts[0]
32
- item_type = ext = ""
33
40
  else:
34
- raise InvalidFilename(
35
- f"Filename does not match file store convention (name.type.ext): {path_str}"
36
- )
41
+ name = os.path.basename(path_str)
42
+ item_type = ext = ""
43
+
37
44
  return dirname, name, item_type, ext
38
45
 
39
46
 
@@ -67,8 +74,6 @@ def parse_file_ext(url_or_path: str | Url | Path) -> FileExt | None:
67
74
 
68
75
 
69
76
  def test_parse_filename():
70
- import pytest
71
-
72
77
  filename = "foo/bar/test_file.1.type.ext"
73
78
  dirname, name, item_type, ext = split_filename(filename)
74
79
  assert dirname == "foo/bar"
@@ -90,9 +95,29 @@ def test_parse_filename():
90
95
  assert item_type == ""
91
96
  assert ext == ""
92
97
 
93
- filename = "missing_type.ext"
94
- with pytest.raises(InvalidFilename):
95
- split_filename(filename, require_type_ext=True)
98
+ # Numeric extensions not allowed.
99
+ dirname, name, item_type, ext = split_filename("test.abc")
100
+ assert name == "test"
101
+ assert ext == "abc"
102
+
103
+ dirname, name, item_type, ext = split_filename("test.123")
104
+ assert name == "test.123"
105
+ assert ext == ""
106
+
107
+ dirname, name, item_type, ext = split_filename("test.type.123")
108
+ assert name == "test.type.123"
109
+ assert item_type == ""
110
+ assert ext == ""
111
+
112
+ dirname, name, item_type, ext = split_filename("test.valid.123")
113
+ assert name == "test.valid.123"
114
+ assert item_type == ""
115
+ assert ext == ""
116
+
117
+ dirname, name, item_type, ext = split_filename("test.123.txt")
118
+ assert name == "test.123"
119
+ assert item_type == ""
120
+ assert ext == "txt"
96
121
 
97
122
 
98
123
  def test_parse_file_ext():
@@ -1,8 +1,6 @@
1
1
  from pathlib import Path
2
2
 
3
- from flowmark import fill_markdown, fill_text, line_wrap_by_sentence
4
- from flowmark.text_filling import DEFAULT_WRAP_WIDTH
5
- from flowmark.text_wrapping import simple_word_splitter
3
+ from flowmark import fill_markdown, line_wrap_by_sentence
6
4
  from frontmatter_format import fmf_read, fmf_write
7
5
 
8
6
  from kash.utils.common.format_utils import fmt_loc
@@ -14,24 +12,28 @@ from kash.utils.rich_custom.ansi_cell_len import ansi_cell_len
14
12
  def normalize_formatting(
15
13
  text: str,
16
14
  format: Format | None,
17
- width=DEFAULT_WRAP_WIDTH,
18
15
  support_ansi: bool = True,
19
16
  cleanups: bool = True,
20
17
  ) -> str:
21
18
  """
22
- Normalize text formatting by wrapping lines and normalizing Markdown.
19
+ Normalize formatting. Currently only normalizes Markdown and leaves plaintext
20
+ and HTML intact.
21
+
23
22
  This only does "safe" normalizations that cannot break the text.
24
23
  Enables ANSI support so ANSI codes and OSC-8 links are correctly handled.
25
24
  """
26
25
  len_fn = ansi_cell_len if support_ansi else len
27
- if format == Format.plaintext:
28
- return fill_text(text, width=width, word_splitter=simple_word_splitter, len_fn=len_fn)
29
- elif format == Format.markdown or format == Format.md_html:
26
+ if format == Format.markdown or format == Format.md_html:
30
27
  return fill_markdown(
31
28
  text,
32
29
  line_wrapper=line_wrap_by_sentence(len_fn=len_fn, is_markdown=True),
33
30
  cleanups=cleanups,
34
31
  )
32
+ elif format == Format.plaintext:
33
+ # Consider plaintext a raw format and don't normalize.
34
+ # We could add support for formatted plaintext as well?
35
+ # Then do: fill_text(text, width=width, word_splitter=simple_word_splitter, len_fn=len_fn)
36
+ return text
35
37
  elif format == Format.html:
36
38
  # We don't currently auto-format HTML as we sometimes use HTML with specifically chosen line breaks.
37
39
  return text
@@ -0,0 +1,156 @@
1
+ import re
2
+ from collections.abc import Set
3
+
4
+ HTML_IN_MD_TAGS = frozenset(["div", "span", "sup", "sub", "br", "details", "summary"])
5
+ """These are tags that have reasonable usage in Markdown so typically would be preserved."""
6
+
7
+ ALLOWED_BARE_PROTOS = frozenset(["http://", "https://", "file://"])
8
+
9
+
10
+ def escape_html_tags(
11
+ html_content: str,
12
+ whitelist_tags: Set[str] = HTML_IN_MD_TAGS,
13
+ allow_bare_md_urls: bool = False,
14
+ ) -> str:
15
+ """
16
+ Escapes HTML tags by replacing '<' with '&lt;', except for whitelisted tags and
17
+ markdown-style URLs like <https://example.com>. Whitelist defaults to the only a
18
+ few common tags. But it can also be empty to escape all tags.
19
+ """
20
+ result = []
21
+ last_pos = 0
22
+
23
+ # Compile patterns for matching at each '<' position
24
+ # Match <, optional spaces, optional /, optional spaces, whitelisted tag, then optional attributes, then optional /, optional spaces, then >
25
+ whitelist_pattern = re.compile(
26
+ r"< *(/?) *(" + "|".join(whitelist_tags) + r")(?:\s+[^>]*)? *(/?) *>",
27
+ re.IGNORECASE,
28
+ )
29
+
30
+ url_pattern = None
31
+ if allow_bare_md_urls:
32
+ url_pattern = re.compile(
33
+ r"<(?:" + "|".join(re.escape(proto) for proto in ALLOWED_BARE_PROTOS) + r")[^>\s]+>"
34
+ )
35
+
36
+ # Find all '<' characters
37
+ for match in re.finditer(r"<", html_content):
38
+ start_pos = match.start()
39
+
40
+ # Add text before this '<'
41
+ result.append(html_content[last_pos:start_pos])
42
+
43
+ # Try to match patterns at this position
44
+ substring = html_content[start_pos:]
45
+ whitelist_match = whitelist_pattern.match(substring)
46
+ url_match = url_pattern and url_pattern.match(substring)
47
+
48
+ if whitelist_match:
49
+ result.append(whitelist_match.group(0))
50
+ last_pos = start_pos + len(whitelist_match.group(0))
51
+ elif url_match:
52
+ result.append(url_match.group(0))
53
+ last_pos = start_pos + len(url_match.group(0))
54
+ else:
55
+ # No match, escape this '<'
56
+ result.append("&lt;")
57
+ last_pos = start_pos + 1
58
+
59
+ # Add remaining text
60
+ result.append(html_content[last_pos:])
61
+
62
+ return "".join(result)
63
+
64
+
65
+ ## Tests
66
+
67
+
68
+ def test_escape_html_tags():
69
+ """Tests the escape_html_tags function with various cases."""
70
+
71
+ # 1. Basic Whitelist Check (Default)
72
+ assert escape_html_tags("<div>Test</div>") == "<div>Test</div>"
73
+ assert escape_html_tags("<span>Test</span>") == "<span>Test</span>"
74
+ assert escape_html_tags("<br>") == "<br>"
75
+ assert (
76
+ escape_html_tags("<details><summary>Sum</summary>Det</details>")
77
+ == "<details><summary>Sum</summary>Det</details>"
78
+ )
79
+
80
+ # 2. Basic Escape Check
81
+ assert escape_html_tags("<p>Test</p>") == "&lt;p>Test&lt;/p>"
82
+ assert escape_html_tags("<script>alert('x');</script>") == "&lt;script>alert('x');&lt;/script>"
83
+ assert escape_html_tags("<img>") == "&lt;img>"
84
+
85
+ # 3. Case Insensitivity
86
+ assert escape_html_tags("<DiV>Case</DiV>") == "<DiV>Case</DiV>" # Whitelisted
87
+ assert escape_html_tags("<P>Test</P>") == "&lt;P>Test&lt;/P>" # Escaped
88
+
89
+ # 4. Self-closing tags
90
+ assert escape_html_tags("<br/>") == "<br/>" # Whitelisted
91
+ assert escape_html_tags("<br />") == "<br />" # Whitelisted
92
+ assert escape_html_tags("<img/>") == "&lt;img/>" # Escaped
93
+
94
+ # 5. Tags with Attributes
95
+ assert (
96
+ escape_html_tags('<div class="foo">Test</div>') == '<div class="foo">Test</div>'
97
+ ) # Whitelisted
98
+ assert (
99
+ escape_html_tags('<span id="bar" data-val="x">Test</span>')
100
+ == '<span id="bar" data-val="x">Test</span>'
101
+ ) # Whitelisted
102
+ assert escape_html_tags('<p class="foo">Test</p>') == '&lt;p class="foo">Test&lt;/p>' # Escaped
103
+ assert escape_html_tags('<img src="a.jpg"/>') == '&lt;img src="a.jpg"/>' # Escaped
104
+
105
+ # 6. Markdown URL Handling
106
+ url_md = "Check <https://example.com> and <http://test.org/path>"
107
+ assert escape_html_tags(url_md, allow_bare_md_urls=True) == url_md
108
+ assert (
109
+ escape_html_tags(url_md, allow_bare_md_urls=False)
110
+ == "Check &lt;https://example.com> and &lt;http://test.org/path>"
111
+ )
112
+
113
+ url_mixed = "<div>Link: <https://ok.com></div> <script>no</script>"
114
+ expected_mixed_urls_allowed = "<div>Link: <https://ok.com></div> &lt;script>no&lt;/script>"
115
+ expected_mixed_urls_disallowed = (
116
+ "<div>Link: &lt;https://ok.com></div> &lt;script>no&lt;/script>"
117
+ )
118
+ assert escape_html_tags(url_mixed, allow_bare_md_urls=True) == expected_mixed_urls_allowed
119
+ assert escape_html_tags(url_mixed, allow_bare_md_urls=False) == expected_mixed_urls_disallowed
120
+
121
+ assert (
122
+ escape_html_tags("<http://malformed url>", allow_bare_md_urls=True)
123
+ == "&lt;http://malformed url>"
124
+ )
125
+ assert (
126
+ escape_html_tags("</https://example.com>", allow_bare_md_urls=True)
127
+ == "&lt;/https://example.com>"
128
+ ) # Closing URL-like is escaped
129
+
130
+ # 7. Nested/Malformed '<' and Edge Cases
131
+ assert escape_html_tags("<<script>>") == "&lt;&lt;script>>" # Escaped non-tag <
132
+ assert escape_html_tags("<div><p>nested</p></div>") == "<div>&lt;p>nested&lt;/p></div>"
133
+ assert escape_html_tags("<div<span") == "&lt;div&lt;span" # Incomplete tags are escaped
134
+ assert (
135
+ escape_html_tags("Text < with > inside") == "Text &lt; with > inside"
136
+ ) # Escape < even if > exists later
137
+ assert escape_html_tags("<") == "&lt;"
138
+ assert escape_html_tags(">") == ">"
139
+ assert escape_html_tags("<>") == "&lt;>"
140
+ assert escape_html_tags("< >") == "&lt; >"
141
+ assert escape_html_tags("< / div >") == "< / div >" # Whitelisted closing tag with spaces
142
+
143
+ # 8. Mixed Content Combination
144
+ complex_html = "<DiV class='A'>Hello <Br/> <p>World</p> <https://link.com> </DiV>"
145
+ expected_complex_allowed = (
146
+ "<DiV class='A'>Hello <Br/> &lt;p>World&lt;/p> <https://link.com> </DiV>"
147
+ )
148
+ expected_complex_disallowed = (
149
+ "<DiV class='A'>Hello <Br/> &lt;p>World&lt;/p> &lt;https://link.com> </DiV>"
150
+ )
151
+ assert escape_html_tags(complex_html, allow_bare_md_urls=True) == expected_complex_allowed
152
+ assert escape_html_tags(complex_html, allow_bare_md_urls=False) == expected_complex_disallowed
153
+
154
+ # 9. Empty/No Tags
155
+ assert escape_html_tags("") == ""
156
+ assert escape_html_tags("Just plain text, no tags.") == "Just plain text, no tags."
@@ -7,11 +7,8 @@ import regex
7
7
  from marko.block import Heading, ListItem
8
8
  from marko.inline import Link
9
9
 
10
- from kash.config.logger import get_logger
11
10
  from kash.utils.common.url import Url
12
11
 
13
- log = get_logger(__name__)
14
-
15
12
  HTag: TypeAlias = str
16
13
 
17
14
  # Characters that commonly need escaping in Markdown inline text.
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from kash.utils.text_handling.escape_html_tags import escape_html_tags
6
+
7
+ _single_tilde_pat = re.compile(r"(?<!~)~(?!~)")
8
+ _alt_tilde = "~"
9
+
10
+
11
+ def _fix_single_tilde(html: str) -> str:
12
+ """
13
+ Escape standalone ~ characters with spaces before/after to avoid
14
+ misinterpretation by markdownify as strikethrough. Using ~ because it's
15
+ hard to properly escape ~ in a way that markdownify will respect.
16
+ """
17
+
18
+ def replace_tilde(match: re.Match[str]) -> str:
19
+ start = match.start()
20
+ end = match.end()
21
+ # Check for space before or after
22
+ has_space_before = start > 0 and html[start - 1].isspace()
23
+ has_space_after = end < len(html) and html[end].isspace()
24
+ return _alt_tilde if has_space_before or has_space_after else "~"
25
+
26
+ return _single_tilde_pat.sub(replace_tilde, html)
27
+
28
+
29
+ def markdownify_preprocess(html: str) -> str:
30
+ """
31
+ Preprocess HTML before passing it to markdownify.
32
+ """
33
+ return _fix_single_tilde(html)
34
+
35
+
36
+ # Good options for markdownify. Without setting sup_symbol and sub_symbol, that
37
+ # info is typically lost.
38
+ MARKDOWNIFY_OPTIONS = {
39
+ "sup_symbol": "<__sup>",
40
+ "sub_symbol": "<__sub>",
41
+ "escape_underscores": True,
42
+ "escape_asterisks": True,
43
+ "escape_misc": False, # This suppresses gratuitous escaping of -, ., etc.
44
+ "newline_style": "BACKSLASH",
45
+ }
46
+
47
+
48
+ def _escape_html_in_md(md_text: str, whitelist_tags: set[str] | None = None) -> str:
49
+ """
50
+ HTML tags originally escaped with entities can get parsed and appear unescaped
51
+ in the Markdown so it usually makes sense to do a full escaping (except for our
52
+ custom sup/sub tags).
53
+ """
54
+ # Output from markdownify (especially from docx or other conversions) should
55
+ # not have any HTML tags except for the custom sup/sub tags we've added.
56
+ return escape_html_tags(
57
+ md_text,
58
+ allow_bare_md_urls=True,
59
+ whitelist_tags={"__sup", "__sub"} | (whitelist_tags or set()),
60
+ )
61
+
62
+
63
+ def markdownify_postprocess(md_text: str) -> str:
64
+ """
65
+ Postprocess Markdown after markdownify has converted HTML to Markdown.
66
+ """
67
+ md_text = _escape_html_in_md(md_text)
68
+ # We use our own custom tags for sup/sub to avoid possible conflicts with other
69
+ # tags in a doc. But when done we should replace them with the standard ones.
70
+ return (
71
+ md_text.replace("<__sup>", "<sup>")
72
+ .replace("</__sup>", "</sup>")
73
+ .replace("<__sub>", "<sub>")
74
+ .replace("</__sub>", "</sub>")
75
+ )
76
+
77
+
78
+ def markdownify_custom(html: str) -> str:
79
+ """
80
+ Customized version of `markdownify_convert to be more robust than with default settings.
81
+ """
82
+
83
+ from markdownify import markdownify as markdownify_convert
84
+
85
+ preprocessed_html = markdownify_preprocess(html)
86
+ md_text = markdownify_convert(preprocessed_html, **MARKDOWNIFY_OPTIONS)
87
+ return markdownify_postprocess(md_text)
@@ -6,15 +6,6 @@ from funlog import abbreviate_arg
6
6
  from patch_ng import PatchSet
7
7
  from pydantic.dataclasses import dataclass
8
8
 
9
- from kash.config.logger import get_logger
10
- from kash.model.items_model import Item, ItemRelations, ItemType
11
- from kash.model.paths_model import StorePath
12
- from kash.utils.errors import ContentError
13
- from kash.utils.file_utils.file_formats_model import Format
14
-
15
- log = get_logger(__name__)
16
-
17
-
18
9
  # TODO: Support diffs of path lists as well, including renames and moves.
19
10
 
20
11
 
@@ -77,7 +68,7 @@ def unified_diff(
77
68
 
78
69
  patch_set = PatchSet(BytesIO(diff_text.encode("utf-8")))
79
70
  if patch_set.errors > 0:
80
- raise ContentError(
71
+ raise ValueError(
81
72
  f"Had {patch_set.errors} errors parsing diff of `{from_name}` and `{to_name}`: {abbreviate_arg(diff_text)}"
82
73
  )
83
74
 
@@ -102,37 +93,3 @@ def unified_diff_files(from_file: str | Path, to_file: str | Path) -> UnifiedDif
102
93
  content2 = f2.read()
103
94
 
104
95
  return unified_diff(content1, content2, from_name, to_name)
105
-
106
-
107
- def unified_diff_items(from_item: Item, to_item: Item, strict: bool = True) -> Item:
108
- """
109
- Generate a unified diff between two items. If `strict` is true, will raise
110
- an error if the items are of different formats.
111
- """
112
- if not from_item.body and not to_item.body:
113
- raise ContentError(f"No body to diff for {from_item} and {to_item}")
114
- if not from_item.store_path or not to_item.store_path:
115
- raise ContentError("No store path on items; save before diffing")
116
- diff_items = [item for item in [from_item, to_item] if item.format == Format.diff]
117
- if len(diff_items) == 1:
118
- raise ContentError(
119
- f"Cannot compare diffs to non-diffs: {from_item.format}, {to_item.format}"
120
- )
121
- if len(diff_items) > 0 or from_item.format != to_item.format:
122
- msg = f"Diffing items of incompatible format: {from_item.format}, {to_item.format}"
123
- if strict:
124
- raise ContentError(msg)
125
- else:
126
- log.warning("%s", msg)
127
-
128
- from_path, to_path = StorePath(from_item.store_path), StorePath(to_item.store_path)
129
-
130
- diff = unified_diff(from_item.body, to_item.body, str(from_path), str(to_path))
131
-
132
- return Item(
133
- type=ItemType.doc,
134
- title=f"Diff of {from_path} and {to_path}",
135
- format=Format.diff,
136
- relations=ItemRelations(diff_of=[from_path, to_path]),
137
- body=diff.patch_text,
138
- )