kash-shell 0.3.12__py3-none-any.whl → 0.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/core/markdownify.py +5 -4
- kash/actions/core/readability.py +4 -4
- kash/actions/core/render_as_html.py +6 -4
- kash/commands/base/basic_file_commands.py +3 -0
- kash/commands/base/diff_commands.py +38 -3
- kash/commands/base/reformat_command.py +1 -1
- kash/commands/base/show_command.py +1 -1
- kash/commands/workspace/selection_commands.py +1 -1
- kash/commands/workspace/workspace_commands.py +62 -16
- kash/docs/load_source_code.py +1 -1
- kash/exec/action_exec.py +4 -5
- kash/exec/fetch_url_metadata.py +8 -5
- kash/exec/importing.py +4 -4
- kash/exec/llm_transforms.py +1 -1
- kash/exec/preconditions.py +7 -7
- kash/file_storage/file_store.py +73 -32
- kash/file_storage/item_file_format.py +1 -1
- kash/file_storage/store_filenames.py +2 -1
- kash/help/help_embeddings.py +2 -2
- kash/llm_utils/clean_headings.py +1 -1
- kash/{text_handling → llm_utils}/custom_sliding_transforms.py +0 -3
- kash/llm_utils/llm_completion.py +1 -1
- kash/local_server/__init__.py +1 -1
- kash/local_server/local_server_commands.py +2 -1
- kash/mcp/__init__.py +1 -1
- kash/mcp/mcp_server_commands.py +8 -2
- kash/media_base/media_cache.py +10 -3
- kash/model/actions_model.py +3 -0
- kash/model/items_model.py +71 -42
- kash/shell/ui/shell_results.py +2 -1
- kash/utils/common/format_utils.py +0 -8
- kash/utils/common/import_utils.py +46 -18
- kash/utils/file_utils/file_formats_model.py +46 -26
- kash/utils/file_utils/filename_parsing.py +41 -16
- kash/{text_handling → utils/text_handling}/doc_normalization.py +10 -8
- kash/utils/text_handling/escape_html_tags.py +156 -0
- kash/{text_handling → utils/text_handling}/markdown_utils.py +0 -3
- kash/utils/text_handling/markdownify_utils.py +87 -0
- kash/{text_handling → utils/text_handling}/unified_diffs.py +1 -44
- kash/web_content/file_cache_utils.py +42 -34
- kash/web_content/local_file_cache.py +29 -12
- kash/web_content/web_extract.py +1 -1
- kash/web_content/web_extract_readabilipy.py +4 -2
- kash/web_content/web_fetch.py +42 -7
- kash/web_content/web_page_model.py +2 -1
- kash/web_gen/simple_webpage.py +1 -1
- kash/web_gen/templates/base_styles.css.jinja +134 -16
- kash/web_gen/templates/simple_webpage.html.jinja +1 -1
- kash/workspaces/selections.py +2 -2
- kash/workspaces/workspace_importing.py +1 -1
- kash/workspaces/workspace_output.py +2 -2
- kash/xonsh_custom/load_into_xonsh.py +4 -2
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.13.dist-info}/METADATA +1 -1
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.13.dist-info}/RECORD +58 -57
- kash/utils/common/inflection.py +0 -22
- /kash/{text_handling → utils/text_handling}/markdown_render.py +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.13.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.13.dist-info}/entry_points.txt +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -12,36 +12,64 @@ log = logging.getLogger(__name__)
|
|
|
12
12
|
Tallies: TypeAlias = dict[str, int]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
15
|
+
def import_recursive(
|
|
16
16
|
parent_package_name: str,
|
|
17
17
|
parent_dir: Path,
|
|
18
|
-
|
|
18
|
+
resource_names: list[str] | None = None,
|
|
19
19
|
tallies: Tallies | None = None,
|
|
20
20
|
):
|
|
21
21
|
"""
|
|
22
|
-
Import
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
Import modules from subdirectories or individual Python modules within a parent package.
|
|
23
|
+
|
|
24
|
+
Each resource in `resource_names` can be:
|
|
25
|
+
- A directory name (all modules within it will be imported)
|
|
26
|
+
- A module name with or without '.py' extension (a single module will be imported)
|
|
27
|
+
- "." to import all modules in the parent_dir
|
|
28
|
+
|
|
29
|
+
If `resource_names` is `None`, imports all modules directly in parent_dir.
|
|
30
|
+
|
|
31
|
+
Simply a convenience wrapper for `importlib.import_module` and
|
|
32
|
+
`pkgutil.iter_modules` to iterate over all modules in the subdirectories.
|
|
33
|
+
|
|
34
|
+
If `tallies` is provided, it will be updated with the number of modules imported
|
|
35
|
+
for each package.
|
|
25
36
|
"""
|
|
26
37
|
if tallies is None:
|
|
27
38
|
tallies = {}
|
|
28
|
-
if not
|
|
29
|
-
|
|
39
|
+
if not resource_names:
|
|
40
|
+
resource_names = ["."]
|
|
30
41
|
|
|
31
|
-
for
|
|
32
|
-
if
|
|
42
|
+
for name in resource_names:
|
|
43
|
+
if name == ".":
|
|
33
44
|
full_path = parent_dir
|
|
34
45
|
package_name = parent_package_name
|
|
35
46
|
else:
|
|
36
|
-
full_path = parent_dir /
|
|
37
|
-
package_name = f"{parent_package_name}.{
|
|
38
|
-
|
|
39
|
-
if
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
full_path = parent_dir / name
|
|
48
|
+
package_name = f"{parent_package_name}.{name}"
|
|
49
|
+
|
|
50
|
+
# Check if it's a directory
|
|
51
|
+
if full_path.is_dir():
|
|
52
|
+
# Import all modules in the directory
|
|
53
|
+
for _, module_name, _ in pkgutil.iter_modules(path=[str(full_path)]):
|
|
54
|
+
importlib.import_module(f"{package_name}.{module_name}")
|
|
55
|
+
tallies[package_name] = tallies.get(package_name, 0) + 1
|
|
56
|
+
else:
|
|
57
|
+
# Not a directory, try as a module file
|
|
58
|
+
module_path = full_path
|
|
59
|
+
module_name = name
|
|
60
|
+
|
|
61
|
+
# Handle with or without .py extension
|
|
62
|
+
if not module_path.is_file() and module_path.suffix != ".py":
|
|
63
|
+
module_path = parent_dir / f"{name}.py"
|
|
64
|
+
module_name = name
|
|
65
|
+
elif module_path.suffix == ".py":
|
|
66
|
+
module_name = module_path.stem
|
|
67
|
+
|
|
68
|
+
if module_path.is_file() and module_name != "__init__":
|
|
69
|
+
importlib.import_module(f"{parent_package_name}.{module_name}")
|
|
70
|
+
tallies[parent_package_name] = tallies.get(parent_package_name, 0) + 1
|
|
71
|
+
else:
|
|
72
|
+
raise FileNotFoundError(f"Path not found or not importable: {full_path}")
|
|
45
73
|
|
|
46
74
|
return tallies
|
|
47
75
|
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
|
|
7
|
-
from kash.utils.common.url import Url, is_file_url, parse_file_url
|
|
7
|
+
from kash.utils.common.url import Url, is_file_url, is_url, parse_file_url
|
|
8
8
|
from kash.utils.file_utils.file_ext import FileExt
|
|
9
9
|
from kash.utils.file_utils.file_formats import (
|
|
10
10
|
MIME_EMPTY,
|
|
@@ -112,6 +112,9 @@ class Format(Enum):
|
|
|
112
112
|
|
|
113
113
|
@property
|
|
114
114
|
def is_doc(self) -> bool:
|
|
115
|
+
"""
|
|
116
|
+
Is this a textual document of some kind?
|
|
117
|
+
"""
|
|
115
118
|
return self in [
|
|
116
119
|
self.markdown,
|
|
117
120
|
self.md_html,
|
|
@@ -119,6 +122,7 @@ class Format(Enum):
|
|
|
119
122
|
self.pdf,
|
|
120
123
|
self.docx,
|
|
121
124
|
self.pptx,
|
|
125
|
+
self.epub,
|
|
122
126
|
]
|
|
123
127
|
|
|
124
128
|
@property
|
|
@@ -340,8 +344,8 @@ Format._init_mime_type_map()
|
|
|
340
344
|
|
|
341
345
|
@dataclass(frozen=True)
|
|
342
346
|
class FileFormatInfo:
|
|
343
|
-
|
|
344
|
-
"""File extension, if recognized."""
|
|
347
|
+
current_file_ext: FileExt | None
|
|
348
|
+
"""File extension, if recognized and in the current filename."""
|
|
345
349
|
|
|
346
350
|
format: Format | None
|
|
347
351
|
"""Format, if recognized."""
|
|
@@ -349,11 +353,18 @@ class FileFormatInfo:
|
|
|
349
353
|
mime_type: MimeType | None
|
|
350
354
|
"""Raw mime type, which may include more formats than the ones above."""
|
|
351
355
|
|
|
356
|
+
@property
|
|
357
|
+
def suggested_file_ext(self) -> FileExt | None:
|
|
358
|
+
"""
|
|
359
|
+
Suggested file extension based on detected format.
|
|
360
|
+
"""
|
|
361
|
+
return self.format.file_ext if self.format else self.current_file_ext
|
|
362
|
+
|
|
352
363
|
@property
|
|
353
364
|
def is_text(self) -> bool:
|
|
354
365
|
return bool(
|
|
355
|
-
self.
|
|
356
|
-
and self.
|
|
366
|
+
self.current_file_ext
|
|
367
|
+
and self.current_file_ext.is_text
|
|
357
368
|
or self.format
|
|
358
369
|
and self.format.is_text
|
|
359
370
|
or self.mime_type
|
|
@@ -373,8 +384,8 @@ class FileFormatInfo:
|
|
|
373
384
|
@property
|
|
374
385
|
def is_image(self) -> bool:
|
|
375
386
|
return bool(
|
|
376
|
-
self.
|
|
377
|
-
and self.
|
|
387
|
+
self.current_file_ext
|
|
388
|
+
and self.current_file_ext.is_image
|
|
378
389
|
or self.format
|
|
379
390
|
and self.format.is_image
|
|
380
391
|
or self.mime_type
|
|
@@ -447,24 +458,33 @@ def detect_media_type(filename: str | Path) -> MediaType:
|
|
|
447
458
|
return media_type
|
|
448
459
|
|
|
449
460
|
|
|
450
|
-
def choose_file_ext(
|
|
461
|
+
def choose_file_ext(
|
|
462
|
+
url_or_path: Url | Path | str, mime_type: MimeType | None = None
|
|
463
|
+
) -> FileExt | None:
|
|
451
464
|
"""
|
|
452
|
-
Pick a
|
|
453
|
-
|
|
465
|
+
Pick a file extension to reflect the type of the content. First tries from any
|
|
466
|
+
provided content type (e.g. if this item was just downloaded). Then
|
|
467
|
+
recognizes known file extensions on the filename or URL, then tries looking
|
|
468
|
+
at the content with libmagic and heuristics, then gives up.
|
|
454
469
|
"""
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
fmt
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
470
|
+
if mime_type:
|
|
471
|
+
fmt = Format.from_mime_type(mime_type)
|
|
472
|
+
if fmt:
|
|
473
|
+
return fmt.file_ext
|
|
474
|
+
|
|
475
|
+
# First check if it's a known standard extension.
|
|
476
|
+
filename_ext = parse_file_ext(url_or_path)
|
|
477
|
+
if filename_ext:
|
|
478
|
+
return filename_ext
|
|
479
|
+
|
|
480
|
+
local_path = None
|
|
481
|
+
if isinstance(url_or_path, str) and is_file_url(url_or_path):
|
|
482
|
+
local_path = parse_file_url(url_or_path)
|
|
483
|
+
elif not is_url(url_or_path):
|
|
484
|
+
local_path = Path(url_or_path)
|
|
485
|
+
|
|
486
|
+
# If it's local based the extension on the file content.
|
|
487
|
+
if local_path:
|
|
488
|
+
return file_format_info(local_path).suggested_file_ext
|
|
489
|
+
|
|
490
|
+
return None
|
|
@@ -1,39 +1,46 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import re
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
4
5
|
from kash.config.logger import get_logger
|
|
5
6
|
from kash.utils.common.url import Url, check_if_url
|
|
6
|
-
from kash.utils.errors import InvalidFilename
|
|
7
7
|
from kash.utils.file_utils.file_ext import FileExt, canonicalize_file_ext
|
|
8
8
|
|
|
9
9
|
log = get_logger(__name__)
|
|
10
10
|
|
|
11
|
+
_valid_ext_re = re.compile(r"^[a-z0-9]*[a-z][a-z0-9]*$", re.IGNORECASE)
|
|
11
12
|
|
|
12
|
-
|
|
13
|
+
|
|
14
|
+
def split_filename(path: str | Path) -> tuple[str, str, str, str]:
|
|
13
15
|
"""
|
|
14
|
-
Parse a filename into its path, name, (optional) type, and extension parts
|
|
16
|
+
Parse a filename into its path, name, (optional) type, and extension parts.
|
|
17
|
+
Type and extension are optional but must be only letters/numbers and not
|
|
18
|
+
all numbers.
|
|
15
19
|
|
|
16
20
|
folder/file.name.type.ext -> ("folder", "file.name", "type", "ext")
|
|
17
21
|
filename.doc.txt -> ("", "filename", "note", "txt")
|
|
18
22
|
filename.txt -> ("", "filename", "", "txt")
|
|
19
23
|
filename -> ("", "filename", "", "")
|
|
24
|
+
filename.123.txt -> ("", "filename.123", "", "txt")
|
|
25
|
+
filename.123.456 -> ("", "filename.123.456", "", "")
|
|
20
26
|
"""
|
|
21
27
|
path_str = str(path)
|
|
22
28
|
|
|
23
29
|
dirname = os.path.dirname(path_str)
|
|
24
30
|
parts = os.path.basename(path_str).rsplit(".", 2)
|
|
25
|
-
if len(parts) == 3:
|
|
31
|
+
if len(parts) == 3 and _valid_ext_re.match(parts[1]) and _valid_ext_re.match(parts[2]):
|
|
26
32
|
name, item_type, ext = parts
|
|
27
|
-
elif len(parts) ==
|
|
33
|
+
elif len(parts) == 3 and _valid_ext_re.match(parts[2]):
|
|
34
|
+
name = f"{parts[0]}.{parts[1]}"
|
|
35
|
+
item_type = ""
|
|
36
|
+
ext = parts[2]
|
|
37
|
+
elif len(parts) == 2 and _valid_ext_re.match(parts[1]):
|
|
28
38
|
name, ext = parts
|
|
29
39
|
item_type = ""
|
|
30
|
-
elif len(parts) == 1 and not require_type_ext:
|
|
31
|
-
name = parts[0]
|
|
32
|
-
item_type = ext = ""
|
|
33
40
|
else:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
41
|
+
name = os.path.basename(path_str)
|
|
42
|
+
item_type = ext = ""
|
|
43
|
+
|
|
37
44
|
return dirname, name, item_type, ext
|
|
38
45
|
|
|
39
46
|
|
|
@@ -67,8 +74,6 @@ def parse_file_ext(url_or_path: str | Url | Path) -> FileExt | None:
|
|
|
67
74
|
|
|
68
75
|
|
|
69
76
|
def test_parse_filename():
|
|
70
|
-
import pytest
|
|
71
|
-
|
|
72
77
|
filename = "foo/bar/test_file.1.type.ext"
|
|
73
78
|
dirname, name, item_type, ext = split_filename(filename)
|
|
74
79
|
assert dirname == "foo/bar"
|
|
@@ -90,9 +95,29 @@ def test_parse_filename():
|
|
|
90
95
|
assert item_type == ""
|
|
91
96
|
assert ext == ""
|
|
92
97
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
98
|
+
# Numeric extensions not allowed.
|
|
99
|
+
dirname, name, item_type, ext = split_filename("test.abc")
|
|
100
|
+
assert name == "test"
|
|
101
|
+
assert ext == "abc"
|
|
102
|
+
|
|
103
|
+
dirname, name, item_type, ext = split_filename("test.123")
|
|
104
|
+
assert name == "test.123"
|
|
105
|
+
assert ext == ""
|
|
106
|
+
|
|
107
|
+
dirname, name, item_type, ext = split_filename("test.type.123")
|
|
108
|
+
assert name == "test.type.123"
|
|
109
|
+
assert item_type == ""
|
|
110
|
+
assert ext == ""
|
|
111
|
+
|
|
112
|
+
dirname, name, item_type, ext = split_filename("test.valid.123")
|
|
113
|
+
assert name == "test.valid.123"
|
|
114
|
+
assert item_type == ""
|
|
115
|
+
assert ext == ""
|
|
116
|
+
|
|
117
|
+
dirname, name, item_type, ext = split_filename("test.123.txt")
|
|
118
|
+
assert name == "test.123"
|
|
119
|
+
assert item_type == ""
|
|
120
|
+
assert ext == "txt"
|
|
96
121
|
|
|
97
122
|
|
|
98
123
|
def test_parse_file_ext():
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
-
from flowmark import fill_markdown,
|
|
4
|
-
from flowmark.text_filling import DEFAULT_WRAP_WIDTH
|
|
5
|
-
from flowmark.text_wrapping import simple_word_splitter
|
|
3
|
+
from flowmark import fill_markdown, line_wrap_by_sentence
|
|
6
4
|
from frontmatter_format import fmf_read, fmf_write
|
|
7
5
|
|
|
8
6
|
from kash.utils.common.format_utils import fmt_loc
|
|
@@ -14,24 +12,28 @@ from kash.utils.rich_custom.ansi_cell_len import ansi_cell_len
|
|
|
14
12
|
def normalize_formatting(
|
|
15
13
|
text: str,
|
|
16
14
|
format: Format | None,
|
|
17
|
-
width=DEFAULT_WRAP_WIDTH,
|
|
18
15
|
support_ansi: bool = True,
|
|
19
16
|
cleanups: bool = True,
|
|
20
17
|
) -> str:
|
|
21
18
|
"""
|
|
22
|
-
Normalize
|
|
19
|
+
Normalize formatting. Currently only normalizes Markdown and leaves plaintext
|
|
20
|
+
and HTML intact.
|
|
21
|
+
|
|
23
22
|
This only does "safe" normalizations that cannot break the text.
|
|
24
23
|
Enables ANSI support so ANSI codes and OSC-8 links are correctly handled.
|
|
25
24
|
"""
|
|
26
25
|
len_fn = ansi_cell_len if support_ansi else len
|
|
27
|
-
if format == Format.
|
|
28
|
-
return fill_text(text, width=width, word_splitter=simple_word_splitter, len_fn=len_fn)
|
|
29
|
-
elif format == Format.markdown or format == Format.md_html:
|
|
26
|
+
if format == Format.markdown or format == Format.md_html:
|
|
30
27
|
return fill_markdown(
|
|
31
28
|
text,
|
|
32
29
|
line_wrapper=line_wrap_by_sentence(len_fn=len_fn, is_markdown=True),
|
|
33
30
|
cleanups=cleanups,
|
|
34
31
|
)
|
|
32
|
+
elif format == Format.plaintext:
|
|
33
|
+
# Consider plaintext a raw format and don't normalize.
|
|
34
|
+
# We could add support for formatted plaintext as well?
|
|
35
|
+
# Then do: fill_text(text, width=width, word_splitter=simple_word_splitter, len_fn=len_fn)
|
|
36
|
+
return text
|
|
35
37
|
elif format == Format.html:
|
|
36
38
|
# We don't currently auto-format HTML as we sometimes use HTML with specifically chosen line breaks.
|
|
37
39
|
return text
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections.abc import Set
|
|
3
|
+
|
|
4
|
+
HTML_IN_MD_TAGS = frozenset(["div", "span", "sup", "sub", "br", "details", "summary"])
|
|
5
|
+
"""These are tags that have reasonable usage in Markdown so typically would be preserved."""
|
|
6
|
+
|
|
7
|
+
ALLOWED_BARE_PROTOS = frozenset(["http://", "https://", "file://"])
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def escape_html_tags(
|
|
11
|
+
html_content: str,
|
|
12
|
+
whitelist_tags: Set[str] = HTML_IN_MD_TAGS,
|
|
13
|
+
allow_bare_md_urls: bool = False,
|
|
14
|
+
) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Escapes HTML tags by replacing '<' with '<', except for whitelisted tags and
|
|
17
|
+
markdown-style URLs like <https://example.com>. Whitelist defaults to the only a
|
|
18
|
+
few common tags. But it can also be empty to escape all tags.
|
|
19
|
+
"""
|
|
20
|
+
result = []
|
|
21
|
+
last_pos = 0
|
|
22
|
+
|
|
23
|
+
# Compile patterns for matching at each '<' position
|
|
24
|
+
# Match <, optional spaces, optional /, optional spaces, whitelisted tag, then optional attributes, then optional /, optional spaces, then >
|
|
25
|
+
whitelist_pattern = re.compile(
|
|
26
|
+
r"< *(/?) *(" + "|".join(whitelist_tags) + r")(?:\s+[^>]*)? *(/?) *>",
|
|
27
|
+
re.IGNORECASE,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
url_pattern = None
|
|
31
|
+
if allow_bare_md_urls:
|
|
32
|
+
url_pattern = re.compile(
|
|
33
|
+
r"<(?:" + "|".join(re.escape(proto) for proto in ALLOWED_BARE_PROTOS) + r")[^>\s]+>"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Find all '<' characters
|
|
37
|
+
for match in re.finditer(r"<", html_content):
|
|
38
|
+
start_pos = match.start()
|
|
39
|
+
|
|
40
|
+
# Add text before this '<'
|
|
41
|
+
result.append(html_content[last_pos:start_pos])
|
|
42
|
+
|
|
43
|
+
# Try to match patterns at this position
|
|
44
|
+
substring = html_content[start_pos:]
|
|
45
|
+
whitelist_match = whitelist_pattern.match(substring)
|
|
46
|
+
url_match = url_pattern and url_pattern.match(substring)
|
|
47
|
+
|
|
48
|
+
if whitelist_match:
|
|
49
|
+
result.append(whitelist_match.group(0))
|
|
50
|
+
last_pos = start_pos + len(whitelist_match.group(0))
|
|
51
|
+
elif url_match:
|
|
52
|
+
result.append(url_match.group(0))
|
|
53
|
+
last_pos = start_pos + len(url_match.group(0))
|
|
54
|
+
else:
|
|
55
|
+
# No match, escape this '<'
|
|
56
|
+
result.append("<")
|
|
57
|
+
last_pos = start_pos + 1
|
|
58
|
+
|
|
59
|
+
# Add remaining text
|
|
60
|
+
result.append(html_content[last_pos:])
|
|
61
|
+
|
|
62
|
+
return "".join(result)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## Tests
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_escape_html_tags():
|
|
69
|
+
"""Tests the escape_html_tags function with various cases."""
|
|
70
|
+
|
|
71
|
+
# 1. Basic Whitelist Check (Default)
|
|
72
|
+
assert escape_html_tags("<div>Test</div>") == "<div>Test</div>"
|
|
73
|
+
assert escape_html_tags("<span>Test</span>") == "<span>Test</span>"
|
|
74
|
+
assert escape_html_tags("<br>") == "<br>"
|
|
75
|
+
assert (
|
|
76
|
+
escape_html_tags("<details><summary>Sum</summary>Det</details>")
|
|
77
|
+
== "<details><summary>Sum</summary>Det</details>"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# 2. Basic Escape Check
|
|
81
|
+
assert escape_html_tags("<p>Test</p>") == "<p>Test</p>"
|
|
82
|
+
assert escape_html_tags("<script>alert('x');</script>") == "<script>alert('x');</script>"
|
|
83
|
+
assert escape_html_tags("<img>") == "<img>"
|
|
84
|
+
|
|
85
|
+
# 3. Case Insensitivity
|
|
86
|
+
assert escape_html_tags("<DiV>Case</DiV>") == "<DiV>Case</DiV>" # Whitelisted
|
|
87
|
+
assert escape_html_tags("<P>Test</P>") == "<P>Test</P>" # Escaped
|
|
88
|
+
|
|
89
|
+
# 4. Self-closing tags
|
|
90
|
+
assert escape_html_tags("<br/>") == "<br/>" # Whitelisted
|
|
91
|
+
assert escape_html_tags("<br />") == "<br />" # Whitelisted
|
|
92
|
+
assert escape_html_tags("<img/>") == "<img/>" # Escaped
|
|
93
|
+
|
|
94
|
+
# 5. Tags with Attributes
|
|
95
|
+
assert (
|
|
96
|
+
escape_html_tags('<div class="foo">Test</div>') == '<div class="foo">Test</div>'
|
|
97
|
+
) # Whitelisted
|
|
98
|
+
assert (
|
|
99
|
+
escape_html_tags('<span id="bar" data-val="x">Test</span>')
|
|
100
|
+
== '<span id="bar" data-val="x">Test</span>'
|
|
101
|
+
) # Whitelisted
|
|
102
|
+
assert escape_html_tags('<p class="foo">Test</p>') == '<p class="foo">Test</p>' # Escaped
|
|
103
|
+
assert escape_html_tags('<img src="a.jpg"/>') == '<img src="a.jpg"/>' # Escaped
|
|
104
|
+
|
|
105
|
+
# 6. Markdown URL Handling
|
|
106
|
+
url_md = "Check <https://example.com> and <http://test.org/path>"
|
|
107
|
+
assert escape_html_tags(url_md, allow_bare_md_urls=True) == url_md
|
|
108
|
+
assert (
|
|
109
|
+
escape_html_tags(url_md, allow_bare_md_urls=False)
|
|
110
|
+
== "Check <https://example.com> and <http://test.org/path>"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
url_mixed = "<div>Link: <https://ok.com></div> <script>no</script>"
|
|
114
|
+
expected_mixed_urls_allowed = "<div>Link: <https://ok.com></div> <script>no</script>"
|
|
115
|
+
expected_mixed_urls_disallowed = (
|
|
116
|
+
"<div>Link: <https://ok.com></div> <script>no</script>"
|
|
117
|
+
)
|
|
118
|
+
assert escape_html_tags(url_mixed, allow_bare_md_urls=True) == expected_mixed_urls_allowed
|
|
119
|
+
assert escape_html_tags(url_mixed, allow_bare_md_urls=False) == expected_mixed_urls_disallowed
|
|
120
|
+
|
|
121
|
+
assert (
|
|
122
|
+
escape_html_tags("<http://malformed url>", allow_bare_md_urls=True)
|
|
123
|
+
== "<http://malformed url>"
|
|
124
|
+
)
|
|
125
|
+
assert (
|
|
126
|
+
escape_html_tags("</https://example.com>", allow_bare_md_urls=True)
|
|
127
|
+
== "</https://example.com>"
|
|
128
|
+
) # Closing URL-like is escaped
|
|
129
|
+
|
|
130
|
+
# 7. Nested/Malformed '<' and Edge Cases
|
|
131
|
+
assert escape_html_tags("<<script>>") == "<<script>>" # Escaped non-tag <
|
|
132
|
+
assert escape_html_tags("<div><p>nested</p></div>") == "<div><p>nested</p></div>"
|
|
133
|
+
assert escape_html_tags("<div<span") == "<div<span" # Incomplete tags are escaped
|
|
134
|
+
assert (
|
|
135
|
+
escape_html_tags("Text < with > inside") == "Text < with > inside"
|
|
136
|
+
) # Escape < even if > exists later
|
|
137
|
+
assert escape_html_tags("<") == "<"
|
|
138
|
+
assert escape_html_tags(">") == ">"
|
|
139
|
+
assert escape_html_tags("<>") == "<>"
|
|
140
|
+
assert escape_html_tags("< >") == "< >"
|
|
141
|
+
assert escape_html_tags("< / div >") == "< / div >" # Whitelisted closing tag with spaces
|
|
142
|
+
|
|
143
|
+
# 8. Mixed Content Combination
|
|
144
|
+
complex_html = "<DiV class='A'>Hello <Br/> <p>World</p> <https://link.com> </DiV>"
|
|
145
|
+
expected_complex_allowed = (
|
|
146
|
+
"<DiV class='A'>Hello <Br/> <p>World</p> <https://link.com> </DiV>"
|
|
147
|
+
)
|
|
148
|
+
expected_complex_disallowed = (
|
|
149
|
+
"<DiV class='A'>Hello <Br/> <p>World</p> <https://link.com> </DiV>"
|
|
150
|
+
)
|
|
151
|
+
assert escape_html_tags(complex_html, allow_bare_md_urls=True) == expected_complex_allowed
|
|
152
|
+
assert escape_html_tags(complex_html, allow_bare_md_urls=False) == expected_complex_disallowed
|
|
153
|
+
|
|
154
|
+
# 9. Empty/No Tags
|
|
155
|
+
assert escape_html_tags("") == ""
|
|
156
|
+
assert escape_html_tags("Just plain text, no tags.") == "Just plain text, no tags."
|
|
@@ -7,11 +7,8 @@ import regex
|
|
|
7
7
|
from marko.block import Heading, ListItem
|
|
8
8
|
from marko.inline import Link
|
|
9
9
|
|
|
10
|
-
from kash.config.logger import get_logger
|
|
11
10
|
from kash.utils.common.url import Url
|
|
12
11
|
|
|
13
|
-
log = get_logger(__name__)
|
|
14
|
-
|
|
15
12
|
HTag: TypeAlias = str
|
|
16
13
|
|
|
17
14
|
# Characters that commonly need escaping in Markdown inline text.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from kash.utils.text_handling.escape_html_tags import escape_html_tags
|
|
6
|
+
|
|
7
|
+
_single_tilde_pat = re.compile(r"(?<!~)~(?!~)")
|
|
8
|
+
_alt_tilde = "~"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _fix_single_tilde(html: str) -> str:
|
|
12
|
+
"""
|
|
13
|
+
Escape standalone ~ characters with spaces before/after to avoid
|
|
14
|
+
misinterpretation by markdownify as strikethrough. Using ~ because it's
|
|
15
|
+
hard to properly escape ~ in a way that markdownify will respect.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def replace_tilde(match: re.Match[str]) -> str:
|
|
19
|
+
start = match.start()
|
|
20
|
+
end = match.end()
|
|
21
|
+
# Check for space before or after
|
|
22
|
+
has_space_before = start > 0 and html[start - 1].isspace()
|
|
23
|
+
has_space_after = end < len(html) and html[end].isspace()
|
|
24
|
+
return _alt_tilde if has_space_before or has_space_after else "~"
|
|
25
|
+
|
|
26
|
+
return _single_tilde_pat.sub(replace_tilde, html)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def markdownify_preprocess(html: str) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Preprocess HTML before passing it to markdownify.
|
|
32
|
+
"""
|
|
33
|
+
return _fix_single_tilde(html)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Good options for markdownify. Without setting sup_symbol and sub_symbol, that
|
|
37
|
+
# info is typically lost.
|
|
38
|
+
MARKDOWNIFY_OPTIONS = {
|
|
39
|
+
"sup_symbol": "<__sup>",
|
|
40
|
+
"sub_symbol": "<__sub>",
|
|
41
|
+
"escape_underscores": True,
|
|
42
|
+
"escape_asterisks": True,
|
|
43
|
+
"escape_misc": False, # This suppresses gratuitous escaping of -, ., etc.
|
|
44
|
+
"newline_style": "BACKSLASH",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _escape_html_in_md(md_text: str, whitelist_tags: set[str] | None = None) -> str:
|
|
49
|
+
"""
|
|
50
|
+
HTML tags originally escaped with entities can get parsed and appear unescaped
|
|
51
|
+
in the Markdown so it usually makes sense to do a full escaping (except for our
|
|
52
|
+
custom sup/sub tags).
|
|
53
|
+
"""
|
|
54
|
+
# Output from markdownify (especially from docx or other conversions) should
|
|
55
|
+
# not have any HTML tags except for the custom sup/sub tags we've added.
|
|
56
|
+
return escape_html_tags(
|
|
57
|
+
md_text,
|
|
58
|
+
allow_bare_md_urls=True,
|
|
59
|
+
whitelist_tags={"__sup", "__sub"} | (whitelist_tags or set()),
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def markdownify_postprocess(md_text: str) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Postprocess Markdown after markdownify has converted HTML to Markdown.
|
|
66
|
+
"""
|
|
67
|
+
md_text = _escape_html_in_md(md_text)
|
|
68
|
+
# We use our own custom tags for sup/sub to avoid possible conflicts with other
|
|
69
|
+
# tags in a doc. But when done we should replace them with the standard ones.
|
|
70
|
+
return (
|
|
71
|
+
md_text.replace("<__sup>", "<sup>")
|
|
72
|
+
.replace("</__sup>", "</sup>")
|
|
73
|
+
.replace("<__sub>", "<sub>")
|
|
74
|
+
.replace("</__sub>", "</sub>")
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def markdownify_custom(html: str) -> str:
|
|
79
|
+
"""
|
|
80
|
+
Customized version of `markdownify_convert to be more robust than with default settings.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
from markdownify import markdownify as markdownify_convert
|
|
84
|
+
|
|
85
|
+
preprocessed_html = markdownify_preprocess(html)
|
|
86
|
+
md_text = markdownify_convert(preprocessed_html, **MARKDOWNIFY_OPTIONS)
|
|
87
|
+
return markdownify_postprocess(md_text)
|
|
@@ -6,15 +6,6 @@ from funlog import abbreviate_arg
|
|
|
6
6
|
from patch_ng import PatchSet
|
|
7
7
|
from pydantic.dataclasses import dataclass
|
|
8
8
|
|
|
9
|
-
from kash.config.logger import get_logger
|
|
10
|
-
from kash.model.items_model import Item, ItemRelations, ItemType
|
|
11
|
-
from kash.model.paths_model import StorePath
|
|
12
|
-
from kash.utils.errors import ContentError
|
|
13
|
-
from kash.utils.file_utils.file_formats_model import Format
|
|
14
|
-
|
|
15
|
-
log = get_logger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
9
|
# TODO: Support diffs of path lists as well, including renames and moves.
|
|
19
10
|
|
|
20
11
|
|
|
@@ -77,7 +68,7 @@ def unified_diff(
|
|
|
77
68
|
|
|
78
69
|
patch_set = PatchSet(BytesIO(diff_text.encode("utf-8")))
|
|
79
70
|
if patch_set.errors > 0:
|
|
80
|
-
raise
|
|
71
|
+
raise ValueError(
|
|
81
72
|
f"Had {patch_set.errors} errors parsing diff of `{from_name}` and `{to_name}`: {abbreviate_arg(diff_text)}"
|
|
82
73
|
)
|
|
83
74
|
|
|
@@ -102,37 +93,3 @@ def unified_diff_files(from_file: str | Path, to_file: str | Path) -> UnifiedDif
|
|
|
102
93
|
content2 = f2.read()
|
|
103
94
|
|
|
104
95
|
return unified_diff(content1, content2, from_name, to_name)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def unified_diff_items(from_item: Item, to_item: Item, strict: bool = True) -> Item:
|
|
108
|
-
"""
|
|
109
|
-
Generate a unified diff between two items. If `strict` is true, will raise
|
|
110
|
-
an error if the items are of different formats.
|
|
111
|
-
"""
|
|
112
|
-
if not from_item.body and not to_item.body:
|
|
113
|
-
raise ContentError(f"No body to diff for {from_item} and {to_item}")
|
|
114
|
-
if not from_item.store_path or not to_item.store_path:
|
|
115
|
-
raise ContentError("No store path on items; save before diffing")
|
|
116
|
-
diff_items = [item for item in [from_item, to_item] if item.format == Format.diff]
|
|
117
|
-
if len(diff_items) == 1:
|
|
118
|
-
raise ContentError(
|
|
119
|
-
f"Cannot compare diffs to non-diffs: {from_item.format}, {to_item.format}"
|
|
120
|
-
)
|
|
121
|
-
if len(diff_items) > 0 or from_item.format != to_item.format:
|
|
122
|
-
msg = f"Diffing items of incompatible format: {from_item.format}, {to_item.format}"
|
|
123
|
-
if strict:
|
|
124
|
-
raise ContentError(msg)
|
|
125
|
-
else:
|
|
126
|
-
log.warning("%s", msg)
|
|
127
|
-
|
|
128
|
-
from_path, to_path = StorePath(from_item.store_path), StorePath(to_item.store_path)
|
|
129
|
-
|
|
130
|
-
diff = unified_diff(from_item.body, to_item.body, str(from_path), str(to_path))
|
|
131
|
-
|
|
132
|
-
return Item(
|
|
133
|
-
type=ItemType.doc,
|
|
134
|
-
title=f"Diff of {from_path} and {to_path}",
|
|
135
|
-
format=Format.diff,
|
|
136
|
-
relations=ItemRelations(diff_of=[from_path, to_path]),
|
|
137
|
-
body=diff.patch_text,
|
|
138
|
-
)
|