kash-shell 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/__init__.py +4 -4
- kash/actions/core/markdownify.py +5 -2
- kash/actions/core/readability.py +5 -2
- kash/actions/core/render_as_html.py +18 -0
- kash/actions/core/webpage_config.py +12 -4
- kash/commands/__init__.py +8 -20
- kash/commands/base/basic_file_commands.py +15 -0
- kash/commands/base/debug_commands.py +15 -2
- kash/commands/base/general_commands.py +27 -18
- kash/commands/base/logs_commands.py +1 -4
- kash/commands/base/model_commands.py +8 -8
- kash/commands/base/search_command.py +3 -2
- kash/commands/base/show_command.py +5 -3
- kash/commands/extras/parse_uv_lock.py +186 -0
- kash/commands/help/doc_commands.py +2 -31
- kash/commands/help/welcome.py +33 -0
- kash/commands/workspace/selection_commands.py +11 -6
- kash/commands/workspace/workspace_commands.py +19 -16
- kash/config/colors.py +2 -0
- kash/config/env_settings.py +72 -0
- kash/config/init.py +2 -2
- kash/config/logger.py +61 -59
- kash/config/logger_basic.py +12 -5
- kash/config/server_config.py +6 -6
- kash/config/settings.py +117 -67
- kash/config/setup.py +35 -9
- kash/config/suppress_warnings.py +30 -12
- kash/config/text_styles.py +3 -13
- kash/docs/load_api_docs.py +2 -1
- kash/docs/markdown/topics/a2_installation.md +7 -3
- kash/docs/markdown/topics/a3_getting_started.md +3 -2
- kash/docs/markdown/warning.md +3 -8
- kash/docs/markdown/welcome.md +4 -0
- kash/docs_base/load_recipe_snippets.py +1 -1
- kash/docs_base/recipes/{general_system_commands.ksh → general_system_commands.sh} +1 -1
- kash/{concepts → embeddings}/cosine.py +2 -1
- kash/embeddings/text_similarity.py +57 -0
- kash/exec/__init__.py +20 -3
- kash/exec/action_decorators.py +18 -4
- kash/exec/action_exec.py +41 -23
- kash/exec/action_registry.py +13 -48
- kash/exec/command_registry.py +2 -1
- kash/exec/fetch_url_metadata.py +4 -6
- kash/exec/importing.py +56 -0
- kash/exec/llm_transforms.py +6 -6
- kash/exec/precondition_registry.py +2 -1
- kash/exec/preconditions.py +16 -1
- kash/exec/shell_callable_action.py +33 -19
- kash/file_storage/file_store.py +23 -14
- kash/file_storage/item_file_format.py +13 -3
- kash/file_storage/metadata_dirs.py +11 -2
- kash/help/assistant.py +2 -2
- kash/help/assistant_instructions.py +2 -1
- kash/help/help_embeddings.py +2 -2
- kash/help/help_printing.py +14 -10
- kash/help/tldr_help.py +5 -3
- kash/llm_utils/clean_headings.py +1 -1
- kash/llm_utils/llm_api_keys.py +4 -4
- kash/llm_utils/llm_completion.py +2 -2
- kash/llm_utils/llm_features.py +68 -0
- kash/llm_utils/llm_messages.py +1 -2
- kash/llm_utils/llm_names.py +1 -1
- kash/llm_utils/llms.py +17 -12
- kash/local_server/__init__.py +5 -2
- kash/local_server/local_server.py +56 -46
- kash/local_server/local_server_commands.py +15 -15
- kash/local_server/local_server_routes.py +2 -2
- kash/local_server/local_url_formatters.py +1 -1
- kash/mcp/__init__.py +5 -2
- kash/mcp/mcp_cli.py +54 -17
- kash/mcp/mcp_server_commands.py +5 -6
- kash/mcp/mcp_server_routes.py +14 -11
- kash/mcp/mcp_server_sse.py +61 -34
- kash/mcp/mcp_server_stdio.py +0 -8
- kash/media_base/audio_processing.py +81 -7
- kash/media_base/media_cache.py +18 -18
- kash/media_base/media_services.py +1 -1
- kash/media_base/media_tools.py +6 -6
- kash/media_base/services/local_file_media.py +2 -2
- kash/media_base/{speech_transcription.py → transcription_deepgram.py} +25 -109
- kash/media_base/transcription_format.py +73 -0
- kash/media_base/transcription_whisper.py +38 -0
- kash/model/__init__.py +73 -5
- kash/model/actions_model.py +38 -4
- kash/model/concept_model.py +30 -0
- kash/model/items_model.py +56 -13
- kash/model/params_model.py +24 -0
- kash/shell/completions/completion_scoring.py +37 -5
- kash/shell/output/kerm_codes.py +1 -2
- kash/shell/output/shell_formatting.py +14 -4
- kash/shell/shell_main.py +2 -2
- kash/shell/utils/exception_printing.py +6 -0
- kash/shell/utils/native_utils.py +26 -20
- kash/text_handling/custom_sliding_transforms.py +12 -4
- kash/text_handling/doc_normalization.py +6 -2
- kash/text_handling/markdown_render.py +117 -0
- kash/text_handling/markdown_utils.py +204 -0
- kash/utils/common/import_utils.py +12 -3
- kash/utils/common/type_utils.py +0 -29
- kash/utils/common/url.py +80 -28
- kash/utils/errors.py +6 -0
- kash/utils/file_utils/{dir_size.py → dir_info.py} +25 -4
- kash/utils/file_utils/file_ext.py +2 -3
- kash/utils/file_utils/file_formats.py +28 -2
- kash/utils/file_utils/file_formats_model.py +50 -19
- kash/utils/file_utils/filename_parsing.py +10 -4
- kash/web_content/dir_store.py +1 -2
- kash/web_content/file_cache_utils.py +37 -10
- kash/web_content/file_processing.py +68 -0
- kash/web_content/local_file_cache.py +12 -9
- kash/web_content/web_extract.py +8 -3
- kash/web_content/web_fetch.py +12 -4
- kash/web_gen/tabbed_webpage.py +5 -2
- kash/web_gen/templates/base_styles.css.jinja +120 -14
- kash/web_gen/templates/base_webpage.html.jinja +60 -13
- kash/web_gen/templates/content_styles.css.jinja +4 -2
- kash/web_gen/templates/item_view.html.jinja +2 -2
- kash/web_gen/templates/tabbed_webpage.html.jinja +1 -2
- kash/workspaces/__init__.py +15 -2
- kash/workspaces/selections.py +18 -3
- kash/workspaces/source_items.py +4 -2
- kash/workspaces/workspace_output.py +11 -4
- kash/workspaces/workspaces.py +5 -11
- kash/xonsh_custom/command_nl_utils.py +40 -19
- kash/xonsh_custom/custom_shell.py +44 -12
- kash/xonsh_custom/customize_prompt.py +39 -21
- kash/xonsh_custom/load_into_xonsh.py +26 -27
- kash/xonsh_custom/shell_load_commands.py +2 -2
- kash/xonsh_custom/xonsh_completers.py +2 -249
- kash/xonsh_custom/xonsh_keybindings.py +282 -0
- kash/xonsh_custom/xonsh_modern_tools.py +3 -3
- kash/xontrib/kash_extension.py +5 -6
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/METADATA +26 -12
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/RECORD +140 -140
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/entry_points.txt +1 -1
- kash/concepts/concept_formats.py +0 -23
- kash/concepts/text_similarity.py +0 -112
- kash/shell/clideps/api_keys.py +0 -99
- kash/shell/clideps/dotenv_setup.py +0 -114
- kash/shell/clideps/dotenv_utils.py +0 -89
- kash/shell/clideps/pkg_deps.py +0 -232
- kash/shell/clideps/platforms.py +0 -11
- kash/shell/clideps/terminal_features.py +0 -56
- kash/shell/utils/osc_utils.py +0 -95
- kash/shell/utils/terminal_images.py +0 -133
- kash/text_handling/markdown_util.py +0 -167
- kash/utils/common/atomic_var.py +0 -158
- kash/utils/common/string_replace.py +0 -93
- kash/utils/common/string_template.py +0 -101
- /kash/docs_base/recipes/{python_dev_commands.ksh → python_dev_commands.sh} +0 -0
- /kash/docs_base/recipes/{tldr_standard_commands.ksh → tldr_standard_commands.sh} +0 -0
- /kash/{concepts → embeddings}/embeddings.py +0 -0
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from textwrap import dedent
|
|
2
|
+
|
|
3
|
+
import marko
|
|
4
|
+
import regex
|
|
5
|
+
from marko.block import HTMLBlock
|
|
6
|
+
from marko.ext.gfm import GFM
|
|
7
|
+
from marko.helpers import MarkoExtension
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# When we use divs in Markdown we usually want them to be standalone paragraphs,
|
|
11
|
+
# so it doesn't break other wrapping with flowmark etc. This handles that.
|
|
12
|
+
class CustomHTMLBlockMixin:
|
|
13
|
+
div_pattern = regex.compile(r"^\s*<div\b", regex.IGNORECASE)
|
|
14
|
+
|
|
15
|
+
def render_html_block(self, element: HTMLBlock) -> str:
|
|
16
|
+
# Apply GFM filtering first via the next renderer in the MRO.
|
|
17
|
+
filtered_body = super().render_html_block(element) # pyright: ignore
|
|
18
|
+
|
|
19
|
+
# Check if the original block was a div.
|
|
20
|
+
if self.div_pattern.match(element.body.strip()):
|
|
21
|
+
# If it was a div, wrap the *filtered* result in newlines.
|
|
22
|
+
return f"\n{filtered_body.strip()}\n"
|
|
23
|
+
else:
|
|
24
|
+
# Otherwise, return the GFM-filtered body directly.
|
|
25
|
+
return filtered_body
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# GFM first, adding our custom override as an extension to handle divs our way.
|
|
29
|
+
# Extensions later in this list are earlier in MRO.
|
|
30
|
+
MARKO_GFM = marko.Markdown(
|
|
31
|
+
extensions=["footnote", GFM, MarkoExtension(renderer_mixins=[CustomHTMLBlockMixin])]
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
FOOTNOTE_UP_ARROW = " ↑ "
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def html_postprocess(html: str) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Final tweaks to the HTML.
|
|
41
|
+
"""
|
|
42
|
+
html = html.replace(
|
|
43
|
+
"""class="footnote">↩</a>""", f"""class="footnote">{FOOTNOTE_UP_ARROW}</a>"""
|
|
44
|
+
)
|
|
45
|
+
return html
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def markdown_to_html(markdown: str, converter: marko.Markdown = MARKO_GFM) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Convert Markdown to HTML.
|
|
51
|
+
|
|
52
|
+
Wraps div blocks with newlines for better Markdown compatibility.
|
|
53
|
+
|
|
54
|
+
Output passes through raw HTML! Note per GFM, unsafe script tags etc
|
|
55
|
+
are [allowed in some cases](https://github.github.com/gfm/#example-140) so
|
|
56
|
+
additional sanitization is needed if input isn't trusted.
|
|
57
|
+
"""
|
|
58
|
+
html = converter.convert(markdown)
|
|
59
|
+
return html_postprocess(html)
|
|
60
|
+
return html
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
## Tests
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_markdown_to_html():
|
|
67
|
+
markdown = dedent(
|
|
68
|
+
"""
|
|
69
|
+
# Heading
|
|
70
|
+
|
|
71
|
+
This is a paragraph and a [link](https://example.com).
|
|
72
|
+
|
|
73
|
+
- Item 1
|
|
74
|
+
- Item 2
|
|
75
|
+
|
|
76
|
+
## Subheading
|
|
77
|
+
|
|
78
|
+
This is a paragraph with a <span>span</span> tag.
|
|
79
|
+
This is a paragraph with a <div>div</div> tag.
|
|
80
|
+
This is a paragraph with an <a href='https://example.com'>example link</a>.
|
|
81
|
+
|
|
82
|
+
<div class="div1">This is a div.</div>
|
|
83
|
+
|
|
84
|
+
<div class="div2">This is a second div.
|
|
85
|
+
<iframe src="https://example.com">Inline iframe, note this is sanitized</iframe>
|
|
86
|
+
</div>
|
|
87
|
+
|
|
88
|
+
<!-- Script tag in a block, note this isn't sanitized -->
|
|
89
|
+
<script>console.log("Javascript block!");</script>
|
|
90
|
+
"""
|
|
91
|
+
)
|
|
92
|
+
print(markdown_to_html(markdown))
|
|
93
|
+
|
|
94
|
+
expected_html = dedent(
|
|
95
|
+
"""
|
|
96
|
+
<h1>Heading</h1>
|
|
97
|
+
<p>This is a paragraph and a <a href="https://example.com">link</a>.</p>
|
|
98
|
+
<ul>
|
|
99
|
+
<li>Item 1</li>
|
|
100
|
+
<li>Item 2</li>
|
|
101
|
+
</ul>
|
|
102
|
+
<h2>Subheading</h2>
|
|
103
|
+
<p>This is a paragraph with a <span>span</span> tag.
|
|
104
|
+
This is a paragraph with a <div>div</div> tag.
|
|
105
|
+
This is a paragraph with an <a href='https://example.com'>example link</a>.</p>
|
|
106
|
+
|
|
107
|
+
<div class="div1">This is a div.</div>
|
|
108
|
+
|
|
109
|
+
<div class="div2">This is a second div.
|
|
110
|
+
<iframe src="https://example.com">Inline iframe, note this is sanitized</iframe>
|
|
111
|
+
</div>
|
|
112
|
+
<!-- Script tag in a block, note this isn't sanitized -->
|
|
113
|
+
<script>console.log("Javascript block!");</script>
|
|
114
|
+
"""
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
assert markdown_to_html(markdown).strip() == expected_html.strip()
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
import marko
|
|
5
|
+
import regex
|
|
6
|
+
from marko.block import Heading, ListItem
|
|
7
|
+
from marko.inline import Link
|
|
8
|
+
|
|
9
|
+
from kash.config.logger import get_logger
|
|
10
|
+
from kash.utils.common.url import Url
|
|
11
|
+
|
|
12
|
+
log = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
# Characters that commonly need escaping in Markdown inline text.
|
|
15
|
+
MARKDOWN_ESCAPE_CHARS = r"([\\`*_{}\[\]()#+.!-])"
|
|
16
|
+
MARKDOWN_ESCAPE_RE = re.compile(MARKDOWN_ESCAPE_CHARS)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def escape_markdown(text: str) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Escape characters with special meaning in Markdown.
|
|
22
|
+
"""
|
|
23
|
+
return MARKDOWN_ESCAPE_RE.sub(r"\\\1", text)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def as_bullet_points(values: list[Any]) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Convert a list of values to a Markdown bullet-point list. If a value is a string,
|
|
29
|
+
it is treated like Markdown. If it's something else it's converted to a string
|
|
30
|
+
and also escaped for Markdown.
|
|
31
|
+
"""
|
|
32
|
+
points: list[str] = []
|
|
33
|
+
for value in values:
|
|
34
|
+
value = value.replace("\n", " ").strip()
|
|
35
|
+
if isinstance(value, str):
|
|
36
|
+
points.append(value)
|
|
37
|
+
else:
|
|
38
|
+
points.append(escape_markdown(str(value)))
|
|
39
|
+
|
|
40
|
+
return "\n\n".join(f"- {point}" for point in points)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def markdown_link(text: str, url: str | Url) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Create a Markdown link.
|
|
46
|
+
"""
|
|
47
|
+
text = text.replace("[", "\\[").replace("]", "\\]")
|
|
48
|
+
return f"[{text}]({url})"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def is_markdown_header(markdown: str) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Is the start of this content a Markdown header?
|
|
54
|
+
"""
|
|
55
|
+
return regex.match(r"^#+ ", markdown) is not None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _tree_links(element, include_internal=False):
|
|
59
|
+
links = []
|
|
60
|
+
|
|
61
|
+
def _find_links(element):
|
|
62
|
+
match element:
|
|
63
|
+
case Link():
|
|
64
|
+
if include_internal or not element.dest.startswith("#"):
|
|
65
|
+
links.append(element.dest)
|
|
66
|
+
case _:
|
|
67
|
+
if hasattr(element, "children"):
|
|
68
|
+
for child in element.children:
|
|
69
|
+
_find_links(child)
|
|
70
|
+
|
|
71
|
+
_find_links(element)
|
|
72
|
+
return links
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_links(file_path: str, include_internal=False) -> list[str]:
|
|
76
|
+
"""
|
|
77
|
+
Extract all links from a Markdown file. Future: Include textual and section context.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
with open(file_path) as file:
|
|
81
|
+
content = file.read()
|
|
82
|
+
document = marko.parse(content)
|
|
83
|
+
return _tree_links(document, include_internal)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _extract_text(element: Any) -> str:
|
|
87
|
+
if isinstance(element, str):
|
|
88
|
+
return element
|
|
89
|
+
elif hasattr(element, "children"):
|
|
90
|
+
return "".join(_extract_text(child) for child in element.children)
|
|
91
|
+
else:
|
|
92
|
+
return ""
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _tree_bullet_points(element: marko.block.Document) -> list[str]:
|
|
96
|
+
bullet_points: list[str] = []
|
|
97
|
+
|
|
98
|
+
def _find_bullet_points(element):
|
|
99
|
+
if isinstance(element, ListItem):
|
|
100
|
+
bullet_points.append(_extract_text(element).strip())
|
|
101
|
+
elif hasattr(element, "children"):
|
|
102
|
+
for child in element.children:
|
|
103
|
+
_find_bullet_points(child)
|
|
104
|
+
|
|
105
|
+
_find_bullet_points(element)
|
|
106
|
+
return bullet_points
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def extract_bullet_points(content: str) -> list[str]:
|
|
110
|
+
"""
|
|
111
|
+
Extract list item values from a Markdown file.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
document = marko.parse(content)
|
|
115
|
+
return _tree_bullet_points(document)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _type_from_heading(heading: Heading) -> str:
|
|
119
|
+
if heading.level in [1, 2, 3, 4, 5, 6]:
|
|
120
|
+
return f"h{heading.level}"
|
|
121
|
+
else:
|
|
122
|
+
raise ValueError(f"Unsupported heading: {heading}: level {heading.level}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _last_unescaped_bracket(text: str, index: int) -> str | None:
|
|
126
|
+
escaped = False
|
|
127
|
+
for i in range(index - 1, -1, -1):
|
|
128
|
+
ch = text[i]
|
|
129
|
+
if ch == "\\":
|
|
130
|
+
escaped = not escaped # Toggle escaping chain
|
|
131
|
+
continue
|
|
132
|
+
if ch in "[]":
|
|
133
|
+
if not escaped:
|
|
134
|
+
return ch
|
|
135
|
+
# Reset escape status after any non‑backslash char
|
|
136
|
+
escaped = False
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def find_markdown_text(
|
|
141
|
+
pattern: re.Pattern[str], text: str, *, start_pos: int = 0
|
|
142
|
+
) -> re.Match[str] | None:
|
|
143
|
+
"""
|
|
144
|
+
Return first regex `pattern` match in `text` not inside an existing link.
|
|
145
|
+
|
|
146
|
+
A match is considered inside a link when the most recent unescaped square
|
|
147
|
+
bracket preceding the match start is an opening bracket "[".
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
pos = start_pos
|
|
151
|
+
while True:
|
|
152
|
+
match = pattern.search(text, pos)
|
|
153
|
+
if match is None:
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
last_bracket = _last_unescaped_bracket(text, match.start())
|
|
157
|
+
if last_bracket != "[":
|
|
158
|
+
return match
|
|
159
|
+
|
|
160
|
+
# Skip this match and continue searching
|
|
161
|
+
pos = match.end()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
## Tests
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_escape_markdown() -> None:
|
|
168
|
+
assert escape_markdown("") == ""
|
|
169
|
+
assert escape_markdown("Hello world") == "Hello world"
|
|
170
|
+
assert escape_markdown("`code`") == "\\`code\\`"
|
|
171
|
+
assert escape_markdown("*italic*") == "\\*italic\\*"
|
|
172
|
+
assert escape_markdown("_bold_") == "\\_bold\\_"
|
|
173
|
+
assert escape_markdown("{braces}") == "\\{braces\\}"
|
|
174
|
+
assert escape_markdown("# header") == "\\# header"
|
|
175
|
+
assert escape_markdown("1. item") == "1\\. item"
|
|
176
|
+
assert escape_markdown("line+break") == "line\\+break"
|
|
177
|
+
assert escape_markdown("dash-") == "dash\\-"
|
|
178
|
+
assert escape_markdown("!bang") == "\\!bang"
|
|
179
|
+
assert escape_markdown("backslash\\") == "backslash\\\\"
|
|
180
|
+
assert escape_markdown("Multiple *special* chars [here](#anchor).") == (
|
|
181
|
+
"Multiple \\*special\\* chars \\[here\\]\\(\\#anchor\\)\\."
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def test_find_markdown_text() -> None: # pragma: no cover
|
|
186
|
+
# Match is returned when the term is not inside a link.
|
|
187
|
+
text = "Foo bar baz"
|
|
188
|
+
pattern = re.compile("Foo Bar", re.IGNORECASE)
|
|
189
|
+
match = find_markdown_text(pattern, text)
|
|
190
|
+
assert match is not None and match.group(0) == "Foo bar"
|
|
191
|
+
|
|
192
|
+
# Skips occurrence inside link and returns the first one outside.
|
|
193
|
+
text = "[Foo](http://example.com) something Foo"
|
|
194
|
+
pattern = re.compile("Foo", re.IGNORECASE)
|
|
195
|
+
match = find_markdown_text(pattern, text)
|
|
196
|
+
assert match is not None
|
|
197
|
+
assert match.start() > text.index(") ")
|
|
198
|
+
assert text[match.start() : match.end()] == "Foo"
|
|
199
|
+
|
|
200
|
+
# Returns None when the only occurrences are inside links.
|
|
201
|
+
text = "prefix [bar](http://example.com) suffix"
|
|
202
|
+
pattern = re.compile("bar", re.IGNORECASE)
|
|
203
|
+
match = find_markdown_text(pattern, text)
|
|
204
|
+
assert match is None
|
|
@@ -15,21 +15,30 @@ Tallies: TypeAlias = dict[str, int]
|
|
|
15
15
|
def import_subdirs(
|
|
16
16
|
parent_package_name: str,
|
|
17
17
|
parent_dir: Path,
|
|
18
|
-
subdir_names: list[str],
|
|
18
|
+
subdir_names: list[str] | None = None,
|
|
19
19
|
tallies: Tallies | None = None,
|
|
20
20
|
):
|
|
21
21
|
"""
|
|
22
22
|
Import all files in the given subdirectories of a single parent directory.
|
|
23
|
+
Wraps `pkgutil.iter_modules` to iterate over all modules in the subdirectories.
|
|
24
|
+
If `subdir_names` is `None`, will import all subdirectories.
|
|
23
25
|
"""
|
|
24
26
|
if tallies is None:
|
|
25
27
|
tallies = {}
|
|
28
|
+
if not subdir_names:
|
|
29
|
+
subdir_names = ["."]
|
|
26
30
|
|
|
27
31
|
for subdir_name in subdir_names:
|
|
28
|
-
|
|
32
|
+
if subdir_name == ".":
|
|
33
|
+
full_path = parent_dir
|
|
34
|
+
package_name = parent_package_name
|
|
35
|
+
else:
|
|
36
|
+
full_path = parent_dir / subdir_name
|
|
37
|
+
package_name = f"{parent_package_name}.{subdir_name}"
|
|
38
|
+
|
|
29
39
|
if not full_path.is_dir():
|
|
30
40
|
raise FileNotFoundError(f"Subdirectory not found: {full_path}")
|
|
31
41
|
|
|
32
|
-
package_name = f"{parent_package_name}.{subdir_name}"
|
|
33
42
|
for _module_finder, module_name, _is_pkg in pkgutil.iter_modules(path=[str(full_path)]):
|
|
34
43
|
importlib.import_module(f"{package_name}.{module_name}") # Propagate import errors
|
|
35
44
|
tallies[package_name] = tallies.get(package_name, 0) + 1
|
kash/utils/common/type_utils.py
CHANGED
|
@@ -15,35 +15,6 @@ def not_none(value: T | None, message: str | None = None) -> T:
|
|
|
15
15
|
return value
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def is_truthy(value: Any, strict: bool = True) -> bool:
|
|
19
|
-
"""
|
|
20
|
-
True for all common string and non-string values for true. Useful for parsing
|
|
21
|
-
string values or command line arguments.
|
|
22
|
-
"""
|
|
23
|
-
truthy_values = {"true", "1", "yes", "on", "y"}
|
|
24
|
-
falsy_values = {"false", "0", "no", "off", "n", ""}
|
|
25
|
-
|
|
26
|
-
if value is None:
|
|
27
|
-
return False
|
|
28
|
-
elif isinstance(value, str):
|
|
29
|
-
value = value.strip().lower()
|
|
30
|
-
if value in truthy_values:
|
|
31
|
-
return True
|
|
32
|
-
elif value in falsy_values:
|
|
33
|
-
return False
|
|
34
|
-
elif isinstance(value, (int, float)):
|
|
35
|
-
return value != 0
|
|
36
|
-
elif isinstance(value, bool):
|
|
37
|
-
return value
|
|
38
|
-
elif isinstance(value, (list, tuple, set, dict)):
|
|
39
|
-
return len(value) > 0
|
|
40
|
-
|
|
41
|
-
if strict:
|
|
42
|
-
raise ValueError(f"Could not convert type {type(value)} to boolean: {repr(value)}")
|
|
43
|
-
|
|
44
|
-
return bool(value)
|
|
45
|
-
|
|
46
|
-
|
|
47
18
|
def as_dataclass(dict_data: dict[str, Any], dataclass_type: type[T]) -> T:
|
|
48
19
|
"""
|
|
49
20
|
Convert a dict recursively to dataclass object, raising an error if the data does
|
kash/utils/common/url.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
A simple `Url` type and basic URL handling with no dependencies.
|
|
3
|
+
Simply a few convenience types and functions around `urllib`.
|
|
3
4
|
"""
|
|
4
5
|
|
|
5
6
|
import re
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import NewType
|
|
8
|
-
from urllib.parse import urlparse, urlsplit, urlunsplit
|
|
9
|
+
from urllib.parse import ParseResult, urlparse, urlsplit, urlunsplit
|
|
9
10
|
|
|
10
11
|
Url = NewType("Url", str)
|
|
11
12
|
"""
|
|
@@ -23,41 +24,82 @@ UnresolvedLocator = str | Locator
|
|
|
23
24
|
A string that may not be resolved to a URL or path.
|
|
24
25
|
"""
|
|
25
26
|
|
|
27
|
+
HTTP_ONLY = ["http", "https"]
|
|
28
|
+
HTTP_OR_FILE = HTTP_ONLY + ["file"]
|
|
26
29
|
|
|
27
|
-
|
|
30
|
+
|
|
31
|
+
def check_if_url(
|
|
32
|
+
text: UnresolvedLocator, only_schemes: list[str] | None = None
|
|
33
|
+
) -> ParseResult | None:
|
|
28
34
|
"""
|
|
29
|
-
|
|
30
|
-
|
|
35
|
+
Convenience function to check if a string or Path is a URL and if so return
|
|
36
|
+
the `urlparse.ParseResult`.
|
|
37
|
+
|
|
38
|
+
Also returns false for Paths, so that it's easy to use local paths and URLs
|
|
39
|
+
(`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE` to
|
|
40
|
+
restrict to only certain schemes.
|
|
31
41
|
"""
|
|
32
42
|
if isinstance(text, Path):
|
|
33
|
-
return
|
|
43
|
+
return None
|
|
34
44
|
text = str(text) # Handle paths or anything else unexpected.
|
|
35
45
|
try:
|
|
36
46
|
result = urlparse(text)
|
|
37
|
-
if
|
|
38
|
-
return result.scheme in
|
|
47
|
+
if only_schemes:
|
|
48
|
+
return result if result.scheme in only_schemes else None
|
|
39
49
|
else:
|
|
40
|
-
return result.scheme != ""
|
|
50
|
+
return result if result.scheme != "" else None
|
|
41
51
|
except ValueError:
|
|
42
|
-
return
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_url(text: UnresolvedLocator, only_schemes: list[str] | None = None) -> bool:
|
|
56
|
+
"""
|
|
57
|
+
Check if a string is a URL. For convenience, also returns false for
|
|
58
|
+
Paths, so that it's easy to use local paths and URLs interchangeably.
|
|
59
|
+
"""
|
|
60
|
+
return check_if_url(text, only_schemes) is not None
|
|
43
61
|
|
|
44
62
|
|
|
45
63
|
def is_file_url(url: str | Url) -> bool:
|
|
46
64
|
"""
|
|
47
|
-
Is URL a file:// URL?
|
|
65
|
+
Is URL a file:// URL? Does not check for local file paths.
|
|
48
66
|
"""
|
|
49
67
|
return url.startswith("file://")
|
|
50
68
|
|
|
51
69
|
|
|
52
|
-
def
|
|
70
|
+
def parse_http_url(url: str | Url) -> ParseResult:
|
|
53
71
|
"""
|
|
54
|
-
Parse
|
|
72
|
+
Parse an http/https URL and return the parsed result, raising ValueError if
|
|
73
|
+
not an http/https URL.
|
|
74
|
+
"""
|
|
75
|
+
parsed_url = urlparse(url)
|
|
76
|
+
if parsed_url.scheme in ("http", "https"):
|
|
77
|
+
return parsed_url
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Not an http/https URL: {url}")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_file_url(url: str | Url) -> Path:
|
|
83
|
+
"""
|
|
84
|
+
Parse a file URL and return the path, raising ValueError if not a file URL.
|
|
55
85
|
"""
|
|
56
86
|
parsed_url = urlparse(url)
|
|
57
87
|
if parsed_url.scheme == "file":
|
|
58
88
|
return Path(parsed_url.path)
|
|
59
89
|
else:
|
|
60
|
-
|
|
90
|
+
raise ValueError(f"Not a file URL: {url}")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def parse_s3_url(url: str | Url) -> tuple[str, str]:
|
|
94
|
+
"""
|
|
95
|
+
Parse an S3 URL and return the bucket and key, raising ValueError if not an
|
|
96
|
+
S3 URL.
|
|
97
|
+
"""
|
|
98
|
+
parsed_url = urlparse(url)
|
|
99
|
+
if parsed_url.scheme == "s3":
|
|
100
|
+
return parsed_url.netloc, parsed_url.path.lstrip("/")
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError(f"Not an S3 URL: {url}")
|
|
61
103
|
|
|
62
104
|
|
|
63
105
|
def as_file_url(path: str | Path) -> Url:
|
|
@@ -73,24 +115,24 @@ def as_file_url(path: str | Path) -> Url:
|
|
|
73
115
|
|
|
74
116
|
|
|
75
117
|
def normalize_url(
|
|
76
|
-
url: Url,
|
|
118
|
+
url: Url,
|
|
119
|
+
check_schemes: list[str] | None = HTTP_OR_FILE,
|
|
120
|
+
drop_fragment: bool = True,
|
|
121
|
+
resolve_local_paths: bool = True,
|
|
77
122
|
) -> Url:
|
|
78
123
|
"""
|
|
79
124
|
Minimal URL normalization. By default also enforces http/https/file URLs and
|
|
80
|
-
removes fragment.
|
|
125
|
+
removes fragment. By default enforces http/https/file URLs but this can be
|
|
126
|
+
adjusted with `check_schemes`.
|
|
81
127
|
"""
|
|
82
|
-
# urlsplit is too forgiving.
|
|
83
|
-
if (
|
|
84
|
-
http_or_file_only
|
|
85
|
-
and not url.startswith("http://")
|
|
86
|
-
and not url.startswith("https://")
|
|
87
|
-
and not is_file_url(url)
|
|
88
|
-
):
|
|
89
|
-
raise ValueError(f"Expected http:// or https:// or file:// URL but found: {url}")
|
|
90
128
|
|
|
91
129
|
fragment: str | None
|
|
92
130
|
scheme, netloc, path, query, fragment = urlsplit(url)
|
|
93
131
|
|
|
132
|
+
# urlsplit is too forgiving.
|
|
133
|
+
if check_schemes and scheme not in check_schemes:
|
|
134
|
+
raise ValueError(f"Scheme {scheme!r} not in allowed schemes: {check_schemes!r}: {url}")
|
|
135
|
+
|
|
94
136
|
if drop_fragment:
|
|
95
137
|
fragment = None
|
|
96
138
|
if path == "/":
|
|
@@ -115,10 +157,10 @@ def test_is_url():
|
|
|
115
157
|
assert is_url("file://hostname/path/to/file") == True
|
|
116
158
|
assert is_url("invalid-url") == False
|
|
117
159
|
assert is_url("www.example.com") == False
|
|
118
|
-
assert is_url("http://example.com",
|
|
119
|
-
assert is_url("https://example.com",
|
|
120
|
-
assert is_url("ftp://example.com",
|
|
121
|
-
assert is_url("file:///path/to/file",
|
|
160
|
+
assert is_url("http://example.com", only_schemes=HTTP_ONLY) == True
|
|
161
|
+
assert is_url("https://example.com", only_schemes=HTTP_ONLY) == True
|
|
162
|
+
assert is_url("ftp://example.com", only_schemes=HTTP_ONLY) == False
|
|
163
|
+
assert is_url("file:///path/to/file", only_schemes=HTTP_ONLY) == False
|
|
122
164
|
|
|
123
165
|
|
|
124
166
|
def test_as_file_url():
|
|
@@ -148,8 +190,18 @@ def test_normalize_url():
|
|
|
148
190
|
normalize_url(Url("file:///path/to/file#fragment"), drop_fragment=False)
|
|
149
191
|
== "file:///path/to/file#fragment"
|
|
150
192
|
)
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
normalize_url(url=Url("/not/a/URL"))
|
|
196
|
+
raise AssertionError()
|
|
197
|
+
except ValueError as e:
|
|
198
|
+
assert str(e) == "Scheme '' not in allowed schemes: ['http', 'https', 'file']: /not/a/URL"
|
|
199
|
+
|
|
151
200
|
try:
|
|
152
201
|
normalize_url(Url("ftp://example.com"))
|
|
153
202
|
raise AssertionError()
|
|
154
203
|
except ValueError as e:
|
|
155
|
-
assert
|
|
204
|
+
assert (
|
|
205
|
+
str(e)
|
|
206
|
+
== "Scheme 'ftp' not in allowed schemes: ['http', 'https', 'file']: ftp://example.com"
|
|
207
|
+
)
|
kash/utils/errors.py
CHANGED
|
@@ -139,6 +139,12 @@ class FileFormatError(ContentError):
|
|
|
139
139
|
pass
|
|
140
140
|
|
|
141
141
|
|
|
142
|
+
class ApiError(KashRuntimeError):
|
|
143
|
+
"""Raised when an API call returns something unexpected."""
|
|
144
|
+
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
|
|
142
148
|
def _nonfatal_exceptions() -> tuple[type[Exception], ...]:
|
|
143
149
|
exceptions: list[type[Exception]] = [SelfExplanatoryError, FileNotFoundError, IOError]
|
|
144
150
|
try:
|
|
@@ -1,14 +1,18 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
5
|
+
from kash.utils.file_utils.file_formats_model import file_format_info
|
|
6
|
+
|
|
4
7
|
|
|
5
8
|
@dataclass(frozen=True)
|
|
6
|
-
class
|
|
9
|
+
class DirInfo:
|
|
7
10
|
total_size: int
|
|
8
11
|
file_count: int
|
|
9
12
|
dir_count: int
|
|
10
13
|
symlink_count: int
|
|
11
14
|
other_count: int
|
|
15
|
+
format_tallies: dict[str, int] | None = None
|
|
12
16
|
|
|
13
17
|
@property
|
|
14
18
|
def total_count(self) -> int:
|
|
@@ -18,7 +22,7 @@ class SizeInfo:
|
|
|
18
22
|
return self.file_count == 0 and self.dir_count == 0 and self.other_count == 0
|
|
19
23
|
|
|
20
24
|
|
|
21
|
-
def
|
|
25
|
+
def get_dir_info(path: Path, tally_formats: bool = False) -> DirInfo:
|
|
22
26
|
"""
|
|
23
27
|
Get tallies of all files, directories, and other items in the given directory.
|
|
24
28
|
"""
|
|
@@ -29,10 +33,15 @@ def get_dir_size(path: Path) -> SizeInfo:
|
|
|
29
33
|
symlink_count = 0
|
|
30
34
|
other_count = 0
|
|
31
35
|
|
|
36
|
+
format_tallies: dict[str, int] = defaultdict(int)
|
|
37
|
+
|
|
32
38
|
for file_path in path.rglob("*"):
|
|
33
39
|
if file_path.is_file():
|
|
34
40
|
file_count += 1
|
|
35
41
|
total_size += file_path.stat().st_size
|
|
42
|
+
if tally_formats:
|
|
43
|
+
file_info = file_format_info(file_path)
|
|
44
|
+
format_tallies[file_info.as_str()] += 1
|
|
36
45
|
elif file_path.is_dir():
|
|
37
46
|
dir_count += 1
|
|
38
47
|
elif file_path.is_symlink():
|
|
@@ -40,9 +49,21 @@ def get_dir_size(path: Path) -> SizeInfo:
|
|
|
40
49
|
else:
|
|
41
50
|
other_count += 1
|
|
42
51
|
|
|
43
|
-
|
|
52
|
+
if format_tallies:
|
|
53
|
+
sorted_format_tallies = {k: format_tallies[k] for k in sorted(format_tallies)}
|
|
54
|
+
else:
|
|
55
|
+
sorted_format_tallies = None
|
|
56
|
+
|
|
57
|
+
return DirInfo(
|
|
58
|
+
total_size,
|
|
59
|
+
file_count,
|
|
60
|
+
dir_count,
|
|
61
|
+
symlink_count,
|
|
62
|
+
other_count,
|
|
63
|
+
sorted_format_tallies,
|
|
64
|
+
)
|
|
44
65
|
|
|
45
66
|
|
|
46
67
|
def is_nonempty_dir(path: str | Path) -> bool:
|
|
47
68
|
path = Path(path)
|
|
48
|
-
return path.is_dir() and
|
|
69
|
+
return path.is_dir() and get_dir_info(path).file_count > 0
|
|
@@ -24,12 +24,12 @@ class FileExt(Enum):
|
|
|
24
24
|
log = "log"
|
|
25
25
|
py = "py"
|
|
26
26
|
sh = "sh"
|
|
27
|
-
ksh = "ksh"
|
|
28
27
|
xsh = "xsh"
|
|
29
28
|
pdf = "pdf"
|
|
30
29
|
docx = "docx"
|
|
31
30
|
jpg = "jpg"
|
|
32
31
|
png = "png"
|
|
32
|
+
gif = "gif"
|
|
33
33
|
svg = "svg"
|
|
34
34
|
mp3 = "mp3"
|
|
35
35
|
m4a = "m4a"
|
|
@@ -49,13 +49,12 @@ class FileExt(Enum):
|
|
|
49
49
|
self.json,
|
|
50
50
|
self.py,
|
|
51
51
|
self.sh,
|
|
52
|
-
self.ksh,
|
|
53
52
|
self.xsh,
|
|
54
53
|
]
|
|
55
54
|
|
|
56
55
|
@property
|
|
57
56
|
def is_image(self) -> bool:
|
|
58
|
-
return self in [self.jpg, self.png]
|
|
57
|
+
return self in [self.jpg, self.png, self.gif, self.svg]
|
|
59
58
|
|
|
60
59
|
@classmethod
|
|
61
60
|
def parse(cls, ext_str: str) -> FileExt | None:
|