kash-shell 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. kash/actions/__init__.py +4 -4
  2. kash/actions/core/markdownify.py +5 -2
  3. kash/actions/core/readability.py +5 -2
  4. kash/actions/core/render_as_html.py +18 -0
  5. kash/actions/core/webpage_config.py +12 -4
  6. kash/commands/__init__.py +8 -20
  7. kash/commands/base/basic_file_commands.py +15 -0
  8. kash/commands/base/debug_commands.py +15 -2
  9. kash/commands/base/general_commands.py +27 -18
  10. kash/commands/base/logs_commands.py +1 -4
  11. kash/commands/base/model_commands.py +8 -8
  12. kash/commands/base/search_command.py +3 -2
  13. kash/commands/base/show_command.py +5 -3
  14. kash/commands/extras/parse_uv_lock.py +186 -0
  15. kash/commands/help/doc_commands.py +2 -31
  16. kash/commands/help/welcome.py +33 -0
  17. kash/commands/workspace/selection_commands.py +11 -6
  18. kash/commands/workspace/workspace_commands.py +19 -16
  19. kash/config/colors.py +2 -0
  20. kash/config/env_settings.py +72 -0
  21. kash/config/init.py +2 -2
  22. kash/config/logger.py +61 -59
  23. kash/config/logger_basic.py +12 -5
  24. kash/config/server_config.py +6 -6
  25. kash/config/settings.py +117 -67
  26. kash/config/setup.py +35 -9
  27. kash/config/suppress_warnings.py +30 -12
  28. kash/config/text_styles.py +3 -13
  29. kash/docs/load_api_docs.py +2 -1
  30. kash/docs/markdown/topics/a2_installation.md +7 -3
  31. kash/docs/markdown/topics/a3_getting_started.md +3 -2
  32. kash/docs/markdown/warning.md +3 -8
  33. kash/docs/markdown/welcome.md +4 -0
  34. kash/docs_base/load_recipe_snippets.py +1 -1
  35. kash/docs_base/recipes/{general_system_commands.ksh → general_system_commands.sh} +1 -1
  36. kash/{concepts → embeddings}/cosine.py +2 -1
  37. kash/embeddings/text_similarity.py +57 -0
  38. kash/exec/__init__.py +20 -3
  39. kash/exec/action_decorators.py +18 -4
  40. kash/exec/action_exec.py +41 -23
  41. kash/exec/action_registry.py +13 -48
  42. kash/exec/command_registry.py +2 -1
  43. kash/exec/fetch_url_metadata.py +4 -6
  44. kash/exec/importing.py +56 -0
  45. kash/exec/llm_transforms.py +6 -6
  46. kash/exec/precondition_registry.py +2 -1
  47. kash/exec/preconditions.py +16 -1
  48. kash/exec/shell_callable_action.py +33 -19
  49. kash/file_storage/file_store.py +23 -14
  50. kash/file_storage/item_file_format.py +13 -3
  51. kash/file_storage/metadata_dirs.py +11 -2
  52. kash/help/assistant.py +2 -2
  53. kash/help/assistant_instructions.py +2 -1
  54. kash/help/help_embeddings.py +2 -2
  55. kash/help/help_printing.py +14 -10
  56. kash/help/tldr_help.py +5 -3
  57. kash/llm_utils/clean_headings.py +1 -1
  58. kash/llm_utils/llm_api_keys.py +4 -4
  59. kash/llm_utils/llm_completion.py +2 -2
  60. kash/llm_utils/llm_features.py +68 -0
  61. kash/llm_utils/llm_messages.py +1 -2
  62. kash/llm_utils/llm_names.py +1 -1
  63. kash/llm_utils/llms.py +17 -12
  64. kash/local_server/__init__.py +5 -2
  65. kash/local_server/local_server.py +56 -46
  66. kash/local_server/local_server_commands.py +15 -15
  67. kash/local_server/local_server_routes.py +2 -2
  68. kash/local_server/local_url_formatters.py +1 -1
  69. kash/mcp/__init__.py +5 -2
  70. kash/mcp/mcp_cli.py +54 -17
  71. kash/mcp/mcp_server_commands.py +5 -6
  72. kash/mcp/mcp_server_routes.py +14 -11
  73. kash/mcp/mcp_server_sse.py +61 -34
  74. kash/mcp/mcp_server_stdio.py +0 -8
  75. kash/media_base/audio_processing.py +81 -7
  76. kash/media_base/media_cache.py +18 -18
  77. kash/media_base/media_services.py +1 -1
  78. kash/media_base/media_tools.py +6 -6
  79. kash/media_base/services/local_file_media.py +2 -2
  80. kash/media_base/{speech_transcription.py → transcription_deepgram.py} +25 -109
  81. kash/media_base/transcription_format.py +73 -0
  82. kash/media_base/transcription_whisper.py +38 -0
  83. kash/model/__init__.py +73 -5
  84. kash/model/actions_model.py +38 -4
  85. kash/model/concept_model.py +30 -0
  86. kash/model/items_model.py +56 -13
  87. kash/model/params_model.py +24 -0
  88. kash/shell/completions/completion_scoring.py +37 -5
  89. kash/shell/output/kerm_codes.py +1 -2
  90. kash/shell/output/shell_formatting.py +14 -4
  91. kash/shell/shell_main.py +2 -2
  92. kash/shell/utils/exception_printing.py +6 -0
  93. kash/shell/utils/native_utils.py +26 -20
  94. kash/text_handling/custom_sliding_transforms.py +12 -4
  95. kash/text_handling/doc_normalization.py +6 -2
  96. kash/text_handling/markdown_render.py +117 -0
  97. kash/text_handling/markdown_utils.py +204 -0
  98. kash/utils/common/import_utils.py +12 -3
  99. kash/utils/common/type_utils.py +0 -29
  100. kash/utils/common/url.py +80 -28
  101. kash/utils/errors.py +6 -0
  102. kash/utils/file_utils/{dir_size.py → dir_info.py} +25 -4
  103. kash/utils/file_utils/file_ext.py +2 -3
  104. kash/utils/file_utils/file_formats.py +28 -2
  105. kash/utils/file_utils/file_formats_model.py +50 -19
  106. kash/utils/file_utils/filename_parsing.py +10 -4
  107. kash/web_content/dir_store.py +1 -2
  108. kash/web_content/file_cache_utils.py +37 -10
  109. kash/web_content/file_processing.py +68 -0
  110. kash/web_content/local_file_cache.py +12 -9
  111. kash/web_content/web_extract.py +8 -3
  112. kash/web_content/web_fetch.py +12 -4
  113. kash/web_gen/tabbed_webpage.py +5 -2
  114. kash/web_gen/templates/base_styles.css.jinja +120 -14
  115. kash/web_gen/templates/base_webpage.html.jinja +60 -13
  116. kash/web_gen/templates/content_styles.css.jinja +4 -2
  117. kash/web_gen/templates/item_view.html.jinja +2 -2
  118. kash/web_gen/templates/tabbed_webpage.html.jinja +1 -2
  119. kash/workspaces/__init__.py +15 -2
  120. kash/workspaces/selections.py +18 -3
  121. kash/workspaces/source_items.py +4 -2
  122. kash/workspaces/workspace_output.py +11 -4
  123. kash/workspaces/workspaces.py +5 -11
  124. kash/xonsh_custom/command_nl_utils.py +40 -19
  125. kash/xonsh_custom/custom_shell.py +44 -12
  126. kash/xonsh_custom/customize_prompt.py +39 -21
  127. kash/xonsh_custom/load_into_xonsh.py +26 -27
  128. kash/xonsh_custom/shell_load_commands.py +2 -2
  129. kash/xonsh_custom/xonsh_completers.py +2 -249
  130. kash/xonsh_custom/xonsh_keybindings.py +282 -0
  131. kash/xonsh_custom/xonsh_modern_tools.py +3 -3
  132. kash/xontrib/kash_extension.py +5 -6
  133. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/METADATA +26 -12
  134. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/RECORD +140 -140
  135. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/entry_points.txt +1 -1
  136. kash/concepts/concept_formats.py +0 -23
  137. kash/concepts/text_similarity.py +0 -112
  138. kash/shell/clideps/api_keys.py +0 -99
  139. kash/shell/clideps/dotenv_setup.py +0 -114
  140. kash/shell/clideps/dotenv_utils.py +0 -89
  141. kash/shell/clideps/pkg_deps.py +0 -232
  142. kash/shell/clideps/platforms.py +0 -11
  143. kash/shell/clideps/terminal_features.py +0 -56
  144. kash/shell/utils/osc_utils.py +0 -95
  145. kash/shell/utils/terminal_images.py +0 -133
  146. kash/text_handling/markdown_util.py +0 -167
  147. kash/utils/common/atomic_var.py +0 -158
  148. kash/utils/common/string_replace.py +0 -93
  149. kash/utils/common/string_template.py +0 -101
  150. /kash/docs_base/recipes/{python_dev_commands.ksh → python_dev_commands.sh} +0 -0
  151. /kash/docs_base/recipes/{tldr_standard_commands.ksh → tldr_standard_commands.sh} +0 -0
  152. /kash/{concepts → embeddings}/embeddings.py +0 -0
  153. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/WHEEL +0 -0
  154. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,117 @@
1
+ from textwrap import dedent
2
+
3
+ import marko
4
+ import regex
5
+ from marko.block import HTMLBlock
6
+ from marko.ext.gfm import GFM
7
+ from marko.helpers import MarkoExtension
8
+
9
+
10
+ # When we use divs in Markdown we usually want them to be standalone paragraphs,
11
+ # so it doesn't break other wrapping with flowmark etc. This handles that.
12
+ class CustomHTMLBlockMixin:
13
+ div_pattern = regex.compile(r"^\s*<div\b", regex.IGNORECASE)
14
+
15
+ def render_html_block(self, element: HTMLBlock) -> str:
16
+ # Apply GFM filtering first via the next renderer in the MRO.
17
+ filtered_body = super().render_html_block(element) # pyright: ignore
18
+
19
+ # Check if the original block was a div.
20
+ if self.div_pattern.match(element.body.strip()):
21
+ # If it was a div, wrap the *filtered* result in newlines.
22
+ return f"\n{filtered_body.strip()}\n"
23
+ else:
24
+ # Otherwise, return the GFM-filtered body directly.
25
+ return filtered_body
26
+
27
+
28
+ # GFM first, adding our custom override as an extension to handle divs our way.
29
+ # Extensions later in this list are earlier in MRO.
30
+ MARKO_GFM = marko.Markdown(
31
+ extensions=["footnote", GFM, MarkoExtension(renderer_mixins=[CustomHTMLBlockMixin])]
32
+ )
33
+
34
+
35
+ FOOTNOTE_UP_ARROW = "&nbsp;↑&nbsp;"
36
+
37
+
38
+ def html_postprocess(html: str) -> str:
39
+ """
40
+ Final tweaks to the HTML.
41
+ """
42
+ html = html.replace(
43
+ """class="footnote">&#8617;</a>""", f"""class="footnote">{FOOTNOTE_UP_ARROW}</a>"""
44
+ )
45
+ return html
46
+
47
+
48
+ def markdown_to_html(markdown: str, converter: marko.Markdown = MARKO_GFM) -> str:
49
+ """
50
+ Convert Markdown to HTML.
51
+
52
+ Wraps div blocks with newlines for better Markdown compatibility.
53
+
54
+ Output passes through raw HTML! Note per GFM, unsafe script tags etc
55
+ are [allowed in some cases](https://github.github.com/gfm/#example-140) so
56
+ additional sanitization is needed if input isn't trusted.
57
+ """
58
+ html = converter.convert(markdown)
59
+ return html_postprocess(html)
60
+ return html
61
+
62
+
63
+ ## Tests
64
+
65
+
66
+ def test_markdown_to_html():
67
+ markdown = dedent(
68
+ """
69
+ # Heading
70
+
71
+ This is a paragraph and a [link](https://example.com).
72
+
73
+ - Item 1
74
+ - Item 2
75
+
76
+ ## Subheading
77
+
78
+ This is a paragraph with a <span>span</span> tag.
79
+ This is a paragraph with a <div>div</div> tag.
80
+ This is a paragraph with an <a href='https://example.com'>example link</a>.
81
+
82
+ <div class="div1">This is a div.</div>
83
+
84
+ <div class="div2">This is a second div.
85
+ <iframe src="https://example.com">Inline iframe, note this is sanitized</iframe>
86
+ </div>
87
+
88
+ <!-- Script tag in a block, note this isn't sanitized -->
89
+ <script>console.log("Javascript block!");</script>
90
+ """
91
+ )
92
+ print(markdown_to_html(markdown))
93
+
94
+ expected_html = dedent(
95
+ """
96
+ <h1>Heading</h1>
97
+ <p>This is a paragraph and a <a href="https://example.com">link</a>.</p>
98
+ <ul>
99
+ <li>Item 1</li>
100
+ <li>Item 2</li>
101
+ </ul>
102
+ <h2>Subheading</h2>
103
+ <p>This is a paragraph with a <span>span</span> tag.
104
+ This is a paragraph with a <div>div</div> tag.
105
+ This is a paragraph with an <a href='https://example.com'>example link</a>.</p>
106
+
107
+ <div class="div1">This is a div.</div>
108
+
109
+ <div class="div2">This is a second div.
110
+ &lt;iframe src="https://example.com">Inline iframe, note this is sanitized</iframe>
111
+ </div>
112
+ <!-- Script tag in a block, note this isn't sanitized -->
113
+ <script>console.log("Javascript block!");</script>
114
+ """
115
+ )
116
+
117
+ assert markdown_to_html(markdown).strip() == expected_html.strip()
@@ -0,0 +1,204 @@
1
+ import re
2
+ from typing import Any
3
+
4
+ import marko
5
+ import regex
6
+ from marko.block import Heading, ListItem
7
+ from marko.inline import Link
8
+
9
+ from kash.config.logger import get_logger
10
+ from kash.utils.common.url import Url
11
+
12
+ log = get_logger(__name__)
13
+
14
+ # Characters that commonly need escaping in Markdown inline text.
15
+ MARKDOWN_ESCAPE_CHARS = r"([\\`*_{}\[\]()#+.!-])"
16
+ MARKDOWN_ESCAPE_RE = re.compile(MARKDOWN_ESCAPE_CHARS)
17
+
18
+
19
+ def escape_markdown(text: str) -> str:
20
+ """
21
+ Escape characters with special meaning in Markdown.
22
+ """
23
+ return MARKDOWN_ESCAPE_RE.sub(r"\\\1", text)
24
+
25
+
26
+ def as_bullet_points(values: list[Any]) -> str:
27
+ """
28
+ Convert a list of values to a Markdown bullet-point list. If a value is a string,
29
+ it is treated like Markdown. If it's something else it's converted to a string
30
+ and also escaped for Markdown.
31
+ """
32
+ points: list[str] = []
33
+ for value in values:
34
+ value = value.replace("\n", " ").strip()
35
+ if isinstance(value, str):
36
+ points.append(value)
37
+ else:
38
+ points.append(escape_markdown(str(value)))
39
+
40
+ return "\n\n".join(f"- {point}" for point in points)
41
+
42
+
43
+ def markdown_link(text: str, url: str | Url) -> str:
44
+ """
45
+ Create a Markdown link.
46
+ """
47
+ text = text.replace("[", "\\[").replace("]", "\\]")
48
+ return f"[{text}]({url})"
49
+
50
+
51
+ def is_markdown_header(markdown: str) -> bool:
52
+ """
53
+ Is the start of this content a Markdown header?
54
+ """
55
+ return regex.match(r"^#+ ", markdown) is not None
56
+
57
+
58
+ def _tree_links(element, include_internal=False):
59
+ links = []
60
+
61
+ def _find_links(element):
62
+ match element:
63
+ case Link():
64
+ if include_internal or not element.dest.startswith("#"):
65
+ links.append(element.dest)
66
+ case _:
67
+ if hasattr(element, "children"):
68
+ for child in element.children:
69
+ _find_links(child)
70
+
71
+ _find_links(element)
72
+ return links
73
+
74
+
75
+ def extract_links(file_path: str, include_internal=False) -> list[str]:
76
+ """
77
+ Extract all links from a Markdown file. Future: Include textual and section context.
78
+ """
79
+
80
+ with open(file_path) as file:
81
+ content = file.read()
82
+ document = marko.parse(content)
83
+ return _tree_links(document, include_internal)
84
+
85
+
86
+ def _extract_text(element: Any) -> str:
87
+ if isinstance(element, str):
88
+ return element
89
+ elif hasattr(element, "children"):
90
+ return "".join(_extract_text(child) for child in element.children)
91
+ else:
92
+ return ""
93
+
94
+
95
+ def _tree_bullet_points(element: marko.block.Document) -> list[str]:
96
+ bullet_points: list[str] = []
97
+
98
+ def _find_bullet_points(element):
99
+ if isinstance(element, ListItem):
100
+ bullet_points.append(_extract_text(element).strip())
101
+ elif hasattr(element, "children"):
102
+ for child in element.children:
103
+ _find_bullet_points(child)
104
+
105
+ _find_bullet_points(element)
106
+ return bullet_points
107
+
108
+
109
+ def extract_bullet_points(content: str) -> list[str]:
110
+ """
111
+ Extract list item values from a Markdown file.
112
+ """
113
+
114
+ document = marko.parse(content)
115
+ return _tree_bullet_points(document)
116
+
117
+
118
+ def _type_from_heading(heading: Heading) -> str:
119
+ if heading.level in [1, 2, 3, 4, 5, 6]:
120
+ return f"h{heading.level}"
121
+ else:
122
+ raise ValueError(f"Unsupported heading: {heading}: level {heading.level}")
123
+
124
+
125
+ def _last_unescaped_bracket(text: str, index: int) -> str | None:
126
+ escaped = False
127
+ for i in range(index - 1, -1, -1):
128
+ ch = text[i]
129
+ if ch == "\\":
130
+ escaped = not escaped # Toggle escaping chain
131
+ continue
132
+ if ch in "[]":
133
+ if not escaped:
134
+ return ch
135
+ # Reset escape status after any non‑backslash char
136
+ escaped = False
137
+ return None
138
+
139
+
140
+ def find_markdown_text(
141
+ pattern: re.Pattern[str], text: str, *, start_pos: int = 0
142
+ ) -> re.Match[str] | None:
143
+ """
144
+ Return first regex `pattern` match in `text` not inside an existing link.
145
+
146
+ A match is considered inside a link when the most recent unescaped square
147
+ bracket preceding the match start is an opening bracket "[".
148
+ """
149
+
150
+ pos = start_pos
151
+ while True:
152
+ match = pattern.search(text, pos)
153
+ if match is None:
154
+ return None
155
+
156
+ last_bracket = _last_unescaped_bracket(text, match.start())
157
+ if last_bracket != "[":
158
+ return match
159
+
160
+ # Skip this match and continue searching
161
+ pos = match.end()
162
+
163
+
164
+ ## Tests
165
+
166
+
167
+ def test_escape_markdown() -> None:
168
+ assert escape_markdown("") == ""
169
+ assert escape_markdown("Hello world") == "Hello world"
170
+ assert escape_markdown("`code`") == "\\`code\\`"
171
+ assert escape_markdown("*italic*") == "\\*italic\\*"
172
+ assert escape_markdown("_bold_") == "\\_bold\\_"
173
+ assert escape_markdown("{braces}") == "\\{braces\\}"
174
+ assert escape_markdown("# header") == "\\# header"
175
+ assert escape_markdown("1. item") == "1\\. item"
176
+ assert escape_markdown("line+break") == "line\\+break"
177
+ assert escape_markdown("dash-") == "dash\\-"
178
+ assert escape_markdown("!bang") == "\\!bang"
179
+ assert escape_markdown("backslash\\") == "backslash\\\\"
180
+ assert escape_markdown("Multiple *special* chars [here](#anchor).") == (
181
+ "Multiple \\*special\\* chars \\[here\\]\\(\\#anchor\\)\\."
182
+ )
183
+
184
+
185
+ def test_find_markdown_text() -> None: # pragma: no cover
186
+ # Match is returned when the term is not inside a link.
187
+ text = "Foo bar baz"
188
+ pattern = re.compile("Foo Bar", re.IGNORECASE)
189
+ match = find_markdown_text(pattern, text)
190
+ assert match is not None and match.group(0) == "Foo bar"
191
+
192
+ # Skips occurrence inside link and returns the first one outside.
193
+ text = "[Foo](http://example.com) something Foo"
194
+ pattern = re.compile("Foo", re.IGNORECASE)
195
+ match = find_markdown_text(pattern, text)
196
+ assert match is not None
197
+ assert match.start() > text.index(") ")
198
+ assert text[match.start() : match.end()] == "Foo"
199
+
200
+ # Returns None when the only occurrences are inside links.
201
+ text = "prefix [bar](http://example.com) suffix"
202
+ pattern = re.compile("bar", re.IGNORECASE)
203
+ match = find_markdown_text(pattern, text)
204
+ assert match is None
@@ -15,21 +15,30 @@ Tallies: TypeAlias = dict[str, int]
15
15
  def import_subdirs(
16
16
  parent_package_name: str,
17
17
  parent_dir: Path,
18
- subdir_names: list[str],
18
+ subdir_names: list[str] | None = None,
19
19
  tallies: Tallies | None = None,
20
20
  ):
21
21
  """
22
22
  Import all files in the given subdirectories of a single parent directory.
23
+ Wraps `pkgutil.iter_modules` to iterate over all modules in the subdirectories.
24
+ If `subdir_names` is `None`, will import all subdirectories.
23
25
  """
24
26
  if tallies is None:
25
27
  tallies = {}
28
+ if not subdir_names:
29
+ subdir_names = ["."]
26
30
 
27
31
  for subdir_name in subdir_names:
28
- full_path = parent_dir / subdir_name
32
+ if subdir_name == ".":
33
+ full_path = parent_dir
34
+ package_name = parent_package_name
35
+ else:
36
+ full_path = parent_dir / subdir_name
37
+ package_name = f"{parent_package_name}.{subdir_name}"
38
+
29
39
  if not full_path.is_dir():
30
40
  raise FileNotFoundError(f"Subdirectory not found: {full_path}")
31
41
 
32
- package_name = f"{parent_package_name}.{subdir_name}"
33
42
  for _module_finder, module_name, _is_pkg in pkgutil.iter_modules(path=[str(full_path)]):
34
43
  importlib.import_module(f"{package_name}.{module_name}") # Propagate import errors
35
44
  tallies[package_name] = tallies.get(package_name, 0) + 1
@@ -15,35 +15,6 @@ def not_none(value: T | None, message: str | None = None) -> T:
15
15
  return value
16
16
 
17
17
 
18
- def is_truthy(value: Any, strict: bool = True) -> bool:
19
- """
20
- True for all common string and non-string values for true. Useful for parsing
21
- string values or command line arguments.
22
- """
23
- truthy_values = {"true", "1", "yes", "on", "y"}
24
- falsy_values = {"false", "0", "no", "off", "n", ""}
25
-
26
- if value is None:
27
- return False
28
- elif isinstance(value, str):
29
- value = value.strip().lower()
30
- if value in truthy_values:
31
- return True
32
- elif value in falsy_values:
33
- return False
34
- elif isinstance(value, (int, float)):
35
- return value != 0
36
- elif isinstance(value, bool):
37
- return value
38
- elif isinstance(value, (list, tuple, set, dict)):
39
- return len(value) > 0
40
-
41
- if strict:
42
- raise ValueError(f"Could not convert type {type(value)} to boolean: {repr(value)}")
43
-
44
- return bool(value)
45
-
46
-
47
18
  def as_dataclass(dict_data: dict[str, Any], dataclass_type: type[T]) -> T:
48
19
  """
49
20
  Convert a dict recursively to dataclass object, raising an error if the data does
kash/utils/common/url.py CHANGED
@@ -1,11 +1,12 @@
1
1
  """
2
2
  A simple `Url` type and basic URL handling with no dependencies.
3
+ Simply a few convenience types and functions around `urllib`.
3
4
  """
4
5
 
5
6
  import re
6
7
  from pathlib import Path
7
8
  from typing import NewType
8
- from urllib.parse import urlparse, urlsplit, urlunsplit
9
+ from urllib.parse import ParseResult, urlparse, urlsplit, urlunsplit
9
10
 
10
11
  Url = NewType("Url", str)
11
12
  """
@@ -23,41 +24,82 @@ UnresolvedLocator = str | Locator
23
24
  A string that may not be resolved to a URL or path.
24
25
  """
25
26
 
27
+ HTTP_ONLY = ["http", "https"]
28
+ HTTP_OR_FILE = HTTP_ONLY + ["file"]
26
29
 
27
- def is_url(text: UnresolvedLocator, http_only: bool = False) -> bool:
30
+
31
+ def check_if_url(
32
+ text: UnresolvedLocator, only_schemes: list[str] | None = None
33
+ ) -> ParseResult | None:
28
34
  """
29
- Check if a string is a URL. For convenience, also returns false for
30
- Paths, so that it's easy to use local paths and URLs interchangeably.
35
+ Convenience function to check if a string or Path is a URL and if so return
36
+ the `urlparse.ParseResult`.
37
+
38
+ Also returns false for Paths, so that it's easy to use local paths and URLs
39
+ (`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE` to
40
+ restrict to only certain schemes.
31
41
  """
32
42
  if isinstance(text, Path):
33
- return False
43
+ return None
34
44
  text = str(text) # Handle paths or anything else unexpected.
35
45
  try:
36
46
  result = urlparse(text)
37
- if http_only:
38
- return result.scheme in ["http", "https"]
47
+ if only_schemes:
48
+ return result if result.scheme in only_schemes else None
39
49
  else:
40
- return result.scheme != ""
50
+ return result if result.scheme != "" else None
41
51
  except ValueError:
42
- return False
52
+ return None
53
+
54
+
55
+ def is_url(text: UnresolvedLocator, only_schemes: list[str] | None = None) -> bool:
56
+ """
57
+ Check if a string is a URL. For convenience, also returns false for
58
+ Paths, so that it's easy to use local paths and URLs interchangeably.
59
+ """
60
+ return check_if_url(text, only_schemes) is not None
43
61
 
44
62
 
45
63
  def is_file_url(url: str | Url) -> bool:
46
64
  """
47
- Is URL a file:// URL?
65
+ Is URL a file:// URL? Does not check for local file paths.
48
66
  """
49
67
  return url.startswith("file://")
50
68
 
51
69
 
52
- def parse_file_url(url: str | Url) -> Path | None:
70
+ def parse_http_url(url: str | Url) -> ParseResult:
53
71
  """
54
- Parse a file URL and return the path, or None if not a file URL.
72
+ Parse an http/https URL and return the parsed result, raising ValueError if
73
+ not an http/https URL.
74
+ """
75
+ parsed_url = urlparse(url)
76
+ if parsed_url.scheme in ("http", "https"):
77
+ return parsed_url
78
+ else:
79
+ raise ValueError(f"Not an http/https URL: {url}")
80
+
81
+
82
+ def parse_file_url(url: str | Url) -> Path:
83
+ """
84
+ Parse a file URL and return the path, raising ValueError if not a file URL.
55
85
  """
56
86
  parsed_url = urlparse(url)
57
87
  if parsed_url.scheme == "file":
58
88
  return Path(parsed_url.path)
59
89
  else:
60
- return None
90
+ raise ValueError(f"Not a file URL: {url}")
91
+
92
+
93
+ def parse_s3_url(url: str | Url) -> tuple[str, str]:
94
+ """
95
+ Parse an S3 URL and return the bucket and key, raising ValueError if not an
96
+ S3 URL.
97
+ """
98
+ parsed_url = urlparse(url)
99
+ if parsed_url.scheme == "s3":
100
+ return parsed_url.netloc, parsed_url.path.lstrip("/")
101
+ else:
102
+ raise ValueError(f"Not an S3 URL: {url}")
61
103
 
62
104
 
63
105
  def as_file_url(path: str | Path) -> Url:
@@ -73,24 +115,24 @@ def as_file_url(path: str | Path) -> Url:
73
115
 
74
116
 
75
117
  def normalize_url(
76
- url: Url, http_or_file_only=True, drop_fragment=True, resolve_local_paths=True
118
+ url: Url,
119
+ check_schemes: list[str] | None = HTTP_OR_FILE,
120
+ drop_fragment: bool = True,
121
+ resolve_local_paths: bool = True,
77
122
  ) -> Url:
78
123
  """
79
124
  Minimal URL normalization. By default also enforces http/https/file URLs and
80
- removes fragment.
125
+ removes fragment. By default enforces http/https/file URLs but this can be
126
+ adjusted with `check_schemes`.
81
127
  """
82
- # urlsplit is too forgiving.
83
- if (
84
- http_or_file_only
85
- and not url.startswith("http://")
86
- and not url.startswith("https://")
87
- and not is_file_url(url)
88
- ):
89
- raise ValueError(f"Expected http:// or https:// or file:// URL but found: {url}")
90
128
 
91
129
  fragment: str | None
92
130
  scheme, netloc, path, query, fragment = urlsplit(url)
93
131
 
132
+ # urlsplit is too forgiving.
133
+ if check_schemes and scheme not in check_schemes:
134
+ raise ValueError(f"Scheme {scheme!r} not in allowed schemes: {check_schemes!r}: {url}")
135
+
94
136
  if drop_fragment:
95
137
  fragment = None
96
138
  if path == "/":
@@ -115,10 +157,10 @@ def test_is_url():
115
157
  assert is_url("file://hostname/path/to/file") == True
116
158
  assert is_url("invalid-url") == False
117
159
  assert is_url("www.example.com") == False
118
- assert is_url("http://example.com", http_only=True) == True
119
- assert is_url("https://example.com", http_only=True) == True
120
- assert is_url("ftp://example.com", http_only=True) == False
121
- assert is_url("file:///path/to/file", http_only=True) == False
160
+ assert is_url("http://example.com", only_schemes=HTTP_ONLY) == True
161
+ assert is_url("https://example.com", only_schemes=HTTP_ONLY) == True
162
+ assert is_url("ftp://example.com", only_schemes=HTTP_ONLY) == False
163
+ assert is_url("file:///path/to/file", only_schemes=HTTP_ONLY) == False
122
164
 
123
165
 
124
166
  def test_as_file_url():
@@ -148,8 +190,18 @@ def test_normalize_url():
148
190
  normalize_url(Url("file:///path/to/file#fragment"), drop_fragment=False)
149
191
  == "file:///path/to/file#fragment"
150
192
  )
193
+
194
+ try:
195
+ normalize_url(url=Url("/not/a/URL"))
196
+ raise AssertionError()
197
+ except ValueError as e:
198
+ assert str(e) == "Scheme '' not in allowed schemes: ['http', 'https', 'file']: /not/a/URL"
199
+
151
200
  try:
152
201
  normalize_url(Url("ftp://example.com"))
153
202
  raise AssertionError()
154
203
  except ValueError as e:
155
- assert str(e) == "Expected http:// or https:// or file:// URL but found: ftp://example.com"
204
+ assert (
205
+ str(e)
206
+ == "Scheme 'ftp' not in allowed schemes: ['http', 'https', 'file']: ftp://example.com"
207
+ )
kash/utils/errors.py CHANGED
@@ -139,6 +139,12 @@ class FileFormatError(ContentError):
139
139
  pass
140
140
 
141
141
 
142
+ class ApiError(KashRuntimeError):
143
+ """Raised when an API call returns something unexpected."""
144
+
145
+ pass
146
+
147
+
142
148
  def _nonfatal_exceptions() -> tuple[type[Exception], ...]:
143
149
  exceptions: list[type[Exception]] = [SelfExplanatoryError, FileNotFoundError, IOError]
144
150
  try:
@@ -1,14 +1,18 @@
1
+ from collections import defaultdict
1
2
  from dataclasses import dataclass
2
3
  from pathlib import Path
3
4
 
5
+ from kash.utils.file_utils.file_formats_model import file_format_info
6
+
4
7
 
5
8
  @dataclass(frozen=True)
6
- class SizeInfo:
9
+ class DirInfo:
7
10
  total_size: int
8
11
  file_count: int
9
12
  dir_count: int
10
13
  symlink_count: int
11
14
  other_count: int
15
+ format_tallies: dict[str, int] | None = None
12
16
 
13
17
  @property
14
18
  def total_count(self) -> int:
@@ -18,7 +22,7 @@ class SizeInfo:
18
22
  return self.file_count == 0 and self.dir_count == 0 and self.other_count == 0
19
23
 
20
24
 
21
- def get_dir_size(path: Path) -> SizeInfo:
25
+ def get_dir_info(path: Path, tally_formats: bool = False) -> DirInfo:
22
26
  """
23
27
  Get tallies of all files, directories, and other items in the given directory.
24
28
  """
@@ -29,10 +33,15 @@ def get_dir_size(path: Path) -> SizeInfo:
29
33
  symlink_count = 0
30
34
  other_count = 0
31
35
 
36
+ format_tallies: dict[str, int] = defaultdict(int)
37
+
32
38
  for file_path in path.rglob("*"):
33
39
  if file_path.is_file():
34
40
  file_count += 1
35
41
  total_size += file_path.stat().st_size
42
+ if tally_formats:
43
+ file_info = file_format_info(file_path)
44
+ format_tallies[file_info.as_str()] += 1
36
45
  elif file_path.is_dir():
37
46
  dir_count += 1
38
47
  elif file_path.is_symlink():
@@ -40,9 +49,21 @@ def get_dir_size(path: Path) -> SizeInfo:
40
49
  else:
41
50
  other_count += 1
42
51
 
43
- return SizeInfo(total_size, file_count, dir_count, symlink_count, other_count)
52
+ if format_tallies:
53
+ sorted_format_tallies = {k: format_tallies[k] for k in sorted(format_tallies)}
54
+ else:
55
+ sorted_format_tallies = None
56
+
57
+ return DirInfo(
58
+ total_size,
59
+ file_count,
60
+ dir_count,
61
+ symlink_count,
62
+ other_count,
63
+ sorted_format_tallies,
64
+ )
44
65
 
45
66
 
46
67
  def is_nonempty_dir(path: str | Path) -> bool:
47
68
  path = Path(path)
48
- return path.is_dir() and get_dir_size(path).file_count > 0
69
+ return path.is_dir() and get_dir_info(path).file_count > 0
@@ -24,12 +24,12 @@ class FileExt(Enum):
24
24
  log = "log"
25
25
  py = "py"
26
26
  sh = "sh"
27
- ksh = "ksh"
28
27
  xsh = "xsh"
29
28
  pdf = "pdf"
30
29
  docx = "docx"
31
30
  jpg = "jpg"
32
31
  png = "png"
32
+ gif = "gif"
33
33
  svg = "svg"
34
34
  mp3 = "mp3"
35
35
  m4a = "m4a"
@@ -49,13 +49,12 @@ class FileExt(Enum):
49
49
  self.json,
50
50
  self.py,
51
51
  self.sh,
52
- self.ksh,
53
52
  self.xsh,
54
53
  ]
55
54
 
56
55
  @property
57
56
  def is_image(self) -> bool:
58
- return self in [self.jpg, self.png]
57
+ return self in [self.jpg, self.png, self.gif, self.svg]
59
58
 
60
59
  @classmethod
61
60
  def parse(cls, ext_str: str) -> FileExt | None: