kash-shell 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. kash/actions/__init__.py +4 -4
  2. kash/actions/core/markdownify.py +5 -2
  3. kash/actions/core/readability.py +5 -2
  4. kash/actions/core/render_as_html.py +18 -0
  5. kash/actions/core/webpage_config.py +12 -4
  6. kash/commands/__init__.py +8 -20
  7. kash/commands/base/basic_file_commands.py +15 -0
  8. kash/commands/base/debug_commands.py +15 -2
  9. kash/commands/base/general_commands.py +27 -18
  10. kash/commands/base/logs_commands.py +1 -4
  11. kash/commands/base/model_commands.py +8 -8
  12. kash/commands/base/search_command.py +3 -2
  13. kash/commands/base/show_command.py +5 -3
  14. kash/commands/extras/parse_uv_lock.py +186 -0
  15. kash/commands/help/doc_commands.py +2 -31
  16. kash/commands/help/welcome.py +33 -0
  17. kash/commands/workspace/selection_commands.py +11 -6
  18. kash/commands/workspace/workspace_commands.py +19 -16
  19. kash/config/colors.py +2 -0
  20. kash/config/env_settings.py +72 -0
  21. kash/config/init.py +2 -2
  22. kash/config/logger.py +61 -59
  23. kash/config/logger_basic.py +12 -5
  24. kash/config/server_config.py +6 -6
  25. kash/config/settings.py +117 -67
  26. kash/config/setup.py +35 -9
  27. kash/config/suppress_warnings.py +30 -12
  28. kash/config/text_styles.py +3 -13
  29. kash/docs/load_api_docs.py +2 -1
  30. kash/docs/markdown/topics/a2_installation.md +7 -3
  31. kash/docs/markdown/topics/a3_getting_started.md +3 -2
  32. kash/docs/markdown/warning.md +3 -8
  33. kash/docs/markdown/welcome.md +4 -0
  34. kash/docs_base/load_recipe_snippets.py +1 -1
  35. kash/docs_base/recipes/{general_system_commands.ksh → general_system_commands.sh} +1 -1
  36. kash/{concepts → embeddings}/cosine.py +2 -1
  37. kash/embeddings/text_similarity.py +57 -0
  38. kash/exec/__init__.py +20 -3
  39. kash/exec/action_decorators.py +18 -4
  40. kash/exec/action_exec.py +41 -23
  41. kash/exec/action_registry.py +13 -48
  42. kash/exec/command_registry.py +2 -1
  43. kash/exec/fetch_url_metadata.py +4 -6
  44. kash/exec/importing.py +56 -0
  45. kash/exec/llm_transforms.py +6 -6
  46. kash/exec/precondition_registry.py +2 -1
  47. kash/exec/preconditions.py +16 -1
  48. kash/exec/shell_callable_action.py +33 -19
  49. kash/file_storage/file_store.py +23 -14
  50. kash/file_storage/item_file_format.py +13 -3
  51. kash/file_storage/metadata_dirs.py +11 -2
  52. kash/help/assistant.py +2 -2
  53. kash/help/assistant_instructions.py +2 -1
  54. kash/help/help_embeddings.py +2 -2
  55. kash/help/help_printing.py +14 -10
  56. kash/help/tldr_help.py +5 -3
  57. kash/llm_utils/clean_headings.py +1 -1
  58. kash/llm_utils/llm_api_keys.py +4 -4
  59. kash/llm_utils/llm_completion.py +2 -2
  60. kash/llm_utils/llm_features.py +68 -0
  61. kash/llm_utils/llm_messages.py +1 -2
  62. kash/llm_utils/llm_names.py +1 -1
  63. kash/llm_utils/llms.py +17 -12
  64. kash/local_server/__init__.py +5 -2
  65. kash/local_server/local_server.py +56 -46
  66. kash/local_server/local_server_commands.py +15 -15
  67. kash/local_server/local_server_routes.py +2 -2
  68. kash/local_server/local_url_formatters.py +1 -1
  69. kash/mcp/__init__.py +5 -2
  70. kash/mcp/mcp_cli.py +54 -17
  71. kash/mcp/mcp_server_commands.py +5 -6
  72. kash/mcp/mcp_server_routes.py +14 -11
  73. kash/mcp/mcp_server_sse.py +61 -34
  74. kash/mcp/mcp_server_stdio.py +0 -8
  75. kash/media_base/audio_processing.py +81 -7
  76. kash/media_base/media_cache.py +18 -18
  77. kash/media_base/media_services.py +1 -1
  78. kash/media_base/media_tools.py +6 -6
  79. kash/media_base/services/local_file_media.py +2 -2
  80. kash/media_base/{speech_transcription.py → transcription_deepgram.py} +25 -109
  81. kash/media_base/transcription_format.py +73 -0
  82. kash/media_base/transcription_whisper.py +38 -0
  83. kash/model/__init__.py +73 -5
  84. kash/model/actions_model.py +38 -4
  85. kash/model/concept_model.py +30 -0
  86. kash/model/items_model.py +56 -13
  87. kash/model/params_model.py +24 -0
  88. kash/shell/completions/completion_scoring.py +37 -5
  89. kash/shell/output/kerm_codes.py +1 -2
  90. kash/shell/output/shell_formatting.py +14 -4
  91. kash/shell/shell_main.py +2 -2
  92. kash/shell/utils/exception_printing.py +6 -0
  93. kash/shell/utils/native_utils.py +26 -20
  94. kash/text_handling/custom_sliding_transforms.py +12 -4
  95. kash/text_handling/doc_normalization.py +6 -2
  96. kash/text_handling/markdown_render.py +117 -0
  97. kash/text_handling/markdown_utils.py +204 -0
  98. kash/utils/common/import_utils.py +12 -3
  99. kash/utils/common/type_utils.py +0 -29
  100. kash/utils/common/url.py +80 -28
  101. kash/utils/errors.py +6 -0
  102. kash/utils/file_utils/{dir_size.py → dir_info.py} +25 -4
  103. kash/utils/file_utils/file_ext.py +2 -3
  104. kash/utils/file_utils/file_formats.py +28 -2
  105. kash/utils/file_utils/file_formats_model.py +50 -19
  106. kash/utils/file_utils/filename_parsing.py +10 -4
  107. kash/web_content/dir_store.py +1 -2
  108. kash/web_content/file_cache_utils.py +37 -10
  109. kash/web_content/file_processing.py +68 -0
  110. kash/web_content/local_file_cache.py +12 -9
  111. kash/web_content/web_extract.py +8 -3
  112. kash/web_content/web_fetch.py +12 -4
  113. kash/web_gen/tabbed_webpage.py +5 -2
  114. kash/web_gen/templates/base_styles.css.jinja +120 -14
  115. kash/web_gen/templates/base_webpage.html.jinja +60 -13
  116. kash/web_gen/templates/content_styles.css.jinja +4 -2
  117. kash/web_gen/templates/item_view.html.jinja +2 -2
  118. kash/web_gen/templates/tabbed_webpage.html.jinja +1 -2
  119. kash/workspaces/__init__.py +15 -2
  120. kash/workspaces/selections.py +18 -3
  121. kash/workspaces/source_items.py +4 -2
  122. kash/workspaces/workspace_output.py +11 -4
  123. kash/workspaces/workspaces.py +5 -11
  124. kash/xonsh_custom/command_nl_utils.py +40 -19
  125. kash/xonsh_custom/custom_shell.py +44 -12
  126. kash/xonsh_custom/customize_prompt.py +39 -21
  127. kash/xonsh_custom/load_into_xonsh.py +26 -27
  128. kash/xonsh_custom/shell_load_commands.py +2 -2
  129. kash/xonsh_custom/xonsh_completers.py +2 -249
  130. kash/xonsh_custom/xonsh_keybindings.py +282 -0
  131. kash/xonsh_custom/xonsh_modern_tools.py +3 -3
  132. kash/xontrib/kash_extension.py +5 -6
  133. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/METADATA +26 -12
  134. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/RECORD +140 -140
  135. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/entry_points.txt +1 -1
  136. kash/concepts/concept_formats.py +0 -23
  137. kash/concepts/text_similarity.py +0 -112
  138. kash/shell/clideps/api_keys.py +0 -99
  139. kash/shell/clideps/dotenv_setup.py +0 -114
  140. kash/shell/clideps/dotenv_utils.py +0 -89
  141. kash/shell/clideps/pkg_deps.py +0 -232
  142. kash/shell/clideps/platforms.py +0 -11
  143. kash/shell/clideps/terminal_features.py +0 -56
  144. kash/shell/utils/osc_utils.py +0 -95
  145. kash/shell/utils/terminal_images.py +0 -133
  146. kash/text_handling/markdown_util.py +0 -167
  147. kash/utils/common/atomic_var.py +0 -158
  148. kash/utils/common/string_replace.py +0 -93
  149. kash/utils/common/string_template.py +0 -101
  150. /kash/docs_base/recipes/{python_dev_commands.ksh → python_dev_commands.sh} +0 -0
  151. /kash/docs_base/recipes/{tldr_standard_commands.ksh → tldr_standard_commands.sh} +0 -0
  152. /kash/{concepts → embeddings}/embeddings.py +0 -0
  153. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/WHEEL +0 -0
  154. {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,12 @@
1
1
  import re
2
+ import tempfile
2
3
  from pathlib import Path
3
4
  from typing import NewType
4
5
 
5
6
  import regex
7
+ from clideps.pkgs.pkg_check import pkg_check
6
8
 
7
9
  from kash.config.logger import get_logger
8
- from kash.shell.clideps.pkg_deps import Pkg, pkg_check
9
10
 
10
11
  log = get_logger(__name__)
11
12
 
@@ -77,13 +78,15 @@ def read_partial_text(
77
78
 
78
79
  MimeType = NewType("MimeType", str)
79
80
 
81
+ MIME_EMPTY = MimeType("inode/x-empty")
82
+
80
83
 
81
84
  def detect_mime_type(filename: str | Path) -> MimeType | None:
82
85
  """
83
86
  Get the mime type of a file using libmagic heuristics plus more careful
84
87
  detection of HTML, Markdown, and multipart YAML.
85
88
  """
86
- pkg_check().require(Pkg.libmagic)
89
+ pkg_check().require("libmagic")
87
90
  import magic
88
91
 
89
92
  mime = magic.Magic(mime=True)
@@ -132,3 +135,26 @@ def mime_type_is_text(mime_type: MimeType) -> bool:
132
135
  "application/rtf",
133
136
  }
134
137
  )
138
+
139
+
140
+ ## Tests
141
+
142
+
143
+ def test_detect_mime_type():
144
+ with tempfile.TemporaryDirectory() as tmpdir:
145
+ tmpdir_path = Path(tmpdir)
146
+
147
+ empty_file = tmpdir_path / "empty.txt"
148
+ empty_file.touch()
149
+
150
+ html_file = tmpdir_path / "example.html"
151
+ with open(html_file, "w") as f:
152
+ f.write("<!DOCTYPE html>\n<html><body><h1>Test</h1></body></html>")
153
+
154
+ text_file = tmpdir_path / "example.txt"
155
+ with open(text_file, "w") as f:
156
+ f.write("This is a simple text file with some content.")
157
+
158
+ assert detect_mime_type(empty_file) == MIME_EMPTY
159
+ assert detect_mime_type(html_file) == "text/html"
160
+ assert detect_mime_type(text_file) == "text/plain"
@@ -6,7 +6,12 @@ from pathlib import Path
6
6
 
7
7
  from kash.utils.common.url import Url, is_file_url, parse_file_url
8
8
  from kash.utils.file_utils.file_ext import FileExt
9
- from kash.utils.file_utils.file_formats import MimeType, detect_mime_type, mime_type_is_text
9
+ from kash.utils.file_utils.file_formats import (
10
+ MIME_EMPTY,
11
+ MimeType,
12
+ detect_mime_type,
13
+ mime_type_is_text,
14
+ )
10
15
  from kash.utils.file_utils.filename_parsing import parse_file_ext
11
16
 
12
17
 
@@ -31,6 +36,8 @@ class Format(Enum):
31
36
  it is the format of the resource (url, media, etc.).
32
37
  """
33
38
 
39
+ # TODO: Be more thorough, pulling in relevant extensions and types from the `mimetypes` module.
40
+
34
41
  # Formats with no body (content is in frontmatter).
35
42
  url = "url"
36
43
 
@@ -44,8 +51,9 @@ class Format(Enum):
44
51
  yaml = "yaml"
45
52
  diff = "diff"
46
53
  python = "python"
47
- kash_script = "kash_script"
48
- """Our own format for kash scripts."""
54
+ shellscript = "shellscript"
55
+ """Covers sh, bash, and similar shell scripts."""
56
+ xonsh = "xonsh"
49
57
  json = "json"
50
58
  csv = "csv"
51
59
  npz = "npz"
@@ -56,6 +64,7 @@ class Format(Enum):
56
64
  docx = "docx"
57
65
  jpeg = "jpeg"
58
66
  png = "png"
67
+ gif = "gif"
59
68
  svg = "svg"
60
69
  mp3 = "mp3"
61
70
  m4a = "m4a"
@@ -85,7 +94,8 @@ class Format(Enum):
85
94
  self.diff,
86
95
  self.python,
87
96
  self.json,
88
- self.kash_script,
97
+ self.shellscript,
98
+ self.xonsh,
89
99
  self.csv,
90
100
  self.log,
91
101
  ]
@@ -102,7 +112,7 @@ class Format(Enum):
102
112
 
103
113
  @property
104
114
  def is_image(self) -> bool:
105
- return self in [self.jpeg, self.png, self.svg]
115
+ return self in [self.jpeg, self.png, self.gif, self.svg]
106
116
 
107
117
  @property
108
118
  def is_audio(self) -> bool:
@@ -114,7 +124,7 @@ class Format(Enum):
114
124
 
115
125
  @property
116
126
  def is_code(self) -> bool:
117
- return self in [self.python, self.kash_script, self.json, self.yaml]
127
+ return self in [self.python, self.shellscript, self.xonsh, self.json, self.yaml]
118
128
 
119
129
  @property
120
130
  def is_data(self) -> bool:
@@ -138,10 +148,12 @@ class Format(Enum):
138
148
  self.markdown,
139
149
  self.md_html,
140
150
  self.html,
151
+ self.json, # Not strictly true but we encourage use of comments.
141
152
  self.yaml,
142
153
  self.diff,
143
154
  self.python,
144
- self.kash_script,
155
+ self.shellscript,
156
+ self.xonsh,
145
157
  self.csv,
146
158
  self.log,
147
159
  ]
@@ -157,13 +169,15 @@ class Format(Enum):
157
169
  Format.yaml: MediaType.text,
158
170
  Format.diff: MediaType.text,
159
171
  Format.python: MediaType.text,
160
- Format.kash_script: MediaType.text,
172
+ Format.shellscript: MediaType.text,
173
+ Format.xonsh: MediaType.text,
161
174
  Format.json: MediaType.text,
162
175
  Format.csv: MediaType.text,
163
176
  Format.log: MediaType.text,
164
177
  Format.pdf: MediaType.text,
165
178
  Format.jpeg: MediaType.image,
166
179
  Format.png: MediaType.image,
180
+ Format.gif: MediaType.image,
167
181
  Format.svg: MediaType.image,
168
182
  Format.docx: MediaType.text,
169
183
  Format.mp3: MediaType.audio,
@@ -189,11 +203,13 @@ class Format(Enum):
189
203
  FileExt.npz.value: Format.npz,
190
204
  FileExt.log.value: Format.log,
191
205
  FileExt.py.value: Format.python,
192
- FileExt.ksh.value: Format.kash_script,
206
+ FileExt.sh.value: Format.shellscript,
207
+ FileExt.xsh.value: Format.xonsh,
193
208
  FileExt.pdf.value: Format.pdf,
194
209
  FileExt.docx.value: Format.docx,
195
210
  FileExt.jpg.value: Format.jpeg,
196
211
  FileExt.png.value: Format.png,
212
+ FileExt.gif.value: Format.gif,
197
213
  FileExt.svg.value: Format.svg,
198
214
  FileExt.mp3.value: Format.mp3,
199
215
  FileExt.m4a.value: Format.m4a,
@@ -219,10 +235,13 @@ class Format(Enum):
219
235
  Format.npz: FileExt.npz,
220
236
  Format.log: FileExt.log,
221
237
  Format.python: FileExt.py,
238
+ Format.shellscript: FileExt.sh,
239
+ Format.xonsh: FileExt.xsh,
222
240
  Format.pdf: FileExt.pdf,
223
241
  Format.docx: FileExt.docx,
224
242
  Format.jpeg: FileExt.jpg,
225
243
  Format.png: FileExt.png,
244
+ Format.gif: FileExt.gif,
226
245
  Format.svg: FileExt.svg,
227
246
  Format.mp3: FileExt.mp3,
228
247
  Format.m4a: FileExt.m4a,
@@ -244,6 +263,10 @@ class Format(Enum):
244
263
  "application/yaml": Format.yaml,
245
264
  "application/x-yaml": Format.yaml,
246
265
  "text/x-python": Format.python,
266
+ "text/x-script.python": Format.python,
267
+ "text/x-sh": Format.shellscript,
268
+ "text/x-shellscript": Format.shellscript,
269
+ "text/x-xonsh": Format.xonsh,
247
270
  "application/json": Format.json,
248
271
  "text/csv": Format.csv,
249
272
  "application/x-npz": Format.npz,
@@ -251,6 +274,7 @@ class Format(Enum):
251
274
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document": Format.docx,
252
275
  "image/jpeg": Format.jpeg,
253
276
  "image/png": Format.png,
277
+ "image/gif": Format.gif,
254
278
  "image/svg+xml": Format.svg,
255
279
  "audio/mpeg": Format.mp3,
256
280
  "audio/mp3": Format.mp3,
@@ -326,15 +350,15 @@ class FileFormatInfo:
326
350
  and self.mime_type.startswith("image")
327
351
  )
328
352
 
329
- def as_str(self) -> str:
330
- if self.format and self.mime_type:
331
- return f"{self.format.value} ({self.mime_type})"
332
- elif self.format:
353
+ def as_str(self, mime_only: bool = False) -> str:
354
+ if self.format and not mime_only:
333
355
  return self.format.value
356
+ elif self.mime_type == MIME_EMPTY:
357
+ return "empty"
334
358
  elif self.mime_type:
335
359
  return self.mime_type
336
360
  else:
337
- return "unrecognized"
361
+ return "unrecognized format"
338
362
 
339
363
  def __str__(self) -> str:
340
364
  return self.as_str()
@@ -357,15 +381,22 @@ def guess_format_by_name(path: str | Path) -> Format | None:
357
381
  return Format.guess_by_file_ext(file_ext) if file_ext else None
358
382
 
359
383
 
360
- def file_format_info(path: str | Path) -> FileFormatInfo:
384
+ def file_format_info(path: str | Path, always_check_content: bool = False) -> FileFormatInfo:
361
385
  """
362
- Full info on the file format path and content (file extension and file content).
386
+ Get info on the file format path and content (file extension and file content).
387
+ Looks at the file extension first and then the file content if needed.
388
+ If `always_check_content` is True, look at the file content even if we
389
+ recognize the file extension.
363
390
  """
364
391
  path = Path(path)
365
392
  file_ext = parse_file_ext(path)
366
- mime_type = detect_mime_type(path)
367
- format = _guess_format(file_ext, mime_type)
368
- final_mime_type = format.mime_type if format else mime_type
393
+ if always_check_content or not file_ext:
394
+ # Look at the file content.
395
+ detected_mime_type = detect_mime_type(path)
396
+ else:
397
+ detected_mime_type = None
398
+ format = _guess_format(file_ext, detected_mime_type)
399
+ final_mime_type = format.mime_type if format else detected_mime_type
369
400
  return FileFormatInfo(file_ext, format, final_mime_type)
370
401
 
371
402
 
@@ -2,7 +2,7 @@ import os
2
2
  from pathlib import Path
3
3
 
4
4
  from kash.config.logger import get_logger
5
- from kash.utils.common.url import Url
5
+ from kash.utils.common.url import Url, check_if_url
6
6
  from kash.utils.errors import InvalidFilename
7
7
  from kash.utils.file_utils.file_ext import FileExt, canonicalize_file_ext
8
8
 
@@ -48,11 +48,17 @@ def join_filename(dirname: str | Path, name: str, item_type: str | None, ext: st
48
48
 
49
49
  def parse_file_ext(url_or_path: str | Url | Path) -> FileExt | None:
50
50
  """
51
- Parse a known, canonical file extension from a path, a URL, or even just a
52
- raw file extension (like "csv" or ".csv").
51
+ Parse a known, canonical file extension from a path or URL. Also accepts
52
+ raw file extensions (like "csv" or ".csv").
53
53
  """
54
- front, ext = os.path.splitext(str(url_or_path).split("/")[-1])
54
+ parsed_url = check_if_url(url_or_path)
55
+ if parsed_url:
56
+ path = parsed_url.path
57
+ else:
58
+ path = str(url_or_path)
59
+ front, ext = os.path.splitext(path.split("/")[-1])
55
60
  if not ext:
61
+ # Handle bare file extensions too.
56
62
  ext = front
57
63
  return FileExt.parse(canonicalize_file_ext(ext))
58
64
 
@@ -87,8 +87,7 @@ class DirStore:
87
87
  self, keys: list[str | Path], folder: str | None = None, suffix: str | None = None
88
88
  ) -> dict[str | Path, Path | None]:
89
89
  """
90
- Look up all existing cached results for the set of keys. This should work fine but could
91
- be optimized for large batches.
90
+ Look up all existing cached results for the set of keys.
92
91
  """
93
92
  return {key: self.find(key, folder=folder, suffix=suffix) for key in keys}
94
93
 
@@ -1,4 +1,7 @@
1
+ import json
2
+ from collections.abc import Callable
1
3
  from pathlib import Path
4
+ from typing import Any
2
5
 
3
6
  from prettyfmt import fmt_lines, fmt_path
4
7
 
@@ -35,18 +38,40 @@ def reset_content_cache_dir(path: Path):
35
38
  log.info("Using web cache: %s", fmt_path(path))
36
39
 
37
40
 
38
- def cache_file(source: Url | Path | Loadable, global_cache: bool = False) -> tuple[Path, bool]:
41
+ def cache_file(
42
+ source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
43
+ ) -> tuple[Path, bool]:
39
44
  """
40
45
  Return a local cached copy of the item. If it is an URL, content is fetched.
41
- Raises requests.HTTPError if the URL is not reachable. If it is a Path or
42
- a Loadable, a cached copy is returned.
46
+ If it is a Path or a Loadable, a cached copy is returned.
47
+ LocalFileCache uses httpx so httpx.HTTPError is raised for non-2xx responses.
48
+
49
+ Uses the current content cache unless there is no current cache or `global_cache` is True,
50
+ in which case the global cache is used.
43
51
  """
44
52
  cache = _global_content_cache if global_cache else _content_cache
45
- path, was_cached = cache.cache(source)
53
+ path, was_cached = cache.cache(source, expiration_sec)
46
54
  return path, was_cached
47
55
 
48
56
 
49
- def cache_resource(item: Item) -> dict[MediaType, Path]:
57
+ def cache_api_response(
58
+ url: Url,
59
+ global_cache: bool = False,
60
+ expiration_sec: float | None = None,
61
+ parser: Callable[[str], Any] = json.loads,
62
+ ) -> tuple[Any, bool]:
63
+ """
64
+ Cache an API response. By default parse the response as JSON.
65
+ """
66
+ cache = _global_content_cache if global_cache else _content_cache
67
+ path, was_cached = cache.cache(url, expiration_sec)
68
+ result = parser(path.read_text())
69
+ return result, was_cached
70
+
71
+
72
+ def cache_resource(
73
+ item: Item, global_cache: bool = False, expiration_sec: float | None = None
74
+ ) -> dict[MediaType, Path]:
50
75
  """
51
76
  Cache a resource item for an external local path or a URL, fetching or
52
77
  copying as needed. For media this may yield more than one format.
@@ -64,17 +89,17 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
64
89
  if is_media_url(item.url):
65
90
  result = cache_media(item.url)
66
91
  else:
67
- path, _was_cached = cache_file(item.url)
92
+ path, _was_cached = cache_file(item.url, global_cache, expiration_sec)
68
93
  elif item.external_path:
69
94
  path = Path(item.external_path)
70
95
  if not path.is_file():
71
96
  raise FileNotFound(f"External path not found: {path}")
72
- path, _was_cached = cache_file(path)
97
+ path, _was_cached = cache_file(path, global_cache, expiration_sec)
73
98
  elif item.original_filename:
74
99
  path = Path(item.original_filename)
75
100
  if not path.is_file():
76
101
  raise FileNotFound(f"Original filename not found: {path}")
77
- path, _was_cached = cache_file(path)
102
+ path, _was_cached = cache_file(path, global_cache, expiration_sec)
78
103
  else:
79
104
  raise ValueError(f"Item has no URL or external path: {item}")
80
105
 
@@ -94,7 +119,9 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
94
119
  return result
95
120
 
96
121
 
97
- def get_url_html(item: Item) -> tuple[Url, str]:
122
+ def get_url_html(
123
+ item: Item, global_cache: bool = False, expiration_sec: float | None = None
124
+ ) -> tuple[Url, str]:
98
125
  """
99
126
  Returns the HTML content of an URL item, using the content cache,
100
127
  or the body of the item if it has a URL and HTML body.
@@ -106,7 +133,7 @@ def get_url_html(item: Item) -> tuple[Url, str]:
106
133
  url = Url(canonicalize_url(item.url))
107
134
 
108
135
  if is_url_item(item):
109
- path, _was_cached = cache_file(url)
136
+ path, _was_cached = cache_file(url, global_cache, expiration_sec)
110
137
  with open(path) as file:
111
138
  html_content = file.read()
112
139
  else:
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Mapping
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import TypeAlias
7
+
8
+ from kash.web_content.local_file_cache import read_mtime
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class OutputType:
13
+ """
14
+ A type of output file, represented by the filename suffix, e.g. '.mp3', '.txt', etc.
15
+ """
16
+
17
+ suffix: str
18
+
19
+ def output_path(self, src: Path) -> Path:
20
+ """
21
+ Resolve the output path. Will be next to the source file, e.g.
22
+ some-dir/video.mp4 -> some-dir/video.mp3
23
+ """
24
+ return src.with_suffix(self.suffix)
25
+
26
+
27
+ Processor: TypeAlias = Callable[[Path, Mapping[OutputType, Path]], None]
28
+ """
29
+ A function that takes a source file and a mapping with one or more output paths.
30
+ """
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class FileProcess:
35
+ """
36
+ Process a file and produce one or more outputs.
37
+ """
38
+
39
+ processor: Processor
40
+ outputs: list[OutputType]
41
+
42
+ def is_outdated(self, src: Path) -> bool:
43
+ """
44
+ True when any output is missing or older (earliest mtime) than `src`.
45
+ """
46
+ dests = {o.output_path(src) for o in self.outputs}
47
+ if any(not p.exists() for p in dests):
48
+ return True
49
+ earliest = min(read_mtime(p) for p in dests)
50
+ return read_mtime(src) > earliest
51
+
52
+ def run(self, src: Path) -> dict[OutputType, Path]:
53
+ """
54
+ Run unconditionally and return a mapping of outputs to paths.
55
+ """
56
+ dests = {o: o.output_path(src) for o in self.outputs}
57
+ self.processor(src, dests)
58
+ return dests
59
+
60
+ def run_if_needed(self, src: Path) -> dict[OutputType, Path]:
61
+ """
62
+ Run only if any output is missing or outdated.
63
+ """
64
+ return (
65
+ self.run(src)
66
+ if self.is_outdated(src)
67
+ else {o: o.output_path(src) for o in self.outputs}
68
+ )
@@ -11,7 +11,7 @@ from prettyfmt import fmt_path
11
11
  from strif import atomic_output_file, copyfile_atomic
12
12
 
13
13
  from kash.utils.common.url import Url, is_file_url, is_url, normalize_url, parse_file_url
14
- from kash.utils.errors import FileNotFound, InvalidInput
14
+ from kash.utils.errors import FileNotFound
15
15
  from kash.utils.file_utils.file_formats_model import choose_file_ext
16
16
  from kash.web_content.dir_store import DirStore
17
17
  from kash.web_content.web_fetch import download_url
@@ -56,19 +56,21 @@ class Loadable:
56
56
 
57
57
  key: str
58
58
  """
59
- The unique identifier for the item. If it ends in a recognized file extension,
60
- both the key and the extension will be used when creating unique cache filenames.
59
+ The unique identifier for the item. Used when creating unique cache filenames,
60
+ as is or with added suffixes.
61
61
  """
62
62
 
63
63
  save: Callable[[Path], None]
64
64
  """
65
65
  Method that saves the item to the given path. Caller will handle path selection
66
- and atomicity of file creation.
66
+ and atomicity of file creation. Raise an exception if the item cannot be saved.
67
67
  """
68
68
 
69
69
 
70
70
  Cacheable = Url | Path | Loadable
71
- """An item that can be cached as a file."""
71
+ """
72
+ An item that can be cached as a file.
73
+ """
72
74
 
73
75
 
74
76
  def _suffix_for(cacheable: Cacheable) -> str | None:
@@ -151,9 +153,7 @@ class LocalFileCache(DirStore):
151
153
  if isinstance(url_or_path, Path):
152
154
  file_path = url_or_path
153
155
  else:
154
- parsed = parse_file_url(url_or_path)
155
- if not parsed:
156
- raise InvalidInput(f"Not a file URL: {url_or_path}")
156
+ parsed = parse_file_url(url_or_path) # Raises ValueError if not a file URL.
157
157
  file_path = parsed
158
158
  if not file_path.exists():
159
159
  raise FileNotFound(f"File not found: {file_path}")
@@ -173,7 +173,10 @@ class LocalFileCache(DirStore):
173
173
  ) as tmp_path:
174
174
  source.save(tmp_path)
175
175
  if not cache_path.exists():
176
- raise InvalidCacheState(f"Failed to save to cache: {source}: {cache_path}")
176
+ # The source should have raised an exception if it failed to save.
177
+ raise InvalidCacheState(
178
+ f"Loadable source failed to save to cache: {source}: {cache_path}"
179
+ )
177
180
  else:
178
181
  raise ValueError(f"Invalid source: {source}")
179
182
 
@@ -10,14 +10,19 @@ from kash.web_content.web_page_model import PageExtractor, WebPageData
10
10
 
11
11
  @log_calls(level="message")
12
12
  def fetch_extract(
13
- url: Url, use_cache: bool = True, extractor: PageExtractor = extract_text_justext
13
+ url: Url,
14
+ refetch: bool = False,
15
+ use_cache: bool = True,
16
+ extractor: PageExtractor = extract_text_justext,
14
17
  ) -> WebPageData:
15
18
  """
16
19
  Fetches a URL and extracts the title, description, and content.
20
+ By default, uses the content cache if available. Can force re-fetching and
21
+ updating the cache by setting `refetch` to true.
17
22
  """
18
-
23
+ expiration_sec = 0 if refetch else None
19
24
  if use_cache:
20
- path, _was_cached = cache_file(url)
25
+ path, _was_cached = cache_file(url, expiration_sec=expiration_sec)
21
26
  with open(path, "rb") as file:
22
27
  content = file.read()
23
28
  page_data = extractor(url, content)
@@ -7,17 +7,22 @@ import httpx
7
7
  from strif import atomic_output_file, copyfile_atomic
8
8
  from tqdm import tqdm
9
9
 
10
+ from kash.config.env_settings import KashEnv
10
11
  from kash.utils.common.url import Url
11
12
 
12
13
  log = logging.getLogger(__name__)
13
14
 
14
- USER_AGENT = "Mozilla/5.0 (Compatible)"
15
15
 
16
16
  DEFAULT_TIMEOUT = 30
17
17
 
18
18
 
19
+ DEFAULT_USER_AGENT = (
20
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0"
21
+ )
22
+
23
+
19
24
  def default_headers() -> dict[str, str]:
20
- return {"User-Agent": USER_AGENT}
25
+ return {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=DEFAULT_USER_AGENT)}
21
26
 
22
27
 
23
28
  def fetch_url(
@@ -36,6 +41,7 @@ def fetch_url(
36
41
  auth=auth,
37
42
  headers=headers or default_headers(),
38
43
  ) as client:
44
+ log.debug("fetch_url: using headers: %s", client.headers)
39
45
  response = client.get(url)
40
46
  log.info("Fetched: %s (%s bytes): %s", response.status_code, len(response.content), url)
41
47
  response.raise_for_status()
@@ -52,7 +58,7 @@ def download_url(
52
58
  headers: dict[str, str] | None = None,
53
59
  ) -> None:
54
60
  """
55
- Download given file, optionally with progress bar.
61
+ Download given file, optionally with progress bar, streaming to a target file.
56
62
  Also handles file:// and s3:// URLs. Output file is created atomically.
57
63
  Raise httpx.HTTPError for non-2xx responses.
58
64
  """
@@ -73,13 +79,15 @@ def download_url(
73
79
  client = session or httpx.Client(follow_redirects=True, timeout=timeout)
74
80
  response: httpx.Response | None = None
75
81
  try:
82
+ headers = headers or default_headers()
83
+ log.debug("download_url: using headers: %s", headers)
76
84
  with client.stream(
77
85
  "GET",
78
86
  url,
79
87
  follow_redirects=True,
80
88
  timeout=timeout,
81
89
  auth=auth,
82
- headers=headers or default_headers(),
90
+ headers=headers,
83
91
  ) as response:
84
92
  response.raise_for_status()
85
93
  total_size = int(response.headers.get("content-length", "0"))
@@ -2,6 +2,7 @@ import os
2
2
  from dataclasses import asdict, dataclass
3
3
 
4
4
  from frontmatter_format import read_yaml_file, to_yaml_string, write_yaml_file
5
+ from prettyfmt import sanitize_title
5
6
 
6
7
  from kash.config.logger import get_logger
7
8
  from kash.exec.preconditions import has_thumbnail_url
@@ -41,7 +42,7 @@ def _fill_in_ids(tabs: list[TabInfo]):
41
42
  tab.id = f"tab_{i}"
42
43
 
43
44
 
44
- def webpage_config(items: list[Item]) -> Item:
45
+ def webpage_config(items: list[Item], clean_headings: bool = False) -> Item:
45
46
  """
46
47
  Get an item with the config for a tabbed web page.
47
48
  """
@@ -57,9 +58,11 @@ def webpage_config(items: list[Item]) -> Item:
57
58
  log.warning("Item has no thumbnail URL: %s", item)
58
59
  return None
59
60
 
61
+ clean = clean_heading if clean_headings else sanitize_title
62
+
60
63
  tabs = [
61
64
  TabInfo(
62
- label=clean_heading(item.abbrev_title()),
65
+ label=clean(item.abbrev_title()),
63
66
  store_path=item.store_path,
64
67
  thumbnail_url=get_thumbnail_url(item),
65
68
  )