kash-shell 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. kash/actions/__init__.py +4 -4
  2. kash/actions/core/format_markdown_template.py +2 -5
  3. kash/actions/core/markdownify.py +7 -6
  4. kash/actions/core/readability.py +7 -6
  5. kash/actions/core/render_as_html.py +37 -0
  6. kash/actions/core/show_webpage.py +6 -11
  7. kash/actions/core/strip_html.py +2 -6
  8. kash/actions/core/tabbed_webpage_config.py +31 -0
  9. kash/actions/core/{webpage_generate.py → tabbed_webpage_generate.py} +5 -4
  10. kash/commands/__init__.py +8 -20
  11. kash/commands/base/basic_file_commands.py +15 -0
  12. kash/commands/base/debug_commands.py +13 -0
  13. kash/commands/base/files_command.py +28 -10
  14. kash/commands/base/general_commands.py +21 -16
  15. kash/commands/base/logs_commands.py +4 -2
  16. kash/commands/base/model_commands.py +8 -8
  17. kash/commands/base/search_command.py +3 -2
  18. kash/commands/base/show_command.py +5 -3
  19. kash/commands/extras/parse_uv_lock.py +186 -0
  20. kash/commands/help/doc_commands.py +2 -31
  21. kash/commands/help/welcome.py +33 -0
  22. kash/commands/workspace/selection_commands.py +11 -6
  23. kash/commands/workspace/workspace_commands.py +19 -17
  24. kash/config/colors.py +3 -1
  25. kash/config/env_settings.py +14 -1
  26. kash/config/init.py +2 -2
  27. kash/config/logger.py +59 -56
  28. kash/config/logger_basic.py +3 -3
  29. kash/config/settings.py +116 -57
  30. kash/config/setup.py +28 -12
  31. kash/config/text_styles.py +3 -13
  32. kash/docs/load_api_docs.py +2 -1
  33. kash/docs/markdown/topics/a3_getting_started.md +3 -2
  34. kash/{concepts → embeddings}/text_similarity.py +2 -2
  35. kash/exec/__init__.py +20 -3
  36. kash/exec/action_decorators.py +24 -10
  37. kash/exec/action_exec.py +41 -23
  38. kash/exec/action_registry.py +13 -48
  39. kash/exec/command_registry.py +2 -1
  40. kash/exec/fetch_url_metadata.py +4 -6
  41. kash/exec/importing.py +56 -0
  42. kash/exec/llm_transforms.py +12 -10
  43. kash/exec/precondition_registry.py +2 -1
  44. kash/exec/preconditions.py +22 -1
  45. kash/exec/resolve_args.py +4 -0
  46. kash/exec/shell_callable_action.py +33 -19
  47. kash/file_storage/file_store.py +42 -27
  48. kash/file_storage/item_file_format.py +5 -2
  49. kash/file_storage/metadata_dirs.py +11 -2
  50. kash/help/assistant.py +1 -1
  51. kash/help/assistant_instructions.py +2 -1
  52. kash/help/function_param_info.py +1 -1
  53. kash/help/help_embeddings.py +2 -2
  54. kash/help/help_printing.py +7 -11
  55. kash/llm_utils/clean_headings.py +1 -1
  56. kash/llm_utils/llm_api_keys.py +4 -4
  57. kash/llm_utils/llm_features.py +68 -0
  58. kash/llm_utils/llm_messages.py +1 -2
  59. kash/llm_utils/llm_names.py +1 -1
  60. kash/llm_utils/llms.py +8 -3
  61. kash/local_server/__init__.py +5 -2
  62. kash/local_server/local_server.py +8 -5
  63. kash/local_server/local_server_commands.py +2 -2
  64. kash/local_server/local_server_routes.py +1 -7
  65. kash/local_server/local_url_formatters.py +1 -1
  66. kash/mcp/__init__.py +5 -2
  67. kash/mcp/mcp_cli.py +5 -5
  68. kash/mcp/mcp_server_commands.py +5 -5
  69. kash/mcp/mcp_server_routes.py +5 -5
  70. kash/mcp/mcp_server_sse.py +4 -2
  71. kash/media_base/media_cache.py +8 -8
  72. kash/media_base/media_services.py +1 -1
  73. kash/media_base/media_tools.py +6 -6
  74. kash/media_base/services/local_file_media.py +2 -2
  75. kash/media_base/{speech_transcription.py → transcription_deepgram.py} +25 -110
  76. kash/media_base/transcription_format.py +73 -0
  77. kash/media_base/transcription_whisper.py +38 -0
  78. kash/model/__init__.py +73 -5
  79. kash/model/actions_model.py +38 -4
  80. kash/model/concept_model.py +30 -0
  81. kash/model/items_model.py +115 -32
  82. kash/model/params_model.py +24 -0
  83. kash/shell/completions/completion_scoring.py +37 -5
  84. kash/shell/output/kerm_codes.py +1 -2
  85. kash/shell/output/shell_formatting.py +14 -4
  86. kash/shell/shell_main.py +2 -2
  87. kash/shell/utils/exception_printing.py +6 -0
  88. kash/shell/utils/native_utils.py +26 -20
  89. kash/shell/utils/shell_function_wrapper.py +15 -15
  90. kash/text_handling/custom_sliding_transforms.py +12 -4
  91. kash/text_handling/doc_normalization.py +6 -2
  92. kash/text_handling/markdown_render.py +118 -0
  93. kash/text_handling/markdown_utils.py +226 -0
  94. kash/utils/common/function_inspect.py +360 -110
  95. kash/utils/common/import_utils.py +12 -3
  96. kash/utils/common/type_utils.py +0 -29
  97. kash/utils/common/url.py +27 -3
  98. kash/utils/errors.py +6 -0
  99. kash/utils/file_utils/file_ext.py +4 -0
  100. kash/utils/file_utils/file_formats.py +2 -2
  101. kash/utils/file_utils/file_formats_model.py +20 -1
  102. kash/web_content/dir_store.py +1 -2
  103. kash/web_content/file_cache_utils.py +37 -10
  104. kash/web_content/file_processing.py +68 -0
  105. kash/web_content/local_file_cache.py +12 -9
  106. kash/web_content/web_extract.py +8 -3
  107. kash/web_content/web_fetch.py +12 -4
  108. kash/web_gen/__init__.py +0 -4
  109. kash/web_gen/simple_webpage.py +52 -0
  110. kash/web_gen/tabbed_webpage.py +24 -14
  111. kash/web_gen/template_render.py +37 -2
  112. kash/web_gen/templates/base_styles.css.jinja +169 -43
  113. kash/web_gen/templates/base_webpage.html.jinja +110 -45
  114. kash/web_gen/templates/content_styles.css.jinja +4 -2
  115. kash/web_gen/templates/item_view.html.jinja +49 -39
  116. kash/web_gen/templates/simple_webpage.html.jinja +24 -0
  117. kash/web_gen/templates/tabbed_webpage.html.jinja +42 -33
  118. kash/workspaces/__init__.py +15 -2
  119. kash/workspaces/selections.py +18 -3
  120. kash/workspaces/source_items.py +0 -1
  121. kash/workspaces/workspaces.py +5 -11
  122. kash/xonsh_custom/command_nl_utils.py +40 -19
  123. kash/xonsh_custom/custom_shell.py +43 -11
  124. kash/xonsh_custom/customize_prompt.py +39 -21
  125. kash/xonsh_custom/load_into_xonsh.py +22 -25
  126. kash/xonsh_custom/shell_load_commands.py +2 -2
  127. kash/xonsh_custom/xonsh_completers.py +2 -249
  128. kash/xonsh_custom/xonsh_keybindings.py +282 -0
  129. kash/xonsh_custom/xonsh_modern_tools.py +3 -3
  130. kash/xontrib/kash_extension.py +5 -6
  131. {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/METADATA +10 -8
  132. {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/RECORD +137 -136
  133. kash/actions/core/webpage_config.py +0 -21
  134. kash/concepts/concept_formats.py +0 -23
  135. kash/shell/clideps/api_keys.py +0 -100
  136. kash/shell/clideps/dotenv_setup.py +0 -115
  137. kash/shell/clideps/dotenv_utils.py +0 -98
  138. kash/shell/clideps/pkg_deps.py +0 -257
  139. kash/shell/clideps/platforms.py +0 -11
  140. kash/shell/clideps/terminal_features.py +0 -56
  141. kash/shell/utils/osc_utils.py +0 -95
  142. kash/shell/utils/terminal_images.py +0 -133
  143. kash/text_handling/markdown_util.py +0 -167
  144. kash/utils/common/atomic_var.py +0 -171
  145. kash/utils/common/string_replace.py +0 -93
  146. kash/utils/common/string_template.py +0 -101
  147. /kash/{concepts → embeddings}/cosine.py +0 -0
  148. /kash/{concepts → embeddings}/embeddings.py +0 -0
  149. {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/WHEEL +0 -0
  150. {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/entry_points.txt +0 -0
  151. {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/licenses/LICENSE +0 -0
@@ -4,9 +4,9 @@ from pathlib import Path
4
4
  from typing import NewType
5
5
 
6
6
  import regex
7
+ from clideps.pkgs.pkg_check import pkg_check
7
8
 
8
9
  from kash.config.logger import get_logger
9
- from kash.shell.clideps.pkg_deps import Pkg, pkg_check
10
10
 
11
11
  log = get_logger(__name__)
12
12
 
@@ -86,7 +86,7 @@ def detect_mime_type(filename: str | Path) -> MimeType | None:
86
86
  Get the mime type of a file using libmagic heuristics plus more careful
87
87
  detection of HTML, Markdown, and multipart YAML.
88
88
  """
89
- pkg_check().require(Pkg.libmagic)
89
+ pkg_check().require("libmagic")
90
90
  import magic
91
91
 
92
92
  mime = magic.Magic(mime=True)
@@ -36,6 +36,8 @@ class Format(Enum):
36
36
  it is the format of the resource (url, media, etc.).
37
37
  """
38
38
 
39
+ # TODO: Be more thorough, pulling in relevant extensions and types from the `mimetypes` module.
40
+
39
41
  # Formats with no body (content is in frontmatter).
40
42
  url = "url"
41
43
 
@@ -46,6 +48,7 @@ class Format(Enum):
46
48
  """`md_html` is Markdown with HTML, used for example when we structure Markdown with divs."""
47
49
  html = "html"
48
50
  """`markdown` should be simple and clean Markdown that we can use with LLMs."""
51
+ epub = "epub"
49
52
  yaml = "yaml"
50
53
  diff = "diff"
51
54
  python = "python"
@@ -54,12 +57,14 @@ class Format(Enum):
54
57
  xonsh = "xonsh"
55
58
  json = "json"
56
59
  csv = "csv"
60
+ xlsx = "xlsx"
57
61
  npz = "npz"
58
62
  log = "log"
59
63
 
60
64
  # Media formats.
61
65
  pdf = "pdf"
62
66
  docx = "docx"
67
+ pptx = "pptx"
63
68
  jpeg = "jpeg"
64
69
  png = "png"
65
70
  gif = "gif"
@@ -106,6 +111,7 @@ class Format(Enum):
106
111
  self.html,
107
112
  self.pdf,
108
113
  self.docx,
114
+ self.pptx,
109
115
  ]
110
116
 
111
117
  @property
@@ -126,7 +132,7 @@ class Format(Enum):
126
132
 
127
133
  @property
128
134
  def is_data(self) -> bool:
129
- return self in [self.csv, self.npz]
135
+ return self in [self.csv, self.xlsx, self.npz]
130
136
 
131
137
  @property
132
138
  def is_binary(self) -> bool:
@@ -146,6 +152,7 @@ class Format(Enum):
146
152
  self.markdown,
147
153
  self.md_html,
148
154
  self.html,
155
+ self.json, # Not strictly true but we encourage use of comments.
149
156
  self.yaml,
150
157
  self.diff,
151
158
  self.python,
@@ -163,6 +170,7 @@ class Format(Enum):
163
170
  Format.markdown: MediaType.text,
164
171
  Format.md_html: MediaType.text,
165
172
  Format.html: MediaType.webpage,
173
+ Format.epub: MediaType.text,
166
174
  Format.yaml: MediaType.text,
167
175
  Format.diff: MediaType.text,
168
176
  Format.python: MediaType.text,
@@ -172,11 +180,13 @@ class Format(Enum):
172
180
  Format.csv: MediaType.text,
173
181
  Format.log: MediaType.text,
174
182
  Format.pdf: MediaType.text,
183
+ Format.xlsx: MediaType.text,
175
184
  Format.jpeg: MediaType.image,
176
185
  Format.png: MediaType.image,
177
186
  Format.gif: MediaType.image,
178
187
  Format.svg: MediaType.image,
179
188
  Format.docx: MediaType.text,
189
+ Format.pptx: MediaType.text,
180
190
  Format.mp3: MediaType.audio,
181
191
  Format.m4a: MediaType.audio,
182
192
  Format.mp4: MediaType.video,
@@ -197,6 +207,7 @@ class Format(Enum):
197
207
  FileExt.diff.value: Format.diff,
198
208
  FileExt.json.value: Format.json,
199
209
  FileExt.csv.value: Format.csv,
210
+ FileExt.xlsx.value: Format.xlsx,
200
211
  FileExt.npz.value: Format.npz,
201
212
  FileExt.log.value: Format.log,
202
213
  FileExt.py.value: Format.python,
@@ -204,6 +215,7 @@ class Format(Enum):
204
215
  FileExt.xsh.value: Format.xonsh,
205
216
  FileExt.pdf.value: Format.pdf,
206
217
  FileExt.docx.value: Format.docx,
218
+ FileExt.pptx.value: Format.pptx,
207
219
  FileExt.jpg.value: Format.jpeg,
208
220
  FileExt.png.value: Format.png,
209
221
  FileExt.gif.value: Format.gif,
@@ -211,6 +223,7 @@ class Format(Enum):
211
223
  FileExt.mp3.value: Format.mp3,
212
224
  FileExt.m4a.value: Format.m4a,
213
225
  FileExt.mp4.value: Format.mp4,
226
+ FileExt.epub.value: Format.epub,
214
227
  }
215
228
  return ext_to_format.get(file_ext.value, None)
216
229
 
@@ -225,10 +238,12 @@ class Format(Enum):
225
238
  Format.md_html: FileExt.md,
226
239
  Format.html: FileExt.html,
227
240
  Format.plaintext: FileExt.txt,
241
+ Format.epub: FileExt.epub,
228
242
  Format.yaml: FileExt.yml,
229
243
  Format.diff: FileExt.diff,
230
244
  Format.json: FileExt.json,
231
245
  Format.csv: FileExt.csv,
246
+ Format.xlsx: FileExt.xlsx,
232
247
  Format.npz: FileExt.npz,
233
248
  Format.log: FileExt.log,
234
249
  Format.python: FileExt.py,
@@ -236,6 +251,7 @@ class Format(Enum):
236
251
  Format.xonsh: FileExt.xsh,
237
252
  Format.pdf: FileExt.pdf,
238
253
  Format.docx: FileExt.docx,
254
+ Format.pptx: FileExt.pptx,
239
255
  Format.jpeg: FileExt.jpg,
240
256
  Format.png: FileExt.png,
241
257
  Format.gif: FileExt.gif,
@@ -257,6 +273,7 @@ class Format(Enum):
257
273
  "text/html": Format.html,
258
274
  "text/diff": Format.diff,
259
275
  "text/x-diff": Format.diff,
276
+ "application/epub+zip": Format.epub,
260
277
  "application/yaml": Format.yaml,
261
278
  "application/x-yaml": Format.yaml,
262
279
  "text/x-python": Format.python,
@@ -266,9 +283,11 @@ class Format(Enum):
266
283
  "text/x-xonsh": Format.xonsh,
267
284
  "application/json": Format.json,
268
285
  "text/csv": Format.csv,
286
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": Format.xlsx,
269
287
  "application/x-npz": Format.npz,
270
288
  "application/pdf": Format.pdf,
271
289
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document": Format.docx,
290
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": Format.pptx,
272
291
  "image/jpeg": Format.jpeg,
273
292
  "image/png": Format.png,
274
293
  "image/gif": Format.gif,
@@ -87,8 +87,7 @@ class DirStore:
87
87
  self, keys: list[str | Path], folder: str | None = None, suffix: str | None = None
88
88
  ) -> dict[str | Path, Path | None]:
89
89
  """
90
- Look up all existing cached results for the set of keys. This should work fine but could
91
- be optimized for large batches.
90
+ Look up all existing cached results for the set of keys.
92
91
  """
93
92
  return {key: self.find(key, folder=folder, suffix=suffix) for key in keys}
94
93
 
@@ -1,4 +1,7 @@
1
+ import json
2
+ from collections.abc import Callable
1
3
  from pathlib import Path
4
+ from typing import Any
2
5
 
3
6
  from prettyfmt import fmt_lines, fmt_path
4
7
 
@@ -35,18 +38,40 @@ def reset_content_cache_dir(path: Path):
35
38
  log.info("Using web cache: %s", fmt_path(path))
36
39
 
37
40
 
38
- def cache_file(source: Url | Path | Loadable, global_cache: bool = False) -> tuple[Path, bool]:
41
+ def cache_file(
42
+ source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
43
+ ) -> tuple[Path, bool]:
39
44
  """
40
45
  Return a local cached copy of the item. If it is an URL, content is fetched.
41
- Raises requests.HTTPError if the URL is not reachable. If it is a Path or
42
- a Loadable, a cached copy is returned.
46
+ If it is a Path or a Loadable, a cached copy is returned.
47
+ LocalFileCache uses httpx so httpx.HTTPError is raised for non-2xx responses.
48
+
49
+ Uses the current content cache unless there is no current cache or `global_cache` is True,
50
+ in which case the global cache is used.
43
51
  """
44
52
  cache = _global_content_cache if global_cache else _content_cache
45
- path, was_cached = cache.cache(source)
53
+ path, was_cached = cache.cache(source, expiration_sec)
46
54
  return path, was_cached
47
55
 
48
56
 
49
- def cache_resource(item: Item) -> dict[MediaType, Path]:
57
+ def cache_api_response(
58
+ url: Url,
59
+ global_cache: bool = False,
60
+ expiration_sec: float | None = None,
61
+ parser: Callable[[str], Any] = json.loads,
62
+ ) -> tuple[Any, bool]:
63
+ """
64
+ Cache an API response. By default parse the response as JSON.
65
+ """
66
+ cache = _global_content_cache if global_cache else _content_cache
67
+ path, was_cached = cache.cache(url, expiration_sec)
68
+ result = parser(path.read_text())
69
+ return result, was_cached
70
+
71
+
72
+ def cache_resource(
73
+ item: Item, global_cache: bool = False, expiration_sec: float | None = None
74
+ ) -> dict[MediaType, Path]:
50
75
  """
51
76
  Cache a resource item for an external local path or a URL, fetching or
52
77
  copying as needed. For media this may yield more than one format.
@@ -64,17 +89,17 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
64
89
  if is_media_url(item.url):
65
90
  result = cache_media(item.url)
66
91
  else:
67
- path, _was_cached = cache_file(item.url)
92
+ path, _was_cached = cache_file(item.url, global_cache, expiration_sec)
68
93
  elif item.external_path:
69
94
  path = Path(item.external_path)
70
95
  if not path.is_file():
71
96
  raise FileNotFound(f"External path not found: {path}")
72
- path, _was_cached = cache_file(path)
97
+ path, _was_cached = cache_file(path, global_cache, expiration_sec)
73
98
  elif item.original_filename:
74
99
  path = Path(item.original_filename)
75
100
  if not path.is_file():
76
101
  raise FileNotFound(f"Original filename not found: {path}")
77
- path, _was_cached = cache_file(path)
102
+ path, _was_cached = cache_file(path, global_cache, expiration_sec)
78
103
  else:
79
104
  raise ValueError(f"Item has no URL or external path: {item}")
80
105
 
@@ -94,7 +119,9 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
94
119
  return result
95
120
 
96
121
 
97
- def get_url_html(item: Item) -> tuple[Url, str]:
122
+ def get_url_html(
123
+ item: Item, global_cache: bool = False, expiration_sec: float | None = None
124
+ ) -> tuple[Url, str]:
98
125
  """
99
126
  Returns the HTML content of an URL item, using the content cache,
100
127
  or the body of the item if it has a URL and HTML body.
@@ -106,7 +133,7 @@ def get_url_html(item: Item) -> tuple[Url, str]:
106
133
  url = Url(canonicalize_url(item.url))
107
134
 
108
135
  if is_url_item(item):
109
- path, _was_cached = cache_file(url)
136
+ path, _was_cached = cache_file(url, global_cache, expiration_sec)
110
137
  with open(path) as file:
111
138
  html_content = file.read()
112
139
  else:
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Mapping
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import TypeAlias
7
+
8
+ from kash.web_content.local_file_cache import read_mtime
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class OutputType:
13
+ """
14
+ A type of output file, represented by the filename suffix, e.g. '.mp3', '.txt', etc.
15
+ """
16
+
17
+ suffix: str
18
+
19
+ def output_path(self, src: Path) -> Path:
20
+ """
21
+ Resolve the output path. Will be next to the source file, e.g.
22
+ some-dir/video.mp4 -> some-dir/video.mp3
23
+ """
24
+ return src.with_suffix(self.suffix)
25
+
26
+
27
+ Processor: TypeAlias = Callable[[Path, Mapping[OutputType, Path]], None]
28
+ """
29
+ A function that takes a source file and a mapping with one or more output paths.
30
+ """
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class FileProcess:
35
+ """
36
+ Process a file and produce one or more outputs.
37
+ """
38
+
39
+ processor: Processor
40
+ outputs: list[OutputType]
41
+
42
+ def is_outdated(self, src: Path) -> bool:
43
+ """
44
+ True when any output is missing or older (earliest mtime) than `src`.
45
+ """
46
+ dests = {o.output_path(src) for o in self.outputs}
47
+ if any(not p.exists() for p in dests):
48
+ return True
49
+ earliest = min(read_mtime(p) for p in dests)
50
+ return read_mtime(src) > earliest
51
+
52
+ def run(self, src: Path) -> dict[OutputType, Path]:
53
+ """
54
+ Run unconditionally and return a mapping of outputs to paths.
55
+ """
56
+ dests = {o: o.output_path(src) for o in self.outputs}
57
+ self.processor(src, dests)
58
+ return dests
59
+
60
+ def run_if_needed(self, src: Path) -> dict[OutputType, Path]:
61
+ """
62
+ Run only if any output is missing or outdated.
63
+ """
64
+ return (
65
+ self.run(src)
66
+ if self.is_outdated(src)
67
+ else {o: o.output_path(src) for o in self.outputs}
68
+ )
@@ -11,7 +11,7 @@ from prettyfmt import fmt_path
11
11
  from strif import atomic_output_file, copyfile_atomic
12
12
 
13
13
  from kash.utils.common.url import Url, is_file_url, is_url, normalize_url, parse_file_url
14
- from kash.utils.errors import FileNotFound, InvalidInput
14
+ from kash.utils.errors import FileNotFound
15
15
  from kash.utils.file_utils.file_formats_model import choose_file_ext
16
16
  from kash.web_content.dir_store import DirStore
17
17
  from kash.web_content.web_fetch import download_url
@@ -56,19 +56,21 @@ class Loadable:
56
56
 
57
57
  key: str
58
58
  """
59
- The unique identifier for the item. If it ends in a recognized file extension,
60
- both the key and the extension will be used when creating unique cache filenames.
59
+ The unique identifier for the item. Used when creating unique cache filenames,
60
+ as is or with added suffixes.
61
61
  """
62
62
 
63
63
  save: Callable[[Path], None]
64
64
  """
65
65
  Method that saves the item to the given path. Caller will handle path selection
66
- and atomicity of file creation.
66
+ and atomicity of file creation. Raise an exception if the item cannot be saved.
67
67
  """
68
68
 
69
69
 
70
70
  Cacheable = Url | Path | Loadable
71
- """An item that can be cached as a file."""
71
+ """
72
+ An item that can be cached as a file.
73
+ """
72
74
 
73
75
 
74
76
  def _suffix_for(cacheable: Cacheable) -> str | None:
@@ -151,9 +153,7 @@ class LocalFileCache(DirStore):
151
153
  if isinstance(url_or_path, Path):
152
154
  file_path = url_or_path
153
155
  else:
154
- parsed = parse_file_url(url_or_path)
155
- if not parsed:
156
- raise InvalidInput(f"Not a file URL: {url_or_path}")
156
+ parsed = parse_file_url(url_or_path) # Raises ValueError if not a file URL.
157
157
  file_path = parsed
158
158
  if not file_path.exists():
159
159
  raise FileNotFound(f"File not found: {file_path}")
@@ -173,7 +173,10 @@ class LocalFileCache(DirStore):
173
173
  ) as tmp_path:
174
174
  source.save(tmp_path)
175
175
  if not cache_path.exists():
176
- raise InvalidCacheState(f"Failed to save to cache: {source}: {cache_path}")
176
+ # The source should have raised an exception if it failed to save.
177
+ raise InvalidCacheState(
178
+ f"Loadable source failed to save to cache: {source}: {cache_path}"
179
+ )
177
180
  else:
178
181
  raise ValueError(f"Invalid source: {source}")
179
182
 
@@ -10,14 +10,19 @@ from kash.web_content.web_page_model import PageExtractor, WebPageData
10
10
 
11
11
  @log_calls(level="message")
12
12
  def fetch_extract(
13
- url: Url, use_cache: bool = True, extractor: PageExtractor = extract_text_justext
13
+ url: Url,
14
+ refetch: bool = False,
15
+ use_cache: bool = True,
16
+ extractor: PageExtractor = extract_text_justext,
14
17
  ) -> WebPageData:
15
18
  """
16
19
  Fetches a URL and extracts the title, description, and content.
20
+ By default, uses the content cache if available. Can force re-fetching and
21
+ updating the cache by setting `refetch` to true.
17
22
  """
18
-
23
+ expiration_sec = 0 if refetch else None
19
24
  if use_cache:
20
- path, _was_cached = cache_file(url)
25
+ path, _was_cached = cache_file(url, expiration_sec=expiration_sec)
21
26
  with open(path, "rb") as file:
22
27
  content = file.read()
23
28
  page_data = extractor(url, content)
@@ -7,17 +7,22 @@ import httpx
7
7
  from strif import atomic_output_file, copyfile_atomic
8
8
  from tqdm import tqdm
9
9
 
10
+ from kash.config.env_settings import KashEnv
10
11
  from kash.utils.common.url import Url
11
12
 
12
13
  log = logging.getLogger(__name__)
13
14
 
14
- USER_AGENT = "Mozilla/5.0 (Compatible)"
15
15
 
16
16
  DEFAULT_TIMEOUT = 30
17
17
 
18
18
 
19
+ DEFAULT_USER_AGENT = (
20
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0"
21
+ )
22
+
23
+
19
24
  def default_headers() -> dict[str, str]:
20
- return {"User-Agent": USER_AGENT}
25
+ return {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=DEFAULT_USER_AGENT)}
21
26
 
22
27
 
23
28
  def fetch_url(
@@ -36,6 +41,7 @@ def fetch_url(
36
41
  auth=auth,
37
42
  headers=headers or default_headers(),
38
43
  ) as client:
44
+ log.debug("fetch_url: using headers: %s", client.headers)
39
45
  response = client.get(url)
40
46
  log.info("Fetched: %s (%s bytes): %s", response.status_code, len(response.content), url)
41
47
  response.raise_for_status()
@@ -52,7 +58,7 @@ def download_url(
52
58
  headers: dict[str, str] | None = None,
53
59
  ) -> None:
54
60
  """
55
- Download given file, optionally with progress bar.
61
+ Download given file, optionally with progress bar, streaming to a target file.
56
62
  Also handles file:// and s3:// URLs. Output file is created atomically.
57
63
  Raise httpx.HTTPError for non-2xx responses.
58
64
  """
@@ -73,13 +79,15 @@ def download_url(
73
79
  client = session or httpx.Client(follow_redirects=True, timeout=timeout)
74
80
  response: httpx.Response | None = None
75
81
  try:
82
+ headers = headers or default_headers()
83
+ log.debug("download_url: using headers: %s", headers)
76
84
  with client.stream(
77
85
  "GET",
78
86
  url,
79
87
  follow_redirects=True,
80
88
  timeout=timeout,
81
89
  auth=auth,
82
- headers=headers or default_headers(),
90
+ headers=headers,
83
91
  ) as response:
84
92
  response.raise_for_status()
85
93
  total_size = int(response.headers.get("content-length", "0"))
kash/web_gen/__init__.py CHANGED
@@ -1,4 +0,0 @@
1
- from pathlib import Path
2
-
3
- base_templates_dir = Path(__file__).parent / "templates"
4
- """Common base web page templates."""
@@ -0,0 +1,52 @@
1
+ from kash.model.items_model import Item
2
+ from kash.utils.file_utils.file_formats_model import Format
3
+ from kash.web_gen.template_render import render_web_template
4
+
5
+
6
+ def simple_webpage_render(
7
+ item: Item,
8
+ page_template: str = "simple_webpage.html.jinja",
9
+ add_title_h1: bool = True,
10
+ ) -> str:
11
+ """
12
+ Generate a simple web page from a single item.
13
+ If `add_title_h1` is True, the title will be inserted as an h1 heading above the body.
14
+ """
15
+ return render_web_template(
16
+ template_filename=page_template,
17
+ data={
18
+ "title": item.title,
19
+ "add_title_h1": add_title_h1,
20
+ "content_html": item.body_as_html(),
21
+ "thumbnail_url": item.thumbnail_url,
22
+ },
23
+ )
24
+
25
+
26
+ ## Tests
27
+
28
+
29
+ def test_render():
30
+ import os
31
+
32
+ from kash.model.items_model import ItemType
33
+
34
+ # Create a test item
35
+ item = Item(
36
+ type=ItemType.doc,
37
+ format=Format.html,
38
+ title="A Simple Web Page",
39
+ body="<p>This is a simple web page with <b>HTML content</b>.</p>",
40
+ )
41
+
42
+ # Generate HTML
43
+ html = simple_webpage_render(item)
44
+
45
+ os.makedirs("tmp", exist_ok=True)
46
+ with open("tmp/simple_webpage.html", "w") as f:
47
+ f.write(html)
48
+ print("Rendered simple webpage to tmp/simple_webpage.html")
49
+
50
+ # Basic validation
51
+ assert item.title and item.title in html
52
+ assert "<b>HTML content</b>" in html
@@ -2,6 +2,7 @@ import os
2
2
  from dataclasses import asdict, dataclass
3
3
 
4
4
  from frontmatter_format import read_yaml_file, to_yaml_string, write_yaml_file
5
+ from prettyfmt import abbrev_on_words, sanitize_title
5
6
 
6
7
  from kash.config.logger import get_logger
7
8
  from kash.exec.preconditions import has_thumbnail_url
@@ -11,7 +12,6 @@ from kash.model.paths_model import StorePath
11
12
  from kash.utils.common.type_utils import as_dataclass, not_none
12
13
  from kash.utils.errors import NoMatch
13
14
  from kash.utils.file_utils.file_formats_model import Format
14
- from kash.web_gen import base_templates_dir
15
15
  from kash.web_gen.template_render import render_web_template
16
16
  from kash.workspaces import current_ws
17
17
  from kash.workspaces.source_items import find_upstream_item
@@ -33,6 +33,7 @@ class TabbedWebpage:
33
33
  title: str
34
34
  tabs: list[TabInfo]
35
35
  show_tabs: bool = True
36
+ add_title_h1: bool = True
36
37
 
37
38
 
38
39
  def _fill_in_ids(tabs: list[TabInfo]):
@@ -41,7 +42,9 @@ def _fill_in_ids(tabs: list[TabInfo]):
41
42
  tab.id = f"tab_{i}"
42
43
 
43
44
 
44
- def webpage_config(items: list[Item]) -> Item:
45
+ def tabbed_webpage_config(
46
+ items: list[Item], clean_headings: bool = False, add_title_h1: bool = True
47
+ ) -> Item:
45
48
  """
46
49
  Get an item with the config for a tabbed web page.
47
50
  """
@@ -57,9 +60,15 @@ def webpage_config(items: list[Item]) -> Item:
57
60
  log.warning("Item has no thumbnail URL: %s", item)
58
61
  return None
59
62
 
63
+ def clean_label(label: str) -> str:
64
+ if clean_headings:
65
+ return clean_heading(label)
66
+ else:
67
+ return abbrev_on_words(sanitize_title(label), max_len=40)
68
+
60
69
  tabs = [
61
70
  TabInfo(
62
- label=clean_heading(item.abbrev_title()),
71
+ label=clean_label(item.abbrev_title()),
63
72
  store_path=item.store_path,
64
73
  thumbnail_url=get_thumbnail_url(item),
65
74
  )
@@ -67,7 +76,9 @@ def webpage_config(items: list[Item]) -> Item:
67
76
  ]
68
77
  _fill_in_ids(tabs)
69
78
  title = summary_heading([item.abbrev_title() for item in items])
70
- config = TabbedWebpage(title=title, tabs=tabs, show_tabs=len(tabs) > 1)
79
+ config = TabbedWebpage(
80
+ title=title, tabs=tabs, show_tabs=len(tabs) > 1, add_title_h1=add_title_h1
81
+ )
71
82
 
72
83
  config_item = Item(
73
84
  title=f"{title} (config)",
@@ -88,7 +99,9 @@ def _load_tab_content(config: TabbedWebpage):
88
99
  tab.content_html = html
89
100
 
90
101
 
91
- def webpage_generate(config_item: Item) -> str:
102
+ def tabbed_webpage_generate(
103
+ config_item: Item, page_template: str = "base_webpage.html.jinja", add_title_h1: bool = True
104
+ ) -> str:
92
105
  """
93
106
  Generate a web page using the supplied config.
94
107
  """
@@ -98,14 +111,15 @@ def webpage_generate(config_item: Item) -> str:
98
111
  _load_tab_content(tabbed_webpage)
99
112
 
100
113
  content = render_web_template(
101
- base_templates_dir, "tabbed_webpage.html.jinja", asdict(tabbed_webpage)
114
+ template_filename="tabbed_webpage.html.jinja",
115
+ data=asdict(tabbed_webpage),
102
116
  )
103
117
 
104
118
  return render_web_template(
105
- base_templates_dir,
106
- "base_webpage.html.jinja",
107
- {
119
+ page_template,
120
+ data={
108
121
  "title": tabbed_webpage.title,
122
+ "add_title_h1": add_title_h1,
109
123
  "content": content,
110
124
  },
111
125
  )
@@ -135,11 +149,7 @@ def test_render():
135
149
  new_config = as_dataclass(read_yaml_file("tmp/webpage_config.yaml"), TabbedWebpage)
136
150
  assert new_config == config
137
151
 
138
- html = render_web_template(
139
- base_templates_dir,
140
- "tabbed_webpage.html.jinja",
141
- asdict(config),
142
- )
152
+ html = render_web_template(template_filename="tabbed_webpage.html.jinja", data=asdict(config))
143
153
  with open("tmp/webpage.html", "w") as f:
144
154
  f.write(html)
145
155
  print("Rendered tabbed webpage to tmp/webpage.html")