kash-shell 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/__init__.py +4 -4
- kash/actions/core/format_markdown_template.py +2 -5
- kash/actions/core/markdownify.py +7 -6
- kash/actions/core/readability.py +7 -6
- kash/actions/core/render_as_html.py +37 -0
- kash/actions/core/show_webpage.py +6 -11
- kash/actions/core/strip_html.py +2 -6
- kash/actions/core/tabbed_webpage_config.py +31 -0
- kash/actions/core/{webpage_generate.py → tabbed_webpage_generate.py} +5 -4
- kash/commands/__init__.py +8 -20
- kash/commands/base/basic_file_commands.py +15 -0
- kash/commands/base/debug_commands.py +13 -0
- kash/commands/base/files_command.py +28 -10
- kash/commands/base/general_commands.py +21 -16
- kash/commands/base/logs_commands.py +4 -2
- kash/commands/base/model_commands.py +8 -8
- kash/commands/base/search_command.py +3 -2
- kash/commands/base/show_command.py +5 -3
- kash/commands/extras/parse_uv_lock.py +186 -0
- kash/commands/help/doc_commands.py +2 -31
- kash/commands/help/welcome.py +33 -0
- kash/commands/workspace/selection_commands.py +11 -6
- kash/commands/workspace/workspace_commands.py +19 -17
- kash/config/colors.py +3 -1
- kash/config/env_settings.py +14 -1
- kash/config/init.py +2 -2
- kash/config/logger.py +59 -56
- kash/config/logger_basic.py +3 -3
- kash/config/settings.py +116 -57
- kash/config/setup.py +28 -12
- kash/config/text_styles.py +3 -13
- kash/docs/load_api_docs.py +2 -1
- kash/docs/markdown/topics/a3_getting_started.md +3 -2
- kash/{concepts → embeddings}/text_similarity.py +2 -2
- kash/exec/__init__.py +20 -3
- kash/exec/action_decorators.py +24 -10
- kash/exec/action_exec.py +41 -23
- kash/exec/action_registry.py +13 -48
- kash/exec/command_registry.py +2 -1
- kash/exec/fetch_url_metadata.py +4 -6
- kash/exec/importing.py +56 -0
- kash/exec/llm_transforms.py +12 -10
- kash/exec/precondition_registry.py +2 -1
- kash/exec/preconditions.py +22 -1
- kash/exec/resolve_args.py +4 -0
- kash/exec/shell_callable_action.py +33 -19
- kash/file_storage/file_store.py +42 -27
- kash/file_storage/item_file_format.py +5 -2
- kash/file_storage/metadata_dirs.py +11 -2
- kash/help/assistant.py +1 -1
- kash/help/assistant_instructions.py +2 -1
- kash/help/function_param_info.py +1 -1
- kash/help/help_embeddings.py +2 -2
- kash/help/help_printing.py +7 -11
- kash/llm_utils/clean_headings.py +1 -1
- kash/llm_utils/llm_api_keys.py +4 -4
- kash/llm_utils/llm_features.py +68 -0
- kash/llm_utils/llm_messages.py +1 -2
- kash/llm_utils/llm_names.py +1 -1
- kash/llm_utils/llms.py +8 -3
- kash/local_server/__init__.py +5 -2
- kash/local_server/local_server.py +8 -5
- kash/local_server/local_server_commands.py +2 -2
- kash/local_server/local_server_routes.py +1 -7
- kash/local_server/local_url_formatters.py +1 -1
- kash/mcp/__init__.py +5 -2
- kash/mcp/mcp_cli.py +5 -5
- kash/mcp/mcp_server_commands.py +5 -5
- kash/mcp/mcp_server_routes.py +5 -5
- kash/mcp/mcp_server_sse.py +4 -2
- kash/media_base/media_cache.py +8 -8
- kash/media_base/media_services.py +1 -1
- kash/media_base/media_tools.py +6 -6
- kash/media_base/services/local_file_media.py +2 -2
- kash/media_base/{speech_transcription.py → transcription_deepgram.py} +25 -110
- kash/media_base/transcription_format.py +73 -0
- kash/media_base/transcription_whisper.py +38 -0
- kash/model/__init__.py +73 -5
- kash/model/actions_model.py +38 -4
- kash/model/concept_model.py +30 -0
- kash/model/items_model.py +115 -32
- kash/model/params_model.py +24 -0
- kash/shell/completions/completion_scoring.py +37 -5
- kash/shell/output/kerm_codes.py +1 -2
- kash/shell/output/shell_formatting.py +14 -4
- kash/shell/shell_main.py +2 -2
- kash/shell/utils/exception_printing.py +6 -0
- kash/shell/utils/native_utils.py +26 -20
- kash/shell/utils/shell_function_wrapper.py +15 -15
- kash/text_handling/custom_sliding_transforms.py +12 -4
- kash/text_handling/doc_normalization.py +6 -2
- kash/text_handling/markdown_render.py +118 -0
- kash/text_handling/markdown_utils.py +226 -0
- kash/utils/common/function_inspect.py +360 -110
- kash/utils/common/import_utils.py +12 -3
- kash/utils/common/type_utils.py +0 -29
- kash/utils/common/url.py +27 -3
- kash/utils/errors.py +6 -0
- kash/utils/file_utils/file_ext.py +4 -0
- kash/utils/file_utils/file_formats.py +2 -2
- kash/utils/file_utils/file_formats_model.py +20 -1
- kash/web_content/dir_store.py +1 -2
- kash/web_content/file_cache_utils.py +37 -10
- kash/web_content/file_processing.py +68 -0
- kash/web_content/local_file_cache.py +12 -9
- kash/web_content/web_extract.py +8 -3
- kash/web_content/web_fetch.py +12 -4
- kash/web_gen/__init__.py +0 -4
- kash/web_gen/simple_webpage.py +52 -0
- kash/web_gen/tabbed_webpage.py +24 -14
- kash/web_gen/template_render.py +37 -2
- kash/web_gen/templates/base_styles.css.jinja +169 -43
- kash/web_gen/templates/base_webpage.html.jinja +110 -45
- kash/web_gen/templates/content_styles.css.jinja +4 -2
- kash/web_gen/templates/item_view.html.jinja +49 -39
- kash/web_gen/templates/simple_webpage.html.jinja +24 -0
- kash/web_gen/templates/tabbed_webpage.html.jinja +42 -33
- kash/workspaces/__init__.py +15 -2
- kash/workspaces/selections.py +18 -3
- kash/workspaces/source_items.py +0 -1
- kash/workspaces/workspaces.py +5 -11
- kash/xonsh_custom/command_nl_utils.py +40 -19
- kash/xonsh_custom/custom_shell.py +43 -11
- kash/xonsh_custom/customize_prompt.py +39 -21
- kash/xonsh_custom/load_into_xonsh.py +22 -25
- kash/xonsh_custom/shell_load_commands.py +2 -2
- kash/xonsh_custom/xonsh_completers.py +2 -249
- kash/xonsh_custom/xonsh_keybindings.py +282 -0
- kash/xonsh_custom/xonsh_modern_tools.py +3 -3
- kash/xontrib/kash_extension.py +5 -6
- {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/METADATA +10 -8
- {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/RECORD +137 -136
- kash/actions/core/webpage_config.py +0 -21
- kash/concepts/concept_formats.py +0 -23
- kash/shell/clideps/api_keys.py +0 -100
- kash/shell/clideps/dotenv_setup.py +0 -115
- kash/shell/clideps/dotenv_utils.py +0 -98
- kash/shell/clideps/pkg_deps.py +0 -257
- kash/shell/clideps/platforms.py +0 -11
- kash/shell/clideps/terminal_features.py +0 -56
- kash/shell/utils/osc_utils.py +0 -95
- kash/shell/utils/terminal_images.py +0 -133
- kash/text_handling/markdown_util.py +0 -167
- kash/utils/common/atomic_var.py +0 -171
- kash/utils/common/string_replace.py +0 -93
- kash/utils/common/string_template.py +0 -101
- /kash/{concepts → embeddings}/cosine.py +0 -0
- /kash/{concepts → embeddings}/embeddings.py +0 -0
- {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/entry_points.txt +0 -0
- {kash_shell-0.3.9.dist-info → kash_shell-0.3.11.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,9 +4,9 @@ from pathlib import Path
|
|
|
4
4
|
from typing import NewType
|
|
5
5
|
|
|
6
6
|
import regex
|
|
7
|
+
from clideps.pkgs.pkg_check import pkg_check
|
|
7
8
|
|
|
8
9
|
from kash.config.logger import get_logger
|
|
9
|
-
from kash.shell.clideps.pkg_deps import Pkg, pkg_check
|
|
10
10
|
|
|
11
11
|
log = get_logger(__name__)
|
|
12
12
|
|
|
@@ -86,7 +86,7 @@ def detect_mime_type(filename: str | Path) -> MimeType | None:
|
|
|
86
86
|
Get the mime type of a file using libmagic heuristics plus more careful
|
|
87
87
|
detection of HTML, Markdown, and multipart YAML.
|
|
88
88
|
"""
|
|
89
|
-
pkg_check().require(
|
|
89
|
+
pkg_check().require("libmagic")
|
|
90
90
|
import magic
|
|
91
91
|
|
|
92
92
|
mime = magic.Magic(mime=True)
|
|
@@ -36,6 +36,8 @@ class Format(Enum):
|
|
|
36
36
|
it is the format of the resource (url, media, etc.).
|
|
37
37
|
"""
|
|
38
38
|
|
|
39
|
+
# TODO: Be more thorough, pulling in relevant extensions and types from the `mimetypes` module.
|
|
40
|
+
|
|
39
41
|
# Formats with no body (content is in frontmatter).
|
|
40
42
|
url = "url"
|
|
41
43
|
|
|
@@ -46,6 +48,7 @@ class Format(Enum):
|
|
|
46
48
|
"""`md_html` is Markdown with HTML, used for example when we structure Markdown with divs."""
|
|
47
49
|
html = "html"
|
|
48
50
|
"""`markdown` should be simple and clean Markdown that we can use with LLMs."""
|
|
51
|
+
epub = "epub"
|
|
49
52
|
yaml = "yaml"
|
|
50
53
|
diff = "diff"
|
|
51
54
|
python = "python"
|
|
@@ -54,12 +57,14 @@ class Format(Enum):
|
|
|
54
57
|
xonsh = "xonsh"
|
|
55
58
|
json = "json"
|
|
56
59
|
csv = "csv"
|
|
60
|
+
xlsx = "xlsx"
|
|
57
61
|
npz = "npz"
|
|
58
62
|
log = "log"
|
|
59
63
|
|
|
60
64
|
# Media formats.
|
|
61
65
|
pdf = "pdf"
|
|
62
66
|
docx = "docx"
|
|
67
|
+
pptx = "pptx"
|
|
63
68
|
jpeg = "jpeg"
|
|
64
69
|
png = "png"
|
|
65
70
|
gif = "gif"
|
|
@@ -106,6 +111,7 @@ class Format(Enum):
|
|
|
106
111
|
self.html,
|
|
107
112
|
self.pdf,
|
|
108
113
|
self.docx,
|
|
114
|
+
self.pptx,
|
|
109
115
|
]
|
|
110
116
|
|
|
111
117
|
@property
|
|
@@ -126,7 +132,7 @@ class Format(Enum):
|
|
|
126
132
|
|
|
127
133
|
@property
|
|
128
134
|
def is_data(self) -> bool:
|
|
129
|
-
return self in [self.csv, self.npz]
|
|
135
|
+
return self in [self.csv, self.xlsx, self.npz]
|
|
130
136
|
|
|
131
137
|
@property
|
|
132
138
|
def is_binary(self) -> bool:
|
|
@@ -146,6 +152,7 @@ class Format(Enum):
|
|
|
146
152
|
self.markdown,
|
|
147
153
|
self.md_html,
|
|
148
154
|
self.html,
|
|
155
|
+
self.json, # Not strictly true but we encourage use of comments.
|
|
149
156
|
self.yaml,
|
|
150
157
|
self.diff,
|
|
151
158
|
self.python,
|
|
@@ -163,6 +170,7 @@ class Format(Enum):
|
|
|
163
170
|
Format.markdown: MediaType.text,
|
|
164
171
|
Format.md_html: MediaType.text,
|
|
165
172
|
Format.html: MediaType.webpage,
|
|
173
|
+
Format.epub: MediaType.text,
|
|
166
174
|
Format.yaml: MediaType.text,
|
|
167
175
|
Format.diff: MediaType.text,
|
|
168
176
|
Format.python: MediaType.text,
|
|
@@ -172,11 +180,13 @@ class Format(Enum):
|
|
|
172
180
|
Format.csv: MediaType.text,
|
|
173
181
|
Format.log: MediaType.text,
|
|
174
182
|
Format.pdf: MediaType.text,
|
|
183
|
+
Format.xlsx: MediaType.text,
|
|
175
184
|
Format.jpeg: MediaType.image,
|
|
176
185
|
Format.png: MediaType.image,
|
|
177
186
|
Format.gif: MediaType.image,
|
|
178
187
|
Format.svg: MediaType.image,
|
|
179
188
|
Format.docx: MediaType.text,
|
|
189
|
+
Format.pptx: MediaType.text,
|
|
180
190
|
Format.mp3: MediaType.audio,
|
|
181
191
|
Format.m4a: MediaType.audio,
|
|
182
192
|
Format.mp4: MediaType.video,
|
|
@@ -197,6 +207,7 @@ class Format(Enum):
|
|
|
197
207
|
FileExt.diff.value: Format.diff,
|
|
198
208
|
FileExt.json.value: Format.json,
|
|
199
209
|
FileExt.csv.value: Format.csv,
|
|
210
|
+
FileExt.xlsx.value: Format.xlsx,
|
|
200
211
|
FileExt.npz.value: Format.npz,
|
|
201
212
|
FileExt.log.value: Format.log,
|
|
202
213
|
FileExt.py.value: Format.python,
|
|
@@ -204,6 +215,7 @@ class Format(Enum):
|
|
|
204
215
|
FileExt.xsh.value: Format.xonsh,
|
|
205
216
|
FileExt.pdf.value: Format.pdf,
|
|
206
217
|
FileExt.docx.value: Format.docx,
|
|
218
|
+
FileExt.pptx.value: Format.pptx,
|
|
207
219
|
FileExt.jpg.value: Format.jpeg,
|
|
208
220
|
FileExt.png.value: Format.png,
|
|
209
221
|
FileExt.gif.value: Format.gif,
|
|
@@ -211,6 +223,7 @@ class Format(Enum):
|
|
|
211
223
|
FileExt.mp3.value: Format.mp3,
|
|
212
224
|
FileExt.m4a.value: Format.m4a,
|
|
213
225
|
FileExt.mp4.value: Format.mp4,
|
|
226
|
+
FileExt.epub.value: Format.epub,
|
|
214
227
|
}
|
|
215
228
|
return ext_to_format.get(file_ext.value, None)
|
|
216
229
|
|
|
@@ -225,10 +238,12 @@ class Format(Enum):
|
|
|
225
238
|
Format.md_html: FileExt.md,
|
|
226
239
|
Format.html: FileExt.html,
|
|
227
240
|
Format.plaintext: FileExt.txt,
|
|
241
|
+
Format.epub: FileExt.epub,
|
|
228
242
|
Format.yaml: FileExt.yml,
|
|
229
243
|
Format.diff: FileExt.diff,
|
|
230
244
|
Format.json: FileExt.json,
|
|
231
245
|
Format.csv: FileExt.csv,
|
|
246
|
+
Format.xlsx: FileExt.xlsx,
|
|
232
247
|
Format.npz: FileExt.npz,
|
|
233
248
|
Format.log: FileExt.log,
|
|
234
249
|
Format.python: FileExt.py,
|
|
@@ -236,6 +251,7 @@ class Format(Enum):
|
|
|
236
251
|
Format.xonsh: FileExt.xsh,
|
|
237
252
|
Format.pdf: FileExt.pdf,
|
|
238
253
|
Format.docx: FileExt.docx,
|
|
254
|
+
Format.pptx: FileExt.pptx,
|
|
239
255
|
Format.jpeg: FileExt.jpg,
|
|
240
256
|
Format.png: FileExt.png,
|
|
241
257
|
Format.gif: FileExt.gif,
|
|
@@ -257,6 +273,7 @@ class Format(Enum):
|
|
|
257
273
|
"text/html": Format.html,
|
|
258
274
|
"text/diff": Format.diff,
|
|
259
275
|
"text/x-diff": Format.diff,
|
|
276
|
+
"application/epub+zip": Format.epub,
|
|
260
277
|
"application/yaml": Format.yaml,
|
|
261
278
|
"application/x-yaml": Format.yaml,
|
|
262
279
|
"text/x-python": Format.python,
|
|
@@ -266,9 +283,11 @@ class Format(Enum):
|
|
|
266
283
|
"text/x-xonsh": Format.xonsh,
|
|
267
284
|
"application/json": Format.json,
|
|
268
285
|
"text/csv": Format.csv,
|
|
286
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": Format.xlsx,
|
|
269
287
|
"application/x-npz": Format.npz,
|
|
270
288
|
"application/pdf": Format.pdf,
|
|
271
289
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": Format.docx,
|
|
290
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": Format.pptx,
|
|
272
291
|
"image/jpeg": Format.jpeg,
|
|
273
292
|
"image/png": Format.png,
|
|
274
293
|
"image/gif": Format.gif,
|
kash/web_content/dir_store.py
CHANGED
|
@@ -87,8 +87,7 @@ class DirStore:
|
|
|
87
87
|
self, keys: list[str | Path], folder: str | None = None, suffix: str | None = None
|
|
88
88
|
) -> dict[str | Path, Path | None]:
|
|
89
89
|
"""
|
|
90
|
-
Look up all existing cached results for the set of keys.
|
|
91
|
-
be optimized for large batches.
|
|
90
|
+
Look up all existing cached results for the set of keys.
|
|
92
91
|
"""
|
|
93
92
|
return {key: self.find(key, folder=folder, suffix=suffix) for key in keys}
|
|
94
93
|
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections.abc import Callable
|
|
1
3
|
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
2
5
|
|
|
3
6
|
from prettyfmt import fmt_lines, fmt_path
|
|
4
7
|
|
|
@@ -35,18 +38,40 @@ def reset_content_cache_dir(path: Path):
|
|
|
35
38
|
log.info("Using web cache: %s", fmt_path(path))
|
|
36
39
|
|
|
37
40
|
|
|
38
|
-
def cache_file(
|
|
41
|
+
def cache_file(
|
|
42
|
+
source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
|
|
43
|
+
) -> tuple[Path, bool]:
|
|
39
44
|
"""
|
|
40
45
|
Return a local cached copy of the item. If it is an URL, content is fetched.
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
If it is a Path or a Loadable, a cached copy is returned.
|
|
47
|
+
LocalFileCache uses httpx so httpx.HTTPError is raised for non-2xx responses.
|
|
48
|
+
|
|
49
|
+
Uses the current content cache unless there is no current cache or `global_cache` is True,
|
|
50
|
+
in which case the global cache is used.
|
|
43
51
|
"""
|
|
44
52
|
cache = _global_content_cache if global_cache else _content_cache
|
|
45
|
-
path, was_cached = cache.cache(source)
|
|
53
|
+
path, was_cached = cache.cache(source, expiration_sec)
|
|
46
54
|
return path, was_cached
|
|
47
55
|
|
|
48
56
|
|
|
49
|
-
def
|
|
57
|
+
def cache_api_response(
|
|
58
|
+
url: Url,
|
|
59
|
+
global_cache: bool = False,
|
|
60
|
+
expiration_sec: float | None = None,
|
|
61
|
+
parser: Callable[[str], Any] = json.loads,
|
|
62
|
+
) -> tuple[Any, bool]:
|
|
63
|
+
"""
|
|
64
|
+
Cache an API response. By default parse the response as JSON.
|
|
65
|
+
"""
|
|
66
|
+
cache = _global_content_cache if global_cache else _content_cache
|
|
67
|
+
path, was_cached = cache.cache(url, expiration_sec)
|
|
68
|
+
result = parser(path.read_text())
|
|
69
|
+
return result, was_cached
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def cache_resource(
|
|
73
|
+
item: Item, global_cache: bool = False, expiration_sec: float | None = None
|
|
74
|
+
) -> dict[MediaType, Path]:
|
|
50
75
|
"""
|
|
51
76
|
Cache a resource item for an external local path or a URL, fetching or
|
|
52
77
|
copying as needed. For media this may yield more than one format.
|
|
@@ -64,17 +89,17 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
|
|
|
64
89
|
if is_media_url(item.url):
|
|
65
90
|
result = cache_media(item.url)
|
|
66
91
|
else:
|
|
67
|
-
path, _was_cached = cache_file(item.url)
|
|
92
|
+
path, _was_cached = cache_file(item.url, global_cache, expiration_sec)
|
|
68
93
|
elif item.external_path:
|
|
69
94
|
path = Path(item.external_path)
|
|
70
95
|
if not path.is_file():
|
|
71
96
|
raise FileNotFound(f"External path not found: {path}")
|
|
72
|
-
path, _was_cached = cache_file(path)
|
|
97
|
+
path, _was_cached = cache_file(path, global_cache, expiration_sec)
|
|
73
98
|
elif item.original_filename:
|
|
74
99
|
path = Path(item.original_filename)
|
|
75
100
|
if not path.is_file():
|
|
76
101
|
raise FileNotFound(f"Original filename not found: {path}")
|
|
77
|
-
path, _was_cached = cache_file(path)
|
|
102
|
+
path, _was_cached = cache_file(path, global_cache, expiration_sec)
|
|
78
103
|
else:
|
|
79
104
|
raise ValueError(f"Item has no URL or external path: {item}")
|
|
80
105
|
|
|
@@ -94,7 +119,9 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
|
|
|
94
119
|
return result
|
|
95
120
|
|
|
96
121
|
|
|
97
|
-
def get_url_html(
|
|
122
|
+
def get_url_html(
|
|
123
|
+
item: Item, global_cache: bool = False, expiration_sec: float | None = None
|
|
124
|
+
) -> tuple[Url, str]:
|
|
98
125
|
"""
|
|
99
126
|
Returns the HTML content of an URL item, using the content cache,
|
|
100
127
|
or the body of the item if it has a URL and HTML body.
|
|
@@ -106,7 +133,7 @@ def get_url_html(item: Item) -> tuple[Url, str]:
|
|
|
106
133
|
url = Url(canonicalize_url(item.url))
|
|
107
134
|
|
|
108
135
|
if is_url_item(item):
|
|
109
|
-
path, _was_cached = cache_file(url)
|
|
136
|
+
path, _was_cached = cache_file(url, global_cache, expiration_sec)
|
|
110
137
|
with open(path) as file:
|
|
111
138
|
html_content = file.read()
|
|
112
139
|
else:
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Mapping
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TypeAlias
|
|
7
|
+
|
|
8
|
+
from kash.web_content.local_file_cache import read_mtime
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class OutputType:
|
|
13
|
+
"""
|
|
14
|
+
A type of output file, represented by the filename suffix, e.g. '.mp3', '.txt', etc.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
suffix: str
|
|
18
|
+
|
|
19
|
+
def output_path(self, src: Path) -> Path:
|
|
20
|
+
"""
|
|
21
|
+
Resolve the output path. Will be next to the source file, e.g.
|
|
22
|
+
some-dir/video.mp4 -> some-dir/video.mp3
|
|
23
|
+
"""
|
|
24
|
+
return src.with_suffix(self.suffix)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
Processor: TypeAlias = Callable[[Path, Mapping[OutputType, Path]], None]
|
|
28
|
+
"""
|
|
29
|
+
A function that takes a source file and a mapping with one or more output paths.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class FileProcess:
|
|
35
|
+
"""
|
|
36
|
+
Process a file and produce one or more outputs.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
processor: Processor
|
|
40
|
+
outputs: list[OutputType]
|
|
41
|
+
|
|
42
|
+
def is_outdated(self, src: Path) -> bool:
|
|
43
|
+
"""
|
|
44
|
+
True when any output is missing or older (earliest mtime) than `src`.
|
|
45
|
+
"""
|
|
46
|
+
dests = {o.output_path(src) for o in self.outputs}
|
|
47
|
+
if any(not p.exists() for p in dests):
|
|
48
|
+
return True
|
|
49
|
+
earliest = min(read_mtime(p) for p in dests)
|
|
50
|
+
return read_mtime(src) > earliest
|
|
51
|
+
|
|
52
|
+
def run(self, src: Path) -> dict[OutputType, Path]:
|
|
53
|
+
"""
|
|
54
|
+
Run unconditionally and return a mapping of outputs to paths.
|
|
55
|
+
"""
|
|
56
|
+
dests = {o: o.output_path(src) for o in self.outputs}
|
|
57
|
+
self.processor(src, dests)
|
|
58
|
+
return dests
|
|
59
|
+
|
|
60
|
+
def run_if_needed(self, src: Path) -> dict[OutputType, Path]:
|
|
61
|
+
"""
|
|
62
|
+
Run only if any output is missing or outdated.
|
|
63
|
+
"""
|
|
64
|
+
return (
|
|
65
|
+
self.run(src)
|
|
66
|
+
if self.is_outdated(src)
|
|
67
|
+
else {o: o.output_path(src) for o in self.outputs}
|
|
68
|
+
)
|
|
@@ -11,7 +11,7 @@ from prettyfmt import fmt_path
|
|
|
11
11
|
from strif import atomic_output_file, copyfile_atomic
|
|
12
12
|
|
|
13
13
|
from kash.utils.common.url import Url, is_file_url, is_url, normalize_url, parse_file_url
|
|
14
|
-
from kash.utils.errors import FileNotFound
|
|
14
|
+
from kash.utils.errors import FileNotFound
|
|
15
15
|
from kash.utils.file_utils.file_formats_model import choose_file_ext
|
|
16
16
|
from kash.web_content.dir_store import DirStore
|
|
17
17
|
from kash.web_content.web_fetch import download_url
|
|
@@ -56,19 +56,21 @@ class Loadable:
|
|
|
56
56
|
|
|
57
57
|
key: str
|
|
58
58
|
"""
|
|
59
|
-
The unique identifier for the item.
|
|
60
|
-
|
|
59
|
+
The unique identifier for the item. Used when creating unique cache filenames,
|
|
60
|
+
as is or with added suffixes.
|
|
61
61
|
"""
|
|
62
62
|
|
|
63
63
|
save: Callable[[Path], None]
|
|
64
64
|
"""
|
|
65
65
|
Method that saves the item to the given path. Caller will handle path selection
|
|
66
|
-
and atomicity of file creation.
|
|
66
|
+
and atomicity of file creation. Raise an exception if the item cannot be saved.
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
Cacheable = Url | Path | Loadable
|
|
71
|
-
"""
|
|
71
|
+
"""
|
|
72
|
+
An item that can be cached as a file.
|
|
73
|
+
"""
|
|
72
74
|
|
|
73
75
|
|
|
74
76
|
def _suffix_for(cacheable: Cacheable) -> str | None:
|
|
@@ -151,9 +153,7 @@ class LocalFileCache(DirStore):
|
|
|
151
153
|
if isinstance(url_or_path, Path):
|
|
152
154
|
file_path = url_or_path
|
|
153
155
|
else:
|
|
154
|
-
parsed = parse_file_url(url_or_path)
|
|
155
|
-
if not parsed:
|
|
156
|
-
raise InvalidInput(f"Not a file URL: {url_or_path}")
|
|
156
|
+
parsed = parse_file_url(url_or_path) # Raises ValueError if not a file URL.
|
|
157
157
|
file_path = parsed
|
|
158
158
|
if not file_path.exists():
|
|
159
159
|
raise FileNotFound(f"File not found: {file_path}")
|
|
@@ -173,7 +173,10 @@ class LocalFileCache(DirStore):
|
|
|
173
173
|
) as tmp_path:
|
|
174
174
|
source.save(tmp_path)
|
|
175
175
|
if not cache_path.exists():
|
|
176
|
-
|
|
176
|
+
# The source should have raised an exception if it failed to save.
|
|
177
|
+
raise InvalidCacheState(
|
|
178
|
+
f"Loadable source failed to save to cache: {source}: {cache_path}"
|
|
179
|
+
)
|
|
177
180
|
else:
|
|
178
181
|
raise ValueError(f"Invalid source: {source}")
|
|
179
182
|
|
kash/web_content/web_extract.py
CHANGED
|
@@ -10,14 +10,19 @@ from kash.web_content.web_page_model import PageExtractor, WebPageData
|
|
|
10
10
|
|
|
11
11
|
@log_calls(level="message")
|
|
12
12
|
def fetch_extract(
|
|
13
|
-
url: Url,
|
|
13
|
+
url: Url,
|
|
14
|
+
refetch: bool = False,
|
|
15
|
+
use_cache: bool = True,
|
|
16
|
+
extractor: PageExtractor = extract_text_justext,
|
|
14
17
|
) -> WebPageData:
|
|
15
18
|
"""
|
|
16
19
|
Fetches a URL and extracts the title, description, and content.
|
|
20
|
+
By default, uses the content cache if available. Can force re-fetching and
|
|
21
|
+
updating the cache by setting `refetch` to true.
|
|
17
22
|
"""
|
|
18
|
-
|
|
23
|
+
expiration_sec = 0 if refetch else None
|
|
19
24
|
if use_cache:
|
|
20
|
-
path, _was_cached = cache_file(url)
|
|
25
|
+
path, _was_cached = cache_file(url, expiration_sec=expiration_sec)
|
|
21
26
|
with open(path, "rb") as file:
|
|
22
27
|
content = file.read()
|
|
23
28
|
page_data = extractor(url, content)
|
kash/web_content/web_fetch.py
CHANGED
|
@@ -7,17 +7,22 @@ import httpx
|
|
|
7
7
|
from strif import atomic_output_file, copyfile_atomic
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
|
+
from kash.config.env_settings import KashEnv
|
|
10
11
|
from kash.utils.common.url import Url
|
|
11
12
|
|
|
12
13
|
log = logging.getLogger(__name__)
|
|
13
14
|
|
|
14
|
-
USER_AGENT = "Mozilla/5.0 (Compatible)"
|
|
15
15
|
|
|
16
16
|
DEFAULT_TIMEOUT = 30
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
DEFAULT_USER_AGENT = (
|
|
20
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
19
24
|
def default_headers() -> dict[str, str]:
|
|
20
|
-
return {"User-Agent":
|
|
25
|
+
return {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=DEFAULT_USER_AGENT)}
|
|
21
26
|
|
|
22
27
|
|
|
23
28
|
def fetch_url(
|
|
@@ -36,6 +41,7 @@ def fetch_url(
|
|
|
36
41
|
auth=auth,
|
|
37
42
|
headers=headers or default_headers(),
|
|
38
43
|
) as client:
|
|
44
|
+
log.debug("fetch_url: using headers: %s", client.headers)
|
|
39
45
|
response = client.get(url)
|
|
40
46
|
log.info("Fetched: %s (%s bytes): %s", response.status_code, len(response.content), url)
|
|
41
47
|
response.raise_for_status()
|
|
@@ -52,7 +58,7 @@ def download_url(
|
|
|
52
58
|
headers: dict[str, str] | None = None,
|
|
53
59
|
) -> None:
|
|
54
60
|
"""
|
|
55
|
-
Download given file, optionally with progress bar.
|
|
61
|
+
Download given file, optionally with progress bar, streaming to a target file.
|
|
56
62
|
Also handles file:// and s3:// URLs. Output file is created atomically.
|
|
57
63
|
Raise httpx.HTTPError for non-2xx responses.
|
|
58
64
|
"""
|
|
@@ -73,13 +79,15 @@ def download_url(
|
|
|
73
79
|
client = session or httpx.Client(follow_redirects=True, timeout=timeout)
|
|
74
80
|
response: httpx.Response | None = None
|
|
75
81
|
try:
|
|
82
|
+
headers = headers or default_headers()
|
|
83
|
+
log.debug("download_url: using headers: %s", headers)
|
|
76
84
|
with client.stream(
|
|
77
85
|
"GET",
|
|
78
86
|
url,
|
|
79
87
|
follow_redirects=True,
|
|
80
88
|
timeout=timeout,
|
|
81
89
|
auth=auth,
|
|
82
|
-
headers=headers
|
|
90
|
+
headers=headers,
|
|
83
91
|
) as response:
|
|
84
92
|
response.raise_for_status()
|
|
85
93
|
total_size = int(response.headers.get("content-length", "0"))
|
kash/web_gen/__init__.py
CHANGED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from kash.model.items_model import Item
|
|
2
|
+
from kash.utils.file_utils.file_formats_model import Format
|
|
3
|
+
from kash.web_gen.template_render import render_web_template
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def simple_webpage_render(
|
|
7
|
+
item: Item,
|
|
8
|
+
page_template: str = "simple_webpage.html.jinja",
|
|
9
|
+
add_title_h1: bool = True,
|
|
10
|
+
) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Generate a simple web page from a single item.
|
|
13
|
+
If `add_title_h1` is True, the title will be inserted as an h1 heading above the body.
|
|
14
|
+
"""
|
|
15
|
+
return render_web_template(
|
|
16
|
+
template_filename=page_template,
|
|
17
|
+
data={
|
|
18
|
+
"title": item.title,
|
|
19
|
+
"add_title_h1": add_title_h1,
|
|
20
|
+
"content_html": item.body_as_html(),
|
|
21
|
+
"thumbnail_url": item.thumbnail_url,
|
|
22
|
+
},
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## Tests
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_render():
|
|
30
|
+
import os
|
|
31
|
+
|
|
32
|
+
from kash.model.items_model import ItemType
|
|
33
|
+
|
|
34
|
+
# Create a test item
|
|
35
|
+
item = Item(
|
|
36
|
+
type=ItemType.doc,
|
|
37
|
+
format=Format.html,
|
|
38
|
+
title="A Simple Web Page",
|
|
39
|
+
body="<p>This is a simple web page with <b>HTML content</b>.</p>",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Generate HTML
|
|
43
|
+
html = simple_webpage_render(item)
|
|
44
|
+
|
|
45
|
+
os.makedirs("tmp", exist_ok=True)
|
|
46
|
+
with open("tmp/simple_webpage.html", "w") as f:
|
|
47
|
+
f.write(html)
|
|
48
|
+
print("Rendered simple webpage to tmp/simple_webpage.html")
|
|
49
|
+
|
|
50
|
+
# Basic validation
|
|
51
|
+
assert item.title and item.title in html
|
|
52
|
+
assert "<b>HTML content</b>" in html
|
kash/web_gen/tabbed_webpage.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from dataclasses import asdict, dataclass
|
|
3
3
|
|
|
4
4
|
from frontmatter_format import read_yaml_file, to_yaml_string, write_yaml_file
|
|
5
|
+
from prettyfmt import abbrev_on_words, sanitize_title
|
|
5
6
|
|
|
6
7
|
from kash.config.logger import get_logger
|
|
7
8
|
from kash.exec.preconditions import has_thumbnail_url
|
|
@@ -11,7 +12,6 @@ from kash.model.paths_model import StorePath
|
|
|
11
12
|
from kash.utils.common.type_utils import as_dataclass, not_none
|
|
12
13
|
from kash.utils.errors import NoMatch
|
|
13
14
|
from kash.utils.file_utils.file_formats_model import Format
|
|
14
|
-
from kash.web_gen import base_templates_dir
|
|
15
15
|
from kash.web_gen.template_render import render_web_template
|
|
16
16
|
from kash.workspaces import current_ws
|
|
17
17
|
from kash.workspaces.source_items import find_upstream_item
|
|
@@ -33,6 +33,7 @@ class TabbedWebpage:
|
|
|
33
33
|
title: str
|
|
34
34
|
tabs: list[TabInfo]
|
|
35
35
|
show_tabs: bool = True
|
|
36
|
+
add_title_h1: bool = True
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
def _fill_in_ids(tabs: list[TabInfo]):
|
|
@@ -41,7 +42,9 @@ def _fill_in_ids(tabs: list[TabInfo]):
|
|
|
41
42
|
tab.id = f"tab_{i}"
|
|
42
43
|
|
|
43
44
|
|
|
44
|
-
def
|
|
45
|
+
def tabbed_webpage_config(
|
|
46
|
+
items: list[Item], clean_headings: bool = False, add_title_h1: bool = True
|
|
47
|
+
) -> Item:
|
|
45
48
|
"""
|
|
46
49
|
Get an item with the config for a tabbed web page.
|
|
47
50
|
"""
|
|
@@ -57,9 +60,15 @@ def webpage_config(items: list[Item]) -> Item:
|
|
|
57
60
|
log.warning("Item has no thumbnail URL: %s", item)
|
|
58
61
|
return None
|
|
59
62
|
|
|
63
|
+
def clean_label(label: str) -> str:
|
|
64
|
+
if clean_headings:
|
|
65
|
+
return clean_heading(label)
|
|
66
|
+
else:
|
|
67
|
+
return abbrev_on_words(sanitize_title(label), max_len=40)
|
|
68
|
+
|
|
60
69
|
tabs = [
|
|
61
70
|
TabInfo(
|
|
62
|
-
label=
|
|
71
|
+
label=clean_label(item.abbrev_title()),
|
|
63
72
|
store_path=item.store_path,
|
|
64
73
|
thumbnail_url=get_thumbnail_url(item),
|
|
65
74
|
)
|
|
@@ -67,7 +76,9 @@ def webpage_config(items: list[Item]) -> Item:
|
|
|
67
76
|
]
|
|
68
77
|
_fill_in_ids(tabs)
|
|
69
78
|
title = summary_heading([item.abbrev_title() for item in items])
|
|
70
|
-
config = TabbedWebpage(
|
|
79
|
+
config = TabbedWebpage(
|
|
80
|
+
title=title, tabs=tabs, show_tabs=len(tabs) > 1, add_title_h1=add_title_h1
|
|
81
|
+
)
|
|
71
82
|
|
|
72
83
|
config_item = Item(
|
|
73
84
|
title=f"{title} (config)",
|
|
@@ -88,7 +99,9 @@ def _load_tab_content(config: TabbedWebpage):
|
|
|
88
99
|
tab.content_html = html
|
|
89
100
|
|
|
90
101
|
|
|
91
|
-
def
|
|
102
|
+
def tabbed_webpage_generate(
|
|
103
|
+
config_item: Item, page_template: str = "base_webpage.html.jinja", add_title_h1: bool = True
|
|
104
|
+
) -> str:
|
|
92
105
|
"""
|
|
93
106
|
Generate a web page using the supplied config.
|
|
94
107
|
"""
|
|
@@ -98,14 +111,15 @@ def webpage_generate(config_item: Item) -> str:
|
|
|
98
111
|
_load_tab_content(tabbed_webpage)
|
|
99
112
|
|
|
100
113
|
content = render_web_template(
|
|
101
|
-
|
|
114
|
+
template_filename="tabbed_webpage.html.jinja",
|
|
115
|
+
data=asdict(tabbed_webpage),
|
|
102
116
|
)
|
|
103
117
|
|
|
104
118
|
return render_web_template(
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
{
|
|
119
|
+
page_template,
|
|
120
|
+
data={
|
|
108
121
|
"title": tabbed_webpage.title,
|
|
122
|
+
"add_title_h1": add_title_h1,
|
|
109
123
|
"content": content,
|
|
110
124
|
},
|
|
111
125
|
)
|
|
@@ -135,11 +149,7 @@ def test_render():
|
|
|
135
149
|
new_config = as_dataclass(read_yaml_file("tmp/webpage_config.yaml"), TabbedWebpage)
|
|
136
150
|
assert new_config == config
|
|
137
151
|
|
|
138
|
-
html = render_web_template(
|
|
139
|
-
base_templates_dir,
|
|
140
|
-
"tabbed_webpage.html.jinja",
|
|
141
|
-
asdict(config),
|
|
142
|
-
)
|
|
152
|
+
html = render_web_template(template_filename="tabbed_webpage.html.jinja", data=asdict(config))
|
|
143
153
|
with open("tmp/webpage.html", "w") as f:
|
|
144
154
|
f.write(html)
|
|
145
155
|
print("Rendered tabbed webpage to tmp/webpage.html")
|