kash-shell 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/__init__.py +4 -4
- kash/actions/core/markdownify.py +5 -2
- kash/actions/core/readability.py +5 -2
- kash/actions/core/render_as_html.py +18 -0
- kash/actions/core/webpage_config.py +12 -4
- kash/commands/__init__.py +8 -20
- kash/commands/base/basic_file_commands.py +15 -0
- kash/commands/base/debug_commands.py +15 -2
- kash/commands/base/general_commands.py +27 -18
- kash/commands/base/logs_commands.py +1 -4
- kash/commands/base/model_commands.py +8 -8
- kash/commands/base/search_command.py +3 -2
- kash/commands/base/show_command.py +5 -3
- kash/commands/extras/parse_uv_lock.py +186 -0
- kash/commands/help/doc_commands.py +2 -31
- kash/commands/help/welcome.py +33 -0
- kash/commands/workspace/selection_commands.py +11 -6
- kash/commands/workspace/workspace_commands.py +19 -16
- kash/config/colors.py +2 -0
- kash/config/env_settings.py +72 -0
- kash/config/init.py +2 -2
- kash/config/logger.py +61 -59
- kash/config/logger_basic.py +12 -5
- kash/config/server_config.py +6 -6
- kash/config/settings.py +117 -67
- kash/config/setup.py +35 -9
- kash/config/suppress_warnings.py +30 -12
- kash/config/text_styles.py +3 -13
- kash/docs/load_api_docs.py +2 -1
- kash/docs/markdown/topics/a2_installation.md +7 -3
- kash/docs/markdown/topics/a3_getting_started.md +3 -2
- kash/docs/markdown/warning.md +3 -8
- kash/docs/markdown/welcome.md +4 -0
- kash/docs_base/load_recipe_snippets.py +1 -1
- kash/docs_base/recipes/{general_system_commands.ksh → general_system_commands.sh} +1 -1
- kash/{concepts → embeddings}/cosine.py +2 -1
- kash/embeddings/text_similarity.py +57 -0
- kash/exec/__init__.py +20 -3
- kash/exec/action_decorators.py +18 -4
- kash/exec/action_exec.py +41 -23
- kash/exec/action_registry.py +13 -48
- kash/exec/command_registry.py +2 -1
- kash/exec/fetch_url_metadata.py +4 -6
- kash/exec/importing.py +56 -0
- kash/exec/llm_transforms.py +6 -6
- kash/exec/precondition_registry.py +2 -1
- kash/exec/preconditions.py +16 -1
- kash/exec/shell_callable_action.py +33 -19
- kash/file_storage/file_store.py +23 -14
- kash/file_storage/item_file_format.py +13 -3
- kash/file_storage/metadata_dirs.py +11 -2
- kash/help/assistant.py +2 -2
- kash/help/assistant_instructions.py +2 -1
- kash/help/help_embeddings.py +2 -2
- kash/help/help_printing.py +14 -10
- kash/help/tldr_help.py +5 -3
- kash/llm_utils/clean_headings.py +1 -1
- kash/llm_utils/llm_api_keys.py +4 -4
- kash/llm_utils/llm_completion.py +2 -2
- kash/llm_utils/llm_features.py +68 -0
- kash/llm_utils/llm_messages.py +1 -2
- kash/llm_utils/llm_names.py +1 -1
- kash/llm_utils/llms.py +17 -12
- kash/local_server/__init__.py +5 -2
- kash/local_server/local_server.py +56 -46
- kash/local_server/local_server_commands.py +15 -15
- kash/local_server/local_server_routes.py +2 -2
- kash/local_server/local_url_formatters.py +1 -1
- kash/mcp/__init__.py +5 -2
- kash/mcp/mcp_cli.py +54 -17
- kash/mcp/mcp_server_commands.py +5 -6
- kash/mcp/mcp_server_routes.py +14 -11
- kash/mcp/mcp_server_sse.py +61 -34
- kash/mcp/mcp_server_stdio.py +0 -8
- kash/media_base/audio_processing.py +81 -7
- kash/media_base/media_cache.py +18 -18
- kash/media_base/media_services.py +1 -1
- kash/media_base/media_tools.py +6 -6
- kash/media_base/services/local_file_media.py +2 -2
- kash/media_base/{speech_transcription.py → transcription_deepgram.py} +25 -109
- kash/media_base/transcription_format.py +73 -0
- kash/media_base/transcription_whisper.py +38 -0
- kash/model/__init__.py +73 -5
- kash/model/actions_model.py +38 -4
- kash/model/concept_model.py +30 -0
- kash/model/items_model.py +56 -13
- kash/model/params_model.py +24 -0
- kash/shell/completions/completion_scoring.py +37 -5
- kash/shell/output/kerm_codes.py +1 -2
- kash/shell/output/shell_formatting.py +14 -4
- kash/shell/shell_main.py +2 -2
- kash/shell/utils/exception_printing.py +6 -0
- kash/shell/utils/native_utils.py +26 -20
- kash/text_handling/custom_sliding_transforms.py +12 -4
- kash/text_handling/doc_normalization.py +6 -2
- kash/text_handling/markdown_render.py +117 -0
- kash/text_handling/markdown_utils.py +204 -0
- kash/utils/common/import_utils.py +12 -3
- kash/utils/common/type_utils.py +0 -29
- kash/utils/common/url.py +80 -28
- kash/utils/errors.py +6 -0
- kash/utils/file_utils/{dir_size.py → dir_info.py} +25 -4
- kash/utils/file_utils/file_ext.py +2 -3
- kash/utils/file_utils/file_formats.py +28 -2
- kash/utils/file_utils/file_formats_model.py +50 -19
- kash/utils/file_utils/filename_parsing.py +10 -4
- kash/web_content/dir_store.py +1 -2
- kash/web_content/file_cache_utils.py +37 -10
- kash/web_content/file_processing.py +68 -0
- kash/web_content/local_file_cache.py +12 -9
- kash/web_content/web_extract.py +8 -3
- kash/web_content/web_fetch.py +12 -4
- kash/web_gen/tabbed_webpage.py +5 -2
- kash/web_gen/templates/base_styles.css.jinja +120 -14
- kash/web_gen/templates/base_webpage.html.jinja +60 -13
- kash/web_gen/templates/content_styles.css.jinja +4 -2
- kash/web_gen/templates/item_view.html.jinja +2 -2
- kash/web_gen/templates/tabbed_webpage.html.jinja +1 -2
- kash/workspaces/__init__.py +15 -2
- kash/workspaces/selections.py +18 -3
- kash/workspaces/source_items.py +4 -2
- kash/workspaces/workspace_output.py +11 -4
- kash/workspaces/workspaces.py +5 -11
- kash/xonsh_custom/command_nl_utils.py +40 -19
- kash/xonsh_custom/custom_shell.py +44 -12
- kash/xonsh_custom/customize_prompt.py +39 -21
- kash/xonsh_custom/load_into_xonsh.py +26 -27
- kash/xonsh_custom/shell_load_commands.py +2 -2
- kash/xonsh_custom/xonsh_completers.py +2 -249
- kash/xonsh_custom/xonsh_keybindings.py +282 -0
- kash/xonsh_custom/xonsh_modern_tools.py +3 -3
- kash/xontrib/kash_extension.py +5 -6
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/METADATA +26 -12
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/RECORD +140 -140
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/entry_points.txt +1 -1
- kash/concepts/concept_formats.py +0 -23
- kash/concepts/text_similarity.py +0 -112
- kash/shell/clideps/api_keys.py +0 -99
- kash/shell/clideps/dotenv_setup.py +0 -114
- kash/shell/clideps/dotenv_utils.py +0 -89
- kash/shell/clideps/pkg_deps.py +0 -232
- kash/shell/clideps/platforms.py +0 -11
- kash/shell/clideps/terminal_features.py +0 -56
- kash/shell/utils/osc_utils.py +0 -95
- kash/shell/utils/terminal_images.py +0 -133
- kash/text_handling/markdown_util.py +0 -167
- kash/utils/common/atomic_var.py +0 -158
- kash/utils/common/string_replace.py +0 -93
- kash/utils/common/string_template.py +0 -101
- /kash/docs_base/recipes/{python_dev_commands.ksh → python_dev_commands.sh} +0 -0
- /kash/docs_base/recipes/{tldr_standard_commands.ksh → tldr_standard_commands.sh} +0 -0
- /kash/{concepts → embeddings}/embeddings.py +0 -0
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.8.dist-info → kash_shell-0.3.10.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import re
|
|
2
|
+
import tempfile
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import NewType
|
|
4
5
|
|
|
5
6
|
import regex
|
|
7
|
+
from clideps.pkgs.pkg_check import pkg_check
|
|
6
8
|
|
|
7
9
|
from kash.config.logger import get_logger
|
|
8
|
-
from kash.shell.clideps.pkg_deps import Pkg, pkg_check
|
|
9
10
|
|
|
10
11
|
log = get_logger(__name__)
|
|
11
12
|
|
|
@@ -77,13 +78,15 @@ def read_partial_text(
|
|
|
77
78
|
|
|
78
79
|
MimeType = NewType("MimeType", str)
|
|
79
80
|
|
|
81
|
+
MIME_EMPTY = MimeType("inode/x-empty")
|
|
82
|
+
|
|
80
83
|
|
|
81
84
|
def detect_mime_type(filename: str | Path) -> MimeType | None:
|
|
82
85
|
"""
|
|
83
86
|
Get the mime type of a file using libmagic heuristics plus more careful
|
|
84
87
|
detection of HTML, Markdown, and multipart YAML.
|
|
85
88
|
"""
|
|
86
|
-
pkg_check().require(
|
|
89
|
+
pkg_check().require("libmagic")
|
|
87
90
|
import magic
|
|
88
91
|
|
|
89
92
|
mime = magic.Magic(mime=True)
|
|
@@ -132,3 +135,26 @@ def mime_type_is_text(mime_type: MimeType) -> bool:
|
|
|
132
135
|
"application/rtf",
|
|
133
136
|
}
|
|
134
137
|
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
## Tests
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_detect_mime_type():
|
|
144
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
145
|
+
tmpdir_path = Path(tmpdir)
|
|
146
|
+
|
|
147
|
+
empty_file = tmpdir_path / "empty.txt"
|
|
148
|
+
empty_file.touch()
|
|
149
|
+
|
|
150
|
+
html_file = tmpdir_path / "example.html"
|
|
151
|
+
with open(html_file, "w") as f:
|
|
152
|
+
f.write("<!DOCTYPE html>\n<html><body><h1>Test</h1></body></html>")
|
|
153
|
+
|
|
154
|
+
text_file = tmpdir_path / "example.txt"
|
|
155
|
+
with open(text_file, "w") as f:
|
|
156
|
+
f.write("This is a simple text file with some content.")
|
|
157
|
+
|
|
158
|
+
assert detect_mime_type(empty_file) == MIME_EMPTY
|
|
159
|
+
assert detect_mime_type(html_file) == "text/html"
|
|
160
|
+
assert detect_mime_type(text_file) == "text/plain"
|
|
@@ -6,7 +6,12 @@ from pathlib import Path
|
|
|
6
6
|
|
|
7
7
|
from kash.utils.common.url import Url, is_file_url, parse_file_url
|
|
8
8
|
from kash.utils.file_utils.file_ext import FileExt
|
|
9
|
-
from kash.utils.file_utils.file_formats import
|
|
9
|
+
from kash.utils.file_utils.file_formats import (
|
|
10
|
+
MIME_EMPTY,
|
|
11
|
+
MimeType,
|
|
12
|
+
detect_mime_type,
|
|
13
|
+
mime_type_is_text,
|
|
14
|
+
)
|
|
10
15
|
from kash.utils.file_utils.filename_parsing import parse_file_ext
|
|
11
16
|
|
|
12
17
|
|
|
@@ -31,6 +36,8 @@ class Format(Enum):
|
|
|
31
36
|
it is the format of the resource (url, media, etc.).
|
|
32
37
|
"""
|
|
33
38
|
|
|
39
|
+
# TODO: Be more thorough, pulling in relevant extensions and types from the `mimetypes` module.
|
|
40
|
+
|
|
34
41
|
# Formats with no body (content is in frontmatter).
|
|
35
42
|
url = "url"
|
|
36
43
|
|
|
@@ -44,8 +51,9 @@ class Format(Enum):
|
|
|
44
51
|
yaml = "yaml"
|
|
45
52
|
diff = "diff"
|
|
46
53
|
python = "python"
|
|
47
|
-
|
|
48
|
-
"""
|
|
54
|
+
shellscript = "shellscript"
|
|
55
|
+
"""Covers sh, bash, and similar shell scripts."""
|
|
56
|
+
xonsh = "xonsh"
|
|
49
57
|
json = "json"
|
|
50
58
|
csv = "csv"
|
|
51
59
|
npz = "npz"
|
|
@@ -56,6 +64,7 @@ class Format(Enum):
|
|
|
56
64
|
docx = "docx"
|
|
57
65
|
jpeg = "jpeg"
|
|
58
66
|
png = "png"
|
|
67
|
+
gif = "gif"
|
|
59
68
|
svg = "svg"
|
|
60
69
|
mp3 = "mp3"
|
|
61
70
|
m4a = "m4a"
|
|
@@ -85,7 +94,8 @@ class Format(Enum):
|
|
|
85
94
|
self.diff,
|
|
86
95
|
self.python,
|
|
87
96
|
self.json,
|
|
88
|
-
self.
|
|
97
|
+
self.shellscript,
|
|
98
|
+
self.xonsh,
|
|
89
99
|
self.csv,
|
|
90
100
|
self.log,
|
|
91
101
|
]
|
|
@@ -102,7 +112,7 @@ class Format(Enum):
|
|
|
102
112
|
|
|
103
113
|
@property
|
|
104
114
|
def is_image(self) -> bool:
|
|
105
|
-
return self in [self.jpeg, self.png, self.svg]
|
|
115
|
+
return self in [self.jpeg, self.png, self.gif, self.svg]
|
|
106
116
|
|
|
107
117
|
@property
|
|
108
118
|
def is_audio(self) -> bool:
|
|
@@ -114,7 +124,7 @@ class Format(Enum):
|
|
|
114
124
|
|
|
115
125
|
@property
|
|
116
126
|
def is_code(self) -> bool:
|
|
117
|
-
return self in [self.python, self.
|
|
127
|
+
return self in [self.python, self.shellscript, self.xonsh, self.json, self.yaml]
|
|
118
128
|
|
|
119
129
|
@property
|
|
120
130
|
def is_data(self) -> bool:
|
|
@@ -138,10 +148,12 @@ class Format(Enum):
|
|
|
138
148
|
self.markdown,
|
|
139
149
|
self.md_html,
|
|
140
150
|
self.html,
|
|
151
|
+
self.json, # Not strictly true but we encourage use of comments.
|
|
141
152
|
self.yaml,
|
|
142
153
|
self.diff,
|
|
143
154
|
self.python,
|
|
144
|
-
self.
|
|
155
|
+
self.shellscript,
|
|
156
|
+
self.xonsh,
|
|
145
157
|
self.csv,
|
|
146
158
|
self.log,
|
|
147
159
|
]
|
|
@@ -157,13 +169,15 @@ class Format(Enum):
|
|
|
157
169
|
Format.yaml: MediaType.text,
|
|
158
170
|
Format.diff: MediaType.text,
|
|
159
171
|
Format.python: MediaType.text,
|
|
160
|
-
Format.
|
|
172
|
+
Format.shellscript: MediaType.text,
|
|
173
|
+
Format.xonsh: MediaType.text,
|
|
161
174
|
Format.json: MediaType.text,
|
|
162
175
|
Format.csv: MediaType.text,
|
|
163
176
|
Format.log: MediaType.text,
|
|
164
177
|
Format.pdf: MediaType.text,
|
|
165
178
|
Format.jpeg: MediaType.image,
|
|
166
179
|
Format.png: MediaType.image,
|
|
180
|
+
Format.gif: MediaType.image,
|
|
167
181
|
Format.svg: MediaType.image,
|
|
168
182
|
Format.docx: MediaType.text,
|
|
169
183
|
Format.mp3: MediaType.audio,
|
|
@@ -189,11 +203,13 @@ class Format(Enum):
|
|
|
189
203
|
FileExt.npz.value: Format.npz,
|
|
190
204
|
FileExt.log.value: Format.log,
|
|
191
205
|
FileExt.py.value: Format.python,
|
|
192
|
-
FileExt.
|
|
206
|
+
FileExt.sh.value: Format.shellscript,
|
|
207
|
+
FileExt.xsh.value: Format.xonsh,
|
|
193
208
|
FileExt.pdf.value: Format.pdf,
|
|
194
209
|
FileExt.docx.value: Format.docx,
|
|
195
210
|
FileExt.jpg.value: Format.jpeg,
|
|
196
211
|
FileExt.png.value: Format.png,
|
|
212
|
+
FileExt.gif.value: Format.gif,
|
|
197
213
|
FileExt.svg.value: Format.svg,
|
|
198
214
|
FileExt.mp3.value: Format.mp3,
|
|
199
215
|
FileExt.m4a.value: Format.m4a,
|
|
@@ -219,10 +235,13 @@ class Format(Enum):
|
|
|
219
235
|
Format.npz: FileExt.npz,
|
|
220
236
|
Format.log: FileExt.log,
|
|
221
237
|
Format.python: FileExt.py,
|
|
238
|
+
Format.shellscript: FileExt.sh,
|
|
239
|
+
Format.xonsh: FileExt.xsh,
|
|
222
240
|
Format.pdf: FileExt.pdf,
|
|
223
241
|
Format.docx: FileExt.docx,
|
|
224
242
|
Format.jpeg: FileExt.jpg,
|
|
225
243
|
Format.png: FileExt.png,
|
|
244
|
+
Format.gif: FileExt.gif,
|
|
226
245
|
Format.svg: FileExt.svg,
|
|
227
246
|
Format.mp3: FileExt.mp3,
|
|
228
247
|
Format.m4a: FileExt.m4a,
|
|
@@ -244,6 +263,10 @@ class Format(Enum):
|
|
|
244
263
|
"application/yaml": Format.yaml,
|
|
245
264
|
"application/x-yaml": Format.yaml,
|
|
246
265
|
"text/x-python": Format.python,
|
|
266
|
+
"text/x-script.python": Format.python,
|
|
267
|
+
"text/x-sh": Format.shellscript,
|
|
268
|
+
"text/x-shellscript": Format.shellscript,
|
|
269
|
+
"text/x-xonsh": Format.xonsh,
|
|
247
270
|
"application/json": Format.json,
|
|
248
271
|
"text/csv": Format.csv,
|
|
249
272
|
"application/x-npz": Format.npz,
|
|
@@ -251,6 +274,7 @@ class Format(Enum):
|
|
|
251
274
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": Format.docx,
|
|
252
275
|
"image/jpeg": Format.jpeg,
|
|
253
276
|
"image/png": Format.png,
|
|
277
|
+
"image/gif": Format.gif,
|
|
254
278
|
"image/svg+xml": Format.svg,
|
|
255
279
|
"audio/mpeg": Format.mp3,
|
|
256
280
|
"audio/mp3": Format.mp3,
|
|
@@ -326,15 +350,15 @@ class FileFormatInfo:
|
|
|
326
350
|
and self.mime_type.startswith("image")
|
|
327
351
|
)
|
|
328
352
|
|
|
329
|
-
def as_str(self) -> str:
|
|
330
|
-
if self.format and
|
|
331
|
-
return f"{self.format.value} ({self.mime_type})"
|
|
332
|
-
elif self.format:
|
|
353
|
+
def as_str(self, mime_only: bool = False) -> str:
|
|
354
|
+
if self.format and not mime_only:
|
|
333
355
|
return self.format.value
|
|
356
|
+
elif self.mime_type == MIME_EMPTY:
|
|
357
|
+
return "empty"
|
|
334
358
|
elif self.mime_type:
|
|
335
359
|
return self.mime_type
|
|
336
360
|
else:
|
|
337
|
-
return "unrecognized"
|
|
361
|
+
return "unrecognized format"
|
|
338
362
|
|
|
339
363
|
def __str__(self) -> str:
|
|
340
364
|
return self.as_str()
|
|
@@ -357,15 +381,22 @@ def guess_format_by_name(path: str | Path) -> Format | None:
|
|
|
357
381
|
return Format.guess_by_file_ext(file_ext) if file_ext else None
|
|
358
382
|
|
|
359
383
|
|
|
360
|
-
def file_format_info(path: str | Path) -> FileFormatInfo:
|
|
384
|
+
def file_format_info(path: str | Path, always_check_content: bool = False) -> FileFormatInfo:
|
|
361
385
|
"""
|
|
362
|
-
|
|
386
|
+
Get info on the file format path and content (file extension and file content).
|
|
387
|
+
Looks at the file extension first and then the file content if needed.
|
|
388
|
+
If `always_check_content` is True, look at the file content even if we
|
|
389
|
+
recognize the file extension.
|
|
363
390
|
"""
|
|
364
391
|
path = Path(path)
|
|
365
392
|
file_ext = parse_file_ext(path)
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
393
|
+
if always_check_content or not file_ext:
|
|
394
|
+
# Look at the file content.
|
|
395
|
+
detected_mime_type = detect_mime_type(path)
|
|
396
|
+
else:
|
|
397
|
+
detected_mime_type = None
|
|
398
|
+
format = _guess_format(file_ext, detected_mime_type)
|
|
399
|
+
final_mime_type = format.mime_type if format else detected_mime_type
|
|
369
400
|
return FileFormatInfo(file_ext, format, final_mime_type)
|
|
370
401
|
|
|
371
402
|
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
from kash.config.logger import get_logger
|
|
5
|
-
from kash.utils.common.url import Url
|
|
5
|
+
from kash.utils.common.url import Url, check_if_url
|
|
6
6
|
from kash.utils.errors import InvalidFilename
|
|
7
7
|
from kash.utils.file_utils.file_ext import FileExt, canonicalize_file_ext
|
|
8
8
|
|
|
@@ -48,11 +48,17 @@ def join_filename(dirname: str | Path, name: str, item_type: str | None, ext: st
|
|
|
48
48
|
|
|
49
49
|
def parse_file_ext(url_or_path: str | Url | Path) -> FileExt | None:
|
|
50
50
|
"""
|
|
51
|
-
Parse a known, canonical file extension from a path
|
|
52
|
-
raw file
|
|
51
|
+
Parse a known, canonical file extension from a path or URL. Also accepts
|
|
52
|
+
raw file extensions (like "csv" or ".csv").
|
|
53
53
|
"""
|
|
54
|
-
|
|
54
|
+
parsed_url = check_if_url(url_or_path)
|
|
55
|
+
if parsed_url:
|
|
56
|
+
path = parsed_url.path
|
|
57
|
+
else:
|
|
58
|
+
path = str(url_or_path)
|
|
59
|
+
front, ext = os.path.splitext(path.split("/")[-1])
|
|
55
60
|
if not ext:
|
|
61
|
+
# Handle bare file extensions too.
|
|
56
62
|
ext = front
|
|
57
63
|
return FileExt.parse(canonicalize_file_ext(ext))
|
|
58
64
|
|
kash/web_content/dir_store.py
CHANGED
|
@@ -87,8 +87,7 @@ class DirStore:
|
|
|
87
87
|
self, keys: list[str | Path], folder: str | None = None, suffix: str | None = None
|
|
88
88
|
) -> dict[str | Path, Path | None]:
|
|
89
89
|
"""
|
|
90
|
-
Look up all existing cached results for the set of keys.
|
|
91
|
-
be optimized for large batches.
|
|
90
|
+
Look up all existing cached results for the set of keys.
|
|
92
91
|
"""
|
|
93
92
|
return {key: self.find(key, folder=folder, suffix=suffix) for key in keys}
|
|
94
93
|
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections.abc import Callable
|
|
1
3
|
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
2
5
|
|
|
3
6
|
from prettyfmt import fmt_lines, fmt_path
|
|
4
7
|
|
|
@@ -35,18 +38,40 @@ def reset_content_cache_dir(path: Path):
|
|
|
35
38
|
log.info("Using web cache: %s", fmt_path(path))
|
|
36
39
|
|
|
37
40
|
|
|
38
|
-
def cache_file(
|
|
41
|
+
def cache_file(
|
|
42
|
+
source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
|
|
43
|
+
) -> tuple[Path, bool]:
|
|
39
44
|
"""
|
|
40
45
|
Return a local cached copy of the item. If it is an URL, content is fetched.
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
If it is a Path or a Loadable, a cached copy is returned.
|
|
47
|
+
LocalFileCache uses httpx so httpx.HTTPError is raised for non-2xx responses.
|
|
48
|
+
|
|
49
|
+
Uses the current content cache unless there is no current cache or `global_cache` is True,
|
|
50
|
+
in which case the global cache is used.
|
|
43
51
|
"""
|
|
44
52
|
cache = _global_content_cache if global_cache else _content_cache
|
|
45
|
-
path, was_cached = cache.cache(source)
|
|
53
|
+
path, was_cached = cache.cache(source, expiration_sec)
|
|
46
54
|
return path, was_cached
|
|
47
55
|
|
|
48
56
|
|
|
49
|
-
def
|
|
57
|
+
def cache_api_response(
|
|
58
|
+
url: Url,
|
|
59
|
+
global_cache: bool = False,
|
|
60
|
+
expiration_sec: float | None = None,
|
|
61
|
+
parser: Callable[[str], Any] = json.loads,
|
|
62
|
+
) -> tuple[Any, bool]:
|
|
63
|
+
"""
|
|
64
|
+
Cache an API response. By default parse the response as JSON.
|
|
65
|
+
"""
|
|
66
|
+
cache = _global_content_cache if global_cache else _content_cache
|
|
67
|
+
path, was_cached = cache.cache(url, expiration_sec)
|
|
68
|
+
result = parser(path.read_text())
|
|
69
|
+
return result, was_cached
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def cache_resource(
|
|
73
|
+
item: Item, global_cache: bool = False, expiration_sec: float | None = None
|
|
74
|
+
) -> dict[MediaType, Path]:
|
|
50
75
|
"""
|
|
51
76
|
Cache a resource item for an external local path or a URL, fetching or
|
|
52
77
|
copying as needed. For media this may yield more than one format.
|
|
@@ -64,17 +89,17 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
|
|
|
64
89
|
if is_media_url(item.url):
|
|
65
90
|
result = cache_media(item.url)
|
|
66
91
|
else:
|
|
67
|
-
path, _was_cached = cache_file(item.url)
|
|
92
|
+
path, _was_cached = cache_file(item.url, global_cache, expiration_sec)
|
|
68
93
|
elif item.external_path:
|
|
69
94
|
path = Path(item.external_path)
|
|
70
95
|
if not path.is_file():
|
|
71
96
|
raise FileNotFound(f"External path not found: {path}")
|
|
72
|
-
path, _was_cached = cache_file(path)
|
|
97
|
+
path, _was_cached = cache_file(path, global_cache, expiration_sec)
|
|
73
98
|
elif item.original_filename:
|
|
74
99
|
path = Path(item.original_filename)
|
|
75
100
|
if not path.is_file():
|
|
76
101
|
raise FileNotFound(f"Original filename not found: {path}")
|
|
77
|
-
path, _was_cached = cache_file(path)
|
|
102
|
+
path, _was_cached = cache_file(path, global_cache, expiration_sec)
|
|
78
103
|
else:
|
|
79
104
|
raise ValueError(f"Item has no URL or external path: {item}")
|
|
80
105
|
|
|
@@ -94,7 +119,9 @@ def cache_resource(item: Item) -> dict[MediaType, Path]:
|
|
|
94
119
|
return result
|
|
95
120
|
|
|
96
121
|
|
|
97
|
-
def get_url_html(
|
|
122
|
+
def get_url_html(
|
|
123
|
+
item: Item, global_cache: bool = False, expiration_sec: float | None = None
|
|
124
|
+
) -> tuple[Url, str]:
|
|
98
125
|
"""
|
|
99
126
|
Returns the HTML content of an URL item, using the content cache,
|
|
100
127
|
or the body of the item if it has a URL and HTML body.
|
|
@@ -106,7 +133,7 @@ def get_url_html(item: Item) -> tuple[Url, str]:
|
|
|
106
133
|
url = Url(canonicalize_url(item.url))
|
|
107
134
|
|
|
108
135
|
if is_url_item(item):
|
|
109
|
-
path, _was_cached = cache_file(url)
|
|
136
|
+
path, _was_cached = cache_file(url, global_cache, expiration_sec)
|
|
110
137
|
with open(path) as file:
|
|
111
138
|
html_content = file.read()
|
|
112
139
|
else:
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Mapping
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TypeAlias
|
|
7
|
+
|
|
8
|
+
from kash.web_content.local_file_cache import read_mtime
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class OutputType:
|
|
13
|
+
"""
|
|
14
|
+
A type of output file, represented by the filename suffix, e.g. '.mp3', '.txt', etc.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
suffix: str
|
|
18
|
+
|
|
19
|
+
def output_path(self, src: Path) -> Path:
|
|
20
|
+
"""
|
|
21
|
+
Resolve the output path. Will be next to the source file, e.g.
|
|
22
|
+
some-dir/video.mp4 -> some-dir/video.mp3
|
|
23
|
+
"""
|
|
24
|
+
return src.with_suffix(self.suffix)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
Processor: TypeAlias = Callable[[Path, Mapping[OutputType, Path]], None]
|
|
28
|
+
"""
|
|
29
|
+
A function that takes a source file and a mapping with one or more output paths.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class FileProcess:
|
|
35
|
+
"""
|
|
36
|
+
Process a file and produce one or more outputs.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
processor: Processor
|
|
40
|
+
outputs: list[OutputType]
|
|
41
|
+
|
|
42
|
+
def is_outdated(self, src: Path) -> bool:
|
|
43
|
+
"""
|
|
44
|
+
True when any output is missing or older (earliest mtime) than `src`.
|
|
45
|
+
"""
|
|
46
|
+
dests = {o.output_path(src) for o in self.outputs}
|
|
47
|
+
if any(not p.exists() for p in dests):
|
|
48
|
+
return True
|
|
49
|
+
earliest = min(read_mtime(p) for p in dests)
|
|
50
|
+
return read_mtime(src) > earliest
|
|
51
|
+
|
|
52
|
+
def run(self, src: Path) -> dict[OutputType, Path]:
|
|
53
|
+
"""
|
|
54
|
+
Run unconditionally and return a mapping of outputs to paths.
|
|
55
|
+
"""
|
|
56
|
+
dests = {o: o.output_path(src) for o in self.outputs}
|
|
57
|
+
self.processor(src, dests)
|
|
58
|
+
return dests
|
|
59
|
+
|
|
60
|
+
def run_if_needed(self, src: Path) -> dict[OutputType, Path]:
|
|
61
|
+
"""
|
|
62
|
+
Run only if any output is missing or outdated.
|
|
63
|
+
"""
|
|
64
|
+
return (
|
|
65
|
+
self.run(src)
|
|
66
|
+
if self.is_outdated(src)
|
|
67
|
+
else {o: o.output_path(src) for o in self.outputs}
|
|
68
|
+
)
|
|
@@ -11,7 +11,7 @@ from prettyfmt import fmt_path
|
|
|
11
11
|
from strif import atomic_output_file, copyfile_atomic
|
|
12
12
|
|
|
13
13
|
from kash.utils.common.url import Url, is_file_url, is_url, normalize_url, parse_file_url
|
|
14
|
-
from kash.utils.errors import FileNotFound
|
|
14
|
+
from kash.utils.errors import FileNotFound
|
|
15
15
|
from kash.utils.file_utils.file_formats_model import choose_file_ext
|
|
16
16
|
from kash.web_content.dir_store import DirStore
|
|
17
17
|
from kash.web_content.web_fetch import download_url
|
|
@@ -56,19 +56,21 @@ class Loadable:
|
|
|
56
56
|
|
|
57
57
|
key: str
|
|
58
58
|
"""
|
|
59
|
-
The unique identifier for the item.
|
|
60
|
-
|
|
59
|
+
The unique identifier for the item. Used when creating unique cache filenames,
|
|
60
|
+
as is or with added suffixes.
|
|
61
61
|
"""
|
|
62
62
|
|
|
63
63
|
save: Callable[[Path], None]
|
|
64
64
|
"""
|
|
65
65
|
Method that saves the item to the given path. Caller will handle path selection
|
|
66
|
-
and atomicity of file creation.
|
|
66
|
+
and atomicity of file creation. Raise an exception if the item cannot be saved.
|
|
67
67
|
"""
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
Cacheable = Url | Path | Loadable
|
|
71
|
-
"""
|
|
71
|
+
"""
|
|
72
|
+
An item that can be cached as a file.
|
|
73
|
+
"""
|
|
72
74
|
|
|
73
75
|
|
|
74
76
|
def _suffix_for(cacheable: Cacheable) -> str | None:
|
|
@@ -151,9 +153,7 @@ class LocalFileCache(DirStore):
|
|
|
151
153
|
if isinstance(url_or_path, Path):
|
|
152
154
|
file_path = url_or_path
|
|
153
155
|
else:
|
|
154
|
-
parsed = parse_file_url(url_or_path)
|
|
155
|
-
if not parsed:
|
|
156
|
-
raise InvalidInput(f"Not a file URL: {url_or_path}")
|
|
156
|
+
parsed = parse_file_url(url_or_path) # Raises ValueError if not a file URL.
|
|
157
157
|
file_path = parsed
|
|
158
158
|
if not file_path.exists():
|
|
159
159
|
raise FileNotFound(f"File not found: {file_path}")
|
|
@@ -173,7 +173,10 @@ class LocalFileCache(DirStore):
|
|
|
173
173
|
) as tmp_path:
|
|
174
174
|
source.save(tmp_path)
|
|
175
175
|
if not cache_path.exists():
|
|
176
|
-
|
|
176
|
+
# The source should have raised an exception if it failed to save.
|
|
177
|
+
raise InvalidCacheState(
|
|
178
|
+
f"Loadable source failed to save to cache: {source}: {cache_path}"
|
|
179
|
+
)
|
|
177
180
|
else:
|
|
178
181
|
raise ValueError(f"Invalid source: {source}")
|
|
179
182
|
|
kash/web_content/web_extract.py
CHANGED
|
@@ -10,14 +10,19 @@ from kash.web_content.web_page_model import PageExtractor, WebPageData
|
|
|
10
10
|
|
|
11
11
|
@log_calls(level="message")
|
|
12
12
|
def fetch_extract(
|
|
13
|
-
url: Url,
|
|
13
|
+
url: Url,
|
|
14
|
+
refetch: bool = False,
|
|
15
|
+
use_cache: bool = True,
|
|
16
|
+
extractor: PageExtractor = extract_text_justext,
|
|
14
17
|
) -> WebPageData:
|
|
15
18
|
"""
|
|
16
19
|
Fetches a URL and extracts the title, description, and content.
|
|
20
|
+
By default, uses the content cache if available. Can force re-fetching and
|
|
21
|
+
updating the cache by setting `refetch` to true.
|
|
17
22
|
"""
|
|
18
|
-
|
|
23
|
+
expiration_sec = 0 if refetch else None
|
|
19
24
|
if use_cache:
|
|
20
|
-
path, _was_cached = cache_file(url)
|
|
25
|
+
path, _was_cached = cache_file(url, expiration_sec=expiration_sec)
|
|
21
26
|
with open(path, "rb") as file:
|
|
22
27
|
content = file.read()
|
|
23
28
|
page_data = extractor(url, content)
|
kash/web_content/web_fetch.py
CHANGED
|
@@ -7,17 +7,22 @@ import httpx
|
|
|
7
7
|
from strif import atomic_output_file, copyfile_atomic
|
|
8
8
|
from tqdm import tqdm
|
|
9
9
|
|
|
10
|
+
from kash.config.env_settings import KashEnv
|
|
10
11
|
from kash.utils.common.url import Url
|
|
11
12
|
|
|
12
13
|
log = logging.getLogger(__name__)
|
|
13
14
|
|
|
14
|
-
USER_AGENT = "Mozilla/5.0 (Compatible)"
|
|
15
15
|
|
|
16
16
|
DEFAULT_TIMEOUT = 30
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
DEFAULT_USER_AGENT = (
|
|
20
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:126.0) Gecko/20100101 Firefox/126.0"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
19
24
|
def default_headers() -> dict[str, str]:
|
|
20
|
-
return {"User-Agent":
|
|
25
|
+
return {"User-Agent": KashEnv.KASH_USER_AGENT.read_str(default=DEFAULT_USER_AGENT)}
|
|
21
26
|
|
|
22
27
|
|
|
23
28
|
def fetch_url(
|
|
@@ -36,6 +41,7 @@ def fetch_url(
|
|
|
36
41
|
auth=auth,
|
|
37
42
|
headers=headers or default_headers(),
|
|
38
43
|
) as client:
|
|
44
|
+
log.debug("fetch_url: using headers: %s", client.headers)
|
|
39
45
|
response = client.get(url)
|
|
40
46
|
log.info("Fetched: %s (%s bytes): %s", response.status_code, len(response.content), url)
|
|
41
47
|
response.raise_for_status()
|
|
@@ -52,7 +58,7 @@ def download_url(
|
|
|
52
58
|
headers: dict[str, str] | None = None,
|
|
53
59
|
) -> None:
|
|
54
60
|
"""
|
|
55
|
-
Download given file, optionally with progress bar.
|
|
61
|
+
Download given file, optionally with progress bar, streaming to a target file.
|
|
56
62
|
Also handles file:// and s3:// URLs. Output file is created atomically.
|
|
57
63
|
Raise httpx.HTTPError for non-2xx responses.
|
|
58
64
|
"""
|
|
@@ -73,13 +79,15 @@ def download_url(
|
|
|
73
79
|
client = session or httpx.Client(follow_redirects=True, timeout=timeout)
|
|
74
80
|
response: httpx.Response | None = None
|
|
75
81
|
try:
|
|
82
|
+
headers = headers or default_headers()
|
|
83
|
+
log.debug("download_url: using headers: %s", headers)
|
|
76
84
|
with client.stream(
|
|
77
85
|
"GET",
|
|
78
86
|
url,
|
|
79
87
|
follow_redirects=True,
|
|
80
88
|
timeout=timeout,
|
|
81
89
|
auth=auth,
|
|
82
|
-
headers=headers
|
|
90
|
+
headers=headers,
|
|
83
91
|
) as response:
|
|
84
92
|
response.raise_for_status()
|
|
85
93
|
total_size = int(response.headers.get("content-length", "0"))
|
kash/web_gen/tabbed_webpage.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from dataclasses import asdict, dataclass
|
|
3
3
|
|
|
4
4
|
from frontmatter_format import read_yaml_file, to_yaml_string, write_yaml_file
|
|
5
|
+
from prettyfmt import sanitize_title
|
|
5
6
|
|
|
6
7
|
from kash.config.logger import get_logger
|
|
7
8
|
from kash.exec.preconditions import has_thumbnail_url
|
|
@@ -41,7 +42,7 @@ def _fill_in_ids(tabs: list[TabInfo]):
|
|
|
41
42
|
tab.id = f"tab_{i}"
|
|
42
43
|
|
|
43
44
|
|
|
44
|
-
def webpage_config(items: list[Item]) -> Item:
|
|
45
|
+
def webpage_config(items: list[Item], clean_headings: bool = False) -> Item:
|
|
45
46
|
"""
|
|
46
47
|
Get an item with the config for a tabbed web page.
|
|
47
48
|
"""
|
|
@@ -57,9 +58,11 @@ def webpage_config(items: list[Item]) -> Item:
|
|
|
57
58
|
log.warning("Item has no thumbnail URL: %s", item)
|
|
58
59
|
return None
|
|
59
60
|
|
|
61
|
+
clean = clean_heading if clean_headings else sanitize_title
|
|
62
|
+
|
|
60
63
|
tabs = [
|
|
61
64
|
TabInfo(
|
|
62
|
-
label=
|
|
65
|
+
label=clean(item.abbrev_title()),
|
|
63
66
|
store_path=item.store_path,
|
|
64
67
|
thumbnail_url=get_thumbnail_url(item),
|
|
65
68
|
)
|