kash-shell 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/core/markdownify.py +12 -8
- kash/actions/core/readability.py +8 -7
- kash/actions/core/render_as_html.py +8 -6
- kash/actions/core/show_webpage.py +2 -2
- kash/commands/base/basic_file_commands.py +3 -0
- kash/commands/base/diff_commands.py +38 -3
- kash/commands/base/reformat_command.py +1 -1
- kash/commands/base/show_command.py +1 -1
- kash/commands/workspace/selection_commands.py +1 -1
- kash/commands/workspace/workspace_commands.py +92 -29
- kash/docs/load_source_code.py +1 -1
- kash/exec/action_exec.py +6 -8
- kash/exec/fetch_url_metadata.py +8 -5
- kash/exec/importing.py +4 -4
- kash/exec/llm_transforms.py +1 -1
- kash/exec/preconditions.py +30 -10
- kash/file_storage/file_store.py +105 -43
- kash/file_storage/item_file_format.py +1 -1
- kash/file_storage/store_filenames.py +2 -1
- kash/help/help_embeddings.py +2 -2
- kash/llm_utils/clean_headings.py +1 -1
- kash/{text_handling → llm_utils}/custom_sliding_transforms.py +0 -3
- kash/llm_utils/llm_completion.py +1 -1
- kash/local_server/__init__.py +1 -1
- kash/local_server/local_server_commands.py +2 -1
- kash/mcp/__init__.py +1 -1
- kash/mcp/mcp_server_commands.py +8 -2
- kash/media_base/media_cache.py +10 -3
- kash/model/actions_model.py +3 -0
- kash/model/items_model.py +78 -44
- kash/model/operations_model.py +14 -0
- kash/shell/ui/shell_results.py +2 -1
- kash/shell/utils/native_utils.py +2 -2
- kash/utils/common/format_utils.py +0 -8
- kash/utils/common/import_utils.py +46 -18
- kash/utils/common/url.py +80 -3
- kash/utils/file_utils/file_formats.py +3 -2
- kash/utils/file_utils/file_formats_model.py +47 -45
- kash/utils/file_utils/filename_parsing.py +41 -16
- kash/{text_handling → utils/text_handling}/doc_normalization.py +10 -8
- kash/utils/text_handling/escape_html_tags.py +156 -0
- kash/{text_handling → utils/text_handling}/markdown_utils.py +0 -3
- kash/utils/text_handling/markdownify_utils.py +87 -0
- kash/{text_handling → utils/text_handling}/unified_diffs.py +1 -44
- kash/web_content/file_cache_utils.py +42 -34
- kash/web_content/local_file_cache.py +53 -13
- kash/web_content/web_extract.py +1 -1
- kash/web_content/web_extract_readabilipy.py +4 -2
- kash/web_content/web_fetch.py +42 -7
- kash/web_content/web_page_model.py +2 -1
- kash/web_gen/simple_webpage.py +1 -1
- kash/web_gen/templates/base_styles.css.jinja +134 -16
- kash/web_gen/templates/simple_webpage.html.jinja +1 -1
- kash/workspaces/selections.py +2 -2
- kash/workspaces/workspace_output.py +2 -2
- kash/xonsh_custom/load_into_xonsh.py +4 -2
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/METADATA +1 -1
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/RECORD +62 -62
- kash/utils/common/inflection.py +0 -22
- kash/workspaces/workspace_importing.py +0 -56
- /kash/{text_handling → utils/text_handling}/markdown_render.py +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/entry_points.txt +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -9,11 +9,12 @@ from kash.config.logger import get_logger
|
|
|
9
9
|
from kash.config.settings import atomic_global_settings, global_settings
|
|
10
10
|
from kash.model.items_model import Item
|
|
11
11
|
from kash.model.media_model import MediaType
|
|
12
|
+
from kash.model.paths_model import StorePath
|
|
12
13
|
from kash.utils.common.url import Url
|
|
13
14
|
from kash.utils.errors import FileNotFound, InvalidInput
|
|
14
15
|
from kash.utils.file_utils.file_formats_model import detect_media_type
|
|
15
16
|
from kash.web_content.canon_url import canonicalize_url
|
|
16
|
-
from kash.web_content.local_file_cache import Loadable, LocalFileCache
|
|
17
|
+
from kash.web_content.local_file_cache import CacheResult, Loadable, LocalFileCache
|
|
17
18
|
|
|
18
19
|
log = get_logger(__name__)
|
|
19
20
|
|
|
@@ -40,7 +41,7 @@ def reset_content_cache_dir(path: Path):
|
|
|
40
41
|
|
|
41
42
|
def cache_file(
|
|
42
43
|
source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
|
|
43
|
-
) ->
|
|
44
|
+
) -> CacheResult:
|
|
44
45
|
"""
|
|
45
46
|
Return a local cached copy of the item. If it is an URL, content is fetched.
|
|
46
47
|
If it is a Path or a Loadable, a cached copy is returned.
|
|
@@ -50,8 +51,7 @@ def cache_file(
|
|
|
50
51
|
in which case the global cache is used.
|
|
51
52
|
"""
|
|
52
53
|
cache = _global_content_cache if global_cache else _content_cache
|
|
53
|
-
|
|
54
|
-
return path, was_cached
|
|
54
|
+
return cache.cache(source, expiration_sec)
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
def cache_api_response(
|
|
@@ -64,9 +64,9 @@ def cache_api_response(
|
|
|
64
64
|
Cache an API response. By default parse the response as JSON.
|
|
65
65
|
"""
|
|
66
66
|
cache = _global_content_cache if global_cache else _content_cache
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
return
|
|
67
|
+
result = cache.cache(url, expiration_sec)
|
|
68
|
+
parsed_result = parser(result.content.path.read_text())
|
|
69
|
+
return parsed_result, result.was_cached
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
def cache_resource(
|
|
@@ -74,7 +74,8 @@ def cache_resource(
|
|
|
74
74
|
) -> dict[MediaType, Path]:
|
|
75
75
|
"""
|
|
76
76
|
Cache a resource item for an external local path or a URL, fetching or
|
|
77
|
-
copying as needed
|
|
77
|
+
copying as needed and returning direct paths to the cached content.
|
|
78
|
+
For media this may yield more than one format.
|
|
78
79
|
"""
|
|
79
80
|
from kash.exec.preconditions import is_resource
|
|
80
81
|
from kash.media_base.media_services import is_media_url
|
|
@@ -83,62 +84,69 @@ def cache_resource(
|
|
|
83
84
|
if not is_resource(item):
|
|
84
85
|
raise ValueError(f"Item is not a resource: {item}")
|
|
85
86
|
|
|
86
|
-
path = None
|
|
87
|
-
|
|
87
|
+
path: Path | None = None
|
|
88
|
+
results: dict[MediaType, Path] = {}
|
|
89
|
+
cache_result: CacheResult | None = None
|
|
90
|
+
|
|
91
|
+
# Cache the content using media or content cache.
|
|
88
92
|
if item.url:
|
|
89
93
|
if is_media_url(item.url):
|
|
90
|
-
|
|
94
|
+
results = cache_media(item.url)
|
|
91
95
|
else:
|
|
92
|
-
|
|
96
|
+
cache_result = cache_file(item.url, global_cache, expiration_sec)
|
|
93
97
|
elif item.external_path:
|
|
94
|
-
|
|
95
|
-
if not
|
|
96
|
-
raise FileNotFound(f"External path not found: {
|
|
97
|
-
|
|
98
|
+
ext_path = Path(item.external_path)
|
|
99
|
+
if not ext_path.is_file():
|
|
100
|
+
raise FileNotFound(f"External path not found: {ext_path}")
|
|
101
|
+
cache_result = cache_file(ext_path, global_cache, expiration_sec)
|
|
98
102
|
elif item.original_filename:
|
|
99
|
-
|
|
100
|
-
if not
|
|
101
|
-
raise FileNotFound(f"Original filename not found: {
|
|
102
|
-
|
|
103
|
+
orig_path = Path(item.original_filename)
|
|
104
|
+
if not orig_path.is_file():
|
|
105
|
+
raise FileNotFound(f"Original filename not found: {orig_path}")
|
|
106
|
+
cache_result = cache_file(orig_path, global_cache, expiration_sec)
|
|
103
107
|
else:
|
|
104
108
|
raise ValueError(f"Item has no URL or external path: {item}")
|
|
105
109
|
|
|
110
|
+
if cache_result:
|
|
111
|
+
path = cache_result.content.path
|
|
112
|
+
|
|
106
113
|
# If we just have the local file path, determine its format.
|
|
107
|
-
if not
|
|
108
|
-
|
|
114
|
+
if not results and path:
|
|
115
|
+
results = {detect_media_type(path): path}
|
|
109
116
|
|
|
110
117
|
log.message(
|
|
111
118
|
"Cached resource %s:\n%s",
|
|
112
119
|
item.as_str_brief(),
|
|
113
120
|
fmt_lines(
|
|
114
121
|
f"{media_type.value}: {fmt_path(media_path)}"
|
|
115
|
-
for media_type, media_path in
|
|
122
|
+
for media_type, media_path in results.items()
|
|
116
123
|
),
|
|
117
124
|
)
|
|
118
125
|
|
|
119
|
-
return
|
|
126
|
+
return results
|
|
120
127
|
|
|
121
128
|
|
|
122
129
|
def get_url_html(
|
|
123
130
|
item: Item, global_cache: bool = False, expiration_sec: float | None = None
|
|
124
|
-
) -> tuple[Url, str]:
|
|
131
|
+
) -> tuple[Url | StorePath, str]:
|
|
125
132
|
"""
|
|
126
133
|
Returns the HTML content of an URL item, using the content cache,
|
|
127
134
|
or the body of the item if it has a URL and HTML body.
|
|
128
135
|
"""
|
|
129
|
-
from kash.exec.preconditions import has_html_body,
|
|
130
|
-
|
|
131
|
-
if not item.url:
|
|
132
|
-
raise InvalidInput("Item must have a URL or an HTML body")
|
|
133
|
-
url = Url(canonicalize_url(item.url))
|
|
136
|
+
from kash.exec.preconditions import has_html_body, is_url_resource
|
|
134
137
|
|
|
135
|
-
if
|
|
136
|
-
|
|
138
|
+
if is_url_resource(item) and item.url and not item.has_body:
|
|
139
|
+
# Need to fetch the content.
|
|
140
|
+
locator = Url(canonicalize_url(item.url))
|
|
141
|
+
path = cache_file(locator, global_cache, expiration_sec).content.path
|
|
137
142
|
with open(path) as file:
|
|
138
143
|
html_content = file.read()
|
|
139
144
|
else:
|
|
140
145
|
if not item.body or not has_html_body(item):
|
|
141
|
-
raise InvalidInput("Item must
|
|
146
|
+
raise InvalidInput("Item must be a URL resource or have an HTML body")
|
|
147
|
+
if not item.store_path:
|
|
148
|
+
raise InvalidInput("Item missing store path")
|
|
142
149
|
html_content = item.body
|
|
150
|
+
locator = StorePath(item.store_path)
|
|
143
151
|
|
|
144
|
-
return
|
|
152
|
+
return locator, html_content
|
|
@@ -10,11 +10,19 @@ from funlog import log_if_modifies
|
|
|
10
10
|
from prettyfmt import fmt_path
|
|
11
11
|
from strif import atomic_output_file, copyfile_atomic
|
|
12
12
|
|
|
13
|
-
from kash.utils.common.url import
|
|
13
|
+
from kash.utils.common.url import (
|
|
14
|
+
Url,
|
|
15
|
+
is_file_url,
|
|
16
|
+
is_url,
|
|
17
|
+
is_valid_path,
|
|
18
|
+
normalize_url,
|
|
19
|
+
parse_file_url,
|
|
20
|
+
)
|
|
14
21
|
from kash.utils.errors import FileNotFound
|
|
15
|
-
from kash.utils.file_utils.file_formats_model import
|
|
22
|
+
from kash.utils.file_utils.file_formats_model import file_format_info
|
|
23
|
+
from kash.utils.file_utils.filename_parsing import parse_file_ext
|
|
16
24
|
from kash.web_content.dir_store import DirStore
|
|
17
|
-
from kash.web_content.web_fetch import download_url
|
|
25
|
+
from kash.web_content.web_fetch import HttpHeaders, download_url
|
|
18
26
|
|
|
19
27
|
log = logging.getLogger(__name__)
|
|
20
28
|
|
|
@@ -73,9 +81,42 @@ An item that can be cached as a file.
|
|
|
73
81
|
"""
|
|
74
82
|
|
|
75
83
|
|
|
84
|
+
@dataclass(frozen=True)
|
|
85
|
+
class CacheContent:
|
|
86
|
+
"""
|
|
87
|
+
An item in the local file cache. If it was a cache miss for a web-fetched URL,
|
|
88
|
+
also has HTTP headers.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
path: Path
|
|
92
|
+
headers: HttpHeaders | None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass(frozen=True)
|
|
96
|
+
class CacheResult:
|
|
97
|
+
content: CacheContent
|
|
98
|
+
was_cached: bool
|
|
99
|
+
|
|
100
|
+
|
|
76
101
|
def _suffix_for(cacheable: Cacheable) -> str | None:
|
|
77
102
|
key = cacheable.key if isinstance(cacheable, Loadable) else cacheable
|
|
78
|
-
|
|
103
|
+
|
|
104
|
+
# Check for recognized file extensions on URLs and Paths.
|
|
105
|
+
filename_ext = parse_file_ext(str(key))
|
|
106
|
+
if filename_ext:
|
|
107
|
+
return filename_ext.dot_ext
|
|
108
|
+
|
|
109
|
+
# Handle local paths
|
|
110
|
+
if is_file_url(str(key)):
|
|
111
|
+
path = parse_file_url(str(key))
|
|
112
|
+
elif is_valid_path(str(key)):
|
|
113
|
+
path = Path(str(key))
|
|
114
|
+
else:
|
|
115
|
+
# A non-local path with no recognized extension.
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
# If it's a local file, check the file content too.
|
|
119
|
+
file_ext = file_format_info(path).suggested_file_ext
|
|
79
120
|
return file_ext.dot_ext if file_ext else None
|
|
80
121
|
|
|
81
122
|
|
|
@@ -135,7 +176,7 @@ class LocalFileCache(DirStore):
|
|
|
135
176
|
if backup_url and mode in (WebCacheMode.TEST, WebCacheMode.UPDATE):
|
|
136
177
|
self._restore(backup_url)
|
|
137
178
|
|
|
138
|
-
def _load_source(self, source: Cacheable) ->
|
|
179
|
+
def _load_source(self, source: Cacheable) -> CacheContent:
|
|
139
180
|
"""
|
|
140
181
|
Load or compute the given source and save it to the cache.
|
|
141
182
|
"""
|
|
@@ -147,6 +188,7 @@ class LocalFileCache(DirStore):
|
|
|
147
188
|
suffix = _suffix_for(source)
|
|
148
189
|
cache_path = self.path_for(key, folder=self.folder, suffix=_suffix_for(source))
|
|
149
190
|
|
|
191
|
+
headers = None
|
|
150
192
|
if isinstance(source, Path) or (isinstance(source, str) and is_file_url(source)):
|
|
151
193
|
# Local file or file:// URL.
|
|
152
194
|
url_or_path = source
|
|
@@ -165,7 +207,8 @@ class LocalFileCache(DirStore):
|
|
|
165
207
|
# URL.
|
|
166
208
|
url = _normalize_url(source)
|
|
167
209
|
log.info("Downloading to cache: %s -> %s", url, fmt_path(cache_path))
|
|
168
|
-
download_url(url, cache_path)
|
|
210
|
+
headers = download_url(url, cache_path)
|
|
211
|
+
log.debug("Response headers: %s", headers)
|
|
169
212
|
elif isinstance(source, Loadable):
|
|
170
213
|
# Arbitrary loadable. Load and save (atomically).
|
|
171
214
|
with atomic_output_file(
|
|
@@ -180,7 +223,7 @@ class LocalFileCache(DirStore):
|
|
|
180
223
|
else:
|
|
181
224
|
raise ValueError(f"Invalid source: {source}")
|
|
182
225
|
|
|
183
|
-
return cache_path
|
|
226
|
+
return CacheContent(cache_path, headers)
|
|
184
227
|
|
|
185
228
|
def _age_in_sec(self, cache_path: Path) -> float:
|
|
186
229
|
now = time.time()
|
|
@@ -210,7 +253,7 @@ class LocalFileCache(DirStore):
|
|
|
210
253
|
|
|
211
254
|
return cache_path is not None and not self._is_expired(cache_path, expiration_sec)
|
|
212
255
|
|
|
213
|
-
def cache(self, source: Cacheable, expiration_sec: float | None = None) ->
|
|
256
|
+
def cache(self, source: Cacheable, expiration_sec: float | None = None) -> CacheResult:
|
|
214
257
|
"""
|
|
215
258
|
Returns cached download path of given URL and whether it was previously cached.
|
|
216
259
|
For file:// URLs does a copy.
|
|
@@ -221,13 +264,10 @@ class LocalFileCache(DirStore):
|
|
|
221
264
|
|
|
222
265
|
if cache_path and not self._is_expired(cache_path, expiration_sec):
|
|
223
266
|
log.info("URL in cache, not fetching: %s: %s", key, fmt_path(cache_path))
|
|
224
|
-
return cache_path, True
|
|
267
|
+
return CacheResult(CacheContent(cache_path, None), True)
|
|
225
268
|
else:
|
|
226
269
|
log.info("Caching new copy: %s", key)
|
|
227
|
-
return (
|
|
228
|
-
self._load_source(source),
|
|
229
|
-
False,
|
|
230
|
-
)
|
|
270
|
+
return CacheResult(self._load_source(source), False)
|
|
231
271
|
|
|
232
272
|
def backup(self) -> None:
|
|
233
273
|
if not self.backup_url:
|
kash/web_content/web_extract.py
CHANGED
|
@@ -22,7 +22,7 @@ def fetch_extract(
|
|
|
22
22
|
"""
|
|
23
23
|
expiration_sec = 0 if refetch else None
|
|
24
24
|
if use_cache:
|
|
25
|
-
path
|
|
25
|
+
path = cache_file(url, expiration_sec=expiration_sec).content.path
|
|
26
26
|
with open(path, "rb") as file:
|
|
27
27
|
content = file.read()
|
|
28
28
|
page_data = extractor(url, content)
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
1
3
|
from kash.utils.common.url import Url
|
|
2
4
|
from kash.utils.errors import InvalidInput
|
|
3
5
|
from kash.web_content.web_page_model import WebPageData
|
|
4
6
|
|
|
5
7
|
|
|
6
|
-
def extract_text_readabilipy(
|
|
8
|
+
def extract_text_readabilipy(locator: Url | Path, html: str) -> WebPageData:
|
|
7
9
|
"""
|
|
8
10
|
Extracts text from HTML using readability.
|
|
9
11
|
This requires Node readability. Justext is an alternative and seems good for
|
|
@@ -16,7 +18,7 @@ def extract_text_readabilipy(url: Url, html: str) -> WebPageData:
|
|
|
16
18
|
raise InvalidInput("No clean HTML found")
|
|
17
19
|
|
|
18
20
|
return WebPageData(
|
|
19
|
-
|
|
21
|
+
locator=locator,
|
|
20
22
|
title=result["title"],
|
|
21
23
|
byline=result["byline"],
|
|
22
24
|
clean_html=result["content"],
|
kash/web_content/web_fetch.py
CHANGED
|
@@ -1,14 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from functools import cached_property
|
|
2
6
|
from pathlib import Path
|
|
3
|
-
from typing import Any
|
|
7
|
+
from typing import TYPE_CHECKING, Any
|
|
4
8
|
from urllib.parse import urlparse
|
|
5
9
|
|
|
6
|
-
import httpx
|
|
7
10
|
from strif import atomic_output_file, copyfile_atomic
|
|
8
|
-
from tqdm import tqdm
|
|
9
11
|
|
|
10
12
|
from kash.config.env_settings import KashEnv
|
|
11
13
|
from kash.utils.common.url import Url
|
|
14
|
+
from kash.utils.file_utils.file_formats import MimeType
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from httpx import Client, Response
|
|
12
18
|
|
|
13
19
|
log = logging.getLogger(__name__)
|
|
14
20
|
|
|
@@ -30,11 +36,13 @@ def fetch_url(
|
|
|
30
36
|
timeout: int = DEFAULT_TIMEOUT,
|
|
31
37
|
auth: Any | None = None,
|
|
32
38
|
headers: dict[str, str] | None = None,
|
|
33
|
-
) ->
|
|
39
|
+
) -> Response:
|
|
34
40
|
"""
|
|
35
41
|
Fetch a URL using httpx with logging and reasonable defaults.
|
|
36
42
|
Raise httpx.HTTPError for non-2xx responses.
|
|
37
43
|
"""
|
|
44
|
+
import httpx
|
|
45
|
+
|
|
38
46
|
with httpx.Client(
|
|
39
47
|
follow_redirects=True,
|
|
40
48
|
timeout=timeout,
|
|
@@ -48,36 +56,60 @@ def fetch_url(
|
|
|
48
56
|
return response
|
|
49
57
|
|
|
50
58
|
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class HttpHeaders:
|
|
61
|
+
"""
|
|
62
|
+
HTTP response headers.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
headers: dict[str, str]
|
|
66
|
+
|
|
67
|
+
@cached_property
|
|
68
|
+
def mime_type(self) -> MimeType | None:
|
|
69
|
+
"""Get content type header, if available."""
|
|
70
|
+
for key, value in self.headers.items():
|
|
71
|
+
if key.lower() == "content-type":
|
|
72
|
+
return MimeType(value)
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
51
76
|
def download_url(
|
|
52
77
|
url: Url,
|
|
53
78
|
target_filename: str | Path,
|
|
54
|
-
session:
|
|
79
|
+
session: Client | None = None,
|
|
55
80
|
show_progress: bool = False,
|
|
56
81
|
timeout: int = DEFAULT_TIMEOUT,
|
|
57
82
|
auth: Any | None = None,
|
|
58
83
|
headers: dict[str, str] | None = None,
|
|
59
|
-
) -> None:
|
|
84
|
+
) -> HttpHeaders | None:
|
|
60
85
|
"""
|
|
61
86
|
Download given file, optionally with progress bar, streaming to a target file.
|
|
62
87
|
Also handles file:// and s3:// URLs. Output file is created atomically.
|
|
63
88
|
Raise httpx.HTTPError for non-2xx responses.
|
|
89
|
+
Returns response headers for HTTP/HTTPS requests, None for other URL types.
|
|
64
90
|
"""
|
|
91
|
+
import httpx
|
|
92
|
+
from tqdm import tqdm
|
|
93
|
+
|
|
65
94
|
target_filename = str(target_filename)
|
|
66
95
|
parsed_url = urlparse(url)
|
|
67
96
|
if show_progress:
|
|
68
97
|
log.info("%s", url)
|
|
69
98
|
|
|
70
99
|
if parsed_url.scheme == "file" or parsed_url.scheme == "":
|
|
71
|
-
copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename)
|
|
100
|
+
copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename, make_parents=True)
|
|
101
|
+
return None
|
|
72
102
|
elif parsed_url.scheme == "s3":
|
|
73
103
|
import boto3 # pyright: ignore
|
|
74
104
|
|
|
75
105
|
s3 = boto3.resource("s3")
|
|
76
106
|
s3_path = parsed_url.path.lstrip("/")
|
|
77
107
|
s3.Bucket(parsed_url.netloc).download_file(s3_path, target_filename)
|
|
108
|
+
return None
|
|
78
109
|
else:
|
|
79
110
|
client = session or httpx.Client(follow_redirects=True, timeout=timeout)
|
|
80
111
|
response: httpx.Response | None = None
|
|
112
|
+
response_headers: dict[str, str] | None = None
|
|
81
113
|
try:
|
|
82
114
|
headers = headers or default_headers()
|
|
83
115
|
log.debug("download_url: using headers: %s", headers)
|
|
@@ -90,6 +122,7 @@ def download_url(
|
|
|
90
122
|
headers=headers,
|
|
91
123
|
) as response:
|
|
92
124
|
response.raise_for_status()
|
|
125
|
+
response_headers = dict(response.headers)
|
|
93
126
|
total_size = int(response.headers.get("content-length", "0"))
|
|
94
127
|
|
|
95
128
|
with atomic_output_file(target_filename, make_parents=True) as temp_filename:
|
|
@@ -107,3 +140,5 @@ def download_url(
|
|
|
107
140
|
client.close()
|
|
108
141
|
if response:
|
|
109
142
|
response.raise_for_status() # In case of errors during streaming
|
|
143
|
+
|
|
144
|
+
return HttpHeaders(response_headers) if response_headers else None
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import Protocol
|
|
2
3
|
|
|
3
4
|
from prettyfmt import abbrev_obj
|
|
@@ -12,7 +13,7 @@ class WebPageData:
|
|
|
12
13
|
Data about a web page, including URL, title and optionally description and extracted content.
|
|
13
14
|
"""
|
|
14
15
|
|
|
15
|
-
|
|
16
|
+
locator: Url | Path
|
|
16
17
|
title: str | None = None
|
|
17
18
|
byline: str | None = None
|
|
18
19
|
description: str | None = None
|
kash/web_gen/simple_webpage.py
CHANGED
|
@@ -15,7 +15,7 @@ def simple_webpage_render(
|
|
|
15
15
|
return render_web_template(
|
|
16
16
|
template_filename=page_template,
|
|
17
17
|
data={
|
|
18
|
-
"title": item.
|
|
18
|
+
"title": item.abbrev_title(),
|
|
19
19
|
"add_title_h1": add_title_h1,
|
|
20
20
|
"content_html": item.body_as_html(),
|
|
21
21
|
"thumbnail_url": item.thumbnail_url,
|