kash-shell 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. kash/actions/core/markdownify.py +12 -8
  2. kash/actions/core/readability.py +8 -7
  3. kash/actions/core/render_as_html.py +8 -6
  4. kash/actions/core/show_webpage.py +2 -2
  5. kash/commands/base/basic_file_commands.py +3 -0
  6. kash/commands/base/diff_commands.py +38 -3
  7. kash/commands/base/reformat_command.py +1 -1
  8. kash/commands/base/show_command.py +1 -1
  9. kash/commands/workspace/selection_commands.py +1 -1
  10. kash/commands/workspace/workspace_commands.py +92 -29
  11. kash/docs/load_source_code.py +1 -1
  12. kash/exec/action_exec.py +6 -8
  13. kash/exec/fetch_url_metadata.py +8 -5
  14. kash/exec/importing.py +4 -4
  15. kash/exec/llm_transforms.py +1 -1
  16. kash/exec/preconditions.py +30 -10
  17. kash/file_storage/file_store.py +105 -43
  18. kash/file_storage/item_file_format.py +1 -1
  19. kash/file_storage/store_filenames.py +2 -1
  20. kash/help/help_embeddings.py +2 -2
  21. kash/llm_utils/clean_headings.py +1 -1
  22. kash/{text_handling → llm_utils}/custom_sliding_transforms.py +0 -3
  23. kash/llm_utils/llm_completion.py +1 -1
  24. kash/local_server/__init__.py +1 -1
  25. kash/local_server/local_server_commands.py +2 -1
  26. kash/mcp/__init__.py +1 -1
  27. kash/mcp/mcp_server_commands.py +8 -2
  28. kash/media_base/media_cache.py +10 -3
  29. kash/model/actions_model.py +3 -0
  30. kash/model/items_model.py +78 -44
  31. kash/model/operations_model.py +14 -0
  32. kash/shell/ui/shell_results.py +2 -1
  33. kash/shell/utils/native_utils.py +2 -2
  34. kash/utils/common/format_utils.py +0 -8
  35. kash/utils/common/import_utils.py +46 -18
  36. kash/utils/common/url.py +80 -3
  37. kash/utils/file_utils/file_formats.py +3 -2
  38. kash/utils/file_utils/file_formats_model.py +47 -45
  39. kash/utils/file_utils/filename_parsing.py +41 -16
  40. kash/{text_handling → utils/text_handling}/doc_normalization.py +10 -8
  41. kash/utils/text_handling/escape_html_tags.py +156 -0
  42. kash/{text_handling → utils/text_handling}/markdown_utils.py +0 -3
  43. kash/utils/text_handling/markdownify_utils.py +87 -0
  44. kash/{text_handling → utils/text_handling}/unified_diffs.py +1 -44
  45. kash/web_content/file_cache_utils.py +42 -34
  46. kash/web_content/local_file_cache.py +53 -13
  47. kash/web_content/web_extract.py +1 -1
  48. kash/web_content/web_extract_readabilipy.py +4 -2
  49. kash/web_content/web_fetch.py +42 -7
  50. kash/web_content/web_page_model.py +2 -1
  51. kash/web_gen/simple_webpage.py +1 -1
  52. kash/web_gen/templates/base_styles.css.jinja +134 -16
  53. kash/web_gen/templates/simple_webpage.html.jinja +1 -1
  54. kash/workspaces/selections.py +2 -2
  55. kash/workspaces/workspace_output.py +2 -2
  56. kash/xonsh_custom/load_into_xonsh.py +4 -2
  57. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/METADATA +1 -1
  58. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/RECORD +62 -62
  59. kash/utils/common/inflection.py +0 -22
  60. kash/workspaces/workspace_importing.py +0 -56
  61. /kash/{text_handling → utils/text_handling}/markdown_render.py +0 -0
  62. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/WHEEL +0 -0
  63. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/entry_points.txt +0 -0
  64. {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/licenses/LICENSE +0 -0
@@ -9,11 +9,12 @@ from kash.config.logger import get_logger
9
9
  from kash.config.settings import atomic_global_settings, global_settings
10
10
  from kash.model.items_model import Item
11
11
  from kash.model.media_model import MediaType
12
+ from kash.model.paths_model import StorePath
12
13
  from kash.utils.common.url import Url
13
14
  from kash.utils.errors import FileNotFound, InvalidInput
14
15
  from kash.utils.file_utils.file_formats_model import detect_media_type
15
16
  from kash.web_content.canon_url import canonicalize_url
16
- from kash.web_content.local_file_cache import Loadable, LocalFileCache
17
+ from kash.web_content.local_file_cache import CacheResult, Loadable, LocalFileCache
17
18
 
18
19
  log = get_logger(__name__)
19
20
 
@@ -40,7 +41,7 @@ def reset_content_cache_dir(path: Path):
40
41
 
41
42
  def cache_file(
42
43
  source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
43
- ) -> tuple[Path, bool]:
44
+ ) -> CacheResult:
44
45
  """
45
46
  Return a local cached copy of the item. If it is an URL, content is fetched.
46
47
  If it is a Path or a Loadable, a cached copy is returned.
@@ -50,8 +51,7 @@ def cache_file(
50
51
  in which case the global cache is used.
51
52
  """
52
53
  cache = _global_content_cache if global_cache else _content_cache
53
- path, was_cached = cache.cache(source, expiration_sec)
54
- return path, was_cached
54
+ return cache.cache(source, expiration_sec)
55
55
 
56
56
 
57
57
  def cache_api_response(
@@ -64,9 +64,9 @@ def cache_api_response(
64
64
  Cache an API response. By default parse the response as JSON.
65
65
  """
66
66
  cache = _global_content_cache if global_cache else _content_cache
67
- path, was_cached = cache.cache(url, expiration_sec)
68
- result = parser(path.read_text())
69
- return result, was_cached
67
+ result = cache.cache(url, expiration_sec)
68
+ parsed_result = parser(result.content.path.read_text())
69
+ return parsed_result, result.was_cached
70
70
 
71
71
 
72
72
  def cache_resource(
@@ -74,7 +74,8 @@ def cache_resource(
74
74
  ) -> dict[MediaType, Path]:
75
75
  """
76
76
  Cache a resource item for an external local path or a URL, fetching or
77
- copying as needed. For media this may yield more than one format.
77
+ copying as needed and returning direct paths to the cached content.
78
+ For media this may yield more than one format.
78
79
  """
79
80
  from kash.exec.preconditions import is_resource
80
81
  from kash.media_base.media_services import is_media_url
@@ -83,62 +84,69 @@ def cache_resource(
83
84
  if not is_resource(item):
84
85
  raise ValueError(f"Item is not a resource: {item}")
85
86
 
86
- path = None
87
- result: dict[MediaType, Path] = {}
87
+ path: Path | None = None
88
+ results: dict[MediaType, Path] = {}
89
+ cache_result: CacheResult | None = None
90
+
91
+ # Cache the content using media or content cache.
88
92
  if item.url:
89
93
  if is_media_url(item.url):
90
- result = cache_media(item.url)
94
+ results = cache_media(item.url)
91
95
  else:
92
- path, _was_cached = cache_file(item.url, global_cache, expiration_sec)
96
+ cache_result = cache_file(item.url, global_cache, expiration_sec)
93
97
  elif item.external_path:
94
- path = Path(item.external_path)
95
- if not path.is_file():
96
- raise FileNotFound(f"External path not found: {path}")
97
- path, _was_cached = cache_file(path, global_cache, expiration_sec)
98
+ ext_path = Path(item.external_path)
99
+ if not ext_path.is_file():
100
+ raise FileNotFound(f"External path not found: {ext_path}")
101
+ cache_result = cache_file(ext_path, global_cache, expiration_sec)
98
102
  elif item.original_filename:
99
- path = Path(item.original_filename)
100
- if not path.is_file():
101
- raise FileNotFound(f"Original filename not found: {path}")
102
- path, _was_cached = cache_file(path, global_cache, expiration_sec)
103
+ orig_path = Path(item.original_filename)
104
+ if not orig_path.is_file():
105
+ raise FileNotFound(f"Original filename not found: {orig_path}")
106
+ cache_result = cache_file(orig_path, global_cache, expiration_sec)
103
107
  else:
104
108
  raise ValueError(f"Item has no URL or external path: {item}")
105
109
 
110
+ if cache_result:
111
+ path = cache_result.content.path
112
+
106
113
  # If we just have the local file path, determine its format.
107
- if not result and path:
108
- result = {detect_media_type(path): path}
114
+ if not results and path:
115
+ results = {detect_media_type(path): path}
109
116
 
110
117
  log.message(
111
118
  "Cached resource %s:\n%s",
112
119
  item.as_str_brief(),
113
120
  fmt_lines(
114
121
  f"{media_type.value}: {fmt_path(media_path)}"
115
- for media_type, media_path in result.items()
122
+ for media_type, media_path in results.items()
116
123
  ),
117
124
  )
118
125
 
119
- return result
126
+ return results
120
127
 
121
128
 
122
129
  def get_url_html(
123
130
  item: Item, global_cache: bool = False, expiration_sec: float | None = None
124
- ) -> tuple[Url, str]:
131
+ ) -> tuple[Url | StorePath, str]:
125
132
  """
126
133
  Returns the HTML content of an URL item, using the content cache,
127
134
  or the body of the item if it has a URL and HTML body.
128
135
  """
129
- from kash.exec.preconditions import has_html_body, is_url_item
130
-
131
- if not item.url:
132
- raise InvalidInput("Item must have a URL or an HTML body")
133
- url = Url(canonicalize_url(item.url))
136
+ from kash.exec.preconditions import has_html_body, is_url_resource
134
137
 
135
- if is_url_item(item):
136
- path, _was_cached = cache_file(url, global_cache, expiration_sec)
138
+ if is_url_resource(item) and item.url and not item.has_body:
139
+ # Need to fetch the content.
140
+ locator = Url(canonicalize_url(item.url))
141
+ path = cache_file(locator, global_cache, expiration_sec).content.path
137
142
  with open(path) as file:
138
143
  html_content = file.read()
139
144
  else:
140
145
  if not item.body or not has_html_body(item):
141
- raise InvalidInput("Item must have a URL or an HTML body")
146
+ raise InvalidInput("Item must be a URL resource or have an HTML body")
147
+ if not item.store_path:
148
+ raise InvalidInput("Item missing store path")
142
149
  html_content = item.body
150
+ locator = StorePath(item.store_path)
143
151
 
144
- return url, html_content
152
+ return locator, html_content
@@ -10,11 +10,19 @@ from funlog import log_if_modifies
10
10
  from prettyfmt import fmt_path
11
11
  from strif import atomic_output_file, copyfile_atomic
12
12
 
13
- from kash.utils.common.url import Url, is_file_url, is_url, normalize_url, parse_file_url
13
+ from kash.utils.common.url import (
14
+ Url,
15
+ is_file_url,
16
+ is_url,
17
+ is_valid_path,
18
+ normalize_url,
19
+ parse_file_url,
20
+ )
14
21
  from kash.utils.errors import FileNotFound
15
- from kash.utils.file_utils.file_formats_model import choose_file_ext
22
+ from kash.utils.file_utils.file_formats_model import file_format_info
23
+ from kash.utils.file_utils.filename_parsing import parse_file_ext
16
24
  from kash.web_content.dir_store import DirStore
17
- from kash.web_content.web_fetch import download_url
25
+ from kash.web_content.web_fetch import HttpHeaders, download_url
18
26
 
19
27
  log = logging.getLogger(__name__)
20
28
 
@@ -73,9 +81,42 @@ An item that can be cached as a file.
73
81
  """
74
82
 
75
83
 
84
+ @dataclass(frozen=True)
85
+ class CacheContent:
86
+ """
87
+ An item in the local file cache. If it was a cache miss for a web-fetched URL,
88
+ also has HTTP headers.
89
+ """
90
+
91
+ path: Path
92
+ headers: HttpHeaders | None
93
+
94
+
95
+ @dataclass(frozen=True)
96
+ class CacheResult:
97
+ content: CacheContent
98
+ was_cached: bool
99
+
100
+
76
101
  def _suffix_for(cacheable: Cacheable) -> str | None:
77
102
  key = cacheable.key if isinstance(cacheable, Loadable) else cacheable
78
- file_ext = choose_file_ext(key)
103
+
104
+ # Check for recognized file extensions on URLs and Paths.
105
+ filename_ext = parse_file_ext(str(key))
106
+ if filename_ext:
107
+ return filename_ext.dot_ext
108
+
109
+ # Handle local paths
110
+ if is_file_url(str(key)):
111
+ path = parse_file_url(str(key))
112
+ elif is_valid_path(str(key)):
113
+ path = Path(str(key))
114
+ else:
115
+ # A non-local path with no recognized extension.
116
+ return None
117
+
118
+ # If it's a local file, check the file content too.
119
+ file_ext = file_format_info(path).suggested_file_ext
79
120
  return file_ext.dot_ext if file_ext else None
80
121
 
81
122
 
@@ -135,7 +176,7 @@ class LocalFileCache(DirStore):
135
176
  if backup_url and mode in (WebCacheMode.TEST, WebCacheMode.UPDATE):
136
177
  self._restore(backup_url)
137
178
 
138
- def _load_source(self, source: Cacheable) -> Path:
179
+ def _load_source(self, source: Cacheable) -> CacheContent:
139
180
  """
140
181
  Load or compute the given source and save it to the cache.
141
182
  """
@@ -147,6 +188,7 @@ class LocalFileCache(DirStore):
147
188
  suffix = _suffix_for(source)
148
189
  cache_path = self.path_for(key, folder=self.folder, suffix=_suffix_for(source))
149
190
 
191
+ headers = None
150
192
  if isinstance(source, Path) or (isinstance(source, str) and is_file_url(source)):
151
193
  # Local file or file:// URL.
152
194
  url_or_path = source
@@ -165,7 +207,8 @@ class LocalFileCache(DirStore):
165
207
  # URL.
166
208
  url = _normalize_url(source)
167
209
  log.info("Downloading to cache: %s -> %s", url, fmt_path(cache_path))
168
- download_url(url, cache_path)
210
+ headers = download_url(url, cache_path)
211
+ log.debug("Response headers: %s", headers)
169
212
  elif isinstance(source, Loadable):
170
213
  # Arbitrary loadable. Load and save (atomically).
171
214
  with atomic_output_file(
@@ -180,7 +223,7 @@ class LocalFileCache(DirStore):
180
223
  else:
181
224
  raise ValueError(f"Invalid source: {source}")
182
225
 
183
- return cache_path
226
+ return CacheContent(cache_path, headers)
184
227
 
185
228
  def _age_in_sec(self, cache_path: Path) -> float:
186
229
  now = time.time()
@@ -210,7 +253,7 @@ class LocalFileCache(DirStore):
210
253
 
211
254
  return cache_path is not None and not self._is_expired(cache_path, expiration_sec)
212
255
 
213
- def cache(self, source: Cacheable, expiration_sec: float | None = None) -> tuple[Path, bool]:
256
+ def cache(self, source: Cacheable, expiration_sec: float | None = None) -> CacheResult:
214
257
  """
215
258
  Returns cached download path of given URL and whether it was previously cached.
216
259
  For file:// URLs does a copy.
@@ -221,13 +264,10 @@ class LocalFileCache(DirStore):
221
264
 
222
265
  if cache_path and not self._is_expired(cache_path, expiration_sec):
223
266
  log.info("URL in cache, not fetching: %s: %s", key, fmt_path(cache_path))
224
- return cache_path, True
267
+ return CacheResult(CacheContent(cache_path, None), True)
225
268
  else:
226
269
  log.info("Caching new copy: %s", key)
227
- return (
228
- self._load_source(source),
229
- False,
230
- )
270
+ return CacheResult(self._load_source(source), False)
231
271
 
232
272
  def backup(self) -> None:
233
273
  if not self.backup_url:
@@ -22,7 +22,7 @@ def fetch_extract(
22
22
  """
23
23
  expiration_sec = 0 if refetch else None
24
24
  if use_cache:
25
- path, _was_cached = cache_file(url, expiration_sec=expiration_sec)
25
+ path = cache_file(url, expiration_sec=expiration_sec).content.path
26
26
  with open(path, "rb") as file:
27
27
  content = file.read()
28
28
  page_data = extractor(url, content)
@@ -1,9 +1,11 @@
1
+ from pathlib import Path
2
+
1
3
  from kash.utils.common.url import Url
2
4
  from kash.utils.errors import InvalidInput
3
5
  from kash.web_content.web_page_model import WebPageData
4
6
 
5
7
 
6
- def extract_text_readabilipy(url: Url, html: str) -> WebPageData:
8
+ def extract_text_readabilipy(locator: Url | Path, html: str) -> WebPageData:
7
9
  """
8
10
  Extracts text from HTML using readability.
9
11
  This requires Node readability. Justext is an alternative and seems good for
@@ -16,7 +18,7 @@ def extract_text_readabilipy(url: Url, html: str) -> WebPageData:
16
18
  raise InvalidInput("No clean HTML found")
17
19
 
18
20
  return WebPageData(
19
- url=url,
21
+ locator=locator,
20
22
  title=result["title"],
21
23
  byline=result["byline"],
22
24
  clean_html=result["content"],
@@ -1,14 +1,20 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ from dataclasses import dataclass
5
+ from functools import cached_property
2
6
  from pathlib import Path
3
- from typing import Any
7
+ from typing import TYPE_CHECKING, Any
4
8
  from urllib.parse import urlparse
5
9
 
6
- import httpx
7
10
  from strif import atomic_output_file, copyfile_atomic
8
- from tqdm import tqdm
9
11
 
10
12
  from kash.config.env_settings import KashEnv
11
13
  from kash.utils.common.url import Url
14
+ from kash.utils.file_utils.file_formats import MimeType
15
+
16
+ if TYPE_CHECKING:
17
+ from httpx import Client, Response
12
18
 
13
19
  log = logging.getLogger(__name__)
14
20
 
@@ -30,11 +36,13 @@ def fetch_url(
30
36
  timeout: int = DEFAULT_TIMEOUT,
31
37
  auth: Any | None = None,
32
38
  headers: dict[str, str] | None = None,
33
- ) -> httpx.Response:
39
+ ) -> Response:
34
40
  """
35
41
  Fetch a URL using httpx with logging and reasonable defaults.
36
42
  Raise httpx.HTTPError for non-2xx responses.
37
43
  """
44
+ import httpx
45
+
38
46
  with httpx.Client(
39
47
  follow_redirects=True,
40
48
  timeout=timeout,
@@ -48,36 +56,60 @@ def fetch_url(
48
56
  return response
49
57
 
50
58
 
59
+ @dataclass(frozen=True)
60
+ class HttpHeaders:
61
+ """
62
+ HTTP response headers.
63
+ """
64
+
65
+ headers: dict[str, str]
66
+
67
+ @cached_property
68
+ def mime_type(self) -> MimeType | None:
69
+ """Get content type header, if available."""
70
+ for key, value in self.headers.items():
71
+ if key.lower() == "content-type":
72
+ return MimeType(value)
73
+ return None
74
+
75
+
51
76
  def download_url(
52
77
  url: Url,
53
78
  target_filename: str | Path,
54
- session: httpx.Client | None = None,
79
+ session: Client | None = None,
55
80
  show_progress: bool = False,
56
81
  timeout: int = DEFAULT_TIMEOUT,
57
82
  auth: Any | None = None,
58
83
  headers: dict[str, str] | None = None,
59
- ) -> None:
84
+ ) -> HttpHeaders | None:
60
85
  """
61
86
  Download given file, optionally with progress bar, streaming to a target file.
62
87
  Also handles file:// and s3:// URLs. Output file is created atomically.
63
88
  Raise httpx.HTTPError for non-2xx responses.
89
+ Returns response headers for HTTP/HTTPS requests, None for other URL types.
64
90
  """
91
+ import httpx
92
+ from tqdm import tqdm
93
+
65
94
  target_filename = str(target_filename)
66
95
  parsed_url = urlparse(url)
67
96
  if show_progress:
68
97
  log.info("%s", url)
69
98
 
70
99
  if parsed_url.scheme == "file" or parsed_url.scheme == "":
71
- copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename)
100
+ copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename, make_parents=True)
101
+ return None
72
102
  elif parsed_url.scheme == "s3":
73
103
  import boto3 # pyright: ignore
74
104
 
75
105
  s3 = boto3.resource("s3")
76
106
  s3_path = parsed_url.path.lstrip("/")
77
107
  s3.Bucket(parsed_url.netloc).download_file(s3_path, target_filename)
108
+ return None
78
109
  else:
79
110
  client = session or httpx.Client(follow_redirects=True, timeout=timeout)
80
111
  response: httpx.Response | None = None
112
+ response_headers: dict[str, str] | None = None
81
113
  try:
82
114
  headers = headers or default_headers()
83
115
  log.debug("download_url: using headers: %s", headers)
@@ -90,6 +122,7 @@ def download_url(
90
122
  headers=headers,
91
123
  ) as response:
92
124
  response.raise_for_status()
125
+ response_headers = dict(response.headers)
93
126
  total_size = int(response.headers.get("content-length", "0"))
94
127
 
95
128
  with atomic_output_file(target_filename, make_parents=True) as temp_filename:
@@ -107,3 +140,5 @@ def download_url(
107
140
  client.close()
108
141
  if response:
109
142
  response.raise_for_status() # In case of errors during streaming
143
+
144
+ return HttpHeaders(response_headers) if response_headers else None
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import Protocol
2
3
 
3
4
  from prettyfmt import abbrev_obj
@@ -12,7 +13,7 @@ class WebPageData:
12
13
  Data about a web page, including URL, title and optionally description and extracted content.
13
14
  """
14
15
 
15
- url: Url
16
+ locator: Url | Path
16
17
  title: str | None = None
17
18
  byline: str | None = None
18
19
  description: str | None = None
@@ -15,7 +15,7 @@ def simple_webpage_render(
15
15
  return render_web_template(
16
16
  template_filename=page_template,
17
17
  data={
18
- "title": item.title,
18
+ "title": item.abbrev_title(),
19
19
  "add_title_h1": add_title_h1,
20
20
  "content_html": item.body_as_html(),
21
21
  "thumbnail_url": item.thumbnail_url,