PyPI - kash-shell - Versions diffs - 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

kash-shell 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

kash/actions/core/markdownify.py +12 -8
kash/actions/core/readability.py +8 -7
kash/actions/core/render_as_html.py +8 -6
kash/actions/core/show_webpage.py +2 -2
kash/commands/base/basic_file_commands.py +3 -0
kash/commands/base/diff_commands.py +38 -3
kash/commands/base/reformat_command.py +1 -1
kash/commands/base/show_command.py +1 -1
kash/commands/workspace/selection_commands.py +1 -1
kash/commands/workspace/workspace_commands.py +92 -29
kash/docs/load_source_code.py +1 -1
kash/exec/action_exec.py +6 -8
kash/exec/fetch_url_metadata.py +8 -5
kash/exec/importing.py +4 -4
kash/exec/llm_transforms.py +1 -1
kash/exec/preconditions.py +30 -10
kash/file_storage/file_store.py +105 -43
kash/file_storage/item_file_format.py +1 -1
kash/file_storage/store_filenames.py +2 -1
kash/help/help_embeddings.py +2 -2
kash/llm_utils/clean_headings.py +1 -1
kash/{text_handling → llm_utils}/custom_sliding_transforms.py +0 -3
kash/llm_utils/llm_completion.py +1 -1
kash/local_server/__init__.py +1 -1
kash/local_server/local_server_commands.py +2 -1
kash/mcp/__init__.py +1 -1
kash/mcp/mcp_server_commands.py +8 -2
kash/media_base/media_cache.py +10 -3
kash/model/actions_model.py +3 -0
kash/model/items_model.py +78 -44
kash/model/operations_model.py +14 -0
kash/shell/ui/shell_results.py +2 -1
kash/shell/utils/native_utils.py +2 -2
kash/utils/common/format_utils.py +0 -8
kash/utils/common/import_utils.py +46 -18
kash/utils/common/url.py +80 -3
kash/utils/file_utils/file_formats.py +3 -2
kash/utils/file_utils/file_formats_model.py +47 -45
kash/utils/file_utils/filename_parsing.py +41 -16
kash/{text_handling → utils/text_handling}/doc_normalization.py +10 -8
kash/utils/text_handling/escape_html_tags.py +156 -0
kash/{text_handling → utils/text_handling}/markdown_utils.py +0 -3
kash/utils/text_handling/markdownify_utils.py +87 -0
kash/{text_handling → utils/text_handling}/unified_diffs.py +1 -44
kash/web_content/file_cache_utils.py +42 -34
kash/web_content/local_file_cache.py +53 -13
kash/web_content/web_extract.py +1 -1
kash/web_content/web_extract_readabilipy.py +4 -2
kash/web_content/web_fetch.py +42 -7
kash/web_content/web_page_model.py +2 -1
kash/web_gen/simple_webpage.py +1 -1
kash/web_gen/templates/base_styles.css.jinja +134 -16
kash/web_gen/templates/simple_webpage.html.jinja +1 -1
kash/workspaces/selections.py +2 -2
kash/workspaces/workspace_output.py +2 -2
kash/xonsh_custom/load_into_xonsh.py +4 -2
{kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/METADATA +1 -1
{kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/RECORD +62 -62
kash/utils/common/inflection.py +0 -22
kash/workspaces/workspace_importing.py +0 -56
/kash/{text_handling → utils/text_handling}/markdown_render.py +0 -0
{kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/WHEEL +0 -0
{kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/entry_points.txt +0 -0
{kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/licenses/LICENSE +0 -0

kash/web_content/file_cache_utils.py CHANGED Viewed

@@ -9,11 +9,12 @@ from kash.config.logger import get_logger
 from kash.config.settings import atomic_global_settings, global_settings
 from kash.model.items_model import Item
 from kash.model.media_model import MediaType
+from kash.model.paths_model import StorePath
 from kash.utils.common.url import Url
 from kash.utils.errors import FileNotFound, InvalidInput
 from kash.utils.file_utils.file_formats_model import detect_media_type
 from kash.web_content.canon_url import canonicalize_url
-from kash.web_content.local_file_cache import Loadable, LocalFileCache
+from kash.web_content.local_file_cache import CacheResult, Loadable, LocalFileCache
 log = get_logger(__name__)
@@ -40,7 +41,7 @@ def reset_content_cache_dir(path: Path):
 def cache_file(
     source: Url | Path | Loadable, global_cache: bool = False, expiration_sec: float | None = None
-) -> tuple[Path, bool]:
+) -> CacheResult:
     """
     Return a local cached copy of the item. If it is an URL, content is fetched.
     If it is a Path or a Loadable, a cached copy is returned.
@@ -50,8 +51,7 @@ def cache_file(
     in which case the global cache is used.
     """
     cache = _global_content_cache if global_cache else _content_cache
-    path, was_cached = cache.cache(source, expiration_sec)
-    return path, was_cached
+    return cache.cache(source, expiration_sec)
 def cache_api_response(
@@ -64,9 +64,9 @@ def cache_api_response(
     Cache an API response. By default parse the response as JSON.
     """
     cache = _global_content_cache if global_cache else _content_cache
-    path, was_cached = cache.cache(url, expiration_sec)
-    result = parser(path.read_text())
-    return result, was_cached
+    result = cache.cache(url, expiration_sec)
+    parsed_result = parser(result.content.path.read_text())
+    return parsed_result, result.was_cached
 def cache_resource(
@@ -74,7 +74,8 @@ def cache_resource(
 ) -> dict[MediaType, Path]:
     """
     Cache a resource item for an external local path or a URL, fetching or
-    copying as needed. For media this may yield more than one format.
+    copying as needed and returning direct paths to the cached content.
+    For media this may yield more than one format.
     """
     from kash.exec.preconditions import is_resource
     from kash.media_base.media_services import is_media_url
@@ -83,62 +84,69 @@ def cache_resource(
     if not is_resource(item):
         raise ValueError(f"Item is not a resource: {item}")
-    path = None
-    result: dict[MediaType, Path] = {}
+    path: Path | None = None
+    results: dict[MediaType, Path] = {}
+    cache_result: CacheResult | None = None
+    # Cache the content using media or content cache.
     if item.url:
         if is_media_url(item.url):
-            result = cache_media(item.url)
+            results = cache_media(item.url)
         else:
-            path, _was_cached = cache_file(item.url, global_cache, expiration_sec)
+            cache_result = cache_file(item.url, global_cache, expiration_sec)
     elif item.external_path:
-        path = Path(item.external_path)
-        if not path.is_file():
-            raise FileNotFound(f"External path not found: {path}")
-        path, _was_cached = cache_file(path, global_cache, expiration_sec)
+        ext_path = Path(item.external_path)
+        if not ext_path.is_file():
+            raise FileNotFound(f"External path not found: {ext_path}")
+        cache_result = cache_file(ext_path, global_cache, expiration_sec)
     elif item.original_filename:
-        path = Path(item.original_filename)
-        if not path.is_file():
-            raise FileNotFound(f"Original filename not found: {path}")
-        path, _was_cached = cache_file(path, global_cache, expiration_sec)
+        orig_path = Path(item.original_filename)
+        if not orig_path.is_file():
+            raise FileNotFound(f"Original filename not found: {orig_path}")
+        cache_result = cache_file(orig_path, global_cache, expiration_sec)
     else:
         raise ValueError(f"Item has no URL or external path: {item}")
+    if cache_result:
+        path = cache_result.content.path
     # If we just have the local file path, determine its format.
-    if not result and path:
-        result = {detect_media_type(path): path}
+    if not results and path:
+        results = {detect_media_type(path): path}
     log.message(
         "Cached resource %s:\n%s",
         item.as_str_brief(),
         fmt_lines(
             f"{media_type.value}: {fmt_path(media_path)}"
-            for media_type, media_path in result.items()
+            for media_type, media_path in results.items()
         ),
     )
-    return result
+    return results
 def get_url_html(
     item: Item, global_cache: bool = False, expiration_sec: float | None = None
-) -> tuple[Url, str]:
+) -> tuple[Url | StorePath, str]:
     """
     Returns the HTML content of an URL item, using the content cache,
     or the body of the item if it has a URL and HTML body.
     """
-    from kash.exec.preconditions import has_html_body, is_url_item
-    if not item.url:
-        raise InvalidInput("Item must have a URL or an HTML body")
-    url = Url(canonicalize_url(item.url))
+    from kash.exec.preconditions import has_html_body, is_url_resource
-    if is_url_item(item):
-        path, _was_cached = cache_file(url, global_cache, expiration_sec)
+    if is_url_resource(item) and item.url and not item.has_body:
+        # Need to fetch the content.
+        locator = Url(canonicalize_url(item.url))
+        path = cache_file(locator, global_cache, expiration_sec).content.path
         with open(path) as file:
             html_content = file.read()
     else:
         if not item.body or not has_html_body(item):
-            raise InvalidInput("Item must have a URL or an HTML body")
+            raise InvalidInput("Item must be a URL resource or have an HTML body")
+        if not item.store_path:
+            raise InvalidInput("Item missing store path")
         html_content = item.body
+        locator = StorePath(item.store_path)
-    return url, html_content
+    return locator, html_content

kash/web_content/local_file_cache.py CHANGED Viewed

@@ -10,11 +10,19 @@ from funlog import log_if_modifies
 from prettyfmt import fmt_path
 from strif import atomic_output_file, copyfile_atomic
-from kash.utils.common.url import Url, is_file_url, is_url, normalize_url, parse_file_url
+from kash.utils.common.url import (
+    Url,
+    is_file_url,
+    is_url,
+    is_valid_path,
+    normalize_url,
+    parse_file_url,
+)
 from kash.utils.errors import FileNotFound
-from kash.utils.file_utils.file_formats_model import choose_file_ext
+from kash.utils.file_utils.file_formats_model import file_format_info
+from kash.utils.file_utils.filename_parsing import parse_file_ext
 from kash.web_content.dir_store import DirStore
-from kash.web_content.web_fetch import download_url
+from kash.web_content.web_fetch import HttpHeaders, download_url
 log = logging.getLogger(__name__)
@@ -73,9 +81,42 @@ An item that can be cached as a file.
 """
+@dataclass(frozen=True)
+class CacheContent:
+    """
+    An item in the local file cache. If it was a cache miss for a web-fetched URL,
+    also has HTTP headers.
+    """
+    path: Path
+    headers: HttpHeaders | None
+@dataclass(frozen=True)
+class CacheResult:
+    content: CacheContent
+    was_cached: bool
 def _suffix_for(cacheable: Cacheable) -> str | None:
     key = cacheable.key if isinstance(cacheable, Loadable) else cacheable
-    file_ext = choose_file_ext(key)
+    # Check for recognized file extensions on URLs and Paths.
+    filename_ext = parse_file_ext(str(key))
+    if filename_ext:
+        return filename_ext.dot_ext
+    # Handle local paths
+    if is_file_url(str(key)):
+        path = parse_file_url(str(key))
+    elif is_valid_path(str(key)):
+        path = Path(str(key))
+    else:
+        # A non-local path with no recognized extension.
+        return None
+    # If it's a local file, check the file content too.
+    file_ext = file_format_info(path).suggested_file_ext
     return file_ext.dot_ext if file_ext else None
@@ -135,7 +176,7 @@ class LocalFileCache(DirStore):
         if backup_url and mode in (WebCacheMode.TEST, WebCacheMode.UPDATE):
             self._restore(backup_url)
-    def _load_source(self, source: Cacheable) -> Path:
+    def _load_source(self, source: Cacheable) -> CacheContent:
         """
         Load or compute the given source and save it to the cache.
         """
@@ -147,6 +188,7 @@ class LocalFileCache(DirStore):
         suffix = _suffix_for(source)
         cache_path = self.path_for(key, folder=self.folder, suffix=_suffix_for(source))
+        headers = None
         if isinstance(source, Path) or (isinstance(source, str) and is_file_url(source)):
             # Local file or file:// URL.
             url_or_path = source
@@ -165,7 +207,8 @@ class LocalFileCache(DirStore):
             # URL.
             url = _normalize_url(source)
             log.info("Downloading to cache: %s -> %s", url, fmt_path(cache_path))
-            download_url(url, cache_path)
+            headers = download_url(url, cache_path)
+            log.debug("Response headers: %s", headers)
         elif isinstance(source, Loadable):
             # Arbitrary loadable. Load and save (atomically).
             with atomic_output_file(
@@ -180,7 +223,7 @@ class LocalFileCache(DirStore):
         else:
             raise ValueError(f"Invalid source: {source}")
-        return cache_path
+        return CacheContent(cache_path, headers)
     def _age_in_sec(self, cache_path: Path) -> float:
         now = time.time()
@@ -210,7 +253,7 @@ class LocalFileCache(DirStore):
         return cache_path is not None and not self._is_expired(cache_path, expiration_sec)
-    def cache(self, source: Cacheable, expiration_sec: float | None = None) -> tuple[Path, bool]:
+    def cache(self, source: Cacheable, expiration_sec: float | None = None) -> CacheResult:
         """
         Returns cached download path of given URL and whether it was previously cached.
         For file:// URLs does a copy.
@@ -221,13 +264,10 @@ class LocalFileCache(DirStore):
         if cache_path and not self._is_expired(cache_path, expiration_sec):
             log.info("URL in cache, not fetching: %s: %s", key, fmt_path(cache_path))
-            return cache_path, True
+            return CacheResult(CacheContent(cache_path, None), True)
         else:
             log.info("Caching new copy: %s", key)
-            return (
-                self._load_source(source),
-                False,
-            )
+            return CacheResult(self._load_source(source), False)
     def backup(self) -> None:
         if not self.backup_url:

kash/web_content/web_extract.py CHANGED Viewed

@@ -22,7 +22,7 @@ def fetch_extract(
     """
     expiration_sec = 0 if refetch else None
     if use_cache:
-        path, _was_cached = cache_file(url, expiration_sec=expiration_sec)
+        path = cache_file(url, expiration_sec=expiration_sec).content.path
         with open(path, "rb") as file:
             content = file.read()
         page_data = extractor(url, content)

kash/web_content/web_extract_readabilipy.py CHANGED Viewed

@@ -1,9 +1,11 @@
+from pathlib import Path
 from kash.utils.common.url import Url
 from kash.utils.errors import InvalidInput
 from kash.web_content.web_page_model import WebPageData
-def extract_text_readabilipy(url: Url, html: str) -> WebPageData:
+def extract_text_readabilipy(locator: Url | Path, html: str) -> WebPageData:
     """
     Extracts text from HTML using readability.
     This requires Node readability. Justext is an alternative and seems good for
@@ -16,7 +18,7 @@ def extract_text_readabilipy(url: Url, html: str) -> WebPageData:
         raise InvalidInput("No clean HTML found")
     return WebPageData(
-        url=url,
+        locator=locator,
         title=result["title"],
         byline=result["byline"],
         clean_html=result["content"],

kash/web_content/web_fetch.py CHANGED Viewed

@@ -1,14 +1,20 @@
+from __future__ import annotations
 import logging
+from dataclasses import dataclass
+from functools import cached_property
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
-import httpx
 from strif import atomic_output_file, copyfile_atomic
-from tqdm import tqdm
 from kash.config.env_settings import KashEnv
 from kash.utils.common.url import Url
+from kash.utils.file_utils.file_formats import MimeType
+if TYPE_CHECKING:
+    from httpx import Client, Response
 log = logging.getLogger(__name__)
@@ -30,11 +36,13 @@ def fetch_url(
     timeout: int = DEFAULT_TIMEOUT,
     auth: Any | None = None,
     headers: dict[str, str] | None = None,
-) -> httpx.Response:
+) -> Response:
     """
     Fetch a URL using httpx with logging and reasonable defaults.
     Raise httpx.HTTPError for non-2xx responses.
     """
+    import httpx
     with httpx.Client(
         follow_redirects=True,
         timeout=timeout,
@@ -48,36 +56,60 @@ def fetch_url(
         return response
+@dataclass(frozen=True)
+class HttpHeaders:
+    """
+    HTTP response headers.
+    """
+    headers: dict[str, str]
+    @cached_property
+    def mime_type(self) -> MimeType | None:
+        """Get content type header, if available."""
+        for key, value in self.headers.items():
+            if key.lower() == "content-type":
+                return MimeType(value)
+        return None
 def download_url(
     url: Url,
     target_filename: str | Path,
-    session: httpx.Client | None = None,
+    session: Client | None = None,
     show_progress: bool = False,
     timeout: int = DEFAULT_TIMEOUT,
     auth: Any | None = None,
     headers: dict[str, str] | None = None,
-) -> None:
+) -> HttpHeaders | None:
     """
     Download given file, optionally with progress bar, streaming to a target file.
     Also handles file:// and s3:// URLs. Output file is created atomically.
     Raise httpx.HTTPError for non-2xx responses.
+    Returns response headers for HTTP/HTTPS requests, None for other URL types.
     """
+    import httpx
+    from tqdm import tqdm
     target_filename = str(target_filename)
     parsed_url = urlparse(url)
     if show_progress:
         log.info("%s", url)
     if parsed_url.scheme == "file" or parsed_url.scheme == "":
-        copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename)
+        copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename, make_parents=True)
+        return None
     elif parsed_url.scheme == "s3":
         import boto3  # pyright: ignore
         s3 = boto3.resource("s3")
         s3_path = parsed_url.path.lstrip("/")
         s3.Bucket(parsed_url.netloc).download_file(s3_path, target_filename)
+        return None
     else:
         client = session or httpx.Client(follow_redirects=True, timeout=timeout)
         response: httpx.Response | None = None
+        response_headers: dict[str, str] | None = None
         try:
             headers = headers or default_headers()
             log.debug("download_url: using headers: %s", headers)
@@ -90,6 +122,7 @@ def download_url(
                 headers=headers,
             ) as response:
                 response.raise_for_status()
+                response_headers = dict(response.headers)
                 total_size = int(response.headers.get("content-length", "0"))
                 with atomic_output_file(target_filename, make_parents=True) as temp_filename:
@@ -107,3 +140,5 @@ def download_url(
                 client.close()
             if response:
                 response.raise_for_status()  # In case of errors during streaming
+        return HttpHeaders(response_headers) if response_headers else None

kash/web_content/web_page_model.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Protocol
 from prettyfmt import abbrev_obj
@@ -12,7 +13,7 @@ class WebPageData:
     Data about a web page, including URL, title and optionally description and extracted content.
     """
-    url: Url
+    locator: Url | Path
     title: str | None = None
     byline: str | None = None
     description: str | None = None

kash/web_gen/simple_webpage.py CHANGED Viewed

@@ -15,7 +15,7 @@ def simple_webpage_render(
     return render_web_template(
         template_filename=page_template,
         data={
-            "title": item.title,
+            "title": item.abbrev_title(),
             "add_title_h1": add_title_h1,
             "content_html": item.body_as_html(),
             "thumbnail_url": item.thumbnail_url,

kash-shell 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

kash-shell 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl