kash-shell 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/core/markdownify.py +12 -8
- kash/actions/core/readability.py +8 -7
- kash/actions/core/render_as_html.py +8 -6
- kash/actions/core/show_webpage.py +2 -2
- kash/commands/base/basic_file_commands.py +3 -0
- kash/commands/base/diff_commands.py +38 -3
- kash/commands/base/reformat_command.py +1 -1
- kash/commands/base/show_command.py +1 -1
- kash/commands/workspace/selection_commands.py +1 -1
- kash/commands/workspace/workspace_commands.py +92 -29
- kash/docs/load_source_code.py +1 -1
- kash/exec/action_exec.py +6 -8
- kash/exec/fetch_url_metadata.py +8 -5
- kash/exec/importing.py +4 -4
- kash/exec/llm_transforms.py +1 -1
- kash/exec/preconditions.py +30 -10
- kash/file_storage/file_store.py +105 -43
- kash/file_storage/item_file_format.py +1 -1
- kash/file_storage/store_filenames.py +2 -1
- kash/help/help_embeddings.py +2 -2
- kash/llm_utils/clean_headings.py +1 -1
- kash/{text_handling → llm_utils}/custom_sliding_transforms.py +0 -3
- kash/llm_utils/llm_completion.py +1 -1
- kash/local_server/__init__.py +1 -1
- kash/local_server/local_server_commands.py +2 -1
- kash/mcp/__init__.py +1 -1
- kash/mcp/mcp_server_commands.py +8 -2
- kash/media_base/media_cache.py +10 -3
- kash/model/actions_model.py +3 -0
- kash/model/items_model.py +78 -44
- kash/model/operations_model.py +14 -0
- kash/shell/ui/shell_results.py +2 -1
- kash/shell/utils/native_utils.py +2 -2
- kash/utils/common/format_utils.py +0 -8
- kash/utils/common/import_utils.py +46 -18
- kash/utils/common/url.py +80 -3
- kash/utils/file_utils/file_formats.py +3 -2
- kash/utils/file_utils/file_formats_model.py +47 -45
- kash/utils/file_utils/filename_parsing.py +41 -16
- kash/{text_handling → utils/text_handling}/doc_normalization.py +10 -8
- kash/utils/text_handling/escape_html_tags.py +156 -0
- kash/{text_handling → utils/text_handling}/markdown_utils.py +0 -3
- kash/utils/text_handling/markdownify_utils.py +87 -0
- kash/{text_handling → utils/text_handling}/unified_diffs.py +1 -44
- kash/web_content/file_cache_utils.py +42 -34
- kash/web_content/local_file_cache.py +53 -13
- kash/web_content/web_extract.py +1 -1
- kash/web_content/web_extract_readabilipy.py +4 -2
- kash/web_content/web_fetch.py +42 -7
- kash/web_content/web_page_model.py +2 -1
- kash/web_gen/simple_webpage.py +1 -1
- kash/web_gen/templates/base_styles.css.jinja +134 -16
- kash/web_gen/templates/simple_webpage.html.jinja +1 -1
- kash/workspaces/selections.py +2 -2
- kash/workspaces/workspace_output.py +2 -2
- kash/xonsh_custom/load_into_xonsh.py +4 -2
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/METADATA +1 -1
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/RECORD +62 -62
- kash/utils/common/inflection.py +0 -22
- kash/workspaces/workspace_importing.py +0 -56
- /kash/{text_handling → utils/text_handling}/markdown_render.py +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/entry_points.txt +0 -0
- {kash_shell-0.3.12.dist-info → kash_shell-0.3.14.dist-info}/licenses/LICENSE +0 -0
kash/model/items_model.py
CHANGED
|
@@ -24,13 +24,14 @@ from kash.model.concept_model import canonicalize_concept
|
|
|
24
24
|
from kash.model.media_model import MediaMetadata
|
|
25
25
|
from kash.model.operations_model import OperationSummary, Source
|
|
26
26
|
from kash.model.paths_model import StorePath, fmt_store_path
|
|
27
|
-
from kash.text_handling.markdown_render import markdown_to_html
|
|
28
|
-
from kash.text_handling.markdown_utils import first_heading
|
|
29
27
|
from kash.utils.common.format_utils import fmt_loc, html_to_plaintext, plaintext_to_html
|
|
30
28
|
from kash.utils.common.url import Locator, Url
|
|
31
29
|
from kash.utils.errors import FileFormatError
|
|
32
30
|
from kash.utils.file_formats.chat_format import ChatHistory
|
|
31
|
+
from kash.utils.file_utils.file_formats import MimeType
|
|
33
32
|
from kash.utils.file_utils.file_formats_model import FileExt, Format
|
|
33
|
+
from kash.utils.text_handling.markdown_render import markdown_to_html
|
|
34
|
+
from kash.utils.text_handling.markdown_utils import first_heading
|
|
34
35
|
|
|
35
36
|
if TYPE_CHECKING:
|
|
36
37
|
from kash.model.exec_model import ExecContext
|
|
@@ -180,7 +181,7 @@ class ItemId:
|
|
|
180
181
|
item_id = ItemId(item.type, IdType.url, canonicalize_url(item.url))
|
|
181
182
|
elif item.type == ItemType.concept and item.title:
|
|
182
183
|
item_id = ItemId(item.type, IdType.concept, canonicalize_concept(item.title))
|
|
183
|
-
elif item.source and item.source.cacheable:
|
|
184
|
+
elif item.source and item.source.cacheable and item.source.operation.has_known_inputs:
|
|
184
185
|
# We know the source of this and if the action was cacheable, we can create
|
|
185
186
|
# an identity based on the source.
|
|
186
187
|
item_id = ItemId(item.type, IdType.source, item.source.as_str())
|
|
@@ -359,20 +360,27 @@ class Item:
|
|
|
359
360
|
cls,
|
|
360
361
|
path: Path | str,
|
|
361
362
|
item_type: ItemType | None = None,
|
|
363
|
+
*,
|
|
362
364
|
title: str | None = None,
|
|
365
|
+
original_filename: str | None = None,
|
|
366
|
+
url: Url | None = None,
|
|
367
|
+
mime_type: MimeType | None = None,
|
|
363
368
|
) -> Item:
|
|
364
369
|
"""
|
|
365
370
|
Create a resource Item for a file with a format inferred from the file extension
|
|
366
371
|
or the content. Only sets basic metadata. Does not read the content. Will set
|
|
367
372
|
`format` and `file_ext` if possible but will leave them as None if unrecognized.
|
|
373
|
+
If `mime_type` is provided, it can help determine the file extension if the
|
|
374
|
+
extension isn't recognized from the filename or URL.
|
|
368
375
|
"""
|
|
369
376
|
from kash.file_storage.store_filenames import parse_item_filename
|
|
370
|
-
from kash.utils.file_utils.file_formats_model import
|
|
377
|
+
from kash.utils.file_utils.file_formats_model import file_format_info
|
|
371
378
|
|
|
372
379
|
# Will raise error for unrecognized file ext.
|
|
373
380
|
_name, filename_item_type, format, file_ext = parse_item_filename(path)
|
|
381
|
+
format_info = file_format_info(path, suggested_mime_type=mime_type)
|
|
374
382
|
if not format:
|
|
375
|
-
format =
|
|
383
|
+
format = format_info.format
|
|
376
384
|
if not item_type and filename_item_type:
|
|
377
385
|
item_type = filename_item_type
|
|
378
386
|
if not item_type:
|
|
@@ -380,12 +388,19 @@ class Item:
|
|
|
380
388
|
item_type = (
|
|
381
389
|
ItemType.doc if format and format.supports_frontmatter else ItemType.resource
|
|
382
390
|
)
|
|
391
|
+
|
|
392
|
+
# Try to determine a good file extension if it's not already on the filename.
|
|
393
|
+
if not file_ext:
|
|
394
|
+
file_ext = format_info.suggested_file_ext
|
|
395
|
+
|
|
383
396
|
item = cls(
|
|
384
397
|
type=item_type,
|
|
385
398
|
title=title,
|
|
386
399
|
file_ext=file_ext,
|
|
387
400
|
format=format,
|
|
388
401
|
external_path=str(path),
|
|
402
|
+
original_filename=original_filename,
|
|
403
|
+
url=url,
|
|
389
404
|
)
|
|
390
405
|
|
|
391
406
|
# Update modified time from the file system.
|
|
@@ -507,17 +522,43 @@ class Item:
|
|
|
507
522
|
|
|
508
523
|
return item_dict
|
|
509
524
|
|
|
510
|
-
def
|
|
525
|
+
def filename_stem(self) -> str | None:
|
|
511
526
|
"""
|
|
512
|
-
|
|
513
|
-
|
|
527
|
+
If the item has an existing or previous filename, return its stem,
|
|
528
|
+
for use in picking new filenames.
|
|
514
529
|
"""
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
530
|
+
from kash.file_storage.store_filenames import parse_item_filename
|
|
531
|
+
|
|
532
|
+
# Prefer original to external, e.g. if we know the original but the external might
|
|
533
|
+
# be a cache filename.
|
|
534
|
+
path = self.store_path or self.original_filename or self.external_path
|
|
535
|
+
if path:
|
|
536
|
+
path_name, _item_type, _format, _file_ext = parse_item_filename(Path(path).name)
|
|
537
|
+
else:
|
|
538
|
+
path_name = None
|
|
539
|
+
return path_name
|
|
540
|
+
|
|
541
|
+
def slug_name(self, max_len: int = SLUG_MAX_LEN, prefer_title: bool = False) -> str:
|
|
542
|
+
"""
|
|
543
|
+
Get a readable slugified name for this item, either from a previous filename
|
|
544
|
+
or from slugifying the title or content. May not be unique.
|
|
545
|
+
"""
|
|
546
|
+
filename_stem = self.filename_stem()
|
|
547
|
+
if filename_stem and not prefer_title:
|
|
548
|
+
return slugify_snake(filename_stem)
|
|
549
|
+
else:
|
|
550
|
+
return slugify_snake(self.abbrev_title(max_len=max_len, add_ops_suffix=True))
|
|
551
|
+
|
|
552
|
+
def default_filename(self) -> str:
|
|
553
|
+
"""
|
|
554
|
+
Get the default filename for an item based on slugifying its title or other
|
|
555
|
+
metadata. May not be unique.
|
|
556
|
+
"""
|
|
557
|
+
from kash.file_storage.store_filenames import join_suffix
|
|
558
|
+
|
|
559
|
+
slug = self.slug_name()
|
|
560
|
+
full_suffix = self.get_full_suffix()
|
|
561
|
+
return join_suffix(slug, full_suffix)
|
|
521
562
|
|
|
522
563
|
def abbrev_title(
|
|
523
564
|
self,
|
|
@@ -527,12 +568,10 @@ class Item:
|
|
|
527
568
|
pull_body_heading: bool = False,
|
|
528
569
|
) -> str:
|
|
529
570
|
"""
|
|
530
|
-
Get or infer a title for this item, falling back to the filename, URL,
|
|
531
|
-
|
|
532
|
-
|
|
571
|
+
Get or infer a title for this item, falling back to the filename, URL, description, or
|
|
572
|
+
finally body text. Optionally, include the last operation as a parenthetical at the end
|
|
573
|
+
of the title. Will use "Untitled" if all else fails.
|
|
533
574
|
"""
|
|
534
|
-
from kash.file_storage.store_filenames import parse_item_filename
|
|
535
|
-
|
|
536
575
|
# First special case: if we are pulling the title from the body header, check
|
|
537
576
|
# that.
|
|
538
577
|
if not self.title and pull_body_heading:
|
|
@@ -544,18 +583,12 @@ class Item:
|
|
|
544
583
|
if not self.title and self.url:
|
|
545
584
|
return abbrev_str(self.url, max_len)
|
|
546
585
|
|
|
547
|
-
|
|
548
|
-
# Use stem to drop suffix like .resource.docx etc in a title.
|
|
549
|
-
path = self.store_path or self.external_path or self.original_filename
|
|
550
|
-
if path:
|
|
551
|
-
path_name, _item_type, _format, _file_ext = parse_item_filename(Path(path).name)
|
|
552
|
-
else:
|
|
553
|
-
path_name = None
|
|
586
|
+
filename_stem = self.filename_stem()
|
|
554
587
|
|
|
555
588
|
# Use the title or the path if possible, falling back to description or even body text.
|
|
556
589
|
title_raw_text = (
|
|
557
590
|
self.title
|
|
558
|
-
or
|
|
591
|
+
or filename_stem
|
|
559
592
|
or self.description
|
|
560
593
|
or (not self.is_binary and self.abbrev_body(max_len))
|
|
561
594
|
or UNTITLED
|
|
@@ -586,6 +619,24 @@ class Item:
|
|
|
586
619
|
|
|
587
620
|
return final_text
|
|
588
621
|
|
|
622
|
+
def display_title(self) -> str:
|
|
623
|
+
"""
|
|
624
|
+
A display title for this item. Same as abbrev_title() but will fall back
|
|
625
|
+
to the filename if it is available.
|
|
626
|
+
"""
|
|
627
|
+
display_title = self.title
|
|
628
|
+
if not display_title and self.store_path:
|
|
629
|
+
display_title = Path(self.store_path).name
|
|
630
|
+
if not display_title:
|
|
631
|
+
display_title = self.abbrev_title()
|
|
632
|
+
return display_title
|
|
633
|
+
|
|
634
|
+
def abbrev_description(self, max_len: int = 1000) -> str:
|
|
635
|
+
"""
|
|
636
|
+
Get or infer description.
|
|
637
|
+
"""
|
|
638
|
+
return abbrev_on_words(html_to_plaintext(self.description or self.body or ""), max_len)
|
|
639
|
+
|
|
589
640
|
def body_heading(self) -> str | None:
|
|
590
641
|
"""
|
|
591
642
|
Get the first h1 or h2 heading from the body text, if present.
|
|
@@ -620,21 +671,6 @@ class Item:
|
|
|
620
671
|
"""
|
|
621
672
|
return bool(self.body and self.body.strip())
|
|
622
673
|
|
|
623
|
-
def slug_name(self, max_len: int = SLUG_MAX_LEN) -> str:
|
|
624
|
-
"""
|
|
625
|
-
Get a readable slugified version of the title or filename or content
|
|
626
|
-
appropriate for this item. May not be unique.
|
|
627
|
-
"""
|
|
628
|
-
title = self.abbrev_title(max_len=max_len, add_ops_suffix=True)
|
|
629
|
-
slug = slugify_snake(title)
|
|
630
|
-
return slug
|
|
631
|
-
|
|
632
|
-
def abbrev_description(self, max_len: int = 1000) -> str:
|
|
633
|
-
"""
|
|
634
|
-
Get or infer description.
|
|
635
|
-
"""
|
|
636
|
-
return abbrev_on_words(html_to_plaintext(self.description or self.body or ""), max_len)
|
|
637
|
-
|
|
638
674
|
def read_as_config(self) -> Any:
|
|
639
675
|
"""
|
|
640
676
|
If it is a config Item, return the parsed YAML.
|
|
@@ -653,8 +689,6 @@ class Item:
|
|
|
653
689
|
"""
|
|
654
690
|
if self.file_ext:
|
|
655
691
|
return self.file_ext
|
|
656
|
-
if self.is_binary and not self.file_ext:
|
|
657
|
-
raise ValueError(f"Binary Items must have a file extension: {self}")
|
|
658
692
|
inferred_ext = self.format and self.format.file_ext
|
|
659
693
|
if not inferred_ext:
|
|
660
694
|
raise ValueError(f"Cannot infer file extension for Item: {self}")
|
kash/model/operations_model.py
CHANGED
|
@@ -66,6 +66,13 @@ class Input:
|
|
|
66
66
|
else:
|
|
67
67
|
return "[input info missing]"
|
|
68
68
|
|
|
69
|
+
@property
|
|
70
|
+
def is_known(self) -> bool:
|
|
71
|
+
"""
|
|
72
|
+
Whether the input is known, i.e. we had saved inputs with hashes.
|
|
73
|
+
"""
|
|
74
|
+
return bool(self.path and self.hash)
|
|
75
|
+
|
|
69
76
|
# Inputs are equal if the hashes match (even if the paths have changed).
|
|
70
77
|
|
|
71
78
|
def __hash__(self):
|
|
@@ -117,6 +124,13 @@ class Operation:
|
|
|
117
124
|
|
|
118
125
|
return d
|
|
119
126
|
|
|
127
|
+
@property
|
|
128
|
+
def has_known_inputs(self) -> bool:
|
|
129
|
+
"""
|
|
130
|
+
Whether the operation has known inputs, i.e. all inputs have hashes.
|
|
131
|
+
"""
|
|
132
|
+
return all(arg.is_known for arg in self.arguments)
|
|
133
|
+
|
|
120
134
|
def summary(self) -> OperationSummary:
|
|
121
135
|
return OperationSummary(self.action_name)
|
|
122
136
|
|
kash/shell/ui/shell_results.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
from prettyfmt import fmt_count_items
|
|
3
4
|
from rich.box import SQUARE
|
|
4
5
|
from rich.panel import Panel
|
|
5
6
|
from rich.table import Table
|
|
@@ -10,7 +11,7 @@ from kash.config.text_styles import COLOR_SELECTION, STYLE_HINT
|
|
|
10
11
|
from kash.exec.command_exec import run_command_or_action
|
|
11
12
|
from kash.exec_model.shell_model import ShellResult
|
|
12
13
|
from kash.shell.output.shell_output import PrintHooks, console_pager, cprint, print_result
|
|
13
|
-
from kash.utils.common.format_utils import
|
|
14
|
+
from kash.utils.common.format_utils import fmt_loc
|
|
14
15
|
from kash.utils.errors import is_fatal
|
|
15
16
|
from kash.workspaces import SelectionHistory
|
|
16
17
|
|
kash/shell/utils/native_utils.py
CHANGED
|
@@ -23,7 +23,7 @@ from kash.shell.output.shell_output import cprint
|
|
|
23
23
|
from kash.utils.common.format_utils import fmt_loc
|
|
24
24
|
from kash.utils.common.url import as_file_url, is_file_url, is_url
|
|
25
25
|
from kash.utils.errors import FileNotFound, SetupError
|
|
26
|
-
from kash.utils.file_utils.file_formats import
|
|
26
|
+
from kash.utils.file_utils.file_formats import is_fullpage_html, read_partial_text
|
|
27
27
|
from kash.utils.file_utils.file_formats_model import file_format_info
|
|
28
28
|
|
|
29
29
|
log = get_logger(__name__)
|
|
@@ -88,7 +88,7 @@ def _detect_view_mode(file_or_url: str) -> ViewMode:
|
|
|
88
88
|
path = Path(file_or_url)
|
|
89
89
|
if path.is_file(): # File or symlink.
|
|
90
90
|
content = read_partial_text(path)
|
|
91
|
-
if content and
|
|
91
|
+
if content and is_fullpage_html(content):
|
|
92
92
|
return ViewMode.browser
|
|
93
93
|
|
|
94
94
|
info = file_format_info(path)
|
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
|
4
4
|
|
|
5
5
|
from prettyfmt import fmt_path
|
|
6
6
|
|
|
7
|
-
from kash.utils.common.inflection import plural
|
|
8
7
|
from kash.utils.common.url import Locator, is_url
|
|
9
8
|
|
|
10
9
|
|
|
@@ -44,13 +43,6 @@ def fmt_loc(locator: str | Locator, resolve: bool = True) -> str:
|
|
|
44
43
|
return fmt_path(locator, resolve=resolve)
|
|
45
44
|
|
|
46
45
|
|
|
47
|
-
def fmt_count_items(count: int, name: str = "item") -> str:
|
|
48
|
-
"""
|
|
49
|
-
Format a count and a name as a pluralized phrase, e.g. "1 item" or "2 items".
|
|
50
|
-
"""
|
|
51
|
-
return f"{count} {plural(name, count)}" # pyright: ignore
|
|
52
|
-
|
|
53
|
-
|
|
54
46
|
## Tests
|
|
55
47
|
|
|
56
48
|
|
|
@@ -12,36 +12,64 @@ log = logging.getLogger(__name__)
|
|
|
12
12
|
Tallies: TypeAlias = dict[str, int]
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
def
|
|
15
|
+
def import_recursive(
|
|
16
16
|
parent_package_name: str,
|
|
17
17
|
parent_dir: Path,
|
|
18
|
-
|
|
18
|
+
resource_names: list[str] | None = None,
|
|
19
19
|
tallies: Tallies | None = None,
|
|
20
20
|
):
|
|
21
21
|
"""
|
|
22
|
-
Import
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
Import modules from subdirectories or individual Python modules within a parent package.
|
|
23
|
+
|
|
24
|
+
Each resource in `resource_names` can be:
|
|
25
|
+
- A directory name (all modules within it will be imported)
|
|
26
|
+
- A module name with or without '.py' extension (a single module will be imported)
|
|
27
|
+
- "." to import all modules in the parent_dir
|
|
28
|
+
|
|
29
|
+
If `resource_names` is `None`, imports all modules directly in parent_dir.
|
|
30
|
+
|
|
31
|
+
Simply a convenience wrapper for `importlib.import_module` and
|
|
32
|
+
`pkgutil.iter_modules` to iterate over all modules in the subdirectories.
|
|
33
|
+
|
|
34
|
+
If `tallies` is provided, it will be updated with the number of modules imported
|
|
35
|
+
for each package.
|
|
25
36
|
"""
|
|
26
37
|
if tallies is None:
|
|
27
38
|
tallies = {}
|
|
28
|
-
if not
|
|
29
|
-
|
|
39
|
+
if not resource_names:
|
|
40
|
+
resource_names = ["."]
|
|
30
41
|
|
|
31
|
-
for
|
|
32
|
-
if
|
|
42
|
+
for name in resource_names:
|
|
43
|
+
if name == ".":
|
|
33
44
|
full_path = parent_dir
|
|
34
45
|
package_name = parent_package_name
|
|
35
46
|
else:
|
|
36
|
-
full_path = parent_dir /
|
|
37
|
-
package_name = f"{parent_package_name}.{
|
|
38
|
-
|
|
39
|
-
if
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
full_path = parent_dir / name
|
|
48
|
+
package_name = f"{parent_package_name}.{name}"
|
|
49
|
+
|
|
50
|
+
# Check if it's a directory
|
|
51
|
+
if full_path.is_dir():
|
|
52
|
+
# Import all modules in the directory
|
|
53
|
+
for _, module_name, _ in pkgutil.iter_modules(path=[str(full_path)]):
|
|
54
|
+
importlib.import_module(f"{package_name}.{module_name}")
|
|
55
|
+
tallies[package_name] = tallies.get(package_name, 0) + 1
|
|
56
|
+
else:
|
|
57
|
+
# Not a directory, try as a module file
|
|
58
|
+
module_path = full_path
|
|
59
|
+
module_name = name
|
|
60
|
+
|
|
61
|
+
# Handle with or without .py extension
|
|
62
|
+
if not module_path.is_file() and module_path.suffix != ".py":
|
|
63
|
+
module_path = parent_dir / f"{name}.py"
|
|
64
|
+
module_name = name
|
|
65
|
+
elif module_path.suffix == ".py":
|
|
66
|
+
module_name = module_path.stem
|
|
67
|
+
|
|
68
|
+
if module_path.is_file() and module_name != "__init__":
|
|
69
|
+
importlib.import_module(f"{parent_package_name}.{module_name}")
|
|
70
|
+
tallies[parent_package_name] = tallies.get(parent_package_name, 0) + 1
|
|
71
|
+
else:
|
|
72
|
+
raise FileNotFoundError(f"Path not found or not importable: {full_path}")
|
|
45
73
|
|
|
46
74
|
return tallies
|
|
47
75
|
|
kash/utils/common/url.py
CHANGED
|
@@ -47,7 +47,9 @@ def check_if_url(
|
|
|
47
47
|
if only_schemes:
|
|
48
48
|
return result if result.scheme in only_schemes else None
|
|
49
49
|
else:
|
|
50
|
-
|
|
50
|
+
# Consider it a URL if the scheme is present and longer than a single character.
|
|
51
|
+
# This helps avoid misinterpreting Windows drive letters (e.g., "C:\foo") as schemes.
|
|
52
|
+
return result if result.scheme and len(result.scheme) > 1 else None
|
|
51
53
|
except ValueError:
|
|
52
54
|
return None
|
|
53
55
|
|
|
@@ -145,6 +147,41 @@ def normalize_url(
|
|
|
145
147
|
return Url(normalized_url)
|
|
146
148
|
|
|
147
149
|
|
|
150
|
+
def is_valid_path(text: UnresolvedLocator) -> bool:
|
|
151
|
+
"""
|
|
152
|
+
Sanity check if the input is plausibly a file path, i.e. not a URL or malformed in
|
|
153
|
+
an obvious way. Does not check for existence or OS-specific naming restrictions.
|
|
154
|
+
For a more thorough check there are other more complex options like:
|
|
155
|
+
https://github.com/thombashi/pathvalidate
|
|
156
|
+
"""
|
|
157
|
+
if isinstance(text, Path):
|
|
158
|
+
return True
|
|
159
|
+
elif isinstance(text, str):
|
|
160
|
+
path_str = text
|
|
161
|
+
else:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
# Check for empty or whitespace-only strings or null characters
|
|
165
|
+
# (never acceptable paths).
|
|
166
|
+
if not path_str or path_str.isspace():
|
|
167
|
+
return False
|
|
168
|
+
if "\0" in path_str:
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
# Explicitly disallow URLs.
|
|
172
|
+
if is_url(path_str):
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
# As a final lightweight check, ensure it can be instantiated as a Path object
|
|
176
|
+
# This doesn't validate existence or character restrictions.
|
|
177
|
+
try:
|
|
178
|
+
_ = Path(path_str)
|
|
179
|
+
except (TypeError, ValueError):
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
|
|
148
185
|
## Tests
|
|
149
186
|
|
|
150
187
|
|
|
@@ -155,13 +192,19 @@ def test_is_url():
|
|
|
155
192
|
assert is_url("ftp://example.com") == True
|
|
156
193
|
assert is_url("file:///path/to/file") == True
|
|
157
194
|
assert is_url("file://hostname/path/to/file") == True
|
|
158
|
-
assert is_url("invalid-url") == False
|
|
159
|
-
assert is_url("www.example.com") == False
|
|
160
195
|
assert is_url("http://example.com", only_schemes=HTTP_ONLY) == True
|
|
161
196
|
assert is_url("https://example.com", only_schemes=HTTP_ONLY) == True
|
|
197
|
+
|
|
198
|
+
assert is_url("invalid-url") == False
|
|
199
|
+
assert is_url("www.example.com") == False
|
|
162
200
|
assert is_url("ftp://example.com", only_schemes=HTTP_ONLY) == False
|
|
163
201
|
assert is_url("file:///path/to/file", only_schemes=HTTP_ONLY) == False
|
|
164
202
|
|
|
203
|
+
assert is_url("www.example.com") is False
|
|
204
|
+
assert is_url("c:\\path\\to\\file") is False
|
|
205
|
+
assert is_url("/foo/bar") is False
|
|
206
|
+
assert is_url("//foo") is False
|
|
207
|
+
|
|
165
208
|
|
|
166
209
|
def test_as_file_url():
|
|
167
210
|
assert as_file_url("file:///path/to/file") == "file:///path/to/file"
|
|
@@ -205,3 +248,37 @@ def test_normalize_url():
|
|
|
205
248
|
str(e)
|
|
206
249
|
== "Scheme 'ftp' not in allowed schemes: ['http', 'https', 'file']: ftp://example.com"
|
|
207
250
|
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def test_is_path():
|
|
254
|
+
assert is_valid_path("foo/bar") is True
|
|
255
|
+
assert is_valid_path("/foo/bar") is True
|
|
256
|
+
assert is_valid_path("./foo/bar") is True
|
|
257
|
+
assert is_valid_path("../foo/bar") is True
|
|
258
|
+
assert is_valid_path("foo.txt") is True
|
|
259
|
+
assert is_valid_path(Path("foo/bar")) is True
|
|
260
|
+
assert is_valid_path(Path()) is True
|
|
261
|
+
assert is_valid_path(".") is True
|
|
262
|
+
assert is_valid_path("..") is True
|
|
263
|
+
assert is_valid_path("C:\\Users\\name") is True # Windows-style
|
|
264
|
+
assert is_valid_path("file_with:colon.txt") is True # Valid on POSIX
|
|
265
|
+
assert is_valid_path(Url("relative/path")) is True # Url type with relative content
|
|
266
|
+
|
|
267
|
+
assert is_valid_path("http://example.com") is False
|
|
268
|
+
assert is_valid_path("https://example.com/path") is False
|
|
269
|
+
assert is_valid_path("file:///path/to/file") is False
|
|
270
|
+
assert is_valid_path(Url("http://example.com")) is False
|
|
271
|
+
assert is_valid_path("") is False
|
|
272
|
+
assert is_valid_path(" ") is False
|
|
273
|
+
assert is_valid_path("foo\0bar.txt") is False
|
|
274
|
+
assert is_valid_path(None) is False # pyright: ignore
|
|
275
|
+
assert is_valid_path(123) is False # pyright: ignore
|
|
276
|
+
|
|
277
|
+
# Edge cases
|
|
278
|
+
assert is_valid_path("www.example.com") is True # No scheme
|
|
279
|
+
assert str(Path("")) == "."
|
|
280
|
+
assert str(Path(" ")) == " "
|
|
281
|
+
assert is_valid_path(Path(" ")) is True # A bad idea but allowed
|
|
282
|
+
assert is_valid_path(Path("")) is True
|
|
283
|
+
assert is_valid_path(" ") is False
|
|
284
|
+
assert is_valid_path("") is False
|
|
@@ -11,9 +11,10 @@ from kash.config.logger import get_logger
|
|
|
11
11
|
log = get_logger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def
|
|
14
|
+
def is_fullpage_html(content: str) -> bool:
|
|
15
15
|
"""
|
|
16
|
-
A full HTML document that is
|
|
16
|
+
A full HTML document that is a full page (headers, footers, etc.) and
|
|
17
|
+
so probably best rendered in a browser.
|
|
17
18
|
"""
|
|
18
19
|
return bool(re.search(r"<!DOCTYPE html>|<html>|<body>|<head>", content[:2048], re.IGNORECASE))
|
|
19
20
|
|