kash-shell 0.3.28__py3-none-any.whl → 0.3.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kash/actions/core/chat.py +1 -0
- kash/actions/core/markdownify_html.py +4 -5
- kash/actions/core/minify_html.py +4 -5
- kash/actions/core/readability.py +1 -4
- kash/actions/core/render_as_html.py +10 -7
- kash/actions/core/save_sidematter_meta.py +47 -0
- kash/actions/core/show_webpage.py +2 -0
- kash/actions/core/zip_sidematter.py +47 -0
- kash/commands/base/basic_file_commands.py +7 -4
- kash/commands/base/diff_commands.py +6 -4
- kash/commands/base/files_command.py +31 -30
- kash/commands/base/general_commands.py +3 -2
- kash/commands/base/logs_commands.py +6 -4
- kash/commands/base/reformat_command.py +3 -2
- kash/commands/base/search_command.py +4 -3
- kash/commands/base/show_command.py +9 -7
- kash/commands/help/assistant_commands.py +6 -4
- kash/commands/help/help_commands.py +7 -4
- kash/commands/workspace/selection_commands.py +18 -16
- kash/commands/workspace/workspace_commands.py +39 -26
- kash/config/logger.py +1 -1
- kash/config/setup.py +2 -27
- kash/config/text_styles.py +1 -1
- kash/docs/markdown/topics/a1_what_is_kash.md +26 -18
- kash/docs/markdown/topics/a2_installation.md +3 -2
- kash/exec/action_decorators.py +7 -5
- kash/exec/action_exec.py +104 -53
- kash/exec/fetch_url_items.py +40 -11
- kash/exec/llm_transforms.py +14 -5
- kash/exec/preconditions.py +2 -2
- kash/exec/resolve_args.py +4 -1
- kash/exec/runtime_settings.py +3 -0
- kash/file_storage/file_store.py +108 -114
- kash/file_storage/item_file_format.py +91 -26
- kash/file_storage/item_id_index.py +128 -0
- kash/help/help_types.py +1 -1
- kash/llm_utils/llms.py +6 -1
- kash/local_server/local_server_commands.py +2 -1
- kash/mcp/mcp_server_commands.py +3 -2
- kash/mcp/mcp_server_routes.py +42 -12
- kash/model/actions_model.py +44 -32
- kash/model/compound_actions_model.py +4 -3
- kash/model/exec_model.py +33 -3
- kash/model/items_model.py +150 -60
- kash/model/params_model.py +4 -4
- kash/shell/output/shell_output.py +1 -2
- kash/utils/api_utils/gather_limited.py +2 -0
- kash/utils/api_utils/multitask_gather.py +74 -0
- kash/utils/common/s3_utils.py +108 -0
- kash/utils/common/url.py +16 -4
- kash/utils/file_formats/chat_format.py +7 -4
- kash/utils/file_utils/file_ext.py +1 -0
- kash/utils/file_utils/file_formats.py +4 -2
- kash/utils/file_utils/file_formats_model.py +12 -0
- kash/utils/text_handling/doc_normalization.py +1 -1
- kash/utils/text_handling/markdown_footnotes.py +224 -0
- kash/utils/text_handling/markdown_utils.py +532 -41
- kash/utils/text_handling/markdownify_utils.py +2 -1
- kash/web_content/web_fetch.py +2 -1
- kash/web_gen/templates/components/tooltip_scripts.js.jinja +186 -1
- kash/web_gen/templates/components/youtube_popover_scripts.js.jinja +223 -0
- kash/web_gen/templates/components/youtube_popover_styles.css.jinja +150 -0
- kash/web_gen/templates/content_styles.css.jinja +53 -1
- kash/web_gen/templates/youtube_webpage.html.jinja +47 -0
- kash/web_gen/webpage_render.py +103 -0
- kash/workspaces/workspaces.py +0 -5
- kash/xonsh_custom/custom_shell.py +4 -3
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/METADATA +35 -26
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/RECORD +72 -64
- kash/llm_utils/llm_features.py +0 -72
- kash/web_gen/simple_webpage.py +0 -55
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/WHEEL +0 -0
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/entry_points.txt +0 -0
- {kash_shell-0.3.28.dist-info → kash_shell-0.3.33.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from sidematter_format.sidematter_format import Sidematter
|
|
8
|
+
|
|
9
|
+
from kash.utils.common.url import Url, is_s3_url, parse_s3_url
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def check_aws_cli() -> None:
|
|
13
|
+
"""
|
|
14
|
+
Check if the AWS CLI is installed and available.
|
|
15
|
+
"""
|
|
16
|
+
if shutil.which("aws") is None:
|
|
17
|
+
raise RuntimeError(
|
|
18
|
+
"AWS CLI not found in PATH. Please install 'awscli' and ensure 'aws' is available."
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_s3_parent_folder(url: Url) -> Url | None:
|
|
23
|
+
"""
|
|
24
|
+
Get the parent folder of an S3 URL, or None if not an S3 URL.
|
|
25
|
+
"""
|
|
26
|
+
if is_s3_url(url):
|
|
27
|
+
s3_bucket, s3_key = parse_s3_url(url)
|
|
28
|
+
s3_parent_folder = Path(s3_key).parent
|
|
29
|
+
|
|
30
|
+
return Url(f"s3://{s3_bucket}/{s3_parent_folder}")
|
|
31
|
+
|
|
32
|
+
else:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def s3_sync_to_folder(
|
|
37
|
+
src_path: str | Path,
|
|
38
|
+
s3_dest_parent: Url,
|
|
39
|
+
*,
|
|
40
|
+
include_sidematter: bool = False,
|
|
41
|
+
) -> list[Url]:
|
|
42
|
+
"""
|
|
43
|
+
Sync a local file or directory to an S3 "parent" folder using the AWS CLI.
|
|
44
|
+
Set `include_sidematter` to include sidematter files alongside the source files.
|
|
45
|
+
|
|
46
|
+
Returns a list of S3 URLs that were the top-level sync targets:
|
|
47
|
+
- For a single file: the file URL (and sidematter file/dir URLs if included).
|
|
48
|
+
- For a directory: the destination parent prefix URL (non-recursive reporting).
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
src_path = Path(src_path)
|
|
52
|
+
if not src_path.exists():
|
|
53
|
+
raise ValueError(f"Source path does not exist: {src_path}")
|
|
54
|
+
if not is_s3_url(s3_dest_parent):
|
|
55
|
+
raise ValueError(f"Destination must be an s3:// URL: {s3_dest_parent}")
|
|
56
|
+
|
|
57
|
+
check_aws_cli()
|
|
58
|
+
|
|
59
|
+
dest_prefix = str(s3_dest_parent).rstrip("/") + "/"
|
|
60
|
+
targets: list[Url] = []
|
|
61
|
+
|
|
62
|
+
if src_path.is_file():
|
|
63
|
+
# Build the list of paths to sync using Sidematter's resolved path_list if requested.
|
|
64
|
+
sync_paths: list[Path]
|
|
65
|
+
if include_sidematter:
|
|
66
|
+
resolved = Sidematter(src_path).resolve(parse_meta=False, use_frontmatter=False)
|
|
67
|
+
sync_paths = resolved.path_list
|
|
68
|
+
else:
|
|
69
|
+
sync_paths = [src_path]
|
|
70
|
+
|
|
71
|
+
for p in sync_paths:
|
|
72
|
+
if p.is_file():
|
|
73
|
+
# Use sync with include/exclude to leverage default short-circuiting
|
|
74
|
+
subprocess.run(
|
|
75
|
+
[
|
|
76
|
+
"aws",
|
|
77
|
+
"s3",
|
|
78
|
+
"sync",
|
|
79
|
+
str(p.parent),
|
|
80
|
+
dest_prefix,
|
|
81
|
+
"--exclude",
|
|
82
|
+
"*",
|
|
83
|
+
"--include",
|
|
84
|
+
p.name,
|
|
85
|
+
],
|
|
86
|
+
check=True,
|
|
87
|
+
)
|
|
88
|
+
targets.append(Url(dest_prefix + p.name))
|
|
89
|
+
elif p.is_dir():
|
|
90
|
+
dest_dir = dest_prefix + p.name + "/"
|
|
91
|
+
subprocess.run(["aws", "s3", "sync", str(p), dest_dir], check=True)
|
|
92
|
+
targets.append(Url(dest_dir))
|
|
93
|
+
|
|
94
|
+
return targets
|
|
95
|
+
else:
|
|
96
|
+
# Directory mode: sync whole directory.
|
|
97
|
+
subprocess.run(
|
|
98
|
+
[
|
|
99
|
+
"aws",
|
|
100
|
+
"s3",
|
|
101
|
+
"sync",
|
|
102
|
+
str(src_path),
|
|
103
|
+
dest_prefix,
|
|
104
|
+
],
|
|
105
|
+
check=True,
|
|
106
|
+
)
|
|
107
|
+
targets.append(Url(dest_prefix))
|
|
108
|
+
return targets
|
kash/utils/common/url.py
CHANGED
|
@@ -26,6 +26,7 @@ A string that may not be resolved to a URL or path.
|
|
|
26
26
|
|
|
27
27
|
HTTP_ONLY = ["http", "https"]
|
|
28
28
|
HTTP_OR_FILE = HTTP_ONLY + ["file"]
|
|
29
|
+
HTTP_OR_FILE_OR_S3 = HTTP_OR_FILE + ["s3"]
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
def check_if_url(
|
|
@@ -36,7 +37,8 @@ def check_if_url(
|
|
|
36
37
|
the `urlparse.ParseResult`.
|
|
37
38
|
|
|
38
39
|
Also returns false for Paths, so that it's easy to use local paths and URLs
|
|
39
|
-
(`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE`
|
|
40
|
+
(`Locator`s) interchangeably. Can provide `HTTP_ONLY` or `HTTP_OR_FILE`
|
|
41
|
+
or `HTTP_OR_FILE_OR_S3` to restrict to only certain schemes.
|
|
40
42
|
restrict to only certain schemes.
|
|
41
43
|
"""
|
|
42
44
|
if isinstance(text, Path):
|
|
@@ -69,6 +71,13 @@ def is_file_url(url: str | Url) -> bool:
|
|
|
69
71
|
return url.startswith("file://")
|
|
70
72
|
|
|
71
73
|
|
|
74
|
+
def is_s3_url(url: str | Url) -> bool:
|
|
75
|
+
"""
|
|
76
|
+
Is URL an S3 URL?
|
|
77
|
+
"""
|
|
78
|
+
return url.startswith("s3://")
|
|
79
|
+
|
|
80
|
+
|
|
72
81
|
def parse_http_url(url: str | Url) -> ParseResult:
|
|
73
82
|
"""
|
|
74
83
|
Parse an http/https URL and return the parsed result, raising ValueError if
|
|
@@ -118,7 +127,7 @@ def as_file_url(path: str | Path) -> Url:
|
|
|
118
127
|
|
|
119
128
|
def normalize_url(
|
|
120
129
|
url: Url,
|
|
121
|
-
check_schemes: list[str] | None =
|
|
130
|
+
check_schemes: list[str] | None = HTTP_OR_FILE_OR_S3,
|
|
122
131
|
drop_fragment: bool = True,
|
|
123
132
|
resolve_local_paths: bool = True,
|
|
124
133
|
) -> Url:
|
|
@@ -238,7 +247,10 @@ def test_normalize_url():
|
|
|
238
247
|
normalize_url(url=Url("/not/a/URL"))
|
|
239
248
|
raise AssertionError()
|
|
240
249
|
except ValueError as e:
|
|
241
|
-
assert
|
|
250
|
+
assert (
|
|
251
|
+
str(e)
|
|
252
|
+
== "Scheme '' not in allowed schemes: ['http', 'https', 'file', 's3']: /not/a/URL"
|
|
253
|
+
)
|
|
242
254
|
|
|
243
255
|
try:
|
|
244
256
|
normalize_url(Url("ftp://example.com"))
|
|
@@ -246,7 +258,7 @@ def test_normalize_url():
|
|
|
246
258
|
except ValueError as e:
|
|
247
259
|
assert (
|
|
248
260
|
str(e)
|
|
249
|
-
== "Scheme 'ftp' not in allowed schemes: ['http', 'https', 'file']: ftp://example.com"
|
|
261
|
+
== "Scheme 'ftp' not in allowed schemes: ['http', 'https', 'file', 's3']: ftp://example.com"
|
|
250
262
|
)
|
|
251
263
|
|
|
252
264
|
|
|
@@ -93,7 +93,6 @@ content: |
|
|
|
93
93
|
|
|
94
94
|
from __future__ import annotations
|
|
95
95
|
|
|
96
|
-
import json
|
|
97
96
|
from dataclasses import field
|
|
98
97
|
from enum import Enum
|
|
99
98
|
from io import StringIO
|
|
@@ -104,6 +103,7 @@ from typing import Any
|
|
|
104
103
|
from frontmatter_format import from_yaml_string, new_yaml, to_yaml_string
|
|
105
104
|
from prettyfmt import abbrev_obj, custom_key_sort, fmt_size_human
|
|
106
105
|
from pydantic.dataclasses import dataclass
|
|
106
|
+
from sidematter_format import to_json_string
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
class ChatRole(str, Enum):
|
|
@@ -161,9 +161,12 @@ class ChatMessage:
|
|
|
161
161
|
Convert to a format that can be used as a standard chat completion, with
|
|
162
162
|
the content field holding JSON-serialized data if it is structured.
|
|
163
163
|
"""
|
|
164
|
+
|
|
164
165
|
return {
|
|
165
166
|
"role": self.role.value,
|
|
166
|
-
"content":
|
|
167
|
+
"content": to_json_string(self.content)
|
|
168
|
+
if isinstance(self.content, dict)
|
|
169
|
+
else self.content,
|
|
167
170
|
}
|
|
168
171
|
|
|
169
172
|
@classmethod
|
|
@@ -174,7 +177,7 @@ class ChatMessage:
|
|
|
174
177
|
return to_yaml_string(self.as_dict(), key_sort=_custom_key_sort)
|
|
175
178
|
|
|
176
179
|
def to_json(self) -> str:
|
|
177
|
-
return
|
|
180
|
+
return to_json_string(self.as_dict())
|
|
178
181
|
|
|
179
182
|
def as_str(self) -> str:
|
|
180
183
|
return self.to_yaml()
|
|
@@ -222,7 +225,7 @@ class ChatHistory:
|
|
|
222
225
|
return stream.getvalue()
|
|
223
226
|
|
|
224
227
|
def to_json(self) -> str:
|
|
225
|
-
return
|
|
228
|
+
return to_json_string([message.as_dict() for message in self.messages], indent=None)
|
|
226
229
|
|
|
227
230
|
def size_summary(self) -> str:
|
|
228
231
|
role_counts = {}
|
|
@@ -16,7 +16,7 @@ def is_fullpage_html(content: str) -> bool:
|
|
|
16
16
|
A full HTML document that is a full page (headers, footers, etc.) and
|
|
17
17
|
so probably best rendered in a browser.
|
|
18
18
|
"""
|
|
19
|
-
return bool(re.search(r"<!DOCTYPE html>|<html
|
|
19
|
+
return bool(re.search(r"<!DOCTYPE html>|<html.*?>|<body>|<head>", content, re.IGNORECASE))
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
_yaml_header_pattern = re.compile(r"^---\n\w+:", re.MULTILINE)
|
|
@@ -35,7 +35,9 @@ def is_html(content: str) -> bool:
|
|
|
35
35
|
"""
|
|
36
36
|
return bool(
|
|
37
37
|
re.search(
|
|
38
|
-
r"<!DOCTYPE html>|<html
|
|
38
|
+
r"<!DOCTYPE html>|<html.*?>|<body>|<head>|<div>|<p>|<img |<a href",
|
|
39
|
+
content,
|
|
40
|
+
re.IGNORECASE,
|
|
39
41
|
)
|
|
40
42
|
)
|
|
41
43
|
|
|
@@ -72,6 +72,9 @@ class Format(Enum):
|
|
|
72
72
|
mp3 = "mp3"
|
|
73
73
|
m4a = "m4a"
|
|
74
74
|
mp4 = "mp4"
|
|
75
|
+
|
|
76
|
+
# Binary formats.
|
|
77
|
+
zip = "zip"
|
|
75
78
|
binary = "binary"
|
|
76
79
|
"""Catch-all format for binary files that are unrecognized."""
|
|
77
80
|
|
|
@@ -167,6 +170,10 @@ class Format(Enum):
|
|
|
167
170
|
def is_data(self) -> bool:
|
|
168
171
|
return self in [self.csv, self.xlsx, self.npz]
|
|
169
172
|
|
|
173
|
+
@property
|
|
174
|
+
def is_zip(self) -> bool:
|
|
175
|
+
return self in [self.zip]
|
|
176
|
+
|
|
170
177
|
@property
|
|
171
178
|
def is_binary(self) -> bool:
|
|
172
179
|
return self.has_body and not self.is_text
|
|
@@ -257,6 +264,7 @@ class Format(Enum):
|
|
|
257
264
|
FileExt.m4a.value: Format.m4a,
|
|
258
265
|
FileExt.mp4.value: Format.mp4,
|
|
259
266
|
FileExt.epub.value: Format.epub,
|
|
267
|
+
FileExt.zip.value: Format.zip,
|
|
260
268
|
}
|
|
261
269
|
return ext_to_format.get(file_ext.value, None)
|
|
262
270
|
|
|
@@ -292,6 +300,7 @@ class Format(Enum):
|
|
|
292
300
|
Format.mp3: FileExt.mp3,
|
|
293
301
|
Format.m4a: FileExt.m4a,
|
|
294
302
|
Format.mp4: FileExt.mp4,
|
|
303
|
+
Format.zip: FileExt.zip,
|
|
295
304
|
}
|
|
296
305
|
|
|
297
306
|
return format_to_file_ext.get(self, None)
|
|
@@ -329,6 +338,9 @@ class Format(Enum):
|
|
|
329
338
|
"audio/mp3": Format.mp3,
|
|
330
339
|
"audio/mp4": Format.m4a,
|
|
331
340
|
"video/mp4": Format.mp4,
|
|
341
|
+
"application/zip": Format.zip,
|
|
342
|
+
"application/x-zip": Format.zip,
|
|
343
|
+
"application/x-zip-compressed": Format.zip,
|
|
332
344
|
"application/octet-stream": Format.binary,
|
|
333
345
|
}
|
|
334
346
|
|
|
@@ -75,7 +75,7 @@ def normalize_text_file(
|
|
|
75
75
|
|
|
76
76
|
def test_osc8_link():
|
|
77
77
|
from clideps.terminal.osc_utils import osc8_link
|
|
78
|
-
from flowmark
|
|
78
|
+
from flowmark import wrap_paragraph
|
|
79
79
|
|
|
80
80
|
link = osc8_link("https://example.com/" + "x" * 50, "Example")
|
|
81
81
|
assert ansi_cell_len(link) == 7
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from flowmark import flowmark_markdown, line_wrap_by_sentence
|
|
8
|
+
from marko import Markdown
|
|
9
|
+
from marko.ext import footnote
|
|
10
|
+
|
|
11
|
+
from kash.utils.text_handling.markdown_utils import comprehensive_transform_tree
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _normalize_footnotes_in_markdown(content: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
Ensure blank lines between consecutive footnote definitions.
|
|
17
|
+
|
|
18
|
+
Marko has a bug where consecutive footnotes without blank lines are parsed
|
|
19
|
+
as a single footnote. This adds blank lines where needed.
|
|
20
|
+
"""
|
|
21
|
+
lines = content.split("\n")
|
|
22
|
+
result = []
|
|
23
|
+
i = 0
|
|
24
|
+
|
|
25
|
+
while i < len(lines):
|
|
26
|
+
line = lines[i]
|
|
27
|
+
result.append(line)
|
|
28
|
+
|
|
29
|
+
# Check if this is a footnote definition
|
|
30
|
+
if re.match(r"^\[\^[^\]]+\]:", line):
|
|
31
|
+
# Look ahead to see if the next non-empty line is also a footnote
|
|
32
|
+
j = i + 1
|
|
33
|
+
while j < len(lines) and not lines[j].strip():
|
|
34
|
+
result.append(lines[j])
|
|
35
|
+
j += 1
|
|
36
|
+
|
|
37
|
+
if j < len(lines) and re.match(r"^\[\^[^\]]+\]:", lines[j]):
|
|
38
|
+
# Next non-empty line is also a footnote, add blank line
|
|
39
|
+
result.append("")
|
|
40
|
+
|
|
41
|
+
i = j
|
|
42
|
+
else:
|
|
43
|
+
i += 1
|
|
44
|
+
|
|
45
|
+
return "\n".join(result)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class FootnoteInfo:
|
|
50
|
+
"""
|
|
51
|
+
Information about a single footnote definition.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
footnote_id: str # The footnote ID with caret (e.g., "^123", "^foo")
|
|
55
|
+
content: str # The rendered markdown content of the footnote
|
|
56
|
+
raw_element: footnote.FootnoteDef # The original marko element
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class MarkdownFootnotes:
|
|
61
|
+
"""
|
|
62
|
+
Container for all footnotes in a markdown document with fast lookup.
|
|
63
|
+
|
|
64
|
+
Provides efficient access to footnote definitions by their IDs.
|
|
65
|
+
IDs are stored with the leading caret (^) to avoid collisions.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
footnotes: dict[str, FootnoteInfo] = field(default_factory=dict)
|
|
69
|
+
"""Dictionary mapping footnote IDs (with ^) to FootnoteInfo objects."""
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def from_markdown(content: str, markdown_parser: Markdown | None = None) -> MarkdownFootnotes:
|
|
73
|
+
"""
|
|
74
|
+
Extract all footnotes from markdown content.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
content: The markdown content to parse
|
|
78
|
+
markdown_parser: Optional custom markdown parser. If None, uses default flowmark setup.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
MarkdownFootnotes instance with all footnotes indexed by ID
|
|
82
|
+
"""
|
|
83
|
+
if markdown_parser is None:
|
|
84
|
+
markdown_parser = flowmark_markdown(line_wrap_by_sentence(is_markdown=True))
|
|
85
|
+
|
|
86
|
+
# Normalize to work around marko bug with consecutive footnotes
|
|
87
|
+
normalized_content = _normalize_footnotes_in_markdown(content)
|
|
88
|
+
document = markdown_parser.parse(normalized_content)
|
|
89
|
+
return MarkdownFootnotes.from_document(document, markdown_parser)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def from_document(document: Any, markdown_parser: Markdown | None = None) -> MarkdownFootnotes:
|
|
93
|
+
"""
|
|
94
|
+
Extract all footnotes from a parsed markdown document.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
document: A parsed marko document object
|
|
98
|
+
markdown_parser: The markdown parser used (needed for rendering).
|
|
99
|
+
If None, uses default flowmark setup.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
MarkdownFootnotes instance with all footnotes indexed by ID
|
|
103
|
+
"""
|
|
104
|
+
if markdown_parser is None:
|
|
105
|
+
markdown_parser = flowmark_markdown(line_wrap_by_sentence(is_markdown=True))
|
|
106
|
+
|
|
107
|
+
footnotes_dict: dict[str, FootnoteInfo] = {}
|
|
108
|
+
|
|
109
|
+
def collect_footnote(element: Any) -> None:
|
|
110
|
+
if isinstance(element, footnote.FootnoteDef):
|
|
111
|
+
content_parts = []
|
|
112
|
+
if hasattr(element, "children") and element.children:
|
|
113
|
+
for child in element.children:
|
|
114
|
+
rendered = markdown_parser.renderer.render(child)
|
|
115
|
+
content_parts.append(rendered)
|
|
116
|
+
|
|
117
|
+
rendered_content = "".join(content_parts).strip()
|
|
118
|
+
|
|
119
|
+
footnote_id = f"^{element.label}"
|
|
120
|
+
footnotes_dict[footnote_id] = FootnoteInfo(
|
|
121
|
+
footnote_id=footnote_id,
|
|
122
|
+
content=rendered_content,
|
|
123
|
+
raw_element=element,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
comprehensive_transform_tree(document, collect_footnote)
|
|
127
|
+
|
|
128
|
+
return MarkdownFootnotes(footnotes=footnotes_dict)
|
|
129
|
+
|
|
130
|
+
def get(self, footnote_id: str, default: FootnoteInfo | None = None) -> FootnoteInfo | None:
|
|
131
|
+
"""
|
|
132
|
+
Get a footnote by its ID.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
footnote_id: The footnote ID (with or without leading ^)
|
|
136
|
+
default: Default value if footnote not found
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
FootnoteInfo if found, otherwise default value
|
|
140
|
+
"""
|
|
141
|
+
if not footnote_id.startswith("^"):
|
|
142
|
+
footnote_id = f"^{footnote_id}"
|
|
143
|
+
return self.footnotes.get(footnote_id, default)
|
|
144
|
+
|
|
145
|
+
def __getitem__(self, footnote_id: str) -> FootnoteInfo:
|
|
146
|
+
"""
|
|
147
|
+
Get a footnote by its ID using dictionary-style access.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
footnote_id: The footnote ID (with or without leading ^)
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
FootnoteInfo for the ID
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
KeyError: If the footnote ID is not found
|
|
157
|
+
"""
|
|
158
|
+
if not footnote_id.startswith("^"):
|
|
159
|
+
footnote_id = f"^{footnote_id}"
|
|
160
|
+
return self.footnotes[footnote_id]
|
|
161
|
+
|
|
162
|
+
def __contains__(self, footnote_id: str) -> bool:
|
|
163
|
+
"""
|
|
164
|
+
Check if a footnote exists.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
footnote_id: The footnote ID (with or without leading ^)
|
|
168
|
+
"""
|
|
169
|
+
if not footnote_id.startswith("^"):
|
|
170
|
+
footnote_id = f"^{footnote_id}"
|
|
171
|
+
return footnote_id in self.footnotes
|
|
172
|
+
|
|
173
|
+
def __len__(self) -> int:
|
|
174
|
+
"""Return the number of footnotes."""
|
|
175
|
+
return len(self.footnotes)
|
|
176
|
+
|
|
177
|
+
def __iter__(self):
|
|
178
|
+
"""Iterate over footnote IDs (with carets)."""
|
|
179
|
+
return iter(self.footnotes)
|
|
180
|
+
|
|
181
|
+
def items(self):
|
|
182
|
+
"""Return (footnote_id, FootnoteInfo) pairs."""
|
|
183
|
+
return self.footnotes.items()
|
|
184
|
+
|
|
185
|
+
def values(self):
|
|
186
|
+
"""Return FootnoteInfo objects."""
|
|
187
|
+
return self.footnotes.values()
|
|
188
|
+
|
|
189
|
+
def keys(self):
|
|
190
|
+
"""Return footnote IDs (with carets)."""
|
|
191
|
+
return self.footnotes.keys()
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def extract_footnote_references(content: str, markdown_parser: Markdown | None = None) -> list[str]:
|
|
195
|
+
"""
|
|
196
|
+
Extract all footnote reference IDs used in the content.
|
|
197
|
+
|
|
198
|
+
This finds all FootnoteRef elements (e.g., [^123] in the text) as opposed
|
|
199
|
+
to FootnoteDef elements which are the definitions.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
content: The markdown content to parse
|
|
203
|
+
markdown_parser: Optional custom markdown parser
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
List of unique footnote IDs that are referenced (with the ^)
|
|
207
|
+
"""
|
|
208
|
+
if markdown_parser is None:
|
|
209
|
+
markdown_parser = flowmark_markdown(line_wrap_by_sentence(is_markdown=True))
|
|
210
|
+
|
|
211
|
+
normalized_content = _normalize_footnotes_in_markdown(content)
|
|
212
|
+
document = markdown_parser.parse(normalized_content)
|
|
213
|
+
references: list[str] = []
|
|
214
|
+
seen: set[str] = set()
|
|
215
|
+
|
|
216
|
+
def collect_references(element: Any) -> None:
|
|
217
|
+
if isinstance(element, footnote.FootnoteRef):
|
|
218
|
+
footnote_id = f"^{element.label}"
|
|
219
|
+
if footnote_id not in seen:
|
|
220
|
+
seen.add(footnote_id)
|
|
221
|
+
references.append(footnote_id)
|
|
222
|
+
|
|
223
|
+
comprehensive_transform_tree(document, collect_references)
|
|
224
|
+
return references
|