justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +44 -2
- justhtml/__main__.py +45 -9
- justhtml/constants.py +12 -0
- justhtml/errors.py +8 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +54 -35
- justhtml/parser.py +105 -38
- justhtml/sanitize.py +511 -282
- justhtml/selector.py +3 -1
- justhtml/serialize.py +398 -72
- justhtml/tokenizer.py +121 -21
- justhtml/tokens.py +21 -3
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +247 -190
- justhtml/treebuilder_modes.py +108 -102
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/METADATA +28 -7
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +1 -1
- justhtml-0.24.0.dist-info/RECORD +0 -24
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.24.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/node.py
CHANGED
|
@@ -3,12 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import TYPE_CHECKING, Any
|
|
4
4
|
from urllib.parse import quote
|
|
5
5
|
|
|
6
|
-
from .sanitize import sanitize
|
|
7
6
|
from .selector import query
|
|
8
7
|
from .serialize import to_html
|
|
9
8
|
|
|
10
9
|
if TYPE_CHECKING:
|
|
11
|
-
from .sanitize import SanitizationPolicy
|
|
12
10
|
from .tokens import Doctype
|
|
13
11
|
|
|
14
12
|
|
|
@@ -192,6 +190,7 @@ class SimpleDomNode:
|
|
|
192
190
|
"_origin_col",
|
|
193
191
|
"_origin_line",
|
|
194
192
|
"_origin_pos",
|
|
193
|
+
"_source_html",
|
|
195
194
|
"attrs",
|
|
196
195
|
"children",
|
|
197
196
|
"data",
|
|
@@ -209,6 +208,7 @@ class SimpleDomNode:
|
|
|
209
208
|
_origin_pos: int | None
|
|
210
209
|
_origin_line: int | None
|
|
211
210
|
_origin_col: int | None
|
|
211
|
+
_source_html: str | None
|
|
212
212
|
|
|
213
213
|
def __init__(
|
|
214
214
|
self,
|
|
@@ -220,6 +220,7 @@ class SimpleDomNode:
|
|
|
220
220
|
self.name = name
|
|
221
221
|
self.parent = None
|
|
222
222
|
self.data = data
|
|
223
|
+
self._source_html = None
|
|
223
224
|
self._origin_pos = None
|
|
224
225
|
self._origin_line = None
|
|
225
226
|
self._origin_col = None
|
|
@@ -271,12 +272,9 @@ class SimpleDomNode:
|
|
|
271
272
|
indent: int = 0,
|
|
272
273
|
indent_size: int = 2,
|
|
273
274
|
pretty: bool = True,
|
|
274
|
-
*,
|
|
275
|
-
safe: bool = True,
|
|
276
|
-
policy: SanitizationPolicy | None = None,
|
|
277
275
|
) -> str:
|
|
278
276
|
"""Convert node to HTML string."""
|
|
279
|
-
return to_html(self, indent, indent_size, pretty=pretty
|
|
277
|
+
return to_html(self, indent, indent_size, pretty=pretty)
|
|
280
278
|
|
|
281
279
|
def query(self, selector: str) -> list[Any]:
|
|
282
280
|
"""
|
|
@@ -312,39 +310,27 @@ class SimpleDomNode:
|
|
|
312
310
|
self,
|
|
313
311
|
separator: str = " ",
|
|
314
312
|
strip: bool = True,
|
|
315
|
-
*,
|
|
316
|
-
safe: bool = True,
|
|
317
|
-
policy: SanitizationPolicy | None = None,
|
|
318
313
|
) -> str:
|
|
319
314
|
"""Return the concatenated text of this node's descendants.
|
|
320
315
|
|
|
321
316
|
- `separator` controls how text nodes are joined (default: a single space).
|
|
322
317
|
- `strip=True` strips each text node and drops empty segments.
|
|
323
|
-
- `safe=True` sanitizes untrusted HTML before extracting text.
|
|
324
|
-
- `policy` overrides the default sanitization policy.
|
|
325
|
-
|
|
326
318
|
Template element contents are included via `template_content`.
|
|
327
319
|
"""
|
|
328
|
-
node: Any =
|
|
320
|
+
node: Any = self
|
|
329
321
|
parts: list[str] = []
|
|
330
322
|
_to_text_collect(node, parts, strip=strip)
|
|
331
323
|
if not parts:
|
|
332
324
|
return ""
|
|
333
325
|
return separator.join(parts)
|
|
334
326
|
|
|
335
|
-
def to_markdown(self
|
|
327
|
+
def to_markdown(self) -> str:
|
|
336
328
|
"""Return a GitHub Flavored Markdown representation of this subtree.
|
|
337
329
|
|
|
338
330
|
This is a pragmatic HTML->Markdown converter intended for readability.
|
|
339
331
|
- Tables and images are preserved as raw HTML.
|
|
340
332
|
- Unknown elements fall back to rendering their children.
|
|
341
333
|
"""
|
|
342
|
-
if safe:
|
|
343
|
-
node = sanitize(self, policy=policy)
|
|
344
|
-
builder = _MarkdownBuilder()
|
|
345
|
-
_to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
|
|
346
|
-
return builder.finish()
|
|
347
|
-
|
|
348
334
|
builder = _MarkdownBuilder()
|
|
349
335
|
_to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
|
|
350
336
|
return builder.finish()
|
|
@@ -405,22 +391,25 @@ class SimpleDomNode:
|
|
|
405
391
|
"""Return True if this node has children."""
|
|
406
392
|
return bool(self.children)
|
|
407
393
|
|
|
408
|
-
def clone_node(self, deep: bool = False) -> SimpleDomNode:
|
|
394
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
|
|
409
395
|
"""
|
|
410
396
|
Clone this node.
|
|
411
397
|
|
|
412
398
|
Args:
|
|
413
399
|
deep: If True, recursively clone children.
|
|
400
|
+
override_attrs: Optional dictionary to use as attributes for the clone.
|
|
414
401
|
|
|
415
402
|
Returns:
|
|
416
403
|
A new node that is a copy of this node.
|
|
417
404
|
"""
|
|
405
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
|
|
418
406
|
clone = SimpleDomNode(
|
|
419
407
|
self.name,
|
|
420
|
-
|
|
408
|
+
attrs,
|
|
421
409
|
self.data,
|
|
422
410
|
self.namespace,
|
|
423
411
|
)
|
|
412
|
+
clone._source_html = self._source_html
|
|
424
413
|
clone._origin_pos = self._origin_pos
|
|
425
414
|
clone._origin_line = self._origin_line
|
|
426
415
|
clone._origin_col = self._origin_col
|
|
@@ -431,11 +420,25 @@ class SimpleDomNode:
|
|
|
431
420
|
|
|
432
421
|
|
|
433
422
|
class ElementNode(SimpleDomNode):
|
|
434
|
-
__slots__ = (
|
|
423
|
+
__slots__ = (
|
|
424
|
+
"_end_tag_end",
|
|
425
|
+
"_end_tag_present",
|
|
426
|
+
"_end_tag_start",
|
|
427
|
+
"_self_closing",
|
|
428
|
+
"_start_tag_end",
|
|
429
|
+
"_start_tag_start",
|
|
430
|
+
"template_content",
|
|
431
|
+
)
|
|
435
432
|
|
|
436
433
|
template_content: SimpleDomNode | None
|
|
437
434
|
children: list[Any]
|
|
438
435
|
attrs: dict[str, str | None]
|
|
436
|
+
_start_tag_start: int | None
|
|
437
|
+
_start_tag_end: int | None
|
|
438
|
+
_end_tag_start: int | None
|
|
439
|
+
_end_tag_end: int | None
|
|
440
|
+
_end_tag_present: bool
|
|
441
|
+
_self_closing: bool
|
|
439
442
|
|
|
440
443
|
def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
|
|
441
444
|
self.name = name
|
|
@@ -445,15 +448,30 @@ class ElementNode(SimpleDomNode):
|
|
|
445
448
|
self.children = []
|
|
446
449
|
self.attrs = attrs if attrs is not None else {}
|
|
447
450
|
self.template_content = None
|
|
451
|
+
self._source_html = None
|
|
448
452
|
self._origin_pos = None
|
|
449
453
|
self._origin_line = None
|
|
450
454
|
self._origin_col = None
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
455
|
+
self._start_tag_start = None
|
|
456
|
+
self._start_tag_end = None
|
|
457
|
+
self._end_tag_start = None
|
|
458
|
+
self._end_tag_end = None
|
|
459
|
+
self._end_tag_present = False
|
|
460
|
+
self._self_closing = False
|
|
461
|
+
|
|
462
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
|
|
463
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
|
|
464
|
+
clone = ElementNode(self.name, attrs, self.namespace)
|
|
465
|
+
clone._source_html = self._source_html
|
|
454
466
|
clone._origin_pos = self._origin_pos
|
|
455
467
|
clone._origin_line = self._origin_line
|
|
456
468
|
clone._origin_col = self._origin_col
|
|
469
|
+
clone._start_tag_start = self._start_tag_start
|
|
470
|
+
clone._start_tag_end = self._start_tag_end
|
|
471
|
+
clone._end_tag_start = self._end_tag_start
|
|
472
|
+
clone._end_tag_end = self._end_tag_end
|
|
473
|
+
clone._end_tag_present = self._end_tag_present
|
|
474
|
+
clone._self_closing = self._self_closing
|
|
457
475
|
if deep:
|
|
458
476
|
for child in self.children:
|
|
459
477
|
clone.append_child(child.clone_node(deep=True))
|
|
@@ -476,16 +494,24 @@ class TemplateNode(ElementNode):
|
|
|
476
494
|
else:
|
|
477
495
|
self.template_content = None
|
|
478
496
|
|
|
479
|
-
def clone_node(self, deep: bool = False) -> TemplateNode:
|
|
497
|
+
def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
|
|
498
|
+
attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
|
|
480
499
|
clone = TemplateNode(
|
|
481
500
|
self.name,
|
|
482
|
-
|
|
501
|
+
attrs,
|
|
483
502
|
None,
|
|
484
503
|
self.namespace,
|
|
485
504
|
)
|
|
505
|
+
clone._source_html = self._source_html
|
|
486
506
|
clone._origin_pos = self._origin_pos
|
|
487
507
|
clone._origin_line = self._origin_line
|
|
488
508
|
clone._origin_col = self._origin_col
|
|
509
|
+
clone._start_tag_start = self._start_tag_start
|
|
510
|
+
clone._start_tag_end = self._start_tag_end
|
|
511
|
+
clone._end_tag_start = self._end_tag_start
|
|
512
|
+
clone._end_tag_end = self._end_tag_end
|
|
513
|
+
clone._end_tag_present = self._end_tag_present
|
|
514
|
+
clone._self_closing = self._self_closing
|
|
489
515
|
if deep:
|
|
490
516
|
if self.template_content:
|
|
491
517
|
clone.template_content = self.template_content.clone_node(deep=True)
|
|
@@ -542,15 +568,8 @@ class TextNode:
|
|
|
542
568
|
self,
|
|
543
569
|
separator: str = " ",
|
|
544
570
|
strip: bool = True,
|
|
545
|
-
*,
|
|
546
|
-
safe: bool = True,
|
|
547
|
-
policy: SanitizationPolicy | None = None,
|
|
548
571
|
) -> str:
|
|
549
|
-
# Parameters are accepted for API consistency; they don't affect leaf nodes.
|
|
550
572
|
_ = separator
|
|
551
|
-
_ = safe
|
|
552
|
-
_ = policy
|
|
553
|
-
|
|
554
573
|
if self.data is None:
|
|
555
574
|
return ""
|
|
556
575
|
if strip:
|
justhtml/parser.py
CHANGED
|
@@ -7,12 +7,14 @@ from typing import TYPE_CHECKING, Any
|
|
|
7
7
|
from .context import FragmentContext
|
|
8
8
|
from .encoding import decode_html
|
|
9
9
|
from .tokenizer import Tokenizer, TokenizerOpts
|
|
10
|
+
from .transforms import apply_compiled_transforms, compile_transforms
|
|
10
11
|
from .treebuilder import TreeBuilder
|
|
11
12
|
|
|
12
13
|
if TYPE_CHECKING:
|
|
13
14
|
from .node import SimpleDomNode
|
|
14
15
|
from .sanitize import SanitizationPolicy
|
|
15
16
|
from .tokens import ParseError
|
|
17
|
+
from .transforms import TransformSpec
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class StrictModeError(SyntaxError):
|
|
@@ -53,6 +55,8 @@ class JustHTML:
|
|
|
53
55
|
self,
|
|
54
56
|
html: str | bytes | bytearray | memoryview | None,
|
|
55
57
|
*,
|
|
58
|
+
safe: bool = True,
|
|
59
|
+
policy: SanitizationPolicy | None = None,
|
|
56
60
|
collect_errors: bool = False,
|
|
57
61
|
track_node_locations: bool = False,
|
|
58
62
|
debug: bool = False,
|
|
@@ -63,6 +67,7 @@ class JustHTML:
|
|
|
63
67
|
strict: bool = False,
|
|
64
68
|
tokenizer_opts: TokenizerOpts | None = None,
|
|
65
69
|
tree_builder: TreeBuilder | None = None,
|
|
70
|
+
transforms: list[TransformSpec] | None = None,
|
|
66
71
|
) -> None:
|
|
67
72
|
if fragment_context is not None:
|
|
68
73
|
fragment = True
|
|
@@ -70,6 +75,29 @@ class JustHTML:
|
|
|
70
75
|
if fragment and fragment_context is None:
|
|
71
76
|
fragment_context = FragmentContext("div")
|
|
72
77
|
|
|
78
|
+
track_tag_spans = False
|
|
79
|
+
has_sanitize_transform = False
|
|
80
|
+
needs_escape_incomplete_tags = False
|
|
81
|
+
if transforms:
|
|
82
|
+
from .sanitize import DEFAULT_POLICY # noqa: PLC0415
|
|
83
|
+
from .transforms import Sanitize # noqa: PLC0415
|
|
84
|
+
|
|
85
|
+
for t in transforms:
|
|
86
|
+
if isinstance(t, Sanitize):
|
|
87
|
+
has_sanitize_transform = True
|
|
88
|
+
effective = t.policy or DEFAULT_POLICY
|
|
89
|
+
if effective.disallowed_tag_handling == "escape":
|
|
90
|
+
track_tag_spans = True
|
|
91
|
+
needs_escape_incomplete_tags = True
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# If we will auto-sanitize (safe=True and no Sanitize in transforms),
|
|
95
|
+
# escape-mode tag reconstruction may require tracking tag spans.
|
|
96
|
+
if safe and not has_sanitize_transform and policy is not None:
|
|
97
|
+
if policy.disallowed_tag_handling == "escape":
|
|
98
|
+
track_tag_spans = True
|
|
99
|
+
needs_escape_incomplete_tags = True
|
|
100
|
+
|
|
73
101
|
self.debug = bool(debug)
|
|
74
102
|
self.fragment_context = fragment_context
|
|
75
103
|
self.encoding = None
|
|
@@ -91,8 +119,11 @@ class JustHTML:
|
|
|
91
119
|
fragment_context=fragment_context,
|
|
92
120
|
iframe_srcdoc=iframe_srcdoc,
|
|
93
121
|
collect_errors=should_collect,
|
|
122
|
+
track_tag_spans=track_tag_spans,
|
|
94
123
|
)
|
|
95
124
|
opts = tokenizer_opts or TokenizerOpts()
|
|
125
|
+
if needs_escape_incomplete_tags:
|
|
126
|
+
opts.emit_bogus_markup_as_text = True
|
|
96
127
|
|
|
97
128
|
# For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
|
|
98
129
|
if fragment_context and not fragment_context.namespace:
|
|
@@ -109,6 +140,7 @@ class JustHTML:
|
|
|
109
140
|
opts,
|
|
110
141
|
collect_errors=should_collect,
|
|
111
142
|
track_node_locations=bool(track_node_locations),
|
|
143
|
+
track_tag_positions=bool(track_node_locations) or track_tag_spans,
|
|
112
144
|
)
|
|
113
145
|
# Link tokenizer to tree_builder for position info
|
|
114
146
|
self.tree_builder.tokenizer = self.tokenizer
|
|
@@ -116,11 +148,73 @@ class JustHTML:
|
|
|
116
148
|
self.tokenizer.run(html_str)
|
|
117
149
|
self.root = self.tree_builder.finish()
|
|
118
150
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
151
|
+
transform_errors: list[ParseError] = []
|
|
152
|
+
|
|
153
|
+
# Apply transforms after parse.
|
|
154
|
+
# Safety model: when safe=True, the in-memory tree is sanitized exactly once
|
|
155
|
+
# during construction by ensuring a Sanitize transform runs.
|
|
156
|
+
if transforms or safe:
|
|
157
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY # noqa: PLC0415
|
|
158
|
+
from .transforms import Sanitize # noqa: PLC0415
|
|
159
|
+
|
|
160
|
+
final_transforms: list[TransformSpec] = list(transforms or [])
|
|
161
|
+
|
|
162
|
+
# Normalize explicit Sanitize() transforms to use the same default policy
|
|
163
|
+
# choice as the old safe-output sanitizer (document vs fragment).
|
|
164
|
+
if final_transforms:
|
|
165
|
+
default_mode_policy = DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY
|
|
166
|
+
for i, t in enumerate(final_transforms):
|
|
167
|
+
if isinstance(t, Sanitize) and t.policy is None:
|
|
168
|
+
final_transforms[i] = Sanitize(
|
|
169
|
+
policy=default_mode_policy, enabled=t.enabled, callback=t.callback, report=t.report
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Auto-append a final Sanitize step only if the user didn't include
|
|
173
|
+
# Sanitize anywhere in their transform list.
|
|
174
|
+
if safe and not any(isinstance(t, Sanitize) for t in final_transforms):
|
|
175
|
+
effective_policy = (
|
|
176
|
+
policy
|
|
177
|
+
if policy is not None
|
|
178
|
+
else (DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY)
|
|
179
|
+
)
|
|
180
|
+
# Avoid stale collected errors on reused policy objects.
|
|
181
|
+
if effective_policy.unsafe_handling == "collect":
|
|
182
|
+
effective_policy.reset_collected_security_errors()
|
|
183
|
+
final_transforms.append(Sanitize(policy=effective_policy))
|
|
184
|
+
|
|
185
|
+
if final_transforms:
|
|
186
|
+
compiled_transforms = compile_transforms(tuple(final_transforms))
|
|
187
|
+
apply_compiled_transforms(self.root, compiled_transforms, errors=transform_errors)
|
|
188
|
+
|
|
189
|
+
# Merge collected security errors into the document error list.
|
|
190
|
+
# This mirrors the old behavior where safe output could feed
|
|
191
|
+
# security findings into doc.errors.
|
|
192
|
+
for t in final_transforms:
|
|
193
|
+
if isinstance(t, Sanitize):
|
|
194
|
+
t_policy = t.policy
|
|
195
|
+
if t_policy is not None and t_policy.unsafe_handling == "collect":
|
|
196
|
+
transform_errors.extend(t_policy.collected_security_errors())
|
|
197
|
+
|
|
198
|
+
if should_collect:
|
|
199
|
+
# Merge errors from both tokenizer and tree builder.
|
|
200
|
+
# Public API: users expect errors to be ordered by input position.
|
|
201
|
+
merged_errors = self.tokenizer.errors + self.tree_builder.errors + transform_errors
|
|
202
|
+
self.errors = self._sorted_errors(merged_errors)
|
|
203
|
+
else:
|
|
204
|
+
self.errors = transform_errors
|
|
205
|
+
|
|
206
|
+
# In strict mode, raise on first error
|
|
207
|
+
if strict and self.errors:
|
|
208
|
+
raise StrictModeError(self.errors[0])
|
|
209
|
+
|
|
210
|
+
def query(self, selector: str) -> list[Any]:
|
|
211
|
+
"""Query the document using a CSS selector. Delegates to root.query()."""
|
|
212
|
+
return self.root.query(selector)
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
|
|
216
|
+
indexed_errors = enumerate(errors)
|
|
217
|
+
return [
|
|
124
218
|
e
|
|
125
219
|
for _, e in sorted(
|
|
126
220
|
indexed_errors,
|
|
@@ -132,56 +226,29 @@ class JustHTML:
|
|
|
132
226
|
)
|
|
133
227
|
]
|
|
134
228
|
|
|
135
|
-
# In strict mode, raise on first error
|
|
136
|
-
if strict and self.errors:
|
|
137
|
-
raise StrictModeError(self.errors[0])
|
|
138
|
-
|
|
139
|
-
def query(self, selector: str) -> list[Any]:
|
|
140
|
-
"""Query the document using a CSS selector. Delegates to root.query()."""
|
|
141
|
-
return self.root.query(selector)
|
|
142
|
-
|
|
143
229
|
def to_html(
|
|
144
230
|
self,
|
|
145
231
|
pretty: bool = True,
|
|
146
232
|
indent_size: int = 2,
|
|
147
|
-
*,
|
|
148
|
-
safe: bool = True,
|
|
149
|
-
policy: SanitizationPolicy | None = None,
|
|
150
233
|
) -> str:
|
|
151
234
|
"""Serialize the document to HTML.
|
|
152
235
|
|
|
153
|
-
|
|
154
|
-
- `policy` overrides the default sanitization policy.
|
|
236
|
+
Sanitization (when enabled) happens during construction.
|
|
155
237
|
"""
|
|
156
238
|
return self.root.to_html(
|
|
157
239
|
indent=0,
|
|
158
240
|
indent_size=indent_size,
|
|
159
241
|
pretty=pretty,
|
|
160
|
-
safe=safe,
|
|
161
|
-
policy=policy,
|
|
162
242
|
)
|
|
163
243
|
|
|
164
244
|
def to_text(
|
|
165
245
|
self,
|
|
166
246
|
separator: str = " ",
|
|
167
247
|
strip: bool = True,
|
|
168
|
-
*,
|
|
169
|
-
safe: bool = True,
|
|
170
|
-
policy: SanitizationPolicy | None = None,
|
|
171
248
|
) -> str:
|
|
172
|
-
"""Return the document's concatenated text.
|
|
173
|
-
|
|
174
|
-
- `safe=True` sanitizes untrusted content before text extraction.
|
|
175
|
-
- `policy` overrides the default sanitization policy.
|
|
249
|
+
"""Return the document's concatenated text."""
|
|
250
|
+
return self.root.to_text(separator=separator, strip=strip)
|
|
176
251
|
|
|
177
|
-
|
|
178
|
-
"""
|
|
179
|
-
return self.root.
|
|
180
|
-
|
|
181
|
-
def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
|
|
182
|
-
"""Return a GitHub Flavored Markdown representation.
|
|
183
|
-
|
|
184
|
-
- `safe=True` sanitizes untrusted content before conversion.
|
|
185
|
-
- `policy` overrides the default sanitization policy.
|
|
186
|
-
"""
|
|
187
|
-
return self.root.to_markdown(safe=safe, policy=policy)
|
|
252
|
+
def to_markdown(self) -> str:
|
|
253
|
+
"""Return a GitHub Flavored Markdown representation."""
|
|
254
|
+
return self.root.to_markdown()
|