tensorbored 2.21.0rc1769983804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorbored/__init__.py +112 -0
- tensorbored/_vendor/__init__.py +0 -0
- tensorbored/_vendor/bleach/__init__.py +125 -0
- tensorbored/_vendor/bleach/_vendor/__init__.py +0 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/__init__.py +35 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_ihatexml.py +289 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_inputstream.py +918 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_tokenizer.py +1735 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_trie/__init__.py +5 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_trie/_base.py +40 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_trie/py.py +67 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_utils.py +159 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/constants.py +2946 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/__init__.py +0 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/alphabeticalattributes.py +29 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/base.py +12 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/inject_meta_charset.py +73 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/lint.py +93 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/optionaltags.py +207 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/sanitizer.py +916 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/whitespace.py +38 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/html5parser.py +2795 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/serializer.py +409 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treeadapters/__init__.py +30 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treeadapters/genshi.py +54 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treeadapters/sax.py +50 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/__init__.py +88 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/base.py +417 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/dom.py +239 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/etree.py +343 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/etree_lxml.py +392 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/__init__.py +154 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/base.py +252 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/dom.py +43 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/etree.py +131 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/etree_lxml.py +215 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/genshi.py +69 -0
- tensorbored/_vendor/bleach/_vendor/parse.py +1078 -0
- tensorbored/_vendor/bleach/callbacks.py +32 -0
- tensorbored/_vendor/bleach/html5lib_shim.py +757 -0
- tensorbored/_vendor/bleach/linkifier.py +633 -0
- tensorbored/_vendor/bleach/parse_shim.py +1 -0
- tensorbored/_vendor/bleach/sanitizer.py +638 -0
- tensorbored/_vendor/bleach/six_shim.py +19 -0
- tensorbored/_vendor/webencodings/__init__.py +342 -0
- tensorbored/_vendor/webencodings/labels.py +231 -0
- tensorbored/_vendor/webencodings/mklabels.py +59 -0
- tensorbored/_vendor/webencodings/x_user_defined.py +325 -0
- tensorbored/assets.py +36 -0
- tensorbored/auth.py +102 -0
- tensorbored/backend/__init__.py +0 -0
- tensorbored/backend/application.py +604 -0
- tensorbored/backend/auth_context_middleware.py +38 -0
- tensorbored/backend/client_feature_flags.py +113 -0
- tensorbored/backend/empty_path_redirect.py +46 -0
- tensorbored/backend/event_processing/__init__.py +0 -0
- tensorbored/backend/event_processing/data_ingester.py +276 -0
- tensorbored/backend/event_processing/data_provider.py +535 -0
- tensorbored/backend/event_processing/directory_loader.py +142 -0
- tensorbored/backend/event_processing/directory_watcher.py +272 -0
- tensorbored/backend/event_processing/event_accumulator.py +950 -0
- tensorbored/backend/event_processing/event_file_inspector.py +463 -0
- tensorbored/backend/event_processing/event_file_loader.py +292 -0
- tensorbored/backend/event_processing/event_multiplexer.py +521 -0
- tensorbored/backend/event_processing/event_util.py +68 -0
- tensorbored/backend/event_processing/io_wrapper.py +223 -0
- tensorbored/backend/event_processing/plugin_asset_util.py +104 -0
- tensorbored/backend/event_processing/plugin_event_accumulator.py +721 -0
- tensorbored/backend/event_processing/plugin_event_multiplexer.py +522 -0
- tensorbored/backend/event_processing/reservoir.py +266 -0
- tensorbored/backend/event_processing/tag_types.py +29 -0
- tensorbored/backend/experiment_id.py +71 -0
- tensorbored/backend/experimental_plugin.py +51 -0
- tensorbored/backend/http_util.py +263 -0
- tensorbored/backend/json_util.py +70 -0
- tensorbored/backend/path_prefix.py +67 -0
- tensorbored/backend/process_graph.py +74 -0
- tensorbored/backend/security_validator.py +202 -0
- tensorbored/compat/__init__.py +69 -0
- tensorbored/compat/proto/__init__.py +0 -0
- tensorbored/compat/proto/allocation_description_pb2.py +35 -0
- tensorbored/compat/proto/api_def_pb2.py +82 -0
- tensorbored/compat/proto/attr_value_pb2.py +80 -0
- tensorbored/compat/proto/cluster_pb2.py +58 -0
- tensorbored/compat/proto/config_pb2.py +271 -0
- tensorbored/compat/proto/coordination_config_pb2.py +45 -0
- tensorbored/compat/proto/cost_graph_pb2.py +87 -0
- tensorbored/compat/proto/cpp_shape_inference_pb2.py +70 -0
- tensorbored/compat/proto/debug_pb2.py +65 -0
- tensorbored/compat/proto/event_pb2.py +149 -0
- tensorbored/compat/proto/full_type_pb2.py +74 -0
- tensorbored/compat/proto/function_pb2.py +157 -0
- tensorbored/compat/proto/graph_debug_info_pb2.py +111 -0
- tensorbored/compat/proto/graph_pb2.py +41 -0
- tensorbored/compat/proto/histogram_pb2.py +39 -0
- tensorbored/compat/proto/meta_graph_pb2.py +254 -0
- tensorbored/compat/proto/node_def_pb2.py +61 -0
- tensorbored/compat/proto/op_def_pb2.py +81 -0
- tensorbored/compat/proto/resource_handle_pb2.py +48 -0
- tensorbored/compat/proto/rewriter_config_pb2.py +93 -0
- tensorbored/compat/proto/rpc_options_pb2.py +35 -0
- tensorbored/compat/proto/saved_object_graph_pb2.py +193 -0
- tensorbored/compat/proto/saver_pb2.py +38 -0
- tensorbored/compat/proto/step_stats_pb2.py +116 -0
- tensorbored/compat/proto/struct_pb2.py +144 -0
- tensorbored/compat/proto/summary_pb2.py +111 -0
- tensorbored/compat/proto/tensor_description_pb2.py +38 -0
- tensorbored/compat/proto/tensor_pb2.py +68 -0
- tensorbored/compat/proto/tensor_shape_pb2.py +46 -0
- tensorbored/compat/proto/tfprof_log_pb2.py +307 -0
- tensorbored/compat/proto/trackable_object_graph_pb2.py +90 -0
- tensorbored/compat/proto/types_pb2.py +105 -0
- tensorbored/compat/proto/variable_pb2.py +62 -0
- tensorbored/compat/proto/verifier_config_pb2.py +38 -0
- tensorbored/compat/proto/versions_pb2.py +35 -0
- tensorbored/compat/tensorflow_stub/__init__.py +38 -0
- tensorbored/compat/tensorflow_stub/app.py +124 -0
- tensorbored/compat/tensorflow_stub/compat/__init__.py +131 -0
- tensorbored/compat/tensorflow_stub/compat/v1/__init__.py +20 -0
- tensorbored/compat/tensorflow_stub/dtypes.py +692 -0
- tensorbored/compat/tensorflow_stub/error_codes.py +169 -0
- tensorbored/compat/tensorflow_stub/errors.py +507 -0
- tensorbored/compat/tensorflow_stub/flags.py +124 -0
- tensorbored/compat/tensorflow_stub/io/__init__.py +17 -0
- tensorbored/compat/tensorflow_stub/io/gfile.py +1011 -0
- tensorbored/compat/tensorflow_stub/pywrap_tensorflow.py +285 -0
- tensorbored/compat/tensorflow_stub/tensor_shape.py +1035 -0
- tensorbored/context.py +129 -0
- tensorbored/data/__init__.py +0 -0
- tensorbored/data/grpc_provider.py +365 -0
- tensorbored/data/ingester.py +46 -0
- tensorbored/data/proto/__init__.py +0 -0
- tensorbored/data/proto/data_provider_pb2.py +517 -0
- tensorbored/data/proto/data_provider_pb2_grpc.py +374 -0
- tensorbored/data/provider.py +1365 -0
- tensorbored/data/server_ingester.py +301 -0
- tensorbored/data_compat.py +159 -0
- tensorbored/dataclass_compat.py +224 -0
- tensorbored/default.py +124 -0
- tensorbored/errors.py +130 -0
- tensorbored/lazy.py +99 -0
- tensorbored/main.py +48 -0
- tensorbored/main_lib.py +62 -0
- tensorbored/manager.py +487 -0
- tensorbored/notebook.py +441 -0
- tensorbored/plugin_util.py +266 -0
- tensorbored/plugins/__init__.py +0 -0
- tensorbored/plugins/audio/__init__.py +0 -0
- tensorbored/plugins/audio/audio_plugin.py +229 -0
- tensorbored/plugins/audio/metadata.py +69 -0
- tensorbored/plugins/audio/plugin_data_pb2.py +37 -0
- tensorbored/plugins/audio/summary.py +230 -0
- tensorbored/plugins/audio/summary_v2.py +124 -0
- tensorbored/plugins/base_plugin.py +367 -0
- tensorbored/plugins/core/__init__.py +0 -0
- tensorbored/plugins/core/core_plugin.py +981 -0
- tensorbored/plugins/custom_scalar/__init__.py +0 -0
- tensorbored/plugins/custom_scalar/custom_scalars_plugin.py +320 -0
- tensorbored/plugins/custom_scalar/layout_pb2.py +85 -0
- tensorbored/plugins/custom_scalar/metadata.py +35 -0
- tensorbored/plugins/custom_scalar/summary.py +79 -0
- tensorbored/plugins/debugger_v2/__init__.py +0 -0
- tensorbored/plugins/debugger_v2/debug_data_multiplexer.py +631 -0
- tensorbored/plugins/debugger_v2/debug_data_provider.py +634 -0
- tensorbored/plugins/debugger_v2/debugger_v2_plugin.py +504 -0
- tensorbored/plugins/distribution/__init__.py +0 -0
- tensorbored/plugins/distribution/compressor.py +158 -0
- tensorbored/plugins/distribution/distributions_plugin.py +116 -0
- tensorbored/plugins/distribution/metadata.py +19 -0
- tensorbored/plugins/graph/__init__.py +0 -0
- tensorbored/plugins/graph/graph_util.py +129 -0
- tensorbored/plugins/graph/graphs_plugin.py +336 -0
- tensorbored/plugins/graph/keras_util.py +328 -0
- tensorbored/plugins/graph/metadata.py +42 -0
- tensorbored/plugins/histogram/__init__.py +0 -0
- tensorbored/plugins/histogram/histograms_plugin.py +144 -0
- tensorbored/plugins/histogram/metadata.py +63 -0
- tensorbored/plugins/histogram/plugin_data_pb2.py +34 -0
- tensorbored/plugins/histogram/summary.py +234 -0
- tensorbored/plugins/histogram/summary_v2.py +292 -0
- tensorbored/plugins/hparams/__init__.py +14 -0
- tensorbored/plugins/hparams/_keras.py +93 -0
- tensorbored/plugins/hparams/api.py +130 -0
- tensorbored/plugins/hparams/api_pb2.py +208 -0
- tensorbored/plugins/hparams/backend_context.py +606 -0
- tensorbored/plugins/hparams/download_data.py +158 -0
- tensorbored/plugins/hparams/error.py +26 -0
- tensorbored/plugins/hparams/get_experiment.py +71 -0
- tensorbored/plugins/hparams/hparams_plugin.py +206 -0
- tensorbored/plugins/hparams/hparams_util_pb2.py +69 -0
- tensorbored/plugins/hparams/json_format_compat.py +38 -0
- tensorbored/plugins/hparams/list_metric_evals.py +57 -0
- tensorbored/plugins/hparams/list_session_groups.py +1040 -0
- tensorbored/plugins/hparams/metadata.py +125 -0
- tensorbored/plugins/hparams/metrics.py +41 -0
- tensorbored/plugins/hparams/plugin_data_pb2.py +69 -0
- tensorbored/plugins/hparams/summary.py +205 -0
- tensorbored/plugins/hparams/summary_v2.py +597 -0
- tensorbored/plugins/image/__init__.py +0 -0
- tensorbored/plugins/image/images_plugin.py +232 -0
- tensorbored/plugins/image/metadata.py +65 -0
- tensorbored/plugins/image/plugin_data_pb2.py +34 -0
- tensorbored/plugins/image/summary.py +159 -0
- tensorbored/plugins/image/summary_v2.py +130 -0
- tensorbored/plugins/mesh/__init__.py +14 -0
- tensorbored/plugins/mesh/mesh_plugin.py +292 -0
- tensorbored/plugins/mesh/metadata.py +152 -0
- tensorbored/plugins/mesh/plugin_data_pb2.py +37 -0
- tensorbored/plugins/mesh/summary.py +251 -0
- tensorbored/plugins/mesh/summary_v2.py +214 -0
- tensorbored/plugins/metrics/__init__.py +0 -0
- tensorbored/plugins/metrics/metadata.py +17 -0
- tensorbored/plugins/metrics/metrics_plugin.py +623 -0
- tensorbored/plugins/pr_curve/__init__.py +0 -0
- tensorbored/plugins/pr_curve/metadata.py +75 -0
- tensorbored/plugins/pr_curve/plugin_data_pb2.py +34 -0
- tensorbored/plugins/pr_curve/pr_curves_plugin.py +241 -0
- tensorbored/plugins/pr_curve/summary.py +574 -0
- tensorbored/plugins/profile_redirect/__init__.py +0 -0
- tensorbored/plugins/profile_redirect/profile_redirect_plugin.py +49 -0
- tensorbored/plugins/projector/__init__.py +67 -0
- tensorbored/plugins/projector/metadata.py +26 -0
- tensorbored/plugins/projector/projector_config_pb2.py +54 -0
- tensorbored/plugins/projector/projector_plugin.py +795 -0
- tensorbored/plugins/projector/tf_projector_plugin/index.js +32 -0
- tensorbored/plugins/projector/tf_projector_plugin/projector_binary.html +524 -0
- tensorbored/plugins/projector/tf_projector_plugin/projector_binary.js +15536 -0
- tensorbored/plugins/scalar/__init__.py +0 -0
- tensorbored/plugins/scalar/metadata.py +60 -0
- tensorbored/plugins/scalar/plugin_data_pb2.py +34 -0
- tensorbored/plugins/scalar/scalars_plugin.py +181 -0
- tensorbored/plugins/scalar/summary.py +109 -0
- tensorbored/plugins/scalar/summary_v2.py +124 -0
- tensorbored/plugins/text/__init__.py +0 -0
- tensorbored/plugins/text/metadata.py +62 -0
- tensorbored/plugins/text/plugin_data_pb2.py +34 -0
- tensorbored/plugins/text/summary.py +114 -0
- tensorbored/plugins/text/summary_v2.py +124 -0
- tensorbored/plugins/text/text_plugin.py +288 -0
- tensorbored/plugins/wit_redirect/__init__.py +0 -0
- tensorbored/plugins/wit_redirect/wit_redirect_plugin.py +49 -0
- tensorbored/program.py +910 -0
- tensorbored/summary/__init__.py +35 -0
- tensorbored/summary/_output.py +124 -0
- tensorbored/summary/_tf/__init__.py +14 -0
- tensorbored/summary/_tf/summary/__init__.py +178 -0
- tensorbored/summary/_writer.py +105 -0
- tensorbored/summary/v1.py +51 -0
- tensorbored/summary/v2.py +25 -0
- tensorbored/summary/writer/__init__.py +13 -0
- tensorbored/summary/writer/event_file_writer.py +291 -0
- tensorbored/summary/writer/record_writer.py +50 -0
- tensorbored/util/__init__.py +0 -0
- tensorbored/util/encoder.py +116 -0
- tensorbored/util/grpc_util.py +311 -0
- tensorbored/util/img_mime_type_detector.py +40 -0
- tensorbored/util/io_util.py +20 -0
- tensorbored/util/lazy_tensor_creator.py +110 -0
- tensorbored/util/op_evaluator.py +104 -0
- tensorbored/util/platform_util.py +20 -0
- tensorbored/util/tb_logging.py +24 -0
- tensorbored/util/tensor_util.py +617 -0
- tensorbored/util/timing.py +122 -0
- tensorbored/version.py +21 -0
- tensorbored/webfiles.zip +0 -0
- tensorbored-2.21.0rc1769983804.dist-info/METADATA +49 -0
- tensorbored-2.21.0rc1769983804.dist-info/RECORD +271 -0
- tensorbored-2.21.0rc1769983804.dist-info/WHEEL +5 -0
- tensorbored-2.21.0rc1769983804.dist-info/entry_points.txt +6 -0
- tensorbored-2.21.0rc1769983804.dist-info/licenses/LICENSE +739 -0
- tensorbored-2.21.0rc1769983804.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,757 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
"""
|
|
3
|
+
Shim module between Bleach and html5lib. This makes it easier to upgrade the
|
|
4
|
+
html5lib library without having to change a lot of code.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
import string
|
|
9
|
+
import warnings
|
|
10
|
+
|
|
11
|
+
# ignore html5lib deprecation warnings to use bleach; we are bleach
|
|
12
|
+
# apply before we import submodules that import html5lib
|
|
13
|
+
warnings.filterwarnings(
|
|
14
|
+
"ignore",
|
|
15
|
+
message="html5lib's sanitizer is deprecated",
|
|
16
|
+
category=DeprecationWarning,
|
|
17
|
+
module="bleach._vendor.html5lib",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from tensorbored._vendor.bleach._vendor.html5lib import ( # noqa: E402 module level import not at top of file
|
|
21
|
+
HTMLParser,
|
|
22
|
+
getTreeWalker,
|
|
23
|
+
)
|
|
24
|
+
from tensorbored._vendor.bleach._vendor.html5lib import (
|
|
25
|
+
constants,
|
|
26
|
+
) # noqa: E402 module level import not at top of file
|
|
27
|
+
from tensorbored._vendor.bleach._vendor.html5lib.constants import ( # noqa: E402 module level import not at top of file
|
|
28
|
+
namespaces,
|
|
29
|
+
prefixes,
|
|
30
|
+
)
|
|
31
|
+
from tensorbored._vendor.bleach._vendor.html5lib.constants import (
|
|
32
|
+
_ReparseException as ReparseException,
|
|
33
|
+
) # noqa: E402 module level import not at top of file
|
|
34
|
+
from tensorbored._vendor.bleach._vendor.html5lib.filters.base import (
|
|
35
|
+
Filter,
|
|
36
|
+
) # noqa: E402 module level import not at top of file
|
|
37
|
+
from tensorbored._vendor.bleach._vendor.html5lib.filters.sanitizer import (
|
|
38
|
+
allowed_protocols,
|
|
39
|
+
allowed_css_properties,
|
|
40
|
+
allowed_svg_properties,
|
|
41
|
+
attr_val_is_uri,
|
|
42
|
+
svg_attr_val_allows_ref,
|
|
43
|
+
svg_allow_local_href,
|
|
44
|
+
) # noqa: E402 module level import not at top of file
|
|
45
|
+
from tensorbored._vendor.bleach._vendor.html5lib.filters.sanitizer import (
|
|
46
|
+
Filter as SanitizerFilter,
|
|
47
|
+
) # noqa: E402 module level import not at top of file
|
|
48
|
+
from tensorbored._vendor.bleach._vendor.html5lib._inputstream import (
|
|
49
|
+
HTMLInputStream,
|
|
50
|
+
) # noqa: E402 module level import not at top of file
|
|
51
|
+
from tensorbored._vendor.bleach._vendor.html5lib.serializer import (
|
|
52
|
+
escape,
|
|
53
|
+
HTMLSerializer,
|
|
54
|
+
) # noqa: E402 module level import not at top of file
|
|
55
|
+
from tensorbored._vendor.bleach._vendor.html5lib._tokenizer import (
|
|
56
|
+
attributeMap,
|
|
57
|
+
HTMLTokenizer,
|
|
58
|
+
) # noqa: E402 module level import not at top of file
|
|
59
|
+
from tensorbored._vendor.bleach._vendor.html5lib._trie import (
|
|
60
|
+
Trie,
|
|
61
|
+
) # noqa: E402 module level import not at top of file
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
#: Map of entity name to expanded entity
|
|
65
|
+
ENTITIES = constants.entities
|
|
66
|
+
|
|
67
|
+
#: Trie of html entity string -> character representation
|
|
68
|
+
ENTITIES_TRIE = Trie(ENTITIES)
|
|
69
|
+
|
|
70
|
+
#: Token type constants--these never change
|
|
71
|
+
TAG_TOKEN_TYPES = {
|
|
72
|
+
constants.tokenTypes["StartTag"],
|
|
73
|
+
constants.tokenTypes["EndTag"],
|
|
74
|
+
constants.tokenTypes["EmptyTag"],
|
|
75
|
+
}
|
|
76
|
+
TAG_TOKEN_TYPE_START = constants.tokenTypes["StartTag"]
|
|
77
|
+
TAG_TOKEN_TYPE_END = constants.tokenTypes["EndTag"]
|
|
78
|
+
TAG_TOKEN_TYPE_CHARACTERS = constants.tokenTypes["Characters"]
|
|
79
|
+
TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
#: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
|
|
83
|
+
#: https://html.spec.whatwg.org/multipage/indices.html#elements-3
|
|
84
|
+
HTML_TAGS = frozenset(
|
|
85
|
+
(
|
|
86
|
+
"a",
|
|
87
|
+
"abbr",
|
|
88
|
+
"address",
|
|
89
|
+
"area",
|
|
90
|
+
"article",
|
|
91
|
+
"aside",
|
|
92
|
+
"audio",
|
|
93
|
+
"b",
|
|
94
|
+
"base",
|
|
95
|
+
"bdi",
|
|
96
|
+
"bdo",
|
|
97
|
+
"blockquote",
|
|
98
|
+
"body",
|
|
99
|
+
"br",
|
|
100
|
+
"button",
|
|
101
|
+
"canvas",
|
|
102
|
+
"caption",
|
|
103
|
+
"cite",
|
|
104
|
+
"code",
|
|
105
|
+
"col",
|
|
106
|
+
"colgroup",
|
|
107
|
+
"data",
|
|
108
|
+
"datalist",
|
|
109
|
+
"dd",
|
|
110
|
+
"del",
|
|
111
|
+
"details",
|
|
112
|
+
"dfn",
|
|
113
|
+
"dialog",
|
|
114
|
+
"div",
|
|
115
|
+
"dl",
|
|
116
|
+
"dt",
|
|
117
|
+
"em",
|
|
118
|
+
"embed",
|
|
119
|
+
"fieldset",
|
|
120
|
+
"figcaption",
|
|
121
|
+
"figure",
|
|
122
|
+
"footer",
|
|
123
|
+
"form",
|
|
124
|
+
"h1",
|
|
125
|
+
"h2",
|
|
126
|
+
"h3",
|
|
127
|
+
"h4",
|
|
128
|
+
"h5",
|
|
129
|
+
"h6",
|
|
130
|
+
"head",
|
|
131
|
+
"header",
|
|
132
|
+
"hgroup",
|
|
133
|
+
"hr",
|
|
134
|
+
"html",
|
|
135
|
+
"i",
|
|
136
|
+
"iframe",
|
|
137
|
+
"img",
|
|
138
|
+
"input",
|
|
139
|
+
"ins",
|
|
140
|
+
"kbd",
|
|
141
|
+
"keygen",
|
|
142
|
+
"label",
|
|
143
|
+
"legend",
|
|
144
|
+
"li",
|
|
145
|
+
"link",
|
|
146
|
+
"map",
|
|
147
|
+
"mark",
|
|
148
|
+
"menu",
|
|
149
|
+
"meta",
|
|
150
|
+
"meter",
|
|
151
|
+
"nav",
|
|
152
|
+
"noscript",
|
|
153
|
+
"object",
|
|
154
|
+
"ol",
|
|
155
|
+
"optgroup",
|
|
156
|
+
"option",
|
|
157
|
+
"output",
|
|
158
|
+
"p",
|
|
159
|
+
"param",
|
|
160
|
+
"picture",
|
|
161
|
+
"pre",
|
|
162
|
+
"progress",
|
|
163
|
+
"q",
|
|
164
|
+
"rp",
|
|
165
|
+
"rt",
|
|
166
|
+
"ruby",
|
|
167
|
+
"s",
|
|
168
|
+
"samp",
|
|
169
|
+
"script",
|
|
170
|
+
"section",
|
|
171
|
+
"select",
|
|
172
|
+
"slot",
|
|
173
|
+
"small",
|
|
174
|
+
"source",
|
|
175
|
+
"span",
|
|
176
|
+
"strong",
|
|
177
|
+
"style",
|
|
178
|
+
"sub",
|
|
179
|
+
"summary",
|
|
180
|
+
"sup",
|
|
181
|
+
"table",
|
|
182
|
+
"tbody",
|
|
183
|
+
"td",
|
|
184
|
+
"template",
|
|
185
|
+
"textarea",
|
|
186
|
+
"tfoot",
|
|
187
|
+
"th",
|
|
188
|
+
"thead",
|
|
189
|
+
"time",
|
|
190
|
+
"title",
|
|
191
|
+
"tr",
|
|
192
|
+
"track",
|
|
193
|
+
"u",
|
|
194
|
+
"ul",
|
|
195
|
+
"var",
|
|
196
|
+
"video",
|
|
197
|
+
"wbr",
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
#: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
|
|
203
|
+
#: from mozilla on 2019.07.11
|
|
204
|
+
#: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
|
|
205
|
+
HTML_TAGS_BLOCK_LEVEL = frozenset(
|
|
206
|
+
(
|
|
207
|
+
"address",
|
|
208
|
+
"article",
|
|
209
|
+
"aside",
|
|
210
|
+
"blockquote",
|
|
211
|
+
"details",
|
|
212
|
+
"dialog",
|
|
213
|
+
"dd",
|
|
214
|
+
"div",
|
|
215
|
+
"dl",
|
|
216
|
+
"dt",
|
|
217
|
+
"fieldset",
|
|
218
|
+
"figcaption",
|
|
219
|
+
"figure",
|
|
220
|
+
"footer",
|
|
221
|
+
"form",
|
|
222
|
+
"h1",
|
|
223
|
+
"h2",
|
|
224
|
+
"h3",
|
|
225
|
+
"h4",
|
|
226
|
+
"h5",
|
|
227
|
+
"h6",
|
|
228
|
+
"header",
|
|
229
|
+
"hgroup",
|
|
230
|
+
"hr",
|
|
231
|
+
"li",
|
|
232
|
+
"main",
|
|
233
|
+
"nav",
|
|
234
|
+
"ol",
|
|
235
|
+
"p",
|
|
236
|
+
"pre",
|
|
237
|
+
"section",
|
|
238
|
+
"table",
|
|
239
|
+
"ul",
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class InputStreamWithMemory:
|
|
245
|
+
"""Wraps an HTMLInputStream to remember characters since last <
|
|
246
|
+
|
|
247
|
+
This wraps existing HTMLInputStream classes to keep track of the stream
|
|
248
|
+
since the last < which marked an open tag state.
|
|
249
|
+
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
def __init__(self, inner_stream):
|
|
253
|
+
self._inner_stream = inner_stream
|
|
254
|
+
self.reset = self._inner_stream.reset
|
|
255
|
+
self.position = self._inner_stream.position
|
|
256
|
+
self._buffer = []
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def errors(self):
|
|
260
|
+
return self._inner_stream.errors
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
def charEncoding(self):
|
|
264
|
+
return self._inner_stream.charEncoding
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def changeEncoding(self):
|
|
268
|
+
return self._inner_stream.changeEncoding
|
|
269
|
+
|
|
270
|
+
def char(self):
|
|
271
|
+
c = self._inner_stream.char()
|
|
272
|
+
# char() can return None if EOF, so ignore that
|
|
273
|
+
if c:
|
|
274
|
+
self._buffer.append(c)
|
|
275
|
+
return c
|
|
276
|
+
|
|
277
|
+
def charsUntil(self, characters, opposite=False):
|
|
278
|
+
chars = self._inner_stream.charsUntil(characters, opposite=opposite)
|
|
279
|
+
self._buffer.extend(list(chars))
|
|
280
|
+
return chars
|
|
281
|
+
|
|
282
|
+
def unget(self, char):
|
|
283
|
+
if self._buffer:
|
|
284
|
+
self._buffer.pop(-1)
|
|
285
|
+
return self._inner_stream.unget(char)
|
|
286
|
+
|
|
287
|
+
def get_tag(self):
|
|
288
|
+
"""Returns the stream history since last '<'
|
|
289
|
+
|
|
290
|
+
Since the buffer starts at the last '<' as as seen by tagOpenState(),
|
|
291
|
+
we know that everything from that point to when this method is called
|
|
292
|
+
is the "tag" that is being tokenized.
|
|
293
|
+
|
|
294
|
+
"""
|
|
295
|
+
return "".join(self._buffer)
|
|
296
|
+
|
|
297
|
+
def start_tag(self):
|
|
298
|
+
"""Resets stream history to just '<'
|
|
299
|
+
|
|
300
|
+
This gets called by tagOpenState() which marks a '<' that denotes an
|
|
301
|
+
open tag. Any time we see that, we reset the buffer.
|
|
302
|
+
|
|
303
|
+
"""
|
|
304
|
+
self._buffer = ["<"]
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
class BleachHTMLTokenizer(HTMLTokenizer):
|
|
308
|
+
"""Tokenizer that doesn't consume character entities"""
|
|
309
|
+
|
|
310
|
+
def __init__(self, consume_entities=False, **kwargs):
|
|
311
|
+
super().__init__(**kwargs)
|
|
312
|
+
|
|
313
|
+
self.consume_entities = consume_entities
|
|
314
|
+
|
|
315
|
+
# Wrap the stream with one that remembers the history
|
|
316
|
+
self.stream = InputStreamWithMemory(self.stream)
|
|
317
|
+
|
|
318
|
+
# Remember the last token emitted; needed for block element spacing
|
|
319
|
+
self.emitted_last_token = None
|
|
320
|
+
|
|
321
|
+
def __iter__(self):
|
|
322
|
+
last_error_token = None
|
|
323
|
+
|
|
324
|
+
for token in super().__iter__():
|
|
325
|
+
if last_error_token is not None:
|
|
326
|
+
if (
|
|
327
|
+
last_error_token["data"] == "invalid-character-in-attribute-name"
|
|
328
|
+
and token["type"] in TAG_TOKEN_TYPES
|
|
329
|
+
and token.get("data")
|
|
330
|
+
):
|
|
331
|
+
# token["data"] is an html5lib attributeMap
|
|
332
|
+
# (OrderedDict 3.7+ and dict otherwise)
|
|
333
|
+
# of attr name to attr value
|
|
334
|
+
#
|
|
335
|
+
# Remove attribute names that have ', " or < in them
|
|
336
|
+
# because those characters are invalid for attribute names.
|
|
337
|
+
token["data"] = attributeMap(
|
|
338
|
+
(attr_name, attr_value)
|
|
339
|
+
for attr_name, attr_value in token["data"].items()
|
|
340
|
+
if (
|
|
341
|
+
'"' not in attr_name
|
|
342
|
+
and "'" not in attr_name
|
|
343
|
+
and "<" not in attr_name
|
|
344
|
+
)
|
|
345
|
+
)
|
|
346
|
+
last_error_token = None
|
|
347
|
+
yield token
|
|
348
|
+
|
|
349
|
+
elif (
|
|
350
|
+
last_error_token["data"] == "expected-closing-tag-but-got-char"
|
|
351
|
+
and self.parser.tags is not None
|
|
352
|
+
and token["data"].lower().strip() not in self.parser.tags
|
|
353
|
+
):
|
|
354
|
+
# We've got either a malformed tag or a pseudo-tag or
|
|
355
|
+
# something that html5lib wants to turn into a malformed
|
|
356
|
+
# comment which Bleach clean() will drop so we interfere
|
|
357
|
+
# with the token stream to handle it more correctly.
|
|
358
|
+
#
|
|
359
|
+
# If this is an allowed tag, it's malformed and we just let
|
|
360
|
+
# the html5lib parser deal with it--we don't enter into this
|
|
361
|
+
# block.
|
|
362
|
+
#
|
|
363
|
+
# If this is not an allowed tag, then we convert it to
|
|
364
|
+
# characters and it'll get escaped in the sanitizer.
|
|
365
|
+
token["data"] = self.stream.get_tag()
|
|
366
|
+
token["type"] = TAG_TOKEN_TYPE_CHARACTERS
|
|
367
|
+
|
|
368
|
+
last_error_token = None
|
|
369
|
+
yield token
|
|
370
|
+
|
|
371
|
+
elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
|
|
372
|
+
# If the token is a parse error, then let the last_error_token
|
|
373
|
+
# go, and make token the new last_error_token
|
|
374
|
+
yield last_error_token
|
|
375
|
+
last_error_token = token
|
|
376
|
+
|
|
377
|
+
else:
|
|
378
|
+
yield last_error_token
|
|
379
|
+
yield token
|
|
380
|
+
last_error_token = None
|
|
381
|
+
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# If the token is a ParseError, we hold on to it so we can get the
|
|
385
|
+
# next token and potentially fix it.
|
|
386
|
+
if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
|
|
387
|
+
last_error_token = token
|
|
388
|
+
continue
|
|
389
|
+
|
|
390
|
+
yield token
|
|
391
|
+
|
|
392
|
+
if last_error_token:
|
|
393
|
+
if last_error_token["data"] == "eof-in-tag-name":
|
|
394
|
+
# Handle the case where the text being parsed ends with <
|
|
395
|
+
# followed by a series of characters. It's treated as a tag
|
|
396
|
+
# name that abruptly ends, but we should treat that like
|
|
397
|
+
# character data
|
|
398
|
+
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
|
|
399
|
+
|
|
400
|
+
elif last_error_token["data"] in (
|
|
401
|
+
"duplicate-attribute",
|
|
402
|
+
"eof-in-attribute-name",
|
|
403
|
+
"eof-in-attribute-value-no-quotes",
|
|
404
|
+
"expected-end-of-tag-but-got-eof",
|
|
405
|
+
):
|
|
406
|
+
# Handle the case where the text being parsed ends with <
|
|
407
|
+
# followed by characters and then space and then:
|
|
408
|
+
#
|
|
409
|
+
# * more characters
|
|
410
|
+
# * more characters repeated with a space between (e.g. "abc abc")
|
|
411
|
+
# * more characters and then a space and then an EOF (e.g. "abc def ")
|
|
412
|
+
#
|
|
413
|
+
# These cases are treated as a tag name followed by an
|
|
414
|
+
# attribute that abruptly ends, but we should treat that like
|
|
415
|
+
# character data instead.
|
|
416
|
+
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
|
|
417
|
+
|
|
418
|
+
else:
|
|
419
|
+
yield last_error_token
|
|
420
|
+
|
|
421
|
+
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
|
422
|
+
# If this tokenizer is set to consume entities, then we can let the
|
|
423
|
+
# superclass do its thing.
|
|
424
|
+
if self.consume_entities:
|
|
425
|
+
return super().consumeEntity(allowedChar, fromAttribute)
|
|
426
|
+
|
|
427
|
+
# If this tokenizer is set to not consume entities, then we don't want
|
|
428
|
+
# to consume and convert them, so this overrides the html5lib tokenizer's
|
|
429
|
+
# consumeEntity so that it's now a no-op.
|
|
430
|
+
#
|
|
431
|
+
# However, when that gets called, it's consumed an &, so we put that back in
|
|
432
|
+
# the stream.
|
|
433
|
+
if fromAttribute:
|
|
434
|
+
self.currentToken["data"][-1][1] += "&"
|
|
435
|
+
|
|
436
|
+
else:
|
|
437
|
+
self.tokenQueue.append({"type": TAG_TOKEN_TYPE_CHARACTERS, "data": "&"})
|
|
438
|
+
|
|
439
|
+
def tagOpenState(self):
|
|
440
|
+
# This state marks a < that is either a StartTag, EndTag, EmptyTag,
|
|
441
|
+
# or ParseError. In all cases, we want to drop any stream history
|
|
442
|
+
# we've collected so far and we do that by calling start_tag() on
|
|
443
|
+
# the input stream wrapper.
|
|
444
|
+
self.stream.start_tag()
|
|
445
|
+
return super().tagOpenState()
|
|
446
|
+
|
|
447
|
+
def emitCurrentToken(self):
|
|
448
|
+
token = self.currentToken
|
|
449
|
+
|
|
450
|
+
if (
|
|
451
|
+
self.parser.tags is not None
|
|
452
|
+
and token["type"] in TAG_TOKEN_TYPES
|
|
453
|
+
and token["name"].lower() not in self.parser.tags
|
|
454
|
+
):
|
|
455
|
+
# If this is a start/end/empty tag for a tag that's not in our
|
|
456
|
+
# allowed list, then it gets stripped or escaped. In both of these
|
|
457
|
+
# cases it gets converted to a Characters token.
|
|
458
|
+
if self.parser.strip:
|
|
459
|
+
if (
|
|
460
|
+
self.emitted_last_token
|
|
461
|
+
and token["type"] == TAG_TOKEN_TYPE_START
|
|
462
|
+
and token["name"].lower() in HTML_TAGS_BLOCK_LEVEL
|
|
463
|
+
):
|
|
464
|
+
# If this is a block level tag we're stripping, we drop it
|
|
465
|
+
# for a newline because that's what a browser would parse
|
|
466
|
+
# it as
|
|
467
|
+
new_data = "\n"
|
|
468
|
+
else:
|
|
469
|
+
# For all other things being stripped, we throw in an empty
|
|
470
|
+
# string token
|
|
471
|
+
new_data = ""
|
|
472
|
+
|
|
473
|
+
else:
|
|
474
|
+
# If we're escaping the token, we want to escape the exact
|
|
475
|
+
# original string. Since tokenizing also normalizes data
|
|
476
|
+
# and this is a tag-like thing, we've lost some information.
|
|
477
|
+
# So we go back through the stream to get the original
|
|
478
|
+
# string and use that.
|
|
479
|
+
new_data = self.stream.get_tag()
|
|
480
|
+
|
|
481
|
+
new_token = {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": new_data}
|
|
482
|
+
|
|
483
|
+
self.currentToken = self.emitted_last_token = new_token
|
|
484
|
+
self.tokenQueue.append(new_token)
|
|
485
|
+
self.state = self.dataState
|
|
486
|
+
return
|
|
487
|
+
|
|
488
|
+
self.emitted_last_token = self.currentToken
|
|
489
|
+
super().emitCurrentToken()
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
class BleachHTMLParser(HTMLParser):
|
|
493
|
+
"""Parser that uses BleachHTMLTokenizer"""
|
|
494
|
+
|
|
495
|
+
def __init__(self, tags, strip, consume_entities, **kwargs):
|
|
496
|
+
"""
|
|
497
|
+
:arg tags: set of allowed tags--everything else is either stripped or
|
|
498
|
+
escaped; if None, then this doesn't look at tags at all
|
|
499
|
+
:arg strip: whether to strip disallowed tags (True) or escape them (False);
|
|
500
|
+
if tags=None, then this doesn't have any effect
|
|
501
|
+
:arg consume_entities: whether to consume entities (default behavior) or
|
|
502
|
+
leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)
|
|
503
|
+
|
|
504
|
+
"""
|
|
505
|
+
self.tags = (
|
|
506
|
+
frozenset((tag.lower() for tag in tags)) if tags is not None else None
|
|
507
|
+
)
|
|
508
|
+
self.strip = strip
|
|
509
|
+
self.consume_entities = consume_entities
|
|
510
|
+
super().__init__(**kwargs)
|
|
511
|
+
|
|
512
|
+
def _parse(
|
|
513
|
+
self, stream, innerHTML=False, container="div", scripting=True, **kwargs
|
|
514
|
+
):
|
|
515
|
+
# set scripting=True to parse <noscript> as though JS is enabled to
|
|
516
|
+
# match the expected context in browsers
|
|
517
|
+
#
|
|
518
|
+
# https://html.spec.whatwg.org/multipage/scripting.html#the-noscript-element
|
|
519
|
+
#
|
|
520
|
+
# Override HTMLParser so we can swap out the tokenizer for our own.
|
|
521
|
+
self.innerHTMLMode = innerHTML
|
|
522
|
+
self.container = container
|
|
523
|
+
self.scripting = scripting
|
|
524
|
+
self.tokenizer = BleachHTMLTokenizer(
|
|
525
|
+
stream=stream, consume_entities=self.consume_entities, parser=self, **kwargs
|
|
526
|
+
)
|
|
527
|
+
self.reset()
|
|
528
|
+
|
|
529
|
+
try:
|
|
530
|
+
self.mainLoop()
|
|
531
|
+
except ReparseException:
|
|
532
|
+
self.reset()
|
|
533
|
+
self.mainLoop()
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def convert_entity(value):
|
|
537
|
+
"""Convert an entity (minus the & and ; part) into what it represents
|
|
538
|
+
|
|
539
|
+
This handles numeric, hex, and text entities.
|
|
540
|
+
|
|
541
|
+
:arg value: the string (minus the ``&`` and ``;`` part) to convert
|
|
542
|
+
|
|
543
|
+
:returns: unicode character or None if it's an ambiguous ampersand that
|
|
544
|
+
doesn't match a character entity
|
|
545
|
+
|
|
546
|
+
"""
|
|
547
|
+
if value[0] == "#":
|
|
548
|
+
if len(value) < 2:
|
|
549
|
+
return None
|
|
550
|
+
|
|
551
|
+
if value[1] in ("x", "X"):
|
|
552
|
+
# hex-encoded code point
|
|
553
|
+
int_as_string, base = value[2:], 16
|
|
554
|
+
else:
|
|
555
|
+
# decimal code point
|
|
556
|
+
int_as_string, base = value[1:], 10
|
|
557
|
+
|
|
558
|
+
if int_as_string == "":
|
|
559
|
+
return None
|
|
560
|
+
|
|
561
|
+
code_point = int(int_as_string, base)
|
|
562
|
+
if 0 < code_point < 0x110000:
|
|
563
|
+
return chr(code_point)
|
|
564
|
+
else:
|
|
565
|
+
return None
|
|
566
|
+
|
|
567
|
+
return ENTITIES.get(value, None)
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def convert_entities(text):
|
|
571
|
+
"""Converts all found entities in the text
|
|
572
|
+
|
|
573
|
+
:arg text: the text to convert entities in
|
|
574
|
+
|
|
575
|
+
:returns: unicode text with converted entities
|
|
576
|
+
|
|
577
|
+
"""
|
|
578
|
+
if "&" not in text:
|
|
579
|
+
return text
|
|
580
|
+
|
|
581
|
+
new_text = []
|
|
582
|
+
for part in next_possible_entity(text):
|
|
583
|
+
if not part:
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
if part.startswith("&"):
|
|
587
|
+
entity = match_entity(part)
|
|
588
|
+
if entity is not None:
|
|
589
|
+
converted = convert_entity(entity)
|
|
590
|
+
|
|
591
|
+
# If it's not an ambiguous ampersand, then replace with the
|
|
592
|
+
# unicode character. Otherwise, we leave the entity in.
|
|
593
|
+
if converted is not None:
|
|
594
|
+
new_text.append(converted)
|
|
595
|
+
remainder = part[len(entity) + 2 :]
|
|
596
|
+
if part:
|
|
597
|
+
new_text.append(remainder)
|
|
598
|
+
continue
|
|
599
|
+
|
|
600
|
+
new_text.append(part)
|
|
601
|
+
|
|
602
|
+
return "".join(new_text)
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
def match_entity(stream):
|
|
606
|
+
"""Returns first entity in stream or None if no entity exists
|
|
607
|
+
|
|
608
|
+
Note: For Bleach purposes, entities must start with a "&" and end with a
|
|
609
|
+
";". This ignores ambiguous character entities that have no ";" at the end.
|
|
610
|
+
|
|
611
|
+
:arg stream: the character stream
|
|
612
|
+
|
|
613
|
+
:returns: the entity string without "&" or ";" if it's a valid character
|
|
614
|
+
entity; ``None`` otherwise
|
|
615
|
+
|
|
616
|
+
"""
|
|
617
|
+
# Nix the & at the beginning
|
|
618
|
+
if stream[0] != "&":
|
|
619
|
+
raise ValueError('Stream should begin with "&"')
|
|
620
|
+
|
|
621
|
+
stream = stream[1:]
|
|
622
|
+
|
|
623
|
+
stream = list(stream)
|
|
624
|
+
possible_entity = ""
|
|
625
|
+
end_characters = "<&=;" + string.whitespace
|
|
626
|
+
|
|
627
|
+
# Handle number entities
|
|
628
|
+
if stream and stream[0] == "#":
|
|
629
|
+
possible_entity = "#"
|
|
630
|
+
stream.pop(0)
|
|
631
|
+
|
|
632
|
+
if stream and stream[0] in ("x", "X"):
|
|
633
|
+
allowed = "0123456789abcdefABCDEF"
|
|
634
|
+
possible_entity += stream.pop(0)
|
|
635
|
+
else:
|
|
636
|
+
allowed = "0123456789"
|
|
637
|
+
|
|
638
|
+
# FIXME(willkg): Do we want to make sure these are valid number
|
|
639
|
+
# entities? This doesn't do that currently.
|
|
640
|
+
while stream and stream[0] not in end_characters:
|
|
641
|
+
c = stream.pop(0)
|
|
642
|
+
if c not in allowed:
|
|
643
|
+
break
|
|
644
|
+
possible_entity += c
|
|
645
|
+
|
|
646
|
+
if possible_entity and stream and stream[0] == ";":
|
|
647
|
+
return possible_entity
|
|
648
|
+
return None
|
|
649
|
+
|
|
650
|
+
# Handle character entities
|
|
651
|
+
while stream and stream[0] not in end_characters:
|
|
652
|
+
c = stream.pop(0)
|
|
653
|
+
possible_entity += c
|
|
654
|
+
if not ENTITIES_TRIE.has_keys_with_prefix(possible_entity):
|
|
655
|
+
# If it's not a prefix, then it's not an entity and we're
|
|
656
|
+
# out
|
|
657
|
+
return None
|
|
658
|
+
|
|
659
|
+
if possible_entity and stream and stream[0] == ";":
|
|
660
|
+
return possible_entity
|
|
661
|
+
|
|
662
|
+
return None
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
AMP_SPLIT_RE = re.compile("(&)")
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def next_possible_entity(text):
|
|
669
|
+
"""Takes a text and generates a list of possible entities
|
|
670
|
+
|
|
671
|
+
:arg text: the text to look at
|
|
672
|
+
|
|
673
|
+
:returns: generator where each part (except the first) starts with an
|
|
674
|
+
"&"
|
|
675
|
+
|
|
676
|
+
"""
|
|
677
|
+
for i, part in enumerate(AMP_SPLIT_RE.split(text)):
|
|
678
|
+
if i == 0:
|
|
679
|
+
yield part
|
|
680
|
+
elif i % 2 == 0:
|
|
681
|
+
yield "&" + part
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
class BleachHTMLSerializer(HTMLSerializer):
|
|
685
|
+
"""HTMLSerializer that undoes & -> & in attributes and sets
|
|
686
|
+
escape_rcdata to True
|
|
687
|
+
"""
|
|
688
|
+
|
|
689
|
+
# per the HTMLSerializer.__init__ docstring:
|
|
690
|
+
#
|
|
691
|
+
# Whether to escape characters that need to be
|
|
692
|
+
# escaped within normal elements within rcdata elements such as
|
|
693
|
+
# style.
|
|
694
|
+
#
|
|
695
|
+
escape_rcdata = True
|
|
696
|
+
|
|
697
|
+
def escape_base_amp(self, stoken):
|
|
698
|
+
"""Escapes just bare & in HTML attribute values"""
|
|
699
|
+
# First, undo escaping of &. We need to do this because html5lib's
|
|
700
|
+
# HTMLSerializer expected the tokenizer to consume all the character
|
|
701
|
+
# entities and convert them to their respective characters, but the
|
|
702
|
+
# BleachHTMLTokenizer doesn't do that. For example, this fixes
|
|
703
|
+
# &entity; back to &entity; .
|
|
704
|
+
stoken = stoken.replace("&", "&")
|
|
705
|
+
|
|
706
|
+
# However, we do want all bare & that are not marking character
|
|
707
|
+
# entities to be changed to &, so let's do that carefully here.
|
|
708
|
+
for part in next_possible_entity(stoken):
|
|
709
|
+
if not part:
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
if part.startswith("&"):
|
|
713
|
+
entity = match_entity(part)
|
|
714
|
+
# Only leave entities in that are not ambiguous. If they're
|
|
715
|
+
# ambiguous, then we escape the ampersand.
|
|
716
|
+
if entity is not None and convert_entity(entity) is not None:
|
|
717
|
+
yield f"&{entity};"
|
|
718
|
+
|
|
719
|
+
# Length of the entity plus 2--one for & at the beginning
|
|
720
|
+
# and one for ; at the end
|
|
721
|
+
part = part[len(entity) + 2 :]
|
|
722
|
+
if part:
|
|
723
|
+
yield part
|
|
724
|
+
continue
|
|
725
|
+
|
|
726
|
+
yield part.replace("&", "&")
|
|
727
|
+
|
|
728
|
+
def serialize(self, treewalker, encoding=None):
|
|
729
|
+
"""Wrap HTMLSerializer.serialize and conver & to & in attribute values
|
|
730
|
+
|
|
731
|
+
Note that this converts & to & in attribute values where the & isn't
|
|
732
|
+
already part of an unambiguous character entity.
|
|
733
|
+
|
|
734
|
+
"""
|
|
735
|
+
in_tag = False
|
|
736
|
+
after_equals = False
|
|
737
|
+
|
|
738
|
+
for stoken in super().serialize(treewalker, encoding):
|
|
739
|
+
if in_tag:
|
|
740
|
+
if stoken == ">":
|
|
741
|
+
in_tag = False
|
|
742
|
+
|
|
743
|
+
elif after_equals:
|
|
744
|
+
if stoken != '"':
|
|
745
|
+
yield from self.escape_base_amp(stoken)
|
|
746
|
+
|
|
747
|
+
after_equals = False
|
|
748
|
+
continue
|
|
749
|
+
|
|
750
|
+
elif stoken == "=":
|
|
751
|
+
after_equals = True
|
|
752
|
+
|
|
753
|
+
yield stoken
|
|
754
|
+
else:
|
|
755
|
+
if stoken.startswith("<"):
|
|
756
|
+
in_tag = True
|
|
757
|
+
yield stoken
|