tensorbored 2.21.0rc1769983804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tensorbored/__init__.py +112 -0
- tensorbored/_vendor/__init__.py +0 -0
- tensorbored/_vendor/bleach/__init__.py +125 -0
- tensorbored/_vendor/bleach/_vendor/__init__.py +0 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/__init__.py +35 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_ihatexml.py +289 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_inputstream.py +918 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_tokenizer.py +1735 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_trie/__init__.py +5 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_trie/_base.py +40 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_trie/py.py +67 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/_utils.py +159 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/constants.py +2946 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/__init__.py +0 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/alphabeticalattributes.py +29 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/base.py +12 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/inject_meta_charset.py +73 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/lint.py +93 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/optionaltags.py +207 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/sanitizer.py +916 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/filters/whitespace.py +38 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/html5parser.py +2795 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/serializer.py +409 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treeadapters/__init__.py +30 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treeadapters/genshi.py +54 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treeadapters/sax.py +50 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/__init__.py +88 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/base.py +417 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/dom.py +239 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/etree.py +343 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treebuilders/etree_lxml.py +392 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/__init__.py +154 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/base.py +252 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/dom.py +43 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/etree.py +131 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/etree_lxml.py +215 -0
- tensorbored/_vendor/bleach/_vendor/html5lib/treewalkers/genshi.py +69 -0
- tensorbored/_vendor/bleach/_vendor/parse.py +1078 -0
- tensorbored/_vendor/bleach/callbacks.py +32 -0
- tensorbored/_vendor/bleach/html5lib_shim.py +757 -0
- tensorbored/_vendor/bleach/linkifier.py +633 -0
- tensorbored/_vendor/bleach/parse_shim.py +1 -0
- tensorbored/_vendor/bleach/sanitizer.py +638 -0
- tensorbored/_vendor/bleach/six_shim.py +19 -0
- tensorbored/_vendor/webencodings/__init__.py +342 -0
- tensorbored/_vendor/webencodings/labels.py +231 -0
- tensorbored/_vendor/webencodings/mklabels.py +59 -0
- tensorbored/_vendor/webencodings/x_user_defined.py +325 -0
- tensorbored/assets.py +36 -0
- tensorbored/auth.py +102 -0
- tensorbored/backend/__init__.py +0 -0
- tensorbored/backend/application.py +604 -0
- tensorbored/backend/auth_context_middleware.py +38 -0
- tensorbored/backend/client_feature_flags.py +113 -0
- tensorbored/backend/empty_path_redirect.py +46 -0
- tensorbored/backend/event_processing/__init__.py +0 -0
- tensorbored/backend/event_processing/data_ingester.py +276 -0
- tensorbored/backend/event_processing/data_provider.py +535 -0
- tensorbored/backend/event_processing/directory_loader.py +142 -0
- tensorbored/backend/event_processing/directory_watcher.py +272 -0
- tensorbored/backend/event_processing/event_accumulator.py +950 -0
- tensorbored/backend/event_processing/event_file_inspector.py +463 -0
- tensorbored/backend/event_processing/event_file_loader.py +292 -0
- tensorbored/backend/event_processing/event_multiplexer.py +521 -0
- tensorbored/backend/event_processing/event_util.py +68 -0
- tensorbored/backend/event_processing/io_wrapper.py +223 -0
- tensorbored/backend/event_processing/plugin_asset_util.py +104 -0
- tensorbored/backend/event_processing/plugin_event_accumulator.py +721 -0
- tensorbored/backend/event_processing/plugin_event_multiplexer.py +522 -0
- tensorbored/backend/event_processing/reservoir.py +266 -0
- tensorbored/backend/event_processing/tag_types.py +29 -0
- tensorbored/backend/experiment_id.py +71 -0
- tensorbored/backend/experimental_plugin.py +51 -0
- tensorbored/backend/http_util.py +263 -0
- tensorbored/backend/json_util.py +70 -0
- tensorbored/backend/path_prefix.py +67 -0
- tensorbored/backend/process_graph.py +74 -0
- tensorbored/backend/security_validator.py +202 -0
- tensorbored/compat/__init__.py +69 -0
- tensorbored/compat/proto/__init__.py +0 -0
- tensorbored/compat/proto/allocation_description_pb2.py +35 -0
- tensorbored/compat/proto/api_def_pb2.py +82 -0
- tensorbored/compat/proto/attr_value_pb2.py +80 -0
- tensorbored/compat/proto/cluster_pb2.py +58 -0
- tensorbored/compat/proto/config_pb2.py +271 -0
- tensorbored/compat/proto/coordination_config_pb2.py +45 -0
- tensorbored/compat/proto/cost_graph_pb2.py +87 -0
- tensorbored/compat/proto/cpp_shape_inference_pb2.py +70 -0
- tensorbored/compat/proto/debug_pb2.py +65 -0
- tensorbored/compat/proto/event_pb2.py +149 -0
- tensorbored/compat/proto/full_type_pb2.py +74 -0
- tensorbored/compat/proto/function_pb2.py +157 -0
- tensorbored/compat/proto/graph_debug_info_pb2.py +111 -0
- tensorbored/compat/proto/graph_pb2.py +41 -0
- tensorbored/compat/proto/histogram_pb2.py +39 -0
- tensorbored/compat/proto/meta_graph_pb2.py +254 -0
- tensorbored/compat/proto/node_def_pb2.py +61 -0
- tensorbored/compat/proto/op_def_pb2.py +81 -0
- tensorbored/compat/proto/resource_handle_pb2.py +48 -0
- tensorbored/compat/proto/rewriter_config_pb2.py +93 -0
- tensorbored/compat/proto/rpc_options_pb2.py +35 -0
- tensorbored/compat/proto/saved_object_graph_pb2.py +193 -0
- tensorbored/compat/proto/saver_pb2.py +38 -0
- tensorbored/compat/proto/step_stats_pb2.py +116 -0
- tensorbored/compat/proto/struct_pb2.py +144 -0
- tensorbored/compat/proto/summary_pb2.py +111 -0
- tensorbored/compat/proto/tensor_description_pb2.py +38 -0
- tensorbored/compat/proto/tensor_pb2.py +68 -0
- tensorbored/compat/proto/tensor_shape_pb2.py +46 -0
- tensorbored/compat/proto/tfprof_log_pb2.py +307 -0
- tensorbored/compat/proto/trackable_object_graph_pb2.py +90 -0
- tensorbored/compat/proto/types_pb2.py +105 -0
- tensorbored/compat/proto/variable_pb2.py +62 -0
- tensorbored/compat/proto/verifier_config_pb2.py +38 -0
- tensorbored/compat/proto/versions_pb2.py +35 -0
- tensorbored/compat/tensorflow_stub/__init__.py +38 -0
- tensorbored/compat/tensorflow_stub/app.py +124 -0
- tensorbored/compat/tensorflow_stub/compat/__init__.py +131 -0
- tensorbored/compat/tensorflow_stub/compat/v1/__init__.py +20 -0
- tensorbored/compat/tensorflow_stub/dtypes.py +692 -0
- tensorbored/compat/tensorflow_stub/error_codes.py +169 -0
- tensorbored/compat/tensorflow_stub/errors.py +507 -0
- tensorbored/compat/tensorflow_stub/flags.py +124 -0
- tensorbored/compat/tensorflow_stub/io/__init__.py +17 -0
- tensorbored/compat/tensorflow_stub/io/gfile.py +1011 -0
- tensorbored/compat/tensorflow_stub/pywrap_tensorflow.py +285 -0
- tensorbored/compat/tensorflow_stub/tensor_shape.py +1035 -0
- tensorbored/context.py +129 -0
- tensorbored/data/__init__.py +0 -0
- tensorbored/data/grpc_provider.py +365 -0
- tensorbored/data/ingester.py +46 -0
- tensorbored/data/proto/__init__.py +0 -0
- tensorbored/data/proto/data_provider_pb2.py +517 -0
- tensorbored/data/proto/data_provider_pb2_grpc.py +374 -0
- tensorbored/data/provider.py +1365 -0
- tensorbored/data/server_ingester.py +301 -0
- tensorbored/data_compat.py +159 -0
- tensorbored/dataclass_compat.py +224 -0
- tensorbored/default.py +124 -0
- tensorbored/errors.py +130 -0
- tensorbored/lazy.py +99 -0
- tensorbored/main.py +48 -0
- tensorbored/main_lib.py +62 -0
- tensorbored/manager.py +487 -0
- tensorbored/notebook.py +441 -0
- tensorbored/plugin_util.py +266 -0
- tensorbored/plugins/__init__.py +0 -0
- tensorbored/plugins/audio/__init__.py +0 -0
- tensorbored/plugins/audio/audio_plugin.py +229 -0
- tensorbored/plugins/audio/metadata.py +69 -0
- tensorbored/plugins/audio/plugin_data_pb2.py +37 -0
- tensorbored/plugins/audio/summary.py +230 -0
- tensorbored/plugins/audio/summary_v2.py +124 -0
- tensorbored/plugins/base_plugin.py +367 -0
- tensorbored/plugins/core/__init__.py +0 -0
- tensorbored/plugins/core/core_plugin.py +981 -0
- tensorbored/plugins/custom_scalar/__init__.py +0 -0
- tensorbored/plugins/custom_scalar/custom_scalars_plugin.py +320 -0
- tensorbored/plugins/custom_scalar/layout_pb2.py +85 -0
- tensorbored/plugins/custom_scalar/metadata.py +35 -0
- tensorbored/plugins/custom_scalar/summary.py +79 -0
- tensorbored/plugins/debugger_v2/__init__.py +0 -0
- tensorbored/plugins/debugger_v2/debug_data_multiplexer.py +631 -0
- tensorbored/plugins/debugger_v2/debug_data_provider.py +634 -0
- tensorbored/plugins/debugger_v2/debugger_v2_plugin.py +504 -0
- tensorbored/plugins/distribution/__init__.py +0 -0
- tensorbored/plugins/distribution/compressor.py +158 -0
- tensorbored/plugins/distribution/distributions_plugin.py +116 -0
- tensorbored/plugins/distribution/metadata.py +19 -0
- tensorbored/plugins/graph/__init__.py +0 -0
- tensorbored/plugins/graph/graph_util.py +129 -0
- tensorbored/plugins/graph/graphs_plugin.py +336 -0
- tensorbored/plugins/graph/keras_util.py +328 -0
- tensorbored/plugins/graph/metadata.py +42 -0
- tensorbored/plugins/histogram/__init__.py +0 -0
- tensorbored/plugins/histogram/histograms_plugin.py +144 -0
- tensorbored/plugins/histogram/metadata.py +63 -0
- tensorbored/plugins/histogram/plugin_data_pb2.py +34 -0
- tensorbored/plugins/histogram/summary.py +234 -0
- tensorbored/plugins/histogram/summary_v2.py +292 -0
- tensorbored/plugins/hparams/__init__.py +14 -0
- tensorbored/plugins/hparams/_keras.py +93 -0
- tensorbored/plugins/hparams/api.py +130 -0
- tensorbored/plugins/hparams/api_pb2.py +208 -0
- tensorbored/plugins/hparams/backend_context.py +606 -0
- tensorbored/plugins/hparams/download_data.py +158 -0
- tensorbored/plugins/hparams/error.py +26 -0
- tensorbored/plugins/hparams/get_experiment.py +71 -0
- tensorbored/plugins/hparams/hparams_plugin.py +206 -0
- tensorbored/plugins/hparams/hparams_util_pb2.py +69 -0
- tensorbored/plugins/hparams/json_format_compat.py +38 -0
- tensorbored/plugins/hparams/list_metric_evals.py +57 -0
- tensorbored/plugins/hparams/list_session_groups.py +1040 -0
- tensorbored/plugins/hparams/metadata.py +125 -0
- tensorbored/plugins/hparams/metrics.py +41 -0
- tensorbored/plugins/hparams/plugin_data_pb2.py +69 -0
- tensorbored/plugins/hparams/summary.py +205 -0
- tensorbored/plugins/hparams/summary_v2.py +597 -0
- tensorbored/plugins/image/__init__.py +0 -0
- tensorbored/plugins/image/images_plugin.py +232 -0
- tensorbored/plugins/image/metadata.py +65 -0
- tensorbored/plugins/image/plugin_data_pb2.py +34 -0
- tensorbored/plugins/image/summary.py +159 -0
- tensorbored/plugins/image/summary_v2.py +130 -0
- tensorbored/plugins/mesh/__init__.py +14 -0
- tensorbored/plugins/mesh/mesh_plugin.py +292 -0
- tensorbored/plugins/mesh/metadata.py +152 -0
- tensorbored/plugins/mesh/plugin_data_pb2.py +37 -0
- tensorbored/plugins/mesh/summary.py +251 -0
- tensorbored/plugins/mesh/summary_v2.py +214 -0
- tensorbored/plugins/metrics/__init__.py +0 -0
- tensorbored/plugins/metrics/metadata.py +17 -0
- tensorbored/plugins/metrics/metrics_plugin.py +623 -0
- tensorbored/plugins/pr_curve/__init__.py +0 -0
- tensorbored/plugins/pr_curve/metadata.py +75 -0
- tensorbored/plugins/pr_curve/plugin_data_pb2.py +34 -0
- tensorbored/plugins/pr_curve/pr_curves_plugin.py +241 -0
- tensorbored/plugins/pr_curve/summary.py +574 -0
- tensorbored/plugins/profile_redirect/__init__.py +0 -0
- tensorbored/plugins/profile_redirect/profile_redirect_plugin.py +49 -0
- tensorbored/plugins/projector/__init__.py +67 -0
- tensorbored/plugins/projector/metadata.py +26 -0
- tensorbored/plugins/projector/projector_config_pb2.py +54 -0
- tensorbored/plugins/projector/projector_plugin.py +795 -0
- tensorbored/plugins/projector/tf_projector_plugin/index.js +32 -0
- tensorbored/plugins/projector/tf_projector_plugin/projector_binary.html +524 -0
- tensorbored/plugins/projector/tf_projector_plugin/projector_binary.js +15536 -0
- tensorbored/plugins/scalar/__init__.py +0 -0
- tensorbored/plugins/scalar/metadata.py +60 -0
- tensorbored/plugins/scalar/plugin_data_pb2.py +34 -0
- tensorbored/plugins/scalar/scalars_plugin.py +181 -0
- tensorbored/plugins/scalar/summary.py +109 -0
- tensorbored/plugins/scalar/summary_v2.py +124 -0
- tensorbored/plugins/text/__init__.py +0 -0
- tensorbored/plugins/text/metadata.py +62 -0
- tensorbored/plugins/text/plugin_data_pb2.py +34 -0
- tensorbored/plugins/text/summary.py +114 -0
- tensorbored/plugins/text/summary_v2.py +124 -0
- tensorbored/plugins/text/text_plugin.py +288 -0
- tensorbored/plugins/wit_redirect/__init__.py +0 -0
- tensorbored/plugins/wit_redirect/wit_redirect_plugin.py +49 -0
- tensorbored/program.py +910 -0
- tensorbored/summary/__init__.py +35 -0
- tensorbored/summary/_output.py +124 -0
- tensorbored/summary/_tf/__init__.py +14 -0
- tensorbored/summary/_tf/summary/__init__.py +178 -0
- tensorbored/summary/_writer.py +105 -0
- tensorbored/summary/v1.py +51 -0
- tensorbored/summary/v2.py +25 -0
- tensorbored/summary/writer/__init__.py +13 -0
- tensorbored/summary/writer/event_file_writer.py +291 -0
- tensorbored/summary/writer/record_writer.py +50 -0
- tensorbored/util/__init__.py +0 -0
- tensorbored/util/encoder.py +116 -0
- tensorbored/util/grpc_util.py +311 -0
- tensorbored/util/img_mime_type_detector.py +40 -0
- tensorbored/util/io_util.py +20 -0
- tensorbored/util/lazy_tensor_creator.py +110 -0
- tensorbored/util/op_evaluator.py +104 -0
- tensorbored/util/platform_util.py +20 -0
- tensorbored/util/tb_logging.py +24 -0
- tensorbored/util/tensor_util.py +617 -0
- tensorbored/util/timing.py +122 -0
- tensorbored/version.py +21 -0
- tensorbored/webfiles.zip +0 -0
- tensorbored-2.21.0rc1769983804.dist-info/METADATA +49 -0
- tensorbored-2.21.0rc1769983804.dist-info/RECORD +271 -0
- tensorbored-2.21.0rc1769983804.dist-info/WHEEL +5 -0
- tensorbored-2.21.0rc1769983804.dist-info/entry_points.txt +6 -0
- tensorbored-2.21.0rc1769983804.dist-info/licenses/LICENSE +739 -0
- tensorbored-2.21.0rc1769983804.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from urllib.parse import quote
|
|
4
|
+
|
|
5
|
+
from tensorbored._vendor.bleach import callbacks as linkify_callbacks
|
|
6
|
+
from tensorbored._vendor.bleach import html5lib_shim
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
#: List of default callbacks
|
|
10
|
+
DEFAULT_CALLBACKS = [linkify_callbacks.nofollow]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
TLDS = """ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az
|
|
14
|
+
ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat
|
|
15
|
+
cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk
|
|
16
|
+
dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg
|
|
17
|
+
gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il
|
|
18
|
+
im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp
|
|
19
|
+
kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk
|
|
20
|
+
ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne
|
|
21
|
+
net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn post
|
|
22
|
+
pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl
|
|
23
|
+
sm sn so sr ss st su sv sx sy sz tc td tel tf tg th tj tk tl tm tn to
|
|
24
|
+
tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws
|
|
25
|
+
xn xxx ye yt yu za zm zw""".split()
|
|
26
|
+
|
|
27
|
+
# Make sure that .com doesn't get matched by .co first
|
|
28
|
+
TLDS.reverse()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_url_re(tlds=TLDS, protocols=html5lib_shim.allowed_protocols):
|
|
32
|
+
"""Builds the url regex used by linkifier
|
|
33
|
+
|
|
34
|
+
If you want a different set of tlds or allowed protocols, pass those in
|
|
35
|
+
and stomp on the existing ``url_re``::
|
|
36
|
+
|
|
37
|
+
from bleach import linkifier
|
|
38
|
+
|
|
39
|
+
my_url_re = linkifier.build_url_re(my_tlds_list, my_protocols)
|
|
40
|
+
|
|
41
|
+
linker = LinkifyFilter(url_re=my_url_re)
|
|
42
|
+
|
|
43
|
+
"""
|
|
44
|
+
return re.compile(
|
|
45
|
+
r"""\(* # Match any opening parentheses.
|
|
46
|
+
\b(?<![@.])(?:(?:{0}):/{{0,3}}(?:(?:\w+:)?\w+@)?)? # http://
|
|
47
|
+
([\w-]+\.)+(?:{1})(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
|
48
|
+
(?:[/?][^\s\{{\}}\|\\\^`<>"]*)?
|
|
49
|
+
# /path/zz (excluding "unsafe" chars from RFC 3986,
|
|
50
|
+
# except for # and ~, which happen in practice)
|
|
51
|
+
""".format(
|
|
52
|
+
"|".join(sorted(protocols)), "|".join(sorted(tlds))
|
|
53
|
+
),
|
|
54
|
+
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
URL_RE = build_url_re()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
PROTO_RE = re.compile(r"^[\w-]+:/{0,3}", re.IGNORECASE)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def build_email_re(tlds=TLDS):
|
|
65
|
+
"""Builds the email regex used by linkifier
|
|
66
|
+
|
|
67
|
+
If you want a different set of tlds, pass those in and stomp on the existing ``email_re``::
|
|
68
|
+
|
|
69
|
+
from bleach import linkifier
|
|
70
|
+
|
|
71
|
+
my_email_re = linkifier.build_email_re(my_tlds_list)
|
|
72
|
+
|
|
73
|
+
linker = LinkifyFilter(email_re=my_url_re)
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
# open and closing braces doubled below for format string
|
|
77
|
+
return re.compile(
|
|
78
|
+
r"""(?<!//)
|
|
79
|
+
(([-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+
|
|
80
|
+
(\.[-!#$%&'*+/=?^_`{{}}|~0-9A-Z]+)* # dot-atom
|
|
81
|
+
|^"([\001-\010\013\014\016-\037!#-\[\]-\177]
|
|
82
|
+
|\\[\001-\011\013\014\016-\177])*" # quoted-string
|
|
83
|
+
)@(?:[A-Z0-9](?:[A-Z0-9-]{{0,61}}[A-Z0-9])?\.)+(?:{0})) # domain
|
|
84
|
+
""".format(
|
|
85
|
+
"|".join(tlds)
|
|
86
|
+
),
|
|
87
|
+
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
EMAIL_RE = build_email_re()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class Linker:
|
|
95
|
+
"""Convert URL-like strings in an HTML fragment to links
|
|
96
|
+
|
|
97
|
+
This function converts strings that look like URLs, domain names and email
|
|
98
|
+
addresses in text that may be an HTML fragment to links, while preserving:
|
|
99
|
+
|
|
100
|
+
1. links already in the string
|
|
101
|
+
2. urls found in attributes
|
|
102
|
+
3. email addresses
|
|
103
|
+
|
|
104
|
+
linkify does a best-effort approach and tries to recover from bad
|
|
105
|
+
situations due to crazy text.
|
|
106
|
+
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(
|
|
110
|
+
self,
|
|
111
|
+
callbacks=DEFAULT_CALLBACKS,
|
|
112
|
+
skip_tags=None,
|
|
113
|
+
parse_email=False,
|
|
114
|
+
url_re=URL_RE,
|
|
115
|
+
email_re=EMAIL_RE,
|
|
116
|
+
recognized_tags=html5lib_shim.HTML_TAGS,
|
|
117
|
+
):
|
|
118
|
+
"""Creates a Linker instance
|
|
119
|
+
|
|
120
|
+
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
|
121
|
+
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
|
122
|
+
|
|
123
|
+
:arg set skip_tags: set of tags that you don't want to linkify the
|
|
124
|
+
contents of; for example, you could set this to ``{'pre'}`` to skip
|
|
125
|
+
linkifying contents of ``pre`` tags; ``None`` means you don't
|
|
126
|
+
want linkify to skip any tags
|
|
127
|
+
|
|
128
|
+
:arg bool parse_email: whether or not to linkify email addresses
|
|
129
|
+
|
|
130
|
+
:arg url_re: url matching regex
|
|
131
|
+
|
|
132
|
+
:arg email_re: email matching regex
|
|
133
|
+
|
|
134
|
+
:arg set recognized_tags: the set of tags that linkify knows about;
|
|
135
|
+
everything else gets escaped
|
|
136
|
+
|
|
137
|
+
:returns: linkified text as unicode
|
|
138
|
+
|
|
139
|
+
"""
|
|
140
|
+
self.callbacks = callbacks
|
|
141
|
+
self.skip_tags = skip_tags
|
|
142
|
+
self.parse_email = parse_email
|
|
143
|
+
self.url_re = url_re
|
|
144
|
+
self.email_re = email_re
|
|
145
|
+
|
|
146
|
+
# Create a parser/tokenizer that allows all HTML tags and escapes
|
|
147
|
+
# anything not in that list.
|
|
148
|
+
self.parser = html5lib_shim.BleachHTMLParser(
|
|
149
|
+
tags=frozenset(recognized_tags),
|
|
150
|
+
strip=False,
|
|
151
|
+
consume_entities=False,
|
|
152
|
+
namespaceHTMLElements=False,
|
|
153
|
+
)
|
|
154
|
+
self.walker = html5lib_shim.getTreeWalker("etree")
|
|
155
|
+
self.serializer = html5lib_shim.BleachHTMLSerializer(
|
|
156
|
+
quote_attr_values="always",
|
|
157
|
+
omit_optional_tags=False,
|
|
158
|
+
# We want to leave entities as they are without escaping or
|
|
159
|
+
# resolving or expanding
|
|
160
|
+
resolve_entities=False,
|
|
161
|
+
# linkify does not sanitize
|
|
162
|
+
sanitize=False,
|
|
163
|
+
# linkify preserves attr order
|
|
164
|
+
alphabetical_attributes=False,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def linkify(self, text):
|
|
168
|
+
"""Linkify specified text
|
|
169
|
+
|
|
170
|
+
:arg str text: the text to add links to
|
|
171
|
+
|
|
172
|
+
:returns: linkified text as unicode
|
|
173
|
+
|
|
174
|
+
:raises TypeError: if ``text`` is not a text type
|
|
175
|
+
|
|
176
|
+
"""
|
|
177
|
+
if not isinstance(text, str):
|
|
178
|
+
raise TypeError("argument must be of text type")
|
|
179
|
+
|
|
180
|
+
if not text:
|
|
181
|
+
return ""
|
|
182
|
+
|
|
183
|
+
dom = self.parser.parseFragment(text)
|
|
184
|
+
filtered = LinkifyFilter(
|
|
185
|
+
source=self.walker(dom),
|
|
186
|
+
callbacks=self.callbacks,
|
|
187
|
+
skip_tags=self.skip_tags,
|
|
188
|
+
parse_email=self.parse_email,
|
|
189
|
+
url_re=self.url_re,
|
|
190
|
+
email_re=self.email_re,
|
|
191
|
+
)
|
|
192
|
+
return self.serializer.render(filtered)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class LinkifyFilter(html5lib_shim.Filter):
|
|
196
|
+
"""html5lib filter that linkifies text
|
|
197
|
+
|
|
198
|
+
This will do the following:
|
|
199
|
+
|
|
200
|
+
* convert email addresses into links
|
|
201
|
+
* convert urls into links
|
|
202
|
+
* edit existing links by running them through callbacks--the default is to
|
|
203
|
+
add a ``rel="nofollow"``
|
|
204
|
+
|
|
205
|
+
This filter can be used anywhere html5lib filters can be used.
|
|
206
|
+
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
def __init__(
|
|
210
|
+
self,
|
|
211
|
+
source,
|
|
212
|
+
callbacks=DEFAULT_CALLBACKS,
|
|
213
|
+
skip_tags=None,
|
|
214
|
+
parse_email=False,
|
|
215
|
+
url_re=URL_RE,
|
|
216
|
+
email_re=EMAIL_RE,
|
|
217
|
+
):
|
|
218
|
+
"""Creates a LinkifyFilter instance
|
|
219
|
+
|
|
220
|
+
:arg source: stream as an html5lib TreeWalker
|
|
221
|
+
|
|
222
|
+
:arg list callbacks: list of callbacks to run when adjusting tag attributes;
|
|
223
|
+
defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``
|
|
224
|
+
|
|
225
|
+
:arg set skip_tags: set of tags that you don't want to linkify the
|
|
226
|
+
contents of; for example, you could set this to ``{'pre'}`` to skip
|
|
227
|
+
linkifying contents of ``pre`` tags
|
|
228
|
+
|
|
229
|
+
:arg bool parse_email: whether or not to linkify email addresses
|
|
230
|
+
|
|
231
|
+
:arg url_re: url matching regex
|
|
232
|
+
|
|
233
|
+
:arg email_re: email matching regex
|
|
234
|
+
|
|
235
|
+
"""
|
|
236
|
+
super().__init__(source)
|
|
237
|
+
|
|
238
|
+
self.callbacks = callbacks or []
|
|
239
|
+
self.skip_tags = skip_tags or {}
|
|
240
|
+
self.parse_email = parse_email
|
|
241
|
+
|
|
242
|
+
self.url_re = url_re
|
|
243
|
+
self.email_re = email_re
|
|
244
|
+
|
|
245
|
+
def apply_callbacks(self, attrs, is_new):
|
|
246
|
+
"""Given an attrs dict and an is_new bool, runs through callbacks
|
|
247
|
+
|
|
248
|
+
Callbacks can return an adjusted attrs dict or ``None``. In the case of
|
|
249
|
+
``None``, we stop going through callbacks and return that and the link
|
|
250
|
+
gets dropped.
|
|
251
|
+
|
|
252
|
+
:arg dict attrs: map of ``(namespace, name)`` -> ``value``
|
|
253
|
+
|
|
254
|
+
:arg bool is_new: whether or not this link was added by linkify
|
|
255
|
+
|
|
256
|
+
:returns: adjusted attrs dict or ``None``
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
for cb in self.callbacks:
|
|
260
|
+
attrs = cb(attrs, is_new)
|
|
261
|
+
if attrs is None:
|
|
262
|
+
return None
|
|
263
|
+
return attrs
|
|
264
|
+
|
|
265
|
+
def extract_character_data(self, token_list):
|
|
266
|
+
"""Extracts and squashes character sequences in a token stream"""
|
|
267
|
+
# FIXME(willkg): This is a terrible idea. What it does is drop all the
|
|
268
|
+
# tags from the token list and merge the Characters and SpaceCharacters
|
|
269
|
+
# tokens into a single text.
|
|
270
|
+
#
|
|
271
|
+
# So something like this::
|
|
272
|
+
#
|
|
273
|
+
# "<span>" "<b>" "some text" "</b>" "</span>"
|
|
274
|
+
#
|
|
275
|
+
# gets converted to "some text".
|
|
276
|
+
#
|
|
277
|
+
# This gets used to figure out the ``_text`` fauxttribute value for
|
|
278
|
+
# linkify callables.
|
|
279
|
+
#
|
|
280
|
+
# I'm not really sure how else to support that ``_text`` fauxttribute and
|
|
281
|
+
# maintain some modicum of backwards compatibility with previous versions
|
|
282
|
+
# of Bleach.
|
|
283
|
+
|
|
284
|
+
out = []
|
|
285
|
+
for token in token_list:
|
|
286
|
+
token_type = token["type"]
|
|
287
|
+
if token_type in ["Characters", "SpaceCharacters"]:
|
|
288
|
+
out.append(token["data"])
|
|
289
|
+
|
|
290
|
+
return "".join(out)
|
|
291
|
+
|
|
292
|
+
def handle_email_addresses(self, src_iter):
|
|
293
|
+
"""Handle email addresses in character tokens"""
|
|
294
|
+
for token in src_iter:
|
|
295
|
+
if token["type"] == "Characters":
|
|
296
|
+
text = token["data"]
|
|
297
|
+
new_tokens = []
|
|
298
|
+
end = 0
|
|
299
|
+
|
|
300
|
+
# For each email address we find in the text
|
|
301
|
+
for match in self.email_re.finditer(text):
|
|
302
|
+
if match.start() > end:
|
|
303
|
+
new_tokens.append(
|
|
304
|
+
{"type": "Characters", "data": text[end : match.start()]}
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# URL-encode the "local-part" according to RFC6068
|
|
308
|
+
parts = match.group(0).split("@")
|
|
309
|
+
parts[0] = quote(parts[0])
|
|
310
|
+
address = "@".join(parts)
|
|
311
|
+
|
|
312
|
+
# Run attributes through the callbacks to see what we
|
|
313
|
+
# should do with this match
|
|
314
|
+
attrs = {
|
|
315
|
+
(None, "href"): "mailto:%s" % address,
|
|
316
|
+
"_text": match.group(0),
|
|
317
|
+
}
|
|
318
|
+
attrs = self.apply_callbacks(attrs, True)
|
|
319
|
+
|
|
320
|
+
if attrs is None:
|
|
321
|
+
# Just add the text--but not as a link
|
|
322
|
+
new_tokens.append(
|
|
323
|
+
{"type": "Characters", "data": match.group(0)}
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
else:
|
|
327
|
+
# Add an "a" tag for the new link
|
|
328
|
+
_text = attrs.pop("_text", "")
|
|
329
|
+
new_tokens.extend(
|
|
330
|
+
[
|
|
331
|
+
{"type": "StartTag", "name": "a", "data": attrs},
|
|
332
|
+
{"type": "Characters", "data": str(_text)},
|
|
333
|
+
{"type": "EndTag", "name": "a"},
|
|
334
|
+
]
|
|
335
|
+
)
|
|
336
|
+
end = match.end()
|
|
337
|
+
|
|
338
|
+
if new_tokens:
|
|
339
|
+
# Yield the adjusted set of tokens and then continue
|
|
340
|
+
# through the loop
|
|
341
|
+
if end < len(text):
|
|
342
|
+
new_tokens.append({"type": "Characters", "data": text[end:]})
|
|
343
|
+
|
|
344
|
+
yield from new_tokens
|
|
345
|
+
|
|
346
|
+
continue
|
|
347
|
+
|
|
348
|
+
yield token
|
|
349
|
+
|
|
350
|
+
def strip_non_url_bits(self, fragment):
|
|
351
|
+
"""Strips non-url bits from the url
|
|
352
|
+
|
|
353
|
+
This accounts for over-eager matching by the regex.
|
|
354
|
+
|
|
355
|
+
"""
|
|
356
|
+
prefix = suffix = ""
|
|
357
|
+
|
|
358
|
+
while fragment:
|
|
359
|
+
# Try removing ( from the beginning and, if it's balanced, from the
|
|
360
|
+
# end, too
|
|
361
|
+
if fragment.startswith("("):
|
|
362
|
+
prefix = prefix + "("
|
|
363
|
+
fragment = fragment[1:]
|
|
364
|
+
|
|
365
|
+
if fragment.endswith(")"):
|
|
366
|
+
suffix = ")" + suffix
|
|
367
|
+
fragment = fragment[:-1]
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
# Now try extraneous things from the end. For example, sometimes we
|
|
371
|
+
# pick up ) at the end of a url, but the url is in a parenthesized
|
|
372
|
+
# phrase like:
|
|
373
|
+
#
|
|
374
|
+
# "i looked at the site (at http://example.com)"
|
|
375
|
+
|
|
376
|
+
if fragment.endswith(")") and "(" not in fragment:
|
|
377
|
+
fragment = fragment[:-1]
|
|
378
|
+
suffix = ")" + suffix
|
|
379
|
+
continue
|
|
380
|
+
|
|
381
|
+
# Handle commas
|
|
382
|
+
if fragment.endswith(","):
|
|
383
|
+
fragment = fragment[:-1]
|
|
384
|
+
suffix = "," + suffix
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
# Handle periods
|
|
388
|
+
if fragment.endswith("."):
|
|
389
|
+
fragment = fragment[:-1]
|
|
390
|
+
suffix = "." + suffix
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
# Nothing matched, so we're done
|
|
394
|
+
break
|
|
395
|
+
|
|
396
|
+
return fragment, prefix, suffix
|
|
397
|
+
|
|
398
|
+
def handle_links(self, src_iter):
|
|
399
|
+
"""Handle links in character tokens"""
|
|
400
|
+
in_a = False # happens, if parse_email=True and if a mail was found
|
|
401
|
+
for token in src_iter:
|
|
402
|
+
if in_a:
|
|
403
|
+
if token["type"] == "EndTag" and token["name"] == "a":
|
|
404
|
+
in_a = False
|
|
405
|
+
yield token
|
|
406
|
+
continue
|
|
407
|
+
elif token["type"] == "StartTag" and token["name"] == "a":
|
|
408
|
+
in_a = True
|
|
409
|
+
yield token
|
|
410
|
+
continue
|
|
411
|
+
if token["type"] == "Characters":
|
|
412
|
+
text = token["data"]
|
|
413
|
+
new_tokens = []
|
|
414
|
+
end = 0
|
|
415
|
+
|
|
416
|
+
for match in self.url_re.finditer(text):
|
|
417
|
+
if match.start() > end:
|
|
418
|
+
new_tokens.append(
|
|
419
|
+
{"type": "Characters", "data": text[end : match.start()]}
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
url = match.group(0)
|
|
423
|
+
prefix = suffix = ""
|
|
424
|
+
|
|
425
|
+
# Sometimes we pick up too much in the url match, so look for
|
|
426
|
+
# bits we should drop and remove them from the match
|
|
427
|
+
url, prefix, suffix = self.strip_non_url_bits(url)
|
|
428
|
+
|
|
429
|
+
# If there's no protocol, add one
|
|
430
|
+
if PROTO_RE.search(url):
|
|
431
|
+
href = url
|
|
432
|
+
else:
|
|
433
|
+
href = "http://%s" % url
|
|
434
|
+
|
|
435
|
+
attrs = {(None, "href"): href, "_text": url}
|
|
436
|
+
attrs = self.apply_callbacks(attrs, True)
|
|
437
|
+
|
|
438
|
+
if attrs is None:
|
|
439
|
+
# Just add the text
|
|
440
|
+
new_tokens.append(
|
|
441
|
+
{"type": "Characters", "data": prefix + url + suffix}
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
else:
|
|
445
|
+
# Add the "a" tag!
|
|
446
|
+
if prefix:
|
|
447
|
+
new_tokens.append({"type": "Characters", "data": prefix})
|
|
448
|
+
|
|
449
|
+
_text = attrs.pop("_text", "")
|
|
450
|
+
new_tokens.extend(
|
|
451
|
+
[
|
|
452
|
+
{"type": "StartTag", "name": "a", "data": attrs},
|
|
453
|
+
{"type": "Characters", "data": str(_text)},
|
|
454
|
+
{"type": "EndTag", "name": "a"},
|
|
455
|
+
]
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
if suffix:
|
|
459
|
+
new_tokens.append({"type": "Characters", "data": suffix})
|
|
460
|
+
|
|
461
|
+
end = match.end()
|
|
462
|
+
|
|
463
|
+
if new_tokens:
|
|
464
|
+
# Yield the adjusted set of tokens and then continue
|
|
465
|
+
# through the loop
|
|
466
|
+
if end < len(text):
|
|
467
|
+
new_tokens.append({"type": "Characters", "data": text[end:]})
|
|
468
|
+
|
|
469
|
+
yield from new_tokens
|
|
470
|
+
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
yield token
|
|
474
|
+
|
|
475
|
+
def handle_a_tag(self, token_buffer):
|
|
476
|
+
"""Handle the "a" tag
|
|
477
|
+
|
|
478
|
+
This could adjust the link or drop it altogether depending on what the
|
|
479
|
+
callbacks return.
|
|
480
|
+
|
|
481
|
+
This yields the new set of tokens.
|
|
482
|
+
|
|
483
|
+
"""
|
|
484
|
+
a_token = token_buffer[0]
|
|
485
|
+
if a_token["data"]:
|
|
486
|
+
attrs = a_token["data"]
|
|
487
|
+
else:
|
|
488
|
+
attrs = {}
|
|
489
|
+
text = self.extract_character_data(token_buffer)
|
|
490
|
+
attrs["_text"] = text
|
|
491
|
+
|
|
492
|
+
attrs = self.apply_callbacks(attrs, False)
|
|
493
|
+
|
|
494
|
+
if attrs is None:
|
|
495
|
+
# We're dropping the "a" tag and everything else and replacing
|
|
496
|
+
# it with character data. So emit that token.
|
|
497
|
+
yield {"type": "Characters", "data": text}
|
|
498
|
+
|
|
499
|
+
else:
|
|
500
|
+
new_text = attrs.pop("_text", "")
|
|
501
|
+
a_token["data"] = attrs
|
|
502
|
+
|
|
503
|
+
if text == new_text:
|
|
504
|
+
# The callbacks didn't change the text, so we yield the new "a"
|
|
505
|
+
# token, then whatever else was there, then the end "a" token
|
|
506
|
+
yield a_token
|
|
507
|
+
yield from token_buffer[1:]
|
|
508
|
+
|
|
509
|
+
else:
|
|
510
|
+
# If the callbacks changed the text, then we're going to drop
|
|
511
|
+
# all the tokens between the start and end "a" tags and replace
|
|
512
|
+
# it with the new text
|
|
513
|
+
yield a_token
|
|
514
|
+
yield {"type": "Characters", "data": str(new_text)}
|
|
515
|
+
yield token_buffer[-1]
|
|
516
|
+
|
|
517
|
+
def extract_entities(self, token):
|
|
518
|
+
"""Handles Characters tokens with entities
|
|
519
|
+
|
|
520
|
+
Our overridden tokenizer doesn't do anything with entities. However,
|
|
521
|
+
that means that the serializer will convert all ``&`` in Characters
|
|
522
|
+
tokens to ``&``.
|
|
523
|
+
|
|
524
|
+
Since we don't want that, we extract entities here and convert them to
|
|
525
|
+
Entity tokens so the serializer will let them be.
|
|
526
|
+
|
|
527
|
+
:arg token: the Characters token to work on
|
|
528
|
+
|
|
529
|
+
:returns: generator of tokens
|
|
530
|
+
|
|
531
|
+
"""
|
|
532
|
+
data = token.get("data", "")
|
|
533
|
+
|
|
534
|
+
# If there isn't a & in the data, we can return now
|
|
535
|
+
if "&" not in data:
|
|
536
|
+
yield token
|
|
537
|
+
return
|
|
538
|
+
|
|
539
|
+
new_tokens = []
|
|
540
|
+
|
|
541
|
+
# For each possible entity that starts with a "&", we try to extract an
|
|
542
|
+
# actual entity and re-tokenize accordingly
|
|
543
|
+
for part in html5lib_shim.next_possible_entity(data):
|
|
544
|
+
if not part:
|
|
545
|
+
continue
|
|
546
|
+
|
|
547
|
+
if part.startswith("&"):
|
|
548
|
+
entity = html5lib_shim.match_entity(part)
|
|
549
|
+
if entity is not None:
|
|
550
|
+
if entity == "amp":
|
|
551
|
+
# LinkifyFilter can't match urls across token boundaries
|
|
552
|
+
# which is problematic with & since that shows up in
|
|
553
|
+
# querystrings all the time. This special-cases &
|
|
554
|
+
# and converts it to a & and sticks it in as a
|
|
555
|
+
# Characters token. It'll get merged with surrounding
|
|
556
|
+
# tokens in the BleachSanitizerfilter.__iter__ and
|
|
557
|
+
# escaped in the serializer.
|
|
558
|
+
new_tokens.append({"type": "Characters", "data": "&"})
|
|
559
|
+
else:
|
|
560
|
+
new_tokens.append({"type": "Entity", "name": entity})
|
|
561
|
+
|
|
562
|
+
# Length of the entity plus 2--one for & at the beginning
|
|
563
|
+
# and one for ; at the end
|
|
564
|
+
remainder = part[len(entity) + 2 :]
|
|
565
|
+
if remainder:
|
|
566
|
+
new_tokens.append({"type": "Characters", "data": remainder})
|
|
567
|
+
continue
|
|
568
|
+
|
|
569
|
+
new_tokens.append({"type": "Characters", "data": part})
|
|
570
|
+
|
|
571
|
+
yield from new_tokens
|
|
572
|
+
|
|
573
|
+
def __iter__(self):
|
|
574
|
+
in_a = False
|
|
575
|
+
in_skip_tag = None
|
|
576
|
+
|
|
577
|
+
token_buffer = []
|
|
578
|
+
|
|
579
|
+
for token in super().__iter__():
|
|
580
|
+
if in_a:
|
|
581
|
+
# Handle the case where we're in an "a" tag--we want to buffer tokens
|
|
582
|
+
# until we hit an end "a" tag.
|
|
583
|
+
if token["type"] == "EndTag" and token["name"] == "a":
|
|
584
|
+
# Add the end tag to the token buffer and then handle them
|
|
585
|
+
# and yield anything returned
|
|
586
|
+
token_buffer.append(token)
|
|
587
|
+
yield from self.handle_a_tag(token_buffer)
|
|
588
|
+
|
|
589
|
+
# Clear "a" related state and continue since we've yielded all
|
|
590
|
+
# the tokens we're going to yield
|
|
591
|
+
in_a = False
|
|
592
|
+
token_buffer = []
|
|
593
|
+
else:
|
|
594
|
+
token_buffer.extend(list(self.extract_entities(token)))
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
if token["type"] in ["StartTag", "EmptyTag"]:
|
|
598
|
+
if token["name"] in self.skip_tags:
|
|
599
|
+
# Skip tags start a "special mode" where we don't linkify
|
|
600
|
+
# anything until the end tag.
|
|
601
|
+
in_skip_tag = token["name"]
|
|
602
|
+
|
|
603
|
+
elif token["name"] == "a":
|
|
604
|
+
# The "a" tag is special--we switch to a slurp mode and
|
|
605
|
+
# slurp all the tokens until the end "a" tag and then
|
|
606
|
+
# figure out what to do with them there.
|
|
607
|
+
in_a = True
|
|
608
|
+
token_buffer.append(token)
|
|
609
|
+
|
|
610
|
+
# We buffer the start tag, so we don't want to yield it,
|
|
611
|
+
# yet
|
|
612
|
+
continue
|
|
613
|
+
|
|
614
|
+
elif in_skip_tag and self.skip_tags:
|
|
615
|
+
# NOTE(willkg): We put this clause here since in_a and
|
|
616
|
+
# switching in and out of in_a takes precedence.
|
|
617
|
+
if token["type"] == "EndTag" and token["name"] == in_skip_tag:
|
|
618
|
+
in_skip_tag = None
|
|
619
|
+
|
|
620
|
+
elif not in_a and not in_skip_tag and token["type"] == "Characters":
|
|
621
|
+
new_stream = iter([token])
|
|
622
|
+
if self.parse_email:
|
|
623
|
+
new_stream = self.handle_email_addresses(new_stream)
|
|
624
|
+
|
|
625
|
+
new_stream = self.handle_links(new_stream)
|
|
626
|
+
|
|
627
|
+
for new_token in new_stream:
|
|
628
|
+
yield from self.extract_entities(new_token)
|
|
629
|
+
|
|
630
|
+
# We've already yielded this token, so continue
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
yield token
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from tensorbored._vendor.bleach._vendor.parse import urlparse # noqa
|