wovnrb 1.0.13 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/lib/wovnrb.rb +7 -0
- data/lib/wovnrb/html_replacers/replacer_base.rb +2 -1
- data/lib/wovnrb/html_replacers/unified_values/dst_swapping_targets_creator.rb +76 -0
- data/lib/wovnrb/html_replacers/unified_values/element_category.rb +242 -0
- data/lib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rb +134 -0
- data/lib/wovnrb/html_replacers/unified_values/text_replacer.rb +35 -0
- data/lib/wovnrb/html_replacers/unified_values/text_scraper.rb +152 -0
- data/lib/wovnrb/html_replacers/unified_values/values_stack.rb +65 -0
- data/lib/wovnrb/lang.rb +6 -1
- data/lib/wovnrb/services/value_agent.rb +9 -0
- data/lib/wovnrb/store.rb +2 -9
- data/lib/wovnrb/version.rb +1 -1
- data/test/fixtures/unified_values/site_html/simple_actual.html +96 -0
- data/test/fixtures/unified_values/site_html/simple_expected.json +251 -0
- data/test/fixtures/unified_values/site_html/wovn.io_actual.html +686 -0
- data/test/fixtures/unified_values/site_html/wovn.io_expected.json +543 -0
- data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_actual.html +1024 -0
- data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_expected.json +3345 -0
- data/test/fixtures/unified_values/small_html/block_inside_inline_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/block_inside_inline_expected.json +22 -0
- data/test/fixtures/unified_values/small_html/br_tag_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/br_tag_expected.json +12 -0
- data/test/fixtures/unified_values/small_html/comment_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/comment_tag_expected.json +10 -0
- data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_actual.html +7 -0
- data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_expected.json +11 -0
- data/test/fixtures/unified_values/small_html/deep_nested_block_actual.html +14 -0
- data/test/fixtures/unified_values/small_html/deep_nested_block_expected.json +8 -0
- data/test/fixtures/unified_values/small_html/deep_nested_inline_actual.html +20 -0
- data/test/fixtures/unified_values/small_html/deep_nested_inline_expected.json +20 -0
- data/test/fixtures/unified_values/small_html/empty_tag_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/empty_tag_expected.json +12 -0
- data/test/fixtures/unified_values/small_html/empty_text_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/empty_text_expected.json +1 -0
- data/test/fixtures/unified_values/small_html/ignore_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/ignore_tag_expected.json +16 -0
- data/test/fixtures/unified_values/small_html/ignored_class_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/ignored_class_expected.json +13 -0
- data/test/fixtures/unified_values/small_html/img_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/img_expected.json +23 -0
- data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_expected.json +16 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_expected.json +12 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_expected.json +14 -0
- data/test/fixtures/unified_values/small_html/option_tag_actual.html +9 -0
- data/test/fixtures/unified_values/small_html/option_tag_expected.json +13 -0
- data/test/fixtures/unified_values/small_html/text_different_inline_each_other_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/text_different_inline_each_other_expected.json +22 -0
- data/test/fixtures/unified_values/small_html/text_in_svg_actual.html +9 -0
- data/test/fixtures/unified_values/small_html/text_in_svg_expected.json +8 -0
- data/test/fixtures/unified_values/small_html/text_with_html_entity_actual.html +6 -0
- data/test/fixtures/unified_values/small_html/text_with_html_entity_expected.json +8 -0
- data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_expected.json +24 -0
- data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_expected.json +14 -0
- data/test/fixtures/unified_values/small_html/wovn_ignore_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/wovn_ignore_expected.json +13 -0
- data/test/lib/html_replacers/unified_values/dst_swapping_targets_creator_test.rb +137 -0
- data/test/lib/html_replacers/unified_values/element_category_test.rb +49 -0
- data/test/lib/html_replacers/unified_values/node_swapping_targets_creator_test.rb +137 -0
- data/test/lib/html_replacers/unified_values/text_replacer_test.rb +270 -0
- data/test/lib/html_replacers/unified_values/text_scraper_test.rb +121 -0
- data/test/lib/html_replacers/unified_values/values_stack_test.rb +122 -0
- data/test/lib/lang_test.rb +59 -1
- data/test/lib/services/value_agent_test.rb +32 -0
- data/test/test_helper.rb +18 -2
- data/wovnrb.gemspec +1 -0
- metadata +134 -7
- data/spec/spec_helper.rb +0 -2
- data/spec/wovnrb_spec.rb +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e67d9a0a4d2f447222580c5d4e0a00995f8027fa
|
4
|
+
data.tar.gz: e8544f392fdf10c6c5101914cd7b721ed2e6c4e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f69d4ccb222bdfb97368d26c3585aaeb433c71d7b1a58b294d8b18cd098c8222149a1b41bb7912de82c88f7594e6d152600ca4e39167ed2f6987769761e2140c
|
7
|
+
data.tar.gz: 3e4ddf14d6c59a6eee023d878744c40a468874351ea27fe6138038506cef78529618950cf60c6af9670174dacd3ac98e12d70c2b306f54b927793cd7dec10bc7
|
data/Rakefile
CHANGED
data/lib/wovnrb.rb
CHANGED
@@ -3,6 +3,7 @@ require 'wovnrb/store'
|
|
3
3
|
require 'wovnrb/headers'
|
4
4
|
require 'wovnrb/lang'
|
5
5
|
require 'nokogumbo'
|
6
|
+
require 'active_support'
|
6
7
|
#require 'dom'
|
7
8
|
require 'json'
|
8
9
|
require 'wovnrb/helpers/nokogumbo_helper'
|
@@ -15,6 +16,12 @@ require 'wovnrb/html_replacers/meta_replacer'
|
|
15
16
|
require 'wovnrb/html_replacers/input_replacer'
|
16
17
|
require 'wovnrb/html_replacers/image_replacer'
|
17
18
|
require 'wovnrb/html_replacers/script_replacer'
|
19
|
+
require 'wovnrb/html_replacers/unified_values/text_replacer'
|
20
|
+
require 'wovnrb/html_replacers/unified_values/text_scraper'
|
21
|
+
require 'wovnrb/html_replacers/unified_values/values_stack'
|
22
|
+
require 'wovnrb/html_replacers/unified_values/element_category'
|
23
|
+
require 'wovnrb/html_replacers/unified_values/dst_swapping_targets_creator'
|
24
|
+
require 'wovnrb/html_replacers/unified_values/node_swapping_targets_creator'
|
18
25
|
require 'wovnrb/railtie' if defined?(Rails)
|
19
26
|
require 'wovnrb/version'
|
20
27
|
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Wovnrb
|
2
|
+
module UnifiedValues
|
3
|
+
class DstSwappingTargetsCreator
|
4
|
+
# NOTE: `text_index` is the format like below
|
5
|
+
#
|
6
|
+
# {
|
7
|
+
# "<span>apple is a good</span>foods"=>
|
8
|
+
# {"ja" =>
|
9
|
+
# [{"xpath"=>"/html/body/div", "data"=>"りんごは<span>おいしい</span>たべものです"}]
|
10
|
+
# },
|
11
|
+
# "click<a>here</a>"=>
|
12
|
+
# {"ja" =>
|
13
|
+
# [{"xpath"=>"/html/body/div", "data"=>"<a>こちら</a>をクリックしてください"}]
|
14
|
+
# }
|
15
|
+
# }
|
16
|
+
|
17
|
+
def initialize(text_index)
|
18
|
+
@text_index = text_index
|
19
|
+
end
|
20
|
+
|
21
|
+
# NOTE: `run` make a swapping target like below
|
22
|
+
#
|
23
|
+
# {
|
24
|
+
# "<span>apple is a good</span>foods"=>
|
25
|
+
# {"ja" =>
|
26
|
+
# [{"xpath"=>"/html/body/div", "data"=>"りんごは<span>おいしい</span>たべものです", 'swapping_targets'=>["りんごは", "おいしい", "たべものです"]}]
|
27
|
+
# },
|
28
|
+
# "click<a>here</a>"=>
|
29
|
+
# {"ja" =>
|
30
|
+
# [{"xpath"=>"/html/body/div", "data"=>" <a>こちら</a>をクリックしてください"}, 'swapping_targets'=>["", "こちら", "をクリックしてください"]]
|
31
|
+
# }
|
32
|
+
# }
|
33
|
+
|
34
|
+
def run!
|
35
|
+
@text_index.each do |_, v|
|
36
|
+
mold = []
|
37
|
+
v.values.each do |values|
|
38
|
+
values.each do |value|
|
39
|
+
value['data'].split(/(<.+?>)/).each_with_index do |data, _index|
|
40
|
+
mold_size = mold.size
|
41
|
+
mold.push('') if mold_size.even? && data.start_with?('<')
|
42
|
+
mold.push(data)
|
43
|
+
end
|
44
|
+
|
45
|
+
mold.push('') if /\A<.+?>\z/ =~ mold.last
|
46
|
+
|
47
|
+
value['swapping_targets'] = remove_tag_element(mold)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def remove_tag_element(mold)
|
56
|
+
end_tag_of_wovn_ignore = nil
|
57
|
+
swapping_targets = []
|
58
|
+
|
59
|
+
mold.each do |value|
|
60
|
+
if end_tag_of_wovn_ignore.nil? && value =~ /\A<.*wovn-ignore>\z/
|
61
|
+
end_tag_of_wovn_ignore = "</#{value.gsub(' wovn-ignore', '')[1..-1]}"
|
62
|
+
next
|
63
|
+
end
|
64
|
+
|
65
|
+
end_tag_of_wovn_ignore = nil if value == end_tag_of_wovn_ignore
|
66
|
+
|
67
|
+
if end_tag_of_wovn_ignore.nil? && /\A<.+?>\z/ !~ value
|
68
|
+
swapping_targets << value
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
swapping_targets
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
module Wovnrb
|
2
|
+
module UnifiedValues
|
3
|
+
module ElementCategory
|
4
|
+
FLOW_CONTENT = 1
|
5
|
+
PALPABLE_CONTENT = 2
|
6
|
+
PHRASING_CONTENT = 3
|
7
|
+
EMBEDDED_CONTENT = 4
|
8
|
+
INTERACTIVE_CONTENT = 5
|
9
|
+
METADATA_CONTENT = 6
|
10
|
+
HEADING_CONTENT = 7
|
11
|
+
SECTIONING_CONTENT = 8
|
12
|
+
SECTIONING_ROOT = 9
|
13
|
+
FORM_ASSOCIATED_ELEMENT = 10
|
14
|
+
SCRIPT_SUPPORTING_ELEMENT = 11
|
15
|
+
SVG_ELEMENT = 12
|
16
|
+
|
17
|
+
DOCUMENT_ELEMENT = 100
|
18
|
+
DOCUMENT_METADATA = 101
|
19
|
+
SECTION = 1010
|
20
|
+
GROUPING_CONTENT = 103
|
21
|
+
TEXT_LEVEL_SEMANTICS = 104
|
22
|
+
EDITS = 105
|
23
|
+
EMBEDDED_CONTENTS = 106
|
24
|
+
LINKS = 107
|
25
|
+
TABULAR_DATA = 108
|
26
|
+
FORMS = 109
|
27
|
+
INTERACTIVE_ELEMENTS = 110
|
28
|
+
SCRIPTING = 111
|
29
|
+
ESCAPABLE_RAW_TEXT_ELEMENT = 112
|
30
|
+
|
31
|
+
RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS = 200
|
32
|
+
RESTRICTED_TO_CHILD_OF_GROUPING = 201
|
33
|
+
RESTRICTED_TO_CHILD_OF_EMBEDDED = 202
|
34
|
+
RESTRICTED_TO_CHILD_OF_TABULAR = 203
|
35
|
+
RESTRICTED_TO_CHILD_OF_FORM = 204
|
36
|
+
RESTRICTED_TO_CHILD_OF_INTERACTIVE = 205
|
37
|
+
|
38
|
+
RESTRICTION_PARENT_ELEMENT = 300
|
39
|
+
FORM_OWNER = 301
|
40
|
+
|
41
|
+
# Below will help to know every elements.
|
42
|
+
# https://www.w3.org/TR/html5/
|
43
|
+
# https://dev.w3.org/html5/html-author/#the-elements
|
44
|
+
# https://html.spec.whatwg.org/#elements-2
|
45
|
+
#
|
46
|
+
# Below will help to know default display of elements.
|
47
|
+
# https://html.spec.whatwg.org/#rendering
|
48
|
+
|
49
|
+
# This constant is for internal. SHOULD NOT be used outside this class.
|
50
|
+
CONTENT_TYPES = {
|
51
|
+
'a' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
|
52
|
+
'abbr' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
53
|
+
'address' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
|
54
|
+
'area' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT],
|
55
|
+
'article' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
|
56
|
+
'aside' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
|
57
|
+
'audio' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
|
58
|
+
'b' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
59
|
+
'base' => [DOCUMENT_METADATA, METADATA_CONTENT],
|
60
|
+
'bb' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT],
|
61
|
+
'bdi' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
62
|
+
'bdo' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
63
|
+
'blockquote' => [GROUPING_CONTENT, FLOW_CONTENT, SECTIONING_ROOT, PALPABLE_CONTENT],
|
64
|
+
'body' => [SECTION, SECTIONING_ROOT],
|
65
|
+
'br' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT],
|
66
|
+
'button' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
|
67
|
+
'canvas' => [SCRIPTING, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENTS, PALPABLE_CONTENT],
|
68
|
+
'caption' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
|
69
|
+
'cite' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
70
|
+
'code' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
71
|
+
'col' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
|
72
|
+
'colgroup' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
|
73
|
+
'data' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
74
|
+
'datalist' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
75
|
+
'dd' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
|
76
|
+
'del' => [EDITS, FLOW_CONTENT, PHRASING_CONTENT],
|
77
|
+
'details' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, SECTIONING_ROOT, INTERACTIVE_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
78
|
+
'dfn' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
79
|
+
'dialog' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, SECTIONING_ROOT],
|
80
|
+
'div' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
|
81
|
+
'dl' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
82
|
+
'dt' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
|
83
|
+
'em' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
84
|
+
'embed' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
|
85
|
+
'fieldset' => [FORMS, FLOW_CONTENT, SECTIONING_ROOT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
86
|
+
'figcaption' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
|
87
|
+
'figure' => [GROUPING_CONTENT, FLOW_CONTENT, SECTIONING_ROOT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
88
|
+
'footer' => [SECTION, FLOW_CONTENT, PALPABLE_CONTENT],
|
89
|
+
'form' => [FORMS, FLOW_CONTENT, PALPABLE_CONTENT, FORM_OWNER],
|
90
|
+
'h1' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
91
|
+
'h2' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
92
|
+
'h3' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
93
|
+
'h4' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
94
|
+
'h5' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
95
|
+
'h6' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
96
|
+
'head' => [DOCUMENT_METADATA],
|
97
|
+
'header' => [SECTIONING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
|
98
|
+
'hgroup' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
|
99
|
+
'hr' => [GROUPING_CONTENT, FLOW_CONTENT],
|
100
|
+
'html' => [DOCUMENT_ELEMENT],
|
101
|
+
'i' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
102
|
+
'iframe' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
|
103
|
+
'img' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, FORM_ASSOCIATED_ELEMENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
|
104
|
+
'input' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
|
105
|
+
'ins' => [EDITS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
106
|
+
'kbd' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
107
|
+
'label' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
|
108
|
+
'legend' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM],
|
109
|
+
'li' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
|
110
|
+
'link' => [DOCUMENT_METADATA, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT],
|
111
|
+
'main' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
|
112
|
+
'map' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
113
|
+
'mark' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
114
|
+
'menu' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, INTERACTIVE_CONTENT],
|
115
|
+
'meta' => [DOCUMENT_METADATA, METADATA_CONTENT],
|
116
|
+
'meter' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
117
|
+
'nav' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
|
118
|
+
'noscript' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT],
|
119
|
+
'object' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
120
|
+
'ol' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
121
|
+
'optgroup' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM, RESTRICTION_PARENT_ELEMENT],
|
122
|
+
'option' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM],
|
123
|
+
'output' => [FLOW_CONTENT, PHRASING_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
|
124
|
+
'p' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
|
125
|
+
'param' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
|
126
|
+
'picture' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT],
|
127
|
+
'pre' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
|
128
|
+
'progress' => [FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
129
|
+
'q' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
130
|
+
'rb' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
|
131
|
+
'rp' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
|
132
|
+
'rt' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
|
133
|
+
'rtc' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
|
134
|
+
'ruby' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
135
|
+
's' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
136
|
+
'samp' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
137
|
+
'script' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT, SCRIPT_SUPPORTING_ELEMENT],
|
138
|
+
'section' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
|
139
|
+
'select' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
140
|
+
'slot' => [SCRIPTING, FLOW_CONTENT, PHRASING_CONTENT],
|
141
|
+
'small' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
142
|
+
'source' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
|
143
|
+
'span' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
144
|
+
'strong' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
145
|
+
'style' => [DOCUMENT_METADATA, METADATA_CONTENT, FLOW_CONTENT],
|
146
|
+
'sub' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
147
|
+
'summary' => [INTERACTIVE_ELEMENTS, RESTRICTED_TO_CHILD_OF_INTERACTIVE],
|
148
|
+
'sup' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
149
|
+
'svg' => [SVG_ELEMENT],
|
150
|
+
'table' => [TABULAR_DATA, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
151
|
+
'tbody' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
|
152
|
+
'td' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
|
153
|
+
'template' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT, SCRIPT_SUPPORTING_ELEMENT],
|
154
|
+
'textarea' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, ESCAPABLE_RAW_TEXT_ELEMENT],
|
155
|
+
'tfoot' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
|
156
|
+
'th' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
|
157
|
+
'thead' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
|
158
|
+
'time' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
159
|
+
'title' => [DOCUMENT_METADATA, METADATA_CONTENT, ESCAPABLE_RAW_TEXT_ELEMENT],
|
160
|
+
'tr' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
|
161
|
+
'track' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
|
162
|
+
'u' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
163
|
+
'ul' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
|
164
|
+
'var' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
|
165
|
+
'video' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
|
166
|
+
'wbr' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT]
|
167
|
+
}.freeze
|
168
|
+
|
169
|
+
# NOTE: Autonomous "custom element" belongs to
|
170
|
+
# [FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT]
|
171
|
+
|
172
|
+
VOID_ELEMENTS = Set.new(%w[area base br col embed hr img input link meta param source track wbr]).freeze
|
173
|
+
|
174
|
+
INLINE_ELEMENTS_INCLUDING_VOID = Set.new(CONTENT_TYPES.select do |_name, types|
|
175
|
+
# Split all table related elements.
|
176
|
+
next false if types.include?(TABULAR_DATA)
|
177
|
+
|
178
|
+
# RESTRICTION_PARENT_ELEMENT creates group for specific purpose.
|
179
|
+
next false if types.include?(RESTRICTION_PARENT_ELEMENT)
|
180
|
+
|
181
|
+
# Ignore meta elements
|
182
|
+
next false if types.include?(METADATA_CONTENT)
|
183
|
+
|
184
|
+
# Text-level contents are for decorating text like span
|
185
|
+
next true if types.include?(TEXT_LEVEL_SEMANTICS)
|
186
|
+
|
187
|
+
# Divide media to block
|
188
|
+
if types.include?(EMBEDDED_CONTENTS)
|
189
|
+
# Not divide content inside media
|
190
|
+
next true if types.include?(RESTRICTED_TO_CHILD_OF_EMBEDDED)
|
191
|
+
next false
|
192
|
+
end
|
193
|
+
|
194
|
+
if types.include?(FORMS)
|
195
|
+
# Divide form element creating section
|
196
|
+
next false if types.include?(SECTIONING_ROOT)
|
197
|
+
|
198
|
+
# Divide "FORM"
|
199
|
+
next false if types.include?(FORM_OWNER)
|
200
|
+
|
201
|
+
# Ignore contents inputted by user
|
202
|
+
next false if types.include?(ESCAPABLE_RAW_TEXT_ELEMENT)
|
203
|
+
|
204
|
+
# Otherwise are grouped as FORM
|
205
|
+
next true
|
206
|
+
end
|
207
|
+
|
208
|
+
false
|
209
|
+
end.keys).freeze
|
210
|
+
|
211
|
+
INLINE_ELEMENTS = (INLINE_ELEMENTS_INCLUDING_VOID - VOID_ELEMENTS).freeze
|
212
|
+
EMPTY_ELEMENTS = (INLINE_ELEMENTS_INCLUDING_VOID & VOID_ELEMENTS).freeze
|
213
|
+
|
214
|
+
# IGNORE_ELEMENTS doesn't scrape contents inside the tags
|
215
|
+
IGNORE_ELEMENTS = Set.new(CONTENT_TYPES.select do |name, types|
|
216
|
+
# meta should be ignored but require swapping.
|
217
|
+
next true if name == 'meta'
|
218
|
+
next true if types.include?(EMBEDDED_CONTENTS) && !types.include?(RESTRICTED_TO_CHILD_OF_EMBEDDED)
|
219
|
+
end.keys).freeze
|
220
|
+
|
221
|
+
# SKIP_ELEMENTS doesn't scrape and swap contents
|
222
|
+
SKIP_ELEMENTS = Set.new(CONTENT_TYPES.select do |name, types|
|
223
|
+
# iterate title to get content
|
224
|
+
next false if name == 'title'
|
225
|
+
next false if name == 'meta'
|
226
|
+
|
227
|
+
next true if types.include?(METADATA_CONTENT)
|
228
|
+
end.keys).freeze
|
229
|
+
|
230
|
+
# SKIP_ELEMENTS doesn't scrape and swap contents. But it scrape and swap tag's attributes like `placeholder`
|
231
|
+
SKIP_ELEMENTS_WITHOUT_ATTRIBUTES = Set.new(CONTENT_TYPES.select do |name, types|
|
232
|
+
# iterate title to get content
|
233
|
+
next false if name == 'title'
|
234
|
+
|
235
|
+
# Ignore contents inputted by user
|
236
|
+
next true if types.include?(ESCAPABLE_RAW_TEXT_ELEMENT)
|
237
|
+
end.keys).freeze
|
238
|
+
|
239
|
+
BLOCK_ELEMENTS = (Set.new(CONTENT_TYPES.keys) - INLINE_ELEMENTS - EMPTY_ELEMENTS - IGNORE_ELEMENTS - SKIP_ELEMENTS - SKIP_ELEMENTS_WITHOUT_ATTRIBUTES).freeze
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
module Wovnrb
|
2
|
+
module UnifiedValues
|
3
|
+
class NodeSwappingTargetsCreator
|
4
|
+
# NOTE: `nodes_info` is the format like below
|
5
|
+
#
|
6
|
+
# [
|
7
|
+
# {:dst=>"an<span>apple is a good</span>foods",
|
8
|
+
# :nodes=>
|
9
|
+
# [
|
10
|
+
# (Text "an"),
|
11
|
+
# (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
|
12
|
+
# (Text "apple is a good"),
|
13
|
+
# (Element:0x13e84e1334 { name = "span", childrelib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rbn = [ #(Text "apple is a good")] }),
|
14
|
+
# (Text "\n foods\n \n \n")]}
|
15
|
+
# ]
|
16
|
+
# }
|
17
|
+
# ]
|
18
|
+
def initialize(nodes_info)
|
19
|
+
@nodes_info = nodes_info
|
20
|
+
end
|
21
|
+
|
22
|
+
# NOTE: `run` make a swapping_targets like below
|
23
|
+
#
|
24
|
+
# [
|
25
|
+
# {:dst=>"an<span>apple is a good</span>foods",
|
26
|
+
# :nodes=>
|
27
|
+
# [
|
28
|
+
# (Text "an"),
|
29
|
+
# (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
|
30
|
+
# (Text "apple is a good"),
|
31
|
+
# (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
|
32
|
+
# (Text "\n foods\n \n \n")]}
|
33
|
+
# ]
|
34
|
+
# :swapping_targets=>
|
35
|
+
# [
|
36
|
+
# (Text "an"),
|
37
|
+
# (Text "apple is a good"),
|
38
|
+
# (Text "\n foods\n \n \n")]}
|
39
|
+
# ]
|
40
|
+
# }
|
41
|
+
# ]
|
42
|
+
|
43
|
+
def run!
|
44
|
+
@nodes_info.each do |node_info|
|
45
|
+
mold = []
|
46
|
+
node_info[:nodes].each do |node|
|
47
|
+
mold_size = mold.size
|
48
|
+
mold.push create_dummy_empty_text_node(next_node: node) if mold_size.even? && node.element?
|
49
|
+
mold.push node
|
50
|
+
end
|
51
|
+
|
52
|
+
mold.push create_dummy_empty_text_node(previous_node: mold.last) if mold.last.element?
|
53
|
+
node_info[:swapping_targets] = remove_tag_element(mold)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def remove_tag_element(mold)
|
58
|
+
id_of_tag_with_wovn_ignore = nil
|
59
|
+
swapping_targets = []
|
60
|
+
mold.each do |node|
|
61
|
+
if id_of_tag_with_wovn_ignore.nil? && node.attributes.keys.include?('wovn-ignore')
|
62
|
+
id_of_tag_with_wovn_ignore = node.object_id
|
63
|
+
next
|
64
|
+
end
|
65
|
+
|
66
|
+
if node.object_id == id_of_tag_with_wovn_ignore
|
67
|
+
id_of_tag_with_wovn_ignore = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
if id_of_tag_with_wovn_ignore.nil? && node.text?
|
71
|
+
swapping_targets << node
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
swapping_targets
|
76
|
+
end
|
77
|
+
|
78
|
+
def create_dummy_empty_text_node(option)
|
79
|
+
DummyEmpryTextNode.new(option)
|
80
|
+
end
|
81
|
+
|
82
|
+
class DummyEmpryTextNode
|
83
|
+
attr_reader :name
|
84
|
+
|
85
|
+
def initialize(next_node: nil, previous_node: nil)
|
86
|
+
@name = 'text'
|
87
|
+
@next_node = next_node
|
88
|
+
@previous_node = previous_node
|
89
|
+
@added_empty_text = nil
|
90
|
+
end
|
91
|
+
|
92
|
+
def text?
|
93
|
+
true
|
94
|
+
end
|
95
|
+
|
96
|
+
def content
|
97
|
+
''
|
98
|
+
end
|
99
|
+
|
100
|
+
def to_s
|
101
|
+
content
|
102
|
+
end
|
103
|
+
|
104
|
+
def attributes
|
105
|
+
{}
|
106
|
+
end
|
107
|
+
|
108
|
+
def parent
|
109
|
+
@next_node.try(:parent) || @previous_node.try(:parent)
|
110
|
+
end
|
111
|
+
|
112
|
+
def document
|
113
|
+
@next_node.try(:document) || @previous_node.try(:document)
|
114
|
+
end
|
115
|
+
|
116
|
+
def add_previous_sibling(comment_node)
|
117
|
+
@added_empty_text.try(:add_previous_sibling, comment_node)
|
118
|
+
end
|
119
|
+
|
120
|
+
def content=(text)
|
121
|
+
return if text == ''
|
122
|
+
|
123
|
+
if @next_node
|
124
|
+
@next_node.add_previous_sibling(text)
|
125
|
+
@added_empty_text = @next_node.previous
|
126
|
+
elsif @previous_node
|
127
|
+
@previous_node.add_next_sibling(text)
|
128
|
+
@added_empty_text = @previous_node.next
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|