wovnrb 1.0.13 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Rakefile +1 -1
- data/lib/wovnrb.rb +7 -0
- data/lib/wovnrb/html_replacers/replacer_base.rb +2 -1
- data/lib/wovnrb/html_replacers/unified_values/dst_swapping_targets_creator.rb +76 -0
- data/lib/wovnrb/html_replacers/unified_values/element_category.rb +242 -0
- data/lib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rb +134 -0
- data/lib/wovnrb/html_replacers/unified_values/text_replacer.rb +35 -0
- data/lib/wovnrb/html_replacers/unified_values/text_scraper.rb +152 -0
- data/lib/wovnrb/html_replacers/unified_values/values_stack.rb +65 -0
- data/lib/wovnrb/lang.rb +6 -1
- data/lib/wovnrb/services/value_agent.rb +9 -0
- data/lib/wovnrb/store.rb +2 -9
- data/lib/wovnrb/version.rb +1 -1
- data/test/fixtures/unified_values/site_html/simple_actual.html +96 -0
- data/test/fixtures/unified_values/site_html/simple_expected.json +251 -0
- data/test/fixtures/unified_values/site_html/wovn.io_actual.html +686 -0
- data/test/fixtures/unified_values/site_html/wovn.io_expected.json +543 -0
- data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_actual.html +1024 -0
- data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_expected.json +3345 -0
- data/test/fixtures/unified_values/small_html/block_inside_inline_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/block_inside_inline_expected.json +22 -0
- data/test/fixtures/unified_values/small_html/br_tag_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/br_tag_expected.json +12 -0
- data/test/fixtures/unified_values/small_html/comment_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/comment_tag_expected.json +10 -0
- data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_actual.html +7 -0
- data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_expected.json +11 -0
- data/test/fixtures/unified_values/small_html/deep_nested_block_actual.html +14 -0
- data/test/fixtures/unified_values/small_html/deep_nested_block_expected.json +8 -0
- data/test/fixtures/unified_values/small_html/deep_nested_inline_actual.html +20 -0
- data/test/fixtures/unified_values/small_html/deep_nested_inline_expected.json +20 -0
- data/test/fixtures/unified_values/small_html/empty_tag_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/empty_tag_expected.json +12 -0
- data/test/fixtures/unified_values/small_html/empty_text_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/empty_text_expected.json +1 -0
- data/test/fixtures/unified_values/small_html/ignore_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/ignore_tag_expected.json +16 -0
- data/test/fixtures/unified_values/small_html/ignored_class_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/ignored_class_expected.json +13 -0
- data/test/fixtures/unified_values/small_html/img_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/img_expected.json +23 -0
- data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_expected.json +16 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_expected.json +12 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_expected.json +14 -0
- data/test/fixtures/unified_values/small_html/option_tag_actual.html +9 -0
- data/test/fixtures/unified_values/small_html/option_tag_expected.json +13 -0
- data/test/fixtures/unified_values/small_html/text_different_inline_each_other_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/text_different_inline_each_other_expected.json +22 -0
- data/test/fixtures/unified_values/small_html/text_in_svg_actual.html +9 -0
- data/test/fixtures/unified_values/small_html/text_in_svg_expected.json +8 -0
- data/test/fixtures/unified_values/small_html/text_with_html_entity_actual.html +6 -0
- data/test/fixtures/unified_values/small_html/text_with_html_entity_expected.json +8 -0
- data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_expected.json +24 -0
- data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_actual.html +12 -0
- data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_expected.json +14 -0
- data/test/fixtures/unified_values/small_html/wovn_ignore_actual.html +10 -0
- data/test/fixtures/unified_values/small_html/wovn_ignore_expected.json +13 -0
- data/test/lib/html_replacers/unified_values/dst_swapping_targets_creator_test.rb +137 -0
- data/test/lib/html_replacers/unified_values/element_category_test.rb +49 -0
- data/test/lib/html_replacers/unified_values/node_swapping_targets_creator_test.rb +137 -0
- data/test/lib/html_replacers/unified_values/text_replacer_test.rb +270 -0
- data/test/lib/html_replacers/unified_values/text_scraper_test.rb +121 -0
- data/test/lib/html_replacers/unified_values/values_stack_test.rb +122 -0
- data/test/lib/lang_test.rb +59 -1
- data/test/lib/services/value_agent_test.rb +32 -0
- data/test/test_helper.rb +18 -2
- data/wovnrb.gemspec +1 -0
- metadata +134 -7
- data/spec/spec_helper.rb +0 -2
- data/spec/wovnrb_spec.rb +0 -7
@@ -0,0 +1,22 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
"xpath": "/html/body/div/text()",
|
4
|
+
"srcs": [
|
5
|
+
"<span>",
|
6
|
+
"<a>",
|
7
|
+
"text value1",
|
8
|
+
"</a>",
|
9
|
+
"</span>"
|
10
|
+
]
|
11
|
+
},
|
12
|
+
{
|
13
|
+
"xpath": "/html/body/div/text()[2]",
|
14
|
+
"srcs": [
|
15
|
+
"<strong>",
|
16
|
+
"<em>",
|
17
|
+
"text value1",
|
18
|
+
"</em>",
|
19
|
+
"</strong>"
|
20
|
+
]
|
21
|
+
}
|
22
|
+
]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
"xpath": "/html/body/div/text()",
|
4
|
+
"srcs": [
|
5
|
+
"<span>",
|
6
|
+
"<a>",
|
7
|
+
"text value1",
|
8
|
+
"</a>"
|
9
|
+
]
|
10
|
+
},
|
11
|
+
{
|
12
|
+
"xpath": "/html/body/div/span/unknown/text()",
|
13
|
+
"srcs": [
|
14
|
+
"text value2"
|
15
|
+
]
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"xpath": "/html/body/div/text()[2]",
|
19
|
+
"srcs": [
|
20
|
+
"text value3",
|
21
|
+
"</span>"
|
22
|
+
]
|
23
|
+
}
|
24
|
+
]
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
module Wovnrb
|
4
|
+
module UnifiedValues
|
5
|
+
class DstSwappingTargetsCreatorTest < WovnMiniTest
|
6
|
+
def test_run
|
7
|
+
text_index = {
|
8
|
+
'' => {
|
9
|
+
'en' => [
|
10
|
+
{ 'data' => 'a<a>b</a>c' }
|
11
|
+
]
|
12
|
+
}
|
13
|
+
}
|
14
|
+
|
15
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
16
|
+
assert_equal(%w[a b c], text_index['']['en'][0]['swapping_targets'])
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_run_with_data_with_spaces
|
20
|
+
text_index = {
|
21
|
+
'' => {
|
22
|
+
'en' => [
|
23
|
+
{ 'data' => ' a <a> b </a> c ' }
|
24
|
+
]
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
29
|
+
assert_equal([' a ', ' b ', ' c '], text_index['']['en'][0]['swapping_targets'])
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_run_with_data_stated_by_tag
|
33
|
+
text_index = {
|
34
|
+
'' => {
|
35
|
+
'en' => [
|
36
|
+
{ 'data' => '<a>b</a>c' }
|
37
|
+
]
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
42
|
+
assert_equal(['', 'b', 'c'], text_index['']['en'][0]['swapping_targets'])
|
43
|
+
end
|
44
|
+
|
45
|
+
def test_run_with_data_ended_by_tag
|
46
|
+
text_index = {
|
47
|
+
'' => {
|
48
|
+
'en' => [
|
49
|
+
{ 'data' => 'a<a>b</a>' }
|
50
|
+
]
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
55
|
+
assert_equal(['a', 'b', ''], text_index['']['en'][0]['swapping_targets'])
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_run_with_data_with_no_content_inside_tag
|
59
|
+
text_index = {
|
60
|
+
'' => {
|
61
|
+
'en' => [
|
62
|
+
{ 'data' => 'a<a></a>c' }
|
63
|
+
]
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
68
|
+
assert_equal(['a', '', 'c'], text_index['']['en'][0]['swapping_targets'])
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_run_with_data_with_tag_only
|
72
|
+
text_index = {
|
73
|
+
'' => {
|
74
|
+
'en' => [
|
75
|
+
{ 'data' => '<a></a>' }
|
76
|
+
]
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
81
|
+
assert_equal(['', '', ''], text_index['']['en'][0]['swapping_targets'])
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_run_with_data_without_tag
|
85
|
+
text_index = {
|
86
|
+
'' => {
|
87
|
+
'en' => [
|
88
|
+
{ 'data' => 'a' }
|
89
|
+
]
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
94
|
+
assert_equal(['a'], text_index['']['en'][0]['swapping_targets'])
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_run_with_data_with_wovn_ignore
|
98
|
+
text_index = {
|
99
|
+
'' => {
|
100
|
+
'en' => [
|
101
|
+
{ 'data' => 'a<a wovn-ignore>b</a>c' }
|
102
|
+
]
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
107
|
+
assert_equal(%w[a c], text_index['']['en'][0]['swapping_targets'])
|
108
|
+
end
|
109
|
+
|
110
|
+
def test_run_with_data_with_closing_tag
|
111
|
+
text_index = {
|
112
|
+
'' => {
|
113
|
+
'en' => [
|
114
|
+
{ 'data' => 'a<br>bc' }
|
115
|
+
]
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
120
|
+
assert_equal(%w[a bc], text_index['']['en'][0]['swapping_targets'])
|
121
|
+
end
|
122
|
+
|
123
|
+
def test_run_with_data_with_both_closing_tag_and_no_closing_tag
|
124
|
+
text_index = {
|
125
|
+
'' => {
|
126
|
+
'en' => [
|
127
|
+
{ 'data' => 'a<a>b<br>c</a>d' }
|
128
|
+
]
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
DstSwappingTargetsCreator.new(text_index).run!
|
133
|
+
assert_equal(%w[a b c d], text_index['']['en'][0]['swapping_targets'])
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
module Wovnrb
|
4
|
+
module UnifiedValues
|
5
|
+
class ElementCategoryTest < WovnMiniTest
|
6
|
+
def test_contents
|
7
|
+
assert_equal(116, ElementCategory::CONTENT_TYPES.size)
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_inline_elements
|
11
|
+
expected_inlines = %w[a abbr b bdi bdo button cite code data dfn em i kbd label legend mark meter option q rb rp rt rtc s samp small span strong sub sup time u var]
|
12
|
+
assert_same_elements(expected_inlines, ElementCategory::INLINE_ELEMENTS.to_a)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_empty_elements
|
16
|
+
expected_inlines = %w[br param source track wbr input]
|
17
|
+
assert_same_elements(expected_inlines, ElementCategory::EMPTY_ELEMENTS.to_a)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_ignore_elements
|
21
|
+
expected_inlines = %w[area audio canvas embed iframe img map meta object picture video]
|
22
|
+
assert_same_elements(expected_inlines, ElementCategory::IGNORE_ELEMENTS.to_a)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_skip_elements
|
26
|
+
expected_inlines = %w[base link noscript script style template]
|
27
|
+
assert_same_elements(expected_inlines, ElementCategory::SKIP_ELEMENTS.to_a)
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_skip_elements_without_attributes
|
31
|
+
expected_inlines = %w[textarea]
|
32
|
+
assert_same_elements(expected_inlines, ElementCategory::SKIP_ELEMENTS_WITHOUT_ATTRIBUTES.to_a)
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_block_elements
|
36
|
+
expected_inlines = %w[address article aside bb blockquote body caption col colgroup datalist dd del details dialog div dl dt fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6 head header hgroup hr html ins li main menu nav ol optgroup output p pre progress ruby section select slot summary svg table tbody td tfoot th thead title tr ul]
|
37
|
+
assert_same_elements(expected_inlines, ElementCategory::BLOCK_ELEMENTS.to_a)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_no_duplication
|
41
|
+
elements = ElementCategory::INLINE_ELEMENTS + ElementCategory::EMPTY_ELEMENTS + ElementCategory::IGNORE_ELEMENTS + ElementCategory::SKIP_ELEMENTS + ElementCategory::SKIP_ELEMENTS_WITHOUT_ATTRIBUTES + ElementCategory::BLOCK_ELEMENTS
|
42
|
+
element_size_sum = ElementCategory::INLINE_ELEMENTS.size + ElementCategory::EMPTY_ELEMENTS.size + ElementCategory::IGNORE_ELEMENTS.size + ElementCategory::SKIP_ELEMENTS.size + ElementCategory::SKIP_ELEMENTS_WITHOUT_ATTRIBUTES.size + ElementCategory::BLOCK_ELEMENTS.size
|
43
|
+
|
44
|
+
assert_equal(elements.size, ElementCategory::CONTENT_TYPES.size)
|
45
|
+
assert_equal(element_size_sum, ElementCategory::CONTENT_TYPES.size)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
module Wovnrb
|
4
|
+
module UnifiedValues
|
5
|
+
class NodeSwappingTargetsCreatorTest < WovnMiniTest
|
6
|
+
def test_run
|
7
|
+
html = 'a<a>b</a>c'
|
8
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
9
|
+
|
10
|
+
nodes_info = [
|
11
|
+
{
|
12
|
+
nodes: result.first[:nodes]
|
13
|
+
}
|
14
|
+
]
|
15
|
+
|
16
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
17
|
+
assert_equal(%w[a b c], nodes_info[0][:swapping_targets].map(&:to_s))
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_run_with_data_stated_by_tag
|
21
|
+
html = '<a>b</a>c'
|
22
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
23
|
+
nodes = result.first[:nodes]
|
24
|
+
nodes_info = [
|
25
|
+
{
|
26
|
+
nodes: nodes
|
27
|
+
}
|
28
|
+
]
|
29
|
+
|
30
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
31
|
+
dummy = NodeSwappingTargetsCreator.new('').create_dummy_empty_text_node(next_node: nodes[0])
|
32
|
+
|
33
|
+
assert_equal(['', 'b', 'c'], nodes_info[0][:swapping_targets].map(&:to_s))
|
34
|
+
|
35
|
+
text_for_dummy = 'a'
|
36
|
+
dummy.content = text_for_dummy
|
37
|
+
assert_equal(text_for_dummy, nodes[0].previous.to_s)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_run_with_data_ended_by_tag
|
41
|
+
html = 'a<a>b</a>'
|
42
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
43
|
+
nodes = result.first[:nodes]
|
44
|
+
nodes_info = [
|
45
|
+
{
|
46
|
+
nodes: nodes
|
47
|
+
}
|
48
|
+
]
|
49
|
+
|
50
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
51
|
+
dummy = NodeSwappingTargetsCreator.new('').create_dummy_empty_text_node(previous_node: nodes[-1])
|
52
|
+
|
53
|
+
assert_equal(['a', 'b', ''], nodes_info[0][:swapping_targets].map(&:to_s))
|
54
|
+
|
55
|
+
text_for_dummy = 'a'
|
56
|
+
dummy.content = text_for_dummy
|
57
|
+
assert_equal(text_for_dummy, nodes[-1].next.to_s)
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_run_with_data_with_no_content_inside_tag
|
61
|
+
html = 'a<a></a>c'
|
62
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
63
|
+
nodes = result.first[:nodes]
|
64
|
+
nodes_info = [
|
65
|
+
{
|
66
|
+
nodes: nodes
|
67
|
+
}
|
68
|
+
]
|
69
|
+
|
70
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
71
|
+
dummy = NodeSwappingTargetsCreator.new('').create_dummy_empty_text_node(next_node: nodes[2])
|
72
|
+
|
73
|
+
assert_equal(['a', '', 'c'], nodes_info[0][:swapping_targets].map(&:to_s))
|
74
|
+
|
75
|
+
text_for_dummy = 'a'
|
76
|
+
dummy.content = text_for_dummy
|
77
|
+
assert_equal(text_for_dummy, nodes[2].previous.to_s)
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_run_with_data_without_tag
|
81
|
+
html = 'a'
|
82
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
83
|
+
nodes = result.first[:nodes]
|
84
|
+
nodes_info = [
|
85
|
+
{
|
86
|
+
nodes: nodes
|
87
|
+
}
|
88
|
+
]
|
89
|
+
|
90
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
91
|
+
assert_equal(['a'], nodes_info[0][:swapping_targets].map(&:to_s))
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_run_with_data_with_wovn_ignore
|
95
|
+
html = 'a<a wovn-ignore>b</a>c'
|
96
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
97
|
+
|
98
|
+
nodes_info = [
|
99
|
+
{
|
100
|
+
nodes: result.first[:nodes]
|
101
|
+
}
|
102
|
+
]
|
103
|
+
|
104
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
105
|
+
assert_equal(%w[a c], nodes_info[0][:swapping_targets].map(&:to_s))
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_run_with_data_with_closing_tag
|
109
|
+
html = 'a<br>bc'
|
110
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
111
|
+
|
112
|
+
nodes_info = [
|
113
|
+
{
|
114
|
+
nodes: result.first[:nodes]
|
115
|
+
}
|
116
|
+
]
|
117
|
+
|
118
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
119
|
+
assert_equal(%w[a bc], nodes_info[0][:swapping_targets].map(&:to_s))
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_run_with_data_with_both_closing_tag_and_no_closing_tag
|
123
|
+
html = 'a<a>b<br>c</a>d'
|
124
|
+
result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
|
125
|
+
|
126
|
+
nodes_info = [
|
127
|
+
{
|
128
|
+
nodes: result.first[:nodes]
|
129
|
+
}
|
130
|
+
]
|
131
|
+
|
132
|
+
NodeSwappingTargetsCreator.new(nodes_info).run!
|
133
|
+
assert_equal(%w[a b c d], nodes_info[0][:swapping_targets].map(&:to_s))
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|