wovnrb 1.0.13 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +1 -1
  3. data/lib/wovnrb.rb +7 -0
  4. data/lib/wovnrb/html_replacers/replacer_base.rb +2 -1
  5. data/lib/wovnrb/html_replacers/unified_values/dst_swapping_targets_creator.rb +76 -0
  6. data/lib/wovnrb/html_replacers/unified_values/element_category.rb +242 -0
  7. data/lib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rb +134 -0
  8. data/lib/wovnrb/html_replacers/unified_values/text_replacer.rb +35 -0
  9. data/lib/wovnrb/html_replacers/unified_values/text_scraper.rb +152 -0
  10. data/lib/wovnrb/html_replacers/unified_values/values_stack.rb +65 -0
  11. data/lib/wovnrb/lang.rb +6 -1
  12. data/lib/wovnrb/services/value_agent.rb +9 -0
  13. data/lib/wovnrb/store.rb +2 -9
  14. data/lib/wovnrb/version.rb +1 -1
  15. data/test/fixtures/unified_values/site_html/simple_actual.html +96 -0
  16. data/test/fixtures/unified_values/site_html/simple_expected.json +251 -0
  17. data/test/fixtures/unified_values/site_html/wovn.io_actual.html +686 -0
  18. data/test/fixtures/unified_values/site_html/wovn.io_expected.json +543 -0
  19. data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_actual.html +1024 -0
  20. data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_expected.json +3345 -0
  21. data/test/fixtures/unified_values/small_html/block_inside_inline_actual.html +12 -0
  22. data/test/fixtures/unified_values/small_html/block_inside_inline_expected.json +22 -0
  23. data/test/fixtures/unified_values/small_html/br_tag_actual.html +10 -0
  24. data/test/fixtures/unified_values/small_html/br_tag_expected.json +12 -0
  25. data/test/fixtures/unified_values/small_html/comment_tag_actual.html +12 -0
  26. data/test/fixtures/unified_values/small_html/comment_tag_expected.json +10 -0
  27. data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_actual.html +7 -0
  28. data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_expected.json +11 -0
  29. data/test/fixtures/unified_values/small_html/deep_nested_block_actual.html +14 -0
  30. data/test/fixtures/unified_values/small_html/deep_nested_block_expected.json +8 -0
  31. data/test/fixtures/unified_values/small_html/deep_nested_inline_actual.html +20 -0
  32. data/test/fixtures/unified_values/small_html/deep_nested_inline_expected.json +20 -0
  33. data/test/fixtures/unified_values/small_html/empty_tag_actual.html +10 -0
  34. data/test/fixtures/unified_values/small_html/empty_tag_expected.json +12 -0
  35. data/test/fixtures/unified_values/small_html/empty_text_actual.html +12 -0
  36. data/test/fixtures/unified_values/small_html/empty_text_expected.json +1 -0
  37. data/test/fixtures/unified_values/small_html/ignore_tag_actual.html +12 -0
  38. data/test/fixtures/unified_values/small_html/ignore_tag_expected.json +16 -0
  39. data/test/fixtures/unified_values/small_html/ignored_class_actual.html +10 -0
  40. data/test/fixtures/unified_values/small_html/ignored_class_expected.json +13 -0
  41. data/test/fixtures/unified_values/small_html/img_actual.html +12 -0
  42. data/test/fixtures/unified_values/small_html/img_expected.json +23 -0
  43. data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_actual.html +10 -0
  44. data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_expected.json +16 -0
  45. data/test/fixtures/unified_values/small_html/nested_text_value_actual.html +10 -0
  46. data/test/fixtures/unified_values/small_html/nested_text_value_expected.json +12 -0
  47. data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_actual.html +10 -0
  48. data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_expected.json +14 -0
  49. data/test/fixtures/unified_values/small_html/option_tag_actual.html +9 -0
  50. data/test/fixtures/unified_values/small_html/option_tag_expected.json +13 -0
  51. data/test/fixtures/unified_values/small_html/text_different_inline_each_other_actual.html +10 -0
  52. data/test/fixtures/unified_values/small_html/text_different_inline_each_other_expected.json +22 -0
  53. data/test/fixtures/unified_values/small_html/text_in_svg_actual.html +9 -0
  54. data/test/fixtures/unified_values/small_html/text_in_svg_expected.json +8 -0
  55. data/test/fixtures/unified_values/small_html/text_with_html_entity_actual.html +6 -0
  56. data/test/fixtures/unified_values/small_html/text_with_html_entity_expected.json +8 -0
  57. data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_actual.html +12 -0
  58. data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_expected.json +24 -0
  59. data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_actual.html +12 -0
  60. data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_expected.json +14 -0
  61. data/test/fixtures/unified_values/small_html/wovn_ignore_actual.html +10 -0
  62. data/test/fixtures/unified_values/small_html/wovn_ignore_expected.json +13 -0
  63. data/test/lib/html_replacers/unified_values/dst_swapping_targets_creator_test.rb +137 -0
  64. data/test/lib/html_replacers/unified_values/element_category_test.rb +49 -0
  65. data/test/lib/html_replacers/unified_values/node_swapping_targets_creator_test.rb +137 -0
  66. data/test/lib/html_replacers/unified_values/text_replacer_test.rb +270 -0
  67. data/test/lib/html_replacers/unified_values/text_scraper_test.rb +121 -0
  68. data/test/lib/html_replacers/unified_values/values_stack_test.rb +122 -0
  69. data/test/lib/lang_test.rb +59 -1
  70. data/test/lib/services/value_agent_test.rb +32 -0
  71. data/test/test_helper.rb +18 -2
  72. data/wovnrb.gemspec +1 -0
  73. metadata +134 -7
  74. data/spec/spec_helper.rb +0 -2
  75. data/spec/wovnrb_spec.rb +0 -7
@@ -0,0 +1,22 @@
1
+ [
2
+ {
3
+ "xpath": "/html/body/div/text()",
4
+ "srcs": [
5
+ "<span>",
6
+ "<a>",
7
+ "text value1",
8
+ "</a>",
9
+ "</span>"
10
+ ]
11
+ },
12
+ {
13
+ "xpath": "/html/body/div/text()[2]",
14
+ "srcs": [
15
+ "<strong>",
16
+ "<em>",
17
+ "text value1",
18
+ "</em>",
19
+ "</strong>"
20
+ ]
21
+ }
22
+ ]
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <svg width="1px" height="1px" xmlns="http://www.w3.org/2000/svg">
5
+ <text x="1" y="1" font-size="30">text value!</text>
6
+ <use href="#test-sym" x="0" y="1" width="1" height="1"/>
7
+ </svg>
8
+ </body>
9
+ </html>
@@ -0,0 +1,8 @@
1
+ [
2
+ {
3
+ "xpath": "/html/body/svg/text/text()",
4
+ "srcs": [
5
+ "text value!"
6
+ ]
7
+ }
8
+ ]
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <p>This is &lt;a&gt; text value!</p>
5
+ </body>
6
+ </html>
@@ -0,0 +1,8 @@
1
+ [
2
+ {
3
+ "xpath": "/html/body/p/text()",
4
+ "srcs": [
5
+ "This is &lt;a&gt; text value!"
6
+ ]
7
+ }
8
+ ]
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <div>
5
+ <span>
6
+ <a href="#">text value1</a>
7
+ <unknown class="emoticon">text value2</unknown>
8
+ text value3
9
+ </span>
10
+ </div>
11
+ </body>
12
+ </html>
@@ -0,0 +1,24 @@
1
+ [
2
+ {
3
+ "xpath": "/html/body/div/text()",
4
+ "srcs": [
5
+ "<span>",
6
+ "<a>",
7
+ "text value1",
8
+ "</a>"
9
+ ]
10
+ },
11
+ {
12
+ "xpath": "/html/body/div/span/unknown/text()",
13
+ "srcs": [
14
+ "text value2"
15
+ ]
16
+ },
17
+ {
18
+ "xpath": "/html/body/div/text()[2]",
19
+ "srcs": [
20
+ "text value3",
21
+ "</span>"
22
+ ]
23
+ }
24
+ ]
@@ -0,0 +1,12 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <li>
5
+ <a href="#">
6
+ <span><span><img src='sample.jpeg'></span></span>
7
+ <span>text value1</span>
8
+ <span>text value2</span>
9
+ </a>
10
+ </li>
11
+ </body>
12
+ </html>
@@ -0,0 +1,14 @@
1
+ [
2
+ {
3
+ "xpath": "/html/body/li/text()[2]",
4
+ "srcs": [
5
+ "<span>",
6
+ "text value1",
7
+ "</span>",
8
+ "<span>",
9
+ "text value2",
10
+ "</span>",
11
+ "</a>"
12
+ ]
13
+ }
14
+ ]
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <head></head>
3
+ <body>
4
+ <div>
5
+ <span>
6
+ This is a <b wovn-ignore>complex</b> text value!
7
+ </span>
8
+ </div>
9
+ </body>
10
+ </html>
@@ -0,0 +1,13 @@
1
+ [
2
+ {
3
+ "xpath": "/html/body/div/text()",
4
+ "srcs": [
5
+ "<span>",
6
+ "This is a",
7
+ "<b wovn-ignore>",
8
+ "</b>",
9
+ "text value!",
10
+ "</span>"
11
+ ]
12
+ }
13
+ ]
@@ -0,0 +1,137 @@
1
+ require 'test_helper'
2
+
3
+ module Wovnrb
4
+ module UnifiedValues
5
+ class DstSwappingTargetsCreatorTest < WovnMiniTest
6
+ def test_run
7
+ text_index = {
8
+ '' => {
9
+ 'en' => [
10
+ { 'data' => 'a<a>b</a>c' }
11
+ ]
12
+ }
13
+ }
14
+
15
+ DstSwappingTargetsCreator.new(text_index).run!
16
+ assert_equal(%w[a b c], text_index['']['en'][0]['swapping_targets'])
17
+ end
18
+
19
+ def test_run_with_data_with_spaces
20
+ text_index = {
21
+ '' => {
22
+ 'en' => [
23
+ { 'data' => ' a <a> b </a> c ' }
24
+ ]
25
+ }
26
+ }
27
+
28
+ DstSwappingTargetsCreator.new(text_index).run!
29
+ assert_equal([' a ', ' b ', ' c '], text_index['']['en'][0]['swapping_targets'])
30
+ end
31
+
32
+ def test_run_with_data_stated_by_tag
33
+ text_index = {
34
+ '' => {
35
+ 'en' => [
36
+ { 'data' => '<a>b</a>c' }
37
+ ]
38
+ }
39
+ }
40
+
41
+ DstSwappingTargetsCreator.new(text_index).run!
42
+ assert_equal(['', 'b', 'c'], text_index['']['en'][0]['swapping_targets'])
43
+ end
44
+
45
+ def test_run_with_data_ended_by_tag
46
+ text_index = {
47
+ '' => {
48
+ 'en' => [
49
+ { 'data' => 'a<a>b</a>' }
50
+ ]
51
+ }
52
+ }
53
+
54
+ DstSwappingTargetsCreator.new(text_index).run!
55
+ assert_equal(['a', 'b', ''], text_index['']['en'][0]['swapping_targets'])
56
+ end
57
+
58
+ def test_run_with_data_with_no_content_inside_tag
59
+ text_index = {
60
+ '' => {
61
+ 'en' => [
62
+ { 'data' => 'a<a></a>c' }
63
+ ]
64
+ }
65
+ }
66
+
67
+ DstSwappingTargetsCreator.new(text_index).run!
68
+ assert_equal(['a', '', 'c'], text_index['']['en'][0]['swapping_targets'])
69
+ end
70
+
71
+ def test_run_with_data_with_tag_only
72
+ text_index = {
73
+ '' => {
74
+ 'en' => [
75
+ { 'data' => '<a></a>' }
76
+ ]
77
+ }
78
+ }
79
+
80
+ DstSwappingTargetsCreator.new(text_index).run!
81
+ assert_equal(['', '', ''], text_index['']['en'][0]['swapping_targets'])
82
+ end
83
+
84
+ def test_run_with_data_without_tag
85
+ text_index = {
86
+ '' => {
87
+ 'en' => [
88
+ { 'data' => 'a' }
89
+ ]
90
+ }
91
+ }
92
+
93
+ DstSwappingTargetsCreator.new(text_index).run!
94
+ assert_equal(['a'], text_index['']['en'][0]['swapping_targets'])
95
+ end
96
+
97
+ def test_run_with_data_with_wovn_ignore
98
+ text_index = {
99
+ '' => {
100
+ 'en' => [
101
+ { 'data' => 'a<a wovn-ignore>b</a>c' }
102
+ ]
103
+ }
104
+ }
105
+
106
+ DstSwappingTargetsCreator.new(text_index).run!
107
+ assert_equal(%w[a c], text_index['']['en'][0]['swapping_targets'])
108
+ end
109
+
110
+ def test_run_with_data_with_closing_tag
111
+ text_index = {
112
+ '' => {
113
+ 'en' => [
114
+ { 'data' => 'a<br>bc' }
115
+ ]
116
+ }
117
+ }
118
+
119
+ DstSwappingTargetsCreator.new(text_index).run!
120
+ assert_equal(%w[a bc], text_index['']['en'][0]['swapping_targets'])
121
+ end
122
+
123
+ def test_run_with_data_with_both_closing_tag_and_no_closing_tag
124
+ text_index = {
125
+ '' => {
126
+ 'en' => [
127
+ { 'data' => 'a<a>b<br>c</a>d' }
128
+ ]
129
+ }
130
+ }
131
+
132
+ DstSwappingTargetsCreator.new(text_index).run!
133
+ assert_equal(%w[a b c d], text_index['']['en'][0]['swapping_targets'])
134
+ end
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,49 @@
1
+ require 'test_helper'
2
+
3
+ module Wovnrb
4
+ module UnifiedValues
5
+ class ElementCategoryTest < WovnMiniTest
6
+ def test_contents
7
+ assert_equal(116, ElementCategory::CONTENT_TYPES.size)
8
+ end
9
+
10
+ def test_inline_elements
11
+ expected_inlines = %w[a abbr b bdi bdo button cite code data dfn em i kbd label legend mark meter option q rb rp rt rtc s samp small span strong sub sup time u var]
12
+ assert_same_elements(expected_inlines, ElementCategory::INLINE_ELEMENTS.to_a)
13
+ end
14
+
15
+ def test_empty_elements
16
+ expected_inlines = %w[br param source track wbr input]
17
+ assert_same_elements(expected_inlines, ElementCategory::EMPTY_ELEMENTS.to_a)
18
+ end
19
+
20
+ def test_ignore_elements
21
+ expected_inlines = %w[area audio canvas embed iframe img map meta object picture video]
22
+ assert_same_elements(expected_inlines, ElementCategory::IGNORE_ELEMENTS.to_a)
23
+ end
24
+
25
+ def test_skip_elements
26
+ expected_inlines = %w[base link noscript script style template]
27
+ assert_same_elements(expected_inlines, ElementCategory::SKIP_ELEMENTS.to_a)
28
+ end
29
+
30
+ def test_skip_elements_without_attributes
31
+ expected_inlines = %w[textarea]
32
+ assert_same_elements(expected_inlines, ElementCategory::SKIP_ELEMENTS_WITHOUT_ATTRIBUTES.to_a)
33
+ end
34
+
35
+ def test_block_elements
36
+ expected_inlines = %w[address article aside bb blockquote body caption col colgroup datalist dd del details dialog div dl dt fieldset figcaption figure footer form h1 h2 h3 h4 h5 h6 head header hgroup hr html ins li main menu nav ol optgroup output p pre progress ruby section select slot summary svg table tbody td tfoot th thead title tr ul]
37
+ assert_same_elements(expected_inlines, ElementCategory::BLOCK_ELEMENTS.to_a)
38
+ end
39
+
40
+ def test_no_duplication
41
+ elements = ElementCategory::INLINE_ELEMENTS + ElementCategory::EMPTY_ELEMENTS + ElementCategory::IGNORE_ELEMENTS + ElementCategory::SKIP_ELEMENTS + ElementCategory::SKIP_ELEMENTS_WITHOUT_ATTRIBUTES + ElementCategory::BLOCK_ELEMENTS
42
+ element_size_sum = ElementCategory::INLINE_ELEMENTS.size + ElementCategory::EMPTY_ELEMENTS.size + ElementCategory::IGNORE_ELEMENTS.size + ElementCategory::SKIP_ELEMENTS.size + ElementCategory::SKIP_ELEMENTS_WITHOUT_ATTRIBUTES.size + ElementCategory::BLOCK_ELEMENTS.size
43
+
44
+ assert_equal(elements.size, ElementCategory::CONTENT_TYPES.size)
45
+ assert_equal(element_size_sum, ElementCategory::CONTENT_TYPES.size)
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,137 @@
1
+ require 'test_helper'
2
+
3
+ module Wovnrb
4
+ module UnifiedValues
5
+ class NodeSwappingTargetsCreatorTest < WovnMiniTest
6
+ def test_run
7
+ html = 'a<a>b</a>c'
8
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
9
+
10
+ nodes_info = [
11
+ {
12
+ nodes: result.first[:nodes]
13
+ }
14
+ ]
15
+
16
+ NodeSwappingTargetsCreator.new(nodes_info).run!
17
+ assert_equal(%w[a b c], nodes_info[0][:swapping_targets].map(&:to_s))
18
+ end
19
+
20
+ def test_run_with_data_stated_by_tag
21
+ html = '<a>b</a>c'
22
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
23
+ nodes = result.first[:nodes]
24
+ nodes_info = [
25
+ {
26
+ nodes: nodes
27
+ }
28
+ ]
29
+
30
+ NodeSwappingTargetsCreator.new(nodes_info).run!
31
+ dummy = NodeSwappingTargetsCreator.new('').create_dummy_empty_text_node(next_node: nodes[0])
32
+
33
+ assert_equal(['', 'b', 'c'], nodes_info[0][:swapping_targets].map(&:to_s))
34
+
35
+ text_for_dummy = 'a'
36
+ dummy.content = text_for_dummy
37
+ assert_equal(text_for_dummy, nodes[0].previous.to_s)
38
+ end
39
+
40
+ def test_run_with_data_ended_by_tag
41
+ html = 'a<a>b</a>'
42
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
43
+ nodes = result.first[:nodes]
44
+ nodes_info = [
45
+ {
46
+ nodes: nodes
47
+ }
48
+ ]
49
+
50
+ NodeSwappingTargetsCreator.new(nodes_info).run!
51
+ dummy = NodeSwappingTargetsCreator.new('').create_dummy_empty_text_node(previous_node: nodes[-1])
52
+
53
+ assert_equal(['a', 'b', ''], nodes_info[0][:swapping_targets].map(&:to_s))
54
+
55
+ text_for_dummy = 'a'
56
+ dummy.content = text_for_dummy
57
+ assert_equal(text_for_dummy, nodes[-1].next.to_s)
58
+ end
59
+
60
+ def test_run_with_data_with_no_content_inside_tag
61
+ html = 'a<a></a>c'
62
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
63
+ nodes = result.first[:nodes]
64
+ nodes_info = [
65
+ {
66
+ nodes: nodes
67
+ }
68
+ ]
69
+
70
+ NodeSwappingTargetsCreator.new(nodes_info).run!
71
+ dummy = NodeSwappingTargetsCreator.new('').create_dummy_empty_text_node(next_node: nodes[2])
72
+
73
+ assert_equal(['a', '', 'c'], nodes_info[0][:swapping_targets].map(&:to_s))
74
+
75
+ text_for_dummy = 'a'
76
+ dummy.content = text_for_dummy
77
+ assert_equal(text_for_dummy, nodes[2].previous.to_s)
78
+ end
79
+
80
+ def test_run_with_data_without_tag
81
+ html = 'a'
82
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
83
+ nodes = result.first[:nodes]
84
+ nodes_info = [
85
+ {
86
+ nodes: nodes
87
+ }
88
+ ]
89
+
90
+ NodeSwappingTargetsCreator.new(nodes_info).run!
91
+ assert_equal(['a'], nodes_info[0][:swapping_targets].map(&:to_s))
92
+ end
93
+
94
+ def test_run_with_data_with_wovn_ignore
95
+ html = 'a<a wovn-ignore>b</a>c'
96
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
97
+
98
+ nodes_info = [
99
+ {
100
+ nodes: result.first[:nodes]
101
+ }
102
+ ]
103
+
104
+ NodeSwappingTargetsCreator.new(nodes_info).run!
105
+ assert_equal(%w[a c], nodes_info[0][:swapping_targets].map(&:to_s))
106
+ end
107
+
108
+ def test_run_with_data_with_closing_tag
109
+ html = 'a<br>bc'
110
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
111
+
112
+ nodes_info = [
113
+ {
114
+ nodes: result.first[:nodes]
115
+ }
116
+ ]
117
+
118
+ NodeSwappingTargetsCreator.new(nodes_info).run!
119
+ assert_equal(%w[a bc], nodes_info[0][:swapping_targets].map(&:to_s))
120
+ end
121
+
122
+ def test_run_with_data_with_both_closing_tag_and_no_closing_tag
123
+ html = 'a<a>b<br>c</a>d'
124
+ result = TextScraper.new(Set.new).run(Nokogiri::HTML5(html))
125
+
126
+ nodes_info = [
127
+ {
128
+ nodes: result.first[:nodes]
129
+ }
130
+ ]
131
+
132
+ NodeSwappingTargetsCreator.new(nodes_info).run!
133
+ assert_equal(%w[a b c d], nodes_info[0][:swapping_targets].map(&:to_s))
134
+ end
135
+ end
136
+ end
137
+ end