wovnrb 1.0.13 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +1 -1
  3. data/lib/wovnrb.rb +7 -0
  4. data/lib/wovnrb/html_replacers/replacer_base.rb +2 -1
  5. data/lib/wovnrb/html_replacers/unified_values/dst_swapping_targets_creator.rb +76 -0
  6. data/lib/wovnrb/html_replacers/unified_values/element_category.rb +242 -0
  7. data/lib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rb +134 -0
  8. data/lib/wovnrb/html_replacers/unified_values/text_replacer.rb +35 -0
  9. data/lib/wovnrb/html_replacers/unified_values/text_scraper.rb +152 -0
  10. data/lib/wovnrb/html_replacers/unified_values/values_stack.rb +65 -0
  11. data/lib/wovnrb/lang.rb +6 -1
  12. data/lib/wovnrb/services/value_agent.rb +9 -0
  13. data/lib/wovnrb/store.rb +2 -9
  14. data/lib/wovnrb/version.rb +1 -1
  15. data/test/fixtures/unified_values/site_html/simple_actual.html +96 -0
  16. data/test/fixtures/unified_values/site_html/simple_expected.json +251 -0
  17. data/test/fixtures/unified_values/site_html/wovn.io_actual.html +686 -0
  18. data/test/fixtures/unified_values/site_html/wovn.io_expected.json +543 -0
  19. data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_actual.html +1024 -0
  20. data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_expected.json +3345 -0
  21. data/test/fixtures/unified_values/small_html/block_inside_inline_actual.html +12 -0
  22. data/test/fixtures/unified_values/small_html/block_inside_inline_expected.json +22 -0
  23. data/test/fixtures/unified_values/small_html/br_tag_actual.html +10 -0
  24. data/test/fixtures/unified_values/small_html/br_tag_expected.json +12 -0
  25. data/test/fixtures/unified_values/small_html/comment_tag_actual.html +12 -0
  26. data/test/fixtures/unified_values/small_html/comment_tag_expected.json +10 -0
  27. data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_actual.html +7 -0
  28. data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_expected.json +11 -0
  29. data/test/fixtures/unified_values/small_html/deep_nested_block_actual.html +14 -0
  30. data/test/fixtures/unified_values/small_html/deep_nested_block_expected.json +8 -0
  31. data/test/fixtures/unified_values/small_html/deep_nested_inline_actual.html +20 -0
  32. data/test/fixtures/unified_values/small_html/deep_nested_inline_expected.json +20 -0
  33. data/test/fixtures/unified_values/small_html/empty_tag_actual.html +10 -0
  34. data/test/fixtures/unified_values/small_html/empty_tag_expected.json +12 -0
  35. data/test/fixtures/unified_values/small_html/empty_text_actual.html +12 -0
  36. data/test/fixtures/unified_values/small_html/empty_text_expected.json +1 -0
  37. data/test/fixtures/unified_values/small_html/ignore_tag_actual.html +12 -0
  38. data/test/fixtures/unified_values/small_html/ignore_tag_expected.json +16 -0
  39. data/test/fixtures/unified_values/small_html/ignored_class_actual.html +10 -0
  40. data/test/fixtures/unified_values/small_html/ignored_class_expected.json +13 -0
  41. data/test/fixtures/unified_values/small_html/img_actual.html +12 -0
  42. data/test/fixtures/unified_values/small_html/img_expected.json +23 -0
  43. data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_actual.html +10 -0
  44. data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_expected.json +16 -0
  45. data/test/fixtures/unified_values/small_html/nested_text_value_actual.html +10 -0
  46. data/test/fixtures/unified_values/small_html/nested_text_value_expected.json +12 -0
  47. data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_actual.html +10 -0
  48. data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_expected.json +14 -0
  49. data/test/fixtures/unified_values/small_html/option_tag_actual.html +9 -0
  50. data/test/fixtures/unified_values/small_html/option_tag_expected.json +13 -0
  51. data/test/fixtures/unified_values/small_html/text_different_inline_each_other_actual.html +10 -0
  52. data/test/fixtures/unified_values/small_html/text_different_inline_each_other_expected.json +22 -0
  53. data/test/fixtures/unified_values/small_html/text_in_svg_actual.html +9 -0
  54. data/test/fixtures/unified_values/small_html/text_in_svg_expected.json +8 -0
  55. data/test/fixtures/unified_values/small_html/text_with_html_entity_actual.html +6 -0
  56. data/test/fixtures/unified_values/small_html/text_with_html_entity_expected.json +8 -0
  57. data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_actual.html +12 -0
  58. data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_expected.json +24 -0
  59. data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_actual.html +12 -0
  60. data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_expected.json +14 -0
  61. data/test/fixtures/unified_values/small_html/wovn_ignore_actual.html +10 -0
  62. data/test/fixtures/unified_values/small_html/wovn_ignore_expected.json +13 -0
  63. data/test/lib/html_replacers/unified_values/dst_swapping_targets_creator_test.rb +137 -0
  64. data/test/lib/html_replacers/unified_values/element_category_test.rb +49 -0
  65. data/test/lib/html_replacers/unified_values/node_swapping_targets_creator_test.rb +137 -0
  66. data/test/lib/html_replacers/unified_values/text_replacer_test.rb +270 -0
  67. data/test/lib/html_replacers/unified_values/text_scraper_test.rb +121 -0
  68. data/test/lib/html_replacers/unified_values/values_stack_test.rb +122 -0
  69. data/test/lib/lang_test.rb +59 -1
  70. data/test/lib/services/value_agent_test.rb +32 -0
  71. data/test/test_helper.rb +18 -2
  72. data/wovnrb.gemspec +1 -0
  73. metadata +134 -7
  74. data/spec/spec_helper.rb +0 -2
  75. data/spec/wovnrb_spec.rb +0 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc6f97af8d6ae409297a399b58c35570e267ca11
4
- data.tar.gz: 46437e1724675d82d2618f82a622e75a2641be89
3
+ metadata.gz: e67d9a0a4d2f447222580c5d4e0a00995f8027fa
4
+ data.tar.gz: e8544f392fdf10c6c5101914cd7b721ed2e6c4e9
5
5
  SHA512:
6
- metadata.gz: 09b1f83b01bd0ea0ea754ce5bc4cc545b19d43c62280f6041bdb725d360cdcd42e4d35adc280e770f852d36c32fb3ffa69a223a9c192b2e48455ecfd92e12669
7
- data.tar.gz: 6121d05cd6d8710e5ac8825b926ce4ced7bbf0f1f5238aa8c29661c77a42d6037ef931098301626f74aac5b2082946973029e86d5f632067fc40c8f878ab8c2f
6
+ metadata.gz: f69d4ccb222bdfb97368d26c3585aaeb433c71d7b1a58b294d8b18cd098c8222149a1b41bb7912de82c88f7594e6d152600ca4e39167ed2f6987769761e2140c
7
+ data.tar.gz: 3e4ddf14d6c59a6eee023d878744c40a468874351ea27fe6138038506cef78529618950cf60c6af9670174dacd3ac98e12d70c2b306f54b927793cd7dec10bc7
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ require 'pry'
5
5
 
6
6
  Rake::TestTask.new do |t|
7
7
  t.libs << 'test'
8
- t.test_files = FileList['test/*/*.rb'] | FileList['test/*/*/*.rb']
8
+ t.test_files = FileList['test/**/*.rb']
9
9
  t.options = '-p'
10
10
  end
11
11
 
data/lib/wovnrb.rb CHANGED
@@ -3,6 +3,7 @@ require 'wovnrb/store'
3
3
  require 'wovnrb/headers'
4
4
  require 'wovnrb/lang'
5
5
  require 'nokogumbo'
6
+ require 'active_support'
6
7
  #require 'dom'
7
8
  require 'json'
8
9
  require 'wovnrb/helpers/nokogumbo_helper'
@@ -15,6 +16,12 @@ require 'wovnrb/html_replacers/meta_replacer'
15
16
  require 'wovnrb/html_replacers/input_replacer'
16
17
  require 'wovnrb/html_replacers/image_replacer'
17
18
  require 'wovnrb/html_replacers/script_replacer'
19
+ require 'wovnrb/html_replacers/unified_values/text_replacer'
20
+ require 'wovnrb/html_replacers/unified_values/text_scraper'
21
+ require 'wovnrb/html_replacers/unified_values/values_stack'
22
+ require 'wovnrb/html_replacers/unified_values/element_category'
23
+ require 'wovnrb/html_replacers/unified_values/dst_swapping_targets_creator'
24
+ require 'wovnrb/html_replacers/unified_values/node_swapping_targets_creator'
18
25
  require 'wovnrb/railtie' if defined?(Rails)
19
26
  require 'wovnrb/version'
20
27
 
@@ -1,7 +1,8 @@
1
1
  module Wovnrb
2
2
  class ReplacerBase
3
- def initialize(store)
3
+ def initialize(store, ignored_class_set = [])
4
4
  @store = store
5
+ @ignored_class_set = Set.new(ignored_class_set)
5
6
  end
6
7
 
7
8
  def replace(dom, lang)
@@ -0,0 +1,76 @@
1
+ module Wovnrb
2
+ module UnifiedValues
3
+ class DstSwappingTargetsCreator
4
+ # NOTE: `text_index` is the format like below
5
+ #
6
+ # {
7
+ # "<span>apple is a good</span>foods"=>
8
+ # {"ja" =>
9
+ # [{"xpath"=>"/html/body/div", "data"=>"りんごは<span>おいしい</span>たべものです"}]
10
+ # },
11
+ # "click<a>here</a>"=>
12
+ # {"ja" =>
13
+ # [{"xpath"=>"/html/body/div", "data"=>"<a>こちら</a>をクリックしてください"}]
14
+ # }
15
+ # }
16
+
17
+ def initialize(text_index)
18
+ @text_index = text_index
19
+ end
20
+
21
+ # NOTE: `run` make a swapping target like below
22
+ #
23
+ # {
24
+ # "<span>apple is a good</span>foods"=>
25
+ # {"ja" =>
26
+ # [{"xpath"=>"/html/body/div", "data"=>"りんごは<span>おいしい</span>たべものです", 'swapping_targets'=>["りんごは", "おいしい", "たべものです"]}]
27
+ # },
28
+ # "click<a>here</a>"=>
29
+ # {"ja" =>
30
+ # [{"xpath"=>"/html/body/div", "data"=>" <a>こちら</a>をクリックしてください"}, 'swapping_targets'=>["", "こちら", "をクリックしてください"]]
31
+ # }
32
+ # }
33
+
34
+ def run!
35
+ @text_index.each do |_, v|
36
+ mold = []
37
+ v.values.each do |values|
38
+ values.each do |value|
39
+ value['data'].split(/(<.+?>)/).each_with_index do |data, _index|
40
+ mold_size = mold.size
41
+ mold.push('') if mold_size.even? && data.start_with?('<')
42
+ mold.push(data)
43
+ end
44
+
45
+ mold.push('') if /\A<.+?>\z/ =~ mold.last
46
+
47
+ value['swapping_targets'] = remove_tag_element(mold)
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def remove_tag_element(mold)
56
+ end_tag_of_wovn_ignore = nil
57
+ swapping_targets = []
58
+
59
+ mold.each do |value|
60
+ if end_tag_of_wovn_ignore.nil? && value =~ /\A<.*wovn-ignore>\z/
61
+ end_tag_of_wovn_ignore = "</#{value.gsub(' wovn-ignore', '')[1..-1]}"
62
+ next
63
+ end
64
+
65
+ end_tag_of_wovn_ignore = nil if value == end_tag_of_wovn_ignore
66
+
67
+ if end_tag_of_wovn_ignore.nil? && /\A<.+?>\z/ !~ value
68
+ swapping_targets << value
69
+ end
70
+ end
71
+
72
+ swapping_targets
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,242 @@
1
+ module Wovnrb
2
+ module UnifiedValues
3
+ module ElementCategory
4
+ FLOW_CONTENT = 1
5
+ PALPABLE_CONTENT = 2
6
+ PHRASING_CONTENT = 3
7
+ EMBEDDED_CONTENT = 4
8
+ INTERACTIVE_CONTENT = 5
9
+ METADATA_CONTENT = 6
10
+ HEADING_CONTENT = 7
11
+ SECTIONING_CONTENT = 8
12
+ SECTIONING_ROOT = 9
13
+ FORM_ASSOCIATED_ELEMENT = 10
14
+ SCRIPT_SUPPORTING_ELEMENT = 11
15
+ SVG_ELEMENT = 12
16
+
17
+ DOCUMENT_ELEMENT = 100
18
+ DOCUMENT_METADATA = 101
19
+ SECTION = 1010
20
+ GROUPING_CONTENT = 103
21
+ TEXT_LEVEL_SEMANTICS = 104
22
+ EDITS = 105
23
+ EMBEDDED_CONTENTS = 106
24
+ LINKS = 107
25
+ TABULAR_DATA = 108
26
+ FORMS = 109
27
+ INTERACTIVE_ELEMENTS = 110
28
+ SCRIPTING = 111
29
+ ESCAPABLE_RAW_TEXT_ELEMENT = 112
30
+
31
+ RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS = 200
32
+ RESTRICTED_TO_CHILD_OF_GROUPING = 201
33
+ RESTRICTED_TO_CHILD_OF_EMBEDDED = 202
34
+ RESTRICTED_TO_CHILD_OF_TABULAR = 203
35
+ RESTRICTED_TO_CHILD_OF_FORM = 204
36
+ RESTRICTED_TO_CHILD_OF_INTERACTIVE = 205
37
+
38
+ RESTRICTION_PARENT_ELEMENT = 300
39
+ FORM_OWNER = 301
40
+
41
+ # Below will help to know every elements.
42
+ # https://www.w3.org/TR/html5/
43
+ # https://dev.w3.org/html5/html-author/#the-elements
44
+ # https://html.spec.whatwg.org/#elements-2
45
+ #
46
+ # Below will help to know default display of elements.
47
+ # https://html.spec.whatwg.org/#rendering
48
+
49
+ # This constant is for internal. SHOULD NOT be used outside this class.
50
+ CONTENT_TYPES = {
51
+ 'a' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
52
+ 'abbr' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
53
+ 'address' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
54
+ 'area' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT],
55
+ 'article' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
56
+ 'aside' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
57
+ 'audio' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
58
+ 'b' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
59
+ 'base' => [DOCUMENT_METADATA, METADATA_CONTENT],
60
+ 'bb' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT],
61
+ 'bdi' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
62
+ 'bdo' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
63
+ 'blockquote' => [GROUPING_CONTENT, FLOW_CONTENT, SECTIONING_ROOT, PALPABLE_CONTENT],
64
+ 'body' => [SECTION, SECTIONING_ROOT],
65
+ 'br' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT],
66
+ 'button' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
67
+ 'canvas' => [SCRIPTING, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENTS, PALPABLE_CONTENT],
68
+ 'caption' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
69
+ 'cite' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
70
+ 'code' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
71
+ 'col' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
72
+ 'colgroup' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
73
+ 'data' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
74
+ 'datalist' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, RESTRICTION_PARENT_ELEMENT],
75
+ 'dd' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
76
+ 'del' => [EDITS, FLOW_CONTENT, PHRASING_CONTENT],
77
+ 'details' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, SECTIONING_ROOT, INTERACTIVE_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
78
+ 'dfn' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
79
+ 'dialog' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, SECTIONING_ROOT],
80
+ 'div' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
81
+ 'dl' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
82
+ 'dt' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
83
+ 'em' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
84
+ 'embed' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
85
+ 'fieldset' => [FORMS, FLOW_CONTENT, SECTIONING_ROOT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
86
+ 'figcaption' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
87
+ 'figure' => [GROUPING_CONTENT, FLOW_CONTENT, SECTIONING_ROOT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
88
+ 'footer' => [SECTION, FLOW_CONTENT, PALPABLE_CONTENT],
89
+ 'form' => [FORMS, FLOW_CONTENT, PALPABLE_CONTENT, FORM_OWNER],
90
+ 'h1' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
91
+ 'h2' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
92
+ 'h3' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
93
+ 'h4' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
94
+ 'h5' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
95
+ 'h6' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
96
+ 'head' => [DOCUMENT_METADATA],
97
+ 'header' => [SECTIONING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
98
+ 'hgroup' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
99
+ 'hr' => [GROUPING_CONTENT, FLOW_CONTENT],
100
+ 'html' => [DOCUMENT_ELEMENT],
101
+ 'i' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
102
+ 'iframe' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
103
+ 'img' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, FORM_ASSOCIATED_ELEMENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
104
+ 'input' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
105
+ 'ins' => [EDITS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
106
+ 'kbd' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
107
+ 'label' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
108
+ 'legend' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM],
109
+ 'li' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
110
+ 'link' => [DOCUMENT_METADATA, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT],
111
+ 'main' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
112
+ 'map' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
113
+ 'mark' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
114
+ 'menu' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, INTERACTIVE_CONTENT],
115
+ 'meta' => [DOCUMENT_METADATA, METADATA_CONTENT],
116
+ 'meter' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
117
+ 'nav' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
118
+ 'noscript' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT],
119
+ 'object' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
120
+ 'ol' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
121
+ 'optgroup' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM, RESTRICTION_PARENT_ELEMENT],
122
+ 'option' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM],
123
+ 'output' => [FLOW_CONTENT, PHRASING_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
124
+ 'p' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
125
+ 'param' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
126
+ 'picture' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT],
127
+ 'pre' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
128
+ 'progress' => [FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
129
+ 'q' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
130
+ 'rb' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
131
+ 'rp' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
132
+ 'rt' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
133
+ 'rtc' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
134
+ 'ruby' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
135
+ 's' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
136
+ 'samp' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
137
+ 'script' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT, SCRIPT_SUPPORTING_ELEMENT],
138
+ 'section' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
139
+ 'select' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
140
+ 'slot' => [SCRIPTING, FLOW_CONTENT, PHRASING_CONTENT],
141
+ 'small' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
142
+ 'source' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
143
+ 'span' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
144
+ 'strong' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
145
+ 'style' => [DOCUMENT_METADATA, METADATA_CONTENT, FLOW_CONTENT],
146
+ 'sub' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
147
+ 'summary' => [INTERACTIVE_ELEMENTS, RESTRICTED_TO_CHILD_OF_INTERACTIVE],
148
+ 'sup' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
149
+ 'svg' => [SVG_ELEMENT],
150
+ 'table' => [TABULAR_DATA, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
151
+ 'tbody' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
152
+ 'td' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
153
+ 'template' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT, SCRIPT_SUPPORTING_ELEMENT],
154
+ 'textarea' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, ESCAPABLE_RAW_TEXT_ELEMENT],
155
+ 'tfoot' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
156
+ 'th' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
157
+ 'thead' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
158
+ 'time' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
159
+ 'title' => [DOCUMENT_METADATA, METADATA_CONTENT, ESCAPABLE_RAW_TEXT_ELEMENT],
160
+ 'tr' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
161
+ 'track' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
162
+ 'u' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
163
+ 'ul' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
164
+ 'var' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
165
+ 'video' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
166
+ 'wbr' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT]
167
+ }.freeze
168
+
169
+ # NOTE: Autonomous "custom element" belongs to
170
+ # [FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT]
171
+
172
+ VOID_ELEMENTS = Set.new(%w[area base br col embed hr img input link meta param source track wbr]).freeze
173
+
174
+ INLINE_ELEMENTS_INCLUDING_VOID = Set.new(CONTENT_TYPES.select do |_name, types|
175
+ # Split all table related elements.
176
+ next false if types.include?(TABULAR_DATA)
177
+
178
+ # RESTRICTION_PARENT_ELEMENT creates group for specific purpose.
179
+ next false if types.include?(RESTRICTION_PARENT_ELEMENT)
180
+
181
+ # Ignore meta elements
182
+ next false if types.include?(METADATA_CONTENT)
183
+
184
+ # Text-level contents are for decorating text like span
185
+ next true if types.include?(TEXT_LEVEL_SEMANTICS)
186
+
187
+ # Divide media to block
188
+ if types.include?(EMBEDDED_CONTENTS)
189
+ # Not divide content inside media
190
+ next true if types.include?(RESTRICTED_TO_CHILD_OF_EMBEDDED)
191
+ next false
192
+ end
193
+
194
+ if types.include?(FORMS)
195
+ # Divide form element creating section
196
+ next false if types.include?(SECTIONING_ROOT)
197
+
198
+ # Divide "FORM"
199
+ next false if types.include?(FORM_OWNER)
200
+
201
+ # Ignore contents inputted by user
202
+ next false if types.include?(ESCAPABLE_RAW_TEXT_ELEMENT)
203
+
204
+ # Otherwise are grouped as FORM
205
+ next true
206
+ end
207
+
208
+ false
209
+ end.keys).freeze
210
+
211
+ INLINE_ELEMENTS = (INLINE_ELEMENTS_INCLUDING_VOID - VOID_ELEMENTS).freeze
212
+ EMPTY_ELEMENTS = (INLINE_ELEMENTS_INCLUDING_VOID & VOID_ELEMENTS).freeze
213
+
214
+ # IGNORE_ELEMENTS doesn't scrape contents inside the tags
215
+ IGNORE_ELEMENTS = Set.new(CONTENT_TYPES.select do |name, types|
216
+ # meta should be ignored but require swapping.
217
+ next true if name == 'meta'
218
+ next true if types.include?(EMBEDDED_CONTENTS) && !types.include?(RESTRICTED_TO_CHILD_OF_EMBEDDED)
219
+ end.keys).freeze
220
+
221
+ # SKIP_ELEMENTS doesn't scrape and swap contents
222
+ SKIP_ELEMENTS = Set.new(CONTENT_TYPES.select do |name, types|
223
+ # iterate title to get content
224
+ next false if name == 'title'
225
+ next false if name == 'meta'
226
+
227
+ next true if types.include?(METADATA_CONTENT)
228
+ end.keys).freeze
229
+
230
+ # SKIP_ELEMENTS doesn't scrape and swap contents. But it scrape and swap tag's attributes like `placeholder`
231
+ SKIP_ELEMENTS_WITHOUT_ATTRIBUTES = Set.new(CONTENT_TYPES.select do |name, types|
232
+ # iterate title to get content
233
+ next false if name == 'title'
234
+
235
+ # Ignore contents inputted by user
236
+ next true if types.include?(ESCAPABLE_RAW_TEXT_ELEMENT)
237
+ end.keys).freeze
238
+
239
+ BLOCK_ELEMENTS = (Set.new(CONTENT_TYPES.keys) - INLINE_ELEMENTS - EMPTY_ELEMENTS - IGNORE_ELEMENTS - SKIP_ELEMENTS - SKIP_ELEMENTS_WITHOUT_ATTRIBUTES).freeze
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,134 @@
1
+ module Wovnrb
2
+ module UnifiedValues
3
+ class NodeSwappingTargetsCreator
4
+ # NOTE: `nodes_info` is the format like below
5
+ #
6
+ # [
7
+ # {:dst=>"an<span>apple is a good</span>foods",
8
+ # :nodes=>
9
+ # [
10
+ # (Text "an"),
11
+ # (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
12
+ # (Text "apple is a good"),
13
+ # (Element:0x13e84e1334 { name = "span", childrelib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rbn = [ #(Text "apple is a good")] }),
14
+ # (Text "\n foods\n \n \n")]}
15
+ # ]
16
+ # }
17
+ # ]
18
+ def initialize(nodes_info)
19
+ @nodes_info = nodes_info
20
+ end
21
+
22
+ # NOTE: `run` make a swapping_targets like below
23
+ #
24
+ # [
25
+ # {:dst=>"an<span>apple is a good</span>foods",
26
+ # :nodes=>
27
+ # [
28
+ # (Text "an"),
29
+ # (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
30
+ # (Text "apple is a good"),
31
+ # (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
32
+ # (Text "\n foods\n \n \n")]}
33
+ # ]
34
+ # :swapping_targets=>
35
+ # [
36
+ # (Text "an"),
37
+ # (Text "apple is a good"),
38
+ # (Text "\n foods\n \n \n")]}
39
+ # ]
40
+ # }
41
+ # ]
42
+
43
+ def run!
44
+ @nodes_info.each do |node_info|
45
+ mold = []
46
+ node_info[:nodes].each do |node|
47
+ mold_size = mold.size
48
+ mold.push create_dummy_empty_text_node(next_node: node) if mold_size.even? && node.element?
49
+ mold.push node
50
+ end
51
+
52
+ mold.push create_dummy_empty_text_node(previous_node: mold.last) if mold.last.element?
53
+ node_info[:swapping_targets] = remove_tag_element(mold)
54
+ end
55
+ end
56
+
57
+ def remove_tag_element(mold)
58
+ id_of_tag_with_wovn_ignore = nil
59
+ swapping_targets = []
60
+ mold.each do |node|
61
+ if id_of_tag_with_wovn_ignore.nil? && node.attributes.keys.include?('wovn-ignore')
62
+ id_of_tag_with_wovn_ignore = node.object_id
63
+ next
64
+ end
65
+
66
+ if node.object_id == id_of_tag_with_wovn_ignore
67
+ id_of_tag_with_wovn_ignore = nil
68
+ end
69
+
70
+ if id_of_tag_with_wovn_ignore.nil? && node.text?
71
+ swapping_targets << node
72
+ end
73
+ end
74
+
75
+ swapping_targets
76
+ end
77
+
78
+ def create_dummy_empty_text_node(option)
79
+ DummyEmpryTextNode.new(option)
80
+ end
81
+
82
+ class DummyEmpryTextNode
83
+ attr_reader :name
84
+
85
+ def initialize(next_node: nil, previous_node: nil)
86
+ @name = 'text'
87
+ @next_node = next_node
88
+ @previous_node = previous_node
89
+ @added_empty_text = nil
90
+ end
91
+
92
+ def text?
93
+ true
94
+ end
95
+
96
+ def content
97
+ ''
98
+ end
99
+
100
+ def to_s
101
+ content
102
+ end
103
+
104
+ def attributes
105
+ {}
106
+ end
107
+
108
+ def parent
109
+ @next_node.try(:parent) || @previous_node.try(:parent)
110
+ end
111
+
112
+ def document
113
+ @next_node.try(:document) || @previous_node.try(:document)
114
+ end
115
+
116
+ def add_previous_sibling(comment_node)
117
+ @added_empty_text.try(:add_previous_sibling, comment_node)
118
+ end
119
+
120
+ def content=(text)
121
+ return if text == ''
122
+
123
+ if @next_node
124
+ @next_node.add_previous_sibling(text)
125
+ @added_empty_text = @next_node.previous
126
+ elsif @previous_node
127
+ @previous_node.add_next_sibling(text)
128
+ @added_empty_text = @previous_node.next
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end