wovnrb 1.0.13 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/Rakefile +1 -1
  3. data/lib/wovnrb.rb +7 -0
  4. data/lib/wovnrb/html_replacers/replacer_base.rb +2 -1
  5. data/lib/wovnrb/html_replacers/unified_values/dst_swapping_targets_creator.rb +76 -0
  6. data/lib/wovnrb/html_replacers/unified_values/element_category.rb +242 -0
  7. data/lib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rb +134 -0
  8. data/lib/wovnrb/html_replacers/unified_values/text_replacer.rb +35 -0
  9. data/lib/wovnrb/html_replacers/unified_values/text_scraper.rb +152 -0
  10. data/lib/wovnrb/html_replacers/unified_values/values_stack.rb +65 -0
  11. data/lib/wovnrb/lang.rb +6 -1
  12. data/lib/wovnrb/services/value_agent.rb +9 -0
  13. data/lib/wovnrb/store.rb +2 -9
  14. data/lib/wovnrb/version.rb +1 -1
  15. data/test/fixtures/unified_values/site_html/simple_actual.html +96 -0
  16. data/test/fixtures/unified_values/site_html/simple_expected.json +251 -0
  17. data/test/fixtures/unified_values/site_html/wovn.io_actual.html +686 -0
  18. data/test/fixtures/unified_values/site_html/wovn.io_expected.json +543 -0
  19. data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_actual.html +1024 -0
  20. data/test/fixtures/unified_values/site_html/www.yahoo.co.jp_expected.json +3345 -0
  21. data/test/fixtures/unified_values/small_html/block_inside_inline_actual.html +12 -0
  22. data/test/fixtures/unified_values/small_html/block_inside_inline_expected.json +22 -0
  23. data/test/fixtures/unified_values/small_html/br_tag_actual.html +10 -0
  24. data/test/fixtures/unified_values/small_html/br_tag_expected.json +12 -0
  25. data/test/fixtures/unified_values/small_html/comment_tag_actual.html +12 -0
  26. data/test/fixtures/unified_values/small_html/comment_tag_expected.json +10 -0
  27. data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_actual.html +7 -0
  28. data/test/fixtures/unified_values/small_html/complex_text_with_html_entity_expected.json +11 -0
  29. data/test/fixtures/unified_values/small_html/deep_nested_block_actual.html +14 -0
  30. data/test/fixtures/unified_values/small_html/deep_nested_block_expected.json +8 -0
  31. data/test/fixtures/unified_values/small_html/deep_nested_inline_actual.html +20 -0
  32. data/test/fixtures/unified_values/small_html/deep_nested_inline_expected.json +20 -0
  33. data/test/fixtures/unified_values/small_html/empty_tag_actual.html +10 -0
  34. data/test/fixtures/unified_values/small_html/empty_tag_expected.json +12 -0
  35. data/test/fixtures/unified_values/small_html/empty_text_actual.html +12 -0
  36. data/test/fixtures/unified_values/small_html/empty_text_expected.json +1 -0
  37. data/test/fixtures/unified_values/small_html/ignore_tag_actual.html +12 -0
  38. data/test/fixtures/unified_values/small_html/ignore_tag_expected.json +16 -0
  39. data/test/fixtures/unified_values/small_html/ignored_class_actual.html +10 -0
  40. data/test/fixtures/unified_values/small_html/ignored_class_expected.json +13 -0
  41. data/test/fixtures/unified_values/small_html/img_actual.html +12 -0
  42. data/test/fixtures/unified_values/small_html/img_expected.json +23 -0
  43. data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_actual.html +10 -0
  44. data/test/fixtures/unified_values/small_html/nested_and_complex_wovn_ignore_expected.json +16 -0
  45. data/test/fixtures/unified_values/small_html/nested_text_value_actual.html +10 -0
  46. data/test/fixtures/unified_values/small_html/nested_text_value_expected.json +12 -0
  47. data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_actual.html +10 -0
  48. data/test/fixtures/unified_values/small_html/nested_text_value_mixed_plan_text_expected.json +14 -0
  49. data/test/fixtures/unified_values/small_html/option_tag_actual.html +9 -0
  50. data/test/fixtures/unified_values/small_html/option_tag_expected.json +13 -0
  51. data/test/fixtures/unified_values/small_html/text_different_inline_each_other_actual.html +10 -0
  52. data/test/fixtures/unified_values/small_html/text_different_inline_each_other_expected.json +22 -0
  53. data/test/fixtures/unified_values/small_html/text_in_svg_actual.html +9 -0
  54. data/test/fixtures/unified_values/small_html/text_in_svg_expected.json +8 -0
  55. data/test/fixtures/unified_values/small_html/text_with_html_entity_actual.html +6 -0
  56. data/test/fixtures/unified_values/small_html/text_with_html_entity_expected.json +8 -0
  57. data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_actual.html +12 -0
  58. data/test/fixtures/unified_values/small_html/unknown_or_custom_tag_expected.json +24 -0
  59. data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_actual.html +12 -0
  60. data/test/fixtures/unified_values/small_html/unnecessay_top_end_tag_expected.json +14 -0
  61. data/test/fixtures/unified_values/small_html/wovn_ignore_actual.html +10 -0
  62. data/test/fixtures/unified_values/small_html/wovn_ignore_expected.json +13 -0
  63. data/test/lib/html_replacers/unified_values/dst_swapping_targets_creator_test.rb +137 -0
  64. data/test/lib/html_replacers/unified_values/element_category_test.rb +49 -0
  65. data/test/lib/html_replacers/unified_values/node_swapping_targets_creator_test.rb +137 -0
  66. data/test/lib/html_replacers/unified_values/text_replacer_test.rb +270 -0
  67. data/test/lib/html_replacers/unified_values/text_scraper_test.rb +121 -0
  68. data/test/lib/html_replacers/unified_values/values_stack_test.rb +122 -0
  69. data/test/lib/lang_test.rb +59 -1
  70. data/test/lib/services/value_agent_test.rb +32 -0
  71. data/test/test_helper.rb +18 -2
  72. data/wovnrb.gemspec +1 -0
  73. metadata +134 -7
  74. data/spec/spec_helper.rb +0 -2
  75. data/spec/wovnrb_spec.rb +0 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc6f97af8d6ae409297a399b58c35570e267ca11
4
- data.tar.gz: 46437e1724675d82d2618f82a622e75a2641be89
3
+ metadata.gz: e67d9a0a4d2f447222580c5d4e0a00995f8027fa
4
+ data.tar.gz: e8544f392fdf10c6c5101914cd7b721ed2e6c4e9
5
5
  SHA512:
6
- metadata.gz: 09b1f83b01bd0ea0ea754ce5bc4cc545b19d43c62280f6041bdb725d360cdcd42e4d35adc280e770f852d36c32fb3ffa69a223a9c192b2e48455ecfd92e12669
7
- data.tar.gz: 6121d05cd6d8710e5ac8825b926ce4ced7bbf0f1f5238aa8c29661c77a42d6037ef931098301626f74aac5b2082946973029e86d5f632067fc40c8f878ab8c2f
6
+ metadata.gz: f69d4ccb222bdfb97368d26c3585aaeb433c71d7b1a58b294d8b18cd098c8222149a1b41bb7912de82c88f7594e6d152600ca4e39167ed2f6987769761e2140c
7
+ data.tar.gz: 3e4ddf14d6c59a6eee023d878744c40a468874351ea27fe6138038506cef78529618950cf60c6af9670174dacd3ac98e12d70c2b306f54b927793cd7dec10bc7
data/Rakefile CHANGED
@@ -5,7 +5,7 @@ require 'pry'
5
5
 
6
6
  Rake::TestTask.new do |t|
7
7
  t.libs << 'test'
8
- t.test_files = FileList['test/*/*.rb'] | FileList['test/*/*/*.rb']
8
+ t.test_files = FileList['test/**/*.rb']
9
9
  t.options = '-p'
10
10
  end
11
11
 
data/lib/wovnrb.rb CHANGED
@@ -3,6 +3,7 @@ require 'wovnrb/store'
3
3
  require 'wovnrb/headers'
4
4
  require 'wovnrb/lang'
5
5
  require 'nokogumbo'
6
+ require 'active_support'
6
7
  #require 'dom'
7
8
  require 'json'
8
9
  require 'wovnrb/helpers/nokogumbo_helper'
@@ -15,6 +16,12 @@ require 'wovnrb/html_replacers/meta_replacer'
15
16
  require 'wovnrb/html_replacers/input_replacer'
16
17
  require 'wovnrb/html_replacers/image_replacer'
17
18
  require 'wovnrb/html_replacers/script_replacer'
19
+ require 'wovnrb/html_replacers/unified_values/text_replacer'
20
+ require 'wovnrb/html_replacers/unified_values/text_scraper'
21
+ require 'wovnrb/html_replacers/unified_values/values_stack'
22
+ require 'wovnrb/html_replacers/unified_values/element_category'
23
+ require 'wovnrb/html_replacers/unified_values/dst_swapping_targets_creator'
24
+ require 'wovnrb/html_replacers/unified_values/node_swapping_targets_creator'
18
25
  require 'wovnrb/railtie' if defined?(Rails)
19
26
  require 'wovnrb/version'
20
27
 
@@ -1,7 +1,8 @@
1
1
  module Wovnrb
2
2
  class ReplacerBase
3
- def initialize(store)
3
+ def initialize(store, ignored_class_set = [])
4
4
  @store = store
5
+ @ignored_class_set = Set.new(ignored_class_set)
5
6
  end
6
7
 
7
8
  def replace(dom, lang)
@@ -0,0 +1,76 @@
1
+ module Wovnrb
2
+ module UnifiedValues
3
+ class DstSwappingTargetsCreator
4
+ # NOTE: `text_index` is the format like below
5
+ #
6
+ # {
7
+ # "<span>apple is a good</span>foods"=>
8
+ # {"ja" =>
9
+ # [{"xpath"=>"/html/body/div", "data"=>"りんごは<span>おいしい</span>たべものです"}]
10
+ # },
11
+ # "click<a>here</a>"=>
12
+ # {"ja" =>
13
+ # [{"xpath"=>"/html/body/div", "data"=>"<a>こちら</a>をクリックしてください"}]
14
+ # }
15
+ # }
16
+
17
+ def initialize(text_index)
18
+ @text_index = text_index
19
+ end
20
+
21
+ # NOTE: `run` make a swapping target like below
22
+ #
23
+ # {
24
+ # "<span>apple is a good</span>foods"=>
25
+ # {"ja" =>
26
+ # [{"xpath"=>"/html/body/div", "data"=>"りんごは<span>おいしい</span>たべものです", 'swapping_targets'=>["りんごは", "おいしい", "たべものです"]}]
27
+ # },
28
+ # "click<a>here</a>"=>
29
+ # {"ja" =>
30
+ # [{"xpath"=>"/html/body/div", "data"=>" <a>こちら</a>をクリックしてください"}, 'swapping_targets'=>["", "こちら", "をクリックしてください"]]
31
+ # }
32
+ # }
33
+
34
+ def run!
35
+ @text_index.each do |_, v|
36
+ mold = []
37
+ v.values.each do |values|
38
+ values.each do |value|
39
+ value['data'].split(/(<.+?>)/).each_with_index do |data, _index|
40
+ mold_size = mold.size
41
+ mold.push('') if mold_size.even? && data.start_with?('<')
42
+ mold.push(data)
43
+ end
44
+
45
+ mold.push('') if /\A<.+?>\z/ =~ mold.last
46
+
47
+ value['swapping_targets'] = remove_tag_element(mold)
48
+ end
49
+ end
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def remove_tag_element(mold)
56
+ end_tag_of_wovn_ignore = nil
57
+ swapping_targets = []
58
+
59
+ mold.each do |value|
60
+ if end_tag_of_wovn_ignore.nil? && value =~ /\A<.*wovn-ignore>\z/
61
+ end_tag_of_wovn_ignore = "</#{value.gsub(' wovn-ignore', '')[1..-1]}"
62
+ next
63
+ end
64
+
65
+ end_tag_of_wovn_ignore = nil if value == end_tag_of_wovn_ignore
66
+
67
+ if end_tag_of_wovn_ignore.nil? && /\A<.+?>\z/ !~ value
68
+ swapping_targets << value
69
+ end
70
+ end
71
+
72
+ swapping_targets
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,242 @@
1
+ module Wovnrb
2
+ module UnifiedValues
3
+ module ElementCategory
4
+ FLOW_CONTENT = 1
5
+ PALPABLE_CONTENT = 2
6
+ PHRASING_CONTENT = 3
7
+ EMBEDDED_CONTENT = 4
8
+ INTERACTIVE_CONTENT = 5
9
+ METADATA_CONTENT = 6
10
+ HEADING_CONTENT = 7
11
+ SECTIONING_CONTENT = 8
12
+ SECTIONING_ROOT = 9
13
+ FORM_ASSOCIATED_ELEMENT = 10
14
+ SCRIPT_SUPPORTING_ELEMENT = 11
15
+ SVG_ELEMENT = 12
16
+
17
+ DOCUMENT_ELEMENT = 100
18
+ DOCUMENT_METADATA = 101
19
+ SECTION = 1010
20
+ GROUPING_CONTENT = 103
21
+ TEXT_LEVEL_SEMANTICS = 104
22
+ EDITS = 105
23
+ EMBEDDED_CONTENTS = 106
24
+ LINKS = 107
25
+ TABULAR_DATA = 108
26
+ FORMS = 109
27
+ INTERACTIVE_ELEMENTS = 110
28
+ SCRIPTING = 111
29
+ ESCAPABLE_RAW_TEXT_ELEMENT = 112
30
+
31
+ RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS = 200
32
+ RESTRICTED_TO_CHILD_OF_GROUPING = 201
33
+ RESTRICTED_TO_CHILD_OF_EMBEDDED = 202
34
+ RESTRICTED_TO_CHILD_OF_TABULAR = 203
35
+ RESTRICTED_TO_CHILD_OF_FORM = 204
36
+ RESTRICTED_TO_CHILD_OF_INTERACTIVE = 205
37
+
38
+ RESTRICTION_PARENT_ELEMENT = 300
39
+ FORM_OWNER = 301
40
+
41
+ # Below will help to know every elements.
42
+ # https://www.w3.org/TR/html5/
43
+ # https://dev.w3.org/html5/html-author/#the-elements
44
+ # https://html.spec.whatwg.org/#elements-2
45
+ #
46
+ # Below will help to know default display of elements.
47
+ # https://html.spec.whatwg.org/#rendering
48
+
49
+ # This constant is for internal. SHOULD NOT be used outside this class.
50
+ CONTENT_TYPES = {
51
+ 'a' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
52
+ 'abbr' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
53
+ 'address' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
54
+ 'area' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT],
55
+ 'article' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
56
+ 'aside' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
57
+ 'audio' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
58
+ 'b' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
59
+ 'base' => [DOCUMENT_METADATA, METADATA_CONTENT],
60
+ 'bb' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT],
61
+ 'bdi' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
62
+ 'bdo' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
63
+ 'blockquote' => [GROUPING_CONTENT, FLOW_CONTENT, SECTIONING_ROOT, PALPABLE_CONTENT],
64
+ 'body' => [SECTION, SECTIONING_ROOT],
65
+ 'br' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT],
66
+ 'button' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
67
+ 'canvas' => [SCRIPTING, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENTS, PALPABLE_CONTENT],
68
+ 'caption' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
69
+ 'cite' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
70
+ 'code' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
71
+ 'col' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
72
+ 'colgroup' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
73
+ 'data' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
74
+ 'datalist' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, RESTRICTION_PARENT_ELEMENT],
75
+ 'dd' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
76
+ 'del' => [EDITS, FLOW_CONTENT, PHRASING_CONTENT],
77
+ 'details' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, SECTIONING_ROOT, INTERACTIVE_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
78
+ 'dfn' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
79
+ 'dialog' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, SECTIONING_ROOT],
80
+ 'div' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
81
+ 'dl' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
82
+ 'dt' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
83
+ 'em' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
84
+ 'embed' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
85
+ 'fieldset' => [FORMS, FLOW_CONTENT, SECTIONING_ROOT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
86
+ 'figcaption' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
87
+ 'figure' => [GROUPING_CONTENT, FLOW_CONTENT, SECTIONING_ROOT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
88
+ 'footer' => [SECTION, FLOW_CONTENT, PALPABLE_CONTENT],
89
+ 'form' => [FORMS, FLOW_CONTENT, PALPABLE_CONTENT, FORM_OWNER],
90
+ 'h1' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
91
+ 'h2' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
92
+ 'h3' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
93
+ 'h4' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
94
+ 'h5' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
95
+ 'h6' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
96
+ 'head' => [DOCUMENT_METADATA],
97
+ 'header' => [SECTIONING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
98
+ 'hgroup' => [SECTION, FLOW_CONTENT, HEADING_CONTENT, PALPABLE_CONTENT],
99
+ 'hr' => [GROUPING_CONTENT, FLOW_CONTENT],
100
+ 'html' => [DOCUMENT_ELEMENT],
101
+ 'i' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
102
+ 'iframe' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
103
+ 'img' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, FORM_ASSOCIATED_ELEMENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
104
+ 'input' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
105
+ 'ins' => [EDITS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
106
+ 'kbd' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
107
+ 'label' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
108
+ 'legend' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM],
109
+ 'li' => [GROUPING_CONTENT, RESTRICTED_TO_CHILD_OF_GROUPING],
110
+ 'link' => [DOCUMENT_METADATA, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT],
111
+ 'main' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
112
+ 'map' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
113
+ 'mark' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
114
+ 'menu' => [INTERACTIVE_ELEMENTS, FLOW_CONTENT, INTERACTIVE_CONTENT],
115
+ 'meta' => [DOCUMENT_METADATA, METADATA_CONTENT],
116
+ 'meter' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
117
+ 'nav' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
118
+ 'noscript' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT],
119
+ 'object' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
120
+ 'ol' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
121
+ 'optgroup' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM, RESTRICTION_PARENT_ELEMENT],
122
+ 'option' => [FORMS, RESTRICTED_TO_CHILD_OF_FORM],
123
+ 'output' => [FLOW_CONTENT, PHRASING_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT],
124
+ 'p' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
125
+ 'param' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
126
+ 'picture' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT],
127
+ 'pre' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT],
128
+ 'progress' => [FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
129
+ 'q' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
130
+ 'rb' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
131
+ 'rp' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
132
+ 'rt' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
133
+ 'rtc' => [TEXT_LEVEL_SEMANTICS, RESTRICTED_TO_CHILD_OF_TEXT_LEVEL_SEMANTICS],
134
+ 'ruby' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
135
+ 's' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
136
+ 'samp' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
137
+ 'script' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT, SCRIPT_SUPPORTING_ELEMENT],
138
+ 'section' => [SECTION, FLOW_CONTENT, SECTIONING_CONTENT, PALPABLE_CONTENT],
139
+ 'select' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
140
+ 'slot' => [SCRIPTING, FLOW_CONTENT, PHRASING_CONTENT],
141
+ 'small' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
142
+ 'source' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
143
+ 'span' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
144
+ 'strong' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
145
+ 'style' => [DOCUMENT_METADATA, METADATA_CONTENT, FLOW_CONTENT],
146
+ 'sub' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
147
+ 'summary' => [INTERACTIVE_ELEMENTS, RESTRICTED_TO_CHILD_OF_INTERACTIVE],
148
+ 'sup' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
149
+ 'svg' => [SVG_ELEMENT],
150
+ 'table' => [TABULAR_DATA, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
151
+ 'tbody' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
152
+ 'td' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
153
+ 'template' => [SCRIPTING, METADATA_CONTENT, FLOW_CONTENT, PHRASING_CONTENT, SCRIPT_SUPPORTING_ELEMENT],
154
+ 'textarea' => [FORMS, FLOW_CONTENT, PHRASING_CONTENT, INTERACTIVE_CONTENT, FORM_ASSOCIATED_ELEMENT, PALPABLE_CONTENT, ESCAPABLE_RAW_TEXT_ELEMENT],
155
+ 'tfoot' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
156
+ 'th' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR],
157
+ 'thead' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
158
+ 'time' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
159
+ 'title' => [DOCUMENT_METADATA, METADATA_CONTENT, ESCAPABLE_RAW_TEXT_ELEMENT],
160
+ 'tr' => [TABULAR_DATA, RESTRICTED_TO_CHILD_OF_TABULAR, RESTRICTION_PARENT_ELEMENT],
161
+ 'track' => [EMBEDDED_CONTENTS, RESTRICTED_TO_CHILD_OF_EMBEDDED],
162
+ 'u' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
163
+ 'ul' => [GROUPING_CONTENT, FLOW_CONTENT, PALPABLE_CONTENT, RESTRICTION_PARENT_ELEMENT],
164
+ 'var' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT],
165
+ 'video' => [EMBEDDED_CONTENTS, FLOW_CONTENT, PHRASING_CONTENT, EMBEDDED_CONTENT, INTERACTIVE_CONTENT, PALPABLE_CONTENT],
166
+ 'wbr' => [TEXT_LEVEL_SEMANTICS, FLOW_CONTENT, PHRASING_CONTENT]
167
+ }.freeze
168
+
169
+ # NOTE: Autonomous "custom element" belongs to
170
+ # [FLOW_CONTENT, PHRASING_CONTENT, PALPABLE_CONTENT]
171
+
172
+ VOID_ELEMENTS = Set.new(%w[area base br col embed hr img input link meta param source track wbr]).freeze
173
+
174
+ INLINE_ELEMENTS_INCLUDING_VOID = Set.new(CONTENT_TYPES.select do |_name, types|
175
+ # Split all table related elements.
176
+ next false if types.include?(TABULAR_DATA)
177
+
178
+ # RESTRICTION_PARENT_ELEMENT creates group for specific purpose.
179
+ next false if types.include?(RESTRICTION_PARENT_ELEMENT)
180
+
181
+ # Ignore meta elements
182
+ next false if types.include?(METADATA_CONTENT)
183
+
184
+ # Text-level contents are for decorating text like span
185
+ next true if types.include?(TEXT_LEVEL_SEMANTICS)
186
+
187
+ # Divide media to block
188
+ if types.include?(EMBEDDED_CONTENTS)
189
+ # Not divide content inside media
190
+ next true if types.include?(RESTRICTED_TO_CHILD_OF_EMBEDDED)
191
+ next false
192
+ end
193
+
194
+ if types.include?(FORMS)
195
+ # Divide form element creating section
196
+ next false if types.include?(SECTIONING_ROOT)
197
+
198
+ # Divide "FORM"
199
+ next false if types.include?(FORM_OWNER)
200
+
201
+ # Ignore contents inputted by user
202
+ next false if types.include?(ESCAPABLE_RAW_TEXT_ELEMENT)
203
+
204
+ # Otherwise are grouped as FORM
205
+ next true
206
+ end
207
+
208
+ false
209
+ end.keys).freeze
210
+
211
+ INLINE_ELEMENTS = (INLINE_ELEMENTS_INCLUDING_VOID - VOID_ELEMENTS).freeze
212
+ EMPTY_ELEMENTS = (INLINE_ELEMENTS_INCLUDING_VOID & VOID_ELEMENTS).freeze
213
+
214
+ # IGNORE_ELEMENTS doesn't scrape contents inside the tags
215
+ IGNORE_ELEMENTS = Set.new(CONTENT_TYPES.select do |name, types|
216
+ # meta should be ignored but require swapping.
217
+ next true if name == 'meta'
218
+ next true if types.include?(EMBEDDED_CONTENTS) && !types.include?(RESTRICTED_TO_CHILD_OF_EMBEDDED)
219
+ end.keys).freeze
220
+
221
+ # SKIP_ELEMENTS doesn't scrape and swap contents
222
+ SKIP_ELEMENTS = Set.new(CONTENT_TYPES.select do |name, types|
223
+ # iterate title to get content
224
+ next false if name == 'title'
225
+ next false if name == 'meta'
226
+
227
+ next true if types.include?(METADATA_CONTENT)
228
+ end.keys).freeze
229
+
230
+ # SKIP_ELEMENTS doesn't scrape and swap contents. But it scrape and swap tag's attributes like `placeholder`
231
+ SKIP_ELEMENTS_WITHOUT_ATTRIBUTES = Set.new(CONTENT_TYPES.select do |name, types|
232
+ # iterate title to get content
233
+ next false if name == 'title'
234
+
235
+ # Ignore contents inputted by user
236
+ next true if types.include?(ESCAPABLE_RAW_TEXT_ELEMENT)
237
+ end.keys).freeze
238
+
239
+ BLOCK_ELEMENTS = (Set.new(CONTENT_TYPES.keys) - INLINE_ELEMENTS - EMPTY_ELEMENTS - IGNORE_ELEMENTS - SKIP_ELEMENTS - SKIP_ELEMENTS_WITHOUT_ATTRIBUTES).freeze
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,134 @@
1
+ module Wovnrb
2
+ module UnifiedValues
3
+ class NodeSwappingTargetsCreator
4
+ # NOTE: `nodes_info` is the format like below
5
+ #
6
+ # [
7
+ # {:dst=>"an<span>apple is a good</span>foods",
8
+ # :nodes=>
9
+ # [
10
+ # (Text "an"),
11
+ # (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
12
+ # (Text "apple is a good"),
13
+ # (Element:0x13e84e1334 { name = "span", childrelib/wovnrb/html_replacers/unified_values/node_swapping_targets_creator.rbn = [ #(Text "apple is a good")] }),
14
+ # (Text "\n foods\n \n \n")]}
15
+ # ]
16
+ # }
17
+ # ]
18
+ def initialize(nodes_info)
19
+ @nodes_info = nodes_info
20
+ end
21
+
22
+ # NOTE: `run` make a swapping_targets like below
23
+ #
24
+ # [
25
+ # {:dst=>"an<span>apple is a good</span>foods",
26
+ # :nodes=>
27
+ # [
28
+ # (Text "an"),
29
+ # (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
30
+ # (Text "apple is a good"),
31
+ # (Element:0x13e84e1334 { name = "span", children = [ #(Text "apple is a good")] }),
32
+ # (Text "\n foods\n \n \n")]}
33
+ # ]
34
+ # :swapping_targets=>
35
+ # [
36
+ # (Text "an"),
37
+ # (Text "apple is a good"),
38
+ # (Text "\n foods\n \n \n")]}
39
+ # ]
40
+ # }
41
+ # ]
42
+
43
+ def run!
44
+ @nodes_info.each do |node_info|
45
+ mold = []
46
+ node_info[:nodes].each do |node|
47
+ mold_size = mold.size
48
+ mold.push create_dummy_empty_text_node(next_node: node) if mold_size.even? && node.element?
49
+ mold.push node
50
+ end
51
+
52
+ mold.push create_dummy_empty_text_node(previous_node: mold.last) if mold.last.element?
53
+ node_info[:swapping_targets] = remove_tag_element(mold)
54
+ end
55
+ end
56
+
57
+ def remove_tag_element(mold)
58
+ id_of_tag_with_wovn_ignore = nil
59
+ swapping_targets = []
60
+ mold.each do |node|
61
+ if id_of_tag_with_wovn_ignore.nil? && node.attributes.keys.include?('wovn-ignore')
62
+ id_of_tag_with_wovn_ignore = node.object_id
63
+ next
64
+ end
65
+
66
+ if node.object_id == id_of_tag_with_wovn_ignore
67
+ id_of_tag_with_wovn_ignore = nil
68
+ end
69
+
70
+ if id_of_tag_with_wovn_ignore.nil? && node.text?
71
+ swapping_targets << node
72
+ end
73
+ end
74
+
75
+ swapping_targets
76
+ end
77
+
78
+ def create_dummy_empty_text_node(option)
79
+ DummyEmpryTextNode.new(option)
80
+ end
81
+
82
+ class DummyEmpryTextNode
83
+ attr_reader :name
84
+
85
+ def initialize(next_node: nil, previous_node: nil)
86
+ @name = 'text'
87
+ @next_node = next_node
88
+ @previous_node = previous_node
89
+ @added_empty_text = nil
90
+ end
91
+
92
+ def text?
93
+ true
94
+ end
95
+
96
+ def content
97
+ ''
98
+ end
99
+
100
+ def to_s
101
+ content
102
+ end
103
+
104
+ def attributes
105
+ {}
106
+ end
107
+
108
+ def parent
109
+ @next_node.try(:parent) || @previous_node.try(:parent)
110
+ end
111
+
112
+ def document
113
+ @next_node.try(:document) || @previous_node.try(:document)
114
+ end
115
+
116
+ def add_previous_sibling(comment_node)
117
+ @added_empty_text.try(:add_previous_sibling, comment_node)
118
+ end
119
+
120
+ def content=(text)
121
+ return if text == ''
122
+
123
+ if @next_node
124
+ @next_node.add_previous_sibling(text)
125
+ @added_empty_text = @next_node.previous
126
+ elsif @previous_node
127
+ @previous_node.add_next_sibling(text)
128
+ @added_empty_text = @previous_node.next
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end