hairaito 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4e3cd6878a89daabfb2fdf0d3a9ca661fc238566
4
- data.tar.gz: 237ee8e16ee95f4e20ac67bd301500a3eb7deb81
3
+ metadata.gz: 739c486eff2c0dd13f4d31cc67937efd6bc3fabf
4
+ data.tar.gz: ab89ded036459096b7b8993c7102b23caad3fe3a
5
5
  SHA512:
6
- metadata.gz: 762a12095a72d152e6b07d80c48ec8294f4077c4e769222d0e770bb3eaa2b7a4c0c4f5176502ddd57a5c6727c6aaaa073d70384623ed808afdcccd3ccc500e6a
7
- data.tar.gz: 69e4d996242c807cd4fbd42e445809b6af8d64fbc3bf6d84bb519b787a7bd1a27887137177bd4f2e4295b3e06ace6967cfc7dd9c2edf7456093506f7231c241f
6
+ metadata.gz: 854720af08d6228b2235795188ce59e78e5228f74907c770be7b9469b167d91a887fa0fd46a20c8f0f7d287fa347e6bff4f71347b7d449f79b990b2388be9f2e
7
+ data.tar.gz: 8b0c6bd2cc94c6b2bc9cd851fb3fe719f048f0122816ddae58a6949757874b1ef78cba42a2a461c8cbee02ad35320517ce245c601862cd3368ce56feb4953863
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # Hairaito
2
2
 
3
+ Extends Nokogiri with text snippets highlighting. It looks like jquery-highlight plugin, but for ruby and nokogiri.
4
+
3
5
  ## Installation
4
6
 
5
7
  Add this line to your application's Gemfile:
@@ -16,6 +18,68 @@ Or install it yourself as:
16
18
 
17
19
  ## Usage
18
20
 
21
+ Hairaito adds to Nokogiri::XML::Document _highlight_ method.
22
+
23
+ Example:
24
+
25
+ ```
26
+ doc = Nokogiri::XML('<body>abc def ghi</body>')
27
+ doc.highlight(['def'])
28
+ doc.to_html # => '<body>abc <span class="snippet-part snippet-start" data-snippet-id="0">def</span> ghi</body>'
29
+ ```
30
+
31
+ There are several options for highlighting customization:
32
+
33
+ ```
34
+ {
35
+ highlight_base: {
36
+ selector: 'body', # Highlighting will be launched at this selector
37
+ content_wrapper: '', # Highlighting base content can be wrapped by this tag
38
+ content_wrapper_class: 'highlighting-base', # Class for wrapper above
39
+ },
40
+ snippet: {
41
+ part_wrapper: 'span', # Found snippet parts will be wrapped with this tag
42
+ part_wrapper_class: 'snippet-part', # Class for wrapper above
43
+ starting_part_class: 'snippet-start', # Class for wrapper above, is added only for first part per found snippet
44
+ },
45
+ numeration: {
46
+ attr: 'data-snippet-id', # Snippet parts of single snippet will have same numeration value in this attribute
47
+ prefix: '', # Prefix, that will be added to each numeration value
48
+ suffix: '', # Suffix, that will be added to each numeration value
49
+ start_with: 0, # Starting point for numeration increment
50
+ },
51
+ boundaries: {
52
+ whole_words_only: true, # If true, only whole words will be found
53
+ inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt), # Tags, that aren't considered as word boundary
54
+ word_parts: '[а-яА-ЯёЁa-zA-Z\d]', # Characters, that are considered as word part
55
+ },
56
+ }
57
+ ```
58
+
59
+ Example:
60
+
61
+ ```
62
+ doc = Nokogiri::XML('<body>abc def ghi abcdefghi</body>')
63
+ options = {
64
+ highlight_base: {
65
+ content_wrapper: 'div',
66
+ },
67
+ snippet: {
68
+ starting_part_class: 'start',
69
+ part_wrapper_class: 'part',
70
+ },
71
+ numeration: {
72
+ attr: 'data-id',
73
+ prefix: 'snippet_'
74
+ },
75
+ boundaries: {
76
+ whole_words_only: false,
77
+ }
78
+ }
79
+ doc.highlight(['abc'], options)
80
+ doc.to_html # => '<body><div class="highlighting-base"><span class="part start" data-id="snippet_0">abc</span> def ghi <span class="part start" data-id="snippet_1">abc</span>defghi</div></body>'
81
+ ```
82
+
19
83
  ## Contributing
20
84
 
21
85
  1. Fork it ( https://github.com/dmazilov/hairaito/fork )
@@ -3,80 +3,92 @@ module Hairaito
3
3
  module XML
4
4
  module Document
5
5
 
6
+ # Highlights text snippets in document
7
+ #
8
+ # @param snippets [Array<String>] text variants to be highlighted
9
+ # @param options [Hash] custom highlighting options
10
+ # @return [Nokogiri::XML::Document] self document for chaining
6
11
  def highlight(snippets, options = {})
7
- highlighting_default_options(options)
8
- snippets.each do |snippet|
9
- highlighting_base.traverse_by_text(snippet).each do |snippet_container|
10
- to_wrap = []
11
- start_index = snippet_container.text().index(snippet)
12
-
13
- start_node, start_inner_index = snippet_container.text_node_by_position(start_index)
12
+ highlighting_defaults(options)
13
+ snippet_parts_to_wrap = []
14
+ prepare_snippets(snippets).each do |snippet|
15
+ highlighting_base.traverse_by_text(snippet, @hl_opts[:boundaries]) do |snippet_container, snippet_offset|
16
+ start_node, start_inner_index = snippet_container.text_node_by_position(snippet_offset.first)
14
17
  start_range = start_node.text_range_by_index(start_inner_index, snippet.length)
15
- to_wrap << [start_node, start_range]
18
+ snippet_parts_to_wrap << {part: start_node, range: start_range, starting: true}
16
19
 
17
20
  # If start node contains only part of snippet
18
21
  if snippet.length > start_range.size
19
- end_node, end_inner_index = snippet_container.text_node_by_position(start_index + snippet.length - 1)
22
+ end_node, end_inner_index = snippet_container.text_node_by_position(snippet_offset.last - 1)
20
23
  end_range = end_node.text_range_by_index(end_inner_index)
21
- to_wrap += snippet_container.text_nodes_between(start_node, end_node).map do |node|
22
- [node, 0..(node.text.length - 1)]
24
+ snippet_parts_to_wrap += snippet_container.text_nodes_between(start_node, end_node).map do |node|
25
+ {part: node, range: 0..(node.text.length - 1)}
23
26
  end
24
- to_wrap << [end_node, end_range]
25
- end
26
-
27
- to_wrap.each do |node_data|
28
- node_data.first.highlight_by_range(node_data.last)
27
+ snippet_parts_to_wrap << {part: end_node, range: end_range}
29
28
  end
30
-
31
- snippet_container['class'] = "#{snippet_container['class']} #{@hl_opts[:snippet_container_class]}"
32
29
  end
33
30
  end
34
- numerate_highlighted_snippets if @hl_opts[:numerate]
35
- to_html
36
- end
37
-
38
- def highlight_snippet_part(text)
39
- if @hl_opts[:snippet_part_wrapper].blank?
40
- raise ArgumentError.new('Snippet part wrapper tag is not specified!')
31
+ snippet_parts_to_wrap.group_by{|part_data| part_data[:part]}.each do |part, parts_collection|
32
+ part.highlight_by_ranges(parts_collection.map{|p| p.except(:part)}, @hl_opts)
41
33
  end
42
- wrapper = create_element("#{@hl_opts[:snippet_part_wrapper]}", class: "#{@hl_opts[:snippet_part_wrapper_class]}")
43
- wrapper.content = text
44
- wrapper
34
+ numerate_snippet_parts if @hl_opts[:numeration][:attr].present?
35
+ self
45
36
  end
46
37
 
47
38
  private
48
39
 
49
- def highlighting_default_options(options)
40
+ def highlighting_defaults(options)
41
+ @hl_base = nil
50
42
  @hl_opts = {
51
- base_selector: 'body',
52
- base_content_wrapper: '',
53
- base_content_wrapper_class: 'highlighting-base',
54
- snippet_container_class: 'highlighted-snippet',
55
- snippet_part_wrapper: 'span',
56
- snippet_part_wrapper_class: 'highlighted-snippet-part',
57
- numerate: true,
58
- numeration_attr: 'data-snippet-id',
59
- numeration_prefix: '',
60
- numeration_suffix: '',
61
- }.merge(options)
43
+ highlight_base: {
44
+ selector: 'body',
45
+ content_wrapper: '',
46
+ content_wrapper_class: 'highlighting-base',
47
+ },
48
+ snippet: {
49
+ part_wrapper: 'span',
50
+ part_wrapper_class: 'snippet-part',
51
+ starting_part_class: 'snippet-start',
52
+ },
53
+ numeration: {
54
+ attr: 'data-snippet-id',
55
+ prefix: '',
56
+ suffix: '',
57
+ start_with: 0,
58
+ },
59
+ boundaries: {},
60
+ }.deep_merge(options).with_indifferent_access
62
61
  end
63
62
 
64
63
  def highlighting_base
65
- base = at(@hl_opts[:base_selector])
64
+ return @hl_base if @hl_base.present?
65
+ base = at(@hl_opts[:highlight_base][:selector])
66
66
  raise ArgumentError.new('Document does not contain highlighting base element!') if base.blank?
67
- if @hl_opts[:base_content_wrapper].present?
68
- wrapper = create_element("#{@hl_opts[:base_content_wrapper]}", class: "#{@hl_opts[:base_content_wrapper_class]}")
67
+ if @hl_opts[:highlight_base][:content_wrapper].present?
68
+ wrapper = create_element("#{@hl_opts[:highlight_base][:content_wrapper]}",
69
+ class: "#{@hl_opts[:highlight_base][:content_wrapper_class]}")
69
70
  base.children.each{|child| child.parent = wrapper}
70
71
  wrapper.parent = base
71
- return wrapper
72
+ @hl_base = wrapper
73
+ else
74
+ @hl_base = base
72
75
  end
73
- base
76
+ @hl_base
77
+ end
78
+
79
+ # Longer snippets must go first due to situations with snippets overlapping
80
+ # Example: ['abc', 'abcdef'],
81
+ # without sorting this produces highlighting artifacts like shorter snippet duplication in result nodes
82
+ def prepare_snippets(snippets)
83
+ snippets.uniq.sort_by{|snippet| snippet.length}.reverse
74
84
  end
75
85
 
76
- def numerate_highlighted_snippets
77
- css(".#{@hl_opts[:snippet_container_class]}").each_with_index do |snippet_container, index|
78
- snippet_container[@hl_opts[:numeration_attr]] =
79
- "#{@hl_opts[:numeration_prefix]}#{index}#{@hl_opts[:numeration_prefix]}"
86
+ def numerate_snippet_parts
87
+ selector = @hl_opts[:snippet][:part_wrapper_class].gsub(/\s+/, ' ').split(' ').map{|cl| ".#{cl}"}.join('')
88
+ index = @hl_opts[:numeration][:start_with] - 1
89
+ css(selector).each do |part|
90
+ index += 1 if part[:class].split(' ').include?(@hl_opts[:snippet][:starting_part_class])
91
+ part[@hl_opts[:numeration][:attr]] = "#{@hl_opts[:numeration][:prefix]}#{index}#{@hl_opts[:numeration][:suffix]}"
80
92
  end
81
93
  end
82
94
 
@@ -3,6 +3,7 @@ module Hairaito
3
3
  module XML
4
4
  module Node
5
5
 
6
+ # @return [Nokogiri::XML::NodeSet] all text nodes, that has self as ancestor
6
7
  def text_nodes
7
8
  result_nodes = []
8
9
  traverse do |node|
@@ -12,6 +13,17 @@ module Hairaito
12
13
  ::Nokogiri::XML::NodeSet.new(document, result_nodes)
13
14
  end
14
15
 
16
+ # @return [Nokogiri::XML::Node] first text node within self node
17
+ def first_text_node
18
+ traverse do |node|
19
+ return node if node.text?
20
+ end
21
+ nil
22
+ end
23
+
24
+ # @param start_node [Nokogiri::XML::Node] left boundary
25
+ # @param end_node [Nokogiri::XML::Node] right boundary
26
+ # @return [Nokogiri::XML::NodeSet] all text nodes are located between specified boundaries
15
27
  def text_nodes_between(start_node, end_node)
16
28
  nodes = text_nodes
17
29
  indexes = [nodes.index(start_node), nodes.index(end_node)]
@@ -22,18 +34,83 @@ module Hairaito
22
34
  ::Nokogiri::XML::NodeSet.new(document, result_nodes)
23
35
  end
24
36
 
25
- def traverse_by_text(text, exclude_ancestors = true)
26
- excluded = []
27
- result_nodes = []
28
- traverse do |node|
29
- next if node.is_a?(::Nokogiri::XML::Text)
30
- next if node.in?(excluded)
31
- if node.text.include?(text)
32
- result_nodes << node
33
- excluded += node.ancestors if exclude_ancestors
37
+ # @param base [Nokogiri::XML::Node] root element for search
38
+ # @return [Nokogiri::XML::Node, nil] previous text node within base node or nil if it doesn't exist
39
+ def previous_text(base = document)
40
+ first_text_node = text_nodes.first
41
+ base_text_nodes = base.text_nodes
42
+ if (index = base_text_nodes.index(first_text_node)).blank?
43
+ raise ArgumentError.new('Base must contain self node!')
44
+ end
45
+ return if index == 0
46
+ base_text_nodes[index - 1]
47
+ end
48
+
49
+ # @param base [Nokogiri::XML::Node] root element for search
50
+ # @return [Nokogiri::XML::Node, nil] next text node within base node or nil if it doesn't exist
51
+ def next_text(base = document)
52
+ first_text_node = text_nodes.last
53
+ base_text_nodes = base.text_nodes
54
+ if (index = base_text_nodes.index(first_text_node)).blank?
55
+ raise ArgumentError.new('Base must contain self node!')
56
+ end
57
+ return if index == base_text_nodes.count - 1
58
+ base_text_nodes[index + 1]
59
+ end
60
+
61
+ # Yields for each match of specified string in child nodes recursively
62
+ #
63
+ # @yieldparam node [Nokogiri::XML::Node] child node contains specified string
64
+ # @yieldparam offset [Array] child text inner offset
65
+ # @param string [String] text for matching
66
+ # @param options [Hash] @see #traverse_by_text_default_options
67
+ # @return [Nokogiri::XML::Node] self node for chaining
68
+ def traverse_by_text(string, options = {}, &block)
69
+ traverse_by_text_defaults(options)
70
+ traverse do |current_node|
71
+ next if current_node.text?
72
+
73
+ offset_types = @tbt_opts[:whole_words_only] ? [:inner_word, :boundary_word] : [:simple]
74
+ inner_offsets, boundary_offsets = current_node.matched_offsets(string, offset_types, @tbt_opts)
75
+
76
+ # Check words bordered with current inline tag if current node has boundary words
77
+ # abc<span>def<span> or <span>def</span>ghi or abc<span>def</span>ghi
78
+ if current_node.name.in?(@tbt_opts[:inline_tags]) && self != current_node
79
+ if boundary_offsets.try(:first).try(:first) == 0
80
+ previous_node = current_node.previous_text(self)
81
+ boundary_offsets.shift if previous_node.try(:matched_offsets, :any, :ending_word, @tbt_opts).present?
82
+ end
83
+ if boundary_offsets.try(:last).try(:first) == 0
84
+ next_node = current_node.next_text(self)
85
+ boundary_offsets.pop if next_node.try(:matched_offsets, :any, :beginning_word, @tbt_opts).present?
86
+ end
87
+ end
88
+
89
+ offsets = (inner_offsets + (boundary_offsets || [])).sort_by{|offset| offset.first}
90
+ if offsets.any?
91
+ offsets.each {|offset| yield(current_node, offset)} if block_given?
92
+ if current_node != self
93
+ # Excludes processed offsets from all ancestors
94
+ ([current_node] + current_node.ancestors).each do |node|
95
+ pos = node.position_by_text_node(current_node.first_text_node)
96
+ # Shifts all offsets according to node inner position and excludes from future processing
97
+ node.exclude_offsets(offsets.map{|offset| [offset.first + pos, offset.last + pos]})
98
+ # Reaches highlighting base
99
+ break if node == self
100
+ end
101
+ end
34
102
  end
35
103
  end
36
- result_nodes
104
+ self
105
+ end
106
+
107
+ def position_by_text_node(text_node)
108
+ nodes = text_nodes
109
+ if (index = nodes.index(text_node)) < 0
110
+ raise ArgumentError.new('Self node must contain text_node!')
111
+ end
112
+ return 0 if index == 0
113
+ nodes[0..index - 1].map{|node| node.text}.join('').length
37
114
  end
38
115
 
39
116
  def text_node_by_position(in_text_position)
@@ -48,11 +125,23 @@ module Hairaito
48
125
  raise ArgumentError.new('Inner index is out of range!')
49
126
  end
50
127
 
51
- def highlight_by_range(range)
52
- prefix = range.first > 0 ? text[0..(range.first - 1)]: ''
53
- suffix = range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: ''
54
- for_wrapping = text[range]
55
- new_contents = "#{prefix}#{document.highlight_snippet_part(for_wrapping)}#{suffix}"
128
+ def highlight_by_ranges(ranges, options)
129
+ if options[:snippet][:part_wrapper].blank?
130
+ raise ArgumentError.new('Snippet part wrapper tag is not specified!')
131
+ end
132
+ parts = []
133
+ ranges = ranges.sort_by{|r| r[:range].first}
134
+ ranges.each_with_index do |range_data, index|
135
+ range = range_data[:range]
136
+ parts << (range.first > 0 ? text[0..(range.first - 1)]: '') if index == 0
137
+ snippet_class = range_data[:starting] ? "#{options[:snippet][:starting_part_class]}" : ''
138
+ wrapper = document.create_element("#{options[:snippet][:part_wrapper]}", class: "#{options[:snippet][:part_wrapper_class]} #{snippet_class}")
139
+ wrapper.content = text[range]
140
+ parts << wrapper.to_s
141
+ parts << text[(range.last + 1)..(ranges[index + 1][:range].first - 1)] if index < ranges.count - 1
142
+ parts << (range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: '') if index == ranges.count - 1
143
+ end
144
+ new_contents = parts.join('')
56
145
  replace(new_contents)
57
146
  end
58
147
 
@@ -60,6 +149,63 @@ module Hairaito
60
149
  demand_length.present? ? index..[text.length - 1, index + demand_length - 1].min : 0..index
61
150
  end
62
151
 
152
+ # @return [Array] self node offsets were already processed
153
+ def excluded_offsets
154
+ @excluded_offsets ||= []
155
+ end
156
+
157
+ # @param offsets [Array] self node offsets to be excluded in the future processing
158
+ def exclude_offsets(offsets)
159
+ @excluded_offsets ||= []
160
+ @excluded_offsets += offsets
161
+ end
162
+
163
+ def matched_offsets(string, types, options)
164
+ types = [types] unless types.is_a?(Array)
165
+ offsets = []
166
+ types.each do |type|
167
+ offsets << text.to_enum(:scan, build_regexp(string, type, options)).map do
168
+ offset = Regexp.last_match.offset(:text)
169
+ # Only one highlighting per position
170
+ offset unless overlapped_offsets?(excluded_offsets, offset)
171
+ end.compact || []
172
+ end
173
+ return *offsets
174
+ end
175
+
176
+ private
177
+
178
+ def traverse_by_text_defaults(options)
179
+ @tbt_opts = {
180
+ whole_words_only: true,
181
+ inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt),
182
+ word_parts: '[а-яА-ЯёЁa-zA-Z\d]',
183
+ }.deep_merge(options).with_indifferent_access
184
+ end
185
+
186
+ def build_regexp(string, type = :simple, options)
187
+ string = '.+' if string == :any
188
+ case type.to_sym
189
+ when :simple
190
+ return /(?<text>#{string})/
191
+ when :inner_word
192
+ return /(?<!#{options[:word_parts]}|\A)(?<text>#{string})(?!#{options[:word_parts]}|\Z)/
193
+ when :beginning_word
194
+ return /\A(?<text>#{string})(?!#{options[:word_parts]})/
195
+ when :ending_word
196
+ return /(?<!#{options[:word_parts]})(?<text>#{string})\Z/
197
+ when :boundary_word
198
+ return /(\A(?<text>#{string})(?!#{options[:word_parts]}))|((?<!#{options[:word_parts]})(?<text>#{string})\Z)|(\A(?<text>#{string})\Z)/
199
+ end
200
+ end
201
+
202
+ def overlapped_offsets?(offsets_collection, offset_for_check)
203
+ offsets_collection.each do |offset|
204
+ return true if (offset_for_check.first...offset_for_check.last).overlaps?(offset.first...offset.last)
205
+ end
206
+ false
207
+ end
208
+
63
209
  end
64
210
  end
65
211
  end
@@ -1,3 +1,3 @@
1
1
  module Hairaito
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hairaito
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Denis Mazilov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-24 00:00:00.000000000 Z
11
+ date: 2016-03-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler