hairaito 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4e3cd6878a89daabfb2fdf0d3a9ca661fc238566
4
- data.tar.gz: 237ee8e16ee95f4e20ac67bd301500a3eb7deb81
3
+ metadata.gz: 739c486eff2c0dd13f4d31cc67937efd6bc3fabf
4
+ data.tar.gz: ab89ded036459096b7b8993c7102b23caad3fe3a
5
5
  SHA512:
6
- metadata.gz: 762a12095a72d152e6b07d80c48ec8294f4077c4e769222d0e770bb3eaa2b7a4c0c4f5176502ddd57a5c6727c6aaaa073d70384623ed808afdcccd3ccc500e6a
7
- data.tar.gz: 69e4d996242c807cd4fbd42e445809b6af8d64fbc3bf6d84bb519b787a7bd1a27887137177bd4f2e4295b3e06ace6967cfc7dd9c2edf7456093506f7231c241f
6
+ metadata.gz: 854720af08d6228b2235795188ce59e78e5228f74907c770be7b9469b167d91a887fa0fd46a20c8f0f7d287fa347e6bff4f71347b7d449f79b990b2388be9f2e
7
+ data.tar.gz: 8b0c6bd2cc94c6b2bc9cd851fb3fe719f048f0122816ddae58a6949757874b1ef78cba42a2a461c8cbee02ad35320517ce245c601862cd3368ce56feb4953863
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # Hairaito
2
2
 
3
+ Extends Nokogiri with text snippets highlighting. It looks like jquery-highlight plugin, but for ruby and nokogiri.
4
+
3
5
  ## Installation
4
6
 
5
7
  Add this line to your application's Gemfile:
@@ -16,6 +18,68 @@ Or install it yourself as:
16
18
 
17
19
  ## Usage
18
20
 
21
+ Hairaito adds to Nokogiri::XML::Document _highlight_ method.
22
+
23
+ Example:
24
+
25
+ ```
26
+ doc = Nokogiri::XML('<body>abc def ghi</body>')
27
+ doc.highlight(['def'])
28
+ doc.to_html # => '<body>abc <span class="snippet-part snippet-start" data-snippet-id="0">def</span> ghi</body>'
29
+ ```
30
+
31
+ There are several options for highlighting customization:
32
+
33
+ ```
34
+ {
35
+ highlight_base: {
36
+ selector: 'body', # Highlighting will be launched at this selector
37
+ content_wrapper: '', # Highlighting base content can be wrapped by this tag
38
+ content_wrapper_class: 'highlighting-base', # Class for wrapper above
39
+ },
40
+ snippet: {
41
+ part_wrapper: 'span', # Found snippet parts will be wrapped with this tag
42
+ part_wrapper_class: 'snippet-part', # Class for wrapper above
43
+ starting_part_class: 'snippet-start', # Class for wrapper above, is added only for first part per found snippet
44
+ },
45
+ numeration: {
46
+ attr: 'data-snippet-id', # Snippet parts of single snippet will have same numeration value in this attribute
47
+ prefix: '', # Prefix, that will be added to each numeration value
48
+ suffix: '', # Suffix, that will be added to each numeration value
49
+ start_with: 0, # Starting point for numeration increment
50
+ },
51
+ boundaries: {
52
+ whole_words_only: true, # If true, only whole words will be found
53
+ inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt), # Tags, that aren't considered as word boundary
54
+ word_parts: '[а-яА-ЯёЁa-zA-Z\d]', # Characters, that are considered as word part
55
+ },
56
+ }
57
+ ```
58
+
59
+ Example:
60
+
61
+ ```
62
+ doc = Nokogiri::XML('<body>abc def ghi abcdefghi</body>')
63
+ options = {
64
+ highlight_base: {
65
+ content_wrapper: 'div',
66
+ },
67
+ snippet: {
68
+ starting_part_class: 'start',
69
+ part_wrapper_class: 'part',
70
+ },
71
+ numeration: {
72
+ attr: 'data-id',
73
+ prefix: 'snippet_'
74
+ },
75
+ boundaries: {
76
+ whole_words_only: false,
77
+ }
78
+ }
79
+ doc.highlight(['abc'], options)
80
+ doc.to_html # => '<body><div class="highlighting-base"><span class="part start" data-id="snippet_0">abc</span> def ghi <span class="part start" data-id="snippet_1">abc</span>defghi</div></body>'
81
+ ```
82
+
19
83
  ## Contributing
20
84
 
21
85
  1. Fork it ( https://github.com/dmazilov/hairaito/fork )
@@ -3,80 +3,92 @@ module Hairaito
3
3
  module XML
4
4
  module Document
5
5
 
6
+ # Highlights text snippets in document
7
+ #
8
+ # @param snippets [Array<String>] text variants to be highlighted
9
+ # @param options [Hash] custom highlighting options
10
+ # @return [Nokogiri::XML::Document] self document for chaining
6
11
  def highlight(snippets, options = {})
7
- highlighting_default_options(options)
8
- snippets.each do |snippet|
9
- highlighting_base.traverse_by_text(snippet).each do |snippet_container|
10
- to_wrap = []
11
- start_index = snippet_container.text().index(snippet)
12
-
13
- start_node, start_inner_index = snippet_container.text_node_by_position(start_index)
12
+ highlighting_defaults(options)
13
+ snippet_parts_to_wrap = []
14
+ prepare_snippets(snippets).each do |snippet|
15
+ highlighting_base.traverse_by_text(snippet, @hl_opts[:boundaries]) do |snippet_container, snippet_offset|
16
+ start_node, start_inner_index = snippet_container.text_node_by_position(snippet_offset.first)
14
17
  start_range = start_node.text_range_by_index(start_inner_index, snippet.length)
15
- to_wrap << [start_node, start_range]
18
+ snippet_parts_to_wrap << {part: start_node, range: start_range, starting: true}
16
19
 
17
20
  # If start node contains only part of snippet
18
21
  if snippet.length > start_range.size
19
- end_node, end_inner_index = snippet_container.text_node_by_position(start_index + snippet.length - 1)
22
+ end_node, end_inner_index = snippet_container.text_node_by_position(snippet_offset.last - 1)
20
23
  end_range = end_node.text_range_by_index(end_inner_index)
21
- to_wrap += snippet_container.text_nodes_between(start_node, end_node).map do |node|
22
- [node, 0..(node.text.length - 1)]
24
+ snippet_parts_to_wrap += snippet_container.text_nodes_between(start_node, end_node).map do |node|
25
+ {part: node, range: 0..(node.text.length - 1)}
23
26
  end
24
- to_wrap << [end_node, end_range]
25
- end
26
-
27
- to_wrap.each do |node_data|
28
- node_data.first.highlight_by_range(node_data.last)
27
+ snippet_parts_to_wrap << {part: end_node, range: end_range}
29
28
  end
30
-
31
- snippet_container['class'] = "#{snippet_container['class']} #{@hl_opts[:snippet_container_class]}"
32
29
  end
33
30
  end
34
- numerate_highlighted_snippets if @hl_opts[:numerate]
35
- to_html
36
- end
37
-
38
- def highlight_snippet_part(text)
39
- if @hl_opts[:snippet_part_wrapper].blank?
40
- raise ArgumentError.new('Snippet part wrapper tag is not specified!')
31
+ snippet_parts_to_wrap.group_by{|part_data| part_data[:part]}.each do |part, parts_collection|
32
+ part.highlight_by_ranges(parts_collection.map{|p| p.except(:part)}, @hl_opts)
41
33
  end
42
- wrapper = create_element("#{@hl_opts[:snippet_part_wrapper]}", class: "#{@hl_opts[:snippet_part_wrapper_class]}")
43
- wrapper.content = text
44
- wrapper
34
+ numerate_snippet_parts if @hl_opts[:numeration][:attr].present?
35
+ self
45
36
  end
46
37
 
47
38
  private
48
39
 
49
- def highlighting_default_options(options)
40
+ def highlighting_defaults(options)
41
+ @hl_base = nil
50
42
  @hl_opts = {
51
- base_selector: 'body',
52
- base_content_wrapper: '',
53
- base_content_wrapper_class: 'highlighting-base',
54
- snippet_container_class: 'highlighted-snippet',
55
- snippet_part_wrapper: 'span',
56
- snippet_part_wrapper_class: 'highlighted-snippet-part',
57
- numerate: true,
58
- numeration_attr: 'data-snippet-id',
59
- numeration_prefix: '',
60
- numeration_suffix: '',
61
- }.merge(options)
43
+ highlight_base: {
44
+ selector: 'body',
45
+ content_wrapper: '',
46
+ content_wrapper_class: 'highlighting-base',
47
+ },
48
+ snippet: {
49
+ part_wrapper: 'span',
50
+ part_wrapper_class: 'snippet-part',
51
+ starting_part_class: 'snippet-start',
52
+ },
53
+ numeration: {
54
+ attr: 'data-snippet-id',
55
+ prefix: '',
56
+ suffix: '',
57
+ start_with: 0,
58
+ },
59
+ boundaries: {},
60
+ }.deep_merge(options).with_indifferent_access
62
61
  end
63
62
 
64
63
  def highlighting_base
65
- base = at(@hl_opts[:base_selector])
64
+ return @hl_base if @hl_base.present?
65
+ base = at(@hl_opts[:highlight_base][:selector])
66
66
  raise ArgumentError.new('Document does not contain highlighting base element!') if base.blank?
67
- if @hl_opts[:base_content_wrapper].present?
68
- wrapper = create_element("#{@hl_opts[:base_content_wrapper]}", class: "#{@hl_opts[:base_content_wrapper_class]}")
67
+ if @hl_opts[:highlight_base][:content_wrapper].present?
68
+ wrapper = create_element("#{@hl_opts[:highlight_base][:content_wrapper]}",
69
+ class: "#{@hl_opts[:highlight_base][:content_wrapper_class]}")
69
70
  base.children.each{|child| child.parent = wrapper}
70
71
  wrapper.parent = base
71
- return wrapper
72
+ @hl_base = wrapper
73
+ else
74
+ @hl_base = base
72
75
  end
73
- base
76
+ @hl_base
77
+ end
78
+
79
+ # Longer snippets must go first due to situations with snippets overlapping
80
+ # Example: ['abc', 'abcdef'],
81
+ # without sorting this produces highlighting artifacts like shorter snippet duplication in result nodes
82
+ def prepare_snippets(snippets)
83
+ snippets.uniq.sort_by{|snippet| snippet.length}.reverse
74
84
  end
75
85
 
76
- def numerate_highlighted_snippets
77
- css(".#{@hl_opts[:snippet_container_class]}").each_with_index do |snippet_container, index|
78
- snippet_container[@hl_opts[:numeration_attr]] =
79
- "#{@hl_opts[:numeration_prefix]}#{index}#{@hl_opts[:numeration_prefix]}"
86
+ def numerate_snippet_parts
87
+ selector = @hl_opts[:snippet][:part_wrapper_class].gsub(/\s+/, ' ').split(' ').map{|cl| ".#{cl}"}.join('')
88
+ index = @hl_opts[:numeration][:start_with] - 1
89
+ css(selector).each do |part|
90
+ index += 1 if part[:class].split(' ').include?(@hl_opts[:snippet][:starting_part_class])
91
+ part[@hl_opts[:numeration][:attr]] = "#{@hl_opts[:numeration][:prefix]}#{index}#{@hl_opts[:numeration][:suffix]}"
80
92
  end
81
93
  end
82
94
 
@@ -3,6 +3,7 @@ module Hairaito
3
3
  module XML
4
4
  module Node
5
5
 
6
+ # @return [Nokogiri::XML::NodeSet] all text nodes, that has self as ancestor
6
7
  def text_nodes
7
8
  result_nodes = []
8
9
  traverse do |node|
@@ -12,6 +13,17 @@ module Hairaito
12
13
  ::Nokogiri::XML::NodeSet.new(document, result_nodes)
13
14
  end
14
15
 
16
+ # @return [Nokogiri::XML::Node] first text node within self node
17
+ def first_text_node
18
+ traverse do |node|
19
+ return node if node.text?
20
+ end
21
+ nil
22
+ end
23
+
24
+ # @param start_node [Nokogiri::XML::Node] left boundary
25
+ # @param end_node [Nokogiri::XML::Node] right boundary
26
+ # @return [Nokogiri::XML::NodeSet] all text nodes are located between specified boundaries
15
27
  def text_nodes_between(start_node, end_node)
16
28
  nodes = text_nodes
17
29
  indexes = [nodes.index(start_node), nodes.index(end_node)]
@@ -22,18 +34,83 @@ module Hairaito
22
34
  ::Nokogiri::XML::NodeSet.new(document, result_nodes)
23
35
  end
24
36
 
25
- def traverse_by_text(text, exclude_ancestors = true)
26
- excluded = []
27
- result_nodes = []
28
- traverse do |node|
29
- next if node.is_a?(::Nokogiri::XML::Text)
30
- next if node.in?(excluded)
31
- if node.text.include?(text)
32
- result_nodes << node
33
- excluded += node.ancestors if exclude_ancestors
37
+ # @param base [Nokogiri::XML::Node] root element for search
38
+ # @return [Nokogiri::XML::Node, nil] previous text node within base node or nil if it doesn't exist
39
+ def previous_text(base = document)
40
+ first_text_node = text_nodes.first
41
+ base_text_nodes = base.text_nodes
42
+ if (index = base_text_nodes.index(first_text_node)).blank?
43
+ raise ArgumentError.new('Base must contain self node!')
44
+ end
45
+ return if index == 0
46
+ base_text_nodes[index - 1]
47
+ end
48
+
49
+ # @param base [Nokogiri::XML::Node] root element for search
50
+ # @return [Nokogiri::XML::Node, nil] next text node within base node or nil if it doesn't exist
51
+ def next_text(base = document)
52
+ first_text_node = text_nodes.last
53
+ base_text_nodes = base.text_nodes
54
+ if (index = base_text_nodes.index(first_text_node)).blank?
55
+ raise ArgumentError.new('Base must contain self node!')
56
+ end
57
+ return if index == base_text_nodes.count - 1
58
+ base_text_nodes[index + 1]
59
+ end
60
+
61
+ # Yields for each match of specified string in child nodes recursively
62
+ #
63
+ # @yieldparam node [Nokogiri::XML::Node] child node contains specified string
64
+ # @yieldparam offset [Array] child text inner offset
65
+ # @param string [String] text for matching
66
+ # @param options [Hash] @see #traverse_by_text_default_options
67
+ # @return [Nokogiri::XML::Node] self node for chaining
68
+ def traverse_by_text(string, options = {}, &block)
69
+ traverse_by_text_defaults(options)
70
+ traverse do |current_node|
71
+ next if current_node.text?
72
+
73
+ offset_types = @tbt_opts[:whole_words_only] ? [:inner_word, :boundary_word] : [:simple]
74
+ inner_offsets, boundary_offsets = current_node.matched_offsets(string, offset_types, @tbt_opts)
75
+
76
+ # Check words bordered with current inline tag if current node has boundary words
77
+ # abc<span>def<span> or <span>def</span>ghi or abc<span>def</span>ghi
78
+ if current_node.name.in?(@tbt_opts[:inline_tags]) && self != current_node
79
+ if boundary_offsets.try(:first).try(:first) == 0
80
+ previous_node = current_node.previous_text(self)
81
+ boundary_offsets.shift if previous_node.try(:matched_offsets, :any, :ending_word, @tbt_opts).present?
82
+ end
83
+ if boundary_offsets.try(:last).try(:first) == 0
84
+ next_node = current_node.next_text(self)
85
+ boundary_offsets.pop if next_node.try(:matched_offsets, :any, :beginning_word, @tbt_opts).present?
86
+ end
87
+ end
88
+
89
+ offsets = (inner_offsets + (boundary_offsets || [])).sort_by{|offset| offset.first}
90
+ if offsets.any?
91
+ offsets.each {|offset| yield(current_node, offset)} if block_given?
92
+ if current_node != self
93
+ # Excludes processed offsets from all ancestors
94
+ ([current_node] + current_node.ancestors).each do |node|
95
+ pos = node.position_by_text_node(current_node.first_text_node)
96
+ # Shifts all offsets according to node inner position and excludes from future processing
97
+ node.exclude_offsets(offsets.map{|offset| [offset.first + pos, offset.last + pos]})
98
+ # Reaches highlighting base
99
+ break if node == self
100
+ end
101
+ end
34
102
  end
35
103
  end
36
- result_nodes
104
+ self
105
+ end
106
+
107
+ def position_by_text_node(text_node)
108
+ nodes = text_nodes
109
+ if (index = nodes.index(text_node)) < 0
110
+ raise ArgumentError.new('Self node must contain text_node!')
111
+ end
112
+ return 0 if index == 0
113
+ nodes[0..index - 1].map{|node| node.text}.join('').length
37
114
  end
38
115
 
39
116
  def text_node_by_position(in_text_position)
@@ -48,11 +125,23 @@ module Hairaito
48
125
  raise ArgumentError.new('Inner index is out of range!')
49
126
  end
50
127
 
51
- def highlight_by_range(range)
52
- prefix = range.first > 0 ? text[0..(range.first - 1)]: ''
53
- suffix = range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: ''
54
- for_wrapping = text[range]
55
- new_contents = "#{prefix}#{document.highlight_snippet_part(for_wrapping)}#{suffix}"
128
+ def highlight_by_ranges(ranges, options)
129
+ if options[:snippet][:part_wrapper].blank?
130
+ raise ArgumentError.new('Snippet part wrapper tag is not specified!')
131
+ end
132
+ parts = []
133
+ ranges = ranges.sort_by{|r| r[:range].first}
134
+ ranges.each_with_index do |range_data, index|
135
+ range = range_data[:range]
136
+ parts << (range.first > 0 ? text[0..(range.first - 1)]: '') if index == 0
137
+ snippet_class = range_data[:starting] ? "#{options[:snippet][:starting_part_class]}" : ''
138
+ wrapper = document.create_element("#{options[:snippet][:part_wrapper]}", class: "#{options[:snippet][:part_wrapper_class]} #{snippet_class}")
139
+ wrapper.content = text[range]
140
+ parts << wrapper.to_s
141
+ parts << text[(range.last + 1)..(ranges[index + 1][:range].first - 1)] if index < ranges.count - 1
142
+ parts << (range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: '') if index == ranges.count - 1
143
+ end
144
+ new_contents = parts.join('')
56
145
  replace(new_contents)
57
146
  end
58
147
 
@@ -60,6 +149,63 @@ module Hairaito
60
149
  demand_length.present? ? index..[text.length - 1, index + demand_length - 1].min : 0..index
61
150
  end
62
151
 
152
+ # @return [Array] self node offsets were already processed
153
+ def excluded_offsets
154
+ @excluded_offsets ||= []
155
+ end
156
+
157
+ # @param offsets [Array] self node offsets to be excluded in the future processing
158
+ def exclude_offsets(offsets)
159
+ @excluded_offsets ||= []
160
+ @excluded_offsets += offsets
161
+ end
162
+
163
+ def matched_offsets(string, types, options)
164
+ types = [types] unless types.is_a?(Array)
165
+ offsets = []
166
+ types.each do |type|
167
+ offsets << text.to_enum(:scan, build_regexp(string, type, options)).map do
168
+ offset = Regexp.last_match.offset(:text)
169
+ # Only one highlighting per position
170
+ offset unless overlapped_offsets?(excluded_offsets, offset)
171
+ end.compact || []
172
+ end
173
+ return *offsets
174
+ end
175
+
176
+ private
177
+
178
+ def traverse_by_text_defaults(options)
179
+ @tbt_opts = {
180
+ whole_words_only: true,
181
+ inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt),
182
+ word_parts: '[а-яА-ЯёЁa-zA-Z\d]',
183
+ }.deep_merge(options).with_indifferent_access
184
+ end
185
+
186
+ def build_regexp(string, type = :simple, options)
187
+ string = '.+' if string == :any
188
+ case type.to_sym
189
+ when :simple
190
+ return /(?<text>#{string})/
191
+ when :inner_word
192
+ return /(?<!#{options[:word_parts]}|\A)(?<text>#{string})(?!#{options[:word_parts]}|\Z)/
193
+ when :beginning_word
194
+ return /\A(?<text>#{string})(?!#{options[:word_parts]})/
195
+ when :ending_word
196
+ return /(?<!#{options[:word_parts]})(?<text>#{string})\Z/
197
+ when :boundary_word
198
+ return /(\A(?<text>#{string})(?!#{options[:word_parts]}))|((?<!#{options[:word_parts]})(?<text>#{string})\Z)|(\A(?<text>#{string})\Z)/
199
+ end
200
+ end
201
+
202
+ def overlapped_offsets?(offsets_collection, offset_for_check)
203
+ offsets_collection.each do |offset|
204
+ return true if (offset_for_check.first...offset_for_check.last).overlaps?(offset.first...offset.last)
205
+ end
206
+ false
207
+ end
208
+
63
209
  end
64
210
  end
65
211
  end
@@ -1,3 +1,3 @@
1
1
  module Hairaito
2
- VERSION = '0.0.2'
2
+ VERSION = '0.0.3'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hairaito
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Denis Mazilov
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-24 00:00:00.000000000 Z
11
+ date: 2016-03-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler