hairaito 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +64 -0
- data/lib/hairaito/nokogiri/xml/document.rb +61 -49
- data/lib/hairaito/nokogiri/xml/node.rb +161 -15
- data/lib/hairaito/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 739c486eff2c0dd13f4d31cc67937efd6bc3fabf
|
4
|
+
data.tar.gz: ab89ded036459096b7b8993c7102b23caad3fe3a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 854720af08d6228b2235795188ce59e78e5228f74907c770be7b9469b167d91a887fa0fd46a20c8f0f7d287fa347e6bff4f71347b7d449f79b990b2388be9f2e
|
7
|
+
data.tar.gz: 8b0c6bd2cc94c6b2bc9cd851fb3fe719f048f0122816ddae58a6949757874b1ef78cba42a2a461c8cbee02ad35320517ce245c601862cd3368ce56feb4953863
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# Hairaito
|
2
2
|
|
3
|
+
Extends Nokogiri with text snippets highlighting. It looks like jquery-highlight plugin, but for ruby and nokogiri.
|
4
|
+
|
3
5
|
## Installation
|
4
6
|
|
5
7
|
Add this line to your application's Gemfile:
|
@@ -16,6 +18,68 @@ Or install it yourself as:
|
|
16
18
|
|
17
19
|
## Usage
|
18
20
|
|
21
|
+
Hairaito adds to Nokogiri::XML::Document _highlight_ method.
|
22
|
+
|
23
|
+
Example:
|
24
|
+
|
25
|
+
```
|
26
|
+
doc = Nokogiri::XML('<body>abc def ghi</body>')
|
27
|
+
doc.highlight(['def'])
|
28
|
+
doc.to_html # => '<body>abc <span class="snippet-part snippet-start" data-snippet-id="0">def</span> ghi</body>'
|
29
|
+
```
|
30
|
+
|
31
|
+
There are several options for highlighting customization:
|
32
|
+
|
33
|
+
```
|
34
|
+
{
|
35
|
+
highlight_base: {
|
36
|
+
selector: 'body', # Highlighting will be launched at this selector
|
37
|
+
content_wrapper: '', # Highlighting base content can be wrapped by this tag
|
38
|
+
content_wrapper_class: 'highlighting-base', # Class for wrapper above
|
39
|
+
},
|
40
|
+
snippet: {
|
41
|
+
part_wrapper: 'span', # Found snippet parts will be wrapped with this tag
|
42
|
+
part_wrapper_class: 'snippet-part', # Class for wrapper above
|
43
|
+
starting_part_class: 'snippet-start', # Class for wrapper above, is added only for first part per found snippet
|
44
|
+
},
|
45
|
+
numeration: {
|
46
|
+
attr: 'data-snippet-id', # Snippet parts of single snippet will have same numeration value in this attribute
|
47
|
+
prefix: '', # Prefix, that will be added to each numeration value
|
48
|
+
suffix: '', # Suffix, that will be added to each numeration value
|
49
|
+
start_with: 0, # Starting point for numeration increment
|
50
|
+
},
|
51
|
+
boundaries: {
|
52
|
+
whole_words_only: true, # If true, only whole words will be found
|
53
|
+
inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt), # Tags, that aren't considered as word boundary
|
54
|
+
word_parts: '[а-яА-ЯёЁa-zA-Z\d]', # Characters, that are considered as word part
|
55
|
+
},
|
56
|
+
}
|
57
|
+
```
|
58
|
+
|
59
|
+
Example:
|
60
|
+
|
61
|
+
```
|
62
|
+
doc = Nokogiri::XML('<body>abc def ghi abcdefghi</body>')
|
63
|
+
options = {
|
64
|
+
highlight_base: {
|
65
|
+
content_wrapper: 'div',
|
66
|
+
},
|
67
|
+
snippet: {
|
68
|
+
starting_part_class: 'start',
|
69
|
+
part_wrapper_class: 'part',
|
70
|
+
},
|
71
|
+
numeration: {
|
72
|
+
attr: 'data-id',
|
73
|
+
prefix: 'snippet_'
|
74
|
+
},
|
75
|
+
boundaries: {
|
76
|
+
whole_words_only: false,
|
77
|
+
}
|
78
|
+
}
|
79
|
+
doc.highlight(['abc'], options)
|
80
|
+
doc.to_html # => '<body><div class="highlighting-base"><span class="part start" data-id="snippet_0">abc</span> def ghi <span class="part start" data-id="snippet_1">abc</span>defghi</div></body>'
|
81
|
+
```
|
82
|
+
|
19
83
|
## Contributing
|
20
84
|
|
21
85
|
1. Fork it ( https://github.com/dmazilov/hairaito/fork )
|
@@ -3,80 +3,92 @@ module Hairaito
|
|
3
3
|
module XML
|
4
4
|
module Document
|
5
5
|
|
6
|
+
# Highlights text snippets in document
|
7
|
+
#
|
8
|
+
# @param snippets [Array<String>] text variants to be highlighted
|
9
|
+
# @param options [Hash] custom highlighting options
|
10
|
+
# @return [Nokogiri::XML::Document] self document for chaining
|
6
11
|
def highlight(snippets, options = {})
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
start_node, start_inner_index = snippet_container.text_node_by_position(start_index)
|
12
|
+
highlighting_defaults(options)
|
13
|
+
snippet_parts_to_wrap = []
|
14
|
+
prepare_snippets(snippets).each do |snippet|
|
15
|
+
highlighting_base.traverse_by_text(snippet, @hl_opts[:boundaries]) do |snippet_container, snippet_offset|
|
16
|
+
start_node, start_inner_index = snippet_container.text_node_by_position(snippet_offset.first)
|
14
17
|
start_range = start_node.text_range_by_index(start_inner_index, snippet.length)
|
15
|
-
|
18
|
+
snippet_parts_to_wrap << {part: start_node, range: start_range, starting: true}
|
16
19
|
|
17
20
|
# If start node contains only part of snippet
|
18
21
|
if snippet.length > start_range.size
|
19
|
-
end_node, end_inner_index = snippet_container.text_node_by_position(
|
22
|
+
end_node, end_inner_index = snippet_container.text_node_by_position(snippet_offset.last - 1)
|
20
23
|
end_range = end_node.text_range_by_index(end_inner_index)
|
21
|
-
|
22
|
-
|
24
|
+
snippet_parts_to_wrap += snippet_container.text_nodes_between(start_node, end_node).map do |node|
|
25
|
+
{part: node, range: 0..(node.text.length - 1)}
|
23
26
|
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
to_wrap.each do |node_data|
|
28
|
-
node_data.first.highlight_by_range(node_data.last)
|
27
|
+
snippet_parts_to_wrap << {part: end_node, range: end_range}
|
29
28
|
end
|
30
|
-
|
31
|
-
snippet_container['class'] = "#{snippet_container['class']} #{@hl_opts[:snippet_container_class]}"
|
32
29
|
end
|
33
30
|
end
|
34
|
-
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
def highlight_snippet_part(text)
|
39
|
-
if @hl_opts[:snippet_part_wrapper].blank?
|
40
|
-
raise ArgumentError.new('Snippet part wrapper tag is not specified!')
|
31
|
+
snippet_parts_to_wrap.group_by{|part_data| part_data[:part]}.each do |part, parts_collection|
|
32
|
+
part.highlight_by_ranges(parts_collection.map{|p| p.except(:part)}, @hl_opts)
|
41
33
|
end
|
42
|
-
|
43
|
-
|
44
|
-
wrapper
|
34
|
+
numerate_snippet_parts if @hl_opts[:numeration][:attr].present?
|
35
|
+
self
|
45
36
|
end
|
46
37
|
|
47
38
|
private
|
48
39
|
|
49
|
-
def
|
40
|
+
def highlighting_defaults(options)
|
41
|
+
@hl_base = nil
|
50
42
|
@hl_opts = {
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
43
|
+
highlight_base: {
|
44
|
+
selector: 'body',
|
45
|
+
content_wrapper: '',
|
46
|
+
content_wrapper_class: 'highlighting-base',
|
47
|
+
},
|
48
|
+
snippet: {
|
49
|
+
part_wrapper: 'span',
|
50
|
+
part_wrapper_class: 'snippet-part',
|
51
|
+
starting_part_class: 'snippet-start',
|
52
|
+
},
|
53
|
+
numeration: {
|
54
|
+
attr: 'data-snippet-id',
|
55
|
+
prefix: '',
|
56
|
+
suffix: '',
|
57
|
+
start_with: 0,
|
58
|
+
},
|
59
|
+
boundaries: {},
|
60
|
+
}.deep_merge(options).with_indifferent_access
|
62
61
|
end
|
63
62
|
|
64
63
|
def highlighting_base
|
65
|
-
|
64
|
+
return @hl_base if @hl_base.present?
|
65
|
+
base = at(@hl_opts[:highlight_base][:selector])
|
66
66
|
raise ArgumentError.new('Document does not contain highlighting base element!') if base.blank?
|
67
|
-
if @hl_opts[:
|
68
|
-
wrapper = create_element("#{@hl_opts[:
|
67
|
+
if @hl_opts[:highlight_base][:content_wrapper].present?
|
68
|
+
wrapper = create_element("#{@hl_opts[:highlight_base][:content_wrapper]}",
|
69
|
+
class: "#{@hl_opts[:highlight_base][:content_wrapper_class]}")
|
69
70
|
base.children.each{|child| child.parent = wrapper}
|
70
71
|
wrapper.parent = base
|
71
|
-
|
72
|
+
@hl_base = wrapper
|
73
|
+
else
|
74
|
+
@hl_base = base
|
72
75
|
end
|
73
|
-
|
76
|
+
@hl_base
|
77
|
+
end
|
78
|
+
|
79
|
+
# Longer snippets must go first due to situations with snippets overlapping
|
80
|
+
# Example: ['abc', 'abcdef'],
|
81
|
+
# without sorting this produces highlighting artifacts like shorter snippet duplication in result nodes
|
82
|
+
def prepare_snippets(snippets)
|
83
|
+
snippets.uniq.sort_by{|snippet| snippet.length}.reverse
|
74
84
|
end
|
75
85
|
|
76
|
-
def
|
77
|
-
|
78
|
-
|
79
|
-
|
86
|
+
def numerate_snippet_parts
|
87
|
+
selector = @hl_opts[:snippet][:part_wrapper_class].gsub(/\s+/, ' ').split(' ').map{|cl| ".#{cl}"}.join('')
|
88
|
+
index = @hl_opts[:numeration][:start_with] - 1
|
89
|
+
css(selector).each do |part|
|
90
|
+
index += 1 if part[:class].split(' ').include?(@hl_opts[:snippet][:starting_part_class])
|
91
|
+
part[@hl_opts[:numeration][:attr]] = "#{@hl_opts[:numeration][:prefix]}#{index}#{@hl_opts[:numeration][:suffix]}"
|
80
92
|
end
|
81
93
|
end
|
82
94
|
|
@@ -3,6 +3,7 @@ module Hairaito
|
|
3
3
|
module XML
|
4
4
|
module Node
|
5
5
|
|
6
|
+
# @return [Nokogiri::XML::NodeSet] all text nodes, that has self as ancestor
|
6
7
|
def text_nodes
|
7
8
|
result_nodes = []
|
8
9
|
traverse do |node|
|
@@ -12,6 +13,17 @@ module Hairaito
|
|
12
13
|
::Nokogiri::XML::NodeSet.new(document, result_nodes)
|
13
14
|
end
|
14
15
|
|
16
|
+
# @return [Nokogiri::XML::Node] first text node within self node
|
17
|
+
def first_text_node
|
18
|
+
traverse do |node|
|
19
|
+
return node if node.text?
|
20
|
+
end
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
|
24
|
+
# @param start_node [Nokogiri::XML::Node] left boundary
|
25
|
+
# @param end_node [Nokogiri::XML::Node] right boundary
|
26
|
+
# @return [Nokogiri::XML::NodeSet] all text nodes are located between specified boundaries
|
15
27
|
def text_nodes_between(start_node, end_node)
|
16
28
|
nodes = text_nodes
|
17
29
|
indexes = [nodes.index(start_node), nodes.index(end_node)]
|
@@ -22,18 +34,83 @@ module Hairaito
|
|
22
34
|
::Nokogiri::XML::NodeSet.new(document, result_nodes)
|
23
35
|
end
|
24
36
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
37
|
+
# @param base [Nokogiri::XML::Node] root element for search
|
38
|
+
# @return [Nokogiri::XML::Node, nil] previous text node within base node or nil if it doesn't exist
|
39
|
+
def previous_text(base = document)
|
40
|
+
first_text_node = text_nodes.first
|
41
|
+
base_text_nodes = base.text_nodes
|
42
|
+
if (index = base_text_nodes.index(first_text_node)).blank?
|
43
|
+
raise ArgumentError.new('Base must contain self node!')
|
44
|
+
end
|
45
|
+
return if index == 0
|
46
|
+
base_text_nodes[index - 1]
|
47
|
+
end
|
48
|
+
|
49
|
+
# @param base [Nokogiri::XML::Node] root element for search
|
50
|
+
# @return [Nokogiri::XML::Node, nil] next text node within base node or nil if it doesn't exist
|
51
|
+
def next_text(base = document)
|
52
|
+
first_text_node = text_nodes.last
|
53
|
+
base_text_nodes = base.text_nodes
|
54
|
+
if (index = base_text_nodes.index(first_text_node)).blank?
|
55
|
+
raise ArgumentError.new('Base must contain self node!')
|
56
|
+
end
|
57
|
+
return if index == base_text_nodes.count - 1
|
58
|
+
base_text_nodes[index + 1]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Yields for each match of specified string in child nodes recursively
|
62
|
+
#
|
63
|
+
# @yieldparam node [Nokogiri::XML::Node] child node contains specified string
|
64
|
+
# @yieldparam offset [Array] child text inner offset
|
65
|
+
# @param string [String] text for matching
|
66
|
+
# @param options [Hash] @see #traverse_by_text_default_options
|
67
|
+
# @return [Nokogiri::XML::Node] self node for chaining
|
68
|
+
def traverse_by_text(string, options = {}, &block)
|
69
|
+
traverse_by_text_defaults(options)
|
70
|
+
traverse do |current_node|
|
71
|
+
next if current_node.text?
|
72
|
+
|
73
|
+
offset_types = @tbt_opts[:whole_words_only] ? [:inner_word, :boundary_word] : [:simple]
|
74
|
+
inner_offsets, boundary_offsets = current_node.matched_offsets(string, offset_types, @tbt_opts)
|
75
|
+
|
76
|
+
# Check words bordered with current inline tag if current node has boundary words
|
77
|
+
# abc<span>def<span> or <span>def</span>ghi or abc<span>def</span>ghi
|
78
|
+
if current_node.name.in?(@tbt_opts[:inline_tags]) && self != current_node
|
79
|
+
if boundary_offsets.try(:first).try(:first) == 0
|
80
|
+
previous_node = current_node.previous_text(self)
|
81
|
+
boundary_offsets.shift if previous_node.try(:matched_offsets, :any, :ending_word, @tbt_opts).present?
|
82
|
+
end
|
83
|
+
if boundary_offsets.try(:last).try(:first) == 0
|
84
|
+
next_node = current_node.next_text(self)
|
85
|
+
boundary_offsets.pop if next_node.try(:matched_offsets, :any, :beginning_word, @tbt_opts).present?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
offsets = (inner_offsets + (boundary_offsets || [])).sort_by{|offset| offset.first}
|
90
|
+
if offsets.any?
|
91
|
+
offsets.each {|offset| yield(current_node, offset)} if block_given?
|
92
|
+
if current_node != self
|
93
|
+
# Excludes processed offsets from all ancestors
|
94
|
+
([current_node] + current_node.ancestors).each do |node|
|
95
|
+
pos = node.position_by_text_node(current_node.first_text_node)
|
96
|
+
# Shifts all offsets according to node inner position and excludes from future processing
|
97
|
+
node.exclude_offsets(offsets.map{|offset| [offset.first + pos, offset.last + pos]})
|
98
|
+
# Reaches highlighting base
|
99
|
+
break if node == self
|
100
|
+
end
|
101
|
+
end
|
34
102
|
end
|
35
103
|
end
|
36
|
-
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
def position_by_text_node(text_node)
|
108
|
+
nodes = text_nodes
|
109
|
+
if (index = nodes.index(text_node)) < 0
|
110
|
+
raise ArgumentError.new('Self node must contain text_node!')
|
111
|
+
end
|
112
|
+
return 0 if index == 0
|
113
|
+
nodes[0..index - 1].map{|node| node.text}.join('').length
|
37
114
|
end
|
38
115
|
|
39
116
|
def text_node_by_position(in_text_position)
|
@@ -48,11 +125,23 @@ module Hairaito
|
|
48
125
|
raise ArgumentError.new('Inner index is out of range!')
|
49
126
|
end
|
50
127
|
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
128
|
+
def highlight_by_ranges(ranges, options)
|
129
|
+
if options[:snippet][:part_wrapper].blank?
|
130
|
+
raise ArgumentError.new('Snippet part wrapper tag is not specified!')
|
131
|
+
end
|
132
|
+
parts = []
|
133
|
+
ranges = ranges.sort_by{|r| r[:range].first}
|
134
|
+
ranges.each_with_index do |range_data, index|
|
135
|
+
range = range_data[:range]
|
136
|
+
parts << (range.first > 0 ? text[0..(range.first - 1)]: '') if index == 0
|
137
|
+
snippet_class = range_data[:starting] ? "#{options[:snippet][:starting_part_class]}" : ''
|
138
|
+
wrapper = document.create_element("#{options[:snippet][:part_wrapper]}", class: "#{options[:snippet][:part_wrapper_class]} #{snippet_class}")
|
139
|
+
wrapper.content = text[range]
|
140
|
+
parts << wrapper.to_s
|
141
|
+
parts << text[(range.last + 1)..(ranges[index + 1][:range].first - 1)] if index < ranges.count - 1
|
142
|
+
parts << (range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: '') if index == ranges.count - 1
|
143
|
+
end
|
144
|
+
new_contents = parts.join('')
|
56
145
|
replace(new_contents)
|
57
146
|
end
|
58
147
|
|
@@ -60,6 +149,63 @@ module Hairaito
|
|
60
149
|
demand_length.present? ? index..[text.length - 1, index + demand_length - 1].min : 0..index
|
61
150
|
end
|
62
151
|
|
152
|
+
# @return [Array] self node offsets were already processed
|
153
|
+
def excluded_offsets
|
154
|
+
@excluded_offsets ||= []
|
155
|
+
end
|
156
|
+
|
157
|
+
# @param offsets [Array] self node offsets to be excluded in the future processing
|
158
|
+
def exclude_offsets(offsets)
|
159
|
+
@excluded_offsets ||= []
|
160
|
+
@excluded_offsets += offsets
|
161
|
+
end
|
162
|
+
|
163
|
+
def matched_offsets(string, types, options)
|
164
|
+
types = [types] unless types.is_a?(Array)
|
165
|
+
offsets = []
|
166
|
+
types.each do |type|
|
167
|
+
offsets << text.to_enum(:scan, build_regexp(string, type, options)).map do
|
168
|
+
offset = Regexp.last_match.offset(:text)
|
169
|
+
# Only one highlighting per position
|
170
|
+
offset unless overlapped_offsets?(excluded_offsets, offset)
|
171
|
+
end.compact || []
|
172
|
+
end
|
173
|
+
return *offsets
|
174
|
+
end
|
175
|
+
|
176
|
+
private
|
177
|
+
|
178
|
+
def traverse_by_text_defaults(options)
|
179
|
+
@tbt_opts = {
|
180
|
+
whole_words_only: true,
|
181
|
+
inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt),
|
182
|
+
word_parts: '[а-яА-ЯёЁa-zA-Z\d]',
|
183
|
+
}.deep_merge(options).with_indifferent_access
|
184
|
+
end
|
185
|
+
|
186
|
+
def build_regexp(string, type = :simple, options)
|
187
|
+
string = '.+' if string == :any
|
188
|
+
case type.to_sym
|
189
|
+
when :simple
|
190
|
+
return /(?<text>#{string})/
|
191
|
+
when :inner_word
|
192
|
+
return /(?<!#{options[:word_parts]}|\A)(?<text>#{string})(?!#{options[:word_parts]}|\Z)/
|
193
|
+
when :beginning_word
|
194
|
+
return /\A(?<text>#{string})(?!#{options[:word_parts]})/
|
195
|
+
when :ending_word
|
196
|
+
return /(?<!#{options[:word_parts]})(?<text>#{string})\Z/
|
197
|
+
when :boundary_word
|
198
|
+
return /(\A(?<text>#{string})(?!#{options[:word_parts]}))|((?<!#{options[:word_parts]})(?<text>#{string})\Z)|(\A(?<text>#{string})\Z)/
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def overlapped_offsets?(offsets_collection, offset_for_check)
|
203
|
+
offsets_collection.each do |offset|
|
204
|
+
return true if (offset_for_check.first...offset_for_check.last).overlaps?(offset.first...offset.last)
|
205
|
+
end
|
206
|
+
false
|
207
|
+
end
|
208
|
+
|
63
209
|
end
|
64
210
|
end
|
65
211
|
end
|
data/lib/hairaito/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hairaito
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Denis Mazilov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|