hairaito 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +64 -0
- data/lib/hairaito/nokogiri/xml/document.rb +61 -49
- data/lib/hairaito/nokogiri/xml/node.rb +161 -15
- data/lib/hairaito/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 739c486eff2c0dd13f4d31cc67937efd6bc3fabf
|
4
|
+
data.tar.gz: ab89ded036459096b7b8993c7102b23caad3fe3a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 854720af08d6228b2235795188ce59e78e5228f74907c770be7b9469b167d91a887fa0fd46a20c8f0f7d287fa347e6bff4f71347b7d449f79b990b2388be9f2e
|
7
|
+
data.tar.gz: 8b0c6bd2cc94c6b2bc9cd851fb3fe719f048f0122816ddae58a6949757874b1ef78cba42a2a461c8cbee02ad35320517ce245c601862cd3368ce56feb4953863
|
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# Hairaito
|
2
2
|
|
3
|
+
Extends Nokogiri with text snippets highlighting. It looks like jquery-highlight plugin, but for ruby and nokogiri.
|
4
|
+
|
3
5
|
## Installation
|
4
6
|
|
5
7
|
Add this line to your application's Gemfile:
|
@@ -16,6 +18,68 @@ Or install it yourself as:
|
|
16
18
|
|
17
19
|
## Usage
|
18
20
|
|
21
|
+
Hairaito adds to Nokogiri::XML::Document _highlight_ method.
|
22
|
+
|
23
|
+
Example:
|
24
|
+
|
25
|
+
```
|
26
|
+
doc = Nokogiri::XML('<body>abc def ghi</body>')
|
27
|
+
doc.highlight(['def'])
|
28
|
+
doc.to_html # => '<body>abc <span class="snippet-part snippet-start" data-snippet-id="0">def</span> ghi</body>'
|
29
|
+
```
|
30
|
+
|
31
|
+
There are several options for highlighting customization:
|
32
|
+
|
33
|
+
```
|
34
|
+
{
|
35
|
+
highlight_base: {
|
36
|
+
selector: 'body', # Highlighting will be launched at this selector
|
37
|
+
content_wrapper: '', # Highlighting base content can be wrapped by this tag
|
38
|
+
content_wrapper_class: 'highlighting-base', # Class for wrapper above
|
39
|
+
},
|
40
|
+
snippet: {
|
41
|
+
part_wrapper: 'span', # Found snippet parts will be wrapped with this tag
|
42
|
+
part_wrapper_class: 'snippet-part', # Class for wrapper above
|
43
|
+
starting_part_class: 'snippet-start', # Class for wrapper above, is added only for first part per found snippet
|
44
|
+
},
|
45
|
+
numeration: {
|
46
|
+
attr: 'data-snippet-id', # Snippet parts of single snippet will have same numeration value in this attribute
|
47
|
+
prefix: '', # Prefix, that will be added to each numeration value
|
48
|
+
suffix: '', # Suffix, that will be added to each numeration value
|
49
|
+
start_with: 0, # Starting point for numeration increment
|
50
|
+
},
|
51
|
+
boundaries: {
|
52
|
+
whole_words_only: true, # If true, only whole words will be found
|
53
|
+
inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt), # Tags, that aren't considered as word boundary
|
54
|
+
word_parts: '[а-яА-ЯёЁa-zA-Z\d]', # Characters, that are considered as word part
|
55
|
+
},
|
56
|
+
}
|
57
|
+
```
|
58
|
+
|
59
|
+
Example:
|
60
|
+
|
61
|
+
```
|
62
|
+
doc = Nokogiri::XML('<body>abc def ghi abcdefghi</body>')
|
63
|
+
options = {
|
64
|
+
highlight_base: {
|
65
|
+
content_wrapper: 'div',
|
66
|
+
},
|
67
|
+
snippet: {
|
68
|
+
starting_part_class: 'start',
|
69
|
+
part_wrapper_class: 'part',
|
70
|
+
},
|
71
|
+
numeration: {
|
72
|
+
attr: 'data-id',
|
73
|
+
prefix: 'snippet_'
|
74
|
+
},
|
75
|
+
boundaries: {
|
76
|
+
whole_words_only: false,
|
77
|
+
}
|
78
|
+
}
|
79
|
+
doc.highlight(['abc'], options)
|
80
|
+
doc.to_html # => '<body><div class="highlighting-base"><span class="part start" data-id="snippet_0">abc</span> def ghi <span class="part start" data-id="snippet_1">abc</span>defghi</div></body>'
|
81
|
+
```
|
82
|
+
|
19
83
|
## Contributing
|
20
84
|
|
21
85
|
1. Fork it ( https://github.com/dmazilov/hairaito/fork )
|
@@ -3,80 +3,92 @@ module Hairaito
|
|
3
3
|
module XML
|
4
4
|
module Document
|
5
5
|
|
6
|
+
# Highlights text snippets in document
|
7
|
+
#
|
8
|
+
# @param snippets [Array<String>] text variants to be highlighted
|
9
|
+
# @param options [Hash] custom highlighting options
|
10
|
+
# @return [Nokogiri::XML::Document] self document for chaining
|
6
11
|
def highlight(snippets, options = {})
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
start_node, start_inner_index = snippet_container.text_node_by_position(start_index)
|
12
|
+
highlighting_defaults(options)
|
13
|
+
snippet_parts_to_wrap = []
|
14
|
+
prepare_snippets(snippets).each do |snippet|
|
15
|
+
highlighting_base.traverse_by_text(snippet, @hl_opts[:boundaries]) do |snippet_container, snippet_offset|
|
16
|
+
start_node, start_inner_index = snippet_container.text_node_by_position(snippet_offset.first)
|
14
17
|
start_range = start_node.text_range_by_index(start_inner_index, snippet.length)
|
15
|
-
|
18
|
+
snippet_parts_to_wrap << {part: start_node, range: start_range, starting: true}
|
16
19
|
|
17
20
|
# If start node contains only part of snippet
|
18
21
|
if snippet.length > start_range.size
|
19
|
-
end_node, end_inner_index = snippet_container.text_node_by_position(
|
22
|
+
end_node, end_inner_index = snippet_container.text_node_by_position(snippet_offset.last - 1)
|
20
23
|
end_range = end_node.text_range_by_index(end_inner_index)
|
21
|
-
|
22
|
-
|
24
|
+
snippet_parts_to_wrap += snippet_container.text_nodes_between(start_node, end_node).map do |node|
|
25
|
+
{part: node, range: 0..(node.text.length - 1)}
|
23
26
|
end
|
24
|
-
|
25
|
-
end
|
26
|
-
|
27
|
-
to_wrap.each do |node_data|
|
28
|
-
node_data.first.highlight_by_range(node_data.last)
|
27
|
+
snippet_parts_to_wrap << {part: end_node, range: end_range}
|
29
28
|
end
|
30
|
-
|
31
|
-
snippet_container['class'] = "#{snippet_container['class']} #{@hl_opts[:snippet_container_class]}"
|
32
29
|
end
|
33
30
|
end
|
34
|
-
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
def highlight_snippet_part(text)
|
39
|
-
if @hl_opts[:snippet_part_wrapper].blank?
|
40
|
-
raise ArgumentError.new('Snippet part wrapper tag is not specified!')
|
31
|
+
snippet_parts_to_wrap.group_by{|part_data| part_data[:part]}.each do |part, parts_collection|
|
32
|
+
part.highlight_by_ranges(parts_collection.map{|p| p.except(:part)}, @hl_opts)
|
41
33
|
end
|
42
|
-
|
43
|
-
|
44
|
-
wrapper
|
34
|
+
numerate_snippet_parts if @hl_opts[:numeration][:attr].present?
|
35
|
+
self
|
45
36
|
end
|
46
37
|
|
47
38
|
private
|
48
39
|
|
49
|
-
def
|
40
|
+
def highlighting_defaults(options)
|
41
|
+
@hl_base = nil
|
50
42
|
@hl_opts = {
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
43
|
+
highlight_base: {
|
44
|
+
selector: 'body',
|
45
|
+
content_wrapper: '',
|
46
|
+
content_wrapper_class: 'highlighting-base',
|
47
|
+
},
|
48
|
+
snippet: {
|
49
|
+
part_wrapper: 'span',
|
50
|
+
part_wrapper_class: 'snippet-part',
|
51
|
+
starting_part_class: 'snippet-start',
|
52
|
+
},
|
53
|
+
numeration: {
|
54
|
+
attr: 'data-snippet-id',
|
55
|
+
prefix: '',
|
56
|
+
suffix: '',
|
57
|
+
start_with: 0,
|
58
|
+
},
|
59
|
+
boundaries: {},
|
60
|
+
}.deep_merge(options).with_indifferent_access
|
62
61
|
end
|
63
62
|
|
64
63
|
def highlighting_base
|
65
|
-
|
64
|
+
return @hl_base if @hl_base.present?
|
65
|
+
base = at(@hl_opts[:highlight_base][:selector])
|
66
66
|
raise ArgumentError.new('Document does not contain highlighting base element!') if base.blank?
|
67
|
-
if @hl_opts[:
|
68
|
-
wrapper = create_element("#{@hl_opts[:
|
67
|
+
if @hl_opts[:highlight_base][:content_wrapper].present?
|
68
|
+
wrapper = create_element("#{@hl_opts[:highlight_base][:content_wrapper]}",
|
69
|
+
class: "#{@hl_opts[:highlight_base][:content_wrapper_class]}")
|
69
70
|
base.children.each{|child| child.parent = wrapper}
|
70
71
|
wrapper.parent = base
|
71
|
-
|
72
|
+
@hl_base = wrapper
|
73
|
+
else
|
74
|
+
@hl_base = base
|
72
75
|
end
|
73
|
-
|
76
|
+
@hl_base
|
77
|
+
end
|
78
|
+
|
79
|
+
# Longer snippets must go first due to situations with snippets overlapping
|
80
|
+
# Example: ['abc', 'abcdef'],
|
81
|
+
# without sorting this produces highlighting artifacts like shorter snippet duplication in result nodes
|
82
|
+
def prepare_snippets(snippets)
|
83
|
+
snippets.uniq.sort_by{|snippet| snippet.length}.reverse
|
74
84
|
end
|
75
85
|
|
76
|
-
def
|
77
|
-
|
78
|
-
|
79
|
-
|
86
|
+
def numerate_snippet_parts
|
87
|
+
selector = @hl_opts[:snippet][:part_wrapper_class].gsub(/\s+/, ' ').split(' ').map{|cl| ".#{cl}"}.join('')
|
88
|
+
index = @hl_opts[:numeration][:start_with] - 1
|
89
|
+
css(selector).each do |part|
|
90
|
+
index += 1 if part[:class].split(' ').include?(@hl_opts[:snippet][:starting_part_class])
|
91
|
+
part[@hl_opts[:numeration][:attr]] = "#{@hl_opts[:numeration][:prefix]}#{index}#{@hl_opts[:numeration][:suffix]}"
|
80
92
|
end
|
81
93
|
end
|
82
94
|
|
@@ -3,6 +3,7 @@ module Hairaito
|
|
3
3
|
module XML
|
4
4
|
module Node
|
5
5
|
|
6
|
+
# @return [Nokogiri::XML::NodeSet] all text nodes, that has self as ancestor
|
6
7
|
def text_nodes
|
7
8
|
result_nodes = []
|
8
9
|
traverse do |node|
|
@@ -12,6 +13,17 @@ module Hairaito
|
|
12
13
|
::Nokogiri::XML::NodeSet.new(document, result_nodes)
|
13
14
|
end
|
14
15
|
|
16
|
+
# @return [Nokogiri::XML::Node] first text node within self node
|
17
|
+
def first_text_node
|
18
|
+
traverse do |node|
|
19
|
+
return node if node.text?
|
20
|
+
end
|
21
|
+
nil
|
22
|
+
end
|
23
|
+
|
24
|
+
# @param start_node [Nokogiri::XML::Node] left boundary
|
25
|
+
# @param end_node [Nokogiri::XML::Node] right boundary
|
26
|
+
# @return [Nokogiri::XML::NodeSet] all text nodes are located between specified boundaries
|
15
27
|
def text_nodes_between(start_node, end_node)
|
16
28
|
nodes = text_nodes
|
17
29
|
indexes = [nodes.index(start_node), nodes.index(end_node)]
|
@@ -22,18 +34,83 @@ module Hairaito
|
|
22
34
|
::Nokogiri::XML::NodeSet.new(document, result_nodes)
|
23
35
|
end
|
24
36
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
37
|
+
# @param base [Nokogiri::XML::Node] root element for search
|
38
|
+
# @return [Nokogiri::XML::Node, nil] previous text node within base node or nil if it doesn't exist
|
39
|
+
def previous_text(base = document)
|
40
|
+
first_text_node = text_nodes.first
|
41
|
+
base_text_nodes = base.text_nodes
|
42
|
+
if (index = base_text_nodes.index(first_text_node)).blank?
|
43
|
+
raise ArgumentError.new('Base must contain self node!')
|
44
|
+
end
|
45
|
+
return if index == 0
|
46
|
+
base_text_nodes[index - 1]
|
47
|
+
end
|
48
|
+
|
49
|
+
# @param base [Nokogiri::XML::Node] root element for search
|
50
|
+
# @return [Nokogiri::XML::Node, nil] next text node within base node or nil if it doesn't exist
|
51
|
+
def next_text(base = document)
|
52
|
+
first_text_node = text_nodes.last
|
53
|
+
base_text_nodes = base.text_nodes
|
54
|
+
if (index = base_text_nodes.index(first_text_node)).blank?
|
55
|
+
raise ArgumentError.new('Base must contain self node!')
|
56
|
+
end
|
57
|
+
return if index == base_text_nodes.count - 1
|
58
|
+
base_text_nodes[index + 1]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Yields for each match of specified string in child nodes recursively
|
62
|
+
#
|
63
|
+
# @yieldparam node [Nokogiri::XML::Node] child node contains specified string
|
64
|
+
# @yieldparam offset [Array] child text inner offset
|
65
|
+
# @param string [String] text for matching
|
66
|
+
# @param options [Hash] @see #traverse_by_text_default_options
|
67
|
+
# @return [Nokogiri::XML::Node] self node for chaining
|
68
|
+
def traverse_by_text(string, options = {}, &block)
|
69
|
+
traverse_by_text_defaults(options)
|
70
|
+
traverse do |current_node|
|
71
|
+
next if current_node.text?
|
72
|
+
|
73
|
+
offset_types = @tbt_opts[:whole_words_only] ? [:inner_word, :boundary_word] : [:simple]
|
74
|
+
inner_offsets, boundary_offsets = current_node.matched_offsets(string, offset_types, @tbt_opts)
|
75
|
+
|
76
|
+
# Check words bordered with current inline tag if current node has boundary words
|
77
|
+
# abc<span>def<span> or <span>def</span>ghi or abc<span>def</span>ghi
|
78
|
+
if current_node.name.in?(@tbt_opts[:inline_tags]) && self != current_node
|
79
|
+
if boundary_offsets.try(:first).try(:first) == 0
|
80
|
+
previous_node = current_node.previous_text(self)
|
81
|
+
boundary_offsets.shift if previous_node.try(:matched_offsets, :any, :ending_word, @tbt_opts).present?
|
82
|
+
end
|
83
|
+
if boundary_offsets.try(:last).try(:first) == 0
|
84
|
+
next_node = current_node.next_text(self)
|
85
|
+
boundary_offsets.pop if next_node.try(:matched_offsets, :any, :beginning_word, @tbt_opts).present?
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
offsets = (inner_offsets + (boundary_offsets || [])).sort_by{|offset| offset.first}
|
90
|
+
if offsets.any?
|
91
|
+
offsets.each {|offset| yield(current_node, offset)} if block_given?
|
92
|
+
if current_node != self
|
93
|
+
# Excludes processed offsets from all ancestors
|
94
|
+
([current_node] + current_node.ancestors).each do |node|
|
95
|
+
pos = node.position_by_text_node(current_node.first_text_node)
|
96
|
+
# Shifts all offsets according to node inner position and excludes from future processing
|
97
|
+
node.exclude_offsets(offsets.map{|offset| [offset.first + pos, offset.last + pos]})
|
98
|
+
# Reaches highlighting base
|
99
|
+
break if node == self
|
100
|
+
end
|
101
|
+
end
|
34
102
|
end
|
35
103
|
end
|
36
|
-
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
def position_by_text_node(text_node)
|
108
|
+
nodes = text_nodes
|
109
|
+
if (index = nodes.index(text_node)) < 0
|
110
|
+
raise ArgumentError.new('Self node must contain text_node!')
|
111
|
+
end
|
112
|
+
return 0 if index == 0
|
113
|
+
nodes[0..index - 1].map{|node| node.text}.join('').length
|
37
114
|
end
|
38
115
|
|
39
116
|
def text_node_by_position(in_text_position)
|
@@ -48,11 +125,23 @@ module Hairaito
|
|
48
125
|
raise ArgumentError.new('Inner index is out of range!')
|
49
126
|
end
|
50
127
|
|
51
|
-
def
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
128
|
+
def highlight_by_ranges(ranges, options)
|
129
|
+
if options[:snippet][:part_wrapper].blank?
|
130
|
+
raise ArgumentError.new('Snippet part wrapper tag is not specified!')
|
131
|
+
end
|
132
|
+
parts = []
|
133
|
+
ranges = ranges.sort_by{|r| r[:range].first}
|
134
|
+
ranges.each_with_index do |range_data, index|
|
135
|
+
range = range_data[:range]
|
136
|
+
parts << (range.first > 0 ? text[0..(range.first - 1)]: '') if index == 0
|
137
|
+
snippet_class = range_data[:starting] ? "#{options[:snippet][:starting_part_class]}" : ''
|
138
|
+
wrapper = document.create_element("#{options[:snippet][:part_wrapper]}", class: "#{options[:snippet][:part_wrapper_class]} #{snippet_class}")
|
139
|
+
wrapper.content = text[range]
|
140
|
+
parts << wrapper.to_s
|
141
|
+
parts << text[(range.last + 1)..(ranges[index + 1][:range].first - 1)] if index < ranges.count - 1
|
142
|
+
parts << (range.last < text.length - 1 ? text[(range.last + 1)..(text.length - 1)]: '') if index == ranges.count - 1
|
143
|
+
end
|
144
|
+
new_contents = parts.join('')
|
56
145
|
replace(new_contents)
|
57
146
|
end
|
58
147
|
|
@@ -60,6 +149,63 @@ module Hairaito
|
|
60
149
|
demand_length.present? ? index..[text.length - 1, index + demand_length - 1].min : 0..index
|
61
150
|
end
|
62
151
|
|
152
|
+
# @return [Array] self node offsets were already processed
|
153
|
+
def excluded_offsets
|
154
|
+
@excluded_offsets ||= []
|
155
|
+
end
|
156
|
+
|
157
|
+
# @param offsets [Array] self node offsets to be excluded in the future processing
|
158
|
+
def exclude_offsets(offsets)
|
159
|
+
@excluded_offsets ||= []
|
160
|
+
@excluded_offsets += offsets
|
161
|
+
end
|
162
|
+
|
163
|
+
def matched_offsets(string, types, options)
|
164
|
+
types = [types] unless types.is_a?(Array)
|
165
|
+
offsets = []
|
166
|
+
types.each do |type|
|
167
|
+
offsets << text.to_enum(:scan, build_regexp(string, type, options)).map do
|
168
|
+
offset = Regexp.last_match.offset(:text)
|
169
|
+
# Only one highlighting per position
|
170
|
+
offset unless overlapped_offsets?(excluded_offsets, offset)
|
171
|
+
end.compact || []
|
172
|
+
end
|
173
|
+
return *offsets
|
174
|
+
end
|
175
|
+
|
176
|
+
private
|
177
|
+
|
178
|
+
def traverse_by_text_defaults(options)
|
179
|
+
@tbt_opts = {
|
180
|
+
whole_words_only: true,
|
181
|
+
inline_tags: %w(a b i s u basefont big em font img label small span strike strong sub sup tt),
|
182
|
+
word_parts: '[а-яА-ЯёЁa-zA-Z\d]',
|
183
|
+
}.deep_merge(options).with_indifferent_access
|
184
|
+
end
|
185
|
+
|
186
|
+
def build_regexp(string, type = :simple, options)
|
187
|
+
string = '.+' if string == :any
|
188
|
+
case type.to_sym
|
189
|
+
when :simple
|
190
|
+
return /(?<text>#{string})/
|
191
|
+
when :inner_word
|
192
|
+
return /(?<!#{options[:word_parts]}|\A)(?<text>#{string})(?!#{options[:word_parts]}|\Z)/
|
193
|
+
when :beginning_word
|
194
|
+
return /\A(?<text>#{string})(?!#{options[:word_parts]})/
|
195
|
+
when :ending_word
|
196
|
+
return /(?<!#{options[:word_parts]})(?<text>#{string})\Z/
|
197
|
+
when :boundary_word
|
198
|
+
return /(\A(?<text>#{string})(?!#{options[:word_parts]}))|((?<!#{options[:word_parts]})(?<text>#{string})\Z)|(\A(?<text>#{string})\Z)/
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def overlapped_offsets?(offsets_collection, offset_for_check)
|
203
|
+
offsets_collection.each do |offset|
|
204
|
+
return true if (offset_for_check.first...offset_for_check.last).overlaps?(offset.first...offset.last)
|
205
|
+
end
|
206
|
+
false
|
207
|
+
end
|
208
|
+
|
63
209
|
end
|
64
210
|
end
|
65
211
|
end
|
data/lib/hairaito/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hairaito
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Denis Mazilov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|