algolia_html_extractor 2.2.0 → 2.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/algolia_html_extractor.rb +73 -72
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
|
4
|
+
data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
|
7
|
+
data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8
|
@@ -3,20 +3,80 @@ require 'digest/md5'
|
|
3
3
|
|
4
4
|
# Extract content from an HTML page in the form of items with associated
|
5
5
|
# hierarchy data
|
6
|
-
|
7
|
-
def
|
8
|
-
@dom = Nokogiri::HTML(input)
|
6
|
+
module AlgoliaHTMLExtractor
|
7
|
+
def self.run(input, options: {})
|
9
8
|
default_options = {
|
10
9
|
css_selector: 'p'
|
11
10
|
}
|
12
|
-
|
11
|
+
options = default_options.merge(options)
|
12
|
+
|
13
|
+
heading_selector = 'h1,h2,h3,h4,h5,h6'
|
14
|
+
# We select all nodes that match either the headings or the elements to
|
15
|
+
# extract. This will allow us to loop over it in order it appears in the DOM
|
16
|
+
all_selector = "#{heading_selector},#{options[:css_selector]}"
|
17
|
+
|
18
|
+
items = []
|
19
|
+
current_hierarchy = {
|
20
|
+
lvl0: nil,
|
21
|
+
lvl1: nil,
|
22
|
+
lvl2: nil,
|
23
|
+
lvl3: nil,
|
24
|
+
lvl4: nil,
|
25
|
+
lvl5: nil
|
26
|
+
}
|
27
|
+
current_position = 0 # Position of the DOM node in the tree
|
28
|
+
current_lvl = nil # Current closest hierarchy level
|
29
|
+
current_anchor = nil # Current closest anchor
|
30
|
+
|
31
|
+
dom = Nokogiri::HTML(input)
|
32
|
+
dom.css(all_selector).each do |node|
|
33
|
+
# If it's a heading, we update our current hierarchy
|
34
|
+
if node.matches?(heading_selector)
|
35
|
+
# Which level heading is it?
|
36
|
+
current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
|
37
|
+
# Update this level, and set all the following ones to nil
|
38
|
+
current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
|
39
|
+
(current_lvl + 1..6).each do |lvl|
|
40
|
+
current_hierarchy["lvl#{lvl}".to_sym] = nil
|
41
|
+
end
|
42
|
+
# Update the anchor, if the new heading has one
|
43
|
+
new_anchor = extract_anchor(node)
|
44
|
+
current_anchor = new_anchor if new_anchor
|
45
|
+
end
|
46
|
+
|
47
|
+
# Stop if node is not to be extracted
|
48
|
+
next unless node.matches?(options[:css_selector])
|
49
|
+
|
50
|
+
# Stop if node is empty
|
51
|
+
content = extract_text(node)
|
52
|
+
next if content.empty?
|
53
|
+
|
54
|
+
item = {
|
55
|
+
html: extract_html(node),
|
56
|
+
content: content,
|
57
|
+
tag_name: extract_tag_name(node),
|
58
|
+
hierarchy: current_hierarchy.clone,
|
59
|
+
anchor: current_anchor,
|
60
|
+
node: node,
|
61
|
+
weight: {
|
62
|
+
position: current_position,
|
63
|
+
heading: heading_weight(current_lvl)
|
64
|
+
}
|
65
|
+
}
|
66
|
+
item[:objectID] = uuid(item)
|
67
|
+
items << item
|
68
|
+
|
69
|
+
current_position += 1
|
70
|
+
end
|
71
|
+
|
72
|
+
items
|
13
73
|
end
|
14
74
|
|
15
75
|
# Returns the outer HTML of a given node
|
16
76
|
#
|
17
77
|
# eg.
|
18
78
|
# <p>foo</p> => <p>foo</p>
|
19
|
-
def extract_html(node)
|
79
|
+
def self.extract_html(node)
|
20
80
|
node.to_s.strip
|
21
81
|
end
|
22
82
|
|
@@ -24,7 +84,7 @@ class AlgoliaHTMLExtractor
|
|
24
84
|
#
|
25
85
|
# eg.
|
26
86
|
# <p>foo</p> => foo
|
27
|
-
def extract_text(node)
|
87
|
+
def self.extract_text(node)
|
28
88
|
node.content
|
29
89
|
end
|
30
90
|
|
@@ -32,7 +92,7 @@ class AlgoliaHTMLExtractor
|
|
32
92
|
#
|
33
93
|
# eg
|
34
94
|
# <p>foo</p> => p
|
35
|
-
def extract_tag_name(node)
|
95
|
+
def self.extract_tag_name(node)
|
36
96
|
node.name.downcase
|
37
97
|
end
|
38
98
|
|
@@ -42,7 +102,7 @@ class AlgoliaHTMLExtractor
|
|
42
102
|
# <h1 name="anchor">Foo</h1> => anchor
|
43
103
|
# <h1 id="anchor">Foo</h1> => anchor
|
44
104
|
# <h1><a name="anchor">Foo</a></h1> => anchor
|
45
|
-
def extract_anchor(node)
|
105
|
+
def self.extract_anchor(node)
|
46
106
|
anchor = node.attr('name') || node.attr('id') || nil
|
47
107
|
return anchor unless anchor.nil?
|
48
108
|
|
@@ -55,7 +115,10 @@ class AlgoliaHTMLExtractor
|
|
55
115
|
|
56
116
|
##
|
57
117
|
# Generate a unique identifier for the item
|
58
|
-
def uuid(item)
|
118
|
+
def self.uuid(item)
|
119
|
+
# We don't use the objectID as part of the hash algorithm
|
120
|
+
|
121
|
+
item.delete(:objectID)
|
59
122
|
# We first get all the keys of the object, sorted alphabetically...
|
60
123
|
ordered_keys = item.keys.sort
|
61
124
|
|
@@ -74,71 +137,9 @@ class AlgoliaHTMLExtractor
|
|
74
137
|
##
|
75
138
|
# Get a relative numeric value of the importance of the heading
|
76
139
|
# 100 for top level, then -10 per heading
|
77
|
-
def heading_weight(heading_level)
|
140
|
+
def self.heading_weight(heading_level)
|
78
141
|
weight = 100
|
79
142
|
return weight if heading_level.nil?
|
80
143
|
weight - ((heading_level + 1) * 10)
|
81
144
|
end
|
82
|
-
|
83
|
-
def extract
|
84
|
-
heading_selector = 'h1,h2,h3,h4,h5,h6'
|
85
|
-
# We select all nodes that match either the headings or the elements to
|
86
|
-
# extract. This will allow us to loop over it in order it appears in the DOM
|
87
|
-
all_selector = "#{heading_selector},#{@options[:css_selector]}"
|
88
|
-
|
89
|
-
items = []
|
90
|
-
current_hierarchy = {
|
91
|
-
lvl0: nil,
|
92
|
-
lvl1: nil,
|
93
|
-
lvl2: nil,
|
94
|
-
lvl3: nil,
|
95
|
-
lvl4: nil,
|
96
|
-
lvl5: nil
|
97
|
-
}
|
98
|
-
current_position = 0 # Position of the DOM node in the tree
|
99
|
-
current_lvl = nil # Current closest hierarchy level
|
100
|
-
current_anchor = nil # Current closest anchor
|
101
|
-
|
102
|
-
@dom.css(all_selector).each do |node|
|
103
|
-
# If it's a heading, we update our current hierarchy
|
104
|
-
if node.matches?(heading_selector)
|
105
|
-
# Which level heading is it?
|
106
|
-
current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
|
107
|
-
# Update this level, and set all the following ones to nil
|
108
|
-
current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
|
109
|
-
(current_lvl + 1..6).each do |lvl|
|
110
|
-
current_hierarchy["lvl#{lvl}".to_sym] = nil
|
111
|
-
end
|
112
|
-
# Update the anchor, if the new heading has one
|
113
|
-
new_anchor = extract_anchor(node)
|
114
|
-
current_anchor = new_anchor if new_anchor
|
115
|
-
end
|
116
|
-
|
117
|
-
# Stop if node is not to be extracted
|
118
|
-
next unless node.matches?(@options[:css_selector])
|
119
|
-
|
120
|
-
# Stop if node is empty
|
121
|
-
content = extract_text(node)
|
122
|
-
next if content.empty?
|
123
|
-
|
124
|
-
item = {
|
125
|
-
html: extract_html(node),
|
126
|
-
content: content,
|
127
|
-
tag_name: extract_tag_name(node),
|
128
|
-
hierarchy: current_hierarchy.clone,
|
129
|
-
anchor: current_anchor,
|
130
|
-
node: node,
|
131
|
-
weight: {
|
132
|
-
position: current_position,
|
133
|
-
heading: heading_weight(current_lvl)
|
134
|
-
}
|
135
|
-
}
|
136
|
-
item[:objectID] = uuid(item)
|
137
|
-
items << item
|
138
|
-
|
139
|
-
current_position += 1
|
140
|
-
end
|
141
|
-
|
142
|
-
items
|
143
|
-
end
|
144
145
|
end
|
data/lib/version.rb
CHANGED