algolia_html_extractor 2.2.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/algolia_html_extractor.rb +73 -72
- data/lib/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
|
4
|
+
data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
|
7
|
+
data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8
|
@@ -3,20 +3,80 @@ require 'digest/md5'
|
|
3
3
|
|
4
4
|
# Extract content from an HTML page in the form of items with associated
|
5
5
|
# hierarchy data
|
6
|
-
|
7
|
-
def
|
8
|
-
@dom = Nokogiri::HTML(input)
|
6
|
+
module AlgoliaHTMLExtractor
|
7
|
+
def self.run(input, options: {})
|
9
8
|
default_options = {
|
10
9
|
css_selector: 'p'
|
11
10
|
}
|
12
|
-
|
11
|
+
options = default_options.merge(options)
|
12
|
+
|
13
|
+
heading_selector = 'h1,h2,h3,h4,h5,h6'
|
14
|
+
# We select all nodes that match either the headings or the elements to
|
15
|
+
# extract. This will allow us to loop over it in order it appears in the DOM
|
16
|
+
all_selector = "#{heading_selector},#{options[:css_selector]}"
|
17
|
+
|
18
|
+
items = []
|
19
|
+
current_hierarchy = {
|
20
|
+
lvl0: nil,
|
21
|
+
lvl1: nil,
|
22
|
+
lvl2: nil,
|
23
|
+
lvl3: nil,
|
24
|
+
lvl4: nil,
|
25
|
+
lvl5: nil
|
26
|
+
}
|
27
|
+
current_position = 0 # Position of the DOM node in the tree
|
28
|
+
current_lvl = nil # Current closest hierarchy level
|
29
|
+
current_anchor = nil # Current closest anchor
|
30
|
+
|
31
|
+
dom = Nokogiri::HTML(input)
|
32
|
+
dom.css(all_selector).each do |node|
|
33
|
+
# If it's a heading, we update our current hierarchy
|
34
|
+
if node.matches?(heading_selector)
|
35
|
+
# Which level heading is it?
|
36
|
+
current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
|
37
|
+
# Update this level, and set all the following ones to nil
|
38
|
+
current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
|
39
|
+
(current_lvl + 1..6).each do |lvl|
|
40
|
+
current_hierarchy["lvl#{lvl}".to_sym] = nil
|
41
|
+
end
|
42
|
+
# Update the anchor, if the new heading has one
|
43
|
+
new_anchor = extract_anchor(node)
|
44
|
+
current_anchor = new_anchor if new_anchor
|
45
|
+
end
|
46
|
+
|
47
|
+
# Stop if node is not to be extracted
|
48
|
+
next unless node.matches?(options[:css_selector])
|
49
|
+
|
50
|
+
# Stop if node is empty
|
51
|
+
content = extract_text(node)
|
52
|
+
next if content.empty?
|
53
|
+
|
54
|
+
item = {
|
55
|
+
html: extract_html(node),
|
56
|
+
content: content,
|
57
|
+
tag_name: extract_tag_name(node),
|
58
|
+
hierarchy: current_hierarchy.clone,
|
59
|
+
anchor: current_anchor,
|
60
|
+
node: node,
|
61
|
+
weight: {
|
62
|
+
position: current_position,
|
63
|
+
heading: heading_weight(current_lvl)
|
64
|
+
}
|
65
|
+
}
|
66
|
+
item[:objectID] = uuid(item)
|
67
|
+
items << item
|
68
|
+
|
69
|
+
current_position += 1
|
70
|
+
end
|
71
|
+
|
72
|
+
items
|
13
73
|
end
|
14
74
|
|
15
75
|
# Returns the outer HTML of a given node
|
16
76
|
#
|
17
77
|
# eg.
|
18
78
|
# <p>foo</p> => <p>foo</p>
|
19
|
-
def extract_html(node)
|
79
|
+
def self.extract_html(node)
|
20
80
|
node.to_s.strip
|
21
81
|
end
|
22
82
|
|
@@ -24,7 +84,7 @@ class AlgoliaHTMLExtractor
|
|
24
84
|
#
|
25
85
|
# eg.
|
26
86
|
# <p>foo</p> => foo
|
27
|
-
def extract_text(node)
|
87
|
+
def self.extract_text(node)
|
28
88
|
node.content
|
29
89
|
end
|
30
90
|
|
@@ -32,7 +92,7 @@ class AlgoliaHTMLExtractor
|
|
32
92
|
#
|
33
93
|
# eg
|
34
94
|
# <p>foo</p> => p
|
35
|
-
def extract_tag_name(node)
|
95
|
+
def self.extract_tag_name(node)
|
36
96
|
node.name.downcase
|
37
97
|
end
|
38
98
|
|
@@ -42,7 +102,7 @@ class AlgoliaHTMLExtractor
|
|
42
102
|
# <h1 name="anchor">Foo</h1> => anchor
|
43
103
|
# <h1 id="anchor">Foo</h1> => anchor
|
44
104
|
# <h1><a name="anchor">Foo</a></h1> => anchor
|
45
|
-
def extract_anchor(node)
|
105
|
+
def self.extract_anchor(node)
|
46
106
|
anchor = node.attr('name') || node.attr('id') || nil
|
47
107
|
return anchor unless anchor.nil?
|
48
108
|
|
@@ -55,7 +115,10 @@ class AlgoliaHTMLExtractor
|
|
55
115
|
|
56
116
|
##
|
57
117
|
# Generate a unique identifier for the item
|
58
|
-
def uuid(item)
|
118
|
+
def self.uuid(item)
|
119
|
+
# We don't use the objectID as part of the hash algorithm
|
120
|
+
|
121
|
+
item.delete(:objectID)
|
59
122
|
# We first get all the keys of the object, sorted alphabetically...
|
60
123
|
ordered_keys = item.keys.sort
|
61
124
|
|
@@ -74,71 +137,9 @@ class AlgoliaHTMLExtractor
|
|
74
137
|
##
|
75
138
|
# Get a relative numeric value of the importance of the heading
|
76
139
|
# 100 for top level, then -10 per heading
|
77
|
-
def heading_weight(heading_level)
|
140
|
+
def self.heading_weight(heading_level)
|
78
141
|
weight = 100
|
79
142
|
return weight if heading_level.nil?
|
80
143
|
weight - ((heading_level + 1) * 10)
|
81
144
|
end
|
82
|
-
|
83
|
-
def extract
|
84
|
-
heading_selector = 'h1,h2,h3,h4,h5,h6'
|
85
|
-
# We select all nodes that match either the headings or the elements to
|
86
|
-
# extract. This will allow us to loop over it in order it appears in the DOM
|
87
|
-
all_selector = "#{heading_selector},#{@options[:css_selector]}"
|
88
|
-
|
89
|
-
items = []
|
90
|
-
current_hierarchy = {
|
91
|
-
lvl0: nil,
|
92
|
-
lvl1: nil,
|
93
|
-
lvl2: nil,
|
94
|
-
lvl3: nil,
|
95
|
-
lvl4: nil,
|
96
|
-
lvl5: nil
|
97
|
-
}
|
98
|
-
current_position = 0 # Position of the DOM node in the tree
|
99
|
-
current_lvl = nil # Current closest hierarchy level
|
100
|
-
current_anchor = nil # Current closest anchor
|
101
|
-
|
102
|
-
@dom.css(all_selector).each do |node|
|
103
|
-
# If it's a heading, we update our current hierarchy
|
104
|
-
if node.matches?(heading_selector)
|
105
|
-
# Which level heading is it?
|
106
|
-
current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1
|
107
|
-
# Update this level, and set all the following ones to nil
|
108
|
-
current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node)
|
109
|
-
(current_lvl + 1..6).each do |lvl|
|
110
|
-
current_hierarchy["lvl#{lvl}".to_sym] = nil
|
111
|
-
end
|
112
|
-
# Update the anchor, if the new heading has one
|
113
|
-
new_anchor = extract_anchor(node)
|
114
|
-
current_anchor = new_anchor if new_anchor
|
115
|
-
end
|
116
|
-
|
117
|
-
# Stop if node is not to be extracted
|
118
|
-
next unless node.matches?(@options[:css_selector])
|
119
|
-
|
120
|
-
# Stop if node is empty
|
121
|
-
content = extract_text(node)
|
122
|
-
next if content.empty?
|
123
|
-
|
124
|
-
item = {
|
125
|
-
html: extract_html(node),
|
126
|
-
content: content,
|
127
|
-
tag_name: extract_tag_name(node),
|
128
|
-
hierarchy: current_hierarchy.clone,
|
129
|
-
anchor: current_anchor,
|
130
|
-
node: node,
|
131
|
-
weight: {
|
132
|
-
position: current_position,
|
133
|
-
heading: heading_weight(current_lvl)
|
134
|
-
}
|
135
|
-
}
|
136
|
-
item[:objectID] = uuid(item)
|
137
|
-
items << item
|
138
|
-
|
139
|
-
current_position += 1
|
140
|
-
end
|
141
|
-
|
142
|
-
items
|
143
|
-
end
|
144
145
|
end
|
data/lib/version.rb
CHANGED