algolia_html_extractor 2.2.1 → 2.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/algolia_html_extractor.rb +19 -10
- data/lib/version.rb +2 -1
- metadata +4 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3efc015000ed0122ea70417eb8a48a0f6c0ccde6
|
4
|
+
data.tar.gz: 6923bced71dfc762f8cf734efa96229ed55c46a4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d352d4b97686377e1869a48b6aa2f97900ae0e958eed630ef4cf103ffce63fac468e44a84e21bfcfb5beb89355a7d14a4974030ba75399778101a75e01d5955d
|
7
|
+
data.tar.gz: 3d7316d8e791a8d9fff538726e2b92de9e6a811190fd68fda3840e79e53f8db5bb4ed6b9752c772ee0a32431fb207cb27b57bfd6f6b1b603772f7cb62b6fdf52
|
@@ -4,16 +4,24 @@ require 'digest/md5'
|
|
4
4
|
# Extract content from an HTML page in the form of items with associated
|
5
5
|
# hierarchy data
|
6
6
|
module AlgoliaHTMLExtractor
|
7
|
-
|
7
|
+
# Extractor options, applying default options when none set
|
8
|
+
def self.default_options(options)
|
8
9
|
default_options = {
|
9
|
-
css_selector: 'p'
|
10
|
+
css_selector: 'p',
|
11
|
+
heading_selector: 'h1,h2,h3,h4,h5,h6'
|
10
12
|
}
|
11
|
-
|
13
|
+
default_options.merge(options)
|
14
|
+
end
|
12
15
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
16
|
+
# Getting a list of HTML nodes from an input and a CSS selector
|
17
|
+
def self.css(input, selector)
|
18
|
+
Nokogiri::HTML(input).css(selector)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.run(input, options: {})
|
22
|
+
options = default_options(options)
|
23
|
+
heading_selector = options[:heading_selector]
|
24
|
+
css_selector = options[:css_selector]
|
17
25
|
|
18
26
|
items = []
|
19
27
|
current_hierarchy = {
|
@@ -28,8 +36,9 @@ module AlgoliaHTMLExtractor
|
|
28
36
|
current_lvl = nil # Current closest hierarchy level
|
29
37
|
current_anchor = nil # Current closest anchor
|
30
38
|
|
31
|
-
|
32
|
-
|
39
|
+
# We select all nodes that match either the headings or the elements to
|
40
|
+
# extract. This will allow us to loop over it in order it appears in the DOM
|
41
|
+
css(input, "#{heading_selector},#{css_selector}").each do |node|
|
33
42
|
# If it's a heading, we update our current hierarchy
|
34
43
|
if node.matches?(heading_selector)
|
35
44
|
# Which level heading is it?
|
@@ -45,7 +54,7 @@ module AlgoliaHTMLExtractor
|
|
45
54
|
end
|
46
55
|
|
47
56
|
# Stop if node is not to be extracted
|
48
|
-
next unless node.matches?(
|
57
|
+
next unless node.matches?(css_selector)
|
49
58
|
|
50
59
|
# Stop if node is empty
|
51
60
|
content = extract_text(node)
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algolia_html_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.2.
|
4
|
+
version: 2.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 1.8.2
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 1.8.2
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: coveralls
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,20 +136,6 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '4.6'
|
139
|
-
- !ruby/object:Gem::Dependency
|
140
|
-
name: jeweler
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
142
|
-
requirements:
|
143
|
-
- - "~>"
|
144
|
-
- !ruby/object:Gem::Version
|
145
|
-
version: '2.0'
|
146
|
-
type: :development
|
147
|
-
prerelease: false
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
149
|
-
requirements:
|
150
|
-
- - "~>"
|
151
|
-
- !ruby/object:Gem::Version
|
152
|
-
version: '2.0'
|
153
139
|
- !ruby/object:Gem::Dependency
|
154
140
|
name: rspec
|
155
141
|
requirement: !ruby/object:Gem::Requirement
|