algolia_html_extractor 2.2.1 → 2.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
4
- data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
3
+ metadata.gz: 3efc015000ed0122ea70417eb8a48a0f6c0ccde6
4
+ data.tar.gz: 6923bced71dfc762f8cf734efa96229ed55c46a4
5
5
  SHA512:
6
- metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
7
- data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8
6
+ metadata.gz: d352d4b97686377e1869a48b6aa2f97900ae0e958eed630ef4cf103ffce63fac468e44a84e21bfcfb5beb89355a7d14a4974030ba75399778101a75e01d5955d
7
+ data.tar.gz: 3d7316d8e791a8d9fff538726e2b92de9e6a811190fd68fda3840e79e53f8db5bb4ed6b9752c772ee0a32431fb207cb27b57bfd6f6b1b603772f7cb62b6fdf52
@@ -4,16 +4,24 @@ require 'digest/md5'
4
4
  # Extract content from an HTML page in the form of items with associated
5
5
  # hierarchy data
6
6
  module AlgoliaHTMLExtractor
7
- def self.run(input, options: {})
7
+ # Extractor options, applying default options when none set
8
+ def self.default_options(options)
8
9
  default_options = {
9
- css_selector: 'p'
10
+ css_selector: 'p',
11
+ heading_selector: 'h1,h2,h3,h4,h5,h6'
10
12
  }
11
- options = default_options.merge(options)
13
+ default_options.merge(options)
14
+ end
12
15
 
13
- heading_selector = 'h1,h2,h3,h4,h5,h6'
14
- # We select all nodes that match either the headings or the elements to
15
- # extract. This will allow us to loop over it in order it appears in the DOM
16
- all_selector = "#{heading_selector},#{options[:css_selector]}"
16
+ # Getting a list of HTML nodes from an input and a CSS selector
17
+ def self.css(input, selector)
18
+ Nokogiri::HTML(input).css(selector)
19
+ end
20
+
21
+ def self.run(input, options: {})
22
+ options = default_options(options)
23
+ heading_selector = options[:heading_selector]
24
+ css_selector = options[:css_selector]
17
25
 
18
26
  items = []
19
27
  current_hierarchy = {
@@ -28,8 +36,9 @@ module AlgoliaHTMLExtractor
28
36
  current_lvl = nil # Current closest hierarchy level
29
37
  current_anchor = nil # Current closest anchor
30
38
 
31
- dom = Nokogiri::HTML(input)
32
- dom.css(all_selector).each do |node|
39
+ # We select all nodes that match either the headings or the elements to
40
+ # extract. This will allow us to loop over it in order it appears in the DOM
41
+ css(input, "#{heading_selector},#{css_selector}").each do |node|
33
42
  # If it's a heading, we update our current hierarchy
34
43
  if node.matches?(heading_selector)
35
44
  # Which level heading is it?
@@ -45,7 +54,7 @@ module AlgoliaHTMLExtractor
45
54
  end
46
55
 
47
56
  # Stop if node is not to be extracted
48
- next unless node.matches?(options[:css_selector])
57
+ next unless node.matches?(css_selector)
49
58
 
50
59
  # Stop if node is empty
51
60
  content = extract_text(node)
@@ -1,5 +1,6 @@
1
1
  # Expose gem version
2
2
  # rubocop:disable Style/SingleLineMethods
3
3
  class AlgoliaHTMLExtractorVersion
4
- def self.to_s; '2.2.1' end
4
+ def self.to_s; '2.2.2' end
5
5
  end
6
+ # rubocop:enable Style/SingleLineMethods
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: algolia_html_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 2.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-19 00:00:00.000000000 Z
11
+ date: 2018-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.8'
47
+ version: 1.8.2
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.8'
54
+ version: 1.8.2
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: coveralls
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -136,20 +136,6 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '4.6'
139
- - !ruby/object:Gem::Dependency
140
- name: jeweler
141
- requirement: !ruby/object:Gem::Requirement
142
- requirements:
143
- - - "~>"
144
- - !ruby/object:Gem::Version
145
- version: '2.0'
146
- type: :development
147
- prerelease: false
148
- version_requirements: !ruby/object:Gem::Requirement
149
- requirements:
150
- - - "~>"
151
- - !ruby/object:Gem::Version
152
- version: '2.0'
153
139
  - !ruby/object:Gem::Dependency
154
140
  name: rspec
155
141
  requirement: !ruby/object:Gem::Requirement