algolia_html_extractor 2.2.1 → 2.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49c3023566226660bc3508fd06b177d27d7ad331
4
- data.tar.gz: 9c1c0df0b0217dae12946d69bea0b885a3477e38
3
+ metadata.gz: 3efc015000ed0122ea70417eb8a48a0f6c0ccde6
4
+ data.tar.gz: 6923bced71dfc762f8cf734efa96229ed55c46a4
5
5
  SHA512:
6
- metadata.gz: ccc601b3a3499fb7dcc25c366409ff007ad3660518d77de085aa2cc44fba89ef8d2fefd727c869e955a9a9762add797fb1b3aaf51f780cc4fb24d1a677c9ec87
7
- data.tar.gz: 06e3b58f8074d40361294d4115ed35cf592667d2ffbf8d170cdce8384ea4a673827f566361002b42e0c8ab88453242758ba39dd44e740511f493a25770547bf8
6
+ metadata.gz: d352d4b97686377e1869a48b6aa2f97900ae0e958eed630ef4cf103ffce63fac468e44a84e21bfcfb5beb89355a7d14a4974030ba75399778101a75e01d5955d
7
+ data.tar.gz: 3d7316d8e791a8d9fff538726e2b92de9e6a811190fd68fda3840e79e53f8db5bb4ed6b9752c772ee0a32431fb207cb27b57bfd6f6b1b603772f7cb62b6fdf52
@@ -4,16 +4,24 @@ require 'digest/md5'
4
4
  # Extract content from an HTML page in the form of items with associated
5
5
  # hierarchy data
6
6
  module AlgoliaHTMLExtractor
7
- def self.run(input, options: {})
7
+ # Extractor options, applying default options when none set
8
+ def self.default_options(options)
8
9
  default_options = {
9
- css_selector: 'p'
10
+ css_selector: 'p',
11
+ heading_selector: 'h1,h2,h3,h4,h5,h6'
10
12
  }
11
- options = default_options.merge(options)
13
+ default_options.merge(options)
14
+ end
12
15
 
13
- heading_selector = 'h1,h2,h3,h4,h5,h6'
14
- # We select all nodes that match either the headings or the elements to
15
- # extract. This will allow us to loop over it in order it appears in the DOM
16
- all_selector = "#{heading_selector},#{options[:css_selector]}"
16
+ # Getting a list of HTML nodes from an input and a CSS selector
17
+ def self.css(input, selector)
18
+ Nokogiri::HTML(input).css(selector)
19
+ end
20
+
21
+ def self.run(input, options: {})
22
+ options = default_options(options)
23
+ heading_selector = options[:heading_selector]
24
+ css_selector = options[:css_selector]
17
25
 
18
26
  items = []
19
27
  current_hierarchy = {
@@ -28,8 +36,9 @@ module AlgoliaHTMLExtractor
28
36
  current_lvl = nil # Current closest hierarchy level
29
37
  current_anchor = nil # Current closest anchor
30
38
 
31
- dom = Nokogiri::HTML(input)
32
- dom.css(all_selector).each do |node|
39
+ # We select all nodes that match either the headings or the elements to
40
+ # extract. This will allow us to loop over it in order it appears in the DOM
41
+ css(input, "#{heading_selector},#{css_selector}").each do |node|
33
42
  # If it's a heading, we update our current hierarchy
34
43
  if node.matches?(heading_selector)
35
44
  # Which level heading is it?
@@ -45,7 +54,7 @@ module AlgoliaHTMLExtractor
45
54
  end
46
55
 
47
56
  # Stop if node is not to be extracted
48
- next unless node.matches?(options[:css_selector])
57
+ next unless node.matches?(css_selector)
49
58
 
50
59
  # Stop if node is empty
51
60
  content = extract_text(node)
@@ -1,5 +1,6 @@
1
1
  # Expose gem version
2
2
  # rubocop:disable Style/SingleLineMethods
3
3
  class AlgoliaHTMLExtractorVersion
4
- def self.to_s; '2.2.1' end
4
+ def self.to_s; '2.2.2' end
5
5
  end
6
+ # rubocop:enable Style/SingleLineMethods
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: algolia_html_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 2.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-19 00:00:00.000000000 Z
11
+ date: 2018-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -44,14 +44,14 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.8'
47
+ version: 1.8.2
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.8'
54
+ version: 1.8.2
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: coveralls
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -136,20 +136,6 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: '4.6'
139
- - !ruby/object:Gem::Dependency
140
- name: jeweler
141
- requirement: !ruby/object:Gem::Requirement
142
- requirements:
143
- - - "~>"
144
- - !ruby/object:Gem::Version
145
- version: '2.0'
146
- type: :development
147
- prerelease: false
148
- version_requirements: !ruby/object:Gem::Requirement
149
- requirements:
150
- - - "~>"
151
- - !ruby/object:Gem::Version
152
- version: '2.0'
153
139
  - !ruby/object:Gem::Dependency
154
140
  name: rspec
155
141
  requirement: !ruby/object:Gem::Requirement