algolia_html_extractor 2.5.1 → 2.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/algolia_html_extractor.rb +9 -1
- data/lib/version.rb +3 -1
- metadata +10 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ed81071c2031fcf2aab6a94b3e7e71b6348b00fd839638de3688a50209dde639
|
4
|
+
data.tar.gz: 5ccba85a1982abae971c624ce889aa11c5973e205c56adba7a351c9cf0baf902
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38afb9eec2aba6c22a10caf459979587ac3ddd3771b5ed45ca22e039f8e589b45f82a3997bd289b302dae41e8afb0a102b5fa2e12248b4627c32973712a302d6
|
7
|
+
data.tar.gz: b2a3f64fd9c20d6f5d0e7e823db5ce069071417a37a9bc114ef293a50db8d4cc8e7911afc811b413178fa2c863aeb68d30f5a64b9c0d2d691ab66916038c54ad
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'digest/md5'
|
3
5
|
|
@@ -8,7 +10,8 @@ module AlgoliaHTMLExtractor
|
|
8
10
|
def self.default_options(options)
|
9
11
|
default_options = {
|
10
12
|
css_selector: 'p',
|
11
|
-
heading_selector: 'h1,h2,h3,h4,h5,h6'
|
13
|
+
heading_selector: 'h1,h2,h3,h4,h5,h6',
|
14
|
+
tags_to_exclude: ''
|
12
15
|
}
|
13
16
|
default_options.merge(options)
|
14
17
|
end
|
@@ -22,6 +25,7 @@ module AlgoliaHTMLExtractor
|
|
22
25
|
options = default_options(options)
|
23
26
|
heading_selector = options[:heading_selector]
|
24
27
|
css_selector = options[:css_selector]
|
28
|
+
tags_to_exclude = options[:tags_to_exclude]
|
25
29
|
|
26
30
|
items = []
|
27
31
|
current_hierarchy = {
|
@@ -56,6 +60,9 @@ module AlgoliaHTMLExtractor
|
|
56
60
|
# Stop if node is not to be extracted
|
57
61
|
next unless node.matches?(css_selector)
|
58
62
|
|
63
|
+
# Removing excluded child from the node
|
64
|
+
node.search(tags_to_exclude).each(&:remove) unless tags_to_exclude.empty?
|
65
|
+
|
59
66
|
# Stop if node is empty
|
60
67
|
content = extract_text(node)
|
61
68
|
next if content.empty?
|
@@ -148,6 +155,7 @@ module AlgoliaHTMLExtractor
|
|
148
155
|
def self.heading_weight(heading_level)
|
149
156
|
weight = 100
|
150
157
|
return weight if heading_level.nil?
|
158
|
+
|
151
159
|
weight - ((heading_level + 1) * 10)
|
152
160
|
end
|
153
161
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algolia_html_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: awesome_print
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.6'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.6'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: json
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -42,16 +28,16 @@ dependencies:
|
|
42
28
|
name: nokogiri
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
|
-
- - "
|
31
|
+
- - ">="
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.
|
33
|
+
version: 1.10.4
|
48
34
|
type: :runtime
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
|
-
- - "
|
38
|
+
- - ">="
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.
|
40
|
+
version: 1.10.4
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: coveralls
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -194,7 +180,7 @@ homepage: https://github.com/algolia/html-extractor
|
|
194
180
|
licenses:
|
195
181
|
- MIT
|
196
182
|
metadata: {}
|
197
|
-
post_install_message:
|
183
|
+
post_install_message:
|
198
184
|
rdoc_options: []
|
199
185
|
require_paths:
|
200
186
|
- lib
|
@@ -209,9 +195,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
209
195
|
- !ruby/object:Gem::Version
|
210
196
|
version: '0'
|
211
197
|
requirements: []
|
212
|
-
|
213
|
-
|
214
|
-
signing_key:
|
198
|
+
rubygems_version: 3.1.2
|
199
|
+
signing_key:
|
215
200
|
specification_version: 4
|
216
201
|
summary: Convert HTML content into Algolia records
|
217
202
|
test_files: []
|