algolia_html_extractor 2.5.1 → 2.6.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/algolia_html_extractor.rb +9 -1
- data/lib/version.rb +3 -1
- metadata +10 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ed81071c2031fcf2aab6a94b3e7e71b6348b00fd839638de3688a50209dde639
|
4
|
+
data.tar.gz: 5ccba85a1982abae971c624ce889aa11c5973e205c56adba7a351c9cf0baf902
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 38afb9eec2aba6c22a10caf459979587ac3ddd3771b5ed45ca22e039f8e589b45f82a3997bd289b302dae41e8afb0a102b5fa2e12248b4627c32973712a302d6
|
7
|
+
data.tar.gz: b2a3f64fd9c20d6f5d0e7e823db5ce069071417a37a9bc114ef293a50db8d4cc8e7911afc811b413178fa2c863aeb68d30f5a64b9c0d2d691ab66916038c54ad
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'digest/md5'
|
3
5
|
|
@@ -8,7 +10,8 @@ module AlgoliaHTMLExtractor
|
|
8
10
|
def self.default_options(options)
|
9
11
|
default_options = {
|
10
12
|
css_selector: 'p',
|
11
|
-
heading_selector: 'h1,h2,h3,h4,h5,h6'
|
13
|
+
heading_selector: 'h1,h2,h3,h4,h5,h6',
|
14
|
+
tags_to_exclude: ''
|
12
15
|
}
|
13
16
|
default_options.merge(options)
|
14
17
|
end
|
@@ -22,6 +25,7 @@ module AlgoliaHTMLExtractor
|
|
22
25
|
options = default_options(options)
|
23
26
|
heading_selector = options[:heading_selector]
|
24
27
|
css_selector = options[:css_selector]
|
28
|
+
tags_to_exclude = options[:tags_to_exclude]
|
25
29
|
|
26
30
|
items = []
|
27
31
|
current_hierarchy = {
|
@@ -56,6 +60,9 @@ module AlgoliaHTMLExtractor
|
|
56
60
|
# Stop if node is not to be extracted
|
57
61
|
next unless node.matches?(css_selector)
|
58
62
|
|
63
|
+
# Removing excluded child from the node
|
64
|
+
node.search(tags_to_exclude).each(&:remove) unless tags_to_exclude.empty?
|
65
|
+
|
59
66
|
# Stop if node is empty
|
60
67
|
content = extract_text(node)
|
61
68
|
next if content.empty?
|
@@ -148,6 +155,7 @@ module AlgoliaHTMLExtractor
|
|
148
155
|
def self.heading_weight(heading_level)
|
149
156
|
weight = 100
|
150
157
|
return weight if heading_level.nil?
|
158
|
+
|
151
159
|
weight - ((heading_level + 1) * 10)
|
152
160
|
end
|
153
161
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algolia_html_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: awesome_print
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.6'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.6'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: json
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -42,16 +28,16 @@ dependencies:
|
|
42
28
|
name: nokogiri
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
44
30
|
requirements:
|
45
|
-
- - "
|
31
|
+
- - ">="
|
46
32
|
- !ruby/object:Gem::Version
|
47
|
-
version: 1.
|
33
|
+
version: 1.10.4
|
48
34
|
type: :runtime
|
49
35
|
prerelease: false
|
50
36
|
version_requirements: !ruby/object:Gem::Requirement
|
51
37
|
requirements:
|
52
|
-
- - "
|
38
|
+
- - ">="
|
53
39
|
- !ruby/object:Gem::Version
|
54
|
-
version: 1.
|
40
|
+
version: 1.10.4
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: coveralls
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -194,7 +180,7 @@ homepage: https://github.com/algolia/html-extractor
|
|
194
180
|
licenses:
|
195
181
|
- MIT
|
196
182
|
metadata: {}
|
197
|
-
post_install_message:
|
183
|
+
post_install_message:
|
198
184
|
rdoc_options: []
|
199
185
|
require_paths:
|
200
186
|
- lib
|
@@ -209,9 +195,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
209
195
|
- !ruby/object:Gem::Version
|
210
196
|
version: '0'
|
211
197
|
requirements: []
|
212
|
-
|
213
|
-
|
214
|
-
signing_key:
|
198
|
+
rubygems_version: 3.1.2
|
199
|
+
signing_key:
|
215
200
|
specification_version: 4
|
216
201
|
summary: Convert HTML content into Algolia records
|
217
202
|
test_files: []
|