algolia_html_extractor 2.5.2 → 2.6.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/algolia_html_extractor.rb +9 -1
- data/lib/version.rb +3 -1
- metadata +8 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1496286d1762c4231f9f38e42549b0128c53d1ff5cf86ffc5b0da04c3c9934bb
|
4
|
+
data.tar.gz: 6af72f32fa6e8e86064677688a2268a02a1ad2a8bdd9fc44146f2e72f527eb3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70bdd12e5e15d62cf9c5a5c992180b2491c090cdcca4e671ff8039c7ae91c91817fd38dcf6a4b54e09f59082dfa2157cd4ca44513cf83c64aad8e1f8c771a258
|
7
|
+
data.tar.gz: b5111d1f7fbc948a1186daf0c2c3760cb2a6ea0a7b1b64ae677f96f4ff849a8e99a30f31efb75807e1d509f131c236f6c750a4f94384180df8b70c0736b3c2c1
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'digest/md5'
|
3
5
|
|
@@ -8,7 +10,8 @@ module AlgoliaHTMLExtractor
|
|
8
10
|
def self.default_options(options)
|
9
11
|
default_options = {
|
10
12
|
css_selector: 'p',
|
11
|
-
heading_selector: 'h1,h2,h3,h4,h5,h6'
|
13
|
+
heading_selector: 'h1,h2,h3,h4,h5,h6',
|
14
|
+
tags_to_exclude: ''
|
12
15
|
}
|
13
16
|
default_options.merge(options)
|
14
17
|
end
|
@@ -22,6 +25,7 @@ module AlgoliaHTMLExtractor
|
|
22
25
|
options = default_options(options)
|
23
26
|
heading_selector = options[:heading_selector]
|
24
27
|
css_selector = options[:css_selector]
|
28
|
+
tags_to_exclude = options[:tags_to_exclude]
|
25
29
|
|
26
30
|
items = []
|
27
31
|
current_hierarchy = {
|
@@ -56,6 +60,9 @@ module AlgoliaHTMLExtractor
|
|
56
60
|
# Stop if node is not to be extracted
|
57
61
|
next unless node.matches?(css_selector)
|
58
62
|
|
63
|
+
# Removing excluded child from the node
|
64
|
+
node.search(tags_to_exclude).each(&:remove) unless tags_to_exclude.empty?
|
65
|
+
|
59
66
|
# Stop if node is empty
|
60
67
|
content = extract_text(node)
|
61
68
|
next if content.empty?
|
@@ -148,6 +155,7 @@ module AlgoliaHTMLExtractor
|
|
148
155
|
def self.heading_weight(heading_level)
|
149
156
|
weight = 100
|
150
157
|
return weight if heading_level.nil?
|
158
|
+
|
151
159
|
weight - ((heading_level + 1) * 10)
|
152
160
|
end
|
153
161
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algolia_html_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: '1.10'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: '1.10'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -180,7 +180,7 @@ homepage: https://github.com/algolia/html-extractor
|
|
180
180
|
licenses:
|
181
181
|
- MIT
|
182
182
|
metadata: {}
|
183
|
-
post_install_message:
|
183
|
+
post_install_message:
|
184
184
|
rdoc_options: []
|
185
185
|
require_paths:
|
186
186
|
- lib
|
@@ -195,9 +195,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
195
195
|
- !ruby/object:Gem::Version
|
196
196
|
version: '0'
|
197
197
|
requirements: []
|
198
|
-
|
199
|
-
|
200
|
-
signing_key:
|
198
|
+
rubygems_version: 3.1.2
|
199
|
+
signing_key:
|
201
200
|
specification_version: 4
|
202
201
|
summary: Convert HTML content into Algolia records
|
203
202
|
test_files: []
|