algolia_html_extractor 2.5.2 → 2.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/algolia_html_extractor.rb +9 -1
- data/lib/version.rb +3 -1
- metadata +8 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1496286d1762c4231f9f38e42549b0128c53d1ff5cf86ffc5b0da04c3c9934bb
|
4
|
+
data.tar.gz: 6af72f32fa6e8e86064677688a2268a02a1ad2a8bdd9fc44146f2e72f527eb3f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70bdd12e5e15d62cf9c5a5c992180b2491c090cdcca4e671ff8039c7ae91c91817fd38dcf6a4b54e09f59082dfa2157cd4ca44513cf83c64aad8e1f8c771a258
|
7
|
+
data.tar.gz: b5111d1f7fbc948a1186daf0c2c3760cb2a6ea0a7b1b64ae677f96f4ff849a8e99a30f31efb75807e1d509f131c236f6c750a4f94384180df8b70c0736b3c2c1
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'nokogiri'
|
2
4
|
require 'digest/md5'
|
3
5
|
|
@@ -8,7 +10,8 @@ module AlgoliaHTMLExtractor
|
|
8
10
|
def self.default_options(options)
|
9
11
|
default_options = {
|
10
12
|
css_selector: 'p',
|
11
|
-
heading_selector: 'h1,h2,h3,h4,h5,h6'
|
13
|
+
heading_selector: 'h1,h2,h3,h4,h5,h6',
|
14
|
+
tags_to_exclude: ''
|
12
15
|
}
|
13
16
|
default_options.merge(options)
|
14
17
|
end
|
@@ -22,6 +25,7 @@ module AlgoliaHTMLExtractor
|
|
22
25
|
options = default_options(options)
|
23
26
|
heading_selector = options[:heading_selector]
|
24
27
|
css_selector = options[:css_selector]
|
28
|
+
tags_to_exclude = options[:tags_to_exclude]
|
25
29
|
|
26
30
|
items = []
|
27
31
|
current_hierarchy = {
|
@@ -56,6 +60,9 @@ module AlgoliaHTMLExtractor
|
|
56
60
|
# Stop if node is not to be extracted
|
57
61
|
next unless node.matches?(css_selector)
|
58
62
|
|
63
|
+
# Removing excluded child from the node
|
64
|
+
node.search(tags_to_exclude).each(&:remove) unless tags_to_exclude.empty?
|
65
|
+
|
59
66
|
# Stop if node is empty
|
60
67
|
content = extract_text(node)
|
61
68
|
next if content.empty?
|
@@ -148,6 +155,7 @@ module AlgoliaHTMLExtractor
|
|
148
155
|
def self.heading_weight(heading_level)
|
149
156
|
weight = 100
|
150
157
|
return weight if heading_level.nil?
|
158
|
+
|
151
159
|
weight - ((heading_level + 1) * 10)
|
152
160
|
end
|
153
161
|
end
|
data/lib/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: algolia_html_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tim Carry
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|
@@ -30,14 +30,14 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 1.
|
33
|
+
version: '1.10'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: 1.
|
40
|
+
version: '1.10'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: coveralls
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -180,7 +180,7 @@ homepage: https://github.com/algolia/html-extractor
|
|
180
180
|
licenses:
|
181
181
|
- MIT
|
182
182
|
metadata: {}
|
183
|
-
post_install_message:
|
183
|
+
post_install_message:
|
184
184
|
rdoc_options: []
|
185
185
|
require_paths:
|
186
186
|
- lib
|
@@ -195,9 +195,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
195
195
|
- !ruby/object:Gem::Version
|
196
196
|
version: '0'
|
197
197
|
requirements: []
|
198
|
-
|
199
|
-
|
200
|
-
signing_key:
|
198
|
+
rubygems_version: 3.1.2
|
199
|
+
signing_key:
|
201
200
|
specification_version: 4
|
202
201
|
summary: Convert HTML content into Algolia records
|
203
202
|
test_files: []
|