boilerpipe-ruby 0.4.3 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/boilerpipe-ruby.gemspec +1 -1
- data/lib/boilerpipe.rb +1 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -7
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 835e5122c287082ef39874ee0a3365f08910a7a3f70b76c45dc264e8e1301edc
|
4
|
+
data.tar.gz: edff0860c01277adfc703e453e22b1d24c2db1817cfd4ad6800dfc1a6c0f339f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cbeb85f9dd74c930e9999e850d2abca6fd3e9470971461a978295de640195192f633a71a19a20b96824fb5becec2e7841e7f3e257b83b90550c935b6d16ff7b
|
7
|
+
data.tar.gz: 393803c95416f77a467724cd0b451961dcbb9c6e544c6d22a4f71853a041e5f2c32513ffd3f43e9733de43624fce31dad18b9cb7d0c37bd3cd722a6fdf223481
|
data/CHANGELOG.md
CHANGED
data/boilerpipe-ruby.gemspec
CHANGED
@@ -22,6 +22,6 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
23
|
spec.add_development_dependency 'rake', '>= 12.3.3'
|
24
24
|
spec.add_development_dependency 'rickshaw', '~> 0.5.0'
|
25
|
-
spec.add_development_dependency 'rspec', '~> 3.
|
25
|
+
spec.add_development_dependency 'rspec', '~> 3.10'
|
26
26
|
spec.add_runtime_dependency 'nokogiri', '~> 1.10'
|
27
27
|
end
|
data/lib/boilerpipe.rb
CHANGED
@@ -40,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
|
40
40
|
require 'boilerpipe/labels/default'
|
41
41
|
require 'boilerpipe/labels/label_action'
|
42
42
|
|
43
|
+
require 'boilerpipe/sax/preprocessor'
|
43
44
|
require 'boilerpipe/sax/html_content_handler'
|
44
45
|
require 'boilerpipe/sax/boilerpipe_html_parser'
|
45
46
|
require 'boilerpipe/sax/tag_action_map'
|
@@ -1,16 +1,11 @@
|
|
1
1
|
module Boilerpipe::SAX
|
2
2
|
class BoilerpipeHTMLParser
|
3
3
|
def self.parse(text)
|
4
|
-
#
|
5
|
-
text.
|
6
|
-
|
7
|
-
# nokogiri uses libxml for mri and nekohtml for jruby
|
8
|
-
# mri doesn't remove when missing the semicolon
|
9
|
-
text.gsub!(/( ) /, '\1; ')
|
4
|
+
# strip out tags that cause issues
|
5
|
+
text = Preprocessor.strip(text)
|
10
6
|
|
11
7
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
12
8
|
text = Nokogiri::HTML(text).to_html
|
13
|
-
|
14
9
|
handler = HTMLContentHandler.new
|
15
10
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
16
11
|
noko_parser.parse(text)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Boilerpipe::SAX
|
2
|
+
class Preprocessor
|
3
|
+
def self.strip(text)
|
4
|
+
# script bug - delete script tags
|
5
|
+
text = text.gsub(/\<script.+?<\/script>/im, '')
|
6
|
+
# nokogiri uses libxml for mri and nekohtml for jruby
|
7
|
+
# mri doesn't remove when missing the semicolon
|
8
|
+
text.gsub(/( ) /, '\1; ')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -58,14 +58,14 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '3.
|
61
|
+
version: '3.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '3.
|
68
|
+
version: '3.10'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +136,7 @@ files:
|
|
136
136
|
- lib/boilerpipe/labels/label_action.rb
|
137
137
|
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
138
138
|
- lib/boilerpipe/sax/html_content_handler.rb
|
139
|
+
- lib/boilerpipe/sax/preprocessor.rb
|
139
140
|
- lib/boilerpipe/sax/tag_action_map.rb
|
140
141
|
- lib/boilerpipe/sax/tag_actions/anchor_text.rb
|
141
142
|
- lib/boilerpipe/sax/tag_actions/block_level.rb
|