boilerpipe-ruby 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/boilerpipe-ruby.gemspec +1 -1
- data/lib/boilerpipe.rb +1 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -7
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/version.rb +1 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 835e5122c287082ef39874ee0a3365f08910a7a3f70b76c45dc264e8e1301edc
|
4
|
+
data.tar.gz: edff0860c01277adfc703e453e22b1d24c2db1817cfd4ad6800dfc1a6c0f339f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cbeb85f9dd74c930e9999e850d2abca6fd3e9470971461a978295de640195192f633a71a19a20b96824fb5becec2e7841e7f3e257b83b90550c935b6d16ff7b
|
7
|
+
data.tar.gz: 393803c95416f77a467724cd0b451961dcbb9c6e544c6d22a4f71853a041e5f2c32513ffd3f43e9733de43624fce31dad18b9cb7d0c37bd3cd722a6fdf223481
|
data/CHANGELOG.md
CHANGED
data/boilerpipe-ruby.gemspec
CHANGED
@@ -22,6 +22,6 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
23
|
spec.add_development_dependency 'rake', '>= 12.3.3'
|
24
24
|
spec.add_development_dependency 'rickshaw', '~> 0.5.0'
|
25
|
-
spec.add_development_dependency 'rspec', '~> 3.
|
25
|
+
spec.add_development_dependency 'rspec', '~> 3.10'
|
26
26
|
spec.add_runtime_dependency 'nokogiri', '~> 1.10'
|
27
27
|
end
|
data/lib/boilerpipe.rb
CHANGED
@@ -40,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
|
40
40
|
require 'boilerpipe/labels/default'
|
41
41
|
require 'boilerpipe/labels/label_action'
|
42
42
|
|
43
|
+
require 'boilerpipe/sax/preprocessor'
|
43
44
|
require 'boilerpipe/sax/html_content_handler'
|
44
45
|
require 'boilerpipe/sax/boilerpipe_html_parser'
|
45
46
|
require 'boilerpipe/sax/tag_action_map'
|
@@ -1,16 +1,11 @@
|
|
1
1
|
module Boilerpipe::SAX
|
2
2
|
class BoilerpipeHTMLParser
|
3
3
|
def self.parse(text)
|
4
|
-
#
|
5
|
-
text.
|
6
|
-
|
7
|
-
# nokogiri uses libxml for mri and nekohtml for jruby
|
8
|
-
# mri doesn't remove when missing the semicolon
|
9
|
-
text.gsub!(/( ) /, '\1; ')
|
4
|
+
# strip out tags that cause issues
|
5
|
+
text = Preprocessor.strip(text)
|
10
6
|
|
11
7
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
12
8
|
text = Nokogiri::HTML(text).to_html
|
13
|
-
|
14
9
|
handler = HTMLContentHandler.new
|
15
10
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
16
11
|
noko_parser.parse(text)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Boilerpipe::SAX
|
2
|
+
class Preprocessor
|
3
|
+
def self.strip(text)
|
4
|
+
# script bug - delete script tags
|
5
|
+
text = text.gsub(/\<script.+?<\/script>/im, '')
|
6
|
+
# nokogiri uses libxml for mri and nekohtml for jruby
|
7
|
+
# mri doesn't remove when missing the semicolon
|
8
|
+
text.gsub(/( ) /, '\1; ')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -58,14 +58,14 @@ dependencies:
|
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '3.
|
61
|
+
version: '3.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '3.
|
68
|
+
version: '3.10'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -136,6 +136,7 @@ files:
|
|
136
136
|
- lib/boilerpipe/labels/label_action.rb
|
137
137
|
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
138
138
|
- lib/boilerpipe/sax/html_content_handler.rb
|
139
|
+
- lib/boilerpipe/sax/preprocessor.rb
|
139
140
|
- lib/boilerpipe/sax/tag_action_map.rb
|
140
141
|
- lib/boilerpipe/sax/tag_actions/anchor_text.rb
|
141
142
|
- lib/boilerpipe/sax/tag_actions/block_level.rb
|