boilerpipe-ruby 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da8bc0b8d74eea14b73e61812bbeba5fef75e8bae2330739e49b28e26f73d14d
4
- data.tar.gz: 68fee529b501210cf3278eb2b045b09e6d27c7846355b7d430c05e60f39088e2
3
+ metadata.gz: 835e5122c287082ef39874ee0a3365f08910a7a3f70b76c45dc264e8e1301edc
4
+ data.tar.gz: edff0860c01277adfc703e453e22b1d24c2db1817cfd4ad6800dfc1a6c0f339f
5
5
  SHA512:
6
- metadata.gz: 4202afab2a01ae588977fde351dfacc29551634077e794d028282666c43d3aeb09adf425d60608dc694c36f2fd8ed034ef89ba41ee07e6ad23f426c19d740931
7
- data.tar.gz: a0dc75a0c5384e1eaf8b50dfb92bc9294b41f38e8b0e0cceb9e9a6aafdf436629df2b3420f61a33523a08d6788c02ade799cdf8fe29db4338c766c7b01523704
6
+ metadata.gz: 7cbeb85f9dd74c930e9999e850d2abca6fd3e9470971461a978295de640195192f633a71a19a20b96824fb5becec2e7841e7f3e257b83b90550c935b6d16ff7b
7
+ data.tar.gz: 393803c95416f77a467724cd0b451961dcbb9c6e544c6d22a4f71853a041e5f2c32513ffd3f43e9733de43624fce31dad18b9cb7d0c37bd3cd722a6fdf223481
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ # 0.4.4 / 2021-02-13
2
+ * Do a better job of stripping out script tags
3
+
1
4
  # 0.4.3 / 2020-07-18
2
5
 
3
6
  * update deps
@@ -22,6 +22,6 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency 'bundler', '~> 2.0'
23
23
  spec.add_development_dependency 'rake', '>= 12.3.3'
24
24
  spec.add_development_dependency 'rickshaw', '~> 0.5.0'
25
- spec.add_development_dependency 'rspec', '~> 3.9'
25
+ spec.add_development_dependency 'rspec', '~> 3.10'
26
26
  spec.add_runtime_dependency 'nokogiri', '~> 1.10'
27
27
  end
data/lib/boilerpipe.rb CHANGED
@@ -40,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
40
40
  require 'boilerpipe/labels/default'
41
41
  require 'boilerpipe/labels/label_action'
42
42
 
43
+ require 'boilerpipe/sax/preprocessor'
43
44
  require 'boilerpipe/sax/html_content_handler'
44
45
  require 'boilerpipe/sax/boilerpipe_html_parser'
45
46
  require 'boilerpipe/sax/tag_action_map'
@@ -1,16 +1,11 @@
1
1
  module Boilerpipe::SAX
2
2
  class BoilerpipeHTMLParser
3
3
  def self.parse(text)
4
- # script bug - delete script tags
5
- text.gsub!(/\<script>.+?<\/script>/i, '')
6
-
7
- # nokogiri uses libxml for mri and nekohtml for jruby
8
- # mri doesn't remove &nbsp; when missing the semicolon
9
- text.gsub!(/(&nbsp) /, '\1; ')
4
+ # strip out tags that cause issues
5
+ text = Preprocessor.strip(text)
10
6
 
11
7
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
12
8
  text = Nokogiri::HTML(text).to_html
13
-
14
9
  handler = HTMLContentHandler.new
15
10
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
16
11
  noko_parser.parse(text)
@@ -0,0 +1,11 @@
1
+ module Boilerpipe::SAX
2
+ class Preprocessor
3
+ def self.strip(text)
4
+ # script bug - delete script tags
5
+ text = text.gsub(/\<script.+?<\/script>/im, '')
6
+ # nokogiri uses libxml for mri and nekohtml for jruby
7
+ # mri doesn't remove &nbsp; when missing the semicolon
8
+ text.gsub(/(&nbsp) /, '\1; ')
9
+ end
10
+ end
11
+ end
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.3'
2
+ VERSION = '0.4.4'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2021-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -58,14 +58,14 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.9'
61
+ version: '3.10'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.9'
68
+ version: '3.10'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +136,7 @@ files:
136
136
  - lib/boilerpipe/labels/label_action.rb
137
137
  - lib/boilerpipe/sax/boilerpipe_html_parser.rb
138
138
  - lib/boilerpipe/sax/html_content_handler.rb
139
+ - lib/boilerpipe/sax/preprocessor.rb
139
140
  - lib/boilerpipe/sax/tag_action_map.rb
140
141
  - lib/boilerpipe/sax/tag_actions/anchor_text.rb
141
142
  - lib/boilerpipe/sax/tag_actions/block_level.rb