boilerpipe-ruby 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da8bc0b8d74eea14b73e61812bbeba5fef75e8bae2330739e49b28e26f73d14d
4
- data.tar.gz: 68fee529b501210cf3278eb2b045b09e6d27c7846355b7d430c05e60f39088e2
3
+ metadata.gz: 835e5122c287082ef39874ee0a3365f08910a7a3f70b76c45dc264e8e1301edc
4
+ data.tar.gz: edff0860c01277adfc703e453e22b1d24c2db1817cfd4ad6800dfc1a6c0f339f
5
5
  SHA512:
6
- metadata.gz: 4202afab2a01ae588977fde351dfacc29551634077e794d028282666c43d3aeb09adf425d60608dc694c36f2fd8ed034ef89ba41ee07e6ad23f426c19d740931
7
- data.tar.gz: a0dc75a0c5384e1eaf8b50dfb92bc9294b41f38e8b0e0cceb9e9a6aafdf436629df2b3420f61a33523a08d6788c02ade799cdf8fe29db4338c766c7b01523704
6
+ metadata.gz: 7cbeb85f9dd74c930e9999e850d2abca6fd3e9470971461a978295de640195192f633a71a19a20b96824fb5becec2e7841e7f3e257b83b90550c935b6d16ff7b
7
+ data.tar.gz: 393803c95416f77a467724cd0b451961dcbb9c6e544c6d22a4f71853a041e5f2c32513ffd3f43e9733de43624fce31dad18b9cb7d0c37bd3cd722a6fdf223481
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ # 0.4.4 / 2021-02-13
2
+ * Do a better job of stripping out script tags
3
+
1
4
  # 0.4.3 / 2020-07-18
2
5
 
3
6
  * update deps
@@ -22,6 +22,6 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency 'bundler', '~> 2.0'
23
23
  spec.add_development_dependency 'rake', '>= 12.3.3'
24
24
  spec.add_development_dependency 'rickshaw', '~> 0.5.0'
25
- spec.add_development_dependency 'rspec', '~> 3.9'
25
+ spec.add_development_dependency 'rspec', '~> 3.10'
26
26
  spec.add_runtime_dependency 'nokogiri', '~> 1.10'
27
27
  end
data/lib/boilerpipe.rb CHANGED
@@ -40,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
40
40
  require 'boilerpipe/labels/default'
41
41
  require 'boilerpipe/labels/label_action'
42
42
 
43
+ require 'boilerpipe/sax/preprocessor'
43
44
  require 'boilerpipe/sax/html_content_handler'
44
45
  require 'boilerpipe/sax/boilerpipe_html_parser'
45
46
  require 'boilerpipe/sax/tag_action_map'
@@ -1,16 +1,11 @@
1
1
  module Boilerpipe::SAX
2
2
  class BoilerpipeHTMLParser
3
3
  def self.parse(text)
4
- # script bug - delete script tags
5
- text.gsub!(/\<script>.+?<\/script>/i, '')
6
-
7
- # nokogiri uses libxml for mri and nekohtml for jruby
8
- # mri doesn't remove &nbsp; when missing the semicolon
9
- text.gsub!(/(&nbsp) /, '\1; ')
4
+ # strip out tags that cause issues
5
+ text = Preprocessor.strip(text)
10
6
 
11
7
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
12
8
  text = Nokogiri::HTML(text).to_html
13
-
14
9
  handler = HTMLContentHandler.new
15
10
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
16
11
  noko_parser.parse(text)
@@ -0,0 +1,11 @@
1
+ module Boilerpipe::SAX
2
+ class Preprocessor
3
+ def self.strip(text)
4
+ # script bug - delete script tags
5
+ text = text.gsub(/\<script.+?<\/script>/im, '')
6
+ # nokogiri uses libxml for mri and nekohtml for jruby
7
+ # mri doesn't remove &nbsp; when missing the semicolon
8
+ text.gsub(/(&nbsp) /, '\1; ')
9
+ end
10
+ end
11
+ end
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.3'
2
+ VERSION = '0.4.4'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2021-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -58,14 +58,14 @@ dependencies:
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '3.9'
61
+ version: '3.10'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '3.9'
68
+ version: '3.10'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -136,6 +136,7 @@ files:
136
136
  - lib/boilerpipe/labels/label_action.rb
137
137
  - lib/boilerpipe/sax/boilerpipe_html_parser.rb
138
138
  - lib/boilerpipe/sax/html_content_handler.rb
139
+ - lib/boilerpipe/sax/preprocessor.rb
139
140
  - lib/boilerpipe/sax/tag_action_map.rb
140
141
  - lib/boilerpipe/sax/tag_actions/anchor_text.rb
141
142
  - lib/boilerpipe/sax/tag_actions/block_level.rb