boilerpipe-ruby 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +28 -1
- data/Dockerfile +14 -0
- data/README.md +13 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +4 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
- data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +28 -25
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
3
2
|
# as boilerplate. Trailing means they are marked content and are
|
4
3
|
# below any other content block.
|
@@ -6,7 +5,6 @@
|
|
6
5
|
module Boilerpipe::Filters
|
7
6
|
class TrailingHeadlineToBoilerplateFilter
|
8
7
|
def self.process(doc)
|
9
|
-
|
10
8
|
doc.text_blocks.each do |tb|
|
11
9
|
next unless tb.is_content?
|
12
10
|
|
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
|
|
19
17
|
|
20
18
|
doc
|
21
19
|
end
|
22
|
-
|
23
20
|
end
|
24
21
|
end
|
@@ -1,20 +1,11 @@
|
|
1
|
-
require 'nokogiri'
|
2
1
|
module Boilerpipe::SAX
|
3
2
|
class BoilerpipeHTMLParser
|
4
3
|
def self.parse(text)
|
5
|
-
|
6
|
-
|
7
|
-
text.gsub!(/\<script>.+?<\/script>/i, '')
|
8
|
-
|
9
|
-
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
|
-
# mri doesn't remove when missing the semicolon
|
11
|
-
text.gsub!(/( ) /, '\1; ')
|
12
|
-
|
4
|
+
# strip out tags that cause issues
|
5
|
+
text = Preprocessor.strip(text)
|
13
6
|
|
14
7
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
8
|
text = Nokogiri::HTML(text).to_html
|
16
|
-
|
17
|
-
|
18
9
|
handler = HTMLContentHandler.new
|
19
10
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
11
|
noko_parser.parse(text)
|
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'set'
|
3
|
-
|
4
1
|
module Boilerpipe::SAX
|
5
2
|
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
3
|
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
4
|
|
8
|
-
attr_accessor :in_anchor_tag, :token_buffer
|
5
|
+
attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
|
9
6
|
ANCHOR_TEXT_START = "$\ue00a<"
|
10
7
|
ANCHOR_TEXT_END = ">\ue00a$"
|
11
8
|
|
@@ -20,7 +17,7 @@ module Boilerpipe::SAX
|
|
20
17
|
@flush = false
|
21
18
|
@block_tag_level = -1
|
22
19
|
|
23
|
-
@
|
20
|
+
@in_body_tag = 0
|
24
21
|
@in_anchor_tag = 0
|
25
22
|
@in_ignorable_element = 0
|
26
23
|
@in_anchor_text = false
|
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
|
|
34
31
|
@label_stacks << nil
|
35
32
|
tag = name.upcase.intern
|
36
33
|
|
37
|
-
|
38
34
|
tag_action = @tag_actions[tag]
|
39
35
|
if tag_action
|
40
36
|
@tag_level += 1 if tag_action.changes_tag_level?
|
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
|
|
51
47
|
def characters(text)
|
52
48
|
flush_block if @flush
|
53
49
|
|
54
|
-
return if
|
50
|
+
return if in_ignorable_element?
|
55
51
|
return if text.empty?
|
56
52
|
|
57
53
|
# replace all whitespace with simple space
|
58
54
|
text.gsub!(/\s+/, ' ')
|
59
55
|
|
60
56
|
# trim whitespace
|
61
|
-
started_with_whitespace = text
|
62
|
-
ended_with_whitespace = text
|
57
|
+
started_with_whitespace = text =~ /^\s/
|
58
|
+
ended_with_whitespace = text =~ /\s$/
|
63
59
|
text.strip!
|
64
60
|
|
65
61
|
# add a single space if the block was only whitespace
|
@@ -96,9 +92,15 @@ module Boilerpipe::SAX
|
|
96
92
|
@label_stacks.pop
|
97
93
|
end
|
98
94
|
|
95
|
+
def not_in_body_tag?
|
96
|
+
@in_body_tag == 0
|
97
|
+
end
|
98
|
+
|
99
99
|
def flush_block
|
100
100
|
@flush = false
|
101
|
-
|
101
|
+
|
102
|
+
# set title
|
103
|
+
if not_in_body_tag?
|
102
104
|
@title = @token_buffer.strip if :TITLE == @last_start_tag
|
103
105
|
clear_buffers
|
104
106
|
return
|
@@ -158,10 +160,10 @@ module Boilerpipe::SAX
|
|
158
160
|
end
|
159
161
|
|
160
162
|
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
163
|
+
num_words,
|
164
|
+
num_linked_words,
|
165
|
+
num_words_in_wrapped_lines,
|
166
|
+
num_wrapped_lines, @offset_blocks)
|
165
167
|
|
166
168
|
@offset_blocks += 1
|
167
169
|
clear_buffers
|
@@ -187,10 +189,10 @@ module Boilerpipe::SAX
|
|
187
189
|
# \p{No} -- a numeric character of other type
|
188
190
|
|
189
191
|
def is_word?(word)
|
190
|
-
|
192
|
+
word =~ VALID_WORD_CHARACTER
|
191
193
|
end
|
192
194
|
|
193
|
-
#public void flushBlock() {
|
195
|
+
# public void flushBlock() {
|
194
196
|
# int numWords = 0;
|
195
197
|
# int numLinkedWords = 0;
|
196
198
|
# int numWrappedLines = 0;
|
@@ -198,22 +200,23 @@ module Boilerpipe::SAX
|
|
198
200
|
# final int maxLineLength = 80;
|
199
201
|
# int numTokens = 0;
|
200
202
|
# int numWordsCurrentLine = 0;
|
201
|
-
#}
|
203
|
+
# }
|
202
204
|
|
203
205
|
def increase_in_ignorable_element!
|
204
206
|
@in_ignorable_element += 1
|
205
207
|
end
|
206
208
|
|
209
|
+
# should we prevent less than zero here?
|
207
210
|
def decrease_in_ignorable_element!
|
208
211
|
@in_ignorable_element -= 1
|
209
212
|
end
|
210
213
|
|
211
|
-
def
|
212
|
-
@
|
214
|
+
def enter_body_tag!
|
215
|
+
@in_body_tag += 1
|
213
216
|
end
|
214
217
|
|
215
|
-
def
|
216
|
-
@
|
218
|
+
def exit_body_tag!
|
219
|
+
@in_body_tag -= 1
|
217
220
|
end
|
218
221
|
|
219
222
|
def in_ignorable_element?
|
@@ -224,7 +227,6 @@ module Boilerpipe::SAX
|
|
224
227
|
@in_anchor_tag > 0
|
225
228
|
end
|
226
229
|
|
227
|
-
|
228
230
|
def add_text_block(text_block)
|
229
231
|
@label_stacks.each do |stack|
|
230
232
|
next unless stack
|
@@ -239,6 +241,7 @@ module Boilerpipe::SAX
|
|
239
241
|
# append space if last character wasn't already one
|
240
242
|
def append_space
|
241
243
|
return if @sb_last_was_whitespace
|
244
|
+
|
242
245
|
@sb_last_was_whitespace = true
|
243
246
|
|
244
247
|
@text_buffer << ' '
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Boilerpipe::SAX
|
2
|
+
class Preprocessor
|
3
|
+
def self.strip(text)
|
4
|
+
# script bug - delete script tags
|
5
|
+
text = text.gsub(/\<script.+?<\/script>/im, '')
|
6
|
+
# nokogiri uses libxml for mri and nekohtml for jruby
|
7
|
+
# mri doesn't remove when missing the semicolon
|
8
|
+
text.gsub(/( ) /, '\1; ')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
|
|
2
2
|
class AnchorText
|
3
3
|
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
4
|
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
-
|
5
|
+
# * encounters such nestings, a SAXException is thrown.
|
6
6
|
def start(handler, name, attrs)
|
7
7
|
if handler.in_anchor_tag?
|
8
8
|
handler.in_anchor_tag += 1
|
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
|
|
42
42
|
# - dunno about nokogiri???????
|
43
43
|
# as nested A elements are not allowed per specification, we
|
44
44
|
# are probably reaching this branch due to a bug in the XML parser
|
45
|
-
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
45
|
+
# puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
46
|
end_tag(handler, name)
|
47
47
|
end
|
48
48
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
4
|
class BlockLevel
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
true
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
# for block-level elements, which triggers some LabelAction on
|
3
|
-
# the generated TextBlock.
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
4
|
class BlockTagLabel
|
5
5
|
def initialize(label_action)
|
6
6
|
@label_action = label_action
|
@@ -1,16 +1,16 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Marks this tag the body element (this should usually only
|
3
|
+
# be set for the <BODY> tag).
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
7
|
-
handler.
|
7
|
+
handler.enter_body_tag!
|
8
8
|
false
|
9
9
|
end
|
10
10
|
|
11
11
|
def end_tag(handler, name)
|
12
12
|
handler.flush_block
|
13
|
-
handler.
|
13
|
+
handler.exit_body_tag!
|
14
14
|
false
|
15
15
|
end
|
16
16
|
|
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
|
|
10
10
|
rel = m[1]
|
11
11
|
val = m[2].to_i # absolute
|
12
12
|
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
13
|
-
handler.font_size_stack <<
|
13
|
+
handler.font_size_stack << size
|
14
14
|
else
|
15
15
|
handler.font_size_stack << nil
|
16
16
|
end
|
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def relative(font_size_stack, rel, val)
|
30
|
-
prev_size = font_size_stack.reverse_each.find{|s| s
|
30
|
+
prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
|
31
31
|
prev_size = 3 if prev_size.nil?
|
32
32
|
|
33
33
|
size = if rel == '+'
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,71 +16,71 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rickshaw
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.5.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.5.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '3.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '3.10'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.10'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
83
|
-
description: A pure ruby implementation of the boilerpipe algorithm
|
82
|
+
version: '1.10'
|
83
|
+
description: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
84
84
|
email:
|
85
85
|
- "<gregory.ostermayr@gmail.com>"
|
86
86
|
executables: []
|
@@ -88,9 +88,11 @@ extensions: []
|
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
90
|
- ".circleci/config.yml"
|
91
|
+
- ".dockerignore"
|
91
92
|
- ".gitignore"
|
92
93
|
- ".rspec"
|
93
94
|
- CHANGELOG.md
|
95
|
+
- Dockerfile
|
94
96
|
- Gemfile
|
95
97
|
- LICENSE.txt
|
96
98
|
- README.md
|
@@ -98,6 +100,7 @@ files:
|
|
98
100
|
- bin/console
|
99
101
|
- bin/setup
|
100
102
|
- boilerpipe-ruby.gemspec
|
103
|
+
- boilerpipe_flow.md
|
101
104
|
- lib/boilerpipe.rb
|
102
105
|
- lib/boilerpipe/document/text_block.rb
|
103
106
|
- lib/boilerpipe/document/text_document.rb
|
@@ -133,6 +136,7 @@ files:
|
|
133
136
|
- lib/boilerpipe/labels/label_action.rb
|
134
137
|
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
135
138
|
- lib/boilerpipe/sax/html_content_handler.rb
|
139
|
+
- lib/boilerpipe/sax/preprocessor.rb
|
136
140
|
- lib/boilerpipe/sax/tag_action_map.rb
|
137
141
|
- lib/boilerpipe/sax/tag_actions/anchor_text.rb
|
138
142
|
- lib/boilerpipe/sax/tag_actions/block_level.rb
|
@@ -151,7 +155,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
|
|
151
155
|
licenses:
|
152
156
|
- Apache 2.0
|
153
157
|
metadata: {}
|
154
|
-
post_install_message:
|
158
|
+
post_install_message:
|
155
159
|
rdoc_options: []
|
156
160
|
require_paths:
|
157
161
|
- lib
|
@@ -166,9 +170,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
170
|
- !ruby/object:Gem::Version
|
167
171
|
version: '0'
|
168
172
|
requirements: []
|
169
|
-
|
170
|
-
|
171
|
-
signing_key:
|
173
|
+
rubygems_version: 3.0.8
|
174
|
+
signing_key:
|
172
175
|
specification_version: 4
|
173
|
-
summary: A pure ruby
|
176
|
+
summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
174
177
|
test_files: []
|