boilerpipe-ruby 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +28 -1
- data/Dockerfile +14 -0
- data/README.md +13 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +4 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
- data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +28 -25
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
3
2
|
# as boilerplate. Trailing means they are marked content and are
|
4
3
|
# below any other content block.
|
@@ -6,7 +5,6 @@
|
|
6
5
|
module Boilerpipe::Filters
|
7
6
|
class TrailingHeadlineToBoilerplateFilter
|
8
7
|
def self.process(doc)
|
9
|
-
|
10
8
|
doc.text_blocks.each do |tb|
|
11
9
|
next unless tb.is_content?
|
12
10
|
|
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
|
|
19
17
|
|
20
18
|
doc
|
21
19
|
end
|
22
|
-
|
23
20
|
end
|
24
21
|
end
|
@@ -1,20 +1,11 @@
|
|
1
|
-
require 'nokogiri'
|
2
1
|
module Boilerpipe::SAX
|
3
2
|
class BoilerpipeHTMLParser
|
4
3
|
def self.parse(text)
|
5
|
-
|
6
|
-
|
7
|
-
text.gsub!(/\<script>.+?<\/script>/i, '')
|
8
|
-
|
9
|
-
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
|
-
# mri doesn't remove when missing the semicolon
|
11
|
-
text.gsub!(/( ) /, '\1; ')
|
12
|
-
|
4
|
+
# strip out tags that cause issues
|
5
|
+
text = Preprocessor.strip(text)
|
13
6
|
|
14
7
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
8
|
text = Nokogiri::HTML(text).to_html
|
16
|
-
|
17
|
-
|
18
9
|
handler = HTMLContentHandler.new
|
19
10
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
11
|
noko_parser.parse(text)
|
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'set'
|
3
|
-
|
4
1
|
module Boilerpipe::SAX
|
5
2
|
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
3
|
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
4
|
|
8
|
-
attr_accessor :in_anchor_tag, :token_buffer
|
5
|
+
attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
|
9
6
|
ANCHOR_TEXT_START = "$\ue00a<"
|
10
7
|
ANCHOR_TEXT_END = ">\ue00a$"
|
11
8
|
|
@@ -20,7 +17,7 @@ module Boilerpipe::SAX
|
|
20
17
|
@flush = false
|
21
18
|
@block_tag_level = -1
|
22
19
|
|
23
|
-
@
|
20
|
+
@in_body_tag = 0
|
24
21
|
@in_anchor_tag = 0
|
25
22
|
@in_ignorable_element = 0
|
26
23
|
@in_anchor_text = false
|
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
|
|
34
31
|
@label_stacks << nil
|
35
32
|
tag = name.upcase.intern
|
36
33
|
|
37
|
-
|
38
34
|
tag_action = @tag_actions[tag]
|
39
35
|
if tag_action
|
40
36
|
@tag_level += 1 if tag_action.changes_tag_level?
|
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
|
|
51
47
|
def characters(text)
|
52
48
|
flush_block if @flush
|
53
49
|
|
54
|
-
return if
|
50
|
+
return if in_ignorable_element?
|
55
51
|
return if text.empty?
|
56
52
|
|
57
53
|
# replace all whitespace with simple space
|
58
54
|
text.gsub!(/\s+/, ' ')
|
59
55
|
|
60
56
|
# trim whitespace
|
61
|
-
started_with_whitespace = text
|
62
|
-
ended_with_whitespace = text
|
57
|
+
started_with_whitespace = text =~ /^\s/
|
58
|
+
ended_with_whitespace = text =~ /\s$/
|
63
59
|
text.strip!
|
64
60
|
|
65
61
|
# add a single space if the block was only whitespace
|
@@ -96,9 +92,15 @@ module Boilerpipe::SAX
|
|
96
92
|
@label_stacks.pop
|
97
93
|
end
|
98
94
|
|
95
|
+
def not_in_body_tag?
|
96
|
+
@in_body_tag == 0
|
97
|
+
end
|
98
|
+
|
99
99
|
def flush_block
|
100
100
|
@flush = false
|
101
|
-
|
101
|
+
|
102
|
+
# set title
|
103
|
+
if not_in_body_tag?
|
102
104
|
@title = @token_buffer.strip if :TITLE == @last_start_tag
|
103
105
|
clear_buffers
|
104
106
|
return
|
@@ -158,10 +160,10 @@ module Boilerpipe::SAX
|
|
158
160
|
end
|
159
161
|
|
160
162
|
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
163
|
+
num_words,
|
164
|
+
num_linked_words,
|
165
|
+
num_words_in_wrapped_lines,
|
166
|
+
num_wrapped_lines, @offset_blocks)
|
165
167
|
|
166
168
|
@offset_blocks += 1
|
167
169
|
clear_buffers
|
@@ -187,10 +189,10 @@ module Boilerpipe::SAX
|
|
187
189
|
# \p{No} -- a numeric character of other type
|
188
190
|
|
189
191
|
def is_word?(word)
|
190
|
-
|
192
|
+
word =~ VALID_WORD_CHARACTER
|
191
193
|
end
|
192
194
|
|
193
|
-
#public void flushBlock() {
|
195
|
+
# public void flushBlock() {
|
194
196
|
# int numWords = 0;
|
195
197
|
# int numLinkedWords = 0;
|
196
198
|
# int numWrappedLines = 0;
|
@@ -198,22 +200,23 @@ module Boilerpipe::SAX
|
|
198
200
|
# final int maxLineLength = 80;
|
199
201
|
# int numTokens = 0;
|
200
202
|
# int numWordsCurrentLine = 0;
|
201
|
-
#}
|
203
|
+
# }
|
202
204
|
|
203
205
|
def increase_in_ignorable_element!
|
204
206
|
@in_ignorable_element += 1
|
205
207
|
end
|
206
208
|
|
209
|
+
# should we prevent less than zero here?
|
207
210
|
def decrease_in_ignorable_element!
|
208
211
|
@in_ignorable_element -= 1
|
209
212
|
end
|
210
213
|
|
211
|
-
def
|
212
|
-
@
|
214
|
+
def enter_body_tag!
|
215
|
+
@in_body_tag += 1
|
213
216
|
end
|
214
217
|
|
215
|
-
def
|
216
|
-
@
|
218
|
+
def exit_body_tag!
|
219
|
+
@in_body_tag -= 1
|
217
220
|
end
|
218
221
|
|
219
222
|
def in_ignorable_element?
|
@@ -224,7 +227,6 @@ module Boilerpipe::SAX
|
|
224
227
|
@in_anchor_tag > 0
|
225
228
|
end
|
226
229
|
|
227
|
-
|
228
230
|
def add_text_block(text_block)
|
229
231
|
@label_stacks.each do |stack|
|
230
232
|
next unless stack
|
@@ -239,6 +241,7 @@ module Boilerpipe::SAX
|
|
239
241
|
# append space if last character wasn't already one
|
240
242
|
def append_space
|
241
243
|
return if @sb_last_was_whitespace
|
244
|
+
|
242
245
|
@sb_last_was_whitespace = true
|
243
246
|
|
244
247
|
@text_buffer << ' '
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Boilerpipe::SAX
|
2
|
+
class Preprocessor
|
3
|
+
def self.strip(text)
|
4
|
+
# script bug - delete script tags
|
5
|
+
text = text.gsub(/\<script.+?<\/script>/im, '')
|
6
|
+
# nokogiri uses libxml for mri and nekohtml for jruby
|
7
|
+
# mri doesn't remove when missing the semicolon
|
8
|
+
text.gsub(/( ) /, '\1; ')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
|
|
2
2
|
class AnchorText
|
3
3
|
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
4
|
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
-
|
5
|
+
# * encounters such nestings, a SAXException is thrown.
|
6
6
|
def start(handler, name, attrs)
|
7
7
|
if handler.in_anchor_tag?
|
8
8
|
handler.in_anchor_tag += 1
|
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
|
|
42
42
|
# - dunno about nokogiri???????
|
43
43
|
# as nested A elements are not allowed per specification, we
|
44
44
|
# are probably reaching this branch due to a bug in the XML parser
|
45
|
-
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
45
|
+
# puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
46
|
end_tag(handler, name)
|
47
47
|
end
|
48
48
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
4
|
class BlockLevel
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
true
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
# for block-level elements, which triggers some LabelAction on
|
3
|
-
# the generated TextBlock.
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
4
|
class BlockTagLabel
|
5
5
|
def initialize(label_action)
|
6
6
|
@label_action = label_action
|
@@ -1,16 +1,16 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Marks this tag the body element (this should usually only
|
3
|
+
# be set for the <BODY> tag).
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
7
|
-
handler.
|
7
|
+
handler.enter_body_tag!
|
8
8
|
false
|
9
9
|
end
|
10
10
|
|
11
11
|
def end_tag(handler, name)
|
12
12
|
handler.flush_block
|
13
|
-
handler.
|
13
|
+
handler.exit_body_tag!
|
14
14
|
false
|
15
15
|
end
|
16
16
|
|
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
|
|
10
10
|
rel = m[1]
|
11
11
|
val = m[2].to_i # absolute
|
12
12
|
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
13
|
-
handler.font_size_stack <<
|
13
|
+
handler.font_size_stack << size
|
14
14
|
else
|
15
15
|
handler.font_size_stack << nil
|
16
16
|
end
|
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def relative(font_size_stack, rel, val)
|
30
|
-
prev_size = font_size_stack.reverse_each.find{|s| s
|
30
|
+
prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
|
31
31
|
prev_size = 3 if prev_size.nil?
|
32
32
|
|
33
33
|
size = if rel == '+'
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,71 +16,71 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rickshaw
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.5.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.5.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '3.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '3.10'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.10'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
83
|
-
description: A pure ruby implementation of the boilerpipe algorithm
|
82
|
+
version: '1.10'
|
83
|
+
description: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
84
84
|
email:
|
85
85
|
- "<gregory.ostermayr@gmail.com>"
|
86
86
|
executables: []
|
@@ -88,9 +88,11 @@ extensions: []
|
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
90
|
- ".circleci/config.yml"
|
91
|
+
- ".dockerignore"
|
91
92
|
- ".gitignore"
|
92
93
|
- ".rspec"
|
93
94
|
- CHANGELOG.md
|
95
|
+
- Dockerfile
|
94
96
|
- Gemfile
|
95
97
|
- LICENSE.txt
|
96
98
|
- README.md
|
@@ -98,6 +100,7 @@ files:
|
|
98
100
|
- bin/console
|
99
101
|
- bin/setup
|
100
102
|
- boilerpipe-ruby.gemspec
|
103
|
+
- boilerpipe_flow.md
|
101
104
|
- lib/boilerpipe.rb
|
102
105
|
- lib/boilerpipe/document/text_block.rb
|
103
106
|
- lib/boilerpipe/document/text_document.rb
|
@@ -133,6 +136,7 @@ files:
|
|
133
136
|
- lib/boilerpipe/labels/label_action.rb
|
134
137
|
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
135
138
|
- lib/boilerpipe/sax/html_content_handler.rb
|
139
|
+
- lib/boilerpipe/sax/preprocessor.rb
|
136
140
|
- lib/boilerpipe/sax/tag_action_map.rb
|
137
141
|
- lib/boilerpipe/sax/tag_actions/anchor_text.rb
|
138
142
|
- lib/boilerpipe/sax/tag_actions/block_level.rb
|
@@ -151,7 +155,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
|
|
151
155
|
licenses:
|
152
156
|
- Apache 2.0
|
153
157
|
metadata: {}
|
154
|
-
post_install_message:
|
158
|
+
post_install_message:
|
155
159
|
rdoc_options: []
|
156
160
|
require_paths:
|
157
161
|
- lib
|
@@ -166,9 +170,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
170
|
- !ruby/object:Gem::Version
|
167
171
|
version: '0'
|
168
172
|
requirements: []
|
169
|
-
|
170
|
-
|
171
|
-
signing_key:
|
173
|
+
rubygems_version: 3.0.8
|
174
|
+
signing_key:
|
172
175
|
specification_version: 4
|
173
|
-
summary: A pure ruby
|
176
|
+
summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
174
177
|
test_files: []
|