boilerpipe-ruby 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +27 -6
- data/Rakefile +8 -0
- data/boilerpipe-ruby.gemspec +10 -9
- data/lib/boilerpipe.rb +30 -0
- data/lib/boilerpipe/document/text_block.rb +113 -0
- data/lib/boilerpipe/document/text_document.rb +44 -0
- data/lib/boilerpipe/errors.rb +1 -0
- data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
- data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
- data/lib/boilerpipe/labels/default.rb +17 -0
- data/lib/boilerpipe/labels/label_action.rb +17 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
- data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
- data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
- data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
- data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
- data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
- data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
- data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
- data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
- data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- data/stuff.txt +4 -0
- metadata +61 -15
@@ -0,0 +1,17 @@
|
|
1
|
+
module Boilerpipe::Labels
|
2
|
+
class LabelAction
|
3
|
+
attr_reader :labels
|
4
|
+
|
5
|
+
def initialize(labels=[])
|
6
|
+
@labels = labels
|
7
|
+
end
|
8
|
+
|
9
|
+
def add_to(text_block)
|
10
|
+
text_block.add_labels(@labels)
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
@labels.join(',')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
module Boilerpipe::SAX
|
3
|
+
class BoilerpipeHTMLParser
|
4
|
+
def self.parse(text)
|
5
|
+
|
6
|
+
#script bug - delete script tags
|
7
|
+
text = text.gsub(/\<script>.+?<\/script>/i, '')
|
8
|
+
|
9
|
+
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
|
+
# mri doesn't remove when missing the semicolon
|
11
|
+
text = text.gsub(/( ) /, '\1; ')
|
12
|
+
|
13
|
+
|
14
|
+
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
|
+
text = Nokogiri::HTML(text).to_html
|
16
|
+
|
17
|
+
|
18
|
+
handler = HTMLContentHandler.new
|
19
|
+
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
|
+
noko_parser.parse(text)
|
21
|
+
handler.text_document
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
module Boilerpipe::SAX
|
5
|
+
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
|
+
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
|
+
|
8
|
+
attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
|
9
|
+
ANCHOR_TEXT_START = "$\ue00a<"
|
10
|
+
ANCHOR_TEXT_END = ">\ue00a$"
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@label_stacks = []
|
14
|
+
@tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
|
15
|
+
@tag_level = 0
|
16
|
+
@sb_last_was_whitespace = false
|
17
|
+
@text_buffer = ''
|
18
|
+
@token_buffer = ''
|
19
|
+
@offset_blocks = 0
|
20
|
+
@flush = false
|
21
|
+
@block_tag_level = -1
|
22
|
+
|
23
|
+
@in_body = 0
|
24
|
+
@in_anchor_tag = 0
|
25
|
+
@in_ignorable_element = 0
|
26
|
+
@in_anchor_text = false
|
27
|
+
@font_size_stack = []
|
28
|
+
@last_start_tag = ''
|
29
|
+
@title
|
30
|
+
@text_blocks = []
|
31
|
+
end
|
32
|
+
|
33
|
+
def start_element(name, attrs = [])
|
34
|
+
@label_stacks << nil
|
35
|
+
tag = name.upcase.intern
|
36
|
+
|
37
|
+
|
38
|
+
tag_action = @tag_actions[tag]
|
39
|
+
if tag_action
|
40
|
+
@tag_level += 1 if tag_action.changes_tag_level?
|
41
|
+
@flush = tag_action.start(self, name, attrs) | @flush
|
42
|
+
else
|
43
|
+
@tag_level += 1
|
44
|
+
@flush = true
|
45
|
+
end
|
46
|
+
|
47
|
+
@last_event = :START_TAG
|
48
|
+
@last_start_tag = tag
|
49
|
+
end
|
50
|
+
|
51
|
+
def characters(text)
|
52
|
+
flush_block if @flush
|
53
|
+
|
54
|
+
return if @in_ignorable_element != 0
|
55
|
+
return if text.empty?
|
56
|
+
|
57
|
+
# replace all whitespace with simple space
|
58
|
+
text.gsub!(/\s+/, ' ')
|
59
|
+
|
60
|
+
# trim whitespace
|
61
|
+
started_with_whitespace = text =~ /^\s/
|
62
|
+
ended_with_whitespace = text =~ /\s$/
|
63
|
+
text.strip!
|
64
|
+
|
65
|
+
# add a single space if the block was only whitespace
|
66
|
+
if text.empty?
|
67
|
+
append_space
|
68
|
+
@last_event = :WHITESPACE
|
69
|
+
return
|
70
|
+
end
|
71
|
+
|
72
|
+
# set block levels
|
73
|
+
@block_tag_level = @tag_level if @block_tag_level == -1
|
74
|
+
|
75
|
+
append_space if started_with_whitespace
|
76
|
+
append_text(text)
|
77
|
+
append_space if ended_with_whitespace
|
78
|
+
|
79
|
+
@last_event = :CHARACTERS
|
80
|
+
end
|
81
|
+
|
82
|
+
def end_element(name)
|
83
|
+
tag = name.upcase.intern
|
84
|
+
tag_action = @tag_actions[tag]
|
85
|
+
if tag_action
|
86
|
+
@flush = tag_action.end_tag(self, name) | @flush
|
87
|
+
else
|
88
|
+
@flush = true
|
89
|
+
end
|
90
|
+
|
91
|
+
@tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level?
|
92
|
+
flush_block if @flush
|
93
|
+
|
94
|
+
@last_event = :END_TAG
|
95
|
+
@last_end_tag = tag
|
96
|
+
@label_stacks.pop
|
97
|
+
end
|
98
|
+
|
99
|
+
def flush_block
|
100
|
+
@flush = false
|
101
|
+
if @in_body == 0
|
102
|
+
@title = @token_buffer.strip if :TITLE == @last_start_tag
|
103
|
+
clear_buffers
|
104
|
+
return
|
105
|
+
end
|
106
|
+
|
107
|
+
# clear out if empty or just a space
|
108
|
+
length = @token_buffer.size
|
109
|
+
case length
|
110
|
+
when 0
|
111
|
+
return
|
112
|
+
when 1
|
113
|
+
clear_buffers if @sb_last_was_whitespace
|
114
|
+
return
|
115
|
+
end
|
116
|
+
|
117
|
+
num_tokens = 0
|
118
|
+
num_words = 0
|
119
|
+
num_words_current_line = 0
|
120
|
+
num_words_in_wrapped_lines = 0
|
121
|
+
num_wrapped_lines = 0
|
122
|
+
num_linked_words = 0
|
123
|
+
current_line_length = 0
|
124
|
+
max_line_length = 80
|
125
|
+
|
126
|
+
tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer)
|
127
|
+
tokens.each do |token|
|
128
|
+
if ANCHOR_TEXT_START == token
|
129
|
+
@in_anchor_text = true
|
130
|
+
elsif ANCHOR_TEXT_END == token
|
131
|
+
@in_anchor_text = false
|
132
|
+
elsif is_word?(token)
|
133
|
+
num_tokens += 1
|
134
|
+
num_words += 1
|
135
|
+
num_words_current_line += 1
|
136
|
+
num_linked_words += 1 if @in_anchor_text
|
137
|
+
token_length = token.size
|
138
|
+
current_line_length += token_length + 1
|
139
|
+
|
140
|
+
if current_line_length > max_line_length
|
141
|
+
num_wrapped_lines += 1
|
142
|
+
current_line_length = token_length
|
143
|
+
num_words_current_line = 1
|
144
|
+
end
|
145
|
+
else
|
146
|
+
num_tokens += 1
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return if num_tokens == 0
|
151
|
+
|
152
|
+
num_words_in_wrapped_lines = 0
|
153
|
+
if num_wrapped_lines == 0
|
154
|
+
num_words_in_wrapped_lines = num_words
|
155
|
+
num_wrapped_lines = 1
|
156
|
+
else
|
157
|
+
num_words_in_wrapped_lines = num_words - num_words_current_line
|
158
|
+
end
|
159
|
+
|
160
|
+
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
+
num_words,
|
162
|
+
num_linked_words,
|
163
|
+
num_words_in_wrapped_lines,
|
164
|
+
num_wrapped_lines, @offset_blocks)
|
165
|
+
|
166
|
+
@offset_blocks += 1
|
167
|
+
clear_buffers
|
168
|
+
text_block.set_tag_level(@block_tag_level)
|
169
|
+
add_text_block(text_block)
|
170
|
+
@block_tag_level = -1
|
171
|
+
end
|
172
|
+
|
173
|
+
def text_document
|
174
|
+
flush_block
|
175
|
+
::Boilerpipe::Document::TextDocument.new(@title, @text_blocks)
|
176
|
+
end
|
177
|
+
|
178
|
+
def token_buffer_size
|
179
|
+
@token_buffer.size
|
180
|
+
end
|
181
|
+
|
182
|
+
VALID_WORD_CHARACTER = /[\p{L}\p{Nd}\p{Nl}\p{No}]/
|
183
|
+
# unicode regex - categories
|
184
|
+
# \p{L} -- Letter
|
185
|
+
# \p{Nd} -- a decimal digit
|
186
|
+
# \p{Nl} -- a letterlike numeric character
|
187
|
+
# \p{No} -- a numeric character of other type
|
188
|
+
|
189
|
+
def is_word?(word)
|
190
|
+
word =~ VALID_WORD_CHARACTER
|
191
|
+
end
|
192
|
+
|
193
|
+
#public void flushBlock() {
|
194
|
+
# int numWords = 0;
|
195
|
+
# int numLinkedWords = 0;
|
196
|
+
# int numWrappedLines = 0;
|
197
|
+
# int currentLineLength = -1; // don't count the first space
|
198
|
+
# final int maxLineLength = 80;
|
199
|
+
# int numTokens = 0;
|
200
|
+
# int numWordsCurrentLine = 0;
|
201
|
+
#}
|
202
|
+
|
203
|
+
def increase_in_ignorable_element!
|
204
|
+
@in_ignorable_element += 1
|
205
|
+
end
|
206
|
+
|
207
|
+
def decrease_in_ignorable_element!
|
208
|
+
@in_ignorable_element -= 1
|
209
|
+
end
|
210
|
+
|
211
|
+
def increase_in_body!
|
212
|
+
@in_body += 1
|
213
|
+
end
|
214
|
+
|
215
|
+
def decrease_in_body!
|
216
|
+
@in_body -= 1
|
217
|
+
end
|
218
|
+
|
219
|
+
def in_ignorable_element?
|
220
|
+
@in_ignorable_element > 0
|
221
|
+
end
|
222
|
+
|
223
|
+
def in_anchor_tag?
|
224
|
+
@in_anchor_tag > 0
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
def add_text_block(text_block)
|
229
|
+
@label_stacks.each do |stack|
|
230
|
+
next unless stack
|
231
|
+
|
232
|
+
stack.each do |label_action|
|
233
|
+
text_block.add_label(label_action.labels) if label_action
|
234
|
+
end
|
235
|
+
end
|
236
|
+
@text_blocks << text_block
|
237
|
+
end
|
238
|
+
|
239
|
+
# append space if last character wasn't already one
|
240
|
+
def append_space
|
241
|
+
return if @sb_last_was_whitespace
|
242
|
+
@sb_last_was_whitespace = true
|
243
|
+
|
244
|
+
@text_buffer << ' '
|
245
|
+
@token_buffer << ' '
|
246
|
+
end
|
247
|
+
|
248
|
+
def append_text(text)
|
249
|
+
@sb_last_was_whitespace = false
|
250
|
+
@text_buffer << text
|
251
|
+
@token_buffer << text
|
252
|
+
end
|
253
|
+
|
254
|
+
def append_token(token)
|
255
|
+
@token_buffer << token
|
256
|
+
end
|
257
|
+
|
258
|
+
def add_label_action(label_action)
|
259
|
+
label_stack = @label_stacks.last
|
260
|
+
if label_stack.nil?
|
261
|
+
label_stack = []
|
262
|
+
@label_stacks.pop
|
263
|
+
@label_stacks << label_stack
|
264
|
+
end
|
265
|
+
label_stack << label_action
|
266
|
+
end
|
267
|
+
|
268
|
+
private
|
269
|
+
|
270
|
+
def clear_buffers
|
271
|
+
@token_buffer = ''
|
272
|
+
@text_buffer = ''
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Boilerpipe::SAX
|
2
|
+
class TagActionMap
|
3
|
+
def self.tag_actions
|
4
|
+
labels = ::Boilerpipe::Labels
|
5
|
+
{
|
6
|
+
STYLE: TagActions::IgnorableElement.new,
|
7
|
+
SCRIPT: TagActions::IgnorableElement.new,
|
8
|
+
OPTION: TagActions::IgnorableElement.new,
|
9
|
+
OBJECT: TagActions::IgnorableElement.new,
|
10
|
+
EMBED: TagActions::IgnorableElement.new,
|
11
|
+
APPLET: TagActions::IgnorableElement.new,
|
12
|
+
LINK: TagActions::IgnorableElement.new,
|
13
|
+
|
14
|
+
A: TagActions::AnchorText.new,
|
15
|
+
BODY: TagActions::Body.new,
|
16
|
+
|
17
|
+
STRIKE: TagActions::InlineNoWhitespace.new,
|
18
|
+
U: TagActions::InlineNoWhitespace.new,
|
19
|
+
B: TagActions::InlineNoWhitespace.new,
|
20
|
+
I: TagActions::InlineNoWhitespace.new,
|
21
|
+
EM: TagActions::InlineNoWhitespace.new,
|
22
|
+
STRONG: TagActions::InlineNoWhitespace.new,
|
23
|
+
SPAN: TagActions::InlineNoWhitespace.new,
|
24
|
+
|
25
|
+
# New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
|
26
|
+
SUP: TagActions::InlineNoWhitespace.new,
|
27
|
+
|
28
|
+
# New in 1.2
|
29
|
+
CODE: TagActions::InlineNoWhitespace.new,
|
30
|
+
TT: TagActions::InlineNoWhitespace.new,
|
31
|
+
SUB: TagActions::InlineNoWhitespace.new,
|
32
|
+
VAR: TagActions::InlineNoWhitespace.new,
|
33
|
+
|
34
|
+
ABBR: TagActions::InlineWhitespace.new,
|
35
|
+
ACRONYM: TagActions::InlineWhitespace.new,
|
36
|
+
FONT: TagActions::InlineNoWhitespace.new,
|
37
|
+
|
38
|
+
# added in 1.1.1
|
39
|
+
NOSCRIPT: TagActions::IgnorableElement.new,
|
40
|
+
|
41
|
+
# New in 1.3
|
42
|
+
|
43
|
+
LI: TagActions::BlockTagLabel.new(labels::LabelAction.new([:LI])),
|
44
|
+
H1: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H1, :HEADING])),
|
45
|
+
H2: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H2, :HEADING])),
|
46
|
+
H3: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H3, :HEADING]))
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
2
|
+
class AnchorText
|
3
|
+
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
|
+
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
+
#* encounters such nestings, a SAXException is thrown.
|
6
|
+
def start(handler, name, attrs)
|
7
|
+
if handler.in_anchor_tag?
|
8
|
+
handler.in_anchor_tag += 1
|
9
|
+
nested_achor_tag_error_recovering(handler, name)
|
10
|
+
return
|
11
|
+
else
|
12
|
+
handler.in_anchor_tag += 1
|
13
|
+
end
|
14
|
+
|
15
|
+
append_anchor_text_start(handler) unless handler.in_ignorable_element?
|
16
|
+
false
|
17
|
+
end
|
18
|
+
|
19
|
+
def end_tag(handler, name)
|
20
|
+
handler.in_anchor_tag -= 1
|
21
|
+
append_anchor_text_end(handler) unless handler.in_anchor_tag? || handler.in_ignorable_element?
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
def changes_tag_level?
|
26
|
+
true
|
27
|
+
end
|
28
|
+
|
29
|
+
def append_anchor_text_start(handler)
|
30
|
+
handler.append_space
|
31
|
+
handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START)
|
32
|
+
handler.append_token(' ')
|
33
|
+
end
|
34
|
+
|
35
|
+
def append_anchor_text_end(handler)
|
36
|
+
handler.append_space
|
37
|
+
handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END)
|
38
|
+
handler.append_token(' ')
|
39
|
+
end
|
40
|
+
|
41
|
+
def nested_achor_tag_error_recovering(handler, name)
|
42
|
+
# - dunno about nokogiri???????
|
43
|
+
# as nested A elements are not allowed per specification, we
|
44
|
+
# are probably reaching this branch due to a bug in the XML parser
|
45
|
+
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
|
+
end_tag(handler, name)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
|
+
class BlockLevel
|
5
|
+
def start(handler, name, attrs)
|
6
|
+
true
|
7
|
+
end
|
8
|
+
|
9
|
+
def end_tag(handler, name)
|
10
|
+
true
|
11
|
+
end
|
12
|
+
|
13
|
+
def changes_tag_level?
|
14
|
+
true
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
|
+
class BlockTagLabel
|
5
|
+
def initialize(label_action)
|
6
|
+
@label_action = label_action
|
7
|
+
end
|
8
|
+
|
9
|
+
def start(handler, name, attrs)
|
10
|
+
handler.add_label_action(@label_action)
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
def end_tag(handler, name)
|
15
|
+
true
|
16
|
+
end
|
17
|
+
|
18
|
+
def changes_tag_level?
|
19
|
+
true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|