boilerpipe-ruby 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +27 -6
  4. data/Rakefile +8 -0
  5. data/boilerpipe-ruby.gemspec +10 -9
  6. data/lib/boilerpipe.rb +30 -0
  7. data/lib/boilerpipe/document/text_block.rb +113 -0
  8. data/lib/boilerpipe/document/text_document.rb +44 -0
  9. data/lib/boilerpipe/errors.rb +1 -0
  10. data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
  11. data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
  12. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
  13. data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
  14. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
  15. data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
  16. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
  17. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
  18. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
  19. data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
  20. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
  21. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
  22. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
  23. data/lib/boilerpipe/labels/default.rb +17 -0
  24. data/lib/boilerpipe/labels/label_action.rb +17 -0
  25. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
  26. data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
  27. data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
  28. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
  29. data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
  30. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
  31. data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
  32. data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
  33. data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
  34. data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
  35. data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
  36. data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
  37. data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
  38. data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
  39. data/lib/boilerpipe/version.rb +1 -1
  40. data/stuff.txt +4 -0
  41. metadata +61 -15
@@ -0,0 +1,17 @@
1
+ module Boilerpipe::Labels
2
+ class LabelAction
3
+ attr_reader :labels
4
+
5
+ def initialize(labels=[])
6
+ @labels = labels
7
+ end
8
+
9
+ def add_to(text_block)
10
+ text_block.add_labels(@labels)
11
+ end
12
+
13
+ def to_s
14
+ @labels.join(',')
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ require 'nokogiri'
2
+ module Boilerpipe::SAX
3
+ class BoilerpipeHTMLParser
4
+ def self.parse(text)
5
+
6
+ #script bug - delete script tags
7
+ text = text.gsub(/\<script>.+?<\/script>/i, '')
8
+
9
+ # nokogiri uses libxml for mri and nekohtml for jruby
10
+ # mri doesn't remove &nbsp; when missing the semicolon
11
+ text = text.gsub(/(&nbsp) /, '\1; ')
12
+
13
+
14
+ # use nokogiri to fix any bad tags, errors - keep experimenting with this
15
+ text = Nokogiri::HTML(text).to_html
16
+
17
+
18
+ handler = HTMLContentHandler.new
19
+ noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
20
+ noko_parser.parse(text)
21
+ handler.text_document
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,275 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
4
+ module Boilerpipe::SAX
5
+ class HTMLContentHandler < Nokogiri::XML::SAX::Document
6
+ attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
7
+
8
+ attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
9
+ ANCHOR_TEXT_START = "$\ue00a<"
10
+ ANCHOR_TEXT_END = ">\ue00a$"
11
+
12
+ def initialize
13
+ @label_stacks = []
14
+ @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
15
+ @tag_level = 0
16
+ @sb_last_was_whitespace = false
17
+ @text_buffer = ''
18
+ @token_buffer = ''
19
+ @offset_blocks = 0
20
+ @flush = false
21
+ @block_tag_level = -1
22
+
23
+ @in_body = 0
24
+ @in_anchor_tag = 0
25
+ @in_ignorable_element = 0
26
+ @in_anchor_text = false
27
+ @font_size_stack = []
28
+ @last_start_tag = ''
29
+ @title
30
+ @text_blocks = []
31
+ end
32
+
33
+ def start_element(name, attrs = [])
34
+ @label_stacks << nil
35
+ tag = name.upcase.intern
36
+
37
+
38
+ tag_action = @tag_actions[tag]
39
+ if tag_action
40
+ @tag_level += 1 if tag_action.changes_tag_level?
41
+ @flush = tag_action.start(self, name, attrs) | @flush
42
+ else
43
+ @tag_level += 1
44
+ @flush = true
45
+ end
46
+
47
+ @last_event = :START_TAG
48
+ @last_start_tag = tag
49
+ end
50
+
51
+ def characters(text)
52
+ flush_block if @flush
53
+
54
+ return if @in_ignorable_element != 0
55
+ return if text.empty?
56
+
57
+ # replace all whitespace with simple space
58
+ text.gsub!(/\s+/, ' ')
59
+
60
+ # trim whitespace
61
+ started_with_whitespace = text =~ /^\s/
62
+ ended_with_whitespace = text =~ /\s$/
63
+ text.strip!
64
+
65
+ # add a single space if the block was only whitespace
66
+ if text.empty?
67
+ append_space
68
+ @last_event = :WHITESPACE
69
+ return
70
+ end
71
+
72
+ # set block levels
73
+ @block_tag_level = @tag_level if @block_tag_level == -1
74
+
75
+ append_space if started_with_whitespace
76
+ append_text(text)
77
+ append_space if ended_with_whitespace
78
+
79
+ @last_event = :CHARACTERS
80
+ end
81
+
82
+ def end_element(name)
83
+ tag = name.upcase.intern
84
+ tag_action = @tag_actions[tag]
85
+ if tag_action
86
+ @flush = tag_action.end_tag(self, name) | @flush
87
+ else
88
+ @flush = true
89
+ end
90
+
91
+ @tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level?
92
+ flush_block if @flush
93
+
94
+ @last_event = :END_TAG
95
+ @last_end_tag = tag
96
+ @label_stacks.pop
97
+ end
98
+
99
+ def flush_block
100
+ @flush = false
101
+ if @in_body == 0
102
+ @title = @token_buffer.strip if :TITLE == @last_start_tag
103
+ clear_buffers
104
+ return
105
+ end
106
+
107
+ # clear out if empty or just a space
108
+ length = @token_buffer.size
109
+ case length
110
+ when 0
111
+ return
112
+ when 1
113
+ clear_buffers if @sb_last_was_whitespace
114
+ return
115
+ end
116
+
117
+ num_tokens = 0
118
+ num_words = 0
119
+ num_words_current_line = 0
120
+ num_words_in_wrapped_lines = 0
121
+ num_wrapped_lines = 0
122
+ num_linked_words = 0
123
+ current_line_length = 0
124
+ max_line_length = 80
125
+
126
+ tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer)
127
+ tokens.each do |token|
128
+ if ANCHOR_TEXT_START == token
129
+ @in_anchor_text = true
130
+ elsif ANCHOR_TEXT_END == token
131
+ @in_anchor_text = false
132
+ elsif is_word?(token)
133
+ num_tokens += 1
134
+ num_words += 1
135
+ num_words_current_line += 1
136
+ num_linked_words += 1 if @in_anchor_text
137
+ token_length = token.size
138
+ current_line_length += token_length + 1
139
+
140
+ if current_line_length > max_line_length
141
+ num_wrapped_lines += 1
142
+ current_line_length = token_length
143
+ num_words_current_line = 1
144
+ end
145
+ else
146
+ num_tokens += 1
147
+ end
148
+ end
149
+
150
+ return if num_tokens == 0
151
+
152
+ num_words_in_wrapped_lines = 0
153
+ if num_wrapped_lines == 0
154
+ num_words_in_wrapped_lines = num_words
155
+ num_wrapped_lines = 1
156
+ else
157
+ num_words_in_wrapped_lines = num_words - num_words_current_line
158
+ end
159
+
160
+ text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
161
+ num_words,
162
+ num_linked_words,
163
+ num_words_in_wrapped_lines,
164
+ num_wrapped_lines, @offset_blocks)
165
+
166
+ @offset_blocks += 1
167
+ clear_buffers
168
+ text_block.set_tag_level(@block_tag_level)
169
+ add_text_block(text_block)
170
+ @block_tag_level = -1
171
+ end
172
+
173
+ def text_document
174
+ flush_block
175
+ ::Boilerpipe::Document::TextDocument.new(@title, @text_blocks)
176
+ end
177
+
178
+ def token_buffer_size
179
+ @token_buffer.size
180
+ end
181
+
182
+ VALID_WORD_CHARACTER = /[\p{L}\p{Nd}\p{Nl}\p{No}]/
183
+ # unicode regex - categories
184
+ # \p{L} -- Letter
185
+ # \p{Nd} -- a decimal digit
186
+ # \p{Nl} -- a letterlike numeric character
187
+ # \p{No} -- a numeric character of other type
188
+
189
+ def is_word?(word)
190
+ word =~ VALID_WORD_CHARACTER
191
+ end
192
+
193
+ #public void flushBlock() {
194
+ # int numWords = 0;
195
+ # int numLinkedWords = 0;
196
+ # int numWrappedLines = 0;
197
+ # int currentLineLength = -1; // don't count the first space
198
+ # final int maxLineLength = 80;
199
+ # int numTokens = 0;
200
+ # int numWordsCurrentLine = 0;
201
+ #}
202
+
203
+ def increase_in_ignorable_element!
204
+ @in_ignorable_element += 1
205
+ end
206
+
207
+ def decrease_in_ignorable_element!
208
+ @in_ignorable_element -= 1
209
+ end
210
+
211
+ def increase_in_body!
212
+ @in_body += 1
213
+ end
214
+
215
+ def decrease_in_body!
216
+ @in_body -= 1
217
+ end
218
+
219
+ def in_ignorable_element?
220
+ @in_ignorable_element > 0
221
+ end
222
+
223
+ def in_anchor_tag?
224
+ @in_anchor_tag > 0
225
+ end
226
+
227
+
228
+ def add_text_block(text_block)
229
+ @label_stacks.each do |stack|
230
+ next unless stack
231
+
232
+ stack.each do |label_action|
233
+ text_block.add_label(label_action.labels) if label_action
234
+ end
235
+ end
236
+ @text_blocks << text_block
237
+ end
238
+
239
+ # append space if last character wasn't already one
240
+ def append_space
241
+ return if @sb_last_was_whitespace
242
+ @sb_last_was_whitespace = true
243
+
244
+ @text_buffer << ' '
245
+ @token_buffer << ' '
246
+ end
247
+
248
+ def append_text(text)
249
+ @sb_last_was_whitespace = false
250
+ @text_buffer << text
251
+ @token_buffer << text
252
+ end
253
+
254
+ def append_token(token)
255
+ @token_buffer << token
256
+ end
257
+
258
+ def add_label_action(label_action)
259
+ label_stack = @label_stacks.last
260
+ if label_stack.nil?
261
+ label_stack = []
262
+ @label_stacks.pop
263
+ @label_stacks << label_stack
264
+ end
265
+ label_stack << label_action
266
+ end
267
+
268
+ private
269
+
270
+ def clear_buffers
271
+ @token_buffer = ''
272
+ @text_buffer = ''
273
+ end
274
+ end
275
+ end
@@ -0,0 +1,51 @@
1
+ module Boilerpipe::SAX
2
+ class TagActionMap
3
+ def self.tag_actions
4
+ labels = ::Boilerpipe::Labels
5
+ {
6
+ STYLE: TagActions::IgnorableElement.new,
7
+ SCRIPT: TagActions::IgnorableElement.new,
8
+ OPTION: TagActions::IgnorableElement.new,
9
+ OBJECT: TagActions::IgnorableElement.new,
10
+ EMBED: TagActions::IgnorableElement.new,
11
+ APPLET: TagActions::IgnorableElement.new,
12
+ LINK: TagActions::IgnorableElement.new,
13
+
14
+ A: TagActions::AnchorText.new,
15
+ BODY: TagActions::Body.new,
16
+
17
+ STRIKE: TagActions::InlineNoWhitespace.new,
18
+ U: TagActions::InlineNoWhitespace.new,
19
+ B: TagActions::InlineNoWhitespace.new,
20
+ I: TagActions::InlineNoWhitespace.new,
21
+ EM: TagActions::InlineNoWhitespace.new,
22
+ STRONG: TagActions::InlineNoWhitespace.new,
23
+ SPAN: TagActions::InlineNoWhitespace.new,
24
+
25
+ # New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
26
+ SUP: TagActions::InlineNoWhitespace.new,
27
+
28
+ # New in 1.2
29
+ CODE: TagActions::InlineNoWhitespace.new,
30
+ TT: TagActions::InlineNoWhitespace.new,
31
+ SUB: TagActions::InlineNoWhitespace.new,
32
+ VAR: TagActions::InlineNoWhitespace.new,
33
+
34
+ ABBR: TagActions::InlineWhitespace.new,
35
+ ACRONYM: TagActions::InlineWhitespace.new,
36
+ FONT: TagActions::InlineNoWhitespace.new,
37
+
38
+ # added in 1.1.1
39
+ NOSCRIPT: TagActions::IgnorableElement.new,
40
+
41
+ # New in 1.3
42
+
43
+ LI: TagActions::BlockTagLabel.new(labels::LabelAction.new([:LI])),
44
+ H1: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H1, :HEADING])),
45
+ H2: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H2, :HEADING])),
46
+ H3: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H3, :HEADING]))
47
+ }
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,49 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ class AnchorText
3
+ # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
4
+ # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
5
+ #* encounters such nestings, a SAXException is thrown.
6
+ def start(handler, name, attrs)
7
+ if handler.in_anchor_tag?
8
+ handler.in_anchor_tag += 1
9
+ nested_achor_tag_error_recovering(handler, name)
10
+ return
11
+ else
12
+ handler.in_anchor_tag += 1
13
+ end
14
+
15
+ append_anchor_text_start(handler) unless handler.in_ignorable_element?
16
+ false
17
+ end
18
+
19
+ def end_tag(handler, name)
20
+ handler.in_anchor_tag -= 1
21
+ append_anchor_text_end(handler) unless handler.in_anchor_tag? || handler.in_ignorable_element?
22
+ false
23
+ end
24
+
25
+ def changes_tag_level?
26
+ true
27
+ end
28
+
29
+ def append_anchor_text_start(handler)
30
+ handler.append_space
31
+ handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START)
32
+ handler.append_token(' ')
33
+ end
34
+
35
+ def append_anchor_text_end(handler)
36
+ handler.append_space
37
+ handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END)
38
+ handler.append_token(' ')
39
+ end
40
+
41
+ def nested_achor_tag_error_recovering(handler, name)
42
+ # - dunno about nokogiri???????
43
+ # as nested A elements are not allowed per specification, we
44
+ # are probably reaching this branch due to a bug in the XML parser
45
+ #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
46
+ end_tag(handler, name)
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,17 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # Explicitly marks this tag a simple "block-level" element,
3
+ # which always generates whitespace
4
+ class BlockLevel
5
+ def start(handler, name, attrs)
6
+ true
7
+ end
8
+
9
+ def end_tag(handler, name)
10
+ true
11
+ end
12
+
13
+ def changes_tag_level?
14
+ true
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,22 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # for block-level elements, which triggers some LabelAction on
3
+ # the generated TextBlock.
4
+ class BlockTagLabel
5
+ def initialize(label_action)
6
+ @label_action = label_action
7
+ end
8
+
9
+ def start(handler, name, attrs)
10
+ handler.add_label_action(@label_action)
11
+ true
12
+ end
13
+
14
+ def end_tag(handler, name)
15
+ true
16
+ end
17
+
18
+ def changes_tag_level?
19
+ true
20
+ end
21
+ end
22
+ end