boilerpipe-ruby 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +27 -6
  4. data/Rakefile +8 -0
  5. data/boilerpipe-ruby.gemspec +10 -9
  6. data/lib/boilerpipe.rb +30 -0
  7. data/lib/boilerpipe/document/text_block.rb +113 -0
  8. data/lib/boilerpipe/document/text_document.rb +44 -0
  9. data/lib/boilerpipe/errors.rb +1 -0
  10. data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
  11. data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
  12. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
  13. data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
  14. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
  15. data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
  16. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
  17. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
  18. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
  19. data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
  20. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
  21. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
  22. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
  23. data/lib/boilerpipe/labels/default.rb +17 -0
  24. data/lib/boilerpipe/labels/label_action.rb +17 -0
  25. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
  26. data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
  27. data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
  28. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
  29. data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
  30. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
  31. data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
  32. data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
  33. data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
  34. data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
  35. data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
  36. data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
  37. data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
  38. data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
  39. data/lib/boilerpipe/version.rb +1 -1
  40. data/stuff.txt +4 -0
  41. metadata +61 -15
@@ -0,0 +1,17 @@
1
+ module Boilerpipe::Labels
2
+ class LabelAction
3
+ attr_reader :labels
4
+
5
+ def initialize(labels=[])
6
+ @labels = labels
7
+ end
8
+
9
+ def add_to(text_block)
10
+ text_block.add_labels(@labels)
11
+ end
12
+
13
+ def to_s
14
+ @labels.join(',')
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ require 'nokogiri'
2
+ module Boilerpipe::SAX
3
+ class BoilerpipeHTMLParser
4
+ def self.parse(text)
5
+
6
+ #script bug - delete script tags
7
+ text = text.gsub(/\<script>.+?<\/script>/i, '')
8
+
9
+ # nokogiri uses libxml for mri and nekohtml for jruby
10
+ # mri doesn't remove &nbsp; when missing the semicolon
11
+ text = text.gsub(/(&nbsp) /, '\1; ')
12
+
13
+
14
+ # use nokogiri to fix any bad tags, errors - keep experimenting with this
15
+ text = Nokogiri::HTML(text).to_html
16
+
17
+
18
+ handler = HTMLContentHandler.new
19
+ noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
20
+ noko_parser.parse(text)
21
+ handler.text_document
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,275 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
4
+ module Boilerpipe::SAX
5
+ class HTMLContentHandler < Nokogiri::XML::SAX::Document
6
+ attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
7
+
8
+ attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
9
+ ANCHOR_TEXT_START = "$\ue00a<"
10
+ ANCHOR_TEXT_END = ">\ue00a$"
11
+
12
+ def initialize
13
+ @label_stacks = []
14
+ @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
15
+ @tag_level = 0
16
+ @sb_last_was_whitespace = false
17
+ @text_buffer = ''
18
+ @token_buffer = ''
19
+ @offset_blocks = 0
20
+ @flush = false
21
+ @block_tag_level = -1
22
+
23
+ @in_body = 0
24
+ @in_anchor_tag = 0
25
+ @in_ignorable_element = 0
26
+ @in_anchor_text = false
27
+ @font_size_stack = []
28
+ @last_start_tag = ''
29
+ @title
30
+ @text_blocks = []
31
+ end
32
+
33
+ def start_element(name, attrs = [])
34
+ @label_stacks << nil
35
+ tag = name.upcase.intern
36
+
37
+
38
+ tag_action = @tag_actions[tag]
39
+ if tag_action
40
+ @tag_level += 1 if tag_action.changes_tag_level?
41
+ @flush = tag_action.start(self, name, attrs) | @flush
42
+ else
43
+ @tag_level += 1
44
+ @flush = true
45
+ end
46
+
47
+ @last_event = :START_TAG
48
+ @last_start_tag = tag
49
+ end
50
+
51
+ def characters(text)
52
+ flush_block if @flush
53
+
54
+ return if @in_ignorable_element != 0
55
+ return if text.empty?
56
+
57
+ # replace all whitespace with simple space
58
+ text.gsub!(/\s+/, ' ')
59
+
60
+ # trim whitespace
61
+ started_with_whitespace = text =~ /^\s/
62
+ ended_with_whitespace = text =~ /\s$/
63
+ text.strip!
64
+
65
+ # add a single space if the block was only whitespace
66
+ if text.empty?
67
+ append_space
68
+ @last_event = :WHITESPACE
69
+ return
70
+ end
71
+
72
+ # set block levels
73
+ @block_tag_level = @tag_level if @block_tag_level == -1
74
+
75
+ append_space if started_with_whitespace
76
+ append_text(text)
77
+ append_space if ended_with_whitespace
78
+
79
+ @last_event = :CHARACTERS
80
+ end
81
+
82
+ def end_element(name)
83
+ tag = name.upcase.intern
84
+ tag_action = @tag_actions[tag]
85
+ if tag_action
86
+ @flush = tag_action.end_tag(self, name) | @flush
87
+ else
88
+ @flush = true
89
+ end
90
+
91
+ @tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level?
92
+ flush_block if @flush
93
+
94
+ @last_event = :END_TAG
95
+ @last_end_tag = tag
96
+ @label_stacks.pop
97
+ end
98
+
99
+ def flush_block
100
+ @flush = false
101
+ if @in_body == 0
102
+ @title = @token_buffer.strip if :TITLE == @last_start_tag
103
+ clear_buffers
104
+ return
105
+ end
106
+
107
+ # clear out if empty or just a space
108
+ length = @token_buffer.size
109
+ case length
110
+ when 0
111
+ return
112
+ when 1
113
+ clear_buffers if @sb_last_was_whitespace
114
+ return
115
+ end
116
+
117
+ num_tokens = 0
118
+ num_words = 0
119
+ num_words_current_line = 0
120
+ num_words_in_wrapped_lines = 0
121
+ num_wrapped_lines = 0
122
+ num_linked_words = 0
123
+ current_line_length = 0
124
+ max_line_length = 80
125
+
126
+ tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer)
127
+ tokens.each do |token|
128
+ if ANCHOR_TEXT_START == token
129
+ @in_anchor_text = true
130
+ elsif ANCHOR_TEXT_END == token
131
+ @in_anchor_text = false
132
+ elsif is_word?(token)
133
+ num_tokens += 1
134
+ num_words += 1
135
+ num_words_current_line += 1
136
+ num_linked_words += 1 if @in_anchor_text
137
+ token_length = token.size
138
+ current_line_length += token_length + 1
139
+
140
+ if current_line_length > max_line_length
141
+ num_wrapped_lines += 1
142
+ current_line_length = token_length
143
+ num_words_current_line = 1
144
+ end
145
+ else
146
+ num_tokens += 1
147
+ end
148
+ end
149
+
150
+ return if num_tokens == 0
151
+
152
+ num_words_in_wrapped_lines = 0
153
+ if num_wrapped_lines == 0
154
+ num_words_in_wrapped_lines = num_words
155
+ num_wrapped_lines = 1
156
+ else
157
+ num_words_in_wrapped_lines = num_words - num_words_current_line
158
+ end
159
+
160
+ text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
161
+ num_words,
162
+ num_linked_words,
163
+ num_words_in_wrapped_lines,
164
+ num_wrapped_lines, @offset_blocks)
165
+
166
+ @offset_blocks += 1
167
+ clear_buffers
168
+ text_block.set_tag_level(@block_tag_level)
169
+ add_text_block(text_block)
170
+ @block_tag_level = -1
171
+ end
172
+
173
+ def text_document
174
+ flush_block
175
+ ::Boilerpipe::Document::TextDocument.new(@title, @text_blocks)
176
+ end
177
+
178
+ def token_buffer_size
179
+ @token_buffer.size
180
+ end
181
+
182
+ VALID_WORD_CHARACTER = /[\p{L}\p{Nd}\p{Nl}\p{No}]/
183
+ # unicode regex - categories
184
+ # \p{L} -- Letter
185
+ # \p{Nd} -- a decimal digit
186
+ # \p{Nl} -- a letterlike numeric character
187
+ # \p{No} -- a numeric character of other type
188
+
189
+ def is_word?(word)
190
+ word =~ VALID_WORD_CHARACTER
191
+ end
192
+
193
+ #public void flushBlock() {
194
+ # int numWords = 0;
195
+ # int numLinkedWords = 0;
196
+ # int numWrappedLines = 0;
197
+ # int currentLineLength = -1; // don't count the first space
198
+ # final int maxLineLength = 80;
199
+ # int numTokens = 0;
200
+ # int numWordsCurrentLine = 0;
201
+ #}
202
+
203
+ def increase_in_ignorable_element!
204
+ @in_ignorable_element += 1
205
+ end
206
+
207
+ def decrease_in_ignorable_element!
208
+ @in_ignorable_element -= 1
209
+ end
210
+
211
+ def increase_in_body!
212
+ @in_body += 1
213
+ end
214
+
215
+ def decrease_in_body!
216
+ @in_body -= 1
217
+ end
218
+
219
+ def in_ignorable_element?
220
+ @in_ignorable_element > 0
221
+ end
222
+
223
+ def in_anchor_tag?
224
+ @in_anchor_tag > 0
225
+ end
226
+
227
+
228
+ def add_text_block(text_block)
229
+ @label_stacks.each do |stack|
230
+ next unless stack
231
+
232
+ stack.each do |label_action|
233
+ text_block.add_label(label_action.labels) if label_action
234
+ end
235
+ end
236
+ @text_blocks << text_block
237
+ end
238
+
239
+ # append space if last character wasn't already one
240
+ def append_space
241
+ return if @sb_last_was_whitespace
242
+ @sb_last_was_whitespace = true
243
+
244
+ @text_buffer << ' '
245
+ @token_buffer << ' '
246
+ end
247
+
248
+ def append_text(text)
249
+ @sb_last_was_whitespace = false
250
+ @text_buffer << text
251
+ @token_buffer << text
252
+ end
253
+
254
+ def append_token(token)
255
+ @token_buffer << token
256
+ end
257
+
258
+ def add_label_action(label_action)
259
+ label_stack = @label_stacks.last
260
+ if label_stack.nil?
261
+ label_stack = []
262
+ @label_stacks.pop
263
+ @label_stacks << label_stack
264
+ end
265
+ label_stack << label_action
266
+ end
267
+
268
+ private
269
+
270
+ def clear_buffers
271
+ @token_buffer = ''
272
+ @text_buffer = ''
273
+ end
274
+ end
275
+ end
@@ -0,0 +1,51 @@
1
+ module Boilerpipe::SAX
2
+ class TagActionMap
3
+ def self.tag_actions
4
+ labels = ::Boilerpipe::Labels
5
+ {
6
+ STYLE: TagActions::IgnorableElement.new,
7
+ SCRIPT: TagActions::IgnorableElement.new,
8
+ OPTION: TagActions::IgnorableElement.new,
9
+ OBJECT: TagActions::IgnorableElement.new,
10
+ EMBED: TagActions::IgnorableElement.new,
11
+ APPLET: TagActions::IgnorableElement.new,
12
+ LINK: TagActions::IgnorableElement.new,
13
+
14
+ A: TagActions::AnchorText.new,
15
+ BODY: TagActions::Body.new,
16
+
17
+ STRIKE: TagActions::InlineNoWhitespace.new,
18
+ U: TagActions::InlineNoWhitespace.new,
19
+ B: TagActions::InlineNoWhitespace.new,
20
+ I: TagActions::InlineNoWhitespace.new,
21
+ EM: TagActions::InlineNoWhitespace.new,
22
+ STRONG: TagActions::InlineNoWhitespace.new,
23
+ SPAN: TagActions::InlineNoWhitespace.new,
24
+
25
+ # New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
26
+ SUP: TagActions::InlineNoWhitespace.new,
27
+
28
+ # New in 1.2
29
+ CODE: TagActions::InlineNoWhitespace.new,
30
+ TT: TagActions::InlineNoWhitespace.new,
31
+ SUB: TagActions::InlineNoWhitespace.new,
32
+ VAR: TagActions::InlineNoWhitespace.new,
33
+
34
+ ABBR: TagActions::InlineWhitespace.new,
35
+ ACRONYM: TagActions::InlineWhitespace.new,
36
+ FONT: TagActions::InlineNoWhitespace.new,
37
+
38
+ # added in 1.1.1
39
+ NOSCRIPT: TagActions::IgnorableElement.new,
40
+
41
+ # New in 1.3
42
+
43
+ LI: TagActions::BlockTagLabel.new(labels::LabelAction.new([:LI])),
44
+ H1: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H1, :HEADING])),
45
+ H2: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H2, :HEADING])),
46
+ H3: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H3, :HEADING]))
47
+ }
48
+ end
49
+ end
50
+ end
51
+
@@ -0,0 +1,49 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ class AnchorText
3
+ # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
4
+ # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
5
+ #* encounters such nestings, a SAXException is thrown.
6
+ def start(handler, name, attrs)
7
+ if handler.in_anchor_tag?
8
+ handler.in_anchor_tag += 1
9
+ nested_achor_tag_error_recovering(handler, name)
10
+ return
11
+ else
12
+ handler.in_anchor_tag += 1
13
+ end
14
+
15
+ append_anchor_text_start(handler) unless handler.in_ignorable_element?
16
+ false
17
+ end
18
+
19
+ def end_tag(handler, name)
20
+ handler.in_anchor_tag -= 1
21
+ append_anchor_text_end(handler) unless handler.in_anchor_tag? || handler.in_ignorable_element?
22
+ false
23
+ end
24
+
25
+ def changes_tag_level?
26
+ true
27
+ end
28
+
29
+ def append_anchor_text_start(handler)
30
+ handler.append_space
31
+ handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START)
32
+ handler.append_token(' ')
33
+ end
34
+
35
+ def append_anchor_text_end(handler)
36
+ handler.append_space
37
+ handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END)
38
+ handler.append_token(' ')
39
+ end
40
+
41
+ def nested_achor_tag_error_recovering(handler, name)
42
+ # - dunno about nokogiri???????
43
+ # as nested A elements are not allowed per specification, we
44
+ # are probably reaching this branch due to a bug in the XML parser
45
+ #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
46
+ end_tag(handler, name)
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,17 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # Explicitly marks this tag a simple "block-level" element,
3
+ # which always generates whitespace
4
+ class BlockLevel
5
+ def start(handler, name, attrs)
6
+ true
7
+ end
8
+
9
+ def end_tag(handler, name)
10
+ true
11
+ end
12
+
13
+ def changes_tag_level?
14
+ true
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,22 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # for block-level elements, which triggers some LabelAction on
3
+ # the generated TextBlock.
4
+ class BlockTagLabel
5
+ def initialize(label_action)
6
+ @label_action = label_action
7
+ end
8
+
9
+ def start(handler, name, attrs)
10
+ handler.add_label_action(@label_action)
11
+ true
12
+ end
13
+
14
+ def end_tag(handler, name)
15
+ true
16
+ end
17
+
18
+ def changes_tag_level?
19
+ true
20
+ end
21
+ end
22
+ end