boilerpipe-ruby 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +27 -6
  4. data/Rakefile +8 -0
  5. data/boilerpipe-ruby.gemspec +10 -9
  6. data/lib/boilerpipe.rb +30 -0
  7. data/lib/boilerpipe/document/text_block.rb +113 -0
  8. data/lib/boilerpipe/document/text_document.rb +44 -0
  9. data/lib/boilerpipe/errors.rb +1 -0
  10. data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
  11. data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
  12. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
  13. data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
  14. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
  15. data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
  16. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
  17. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
  18. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
  19. data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
  20. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
  21. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
  22. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
  23. data/lib/boilerpipe/labels/default.rb +17 -0
  24. data/lib/boilerpipe/labels/label_action.rb +17 -0
  25. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
  26. data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
  27. data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
  28. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
  29. data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
  30. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
  31. data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
  32. data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
  33. data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
  34. data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
  35. data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
  36. data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
  37. data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
  38. data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
  39. data/lib/boilerpipe/version.rb +1 -1
  40. data/stuff.txt +4 -0
  41. metadata +61 -15
@@ -0,0 +1,121 @@
1
+ # encoding: utf-8
2
+ require 'set'
3
+
4
+ # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
5
+ # some heuristics which are quite specific to the news domain.
6
+
7
+ # we create a list of potential titles from the page title
8
+ # then we look at every text block and if the text block
9
+ # contains a potential title - we set that text block label as :TITLE
10
+
11
+ module Boilerpipe::Filters
12
+ class DocumentTitleMatchClassifier
13
+ attr_reader :potential_titles
14
+
15
+ def initialize(title)
16
+ @potential_titles = Set.new
17
+ generate_potential_titles(title)
18
+ end
19
+
20
+ def process(doc)
21
+ return doc if @potential_titles.empty?
22
+
23
+ doc.text_blocks.each do |tb|
24
+ text = tb.text.gsub('\u00a0', ' ')
25
+ .gsub("'", '')
26
+ .strip.downcase
27
+
28
+ if @potential_titles.member? text
29
+ tb.add_label :TITLE
30
+ break
31
+ end
32
+
33
+ remove_characters = /[?!.-:]+/
34
+ text = text.gsub(remove_characters, '').strip
35
+
36
+ if @potential_titles.member? text
37
+ tb.add_label :TITLE
38
+ break
39
+ end
40
+ end
41
+
42
+ doc
43
+ end
44
+
45
+ private
46
+
47
+ def generate_potential_titles(title)
48
+ return if title.nil?
49
+
50
+ title = title.gsub('\u00a0', ' ')
51
+ .gsub("'", '')
52
+ .strip
53
+ .downcase
54
+
55
+ @potential_titles << title
56
+
57
+ # unnecessary
58
+ #p = longest_part(title, /[ ]*[|»-][ ]*/)
59
+ #@potential_titles << p if p
60
+
61
+ #p = longest_part(title, /[ ]*[|»:][ ]*/)
62
+ #@potential_titles << p if p
63
+
64
+ #p = longest_part(title, /[ ]*[|»:()][ ]*/)
65
+ #@potential_titles << p if p
66
+
67
+ #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
68
+ #@potential_titles << p if p
69
+
70
+ p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
71
+ @potential_titles << p if p
72
+
73
+ # we replace \u00a0 so why check for it?
74
+ #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
75
+ #@potential_titles << p if p
76
+
77
+ add_potential_titles(title, /[ ]+[|][ ]+/, 4)
78
+ add_potential_titles(title, /[ ]+[-][ ]+/, 4)
79
+
80
+ @potential_titles << title.sub(/ - [^-]+$/, '') # remove right of -
81
+ @potential_titles << title.sub(/^[^-]+ - /, '') # remove left of -
82
+ end
83
+
84
+ def longest_part(title, regex)
85
+ parts = title.split regex
86
+ return nil if parts.size == 1
87
+
88
+ longest_num_words = 0
89
+ longest_part = ''
90
+
91
+ parts.each do |part|
92
+ next if part =~ /[.]com/
93
+ num_words = number_of_words(part)
94
+
95
+ if num_words > longest_num_words || part.size > longest_part.size
96
+ longest_num_words = num_words
97
+ longest_part = part
98
+ end
99
+ end
100
+
101
+ longest_part.empty? ? nil : longest_part.strip
102
+ end
103
+
104
+ def add_potential_titles(title, regex, min_words)
105
+ parts = title.split regex
106
+ return if parts.size == 1
107
+
108
+ parts.each do |part|
109
+ next if part =~ /[.]com/
110
+ num_words = number_of_words(part)
111
+
112
+ @potential_titles << part if num_words >= min_words
113
+ end
114
+ end
115
+
116
+ def number_of_words(s)
117
+ s.split(/[\b ]+/).size
118
+ end
119
+
120
+ end
121
+ end
@@ -0,0 +1,43 @@
1
+
2
+ # Marks all TextBlocks "content" which are between the headline and the part that has
3
+ # already been marked content, if they are marked MIGHT_BE_CONTENT.
4
+ # This filter is quite specific to the news domain.
5
+ # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
6
+
7
+
8
+ module Boilerpipe::Filters
9
+ class ExpandTitleToContentFilter
10
+ def self.process(doc)
11
+ tbs = doc.text_blocks
12
+
13
+ # slower and more ruby-like
14
+ # comeback and let's do some benchmarking
15
+ # titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
16
+ # title = tbs.index(titles.last)
17
+ # content_start = tbs.find_index(&:is_content?)
18
+
19
+ i = 0
20
+ title = nil
21
+ content_start = nil
22
+
23
+ tbs.each do |tb|
24
+ title = i if content_start.nil? && tb.has_label?(:TITLE)
25
+ content_start = i if content_start.nil? && tb.is_content?
26
+ i += 1
27
+ end
28
+
29
+ return doc if no_title_with_subsequent_content?(content_start, title)
30
+
31
+ tbs.slice(title...content_start).each do |tb|
32
+ tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
33
+ end
34
+
35
+ doc
36
+ end
37
+
38
+ def self.no_title_with_subsequent_content?(content_start, title)
39
+ title.nil? || content_start.nil? || content_start <= title
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,7 @@
1
+ module Boilerpipe::Filters
2
+ class HeuristicFilterBase
3
+ def self.num_full_text_words(tb, min_text_density=9.0)
4
+ tb.text_density >= min_text_density ? tb.num_words : 0
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,24 @@
1
+ # Marks all blocks as "non-content" that occur after blocks that have been
2
+ # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
+ # number of words in content blocks occur before this mark (default: 60).
4
+ # This can be used in conjunction with an upstream TerminatingBlocksFinder.
5
+
6
+ module Boilerpipe::Filters
7
+ class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
8
+
9
+ def self.process(doc, min_num_words=60)
10
+ found_end_of_text = false
11
+ num_words = 0
12
+
13
+ doc.text_blocks.each do |tb|
14
+ end_of_text = tb.has_label? :INDICATES_END_OF_TEXT
15
+ num_words += num_full_text_words(tb) if tb.is_content?
16
+ found_end_of_text = true if end_of_text && num_words >= min_num_words
17
+ tb.content = false if found_end_of_text
18
+ end
19
+
20
+ doc
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,62 @@
1
+
2
+ # Keeps the largest TextBlock only (by the number of words). In case of
3
+ # more than one block with the same number of words, the first block is chosen.
4
+ # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
5
+ #
6
+ # Note that, by default, only TextBlocks marked as "content" are taken into
7
+ # consideration.
8
+
9
+ module Boilerpipe::Filters
10
+ class KeepLargestBlockFilter
11
+
12
+ def initialize(expand_to_same_level_text, min_words)
13
+ @expand_to_same_level_text = expand_to_same_level_text
14
+ @min_words = min_words
15
+ end
16
+
17
+ INSTANCE = KeepLargestBlockFilter.new(false, 0)
18
+ INSTANCE_EXPAND_TO_SAME_TAGLEVEL = KeepLargestBlockFilter.new(true, 0)
19
+ INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS = KeepLargestBlockFilter.new(true, 150)
20
+
21
+ def process(doc)
22
+ tbs = doc.text_blocks
23
+ return false if tbs.size < 2
24
+
25
+ # find tb with the most words
26
+ largest_block = tbs.select(&:is_content?).max_by(&:num_words)
27
+ level = @expand_to_same_level_text ? largest_block.tag_level : -1
28
+
29
+ # set labels for text blocks
30
+ tbs.each do |tb|
31
+ if tb == largest_block
32
+ tb.content = true
33
+ tb.add_label :VERY_LIKELY_CONTENT
34
+ else
35
+ tb.content = false
36
+ tb.add_label :MIGHT_BE_CONTENT
37
+ end
38
+ end
39
+
40
+ n = tbs.index(largest_block)
41
+ if @expand_to_same_level_text && n
42
+ # expand blocks to the left
43
+ expand_tag_level(tbs[0...n].reverse, level, @min_words)
44
+
45
+ # expand blocks to the right
46
+ expand_tag_level(tbs[n+1..-1], level, @min_words)
47
+ end
48
+ end
49
+
50
+ # sets content to true
51
+ def expand_tag_level(tbs, level, min_words)
52
+ tbs.each do |tb|
53
+ if tb.tag_level < level
54
+ break
55
+ elsif tb.tag_level == level
56
+ tb.content = true if tb.num_words >= min_words
57
+ end
58
+ end
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,29 @@
1
+
2
+ # Marks all blocks as content that:
3
+ # are on the same tag-level as very likely main content
4
+ # (usually the level of the largest block)
5
+ # have a significant number of words, currently: at least 100
6
+ # Used downstream of KeepLargestBlockFilter
7
+
8
+ module Boilerpipe::Filters
9
+ class LargeBlockSameTagLevelToContentFilter
10
+
11
+ def self.process(doc)
12
+
13
+ largest = doc.text_blocks.find do |tb|
14
+ tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
15
+ end
16
+
17
+ return doc if largest.nil?
18
+ tag_level = largest.tag_level
19
+
20
+ doc.text_blocks.each do |tb|
21
+ next if tb.is_content?
22
+ tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
23
+ end
24
+
25
+ doc
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,25 @@
1
+ # Marks nested list-item blocks after the end of the main content as content.
2
+ # Used downstream of keep_largest_block_filter.
3
+
4
+ module Boilerpipe::Filters
5
+ class ListAtEndFilter
6
+ MAX = 99999999
7
+
8
+ def self.process(doc)
9
+ tag_level = MAX
10
+
11
+ doc.text_blocks.each do |tb|
12
+ if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
13
+ tag_level = tb.tag_level
14
+ elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
15
+ tb.content = true
16
+ else
17
+ tag_level = MAX
18
+ end
19
+ end
20
+
21
+ doc
22
+ end
23
+
24
+ end
25
+ end
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+
3
+ # Classifies TextBlocks as content/not-content through rules that have been determined
4
+ # using the C4.8 machine learning algorithm, as described in the paper
5
+ # "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
6
+ # using number of words per block and link density per block.
7
+
8
+ module Boilerpipe::Filters
9
+ class NumWordsRulesClassifier
10
+
11
+ def self.process(doc)
12
+ empty = Boilerpipe::Document::TextBlock.empty_start
13
+ text_blocks = [empty] + doc.text_blocks + [empty]
14
+
15
+ text_blocks.each_cons(3) do |slice|
16
+ prev, current, nxt = *slice
17
+ current.content = classify(prev, current, nxt)
18
+ end
19
+
20
+ doc
21
+ end
22
+
23
+ private
24
+
25
+ def self.classify(prev, current, nxt)
26
+ return false if current.link_density > 0.333333
27
+
28
+ if prev.link_density <= 0.555556
29
+ return true if current.num_words > 16
30
+
31
+ return true if nxt.num_words > 15
32
+ return true if prev.num_words > 4
33
+ else
34
+ return true if current.num_words > 40
35
+ return true if nxt.num_words > 17
36
+ end
37
+
38
+ false
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,44 @@
1
+ # encoding: utf-8
2
+
3
+ # Finds blocks which are potentially indicating the end of an article
4
+ # text and marks them with INDICATES_END_OF_TEXT. This can be used
5
+ # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
6
+
7
+
8
+ module Boilerpipe::Filters
9
+ class TerminatingBlocksFinder
10
+ def self.process(doc)
11
+ doc.text_blocks.each do |tb|
12
+ next unless tb.num_words < 15
13
+ if tb.text.length >= 8 && finds_match?(tb.text.downcase)
14
+ tb.labels << :INDICATES_END_OF_TEXT
15
+ elsif tb.link_density == 1.0 && tb.text == 'comment'
16
+ tb.labels << :INDICATES_END_OF_TEXT
17
+ end
18
+ end
19
+
20
+ doc
21
+ end
22
+
23
+ def self.finds_match?(text)
24
+ text.start_with?('comments') ||
25
+ text =~ /^\d+ (comments|users responded in)/ || # starts with number
26
+ text.start_with?('© reuters') ||
27
+ text.start_with?('please rate this') ||
28
+ text.start_with?('post a comment') ||
29
+ text.include?('what you think...') ||
30
+ text.include?('add your comment') ||
31
+ text.include?('add comment') ||
32
+ #TODO add this and test
33
+ #text.include?('leave a reply') ||
34
+ #text.include?('leave a comment') ||
35
+ #text.include?('show comments') ||
36
+ #text.include?('Share this:') ||
37
+ text.include?('reader views') ||
38
+ text.include?('have your say') ||
39
+ text.include?('reader comments') ||
40
+ text.include?('rätta artikeln') ||
41
+ text == 'thanks for your comments - this feedback is now closed'
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,24 @@
1
+
2
+ # Marks trailing headlines TextBlocks that have the label :#HEADING
3
+ # as boilerplate. Trailing means they are marked content and are
4
+ # below any other content block.
5
+
6
+ module Boilerpipe::Filters
7
+ class TrailingHeadlineToBoilerplateFilter
8
+ def self.process(doc)
9
+
10
+ doc.text_blocks.each do |tb|
11
+ next unless tb.is_content?
12
+
13
+ if tb.has_label? :HEADING
14
+ tb.content = false
15
+ else
16
+ break
17
+ end
18
+ end
19
+
20
+ doc
21
+ end
22
+
23
+ end
24
+ end
@@ -0,0 +1,17 @@
1
+ module Boilerpipe::Labels
2
+ module Default
3
+ :TITLE
4
+ :ARTICLE_METADATA
5
+ :INDICATES_END_OF_TEXT
6
+ :MIGHT_BE_CONTENT
7
+ :VERY_LIKELY_CONTENT
8
+ :STRICTLY_NOT_CONTENT
9
+ :HR
10
+ :LI
11
+ :HEADING
12
+ :H1
13
+ :H2
14
+ :H3
15
+ MARKUP_PREFIX = '<'
16
+ end
17
+ end