boilerpipe-ruby 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +27 -6
- data/Rakefile +8 -0
- data/boilerpipe-ruby.gemspec +10 -9
- data/lib/boilerpipe.rb +30 -0
- data/lib/boilerpipe/document/text_block.rb +113 -0
- data/lib/boilerpipe/document/text_document.rb +44 -0
- data/lib/boilerpipe/errors.rb +1 -0
- data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
- data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
- data/lib/boilerpipe/labels/default.rb +17 -0
- data/lib/boilerpipe/labels/label_action.rb +17 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
- data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
- data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
- data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
- data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
- data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
- data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
- data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
- data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
- data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- data/stuff.txt +4 -0
- metadata +61 -15
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
require 'set'
|
|
3
|
+
|
|
4
|
+
# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
|
|
5
|
+
# some heuristics which are quite specific to the news domain.
|
|
6
|
+
|
|
7
|
+
# we create a list of potential titles from the page title
|
|
8
|
+
# then we look at every text block and if the text block
|
|
9
|
+
# contains a potential title - we set that text block label as :TITLE
|
|
10
|
+
|
|
11
|
+
module Boilerpipe::Filters
|
|
12
|
+
class DocumentTitleMatchClassifier
|
|
13
|
+
attr_reader :potential_titles
|
|
14
|
+
|
|
15
|
+
def initialize(title)
|
|
16
|
+
@potential_titles = Set.new
|
|
17
|
+
generate_potential_titles(title)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def process(doc)
|
|
21
|
+
return doc if @potential_titles.empty?
|
|
22
|
+
|
|
23
|
+
doc.text_blocks.each do |tb|
|
|
24
|
+
text = tb.text.gsub('\u00a0', ' ')
|
|
25
|
+
.gsub("'", '')
|
|
26
|
+
.strip.downcase
|
|
27
|
+
|
|
28
|
+
if @potential_titles.member? text
|
|
29
|
+
tb.add_label :TITLE
|
|
30
|
+
break
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
remove_characters = /[?!.-:]+/
|
|
34
|
+
text = text.gsub(remove_characters, '').strip
|
|
35
|
+
|
|
36
|
+
if @potential_titles.member? text
|
|
37
|
+
tb.add_label :TITLE
|
|
38
|
+
break
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
doc
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def generate_potential_titles(title)
|
|
48
|
+
return if title.nil?
|
|
49
|
+
|
|
50
|
+
title = title.gsub('\u00a0', ' ')
|
|
51
|
+
.gsub("'", '')
|
|
52
|
+
.strip
|
|
53
|
+
.downcase
|
|
54
|
+
|
|
55
|
+
@potential_titles << title
|
|
56
|
+
|
|
57
|
+
# unnecessary
|
|
58
|
+
#p = longest_part(title, /[ ]*[|»-][ ]*/)
|
|
59
|
+
#@potential_titles << p if p
|
|
60
|
+
|
|
61
|
+
#p = longest_part(title, /[ ]*[|»:][ ]*/)
|
|
62
|
+
#@potential_titles << p if p
|
|
63
|
+
|
|
64
|
+
#p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
|
65
|
+
#@potential_titles << p if p
|
|
66
|
+
|
|
67
|
+
#p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
|
68
|
+
#@potential_titles << p if p
|
|
69
|
+
|
|
70
|
+
p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
|
|
71
|
+
@potential_titles << p if p
|
|
72
|
+
|
|
73
|
+
# we replace \u00a0 so why check for it?
|
|
74
|
+
#p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
|
75
|
+
#@potential_titles << p if p
|
|
76
|
+
|
|
77
|
+
add_potential_titles(title, /[ ]+[|][ ]+/, 4)
|
|
78
|
+
add_potential_titles(title, /[ ]+[-][ ]+/, 4)
|
|
79
|
+
|
|
80
|
+
@potential_titles << title.sub(/ - [^-]+$/, '') # remove right of -
|
|
81
|
+
@potential_titles << title.sub(/^[^-]+ - /, '') # remove left of -
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def longest_part(title, regex)
|
|
85
|
+
parts = title.split regex
|
|
86
|
+
return nil if parts.size == 1
|
|
87
|
+
|
|
88
|
+
longest_num_words = 0
|
|
89
|
+
longest_part = ''
|
|
90
|
+
|
|
91
|
+
parts.each do |part|
|
|
92
|
+
next if part =~ /[.]com/
|
|
93
|
+
num_words = number_of_words(part)
|
|
94
|
+
|
|
95
|
+
if num_words > longest_num_words || part.size > longest_part.size
|
|
96
|
+
longest_num_words = num_words
|
|
97
|
+
longest_part = part
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
longest_part.empty? ? nil : longest_part.strip
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def add_potential_titles(title, regex, min_words)
|
|
105
|
+
parts = title.split regex
|
|
106
|
+
return if parts.size == 1
|
|
107
|
+
|
|
108
|
+
parts.each do |part|
|
|
109
|
+
next if part =~ /[.]com/
|
|
110
|
+
num_words = number_of_words(part)
|
|
111
|
+
|
|
112
|
+
@potential_titles << part if num_words >= min_words
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def number_of_words(s)
|
|
117
|
+
s.split(/[\b ]+/).size
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
|
|
2
|
+
# Marks all TextBlocks "content" which are between the headline and the part that has
|
|
3
|
+
# already been marked content, if they are marked MIGHT_BE_CONTENT.
|
|
4
|
+
# This filter is quite specific to the news domain.
|
|
5
|
+
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
module Boilerpipe::Filters
|
|
9
|
+
class ExpandTitleToContentFilter
|
|
10
|
+
def self.process(doc)
|
|
11
|
+
tbs = doc.text_blocks
|
|
12
|
+
|
|
13
|
+
# slower and more ruby-like
|
|
14
|
+
# comeback and let's do some benchmarking
|
|
15
|
+
# titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
|
|
16
|
+
# title = tbs.index(titles.last)
|
|
17
|
+
# content_start = tbs.find_index(&:is_content?)
|
|
18
|
+
|
|
19
|
+
i = 0
|
|
20
|
+
title = nil
|
|
21
|
+
content_start = nil
|
|
22
|
+
|
|
23
|
+
tbs.each do |tb|
|
|
24
|
+
title = i if content_start.nil? && tb.has_label?(:TITLE)
|
|
25
|
+
content_start = i if content_start.nil? && tb.is_content?
|
|
26
|
+
i += 1
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
return doc if no_title_with_subsequent_content?(content_start, title)
|
|
30
|
+
|
|
31
|
+
tbs.slice(title...content_start).each do |tb|
|
|
32
|
+
tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
doc
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.no_title_with_subsequent_content?(content_start, title)
|
|
39
|
+
title.nil? || content_start.nil? || content_start <= title
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Marks all blocks as "non-content" that occur after blocks that have been
|
|
2
|
+
# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
|
|
3
|
+
# number of words in content blocks occur before this mark (default: 60).
|
|
4
|
+
# This can be used in conjunction with an upstream TerminatingBlocksFinder.
|
|
5
|
+
|
|
6
|
+
module Boilerpipe::Filters
|
|
7
|
+
class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
|
|
8
|
+
|
|
9
|
+
def self.process(doc, min_num_words=60)
|
|
10
|
+
found_end_of_text = false
|
|
11
|
+
num_words = 0
|
|
12
|
+
|
|
13
|
+
doc.text_blocks.each do |tb|
|
|
14
|
+
end_of_text = tb.has_label? :INDICATES_END_OF_TEXT
|
|
15
|
+
num_words += num_full_text_words(tb) if tb.is_content?
|
|
16
|
+
found_end_of_text = true if end_of_text && num_words >= min_num_words
|
|
17
|
+
tb.content = false if found_end_of_text
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
doc
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
|
|
2
|
+
# Keeps the largest TextBlock only (by the number of words). In case of
|
|
3
|
+
# more than one block with the same number of words, the first block is chosen.
|
|
4
|
+
# All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
|
|
5
|
+
#
|
|
6
|
+
# Note that, by default, only TextBlocks marked as "content" are taken into
|
|
7
|
+
# consideration.
|
|
8
|
+
|
|
9
|
+
module Boilerpipe::Filters
|
|
10
|
+
class KeepLargestBlockFilter
|
|
11
|
+
|
|
12
|
+
def initialize(expand_to_same_level_text, min_words)
|
|
13
|
+
@expand_to_same_level_text = expand_to_same_level_text
|
|
14
|
+
@min_words = min_words
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
INSTANCE = KeepLargestBlockFilter.new(false, 0)
|
|
18
|
+
INSTANCE_EXPAND_TO_SAME_TAGLEVEL = KeepLargestBlockFilter.new(true, 0)
|
|
19
|
+
INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS = KeepLargestBlockFilter.new(true, 150)
|
|
20
|
+
|
|
21
|
+
def process(doc)
|
|
22
|
+
tbs = doc.text_blocks
|
|
23
|
+
return false if tbs.size < 2
|
|
24
|
+
|
|
25
|
+
# find tb with the most words
|
|
26
|
+
largest_block = tbs.select(&:is_content?).max_by(&:num_words)
|
|
27
|
+
level = @expand_to_same_level_text ? largest_block.tag_level : -1
|
|
28
|
+
|
|
29
|
+
# set labels for text blocks
|
|
30
|
+
tbs.each do |tb|
|
|
31
|
+
if tb == largest_block
|
|
32
|
+
tb.content = true
|
|
33
|
+
tb.add_label :VERY_LIKELY_CONTENT
|
|
34
|
+
else
|
|
35
|
+
tb.content = false
|
|
36
|
+
tb.add_label :MIGHT_BE_CONTENT
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
n = tbs.index(largest_block)
|
|
41
|
+
if @expand_to_same_level_text && n
|
|
42
|
+
# expand blocks to the left
|
|
43
|
+
expand_tag_level(tbs[0...n].reverse, level, @min_words)
|
|
44
|
+
|
|
45
|
+
# expand blocks to the right
|
|
46
|
+
expand_tag_level(tbs[n+1..-1], level, @min_words)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# sets content to true
|
|
51
|
+
def expand_tag_level(tbs, level, min_words)
|
|
52
|
+
tbs.each do |tb|
|
|
53
|
+
if tb.tag_level < level
|
|
54
|
+
break
|
|
55
|
+
elsif tb.tag_level == level
|
|
56
|
+
tb.content = true if tb.num_words >= min_words
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
|
|
2
|
+
# Marks all blocks as content that:
|
|
3
|
+
# are on the same tag-level as very likely main content
|
|
4
|
+
# (usually the level of the largest block)
|
|
5
|
+
# have a significant number of words, currently: at least 100
|
|
6
|
+
# Used downstream of KeepLargestBlockFilter
|
|
7
|
+
|
|
8
|
+
module Boilerpipe::Filters
|
|
9
|
+
class LargeBlockSameTagLevelToContentFilter
|
|
10
|
+
|
|
11
|
+
def self.process(doc)
|
|
12
|
+
|
|
13
|
+
largest = doc.text_blocks.find do |tb|
|
|
14
|
+
tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
return doc if largest.nil?
|
|
18
|
+
tag_level = largest.tag_level
|
|
19
|
+
|
|
20
|
+
doc.text_blocks.each do |tb|
|
|
21
|
+
next if tb.is_content?
|
|
22
|
+
tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
doc
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Marks nested list-item blocks after the end of the main content as content.
|
|
2
|
+
# Used downstream of keep_largest_block_filter.
|
|
3
|
+
|
|
4
|
+
module Boilerpipe::Filters
|
|
5
|
+
class ListAtEndFilter
|
|
6
|
+
MAX = 99999999
|
|
7
|
+
|
|
8
|
+
def self.process(doc)
|
|
9
|
+
tag_level = MAX
|
|
10
|
+
|
|
11
|
+
doc.text_blocks.each do |tb|
|
|
12
|
+
if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
|
13
|
+
tag_level = tb.tag_level
|
|
14
|
+
elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
|
|
15
|
+
tb.content = true
|
|
16
|
+
else
|
|
17
|
+
tag_level = MAX
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
doc
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
# Classifies TextBlocks as content/not-content through rules that have been determined
|
|
4
|
+
# using the C4.8 machine learning algorithm, as described in the paper
|
|
5
|
+
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
|
6
|
+
# using number of words per block and link density per block.
|
|
7
|
+
|
|
8
|
+
module Boilerpipe::Filters
|
|
9
|
+
class NumWordsRulesClassifier
|
|
10
|
+
|
|
11
|
+
def self.process(doc)
|
|
12
|
+
empty = Boilerpipe::Document::TextBlock.empty_start
|
|
13
|
+
text_blocks = [empty] + doc.text_blocks + [empty]
|
|
14
|
+
|
|
15
|
+
text_blocks.each_cons(3) do |slice|
|
|
16
|
+
prev, current, nxt = *slice
|
|
17
|
+
current.content = classify(prev, current, nxt)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
doc
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def self.classify(prev, current, nxt)
|
|
26
|
+
return false if current.link_density > 0.333333
|
|
27
|
+
|
|
28
|
+
if prev.link_density <= 0.555556
|
|
29
|
+
return true if current.num_words > 16
|
|
30
|
+
|
|
31
|
+
return true if nxt.num_words > 15
|
|
32
|
+
return true if prev.num_words > 4
|
|
33
|
+
else
|
|
34
|
+
return true if current.num_words > 40
|
|
35
|
+
return true if nxt.num_words > 17
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
false
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
# Finds blocks which are potentially indicating the end of an article
|
|
4
|
+
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
|
5
|
+
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
module Boilerpipe::Filters
|
|
9
|
+
class TerminatingBlocksFinder
|
|
10
|
+
def self.process(doc)
|
|
11
|
+
doc.text_blocks.each do |tb|
|
|
12
|
+
next unless tb.num_words < 15
|
|
13
|
+
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
|
14
|
+
tb.labels << :INDICATES_END_OF_TEXT
|
|
15
|
+
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
|
16
|
+
tb.labels << :INDICATES_END_OF_TEXT
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
doc
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.finds_match?(text)
|
|
24
|
+
text.start_with?('comments') ||
|
|
25
|
+
text =~ /^\d+ (comments|users responded in)/ || # starts with number
|
|
26
|
+
text.start_with?('© reuters') ||
|
|
27
|
+
text.start_with?('please rate this') ||
|
|
28
|
+
text.start_with?('post a comment') ||
|
|
29
|
+
text.include?('what you think...') ||
|
|
30
|
+
text.include?('add your comment') ||
|
|
31
|
+
text.include?('add comment') ||
|
|
32
|
+
#TODO add this and test
|
|
33
|
+
#text.include?('leave a reply') ||
|
|
34
|
+
#text.include?('leave a comment') ||
|
|
35
|
+
#text.include?('show comments') ||
|
|
36
|
+
#text.include?('Share this:') ||
|
|
37
|
+
text.include?('reader views') ||
|
|
38
|
+
text.include?('have your say') ||
|
|
39
|
+
text.include?('reader comments') ||
|
|
40
|
+
text.include?('rätta artikeln') ||
|
|
41
|
+
text == 'thanks for your comments - this feedback is now closed'
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
|
|
2
|
+
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
|
3
|
+
# as boilerplate. Trailing means they are marked content and are
|
|
4
|
+
# below any other content block.
|
|
5
|
+
|
|
6
|
+
module Boilerpipe::Filters
|
|
7
|
+
class TrailingHeadlineToBoilerplateFilter
|
|
8
|
+
def self.process(doc)
|
|
9
|
+
|
|
10
|
+
doc.text_blocks.each do |tb|
|
|
11
|
+
next unless tb.is_content?
|
|
12
|
+
|
|
13
|
+
if tb.has_label? :HEADING
|
|
14
|
+
tb.content = false
|
|
15
|
+
else
|
|
16
|
+
break
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
doc
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
|
24
|
+
end
|