boilerpipe-ruby 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +28 -1
- data/Dockerfile +14 -0
- data/README.md +13 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +4 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
- data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +28 -25
@@ -1,9 +1,7 @@
|
|
1
|
-
|
2
|
-
# Removes TextBlocks which have explicitly been marked as "not content".
|
1
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class BoilerplateBlockFilter
|
6
|
-
|
7
5
|
def initialize(label)
|
8
6
|
@label_to_keep = label
|
9
7
|
end
|
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
|
|
21
19
|
doc.replace_text_blocks!(combined)
|
22
20
|
doc
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
end
|
@@ -1,10 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# A full-text extractor trained on http://krdwrd.org/
|
2
|
+
# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
|
3
|
+
# Works well with SimpleEstimator, too.
|
4
4
|
|
5
5
|
module Boilerpipe::Filters
|
6
6
|
class CanolaClassifier
|
7
|
-
|
8
7
|
def self.process(doc)
|
9
8
|
return doc if doc.text_blocks.size < 1
|
10
9
|
|
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
|
|
22
21
|
def self.classify(prev, current, nxt)
|
23
22
|
current.link_density > 0 && nxt.num_words > 11 \
|
24
23
|
|| current.num_words > 19 \
|
25
|
-
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (
|
24
|
+
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
|
26
25
|
end
|
27
26
|
end
|
28
27
|
end
|
@@ -5,9 +5,8 @@
|
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class DensityRulesClassifier
|
8
|
-
|
9
8
|
def self.process(doc)
|
10
|
-
#return doc if doc.text_blocks.size < 2
|
9
|
+
# return doc if doc.text_blocks.size < 2
|
11
10
|
|
12
11
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
12
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
|
|
26
25
|
if prev.link_density <= 0.555556
|
27
26
|
if current.text_density <= 9
|
28
27
|
return true if nxt.text_density > 10
|
28
|
+
|
29
29
|
return prev.text_density <= 4 ? false : true
|
30
30
|
else
|
31
31
|
return nxt.text_density == 0 ? false : true
|
32
32
|
end
|
33
33
|
else
|
34
34
|
return false if nxt.text_density <= 11
|
35
|
+
|
35
36
|
true
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,9 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
|
2
|
+
# some heuristics which are quite specific to the news domain.
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# we create a list of potential titles from the page title
|
8
|
-
# then we look at every text block and if the text block
|
9
|
-
# contains a potential title - we set that text block label as :TITLE
|
4
|
+
# we create a list of potential titles from the page title
|
5
|
+
# then we look at every text block and if the text block
|
6
|
+
# contains a potential title - we set that text block label as :TITLE
|
10
7
|
|
11
8
|
module Boilerpipe::Filters
|
12
9
|
class DocumentTitleMatchClassifier
|
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
|
|
55
52
|
@potential_titles << title
|
56
53
|
|
57
54
|
# unnecessary
|
58
|
-
#p = longest_part(title, /[ ]*[|»-][ ]*/)
|
59
|
-
|
55
|
+
# p = longest_part(title, /[ ]*[|»-][ ]*/)
|
56
|
+
# @potential_titles << p if p
|
60
57
|
|
61
|
-
#p = longest_part(title, /[ ]*[|»:][ ]*/)
|
62
|
-
|
58
|
+
# p = longest_part(title, /[ ]*[|»:][ ]*/)
|
59
|
+
# @potential_titles << p if p
|
63
60
|
|
64
|
-
#p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
65
|
-
|
61
|
+
# p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
62
|
+
# @potential_titles << p if p
|
66
63
|
|
67
|
-
#p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
68
|
-
|
64
|
+
# p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
65
|
+
# @potential_titles << p if p
|
69
66
|
|
70
67
|
p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
|
71
68
|
@potential_titles << p if p
|
72
69
|
|
73
70
|
# we replace \u00a0 so why check for it?
|
74
|
-
#p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
75
|
-
|
71
|
+
# p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
72
|
+
# @potential_titles << p if p
|
76
73
|
|
77
74
|
add_potential_titles(title, /[ ]+[|][ ]+/, 4)
|
78
75
|
add_potential_titles(title, /[ ]+[-][ ]+/, 4)
|
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
|
|
90
87
|
|
91
88
|
parts.each do |part|
|
92
89
|
next if part =~ /[.]com/
|
90
|
+
|
93
91
|
num_words = number_of_words(part)
|
94
92
|
|
95
93
|
if num_words > longest_num_words || part.size > longest_part.size
|
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
|
|
107
105
|
|
108
106
|
parts.each do |part|
|
109
107
|
next if part =~ /[.]com/
|
108
|
+
|
110
109
|
num_words = number_of_words(part)
|
111
110
|
|
112
111
|
@potential_titles << part if num_words >= min_words
|
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
|
|
116
115
|
def number_of_words(s)
|
117
116
|
s.split(/[\b ]+/).size
|
118
117
|
end
|
119
|
-
|
120
118
|
end
|
121
119
|
end
|
@@ -1,43 +1,30 @@
|
|
1
|
-
|
2
1
|
# Marks all TextBlocks "content" which are between the headline and the part that has
|
3
2
|
# already been marked content, if they are marked MIGHT_BE_CONTENT.
|
4
3
|
# This filter is quite specific to the news domain.
|
5
4
|
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
|
6
5
|
|
7
|
-
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class ExpandTitleToContentFilter
|
10
8
|
def self.process(doc)
|
11
9
|
tbs = doc.text_blocks
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
# titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
|
16
|
-
# title = tbs.index(titles.last)
|
17
|
-
# content_start = tbs.find_index(&:is_content?)
|
11
|
+
title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
|
12
|
+
title_idx = tbs.index(title)
|
18
13
|
|
19
|
-
|
20
|
-
title = nil
|
21
|
-
content_start = nil
|
14
|
+
content_start = tbs.find_index(&:is_content?)
|
22
15
|
|
23
|
-
|
24
|
-
title = i if content_start.nil? && tb.has_label?(:TITLE)
|
25
|
-
content_start = i if content_start.nil? && tb.is_content?
|
26
|
-
i += 1
|
27
|
-
end
|
16
|
+
return doc if no_title_with_subsequent_content?(content_start, title_idx)
|
28
17
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
|
33
|
-
end
|
18
|
+
tbs.slice(title_idx...content_start)
|
19
|
+
.select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
|
20
|
+
.each{ |tb| tb.content = true }
|
34
21
|
|
35
22
|
doc
|
36
23
|
end
|
37
24
|
|
38
|
-
def self.no_title_with_subsequent_content?(content_start,
|
39
|
-
title
|
25
|
+
def self.no_title_with_subsequent_content?(content_start, title_idx)
|
26
|
+
# title has to start before content
|
27
|
+
title_idx.nil? || content_start.nil? || title_idx >= content_start
|
40
28
|
end
|
41
|
-
|
42
29
|
end
|
43
30
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# Marks all blocks as "non-content" that occur after blocks that have been
|
2
|
+
# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
|
3
|
+
# number of words in content blocks occur before this mark (default: 60).
|
4
|
+
# This can be used in conjunction with an upstream TerminatingBlocksFinder.
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
|
8
|
-
|
9
|
-
def self.process(doc, min_num_words=60)
|
8
|
+
def self.process(doc, min_num_words = 60)
|
10
9
|
found_end_of_text = false
|
11
10
|
num_words = 0
|
12
11
|
|
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
|
|
19
18
|
|
20
19
|
doc
|
21
20
|
end
|
22
|
-
|
23
21
|
end
|
24
22
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Keeps the largest TextBlock only (by the number of words). In case of
|
3
2
|
# more than one block with the same number of words, the first block is chosen.
|
4
3
|
# All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class KeepLargestBlockFilter
|
11
|
-
|
12
10
|
def initialize(expand_to_same_level_text, min_words)
|
13
11
|
@expand_to_same_level_text = expand_to_same_level_text
|
14
12
|
@min_words = min_words
|
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
|
|
43
41
|
expand_tag_level(tbs[0...n].reverse, level, @min_words)
|
44
42
|
|
45
43
|
# expand blocks to the right
|
46
|
-
expand_tag_level(tbs[n+1..-1], level, @min_words)
|
44
|
+
expand_tag_level(tbs[n + 1..-1], level, @min_words)
|
47
45
|
end
|
48
46
|
end
|
49
47
|
|
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
|
|
57
55
|
end
|
58
56
|
end
|
59
57
|
end
|
60
|
-
|
61
58
|
end
|
62
59
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks all blocks as content that:
|
3
2
|
# are on the same tag-level as very likely main content
|
4
3
|
# (usually the level of the largest block)
|
@@ -7,23 +6,22 @@
|
|
7
6
|
|
8
7
|
module Boilerpipe::Filters
|
9
8
|
class LargeBlockSameTagLevelToContentFilter
|
10
|
-
|
11
9
|
def self.process(doc)
|
12
|
-
|
13
10
|
largest = doc.text_blocks.find do |tb|
|
14
11
|
tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
15
12
|
end
|
16
13
|
|
17
14
|
return doc if largest.nil?
|
15
|
+
|
18
16
|
tag_level = largest.tag_level
|
19
17
|
|
20
18
|
doc.text_blocks.each do |tb|
|
21
19
|
next if tb.is_content?
|
20
|
+
|
22
21
|
tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
|
23
22
|
end
|
24
23
|
|
25
24
|
doc
|
26
25
|
end
|
27
|
-
|
28
26
|
end
|
29
27
|
end
|
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
|
|
11
11
|
doc.text_blocks.each do |tb|
|
12
12
|
if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
13
13
|
tag_level = tb.tag_level
|
14
|
-
elsif
|
14
|
+
elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
|
15
15
|
tb.content = true
|
16
16
|
else
|
17
17
|
tag_level = MAX
|
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
|
|
20
20
|
|
21
21
|
doc
|
22
22
|
end
|
23
|
-
|
24
23
|
end
|
25
24
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
|
1
|
+
# Marks all blocks as content.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class MarkEverythingContentFilter
|
5
|
-
|
6
5
|
def self.process(doc)
|
7
6
|
doc.text_blocks.each do |tb|
|
8
7
|
tb.content = true if tb.is_not_content?
|
9
8
|
end
|
10
9
|
doc
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
12
|
end
|
@@ -8,30 +8,27 @@
|
|
8
8
|
|
9
9
|
module Boilerpipe::Filters
|
10
10
|
class MinClauseWordsFilter
|
11
|
-
|
12
|
-
def self.process(doc, min_words=5)
|
13
|
-
|
11
|
+
def self.process(doc, min_words = 5)
|
14
12
|
doc.text_blocks.each do |tb|
|
15
13
|
next if tb.is_not_content?
|
16
14
|
|
17
15
|
clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
|
16
|
+
hasClause = false
|
18
17
|
tb.text.scan(clause_delimiter).each do |possible_clause|
|
19
|
-
|
20
|
-
break
|
21
|
-
else
|
22
|
-
tb.content = false
|
23
|
-
end
|
18
|
+
hasClause |= is_clause? possible_clause
|
24
19
|
end
|
20
|
+
|
21
|
+
tb.content = false unless hasClause
|
25
22
|
end
|
26
23
|
|
27
24
|
doc
|
28
25
|
end
|
29
26
|
|
30
|
-
def self.is_clause?(text, min_words=5)
|
31
|
-
|
27
|
+
def self.is_clause?(text, min_words = 5)
|
28
|
+
return false if text.nil?
|
29
|
+
|
32
30
|
whitespace = /[ \n\r]+/
|
33
31
|
text.scan(whitespace).size >= min_words
|
34
32
|
end
|
35
|
-
|
36
33
|
end
|
37
34
|
end
|
@@ -1,16 +1,14 @@
|
|
1
|
-
|
2
1
|
# Keeps only those content blocks which contain at least k words.
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class MinWordsFilter
|
6
|
-
|
7
5
|
def self.process(min_words, doc)
|
8
6
|
doc.text_blocks.each do |tb|
|
9
7
|
next if tb.is_not_content?
|
8
|
+
|
10
9
|
tb.content = false if tb.num_words < min_words
|
11
10
|
end
|
12
11
|
doc
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Classifies TextBlocks as content/not-content through rules that have been determined
|
4
2
|
# using the C4.8 machine learning algorithm, as described in the paper
|
5
3
|
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
@@ -7,7 +5,6 @@
|
|
7
5
|
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class NumWordsRulesClassifier
|
10
|
-
|
11
8
|
def self.process(doc)
|
12
9
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
10
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
|
|
37
34
|
|
38
35
|
false
|
39
36
|
end
|
40
|
-
|
41
37
|
end
|
42
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class SimpleBlockFusionProcessor
|
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
doc.replace_text_blocks!(
|
20
|
+
doc.replace_text_blocks!(tbs - blocks_to_remove)
|
21
21
|
doc
|
22
22
|
end
|
23
23
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Splits TextBlocks at paragraph boundaries.
|
3
2
|
#
|
4
3
|
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class SplitParagraphBlocksFilter
|
11
|
-
|
12
10
|
def self.process(doc)
|
13
11
|
tbs = doc.text_blocks
|
14
12
|
new_blocks = []
|
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
|
|
35
33
|
doc.replace_text_blocks!(new_blocks) if changes
|
36
34
|
doc
|
37
35
|
end
|
38
|
-
|
39
36
|
end
|
40
37
|
end
|
@@ -1,15 +1,13 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Finds blocks which are potentially indicating the end of an article
|
4
2
|
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
5
3
|
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
6
4
|
|
7
|
-
|
8
5
|
module Boilerpipe::Filters
|
9
6
|
class TerminatingBlocksFinder
|
10
7
|
def self.process(doc)
|
11
8
|
doc.text_blocks.each do |tb|
|
12
9
|
next unless tb.num_words < 15
|
10
|
+
|
13
11
|
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
14
12
|
tb.labels << :INDICATES_END_OF_TEXT
|
15
13
|
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
|
|
29
27
|
text.include?('what you think...') ||
|
30
28
|
text.include?('add your comment') ||
|
31
29
|
text.include?('add comment') ||
|
32
|
-
#TODO add this and test
|
33
|
-
#text.include?('leave a reply') ||
|
34
|
-
#text.include?('leave a comment') ||
|
35
|
-
#text.include?('show comments') ||
|
36
|
-
#text.include?('Share this:') ||
|
30
|
+
# TODO add this and test
|
31
|
+
# text.include?('leave a reply') ||
|
32
|
+
# text.include?('leave a comment') ||
|
33
|
+
# text.include?('show comments') ||
|
34
|
+
# text.include?('Share this:') ||
|
37
35
|
text.include?('reader views') ||
|
38
36
|
text.include?('have your say') ||
|
39
37
|
text.include?('reader comments') ||
|