boilerpipe-ruby 0.3.0 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +30 -1
- data/Dockerfile +14 -0
- data/README.md +15 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +9 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
- data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +33 -25
@@ -0,0 +1,20 @@
|
|
1
|
+
# A full-text extractor which extracts the largest text component of a page.
|
2
|
+
# For news articles, it may perform better than the DefaultExtractor, but
|
3
|
+
# usually worse than ArticleExtractor.
|
4
|
+
|
5
|
+
module Boilerpipe::Extractors
|
6
|
+
class KeepEverythingWithKMinWordsExtractor
|
7
|
+
def self.text(min, contents)
|
8
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
9
|
+
::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
|
10
|
+
doc.content
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.process(min, doc)
|
14
|
+
::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
|
15
|
+
::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
|
16
|
+
::Boilerpipe::Filters::MinWordsFilter.process min, doc
|
17
|
+
doc
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
1
|
+
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
|
2
|
+
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
4
3
|
|
5
4
|
module Boilerpipe::Filters
|
6
5
|
class BlockProximityFusion
|
7
|
-
|
8
|
-
|
9
6
|
def initialize(max_blocks_distance, content_only, same_tag_level_only)
|
10
7
|
@max_blocks_distance = max_blocks_distance
|
11
8
|
@content_only = content_only
|
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
|
|
13
10
|
end
|
14
11
|
|
15
12
|
MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
|
16
|
-
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(
|
17
|
-
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(
|
13
|
+
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
|
14
|
+
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
|
18
15
|
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
|
19
16
|
|
20
17
|
def process(doc)
|
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
|
|
22
19
|
return false if text_blocks.size < 2
|
23
20
|
|
24
21
|
prev_block = if @content_only
|
25
|
-
text_blocks.find{ |tb| tb.is_content? }
|
22
|
+
text_blocks.find { |tb| tb.is_content? }
|
26
23
|
else
|
27
24
|
text_blocks.first
|
28
25
|
end
|
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
|
|
46
43
|
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
|
47
44
|
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
|
48
45
|
|
49
|
-
if
|
46
|
+
if ok
|
50
47
|
prev_block.merge_next(tb)
|
51
48
|
blocks_to_remove << tb
|
52
49
|
else
|
53
50
|
prev_block = tb
|
54
51
|
end
|
55
52
|
end
|
56
|
-
|
57
53
|
end
|
58
|
-
doc.replace_text_blocks!(
|
54
|
+
doc.replace_text_blocks!(text_blocks - blocks_to_remove)
|
59
55
|
doc
|
60
56
|
end
|
61
|
-
|
62
57
|
end
|
63
58
|
end
|
@@ -1,9 +1,7 @@
|
|
1
|
-
|
2
|
-
# Removes TextBlocks which have explicitly been marked as "not content".
|
1
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class BoilerplateBlockFilter
|
6
|
-
|
7
5
|
def initialize(label)
|
8
6
|
@label_to_keep = label
|
9
7
|
end
|
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
|
|
21
19
|
doc.replace_text_blocks!(combined)
|
22
20
|
doc
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
end
|
@@ -1,10 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# A full-text extractor trained on http://krdwrd.org/
|
2
|
+
# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
|
3
|
+
# Works well with SimpleEstimator, too.
|
4
4
|
|
5
5
|
module Boilerpipe::Filters
|
6
6
|
class CanolaClassifier
|
7
|
-
|
8
7
|
def self.process(doc)
|
9
8
|
return doc if doc.text_blocks.size < 1
|
10
9
|
|
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
|
|
22
21
|
def self.classify(prev, current, nxt)
|
23
22
|
current.link_density > 0 && nxt.num_words > 11 \
|
24
23
|
|| current.num_words > 19 \
|
25
|
-
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (
|
24
|
+
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
|
26
25
|
end
|
27
26
|
end
|
28
27
|
end
|
@@ -5,9 +5,8 @@
|
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class DensityRulesClassifier
|
8
|
-
|
9
8
|
def self.process(doc)
|
10
|
-
#return doc if doc.text_blocks.size < 2
|
9
|
+
# return doc if doc.text_blocks.size < 2
|
11
10
|
|
12
11
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
12
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
|
|
26
25
|
if prev.link_density <= 0.555556
|
27
26
|
if current.text_density <= 9
|
28
27
|
return true if nxt.text_density > 10
|
28
|
+
|
29
29
|
return prev.text_density <= 4 ? false : true
|
30
30
|
else
|
31
31
|
return nxt.text_density == 0 ? false : true
|
32
32
|
end
|
33
33
|
else
|
34
34
|
return false if nxt.text_density <= 11
|
35
|
+
|
35
36
|
true
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,9 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
|
2
|
+
# some heuristics which are quite specific to the news domain.
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# we create a list of potential titles from the page title
|
8
|
-
# then we look at every text block and if the text block
|
9
|
-
# contains a potential title - we set that text block label as :TITLE
|
4
|
+
# we create a list of potential titles from the page title
|
5
|
+
# then we look at every text block and if the text block
|
6
|
+
# contains a potential title - we set that text block label as :TITLE
|
10
7
|
|
11
8
|
module Boilerpipe::Filters
|
12
9
|
class DocumentTitleMatchClassifier
|
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
|
|
55
52
|
@potential_titles << title
|
56
53
|
|
57
54
|
# unnecessary
|
58
|
-
#p = longest_part(title, /[ ]*[|»-][ ]*/)
|
59
|
-
|
55
|
+
# p = longest_part(title, /[ ]*[|»-][ ]*/)
|
56
|
+
# @potential_titles << p if p
|
60
57
|
|
61
|
-
#p = longest_part(title, /[ ]*[|»:][ ]*/)
|
62
|
-
|
58
|
+
# p = longest_part(title, /[ ]*[|»:][ ]*/)
|
59
|
+
# @potential_titles << p if p
|
63
60
|
|
64
|
-
#p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
65
|
-
|
61
|
+
# p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
62
|
+
# @potential_titles << p if p
|
66
63
|
|
67
|
-
#p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
68
|
-
|
64
|
+
# p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
65
|
+
# @potential_titles << p if p
|
69
66
|
|
70
67
|
p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
|
71
68
|
@potential_titles << p if p
|
72
69
|
|
73
70
|
# we replace \u00a0 so why check for it?
|
74
|
-
#p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
75
|
-
|
71
|
+
# p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
72
|
+
# @potential_titles << p if p
|
76
73
|
|
77
74
|
add_potential_titles(title, /[ ]+[|][ ]+/, 4)
|
78
75
|
add_potential_titles(title, /[ ]+[-][ ]+/, 4)
|
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
|
|
90
87
|
|
91
88
|
parts.each do |part|
|
92
89
|
next if part =~ /[.]com/
|
90
|
+
|
93
91
|
num_words = number_of_words(part)
|
94
92
|
|
95
93
|
if num_words > longest_num_words || part.size > longest_part.size
|
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
|
|
107
105
|
|
108
106
|
parts.each do |part|
|
109
107
|
next if part =~ /[.]com/
|
108
|
+
|
110
109
|
num_words = number_of_words(part)
|
111
110
|
|
112
111
|
@potential_titles << part if num_words >= min_words
|
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
|
|
116
115
|
def number_of_words(s)
|
117
116
|
s.split(/[\b ]+/).size
|
118
117
|
end
|
119
|
-
|
120
118
|
end
|
121
119
|
end
|
@@ -1,10 +1,8 @@
|
|
1
|
-
|
2
1
|
# Marks all TextBlocks "content" which are between the headline and the part that has
|
3
2
|
# already been marked content, if they are marked MIGHT_BE_CONTENT.
|
4
3
|
# This filter is quite specific to the news domain.
|
5
4
|
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
|
6
5
|
|
7
|
-
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class ExpandTitleToContentFilter
|
10
8
|
def self.process(doc)
|
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
|
|
38
36
|
def self.no_title_with_subsequent_content?(content_start, title)
|
39
37
|
title.nil? || content_start.nil? || content_start <= title
|
40
38
|
end
|
41
|
-
|
42
39
|
end
|
43
40
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# Marks all blocks as "non-content" that occur after blocks that have been
|
2
|
+
# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
|
3
|
+
# number of words in content blocks occur before this mark (default: 60).
|
4
|
+
# This can be used in conjunction with an upstream TerminatingBlocksFinder.
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
|
8
|
-
|
9
|
-
def self.process(doc, min_num_words=60)
|
8
|
+
def self.process(doc, min_num_words = 60)
|
10
9
|
found_end_of_text = false
|
11
10
|
num_words = 0
|
12
11
|
|
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
|
|
19
18
|
|
20
19
|
doc
|
21
20
|
end
|
22
|
-
|
23
21
|
end
|
24
22
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Keeps the largest TextBlock only (by the number of words). In case of
|
3
2
|
# more than one block with the same number of words, the first block is chosen.
|
4
3
|
# All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class KeepLargestBlockFilter
|
11
|
-
|
12
10
|
def initialize(expand_to_same_level_text, min_words)
|
13
11
|
@expand_to_same_level_text = expand_to_same_level_text
|
14
12
|
@min_words = min_words
|
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
|
|
43
41
|
expand_tag_level(tbs[0...n].reverse, level, @min_words)
|
44
42
|
|
45
43
|
# expand blocks to the right
|
46
|
-
expand_tag_level(tbs[n+1..-1], level, @min_words)
|
44
|
+
expand_tag_level(tbs[n + 1..-1], level, @min_words)
|
47
45
|
end
|
48
46
|
end
|
49
47
|
|
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
|
|
57
55
|
end
|
58
56
|
end
|
59
57
|
end
|
60
|
-
|
61
58
|
end
|
62
59
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks all blocks as content that:
|
3
2
|
# are on the same tag-level as very likely main content
|
4
3
|
# (usually the level of the largest block)
|
@@ -7,23 +6,22 @@
|
|
7
6
|
|
8
7
|
module Boilerpipe::Filters
|
9
8
|
class LargeBlockSameTagLevelToContentFilter
|
10
|
-
|
11
9
|
def self.process(doc)
|
12
|
-
|
13
10
|
largest = doc.text_blocks.find do |tb|
|
14
11
|
tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
15
12
|
end
|
16
13
|
|
17
14
|
return doc if largest.nil?
|
15
|
+
|
18
16
|
tag_level = largest.tag_level
|
19
17
|
|
20
18
|
doc.text_blocks.each do |tb|
|
21
19
|
next if tb.is_content?
|
20
|
+
|
22
21
|
tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
|
23
22
|
end
|
24
23
|
|
25
24
|
doc
|
26
25
|
end
|
27
|
-
|
28
26
|
end
|
29
27
|
end
|
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
|
|
11
11
|
doc.text_blocks.each do |tb|
|
12
12
|
if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
13
13
|
tag_level = tb.tag_level
|
14
|
-
elsif
|
14
|
+
elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
|
15
15
|
tb.content = true
|
16
16
|
else
|
17
17
|
tag_level = MAX
|
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
|
|
20
20
|
|
21
21
|
doc
|
22
22
|
end
|
23
|
-
|
24
23
|
end
|
25
24
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
|
1
|
+
# Marks all blocks as content.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class MarkEverythingContentFilter
|
5
|
-
|
6
5
|
def self.process(doc)
|
7
6
|
doc.text_blocks.each do |tb|
|
8
7
|
tb.content = true if tb.is_not_content?
|
9
8
|
end
|
10
9
|
doc
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
12
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#
|
2
|
+
# Keeps only blocks that have at least one segment fragment ("clause") with at least k
|
3
|
+
# words (default: 5).
|
4
|
+
#
|
5
|
+
# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
|
6
|
+
#
|
7
|
+
# SplitParagraphBlocksFilter
|
8
|
+
|
9
|
+
module Boilerpipe::Filters
|
10
|
+
class MinClauseWordsFilter
|
11
|
+
def self.process(doc, min_words = 5)
|
12
|
+
doc.text_blocks.each do |tb|
|
13
|
+
next if tb.is_not_content?
|
14
|
+
|
15
|
+
clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
|
16
|
+
hasClause = false
|
17
|
+
tb.text.scan(clause_delimiter).each do |possible_clause|
|
18
|
+
hasClause |= is_clause? possible_clause
|
19
|
+
end
|
20
|
+
|
21
|
+
tb.content = false unless hasClause
|
22
|
+
end
|
23
|
+
|
24
|
+
doc
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.is_clause?(text, min_words = 5)
|
28
|
+
return false if text.nil?
|
29
|
+
|
30
|
+
whitespace = /[ \n\r]+/
|
31
|
+
text.scan(whitespace).size >= min_words
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# Keeps only those content blocks which contain at least k words.
|
2
|
+
|
3
|
+
module Boilerpipe::Filters
|
4
|
+
class MinWordsFilter
|
5
|
+
def self.process(min_words, doc)
|
6
|
+
doc.text_blocks.each do |tb|
|
7
|
+
next if tb.is_not_content?
|
8
|
+
|
9
|
+
tb.content = false if tb.num_words < min_words
|
10
|
+
end
|
11
|
+
doc
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Classifies TextBlocks as content/not-content through rules that have been determined
|
4
2
|
# using the C4.8 machine learning algorithm, as described in the paper
|
5
3
|
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
@@ -7,7 +5,6 @@
|
|
7
5
|
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class NumWordsRulesClassifier
|
10
|
-
|
11
8
|
def self.process(doc)
|
12
9
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
10
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
|
|
37
34
|
|
38
35
|
false
|
39
36
|
end
|
40
|
-
|
41
37
|
end
|
42
38
|
end
|