boilerpipe-ruby 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +28 -1
- data/Dockerfile +14 -0
- data/README.md +13 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +4 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
- data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +28 -25
@@ -1,9 +1,7 @@
|
|
1
|
-
|
2
|
-
# Removes TextBlocks which have explicitly been marked as "not content".
|
1
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class BoilerplateBlockFilter
|
6
|
-
|
7
5
|
def initialize(label)
|
8
6
|
@label_to_keep = label
|
9
7
|
end
|
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
|
|
21
19
|
doc.replace_text_blocks!(combined)
|
22
20
|
doc
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
end
|
@@ -1,10 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# A full-text extractor trained on http://krdwrd.org/
|
2
|
+
# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
|
3
|
+
# Works well with SimpleEstimator, too.
|
4
4
|
|
5
5
|
module Boilerpipe::Filters
|
6
6
|
class CanolaClassifier
|
7
|
-
|
8
7
|
def self.process(doc)
|
9
8
|
return doc if doc.text_blocks.size < 1
|
10
9
|
|
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
|
|
22
21
|
def self.classify(prev, current, nxt)
|
23
22
|
current.link_density > 0 && nxt.num_words > 11 \
|
24
23
|
|| current.num_words > 19 \
|
25
|
-
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (
|
24
|
+
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
|
26
25
|
end
|
27
26
|
end
|
28
27
|
end
|
@@ -5,9 +5,8 @@
|
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class DensityRulesClassifier
|
8
|
-
|
9
8
|
def self.process(doc)
|
10
|
-
#return doc if doc.text_blocks.size < 2
|
9
|
+
# return doc if doc.text_blocks.size < 2
|
11
10
|
|
12
11
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
12
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
|
|
26
25
|
if prev.link_density <= 0.555556
|
27
26
|
if current.text_density <= 9
|
28
27
|
return true if nxt.text_density > 10
|
28
|
+
|
29
29
|
return prev.text_density <= 4 ? false : true
|
30
30
|
else
|
31
31
|
return nxt.text_density == 0 ? false : true
|
32
32
|
end
|
33
33
|
else
|
34
34
|
return false if nxt.text_density <= 11
|
35
|
+
|
35
36
|
true
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,9 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
|
2
|
+
# some heuristics which are quite specific to the news domain.
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# we create a list of potential titles from the page title
|
8
|
-
# then we look at every text block and if the text block
|
9
|
-
# contains a potential title - we set that text block label as :TITLE
|
4
|
+
# we create a list of potential titles from the page title
|
5
|
+
# then we look at every text block and if the text block
|
6
|
+
# contains a potential title - we set that text block label as :TITLE
|
10
7
|
|
11
8
|
module Boilerpipe::Filters
|
12
9
|
class DocumentTitleMatchClassifier
|
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
|
|
55
52
|
@potential_titles << title
|
56
53
|
|
57
54
|
# unnecessary
|
58
|
-
#p = longest_part(title, /[ ]*[|»-][ ]*/)
|
59
|
-
|
55
|
+
# p = longest_part(title, /[ ]*[|»-][ ]*/)
|
56
|
+
# @potential_titles << p if p
|
60
57
|
|
61
|
-
#p = longest_part(title, /[ ]*[|»:][ ]*/)
|
62
|
-
|
58
|
+
# p = longest_part(title, /[ ]*[|»:][ ]*/)
|
59
|
+
# @potential_titles << p if p
|
63
60
|
|
64
|
-
#p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
65
|
-
|
61
|
+
# p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
62
|
+
# @potential_titles << p if p
|
66
63
|
|
67
|
-
#p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
68
|
-
|
64
|
+
# p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
65
|
+
# @potential_titles << p if p
|
69
66
|
|
70
67
|
p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
|
71
68
|
@potential_titles << p if p
|
72
69
|
|
73
70
|
# we replace \u00a0 so why check for it?
|
74
|
-
#p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
75
|
-
|
71
|
+
# p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
72
|
+
# @potential_titles << p if p
|
76
73
|
|
77
74
|
add_potential_titles(title, /[ ]+[|][ ]+/, 4)
|
78
75
|
add_potential_titles(title, /[ ]+[-][ ]+/, 4)
|
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
|
|
90
87
|
|
91
88
|
parts.each do |part|
|
92
89
|
next if part =~ /[.]com/
|
90
|
+
|
93
91
|
num_words = number_of_words(part)
|
94
92
|
|
95
93
|
if num_words > longest_num_words || part.size > longest_part.size
|
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
|
|
107
105
|
|
108
106
|
parts.each do |part|
|
109
107
|
next if part =~ /[.]com/
|
108
|
+
|
110
109
|
num_words = number_of_words(part)
|
111
110
|
|
112
111
|
@potential_titles << part if num_words >= min_words
|
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
|
|
116
115
|
def number_of_words(s)
|
117
116
|
s.split(/[\b ]+/).size
|
118
117
|
end
|
119
|
-
|
120
118
|
end
|
121
119
|
end
|
@@ -1,43 +1,30 @@
|
|
1
|
-
|
2
1
|
# Marks all TextBlocks "content" which are between the headline and the part that has
|
3
2
|
# already been marked content, if they are marked MIGHT_BE_CONTENT.
|
4
3
|
# This filter is quite specific to the news domain.
|
5
4
|
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
|
6
5
|
|
7
|
-
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class ExpandTitleToContentFilter
|
10
8
|
def self.process(doc)
|
11
9
|
tbs = doc.text_blocks
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
# titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
|
16
|
-
# title = tbs.index(titles.last)
|
17
|
-
# content_start = tbs.find_index(&:is_content?)
|
11
|
+
title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
|
12
|
+
title_idx = tbs.index(title)
|
18
13
|
|
19
|
-
|
20
|
-
title = nil
|
21
|
-
content_start = nil
|
14
|
+
content_start = tbs.find_index(&:is_content?)
|
22
15
|
|
23
|
-
|
24
|
-
title = i if content_start.nil? && tb.has_label?(:TITLE)
|
25
|
-
content_start = i if content_start.nil? && tb.is_content?
|
26
|
-
i += 1
|
27
|
-
end
|
16
|
+
return doc if no_title_with_subsequent_content?(content_start, title_idx)
|
28
17
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
|
33
|
-
end
|
18
|
+
tbs.slice(title_idx...content_start)
|
19
|
+
.select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
|
20
|
+
.each{ |tb| tb.content = true }
|
34
21
|
|
35
22
|
doc
|
36
23
|
end
|
37
24
|
|
38
|
-
def self.no_title_with_subsequent_content?(content_start,
|
39
|
-
title
|
25
|
+
def self.no_title_with_subsequent_content?(content_start, title_idx)
|
26
|
+
# title has to start before content
|
27
|
+
title_idx.nil? || content_start.nil? || title_idx >= content_start
|
40
28
|
end
|
41
|
-
|
42
29
|
end
|
43
30
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# Marks all blocks as "non-content" that occur after blocks that have been
|
2
|
+
# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
|
3
|
+
# number of words in content blocks occur before this mark (default: 60).
|
4
|
+
# This can be used in conjunction with an upstream TerminatingBlocksFinder.
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
|
8
|
-
|
9
|
-
def self.process(doc, min_num_words=60)
|
8
|
+
def self.process(doc, min_num_words = 60)
|
10
9
|
found_end_of_text = false
|
11
10
|
num_words = 0
|
12
11
|
|
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
|
|
19
18
|
|
20
19
|
doc
|
21
20
|
end
|
22
|
-
|
23
21
|
end
|
24
22
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Keeps the largest TextBlock only (by the number of words). In case of
|
3
2
|
# more than one block with the same number of words, the first block is chosen.
|
4
3
|
# All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class KeepLargestBlockFilter
|
11
|
-
|
12
10
|
def initialize(expand_to_same_level_text, min_words)
|
13
11
|
@expand_to_same_level_text = expand_to_same_level_text
|
14
12
|
@min_words = min_words
|
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
|
|
43
41
|
expand_tag_level(tbs[0...n].reverse, level, @min_words)
|
44
42
|
|
45
43
|
# expand blocks to the right
|
46
|
-
expand_tag_level(tbs[n+1..-1], level, @min_words)
|
44
|
+
expand_tag_level(tbs[n + 1..-1], level, @min_words)
|
47
45
|
end
|
48
46
|
end
|
49
47
|
|
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
|
|
57
55
|
end
|
58
56
|
end
|
59
57
|
end
|
60
|
-
|
61
58
|
end
|
62
59
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks all blocks as content that:
|
3
2
|
# are on the same tag-level as very likely main content
|
4
3
|
# (usually the level of the largest block)
|
@@ -7,23 +6,22 @@
|
|
7
6
|
|
8
7
|
module Boilerpipe::Filters
|
9
8
|
class LargeBlockSameTagLevelToContentFilter
|
10
|
-
|
11
9
|
def self.process(doc)
|
12
|
-
|
13
10
|
largest = doc.text_blocks.find do |tb|
|
14
11
|
tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
15
12
|
end
|
16
13
|
|
17
14
|
return doc if largest.nil?
|
15
|
+
|
18
16
|
tag_level = largest.tag_level
|
19
17
|
|
20
18
|
doc.text_blocks.each do |tb|
|
21
19
|
next if tb.is_content?
|
20
|
+
|
22
21
|
tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
|
23
22
|
end
|
24
23
|
|
25
24
|
doc
|
26
25
|
end
|
27
|
-
|
28
26
|
end
|
29
27
|
end
|
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
|
|
11
11
|
doc.text_blocks.each do |tb|
|
12
12
|
if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
13
13
|
tag_level = tb.tag_level
|
14
|
-
elsif
|
14
|
+
elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
|
15
15
|
tb.content = true
|
16
16
|
else
|
17
17
|
tag_level = MAX
|
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
|
|
20
20
|
|
21
21
|
doc
|
22
22
|
end
|
23
|
-
|
24
23
|
end
|
25
24
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
|
1
|
+
# Marks all blocks as content.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class MarkEverythingContentFilter
|
5
|
-
|
6
5
|
def self.process(doc)
|
7
6
|
doc.text_blocks.each do |tb|
|
8
7
|
tb.content = true if tb.is_not_content?
|
9
8
|
end
|
10
9
|
doc
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
12
|
end
|
@@ -8,30 +8,27 @@
|
|
8
8
|
|
9
9
|
module Boilerpipe::Filters
|
10
10
|
class MinClauseWordsFilter
|
11
|
-
|
12
|
-
def self.process(doc, min_words=5)
|
13
|
-
|
11
|
+
def self.process(doc, min_words = 5)
|
14
12
|
doc.text_blocks.each do |tb|
|
15
13
|
next if tb.is_not_content?
|
16
14
|
|
17
15
|
clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
|
16
|
+
hasClause = false
|
18
17
|
tb.text.scan(clause_delimiter).each do |possible_clause|
|
19
|
-
|
20
|
-
break
|
21
|
-
else
|
22
|
-
tb.content = false
|
23
|
-
end
|
18
|
+
hasClause |= is_clause? possible_clause
|
24
19
|
end
|
20
|
+
|
21
|
+
tb.content = false unless hasClause
|
25
22
|
end
|
26
23
|
|
27
24
|
doc
|
28
25
|
end
|
29
26
|
|
30
|
-
def self.is_clause?(text, min_words=5)
|
31
|
-
|
27
|
+
def self.is_clause?(text, min_words = 5)
|
28
|
+
return false if text.nil?
|
29
|
+
|
32
30
|
whitespace = /[ \n\r]+/
|
33
31
|
text.scan(whitespace).size >= min_words
|
34
32
|
end
|
35
|
-
|
36
33
|
end
|
37
34
|
end
|
@@ -1,16 +1,14 @@
|
|
1
|
-
|
2
1
|
# Keeps only those content blocks which contain at least k words.
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class MinWordsFilter
|
6
|
-
|
7
5
|
def self.process(min_words, doc)
|
8
6
|
doc.text_blocks.each do |tb|
|
9
7
|
next if tb.is_not_content?
|
8
|
+
|
10
9
|
tb.content = false if tb.num_words < min_words
|
11
10
|
end
|
12
11
|
doc
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Classifies TextBlocks as content/not-content through rules that have been determined
|
4
2
|
# using the C4.8 machine learning algorithm, as described in the paper
|
5
3
|
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
@@ -7,7 +5,6 @@
|
|
7
5
|
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class NumWordsRulesClassifier
|
10
|
-
|
11
8
|
def self.process(doc)
|
12
9
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
10
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
|
|
37
34
|
|
38
35
|
false
|
39
36
|
end
|
40
|
-
|
41
37
|
end
|
42
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class SimpleBlockFusionProcessor
|
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
doc.replace_text_blocks!(
|
20
|
+
doc.replace_text_blocks!(tbs - blocks_to_remove)
|
21
21
|
doc
|
22
22
|
end
|
23
23
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Splits TextBlocks at paragraph boundaries.
|
3
2
|
#
|
4
3
|
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class SplitParagraphBlocksFilter
|
11
|
-
|
12
10
|
def self.process(doc)
|
13
11
|
tbs = doc.text_blocks
|
14
12
|
new_blocks = []
|
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
|
|
35
33
|
doc.replace_text_blocks!(new_blocks) if changes
|
36
34
|
doc
|
37
35
|
end
|
38
|
-
|
39
36
|
end
|
40
37
|
end
|
@@ -1,15 +1,13 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Finds blocks which are potentially indicating the end of an article
|
4
2
|
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
5
3
|
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
6
4
|
|
7
|
-
|
8
5
|
module Boilerpipe::Filters
|
9
6
|
class TerminatingBlocksFinder
|
10
7
|
def self.process(doc)
|
11
8
|
doc.text_blocks.each do |tb|
|
12
9
|
next unless tb.num_words < 15
|
10
|
+
|
13
11
|
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
14
12
|
tb.labels << :INDICATES_END_OF_TEXT
|
15
13
|
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
|
|
29
27
|
text.include?('what you think...') ||
|
30
28
|
text.include?('add your comment') ||
|
31
29
|
text.include?('add comment') ||
|
32
|
-
#TODO add this and test
|
33
|
-
#text.include?('leave a reply') ||
|
34
|
-
#text.include?('leave a comment') ||
|
35
|
-
#text.include?('show comments') ||
|
36
|
-
#text.include?('Share this:') ||
|
30
|
+
# TODO add this and test
|
31
|
+
# text.include?('leave a reply') ||
|
32
|
+
# text.include?('leave a comment') ||
|
33
|
+
# text.include?('show comments') ||
|
34
|
+
# text.include?('Share this:') ||
|
37
35
|
text.include?('reader views') ||
|
38
36
|
text.include?('have your say') ||
|
39
37
|
text.include?('reader comments') ||
|