boilerpipe-ruby 0.2.0 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +34 -1
- data/Dockerfile +14 -0
- data/README.md +32 -7
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +14 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
- data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
- data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
- data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
- data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +38 -25
@@ -0,0 +1,15 @@
|
|
1
|
+
module Boilerpipe::Extractors
|
2
|
+
class CanolaExtractor
|
3
|
+
def self.text(contents)
|
4
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
5
|
+
::Boilerpipe::Extractors::CanolaExtractor.process doc
|
6
|
+
doc.content
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.process(doc)
|
10
|
+
::Boilerpipe::Filters::CanolaClassifier.process doc
|
11
|
+
|
12
|
+
doc
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Marks all blocks as content.
|
2
|
+
|
3
|
+
module Boilerpipe::Extractors
|
4
|
+
class KeepEverythingExtractor
|
5
|
+
def self.text(contents)
|
6
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
7
|
+
::Boilerpipe::Extractors::KeepEverythingExtractor.process doc
|
8
|
+
doc.content
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.process(doc)
|
12
|
+
::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
|
13
|
+
doc
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# A full-text extractor which extracts the largest text component of a page.
|
2
|
+
# For news articles, it may perform better than the DefaultExtractor, but
|
3
|
+
# usually worse than ArticleExtractor.
|
4
|
+
|
5
|
+
module Boilerpipe::Extractors
|
6
|
+
class KeepEverythingWithKMinWordsExtractor
|
7
|
+
def self.text(min, contents)
|
8
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
9
|
+
::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
|
10
|
+
doc.content
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.process(min, doc)
|
14
|
+
::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
|
15
|
+
::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
|
16
|
+
::Boilerpipe::Filters::MinWordsFilter.process min, doc
|
17
|
+
doc
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Boilerpipe::Extractors
|
2
|
+
class LargestContentExtractor
|
3
|
+
def self.text(contents)
|
4
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
5
|
+
::Boilerpipe::Extractors::LargestContentExtractor.process doc
|
6
|
+
doc.content
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.process(doc)
|
10
|
+
filters = ::Boilerpipe::Filters
|
11
|
+
filters::NumWordsRulesClassifier.process doc
|
12
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
|
13
|
+
filters::KeepLargestBlockFilter::INSTANCE.process doc
|
14
|
+
|
15
|
+
doc
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Boilerpipe::Extractors
|
2
|
+
class NumWordsRulesExtractor
|
3
|
+
def self.text(contents)
|
4
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
5
|
+
::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
|
6
|
+
doc.content
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.process(doc)
|
10
|
+
::Boilerpipe::Filters::NumWordsRulesClassifier.process doc
|
11
|
+
doc
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
1
|
+
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
|
2
|
+
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
4
3
|
|
5
4
|
module Boilerpipe::Filters
|
6
5
|
class BlockProximityFusion
|
7
|
-
|
8
|
-
|
9
6
|
def initialize(max_blocks_distance, content_only, same_tag_level_only)
|
10
7
|
@max_blocks_distance = max_blocks_distance
|
11
8
|
@content_only = content_only
|
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
|
|
13
10
|
end
|
14
11
|
|
15
12
|
MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
|
16
|
-
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(
|
17
|
-
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(
|
13
|
+
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
|
14
|
+
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
|
18
15
|
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
|
19
16
|
|
20
17
|
def process(doc)
|
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
|
|
22
19
|
return false if text_blocks.size < 2
|
23
20
|
|
24
21
|
prev_block = if @content_only
|
25
|
-
text_blocks.find{ |tb| tb.is_content? }
|
22
|
+
text_blocks.find { |tb| tb.is_content? }
|
26
23
|
else
|
27
24
|
text_blocks.first
|
28
25
|
end
|
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
|
|
46
43
|
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
|
47
44
|
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
|
48
45
|
|
49
|
-
if
|
46
|
+
if ok
|
50
47
|
prev_block.merge_next(tb)
|
51
48
|
blocks_to_remove << tb
|
52
49
|
else
|
53
50
|
prev_block = tb
|
54
51
|
end
|
55
52
|
end
|
56
|
-
|
57
53
|
end
|
58
|
-
doc.replace_text_blocks!(
|
54
|
+
doc.replace_text_blocks!(text_blocks - blocks_to_remove)
|
59
55
|
doc
|
60
56
|
end
|
61
|
-
|
62
57
|
end
|
63
58
|
end
|
@@ -1,9 +1,7 @@
|
|
1
|
-
|
2
|
-
# Removes TextBlocks which have explicitly been marked as "not content".
|
1
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class BoilerplateBlockFilter
|
6
|
-
|
7
5
|
def initialize(label)
|
8
6
|
@label_to_keep = label
|
9
7
|
end
|
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
|
|
21
19
|
doc.replace_text_blocks!(combined)
|
22
20
|
doc
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# A full-text extractor trained on http://krdwrd.org/
|
2
|
+
# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
|
3
|
+
# Works well with SimpleEstimator, too.
|
4
|
+
|
5
|
+
module Boilerpipe::Filters
|
6
|
+
class CanolaClassifier
|
7
|
+
def self.process(doc)
|
8
|
+
return doc if doc.text_blocks.size < 1
|
9
|
+
|
10
|
+
empty = Boilerpipe::Document::TextBlock.empty_start
|
11
|
+
text_blocks = [empty] + doc.text_blocks + [empty]
|
12
|
+
|
13
|
+
text_blocks.each_cons(3) do |slice|
|
14
|
+
prev, current, nxt = *slice
|
15
|
+
current.content = classify(prev, current, nxt)
|
16
|
+
end
|
17
|
+
|
18
|
+
doc
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.classify(prev, current, nxt)
|
22
|
+
current.link_density > 0 && nxt.num_words > 11 \
|
23
|
+
|| current.num_words > 19 \
|
24
|
+
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -5,9 +5,8 @@
|
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class DensityRulesClassifier
|
8
|
-
|
9
8
|
def self.process(doc)
|
10
|
-
#return doc if doc.text_blocks.size < 2
|
9
|
+
# return doc if doc.text_blocks.size < 2
|
11
10
|
|
12
11
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
12
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
|
|
26
25
|
if prev.link_density <= 0.555556
|
27
26
|
if current.text_density <= 9
|
28
27
|
return true if nxt.text_density > 10
|
28
|
+
|
29
29
|
return prev.text_density <= 4 ? false : true
|
30
30
|
else
|
31
31
|
return nxt.text_density == 0 ? false : true
|
32
32
|
end
|
33
33
|
else
|
34
34
|
return false if nxt.text_density <= 11
|
35
|
+
|
35
36
|
true
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,9 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
|
2
|
+
# some heuristics which are quite specific to the news domain.
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# we create a list of potential titles from the page title
|
8
|
-
# then we look at every text block and if the text block
|
9
|
-
# contains a potential title - we set that text block label as :TITLE
|
4
|
+
# we create a list of potential titles from the page title
|
5
|
+
# then we look at every text block and if the text block
|
6
|
+
# contains a potential title - we set that text block label as :TITLE
|
10
7
|
|
11
8
|
module Boilerpipe::Filters
|
12
9
|
class DocumentTitleMatchClassifier
|
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
|
|
55
52
|
@potential_titles << title
|
56
53
|
|
57
54
|
# unnecessary
|
58
|
-
#p = longest_part(title, /[ ]*[|»-][ ]*/)
|
59
|
-
|
55
|
+
# p = longest_part(title, /[ ]*[|»-][ ]*/)
|
56
|
+
# @potential_titles << p if p
|
60
57
|
|
61
|
-
#p = longest_part(title, /[ ]*[|»:][ ]*/)
|
62
|
-
|
58
|
+
# p = longest_part(title, /[ ]*[|»:][ ]*/)
|
59
|
+
# @potential_titles << p if p
|
63
60
|
|
64
|
-
#p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
65
|
-
|
61
|
+
# p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
62
|
+
# @potential_titles << p if p
|
66
63
|
|
67
|
-
#p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
68
|
-
|
64
|
+
# p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
65
|
+
# @potential_titles << p if p
|
69
66
|
|
70
67
|
p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
|
71
68
|
@potential_titles << p if p
|
72
69
|
|
73
70
|
# we replace \u00a0 so why check for it?
|
74
|
-
#p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
75
|
-
|
71
|
+
# p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
72
|
+
# @potential_titles << p if p
|
76
73
|
|
77
74
|
add_potential_titles(title, /[ ]+[|][ ]+/, 4)
|
78
75
|
add_potential_titles(title, /[ ]+[-][ ]+/, 4)
|
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
|
|
90
87
|
|
91
88
|
parts.each do |part|
|
92
89
|
next if part =~ /[.]com/
|
90
|
+
|
93
91
|
num_words = number_of_words(part)
|
94
92
|
|
95
93
|
if num_words > longest_num_words || part.size > longest_part.size
|
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
|
|
107
105
|
|
108
106
|
parts.each do |part|
|
109
107
|
next if part =~ /[.]com/
|
108
|
+
|
110
109
|
num_words = number_of_words(part)
|
111
110
|
|
112
111
|
@potential_titles << part if num_words >= min_words
|
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
|
|
116
115
|
def number_of_words(s)
|
117
116
|
s.split(/[\b ]+/).size
|
118
117
|
end
|
119
|
-
|
120
118
|
end
|
121
119
|
end
|
@@ -1,10 +1,8 @@
|
|
1
|
-
|
2
1
|
# Marks all TextBlocks "content" which are between the headline and the part that has
|
3
2
|
# already been marked content, if they are marked MIGHT_BE_CONTENT.
|
4
3
|
# This filter is quite specific to the news domain.
|
5
4
|
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
|
6
5
|
|
7
|
-
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class ExpandTitleToContentFilter
|
10
8
|
def self.process(doc)
|
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
|
|
38
36
|
def self.no_title_with_subsequent_content?(content_start, title)
|
39
37
|
title.nil? || content_start.nil? || content_start <= title
|
40
38
|
end
|
41
|
-
|
42
39
|
end
|
43
40
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# Marks all blocks as "non-content" that occur after blocks that have been
|
2
|
+
# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
|
3
|
+
# number of words in content blocks occur before this mark (default: 60).
|
4
|
+
# This can be used in conjunction with an upstream TerminatingBlocksFinder.
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
|
8
|
-
|
9
|
-
def self.process(doc, min_num_words=60)
|
8
|
+
def self.process(doc, min_num_words = 60)
|
10
9
|
found_end_of_text = false
|
11
10
|
num_words = 0
|
12
11
|
|
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
|
|
19
18
|
|
20
19
|
doc
|
21
20
|
end
|
22
|
-
|
23
21
|
end
|
24
22
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Keeps the largest TextBlock only (by the number of words). In case of
|
3
2
|
# more than one block with the same number of words, the first block is chosen.
|
4
3
|
# All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class KeepLargestBlockFilter
|
11
|
-
|
12
10
|
def initialize(expand_to_same_level_text, min_words)
|
13
11
|
@expand_to_same_level_text = expand_to_same_level_text
|
14
12
|
@min_words = min_words
|
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
|
|
43
41
|
expand_tag_level(tbs[0...n].reverse, level, @min_words)
|
44
42
|
|
45
43
|
# expand blocks to the right
|
46
|
-
expand_tag_level(tbs[n+1..-1], level, @min_words)
|
44
|
+
expand_tag_level(tbs[n + 1..-1], level, @min_words)
|
47
45
|
end
|
48
46
|
end
|
49
47
|
|
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
|
|
57
55
|
end
|
58
56
|
end
|
59
57
|
end
|
60
|
-
|
61
58
|
end
|
62
59
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks all blocks as content that:
|
3
2
|
# are on the same tag-level as very likely main content
|
4
3
|
# (usually the level of the largest block)
|
@@ -7,23 +6,22 @@
|
|
7
6
|
|
8
7
|
module Boilerpipe::Filters
|
9
8
|
class LargeBlockSameTagLevelToContentFilter
|
10
|
-
|
11
9
|
def self.process(doc)
|
12
|
-
|
13
10
|
largest = doc.text_blocks.find do |tb|
|
14
11
|
tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
15
12
|
end
|
16
13
|
|
17
14
|
return doc if largest.nil?
|
15
|
+
|
18
16
|
tag_level = largest.tag_level
|
19
17
|
|
20
18
|
doc.text_blocks.each do |tb|
|
21
19
|
next if tb.is_content?
|
20
|
+
|
22
21
|
tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
|
23
22
|
end
|
24
23
|
|
25
24
|
doc
|
26
25
|
end
|
27
|
-
|
28
26
|
end
|
29
27
|
end
|
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
|
|
11
11
|
doc.text_blocks.each do |tb|
|
12
12
|
if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
13
13
|
tag_level = tb.tag_level
|
14
|
-
elsif
|
14
|
+
elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
|
15
15
|
tb.content = true
|
16
16
|
else
|
17
17
|
tag_level = MAX
|
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
|
|
20
20
|
|
21
21
|
doc
|
22
22
|
end
|
23
|
-
|
24
23
|
end
|
25
24
|
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
#
|
2
|
+
# Keeps only blocks that have at least one segment fragment ("clause") with at least k
|
3
|
+
# words (default: 5).
|
4
|
+
#
|
5
|
+
# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
|
6
|
+
#
|
7
|
+
# SplitParagraphBlocksFilter
|
8
|
+
|
9
|
+
module Boilerpipe::Filters
|
10
|
+
class MinClauseWordsFilter
|
11
|
+
def self.process(doc, min_words = 5)
|
12
|
+
doc.text_blocks.each do |tb|
|
13
|
+
next if tb.is_not_content?
|
14
|
+
|
15
|
+
clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
|
16
|
+
hasClause = false
|
17
|
+
tb.text.scan(clause_delimiter).each do |possible_clause|
|
18
|
+
hasClause |= is_clause? possible_clause
|
19
|
+
end
|
20
|
+
|
21
|
+
tb.content = false unless hasClause
|
22
|
+
end
|
23
|
+
|
24
|
+
doc
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.is_clause?(text, min_words = 5)
|
28
|
+
return false if text.nil?
|
29
|
+
|
30
|
+
whitespace = /[ \n\r]+/
|
31
|
+
text.scan(whitespace).size >= min_words
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|