boilerpipe-ruby 0.2.0 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +34 -1
- data/Dockerfile +14 -0
- data/README.md +32 -7
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +14 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
- data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
- data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
- data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
- data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +38 -25
@@ -0,0 +1,14 @@
|
|
1
|
+
# Keeps only those content blocks which contain at least k words.
|
2
|
+
|
3
|
+
module Boilerpipe::Filters
|
4
|
+
class MinWordsFilter
|
5
|
+
def self.process(min_words, doc)
|
6
|
+
doc.text_blocks.each do |tb|
|
7
|
+
next if tb.is_not_content?
|
8
|
+
|
9
|
+
tb.content = false if tb.num_words < min_words
|
10
|
+
end
|
11
|
+
doc
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Classifies TextBlocks as content/not-content through rules that have been determined
|
4
2
|
# using the C4.8 machine learning algorithm, as described in the paper
|
5
3
|
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
@@ -7,7 +5,6 @@
|
|
7
5
|
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class NumWordsRulesClassifier
|
10
|
-
|
11
8
|
def self.process(doc)
|
12
9
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
10
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
|
|
37
34
|
|
38
35
|
false
|
39
36
|
end
|
40
|
-
|
41
37
|
end
|
42
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class SimpleBlockFusionProcessor
|
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
doc.replace_text_blocks!(
|
20
|
+
doc.replace_text_blocks!(tbs - blocks_to_remove)
|
21
21
|
doc
|
22
22
|
end
|
23
23
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Splits TextBlocks at paragraph boundaries.
|
2
|
+
#
|
3
|
+
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
4
|
+
# #getContainedTextElements()), but this one probably is necessary for some other filters.
|
5
|
+
#
|
6
|
+
# see MinClauseWordsFilter
|
7
|
+
|
8
|
+
module Boilerpipe::Filters
|
9
|
+
class SplitParagraphBlocksFilter
|
10
|
+
def self.process(doc)
|
11
|
+
tbs = doc.text_blocks
|
12
|
+
new_blocks = []
|
13
|
+
changes = false
|
14
|
+
tbs.each do |tb|
|
15
|
+
paragraphs = tb.text.split(/[\n\r]+/)
|
16
|
+
|
17
|
+
if paragraphs.size < 2
|
18
|
+
new_blocks << tb
|
19
|
+
next
|
20
|
+
end
|
21
|
+
|
22
|
+
is_content = tb.is_content?
|
23
|
+
labels = tb.labels
|
24
|
+
paragraphs.each do |paragraph|
|
25
|
+
tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
|
26
|
+
tbP.content = is_content
|
27
|
+
tbP.add_labels(labels)
|
28
|
+
new_blocks << tbP
|
29
|
+
changes = true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
doc.replace_text_blocks!(new_blocks) if changes
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -1,15 +1,13 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Finds blocks which are potentially indicating the end of an article
|
4
2
|
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
5
3
|
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
6
4
|
|
7
|
-
|
8
5
|
module Boilerpipe::Filters
|
9
6
|
class TerminatingBlocksFinder
|
10
7
|
def self.process(doc)
|
11
8
|
doc.text_blocks.each do |tb|
|
12
9
|
next unless tb.num_words < 15
|
10
|
+
|
13
11
|
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
14
12
|
tb.labels << :INDICATES_END_OF_TEXT
|
15
13
|
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
|
|
29
27
|
text.include?('what you think...') ||
|
30
28
|
text.include?('add your comment') ||
|
31
29
|
text.include?('add comment') ||
|
32
|
-
#TODO add this and test
|
33
|
-
#text.include?('leave a reply') ||
|
34
|
-
#text.include?('leave a comment') ||
|
35
|
-
#text.include?('show comments') ||
|
36
|
-
#text.include?('Share this:') ||
|
30
|
+
# TODO add this and test
|
31
|
+
# text.include?('leave a reply') ||
|
32
|
+
# text.include?('leave a comment') ||
|
33
|
+
# text.include?('show comments') ||
|
34
|
+
# text.include?('Share this:') ||
|
37
35
|
text.include?('reader views') ||
|
38
36
|
text.include?('have your say') ||
|
39
37
|
text.include?('reader comments') ||
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
3
2
|
# as boilerplate. Trailing means they are marked content and are
|
4
3
|
# below any other content block.
|
@@ -6,7 +5,6 @@
|
|
6
5
|
module Boilerpipe::Filters
|
7
6
|
class TrailingHeadlineToBoilerplateFilter
|
8
7
|
def self.process(doc)
|
9
|
-
|
10
8
|
doc.text_blocks.each do |tb|
|
11
9
|
next unless tb.is_content?
|
12
10
|
|
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
|
|
19
17
|
|
20
18
|
doc
|
21
19
|
end
|
22
|
-
|
23
20
|
end
|
24
21
|
end
|
@@ -1,20 +1,16 @@
|
|
1
|
-
require 'nokogiri'
|
2
1
|
module Boilerpipe::SAX
|
3
2
|
class BoilerpipeHTMLParser
|
4
3
|
def self.parse(text)
|
5
|
-
|
6
|
-
|
7
|
-
text = text.gsub(/\<script>.+?<\/script>/i, '')
|
4
|
+
# script bug - delete script tags
|
5
|
+
text.gsub!(/\<script>.+?<\/script>/i, '')
|
8
6
|
|
9
7
|
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
8
|
# mri doesn't remove when missing the semicolon
|
11
|
-
text
|
12
|
-
|
9
|
+
text.gsub!(/( ) /, '\1; ')
|
13
10
|
|
14
11
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
12
|
text = Nokogiri::HTML(text).to_html
|
16
13
|
|
17
|
-
|
18
14
|
handler = HTMLContentHandler.new
|
19
15
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
16
|
noko_parser.parse(text)
|
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'set'
|
3
|
-
|
4
1
|
module Boilerpipe::SAX
|
5
2
|
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
3
|
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
4
|
|
8
|
-
attr_accessor :in_anchor_tag, :token_buffer
|
5
|
+
attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
|
9
6
|
ANCHOR_TEXT_START = "$\ue00a<"
|
10
7
|
ANCHOR_TEXT_END = ">\ue00a$"
|
11
8
|
|
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
|
|
34
31
|
@label_stacks << nil
|
35
32
|
tag = name.upcase.intern
|
36
33
|
|
37
|
-
|
38
34
|
tag_action = @tag_actions[tag]
|
39
35
|
if tag_action
|
40
36
|
@tag_level += 1 if tag_action.changes_tag_level?
|
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
|
|
51
47
|
def characters(text)
|
52
48
|
flush_block if @flush
|
53
49
|
|
54
|
-
return if
|
50
|
+
return if in_ignorable_element?
|
55
51
|
return if text.empty?
|
56
52
|
|
57
53
|
# replace all whitespace with simple space
|
58
54
|
text.gsub!(/\s+/, ' ')
|
59
55
|
|
60
56
|
# trim whitespace
|
61
|
-
started_with_whitespace = text
|
62
|
-
ended_with_whitespace = text
|
57
|
+
started_with_whitespace = text =~ /^\s/
|
58
|
+
ended_with_whitespace = text =~ /\s$/
|
63
59
|
text.strip!
|
64
60
|
|
65
61
|
# add a single space if the block was only whitespace
|
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
|
|
158
154
|
end
|
159
155
|
|
160
156
|
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
num_words,
|
158
|
+
num_linked_words,
|
159
|
+
num_words_in_wrapped_lines,
|
160
|
+
num_wrapped_lines, @offset_blocks)
|
165
161
|
|
166
162
|
@offset_blocks += 1
|
167
163
|
clear_buffers
|
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
|
|
187
183
|
# \p{No} -- a numeric character of other type
|
188
184
|
|
189
185
|
def is_word?(word)
|
190
|
-
|
186
|
+
word =~ VALID_WORD_CHARACTER
|
191
187
|
end
|
192
188
|
|
193
|
-
#public void flushBlock() {
|
189
|
+
# public void flushBlock() {
|
194
190
|
# int numWords = 0;
|
195
191
|
# int numLinkedWords = 0;
|
196
192
|
# int numWrappedLines = 0;
|
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
|
|
198
194
|
# final int maxLineLength = 80;
|
199
195
|
# int numTokens = 0;
|
200
196
|
# int numWordsCurrentLine = 0;
|
201
|
-
#}
|
197
|
+
# }
|
202
198
|
|
203
199
|
def increase_in_ignorable_element!
|
204
200
|
@in_ignorable_element += 1
|
205
201
|
end
|
206
202
|
|
203
|
+
# should we prevent less than zero here?
|
207
204
|
def decrease_in_ignorable_element!
|
208
205
|
@in_ignorable_element -= 1
|
209
206
|
end
|
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
|
|
224
221
|
@in_anchor_tag > 0
|
225
222
|
end
|
226
223
|
|
227
|
-
|
228
224
|
def add_text_block(text_block)
|
229
225
|
@label_stacks.each do |stack|
|
230
226
|
next unless stack
|
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
|
|
239
235
|
# append space if last character wasn't already one
|
240
236
|
def append_space
|
241
237
|
return if @sb_last_was_whitespace
|
238
|
+
|
242
239
|
@sb_last_was_whitespace = true
|
243
240
|
|
244
241
|
@text_buffer << ' '
|
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
|
|
2
2
|
class AnchorText
|
3
3
|
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
4
|
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
-
|
5
|
+
# * encounters such nestings, a SAXException is thrown.
|
6
6
|
def start(handler, name, attrs)
|
7
7
|
if handler.in_anchor_tag?
|
8
8
|
handler.in_anchor_tag += 1
|
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
|
|
42
42
|
# - dunno about nokogiri???????
|
43
43
|
# as nested A elements are not allowed per specification, we
|
44
44
|
# are probably reaching this branch due to a bug in the XML parser
|
45
|
-
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
45
|
+
# puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
46
|
end_tag(handler, name)
|
47
47
|
end
|
48
48
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
4
|
class BlockLevel
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
true
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
# for block-level elements, which triggers some LabelAction on
|
3
|
-
# the generated TextBlock.
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
4
|
class BlockTagLabel
|
5
5
|
def initialize(label_action)
|
6
6
|
@label_action = label_action
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Marks this tag the body element (this should usually only
|
3
|
+
# be set for the <BODY> tag).
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
|
|
10
10
|
rel = m[1]
|
11
11
|
val = m[2].to_i # absolute
|
12
12
|
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
13
|
-
handler.font_size_stack <<
|
13
|
+
handler.font_size_stack << size
|
14
14
|
else
|
15
15
|
handler.font_size_stack << nil
|
16
16
|
end
|
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def relative(font_size_stack, rel, val)
|
30
|
-
prev_size = font_size_stack.reverse_each.find{|s| s
|
30
|
+
prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
|
31
31
|
prev_size = 3 if prev_size.nil?
|
32
32
|
|
33
33
|
size = if rel == '+'
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,71 +16,71 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rickshaw
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.5.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.5.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '3.9'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '3.9'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.10'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
83
|
-
description: A pure ruby implementation of the boilerpipe algorithm
|
82
|
+
version: '1.10'
|
83
|
+
description: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
84
84
|
email:
|
85
85
|
- "<gregory.ostermayr@gmail.com>"
|
86
86
|
executables: []
|
@@ -88,9 +88,11 @@ extensions: []
|
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
90
|
- ".circleci/config.yml"
|
91
|
+
- ".dockerignore"
|
91
92
|
- ".gitignore"
|
92
93
|
- ".rspec"
|
93
94
|
- CHANGELOG.md
|
95
|
+
- Dockerfile
|
94
96
|
- Gemfile
|
95
97
|
- LICENSE.txt
|
96
98
|
- README.md
|
@@ -98,14 +100,22 @@ files:
|
|
98
100
|
- bin/console
|
99
101
|
- bin/setup
|
100
102
|
- boilerpipe-ruby.gemspec
|
103
|
+
- boilerpipe_flow.md
|
101
104
|
- lib/boilerpipe.rb
|
102
105
|
- lib/boilerpipe/document/text_block.rb
|
103
106
|
- lib/boilerpipe/document/text_document.rb
|
104
107
|
- lib/boilerpipe/errors.rb
|
105
108
|
- lib/boilerpipe/extractors/article_extractor.rb
|
109
|
+
- lib/boilerpipe/extractors/article_sentence_extractor.rb
|
110
|
+
- lib/boilerpipe/extractors/canola_extractor.rb
|
106
111
|
- lib/boilerpipe/extractors/default_extractor.rb
|
112
|
+
- lib/boilerpipe/extractors/keep_everything_extractor.rb
|
113
|
+
- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
|
114
|
+
- lib/boilerpipe/extractors/largest_content_extractor.rb
|
115
|
+
- lib/boilerpipe/extractors/num_words_rules_extractor.rb
|
107
116
|
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
108
117
|
- lib/boilerpipe/filters/boilerplate_block_filter.rb
|
118
|
+
- lib/boilerpipe/filters/canola_classifier.rb
|
109
119
|
- lib/boilerpipe/filters/density_rules_classifier.rb
|
110
120
|
- lib/boilerpipe/filters/document_title_match_classifier.rb
|
111
121
|
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
|
@@ -114,8 +124,12 @@ files:
|
|
114
124
|
- lib/boilerpipe/filters/keep_largest_block_filter.rb
|
115
125
|
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
116
126
|
- lib/boilerpipe/filters/list_at_end_filter.rb
|
127
|
+
- lib/boilerpipe/filters/mark_everything_content_filter.rb
|
128
|
+
- lib/boilerpipe/filters/min_clause_words_filter.rb
|
129
|
+
- lib/boilerpipe/filters/min_words_filter.rb
|
117
130
|
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
118
131
|
- lib/boilerpipe/filters/simple_block_fusion_processor.rb
|
132
|
+
- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
|
119
133
|
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
120
134
|
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
121
135
|
- lib/boilerpipe/labels/default.rb
|
@@ -140,7 +154,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
|
|
140
154
|
licenses:
|
141
155
|
- Apache 2.0
|
142
156
|
metadata: {}
|
143
|
-
post_install_message:
|
157
|
+
post_install_message:
|
144
158
|
rdoc_options: []
|
145
159
|
require_paths:
|
146
160
|
- lib
|
@@ -155,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
169
|
- !ruby/object:Gem::Version
|
156
170
|
version: '0'
|
157
171
|
requirements: []
|
158
|
-
|
159
|
-
|
160
|
-
signing_key:
|
172
|
+
rubygems_version: 3.0.8
|
173
|
+
signing_key:
|
161
174
|
specification_version: 4
|
162
|
-
summary: A pure ruby
|
175
|
+
summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
163
176
|
test_files: []
|