boilerpipe-ruby 0.2.0 → 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +34 -1
- data/Dockerfile +14 -0
- data/README.md +32 -7
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +14 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
- data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
- data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
- data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
- data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +38 -25
@@ -0,0 +1,14 @@
|
|
1
|
+
# Keeps only those content blocks which contain at least k words.
|
2
|
+
|
3
|
+
module Boilerpipe::Filters
|
4
|
+
class MinWordsFilter
|
5
|
+
def self.process(min_words, doc)
|
6
|
+
doc.text_blocks.each do |tb|
|
7
|
+
next if tb.is_not_content?
|
8
|
+
|
9
|
+
tb.content = false if tb.num_words < min_words
|
10
|
+
end
|
11
|
+
doc
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Classifies TextBlocks as content/not-content through rules that have been determined
|
4
2
|
# using the C4.8 machine learning algorithm, as described in the paper
|
5
3
|
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
@@ -7,7 +5,6 @@
|
|
7
5
|
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class NumWordsRulesClassifier
|
10
|
-
|
11
8
|
def self.process(doc)
|
12
9
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
10
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
|
|
37
34
|
|
38
35
|
false
|
39
36
|
end
|
40
|
-
|
41
37
|
end
|
42
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class SimpleBlockFusionProcessor
|
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
doc.replace_text_blocks!(
|
20
|
+
doc.replace_text_blocks!(tbs - blocks_to_remove)
|
21
21
|
doc
|
22
22
|
end
|
23
23
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Splits TextBlocks at paragraph boundaries.
|
2
|
+
#
|
3
|
+
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
4
|
+
# #getContainedTextElements()), but this one probably is necessary for some other filters.
|
5
|
+
#
|
6
|
+
# see MinClauseWordsFilter
|
7
|
+
|
8
|
+
module Boilerpipe::Filters
|
9
|
+
class SplitParagraphBlocksFilter
|
10
|
+
def self.process(doc)
|
11
|
+
tbs = doc.text_blocks
|
12
|
+
new_blocks = []
|
13
|
+
changes = false
|
14
|
+
tbs.each do |tb|
|
15
|
+
paragraphs = tb.text.split(/[\n\r]+/)
|
16
|
+
|
17
|
+
if paragraphs.size < 2
|
18
|
+
new_blocks << tb
|
19
|
+
next
|
20
|
+
end
|
21
|
+
|
22
|
+
is_content = tb.is_content?
|
23
|
+
labels = tb.labels
|
24
|
+
paragraphs.each do |paragraph|
|
25
|
+
tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
|
26
|
+
tbP.content = is_content
|
27
|
+
tbP.add_labels(labels)
|
28
|
+
new_blocks << tbP
|
29
|
+
changes = true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
doc.replace_text_blocks!(new_blocks) if changes
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -1,15 +1,13 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Finds blocks which are potentially indicating the end of an article
|
4
2
|
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
5
3
|
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
6
4
|
|
7
|
-
|
8
5
|
module Boilerpipe::Filters
|
9
6
|
class TerminatingBlocksFinder
|
10
7
|
def self.process(doc)
|
11
8
|
doc.text_blocks.each do |tb|
|
12
9
|
next unless tb.num_words < 15
|
10
|
+
|
13
11
|
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
14
12
|
tb.labels << :INDICATES_END_OF_TEXT
|
15
13
|
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
|
|
29
27
|
text.include?('what you think...') ||
|
30
28
|
text.include?('add your comment') ||
|
31
29
|
text.include?('add comment') ||
|
32
|
-
#TODO add this and test
|
33
|
-
#text.include?('leave a reply') ||
|
34
|
-
#text.include?('leave a comment') ||
|
35
|
-
#text.include?('show comments') ||
|
36
|
-
#text.include?('Share this:') ||
|
30
|
+
# TODO add this and test
|
31
|
+
# text.include?('leave a reply') ||
|
32
|
+
# text.include?('leave a comment') ||
|
33
|
+
# text.include?('show comments') ||
|
34
|
+
# text.include?('Share this:') ||
|
37
35
|
text.include?('reader views') ||
|
38
36
|
text.include?('have your say') ||
|
39
37
|
text.include?('reader comments') ||
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
3
2
|
# as boilerplate. Trailing means they are marked content and are
|
4
3
|
# below any other content block.
|
@@ -6,7 +5,6 @@
|
|
6
5
|
module Boilerpipe::Filters
|
7
6
|
class TrailingHeadlineToBoilerplateFilter
|
8
7
|
def self.process(doc)
|
9
|
-
|
10
8
|
doc.text_blocks.each do |tb|
|
11
9
|
next unless tb.is_content?
|
12
10
|
|
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
|
|
19
17
|
|
20
18
|
doc
|
21
19
|
end
|
22
|
-
|
23
20
|
end
|
24
21
|
end
|
@@ -1,20 +1,16 @@
|
|
1
|
-
require 'nokogiri'
|
2
1
|
module Boilerpipe::SAX
|
3
2
|
class BoilerpipeHTMLParser
|
4
3
|
def self.parse(text)
|
5
|
-
|
6
|
-
|
7
|
-
text = text.gsub(/\<script>.+?<\/script>/i, '')
|
4
|
+
# script bug - delete script tags
|
5
|
+
text.gsub!(/\<script>.+?<\/script>/i, '')
|
8
6
|
|
9
7
|
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
8
|
# mri doesn't remove when missing the semicolon
|
11
|
-
text
|
12
|
-
|
9
|
+
text.gsub!(/( ) /, '\1; ')
|
13
10
|
|
14
11
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
12
|
text = Nokogiri::HTML(text).to_html
|
16
13
|
|
17
|
-
|
18
14
|
handler = HTMLContentHandler.new
|
19
15
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
16
|
noko_parser.parse(text)
|
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'set'
|
3
|
-
|
4
1
|
module Boilerpipe::SAX
|
5
2
|
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
3
|
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
4
|
|
8
|
-
attr_accessor :in_anchor_tag, :token_buffer
|
5
|
+
attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
|
9
6
|
ANCHOR_TEXT_START = "$\ue00a<"
|
10
7
|
ANCHOR_TEXT_END = ">\ue00a$"
|
11
8
|
|
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
|
|
34
31
|
@label_stacks << nil
|
35
32
|
tag = name.upcase.intern
|
36
33
|
|
37
|
-
|
38
34
|
tag_action = @tag_actions[tag]
|
39
35
|
if tag_action
|
40
36
|
@tag_level += 1 if tag_action.changes_tag_level?
|
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
|
|
51
47
|
def characters(text)
|
52
48
|
flush_block if @flush
|
53
49
|
|
54
|
-
return if
|
50
|
+
return if in_ignorable_element?
|
55
51
|
return if text.empty?
|
56
52
|
|
57
53
|
# replace all whitespace with simple space
|
58
54
|
text.gsub!(/\s+/, ' ')
|
59
55
|
|
60
56
|
# trim whitespace
|
61
|
-
started_with_whitespace = text
|
62
|
-
ended_with_whitespace = text
|
57
|
+
started_with_whitespace = text =~ /^\s/
|
58
|
+
ended_with_whitespace = text =~ /\s$/
|
63
59
|
text.strip!
|
64
60
|
|
65
61
|
# add a single space if the block was only whitespace
|
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
|
|
158
154
|
end
|
159
155
|
|
160
156
|
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
num_words,
|
158
|
+
num_linked_words,
|
159
|
+
num_words_in_wrapped_lines,
|
160
|
+
num_wrapped_lines, @offset_blocks)
|
165
161
|
|
166
162
|
@offset_blocks += 1
|
167
163
|
clear_buffers
|
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
|
|
187
183
|
# \p{No} -- a numeric character of other type
|
188
184
|
|
189
185
|
def is_word?(word)
|
190
|
-
|
186
|
+
word =~ VALID_WORD_CHARACTER
|
191
187
|
end
|
192
188
|
|
193
|
-
#public void flushBlock() {
|
189
|
+
# public void flushBlock() {
|
194
190
|
# int numWords = 0;
|
195
191
|
# int numLinkedWords = 0;
|
196
192
|
# int numWrappedLines = 0;
|
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
|
|
198
194
|
# final int maxLineLength = 80;
|
199
195
|
# int numTokens = 0;
|
200
196
|
# int numWordsCurrentLine = 0;
|
201
|
-
#}
|
197
|
+
# }
|
202
198
|
|
203
199
|
def increase_in_ignorable_element!
|
204
200
|
@in_ignorable_element += 1
|
205
201
|
end
|
206
202
|
|
203
|
+
# should we prevent less than zero here?
|
207
204
|
def decrease_in_ignorable_element!
|
208
205
|
@in_ignorable_element -= 1
|
209
206
|
end
|
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
|
|
224
221
|
@in_anchor_tag > 0
|
225
222
|
end
|
226
223
|
|
227
|
-
|
228
224
|
def add_text_block(text_block)
|
229
225
|
@label_stacks.each do |stack|
|
230
226
|
next unless stack
|
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
|
|
239
235
|
# append space if last character wasn't already one
|
240
236
|
def append_space
|
241
237
|
return if @sb_last_was_whitespace
|
238
|
+
|
242
239
|
@sb_last_was_whitespace = true
|
243
240
|
|
244
241
|
@text_buffer << ' '
|
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
|
|
2
2
|
class AnchorText
|
3
3
|
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
4
|
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
-
|
5
|
+
# * encounters such nestings, a SAXException is thrown.
|
6
6
|
def start(handler, name, attrs)
|
7
7
|
if handler.in_anchor_tag?
|
8
8
|
handler.in_anchor_tag += 1
|
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
|
|
42
42
|
# - dunno about nokogiri???????
|
43
43
|
# as nested A elements are not allowed per specification, we
|
44
44
|
# are probably reaching this branch due to a bug in the XML parser
|
45
|
-
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
45
|
+
# puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
46
|
end_tag(handler, name)
|
47
47
|
end
|
48
48
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
4
|
class BlockLevel
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
true
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
# for block-level elements, which triggers some LabelAction on
|
3
|
-
# the generated TextBlock.
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
4
|
class BlockTagLabel
|
5
5
|
def initialize(label_action)
|
6
6
|
@label_action = label_action
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Marks this tag the body element (this should usually only
|
3
|
+
# be set for the <BODY> tag).
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
|
|
10
10
|
rel = m[1]
|
11
11
|
val = m[2].to_i # absolute
|
12
12
|
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
13
|
-
handler.font_size_stack <<
|
13
|
+
handler.font_size_stack << size
|
14
14
|
else
|
15
15
|
handler.font_size_stack << nil
|
16
16
|
end
|
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def relative(font_size_stack, rel, val)
|
30
|
-
prev_size = font_size_stack.reverse_each.find{|s| s
|
30
|
+
prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
|
31
31
|
prev_size = 3 if prev_size.nil?
|
32
32
|
|
33
33
|
size = if rel == '+'
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,71 +16,71 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rickshaw
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.5.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.5.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '3.9'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '3.9'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.10'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
83
|
-
description: A pure ruby implementation of the boilerpipe algorithm
|
82
|
+
version: '1.10'
|
83
|
+
description: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
84
84
|
email:
|
85
85
|
- "<gregory.ostermayr@gmail.com>"
|
86
86
|
executables: []
|
@@ -88,9 +88,11 @@ extensions: []
|
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
90
|
- ".circleci/config.yml"
|
91
|
+
- ".dockerignore"
|
91
92
|
- ".gitignore"
|
92
93
|
- ".rspec"
|
93
94
|
- CHANGELOG.md
|
95
|
+
- Dockerfile
|
94
96
|
- Gemfile
|
95
97
|
- LICENSE.txt
|
96
98
|
- README.md
|
@@ -98,14 +100,22 @@ files:
|
|
98
100
|
- bin/console
|
99
101
|
- bin/setup
|
100
102
|
- boilerpipe-ruby.gemspec
|
103
|
+
- boilerpipe_flow.md
|
101
104
|
- lib/boilerpipe.rb
|
102
105
|
- lib/boilerpipe/document/text_block.rb
|
103
106
|
- lib/boilerpipe/document/text_document.rb
|
104
107
|
- lib/boilerpipe/errors.rb
|
105
108
|
- lib/boilerpipe/extractors/article_extractor.rb
|
109
|
+
- lib/boilerpipe/extractors/article_sentence_extractor.rb
|
110
|
+
- lib/boilerpipe/extractors/canola_extractor.rb
|
106
111
|
- lib/boilerpipe/extractors/default_extractor.rb
|
112
|
+
- lib/boilerpipe/extractors/keep_everything_extractor.rb
|
113
|
+
- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
|
114
|
+
- lib/boilerpipe/extractors/largest_content_extractor.rb
|
115
|
+
- lib/boilerpipe/extractors/num_words_rules_extractor.rb
|
107
116
|
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
108
117
|
- lib/boilerpipe/filters/boilerplate_block_filter.rb
|
118
|
+
- lib/boilerpipe/filters/canola_classifier.rb
|
109
119
|
- lib/boilerpipe/filters/density_rules_classifier.rb
|
110
120
|
- lib/boilerpipe/filters/document_title_match_classifier.rb
|
111
121
|
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
|
@@ -114,8 +124,12 @@ files:
|
|
114
124
|
- lib/boilerpipe/filters/keep_largest_block_filter.rb
|
115
125
|
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
116
126
|
- lib/boilerpipe/filters/list_at_end_filter.rb
|
127
|
+
- lib/boilerpipe/filters/mark_everything_content_filter.rb
|
128
|
+
- lib/boilerpipe/filters/min_clause_words_filter.rb
|
129
|
+
- lib/boilerpipe/filters/min_words_filter.rb
|
117
130
|
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
118
131
|
- lib/boilerpipe/filters/simple_block_fusion_processor.rb
|
132
|
+
- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
|
119
133
|
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
120
134
|
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
121
135
|
- lib/boilerpipe/labels/default.rb
|
@@ -140,7 +154,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
|
|
140
154
|
licenses:
|
141
155
|
- Apache 2.0
|
142
156
|
metadata: {}
|
143
|
-
post_install_message:
|
157
|
+
post_install_message:
|
144
158
|
rdoc_options: []
|
145
159
|
require_paths:
|
146
160
|
- lib
|
@@ -155,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
155
169
|
- !ruby/object:Gem::Version
|
156
170
|
version: '0'
|
157
171
|
requirements: []
|
158
|
-
|
159
|
-
|
160
|
-
signing_key:
|
172
|
+
rubygems_version: 3.0.8
|
173
|
+
signing_key:
|
161
174
|
specification_version: 4
|
162
|
-
summary: A pure ruby
|
175
|
+
summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
163
176
|
test_files: []
|