boilerpipe-ruby 0.3.0 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +30 -1
- data/Dockerfile +14 -0
- data/README.md +15 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +9 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
- data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +33 -25
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class SimpleBlockFusionProcessor
|
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
doc.replace_text_blocks!(
|
20
|
+
doc.replace_text_blocks!(tbs - blocks_to_remove)
|
21
21
|
doc
|
22
22
|
end
|
23
23
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# Splits TextBlocks at paragraph boundaries.
|
2
|
+
#
|
3
|
+
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
4
|
+
# #getContainedTextElements()), but this one probably is necessary for some other filters.
|
5
|
+
#
|
6
|
+
# see MinClauseWordsFilter
|
7
|
+
|
8
|
+
module Boilerpipe::Filters
|
9
|
+
class SplitParagraphBlocksFilter
|
10
|
+
def self.process(doc)
|
11
|
+
tbs = doc.text_blocks
|
12
|
+
new_blocks = []
|
13
|
+
changes = false
|
14
|
+
tbs.each do |tb|
|
15
|
+
paragraphs = tb.text.split(/[\n\r]+/)
|
16
|
+
|
17
|
+
if paragraphs.size < 2
|
18
|
+
new_blocks << tb
|
19
|
+
next
|
20
|
+
end
|
21
|
+
|
22
|
+
is_content = tb.is_content?
|
23
|
+
labels = tb.labels
|
24
|
+
paragraphs.each do |paragraph|
|
25
|
+
tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
|
26
|
+
tbP.content = is_content
|
27
|
+
tbP.add_labels(labels)
|
28
|
+
new_blocks << tbP
|
29
|
+
changes = true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
doc.replace_text_blocks!(new_blocks) if changes
|
34
|
+
doc
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -1,15 +1,13 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Finds blocks which are potentially indicating the end of an article
|
4
2
|
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
5
3
|
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
6
4
|
|
7
|
-
|
8
5
|
module Boilerpipe::Filters
|
9
6
|
class TerminatingBlocksFinder
|
10
7
|
def self.process(doc)
|
11
8
|
doc.text_blocks.each do |tb|
|
12
9
|
next unless tb.num_words < 15
|
10
|
+
|
13
11
|
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
14
12
|
tb.labels << :INDICATES_END_OF_TEXT
|
15
13
|
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
|
|
29
27
|
text.include?('what you think...') ||
|
30
28
|
text.include?('add your comment') ||
|
31
29
|
text.include?('add comment') ||
|
32
|
-
#TODO add this and test
|
33
|
-
#text.include?('leave a reply') ||
|
34
|
-
#text.include?('leave a comment') ||
|
35
|
-
#text.include?('show comments') ||
|
36
|
-
#text.include?('Share this:') ||
|
30
|
+
# TODO add this and test
|
31
|
+
# text.include?('leave a reply') ||
|
32
|
+
# text.include?('leave a comment') ||
|
33
|
+
# text.include?('show comments') ||
|
34
|
+
# text.include?('Share this:') ||
|
37
35
|
text.include?('reader views') ||
|
38
36
|
text.include?('have your say') ||
|
39
37
|
text.include?('reader comments') ||
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
3
2
|
# as boilerplate. Trailing means they are marked content and are
|
4
3
|
# below any other content block.
|
@@ -6,7 +5,6 @@
|
|
6
5
|
module Boilerpipe::Filters
|
7
6
|
class TrailingHeadlineToBoilerplateFilter
|
8
7
|
def self.process(doc)
|
9
|
-
|
10
8
|
doc.text_blocks.each do |tb|
|
11
9
|
next unless tb.is_content?
|
12
10
|
|
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
|
|
19
17
|
|
20
18
|
doc
|
21
19
|
end
|
22
|
-
|
23
20
|
end
|
24
21
|
end
|
@@ -1,20 +1,11 @@
|
|
1
|
-
require 'nokogiri'
|
2
1
|
module Boilerpipe::SAX
|
3
2
|
class BoilerpipeHTMLParser
|
4
3
|
def self.parse(text)
|
5
|
-
|
6
|
-
|
7
|
-
text = text.gsub(/\<script>.+?<\/script>/i, '')
|
8
|
-
|
9
|
-
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
|
-
# mri doesn't remove when missing the semicolon
|
11
|
-
text = text.gsub(/( ) /, '\1; ')
|
12
|
-
|
4
|
+
# strip out tags that cause issues
|
5
|
+
text = Preprocessor.strip(text)
|
13
6
|
|
14
7
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
8
|
text = Nokogiri::HTML(text).to_html
|
16
|
-
|
17
|
-
|
18
9
|
handler = HTMLContentHandler.new
|
19
10
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
11
|
noko_parser.parse(text)
|
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'set'
|
3
|
-
|
4
1
|
module Boilerpipe::SAX
|
5
2
|
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
3
|
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
4
|
|
8
|
-
attr_accessor :in_anchor_tag, :token_buffer
|
5
|
+
attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
|
9
6
|
ANCHOR_TEXT_START = "$\ue00a<"
|
10
7
|
ANCHOR_TEXT_END = ">\ue00a$"
|
11
8
|
|
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
|
|
34
31
|
@label_stacks << nil
|
35
32
|
tag = name.upcase.intern
|
36
33
|
|
37
|
-
|
38
34
|
tag_action = @tag_actions[tag]
|
39
35
|
if tag_action
|
40
36
|
@tag_level += 1 if tag_action.changes_tag_level?
|
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
|
|
51
47
|
def characters(text)
|
52
48
|
flush_block if @flush
|
53
49
|
|
54
|
-
return if
|
50
|
+
return if in_ignorable_element?
|
55
51
|
return if text.empty?
|
56
52
|
|
57
53
|
# replace all whitespace with simple space
|
58
54
|
text.gsub!(/\s+/, ' ')
|
59
55
|
|
60
56
|
# trim whitespace
|
61
|
-
started_with_whitespace = text
|
62
|
-
ended_with_whitespace = text
|
57
|
+
started_with_whitespace = text =~ /^\s/
|
58
|
+
ended_with_whitespace = text =~ /\s$/
|
63
59
|
text.strip!
|
64
60
|
|
65
61
|
# add a single space if the block was only whitespace
|
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
|
|
158
154
|
end
|
159
155
|
|
160
156
|
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
num_words,
|
158
|
+
num_linked_words,
|
159
|
+
num_words_in_wrapped_lines,
|
160
|
+
num_wrapped_lines, @offset_blocks)
|
165
161
|
|
166
162
|
@offset_blocks += 1
|
167
163
|
clear_buffers
|
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
|
|
187
183
|
# \p{No} -- a numeric character of other type
|
188
184
|
|
189
185
|
def is_word?(word)
|
190
|
-
|
186
|
+
word =~ VALID_WORD_CHARACTER
|
191
187
|
end
|
192
188
|
|
193
|
-
#public void flushBlock() {
|
189
|
+
# public void flushBlock() {
|
194
190
|
# int numWords = 0;
|
195
191
|
# int numLinkedWords = 0;
|
196
192
|
# int numWrappedLines = 0;
|
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
|
|
198
194
|
# final int maxLineLength = 80;
|
199
195
|
# int numTokens = 0;
|
200
196
|
# int numWordsCurrentLine = 0;
|
201
|
-
#}
|
197
|
+
# }
|
202
198
|
|
203
199
|
def increase_in_ignorable_element!
|
204
200
|
@in_ignorable_element += 1
|
205
201
|
end
|
206
202
|
|
203
|
+
# should we prevent less than zero here?
|
207
204
|
def decrease_in_ignorable_element!
|
208
205
|
@in_ignorable_element -= 1
|
209
206
|
end
|
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
|
|
224
221
|
@in_anchor_tag > 0
|
225
222
|
end
|
226
223
|
|
227
|
-
|
228
224
|
def add_text_block(text_block)
|
229
225
|
@label_stacks.each do |stack|
|
230
226
|
next unless stack
|
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
|
|
239
235
|
# append space if last character wasn't already one
|
240
236
|
def append_space
|
241
237
|
return if @sb_last_was_whitespace
|
238
|
+
|
242
239
|
@sb_last_was_whitespace = true
|
243
240
|
|
244
241
|
@text_buffer << ' '
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Boilerpipe::SAX
|
2
|
+
class Preprocessor
|
3
|
+
def self.strip(text)
|
4
|
+
# script bug - delete script tags
|
5
|
+
text = text.gsub(/\<script.+?<\/script>/im, '')
|
6
|
+
# nokogiri uses libxml for mri and nekohtml for jruby
|
7
|
+
# mri doesn't remove when missing the semicolon
|
8
|
+
text.gsub(/( ) /, '\1; ')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
|
|
2
2
|
class AnchorText
|
3
3
|
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
4
|
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
-
|
5
|
+
# * encounters such nestings, a SAXException is thrown.
|
6
6
|
def start(handler, name, attrs)
|
7
7
|
if handler.in_anchor_tag?
|
8
8
|
handler.in_anchor_tag += 1
|
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
|
|
42
42
|
# - dunno about nokogiri???????
|
43
43
|
# as nested A elements are not allowed per specification, we
|
44
44
|
# are probably reaching this branch due to a bug in the XML parser
|
45
|
-
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
45
|
+
# puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
46
|
end_tag(handler, name)
|
47
47
|
end
|
48
48
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
4
|
class BlockLevel
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
true
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
# for block-level elements, which triggers some LabelAction on
|
3
|
-
# the generated TextBlock.
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
4
|
class BlockTagLabel
|
5
5
|
def initialize(label_action)
|
6
6
|
@label_action = label_action
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Marks this tag the body element (this should usually only
|
3
|
+
# be set for the <BODY> tag).
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
|
|
10
10
|
rel = m[1]
|
11
11
|
val = m[2].to_i # absolute
|
12
12
|
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
13
|
-
handler.font_size_stack <<
|
13
|
+
handler.font_size_stack << size
|
14
14
|
else
|
15
15
|
handler.font_size_stack << nil
|
16
16
|
end
|
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def relative(font_size_stack, rel, val)
|
30
|
-
prev_size = font_size_stack.reverse_each.find{|s| s
|
30
|
+
prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
|
31
31
|
prev_size = 3 if prev_size.nil?
|
32
32
|
|
33
33
|
size = if rel == '+'
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,71 +16,71 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rickshaw
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.5.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.5.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '3.10'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '3.10'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- - "
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.10'
|
76
76
|
type: :runtime
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- - "
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
83
|
-
description: A pure ruby implementation of the boilerpipe algorithm
|
82
|
+
version: '1.10'
|
83
|
+
description: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
84
84
|
email:
|
85
85
|
- "<gregory.ostermayr@gmail.com>"
|
86
86
|
executables: []
|
@@ -88,9 +88,11 @@ extensions: []
|
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
90
|
- ".circleci/config.yml"
|
91
|
+
- ".dockerignore"
|
91
92
|
- ".gitignore"
|
92
93
|
- ".rspec"
|
93
94
|
- CHANGELOG.md
|
95
|
+
- Dockerfile
|
94
96
|
- Gemfile
|
95
97
|
- LICENSE.txt
|
96
98
|
- README.md
|
@@ -98,14 +100,17 @@ files:
|
|
98
100
|
- bin/console
|
99
101
|
- bin/setup
|
100
102
|
- boilerpipe-ruby.gemspec
|
103
|
+
- boilerpipe_flow.md
|
101
104
|
- lib/boilerpipe.rb
|
102
105
|
- lib/boilerpipe/document/text_block.rb
|
103
106
|
- lib/boilerpipe/document/text_document.rb
|
104
107
|
- lib/boilerpipe/errors.rb
|
105
108
|
- lib/boilerpipe/extractors/article_extractor.rb
|
109
|
+
- lib/boilerpipe/extractors/article_sentence_extractor.rb
|
106
110
|
- lib/boilerpipe/extractors/canola_extractor.rb
|
107
111
|
- lib/boilerpipe/extractors/default_extractor.rb
|
108
112
|
- lib/boilerpipe/extractors/keep_everything_extractor.rb
|
113
|
+
- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
|
109
114
|
- lib/boilerpipe/extractors/largest_content_extractor.rb
|
110
115
|
- lib/boilerpipe/extractors/num_words_rules_extractor.rb
|
111
116
|
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
@@ -120,14 +125,18 @@ files:
|
|
120
125
|
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
121
126
|
- lib/boilerpipe/filters/list_at_end_filter.rb
|
122
127
|
- lib/boilerpipe/filters/mark_everything_content_filter.rb
|
128
|
+
- lib/boilerpipe/filters/min_clause_words_filter.rb
|
129
|
+
- lib/boilerpipe/filters/min_words_filter.rb
|
123
130
|
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
124
131
|
- lib/boilerpipe/filters/simple_block_fusion_processor.rb
|
132
|
+
- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
|
125
133
|
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
126
134
|
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
127
135
|
- lib/boilerpipe/labels/default.rb
|
128
136
|
- lib/boilerpipe/labels/label_action.rb
|
129
137
|
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
130
138
|
- lib/boilerpipe/sax/html_content_handler.rb
|
139
|
+
- lib/boilerpipe/sax/preprocessor.rb
|
131
140
|
- lib/boilerpipe/sax/tag_action_map.rb
|
132
141
|
- lib/boilerpipe/sax/tag_actions/anchor_text.rb
|
133
142
|
- lib/boilerpipe/sax/tag_actions/block_level.rb
|
@@ -146,7 +155,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
|
|
146
155
|
licenses:
|
147
156
|
- Apache 2.0
|
148
157
|
metadata: {}
|
149
|
-
post_install_message:
|
158
|
+
post_install_message:
|
150
159
|
rdoc_options: []
|
151
160
|
require_paths:
|
152
161
|
- lib
|
@@ -161,9 +170,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
161
170
|
- !ruby/object:Gem::Version
|
162
171
|
version: '0'
|
163
172
|
requirements: []
|
164
|
-
|
165
|
-
|
166
|
-
signing_key:
|
173
|
+
rubygems_version: 3.0.8
|
174
|
+
signing_key:
|
167
175
|
specification_version: 4
|
168
|
-
summary: A pure ruby
|
176
|
+
summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
169
177
|
test_files: []
|