boilerpipe-ruby 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +27 -6
- data/Rakefile +8 -0
- data/boilerpipe-ruby.gemspec +10 -9
- data/lib/boilerpipe.rb +30 -0
- data/lib/boilerpipe/document/text_block.rb +113 -0
- data/lib/boilerpipe/document/text_document.rb +44 -0
- data/lib/boilerpipe/errors.rb +1 -0
- data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
- data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
- data/lib/boilerpipe/labels/default.rb +17 -0
- data/lib/boilerpipe/labels/label_action.rb +17 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
- data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
- data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
- data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
- data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
- data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
- data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
- data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
- data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
- data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- data/stuff.txt +4 -0
- metadata +61 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5aed62a42276e97a3e40126609fc51bf624b091f
|
4
|
+
data.tar.gz: 357afb75e661083011013b0ebdca88103e4a188e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a64ef6c16a1f1aa5dc44f60c0c94f0ee8cd02876a549e6433b19db617207e64c258534f6b708bece4c0749c7fee50ec15f114c64cb8ee993b154ede4764c2e2b
|
7
|
+
data.tar.gz: d9d52e256767a6553a7d19fe6ad8711610cc3fa7beea5bef7fdc86c28d25a9f5ab00e3486aff7942251376907322f11f643c879d96a136b3e9a969fca8c23a25
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,15 +1,29 @@
|
|
1
|
-
# Boilerpipe
|
1
|
+
# Boilerpipe
|
2
2
|
|
3
|
-
|
3
|
+
A pure ruby implemenation of the boilerpipe algorithm.
|
4
4
|
|
5
|
-
|
5
|
+
This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
|
6
|
+
|
7
|
+
I went directly to the original author's github https://github.com/kohlschutter/boilerpipe and forked that code base here https://github.com/gregors/boilerpipe.
|
8
|
+
|
9
|
+
I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-web.appspot.com) but depending on time of day the api goes down due to exceeding the hosting plan. I also checked out some gems making use of Jruby but I had all kinds of dependency and bug issues. So I made some tweaks on my fork and created a new [jruby-boilerpipe gem](https://rubygems.org/gems/jruby-boilerpipe).
|
10
|
+
|
11
|
+
This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
|
12
|
+
|
13
|
+
I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
|
14
|
+
|
15
|
+
Presently the follow Extractors are implemented
|
16
|
+
* [x] ArticleExtractor
|
17
|
+
* [ ] DefaultExtractor
|
18
|
+
* [ ] LargestContentExtractor
|
19
|
+
* [ ] KeepEverythingExtractor
|
6
20
|
|
7
21
|
## Installation
|
8
22
|
|
9
23
|
Add this line to your application's Gemfile:
|
10
24
|
|
11
25
|
```ruby
|
12
|
-
gem 'boilerpipe-ruby'
|
26
|
+
gem 'boilerpipe-ruby', require: 'boilerpipe'
|
13
27
|
```
|
14
28
|
|
15
29
|
And then execute:
|
@@ -22,7 +36,14 @@ Or install it yourself as:
|
|
22
36
|
|
23
37
|
## Usage
|
24
38
|
|
25
|
-
|
39
|
+
gregors$ irb
|
40
|
+
> require 'boilerpipe'
|
41
|
+
=> true
|
42
|
+
> require 'open-uri'
|
43
|
+
=> true
|
44
|
+
> content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
|
45
|
+
> output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
|
46
|
+
=> "Always Squash and Rebase your Git Commits"
|
26
47
|
|
27
48
|
## Development
|
28
49
|
|
@@ -32,5 +53,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
32
53
|
|
33
54
|
## Contributing
|
34
55
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
56
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.
|
36
57
|
|
data/Rakefile
CHANGED
@@ -4,3 +4,11 @@ require "rspec/core/rake_task"
|
|
4
4
|
RSpec::Core::RakeTask.new(:spec)
|
5
5
|
|
6
6
|
task :default => :spec
|
7
|
+
|
8
|
+
|
9
|
+
desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
|
10
|
+
task :download_boilerpipe_jar do
|
11
|
+
FileUtils.mkdir_p 'spec/sanity_checks/jars/'
|
12
|
+
Dir.chdir 'spec/sanity_checks/jars/'
|
13
|
+
`wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
|
14
|
+
end
|
data/boilerpipe-ruby.gemspec
CHANGED
@@ -4,21 +4,22 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'boilerpipe/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
7
|
+
spec.name = 'boilerpipe-ruby'
|
8
8
|
spec.version = Boilerpipe::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
9
|
+
spec.authors = ['Gregory Ostermayr']
|
10
|
+
spec.email = ['<gregory.ostermayr@gmail.com>']
|
11
11
|
|
12
|
-
spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm
|
13
|
-
spec.description = %q{A pure ruby implementation of the boilerpipe
|
14
|
-
spec.homepage =
|
12
|
+
spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
|
13
|
+
spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
|
14
|
+
spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
|
15
15
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
17
|
spec.bindir = "exe"
|
18
18
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_development_dependency
|
22
|
-
spec.add_development_dependency
|
23
|
-
spec.add_development_dependency
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1.11'
|
22
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
23
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
24
|
+
spec.add_runtime_dependency 'nokogiri', '1.6.6.2'
|
24
25
|
end
|
data/lib/boilerpipe.rb
CHANGED
@@ -1,2 +1,32 @@
|
|
1
1
|
require 'boilerpipe/version'
|
2
2
|
require 'boilerpipe/util/unicode_tokenizer'
|
3
|
+
require 'boilerpipe/document/text_document'
|
4
|
+
require 'boilerpipe/document/text_block'
|
5
|
+
require 'boilerpipe/extractors/article_extractor'
|
6
|
+
require 'boilerpipe/filters/block_proximity_fusion'
|
7
|
+
require 'boilerpipe/filters/boilerplate_block_filter'
|
8
|
+
require 'boilerpipe/filters/document_title_match_classifier'
|
9
|
+
require 'boilerpipe/filters/expand_title_to_content_filter'
|
10
|
+
require 'boilerpipe/filters/heuristic_filter_base'
|
11
|
+
require 'boilerpipe/filters/ignore_blocks_after_content_filter'
|
12
|
+
require 'boilerpipe/filters/keep_largest_block_filter'
|
13
|
+
require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
|
14
|
+
require 'boilerpipe/filters/list_at_end_filter'
|
15
|
+
require 'boilerpipe/filters/num_words_rules_classifier'
|
16
|
+
require 'boilerpipe/filters/terminating_blocks_finder'
|
17
|
+
require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
18
|
+
require 'boilerpipe/labels/default'
|
19
|
+
require 'boilerpipe/labels/label_action'
|
20
|
+
require 'boilerpipe/sax/html_content_handler'
|
21
|
+
require 'boilerpipe/sax/boilerpipe_html_parser'
|
22
|
+
require 'boilerpipe/sax/tag_action_map'
|
23
|
+
require 'boilerpipe/sax/tag_actions/chained'
|
24
|
+
require 'boilerpipe/sax/tag_actions/ignorable_element'
|
25
|
+
require 'boilerpipe/sax/tag_actions/anchor_text'
|
26
|
+
require 'boilerpipe/sax/tag_actions/body'
|
27
|
+
require 'boilerpipe/sax/tag_actions/inline_whitespace'
|
28
|
+
require 'boilerpipe/sax/tag_actions/inline_no_whitespace'
|
29
|
+
require 'boilerpipe/sax/tag_actions/block_level'
|
30
|
+
require 'boilerpipe/sax/tag_actions/font'
|
31
|
+
require 'boilerpipe/sax/tag_actions/inline_tag_label'
|
32
|
+
require 'boilerpipe/sax/tag_actions/block_tag_label'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Boilerpipe
|
4
|
+
module Document
|
5
|
+
class TextBlock
|
6
|
+
|
7
|
+
#EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
|
8
|
+
|
9
|
+
attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
|
10
|
+
:num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
|
11
|
+
:link_density, :labels, :tag_level, :num_full_text_words
|
12
|
+
|
13
|
+
attr_accessor :content
|
14
|
+
|
15
|
+
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=0, offset_blocks=0)
|
16
|
+
@labels = Set.new
|
17
|
+
@text = text
|
18
|
+
@num_words = num_words
|
19
|
+
@num_words_in_anchor_text = num_words_in_anchor_text
|
20
|
+
@num_words_in_wrapped_lines = num_words_in_wrapped_lines
|
21
|
+
@num_wrapped_lines = num_wrapped_lines
|
22
|
+
@num_full_text_words = 0
|
23
|
+
@offset_blocks_start = offset_blocks
|
24
|
+
@offset_blocks_end = offset_blocks
|
25
|
+
@content = false
|
26
|
+
@tag_level = 0
|
27
|
+
|
28
|
+
init_densities
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.empty_start
|
32
|
+
new('', 0, 0, 0, 0, -1)
|
33
|
+
end
|
34
|
+
|
35
|
+
def set_tag_level(level)
|
36
|
+
@tag_level = level
|
37
|
+
end
|
38
|
+
|
39
|
+
def is_content?
|
40
|
+
@content
|
41
|
+
end
|
42
|
+
|
43
|
+
def is_not_content?
|
44
|
+
!is_content?
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_label(label)
|
48
|
+
@labels << label
|
49
|
+
end
|
50
|
+
|
51
|
+
def add_labels(labels)
|
52
|
+
labels.each do |label|
|
53
|
+
add_label(label)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def has_label?(label)
|
58
|
+
@labels.include?(label)
|
59
|
+
end
|
60
|
+
|
61
|
+
def remove_label(label)
|
62
|
+
@labels.delete(label)
|
63
|
+
end
|
64
|
+
|
65
|
+
def merge_next(other)
|
66
|
+
@text = "#{@text}\n#{other.text}"
|
67
|
+
@num_words += other.num_words
|
68
|
+
@num_words_in_anchor_text += other.num_words_in_anchor_text
|
69
|
+
@num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
|
70
|
+
@num_wrapped_lines += other.num_wrapped_lines
|
71
|
+
@offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
|
72
|
+
@offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
|
73
|
+
init_densities
|
74
|
+
@content |= other.is_content?
|
75
|
+
|
76
|
+
@num_full_text_words += other.num_full_text_words
|
77
|
+
|
78
|
+
if other.labels
|
79
|
+
if @labels.nil?
|
80
|
+
@labels = other.labels.clone
|
81
|
+
else
|
82
|
+
@labels.merge(other.labels.clone)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
@tag_level = [@tag_level, other.tag_level].min
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_s
|
90
|
+
#"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
|
91
|
+
labels = 'null'
|
92
|
+
if !@labels.empty?
|
93
|
+
labels ="[#{ @labels.to_a.join(',')}]"
|
94
|
+
end
|
95
|
+
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
|
96
|
+
end
|
97
|
+
|
98
|
+
def clone
|
99
|
+
throw NotImplementedError
|
100
|
+
end
|
101
|
+
|
102
|
+
private
|
103
|
+
def init_densities
|
104
|
+
if @num_words_in_wrapped_lines == 0
|
105
|
+
@num_words_in_wrapped_lines = @num_words
|
106
|
+
@num_wrapped_lines = 1
|
107
|
+
end
|
108
|
+
@text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f
|
109
|
+
@link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Boilerpipe
|
2
|
+
module Document
|
3
|
+
class TextDocument
|
4
|
+
attr_reader :text_blocks
|
5
|
+
attr_accessor :title
|
6
|
+
|
7
|
+
def initialize(title, text_blocks)
|
8
|
+
@text_blocks = text_blocks
|
9
|
+
@title = title
|
10
|
+
end
|
11
|
+
|
12
|
+
def content
|
13
|
+
text(true, false)
|
14
|
+
end
|
15
|
+
|
16
|
+
def text(include_content, include_noncontent)
|
17
|
+
s = ''
|
18
|
+
@text_blocks.each do |text_block|
|
19
|
+
case text_block.is_content?
|
20
|
+
when true
|
21
|
+
next unless include_content
|
22
|
+
s << text_block.text
|
23
|
+
s << '\n'
|
24
|
+
when false
|
25
|
+
next unless include_noncontent
|
26
|
+
s << text_block.text
|
27
|
+
s << '\n'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
s
|
31
|
+
end
|
32
|
+
|
33
|
+
def replace_text_blocks!(new_blocks)
|
34
|
+
@text_blocks = new_blocks
|
35
|
+
end
|
36
|
+
|
37
|
+
def debug_s
|
38
|
+
@text_blocks.map(&:to_s).join("\n")
|
39
|
+
end
|
40
|
+
alias_method :debug_string, :debug_s
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
class BoilerPipeProcessingError < StandardError; end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Boilerpipe::Extractors
|
2
|
+
class ArticleExtractor
|
3
|
+
def self.text(contents)
|
4
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
5
|
+
::Boilerpipe::Extractors::ArticleExtractor.process(doc)
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.process(doc)
|
9
|
+
title = doc.title
|
10
|
+
|
11
|
+
filters = ::Boilerpipe::Filters
|
12
|
+
|
13
|
+
# marks text blocks as end of text with :INDICATES_END_OF_TEXT
|
14
|
+
filters::TerminatingBlocksFinder.process doc
|
15
|
+
|
16
|
+
# marks text blocks as title
|
17
|
+
filters::DocumentTitleMatchClassifier.new(title).process doc
|
18
|
+
|
19
|
+
# marks text blocks as content / non-content using boilerpipe alg
|
20
|
+
filters::NumWordsRulesClassifier.process doc
|
21
|
+
|
22
|
+
# marks text blocks after INDICATES_END_OF_TEXT non-content
|
23
|
+
filters::IgnoreBlocksAfterContentFilter.process doc
|
24
|
+
|
25
|
+
# marks HEADING text blocks as non-content after existing content
|
26
|
+
filters::TrailingHeadlineToBoilerplateFilter.process doc
|
27
|
+
|
28
|
+
# merge text blocks next to each other
|
29
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
|
30
|
+
|
31
|
+
# removes non-content text blocks
|
32
|
+
filters::BoilerplateBlockFilter::INSTANCE_KEEP_TITLE.process doc
|
33
|
+
|
34
|
+
# merge text blocks next to each other if they are the same tag level
|
35
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process doc
|
36
|
+
|
37
|
+
# Keeps only the largest text block as content
|
38
|
+
filters::KeepLargestBlockFilter::INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process doc
|
39
|
+
|
40
|
+
# Marks all TextBlocks "content" which are between the headline and the part is already content
|
41
|
+
filters::ExpandTitleToContentFilter.process doc
|
42
|
+
|
43
|
+
# mark text blocks with a lot of text at the same tag level as the largest current content as additional content
|
44
|
+
filters::LargeBlockSameTagLevelToContentFilter.process doc
|
45
|
+
|
46
|
+
# Marks nested list-item blocks after the end of the main content as content.
|
47
|
+
filters::ListAtEndFilter.process doc
|
48
|
+
|
49
|
+
doc.content
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
|
3
|
+
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
4
|
+
|
5
|
+
module Boilerpipe::Filters
|
6
|
+
class BlockProximityFusion
|
7
|
+
|
8
|
+
|
9
|
+
def initialize(max_blocks_distance, content_only, same_tag_level_only)
|
10
|
+
@max_blocks_distance = max_blocks_distance
|
11
|
+
@content_only = content_only
|
12
|
+
@same_tag_level_only = same_tag_level_only
|
13
|
+
end
|
14
|
+
|
15
|
+
MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
|
16
|
+
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
|
17
|
+
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
|
18
|
+
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
|
19
|
+
|
20
|
+
def process(doc)
|
21
|
+
text_blocks = doc.text_blocks
|
22
|
+
return false if text_blocks.size < 2
|
23
|
+
|
24
|
+
prev_block = if @content_only
|
25
|
+
text_blocks.find{ |tb| tb.is_content? }
|
26
|
+
else
|
27
|
+
text_blocks.first
|
28
|
+
end
|
29
|
+
|
30
|
+
return false if prev_block.nil?
|
31
|
+
|
32
|
+
offset = text_blocks.index(prev_block) + 1
|
33
|
+
blocks = text_blocks[offset..-1]
|
34
|
+
|
35
|
+
blocks_to_remove = []
|
36
|
+
|
37
|
+
blocks.each do |tb|
|
38
|
+
if tb.is_not_content?
|
39
|
+
prev_block = tb
|
40
|
+
next
|
41
|
+
end
|
42
|
+
|
43
|
+
diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1
|
44
|
+
if diff_blocks <= @max_blocks_distance
|
45
|
+
ok = true
|
46
|
+
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
|
47
|
+
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
|
48
|
+
|
49
|
+
if ok
|
50
|
+
prev_block.merge_next(tb)
|
51
|
+
blocks_to_remove << tb
|
52
|
+
else
|
53
|
+
prev_block = tb
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
doc.replace_text_blocks!( text_blocks - blocks_to_remove )
|
59
|
+
doc
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
3
|
+
|
4
|
+
module Boilerpipe::Filters
|
5
|
+
class BoilerplateBlockFilter
|
6
|
+
|
7
|
+
def initialize(label)
|
8
|
+
@label_to_keep = label
|
9
|
+
end
|
10
|
+
INSTANCE_KEEP_TITLE = BoilerplateBlockFilter.new(:TITLE)
|
11
|
+
|
12
|
+
def process(doc)
|
13
|
+
combined = doc.text_blocks.delete_if do |tb|
|
14
|
+
if tb.is_not_content? &&
|
15
|
+
(@label_to_keep.nil? || !tb.has_label?(:TITLE))
|
16
|
+
true
|
17
|
+
else
|
18
|
+
false
|
19
|
+
end
|
20
|
+
end
|
21
|
+
doc.replace_text_blocks!(combined)
|
22
|
+
doc
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|