boilerpipe-ruby 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +27 -6
- data/Rakefile +8 -0
- data/boilerpipe-ruby.gemspec +10 -9
- data/lib/boilerpipe.rb +30 -0
- data/lib/boilerpipe/document/text_block.rb +113 -0
- data/lib/boilerpipe/document/text_document.rb +44 -0
- data/lib/boilerpipe/errors.rb +1 -0
- data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
- data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
- data/lib/boilerpipe/labels/default.rb +17 -0
- data/lib/boilerpipe/labels/label_action.rb +17 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
- data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
- data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
- data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
- data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
- data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
- data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
- data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
- data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
- data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- data/stuff.txt +4 -0
- metadata +61 -15
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5aed62a42276e97a3e40126609fc51bf624b091f
|
|
4
|
+
data.tar.gz: 357afb75e661083011013b0ebdca88103e4a188e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a64ef6c16a1f1aa5dc44f60c0c94f0ee8cd02876a549e6433b19db617207e64c258534f6b708bece4c0749c7fee50ec15f114c64cb8ee993b154ede4764c2e2b
|
|
7
|
+
data.tar.gz: d9d52e256767a6553a7d19fe6ad8711610cc3fa7beea5bef7fdc86c28d25a9f5ab00e3486aff7942251376907322f11f643c879d96a136b3e9a969fca8c23a25
|
data/.gitignore
CHANGED
data/README.md
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
|
-
# Boilerpipe
|
|
1
|
+
# Boilerpipe
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
A pure ruby implemenation of the boilerpipe algorithm.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
|
|
6
|
+
|
|
7
|
+
I went directly to the original author's github https://github.com/kohlschutter/boilerpipe and forked that code base here https://github.com/gregors/boilerpipe.
|
|
8
|
+
|
|
9
|
+
I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-web.appspot.com) but depending on time of day the api goes down due to exceeding the hosting plan. I also checked out some gems making use of Jruby but I had all kinds of dependency and bug issues. So I made some tweaks on my fork and created a new [jruby-boilerpipe gem](https://rubygems.org/gems/jruby-boilerpipe).
|
|
10
|
+
|
|
11
|
+
This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
|
|
12
|
+
|
|
13
|
+
I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
|
|
14
|
+
|
|
15
|
+
Presently the follow Extractors are implemented
|
|
16
|
+
* [x] ArticleExtractor
|
|
17
|
+
* [ ] DefaultExtractor
|
|
18
|
+
* [ ] LargestContentExtractor
|
|
19
|
+
* [ ] KeepEverythingExtractor
|
|
6
20
|
|
|
7
21
|
## Installation
|
|
8
22
|
|
|
9
23
|
Add this line to your application's Gemfile:
|
|
10
24
|
|
|
11
25
|
```ruby
|
|
12
|
-
gem 'boilerpipe-ruby'
|
|
26
|
+
gem 'boilerpipe-ruby', require: 'boilerpipe'
|
|
13
27
|
```
|
|
14
28
|
|
|
15
29
|
And then execute:
|
|
@@ -22,7 +36,14 @@ Or install it yourself as:
|
|
|
22
36
|
|
|
23
37
|
## Usage
|
|
24
38
|
|
|
25
|
-
|
|
39
|
+
gregors$ irb
|
|
40
|
+
> require 'boilerpipe'
|
|
41
|
+
=> true
|
|
42
|
+
> require 'open-uri'
|
|
43
|
+
=> true
|
|
44
|
+
> content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
|
|
45
|
+
> output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
|
|
46
|
+
=> "Always Squash and Rebase your Git Commits"
|
|
26
47
|
|
|
27
48
|
## Development
|
|
28
49
|
|
|
@@ -32,5 +53,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
|
32
53
|
|
|
33
54
|
## Contributing
|
|
34
55
|
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
|
56
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.
|
|
36
57
|
|
data/Rakefile
CHANGED
|
@@ -4,3 +4,11 @@ require "rspec/core/rake_task"
|
|
|
4
4
|
RSpec::Core::RakeTask.new(:spec)
|
|
5
5
|
|
|
6
6
|
task :default => :spec
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
|
|
10
|
+
task :download_boilerpipe_jar do
|
|
11
|
+
FileUtils.mkdir_p 'spec/sanity_checks/jars/'
|
|
12
|
+
Dir.chdir 'spec/sanity_checks/jars/'
|
|
13
|
+
`wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
|
|
14
|
+
end
|
data/boilerpipe-ruby.gemspec
CHANGED
|
@@ -4,21 +4,22 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
|
4
4
|
require 'boilerpipe/version'
|
|
5
5
|
|
|
6
6
|
Gem::Specification.new do |spec|
|
|
7
|
-
spec.name =
|
|
7
|
+
spec.name = 'boilerpipe-ruby'
|
|
8
8
|
spec.version = Boilerpipe::VERSION
|
|
9
|
-
spec.authors = [
|
|
10
|
-
spec.email = [
|
|
9
|
+
spec.authors = ['Gregory Ostermayr']
|
|
10
|
+
spec.email = ['<gregory.ostermayr@gmail.com>']
|
|
11
11
|
|
|
12
|
-
spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm
|
|
13
|
-
spec.description = %q{A pure ruby implementation of the boilerpipe
|
|
14
|
-
spec.homepage =
|
|
12
|
+
spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
|
|
13
|
+
spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
|
|
14
|
+
spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
|
|
15
15
|
|
|
16
16
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
17
17
|
spec.bindir = "exe"
|
|
18
18
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
19
19
|
spec.require_paths = ["lib"]
|
|
20
20
|
|
|
21
|
-
spec.add_development_dependency
|
|
22
|
-
spec.add_development_dependency
|
|
23
|
-
spec.add_development_dependency
|
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1.11'
|
|
22
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
|
23
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
|
24
|
+
spec.add_runtime_dependency 'nokogiri', '1.6.6.2'
|
|
24
25
|
end
|
data/lib/boilerpipe.rb
CHANGED
|
@@ -1,2 +1,32 @@
|
|
|
1
1
|
require 'boilerpipe/version'
|
|
2
2
|
require 'boilerpipe/util/unicode_tokenizer'
|
|
3
|
+
require 'boilerpipe/document/text_document'
|
|
4
|
+
require 'boilerpipe/document/text_block'
|
|
5
|
+
require 'boilerpipe/extractors/article_extractor'
|
|
6
|
+
require 'boilerpipe/filters/block_proximity_fusion'
|
|
7
|
+
require 'boilerpipe/filters/boilerplate_block_filter'
|
|
8
|
+
require 'boilerpipe/filters/document_title_match_classifier'
|
|
9
|
+
require 'boilerpipe/filters/expand_title_to_content_filter'
|
|
10
|
+
require 'boilerpipe/filters/heuristic_filter_base'
|
|
11
|
+
require 'boilerpipe/filters/ignore_blocks_after_content_filter'
|
|
12
|
+
require 'boilerpipe/filters/keep_largest_block_filter'
|
|
13
|
+
require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
|
|
14
|
+
require 'boilerpipe/filters/list_at_end_filter'
|
|
15
|
+
require 'boilerpipe/filters/num_words_rules_classifier'
|
|
16
|
+
require 'boilerpipe/filters/terminating_blocks_finder'
|
|
17
|
+
require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
|
18
|
+
require 'boilerpipe/labels/default'
|
|
19
|
+
require 'boilerpipe/labels/label_action'
|
|
20
|
+
require 'boilerpipe/sax/html_content_handler'
|
|
21
|
+
require 'boilerpipe/sax/boilerpipe_html_parser'
|
|
22
|
+
require 'boilerpipe/sax/tag_action_map'
|
|
23
|
+
require 'boilerpipe/sax/tag_actions/chained'
|
|
24
|
+
require 'boilerpipe/sax/tag_actions/ignorable_element'
|
|
25
|
+
require 'boilerpipe/sax/tag_actions/anchor_text'
|
|
26
|
+
require 'boilerpipe/sax/tag_actions/body'
|
|
27
|
+
require 'boilerpipe/sax/tag_actions/inline_whitespace'
|
|
28
|
+
require 'boilerpipe/sax/tag_actions/inline_no_whitespace'
|
|
29
|
+
require 'boilerpipe/sax/tag_actions/block_level'
|
|
30
|
+
require 'boilerpipe/sax/tag_actions/font'
|
|
31
|
+
require 'boilerpipe/sax/tag_actions/inline_tag_label'
|
|
32
|
+
require 'boilerpipe/sax/tag_actions/block_tag_label'
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
require 'set'
|
|
2
|
+
|
|
3
|
+
module Boilerpipe
|
|
4
|
+
module Document
|
|
5
|
+
class TextBlock
|
|
6
|
+
|
|
7
|
+
#EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
|
|
8
|
+
|
|
9
|
+
attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
|
|
10
|
+
:num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
|
|
11
|
+
:link_density, :labels, :tag_level, :num_full_text_words
|
|
12
|
+
|
|
13
|
+
attr_accessor :content
|
|
14
|
+
|
|
15
|
+
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=0, offset_blocks=0)
|
|
16
|
+
@labels = Set.new
|
|
17
|
+
@text = text
|
|
18
|
+
@num_words = num_words
|
|
19
|
+
@num_words_in_anchor_text = num_words_in_anchor_text
|
|
20
|
+
@num_words_in_wrapped_lines = num_words_in_wrapped_lines
|
|
21
|
+
@num_wrapped_lines = num_wrapped_lines
|
|
22
|
+
@num_full_text_words = 0
|
|
23
|
+
@offset_blocks_start = offset_blocks
|
|
24
|
+
@offset_blocks_end = offset_blocks
|
|
25
|
+
@content = false
|
|
26
|
+
@tag_level = 0
|
|
27
|
+
|
|
28
|
+
init_densities
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def self.empty_start
|
|
32
|
+
new('', 0, 0, 0, 0, -1)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def set_tag_level(level)
|
|
36
|
+
@tag_level = level
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def is_content?
|
|
40
|
+
@content
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def is_not_content?
|
|
44
|
+
!is_content?
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def add_label(label)
|
|
48
|
+
@labels << label
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def add_labels(labels)
|
|
52
|
+
labels.each do |label|
|
|
53
|
+
add_label(label)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def has_label?(label)
|
|
58
|
+
@labels.include?(label)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def remove_label(label)
|
|
62
|
+
@labels.delete(label)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def merge_next(other)
|
|
66
|
+
@text = "#{@text}\n#{other.text}"
|
|
67
|
+
@num_words += other.num_words
|
|
68
|
+
@num_words_in_anchor_text += other.num_words_in_anchor_text
|
|
69
|
+
@num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
|
|
70
|
+
@num_wrapped_lines += other.num_wrapped_lines
|
|
71
|
+
@offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
|
|
72
|
+
@offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
|
|
73
|
+
init_densities
|
|
74
|
+
@content |= other.is_content?
|
|
75
|
+
|
|
76
|
+
@num_full_text_words += other.num_full_text_words
|
|
77
|
+
|
|
78
|
+
if other.labels
|
|
79
|
+
if @labels.nil?
|
|
80
|
+
@labels = other.labels.clone
|
|
81
|
+
else
|
|
82
|
+
@labels.merge(other.labels.clone)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
@tag_level = [@tag_level, other.tag_level].min
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def to_s
|
|
90
|
+
#"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
|
|
91
|
+
labels = 'null'
|
|
92
|
+
if !@labels.empty?
|
|
93
|
+
labels ="[#{ @labels.to_a.join(',')}]"
|
|
94
|
+
end
|
|
95
|
+
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def clone
|
|
99
|
+
throw NotImplementedError
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
private
|
|
103
|
+
def init_densities
|
|
104
|
+
if @num_words_in_wrapped_lines == 0
|
|
105
|
+
@num_words_in_wrapped_lines = @num_words
|
|
106
|
+
@num_wrapped_lines = 1
|
|
107
|
+
end
|
|
108
|
+
@text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f
|
|
109
|
+
@link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
module Boilerpipe
|
|
2
|
+
module Document
|
|
3
|
+
class TextDocument
|
|
4
|
+
attr_reader :text_blocks
|
|
5
|
+
attr_accessor :title
|
|
6
|
+
|
|
7
|
+
def initialize(title, text_blocks)
|
|
8
|
+
@text_blocks = text_blocks
|
|
9
|
+
@title = title
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def content
|
|
13
|
+
text(true, false)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def text(include_content, include_noncontent)
|
|
17
|
+
s = ''
|
|
18
|
+
@text_blocks.each do |text_block|
|
|
19
|
+
case text_block.is_content?
|
|
20
|
+
when true
|
|
21
|
+
next unless include_content
|
|
22
|
+
s << text_block.text
|
|
23
|
+
s << '\n'
|
|
24
|
+
when false
|
|
25
|
+
next unless include_noncontent
|
|
26
|
+
s << text_block.text
|
|
27
|
+
s << '\n'
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
s
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def replace_text_blocks!(new_blocks)
|
|
34
|
+
@text_blocks = new_blocks
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def debug_s
|
|
38
|
+
@text_blocks.map(&:to_s).join("\n")
|
|
39
|
+
end
|
|
40
|
+
alias_method :debug_string, :debug_s
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
class BoilerPipeProcessingError < StandardError; end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module Boilerpipe::Extractors
|
|
2
|
+
class ArticleExtractor
|
|
3
|
+
def self.text(contents)
|
|
4
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
|
5
|
+
::Boilerpipe::Extractors::ArticleExtractor.process(doc)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def self.process(doc)
|
|
9
|
+
title = doc.title
|
|
10
|
+
|
|
11
|
+
filters = ::Boilerpipe::Filters
|
|
12
|
+
|
|
13
|
+
# marks text blocks as end of text with :INDICATES_END_OF_TEXT
|
|
14
|
+
filters::TerminatingBlocksFinder.process doc
|
|
15
|
+
|
|
16
|
+
# marks text blocks as title
|
|
17
|
+
filters::DocumentTitleMatchClassifier.new(title).process doc
|
|
18
|
+
|
|
19
|
+
# marks text blocks as content / non-content using boilerpipe alg
|
|
20
|
+
filters::NumWordsRulesClassifier.process doc
|
|
21
|
+
|
|
22
|
+
# marks text blocks after INDICATES_END_OF_TEXT non-content
|
|
23
|
+
filters::IgnoreBlocksAfterContentFilter.process doc
|
|
24
|
+
|
|
25
|
+
# marks HEADING text blocks as non-content after existing content
|
|
26
|
+
filters::TrailingHeadlineToBoilerplateFilter.process doc
|
|
27
|
+
|
|
28
|
+
# merge text blocks next to each other
|
|
29
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
|
|
30
|
+
|
|
31
|
+
# removes non-content text blocks
|
|
32
|
+
filters::BoilerplateBlockFilter::INSTANCE_KEEP_TITLE.process doc
|
|
33
|
+
|
|
34
|
+
# merge text blocks next to each other if they are the same tag level
|
|
35
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process doc
|
|
36
|
+
|
|
37
|
+
# Keeps only the largest text block as content
|
|
38
|
+
filters::KeepLargestBlockFilter::INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process doc
|
|
39
|
+
|
|
40
|
+
# Marks all TextBlocks "content" which are between the headline and the part is already content
|
|
41
|
+
filters::ExpandTitleToContentFilter.process doc
|
|
42
|
+
|
|
43
|
+
# mark text blocks with a lot of text at the same tag level as the largest current content as additional content
|
|
44
|
+
filters::LargeBlockSameTagLevelToContentFilter.process doc
|
|
45
|
+
|
|
46
|
+
# Marks nested list-item blocks after the end of the main content as content.
|
|
47
|
+
filters::ListAtEndFilter.process doc
|
|
48
|
+
|
|
49
|
+
doc.content
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
|
|
2
|
+
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
|
|
3
|
+
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
|
4
|
+
|
|
5
|
+
module Boilerpipe::Filters
|
|
6
|
+
class BlockProximityFusion
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def initialize(max_blocks_distance, content_only, same_tag_level_only)
|
|
10
|
+
@max_blocks_distance = max_blocks_distance
|
|
11
|
+
@content_only = content_only
|
|
12
|
+
@same_tag_level_only = same_tag_level_only
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
|
|
16
|
+
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
|
|
17
|
+
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
|
|
18
|
+
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
|
|
19
|
+
|
|
20
|
+
def process(doc)
|
|
21
|
+
text_blocks = doc.text_blocks
|
|
22
|
+
return false if text_blocks.size < 2
|
|
23
|
+
|
|
24
|
+
prev_block = if @content_only
|
|
25
|
+
text_blocks.find{ |tb| tb.is_content? }
|
|
26
|
+
else
|
|
27
|
+
text_blocks.first
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
return false if prev_block.nil?
|
|
31
|
+
|
|
32
|
+
offset = text_blocks.index(prev_block) + 1
|
|
33
|
+
blocks = text_blocks[offset..-1]
|
|
34
|
+
|
|
35
|
+
blocks_to_remove = []
|
|
36
|
+
|
|
37
|
+
blocks.each do |tb|
|
|
38
|
+
if tb.is_not_content?
|
|
39
|
+
prev_block = tb
|
|
40
|
+
next
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1
|
|
44
|
+
if diff_blocks <= @max_blocks_distance
|
|
45
|
+
ok = true
|
|
46
|
+
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
|
|
47
|
+
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
|
|
48
|
+
|
|
49
|
+
if ok
|
|
50
|
+
prev_block.merge_next(tb)
|
|
51
|
+
blocks_to_remove << tb
|
|
52
|
+
else
|
|
53
|
+
prev_block = tb
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
end
|
|
58
|
+
doc.replace_text_blocks!( text_blocks - blocks_to_remove )
|
|
59
|
+
doc
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
|
|
2
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
|
3
|
+
|
|
4
|
+
module Boilerpipe::Filters
|
|
5
|
+
class BoilerplateBlockFilter
|
|
6
|
+
|
|
7
|
+
def initialize(label)
|
|
8
|
+
@label_to_keep = label
|
|
9
|
+
end
|
|
10
|
+
INSTANCE_KEEP_TITLE = BoilerplateBlockFilter.new(:TITLE)
|
|
11
|
+
|
|
12
|
+
def process(doc)
|
|
13
|
+
combined = doc.text_blocks.delete_if do |tb|
|
|
14
|
+
if tb.is_not_content? &&
|
|
15
|
+
(@label_to_keep.nil? || !tb.has_label?(:TITLE))
|
|
16
|
+
true
|
|
17
|
+
else
|
|
18
|
+
false
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
doc.replace_text_blocks!(combined)
|
|
22
|
+
doc
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
end
|