boilerpipe-ruby 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +28 -1
- data/Dockerfile +14 -0
- data/README.md +13 -4
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +9 -9
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +4 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -5
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
- data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
- data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
- data/lib/boilerpipe/sax/preprocessor.rb +11 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +28 -25
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
|
4
|
+
data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
|
7
|
+
data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
|
data/.circleci/config.yml
CHANGED
@@ -6,48 +6,30 @@ version: 2
|
|
6
6
|
jobs:
|
7
7
|
build:
|
8
8
|
docker:
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
- image: circleci/ruby:2.5.5-node-browsers
|
10
|
+
|
12
11
|
# Specify service dependencies here if necessary
|
13
12
|
# CircleCI maintains a library of pre-built images
|
14
13
|
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
-
# - image: circleci/postgres:9.4
|
16
14
|
|
17
15
|
working_directory: ~/repo
|
18
16
|
|
19
17
|
steps:
|
20
18
|
- checkout
|
21
19
|
|
22
|
-
|
23
|
-
- restore_cache:
|
24
|
-
keys:
|
25
|
-
- v1-dependencies-{{ checksum "Gemfile.lock" }}
|
26
|
-
# fallback to using the latest cache if no exact match is found
|
27
|
-
- v1-dependencies-
|
28
|
-
|
20
|
+
- run: gem install bundler
|
29
21
|
- run:
|
30
22
|
name: install dependencies
|
31
23
|
command: |
|
32
|
-
bundle install --jobs=4 --retry=3
|
24
|
+
bundle install --jobs=4 --retry=3
|
33
25
|
|
34
|
-
- save_cache:
|
35
|
-
paths:
|
36
|
-
- ./vendor/bundle
|
37
|
-
key: v1-dependencies-{{ checksum "Gemfile.lock" }}
|
38
|
-
|
39
|
-
# Database setup
|
40
|
-
#- run: bundle exec rake db:create
|
41
|
-
#- run: bundle exec rake db:schema:load
|
42
|
-
|
43
|
-
# run tests!
|
44
26
|
- run:
|
45
27
|
name: run tests
|
46
28
|
command: |
|
47
29
|
mkdir /tmp/test-results
|
48
30
|
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
|
49
|
-
|
50
|
-
rspec --format progress "spec"
|
31
|
+
|
32
|
+
bundle exec rspec --format progress "spec"
|
51
33
|
|
52
34
|
# collect reports
|
53
35
|
- store_test_results:
|
data/.dockerignore
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,30 @@
|
|
1
|
+
# 0.5.0 / 2021-02-15
|
2
|
+
* internal refactoring for clarity
|
3
|
+
|
4
|
+
# 0.4.4 / 2021-02-13
|
5
|
+
* Do a better job of stripping out script tags
|
6
|
+
|
7
|
+
# 0.4.3 / 2020-07-18
|
8
|
+
|
9
|
+
* update deps
|
10
|
+
|
11
|
+
# 0.4.2 / 2020-03-11
|
12
|
+
|
13
|
+
* update deps
|
14
|
+
|
15
|
+
# 0.4.1 / 2019-07-04
|
16
|
+
|
17
|
+
* Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
|
18
|
+
* Allow tests to run in Docker
|
19
|
+
* Update circle to continue to work
|
20
|
+
* Add architecture flow
|
21
|
+
* Code formatting
|
22
|
+
* Add min words filter specs
|
23
|
+
* Add label action specs
|
24
|
+
* Add missing test case to ignorable element spec
|
25
|
+
* Add merge_next case to text block spec
|
26
|
+
* Dry up includes
|
27
|
+
|
1
28
|
# 0.4.0 / 2017-09-15
|
2
29
|
|
3
30
|
* Add KeepEverythingWithMinKWords Extractor
|
@@ -22,4 +49,4 @@
|
|
22
49
|
|
23
50
|
# 0.1.0 / 2017-09-08
|
24
51
|
|
25
|
-
* Add Article Extractor
|
52
|
+
* Add Article Extractor
|
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
From ruby:2.5
|
2
|
+
RUN gem install bundler
|
3
|
+
COPY *gemspec /usr/src/app/
|
4
|
+
COPY Gemfile /usr/src/app/
|
5
|
+
COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
|
6
|
+
COPY bin /usr/src/app/
|
7
|
+
COPY bin/* /usr/src/app/bin/
|
8
|
+
|
9
|
+
WORKDIR /usr/src/app
|
10
|
+
RUN bin/setup
|
11
|
+
|
12
|
+
COPY . /usr/src/app/
|
13
|
+
|
14
|
+
CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
|
data/README.md
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
# Boilerpipe
|
2
2
|
|
3
|
+
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
|
5
|
+
|
3
6
|
A pure ruby implemenation of the boilerpipe algorithm.
|
4
7
|
|
5
8
|
This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
|
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
|
|
10
13
|
|
11
14
|
This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
|
12
15
|
|
16
|
+
Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
|
17
|
+
|
13
18
|
# TLDR
|
14
19
|
|
15
20
|
Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
|
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
|
|
24
29
|
* [x] LargestContentExtractor
|
25
30
|
* [x] NumWordsRulesExtractor
|
26
31
|
|
27
|
-
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
|
28
|
-
|
29
|
-
[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
|
30
32
|
|
31
33
|
## Installation
|
32
34
|
|
@@ -69,7 +71,14 @@ Or install it yourself as:
|
|
69
71
|
|
70
72
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
71
73
|
|
72
|
-
To install this gem onto your local machine, run `bundle exec rake install`.
|
74
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
75
|
+
|
76
|
+
### Running Tests on Docker
|
77
|
+
|
78
|
+
The default run command will run the tests
|
79
|
+
|
80
|
+
docker build -t boilerpipe .
|
81
|
+
docker run -it --rm boilerpipe
|
73
82
|
|
74
83
|
## Contributing
|
75
84
|
|
data/Rakefile
CHANGED
@@ -1,14 +1,13 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
3
|
|
4
4
|
RSpec::Core::RakeTask.new(:spec)
|
5
5
|
|
6
6
|
task :default => :spec
|
7
7
|
|
8
|
-
|
9
8
|
desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
|
10
9
|
task :download_boilerpipe_jar do
|
11
10
|
FileUtils.mkdir_p 'spec/sanity_checks/jars/'
|
12
11
|
Dir.chdir 'spec/sanity_checks/jars/'
|
13
|
-
|
12
|
+
`wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
|
14
13
|
end
|
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'boilerpipe'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "boilerpipe"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start
|
data/boilerpipe-ruby.gemspec
CHANGED
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['<gregory.ostermayr@gmail.com>']
|
11
11
|
spec.license = 'Apache 2.0'
|
12
12
|
|
13
|
-
spec.summary = %q{A pure ruby
|
14
|
-
spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
|
13
|
+
spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
|
14
|
+
spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
|
15
15
|
spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
|
16
16
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
18
|
+
spec.bindir = 'exe'
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
21
21
|
|
22
|
-
spec.add_development_dependency 'bundler', '~>
|
23
|
-
spec.add_development_dependency 'rake', '
|
24
|
-
spec.add_development_dependency '
|
25
|
-
spec.add_development_dependency '
|
26
|
-
spec.add_runtime_dependency 'nokogiri', '
|
22
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
|
+
spec.add_development_dependency 'rake', '>= 12.3.3'
|
24
|
+
spec.add_development_dependency 'rickshaw', '~> 0.5.0'
|
25
|
+
spec.add_development_dependency 'rspec', '~> 3.10'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '~> 1.10'
|
27
27
|
end
|
data/boilerpipe_flow.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
```
|
2
|
+
raw html
|
3
|
+
|
|
4
|
+
|
|
5
|
+
sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
|
6
|
+
|
|
7
|
+
-------------------------------------<------------------------------------<------|
|
8
|
+
| | |
|
9
|
+
text blocks text blocks text blocks
|
10
|
+
| | |
|
11
|
+
| | |
|
12
|
+
-----------------------------
|
13
|
+
|
|
14
|
+
|
|
15
|
+
text document
|
16
|
+
|
|
17
|
+
|
|
18
|
+
filter
|
19
|
+
|
|
20
|
+
filter
|
21
|
+
|
|
22
|
+
filter
|
23
|
+
|
|
24
|
+
filter
|
25
|
+
|
|
26
|
+
filter
|
27
|
+
|
|
28
|
+
filter
|
29
|
+
|
|
30
|
+
filter
|
31
|
+
|
|
32
|
+
filter
|
33
|
+
|
|
34
|
+
filter
|
35
|
+
|
|
36
|
+
|
|
37
|
+
text document
|
38
|
+
|
|
39
|
+
outputs extracted text
|
40
|
+
```
|
data/lib/boilerpipe.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'set'
|
3
|
+
|
1
4
|
require 'boilerpipe/version'
|
2
5
|
|
3
6
|
require 'boilerpipe/util/unicode_tokenizer'
|
@@ -37,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
|
37
40
|
require 'boilerpipe/labels/default'
|
38
41
|
require 'boilerpipe/labels/label_action'
|
39
42
|
|
43
|
+
require 'boilerpipe/sax/preprocessor'
|
40
44
|
require 'boilerpipe/sax/html_content_handler'
|
41
45
|
require 'boilerpipe/sax/boilerpipe_html_parser'
|
42
46
|
require 'boilerpipe/sax/tag_action_map'
|
@@ -1,10 +1,7 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module Boilerpipe
|
4
2
|
module Document
|
5
3
|
class TextBlock
|
6
|
-
|
7
|
-
#EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
|
4
|
+
# EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
|
8
5
|
|
9
6
|
attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
|
10
7
|
:num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
|
@@ -12,7 +9,7 @@ module Boilerpipe
|
|
12
9
|
|
13
10
|
attr_accessor :content
|
14
11
|
|
15
|
-
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
|
12
|
+
def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
|
16
13
|
@labels = Set.new
|
17
14
|
@text = text
|
18
15
|
@num_words = num_words
|
@@ -32,9 +29,9 @@ module Boilerpipe
|
|
32
29
|
new('', 0, 0, 0, 0, -1)
|
33
30
|
end
|
34
31
|
|
35
|
-
|
36
|
-
|
37
|
-
|
32
|
+
def set_tag_level(level)
|
33
|
+
@tag_level = level
|
34
|
+
end
|
38
35
|
|
39
36
|
def is_content?
|
40
37
|
@content
|
@@ -68,8 +65,8 @@ module Boilerpipe
|
|
68
65
|
@num_words_in_anchor_text += other.num_words_in_anchor_text
|
69
66
|
@num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
|
70
67
|
@num_wrapped_lines += other.num_wrapped_lines
|
71
|
-
@offset_blocks_start = [@offset_blocks_start
|
72
|
-
@offset_blocks_end = [@offset_blocks_end
|
68
|
+
@offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
|
69
|
+
@offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
|
73
70
|
init_densities
|
74
71
|
@content |= other.is_content?
|
75
72
|
|
@@ -87,10 +84,10 @@ module Boilerpipe
|
|
87
84
|
end
|
88
85
|
|
89
86
|
def to_s
|
90
|
-
#"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
|
87
|
+
# "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
|
91
88
|
labels = 'null'
|
92
89
|
if !@labels.empty?
|
93
|
-
labels ="[#{
|
90
|
+
labels = "[#{@labels.to_a.join(',')}]"
|
94
91
|
end
|
95
92
|
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
|
96
93
|
end
|
@@ -100,6 +97,7 @@ module Boilerpipe
|
|
100
97
|
end
|
101
98
|
|
102
99
|
private
|
100
|
+
|
103
101
|
def init_densities
|
104
102
|
if @num_words_in_wrapped_lines == 0
|
105
103
|
@num_words_in_wrapped_lines = @num_words
|
@@ -19,14 +19,14 @@ module Boilerpipe
|
|
19
19
|
case text_block.is_content?
|
20
20
|
when true
|
21
21
|
next unless include_content
|
22
|
-
s << text_block.text
|
23
|
-
s << "\n"
|
24
22
|
when false
|
25
23
|
next unless include_noncontent
|
26
|
-
s << text_block.text
|
27
|
-
s << "\n"
|
28
24
|
end
|
25
|
+
|
26
|
+
s << text_block.text
|
27
|
+
s << "\n"
|
29
28
|
end
|
29
|
+
|
30
30
|
s
|
31
31
|
end
|
32
32
|
|
@@ -38,7 +38,6 @@ module Boilerpipe
|
|
38
38
|
@text_blocks.map(&:to_s).join("\n")
|
39
39
|
end
|
40
40
|
alias_method :debug_string, :debug_s
|
41
|
-
|
42
41
|
end
|
43
42
|
end
|
44
43
|
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
1
|
+
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
|
2
|
+
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
4
3
|
|
5
4
|
module Boilerpipe::Filters
|
6
5
|
class BlockProximityFusion
|
7
|
-
|
8
|
-
|
9
6
|
def initialize(max_blocks_distance, content_only, same_tag_level_only)
|
10
7
|
@max_blocks_distance = max_blocks_distance
|
11
8
|
@content_only = content_only
|
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
|
|
13
10
|
end
|
14
11
|
|
15
12
|
MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
|
16
|
-
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(
|
17
|
-
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(
|
13
|
+
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
|
14
|
+
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
|
18
15
|
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
|
19
16
|
|
20
17
|
def process(doc)
|
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
|
|
22
19
|
return false if text_blocks.size < 2
|
23
20
|
|
24
21
|
prev_block = if @content_only
|
25
|
-
text_blocks.find{ |tb| tb.is_content? }
|
22
|
+
text_blocks.find { |tb| tb.is_content? }
|
26
23
|
else
|
27
24
|
text_blocks.first
|
28
25
|
end
|
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
|
|
46
43
|
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
|
47
44
|
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
|
48
45
|
|
49
|
-
if
|
46
|
+
if ok
|
50
47
|
prev_block.merge_next(tb)
|
51
48
|
blocks_to_remove << tb
|
52
49
|
else
|
53
50
|
prev_block = tb
|
54
51
|
end
|
55
52
|
end
|
56
|
-
|
57
53
|
end
|
58
|
-
doc.replace_text_blocks!(
|
54
|
+
doc.replace_text_blocks!(text_blocks - blocks_to_remove)
|
59
55
|
doc
|
60
56
|
end
|
61
|
-
|
62
57
|
end
|
63
58
|
end
|