boilerpipe-ruby 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +28 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +13 -4
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +4 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  15. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  17. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
  18. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  19. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  20. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  21. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  22. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  23. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  24. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
  25. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  26. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  27. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  28. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  29. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  30. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  31. data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
  32. data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
  33. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  34. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  35. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
  36. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  37. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  38. data/lib/boilerpipe/labels/label_action.rb +1 -1
  39. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
  40. data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
  41. data/lib/boilerpipe/sax/preprocessor.rb +11 -0
  42. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  43. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  44. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
  47. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  48. data/lib/boilerpipe/version.rb +1 -1
  49. metadata +28 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
4
- data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
2
+ SHA256:
3
+ metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
4
+ data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
5
5
  SHA512:
6
- metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
7
- data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
6
+ metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
7
+ data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
data/.circleci/config.yml CHANGED
@@ -6,48 +6,30 @@ version: 2
6
6
  jobs:
7
7
  build:
8
8
  docker:
9
- # specify the version you desire here
10
- - image: circleci/ruby:2.4.1-node-browsers
11
-
9
+ - image: circleci/ruby:2.5.5-node-browsers
10
+
12
11
  # Specify service dependencies here if necessary
13
12
  # CircleCI maintains a library of pre-built images
14
13
  # documented at https://circleci.com/docs/2.0/circleci-images/
15
- # - image: circleci/postgres:9.4
16
14
 
17
15
  working_directory: ~/repo
18
16
 
19
17
  steps:
20
18
  - checkout
21
19
 
22
- # Download and cache dependencies
23
- - restore_cache:
24
- keys:
25
- - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
- # fallback to using the latest cache if no exact match is found
27
- - v1-dependencies-
28
-
20
+ - run: gem install bundler
29
21
  - run:
30
22
  name: install dependencies
31
23
  command: |
32
- bundle install --jobs=4 --retry=3 --path vendor/bundle
24
+ bundle install --jobs=4 --retry=3
33
25
 
34
- - save_cache:
35
- paths:
36
- - ./vendor/bundle
37
- key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
-
39
- # Database setup
40
- #- run: bundle exec rake db:create
41
- #- run: bundle exec rake db:schema:load
42
-
43
- # run tests!
44
26
  - run:
45
27
  name: run tests
46
28
  command: |
47
29
  mkdir /tmp/test-results
48
30
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
-
50
- rspec --format progress "spec"
31
+
32
+ bundle exec rspec --format progress "spec"
51
33
 
52
34
  # collect reports
53
35
  - store_test_results:
data/.dockerignore ADDED
@@ -0,0 +1,7 @@
1
+ .git
2
+ .gitignore
3
+ log/*
4
+ tmp/*
5
+ *.swp
6
+ *.swo
7
+ Gemfile.lock
data/CHANGELOG.md CHANGED
@@ -1,3 +1,30 @@
1
+ # 0.5.0 / 2021-02-15
2
+ * internal refactoring for clarity
3
+
4
+ # 0.4.4 / 2021-02-13
5
+ * Do a better job of stripping out script tags
6
+
7
+ # 0.4.3 / 2020-07-18
8
+
9
+ * update deps
10
+
11
+ # 0.4.2 / 2020-03-11
12
+
13
+ * update deps
14
+
15
+ # 0.4.1 / 2019-07-04
16
+
17
+ * Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
18
+ * Allow tests to run in Docker
19
+ * Update circle to continue to work
20
+ * Add architecture flow
21
+ * Code formatting
22
+ * Add min words filter specs
23
+ * Add label action specs
24
+ * Add missing test case to ignorable element spec
25
+ * Add merge_next case to text block spec
26
+ * Dry up includes
27
+
1
28
  # 0.4.0 / 2017-09-15
2
29
 
3
30
  * Add KeepEverythingWithMinKWords Extractor
@@ -22,4 +49,4 @@
22
49
 
23
50
  # 0.1.0 / 2017-09-08
24
51
 
25
- * Add Article Extractor
52
+ * Add Article Extractor
data/Dockerfile ADDED
@@ -0,0 +1,14 @@
1
+ From ruby:2.5
2
+ RUN gem install bundler
3
+ COPY *gemspec /usr/src/app/
4
+ COPY Gemfile /usr/src/app/
5
+ COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
6
+ COPY bin /usr/src/app/
7
+ COPY bin/* /usr/src/app/bin/
8
+
9
+ WORKDIR /usr/src/app
10
+ RUN bin/setup
11
+
12
+ COPY . /usr/src/app/
13
+
14
+ CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Boilerpipe
2
2
 
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
4
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
+
3
6
  A pure ruby implemenation of the boilerpipe algorithm.
4
7
 
5
8
  This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
10
13
 
11
14
  This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
12
15
 
16
+ Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
17
+
13
18
  # TLDR
14
19
 
15
20
  Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
24
29
  * [x] LargestContentExtractor
25
30
  * [x] NumWordsRulesExtractor
26
31
 
27
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
28
-
29
- [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
30
32
 
31
33
  ## Installation
32
34
 
@@ -69,7 +71,14 @@ Or install it yourself as:
69
71
 
70
72
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
71
73
 
72
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
74
+ To install this gem onto your local machine, run `bundle exec rake install`.
75
+
76
+ ### Running Tests on Docker
77
+
78
+ The default run command will run the tests
79
+
80
+ docker build -t boilerpipe .
81
+ docker run -it --rm boilerpipe
73
82
 
74
83
  ## Contributing
75
84
 
data/Rakefile CHANGED
@@ -1,14 +1,13 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task :default => :spec
7
7
 
8
-
9
8
  desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
10
9
  task :download_boilerpipe_jar do
11
10
  FileUtils.mkdir_p 'spec/sanity_checks/jars/'
12
11
  Dir.chdir 'spec/sanity_checks/jars/'
13
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
12
+ `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
14
13
  end
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "boilerpipe"
3
+ require 'bundler/setup'
4
+ require 'boilerpipe'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
11
  spec.license = 'Apache 2.0'
12
12
 
13
- spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
14
- spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
13
+ spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
14
+ spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
15
15
  spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
23
- spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
- spec.add_development_dependency 'rickshaw', '~> 0.4.0'
26
- spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
22
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
+ spec.add_development_dependency 'rake', '>= 12.3.3'
24
+ spec.add_development_dependency 'rickshaw', '~> 0.5.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.10'
26
+ spec.add_runtime_dependency 'nokogiri', '~> 1.10'
27
27
  end
@@ -0,0 +1,40 @@
1
+ ```
2
+ raw html
3
+ |
4
+ |
5
+ sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
6
+ |
7
+ -------------------------------------<------------------------------------<------|
8
+ | | |
9
+ text blocks text blocks text blocks
10
+ | | |
11
+ | | |
12
+ -----------------------------
13
+ |
14
+ |
15
+ text document
16
+ |
17
+ |
18
+ filter
19
+ |
20
+ filter
21
+ |
22
+ filter
23
+ |
24
+ filter
25
+ |
26
+ filter
27
+ |
28
+ filter
29
+ |
30
+ filter
31
+ |
32
+ filter
33
+ |
34
+ filter
35
+ |
36
+ |
37
+ text document
38
+ |
39
+ outputs extracted text
40
+ ```
data/lib/boilerpipe.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
1
4
  require 'boilerpipe/version'
2
5
 
3
6
  require 'boilerpipe/util/unicode_tokenizer'
@@ -37,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
37
40
  require 'boilerpipe/labels/default'
38
41
  require 'boilerpipe/labels/label_action'
39
42
 
43
+ require 'boilerpipe/sax/preprocessor'
40
44
  require 'boilerpipe/sax/html_content_handler'
41
45
  require 'boilerpipe/sax/boilerpipe_html_parser'
42
46
  require 'boilerpipe/sax/tag_action_map'
@@ -1,10 +1,7 @@
1
- require 'set'
2
-
3
1
  module Boilerpipe
4
2
  module Document
5
3
  class TextBlock
6
-
7
- #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
4
+ # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
8
5
 
9
6
  attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
10
7
  :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
12
9
 
13
10
  attr_accessor :content
14
11
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
12
+ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
16
13
  @labels = Set.new
17
14
  @text = text
18
15
  @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
32
29
  new('', 0, 0, 0, 0, -1)
33
30
  end
34
31
 
35
- def set_tag_level(level)
36
- @tag_level = level
37
- end
32
+ def set_tag_level(level)
33
+ @tag_level = level
34
+ end
38
35
 
39
36
  def is_content?
40
37
  @content
@@ -68,8 +65,8 @@ module Boilerpipe
68
65
  @num_words_in_anchor_text += other.num_words_in_anchor_text
69
66
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
70
67
  @num_wrapped_lines += other.num_wrapped_lines
71
- @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
72
- @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
68
+ @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
69
+ @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
73
70
  init_densities
74
71
  @content |= other.is_content?
75
72
 
@@ -87,10 +84,10 @@ module Boilerpipe
87
84
  end
88
85
 
89
86
  def to_s
90
- #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
87
+ # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
91
88
  labels = 'null'
92
89
  if !@labels.empty?
93
- labels ="[#{ @labels.to_a.join(',')}]"
90
+ labels = "[#{@labels.to_a.join(',')}]"
94
91
  end
95
92
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
96
93
  end
@@ -100,6 +97,7 @@ module Boilerpipe
100
97
  end
101
98
 
102
99
  private
100
+
103
101
  def init_densities
104
102
  if @num_words_in_wrapped_lines == 0
105
103
  @num_words_in_wrapped_lines = @num_words
@@ -19,14 +19,14 @@ module Boilerpipe
19
19
  case text_block.is_content?
20
20
  when true
21
21
  next unless include_content
22
- s << text_block.text
23
- s << "\n"
24
22
  when false
25
23
  next unless include_noncontent
26
- s << text_block.text
27
- s << "\n"
28
24
  end
25
+
26
+ s << text_block.text
27
+ s << "\n"
29
28
  end
29
+
30
30
  s
31
31
  end
32
32
 
@@ -38,7 +38,6 @@ module Boilerpipe
38
38
  @text_blocks.map(&:to_s).join("\n")
39
39
  end
40
40
  alias_method :debug_string, :debug_s
41
-
42
41
  end
43
42
  end
44
43
  end
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class CanolaExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::CanolaExtractor.process doc
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class DefaultExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::DefaultExtractor.process doc
@@ -1,4 +1,4 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Extractors
4
4
  class KeepEverythingExtractor
@@ -1,4 +1,3 @@
1
-
2
1
  # A full-text extractor which extracts the largest text component of a page.
3
2
  # For news articles, it may perform better than the DefaultExtractor, but
4
3
  # usually worse than ArticleExtractor.
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class NumWordsRulesExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
@@ -1,11 +1,8 @@
1
-
2
- # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
3
- # probably makes sense only in cases where an upstream filter already has removed some blocks.
1
+ # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
2
+ # probably makes sense only in cases where an upstream filter already has removed some blocks.
4
3
 
5
4
  module Boilerpipe::Filters
6
5
  class BlockProximityFusion
7
-
8
-
9
6
  def initialize(max_blocks_distance, content_only, same_tag_level_only)
10
7
  @max_blocks_distance = max_blocks_distance
11
8
  @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
13
10
  end
14
11
 
15
12
  MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
16
- MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
17
- MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
13
+ MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
14
+ MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
18
15
  MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
19
16
 
20
17
  def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
22
19
  return false if text_blocks.size < 2
23
20
 
24
21
  prev_block = if @content_only
25
- text_blocks.find{ |tb| tb.is_content? }
22
+ text_blocks.find { |tb| tb.is_content? }
26
23
  else
27
24
  text_blocks.first
28
25
  end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
46
43
  ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
47
44
  ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
48
45
 
49
- if ok
46
+ if ok
50
47
  prev_block.merge_next(tb)
51
48
  blocks_to_remove << tb
52
49
  else
53
50
  prev_block = tb
54
51
  end
55
52
  end
56
-
57
53
  end
58
- doc.replace_text_blocks!( text_blocks - blocks_to_remove )
54
+ doc.replace_text_blocks!(text_blocks - blocks_to_remove)
59
55
  doc
60
56
  end
61
-
62
57
  end
63
58
  end