boilerpipe-ruby 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +28 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +13 -4
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +4 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  15. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  17. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
  18. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  19. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  20. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  21. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  22. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  23. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  24. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
  25. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  26. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  27. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  28. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  29. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  30. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  31. data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
  32. data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
  33. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  34. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  35. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
  36. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  37. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  38. data/lib/boilerpipe/labels/label_action.rb +1 -1
  39. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
  40. data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
  41. data/lib/boilerpipe/sax/preprocessor.rb +11 -0
  42. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  43. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  44. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
  47. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  48. data/lib/boilerpipe/version.rb +1 -1
  49. metadata +28 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
4
- data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
2
+ SHA256:
3
+ metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
4
+ data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
5
5
  SHA512:
6
- metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
7
- data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
6
+ metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
7
+ data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
data/.circleci/config.yml CHANGED
@@ -6,48 +6,30 @@ version: 2
6
6
  jobs:
7
7
  build:
8
8
  docker:
9
- # specify the version you desire here
10
- - image: circleci/ruby:2.4.1-node-browsers
11
-
9
+ - image: circleci/ruby:2.5.5-node-browsers
10
+
12
11
  # Specify service dependencies here if necessary
13
12
  # CircleCI maintains a library of pre-built images
14
13
  # documented at https://circleci.com/docs/2.0/circleci-images/
15
- # - image: circleci/postgres:9.4
16
14
 
17
15
  working_directory: ~/repo
18
16
 
19
17
  steps:
20
18
  - checkout
21
19
 
22
- # Download and cache dependencies
23
- - restore_cache:
24
- keys:
25
- - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
- # fallback to using the latest cache if no exact match is found
27
- - v1-dependencies-
28
-
20
+ - run: gem install bundler
29
21
  - run:
30
22
  name: install dependencies
31
23
  command: |
32
- bundle install --jobs=4 --retry=3 --path vendor/bundle
24
+ bundle install --jobs=4 --retry=3
33
25
 
34
- - save_cache:
35
- paths:
36
- - ./vendor/bundle
37
- key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
-
39
- # Database setup
40
- #- run: bundle exec rake db:create
41
- #- run: bundle exec rake db:schema:load
42
-
43
- # run tests!
44
26
  - run:
45
27
  name: run tests
46
28
  command: |
47
29
  mkdir /tmp/test-results
48
30
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
-
50
- rspec --format progress "spec"
31
+
32
+ bundle exec rspec --format progress "spec"
51
33
 
52
34
  # collect reports
53
35
  - store_test_results:
data/.dockerignore ADDED
@@ -0,0 +1,7 @@
1
+ .git
2
+ .gitignore
3
+ log/*
4
+ tmp/*
5
+ *.swp
6
+ *.swo
7
+ Gemfile.lock
data/CHANGELOG.md CHANGED
@@ -1,3 +1,30 @@
1
+ # 0.5.0 / 2021-02-15
2
+ * internal refactoring for clarity
3
+
4
+ # 0.4.4 / 2021-02-13
5
+ * Do a better job of stripping out script tags
6
+
7
+ # 0.4.3 / 2020-07-18
8
+
9
+ * update deps
10
+
11
+ # 0.4.2 / 2020-03-11
12
+
13
+ * update deps
14
+
15
+ # 0.4.1 / 2019-07-04
16
+
17
+ * Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
18
+ * Allow tests to run in Docker
19
+ * Update circle to continue to work
20
+ * Add architecture flow
21
+ * Code formatting
22
+ * Add min words filter specs
23
+ * Add label action specs
24
+ * Add missing test case to ignorable element spec
25
+ * Add merge_next case to text block spec
26
+ * Dry up includes
27
+
1
28
  # 0.4.0 / 2017-09-15
2
29
 
3
30
  * Add KeepEverythingWithMinKWords Extractor
@@ -22,4 +49,4 @@
22
49
 
23
50
  # 0.1.0 / 2017-09-08
24
51
 
25
- * Add Article Extractor
52
+ * Add Article Extractor
data/Dockerfile ADDED
@@ -0,0 +1,14 @@
1
+ From ruby:2.5
2
+ RUN gem install bundler
3
+ COPY *gemspec /usr/src/app/
4
+ COPY Gemfile /usr/src/app/
5
+ COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
6
+ COPY bin /usr/src/app/
7
+ COPY bin/* /usr/src/app/bin/
8
+
9
+ WORKDIR /usr/src/app
10
+ RUN bin/setup
11
+
12
+ COPY . /usr/src/app/
13
+
14
+ CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Boilerpipe
2
2
 
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
4
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
+
3
6
  A pure ruby implemenation of the boilerpipe algorithm.
4
7
 
5
8
  This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
10
13
 
11
14
  This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
12
15
 
16
+ Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
17
+
13
18
  # TLDR
14
19
 
15
20
  Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
24
29
  * [x] LargestContentExtractor
25
30
  * [x] NumWordsRulesExtractor
26
31
 
27
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
28
-
29
- [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
30
32
 
31
33
  ## Installation
32
34
 
@@ -69,7 +71,14 @@ Or install it yourself as:
69
71
 
70
72
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
71
73
 
72
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
74
+ To install this gem onto your local machine, run `bundle exec rake install`.
75
+
76
+ ### Running Tests on Docker
77
+
78
+ The default run command will run the tests
79
+
80
+ docker build -t boilerpipe .
81
+ docker run -it --rm boilerpipe
73
82
 
74
83
  ## Contributing
75
84
 
data/Rakefile CHANGED
@@ -1,14 +1,13 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task :default => :spec
7
7
 
8
-
9
8
  desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
10
9
  task :download_boilerpipe_jar do
11
10
  FileUtils.mkdir_p 'spec/sanity_checks/jars/'
12
11
  Dir.chdir 'spec/sanity_checks/jars/'
13
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
12
+ `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
14
13
  end
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "boilerpipe"
3
+ require 'bundler/setup'
4
+ require 'boilerpipe'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
11
  spec.license = 'Apache 2.0'
12
12
 
13
- spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
14
- spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
13
+ spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
14
+ spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
15
15
  spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
23
- spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
- spec.add_development_dependency 'rickshaw', '~> 0.4.0'
26
- spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
22
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
+ spec.add_development_dependency 'rake', '>= 12.3.3'
24
+ spec.add_development_dependency 'rickshaw', '~> 0.5.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.10'
26
+ spec.add_runtime_dependency 'nokogiri', '~> 1.10'
27
27
  end
@@ -0,0 +1,40 @@
1
+ ```
2
+ raw html
3
+ |
4
+ |
5
+ sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
6
+ |
7
+ -------------------------------------<------------------------------------<------|
8
+ | | |
9
+ text blocks text blocks text blocks
10
+ | | |
11
+ | | |
12
+ -----------------------------
13
+ |
14
+ |
15
+ text document
16
+ |
17
+ |
18
+ filter
19
+ |
20
+ filter
21
+ |
22
+ filter
23
+ |
24
+ filter
25
+ |
26
+ filter
27
+ |
28
+ filter
29
+ |
30
+ filter
31
+ |
32
+ filter
33
+ |
34
+ filter
35
+ |
36
+ |
37
+ text document
38
+ |
39
+ outputs extracted text
40
+ ```
data/lib/boilerpipe.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
1
4
  require 'boilerpipe/version'
2
5
 
3
6
  require 'boilerpipe/util/unicode_tokenizer'
@@ -37,6 +40,7 @@ require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
37
40
  require 'boilerpipe/labels/default'
38
41
  require 'boilerpipe/labels/label_action'
39
42
 
43
+ require 'boilerpipe/sax/preprocessor'
40
44
  require 'boilerpipe/sax/html_content_handler'
41
45
  require 'boilerpipe/sax/boilerpipe_html_parser'
42
46
  require 'boilerpipe/sax/tag_action_map'
@@ -1,10 +1,7 @@
1
- require 'set'
2
-
3
1
  module Boilerpipe
4
2
  module Document
5
3
  class TextBlock
6
-
7
- #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
4
+ # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
8
5
 
9
6
  attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
10
7
  :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
12
9
 
13
10
  attr_accessor :content
14
11
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
12
+ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
16
13
  @labels = Set.new
17
14
  @text = text
18
15
  @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
32
29
  new('', 0, 0, 0, 0, -1)
33
30
  end
34
31
 
35
- def set_tag_level(level)
36
- @tag_level = level
37
- end
32
+ def set_tag_level(level)
33
+ @tag_level = level
34
+ end
38
35
 
39
36
  def is_content?
40
37
  @content
@@ -68,8 +65,8 @@ module Boilerpipe
68
65
  @num_words_in_anchor_text += other.num_words_in_anchor_text
69
66
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
70
67
  @num_wrapped_lines += other.num_wrapped_lines
71
- @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
72
- @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
68
+ @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
69
+ @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
73
70
  init_densities
74
71
  @content |= other.is_content?
75
72
 
@@ -87,10 +84,10 @@ module Boilerpipe
87
84
  end
88
85
 
89
86
  def to_s
90
- #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
87
+ # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
91
88
  labels = 'null'
92
89
  if !@labels.empty?
93
- labels ="[#{ @labels.to_a.join(',')}]"
90
+ labels = "[#{@labels.to_a.join(',')}]"
94
91
  end
95
92
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
96
93
  end
@@ -100,6 +97,7 @@ module Boilerpipe
100
97
  end
101
98
 
102
99
  private
100
+
103
101
  def init_densities
104
102
  if @num_words_in_wrapped_lines == 0
105
103
  @num_words_in_wrapped_lines = @num_words
@@ -19,14 +19,14 @@ module Boilerpipe
19
19
  case text_block.is_content?
20
20
  when true
21
21
  next unless include_content
22
- s << text_block.text
23
- s << "\n"
24
22
  when false
25
23
  next unless include_noncontent
26
- s << text_block.text
27
- s << "\n"
28
24
  end
25
+
26
+ s << text_block.text
27
+ s << "\n"
29
28
  end
29
+
30
30
  s
31
31
  end
32
32
 
@@ -38,7 +38,6 @@ module Boilerpipe
38
38
  @text_blocks.map(&:to_s).join("\n")
39
39
  end
40
40
  alias_method :debug_string, :debug_s
41
-
42
41
  end
43
42
  end
44
43
  end
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class CanolaExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::CanolaExtractor.process doc
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class DefaultExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::DefaultExtractor.process doc
@@ -1,4 +1,4 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Extractors
4
4
  class KeepEverythingExtractor
@@ -1,4 +1,3 @@
1
-
2
1
  # A full-text extractor which extracts the largest text component of a page.
3
2
  # For news articles, it may perform better than the DefaultExtractor, but
4
3
  # usually worse than ArticleExtractor.
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class NumWordsRulesExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
@@ -1,11 +1,8 @@
1
-
2
- # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
3
- # probably makes sense only in cases where an upstream filter already has removed some blocks.
1
+ # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
2
+ # probably makes sense only in cases where an upstream filter already has removed some blocks.
4
3
 
5
4
  module Boilerpipe::Filters
6
5
  class BlockProximityFusion
7
-
8
-
9
6
  def initialize(max_blocks_distance, content_only, same_tag_level_only)
10
7
  @max_blocks_distance = max_blocks_distance
11
8
  @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
13
10
  end
14
11
 
15
12
  MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
16
- MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
17
- MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
13
+ MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
14
+ MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
18
15
  MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
19
16
 
20
17
  def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
22
19
  return false if text_blocks.size < 2
23
20
 
24
21
  prev_block = if @content_only
25
- text_blocks.find{ |tb| tb.is_content? }
22
+ text_blocks.find { |tb| tb.is_content? }
26
23
  else
27
24
  text_blocks.first
28
25
  end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
46
43
  ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
47
44
  ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
48
45
 
49
- if ok
46
+ if ok
50
47
  prev_block.merge_next(tb)
51
48
  blocks_to_remove << tb
52
49
  else
53
50
  prev_block = tb
54
51
  end
55
52
  end
56
-
57
53
  end
58
- doc.replace_text_blocks!( text_blocks - blocks_to_remove )
54
+ doc.replace_text_blocks!(text_blocks - blocks_to_remove)
59
55
  doc
60
56
  end
61
-
62
57
  end
63
58
  end