boilerpipe-ruby 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +13 -0
  5. data/Dockerfile +14 -0
  6. data/README.md +12 -3
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +6 -6
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +3 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -3
  14. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  15. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  17. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
  18. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  19. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  20. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  21. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  22. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  23. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  24. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
  25. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  26. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  27. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  28. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  29. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  30. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  31. data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
  32. data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
  33. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  34. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  35. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
  36. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  37. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  38. data/lib/boilerpipe/labels/label_action.rb +1 -1
  39. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +1 -5
  40. data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
  41. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  42. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  43. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  44. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  47. data/lib/boilerpipe/version.rb +1 -1
  48. metadata +16 -14
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
4
- data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
2
+ SHA256:
3
+ metadata.gz: 65756911038bd486a08337188c3275ebd0c0c65e4b902edbd6b6667dda422740
4
+ data.tar.gz: ff83632d9cea8e4a0ede8609115e8282a856d2d7728c801e64ecec39a9399857
5
5
  SHA512:
6
- metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
7
- data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
6
+ metadata.gz: e5e902c81cea26252c41bc4b96d0faebe6682a0dc5ae2c09397762ef4c5a7f244c0c100f87923863b308d2ef9b5ecc732e674d2ca801e4087f99031d46776034
7
+ data.tar.gz: 6788183a0a4c9d01c764d17537c52edaf9d32b93fb42da8a013f9f0b14f6a4f757a8d3b5f77b73fef59f62fcf988a723abf7fe8305f5626b0337838c4eb31c7d
@@ -6,48 +6,30 @@ version: 2
6
6
  jobs:
7
7
  build:
8
8
  docker:
9
- # specify the version you desire here
10
- - image: circleci/ruby:2.4.1-node-browsers
11
-
9
+ - image: circleci/ruby:2.5.5-node-browsers
10
+
12
11
  # Specify service dependencies here if necessary
13
12
  # CircleCI maintains a library of pre-built images
14
13
  # documented at https://circleci.com/docs/2.0/circleci-images/
15
- # - image: circleci/postgres:9.4
16
14
 
17
15
  working_directory: ~/repo
18
16
 
19
17
  steps:
20
18
  - checkout
21
19
 
22
- # Download and cache dependencies
23
- - restore_cache:
24
- keys:
25
- - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
- # fallback to using the latest cache if no exact match is found
27
- - v1-dependencies-
28
-
20
+ - run: gem install bundler
29
21
  - run:
30
22
  name: install dependencies
31
23
  command: |
32
- bundle install --jobs=4 --retry=3 --path vendor/bundle
24
+ bundle install --jobs=4 --retry=3
33
25
 
34
- - save_cache:
35
- paths:
36
- - ./vendor/bundle
37
- key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
-
39
- # Database setup
40
- #- run: bundle exec rake db:create
41
- #- run: bundle exec rake db:schema:load
42
-
43
- # run tests!
44
26
  - run:
45
27
  name: run tests
46
28
  command: |
47
29
  mkdir /tmp/test-results
48
30
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
-
50
- rspec --format progress "spec"
31
+
32
+ bundle exec rspec --format progress "spec"
51
33
 
52
34
  # collect reports
53
35
  - store_test_results:
@@ -0,0 +1,7 @@
1
+ .git
2
+ .gitignore
3
+ log/*
4
+ tmp/*
5
+ *.swp
6
+ *.swo
7
+ Gemfile.lock
@@ -1,3 +1,16 @@
1
+ # 0.4.1 / 2019-07-04
2
+
3
+ * Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
4
+ * Allow tests to run in Docker
5
+ * Update circle to continue to work
6
+ * Add architecture flow
7
+ * Code formatting
8
+ * Add min words filter specs
9
+ * Add label action specs
10
+ * Add missing test case to ignorable element spec
11
+ * Add merge_next case to text block spec
12
+ * Dry up includes
13
+
1
14
  # 0.4.0 / 2017-09-15
2
15
 
3
16
  * Add KeepEverythingWithMinKWords Extractor
@@ -0,0 +1,14 @@
1
+ From ruby:2.5
2
+ RUN gem install bundler
3
+ COPY *gemspec /usr/src/app/
4
+ COPY Gemfile /usr/src/app/
5
+ COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
6
+ COPY bin /usr/src/app/
7
+ COPY bin/* /usr/src/app/bin/
8
+
9
+ WORKDIR /usr/src/app
10
+ RUN bin/setup
11
+
12
+ COPY . /usr/src/app/
13
+
14
+ CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Boilerpipe
2
2
 
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
4
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
+
3
6
  A pure ruby implemenation of the boilerpipe algorithm.
4
7
 
5
8
  This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
10
13
 
11
14
  This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
12
15
 
16
+ Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
17
+
13
18
  # TLDR
14
19
 
15
20
  Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
24
29
  * [x] LargestContentExtractor
25
30
  * [x] NumWordsRulesExtractor
26
31
 
27
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
28
-
29
- [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
30
32
 
31
33
  ## Installation
32
34
 
@@ -71,6 +73,13 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
71
73
 
72
74
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
73
75
 
76
+ ### Running Tests on Docker
77
+
78
+ The default run command will run the tests
79
+
80
+ docker build -t boilerpipe .
81
+ docker run -it --rm boilerpipe
82
+
74
83
  ## Contributing
75
84
 
76
85
  Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.
data/Rakefile CHANGED
@@ -1,14 +1,13 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task :default => :spec
7
7
 
8
-
9
8
  desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
10
9
  task :download_boilerpipe_jar do
11
10
  FileUtils.mkdir_p 'spec/sanity_checks/jars/'
12
11
  Dir.chdir 'spec/sanity_checks/jars/'
13
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
12
+ `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
14
13
  end
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "boilerpipe"
3
+ require 'bundler/setup'
4
+ require 'boilerpipe'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
11
  spec.license = 'Apache 2.0'
12
12
 
13
- spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
14
- spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
13
+ spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
14
+ spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
15
15
  spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
22
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
23
  spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
24
  spec.add_development_dependency 'rickshaw', '~> 0.4.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.0'
26
26
  spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
27
27
  end
@@ -0,0 +1,40 @@
1
+ ```
2
+ raw html
3
+ |
4
+ |
5
+ sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
6
+ |
7
+ -------------------------------------<------------------------------------<------|
8
+ | | |
9
+ text blocks text blocks text blocks
10
+ | | |
11
+ | | |
12
+ -----------------------------
13
+ |
14
+ |
15
+ text document
16
+ |
17
+ |
18
+ filter
19
+ |
20
+ filter
21
+ |
22
+ filter
23
+ |
24
+ filter
25
+ |
26
+ filter
27
+ |
28
+ filter
29
+ |
30
+ filter
31
+ |
32
+ filter
33
+ |
34
+ filter
35
+ |
36
+ |
37
+ text document
38
+ |
39
+ outputs extracted text
40
+ ```
@@ -1,3 +1,6 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
1
4
  require 'boilerpipe/version'
2
5
 
3
6
  require 'boilerpipe/util/unicode_tokenizer'
@@ -1,10 +1,7 @@
1
- require 'set'
2
-
3
1
  module Boilerpipe
4
2
  module Document
5
3
  class TextBlock
6
-
7
- #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
4
+ # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
8
5
 
9
6
  attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
10
7
  :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
12
9
 
13
10
  attr_accessor :content
14
11
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
12
+ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
16
13
  @labels = Set.new
17
14
  @text = text
18
15
  @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
32
29
  new('', 0, 0, 0, 0, -1)
33
30
  end
34
31
 
35
- def set_tag_level(level)
36
- @tag_level = level
37
- end
32
+ def set_tag_level(level)
33
+ @tag_level = level
34
+ end
38
35
 
39
36
  def is_content?
40
37
  @content
@@ -68,8 +65,8 @@ module Boilerpipe
68
65
  @num_words_in_anchor_text += other.num_words_in_anchor_text
69
66
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
70
67
  @num_wrapped_lines += other.num_wrapped_lines
71
- @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
72
- @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
68
+ @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
69
+ @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
73
70
  init_densities
74
71
  @content |= other.is_content?
75
72
 
@@ -87,10 +84,10 @@ module Boilerpipe
87
84
  end
88
85
 
89
86
  def to_s
90
- #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
87
+ # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
91
88
  labels = 'null'
92
89
  if !@labels.empty?
93
- labels ="[#{ @labels.to_a.join(',')}]"
90
+ labels = "[#{@labels.to_a.join(',')}]"
94
91
  end
95
92
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
96
93
  end
@@ -100,6 +97,7 @@ module Boilerpipe
100
97
  end
101
98
 
102
99
  private
100
+
103
101
  def init_densities
104
102
  if @num_words_in_wrapped_lines == 0
105
103
  @num_words_in_wrapped_lines = @num_words
@@ -19,12 +19,14 @@ module Boilerpipe
19
19
  case text_block.is_content?
20
20
  when true
21
21
  next unless include_content
22
+
22
23
  s << text_block.text
23
24
  s << "\n"
24
25
  when false
25
26
  next unless include_noncontent
26
- s << text_block.text
27
- s << "\n"
27
+
28
+ s << text_block.text
29
+ s << "\n"
28
30
  end
29
31
  end
30
32
  s
@@ -38,7 +40,6 @@ module Boilerpipe
38
40
  @text_blocks.map(&:to_s).join("\n")
39
41
  end
40
42
  alias_method :debug_string, :debug_s
41
-
42
43
  end
43
44
  end
44
45
  end
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class CanolaExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::CanolaExtractor.process doc
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class DefaultExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::DefaultExtractor.process doc
@@ -1,4 +1,4 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Extractors
4
4
  class KeepEverythingExtractor
@@ -1,4 +1,3 @@
1
-
2
1
  # A full-text extractor which extracts the largest text component of a page.
3
2
  # For news articles, it may perform better than the DefaultExtractor, but
4
3
  # usually worse than ArticleExtractor.
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class NumWordsRulesExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
@@ -1,11 +1,8 @@
1
-
2
- # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
3
- # probably makes sense only in cases where an upstream filter already has removed some blocks.
1
+ # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
2
+ # probably makes sense only in cases where an upstream filter already has removed some blocks.
4
3
 
5
4
  module Boilerpipe::Filters
6
5
  class BlockProximityFusion
7
-
8
-
9
6
  def initialize(max_blocks_distance, content_only, same_tag_level_only)
10
7
  @max_blocks_distance = max_blocks_distance
11
8
  @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
13
10
  end
14
11
 
15
12
  MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
16
- MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
17
- MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
13
+ MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
14
+ MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
18
15
  MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
19
16
 
20
17
  def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
22
19
  return false if text_blocks.size < 2
23
20
 
24
21
  prev_block = if @content_only
25
- text_blocks.find{ |tb| tb.is_content? }
22
+ text_blocks.find { |tb| tb.is_content? }
26
23
  else
27
24
  text_blocks.first
28
25
  end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
46
43
  ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
47
44
  ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
48
45
 
49
- if ok
46
+ if ok
50
47
  prev_block.merge_next(tb)
51
48
  blocks_to_remove << tb
52
49
  else
53
50
  prev_block = tb
54
51
  end
55
52
  end
56
-
57
53
  end
58
- doc.replace_text_blocks!( text_blocks - blocks_to_remove )
54
+ doc.replace_text_blocks!(text_blocks - blocks_to_remove)
59
55
  doc
60
56
  end
61
-
62
57
  end
63
58
  end
@@ -1,9 +1,7 @@
1
-
2
- # Removes TextBlocks which have explicitly been marked as "not content".
1
+ # Removes TextBlocks which have explicitly been marked as "not content".
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class BoilerplateBlockFilter
6
-
7
5
  def initialize(label)
8
6
  @label_to_keep = label
9
7
  end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
21
19
  doc.replace_text_blocks!(combined)
22
20
  doc
23
21
  end
24
-
25
22
  end
26
23
  end
@@ -1,10 +1,9 @@
1
- # A full-text extractor trained on http://krdwrd.org/
2
- # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
- # Works well with SimpleEstimator, too.
1
+ # A full-text extractor trained on http://krdwrd.org/
2
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
+ # Works well with SimpleEstimator, too.
4
4
 
5
5
  module Boilerpipe::Filters
6
6
  class CanolaClassifier
7
-
8
7
  def self.process(doc)
9
8
  return doc if doc.text_blocks.size < 1
10
9
 
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
22
21
  def self.classify(prev, current, nxt)
23
22
  current.link_density > 0 && nxt.num_words > 11 \
24
23
  || current.num_words > 19 \
25
- || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
24
+ || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
26
25
  end
27
26
  end
28
27
  end
@@ -5,9 +5,8 @@
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class DensityRulesClassifier
8
-
9
8
  def self.process(doc)
10
- #return doc if doc.text_blocks.size < 2
9
+ # return doc if doc.text_blocks.size < 2
11
10
 
12
11
  empty = Boilerpipe::Document::TextBlock.empty_start
13
12
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
26
25
  if prev.link_density <= 0.555556
27
26
  if current.text_density <= 9
28
27
  return true if nxt.text_density > 10
28
+
29
29
  return prev.text_density <= 4 ? false : true
30
30
  else
31
31
  return nxt.text_density == 0 ? false : true
32
32
  end
33
33
  else
34
34
  return false if nxt.text_density <= 11
35
+
35
36
  true
36
37
  end
37
38
  end
@@ -1,12 +1,9 @@
1
- # encoding: utf-8
2
- require 'set'
1
+ # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
2
+ # some heuristics which are quite specific to the news domain.
3
3
 
4
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
5
- # some heuristics which are quite specific to the news domain.
6
-
7
- # we create a list of potential titles from the page title
8
- # then we look at every text block and if the text block
9
- # contains a potential title - we set that text block label as :TITLE
4
+ # we create a list of potential titles from the page title
5
+ # then we look at every text block and if the text block
6
+ # contains a potential title - we set that text block label as :TITLE
10
7
 
11
8
  module Boilerpipe::Filters
12
9
  class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
55
52
  @potential_titles << title
56
53
 
57
54
  # unnecessary
58
- #p = longest_part(title, /[ ]*[|»-][ ]*/)
59
- #@potential_titles << p if p
55
+ # p = longest_part(title, /[ ]*[|»-][ ]*/)
56
+ # @potential_titles << p if p
60
57
 
61
- #p = longest_part(title, /[ ]*[|»:][ ]*/)
62
- #@potential_titles << p if p
58
+ # p = longest_part(title, /[ ]*[|»:][ ]*/)
59
+ # @potential_titles << p if p
63
60
 
64
- #p = longest_part(title, /[ ]*[|»:()][ ]*/)
65
- #@potential_titles << p if p
61
+ # p = longest_part(title, /[ ]*[|»:()][ ]*/)
62
+ # @potential_titles << p if p
66
63
 
67
- #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
68
- #@potential_titles << p if p
64
+ # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
65
+ # @potential_titles << p if p
69
66
 
70
67
  p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
71
68
  @potential_titles << p if p
72
69
 
73
70
  # we replace \u00a0 so why check for it?
74
- #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
75
- #@potential_titles << p if p
71
+ # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
72
+ # @potential_titles << p if p
76
73
 
77
74
  add_potential_titles(title, /[ ]+[|][ ]+/, 4)
78
75
  add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
90
87
 
91
88
  parts.each do |part|
92
89
  next if part =~ /[.]com/
90
+
93
91
  num_words = number_of_words(part)
94
92
 
95
93
  if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
107
105
 
108
106
  parts.each do |part|
109
107
  next if part =~ /[.]com/
108
+
110
109
  num_words = number_of_words(part)
111
110
 
112
111
  @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
116
115
  def number_of_words(s)
117
116
  s.split(/[\b ]+/).size
118
117
  end
119
-
120
118
  end
121
119
  end
@@ -1,10 +1,8 @@
1
-
2
1
  # Marks all TextBlocks "content" which are between the headline and the part that has
3
2
  # already been marked content, if they are marked MIGHT_BE_CONTENT.
4
3
  # This filter is quite specific to the news domain.
5
4
  # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
6
5
 
7
-
8
6
  module Boilerpipe::Filters
9
7
  class ExpandTitleToContentFilter
10
8
  def self.process(doc)
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
38
36
  def self.no_title_with_subsequent_content?(content_start, title)
39
37
  title.nil? || content_start.nil? || content_start <= title
40
38
  end
41
-
42
39
  end
43
40
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::Filters
2
2
  class HeuristicFilterBase
3
- def self.num_full_text_words(tb, min_text_density=9.0)
3
+ def self.num_full_text_words(tb, min_text_density = 9.0)
4
4
  tb.text_density >= min_text_density ? tb.num_words : 0
5
5
  end
6
6
  end
@@ -1,12 +1,11 @@
1
- # Marks all blocks as "non-content" that occur after blocks that have been
2
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
- # number of words in content blocks occur before this mark (default: 60).
4
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
1
+ # Marks all blocks as "non-content" that occur after blocks that have been
2
+ # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
+ # number of words in content blocks occur before this mark (default: 60).
4
+ # This can be used in conjunction with an upstream TerminatingBlocksFinder.
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
8
-
9
- def self.process(doc, min_num_words=60)
8
+ def self.process(doc, min_num_words = 60)
10
9
  found_end_of_text = false
11
10
  num_words = 0
12
11
 
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
19
18
 
20
19
  doc
21
20
  end
22
-
23
21
  end
24
22
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Keeps the largest TextBlock only (by the number of words). In case of
3
2
  # more than one block with the same number of words, the first block is chosen.
4
3
  # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class KeepLargestBlockFilter
11
-
12
10
  def initialize(expand_to_same_level_text, min_words)
13
11
  @expand_to_same_level_text = expand_to_same_level_text
14
12
  @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
43
41
  expand_tag_level(tbs[0...n].reverse, level, @min_words)
44
42
 
45
43
  # expand blocks to the right
46
- expand_tag_level(tbs[n+1..-1], level, @min_words)
44
+ expand_tag_level(tbs[n + 1..-1], level, @min_words)
47
45
  end
48
46
  end
49
47
 
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
57
55
  end
58
56
  end
59
57
  end
60
-
61
58
  end
62
59
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks all blocks as content that:
3
2
  # are on the same tag-level as very likely main content
4
3
  # (usually the level of the largest block)
@@ -7,23 +6,22 @@
7
6
 
8
7
  module Boilerpipe::Filters
9
8
  class LargeBlockSameTagLevelToContentFilter
10
-
11
9
  def self.process(doc)
12
-
13
10
  largest = doc.text_blocks.find do |tb|
14
11
  tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
15
12
  end
16
13
 
17
14
  return doc if largest.nil?
15
+
18
16
  tag_level = largest.tag_level
19
17
 
20
18
  doc.text_blocks.each do |tb|
21
19
  next if tb.is_content?
20
+
22
21
  tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
23
22
  end
24
23
 
25
24
  doc
26
25
  end
27
-
28
26
  end
29
27
  end
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
11
11
  doc.text_blocks.each do |tb|
12
12
  if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
13
13
  tag_level = tb.tag_level
14
- elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
14
+ elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
15
15
  tb.content = true
16
16
  else
17
17
  tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
20
20
 
21
21
  doc
22
22
  end
23
-
24
23
  end
25
24
  end
@@ -1,14 +1,12 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class MarkEverythingContentFilter
5
-
6
5
  def self.process(doc)
7
6
  doc.text_blocks.each do |tb|
8
7
  tb.content = true if tb.is_not_content?
9
8
  end
10
9
  doc
11
10
  end
12
-
13
11
  end
14
12
  end
@@ -8,30 +8,27 @@
8
8
 
9
9
  module Boilerpipe::Filters
10
10
  class MinClauseWordsFilter
11
-
12
- def self.process(doc, min_words=5)
13
-
11
+ def self.process(doc, min_words = 5)
14
12
  doc.text_blocks.each do |tb|
15
13
  next if tb.is_not_content?
16
14
 
17
15
  clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
16
+ hasClause = false
18
17
  tb.text.scan(clause_delimiter).each do |possible_clause|
19
- if is_clause? possible_clause
20
- break
21
- else
22
- tb.content = false
23
- end
18
+ hasClause |= is_clause? possible_clause
24
19
  end
20
+
21
+ tb.content = false unless hasClause
25
22
  end
26
23
 
27
24
  doc
28
25
  end
29
26
 
30
- def self.is_clause?(text, min_words=5)
31
- return false if text.nil?
27
+ def self.is_clause?(text, min_words = 5)
28
+ return false if text.nil?
29
+
32
30
  whitespace = /[ \n\r]+/
33
31
  text.scan(whitespace).size >= min_words
34
32
  end
35
-
36
33
  end
37
34
  end
@@ -1,16 +1,14 @@
1
-
2
1
  # Keeps only those content blocks which contain at least k words.
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class MinWordsFilter
6
-
7
5
  def self.process(min_words, doc)
8
6
  doc.text_blocks.each do |tb|
9
7
  next if tb.is_not_content?
8
+
10
9
  tb.content = false if tb.num_words < min_words
11
10
  end
12
11
  doc
13
12
  end
14
-
15
13
  end
16
14
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  # Classifies TextBlocks as content/not-content through rules that have been determined
4
2
  # using the C4.8 machine learning algorithm, as described in the paper
5
3
  # "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
7
5
 
8
6
  module Boilerpipe::Filters
9
7
  class NumWordsRulesClassifier
10
-
11
8
  def self.process(doc)
12
9
  empty = Boilerpipe::Document::TextBlock.empty_start
13
10
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
37
34
 
38
35
  false
39
36
  end
40
-
41
37
  end
42
38
  end
@@ -1,4 +1,4 @@
1
- # Merges two subsequent blocks if their text densities are equal.
1
+ # Merges two subsequent blocks if their text densities are equal.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
17
17
  end
18
18
  end
19
19
 
20
- doc.replace_text_blocks!( tbs - blocks_to_remove )
20
+ doc.replace_text_blocks!(tbs - blocks_to_remove)
21
21
  doc
22
22
  end
23
23
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Splits TextBlocks at paragraph boundaries.
3
2
  #
4
3
  # NOTE: This is not fully supported (i.e., it will break highlighting support via
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class SplitParagraphBlocksFilter
11
-
12
10
  def self.process(doc)
13
11
  tbs = doc.text_blocks
14
12
  new_blocks = []
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
35
33
  doc.replace_text_blocks!(new_blocks) if changes
36
34
  doc
37
35
  end
38
-
39
36
  end
40
37
  end
@@ -1,15 +1,13 @@
1
- # encoding: utf-8
2
-
3
1
  # Finds blocks which are potentially indicating the end of an article
4
2
  # text and marks them with INDICATES_END_OF_TEXT. This can be used
5
3
  # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
6
4
 
7
-
8
5
  module Boilerpipe::Filters
9
6
  class TerminatingBlocksFinder
10
7
  def self.process(doc)
11
8
  doc.text_blocks.each do |tb|
12
9
  next unless tb.num_words < 15
10
+
13
11
  if tb.text.length >= 8 && finds_match?(tb.text.downcase)
14
12
  tb.labels << :INDICATES_END_OF_TEXT
15
13
  elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
29
27
  text.include?('what you think...') ||
30
28
  text.include?('add your comment') ||
31
29
  text.include?('add comment') ||
32
- #TODO add this and test
33
- #text.include?('leave a reply') ||
34
- #text.include?('leave a comment') ||
35
- #text.include?('show comments') ||
36
- #text.include?('Share this:') ||
30
+ # TODO add this and test
31
+ # text.include?('leave a reply') ||
32
+ # text.include?('leave a comment') ||
33
+ # text.include?('show comments') ||
34
+ # text.include?('Share this:') ||
37
35
  text.include?('reader views') ||
38
36
  text.include?('have your say') ||
39
37
  text.include?('reader comments') ||
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks trailing headlines TextBlocks that have the label :#HEADING
3
2
  # as boilerplate. Trailing means they are marked content and are
4
3
  # below any other content block.
@@ -6,7 +5,6 @@
6
5
  module Boilerpipe::Filters
7
6
  class TrailingHeadlineToBoilerplateFilter
8
7
  def self.process(doc)
9
-
10
8
  doc.text_blocks.each do |tb|
11
9
  next unless tb.is_content?
12
10
 
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
19
17
 
20
18
  doc
21
19
  end
22
-
23
20
  end
24
21
  end
@@ -2,7 +2,7 @@ module Boilerpipe::Labels
2
2
  class LabelAction
3
3
  attr_reader :labels
4
4
 
5
- def initialize(labels=[])
5
+ def initialize(labels = [])
6
6
  @labels = labels
7
7
  end
8
8
 
@@ -1,20 +1,16 @@
1
- require 'nokogiri'
2
1
  module Boilerpipe::SAX
3
2
  class BoilerpipeHTMLParser
4
3
  def self.parse(text)
5
-
6
- #script bug - delete script tags
4
+ # script bug - delete script tags
7
5
  text.gsub!(/\<script>.+?<\/script>/i, '')
8
6
 
9
7
  # nokogiri uses libxml for mri and nekohtml for jruby
10
8
  # mri doesn't remove &nbsp; when missing the semicolon
11
9
  text.gsub!(/(&nbsp) /, '\1; ')
12
10
 
13
-
14
11
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
15
12
  text = Nokogiri::HTML(text).to_html
16
13
 
17
-
18
14
  handler = HTMLContentHandler.new
19
15
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
20
16
  noko_parser.parse(text)
@@ -1,11 +1,8 @@
1
- require 'nokogiri'
2
- require 'set'
3
-
4
1
  module Boilerpipe::SAX
5
2
  class HTMLContentHandler < Nokogiri::XML::SAX::Document
6
3
  attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
7
4
 
8
- attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
5
+ attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
9
6
  ANCHOR_TEXT_START = "$\ue00a<"
10
7
  ANCHOR_TEXT_END = ">\ue00a$"
11
8
 
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
34
31
  @label_stacks << nil
35
32
  tag = name.upcase.intern
36
33
 
37
-
38
34
  tag_action = @tag_actions[tag]
39
35
  if tag_action
40
36
  @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
51
47
  def characters(text)
52
48
  flush_block if @flush
53
49
 
54
- return if @in_ignorable_element != 0
50
+ return if in_ignorable_element?
55
51
  return if text.empty?
56
52
 
57
53
  # replace all whitespace with simple space
58
54
  text.gsub!(/\s+/, ' ')
59
55
 
60
56
  # trim whitespace
61
- started_with_whitespace = text =~ /^\s/
62
- ended_with_whitespace = text =~ /\s$/
57
+ started_with_whitespace = text =~ /^\s/
58
+ ended_with_whitespace = text =~ /\s$/
63
59
  text.strip!
64
60
 
65
61
  # add a single space if the block was only whitespace
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
158
154
  end
159
155
 
160
156
  text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
161
- num_words,
162
- num_linked_words,
163
- num_words_in_wrapped_lines,
164
- num_wrapped_lines, @offset_blocks)
157
+ num_words,
158
+ num_linked_words,
159
+ num_words_in_wrapped_lines,
160
+ num_wrapped_lines, @offset_blocks)
165
161
 
166
162
  @offset_blocks += 1
167
163
  clear_buffers
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
187
183
  # \p{No} -- a numeric character of other type
188
184
 
189
185
  def is_word?(word)
190
- word =~ VALID_WORD_CHARACTER
186
+ word =~ VALID_WORD_CHARACTER
191
187
  end
192
188
 
193
- #public void flushBlock() {
189
+ # public void flushBlock() {
194
190
  # int numWords = 0;
195
191
  # int numLinkedWords = 0;
196
192
  # int numWrappedLines = 0;
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
198
194
  # final int maxLineLength = 80;
199
195
  # int numTokens = 0;
200
196
  # int numWordsCurrentLine = 0;
201
- #}
197
+ # }
202
198
 
203
199
  def increase_in_ignorable_element!
204
200
  @in_ignorable_element += 1
205
201
  end
206
202
 
203
+ # should we prevent less than zero here?
207
204
  def decrease_in_ignorable_element!
208
205
  @in_ignorable_element -= 1
209
206
  end
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
224
221
  @in_anchor_tag > 0
225
222
  end
226
223
 
227
-
228
224
  def add_text_block(text_block)
229
225
  @label_stacks.each do |stack|
230
226
  next unless stack
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
239
235
  # append space if last character wasn't already one
240
236
  def append_space
241
237
  return if @sb_last_was_whitespace
238
+
242
239
  @sb_last_was_whitespace = true
243
240
 
244
241
  @text_buffer << ' '
@@ -48,4 +48,3 @@ module Boilerpipe::SAX
48
48
  end
49
49
  end
50
50
  end
51
-
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
2
2
  class AnchorText
3
3
  # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
4
4
  # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
5
- #* encounters such nestings, a SAXException is thrown.
5
+ # * encounters such nestings, a SAXException is thrown.
6
6
  def start(handler, name, attrs)
7
7
  if handler.in_anchor_tag?
8
8
  handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
42
42
  # - dunno about nokogiri???????
43
43
  # as nested A elements are not allowed per specification, we
44
44
  # are probably reaching this branch due to a bug in the XML parser
45
- #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
45
+ # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
46
46
  end_tag(handler, name)
47
47
  end
48
48
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Explicitly marks this tag a simple "block-level" element,
3
- # which always generates whitespace
2
+ # Explicitly marks this tag a simple "block-level" element,
3
+ # which always generates whitespace
4
4
  class BlockLevel
5
5
  def start(handler, name, attrs)
6
6
  true
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # for block-level elements, which triggers some LabelAction on
3
- # the generated TextBlock.
2
+ # for block-level elements, which triggers some LabelAction on
3
+ # the generated TextBlock.
4
4
  class BlockTagLabel
5
5
  def initialize(label_action)
6
6
  @label_action = label_action
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Marks this tag the body element (this should usually only
3
- # be set for the <BODY> tag).
2
+ # Marks this tag the body element (this should usually only
3
+ # be set for the <BODY> tag).
4
4
  class Body
5
5
  def start(handler, name, attrs)
6
6
  handler.flush_block
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
10
10
  rel = m[1]
11
11
  val = m[2].to_i # absolute
12
12
  size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
13
- handler.font_size_stack << size
13
+ handler.font_size_stack << size
14
14
  else
15
15
  handler.font_size_stack << nil
16
16
  end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
27
27
  end
28
28
 
29
29
  def relative(font_size_stack, rel, val)
30
- prev_size = font_size_stack.reverse_each.find{|s| s != nil}
30
+ prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
31
31
  prev_size = 3 if prev_size.nil?
32
32
 
33
33
  size = if rel == '+'
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.0'
2
+ VERSION = '0.4.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-15 00:00:00.000000000 Z
11
+ date: 2019-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,33 +39,33 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: rickshaw
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: 0.4.0
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: 0.4.0
55
55
  - !ruby/object:Gem::Dependency
56
- name: rickshaw
56
+ name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.4.0
61
+ version: '3.0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.4.0
68
+ version: '3.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -80,7 +80,7 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: 1.6.6.2
83
- description: A pure ruby implementation of the boilerpipe algorithm
83
+ description: A pure ruby implementation of the boilerpipe web content extraction algorithm
84
84
  email:
85
85
  - "<gregory.ostermayr@gmail.com>"
86
86
  executables: []
@@ -88,9 +88,11 @@ extensions: []
88
88
  extra_rdoc_files: []
89
89
  files:
90
90
  - ".circleci/config.yml"
91
+ - ".dockerignore"
91
92
  - ".gitignore"
92
93
  - ".rspec"
93
94
  - CHANGELOG.md
95
+ - Dockerfile
94
96
  - Gemfile
95
97
  - LICENSE.txt
96
98
  - README.md
@@ -98,6 +100,7 @@ files:
98
100
  - bin/console
99
101
  - bin/setup
100
102
  - boilerpipe-ruby.gemspec
103
+ - boilerpipe_flow.md
101
104
  - lib/boilerpipe.rb
102
105
  - lib/boilerpipe/document/text_block.rb
103
106
  - lib/boilerpipe/document/text_document.rb
@@ -166,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
169
  - !ruby/object:Gem::Version
167
170
  version: '0'
168
171
  requirements: []
169
- rubyforge_project:
170
- rubygems_version: 2.6.12
172
+ rubygems_version: 3.0.1
171
173
  signing_key:
172
174
  specification_version: 4
173
- summary: A pure ruby implemenation of the boilerpipe algorithm
175
+ summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
174
176
  test_files: []