boilerpipe-ruby 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +13 -0
  5. data/Dockerfile +14 -0
  6. data/README.md +12 -3
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +6 -6
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +3 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -3
  14. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  15. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  17. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
  18. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  19. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  20. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  21. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  22. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  23. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  24. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
  25. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  26. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  27. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  28. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  29. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  30. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  31. data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
  32. data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
  33. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  34. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  35. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
  36. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  37. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  38. data/lib/boilerpipe/labels/label_action.rb +1 -1
  39. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +1 -5
  40. data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
  41. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  42. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  43. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  44. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  47. data/lib/boilerpipe/version.rb +1 -1
  48. metadata +16 -14
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
4
- data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
2
+ SHA256:
3
+ metadata.gz: 65756911038bd486a08337188c3275ebd0c0c65e4b902edbd6b6667dda422740
4
+ data.tar.gz: ff83632d9cea8e4a0ede8609115e8282a856d2d7728c801e64ecec39a9399857
5
5
  SHA512:
6
- metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
7
- data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
6
+ metadata.gz: e5e902c81cea26252c41bc4b96d0faebe6682a0dc5ae2c09397762ef4c5a7f244c0c100f87923863b308d2ef9b5ecc732e674d2ca801e4087f99031d46776034
7
+ data.tar.gz: 6788183a0a4c9d01c764d17537c52edaf9d32b93fb42da8a013f9f0b14f6a4f757a8d3b5f77b73fef59f62fcf988a723abf7fe8305f5626b0337838c4eb31c7d
@@ -6,48 +6,30 @@ version: 2
6
6
  jobs:
7
7
  build:
8
8
  docker:
9
- # specify the version you desire here
10
- - image: circleci/ruby:2.4.1-node-browsers
11
-
9
+ - image: circleci/ruby:2.5.5-node-browsers
10
+
12
11
  # Specify service dependencies here if necessary
13
12
  # CircleCI maintains a library of pre-built images
14
13
  # documented at https://circleci.com/docs/2.0/circleci-images/
15
- # - image: circleci/postgres:9.4
16
14
 
17
15
  working_directory: ~/repo
18
16
 
19
17
  steps:
20
18
  - checkout
21
19
 
22
- # Download and cache dependencies
23
- - restore_cache:
24
- keys:
25
- - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
- # fallback to using the latest cache if no exact match is found
27
- - v1-dependencies-
28
-
20
+ - run: gem install bundler
29
21
  - run:
30
22
  name: install dependencies
31
23
  command: |
32
- bundle install --jobs=4 --retry=3 --path vendor/bundle
24
+ bundle install --jobs=4 --retry=3
33
25
 
34
- - save_cache:
35
- paths:
36
- - ./vendor/bundle
37
- key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
-
39
- # Database setup
40
- #- run: bundle exec rake db:create
41
- #- run: bundle exec rake db:schema:load
42
-
43
- # run tests!
44
26
  - run:
45
27
  name: run tests
46
28
  command: |
47
29
  mkdir /tmp/test-results
48
30
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
-
50
- rspec --format progress "spec"
31
+
32
+ bundle exec rspec --format progress "spec"
51
33
 
52
34
  # collect reports
53
35
  - store_test_results:
@@ -0,0 +1,7 @@
1
+ .git
2
+ .gitignore
3
+ log/*
4
+ tmp/*
5
+ *.swp
6
+ *.swo
7
+ Gemfile.lock
@@ -1,3 +1,16 @@
1
+ # 0.4.1 / 2019-07-04
2
+
3
+ * Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
4
+ * Allow tests to run in Docker
5
+ * Update circle to continue to work
6
+ * Add architecture flow
7
+ * Code formatting
8
+ * Add min words filter specs
9
+ * Add label action specs
10
+ * Add missing test case to ignorable element spec
11
+ * Add merge_next case to text block spec
12
+ * Dry up includes
13
+
1
14
  # 0.4.0 / 2017-09-15
2
15
 
3
16
  * Add KeepEverythingWithMinKWords Extractor
@@ -0,0 +1,14 @@
1
+ From ruby:2.5
2
+ RUN gem install bundler
3
+ COPY *gemspec /usr/src/app/
4
+ COPY Gemfile /usr/src/app/
5
+ COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
6
+ COPY bin /usr/src/app/
7
+ COPY bin/* /usr/src/app/bin/
8
+
9
+ WORKDIR /usr/src/app
10
+ RUN bin/setup
11
+
12
+ COPY . /usr/src/app/
13
+
14
+ CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Boilerpipe
2
2
 
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
4
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
+
3
6
  A pure ruby implemenation of the boilerpipe algorithm.
4
7
 
5
8
  This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
10
13
 
11
14
  This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
12
15
 
16
+ Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
17
+
13
18
  # TLDR
14
19
 
15
20
  Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
24
29
  * [x] LargestContentExtractor
25
30
  * [x] NumWordsRulesExtractor
26
31
 
27
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
28
-
29
- [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
30
32
 
31
33
  ## Installation
32
34
 
@@ -71,6 +73,13 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
71
73
 
72
74
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
73
75
 
76
+ ### Running Tests on Docker
77
+
78
+ The default run command will run the tests
79
+
80
+ docker build -t boilerpipe .
81
+ docker run -it --rm boilerpipe
82
+
74
83
  ## Contributing
75
84
 
76
85
  Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.
data/Rakefile CHANGED
@@ -1,14 +1,13 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task :default => :spec
7
7
 
8
-
9
8
  desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
10
9
  task :download_boilerpipe_jar do
11
10
  FileUtils.mkdir_p 'spec/sanity_checks/jars/'
12
11
  Dir.chdir 'spec/sanity_checks/jars/'
13
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
12
+ `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
14
13
  end
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "boilerpipe"
3
+ require 'bundler/setup'
4
+ require 'boilerpipe'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
11
  spec.license = 'Apache 2.0'
12
12
 
13
- spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
14
- spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
13
+ spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
14
+ spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
15
15
  spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
22
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
23
  spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
24
  spec.add_development_dependency 'rickshaw', '~> 0.4.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.0'
26
26
  spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
27
27
  end
@@ -0,0 +1,40 @@
1
+ ```
2
+ raw html
3
+ |
4
+ |
5
+ sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
6
+ |
7
+ -------------------------------------<------------------------------------<------|
8
+ | | |
9
+ text blocks text blocks text blocks
10
+ | | |
11
+ | | |
12
+ -----------------------------
13
+ |
14
+ |
15
+ text document
16
+ |
17
+ |
18
+ filter
19
+ |
20
+ filter
21
+ |
22
+ filter
23
+ |
24
+ filter
25
+ |
26
+ filter
27
+ |
28
+ filter
29
+ |
30
+ filter
31
+ |
32
+ filter
33
+ |
34
+ filter
35
+ |
36
+ |
37
+ text document
38
+ |
39
+ outputs extracted text
40
+ ```
@@ -1,3 +1,6 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
1
4
  require 'boilerpipe/version'
2
5
 
3
6
  require 'boilerpipe/util/unicode_tokenizer'
@@ -1,10 +1,7 @@
1
- require 'set'
2
-
3
1
  module Boilerpipe
4
2
  module Document
5
3
  class TextBlock
6
-
7
- #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
4
+ # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
8
5
 
9
6
  attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
10
7
  :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
12
9
 
13
10
  attr_accessor :content
14
11
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
12
+ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
16
13
  @labels = Set.new
17
14
  @text = text
18
15
  @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
32
29
  new('', 0, 0, 0, 0, -1)
33
30
  end
34
31
 
35
- def set_tag_level(level)
36
- @tag_level = level
37
- end
32
+ def set_tag_level(level)
33
+ @tag_level = level
34
+ end
38
35
 
39
36
  def is_content?
40
37
  @content
@@ -68,8 +65,8 @@ module Boilerpipe
68
65
  @num_words_in_anchor_text += other.num_words_in_anchor_text
69
66
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
70
67
  @num_wrapped_lines += other.num_wrapped_lines
71
- @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
72
- @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
68
+ @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
69
+ @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
73
70
  init_densities
74
71
  @content |= other.is_content?
75
72
 
@@ -87,10 +84,10 @@ module Boilerpipe
87
84
  end
88
85
 
89
86
  def to_s
90
- #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
87
+ # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
91
88
  labels = 'null'
92
89
  if !@labels.empty?
93
- labels ="[#{ @labels.to_a.join(',')}]"
90
+ labels = "[#{@labels.to_a.join(',')}]"
94
91
  end
95
92
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
96
93
  end
@@ -100,6 +97,7 @@ module Boilerpipe
100
97
  end
101
98
 
102
99
  private
100
+
103
101
  def init_densities
104
102
  if @num_words_in_wrapped_lines == 0
105
103
  @num_words_in_wrapped_lines = @num_words
@@ -19,12 +19,14 @@ module Boilerpipe
19
19
  case text_block.is_content?
20
20
  when true
21
21
  next unless include_content
22
+
22
23
  s << text_block.text
23
24
  s << "\n"
24
25
  when false
25
26
  next unless include_noncontent
26
- s << text_block.text
27
- s << "\n"
27
+
28
+ s << text_block.text
29
+ s << "\n"
28
30
  end
29
31
  end
30
32
  s
@@ -38,7 +40,6 @@ module Boilerpipe
38
40
  @text_blocks.map(&:to_s).join("\n")
39
41
  end
40
42
  alias_method :debug_string, :debug_s
41
-
42
43
  end
43
44
  end
44
45
  end
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class CanolaExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::CanolaExtractor.process doc
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class DefaultExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::DefaultExtractor.process doc
@@ -1,4 +1,4 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Extractors
4
4
  class KeepEverythingExtractor
@@ -1,4 +1,3 @@
1
-
2
1
  # A full-text extractor which extracts the largest text component of a page.
3
2
  # For news articles, it may perform better than the DefaultExtractor, but
4
3
  # usually worse than ArticleExtractor.
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class NumWordsRulesExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
@@ -1,11 +1,8 @@
1
-
2
- # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
3
- # probably makes sense only in cases where an upstream filter already has removed some blocks.
1
+ # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
2
+ # probably makes sense only in cases where an upstream filter already has removed some blocks.
4
3
 
5
4
  module Boilerpipe::Filters
6
5
  class BlockProximityFusion
7
-
8
-
9
6
  def initialize(max_blocks_distance, content_only, same_tag_level_only)
10
7
  @max_blocks_distance = max_blocks_distance
11
8
  @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
13
10
  end
14
11
 
15
12
  MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
16
- MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
17
- MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
13
+ MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
14
+ MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
18
15
  MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
19
16
 
20
17
  def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
22
19
  return false if text_blocks.size < 2
23
20
 
24
21
  prev_block = if @content_only
25
- text_blocks.find{ |tb| tb.is_content? }
22
+ text_blocks.find { |tb| tb.is_content? }
26
23
  else
27
24
  text_blocks.first
28
25
  end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
46
43
  ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
47
44
  ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
48
45
 
49
- if ok
46
+ if ok
50
47
  prev_block.merge_next(tb)
51
48
  blocks_to_remove << tb
52
49
  else
53
50
  prev_block = tb
54
51
  end
55
52
  end
56
-
57
53
  end
58
- doc.replace_text_blocks!( text_blocks - blocks_to_remove )
54
+ doc.replace_text_blocks!(text_blocks - blocks_to_remove)
59
55
  doc
60
56
  end
61
-
62
57
  end
63
58
  end
@@ -1,9 +1,7 @@
1
-
2
- # Removes TextBlocks which have explicitly been marked as "not content".
1
+ # Removes TextBlocks which have explicitly been marked as "not content".
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class BoilerplateBlockFilter
6
-
7
5
  def initialize(label)
8
6
  @label_to_keep = label
9
7
  end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
21
19
  doc.replace_text_blocks!(combined)
22
20
  doc
23
21
  end
24
-
25
22
  end
26
23
  end
@@ -1,10 +1,9 @@
1
- # A full-text extractor trained on http://krdwrd.org/
2
- # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
- # Works well with SimpleEstimator, too.
1
+ # A full-text extractor trained on http://krdwrd.org/
2
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
+ # Works well with SimpleEstimator, too.
4
4
 
5
5
  module Boilerpipe::Filters
6
6
  class CanolaClassifier
7
-
8
7
  def self.process(doc)
9
8
  return doc if doc.text_blocks.size < 1
10
9
 
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
22
21
  def self.classify(prev, current, nxt)
23
22
  current.link_density > 0 && nxt.num_words > 11 \
24
23
  || current.num_words > 19 \
25
- || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
24
+ || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
26
25
  end
27
26
  end
28
27
  end
@@ -5,9 +5,8 @@
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class DensityRulesClassifier
8
-
9
8
  def self.process(doc)
10
- #return doc if doc.text_blocks.size < 2
9
+ # return doc if doc.text_blocks.size < 2
11
10
 
12
11
  empty = Boilerpipe::Document::TextBlock.empty_start
13
12
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
26
25
  if prev.link_density <= 0.555556
27
26
  if current.text_density <= 9
28
27
  return true if nxt.text_density > 10
28
+
29
29
  return prev.text_density <= 4 ? false : true
30
30
  else
31
31
  return nxt.text_density == 0 ? false : true
32
32
  end
33
33
  else
34
34
  return false if nxt.text_density <= 11
35
+
35
36
  true
36
37
  end
37
38
  end
@@ -1,12 +1,9 @@
1
- # encoding: utf-8
2
- require 'set'
1
+ # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
2
+ # some heuristics which are quite specific to the news domain.
3
3
 
4
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
5
- # some heuristics which are quite specific to the news domain.
6
-
7
- # we create a list of potential titles from the page title
8
- # then we look at every text block and if the text block
9
- # contains a potential title - we set that text block label as :TITLE
4
+ # we create a list of potential titles from the page title
5
+ # then we look at every text block and if the text block
6
+ # contains a potential title - we set that text block label as :TITLE
10
7
 
11
8
  module Boilerpipe::Filters
12
9
  class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
55
52
  @potential_titles << title
56
53
 
57
54
  # unnecessary
58
- #p = longest_part(title, /[ ]*[|»-][ ]*/)
59
- #@potential_titles << p if p
55
+ # p = longest_part(title, /[ ]*[|»-][ ]*/)
56
+ # @potential_titles << p if p
60
57
 
61
- #p = longest_part(title, /[ ]*[|»:][ ]*/)
62
- #@potential_titles << p if p
58
+ # p = longest_part(title, /[ ]*[|»:][ ]*/)
59
+ # @potential_titles << p if p
63
60
 
64
- #p = longest_part(title, /[ ]*[|»:()][ ]*/)
65
- #@potential_titles << p if p
61
+ # p = longest_part(title, /[ ]*[|»:()][ ]*/)
62
+ # @potential_titles << p if p
66
63
 
67
- #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
68
- #@potential_titles << p if p
64
+ # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
65
+ # @potential_titles << p if p
69
66
 
70
67
  p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
71
68
  @potential_titles << p if p
72
69
 
73
70
  # we replace \u00a0 so why check for it?
74
- #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
75
- #@potential_titles << p if p
71
+ # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
72
+ # @potential_titles << p if p
76
73
 
77
74
  add_potential_titles(title, /[ ]+[|][ ]+/, 4)
78
75
  add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
90
87
 
91
88
  parts.each do |part|
92
89
  next if part =~ /[.]com/
90
+
93
91
  num_words = number_of_words(part)
94
92
 
95
93
  if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
107
105
 
108
106
  parts.each do |part|
109
107
  next if part =~ /[.]com/
108
+
110
109
  num_words = number_of_words(part)
111
110
 
112
111
  @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
116
115
  def number_of_words(s)
117
116
  s.split(/[\b ]+/).size
118
117
  end
119
-
120
118
  end
121
119
  end
@@ -1,10 +1,8 @@
1
-
2
1
  # Marks all TextBlocks "content" which are between the headline and the part that has
3
2
  # already been marked content, if they are marked MIGHT_BE_CONTENT.
4
3
  # This filter is quite specific to the news domain.
5
4
  # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
6
5
 
7
-
8
6
  module Boilerpipe::Filters
9
7
  class ExpandTitleToContentFilter
10
8
  def self.process(doc)
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
38
36
  def self.no_title_with_subsequent_content?(content_start, title)
39
37
  title.nil? || content_start.nil? || content_start <= title
40
38
  end
41
-
42
39
  end
43
40
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::Filters
2
2
  class HeuristicFilterBase
3
- def self.num_full_text_words(tb, min_text_density=9.0)
3
+ def self.num_full_text_words(tb, min_text_density = 9.0)
4
4
  tb.text_density >= min_text_density ? tb.num_words : 0
5
5
  end
6
6
  end
@@ -1,12 +1,11 @@
1
- # Marks all blocks as "non-content" that occur after blocks that have been
2
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
- # number of words in content blocks occur before this mark (default: 60).
4
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
1
+ # Marks all blocks as "non-content" that occur after blocks that have been
2
+ # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
+ # number of words in content blocks occur before this mark (default: 60).
4
+ # This can be used in conjunction with an upstream TerminatingBlocksFinder.
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
8
-
9
- def self.process(doc, min_num_words=60)
8
+ def self.process(doc, min_num_words = 60)
10
9
  found_end_of_text = false
11
10
  num_words = 0
12
11
 
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
19
18
 
20
19
  doc
21
20
  end
22
-
23
21
  end
24
22
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Keeps the largest TextBlock only (by the number of words). In case of
3
2
  # more than one block with the same number of words, the first block is chosen.
4
3
  # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class KeepLargestBlockFilter
11
-
12
10
  def initialize(expand_to_same_level_text, min_words)
13
11
  @expand_to_same_level_text = expand_to_same_level_text
14
12
  @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
43
41
  expand_tag_level(tbs[0...n].reverse, level, @min_words)
44
42
 
45
43
  # expand blocks to the right
46
- expand_tag_level(tbs[n+1..-1], level, @min_words)
44
+ expand_tag_level(tbs[n + 1..-1], level, @min_words)
47
45
  end
48
46
  end
49
47
 
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
57
55
  end
58
56
  end
59
57
  end
60
-
61
58
  end
62
59
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks all blocks as content that:
3
2
  # are on the same tag-level as very likely main content
4
3
  # (usually the level of the largest block)
@@ -7,23 +6,22 @@
7
6
 
8
7
  module Boilerpipe::Filters
9
8
  class LargeBlockSameTagLevelToContentFilter
10
-
11
9
  def self.process(doc)
12
-
13
10
  largest = doc.text_blocks.find do |tb|
14
11
  tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
15
12
  end
16
13
 
17
14
  return doc if largest.nil?
15
+
18
16
  tag_level = largest.tag_level
19
17
 
20
18
  doc.text_blocks.each do |tb|
21
19
  next if tb.is_content?
20
+
22
21
  tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
23
22
  end
24
23
 
25
24
  doc
26
25
  end
27
-
28
26
  end
29
27
  end
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
11
11
  doc.text_blocks.each do |tb|
12
12
  if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
13
13
  tag_level = tb.tag_level
14
- elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
14
+ elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
15
15
  tb.content = true
16
16
  else
17
17
  tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
20
20
 
21
21
  doc
22
22
  end
23
-
24
23
  end
25
24
  end
@@ -1,14 +1,12 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class MarkEverythingContentFilter
5
-
6
5
  def self.process(doc)
7
6
  doc.text_blocks.each do |tb|
8
7
  tb.content = true if tb.is_not_content?
9
8
  end
10
9
  doc
11
10
  end
12
-
13
11
  end
14
12
  end
@@ -8,30 +8,27 @@
8
8
 
9
9
  module Boilerpipe::Filters
10
10
  class MinClauseWordsFilter
11
-
12
- def self.process(doc, min_words=5)
13
-
11
+ def self.process(doc, min_words = 5)
14
12
  doc.text_blocks.each do |tb|
15
13
  next if tb.is_not_content?
16
14
 
17
15
  clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
16
+ hasClause = false
18
17
  tb.text.scan(clause_delimiter).each do |possible_clause|
19
- if is_clause? possible_clause
20
- break
21
- else
22
- tb.content = false
23
- end
18
+ hasClause |= is_clause? possible_clause
24
19
  end
20
+
21
+ tb.content = false unless hasClause
25
22
  end
26
23
 
27
24
  doc
28
25
  end
29
26
 
30
- def self.is_clause?(text, min_words=5)
31
- return false if text.nil?
27
+ def self.is_clause?(text, min_words = 5)
28
+ return false if text.nil?
29
+
32
30
  whitespace = /[ \n\r]+/
33
31
  text.scan(whitespace).size >= min_words
34
32
  end
35
-
36
33
  end
37
34
  end
@@ -1,16 +1,14 @@
1
-
2
1
  # Keeps only those content blocks which contain at least k words.
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class MinWordsFilter
6
-
7
5
  def self.process(min_words, doc)
8
6
  doc.text_blocks.each do |tb|
9
7
  next if tb.is_not_content?
8
+
10
9
  tb.content = false if tb.num_words < min_words
11
10
  end
12
11
  doc
13
12
  end
14
-
15
13
  end
16
14
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  # Classifies TextBlocks as content/not-content through rules that have been determined
4
2
  # using the C4.8 machine learning algorithm, as described in the paper
5
3
  # "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
7
5
 
8
6
  module Boilerpipe::Filters
9
7
  class NumWordsRulesClassifier
10
-
11
8
  def self.process(doc)
12
9
  empty = Boilerpipe::Document::TextBlock.empty_start
13
10
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
37
34
 
38
35
  false
39
36
  end
40
-
41
37
  end
42
38
  end
@@ -1,4 +1,4 @@
1
- # Merges two subsequent blocks if their text densities are equal.
1
+ # Merges two subsequent blocks if their text densities are equal.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
17
17
  end
18
18
  end
19
19
 
20
- doc.replace_text_blocks!( tbs - blocks_to_remove )
20
+ doc.replace_text_blocks!(tbs - blocks_to_remove)
21
21
  doc
22
22
  end
23
23
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Splits TextBlocks at paragraph boundaries.
3
2
  #
4
3
  # NOTE: This is not fully supported (i.e., it will break highlighting support via
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class SplitParagraphBlocksFilter
11
-
12
10
  def self.process(doc)
13
11
  tbs = doc.text_blocks
14
12
  new_blocks = []
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
35
33
  doc.replace_text_blocks!(new_blocks) if changes
36
34
  doc
37
35
  end
38
-
39
36
  end
40
37
  end
@@ -1,15 +1,13 @@
1
- # encoding: utf-8
2
-
3
1
  # Finds blocks which are potentially indicating the end of an article
4
2
  # text and marks them with INDICATES_END_OF_TEXT. This can be used
5
3
  # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
6
4
 
7
-
8
5
  module Boilerpipe::Filters
9
6
  class TerminatingBlocksFinder
10
7
  def self.process(doc)
11
8
  doc.text_blocks.each do |tb|
12
9
  next unless tb.num_words < 15
10
+
13
11
  if tb.text.length >= 8 && finds_match?(tb.text.downcase)
14
12
  tb.labels << :INDICATES_END_OF_TEXT
15
13
  elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
29
27
  text.include?('what you think...') ||
30
28
  text.include?('add your comment') ||
31
29
  text.include?('add comment') ||
32
- #TODO add this and test
33
- #text.include?('leave a reply') ||
34
- #text.include?('leave a comment') ||
35
- #text.include?('show comments') ||
36
- #text.include?('Share this:') ||
30
+ # TODO add this and test
31
+ # text.include?('leave a reply') ||
32
+ # text.include?('leave a comment') ||
33
+ # text.include?('show comments') ||
34
+ # text.include?('Share this:') ||
37
35
  text.include?('reader views') ||
38
36
  text.include?('have your say') ||
39
37
  text.include?('reader comments') ||
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks trailing headlines TextBlocks that have the label :#HEADING
3
2
  # as boilerplate. Trailing means they are marked content and are
4
3
  # below any other content block.
@@ -6,7 +5,6 @@
6
5
  module Boilerpipe::Filters
7
6
  class TrailingHeadlineToBoilerplateFilter
8
7
  def self.process(doc)
9
-
10
8
  doc.text_blocks.each do |tb|
11
9
  next unless tb.is_content?
12
10
 
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
19
17
 
20
18
  doc
21
19
  end
22
-
23
20
  end
24
21
  end
@@ -2,7 +2,7 @@ module Boilerpipe::Labels
2
2
  class LabelAction
3
3
  attr_reader :labels
4
4
 
5
- def initialize(labels=[])
5
+ def initialize(labels = [])
6
6
  @labels = labels
7
7
  end
8
8
 
@@ -1,20 +1,16 @@
1
- require 'nokogiri'
2
1
  module Boilerpipe::SAX
3
2
  class BoilerpipeHTMLParser
4
3
  def self.parse(text)
5
-
6
- #script bug - delete script tags
4
+ # script bug - delete script tags
7
5
  text.gsub!(/\<script>.+?<\/script>/i, '')
8
6
 
9
7
  # nokogiri uses libxml for mri and nekohtml for jruby
10
8
  # mri doesn't remove &nbsp; when missing the semicolon
11
9
  text.gsub!(/(&nbsp) /, '\1; ')
12
10
 
13
-
14
11
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
15
12
  text = Nokogiri::HTML(text).to_html
16
13
 
17
-
18
14
  handler = HTMLContentHandler.new
19
15
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
20
16
  noko_parser.parse(text)
@@ -1,11 +1,8 @@
1
- require 'nokogiri'
2
- require 'set'
3
-
4
1
  module Boilerpipe::SAX
5
2
  class HTMLContentHandler < Nokogiri::XML::SAX::Document
6
3
  attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
7
4
 
8
- attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
5
+ attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
9
6
  ANCHOR_TEXT_START = "$\ue00a<"
10
7
  ANCHOR_TEXT_END = ">\ue00a$"
11
8
 
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
34
31
  @label_stacks << nil
35
32
  tag = name.upcase.intern
36
33
 
37
-
38
34
  tag_action = @tag_actions[tag]
39
35
  if tag_action
40
36
  @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
51
47
  def characters(text)
52
48
  flush_block if @flush
53
49
 
54
- return if @in_ignorable_element != 0
50
+ return if in_ignorable_element?
55
51
  return if text.empty?
56
52
 
57
53
  # replace all whitespace with simple space
58
54
  text.gsub!(/\s+/, ' ')
59
55
 
60
56
  # trim whitespace
61
- started_with_whitespace = text =~ /^\s/
62
- ended_with_whitespace = text =~ /\s$/
57
+ started_with_whitespace = text =~ /^\s/
58
+ ended_with_whitespace = text =~ /\s$/
63
59
  text.strip!
64
60
 
65
61
  # add a single space if the block was only whitespace
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
158
154
  end
159
155
 
160
156
  text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
161
- num_words,
162
- num_linked_words,
163
- num_words_in_wrapped_lines,
164
- num_wrapped_lines, @offset_blocks)
157
+ num_words,
158
+ num_linked_words,
159
+ num_words_in_wrapped_lines,
160
+ num_wrapped_lines, @offset_blocks)
165
161
 
166
162
  @offset_blocks += 1
167
163
  clear_buffers
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
187
183
  # \p{No} -- a numeric character of other type
188
184
 
189
185
  def is_word?(word)
190
- word =~ VALID_WORD_CHARACTER
186
+ word =~ VALID_WORD_CHARACTER
191
187
  end
192
188
 
193
- #public void flushBlock() {
189
+ # public void flushBlock() {
194
190
  # int numWords = 0;
195
191
  # int numLinkedWords = 0;
196
192
  # int numWrappedLines = 0;
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
198
194
  # final int maxLineLength = 80;
199
195
  # int numTokens = 0;
200
196
  # int numWordsCurrentLine = 0;
201
- #}
197
+ # }
202
198
 
203
199
  def increase_in_ignorable_element!
204
200
  @in_ignorable_element += 1
205
201
  end
206
202
 
203
+ # should we prevent less than zero here?
207
204
  def decrease_in_ignorable_element!
208
205
  @in_ignorable_element -= 1
209
206
  end
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
224
221
  @in_anchor_tag > 0
225
222
  end
226
223
 
227
-
228
224
  def add_text_block(text_block)
229
225
  @label_stacks.each do |stack|
230
226
  next unless stack
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
239
235
  # append space if last character wasn't already one
240
236
  def append_space
241
237
  return if @sb_last_was_whitespace
238
+
242
239
  @sb_last_was_whitespace = true
243
240
 
244
241
  @text_buffer << ' '
@@ -48,4 +48,3 @@ module Boilerpipe::SAX
48
48
  end
49
49
  end
50
50
  end
51
-
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
2
2
  class AnchorText
3
3
  # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
4
4
  # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
5
- #* encounters such nestings, a SAXException is thrown.
5
+ # * encounters such nestings, a SAXException is thrown.
6
6
  def start(handler, name, attrs)
7
7
  if handler.in_anchor_tag?
8
8
  handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
42
42
  # - dunno about nokogiri???????
43
43
  # as nested A elements are not allowed per specification, we
44
44
  # are probably reaching this branch due to a bug in the XML parser
45
- #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
45
+ # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
46
46
  end_tag(handler, name)
47
47
  end
48
48
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Explicitly marks this tag a simple "block-level" element,
3
- # which always generates whitespace
2
+ # Explicitly marks this tag a simple "block-level" element,
3
+ # which always generates whitespace
4
4
  class BlockLevel
5
5
  def start(handler, name, attrs)
6
6
  true
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # for block-level elements, which triggers some LabelAction on
3
- # the generated TextBlock.
2
+ # for block-level elements, which triggers some LabelAction on
3
+ # the generated TextBlock.
4
4
  class BlockTagLabel
5
5
  def initialize(label_action)
6
6
  @label_action = label_action
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Marks this tag the body element (this should usually only
3
- # be set for the <BODY> tag).
2
+ # Marks this tag the body element (this should usually only
3
+ # be set for the <BODY> tag).
4
4
  class Body
5
5
  def start(handler, name, attrs)
6
6
  handler.flush_block
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
10
10
  rel = m[1]
11
11
  val = m[2].to_i # absolute
12
12
  size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
13
- handler.font_size_stack << size
13
+ handler.font_size_stack << size
14
14
  else
15
15
  handler.font_size_stack << nil
16
16
  end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
27
27
  end
28
28
 
29
29
  def relative(font_size_stack, rel, val)
30
- prev_size = font_size_stack.reverse_each.find{|s| s != nil}
30
+ prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
31
31
  prev_size = 3 if prev_size.nil?
32
32
 
33
33
  size = if rel == '+'
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.0'
2
+ VERSION = '0.4.1'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-15 00:00:00.000000000 Z
11
+ date: 2019-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -39,33 +39,33 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: rickshaw
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: 0.4.0
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: 0.4.0
55
55
  - !ruby/object:Gem::Dependency
56
- name: rickshaw
56
+ name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.4.0
61
+ version: '3.0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.4.0
68
+ version: '3.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -80,7 +80,7 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: 1.6.6.2
83
- description: A pure ruby implementation of the boilerpipe algorithm
83
+ description: A pure ruby implementation of the boilerpipe web content extraction algorithm
84
84
  email:
85
85
  - "<gregory.ostermayr@gmail.com>"
86
86
  executables: []
@@ -88,9 +88,11 @@ extensions: []
88
88
  extra_rdoc_files: []
89
89
  files:
90
90
  - ".circleci/config.yml"
91
+ - ".dockerignore"
91
92
  - ".gitignore"
92
93
  - ".rspec"
93
94
  - CHANGELOG.md
95
+ - Dockerfile
94
96
  - Gemfile
95
97
  - LICENSE.txt
96
98
  - README.md
@@ -98,6 +100,7 @@ files:
98
100
  - bin/console
99
101
  - bin/setup
100
102
  - boilerpipe-ruby.gemspec
103
+ - boilerpipe_flow.md
101
104
  - lib/boilerpipe.rb
102
105
  - lib/boilerpipe/document/text_block.rb
103
106
  - lib/boilerpipe/document/text_document.rb
@@ -166,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
169
  - !ruby/object:Gem::Version
167
170
  version: '0'
168
171
  requirements: []
169
- rubyforge_project:
170
- rubygems_version: 2.6.12
172
+ rubygems_version: 3.0.1
171
173
  signing_key:
172
174
  specification_version: 4
173
- summary: A pure ruby implemenation of the boilerpipe algorithm
175
+ summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
174
176
  test_files: []