boilerpipe-ruby 0.2.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +34 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +32 -7
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +14 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
  15. data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
  16. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  17. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
  18. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
  19. data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
  20. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
  21. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  22. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  23. data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
  24. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  25. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  26. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
  27. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  28. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  29. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  30. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  31. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  32. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
  33. data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
  34. data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
  35. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  36. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  37. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
  38. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  39. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  40. data/lib/boilerpipe/labels/label_action.rb +1 -1
  41. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
  42. data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
  43. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  44. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  47. data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
  48. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  49. data/lib/boilerpipe/version.rb +1 -1
  50. metadata +38 -25
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
4
- data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
2
+ SHA256:
3
+ metadata.gz: da8bc0b8d74eea14b73e61812bbeba5fef75e8bae2330739e49b28e26f73d14d
4
+ data.tar.gz: 68fee529b501210cf3278eb2b045b09e6d27c7846355b7d430c05e60f39088e2
5
5
  SHA512:
6
- metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
7
- data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
6
+ metadata.gz: 4202afab2a01ae588977fde351dfacc29551634077e794d028282666c43d3aeb09adf425d60608dc694c36f2fd8ed034ef89ba41ee07e6ad23f426c19d740931
7
+ data.tar.gz: a0dc75a0c5384e1eaf8b50dfb92bc9294b41f38e8b0e0cceb9e9a6aafdf436629df2b3420f61a33523a08d6788c02ade799cdf8fe29db4338c766c7b01523704
@@ -6,48 +6,30 @@ version: 2
6
6
  jobs:
7
7
  build:
8
8
  docker:
9
- # specify the version you desire here
10
- - image: circleci/ruby:2.4.1-node-browsers
11
-
9
+ - image: circleci/ruby:2.5.5-node-browsers
10
+
12
11
  # Specify service dependencies here if necessary
13
12
  # CircleCI maintains a library of pre-built images
14
13
  # documented at https://circleci.com/docs/2.0/circleci-images/
15
- # - image: circleci/postgres:9.4
16
14
 
17
15
  working_directory: ~/repo
18
16
 
19
17
  steps:
20
18
  - checkout
21
19
 
22
- # Download and cache dependencies
23
- - restore_cache:
24
- keys:
25
- - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
- # fallback to using the latest cache if no exact match is found
27
- - v1-dependencies-
28
-
20
+ - run: gem install bundler
29
21
  - run:
30
22
  name: install dependencies
31
23
  command: |
32
- bundle install --jobs=4 --retry=3 --path vendor/bundle
24
+ bundle install --jobs=4 --retry=3
33
25
 
34
- - save_cache:
35
- paths:
36
- - ./vendor/bundle
37
- key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
-
39
- # Database setup
40
- #- run: bundle exec rake db:create
41
- #- run: bundle exec rake db:schema:load
42
-
43
- # run tests!
44
26
  - run:
45
27
  name: run tests
46
28
  command: |
47
29
  mkdir /tmp/test-results
48
30
  TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
-
50
- rspec --format progress "spec"
31
+
32
+ bundle exec rspec --format progress "spec"
51
33
 
52
34
  # collect reports
53
35
  - store_test_results:
@@ -0,0 +1,7 @@
1
+ .git
2
+ .gitignore
3
+ log/*
4
+ tmp/*
5
+ *.swp
6
+ *.swo
7
+ Gemfile.lock
@@ -1,3 +1,36 @@
1
+ # 0.4.3 / 2020-07-18
2
+
3
+ * update deps
4
+
5
+ # 0.4.2 / 2020-03-11
6
+
7
+ * update deps
8
+
9
+ # 0.4.1 / 2019-07-04
10
+
11
+ * Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
12
+ * Allow tests to run in Docker
13
+ * Update circle to continue to work
14
+ * Add architecture flow
15
+ * Code formatting
16
+ * Add min words filter specs
17
+ * Add label action specs
18
+ * Add missing test case to ignorable element spec
19
+ * Add merge_next case to text block spec
20
+ * Dry up includes
21
+
22
+ # 0.4.0 / 2017-09-15
23
+
24
+ * Add KeepEverythingWithMinKWords Extractor
25
+ * Add ArticleSentence Extractor
26
+
27
+ # 0.3.0 / 2017-09-12
28
+
29
+ * Add LargestContent Extractor
30
+ * Add KeepEverything Extractor
31
+ * Add NumWordsRules Extractor
32
+ * Add Canola Extractor
33
+
1
34
  # 0.2.0 / 2017-09-11
2
35
 
3
36
  * Add Default Extractor
@@ -10,4 +43,4 @@
10
43
 
11
44
  # 0.1.0 / 2017-09-08
12
45
 
13
- * Add Article Extractor
46
+ * Add Article Extractor
@@ -0,0 +1,14 @@
1
+ From ruby:2.5
2
+ RUN gem install bundler
3
+ COPY *gemspec /usr/src/app/
4
+ COPY Gemfile /usr/src/app/
5
+ COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
6
+ COPY bin /usr/src/app/
7
+ COPY bin/* /usr/src/app/bin/
8
+
9
+ WORKDIR /usr/src/app
10
+ RUN bin/setup
11
+
12
+ COPY . /usr/src/app/
13
+
14
+ CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
data/README.md CHANGED
@@ -1,5 +1,8 @@
1
1
  # Boilerpipe
2
2
 
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
4
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
+
3
6
  A pure ruby implemenation of the boilerpipe algorithm.
4
7
 
5
8
  This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,15 +13,22 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
10
13
 
11
14
  This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
12
15
 
13
- I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
16
+ Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
17
+
18
+ # TLDR
19
+
20
+ Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
14
21
 
15
22
  Presently the follow Extractors are implemented
16
23
  * [x] ArticleExtractor
24
+ * [x] ArticleSentenceExtractor
25
+ * [x] CanolaExtractor
17
26
  * [x] DefaultExtractor
18
- * [ ] LargestContentExtractor
19
- * [ ] KeepEverythingExtractor
27
+ * [x] KeepEverythingExtractor
28
+ * [x] KeepEverythingWithMinKWordsExtractor
29
+ * [x] LargestContentExtractor
30
+ * [x] NumWordsRulesExtractor
20
31
 
21
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
22
32
 
23
33
  ## Installation
24
34
 
@@ -44,16 +54,31 @@ Or install it yourself as:
44
54
  > require 'open-uri'
45
55
  => true
46
56
  > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
47
- > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
57
+
58
+ > Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
48
59
  => "Always Squash and Rebase your Git Commits"
49
- > output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
60
+
61
+ > Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
50
62
  => "Posted on\nWhat is the squash rebase workf"
63
+
64
+ > Boilerpipe::Extractors::LargestContentExtractor.text(content).slice(0, 40)
65
+ => "git push origin master\nWhy should you ad"
66
+
67
+ > Boilerpipe::Extractors::KeepEverythingExtractor.text(content).slice(0..40)
68
+ => "Toggle Navigation\nCarbon Five\nAbout\nWork\n"
51
69
 
52
70
  ## Development
53
71
 
54
72
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
55
73
 
56
- To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
74
+ To install this gem onto your local machine, run `bundle exec rake install`.
75
+
76
+ ### Running Tests on Docker
77
+
78
+ The default run command will run the tests
79
+
80
+ docker build -t boilerpipe .
81
+ docker run -it --rm boilerpipe
57
82
 
58
83
  ## Contributing
59
84
 
data/Rakefile CHANGED
@@ -1,14 +1,13 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
6
6
  task :default => :spec
7
7
 
8
-
9
8
  desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
10
9
  task :download_boilerpipe_jar do
11
10
  FileUtils.mkdir_p 'spec/sanity_checks/jars/'
12
11
  Dir.chdir 'spec/sanity_checks/jars/'
13
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
12
+ `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
14
13
  end
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "bundler/setup"
4
- require "boilerpipe"
3
+ require 'bundler/setup'
4
+ require 'boilerpipe'
5
5
 
6
6
  # You can add fixtures and/or initialization code here to make experimenting
7
7
  # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
10
10
  # require "pry"
11
11
  # Pry.start
12
12
 
13
- require "irb"
13
+ require 'irb'
14
14
  IRB.start
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
11
  spec.license = 'Apache 2.0'
12
12
 
13
- spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
14
- spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
13
+ spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
14
+ spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
15
15
  spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
16
16
 
17
17
  spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
- spec.bindir = "exe"
18
+ spec.bindir = 'exe'
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
- spec.require_paths = ["lib"]
20
+ spec.require_paths = ['lib']
21
21
 
22
- spec.add_development_dependency 'bundler', '~> 1.11'
23
- spec.add_development_dependency 'rake', '~> 10.0'
24
- spec.add_development_dependency 'rspec', '~> 3.0'
25
- spec.add_development_dependency 'rickshaw', '~> 0.4.0'
26
- spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
22
+ spec.add_development_dependency 'bundler', '~> 2.0'
23
+ spec.add_development_dependency 'rake', '>= 12.3.3'
24
+ spec.add_development_dependency 'rickshaw', '~> 0.5.0'
25
+ spec.add_development_dependency 'rspec', '~> 3.9'
26
+ spec.add_runtime_dependency 'nokogiri', '~> 1.10'
27
27
  end
@@ -0,0 +1,40 @@
1
+ ```
2
+ raw html
3
+ |
4
+ |
5
+ sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
6
+ |
7
+ -------------------------------------<------------------------------------<------|
8
+ | | |
9
+ text blocks text blocks text blocks
10
+ | | |
11
+ | | |
12
+ -----------------------------
13
+ |
14
+ |
15
+ text document
16
+ |
17
+ |
18
+ filter
19
+ |
20
+ filter
21
+ |
22
+ filter
23
+ |
24
+ filter
25
+ |
26
+ filter
27
+ |
28
+ filter
29
+ |
30
+ filter
31
+ |
32
+ filter
33
+ |
34
+ filter
35
+ |
36
+ |
37
+ text document
38
+ |
39
+ outputs extracted text
40
+ ```
@@ -1,3 +1,6 @@
1
+ require 'nokogiri'
2
+ require 'set'
3
+
1
4
  require 'boilerpipe/version'
2
5
 
3
6
  require 'boilerpipe/util/unicode_tokenizer'
@@ -6,10 +9,17 @@ require 'boilerpipe/document/text_document'
6
9
  require 'boilerpipe/document/text_block'
7
10
 
8
11
  require 'boilerpipe/extractors/article_extractor'
12
+ require 'boilerpipe/extractors/article_sentence_extractor'
13
+ require 'boilerpipe/extractors/canola_extractor'
9
14
  require 'boilerpipe/extractors/default_extractor'
15
+ require 'boilerpipe/extractors/keep_everything_extractor'
16
+ require 'boilerpipe/extractors/keep_everything_with_k_min_words_extractor'
17
+ require 'boilerpipe/extractors/largest_content_extractor'
18
+ require 'boilerpipe/extractors/num_words_rules_extractor'
10
19
 
11
20
  require 'boilerpipe/filters/block_proximity_fusion'
12
21
  require 'boilerpipe/filters/boilerplate_block_filter'
22
+ require 'boilerpipe/filters/canola_classifier'
13
23
  require 'boilerpipe/filters/density_rules_classifier'
14
24
  require 'boilerpipe/filters/document_title_match_classifier'
15
25
  require 'boilerpipe/filters/expand_title_to_content_filter'
@@ -18,8 +28,12 @@ require 'boilerpipe/filters/ignore_blocks_after_content_filter'
18
28
  require 'boilerpipe/filters/keep_largest_block_filter'
19
29
  require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
20
30
  require 'boilerpipe/filters/list_at_end_filter'
31
+ require 'boilerpipe/filters/mark_everything_content_filter'
32
+ require 'boilerpipe/filters/min_clause_words_filter'
33
+ require 'boilerpipe/filters/min_words_filter'
21
34
  require 'boilerpipe/filters/num_words_rules_classifier'
22
35
  require 'boilerpipe/filters/simple_block_fusion_processor'
36
+ require 'boilerpipe/filters/split_paragraph_blocks_filter'
23
37
  require 'boilerpipe/filters/terminating_blocks_finder'
24
38
  require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
25
39
 
@@ -1,10 +1,7 @@
1
- require 'set'
2
-
3
1
  module Boilerpipe
4
2
  module Document
5
3
  class TextBlock
6
-
7
- #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
4
+ # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
8
5
 
9
6
  attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
10
7
  :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
12
9
 
13
10
  attr_accessor :content
14
11
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
12
+ def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
16
13
  @labels = Set.new
17
14
  @text = text
18
15
  @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
32
29
  new('', 0, 0, 0, 0, -1)
33
30
  end
34
31
 
35
- def set_tag_level(level)
36
- @tag_level = level
37
- end
32
+ def set_tag_level(level)
33
+ @tag_level = level
34
+ end
38
35
 
39
36
  def is_content?
40
37
  @content
@@ -68,8 +65,8 @@ module Boilerpipe
68
65
  @num_words_in_anchor_text += other.num_words_in_anchor_text
69
66
  @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
70
67
  @num_wrapped_lines += other.num_wrapped_lines
71
- @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
72
- @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
68
+ @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
69
+ @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
73
70
  init_densities
74
71
  @content |= other.is_content?
75
72
 
@@ -87,10 +84,10 @@ module Boilerpipe
87
84
  end
88
85
 
89
86
  def to_s
90
- #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
87
+ # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
91
88
  labels = 'null'
92
89
  if !@labels.empty?
93
- labels ="[#{ @labels.to_a.join(',')}]"
90
+ labels = "[#{@labels.to_a.join(',')}]"
94
91
  end
95
92
  "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
96
93
  end
@@ -100,6 +97,7 @@ module Boilerpipe
100
97
  end
101
98
 
102
99
  private
100
+
103
101
  def init_densities
104
102
  if @num_words_in_wrapped_lines == 0
105
103
  @num_words_in_wrapped_lines = @num_words
@@ -19,14 +19,14 @@ module Boilerpipe
19
19
  case text_block.is_content?
20
20
  when true
21
21
  next unless include_content
22
- s << text_block.text
23
- s << "\n"
24
22
  when false
25
23
  next unless include_noncontent
26
- s << text_block.text
27
- s << "\n"
28
24
  end
25
+
26
+ s << text_block.text
27
+ s << "\n"
29
28
  end
29
+
30
30
  s
31
31
  end
32
32
 
@@ -38,7 +38,6 @@ module Boilerpipe
38
38
  @text_blocks.map(&:to_s).join("\n")
39
39
  end
40
40
  alias_method :debug_string, :debug_s
41
-
42
41
  end
43
42
  end
44
43
  end
@@ -0,0 +1,17 @@
1
+ # A full-text extractor which is tuned towards extracting sentences from news articles.
2
+
3
+ module Boilerpipe::Extractors
4
+ class ArticleSentenceExtractor
5
+ def self.text(contents)
6
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
7
+ ::Boilerpipe::Extractors::ArticleSentenceExtractor.process(doc)
8
+ doc.content
9
+ end
10
+
11
+ def self.process(doc)
12
+ ::Boilerpipe::Extractors::ArticleExtractor.process doc
13
+ ::Boilerpipe::Filters::SplitParagraphBlocksFilter.process doc
14
+ ::Boilerpipe::Filters::MinClauseWordsFilter.process doc
15
+ end
16
+ end
17
+ end