boilerpipe-ruby 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 64511e224c1ec186c5c7cde62568dd0ba8cf1005
4
- data.tar.gz: 455a8bac8eaadda62706d8f507854c6f6ccb6dba
3
+ metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
4
+ data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
5
5
  SHA512:
6
- metadata.gz: 8f2f769627e15ca0b8293122143304ef397ed789567cff61ceb48f45a657336442711c17d29d25fa8cfdf0dbcfd557030063e7b2a132550921ce601e2573e71f
7
- data.tar.gz: 630f46c4d3a6e71933be0ba5f3bc98dc97fd34d10fb05d9e656e8f5f2f20fb4c28197122ef349398c79de7e57c795721307a5dadc7b4a796f907b2904d4816d0
6
+ metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
7
+ data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
@@ -1,3 +1,8 @@
1
+ # 0.4.0 / 2017-09-15
2
+
3
+ * Add KeepEverythingWithMinKWords Extractor
4
+ * Add ArticleSentence Extractor
5
+
1
6
  # 0.3.0 / 2017-09-12
2
7
 
3
8
  * Add LargestContent Extractor
data/README.md CHANGED
@@ -16,16 +16,18 @@ Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor -
16
16
 
17
17
  Presently the follow Extractors are implemented
18
18
  * [x] ArticleExtractor
19
- * [ ] ArticleSentenceExtractor
19
+ * [x] ArticleSentenceExtractor
20
20
  * [x] CanolaExtractor
21
21
  * [x] DefaultExtractor
22
22
  * [x] KeepEverythingExtractor
23
- * [ ] KeepEverythingWithMinKWordsExtractor
23
+ * [x] KeepEverythingWithMinKWordsExtractor
24
24
  * [x] LargestContentExtractor
25
25
  * [x] NumWordsRulesExtractor
26
26
 
27
27
  [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
28
28
 
29
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
30
+
29
31
  ## Installation
30
32
 
31
33
  Add this line to your application's Gemfile:
@@ -6,9 +6,11 @@ require 'boilerpipe/document/text_document'
6
6
  require 'boilerpipe/document/text_block'
7
7
 
8
8
  require 'boilerpipe/extractors/article_extractor'
9
+ require 'boilerpipe/extractors/article_sentence_extractor'
9
10
  require 'boilerpipe/extractors/canola_extractor'
10
11
  require 'boilerpipe/extractors/default_extractor'
11
12
  require 'boilerpipe/extractors/keep_everything_extractor'
13
+ require 'boilerpipe/extractors/keep_everything_with_k_min_words_extractor'
12
14
  require 'boilerpipe/extractors/largest_content_extractor'
13
15
  require 'boilerpipe/extractors/num_words_rules_extractor'
14
16
 
@@ -24,8 +26,11 @@ require 'boilerpipe/filters/keep_largest_block_filter'
24
26
  require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
25
27
  require 'boilerpipe/filters/list_at_end_filter'
26
28
  require 'boilerpipe/filters/mark_everything_content_filter'
29
+ require 'boilerpipe/filters/min_clause_words_filter'
30
+ require 'boilerpipe/filters/min_words_filter'
27
31
  require 'boilerpipe/filters/num_words_rules_classifier'
28
32
  require 'boilerpipe/filters/simple_block_fusion_processor'
33
+ require 'boilerpipe/filters/split_paragraph_blocks_filter'
29
34
  require 'boilerpipe/filters/terminating_blocks_finder'
30
35
  require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
31
36
 
@@ -0,0 +1,17 @@
1
+ # A full-text extractor which is tuned towards extracting sentences from news articles.
2
+
3
+ module Boilerpipe::Extractors
4
+ class ArticleSentenceExtractor
5
+ def self.text(contents)
6
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
7
+ ::Boilerpipe::Extractors::ArticleSentenceExtractor.process(doc)
8
+ doc.content
9
+ end
10
+
11
+ def self.process(doc)
12
+ ::Boilerpipe::Extractors::ArticleExtractor.process doc
13
+ ::Boilerpipe::Filters::SplitParagraphBlocksFilter.process doc
14
+ ::Boilerpipe::Filters::MinClauseWordsFilter.process doc
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+
2
+ # A full-text extractor which extracts the largest text component of a page.
3
+ # For news articles, it may perform better than the DefaultExtractor, but
4
+ # usually worse than ArticleExtractor.
5
+
6
+ module Boilerpipe::Extractors
7
+ class KeepEverythingWithKMinWordsExtractor
8
+ def self.text(min, contents)
9
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
10
+ ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
11
+ doc.content
12
+ end
13
+
14
+ def self.process(min, doc)
15
+ ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
16
+ ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
17
+ ::Boilerpipe::Filters::MinWordsFilter.process min, doc
18
+ doc
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,37 @@
1
+ #
2
+ # Keeps only blocks that have at least one segment fragment ("clause") with at least k
3
+ # words (default: 5).
4
+ #
5
+ # NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
6
+ #
7
+ # SplitParagraphBlocksFilter
8
+
9
+ module Boilerpipe::Filters
10
+ class MinClauseWordsFilter
11
+
12
+ def self.process(doc, min_words=5)
13
+
14
+ doc.text_blocks.each do |tb|
15
+ next if tb.is_not_content?
16
+
17
+ clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
18
+ tb.text.scan(clause_delimiter).each do |possible_clause|
19
+ if is_clause? possible_clause
20
+ break
21
+ else
22
+ tb.content = false
23
+ end
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def self.is_clause?(text, min_words=5)
31
+ return false if text.nil?
32
+ whitespace = /[ \n\r]+/
33
+ text.scan(whitespace).size >= min_words
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+
2
+ # Keeps only those content blocks which contain at least k words.
3
+
4
+ module Boilerpipe::Filters
5
+ class MinWordsFilter
6
+
7
+ def self.process(min_words, doc)
8
+ doc.text_blocks.each do |tb|
9
+ next if tb.is_not_content?
10
+ tb.content = false if tb.num_words < min_words
11
+ end
12
+ doc
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,40 @@
1
+
2
+ # Splits TextBlocks at paragraph boundaries.
3
+ #
4
+ # NOTE: This is not fully supported (i.e., it will break highlighting support via
5
+ # #getContainedTextElements()), but this one probably is necessary for some other filters.
6
+ #
7
+ # see MinClauseWordsFilter
8
+
9
+ module Boilerpipe::Filters
10
+ class SplitParagraphBlocksFilter
11
+
12
+ def self.process(doc)
13
+ tbs = doc.text_blocks
14
+ new_blocks = []
15
+ changes = false
16
+ tbs.each do |tb|
17
+ paragraphs = tb.text.split(/[\n\r]+/)
18
+
19
+ if paragraphs.size < 2
20
+ new_blocks << tb
21
+ next
22
+ end
23
+
24
+ is_content = tb.is_content?
25
+ labels = tb.labels
26
+ paragraphs.each do |paragraph|
27
+ tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
28
+ tbP.content = is_content
29
+ tbP.add_labels(labels)
30
+ new_blocks << tbP
31
+ changes = true
32
+ end
33
+ end
34
+
35
+ doc.replace_text_blocks!(new_blocks) if changes
36
+ doc
37
+ end
38
+
39
+ end
40
+ end
@@ -4,11 +4,11 @@ module Boilerpipe::SAX
4
4
  def self.parse(text)
5
5
 
6
6
  #script bug - delete script tags
7
- text = text.gsub(/\<script>.+?<\/script>/i, '')
7
+ text.gsub!(/\<script>.+?<\/script>/i, '')
8
8
 
9
9
  # nokogiri uses libxml for mri and nekohtml for jruby
10
10
  # mri doesn't remove &nbsp; when missing the semicolon
11
- text = text.gsub(/(&nbsp) /, '\1; ')
11
+ text.gsub!(/(&nbsp) /, '\1; ')
12
12
 
13
13
 
14
14
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.3.0'
2
+ VERSION = '0.4.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-12 00:00:00.000000000 Z
11
+ date: 2017-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -103,9 +103,11 @@ files:
103
103
  - lib/boilerpipe/document/text_document.rb
104
104
  - lib/boilerpipe/errors.rb
105
105
  - lib/boilerpipe/extractors/article_extractor.rb
106
+ - lib/boilerpipe/extractors/article_sentence_extractor.rb
106
107
  - lib/boilerpipe/extractors/canola_extractor.rb
107
108
  - lib/boilerpipe/extractors/default_extractor.rb
108
109
  - lib/boilerpipe/extractors/keep_everything_extractor.rb
110
+ - lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
109
111
  - lib/boilerpipe/extractors/largest_content_extractor.rb
110
112
  - lib/boilerpipe/extractors/num_words_rules_extractor.rb
111
113
  - lib/boilerpipe/filters/block_proximity_fusion.rb
@@ -120,8 +122,11 @@ files:
120
122
  - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
121
123
  - lib/boilerpipe/filters/list_at_end_filter.rb
122
124
  - lib/boilerpipe/filters/mark_everything_content_filter.rb
125
+ - lib/boilerpipe/filters/min_clause_words_filter.rb
126
+ - lib/boilerpipe/filters/min_words_filter.rb
123
127
  - lib/boilerpipe/filters/num_words_rules_classifier.rb
124
128
  - lib/boilerpipe/filters/simple_block_fusion_processor.rb
129
+ - lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
125
130
  - lib/boilerpipe/filters/terminating_blocks_finder.rb
126
131
  - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
127
132
  - lib/boilerpipe/labels/default.rb