boilerpipe-ruby 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 64511e224c1ec186c5c7cde62568dd0ba8cf1005
4
- data.tar.gz: 455a8bac8eaadda62706d8f507854c6f6ccb6dba
3
+ metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
4
+ data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
5
5
  SHA512:
6
- metadata.gz: 8f2f769627e15ca0b8293122143304ef397ed789567cff61ceb48f45a657336442711c17d29d25fa8cfdf0dbcfd557030063e7b2a132550921ce601e2573e71f
7
- data.tar.gz: 630f46c4d3a6e71933be0ba5f3bc98dc97fd34d10fb05d9e656e8f5f2f20fb4c28197122ef349398c79de7e57c795721307a5dadc7b4a796f907b2904d4816d0
6
+ metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
7
+ data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
@@ -1,3 +1,8 @@
1
+ # 0.4.0 / 2017-09-15
2
+
3
+ * Add KeepEverythingWithMinKWords Extractor
4
+ * Add ArticleSentence Extractor
5
+
1
6
  # 0.3.0 / 2017-09-12
2
7
 
3
8
  * Add LargestContent Extractor
data/README.md CHANGED
@@ -16,16 +16,18 @@ Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor -
16
16
 
17
17
  Presently the follow Extractors are implemented
18
18
  * [x] ArticleExtractor
19
- * [ ] ArticleSentenceExtractor
19
+ * [x] ArticleSentenceExtractor
20
20
  * [x] CanolaExtractor
21
21
  * [x] DefaultExtractor
22
22
  * [x] KeepEverythingExtractor
23
- * [ ] KeepEverythingWithMinKWordsExtractor
23
+ * [x] KeepEverythingWithMinKWordsExtractor
24
24
  * [x] LargestContentExtractor
25
25
  * [x] NumWordsRulesExtractor
26
26
 
27
27
  [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
28
28
 
29
+ [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
30
+
29
31
  ## Installation
30
32
 
31
33
  Add this line to your application's Gemfile:
@@ -6,9 +6,11 @@ require 'boilerpipe/document/text_document'
6
6
  require 'boilerpipe/document/text_block'
7
7
 
8
8
  require 'boilerpipe/extractors/article_extractor'
9
+ require 'boilerpipe/extractors/article_sentence_extractor'
9
10
  require 'boilerpipe/extractors/canola_extractor'
10
11
  require 'boilerpipe/extractors/default_extractor'
11
12
  require 'boilerpipe/extractors/keep_everything_extractor'
13
+ require 'boilerpipe/extractors/keep_everything_with_k_min_words_extractor'
12
14
  require 'boilerpipe/extractors/largest_content_extractor'
13
15
  require 'boilerpipe/extractors/num_words_rules_extractor'
14
16
 
@@ -24,8 +26,11 @@ require 'boilerpipe/filters/keep_largest_block_filter'
24
26
  require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
25
27
  require 'boilerpipe/filters/list_at_end_filter'
26
28
  require 'boilerpipe/filters/mark_everything_content_filter'
29
+ require 'boilerpipe/filters/min_clause_words_filter'
30
+ require 'boilerpipe/filters/min_words_filter'
27
31
  require 'boilerpipe/filters/num_words_rules_classifier'
28
32
  require 'boilerpipe/filters/simple_block_fusion_processor'
33
+ require 'boilerpipe/filters/split_paragraph_blocks_filter'
29
34
  require 'boilerpipe/filters/terminating_blocks_finder'
30
35
  require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
31
36
 
@@ -0,0 +1,17 @@
1
+ # A full-text extractor which is tuned towards extracting sentences from news articles.
2
+
3
+ module Boilerpipe::Extractors
4
+ class ArticleSentenceExtractor
5
+ def self.text(contents)
6
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
7
+ ::Boilerpipe::Extractors::ArticleSentenceExtractor.process(doc)
8
+ doc.content
9
+ end
10
+
11
+ def self.process(doc)
12
+ ::Boilerpipe::Extractors::ArticleExtractor.process doc
13
+ ::Boilerpipe::Filters::SplitParagraphBlocksFilter.process doc
14
+ ::Boilerpipe::Filters::MinClauseWordsFilter.process doc
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+
2
+ # A full-text extractor which extracts the largest text component of a page.
3
+ # For news articles, it may perform better than the DefaultExtractor, but
4
+ # usually worse than ArticleExtractor.
5
+
6
+ module Boilerpipe::Extractors
7
+ class KeepEverythingWithKMinWordsExtractor
8
+ def self.text(min, contents)
9
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
10
+ ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
11
+ doc.content
12
+ end
13
+
14
+ def self.process(min, doc)
15
+ ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
16
+ ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
17
+ ::Boilerpipe::Filters::MinWordsFilter.process min, doc
18
+ doc
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,37 @@
1
+ #
2
+ # Keeps only blocks that have at least one segment fragment ("clause") with at least k
3
+ # words (default: 5).
4
+ #
5
+ # NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
6
+ #
7
+ # SplitParagraphBlocksFilter
8
+
9
+ module Boilerpipe::Filters
10
+ class MinClauseWordsFilter
11
+
12
+ def self.process(doc, min_words=5)
13
+
14
+ doc.text_blocks.each do |tb|
15
+ next if tb.is_not_content?
16
+
17
+ clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
18
+ tb.text.scan(clause_delimiter).each do |possible_clause|
19
+ if is_clause? possible_clause
20
+ break
21
+ else
22
+ tb.content = false
23
+ end
24
+ end
25
+ end
26
+
27
+ doc
28
+ end
29
+
30
+ def self.is_clause?(text, min_words=5)
31
+ return false if text.nil?
32
+ whitespace = /[ \n\r]+/
33
+ text.scan(whitespace).size >= min_words
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,16 @@
1
+
2
+ # Keeps only those content blocks which contain at least k words.
3
+
4
+ module Boilerpipe::Filters
5
+ class MinWordsFilter
6
+
7
+ def self.process(min_words, doc)
8
+ doc.text_blocks.each do |tb|
9
+ next if tb.is_not_content?
10
+ tb.content = false if tb.num_words < min_words
11
+ end
12
+ doc
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,40 @@
1
+
2
+ # Splits TextBlocks at paragraph boundaries.
3
+ #
4
+ # NOTE: This is not fully supported (i.e., it will break highlighting support via
5
+ # #getContainedTextElements()), but this one probably is necessary for some other filters.
6
+ #
7
+ # see MinClauseWordsFilter
8
+
9
+ module Boilerpipe::Filters
10
+ class SplitParagraphBlocksFilter
11
+
12
+ def self.process(doc)
13
+ tbs = doc.text_blocks
14
+ new_blocks = []
15
+ changes = false
16
+ tbs.each do |tb|
17
+ paragraphs = tb.text.split(/[\n\r]+/)
18
+
19
+ if paragraphs.size < 2
20
+ new_blocks << tb
21
+ next
22
+ end
23
+
24
+ is_content = tb.is_content?
25
+ labels = tb.labels
26
+ paragraphs.each do |paragraph|
27
+ tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
28
+ tbP.content = is_content
29
+ tbP.add_labels(labels)
30
+ new_blocks << tbP
31
+ changes = true
32
+ end
33
+ end
34
+
35
+ doc.replace_text_blocks!(new_blocks) if changes
36
+ doc
37
+ end
38
+
39
+ end
40
+ end
@@ -4,11 +4,11 @@ module Boilerpipe::SAX
4
4
  def self.parse(text)
5
5
 
6
6
  #script bug - delete script tags
7
- text = text.gsub(/\<script>.+?<\/script>/i, '')
7
+ text.gsub!(/\<script>.+?<\/script>/i, '')
8
8
 
9
9
  # nokogiri uses libxml for mri and nekohtml for jruby
10
10
  # mri doesn't remove &nbsp; when missing the semicolon
11
- text = text.gsub(/(&nbsp) /, '\1; ')
11
+ text.gsub!(/(&nbsp) /, '\1; ')
12
12
 
13
13
 
14
14
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.3.0'
2
+ VERSION = '0.4.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-12 00:00:00.000000000 Z
11
+ date: 2017-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -103,9 +103,11 @@ files:
103
103
  - lib/boilerpipe/document/text_document.rb
104
104
  - lib/boilerpipe/errors.rb
105
105
  - lib/boilerpipe/extractors/article_extractor.rb
106
+ - lib/boilerpipe/extractors/article_sentence_extractor.rb
106
107
  - lib/boilerpipe/extractors/canola_extractor.rb
107
108
  - lib/boilerpipe/extractors/default_extractor.rb
108
109
  - lib/boilerpipe/extractors/keep_everything_extractor.rb
110
+ - lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
109
111
  - lib/boilerpipe/extractors/largest_content_extractor.rb
110
112
  - lib/boilerpipe/extractors/num_words_rules_extractor.rb
111
113
  - lib/boilerpipe/filters/block_proximity_fusion.rb
@@ -120,8 +122,11 @@ files:
120
122
  - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
121
123
  - lib/boilerpipe/filters/list_at_end_filter.rb
122
124
  - lib/boilerpipe/filters/mark_everything_content_filter.rb
125
+ - lib/boilerpipe/filters/min_clause_words_filter.rb
126
+ - lib/boilerpipe/filters/min_words_filter.rb
123
127
  - lib/boilerpipe/filters/num_words_rules_classifier.rb
124
128
  - lib/boilerpipe/filters/simple_block_fusion_processor.rb
129
+ - lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
125
130
  - lib/boilerpipe/filters/terminating_blocks_finder.rb
126
131
  - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
127
132
  - lib/boilerpipe/labels/default.rb