boilerpipe-ruby 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
4
- data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
3
+ metadata.gz: 64511e224c1ec186c5c7cde62568dd0ba8cf1005
4
+ data.tar.gz: 455a8bac8eaadda62706d8f507854c6f6ccb6dba
5
5
  SHA512:
6
- metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
7
- data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
6
+ metadata.gz: 8f2f769627e15ca0b8293122143304ef397ed789567cff61ceb48f45a657336442711c17d29d25fa8cfdf0dbcfd557030063e7b2a132550921ce601e2573e71f
7
+ data.tar.gz: 630f46c4d3a6e71933be0ba5f3bc98dc97fd34d10fb05d9e656e8f5f2f20fb4c28197122ef349398c79de7e57c795721307a5dadc7b4a796f907b2904d4816d0
@@ -1,3 +1,10 @@
1
+ # 0.3.0 / 2017-09-12
2
+
3
+ * Add LargestContent Extractor
4
+ * Add KeepEverything Extractor
5
+ * Add NumWordsRules Extractor
6
+ * Add Canola Extractor
7
+
1
8
  # 0.2.0 / 2017-09-11
2
9
 
3
10
  * Add Default Extractor
data/README.md CHANGED
@@ -10,13 +10,19 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
10
10
 
11
11
  This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
12
12
 
13
- I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
13
+ # TLDR
14
+
15
+ Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
14
16
 
15
17
  Presently the follow Extractors are implemented
16
18
  * [x] ArticleExtractor
19
+ * [ ] ArticleSentenceExtractor
20
+ * [x] CanolaExtractor
17
21
  * [x] DefaultExtractor
18
- * [ ] LargestContentExtractor
19
- * [ ] KeepEverythingExtractor
22
+ * [x] KeepEverythingExtractor
23
+ * [ ] KeepEverythingWithMinKWordsExtractor
24
+ * [x] LargestContentExtractor
25
+ * [x] NumWordsRulesExtractor
20
26
 
21
27
  [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
22
28
 
@@ -44,10 +50,18 @@ Or install it yourself as:
44
50
  > require 'open-uri'
45
51
  => true
46
52
  > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
47
- > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
53
+
54
+ > Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
48
55
  => "Always Squash and Rebase your Git Commits"
49
- > output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
56
+
57
+ > Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
50
58
  => "Posted on\nWhat is the squash rebase workf"
59
+
60
+ > Boilerpipe::Extractors::LargestContentExtractor.text(content).slice(0, 40)
61
+ => "git push origin master\nWhy should you ad"
62
+
63
+ > Boilerpipe::Extractors::KeepEverythingExtractor.text(content).slice(0..40)
64
+ => "Toggle Navigation\nCarbon Five\nAbout\nWork\n"
51
65
 
52
66
  ## Development
53
67
 
@@ -6,10 +6,15 @@ require 'boilerpipe/document/text_document'
6
6
  require 'boilerpipe/document/text_block'
7
7
 
8
8
  require 'boilerpipe/extractors/article_extractor'
9
+ require 'boilerpipe/extractors/canola_extractor'
9
10
  require 'boilerpipe/extractors/default_extractor'
11
+ require 'boilerpipe/extractors/keep_everything_extractor'
12
+ require 'boilerpipe/extractors/largest_content_extractor'
13
+ require 'boilerpipe/extractors/num_words_rules_extractor'
10
14
 
11
15
  require 'boilerpipe/filters/block_proximity_fusion'
12
16
  require 'boilerpipe/filters/boilerplate_block_filter'
17
+ require 'boilerpipe/filters/canola_classifier'
13
18
  require 'boilerpipe/filters/density_rules_classifier'
14
19
  require 'boilerpipe/filters/document_title_match_classifier'
15
20
  require 'boilerpipe/filters/expand_title_to_content_filter'
@@ -18,6 +23,7 @@ require 'boilerpipe/filters/ignore_blocks_after_content_filter'
18
23
  require 'boilerpipe/filters/keep_largest_block_filter'
19
24
  require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
20
25
  require 'boilerpipe/filters/list_at_end_filter'
26
+ require 'boilerpipe/filters/mark_everything_content_filter'
21
27
  require 'boilerpipe/filters/num_words_rules_classifier'
22
28
  require 'boilerpipe/filters/simple_block_fusion_processor'
23
29
  require 'boilerpipe/filters/terminating_blocks_finder'
@@ -0,0 +1,16 @@
1
+ module Boilerpipe::Extractors
2
+ class CanolaExtractor
3
+
4
+ def self.text(contents)
5
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
+ ::Boilerpipe::Extractors::CanolaExtractor.process doc
7
+ doc.content
8
+ end
9
+
10
+ def self.process(doc)
11
+ ::Boilerpipe::Filters::CanolaClassifier.process doc
12
+
13
+ doc
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ # Marks all blocks as content.
2
+
3
+ module Boilerpipe::Extractors
4
+ class KeepEverythingExtractor
5
+ def self.text(contents)
6
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
7
+ ::Boilerpipe::Extractors::KeepEverythingExtractor.process doc
8
+ doc.content
9
+ end
10
+
11
+ def self.process(doc)
12
+ ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
13
+ doc
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,18 @@
1
+ module Boilerpipe::Extractors
2
+ class LargestContentExtractor
3
+ def self.text(contents)
4
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
5
+ ::Boilerpipe::Extractors::LargestContentExtractor.process doc
6
+ doc.content
7
+ end
8
+
9
+ def self.process(doc)
10
+ filters = ::Boilerpipe::Filters
11
+ filters::NumWordsRulesClassifier.process doc
12
+ filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
13
+ filters::KeepLargestBlockFilter::INSTANCE.process doc
14
+
15
+ doc
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ module Boilerpipe::Extractors
2
+ class NumWordsRulesExtractor
3
+
4
+ def self.text(contents)
5
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
+ ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
7
+ doc.content
8
+ end
9
+
10
+ def self.process(doc)
11
+ ::Boilerpipe::Filters::NumWordsRulesClassifier.process doc
12
+ doc
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,28 @@
1
+ # A full-text extractor trained on http://krdwrd.org/
2
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
+ # Works well with SimpleEstimator, too.
4
+
5
+ module Boilerpipe::Filters
6
+ class CanolaClassifier
7
+
8
+ def self.process(doc)
9
+ return doc if doc.text_blocks.size < 1
10
+
11
+ empty = Boilerpipe::Document::TextBlock.empty_start
12
+ text_blocks = [empty] + doc.text_blocks + [empty]
13
+
14
+ text_blocks.each_cons(3) do |slice|
15
+ prev, current, nxt = *slice
16
+ current.content = classify(prev, current, nxt)
17
+ end
18
+
19
+ doc
20
+ end
21
+
22
+ def self.classify(prev, current, nxt)
23
+ current.link_density > 0 && nxt.num_words > 11 \
24
+ || current.num_words > 19 \
25
+ || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,14 @@
1
+ # Marks all blocks as content.
2
+
3
+ module Boilerpipe::Filters
4
+ class MarkEverythingContentFilter
5
+
6
+ def self.process(doc)
7
+ doc.text_blocks.each do |tb|
8
+ tb.content = true if tb.is_not_content?
9
+ end
10
+ doc
11
+ end
12
+
13
+ end
14
+ end
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.2.0'
2
+ VERSION = '0.3.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-11 00:00:00.000000000 Z
11
+ date: 2017-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -103,9 +103,14 @@ files:
103
103
  - lib/boilerpipe/document/text_document.rb
104
104
  - lib/boilerpipe/errors.rb
105
105
  - lib/boilerpipe/extractors/article_extractor.rb
106
+ - lib/boilerpipe/extractors/canola_extractor.rb
106
107
  - lib/boilerpipe/extractors/default_extractor.rb
108
+ - lib/boilerpipe/extractors/keep_everything_extractor.rb
109
+ - lib/boilerpipe/extractors/largest_content_extractor.rb
110
+ - lib/boilerpipe/extractors/num_words_rules_extractor.rb
107
111
  - lib/boilerpipe/filters/block_proximity_fusion.rb
108
112
  - lib/boilerpipe/filters/boilerplate_block_filter.rb
113
+ - lib/boilerpipe/filters/canola_classifier.rb
109
114
  - lib/boilerpipe/filters/density_rules_classifier.rb
110
115
  - lib/boilerpipe/filters/document_title_match_classifier.rb
111
116
  - lib/boilerpipe/filters/expand_title_to_content_filter.rb
@@ -114,6 +119,7 @@ files:
114
119
  - lib/boilerpipe/filters/keep_largest_block_filter.rb
115
120
  - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
116
121
  - lib/boilerpipe/filters/list_at_end_filter.rb
122
+ - lib/boilerpipe/filters/mark_everything_content_filter.rb
117
123
  - lib/boilerpipe/filters/num_words_rules_classifier.rb
118
124
  - lib/boilerpipe/filters/simple_block_fusion_processor.rb
119
125
  - lib/boilerpipe/filters/terminating_blocks_finder.rb