boilerpipe-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8b4fe90788ec566674b1a076d7e86c0467d1b805
4
- data.tar.gz: 762c1e5e23730ddc0fe4b3d4096c92939b715dc7
3
+ metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
4
+ data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
5
5
  SHA512:
6
- metadata.gz: f185331100d3533326800cf2ee610ea2b962d4ff6971b89a0b05f1443bea490f730f78cd17e04b421f2f8fc4009252fac1ed90476350f9b68879f963111d345e
7
- data.tar.gz: 08313ed988aecb0a69e0479d3605f3dfb922928a605136a5a23d44d3fd5d4073550f035fee7861aa9b8280874faa55ec6fc63393f9643e7ea11150a2fb2b4261
6
+ metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
7
+ data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
@@ -0,0 +1,57 @@
1
+ # Ruby CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-ruby/ for more details
4
+ #
5
+ version: 2
6
+ jobs:
7
+ build:
8
+ docker:
9
+ # specify the version you desire here
10
+ - image: circleci/ruby:2.4.1-node-browsers
11
+
12
+ # Specify service dependencies here if necessary
13
+ # CircleCI maintains a library of pre-built images
14
+ # documented at https://circleci.com/docs/2.0/circleci-images/
15
+ # - image: circleci/postgres:9.4
16
+
17
+ working_directory: ~/repo
18
+
19
+ steps:
20
+ - checkout
21
+
22
+ # Download and cache dependencies
23
+ - restore_cache:
24
+ keys:
25
+ - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
+ # fallback to using the latest cache if no exact match is found
27
+ - v1-dependencies-
28
+
29
+ - run:
30
+ name: install dependencies
31
+ command: |
32
+ bundle install --jobs=4 --retry=3 --path vendor/bundle
33
+
34
+ - save_cache:
35
+ paths:
36
+ - ./vendor/bundle
37
+ key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
+
39
+ # Database setup
40
+ #- run: bundle exec rake db:create
41
+ #- run: bundle exec rake db:schema:load
42
+
43
+ # run tests!
44
+ - run:
45
+ name: run tests
46
+ command: |
47
+ mkdir /tmp/test-results
48
+ TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
+
50
+ rspec --format progress "spec"
51
+
52
+ # collect reports
53
+ - store_test_results:
54
+ path: /tmp/test-results
55
+ - store_artifacts:
56
+ path: /tmp/test-results
57
+ destination: test-results
data/.gitignore CHANGED
@@ -1,6 +1,5 @@
1
1
  /.bundle/
2
2
  /.yardoc
3
- /Gemfile.lock
4
3
  /_yardoc/
5
4
  /coverage/
6
5
  /doc/
@@ -0,0 +1,13 @@
1
+ # 0.2.0 / 2017-09-11
2
+
3
+ * Add Default Extractor
4
+ * Tweak dependency to use Nokogiri 1.6.6.2 or newer
5
+ * Add Apache 2.0 license to reflect original work by Christian Kohlschütter
6
+
7
+ # 0.1.1 / 2017-09-11
8
+
9
+ * bugfix new line character escaping bug
10
+
11
+ # 0.1.0 / 2017-09-08
12
+
13
+ * Add Article Extractor
@@ -0,0 +1,33 @@
1
+ boilerpipe
2
+
3
+ Copyright (c) 2009, 2014 Christian Kohlschütter
4
+
5
+ The author licenses this file to You under the Apache License, Version 2.0
6
+ (the "License"); you may not use this file except in compliance with
7
+ the License. You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
17
+ ================================================================================
18
+
19
+ Reimplemetation of the original Java-based boilerpipe library
20
+
21
+ Modifications Copyright (c) 2017 Gregory Ostermayr
22
+
23
+ The author licenses this file to You under the Apache License, Version 2.0
24
+ (the "License"); you may not use this file except in compliance with
25
+ the License. You may obtain a copy of the License at
26
+
27
+ http://www.apache.org/licenses/LICENSE-2.0
28
+
29
+ Unless required by applicable law or agreed to in writing, software
30
+ distributed under the License is distributed on an "AS IS" BASIS,
31
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32
+ See the License for the specific language governing permissions and
33
+ limitations under the License.
data/README.md CHANGED
@@ -14,10 +14,12 @@ I've only got the ArticleExtractor working but the others should be following qu
14
14
 
15
15
  Presently the follow Extractors are implemented
16
16
  * [x] ArticleExtractor
17
- * [ ] DefaultExtractor
17
+ * [x] DefaultExtractor
18
18
  * [ ] LargestContentExtractor
19
19
  * [ ] KeepEverythingExtractor
20
20
 
21
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
22
+
21
23
  ## Installation
22
24
 
23
25
  Add this line to your application's Gemfile:
@@ -44,6 +46,8 @@ Or install it yourself as:
44
46
  > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
45
47
  > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
46
48
  => "Always Squash and Rebase your Git Commits"
49
+ > output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
50
+ => "Posted on\nWhat is the squash rebase workf"
47
51
 
48
52
  ## Development
49
53
 
@@ -8,6 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Boilerpipe::VERSION
9
9
  spec.authors = ['Gregory Ostermayr']
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
+ spec.license = 'Apache 2.0'
11
12
 
12
13
  spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
13
14
  spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
@@ -21,5 +22,6 @@ Gem::Specification.new do |spec|
21
22
  spec.add_development_dependency 'bundler', '~> 1.11'
22
23
  spec.add_development_dependency 'rake', '~> 10.0'
23
24
  spec.add_development_dependency 'rspec', '~> 3.0'
24
- spec.add_runtime_dependency 'nokogiri', '1.6.6.2'
25
+ spec.add_development_dependency 'rickshaw', '~> 0.4.0'
26
+ spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
25
27
  end
@@ -1,10 +1,16 @@
1
1
  require 'boilerpipe/version'
2
+
2
3
  require 'boilerpipe/util/unicode_tokenizer'
4
+
3
5
  require 'boilerpipe/document/text_document'
4
6
  require 'boilerpipe/document/text_block'
7
+
5
8
  require 'boilerpipe/extractors/article_extractor'
9
+ require 'boilerpipe/extractors/default_extractor'
10
+
6
11
  require 'boilerpipe/filters/block_proximity_fusion'
7
12
  require 'boilerpipe/filters/boilerplate_block_filter'
13
+ require 'boilerpipe/filters/density_rules_classifier'
8
14
  require 'boilerpipe/filters/document_title_match_classifier'
9
15
  require 'boilerpipe/filters/expand_title_to_content_filter'
10
16
  require 'boilerpipe/filters/heuristic_filter_base'
@@ -13,10 +19,13 @@ require 'boilerpipe/filters/keep_largest_block_filter'
13
19
  require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
14
20
  require 'boilerpipe/filters/list_at_end_filter'
15
21
  require 'boilerpipe/filters/num_words_rules_classifier'
22
+ require 'boilerpipe/filters/simple_block_fusion_processor'
16
23
  require 'boilerpipe/filters/terminating_blocks_finder'
17
24
  require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
25
+
18
26
  require 'boilerpipe/labels/default'
19
27
  require 'boilerpipe/labels/label_action'
28
+
20
29
  require 'boilerpipe/sax/html_content_handler'
21
30
  require 'boilerpipe/sax/boilerpipe_html_parser'
22
31
  require 'boilerpipe/sax/tag_action_map'
@@ -12,7 +12,7 @@ module Boilerpipe
12
12
 
13
13
  attr_accessor :content
14
14
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=0, offset_blocks=0)
15
+ def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
16
16
  @labels = Set.new
17
17
  @text = text
18
18
  @num_words = num_words
@@ -3,6 +3,7 @@ module Boilerpipe::Extractors
3
3
  def self.text(contents)
4
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
5
5
  ::Boilerpipe::Extractors::ArticleExtractor.process(doc)
6
+ doc.content
6
7
  end
7
8
 
8
9
  def self.process(doc)
@@ -46,7 +47,7 @@ module Boilerpipe::Extractors
46
47
  # Marks nested list-item blocks after the end of the main content as content.
47
48
  filters::ListAtEndFilter.process doc
48
49
 
49
- doc.content
50
+ doc
50
51
  end
51
52
  end
52
53
  end
@@ -0,0 +1,24 @@
1
+ module Boilerpipe::Extractors
2
+ class DefaultExtractor
3
+
4
+ def self.text(contents)
5
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
+ ::Boilerpipe::Extractors::DefaultExtractor.process doc
7
+ doc.content
8
+ end
9
+
10
+ def self.process(doc)
11
+ filters = ::Boilerpipe::Filters
12
+ # merge adjacent blocks with equal text_density
13
+ filters::SimpleBlockFusionProcessor.process doc
14
+
15
+ # merge text blocks next to each other
16
+ filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
17
+
18
+ # marks text blocks as content / non-content using boilerpipe alg
19
+ filters::DensityRulesClassifier.process doc
20
+
21
+ doc
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,39 @@
1
+ # Classifies TextBlocks as content/not-content through rules that have been determined
2
+ # using the C4.8 machine learning algorithm, as described in the paper
3
+ # "Boilerplate Detection using Shallow Text Features", particularly using text densities and link
4
+ # densities.
5
+
6
+ module Boilerpipe::Filters
7
+ class DensityRulesClassifier
8
+
9
+ def self.process(doc)
10
+ #return doc if doc.text_blocks.size < 2
11
+
12
+ empty = Boilerpipe::Document::TextBlock.empty_start
13
+ text_blocks = [empty] + doc.text_blocks + [empty]
14
+
15
+ text_blocks.each_cons(3) do |slice|
16
+ prev, current, nxt = *slice
17
+ current.content = classify(prev, current, nxt)
18
+ end
19
+
20
+ doc
21
+ end
22
+
23
+ def self.classify(prev, current, nxt)
24
+ return false if current.link_density > 0.333333
25
+
26
+ if prev.link_density <= 0.555556
27
+ if current.text_density <= 9
28
+ return true if nxt.text_density > 10
29
+ return prev.text_density <= 4 ? false : true
30
+ else
31
+ return nxt.text_density == 0 ? false : true
32
+ end
33
+ else
34
+ return false if nxt.text_density <= 11
35
+ true
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,24 @@
1
+ # Merges two subsequent blocks if their text densities are equal.
2
+
3
+ module Boilerpipe::Filters
4
+ class SimpleBlockFusionProcessor
5
+ def self.process(doc)
6
+ tbs = doc.text_blocks
7
+ return doc if tbs.size < 2
8
+
9
+ blocks_to_remove = []
10
+ tb1 = tbs.first
11
+ tbs.drop(1).each do |tb|
12
+ if tb1.text_density == tb.text_density
13
+ tb1.merge_next(tb)
14
+ blocks_to_remove << tb
15
+ else
16
+ tb1 = tb
17
+ end
18
+ end
19
+
20
+ doc.replace_text_blocks!( tbs - blocks_to_remove )
21
+ doc
22
+ end
23
+ end
24
+ end
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
@@ -52,18 +52,32 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rickshaw
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.4.0
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: nokogiri
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
- - - '='
73
+ - - ">="
60
74
  - !ruby/object:Gem::Version
61
75
  version: 1.6.6.2
62
76
  type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - '='
80
+ - - ">="
67
81
  - !ruby/object:Gem::Version
68
82
  version: 1.6.6.2
69
83
  description: A pure ruby implementation of the boilerpipe algorithm
@@ -73,9 +87,12 @@ executables: []
73
87
  extensions: []
74
88
  extra_rdoc_files: []
75
89
  files:
90
+ - ".circleci/config.yml"
76
91
  - ".gitignore"
77
92
  - ".rspec"
93
+ - CHANGELOG.md
78
94
  - Gemfile
95
+ - LICENSE.txt
79
96
  - README.md
80
97
  - Rakefile
81
98
  - bin/console
@@ -86,8 +103,10 @@ files:
86
103
  - lib/boilerpipe/document/text_document.rb
87
104
  - lib/boilerpipe/errors.rb
88
105
  - lib/boilerpipe/extractors/article_extractor.rb
106
+ - lib/boilerpipe/extractors/default_extractor.rb
89
107
  - lib/boilerpipe/filters/block_proximity_fusion.rb
90
108
  - lib/boilerpipe/filters/boilerplate_block_filter.rb
109
+ - lib/boilerpipe/filters/density_rules_classifier.rb
91
110
  - lib/boilerpipe/filters/document_title_match_classifier.rb
92
111
  - lib/boilerpipe/filters/expand_title_to_content_filter.rb
93
112
  - lib/boilerpipe/filters/heuristic_filter_base.rb
@@ -96,6 +115,7 @@ files:
96
115
  - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
97
116
  - lib/boilerpipe/filters/list_at_end_filter.rb
98
117
  - lib/boilerpipe/filters/num_words_rules_classifier.rb
118
+ - lib/boilerpipe/filters/simple_block_fusion_processor.rb
99
119
  - lib/boilerpipe/filters/terminating_blocks_finder.rb
100
120
  - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
101
121
  - lib/boilerpipe/labels/default.rb
@@ -117,7 +137,8 @@ files:
117
137
  - lib/boilerpipe/version.rb
118
138
  - stuff.txt
119
139
  homepage: https://github.com/gregors/boilerpipe-ruby
120
- licenses: []
140
+ licenses:
141
+ - Apache 2.0
121
142
  metadata: {}
122
143
  post_install_message:
123
144
  rdoc_options: []