boilerpipe-ruby 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8b4fe90788ec566674b1a076d7e86c0467d1b805
4
- data.tar.gz: 762c1e5e23730ddc0fe4b3d4096c92939b715dc7
3
+ metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
4
+ data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
5
5
  SHA512:
6
- metadata.gz: f185331100d3533326800cf2ee610ea2b962d4ff6971b89a0b05f1443bea490f730f78cd17e04b421f2f8fc4009252fac1ed90476350f9b68879f963111d345e
7
- data.tar.gz: 08313ed988aecb0a69e0479d3605f3dfb922928a605136a5a23d44d3fd5d4073550f035fee7861aa9b8280874faa55ec6fc63393f9643e7ea11150a2fb2b4261
6
+ metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
7
+ data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
@@ -0,0 +1,57 @@
1
+ # Ruby CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-ruby/ for more details
4
+ #
5
+ version: 2
6
+ jobs:
7
+ build:
8
+ docker:
9
+ # specify the version you desire here
10
+ - image: circleci/ruby:2.4.1-node-browsers
11
+
12
+ # Specify service dependencies here if necessary
13
+ # CircleCI maintains a library of pre-built images
14
+ # documented at https://circleci.com/docs/2.0/circleci-images/
15
+ # - image: circleci/postgres:9.4
16
+
17
+ working_directory: ~/repo
18
+
19
+ steps:
20
+ - checkout
21
+
22
+ # Download and cache dependencies
23
+ - restore_cache:
24
+ keys:
25
+ - v1-dependencies-{{ checksum "Gemfile.lock" }}
26
+ # fallback to using the latest cache if no exact match is found
27
+ - v1-dependencies-
28
+
29
+ - run:
30
+ name: install dependencies
31
+ command: |
32
+ bundle install --jobs=4 --retry=3 --path vendor/bundle
33
+
34
+ - save_cache:
35
+ paths:
36
+ - ./vendor/bundle
37
+ key: v1-dependencies-{{ checksum "Gemfile.lock" }}
38
+
39
+ # Database setup
40
+ #- run: bundle exec rake db:create
41
+ #- run: bundle exec rake db:schema:load
42
+
43
+ # run tests!
44
+ - run:
45
+ name: run tests
46
+ command: |
47
+ mkdir /tmp/test-results
48
+ TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
49
+
50
+ rspec --format progress "spec"
51
+
52
+ # collect reports
53
+ - store_test_results:
54
+ path: /tmp/test-results
55
+ - store_artifacts:
56
+ path: /tmp/test-results
57
+ destination: test-results
data/.gitignore CHANGED
@@ -1,6 +1,5 @@
1
1
  /.bundle/
2
2
  /.yardoc
3
- /Gemfile.lock
4
3
  /_yardoc/
5
4
  /coverage/
6
5
  /doc/
@@ -0,0 +1,13 @@
1
+ # 0.2.0 / 2017-09-11
2
+
3
+ * Add Default Extractor
4
+ * Tweak dependency to use Nokogiri 1.6.6.2 or newer
5
+ * Add Apache 2.0 license to reflect original work by Christian Kohlschütter
6
+
7
+ # 0.1.1 / 2017-09-11
8
+
9
+ * bugfix new line character escaping bug
10
+
11
+ # 0.1.0 / 2017-09-08
12
+
13
+ * Add Article Extractor
@@ -0,0 +1,33 @@
1
+ boilerpipe
2
+
3
+ Copyright (c) 2009, 2014 Christian Kohlschütter
4
+
5
+ The author licenses this file to You under the Apache License, Version 2.0
6
+ (the "License"); you may not use this file except in compliance with
7
+ the License. You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
17
+ ================================================================================
18
+
19
+ Reimplemetation of the original Java-based boilerpipe library
20
+
21
+ Modifications Copyright (c) 2017 Gregory Ostermayr
22
+
23
+ The author licenses this file to You under the Apache License, Version 2.0
24
+ (the "License"); you may not use this file except in compliance with
25
+ the License. You may obtain a copy of the License at
26
+
27
+ http://www.apache.org/licenses/LICENSE-2.0
28
+
29
+ Unless required by applicable law or agreed to in writing, software
30
+ distributed under the License is distributed on an "AS IS" BASIS,
31
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32
+ See the License for the specific language governing permissions and
33
+ limitations under the License.
data/README.md CHANGED
@@ -14,10 +14,12 @@ I've only got the ArticleExtractor working but the others should be following qu
14
14
 
15
15
  Presently the follow Extractors are implemented
16
16
  * [x] ArticleExtractor
17
- * [ ] DefaultExtractor
17
+ * [x] DefaultExtractor
18
18
  * [ ] LargestContentExtractor
19
19
  * [ ] KeepEverythingExtractor
20
20
 
21
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
22
+
21
23
  ## Installation
22
24
 
23
25
  Add this line to your application's Gemfile:
@@ -44,6 +46,8 @@ Or install it yourself as:
44
46
  > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
45
47
  > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
46
48
  => "Always Squash and Rebase your Git Commits"
49
+ > output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
50
+ => "Posted on\nWhat is the squash rebase workf"
47
51
 
48
52
  ## Development
49
53
 
@@ -8,6 +8,7 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Boilerpipe::VERSION
9
9
  spec.authors = ['Gregory Ostermayr']
10
10
  spec.email = ['<gregory.ostermayr@gmail.com>']
11
+ spec.license = 'Apache 2.0'
11
12
 
12
13
  spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
13
14
  spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
@@ -21,5 +22,6 @@ Gem::Specification.new do |spec|
21
22
  spec.add_development_dependency 'bundler', '~> 1.11'
22
23
  spec.add_development_dependency 'rake', '~> 10.0'
23
24
  spec.add_development_dependency 'rspec', '~> 3.0'
24
- spec.add_runtime_dependency 'nokogiri', '1.6.6.2'
25
+ spec.add_development_dependency 'rickshaw', '~> 0.4.0'
26
+ spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
25
27
  end
@@ -1,10 +1,16 @@
1
1
  require 'boilerpipe/version'
2
+
2
3
  require 'boilerpipe/util/unicode_tokenizer'
4
+
3
5
  require 'boilerpipe/document/text_document'
4
6
  require 'boilerpipe/document/text_block'
7
+
5
8
  require 'boilerpipe/extractors/article_extractor'
9
+ require 'boilerpipe/extractors/default_extractor'
10
+
6
11
  require 'boilerpipe/filters/block_proximity_fusion'
7
12
  require 'boilerpipe/filters/boilerplate_block_filter'
13
+ require 'boilerpipe/filters/density_rules_classifier'
8
14
  require 'boilerpipe/filters/document_title_match_classifier'
9
15
  require 'boilerpipe/filters/expand_title_to_content_filter'
10
16
  require 'boilerpipe/filters/heuristic_filter_base'
@@ -13,10 +19,13 @@ require 'boilerpipe/filters/keep_largest_block_filter'
13
19
  require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
14
20
  require 'boilerpipe/filters/list_at_end_filter'
15
21
  require 'boilerpipe/filters/num_words_rules_classifier'
22
+ require 'boilerpipe/filters/simple_block_fusion_processor'
16
23
  require 'boilerpipe/filters/terminating_blocks_finder'
17
24
  require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
25
+
18
26
  require 'boilerpipe/labels/default'
19
27
  require 'boilerpipe/labels/label_action'
28
+
20
29
  require 'boilerpipe/sax/html_content_handler'
21
30
  require 'boilerpipe/sax/boilerpipe_html_parser'
22
31
  require 'boilerpipe/sax/tag_action_map'
@@ -12,7 +12,7 @@ module Boilerpipe
12
12
 
13
13
  attr_accessor :content
14
14
 
15
- def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=0, offset_blocks=0)
15
+ def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
16
16
  @labels = Set.new
17
17
  @text = text
18
18
  @num_words = num_words
@@ -3,6 +3,7 @@ module Boilerpipe::Extractors
3
3
  def self.text(contents)
4
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
5
5
  ::Boilerpipe::Extractors::ArticleExtractor.process(doc)
6
+ doc.content
6
7
  end
7
8
 
8
9
  def self.process(doc)
@@ -46,7 +47,7 @@ module Boilerpipe::Extractors
46
47
  # Marks nested list-item blocks after the end of the main content as content.
47
48
  filters::ListAtEndFilter.process doc
48
49
 
49
- doc.content
50
+ doc
50
51
  end
51
52
  end
52
53
  end
@@ -0,0 +1,24 @@
1
+ module Boilerpipe::Extractors
2
+ class DefaultExtractor
3
+
4
+ def self.text(contents)
5
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
+ ::Boilerpipe::Extractors::DefaultExtractor.process doc
7
+ doc.content
8
+ end
9
+
10
+ def self.process(doc)
11
+ filters = ::Boilerpipe::Filters
12
+ # merge adjacent blocks with equal text_density
13
+ filters::SimpleBlockFusionProcessor.process doc
14
+
15
+ # merge text blocks next to each other
16
+ filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
17
+
18
+ # marks text blocks as content / non-content using boilerpipe alg
19
+ filters::DensityRulesClassifier.process doc
20
+
21
+ doc
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,39 @@
1
+ # Classifies TextBlocks as content/not-content through rules that have been determined
2
+ # using the C4.8 machine learning algorithm, as described in the paper
3
+ # "Boilerplate Detection using Shallow Text Features", particularly using text densities and link
4
+ # densities.
5
+
6
+ module Boilerpipe::Filters
7
+ class DensityRulesClassifier
8
+
9
+ def self.process(doc)
10
+ #return doc if doc.text_blocks.size < 2
11
+
12
+ empty = Boilerpipe::Document::TextBlock.empty_start
13
+ text_blocks = [empty] + doc.text_blocks + [empty]
14
+
15
+ text_blocks.each_cons(3) do |slice|
16
+ prev, current, nxt = *slice
17
+ current.content = classify(prev, current, nxt)
18
+ end
19
+
20
+ doc
21
+ end
22
+
23
+ def self.classify(prev, current, nxt)
24
+ return false if current.link_density > 0.333333
25
+
26
+ if prev.link_density <= 0.555556
27
+ if current.text_density <= 9
28
+ return true if nxt.text_density > 10
29
+ return prev.text_density <= 4 ? false : true
30
+ else
31
+ return nxt.text_density == 0 ? false : true
32
+ end
33
+ else
34
+ return false if nxt.text_density <= 11
35
+ true
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,24 @@
1
+ # Merges two subsequent blocks if their text densities are equal.
2
+
3
+ module Boilerpipe::Filters
4
+ class SimpleBlockFusionProcessor
5
+ def self.process(doc)
6
+ tbs = doc.text_blocks
7
+ return doc if tbs.size < 2
8
+
9
+ blocks_to_remove = []
10
+ tb1 = tbs.first
11
+ tbs.drop(1).each do |tb|
12
+ if tb1.text_density == tb.text_density
13
+ tb1.merge_next(tb)
14
+ blocks_to_remove << tb
15
+ else
16
+ tb1 = tb
17
+ end
18
+ end
19
+
20
+ doc.replace_text_blocks!( tbs - blocks_to_remove )
21
+ doc
22
+ end
23
+ end
24
+ end
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.1.1'
2
+ VERSION = '0.2.0'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
@@ -52,18 +52,32 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rickshaw
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.4.0
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.4.0
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: nokogiri
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
- - - '='
73
+ - - ">="
60
74
  - !ruby/object:Gem::Version
61
75
  version: 1.6.6.2
62
76
  type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - '='
80
+ - - ">="
67
81
  - !ruby/object:Gem::Version
68
82
  version: 1.6.6.2
69
83
  description: A pure ruby implementation of the boilerpipe algorithm
@@ -73,9 +87,12 @@ executables: []
73
87
  extensions: []
74
88
  extra_rdoc_files: []
75
89
  files:
90
+ - ".circleci/config.yml"
76
91
  - ".gitignore"
77
92
  - ".rspec"
93
+ - CHANGELOG.md
78
94
  - Gemfile
95
+ - LICENSE.txt
79
96
  - README.md
80
97
  - Rakefile
81
98
  - bin/console
@@ -86,8 +103,10 @@ files:
86
103
  - lib/boilerpipe/document/text_document.rb
87
104
  - lib/boilerpipe/errors.rb
88
105
  - lib/boilerpipe/extractors/article_extractor.rb
106
+ - lib/boilerpipe/extractors/default_extractor.rb
89
107
  - lib/boilerpipe/filters/block_proximity_fusion.rb
90
108
  - lib/boilerpipe/filters/boilerplate_block_filter.rb
109
+ - lib/boilerpipe/filters/density_rules_classifier.rb
91
110
  - lib/boilerpipe/filters/document_title_match_classifier.rb
92
111
  - lib/boilerpipe/filters/expand_title_to_content_filter.rb
93
112
  - lib/boilerpipe/filters/heuristic_filter_base.rb
@@ -96,6 +115,7 @@ files:
96
115
  - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
97
116
  - lib/boilerpipe/filters/list_at_end_filter.rb
98
117
  - lib/boilerpipe/filters/num_words_rules_classifier.rb
118
+ - lib/boilerpipe/filters/simple_block_fusion_processor.rb
99
119
  - lib/boilerpipe/filters/terminating_blocks_finder.rb
100
120
  - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
101
121
  - lib/boilerpipe/labels/default.rb
@@ -117,7 +137,8 @@ files:
117
137
  - lib/boilerpipe/version.rb
118
138
  - stuff.txt
119
139
  homepage: https://github.com/gregors/boilerpipe-ruby
120
- licenses: []
140
+ licenses:
141
+ - Apache 2.0
121
142
  metadata: {}
122
143
  post_install_message:
123
144
  rdoc_options: []