boilerpipe-ruby 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.circleci/config.yml +57 -0
- data/.gitignore +0 -1
- data/CHANGELOG.md +13 -0
- data/LICENSE.txt +33 -0
- data/README.md +5 -1
- data/boilerpipe-ruby.gemspec +3 -1
- data/lib/boilerpipe.rb +9 -0
- data/lib/boilerpipe/document/text_block.rb +1 -1
- data/lib/boilerpipe/extractors/article_extractor.rb +2 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +24 -0
- data/lib/boilerpipe/filters/density_rules_classifier.rb +39 -0
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +24 -0
- data/lib/boilerpipe/version.rb +1 -1
- metadata +25 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
|
4
|
+
data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
|
7
|
+
data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# Ruby CircleCI 2.0 configuration file
|
2
|
+
#
|
3
|
+
# Check https://circleci.com/docs/2.0/language-ruby/ for more details
|
4
|
+
#
|
5
|
+
version: 2
|
6
|
+
jobs:
|
7
|
+
build:
|
8
|
+
docker:
|
9
|
+
# specify the version you desire here
|
10
|
+
- image: circleci/ruby:2.4.1-node-browsers
|
11
|
+
|
12
|
+
# Specify service dependencies here if necessary
|
13
|
+
# CircleCI maintains a library of pre-built images
|
14
|
+
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
+
# - image: circleci/postgres:9.4
|
16
|
+
|
17
|
+
working_directory: ~/repo
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- checkout
|
21
|
+
|
22
|
+
# Download and cache dependencies
|
23
|
+
- restore_cache:
|
24
|
+
keys:
|
25
|
+
- v1-dependencies-{{ checksum "Gemfile.lock" }}
|
26
|
+
# fallback to using the latest cache if no exact match is found
|
27
|
+
- v1-dependencies-
|
28
|
+
|
29
|
+
- run:
|
30
|
+
name: install dependencies
|
31
|
+
command: |
|
32
|
+
bundle install --jobs=4 --retry=3 --path vendor/bundle
|
33
|
+
|
34
|
+
- save_cache:
|
35
|
+
paths:
|
36
|
+
- ./vendor/bundle
|
37
|
+
key: v1-dependencies-{{ checksum "Gemfile.lock" }}
|
38
|
+
|
39
|
+
# Database setup
|
40
|
+
#- run: bundle exec rake db:create
|
41
|
+
#- run: bundle exec rake db:schema:load
|
42
|
+
|
43
|
+
# run tests!
|
44
|
+
- run:
|
45
|
+
name: run tests
|
46
|
+
command: |
|
47
|
+
mkdir /tmp/test-results
|
48
|
+
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
|
49
|
+
|
50
|
+
rspec --format progress "spec"
|
51
|
+
|
52
|
+
# collect reports
|
53
|
+
- store_test_results:
|
54
|
+
path: /tmp/test-results
|
55
|
+
- store_artifacts:
|
56
|
+
path: /tmp/test-results
|
57
|
+
destination: test-results
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# 0.2.0 / 2017-09-11
|
2
|
+
|
3
|
+
* Add Default Extractor
|
4
|
+
* Tweak dependency to use Nokogiri 1.6.6.2 or newer
|
5
|
+
* Add Apache 2.0 license to reflect original work by Christian Kohlschütter
|
6
|
+
|
7
|
+
# 0.1.1 / 2017-09-11
|
8
|
+
|
9
|
+
* bugfix new line character escaping bug
|
10
|
+
|
11
|
+
# 0.1.0 / 2017-09-08
|
12
|
+
|
13
|
+
* Add Article Extractor
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
boilerpipe
|
2
|
+
|
3
|
+
Copyright (c) 2009, 2014 Christian Kohlschütter
|
4
|
+
|
5
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
6
|
+
(the "License"); you may not use this file except in compliance with
|
7
|
+
the License. You may obtain a copy of the License at
|
8
|
+
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
|
17
|
+
================================================================================
|
18
|
+
|
19
|
+
Reimplemetation of the original Java-based boilerpipe library
|
20
|
+
|
21
|
+
Modifications Copyright (c) 2017 Gregory Ostermayr
|
22
|
+
|
23
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
24
|
+
(the "License"); you may not use this file except in compliance with
|
25
|
+
the License. You may obtain a copy of the License at
|
26
|
+
|
27
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
28
|
+
|
29
|
+
Unless required by applicable law or agreed to in writing, software
|
30
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
31
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
32
|
+
See the License for the specific language governing permissions and
|
33
|
+
limitations under the License.
|
data/README.md
CHANGED
@@ -14,10 +14,12 @@ I've only got the ArticleExtractor working but the others should be following qu
|
|
14
14
|
|
15
15
|
Presently the follow Extractors are implemented
|
16
16
|
* [x] ArticleExtractor
|
17
|
-
* [
|
17
|
+
* [x] DefaultExtractor
|
18
18
|
* [ ] LargestContentExtractor
|
19
19
|
* [ ] KeepEverythingExtractor
|
20
20
|
|
21
|
+
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
|
22
|
+
|
21
23
|
## Installation
|
22
24
|
|
23
25
|
Add this line to your application's Gemfile:
|
@@ -44,6 +46,8 @@ Or install it yourself as:
|
|
44
46
|
> content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
|
45
47
|
> output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
|
46
48
|
=> "Always Squash and Rebase your Git Commits"
|
49
|
+
> output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
|
50
|
+
=> "Posted on\nWhat is the squash rebase workf"
|
47
51
|
|
48
52
|
## Development
|
49
53
|
|
data/boilerpipe-ruby.gemspec
CHANGED
@@ -8,6 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Boilerpipe::VERSION
|
9
9
|
spec.authors = ['Gregory Ostermayr']
|
10
10
|
spec.email = ['<gregory.ostermayr@gmail.com>']
|
11
|
+
spec.license = 'Apache 2.0'
|
11
12
|
|
12
13
|
spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
|
13
14
|
spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
|
@@ -21,5 +22,6 @@ Gem::Specification.new do |spec|
|
|
21
22
|
spec.add_development_dependency 'bundler', '~> 1.11'
|
22
23
|
spec.add_development_dependency 'rake', '~> 10.0'
|
23
24
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
24
|
-
spec.
|
25
|
+
spec.add_development_dependency 'rickshaw', '~> 0.4.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
|
25
27
|
end
|
data/lib/boilerpipe.rb
CHANGED
@@ -1,10 +1,16 @@
|
|
1
1
|
require 'boilerpipe/version'
|
2
|
+
|
2
3
|
require 'boilerpipe/util/unicode_tokenizer'
|
4
|
+
|
3
5
|
require 'boilerpipe/document/text_document'
|
4
6
|
require 'boilerpipe/document/text_block'
|
7
|
+
|
5
8
|
require 'boilerpipe/extractors/article_extractor'
|
9
|
+
require 'boilerpipe/extractors/default_extractor'
|
10
|
+
|
6
11
|
require 'boilerpipe/filters/block_proximity_fusion'
|
7
12
|
require 'boilerpipe/filters/boilerplate_block_filter'
|
13
|
+
require 'boilerpipe/filters/density_rules_classifier'
|
8
14
|
require 'boilerpipe/filters/document_title_match_classifier'
|
9
15
|
require 'boilerpipe/filters/expand_title_to_content_filter'
|
10
16
|
require 'boilerpipe/filters/heuristic_filter_base'
|
@@ -13,10 +19,13 @@ require 'boilerpipe/filters/keep_largest_block_filter'
|
|
13
19
|
require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
|
14
20
|
require 'boilerpipe/filters/list_at_end_filter'
|
15
21
|
require 'boilerpipe/filters/num_words_rules_classifier'
|
22
|
+
require 'boilerpipe/filters/simple_block_fusion_processor'
|
16
23
|
require 'boilerpipe/filters/terminating_blocks_finder'
|
17
24
|
require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
25
|
+
|
18
26
|
require 'boilerpipe/labels/default'
|
19
27
|
require 'boilerpipe/labels/label_action'
|
28
|
+
|
20
29
|
require 'boilerpipe/sax/html_content_handler'
|
21
30
|
require 'boilerpipe/sax/boilerpipe_html_parser'
|
22
31
|
require 'boilerpipe/sax/tag_action_map'
|
@@ -12,7 +12,7 @@ module Boilerpipe
|
|
12
12
|
|
13
13
|
attr_accessor :content
|
14
14
|
|
15
|
-
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=
|
15
|
+
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
|
16
16
|
@labels = Set.new
|
17
17
|
@text = text
|
18
18
|
@num_words = num_words
|
@@ -3,6 +3,7 @@ module Boilerpipe::Extractors
|
|
3
3
|
def self.text(contents)
|
4
4
|
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
5
5
|
::Boilerpipe::Extractors::ArticleExtractor.process(doc)
|
6
|
+
doc.content
|
6
7
|
end
|
7
8
|
|
8
9
|
def self.process(doc)
|
@@ -46,7 +47,7 @@ module Boilerpipe::Extractors
|
|
46
47
|
# Marks nested list-item blocks after the end of the main content as content.
|
47
48
|
filters::ListAtEndFilter.process doc
|
48
49
|
|
49
|
-
doc
|
50
|
+
doc
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Boilerpipe::Extractors
|
2
|
+
class DefaultExtractor
|
3
|
+
|
4
|
+
def self.text(contents)
|
5
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
6
|
+
::Boilerpipe::Extractors::DefaultExtractor.process doc
|
7
|
+
doc.content
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.process(doc)
|
11
|
+
filters = ::Boilerpipe::Filters
|
12
|
+
# merge adjacent blocks with equal text_density
|
13
|
+
filters::SimpleBlockFusionProcessor.process doc
|
14
|
+
|
15
|
+
# merge text blocks next to each other
|
16
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
|
17
|
+
|
18
|
+
# marks text blocks as content / non-content using boilerpipe alg
|
19
|
+
filters::DensityRulesClassifier.process doc
|
20
|
+
|
21
|
+
doc
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Classifies TextBlocks as content/not-content through rules that have been determined
|
2
|
+
# using the C4.8 machine learning algorithm, as described in the paper
|
3
|
+
# "Boilerplate Detection using Shallow Text Features", particularly using text densities and link
|
4
|
+
# densities.
|
5
|
+
|
6
|
+
module Boilerpipe::Filters
|
7
|
+
class DensityRulesClassifier
|
8
|
+
|
9
|
+
def self.process(doc)
|
10
|
+
#return doc if doc.text_blocks.size < 2
|
11
|
+
|
12
|
+
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
|
+
text_blocks = [empty] + doc.text_blocks + [empty]
|
14
|
+
|
15
|
+
text_blocks.each_cons(3) do |slice|
|
16
|
+
prev, current, nxt = *slice
|
17
|
+
current.content = classify(prev, current, nxt)
|
18
|
+
end
|
19
|
+
|
20
|
+
doc
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.classify(prev, current, nxt)
|
24
|
+
return false if current.link_density > 0.333333
|
25
|
+
|
26
|
+
if prev.link_density <= 0.555556
|
27
|
+
if current.text_density <= 9
|
28
|
+
return true if nxt.text_density > 10
|
29
|
+
return prev.text_density <= 4 ? false : true
|
30
|
+
else
|
31
|
+
return nxt.text_density == 0 ? false : true
|
32
|
+
end
|
33
|
+
else
|
34
|
+
return false if nxt.text_density <= 11
|
35
|
+
true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
|
+
|
3
|
+
module Boilerpipe::Filters
|
4
|
+
class SimpleBlockFusionProcessor
|
5
|
+
def self.process(doc)
|
6
|
+
tbs = doc.text_blocks
|
7
|
+
return doc if tbs.size < 2
|
8
|
+
|
9
|
+
blocks_to_remove = []
|
10
|
+
tb1 = tbs.first
|
11
|
+
tbs.drop(1).each do |tb|
|
12
|
+
if tb1.text_density == tb.text_density
|
13
|
+
tb1.merge_next(tb)
|
14
|
+
blocks_to_remove << tb
|
15
|
+
else
|
16
|
+
tb1 = tb
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
doc.replace_text_blocks!( tbs - blocks_to_remove )
|
21
|
+
doc
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
@@ -52,18 +52,32 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rickshaw
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.4.0
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.4.0
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: nokogiri
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
|
-
- -
|
73
|
+
- - ">="
|
60
74
|
- !ruby/object:Gem::Version
|
61
75
|
version: 1.6.6.2
|
62
76
|
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: 1.6.6.2
|
69
83
|
description: A pure ruby implementation of the boilerpipe algorithm
|
@@ -73,9 +87,12 @@ executables: []
|
|
73
87
|
extensions: []
|
74
88
|
extra_rdoc_files: []
|
75
89
|
files:
|
90
|
+
- ".circleci/config.yml"
|
76
91
|
- ".gitignore"
|
77
92
|
- ".rspec"
|
93
|
+
- CHANGELOG.md
|
78
94
|
- Gemfile
|
95
|
+
- LICENSE.txt
|
79
96
|
- README.md
|
80
97
|
- Rakefile
|
81
98
|
- bin/console
|
@@ -86,8 +103,10 @@ files:
|
|
86
103
|
- lib/boilerpipe/document/text_document.rb
|
87
104
|
- lib/boilerpipe/errors.rb
|
88
105
|
- lib/boilerpipe/extractors/article_extractor.rb
|
106
|
+
- lib/boilerpipe/extractors/default_extractor.rb
|
89
107
|
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
90
108
|
- lib/boilerpipe/filters/boilerplate_block_filter.rb
|
109
|
+
- lib/boilerpipe/filters/density_rules_classifier.rb
|
91
110
|
- lib/boilerpipe/filters/document_title_match_classifier.rb
|
92
111
|
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
|
93
112
|
- lib/boilerpipe/filters/heuristic_filter_base.rb
|
@@ -96,6 +115,7 @@ files:
|
|
96
115
|
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
97
116
|
- lib/boilerpipe/filters/list_at_end_filter.rb
|
98
117
|
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
118
|
+
- lib/boilerpipe/filters/simple_block_fusion_processor.rb
|
99
119
|
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
100
120
|
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
101
121
|
- lib/boilerpipe/labels/default.rb
|
@@ -117,7 +137,8 @@ files:
|
|
117
137
|
- lib/boilerpipe/version.rb
|
118
138
|
- stuff.txt
|
119
139
|
homepage: https://github.com/gregors/boilerpipe-ruby
|
120
|
-
licenses:
|
140
|
+
licenses:
|
141
|
+
- Apache 2.0
|
121
142
|
metadata: {}
|
122
143
|
post_install_message:
|
123
144
|
rdoc_options: []
|