boilerpipe-ruby 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +57 -0
- data/.gitignore +0 -1
- data/CHANGELOG.md +13 -0
- data/LICENSE.txt +33 -0
- data/README.md +5 -1
- data/boilerpipe-ruby.gemspec +3 -1
- data/lib/boilerpipe.rb +9 -0
- data/lib/boilerpipe/document/text_block.rb +1 -1
- data/lib/boilerpipe/extractors/article_extractor.rb +2 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +24 -0
- data/lib/boilerpipe/filters/density_rules_classifier.rb +39 -0
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +24 -0
- data/lib/boilerpipe/version.rb +1 -1
- metadata +25 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
|
4
|
+
data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
|
7
|
+
data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# Ruby CircleCI 2.0 configuration file
|
2
|
+
#
|
3
|
+
# Check https://circleci.com/docs/2.0/language-ruby/ for more details
|
4
|
+
#
|
5
|
+
version: 2
|
6
|
+
jobs:
|
7
|
+
build:
|
8
|
+
docker:
|
9
|
+
# specify the version you desire here
|
10
|
+
- image: circleci/ruby:2.4.1-node-browsers
|
11
|
+
|
12
|
+
# Specify service dependencies here if necessary
|
13
|
+
# CircleCI maintains a library of pre-built images
|
14
|
+
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
+
# - image: circleci/postgres:9.4
|
16
|
+
|
17
|
+
working_directory: ~/repo
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- checkout
|
21
|
+
|
22
|
+
# Download and cache dependencies
|
23
|
+
- restore_cache:
|
24
|
+
keys:
|
25
|
+
- v1-dependencies-{{ checksum "Gemfile.lock" }}
|
26
|
+
# fallback to using the latest cache if no exact match is found
|
27
|
+
- v1-dependencies-
|
28
|
+
|
29
|
+
- run:
|
30
|
+
name: install dependencies
|
31
|
+
command: |
|
32
|
+
bundle install --jobs=4 --retry=3 --path vendor/bundle
|
33
|
+
|
34
|
+
- save_cache:
|
35
|
+
paths:
|
36
|
+
- ./vendor/bundle
|
37
|
+
key: v1-dependencies-{{ checksum "Gemfile.lock" }}
|
38
|
+
|
39
|
+
# Database setup
|
40
|
+
#- run: bundle exec rake db:create
|
41
|
+
#- run: bundle exec rake db:schema:load
|
42
|
+
|
43
|
+
# run tests!
|
44
|
+
- run:
|
45
|
+
name: run tests
|
46
|
+
command: |
|
47
|
+
mkdir /tmp/test-results
|
48
|
+
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
|
49
|
+
|
50
|
+
rspec --format progress "spec"
|
51
|
+
|
52
|
+
# collect reports
|
53
|
+
- store_test_results:
|
54
|
+
path: /tmp/test-results
|
55
|
+
- store_artifacts:
|
56
|
+
path: /tmp/test-results
|
57
|
+
destination: test-results
|
data/.gitignore
CHANGED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# 0.2.0 / 2017-09-11
|
2
|
+
|
3
|
+
* Add Default Extractor
|
4
|
+
* Tweak dependency to use Nokogiri 1.6.6.2 or newer
|
5
|
+
* Add Apache 2.0 license to reflect original work by Christian Kohlschütter
|
6
|
+
|
7
|
+
# 0.1.1 / 2017-09-11
|
8
|
+
|
9
|
+
* bugfix new line character escaping bug
|
10
|
+
|
11
|
+
# 0.1.0 / 2017-09-08
|
12
|
+
|
13
|
+
* Add Article Extractor
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
boilerpipe
|
2
|
+
|
3
|
+
Copyright (c) 2009, 2014 Christian Kohlschütter
|
4
|
+
|
5
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
6
|
+
(the "License"); you may not use this file except in compliance with
|
7
|
+
the License. You may obtain a copy of the License at
|
8
|
+
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
|
17
|
+
================================================================================
|
18
|
+
|
19
|
+
Reimplemetation of the original Java-based boilerpipe library
|
20
|
+
|
21
|
+
Modifications Copyright (c) 2017 Gregory Ostermayr
|
22
|
+
|
23
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
24
|
+
(the "License"); you may not use this file except in compliance with
|
25
|
+
the License. You may obtain a copy of the License at
|
26
|
+
|
27
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
28
|
+
|
29
|
+
Unless required by applicable law or agreed to in writing, software
|
30
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
31
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
32
|
+
See the License for the specific language governing permissions and
|
33
|
+
limitations under the License.
|
data/README.md
CHANGED
@@ -14,10 +14,12 @@ I've only got the ArticleExtractor working but the others should be following qu
|
|
14
14
|
|
15
15
|
Presently the follow Extractors are implemented
|
16
16
|
* [x] ArticleExtractor
|
17
|
-
* [
|
17
|
+
* [x] DefaultExtractor
|
18
18
|
* [ ] LargestContentExtractor
|
19
19
|
* [ ] KeepEverythingExtractor
|
20
20
|
|
21
|
+
[](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
|
22
|
+
|
21
23
|
## Installation
|
22
24
|
|
23
25
|
Add this line to your application's Gemfile:
|
@@ -44,6 +46,8 @@ Or install it yourself as:
|
|
44
46
|
> content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
|
45
47
|
> output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
|
46
48
|
=> "Always Squash and Rebase your Git Commits"
|
49
|
+
> output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
|
50
|
+
=> "Posted on\nWhat is the squash rebase workf"
|
47
51
|
|
48
52
|
## Development
|
49
53
|
|
data/boilerpipe-ruby.gemspec
CHANGED
@@ -8,6 +8,7 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.version = Boilerpipe::VERSION
|
9
9
|
spec.authors = ['Gregory Ostermayr']
|
10
10
|
spec.email = ['<gregory.ostermayr@gmail.com>']
|
11
|
+
spec.license = 'Apache 2.0'
|
11
12
|
|
12
13
|
spec.summary = %q{A pure ruby implemenation of the boilerpipe algorithm}
|
13
14
|
spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
|
@@ -21,5 +22,6 @@ Gem::Specification.new do |spec|
|
|
21
22
|
spec.add_development_dependency 'bundler', '~> 1.11'
|
22
23
|
spec.add_development_dependency 'rake', '~> 10.0'
|
23
24
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
24
|
-
spec.
|
25
|
+
spec.add_development_dependency 'rickshaw', '~> 0.4.0'
|
26
|
+
spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
|
25
27
|
end
|
data/lib/boilerpipe.rb
CHANGED
@@ -1,10 +1,16 @@
|
|
1
1
|
require 'boilerpipe/version'
|
2
|
+
|
2
3
|
require 'boilerpipe/util/unicode_tokenizer'
|
4
|
+
|
3
5
|
require 'boilerpipe/document/text_document'
|
4
6
|
require 'boilerpipe/document/text_block'
|
7
|
+
|
5
8
|
require 'boilerpipe/extractors/article_extractor'
|
9
|
+
require 'boilerpipe/extractors/default_extractor'
|
10
|
+
|
6
11
|
require 'boilerpipe/filters/block_proximity_fusion'
|
7
12
|
require 'boilerpipe/filters/boilerplate_block_filter'
|
13
|
+
require 'boilerpipe/filters/density_rules_classifier'
|
8
14
|
require 'boilerpipe/filters/document_title_match_classifier'
|
9
15
|
require 'boilerpipe/filters/expand_title_to_content_filter'
|
10
16
|
require 'boilerpipe/filters/heuristic_filter_base'
|
@@ -13,10 +19,13 @@ require 'boilerpipe/filters/keep_largest_block_filter'
|
|
13
19
|
require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
|
14
20
|
require 'boilerpipe/filters/list_at_end_filter'
|
15
21
|
require 'boilerpipe/filters/num_words_rules_classifier'
|
22
|
+
require 'boilerpipe/filters/simple_block_fusion_processor'
|
16
23
|
require 'boilerpipe/filters/terminating_blocks_finder'
|
17
24
|
require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
25
|
+
|
18
26
|
require 'boilerpipe/labels/default'
|
19
27
|
require 'boilerpipe/labels/label_action'
|
28
|
+
|
20
29
|
require 'boilerpipe/sax/html_content_handler'
|
21
30
|
require 'boilerpipe/sax/boilerpipe_html_parser'
|
22
31
|
require 'boilerpipe/sax/tag_action_map'
|
@@ -12,7 +12,7 @@ module Boilerpipe
|
|
12
12
|
|
13
13
|
attr_accessor :content
|
14
14
|
|
15
|
-
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=
|
15
|
+
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
|
16
16
|
@labels = Set.new
|
17
17
|
@text = text
|
18
18
|
@num_words = num_words
|
@@ -3,6 +3,7 @@ module Boilerpipe::Extractors
|
|
3
3
|
def self.text(contents)
|
4
4
|
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
5
5
|
::Boilerpipe::Extractors::ArticleExtractor.process(doc)
|
6
|
+
doc.content
|
6
7
|
end
|
7
8
|
|
8
9
|
def self.process(doc)
|
@@ -46,7 +47,7 @@ module Boilerpipe::Extractors
|
|
46
47
|
# Marks nested list-item blocks after the end of the main content as content.
|
47
48
|
filters::ListAtEndFilter.process doc
|
48
49
|
|
49
|
-
doc
|
50
|
+
doc
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Boilerpipe::Extractors
|
2
|
+
class DefaultExtractor
|
3
|
+
|
4
|
+
def self.text(contents)
|
5
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
6
|
+
::Boilerpipe::Extractors::DefaultExtractor.process doc
|
7
|
+
doc.content
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.process(doc)
|
11
|
+
filters = ::Boilerpipe::Filters
|
12
|
+
# merge adjacent blocks with equal text_density
|
13
|
+
filters::SimpleBlockFusionProcessor.process doc
|
14
|
+
|
15
|
+
# merge text blocks next to each other
|
16
|
+
filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
|
17
|
+
|
18
|
+
# marks text blocks as content / non-content using boilerpipe alg
|
19
|
+
filters::DensityRulesClassifier.process doc
|
20
|
+
|
21
|
+
doc
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Classifies TextBlocks as content/not-content through rules that have been determined
|
2
|
+
# using the C4.8 machine learning algorithm, as described in the paper
|
3
|
+
# "Boilerplate Detection using Shallow Text Features", particularly using text densities and link
|
4
|
+
# densities.
|
5
|
+
|
6
|
+
module Boilerpipe::Filters
|
7
|
+
class DensityRulesClassifier
|
8
|
+
|
9
|
+
def self.process(doc)
|
10
|
+
#return doc if doc.text_blocks.size < 2
|
11
|
+
|
12
|
+
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
|
+
text_blocks = [empty] + doc.text_blocks + [empty]
|
14
|
+
|
15
|
+
text_blocks.each_cons(3) do |slice|
|
16
|
+
prev, current, nxt = *slice
|
17
|
+
current.content = classify(prev, current, nxt)
|
18
|
+
end
|
19
|
+
|
20
|
+
doc
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.classify(prev, current, nxt)
|
24
|
+
return false if current.link_density > 0.333333
|
25
|
+
|
26
|
+
if prev.link_density <= 0.555556
|
27
|
+
if current.text_density <= 9
|
28
|
+
return true if nxt.text_density > 10
|
29
|
+
return prev.text_density <= 4 ? false : true
|
30
|
+
else
|
31
|
+
return nxt.text_density == 0 ? false : true
|
32
|
+
end
|
33
|
+
else
|
34
|
+
return false if nxt.text_density <= 11
|
35
|
+
true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
|
+
|
3
|
+
module Boilerpipe::Filters
|
4
|
+
class SimpleBlockFusionProcessor
|
5
|
+
def self.process(doc)
|
6
|
+
tbs = doc.text_blocks
|
7
|
+
return doc if tbs.size < 2
|
8
|
+
|
9
|
+
blocks_to_remove = []
|
10
|
+
tb1 = tbs.first
|
11
|
+
tbs.drop(1).each do |tb|
|
12
|
+
if tb1.text_density == tb.text_density
|
13
|
+
tb1.merge_next(tb)
|
14
|
+
blocks_to_remove << tb
|
15
|
+
else
|
16
|
+
tb1 = tb
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
doc.replace_text_blocks!( tbs - blocks_to_remove )
|
21
|
+
doc
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
@@ -52,18 +52,32 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rickshaw
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.4.0
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 0.4.0
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: nokogiri
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
|
-
- -
|
73
|
+
- - ">="
|
60
74
|
- !ruby/object:Gem::Version
|
61
75
|
version: 1.6.6.2
|
62
76
|
type: :runtime
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - ">="
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: 1.6.6.2
|
69
83
|
description: A pure ruby implementation of the boilerpipe algorithm
|
@@ -73,9 +87,12 @@ executables: []
|
|
73
87
|
extensions: []
|
74
88
|
extra_rdoc_files: []
|
75
89
|
files:
|
90
|
+
- ".circleci/config.yml"
|
76
91
|
- ".gitignore"
|
77
92
|
- ".rspec"
|
93
|
+
- CHANGELOG.md
|
78
94
|
- Gemfile
|
95
|
+
- LICENSE.txt
|
79
96
|
- README.md
|
80
97
|
- Rakefile
|
81
98
|
- bin/console
|
@@ -86,8 +103,10 @@ files:
|
|
86
103
|
- lib/boilerpipe/document/text_document.rb
|
87
104
|
- lib/boilerpipe/errors.rb
|
88
105
|
- lib/boilerpipe/extractors/article_extractor.rb
|
106
|
+
- lib/boilerpipe/extractors/default_extractor.rb
|
89
107
|
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
90
108
|
- lib/boilerpipe/filters/boilerplate_block_filter.rb
|
109
|
+
- lib/boilerpipe/filters/density_rules_classifier.rb
|
91
110
|
- lib/boilerpipe/filters/document_title_match_classifier.rb
|
92
111
|
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
|
93
112
|
- lib/boilerpipe/filters/heuristic_filter_base.rb
|
@@ -96,6 +115,7 @@ files:
|
|
96
115
|
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
97
116
|
- lib/boilerpipe/filters/list_at_end_filter.rb
|
98
117
|
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
118
|
+
- lib/boilerpipe/filters/simple_block_fusion_processor.rb
|
99
119
|
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
100
120
|
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
101
121
|
- lib/boilerpipe/labels/default.rb
|
@@ -117,7 +137,8 @@ files:
|
|
117
137
|
- lib/boilerpipe/version.rb
|
118
138
|
- stuff.txt
|
119
139
|
homepage: https://github.com/gregors/boilerpipe-ruby
|
120
|
-
licenses:
|
140
|
+
licenses:
|
141
|
+
- Apache 2.0
|
121
142
|
metadata: {}
|
122
143
|
post_install_message:
|
123
144
|
rdoc_options: []
|