boilerpipe-ruby 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +4 -2
- data/lib/boilerpipe.rb +5 -0
- data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +21 -0
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +37 -0
- data/lib/boilerpipe/filters/min_words_filter.rb +16 -0
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +40 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
|
4
|
+
data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
|
7
|
+
data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -16,16 +16,18 @@ Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor -
|
|
16
16
|
|
17
17
|
Presently the follow Extractors are implemented
|
18
18
|
* [x] ArticleExtractor
|
19
|
-
* [
|
19
|
+
* [x] ArticleSentenceExtractor
|
20
20
|
* [x] CanolaExtractor
|
21
21
|
* [x] DefaultExtractor
|
22
22
|
* [x] KeepEverythingExtractor
|
23
|
-
* [
|
23
|
+
* [x] KeepEverythingWithMinKWordsExtractor
|
24
24
|
* [x] LargestContentExtractor
|
25
25
|
* [x] NumWordsRulesExtractor
|
26
26
|
|
27
27
|
[](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
|
28
28
|
|
29
|
+
[](https://badge.fury.io/rb/boilerpipe-ruby)
|
30
|
+
|
29
31
|
## Installation
|
30
32
|
|
31
33
|
Add this line to your application's Gemfile:
|
data/lib/boilerpipe.rb
CHANGED
@@ -6,9 +6,11 @@ require 'boilerpipe/document/text_document'
|
|
6
6
|
require 'boilerpipe/document/text_block'
|
7
7
|
|
8
8
|
require 'boilerpipe/extractors/article_extractor'
|
9
|
+
require 'boilerpipe/extractors/article_sentence_extractor'
|
9
10
|
require 'boilerpipe/extractors/canola_extractor'
|
10
11
|
require 'boilerpipe/extractors/default_extractor'
|
11
12
|
require 'boilerpipe/extractors/keep_everything_extractor'
|
13
|
+
require 'boilerpipe/extractors/keep_everything_with_k_min_words_extractor'
|
12
14
|
require 'boilerpipe/extractors/largest_content_extractor'
|
13
15
|
require 'boilerpipe/extractors/num_words_rules_extractor'
|
14
16
|
|
@@ -24,8 +26,11 @@ require 'boilerpipe/filters/keep_largest_block_filter'
|
|
24
26
|
require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
|
25
27
|
require 'boilerpipe/filters/list_at_end_filter'
|
26
28
|
require 'boilerpipe/filters/mark_everything_content_filter'
|
29
|
+
require 'boilerpipe/filters/min_clause_words_filter'
|
30
|
+
require 'boilerpipe/filters/min_words_filter'
|
27
31
|
require 'boilerpipe/filters/num_words_rules_classifier'
|
28
32
|
require 'boilerpipe/filters/simple_block_fusion_processor'
|
33
|
+
require 'boilerpipe/filters/split_paragraph_blocks_filter'
|
29
34
|
require 'boilerpipe/filters/terminating_blocks_finder'
|
30
35
|
require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
|
31
36
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# A full-text extractor which is tuned towards extracting sentences from news articles.
|
2
|
+
|
3
|
+
module Boilerpipe::Extractors
|
4
|
+
class ArticleSentenceExtractor
|
5
|
+
def self.text(contents)
|
6
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
7
|
+
::Boilerpipe::Extractors::ArticleSentenceExtractor.process(doc)
|
8
|
+
doc.content
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.process(doc)
|
12
|
+
::Boilerpipe::Extractors::ArticleExtractor.process doc
|
13
|
+
::Boilerpipe::Filters::SplitParagraphBlocksFilter.process doc
|
14
|
+
::Boilerpipe::Filters::MinClauseWordsFilter.process doc
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
# A full-text extractor which extracts the largest text component of a page.
|
3
|
+
# For news articles, it may perform better than the DefaultExtractor, but
|
4
|
+
# usually worse than ArticleExtractor.
|
5
|
+
|
6
|
+
module Boilerpipe::Extractors
|
7
|
+
class KeepEverythingWithKMinWordsExtractor
|
8
|
+
def self.text(min, contents)
|
9
|
+
doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
|
10
|
+
::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
|
11
|
+
doc.content
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.process(min, doc)
|
15
|
+
::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
|
16
|
+
::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
|
17
|
+
::Boilerpipe::Filters::MinWordsFilter.process min, doc
|
18
|
+
doc
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
#
|
2
|
+
# Keeps only blocks that have at least one segment fragment ("clause") with at least k
|
3
|
+
# words (default: 5).
|
4
|
+
#
|
5
|
+
# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
|
6
|
+
#
|
7
|
+
# SplitParagraphBlocksFilter
|
8
|
+
|
9
|
+
module Boilerpipe::Filters
|
10
|
+
class MinClauseWordsFilter
|
11
|
+
|
12
|
+
def self.process(doc, min_words=5)
|
13
|
+
|
14
|
+
doc.text_blocks.each do |tb|
|
15
|
+
next if tb.is_not_content?
|
16
|
+
|
17
|
+
clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
|
18
|
+
tb.text.scan(clause_delimiter).each do |possible_clause|
|
19
|
+
if is_clause? possible_clause
|
20
|
+
break
|
21
|
+
else
|
22
|
+
tb.content = false
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
doc
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.is_clause?(text, min_words=5)
|
31
|
+
return false if text.nil?
|
32
|
+
whitespace = /[ \n\r]+/
|
33
|
+
text.scan(whitespace).size >= min_words
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
|
2
|
+
# Keeps only those content blocks which contain at least k words.
|
3
|
+
|
4
|
+
module Boilerpipe::Filters
|
5
|
+
class MinWordsFilter
|
6
|
+
|
7
|
+
def self.process(min_words, doc)
|
8
|
+
doc.text_blocks.each do |tb|
|
9
|
+
next if tb.is_not_content?
|
10
|
+
tb.content = false if tb.num_words < min_words
|
11
|
+
end
|
12
|
+
doc
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
# Splits TextBlocks at paragraph boundaries.
|
3
|
+
#
|
4
|
+
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
5
|
+
# #getContainedTextElements()), but this one probably is necessary for some other filters.
|
6
|
+
#
|
7
|
+
# see MinClauseWordsFilter
|
8
|
+
|
9
|
+
module Boilerpipe::Filters
|
10
|
+
class SplitParagraphBlocksFilter
|
11
|
+
|
12
|
+
def self.process(doc)
|
13
|
+
tbs = doc.text_blocks
|
14
|
+
new_blocks = []
|
15
|
+
changes = false
|
16
|
+
tbs.each do |tb|
|
17
|
+
paragraphs = tb.text.split(/[\n\r]+/)
|
18
|
+
|
19
|
+
if paragraphs.size < 2
|
20
|
+
new_blocks << tb
|
21
|
+
next
|
22
|
+
end
|
23
|
+
|
24
|
+
is_content = tb.is_content?
|
25
|
+
labels = tb.labels
|
26
|
+
paragraphs.each do |paragraph|
|
27
|
+
tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
|
28
|
+
tbP.content = is_content
|
29
|
+
tbP.add_labels(labels)
|
30
|
+
new_blocks << tbP
|
31
|
+
changes = true
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
doc.replace_text_blocks!(new_blocks) if changes
|
36
|
+
doc
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -4,11 +4,11 @@ module Boilerpipe::SAX
|
|
4
4
|
def self.parse(text)
|
5
5
|
|
6
6
|
#script bug - delete script tags
|
7
|
-
text
|
7
|
+
text.gsub!(/\<script>.+?<\/script>/i, '')
|
8
8
|
|
9
9
|
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
10
|
# mri doesn't remove when missing the semicolon
|
11
|
-
text
|
11
|
+
text.gsub!(/( ) /, '\1; ')
|
12
12
|
|
13
13
|
|
14
14
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-09-
|
11
|
+
date: 2017-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -103,9 +103,11 @@ files:
|
|
103
103
|
- lib/boilerpipe/document/text_document.rb
|
104
104
|
- lib/boilerpipe/errors.rb
|
105
105
|
- lib/boilerpipe/extractors/article_extractor.rb
|
106
|
+
- lib/boilerpipe/extractors/article_sentence_extractor.rb
|
106
107
|
- lib/boilerpipe/extractors/canola_extractor.rb
|
107
108
|
- lib/boilerpipe/extractors/default_extractor.rb
|
108
109
|
- lib/boilerpipe/extractors/keep_everything_extractor.rb
|
110
|
+
- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
|
109
111
|
- lib/boilerpipe/extractors/largest_content_extractor.rb
|
110
112
|
- lib/boilerpipe/extractors/num_words_rules_extractor.rb
|
111
113
|
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
@@ -120,8 +122,11 @@ files:
|
|
120
122
|
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
121
123
|
- lib/boilerpipe/filters/list_at_end_filter.rb
|
122
124
|
- lib/boilerpipe/filters/mark_everything_content_filter.rb
|
125
|
+
- lib/boilerpipe/filters/min_clause_words_filter.rb
|
126
|
+
- lib/boilerpipe/filters/min_words_filter.rb
|
123
127
|
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
124
128
|
- lib/boilerpipe/filters/simple_block_fusion_processor.rb
|
129
|
+
- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
|
125
130
|
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
126
131
|
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
127
132
|
- lib/boilerpipe/labels/default.rb
|