boilerpipe-ruby 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +27 -6
- data/Rakefile +8 -0
- data/boilerpipe-ruby.gemspec +10 -9
- data/lib/boilerpipe.rb +30 -0
- data/lib/boilerpipe/document/text_block.rb +113 -0
- data/lib/boilerpipe/document/text_document.rb +44 -0
- data/lib/boilerpipe/errors.rb +1 -0
- data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
- data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
- data/lib/boilerpipe/labels/default.rb +17 -0
- data/lib/boilerpipe/labels/label_action.rb +17 -0
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
- data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
- data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
- data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
- data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
- data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
- data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
- data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
- data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
- data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
- data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- data/stuff.txt +4 -0
- metadata +61 -15
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
# Marks this tag the body element (this should usually only
|
|
3
|
+
# be set for the <BODY> tag).
|
|
4
|
+
class Body
|
|
5
|
+
def start(handler, name, attrs)
|
|
6
|
+
handler.flush_block
|
|
7
|
+
handler.increase_in_body!
|
|
8
|
+
false
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def end_tag(handler, name)
|
|
12
|
+
handler.flush_block
|
|
13
|
+
handler.decrease_in_body!
|
|
14
|
+
false
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def changes_tag_level?
|
|
18
|
+
true
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
class Chained
|
|
3
|
+
def initialize(t1, t2)
|
|
4
|
+
@t1 = t1
|
|
5
|
+
@t2 = t2
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def start(handler, name, attrs)
|
|
9
|
+
@t1.start(handler, name, attrs) | @t2.start(handler, name, attrs)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def end_tag(handler, name)
|
|
13
|
+
@t1.end_tag(handler, name) | @t2.end_tag(handler, name)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def changes_tag_level?
|
|
17
|
+
@t1.changes_tag_level? || @t2.changes_tag_level?
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
# Special TagAction for the <FONT> tag, which keeps track of
|
|
3
|
+
# the absolute and relative font size.
|
|
4
|
+
class Font
|
|
5
|
+
FONT_SIZE = /([\+\-]?)([0-9])/
|
|
6
|
+
|
|
7
|
+
def start(handler, name, attrs)
|
|
8
|
+
m = FONT_SIZE.match attrs['size']
|
|
9
|
+
if m
|
|
10
|
+
rel = m[1]
|
|
11
|
+
val = m[2].to_i # absolute
|
|
12
|
+
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
|
13
|
+
handler.font_size_stack << size
|
|
14
|
+
else
|
|
15
|
+
handler.font_size_stack << nil
|
|
16
|
+
end
|
|
17
|
+
false
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def end_tag(handler, name)
|
|
21
|
+
handler.font_size_stack.pop
|
|
22
|
+
false
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def changes_tag_level?
|
|
26
|
+
false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def relative(font_size_stack, rel, val)
|
|
30
|
+
prev_size = font_size_stack.reverse_each.find{|s| s != nil}
|
|
31
|
+
prev_size = 3 if prev_size.nil?
|
|
32
|
+
|
|
33
|
+
size = if rel == '+'
|
|
34
|
+
prev_size + val
|
|
35
|
+
else
|
|
36
|
+
prev_size - val
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
class IgnorableElement
|
|
3
|
+
# Marks this tag as "ignorable", i.e. all its inner content is silently skipped.
|
|
4
|
+
def start(handler, name, attrs)
|
|
5
|
+
handler.increase_in_ignorable_element!
|
|
6
|
+
true
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def end_tag(handler, name)
|
|
10
|
+
handler.decrease_in_ignorable_element!
|
|
11
|
+
true
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def changes_tag_level?
|
|
15
|
+
true
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
# Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
|
|
3
|
+
class InlineNoWhitespace
|
|
4
|
+
def start(handler, name, attrs)
|
|
5
|
+
false
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def end_tag(handler, name)
|
|
9
|
+
false
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def changes_tag_level?
|
|
13
|
+
false
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
# for inline elements, which triggers some LabelAction on the
|
|
3
|
+
# generated TextBlock.
|
|
4
|
+
class InlineTagLabel
|
|
5
|
+
def initialize(label_action)
|
|
6
|
+
@label_action = label_action
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def start(handler, name, attrs)
|
|
10
|
+
handler.append_space
|
|
11
|
+
handler.add_label_action(@label_action)
|
|
12
|
+
false
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def end_tag(handler, name)
|
|
16
|
+
handler.append_space
|
|
17
|
+
false
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def changes_tag_level?
|
|
21
|
+
false
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Boilerpipe::SAX::TagActions
|
|
2
|
+
# Marks this tag a simple "inline" element, which generates whitespace, but no new block.
|
|
3
|
+
class InlineWhitespace
|
|
4
|
+
def start(handler, name, attrs)
|
|
5
|
+
handler.append_space
|
|
6
|
+
false
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
def end_tag(handler, name)
|
|
10
|
+
handler.append_space
|
|
11
|
+
false
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def changes_tag_level?
|
|
15
|
+
false
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -2,9 +2,9 @@ module Boilerpipe
|
|
|
2
2
|
class UnicodeTokenizer
|
|
3
3
|
INVISIBLE_SEPARATOR = "\u2063"
|
|
4
4
|
WORD_BOUNDARY = Regexp.new('\b')
|
|
5
|
-
NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)
|
|
5
|
+
NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)\/])[\u2063]*")
|
|
6
6
|
|
|
7
|
-
# replace word boundaries with 'invisible separator'
|
|
7
|
+
# replace word boundaries with 'invisible separator'
|
|
8
8
|
# strip invisible separators from non-word boundaries
|
|
9
9
|
# replace spaces or invisible separators with a single space
|
|
10
10
|
# trim
|
data/lib/boilerpipe/version.rb
CHANGED
data/stuff.txt
ADDED
metadata
CHANGED
|
@@ -1,58 +1,72 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: boilerpipe-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0
|
|
4
|
+
version: 0.1.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gregory Ostermayr
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2017-09-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: bundler
|
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
|
15
16
|
requirements:
|
|
16
17
|
- - "~>"
|
|
17
18
|
- !ruby/object:Gem::Version
|
|
18
19
|
version: '1.11'
|
|
19
|
-
name: bundler
|
|
20
|
-
prerelease: false
|
|
21
20
|
type: :development
|
|
21
|
+
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '1.11'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: rake
|
|
28
29
|
requirement: !ruby/object:Gem::Requirement
|
|
29
30
|
requirements:
|
|
30
31
|
- - "~>"
|
|
31
32
|
- !ruby/object:Gem::Version
|
|
32
33
|
version: '10.0'
|
|
33
|
-
name: rake
|
|
34
|
-
prerelease: false
|
|
35
34
|
type: :development
|
|
35
|
+
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
38
|
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '10.0'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: rspec
|
|
42
43
|
requirement: !ruby/object:Gem::Requirement
|
|
43
44
|
requirements:
|
|
44
45
|
- - "~>"
|
|
45
46
|
- !ruby/object:Gem::Version
|
|
46
47
|
version: '3.0'
|
|
47
|
-
name: rspec
|
|
48
|
-
prerelease: false
|
|
49
48
|
type: :development
|
|
49
|
+
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '3.0'
|
|
55
|
-
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: nokogiri
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - '='
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: 1.6.6.2
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - '='
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: 1.6.6.2
|
|
69
|
+
description: A pure ruby implementation of the boilerpipe algorithm
|
|
56
70
|
email:
|
|
57
71
|
- "<gregory.ostermayr@gmail.com>"
|
|
58
72
|
executables: []
|
|
@@ -68,12 +82,44 @@ files:
|
|
|
68
82
|
- bin/setup
|
|
69
83
|
- boilerpipe-ruby.gemspec
|
|
70
84
|
- lib/boilerpipe.rb
|
|
85
|
+
- lib/boilerpipe/document/text_block.rb
|
|
86
|
+
- lib/boilerpipe/document/text_document.rb
|
|
87
|
+
- lib/boilerpipe/errors.rb
|
|
88
|
+
- lib/boilerpipe/extractors/article_extractor.rb
|
|
89
|
+
- lib/boilerpipe/filters/block_proximity_fusion.rb
|
|
90
|
+
- lib/boilerpipe/filters/boilerplate_block_filter.rb
|
|
91
|
+
- lib/boilerpipe/filters/document_title_match_classifier.rb
|
|
92
|
+
- lib/boilerpipe/filters/expand_title_to_content_filter.rb
|
|
93
|
+
- lib/boilerpipe/filters/heuristic_filter_base.rb
|
|
94
|
+
- lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb
|
|
95
|
+
- lib/boilerpipe/filters/keep_largest_block_filter.rb
|
|
96
|
+
- lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
|
|
97
|
+
- lib/boilerpipe/filters/list_at_end_filter.rb
|
|
98
|
+
- lib/boilerpipe/filters/num_words_rules_classifier.rb
|
|
99
|
+
- lib/boilerpipe/filters/terminating_blocks_finder.rb
|
|
100
|
+
- lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
|
|
101
|
+
- lib/boilerpipe/labels/default.rb
|
|
102
|
+
- lib/boilerpipe/labels/label_action.rb
|
|
103
|
+
- lib/boilerpipe/sax/boilerpipe_html_parser.rb
|
|
104
|
+
- lib/boilerpipe/sax/html_content_handler.rb
|
|
105
|
+
- lib/boilerpipe/sax/tag_action_map.rb
|
|
106
|
+
- lib/boilerpipe/sax/tag_actions/anchor_text.rb
|
|
107
|
+
- lib/boilerpipe/sax/tag_actions/block_level.rb
|
|
108
|
+
- lib/boilerpipe/sax/tag_actions/block_tag_label.rb
|
|
109
|
+
- lib/boilerpipe/sax/tag_actions/body.rb
|
|
110
|
+
- lib/boilerpipe/sax/tag_actions/chained.rb
|
|
111
|
+
- lib/boilerpipe/sax/tag_actions/font.rb
|
|
112
|
+
- lib/boilerpipe/sax/tag_actions/ignorable_element.rb
|
|
113
|
+
- lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb
|
|
114
|
+
- lib/boilerpipe/sax/tag_actions/inline_tag_label.rb
|
|
115
|
+
- lib/boilerpipe/sax/tag_actions/inline_whitespace.rb
|
|
71
116
|
- lib/boilerpipe/util/unicode_tokenizer.rb
|
|
72
117
|
- lib/boilerpipe/version.rb
|
|
118
|
+
- stuff.txt
|
|
73
119
|
homepage: https://github.com/gregors/boilerpipe-ruby
|
|
74
120
|
licenses: []
|
|
75
121
|
metadata: {}
|
|
76
|
-
post_install_message:
|
|
122
|
+
post_install_message:
|
|
77
123
|
rdoc_options: []
|
|
78
124
|
require_paths:
|
|
79
125
|
- lib
|
|
@@ -88,9 +134,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
88
134
|
- !ruby/object:Gem::Version
|
|
89
135
|
version: '0'
|
|
90
136
|
requirements: []
|
|
91
|
-
rubyforge_project:
|
|
92
|
-
rubygems_version: 2.
|
|
93
|
-
signing_key:
|
|
137
|
+
rubyforge_project:
|
|
138
|
+
rubygems_version: 2.6.12
|
|
139
|
+
signing_key:
|
|
94
140
|
specification_version: 4
|
|
95
|
-
summary: A pure ruby implemenation of the boilerpipe algorithm
|
|
141
|
+
summary: A pure ruby implemenation of the boilerpipe algorithm
|
|
96
142
|
test_files: []
|