boilerpipe-ruby 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/README.md +27 -6
  4. data/Rakefile +8 -0
  5. data/boilerpipe-ruby.gemspec +10 -9
  6. data/lib/boilerpipe.rb +30 -0
  7. data/lib/boilerpipe/document/text_block.rb +113 -0
  8. data/lib/boilerpipe/document/text_document.rb +44 -0
  9. data/lib/boilerpipe/errors.rb +1 -0
  10. data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
  11. data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
  12. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
  13. data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
  14. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
  15. data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
  16. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
  17. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
  18. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
  19. data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
  20. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
  21. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
  22. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
  23. data/lib/boilerpipe/labels/default.rb +17 -0
  24. data/lib/boilerpipe/labels/label_action.rb +17 -0
  25. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
  26. data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
  27. data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
  28. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
  29. data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
  30. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
  31. data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
  32. data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
  33. data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
  34. data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
  35. data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
  36. data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
  37. data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
  38. data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
  39. data/lib/boilerpipe/version.rb +1 -1
  40. data/stuff.txt +4 -0
  41. metadata +61 -15
@@ -0,0 +1,21 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # Marks this tag the body element (this should usually only
3
+ # be set for the <BODY> tag).
4
+ class Body
5
+ def start(handler, name, attrs)
6
+ handler.flush_block
7
+ handler.increase_in_body!
8
+ false
9
+ end
10
+
11
+ def end_tag(handler, name)
12
+ handler.flush_block
13
+ handler.decrease_in_body!
14
+ false
15
+ end
16
+
17
+ def changes_tag_level?
18
+ true
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ class Chained
3
+ def initialize(t1, t2)
4
+ @t1 = t1
5
+ @t2 = t2
6
+ end
7
+
8
+ def start(handler, name, attrs)
9
+ @t1.start(handler, name, attrs) | @t2.start(handler, name, attrs)
10
+ end
11
+
12
+ def end_tag(handler, name)
13
+ @t1.end_tag(handler, name) | @t2.end_tag(handler, name)
14
+ end
15
+
16
+ def changes_tag_level?
17
+ @t1.changes_tag_level? || @t2.changes_tag_level?
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,40 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # Special TagAction for the <FONT> tag, which keeps track of
3
+ # the absolute and relative font size.
4
+ class Font
5
+ FONT_SIZE = /([\+\-]?)([0-9])/
6
+
7
+ def start(handler, name, attrs)
8
+ m = FONT_SIZE.match attrs['size']
9
+ if m
10
+ rel = m[1]
11
+ val = m[2].to_i # absolute
12
+ size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
13
+ handler.font_size_stack << size
14
+ else
15
+ handler.font_size_stack << nil
16
+ end
17
+ false
18
+ end
19
+
20
+ def end_tag(handler, name)
21
+ handler.font_size_stack.pop
22
+ false
23
+ end
24
+
25
+ def changes_tag_level?
26
+ false
27
+ end
28
+
29
+ def relative(font_size_stack, rel, val)
30
+ prev_size = font_size_stack.reverse_each.find{|s| s != nil}
31
+ prev_size = 3 if prev_size.nil?
32
+
33
+ size = if rel == '+'
34
+ prev_size + val
35
+ else
36
+ prev_size - val
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,18 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ class IgnorableElement
3
+ # Marks this tag as "ignorable", i.e. all its inner content is silently skipped.
4
+ def start(handler, name, attrs)
5
+ handler.increase_in_ignorable_element!
6
+ true
7
+ end
8
+
9
+ def end_tag(handler, name)
10
+ handler.decrease_in_ignorable_element!
11
+ true
12
+ end
13
+
14
+ def changes_tag_level?
15
+ true
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,16 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
3
+ class InlineNoWhitespace
4
+ def start(handler, name, attrs)
5
+ false
6
+ end
7
+
8
+ def end_tag(handler, name)
9
+ false
10
+ end
11
+
12
+ def changes_tag_level?
13
+ false
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,24 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # for inline elements, which triggers some LabelAction on the
3
+ # generated TextBlock.
4
+ class InlineTagLabel
5
+ def initialize(label_action)
6
+ @label_action = label_action
7
+ end
8
+
9
+ def start(handler, name, attrs)
10
+ handler.append_space
11
+ handler.add_label_action(@label_action)
12
+ false
13
+ end
14
+
15
+ def end_tag(handler, name)
16
+ handler.append_space
17
+ false
18
+ end
19
+
20
+ def changes_tag_level?
21
+ false
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,18 @@
1
+ module Boilerpipe::SAX::TagActions
2
+ # Marks this tag a simple "inline" element, which generates whitespace, but no new block.
3
+ class InlineWhitespace
4
+ def start(handler, name, attrs)
5
+ handler.append_space
6
+ false
7
+ end
8
+
9
+ def end_tag(handler, name)
10
+ handler.append_space
11
+ false
12
+ end
13
+
14
+ def changes_tag_level?
15
+ false
16
+ end
17
+ end
18
+ end
@@ -2,9 +2,9 @@ module Boilerpipe
2
2
  class UnicodeTokenizer
3
3
  INVISIBLE_SEPARATOR = "\u2063"
4
4
  WORD_BOUNDARY = Regexp.new('\b')
5
- NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)/])[\u2063]*")
5
+ NOT_WORD_BOUNDARY = Regexp.new("[\u2063]*([\\\"'\\.,\\!\\@\\-\\:\\;\\$\\?\\(\\)\/])[\u2063]*")
6
6
 
7
- # replace word boundaries with 'invisible separator'
7
+ # replace word boundaries with 'invisible separator'
8
8
  # strip invisible separators from non-word boundaries
9
9
  # replace spaces or invisible separators with a single space
10
10
  # trim
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = "0.0.1"
2
+ VERSION = '0.1.0'
3
3
  end
data/stuff.txt ADDED
@@ -0,0 +1,4 @@
1
+ 1. self closing tags mess the sax parser up - like img tags which aren't self closing - maybe look at using cyberneko to balance and fix up html???
2
+
3
+ get rid of
4
+ img, br, pre,
metadata CHANGED
@@ -1,58 +1,72 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-03-13 00:00:00.000000000 Z
11
+ date: 2017-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
+ name: bundler
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
17
  - - "~>"
17
18
  - !ruby/object:Gem::Version
18
19
  version: '1.11'
19
- name: bundler
20
- prerelease: false
21
20
  type: :development
21
+ prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.11'
27
27
  - !ruby/object:Gem::Dependency
28
+ name: rake
28
29
  requirement: !ruby/object:Gem::Requirement
29
30
  requirements:
30
31
  - - "~>"
31
32
  - !ruby/object:Gem::Version
32
33
  version: '10.0'
33
- name: rake
34
- prerelease: false
35
34
  type: :development
35
+ prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '10.0'
41
41
  - !ruby/object:Gem::Dependency
42
+ name: rspec
42
43
  requirement: !ruby/object:Gem::Requirement
43
44
  requirements:
44
45
  - - "~>"
45
46
  - !ruby/object:Gem::Version
46
47
  version: '3.0'
47
- name: rspec
48
- prerelease: false
49
48
  type: :development
49
+ prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '3.0'
55
- description: A pure ruby implementation of the boilerpipe algorith - in progress
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 1.6.6.2
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 1.6.6.2
69
+ description: A pure ruby implementation of the boilerpipe algorithm
56
70
  email:
57
71
  - "<gregory.ostermayr@gmail.com>"
58
72
  executables: []
@@ -68,12 +82,44 @@ files:
68
82
  - bin/setup
69
83
  - boilerpipe-ruby.gemspec
70
84
  - lib/boilerpipe.rb
85
+ - lib/boilerpipe/document/text_block.rb
86
+ - lib/boilerpipe/document/text_document.rb
87
+ - lib/boilerpipe/errors.rb
88
+ - lib/boilerpipe/extractors/article_extractor.rb
89
+ - lib/boilerpipe/filters/block_proximity_fusion.rb
90
+ - lib/boilerpipe/filters/boilerplate_block_filter.rb
91
+ - lib/boilerpipe/filters/document_title_match_classifier.rb
92
+ - lib/boilerpipe/filters/expand_title_to_content_filter.rb
93
+ - lib/boilerpipe/filters/heuristic_filter_base.rb
94
+ - lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb
95
+ - lib/boilerpipe/filters/keep_largest_block_filter.rb
96
+ - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
97
+ - lib/boilerpipe/filters/list_at_end_filter.rb
98
+ - lib/boilerpipe/filters/num_words_rules_classifier.rb
99
+ - lib/boilerpipe/filters/terminating_blocks_finder.rb
100
+ - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
101
+ - lib/boilerpipe/labels/default.rb
102
+ - lib/boilerpipe/labels/label_action.rb
103
+ - lib/boilerpipe/sax/boilerpipe_html_parser.rb
104
+ - lib/boilerpipe/sax/html_content_handler.rb
105
+ - lib/boilerpipe/sax/tag_action_map.rb
106
+ - lib/boilerpipe/sax/tag_actions/anchor_text.rb
107
+ - lib/boilerpipe/sax/tag_actions/block_level.rb
108
+ - lib/boilerpipe/sax/tag_actions/block_tag_label.rb
109
+ - lib/boilerpipe/sax/tag_actions/body.rb
110
+ - lib/boilerpipe/sax/tag_actions/chained.rb
111
+ - lib/boilerpipe/sax/tag_actions/font.rb
112
+ - lib/boilerpipe/sax/tag_actions/ignorable_element.rb
113
+ - lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb
114
+ - lib/boilerpipe/sax/tag_actions/inline_tag_label.rb
115
+ - lib/boilerpipe/sax/tag_actions/inline_whitespace.rb
71
116
  - lib/boilerpipe/util/unicode_tokenizer.rb
72
117
  - lib/boilerpipe/version.rb
118
+ - stuff.txt
73
119
  homepage: https://github.com/gregors/boilerpipe-ruby
74
120
  licenses: []
75
121
  metadata: {}
76
- post_install_message:
122
+ post_install_message:
77
123
  rdoc_options: []
78
124
  require_paths:
79
125
  - lib
@@ -88,9 +134,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
134
  - !ruby/object:Gem::Version
89
135
  version: '0'
90
136
  requirements: []
91
- rubyforge_project:
92
- rubygems_version: 2.4.8
93
- signing_key:
137
+ rubyforge_project:
138
+ rubygems_version: 2.6.12
139
+ signing_key:
94
140
  specification_version: 4
95
- summary: A pure ruby implemenation of the boilerpipe algorithm - in progress
141
+ summary: A pure ruby implemenation of the boilerpipe algorithm
96
142
  test_files: []