boilerpipe-ruby 0.4.4 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 835e5122c287082ef39874ee0a3365f08910a7a3f70b76c45dc264e8e1301edc
4
- data.tar.gz: edff0860c01277adfc703e453e22b1d24c2db1817cfd4ad6800dfc1a6c0f339f
3
+ metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
4
+ data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
5
5
  SHA512:
6
- metadata.gz: 7cbeb85f9dd74c930e9999e850d2abca6fd3e9470971461a978295de640195192f633a71a19a20b96824fb5becec2e7841e7f3e257b83b90550c935b6d16ff7b
7
- data.tar.gz: 393803c95416f77a467724cd0b451961dcbb9c6e544c6d22a4f71853a041e5f2c32513ffd3f43e9733de43624fce31dad18b9cb7d0c37bd3cd722a6fdf223481
6
+ metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
7
+ data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ # 0.5.0 / 2021-02-15
2
+ * internal refactoring for clarity
3
+
1
4
  # 0.4.4 / 2021-02-13
2
5
  * Do a better job of stripping out script tags
3
6
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Boilerpipe
2
2
 
3
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
4
4
  [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
5
 
6
6
  A pure ruby implemenation of the boilerpipe algorithm.
@@ -8,33 +8,23 @@ module Boilerpipe::Filters
8
8
  def self.process(doc)
9
9
  tbs = doc.text_blocks
10
10
 
11
- # slower and more ruby-like
12
- # comeback and let's do some benchmarking
13
- # titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
14
- # title = tbs.index(titles.last)
15
- # content_start = tbs.find_index(&:is_content?)
11
+ title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
12
+ title_idx = tbs.index(title)
16
13
 
17
- i = 0
18
- title = nil
19
- content_start = nil
14
+ content_start = tbs.find_index(&:is_content?)
20
15
 
21
- tbs.each do |tb|
22
- title = i if content_start.nil? && tb.has_label?(:TITLE)
23
- content_start = i if content_start.nil? && tb.is_content?
24
- i += 1
25
- end
16
+ return doc if no_title_with_subsequent_content?(content_start, title_idx)
26
17
 
27
- return doc if no_title_with_subsequent_content?(content_start, title)
28
-
29
- tbs.slice(title...content_start).each do |tb|
30
- tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
31
- end
18
+ tbs.slice(title_idx...content_start)
19
+ .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
20
+ .each{ |tb| tb.content = true }
32
21
 
33
22
  doc
34
23
  end
35
24
 
36
- def self.no_title_with_subsequent_content?(content_start, title)
37
- title.nil? || content_start.nil? || content_start <= title
25
+ def self.no_title_with_subsequent_content?(content_start, title_idx)
26
+ # title has to start before content
27
+ title_idx.nil? || content_start.nil? || title_idx >= content_start
38
28
  end
39
29
  end
40
30
  end
@@ -17,7 +17,7 @@ module Boilerpipe::SAX
17
17
  @flush = false
18
18
  @block_tag_level = -1
19
19
 
20
- @in_body = 0
20
+ @in_body_tag = 0
21
21
  @in_anchor_tag = 0
22
22
  @in_ignorable_element = 0
23
23
  @in_anchor_text = false
@@ -92,9 +92,15 @@ module Boilerpipe::SAX
92
92
  @label_stacks.pop
93
93
  end
94
94
 
95
+ def not_in_body_tag?
96
+ @in_body_tag == 0
97
+ end
98
+
95
99
  def flush_block
96
100
  @flush = false
97
- if @in_body == 0
101
+
102
+ # set title
103
+ if not_in_body_tag?
98
104
  @title = @token_buffer.strip if :TITLE == @last_start_tag
99
105
  clear_buffers
100
106
  return
@@ -205,12 +211,12 @@ module Boilerpipe::SAX
205
211
  @in_ignorable_element -= 1
206
212
  end
207
213
 
208
- def increase_in_body!
209
- @in_body += 1
214
+ def enter_body_tag!
215
+ @in_body_tag += 1
210
216
  end
211
217
 
212
- def decrease_in_body!
213
- @in_body -= 1
218
+ def exit_body_tag!
219
+ @in_body_tag -= 1
214
220
  end
215
221
 
216
222
  def in_ignorable_element?
@@ -4,13 +4,13 @@ module Boilerpipe::SAX::TagActions
4
4
  class Body
5
5
  def start(handler, name, attrs)
6
6
  handler.flush_block
7
- handler.increase_in_body!
7
+ handler.enter_body_tag!
8
8
  false
9
9
  end
10
10
 
11
11
  def end_tag(handler, name)
12
12
  handler.flush_block
13
- handler.decrease_in_body!
13
+ handler.exit_body_tag!
14
14
  false
15
15
  end
16
16
 
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.4'
2
+ VERSION = '0.5.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-02-13 00:00:00.000000000 Z
11
+ date: 2021-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler