boilerpipe-ruby 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 835e5122c287082ef39874ee0a3365f08910a7a3f70b76c45dc264e8e1301edc
4
- data.tar.gz: edff0860c01277adfc703e453e22b1d24c2db1817cfd4ad6800dfc1a6c0f339f
3
+ metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
4
+ data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
5
5
  SHA512:
6
- metadata.gz: 7cbeb85f9dd74c930e9999e850d2abca6fd3e9470971461a978295de640195192f633a71a19a20b96824fb5becec2e7841e7f3e257b83b90550c935b6d16ff7b
7
- data.tar.gz: 393803c95416f77a467724cd0b451961dcbb9c6e544c6d22a4f71853a041e5f2c32513ffd3f43e9733de43624fce31dad18b9cb7d0c37bd3cd722a6fdf223481
6
+ metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
7
+ data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ # 0.5.0 / 2021-02-15
2
+ * internal refactoring for clarity
3
+
1
4
  # 0.4.4 / 2021-02-13
2
5
  * Do a better job of stripping out script tags
3
6
 
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Boilerpipe
2
2
 
3
- [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
3
+ [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
4
4
  [![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
5
5
 
6
6
  A pure ruby implemenation of the boilerpipe algorithm.
@@ -8,33 +8,23 @@ module Boilerpipe::Filters
8
8
  def self.process(doc)
9
9
  tbs = doc.text_blocks
10
10
 
11
- # slower and more ruby-like
12
- # comeback and let's do some benchmarking
13
- # titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
14
- # title = tbs.index(titles.last)
15
- # content_start = tbs.find_index(&:is_content?)
11
+ title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
12
+ title_idx = tbs.index(title)
16
13
 
17
- i = 0
18
- title = nil
19
- content_start = nil
14
+ content_start = tbs.find_index(&:is_content?)
20
15
 
21
- tbs.each do |tb|
22
- title = i if content_start.nil? && tb.has_label?(:TITLE)
23
- content_start = i if content_start.nil? && tb.is_content?
24
- i += 1
25
- end
16
+ return doc if no_title_with_subsequent_content?(content_start, title_idx)
26
17
 
27
- return doc if no_title_with_subsequent_content?(content_start, title)
28
-
29
- tbs.slice(title...content_start).each do |tb|
30
- tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
31
- end
18
+ tbs.slice(title_idx...content_start)
19
+ .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
20
+ .each{ |tb| tb.content = true }
32
21
 
33
22
  doc
34
23
  end
35
24
 
36
- def self.no_title_with_subsequent_content?(content_start, title)
37
- title.nil? || content_start.nil? || content_start <= title
25
+ def self.no_title_with_subsequent_content?(content_start, title_idx)
26
+ # title has to start before content
27
+ title_idx.nil? || content_start.nil? || title_idx >= content_start
38
28
  end
39
29
  end
40
30
  end
@@ -17,7 +17,7 @@ module Boilerpipe::SAX
17
17
  @flush = false
18
18
  @block_tag_level = -1
19
19
 
20
- @in_body = 0
20
+ @in_body_tag = 0
21
21
  @in_anchor_tag = 0
22
22
  @in_ignorable_element = 0
23
23
  @in_anchor_text = false
@@ -92,9 +92,15 @@ module Boilerpipe::SAX
92
92
  @label_stacks.pop
93
93
  end
94
94
 
95
+ def not_in_body_tag?
96
+ @in_body_tag == 0
97
+ end
98
+
95
99
  def flush_block
96
100
  @flush = false
97
- if @in_body == 0
101
+
102
+ # set title
103
+ if not_in_body_tag?
98
104
  @title = @token_buffer.strip if :TITLE == @last_start_tag
99
105
  clear_buffers
100
106
  return
@@ -205,12 +211,12 @@ module Boilerpipe::SAX
205
211
  @in_ignorable_element -= 1
206
212
  end
207
213
 
208
- def increase_in_body!
209
- @in_body += 1
214
+ def enter_body_tag!
215
+ @in_body_tag += 1
210
216
  end
211
217
 
212
- def decrease_in_body!
213
- @in_body -= 1
218
+ def exit_body_tag!
219
+ @in_body_tag -= 1
214
220
  end
215
221
 
216
222
  def in_ignorable_element?
@@ -4,13 +4,13 @@ module Boilerpipe::SAX::TagActions
4
4
  class Body
5
5
  def start(handler, name, attrs)
6
6
  handler.flush_block
7
- handler.increase_in_body!
7
+ handler.enter_body_tag!
8
8
  false
9
9
  end
10
10
 
11
11
  def end_tag(handler, name)
12
12
  handler.flush_block
13
- handler.decrease_in_body!
13
+ handler.exit_body_tag!
14
14
  false
15
15
  end
16
16
 
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.4'
2
+ VERSION = '0.5.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.4
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-02-13 00:00:00.000000000 Z
11
+ date: 2021-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler