boilerpipe-ruby 0.4.4 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
|
4
|
+
data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
|
7
|
+
data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Boilerpipe
|
2
2
|
|
3
|
-
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/
|
3
|
+
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
|
4
4
|
[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
|
5
5
|
|
6
6
|
A pure ruby implemenation of the boilerpipe algorithm.
|
@@ -8,33 +8,23 @@ module Boilerpipe::Filters
|
|
8
8
|
def self.process(doc)
|
9
9
|
tbs = doc.text_blocks
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
# titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
|
14
|
-
# title = tbs.index(titles.last)
|
15
|
-
# content_start = tbs.find_index(&:is_content?)
|
11
|
+
title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
|
12
|
+
title_idx = tbs.index(title)
|
16
13
|
|
17
|
-
|
18
|
-
title = nil
|
19
|
-
content_start = nil
|
14
|
+
content_start = tbs.find_index(&:is_content?)
|
20
15
|
|
21
|
-
|
22
|
-
title = i if content_start.nil? && tb.has_label?(:TITLE)
|
23
|
-
content_start = i if content_start.nil? && tb.is_content?
|
24
|
-
i += 1
|
25
|
-
end
|
16
|
+
return doc if no_title_with_subsequent_content?(content_start, title_idx)
|
26
17
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
|
31
|
-
end
|
18
|
+
tbs.slice(title_idx...content_start)
|
19
|
+
.select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
|
20
|
+
.each{ |tb| tb.content = true }
|
32
21
|
|
33
22
|
doc
|
34
23
|
end
|
35
24
|
|
36
|
-
def self.no_title_with_subsequent_content?(content_start,
|
37
|
-
title
|
25
|
+
def self.no_title_with_subsequent_content?(content_start, title_idx)
|
26
|
+
# title has to start before content
|
27
|
+
title_idx.nil? || content_start.nil? || title_idx >= content_start
|
38
28
|
end
|
39
29
|
end
|
40
30
|
end
|
@@ -17,7 +17,7 @@ module Boilerpipe::SAX
|
|
17
17
|
@flush = false
|
18
18
|
@block_tag_level = -1
|
19
19
|
|
20
|
-
@
|
20
|
+
@in_body_tag = 0
|
21
21
|
@in_anchor_tag = 0
|
22
22
|
@in_ignorable_element = 0
|
23
23
|
@in_anchor_text = false
|
@@ -92,9 +92,15 @@ module Boilerpipe::SAX
|
|
92
92
|
@label_stacks.pop
|
93
93
|
end
|
94
94
|
|
95
|
+
def not_in_body_tag?
|
96
|
+
@in_body_tag == 0
|
97
|
+
end
|
98
|
+
|
95
99
|
def flush_block
|
96
100
|
@flush = false
|
97
|
-
|
101
|
+
|
102
|
+
# set title
|
103
|
+
if not_in_body_tag?
|
98
104
|
@title = @token_buffer.strip if :TITLE == @last_start_tag
|
99
105
|
clear_buffers
|
100
106
|
return
|
@@ -205,12 +211,12 @@ module Boilerpipe::SAX
|
|
205
211
|
@in_ignorable_element -= 1
|
206
212
|
end
|
207
213
|
|
208
|
-
def
|
209
|
-
@
|
214
|
+
def enter_body_tag!
|
215
|
+
@in_body_tag += 1
|
210
216
|
end
|
211
217
|
|
212
|
-
def
|
213
|
-
@
|
218
|
+
def exit_body_tag!
|
219
|
+
@in_body_tag -= 1
|
214
220
|
end
|
215
221
|
|
216
222
|
def in_ignorable_element?
|
@@ -4,13 +4,13 @@ module Boilerpipe::SAX::TagActions
|
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
7
|
-
handler.
|
7
|
+
handler.enter_body_tag!
|
8
8
|
false
|
9
9
|
end
|
10
10
|
|
11
11
|
def end_tag(handler, name)
|
12
12
|
handler.flush_block
|
13
|
-
handler.
|
13
|
+
handler.exit_body_tag!
|
14
14
|
false
|
15
15
|
end
|
16
16
|
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|