boilerpipe-ruby 0.4.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7fec2bd11d29c4b5d14f70e10fcac76beb95c61e25fbe5cac15b82e8c64fbf69
|
4
|
+
data.tar.gz: 766ea373235462c3678cc2487c647d6211fd2fc066626d5c10ab7e4d31f303ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be90614a1c2efa29356e9b3b255a5e5d4374474fd6b711d4ed9ab575c4ab8466a1d6903c23de46276133d1621727dea8525422e49608185c1e6294af4f6e0f54
|
7
|
+
data.tar.gz: 5b368e59ced5b794b8e2033b632de67a09bf43e3c45070e96a66ce695bddb5966130e37d9862179a9c989d5166645e182bd2973c179cb8f369c7c4942e238f30
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Boilerpipe
|
2
2
|
|
3
|
-
[](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/main)
|
4
4
|
[](https://badge.fury.io/rb/boilerpipe-ruby)
|
5
5
|
|
6
6
|
A pure ruby implemenation of the boilerpipe algorithm.
|
@@ -8,33 +8,23 @@ module Boilerpipe::Filters
|
|
8
8
|
def self.process(doc)
|
9
9
|
tbs = doc.text_blocks
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
# titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
|
14
|
-
# title = tbs.index(titles.last)
|
15
|
-
# content_start = tbs.find_index(&:is_content?)
|
11
|
+
title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
|
12
|
+
title_idx = tbs.index(title)
|
16
13
|
|
17
|
-
|
18
|
-
title = nil
|
19
|
-
content_start = nil
|
14
|
+
content_start = tbs.find_index(&:is_content?)
|
20
15
|
|
21
|
-
|
22
|
-
title = i if content_start.nil? && tb.has_label?(:TITLE)
|
23
|
-
content_start = i if content_start.nil? && tb.is_content?
|
24
|
-
i += 1
|
25
|
-
end
|
16
|
+
return doc if no_title_with_subsequent_content?(content_start, title_idx)
|
26
17
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
|
31
|
-
end
|
18
|
+
tbs.slice(title_idx...content_start)
|
19
|
+
.select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
|
20
|
+
.each{ |tb| tb.content = true }
|
32
21
|
|
33
22
|
doc
|
34
23
|
end
|
35
24
|
|
36
|
-
def self.no_title_with_subsequent_content?(content_start,
|
37
|
-
title
|
25
|
+
def self.no_title_with_subsequent_content?(content_start, title_idx)
|
26
|
+
# title has to start before content
|
27
|
+
title_idx.nil? || content_start.nil? || title_idx >= content_start
|
38
28
|
end
|
39
29
|
end
|
40
30
|
end
|
@@ -17,7 +17,7 @@ module Boilerpipe::SAX
|
|
17
17
|
@flush = false
|
18
18
|
@block_tag_level = -1
|
19
19
|
|
20
|
-
@
|
20
|
+
@in_body_tag = 0
|
21
21
|
@in_anchor_tag = 0
|
22
22
|
@in_ignorable_element = 0
|
23
23
|
@in_anchor_text = false
|
@@ -92,9 +92,15 @@ module Boilerpipe::SAX
|
|
92
92
|
@label_stacks.pop
|
93
93
|
end
|
94
94
|
|
95
|
+
def not_in_body_tag?
|
96
|
+
@in_body_tag == 0
|
97
|
+
end
|
98
|
+
|
95
99
|
def flush_block
|
96
100
|
@flush = false
|
97
|
-
|
101
|
+
|
102
|
+
# set title
|
103
|
+
if not_in_body_tag?
|
98
104
|
@title = @token_buffer.strip if :TITLE == @last_start_tag
|
99
105
|
clear_buffers
|
100
106
|
return
|
@@ -205,12 +211,12 @@ module Boilerpipe::SAX
|
|
205
211
|
@in_ignorable_element -= 1
|
206
212
|
end
|
207
213
|
|
208
|
-
def
|
209
|
-
@
|
214
|
+
def enter_body_tag!
|
215
|
+
@in_body_tag += 1
|
210
216
|
end
|
211
217
|
|
212
|
-
def
|
213
|
-
@
|
218
|
+
def exit_body_tag!
|
219
|
+
@in_body_tag -= 1
|
214
220
|
end
|
215
221
|
|
216
222
|
def in_ignorable_element?
|
@@ -4,13 +4,13 @@ module Boilerpipe::SAX::TagActions
|
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
7
|
-
handler.
|
7
|
+
handler.enter_body_tag!
|
8
8
|
false
|
9
9
|
end
|
10
10
|
|
11
11
|
def end_tag(handler, name)
|
12
12
|
handler.flush_block
|
13
|
-
handler.
|
13
|
+
handler.exit_body_tag!
|
14
14
|
false
|
15
15
|
end
|
16
16
|
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-02-
|
11
|
+
date: 2021-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|