text_sentencer 0.2.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/text_sentencer +13 -16
- data/lib/text_sentencer/string_scan_offset.rb +9 -0
- data/lib/text_sentencer/text_sentencer.rb +43 -48
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0be05e5130a03bd0c189b112d1d26c610844a4c9
|
4
|
+
data.tar.gz: 16ea016fb63066dd617c7b4ac39887bbacba40af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c07b216029b059b7f9bc2dcff01278801933fb95d7a201f19d95951e52329f66c311ebf26bdbd910a2c57a2347adef6f366361f5e71643a735e8bc1775c4b61c
|
7
|
+
data.tar.gz: eeb161448cdd65860686248281c6438e0b9e096529956ae175faab0a41bfa40cd3a101017b9aa5367099d4ad39fd7bc972fde48beae88e3f3dcb2c500715ae22
|
data/bin/text_sentencer
CHANGED
@@ -3,6 +3,7 @@ require 'json'
|
|
3
3
|
require 'text_sentencer'
|
4
4
|
|
5
5
|
config_filename = nil
|
6
|
+
output_mode = :sentences
|
6
7
|
|
7
8
|
## command line option processing
|
8
9
|
require 'optparse'
|
@@ -13,6 +14,10 @@ optparse = OptionParser.new do |opts|
|
|
13
14
|
config_filename = c
|
14
15
|
end
|
15
16
|
|
17
|
+
opts.on('-j', '--json_output', 'outputs the result in JSON.') do
|
18
|
+
output_mode = :json
|
19
|
+
end
|
20
|
+
|
16
21
|
opts.on('-h', '--help', 'displays this screen.') do
|
17
22
|
puts opts
|
18
23
|
exit
|
@@ -28,21 +33,13 @@ end
|
|
28
33
|
sentencer = TextSentencer.new(config)
|
29
34
|
|
30
35
|
text = ARGF.read
|
31
|
-
|
32
|
-
## Preprocessing
|
33
|
-
# It should be removed later
|
34
|
-
text.gsub!(/ +/, ' ')
|
35
|
-
text.gsub!(/\n+/, "\n")
|
36
|
-
text.gsub!(/\t+/, "\t")
|
37
|
-
text.gsub!(/\n /, "\n")
|
38
|
-
text.gsub!(/ \n/, "\n")
|
39
|
-
text.gsub!(/\t /, "\t")
|
40
|
-
text.gsub!(/ \t/, "\t")
|
41
|
-
text.gsub!(/\n\t/, "\n")
|
42
|
-
text.gsub!(/\t\n/, "\n")
|
43
|
-
|
44
36
|
annotations = sentencer.annotate(text)
|
45
|
-
|
46
|
-
|
47
|
-
|
37
|
+
|
38
|
+
if output_mode == :json
|
39
|
+
puts JSON.pretty_generate(annotations)
|
40
|
+
else
|
41
|
+
annotations[:denotations].each do |d|
|
42
|
+
span = d[:span]
|
43
|
+
puts text[span[:begin]...span[:end]]
|
44
|
+
end
|
48
45
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_sentencer/string_scan_offset'
|
2
3
|
require 'pp'
|
3
4
|
|
4
5
|
class TextSentencer
|
@@ -6,14 +7,10 @@ class TextSentencer
|
|
6
7
|
|
7
8
|
DEFAULT_RULES = {
|
8
9
|
# All the positions of new line characters always take sentence break.
|
9
|
-
|
10
|
-
"\n"
|
11
|
-
],
|
10
|
+
break_pattern: "([ \t]*\n+)+[ \t]*",
|
12
11
|
|
13
12
|
# All the positions of space and tab characters are candiates of sentence break.
|
14
|
-
|
15
|
-
" ", "\t"
|
16
|
-
],
|
13
|
+
candidate_pattern: "[ \t]+",
|
17
14
|
|
18
15
|
# First, positive rules are applied to the break candidates to make initial segmantations.
|
19
16
|
positive_rules: [
|
@@ -49,75 +46,73 @@ class TextSentencer
|
|
49
46
|
def initialize(rules = nil)
|
50
47
|
rules ||= DEFAULT_RULES
|
51
48
|
@rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
|
52
|
-
@rules[:
|
53
|
-
@rules[:
|
49
|
+
@rules[:break_pattern] ||= ""
|
50
|
+
@rules[:candidate_pattern] ||= ""
|
54
51
|
@rules[:positive_rules] ||= []
|
55
52
|
@rules[:negative_rules] ||= []
|
56
53
|
end
|
57
54
|
|
58
55
|
def annotate(text)
|
59
|
-
return nil if text.nil?
|
56
|
+
return nil if text.nil?
|
57
|
+
|
60
58
|
sentences = segment(text)
|
61
59
|
denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
|
62
|
-
|
60
|
+
{text:text, denotations:denotations}
|
63
61
|
end
|
64
62
|
|
65
63
|
private
|
66
64
|
|
67
65
|
def segment(text)
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
breaks = []
|
66
|
+
breaks = if @rules[:break_pattern].empty?
|
67
|
+
[]
|
68
|
+
else
|
69
|
+
text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
|
70
|
+
end
|
74
71
|
|
75
|
-
|
76
|
-
|
72
|
+
candidates = if @rules[:candidate_pattern].empty?
|
73
|
+
[]
|
74
|
+
else
|
75
|
+
text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
|
76
|
+
end
|
77
77
|
|
78
|
-
#
|
79
|
-
|
78
|
+
# breaks take precedent
|
79
|
+
candidates -= breaks
|
80
80
|
|
81
|
-
|
81
|
+
candidates.each do |c|
|
82
|
+
last_end, next_begin = c
|
82
83
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
87
|
-
pbreaks << l
|
88
|
-
break
|
89
|
-
end
|
90
|
-
end
|
91
|
-
elsif @rules[:break_characters].include?(text[l])
|
92
|
-
breaks << l
|
84
|
+
if (last_end == 0) || (next_begin == text.length)
|
85
|
+
breaks << c
|
86
|
+
next
|
93
87
|
end
|
94
|
-
end
|
95
88
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
89
|
+
last_text = text[0...last_end]
|
90
|
+
next_text = text[next_begin..-1]
|
91
|
+
|
92
|
+
@rules[:positive_rules].each do |p|
|
93
|
+
if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
|
94
|
+
break_p = true
|
95
|
+
@rules[:negative_rules].each do |n|
|
96
|
+
if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
|
97
|
+
break_p = false
|
98
|
+
break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
breaks << c if break_p
|
101
102
|
break
|
102
103
|
end
|
103
104
|
end
|
104
105
|
end
|
105
|
-
|
106
|
+
|
106
107
|
breaks.sort!
|
107
108
|
|
108
109
|
sentences = []
|
109
|
-
lastbreak =
|
110
|
+
lastbreak = 0
|
110
111
|
breaks.each do |b|
|
111
|
-
sentences
|
112
|
-
lastbreak = b
|
112
|
+
sentences << [lastbreak, b[0]] if b[0] > lastbreak
|
113
|
+
lastbreak = b[1]
|
113
114
|
end
|
114
|
-
sentences
|
115
|
-
|
116
|
-
## filter out empty segments
|
117
|
-
sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
|
118
|
-
|
119
|
-
## adjust offsets for the in text
|
120
|
-
sentences.collect!{|b, e| [b + start, e + start]}
|
115
|
+
sentences << [lastbreak, text.length] if lastbreak < text.length
|
121
116
|
|
122
117
|
sentences
|
123
118
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_sentencer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: TextSentencer is a simple rule-based system for segmenting a text block
|
14
14
|
into sentences.
|
@@ -20,6 +20,7 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- bin/text_sentencer
|
22
22
|
- lib/text_sentencer.rb
|
23
|
+
- lib/text_sentencer/string_scan_offset.rb
|
23
24
|
- lib/text_sentencer/text_sentencer.rb
|
24
25
|
homepage: https://github.com/jdkim/text_sentencer
|
25
26
|
licenses:
|