text_sentencer 0.2.0 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/text_sentencer +26 -28
- data/lib/text_sentencer/string_scan_offset.rb +9 -0
- data/lib/text_sentencer/text_sentencer.rb +113 -146
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
|
4
|
+
data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
|
7
|
+
data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
|
data/bin/text_sentencer
CHANGED
@@ -1,47 +1,45 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'json'
|
2
3
|
require 'text_sentencer'
|
3
4
|
|
4
5
|
config_filename = nil
|
6
|
+
output_mode = :sentences
|
5
7
|
|
6
8
|
## command line option processing
|
7
9
|
require 'optparse'
|
8
10
|
optparse = OptionParser.new do |opts|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
11
|
+
opts.banner = "Usage: text_sentencer [options]"
|
12
|
+
|
13
|
+
opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
|
14
|
+
config_filename = c
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-j', '--json_output', 'outputs the result in JSON.') do
|
18
|
+
output_mode = :json
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
22
|
+
puts opts
|
23
|
+
exit
|
24
|
+
end
|
19
25
|
end
|
20
26
|
|
21
27
|
optparse.parse!
|
22
28
|
|
23
|
-
config = if config_filename
|
24
|
-
JSON.parse File.read(config_filename)
|
29
|
+
config = if config_filename && File.file?(config_filename)
|
30
|
+
JSON.parse File.read(config_filename)
|
25
31
|
end
|
26
32
|
|
27
33
|
sentencer = TextSentencer.new(config)
|
28
34
|
|
29
35
|
text = ARGF.read
|
30
|
-
|
31
|
-
## Preprocessing
|
32
|
-
# It should be removed later
|
33
|
-
text.gsub!(/ +/, ' ')
|
34
|
-
text.gsub!(/\n+/, "\n")
|
35
|
-
text.gsub!(/\t+/, "\t")
|
36
|
-
text.gsub!(/\n /, "\n")
|
37
|
-
text.gsub!(/ \n/, "\n")
|
38
|
-
text.gsub!(/\t /, "\t")
|
39
|
-
text.gsub!(/ \t/, "\t")
|
40
|
-
text.gsub!(/\n\t/, "\n")
|
41
|
-
text.gsub!(/\t\n/, "\n")
|
42
|
-
|
43
36
|
annotations = sentencer.annotate(text)
|
44
|
-
|
45
|
-
|
46
|
-
puts
|
37
|
+
|
38
|
+
if output_mode == :json
|
39
|
+
puts JSON.pretty_generate(annotations)
|
40
|
+
else
|
41
|
+
annotations[:denotations].each do |d|
|
42
|
+
span = d[:span]
|
43
|
+
puts text[span[:begin]...span[:end]]
|
44
|
+
end
|
47
45
|
end
|
@@ -1,150 +1,117 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'text_sentencer/string_scan_offset'
|
3
|
+
require 'pp'
|
2
4
|
|
3
5
|
class TextSentencer
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
## adjust offsets for the in text
|
116
|
-
sentences.collect!{|b, e| [b + start, e + start]}
|
117
|
-
|
118
|
-
sentences
|
119
|
-
end
|
120
|
-
end
|
121
|
-
|
122
|
-
if __FILE__ == $0
|
123
|
-
rules = {
|
124
|
-
break_candidates: [
|
125
|
-
" ", "\t"
|
126
|
-
],
|
127
|
-
|
128
|
-
break_characters: [
|
129
|
-
"\n"
|
130
|
-
],
|
131
|
-
|
132
|
-
positive_rules: [
|
133
|
-
['[\.!?]', '[0-9A-Z]'],
|
134
|
-
['[:]', '[0-9]'],
|
135
|
-
['[:]', '[A-Z][a-z]']
|
136
|
-
],
|
137
|
-
|
138
|
-
negative_rules: []
|
139
|
-
}
|
140
|
-
|
141
|
-
sentencer = TextSentencer.new
|
142
|
-
|
143
|
-
text = ''
|
144
|
-
ARGF.each do |line|
|
145
|
-
text += line
|
146
|
-
end
|
147
|
-
|
148
|
-
sen_so = sentencer.annotate(text)
|
149
|
-
p(sen_so)
|
6
|
+
## default rules
|
7
|
+
|
8
|
+
DEFAULT_RULES = {
|
9
|
+
# All the positions of new line characters always take sentence break.
|
10
|
+
break_pattern: "([ \t]*\n+)+[ \t]*",
|
11
|
+
|
12
|
+
# All the positions of space and tab characters are candiates of sentence break.
|
13
|
+
candidate_pattern: "[ \t]+",
|
14
|
+
|
15
|
+
# First, positive rules are applied to the break candidates to make initial segmantations.
|
16
|
+
positive_rules: [
|
17
|
+
['[.!?]', '[0-9A-Z]'],
|
18
|
+
['[:]', '[0-9]'],
|
19
|
+
['[:]', '[A-Z][a-z]']
|
20
|
+
],
|
21
|
+
|
22
|
+
# Then, negative rules are applied to cancel some initial segmentations.
|
23
|
+
negative_rules: [
|
24
|
+
# Titles before names
|
25
|
+
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
|
26
|
+
|
27
|
+
# Titles usually before names, but ..
|
28
|
+
['(Sr|Jr)\.', '[A-Z][a-z]'],
|
29
|
+
|
30
|
+
# Single letter abbriveations, e.g. middle name
|
31
|
+
# ['\b[A-Z]\.', '[A-Z][a-z]'],
|
32
|
+
|
33
|
+
# Abbriveations, e.g. middle name
|
34
|
+
['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
|
35
|
+
|
36
|
+
# Frequent abbreviations that will never appear in the end of a sentence
|
37
|
+
['(cf|vs)\.', ''],
|
38
|
+
['e\.g\.', ''],
|
39
|
+
['i\.e\.', ''],
|
40
|
+
|
41
|
+
# Others
|
42
|
+
['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
|
43
|
+
]
|
44
|
+
}
|
45
|
+
|
46
|
+
def initialize(rules = nil)
|
47
|
+
rules ||= DEFAULT_RULES
|
48
|
+
@rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
|
49
|
+
@rules[:break_pattern] ||= ""
|
50
|
+
@rules[:candidate_pattern] ||= ""
|
51
|
+
@rules[:positive_rules] ||= []
|
52
|
+
@rules[:negative_rules] ||= []
|
53
|
+
end
|
54
|
+
|
55
|
+
def annotate(text)
|
56
|
+
return nil if text.nil?
|
57
|
+
|
58
|
+
sentences = segment(text)
|
59
|
+
denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
|
60
|
+
{text:text, denotations:denotations}
|
61
|
+
end
|
62
|
+
|
63
|
+
def segment(text)
|
64
|
+
breaks = if @rules[:break_pattern].empty?
|
65
|
+
[]
|
66
|
+
else
|
67
|
+
text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
|
68
|
+
end
|
69
|
+
|
70
|
+
candidates = if @rules[:candidate_pattern].empty?
|
71
|
+
[]
|
72
|
+
else
|
73
|
+
text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
|
74
|
+
end
|
75
|
+
|
76
|
+
# breaks take precedent
|
77
|
+
candidates -= breaks
|
78
|
+
|
79
|
+
candidates.each do |c|
|
80
|
+
last_end, next_begin = c
|
81
|
+
|
82
|
+
if (last_end == 0) || (next_begin == text.length)
|
83
|
+
breaks << c
|
84
|
+
next
|
85
|
+
end
|
86
|
+
|
87
|
+
last_text = text[0...last_end]
|
88
|
+
next_text = text[next_begin..-1]
|
89
|
+
|
90
|
+
@rules[:positive_rules].each do |p|
|
91
|
+
if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
|
92
|
+
break_p = true
|
93
|
+
@rules[:negative_rules].each do |n|
|
94
|
+
if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
|
95
|
+
break_p = false
|
96
|
+
break
|
97
|
+
end
|
98
|
+
end
|
99
|
+
breaks << c if break_p
|
100
|
+
break
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
breaks.sort!
|
106
|
+
|
107
|
+
sentences = []
|
108
|
+
lastbreak = 0
|
109
|
+
breaks.each do |b|
|
110
|
+
sentences << [lastbreak, b[0]] if b[0] > lastbreak
|
111
|
+
lastbreak = b[1]
|
112
|
+
end
|
113
|
+
sentences << [lastbreak, text.length] if lastbreak < text.length
|
114
|
+
|
115
|
+
sentences
|
116
|
+
end
|
150
117
|
end
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_sentencer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: TextSentencer is a simple rule-based system for segmenting
|
14
|
-
|
13
|
+
description: TextSentencer is a simple rule-based system for segmenting text into
|
14
|
+
sentences.
|
15
15
|
email: jindong.kim@gmail.com
|
16
16
|
executables:
|
17
17
|
- text_sentencer
|
@@ -20,8 +20,9 @@ extra_rdoc_files: []
|
|
20
20
|
files:
|
21
21
|
- bin/text_sentencer
|
22
22
|
- lib/text_sentencer.rb
|
23
|
+
- lib/text_sentencer/string_scan_offset.rb
|
23
24
|
- lib/text_sentencer/text_sentencer.rb
|
24
|
-
homepage: http://
|
25
|
+
homepage: http://bionlp.dbcls.jp/text_sentencer
|
25
26
|
licenses:
|
26
27
|
- MIT
|
27
28
|
metadata: {}
|
@@ -40,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
41
|
- !ruby/object:Gem::Version
|
41
42
|
version: '0'
|
42
43
|
requirements: []
|
43
|
-
|
44
|
-
rubygems_version: 2.4.8
|
44
|
+
rubygems_version: 3.2.3
|
45
45
|
signing_key:
|
46
46
|
specification_version: 4
|
47
47
|
summary: A simple, rule-based script to find sentence boundaries in text.
|