text_sentencer 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/text_sentencer +47 -0
- data/lib/text_sentencer/text_sentencer.rb +101 -20
- metadata +9 -8
- data/lib/text_sentencer/rules.rb +0 -33
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
|
4
|
+
data.tar.gz: 0255396f5d925c06023111bca178157b56765586
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
|
7
|
+
data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
|
data/bin/text_sentencer
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'text_sentencer'
|
3
|
+
|
4
|
+
config_filename = nil
|
5
|
+
|
6
|
+
## command line option processing
|
7
|
+
require 'optparse'
|
8
|
+
optparse = OptionParser.new do |opts|
|
9
|
+
opts.banner = "Usage: text_sentencer [options]"
|
10
|
+
|
11
|
+
opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
|
12
|
+
config_filename = f
|
13
|
+
end
|
14
|
+
|
15
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
16
|
+
puts opts
|
17
|
+
exit
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
optparse.parse!
|
22
|
+
|
23
|
+
config = if config_filename
|
24
|
+
JSON.parse File.read(config_filename) if File.file?(config_filename)
|
25
|
+
end
|
26
|
+
|
27
|
+
sentencer = TextSentencer.new(config)
|
28
|
+
|
29
|
+
text = ARGF.read
|
30
|
+
|
31
|
+
## Preprocessing
|
32
|
+
# It should be removed later
|
33
|
+
text.gsub!(/ +/, ' ')
|
34
|
+
text.gsub!(/\n+/, "\n")
|
35
|
+
text.gsub!(/\t+/, "\t")
|
36
|
+
text.gsub!(/\n /, "\n")
|
37
|
+
text.gsub!(/ \n/, "\n")
|
38
|
+
text.gsub!(/\t /, "\t")
|
39
|
+
text.gsub!(/ \t/, "\t")
|
40
|
+
text.gsub!(/\n\t/, "\n")
|
41
|
+
text.gsub!(/\t\n/, "\n")
|
42
|
+
|
43
|
+
annotations = sentencer.annotate(text)
|
44
|
+
annotations[:denotations].each do |d|
|
45
|
+
span = d[:span]
|
46
|
+
puts text[span[:begin]...span[:end]]
|
47
|
+
end
|
@@ -1,44 +1,105 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
require 'text_sentencer/rules'
|
3
2
|
|
4
|
-
|
3
|
+
class TextSentencer
|
4
|
+
## default rules
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
# All the positions of space and tab characters are candiates of sentence break.
|
7
|
+
BREAK_CANDIDATES = [
|
8
|
+
" ", "\t"
|
9
|
+
]
|
10
|
+
|
11
|
+
# All the positions of new line characters always take sentence break.
|
12
|
+
BREAK_CHARACTERS = [
|
13
|
+
"\n"
|
14
|
+
]
|
15
|
+
|
16
|
+
# First, positive rules are applied to the break candidates to make initial segmantations.
|
17
|
+
POSITIVE_RULES = [
|
18
|
+
['[\.!?]', '[0-9A-Z]'],
|
19
|
+
['[:]', '[0-9]'],
|
20
|
+
['[:]', '[A-Z][a-z]']
|
21
|
+
]
|
22
|
+
|
23
|
+
# Then, negative rules are applied to cancel some initial segmentations.
|
24
|
+
NEGATIVE_RULES = [
|
25
|
+
# Titles before names
|
26
|
+
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
|
27
|
+
|
28
|
+
# Titles usually before names, but ..
|
29
|
+
['(Sr|Jr)\.', '[A-Z][a-z]'],
|
30
|
+
|
31
|
+
# Single letter abbriveations, e.g. middle name
|
32
|
+
# ['\b[A-Z]\.', '[A-Z][a-z]'],
|
33
|
+
|
34
|
+
# Abbriveations, e.g. middle name
|
35
|
+
['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
|
36
|
+
|
37
|
+
# Frequent abbreviations that will never appear in the end of a sentence
|
38
|
+
['(cf|vs)\.', ''],
|
39
|
+
['e\.g\.', ''],
|
40
|
+
['i\.e\.', ''],
|
41
|
+
|
42
|
+
# Others
|
43
|
+
['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
|
44
|
+
]
|
45
|
+
|
46
|
+
def initialize(rules = {})
|
47
|
+
rules ||= {}
|
48
|
+
@break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
|
49
|
+
@break_characters = rules[:break_characters] || BREAK_CHARACTERS
|
50
|
+
@positive_rules = rules[:positive_rules] || POSITIVE_RULES
|
51
|
+
@negative_rules = rules[:negative_rules] || NEGATIVE_RULES
|
52
|
+
end
|
53
|
+
|
54
|
+
def annotate(text)
|
55
|
+
return nil if text.nil? || text.empty?
|
56
|
+
sentences = segment(text)
|
57
|
+
denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
|
58
|
+
denotations.empty? ? {text:text} : {text:text, denotations:denotations}
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def segment(text)
|
8
64
|
original_text = text
|
9
65
|
text = original_text.strip
|
10
66
|
start = original_text.index(text)
|
11
67
|
|
12
|
-
|
13
|
-
|
68
|
+
# sentence breaks
|
69
|
+
breaks = []
|
70
|
+
|
71
|
+
# breaks by positive rules
|
72
|
+
pbreaks = []
|
73
|
+
|
74
|
+
# canceled breaks by negative rules
|
75
|
+
nbreaks = []
|
76
|
+
|
14
77
|
for l in 0..text.length
|
15
78
|
|
16
|
-
|
17
|
-
|
79
|
+
## apply the positive rules to the places of break candidates
|
80
|
+
if @break_candidates.include?(text[l])
|
18
81
|
POSITIVE_RULES.each do |r|
|
19
82
|
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
20
83
|
pbreaks << l
|
21
84
|
break
|
22
85
|
end
|
23
86
|
end
|
24
|
-
|
25
|
-
|
87
|
+
elsif @break_characters.include?(text[l])
|
88
|
+
breaks << l
|
26
89
|
end
|
27
90
|
end
|
28
91
|
|
29
|
-
## apply the negative rules to the places of
|
30
|
-
nbreaks = [] # breaks by negative rules
|
92
|
+
## apply the negative rules to the places of break candidates
|
31
93
|
pbreaks.each do |l|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
break
|
37
|
-
end
|
94
|
+
NEGATIVE_RULES.each do |r|
|
95
|
+
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
96
|
+
nbreaks << l
|
97
|
+
break
|
38
98
|
end
|
39
99
|
end
|
40
100
|
end
|
41
|
-
breaks
|
101
|
+
breaks += pbreaks - nbreaks
|
102
|
+
breaks.sort!
|
42
103
|
|
43
104
|
sentences = []
|
44
105
|
lastbreak = -1
|
@@ -59,11 +120,31 @@ module TextSentencer
|
|
59
120
|
end
|
60
121
|
|
61
122
|
if __FILE__ == $0
|
123
|
+
rules = {
|
124
|
+
break_candidates: [
|
125
|
+
" ", "\t"
|
126
|
+
],
|
127
|
+
|
128
|
+
break_characters: [
|
129
|
+
"\n"
|
130
|
+
],
|
131
|
+
|
132
|
+
positive_rules: [
|
133
|
+
['[\.!?]', '[0-9A-Z]'],
|
134
|
+
['[:]', '[0-9]'],
|
135
|
+
['[:]', '[A-Z][a-z]']
|
136
|
+
],
|
137
|
+
|
138
|
+
negative_rules: []
|
139
|
+
}
|
140
|
+
|
141
|
+
sentencer = TextSentencer.new
|
142
|
+
|
62
143
|
text = ''
|
63
144
|
ARGF.each do |line|
|
64
145
|
text += line
|
65
146
|
end
|
66
147
|
|
67
|
-
sen_so =
|
148
|
+
sen_so = sentencer.annotate(text)
|
68
149
|
p(sen_so)
|
69
150
|
end
|
metadata
CHANGED
@@ -1,24 +1,25 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_sentencer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-07-20 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
14
|
-
|
13
|
+
description: TextSentencer is a simple rule-based system for segmenting a text block
|
14
|
+
into sentences.
|
15
15
|
email: jindong.kim@gmail.com
|
16
|
-
executables:
|
16
|
+
executables:
|
17
|
+
- text_sentencer
|
17
18
|
extensions: []
|
18
19
|
extra_rdoc_files: []
|
19
20
|
files:
|
21
|
+
- bin/text_sentencer
|
20
22
|
- lib/text_sentencer.rb
|
21
|
-
- lib/text_sentencer/rules.rb
|
22
23
|
- lib/text_sentencer/text_sentencer.rb
|
23
24
|
homepage: http://rubygems.org/gems/text_sentencer
|
24
25
|
licenses:
|
@@ -40,8 +41,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
41
|
version: '0'
|
41
42
|
requirements: []
|
42
43
|
rubyforge_project:
|
43
|
-
rubygems_version: 2.
|
44
|
+
rubygems_version: 2.4.8
|
44
45
|
signing_key:
|
45
46
|
specification_version: 4
|
46
|
-
summary:
|
47
|
+
summary: A simple, rule-based script to find sentence boundaries in text.
|
47
48
|
test_files: []
|
data/lib/text_sentencer/rules.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
module TextSentencer; end unless defined? TextSentencer
|
2
|
-
|
3
|
-
# All the positions of whitespace characters are candiate of sentence boundary.
|
4
|
-
|
5
|
-
# First, positive rules are applied to find make initial segmantations.
|
6
|
-
TextSentencer::POSITIVE_RULES = [
|
7
|
-
['[\.!?]', '[0-9A-Z]'],
|
8
|
-
['[:]', '[0-9]'],
|
9
|
-
['[:]', '[A-Z][a-z]']
|
10
|
-
]
|
11
|
-
|
12
|
-
# Then, negative rules are applied to cancel some initial segmentations.
|
13
|
-
TextSentencer::NEGATIVE_RULES = [
|
14
|
-
# Titles before names
|
15
|
-
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
|
16
|
-
|
17
|
-
# Titles usually before names, but ..
|
18
|
-
['(Sr|Jr)\.', '[A-Z][a-z]'],
|
19
|
-
|
20
|
-
# Single letter abbriveations, e.g. middle name
|
21
|
-
# ['\b[A-Z]\.', '[A-Z][a-z]'],
|
22
|
-
|
23
|
-
# Abbriveations, e.g. middle name
|
24
|
-
['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
|
25
|
-
|
26
|
-
# Frequent abbreviations that will never appear in the end of a sentence
|
27
|
-
['(cf|vs)\.', ''],
|
28
|
-
['e\.g\.', ''],
|
29
|
-
['i\.e\.', ''],
|
30
|
-
|
31
|
-
# Others
|
32
|
-
['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
|
33
|
-
]
|