text_sentencer 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/text_sentencer +5 -4
- data/lib/text_sentencer/text_sentencer.rb +54 -80
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
|
4
|
+
data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
|
7
|
+
data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227
|
data/bin/text_sentencer
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'json'
|
2
3
|
require 'text_sentencer'
|
3
4
|
|
4
5
|
config_filename = nil
|
@@ -8,8 +9,8 @@ require 'optparse'
|
|
8
9
|
optparse = OptionParser.new do |opts|
|
9
10
|
opts.banner = "Usage: text_sentencer [options]"
|
10
11
|
|
11
|
-
opts.on('-c', '--config', 'specifies the configuration JSON file.') do |
|
12
|
-
config_filename =
|
12
|
+
opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
|
13
|
+
config_filename = c
|
13
14
|
end
|
14
15
|
|
15
16
|
opts.on('-h', '--help', 'displays this screen.') do
|
@@ -20,8 +21,8 @@ end
|
|
20
21
|
|
21
22
|
optparse.parse!
|
22
23
|
|
23
|
-
config = if config_filename
|
24
|
-
JSON.parse File.read(config_filename)
|
24
|
+
config = if config_filename && File.file?(config_filename)
|
25
|
+
JSON.parse File.read(config_filename)
|
25
26
|
end
|
26
27
|
|
27
28
|
sentencer = TextSentencer.new(config)
|
@@ -1,54 +1,58 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
+
require 'pp'
|
2
3
|
|
3
4
|
class TextSentencer
|
4
5
|
## default rules
|
5
6
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
@
|
51
|
-
@
|
7
|
+
DEFAULT_RULES = {
|
8
|
+
# All the positions of new line characters always take sentence break.
|
9
|
+
break_characters: [
|
10
|
+
"\n"
|
11
|
+
],
|
12
|
+
|
13
|
+
# All the positions of space and tab characters are candiates of sentence break.
|
14
|
+
break_candidates: [
|
15
|
+
" ", "\t"
|
16
|
+
],
|
17
|
+
|
18
|
+
# First, positive rules are applied to the break candidates to make initial segmantations.
|
19
|
+
positive_rules: [
|
20
|
+
['[.!?]', '[0-9A-Z]'],
|
21
|
+
['[:]', '[0-9]'],
|
22
|
+
['[:]', '[A-Z][a-z]']
|
23
|
+
],
|
24
|
+
|
25
|
+
# Then, negative rules are applied to cancel some initial segmentations.
|
26
|
+
negative_rules: [
|
27
|
+
# Titles before names
|
28
|
+
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
|
29
|
+
|
30
|
+
# Titles usually before names, but ..
|
31
|
+
['(Sr|Jr)\.', '[A-Z][a-z]'],
|
32
|
+
|
33
|
+
# Single letter abbriveations, e.g. middle name
|
34
|
+
# ['\b[A-Z]\.', '[A-Z][a-z]'],
|
35
|
+
|
36
|
+
# Abbriveations, e.g. middle name
|
37
|
+
['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
|
38
|
+
|
39
|
+
# Frequent abbreviations that will never appear in the end of a sentence
|
40
|
+
['(cf|vs)\.', ''],
|
41
|
+
['e\.g\.', ''],
|
42
|
+
['i\.e\.', ''],
|
43
|
+
|
44
|
+
# Others
|
45
|
+
['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
|
46
|
+
]
|
47
|
+
}
|
48
|
+
|
49
|
+
def initialize(rules = nil)
|
50
|
+
rules ||= DEFAULT_RULES
|
51
|
+
@rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
|
52
|
+
@rules[:break_characters] ||= []
|
53
|
+
@rules[:break_candidates] ||= []
|
54
|
+
@rules[:positive_rules] ||= []
|
55
|
+
@rules[:negative_rules] ||= []
|
52
56
|
end
|
53
57
|
|
54
58
|
def annotate(text)
|
@@ -77,21 +81,21 @@ class TextSentencer
|
|
77
81
|
for l in 0..text.length
|
78
82
|
|
79
83
|
## apply the positive rules to the places of break candidates
|
80
|
-
if @break_candidates.include?(text[l])
|
81
|
-
|
84
|
+
if @rules[:break_candidates].include?(text[l])
|
85
|
+
@rules[:positive_rules].each do |r|
|
82
86
|
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
83
87
|
pbreaks << l
|
84
88
|
break
|
85
89
|
end
|
86
90
|
end
|
87
|
-
elsif @break_characters.include?(text[l])
|
91
|
+
elsif @rules[:break_characters].include?(text[l])
|
88
92
|
breaks << l
|
89
93
|
end
|
90
94
|
end
|
91
95
|
|
92
96
|
## apply the negative rules to the places of break candidates
|
93
97
|
pbreaks.each do |l|
|
94
|
-
|
98
|
+
@rules[:negative_rules].each do |r|
|
95
99
|
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
96
100
|
nbreaks << l
|
97
101
|
break
|
@@ -118,33 +122,3 @@ class TextSentencer
|
|
118
122
|
sentences
|
119
123
|
end
|
120
124
|
end
|
121
|
-
|
122
|
-
if __FILE__ == $0
|
123
|
-
rules = {
|
124
|
-
break_candidates: [
|
125
|
-
" ", "\t"
|
126
|
-
],
|
127
|
-
|
128
|
-
break_characters: [
|
129
|
-
"\n"
|
130
|
-
],
|
131
|
-
|
132
|
-
positive_rules: [
|
133
|
-
['[\.!?]', '[0-9A-Z]'],
|
134
|
-
['[:]', '[0-9]'],
|
135
|
-
['[:]', '[A-Z][a-z]']
|
136
|
-
],
|
137
|
-
|
138
|
-
negative_rules: []
|
139
|
-
}
|
140
|
-
|
141
|
-
sentencer = TextSentencer.new
|
142
|
-
|
143
|
-
text = ''
|
144
|
-
ARGF.each do |line|
|
145
|
-
text += line
|
146
|
-
end
|
147
|
-
|
148
|
-
sen_so = sentencer.annotate(text)
|
149
|
-
p(sen_so)
|
150
|
-
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_sentencer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: TextSentencer is a simple rule-based system for segmenting a text block
|
14
14
|
into sentences.
|
@@ -21,7 +21,7 @@ files:
|
|
21
21
|
- bin/text_sentencer
|
22
22
|
- lib/text_sentencer.rb
|
23
23
|
- lib/text_sentencer/text_sentencer.rb
|
24
|
-
homepage:
|
24
|
+
homepage: https://github.com/jdkim/text_sentencer
|
25
25
|
licenses:
|
26
26
|
- MIT
|
27
27
|
metadata: {}
|