text_sentencer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/text_sentencer.rb +1 -0
- data/lib/text_sentencer/rules.rb +33 -0
- data/lib/text_sentencer/text_sentencer.rb +69 -0
- metadata +47 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1ec3b6b9d2b3596397c952d6e905c75f4667a73d
|
4
|
+
data.tar.gz: f094203047168fae9a682d1344904c301c9b62ae
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1142ded7a5be0e72cb840f8f0f6b294196b4718f1e793935304068c6dd4763eedcd55db26ed70e24cb7477d5b3f3fcb24f4c0c57b9a6751a325e4a26c429d553
|
7
|
+
data.tar.gz: 44ef4ff3c18c18623b340a32331153da3f24e3e992f7b481d682d05ec1778e05f75f85ce1992182a8c6cc383d28a3e14e0d553d3f2df20853928400da73a13a0
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'text_sentencer/text_sentencer'
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module TextSentencer; end unless defined? TextSentencer
|
2
|
+
|
3
|
+
# All the positions of whitespace characters are candiate of sentence boundary.
|
4
|
+
|
5
|
+
# First, positive rules are applied to find make initial segmantations.
|
6
|
+
TextSentencer::POSITIVE_RULES = [
|
7
|
+
['[\.!?]', '[0-9A-Z]'],
|
8
|
+
['[:]', '[0-9]'],
|
9
|
+
['[:]', '[A-Z][a-z]']
|
10
|
+
]
|
11
|
+
|
12
|
+
# Then, negative rules are applied to cancel some initial segmentations.
|
13
|
+
TextSentencer::NEGATIVE_RULES = [
|
14
|
+
# Titles before names
|
15
|
+
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
|
16
|
+
|
17
|
+
# Titles usually before names, but ..
|
18
|
+
['(Sr|Jr)\.', '[A-Z][a-z]'],
|
19
|
+
|
20
|
+
# Single letter abbriveations, e.g. middle name
|
21
|
+
# ['\b[A-Z]\.', '[A-Z][a-z]'],
|
22
|
+
|
23
|
+
# Abbriveations, e.g. middle name
|
24
|
+
['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
|
25
|
+
|
26
|
+
# Frequent abbreviations that will never appear in the end of a sentence
|
27
|
+
['(cf|vs)\.', ''],
|
28
|
+
['e\.g\.', ''],
|
29
|
+
['i\.e\.', ''],
|
30
|
+
|
31
|
+
# Others
|
32
|
+
['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
|
33
|
+
]
|
@@ -0,0 +1,69 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'text_sentencer/rules'
|
3
|
+
|
4
|
+
module TextSentencer; end unless defined? TextSentencer
|
5
|
+
|
6
|
+
module TextSentencer
|
7
|
+
def TextSentencer.segment(text)
|
8
|
+
original_text = text
|
9
|
+
text = original_text.strip
|
10
|
+
start = original_text.index(text)
|
11
|
+
|
12
|
+
## apply the positive rules to the places of space and newline characters
|
13
|
+
pbreaks = [] # breaks by positive rules
|
14
|
+
for l in 0..text.length
|
15
|
+
|
16
|
+
case text[l]
|
17
|
+
when ' ' # space
|
18
|
+
POSITIVE_RULES.each do |r|
|
19
|
+
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
20
|
+
pbreaks << l
|
21
|
+
break
|
22
|
+
end
|
23
|
+
end
|
24
|
+
when "\n" # newline
|
25
|
+
pbreaks << l
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
## apply the negative rules to the places of space characters
|
30
|
+
nbreaks = [] # breaks by negative rules
|
31
|
+
pbreaks.each do |l|
|
32
|
+
if text[l] == ' '
|
33
|
+
NEGATIVE_RULES.each do |r|
|
34
|
+
if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
|
35
|
+
nbreaks << l
|
36
|
+
break
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
breaks = pbreaks - nbreaks
|
42
|
+
|
43
|
+
sentences = []
|
44
|
+
lastbreak = -1
|
45
|
+
breaks.each do |b|
|
46
|
+
sentences.push([lastbreak+1, b])
|
47
|
+
lastbreak = b
|
48
|
+
end
|
49
|
+
sentences.push([lastbreak+1, text.length])
|
50
|
+
|
51
|
+
## filter out empty segments
|
52
|
+
sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
|
53
|
+
|
54
|
+
## adjust offsets for the in text
|
55
|
+
sentences.collect!{|b, e| [b + start, e + start]}
|
56
|
+
|
57
|
+
sentences
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
if __FILE__ == $0
|
62
|
+
text = ''
|
63
|
+
ARGF.each do |line|
|
64
|
+
text += line
|
65
|
+
end
|
66
|
+
|
67
|
+
sen_so = TextSentencer.segment(text)
|
68
|
+
p(sen_so)
|
69
|
+
end
|
metadata
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_sentencer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-04-11 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Text sentencer finds sentence boundaries of a given text. It is a simple,
|
14
|
+
rule-based system.
|
15
|
+
email: jindong.kim@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/text_sentencer.rb
|
21
|
+
- lib/text_sentencer/rules.rb
|
22
|
+
- lib/text_sentencer/text_sentencer.rb
|
23
|
+
homepage: http://rubygems.org/gems/text_sentencer
|
24
|
+
licenses:
|
25
|
+
- MIT
|
26
|
+
metadata: {}
|
27
|
+
post_install_message:
|
28
|
+
rdoc_options: []
|
29
|
+
require_paths:
|
30
|
+
- lib
|
31
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
requirements: []
|
42
|
+
rubyforge_project:
|
43
|
+
rubygems_version: 2.2.2
|
44
|
+
signing_key:
|
45
|
+
specification_version: 4
|
46
|
+
summary: To find sentences in text.
|
47
|
+
test_files: []
|