text_sentencer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1ec3b6b9d2b3596397c952d6e905c75f4667a73d
4
+ data.tar.gz: f094203047168fae9a682d1344904c301c9b62ae
5
+ SHA512:
6
+ metadata.gz: 1142ded7a5be0e72cb840f8f0f6b294196b4718f1e793935304068c6dd4763eedcd55db26ed70e24cb7477d5b3f3fcb24f4c0c57b9a6751a325e4a26c429d553
7
+ data.tar.gz: 44ef4ff3c18c18623b340a32331153da3f24e3e992f7b481d682d05ec1778e05f75f85ce1992182a8c6cc383d28a3e14e0d553d3f2df20853928400da73a13a0
@@ -0,0 +1 @@
1
+ require 'text_sentencer/text_sentencer'
@@ -0,0 +1,33 @@
1
+ module TextSentencer; end unless defined? TextSentencer
2
+
3
+ # All the positions of whitespace characters are candiate of sentence boundary.
4
+
5
+ # First, positive rules are applied to find make initial segmantations.
6
+ TextSentencer::POSITIVE_RULES = [
7
+ ['[\.!?]', '[0-9A-Z]'],
8
+ ['[:]', '[0-9]'],
9
+ ['[:]', '[A-Z][a-z]']
10
+ ]
11
+
12
+ # Then, negative rules are applied to cancel some initial segmentations.
13
+ TextSentencer::NEGATIVE_RULES = [
14
+ # Titles before names
15
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
16
+
17
+ # Titles usually before names, but ..
18
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
19
+
20
+ # Single letter abbriveations, e.g. middle name
21
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
22
+
23
+ # Abbriveations, e.g. middle name
24
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
25
+
26
+ # Frequent abbreviations that will never appear in the end of a sentence
27
+ ['(cf|vs)\.', ''],
28
+ ['e\.g\.', ''],
29
+ ['i\.e\.', ''],
30
+
31
+ # Others
32
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
33
+ ]
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_sentencer/rules'
3
+
4
+ module TextSentencer; end unless defined? TextSentencer
5
+
6
+ module TextSentencer
7
+ def TextSentencer.segment(text)
8
+ original_text = text
9
+ text = original_text.strip
10
+ start = original_text.index(text)
11
+
12
+ ## apply the positive rules to the places of space and newline characters
13
+ pbreaks = [] # breaks by positive rules
14
+ for l in 0..text.length
15
+
16
+ case text[l]
17
+ when ' ' # space
18
+ POSITIVE_RULES.each do |r|
19
+ if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
20
+ pbreaks << l
21
+ break
22
+ end
23
+ end
24
+ when "\n" # newline
25
+ pbreaks << l
26
+ end
27
+ end
28
+
29
+ ## apply the negative rules to the places of space characters
30
+ nbreaks = [] # breaks by negative rules
31
+ pbreaks.each do |l|
32
+ if text[l] == ' '
33
+ NEGATIVE_RULES.each do |r|
34
+ if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
35
+ nbreaks << l
36
+ break
37
+ end
38
+ end
39
+ end
40
+ end
41
+ breaks = pbreaks - nbreaks
42
+
43
+ sentences = []
44
+ lastbreak = -1
45
+ breaks.each do |b|
46
+ sentences.push([lastbreak+1, b])
47
+ lastbreak = b
48
+ end
49
+ sentences.push([lastbreak+1, text.length])
50
+
51
+ ## filter out empty segments
52
+ sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
53
+
54
+ ## adjust offsets for the in text
55
+ sentences.collect!{|b, e| [b + start, e + start]}
56
+
57
+ sentences
58
+ end
59
+ end
60
+
61
+ if __FILE__ == $0
62
+ text = ''
63
+ ARGF.each do |line|
64
+ text += line
65
+ end
66
+
67
+ sen_so = TextSentencer.segment(text)
68
+ p(sen_so)
69
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_sentencer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jin-Dong Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-11 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Text sentencer finds sentence boundaries of a given text. It is a simple,
14
+ rule-based system.
15
+ email: jindong.kim@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/text_sentencer.rb
21
+ - lib/text_sentencer/rules.rb
22
+ - lib/text_sentencer/text_sentencer.rb
23
+ homepage: http://rubygems.org/gems/text_sentencer
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.2.2
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: To find sentences in text.
47
+ test_files: []