text_sentencer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1ec3b6b9d2b3596397c952d6e905c75f4667a73d
4
+ data.tar.gz: f094203047168fae9a682d1344904c301c9b62ae
5
+ SHA512:
6
+ metadata.gz: 1142ded7a5be0e72cb840f8f0f6b294196b4718f1e793935304068c6dd4763eedcd55db26ed70e24cb7477d5b3f3fcb24f4c0c57b9a6751a325e4a26c429d553
7
+ data.tar.gz: 44ef4ff3c18c18623b340a32331153da3f24e3e992f7b481d682d05ec1778e05f75f85ce1992182a8c6cc383d28a3e14e0d553d3f2df20853928400da73a13a0
@@ -0,0 +1 @@
1
+ require 'text_sentencer/text_sentencer'
@@ -0,0 +1,33 @@
1
+ module TextSentencer; end unless defined? TextSentencer
2
+
3
+ # All the positions of whitespace characters are candiate of sentence boundary.
4
+
5
+ # First, positive rules are applied to find make initial segmantations.
6
+ TextSentencer::POSITIVE_RULES = [
7
+ ['[\.!?]', '[0-9A-Z]'],
8
+ ['[:]', '[0-9]'],
9
+ ['[:]', '[A-Z][a-z]']
10
+ ]
11
+
12
+ # Then, negative rules are applied to cancel some initial segmentations.
13
+ TextSentencer::NEGATIVE_RULES = [
14
+ # Titles before names
15
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
16
+
17
+ # Titles usually before names, but ..
18
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
19
+
20
+ # Single letter abbriveations, e.g. middle name
21
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
22
+
23
+ # Abbriveations, e.g. middle name
24
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
25
+
26
+ # Frequent abbreviations that will never appear in the end of a sentence
27
+ ['(cf|vs)\.', ''],
28
+ ['e\.g\.', ''],
29
+ ['i\.e\.', ''],
30
+
31
+ # Others
32
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
33
+ ]
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_sentencer/rules'
3
+
4
+ module TextSentencer; end unless defined? TextSentencer
5
+
6
+ module TextSentencer
7
+ def TextSentencer.segment(text)
8
+ original_text = text
9
+ text = original_text.strip
10
+ start = original_text.index(text)
11
+
12
+ ## apply the positive rules to the places of space and newline characters
13
+ pbreaks = [] # breaks by positive rules
14
+ for l in 0..text.length
15
+
16
+ case text[l]
17
+ when ' ' # space
18
+ POSITIVE_RULES.each do |r|
19
+ if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
20
+ pbreaks << l
21
+ break
22
+ end
23
+ end
24
+ when "\n" # newline
25
+ pbreaks << l
26
+ end
27
+ end
28
+
29
+ ## apply the negative rules to the places of space characters
30
+ nbreaks = [] # breaks by negative rules
31
+ pbreaks.each do |l|
32
+ if text[l] == ' '
33
+ NEGATIVE_RULES.each do |r|
34
+ if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
35
+ nbreaks << l
36
+ break
37
+ end
38
+ end
39
+ end
40
+ end
41
+ breaks = pbreaks - nbreaks
42
+
43
+ sentences = []
44
+ lastbreak = -1
45
+ breaks.each do |b|
46
+ sentences.push([lastbreak+1, b])
47
+ lastbreak = b
48
+ end
49
+ sentences.push([lastbreak+1, text.length])
50
+
51
+ ## filter out empty segments
52
+ sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
53
+
54
+ ## adjust offsets for the in text
55
+ sentences.collect!{|b, e| [b + start, e + start]}
56
+
57
+ sentences
58
+ end
59
+ end
60
+
61
+ if __FILE__ == $0
62
+ text = ''
63
+ ARGF.each do |line|
64
+ text += line
65
+ end
66
+
67
+ sen_so = TextSentencer.segment(text)
68
+ p(sen_so)
69
+ end
metadata ADDED
@@ -0,0 +1,47 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_sentencer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jin-Dong Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-04-11 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Text sentencer finds sentence boundaries of a given text. It is a simple,
14
+ rule-based system.
15
+ email: jindong.kim@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/text_sentencer.rb
21
+ - lib/text_sentencer/rules.rb
22
+ - lib/text_sentencer/text_sentencer.rb
23
+ homepage: http://rubygems.org/gems/text_sentencer
24
+ licenses:
25
+ - MIT
26
+ metadata: {}
27
+ post_install_message:
28
+ rdoc_options: []
29
+ require_paths:
30
+ - lib
31
+ required_ruby_version: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ requirements: []
42
+ rubyforge_project:
43
+ rubygems_version: 2.2.2
44
+ signing_key:
45
+ specification_version: 4
46
+ summary: To find sentences in text.
47
+ test_files: []