text_sentencer 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1ec3b6b9d2b3596397c952d6e905c75f4667a73d
4
- data.tar.gz: f094203047168fae9a682d1344904c301c9b62ae
3
+ metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
4
+ data.tar.gz: 0255396f5d925c06023111bca178157b56765586
5
5
  SHA512:
6
- metadata.gz: 1142ded7a5be0e72cb840f8f0f6b294196b4718f1e793935304068c6dd4763eedcd55db26ed70e24cb7477d5b3f3fcb24f4c0c57b9a6751a325e4a26c429d553
7
- data.tar.gz: 44ef4ff3c18c18623b340a32331153da3f24e3e992f7b481d682d05ec1778e05f75f85ce1992182a8c6cc383d28a3e14e0d553d3f2df20853928400da73a13a0
6
+ metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
7
+ data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_sentencer'
3
+
4
+ config_filename = nil
5
+
6
+ ## command line option processing
7
+ require 'optparse'
8
+ optparse = OptionParser.new do |opts|
9
+ opts.banner = "Usage: text_sentencer [options]"
10
+
11
+ opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
12
+ config_filename = f
13
+ end
14
+
15
+ opts.on('-h', '--help', 'displays this screen.') do
16
+ puts opts
17
+ exit
18
+ end
19
+ end
20
+
21
+ optparse.parse!
22
+
23
+ config = if config_filename
24
+ JSON.parse File.read(config_filename) if File.file?(config_filename)
25
+ end
26
+
27
+ sentencer = TextSentencer.new(config)
28
+
29
+ text = ARGF.read
30
+
31
+ ## Preprocessing
32
+ # It should be removed later
33
+ text.gsub!(/ +/, ' ')
34
+ text.gsub!(/\n+/, "\n")
35
+ text.gsub!(/\t+/, "\t")
36
+ text.gsub!(/\n /, "\n")
37
+ text.gsub!(/ \n/, "\n")
38
+ text.gsub!(/\t /, "\t")
39
+ text.gsub!(/ \t/, "\t")
40
+ text.gsub!(/\n\t/, "\n")
41
+ text.gsub!(/\t\n/, "\n")
42
+
43
+ annotations = sentencer.annotate(text)
44
+ annotations[:denotations].each do |d|
45
+ span = d[:span]
46
+ puts text[span[:begin]...span[:end]]
47
+ end
@@ -1,44 +1,105 @@
1
1
  #!/usr/bin/env ruby
2
- require 'text_sentencer/rules'
3
2
 
4
- module TextSentencer; end unless defined? TextSentencer
3
+ class TextSentencer
4
+ ## default rules
5
5
 
6
- module TextSentencer
7
- def TextSentencer.segment(text)
6
+ # All the positions of space and tab characters are candiates of sentence break.
7
+ BREAK_CANDIDATES = [
8
+ " ", "\t"
9
+ ]
10
+
11
+ # All the positions of new line characters always take sentence break.
12
+ BREAK_CHARACTERS = [
13
+ "\n"
14
+ ]
15
+
16
+ # First, positive rules are applied to the break candidates to make initial segmantations.
17
+ POSITIVE_RULES = [
18
+ ['[\.!?]', '[0-9A-Z]'],
19
+ ['[:]', '[0-9]'],
20
+ ['[:]', '[A-Z][a-z]']
21
+ ]
22
+
23
+ # Then, negative rules are applied to cancel some initial segmentations.
24
+ NEGATIVE_RULES = [
25
+ # Titles before names
26
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
27
+
28
+ # Titles usually before names, but ..
29
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
30
+
31
+ # Single letter abbriveations, e.g. middle name
32
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
33
+
34
+ # Abbriveations, e.g. middle name
35
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
36
+
37
+ # Frequent abbreviations that will never appear in the end of a sentence
38
+ ['(cf|vs)\.', ''],
39
+ ['e\.g\.', ''],
40
+ ['i\.e\.', ''],
41
+
42
+ # Others
43
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
44
+ ]
45
+
46
+ def initialize(rules = {})
47
+ rules ||= {}
48
+ @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
49
+ @break_characters = rules[:break_characters] || BREAK_CHARACTERS
50
+ @positive_rules = rules[:positive_rules] || POSITIVE_RULES
51
+ @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
52
+ end
53
+
54
+ def annotate(text)
55
+ return nil if text.nil? || text.empty?
56
+ sentences = segment(text)
57
+ denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
+ denotations.empty? ? {text:text} : {text:text, denotations:denotations}
59
+ end
60
+
61
+ private
62
+
63
+ def segment(text)
8
64
  original_text = text
9
65
  text = original_text.strip
10
66
  start = original_text.index(text)
11
67
 
12
- ## apply the positive rules to the places of space and newline characters
13
- pbreaks = [] # breaks by positive rules
68
+ # sentence breaks
69
+ breaks = []
70
+
71
+ # breaks by positive rules
72
+ pbreaks = []
73
+
74
+ # canceled breaks by negative rules
75
+ nbreaks = []
76
+
14
77
  for l in 0..text.length
15
78
 
16
- case text[l]
17
- when ' ' # space
79
+ ## apply the positive rules to the places of break candidates
80
+ if @break_candidates.include?(text[l])
18
81
  POSITIVE_RULES.each do |r|
19
82
  if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
20
83
  pbreaks << l
21
84
  break
22
85
  end
23
86
  end
24
- when "\n" # newline
25
- pbreaks << l
87
+ elsif @break_characters.include?(text[l])
88
+ breaks << l
26
89
  end
27
90
  end
28
91
 
29
- ## apply the negative rules to the places of space characters
30
- nbreaks = [] # breaks by negative rules
92
+ ## apply the negative rules to the places of break candidates
31
93
  pbreaks.each do |l|
32
- if text[l] == ' '
33
- NEGATIVE_RULES.each do |r|
34
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
35
- nbreaks << l
36
- break
37
- end
94
+ NEGATIVE_RULES.each do |r|
95
+ if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
96
+ nbreaks << l
97
+ break
38
98
  end
39
99
  end
40
100
  end
41
- breaks = pbreaks - nbreaks
101
+ breaks += pbreaks - nbreaks
102
+ breaks.sort!
42
103
 
43
104
  sentences = []
44
105
  lastbreak = -1
@@ -59,11 +120,31 @@ module TextSentencer
59
120
  end
60
121
 
61
122
  if __FILE__ == $0
123
+ rules = {
124
+ break_candidates: [
125
+ " ", "\t"
126
+ ],
127
+
128
+ break_characters: [
129
+ "\n"
130
+ ],
131
+
132
+ positive_rules: [
133
+ ['[\.!?]', '[0-9A-Z]'],
134
+ ['[:]', '[0-9]'],
135
+ ['[:]', '[A-Z][a-z]']
136
+ ],
137
+
138
+ negative_rules: []
139
+ }
140
+
141
+ sentencer = TextSentencer.new
142
+
62
143
  text = ''
63
144
  ARGF.each do |line|
64
145
  text += line
65
146
  end
66
147
 
67
- sen_so = TextSentencer.segment(text)
148
+ sen_so = sentencer.annotate(text)
68
149
  p(sen_so)
69
150
  end
metadata CHANGED
@@ -1,24 +1,25 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-11 00:00:00.000000000 Z
11
+ date: 2017-07-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Text sentencer finds sentence boundaries of a given text. It is a simple,
14
- rule-based system.
13
+ description: TextSentencer is a simple rule-based system for segmenting a text block
14
+ into sentences.
15
15
  email: jindong.kim@gmail.com
16
- executables: []
16
+ executables:
17
+ - text_sentencer
17
18
  extensions: []
18
19
  extra_rdoc_files: []
19
20
  files:
21
+ - bin/text_sentencer
20
22
  - lib/text_sentencer.rb
21
- - lib/text_sentencer/rules.rb
22
23
  - lib/text_sentencer/text_sentencer.rb
23
24
  homepage: http://rubygems.org/gems/text_sentencer
24
25
  licenses:
@@ -40,8 +41,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
40
41
  version: '0'
41
42
  requirements: []
42
43
  rubyforge_project:
43
- rubygems_version: 2.2.2
44
+ rubygems_version: 2.4.8
44
45
  signing_key:
45
46
  specification_version: 4
46
- summary: To find sentences in text.
47
+ summary: A simple, rule-based script to find sentence boundaries in text.
47
48
  test_files: []
@@ -1,33 +0,0 @@
1
- module TextSentencer; end unless defined? TextSentencer
2
-
3
- # All the positions of whitespace characters are candiate of sentence boundary.
4
-
5
- # First, positive rules are applied to find make initial segmantations.
6
- TextSentencer::POSITIVE_RULES = [
7
- ['[\.!?]', '[0-9A-Z]'],
8
- ['[:]', '[0-9]'],
9
- ['[:]', '[A-Z][a-z]']
10
- ]
11
-
12
- # Then, negative rules are applied to cancel some initial segmentations.
13
- TextSentencer::NEGATIVE_RULES = [
14
- # Titles before names
15
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
16
-
17
- # Titles usually before names, but ..
18
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
19
-
20
- # Single letter abbriveations, e.g. middle name
21
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
22
-
23
- # Abbriveations, e.g. middle name
24
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
25
-
26
- # Frequent abbreviations that will never appear in the end of a sentence
27
- ['(cf|vs)\.', ''],
28
- ['e\.g\.', ''],
29
- ['i\.e\.', ''],
30
-
31
- # Others
32
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
33
- ]