text_sentencer 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1ec3b6b9d2b3596397c952d6e905c75f4667a73d
4
- data.tar.gz: f094203047168fae9a682d1344904c301c9b62ae
3
+ metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
4
+ data.tar.gz: 0255396f5d925c06023111bca178157b56765586
5
5
  SHA512:
6
- metadata.gz: 1142ded7a5be0e72cb840f8f0f6b294196b4718f1e793935304068c6dd4763eedcd55db26ed70e24cb7477d5b3f3fcb24f4c0c57b9a6751a325e4a26c429d553
7
- data.tar.gz: 44ef4ff3c18c18623b340a32331153da3f24e3e992f7b481d682d05ec1778e05f75f85ce1992182a8c6cc383d28a3e14e0d553d3f2df20853928400da73a13a0
6
+ metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
7
+ data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ require 'text_sentencer'
3
+
4
+ config_filename = nil
5
+
6
+ ## command line option processing
7
+ require 'optparse'
8
+ optparse = OptionParser.new do |opts|
9
+ opts.banner = "Usage: text_sentencer [options]"
10
+
11
+ opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
12
+ config_filename = f
13
+ end
14
+
15
+ opts.on('-h', '--help', 'displays this screen.') do
16
+ puts opts
17
+ exit
18
+ end
19
+ end
20
+
21
+ optparse.parse!
22
+
23
+ config = if config_filename
24
+ JSON.parse File.read(config_filename) if File.file?(config_filename)
25
+ end
26
+
27
+ sentencer = TextSentencer.new(config)
28
+
29
+ text = ARGF.read
30
+
31
+ ## Preprocessing
32
+ # It should be removed later
33
+ text.gsub!(/ +/, ' ')
34
+ text.gsub!(/\n+/, "\n")
35
+ text.gsub!(/\t+/, "\t")
36
+ text.gsub!(/\n /, "\n")
37
+ text.gsub!(/ \n/, "\n")
38
+ text.gsub!(/\t /, "\t")
39
+ text.gsub!(/ \t/, "\t")
40
+ text.gsub!(/\n\t/, "\n")
41
+ text.gsub!(/\t\n/, "\n")
42
+
43
+ annotations = sentencer.annotate(text)
44
+ annotations[:denotations].each do |d|
45
+ span = d[:span]
46
+ puts text[span[:begin]...span[:end]]
47
+ end
@@ -1,44 +1,105 @@
1
1
  #!/usr/bin/env ruby
2
- require 'text_sentencer/rules'
3
2
 
4
- module TextSentencer; end unless defined? TextSentencer
3
+ class TextSentencer
4
+ ## default rules
5
5
 
6
- module TextSentencer
7
- def TextSentencer.segment(text)
6
+ # All the positions of space and tab characters are candiates of sentence break.
7
+ BREAK_CANDIDATES = [
8
+ " ", "\t"
9
+ ]
10
+
11
+ # All the positions of new line characters always take sentence break.
12
+ BREAK_CHARACTERS = [
13
+ "\n"
14
+ ]
15
+
16
+ # First, positive rules are applied to the break candidates to make initial segmantations.
17
+ POSITIVE_RULES = [
18
+ ['[\.!?]', '[0-9A-Z]'],
19
+ ['[:]', '[0-9]'],
20
+ ['[:]', '[A-Z][a-z]']
21
+ ]
22
+
23
+ # Then, negative rules are applied to cancel some initial segmentations.
24
+ NEGATIVE_RULES = [
25
+ # Titles before names
26
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
27
+
28
+ # Titles usually before names, but ..
29
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
30
+
31
+ # Single letter abbriveations, e.g. middle name
32
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
33
+
34
+ # Abbriveations, e.g. middle name
35
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
36
+
37
+ # Frequent abbreviations that will never appear in the end of a sentence
38
+ ['(cf|vs)\.', ''],
39
+ ['e\.g\.', ''],
40
+ ['i\.e\.', ''],
41
+
42
+ # Others
43
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
44
+ ]
45
+
46
+ def initialize(rules = {})
47
+ rules ||= {}
48
+ @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
49
+ @break_characters = rules[:break_characters] || BREAK_CHARACTERS
50
+ @positive_rules = rules[:positive_rules] || POSITIVE_RULES
51
+ @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
52
+ end
53
+
54
+ def annotate(text)
55
+ return nil if text.nil? || text.empty?
56
+ sentences = segment(text)
57
+ denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
+ denotations.empty? ? {text:text} : {text:text, denotations:denotations}
59
+ end
60
+
61
+ private
62
+
63
+ def segment(text)
8
64
  original_text = text
9
65
  text = original_text.strip
10
66
  start = original_text.index(text)
11
67
 
12
- ## apply the positive rules to the places of space and newline characters
13
- pbreaks = [] # breaks by positive rules
68
+ # sentence breaks
69
+ breaks = []
70
+
71
+ # breaks by positive rules
72
+ pbreaks = []
73
+
74
+ # canceled breaks by negative rules
75
+ nbreaks = []
76
+
14
77
  for l in 0..text.length
15
78
 
16
- case text[l]
17
- when ' ' # space
79
+ ## apply the positive rules to the places of break candidates
80
+ if @break_candidates.include?(text[l])
18
81
  POSITIVE_RULES.each do |r|
19
82
  if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
20
83
  pbreaks << l
21
84
  break
22
85
  end
23
86
  end
24
- when "\n" # newline
25
- pbreaks << l
87
+ elsif @break_characters.include?(text[l])
88
+ breaks << l
26
89
  end
27
90
  end
28
91
 
29
- ## apply the negative rules to the places of space characters
30
- nbreaks = [] # breaks by negative rules
92
+ ## apply the negative rules to the places of break candidates
31
93
  pbreaks.each do |l|
32
- if text[l] == ' '
33
- NEGATIVE_RULES.each do |r|
34
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
35
- nbreaks << l
36
- break
37
- end
94
+ NEGATIVE_RULES.each do |r|
95
+ if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
96
+ nbreaks << l
97
+ break
38
98
  end
39
99
  end
40
100
  end
41
- breaks = pbreaks - nbreaks
101
+ breaks += pbreaks - nbreaks
102
+ breaks.sort!
42
103
 
43
104
  sentences = []
44
105
  lastbreak = -1
@@ -59,11 +120,31 @@ module TextSentencer
59
120
  end
60
121
 
61
122
  if __FILE__ == $0
123
+ rules = {
124
+ break_candidates: [
125
+ " ", "\t"
126
+ ],
127
+
128
+ break_characters: [
129
+ "\n"
130
+ ],
131
+
132
+ positive_rules: [
133
+ ['[\.!?]', '[0-9A-Z]'],
134
+ ['[:]', '[0-9]'],
135
+ ['[:]', '[A-Z][a-z]']
136
+ ],
137
+
138
+ negative_rules: []
139
+ }
140
+
141
+ sentencer = TextSentencer.new
142
+
62
143
  text = ''
63
144
  ARGF.each do |line|
64
145
  text += line
65
146
  end
66
147
 
67
- sen_so = TextSentencer.segment(text)
148
+ sen_so = sentencer.annotate(text)
68
149
  p(sen_so)
69
150
  end
metadata CHANGED
@@ -1,24 +1,25 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-11 00:00:00.000000000 Z
11
+ date: 2017-07-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: Text sentencer finds sentence boundaries of a given text. It is a simple,
14
- rule-based system.
13
+ description: TextSentencer is a simple rule-based system for segmenting a text block
14
+ into sentences.
15
15
  email: jindong.kim@gmail.com
16
- executables: []
16
+ executables:
17
+ - text_sentencer
17
18
  extensions: []
18
19
  extra_rdoc_files: []
19
20
  files:
21
+ - bin/text_sentencer
20
22
  - lib/text_sentencer.rb
21
- - lib/text_sentencer/rules.rb
22
23
  - lib/text_sentencer/text_sentencer.rb
23
24
  homepage: http://rubygems.org/gems/text_sentencer
24
25
  licenses:
@@ -40,8 +41,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
40
41
  version: '0'
41
42
  requirements: []
42
43
  rubyforge_project:
43
- rubygems_version: 2.2.2
44
+ rubygems_version: 2.4.8
44
45
  signing_key:
45
46
  specification_version: 4
46
- summary: To find sentences in text.
47
+ summary: A simple, rule-based script to find sentence boundaries in text.
47
48
  test_files: []
@@ -1,33 +0,0 @@
1
- module TextSentencer; end unless defined? TextSentencer
2
-
3
- # All the positions of whitespace characters are candiate of sentence boundary.
4
-
5
- # First, positive rules are applied to find make initial segmantations.
6
- TextSentencer::POSITIVE_RULES = [
7
- ['[\.!?]', '[0-9A-Z]'],
8
- ['[:]', '[0-9]'],
9
- ['[:]', '[A-Z][a-z]']
10
- ]
11
-
12
- # Then, negative rules are applied to cancel some initial segmentations.
13
- TextSentencer::NEGATIVE_RULES = [
14
- # Titles before names
15
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
16
-
17
- # Titles usually before names, but ..
18
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
19
-
20
- # Single letter abbriveations, e.g. middle name
21
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
22
-
23
- # Abbriveations, e.g. middle name
24
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
25
-
26
- # Frequent abbreviations that will never appear in the end of a sentence
27
- ['(cf|vs)\.', ''],
28
- ['e\.g\.', ''],
29
- ['i\.e\.', ''],
30
-
31
- # Others
32
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
33
- ]