text_sentencer 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
4
- data.tar.gz: 0255396f5d925c06023111bca178157b56765586
3
+ metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
4
+ data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
5
5
  SHA512:
6
- metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
7
- data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
6
+ metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
7
+ data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227
data/bin/text_sentencer CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'json'
2
3
  require 'text_sentencer'
3
4
 
4
5
  config_filename = nil
@@ -8,8 +9,8 @@ require 'optparse'
8
9
  optparse = OptionParser.new do |opts|
9
10
  opts.banner = "Usage: text_sentencer [options]"
10
11
 
11
- opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
12
- config_filename = f
12
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
13
+ config_filename = c
13
14
  end
14
15
 
15
16
  opts.on('-h', '--help', 'displays this screen.') do
@@ -20,8 +21,8 @@ end
20
21
 
21
22
  optparse.parse!
22
23
 
23
- config = if config_filename
24
- JSON.parse File.read(config_filename) if File.file?(config_filename)
24
+ config = if config_filename && File.file?(config_filename)
25
+ JSON.parse File.read(config_filename)
25
26
  end
26
27
 
27
28
  sentencer = TextSentencer.new(config)
@@ -1,54 +1,58 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'pp'
2
3
 
3
4
  class TextSentencer
4
5
  ## default rules
5
6
 
6
- # All the positions of space and tab characters are candiates of sentence break.
7
- BREAK_CANDIDATES = [
8
- " ", "\t"
9
- ]
10
-
11
- # All the positions of new line characters always take sentence break.
12
- BREAK_CHARACTERS = [
13
- "\n"
14
- ]
15
-
16
- # First, positive rules are applied to the break candidates to make initial segmantations.
17
- POSITIVE_RULES = [
18
- ['[\.!?]', '[0-9A-Z]'],
19
- ['[:]', '[0-9]'],
20
- ['[:]', '[A-Z][a-z]']
21
- ]
22
-
23
- # Then, negative rules are applied to cancel some initial segmentations.
24
- NEGATIVE_RULES = [
25
- # Titles before names
26
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
27
-
28
- # Titles usually before names, but ..
29
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
30
-
31
- # Single letter abbriveations, e.g. middle name
32
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
33
-
34
- # Abbriveations, e.g. middle name
35
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
36
-
37
- # Frequent abbreviations that will never appear in the end of a sentence
38
- ['(cf|vs)\.', ''],
39
- ['e\.g\.', ''],
40
- ['i\.e\.', ''],
41
-
42
- # Others
43
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
44
- ]
45
-
46
- def initialize(rules = {})
47
- rules ||= {}
48
- @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
49
- @break_characters = rules[:break_characters] || BREAK_CHARACTERS
50
- @positive_rules = rules[:positive_rules] || POSITIVE_RULES
51
- @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
7
+ DEFAULT_RULES = {
8
+ # All the positions of new line characters always take sentence break.
9
+ break_characters: [
10
+ "\n"
11
+ ],
12
+
13
+ # All the positions of space and tab characters are candiates of sentence break.
14
+ break_candidates: [
15
+ " ", "\t"
16
+ ],
17
+
18
+ # First, positive rules are applied to the break candidates to make initial segmantations.
19
+ positive_rules: [
20
+ ['[.!?]', '[0-9A-Z]'],
21
+ ['[:]', '[0-9]'],
22
+ ['[:]', '[A-Z][a-z]']
23
+ ],
24
+
25
+ # Then, negative rules are applied to cancel some initial segmentations.
26
+ negative_rules: [
27
+ # Titles before names
28
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
29
+
30
+ # Titles usually before names, but ..
31
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
32
+
33
+ # Single letter abbriveations, e.g. middle name
34
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
35
+
36
+ # Abbriveations, e.g. middle name
37
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
38
+
39
+ # Frequent abbreviations that will never appear in the end of a sentence
40
+ ['(cf|vs)\.', ''],
41
+ ['e\.g\.', ''],
42
+ ['i\.e\.', ''],
43
+
44
+ # Others
45
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
46
+ ]
47
+ }
48
+
49
+ def initialize(rules = nil)
50
+ rules ||= DEFAULT_RULES
51
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
52
+ @rules[:break_characters] ||= []
53
+ @rules[:break_candidates] ||= []
54
+ @rules[:positive_rules] ||= []
55
+ @rules[:negative_rules] ||= []
52
56
  end
53
57
 
54
58
  def annotate(text)
@@ -77,21 +81,21 @@ class TextSentencer
77
81
  for l in 0..text.length
78
82
 
79
83
  ## apply the positive rules to the places of break candidates
80
- if @break_candidates.include?(text[l])
81
- POSITIVE_RULES.each do |r|
84
+ if @rules[:break_candidates].include?(text[l])
85
+ @rules[:positive_rules].each do |r|
82
86
  if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
83
87
  pbreaks << l
84
88
  break
85
89
  end
86
90
  end
87
- elsif @break_characters.include?(text[l])
91
+ elsif @rules[:break_characters].include?(text[l])
88
92
  breaks << l
89
93
  end
90
94
  end
91
95
 
92
96
  ## apply the negative rules to the places of break candidates
93
97
  pbreaks.each do |l|
94
- NEGATIVE_RULES.each do |r|
98
+ @rules[:negative_rules].each do |r|
95
99
  if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
96
100
  nbreaks << l
97
101
  break
@@ -118,33 +122,3 @@ class TextSentencer
118
122
  sentences
119
123
  end
120
124
  end
121
-
122
- if __FILE__ == $0
123
- rules = {
124
- break_candidates: [
125
- " ", "\t"
126
- ],
127
-
128
- break_characters: [
129
- "\n"
130
- ],
131
-
132
- positive_rules: [
133
- ['[\.!?]', '[0-9A-Z]'],
134
- ['[:]', '[0-9]'],
135
- ['[:]', '[A-Z][a-z]']
136
- ],
137
-
138
- negative_rules: []
139
- }
140
-
141
- sentencer = TextSentencer.new
142
-
143
- text = ''
144
- ARGF.each do |line|
145
- text += line
146
- end
147
-
148
- sen_so = sentencer.annotate(text)
149
- p(sen_so)
150
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-20 00:00:00.000000000 Z
11
+ date: 2017-07-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting a text block
14
14
  into sentences.
@@ -21,7 +21,7 @@ files:
21
21
  - bin/text_sentencer
22
22
  - lib/text_sentencer.rb
23
23
  - lib/text_sentencer/text_sentencer.rb
24
- homepage: http://rubygems.org/gems/text_sentencer
24
+ homepage: https://github.com/jdkim/text_sentencer
25
25
  licenses:
26
26
  - MIT
27
27
  metadata: {}