text_sentencer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
4
- data.tar.gz: 0255396f5d925c06023111bca178157b56765586
3
+ metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
4
+ data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
5
5
  SHA512:
6
- metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
7
- data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
6
+ metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
7
+ data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227
data/bin/text_sentencer CHANGED
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'json'
2
3
  require 'text_sentencer'
3
4
 
4
5
  config_filename = nil
@@ -8,8 +9,8 @@ require 'optparse'
8
9
  optparse = OptionParser.new do |opts|
9
10
  opts.banner = "Usage: text_sentencer [options]"
10
11
 
11
- opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
12
- config_filename = f
12
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
13
+ config_filename = c
13
14
  end
14
15
 
15
16
  opts.on('-h', '--help', 'displays this screen.') do
@@ -20,8 +21,8 @@ end
20
21
 
21
22
  optparse.parse!
22
23
 
23
- config = if config_filename
24
- JSON.parse File.read(config_filename) if File.file?(config_filename)
24
+ config = if config_filename && File.file?(config_filename)
25
+ JSON.parse File.read(config_filename)
25
26
  end
26
27
 
27
28
  sentencer = TextSentencer.new(config)
@@ -1,54 +1,58 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'pp'
2
3
 
3
4
  class TextSentencer
4
5
  ## default rules
5
6
 
6
- # All the positions of space and tab characters are candiates of sentence break.
7
- BREAK_CANDIDATES = [
8
- " ", "\t"
9
- ]
10
-
11
- # All the positions of new line characters always take sentence break.
12
- BREAK_CHARACTERS = [
13
- "\n"
14
- ]
15
-
16
- # First, positive rules are applied to the break candidates to make initial segmantations.
17
- POSITIVE_RULES = [
18
- ['[\.!?]', '[0-9A-Z]'],
19
- ['[:]', '[0-9]'],
20
- ['[:]', '[A-Z][a-z]']
21
- ]
22
-
23
- # Then, negative rules are applied to cancel some initial segmentations.
24
- NEGATIVE_RULES = [
25
- # Titles before names
26
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
27
-
28
- # Titles usually before names, but ..
29
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
30
-
31
- # Single letter abbriveations, e.g. middle name
32
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
33
-
34
- # Abbriveations, e.g. middle name
35
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
36
-
37
- # Frequent abbreviations that will never appear in the end of a sentence
38
- ['(cf|vs)\.', ''],
39
- ['e\.g\.', ''],
40
- ['i\.e\.', ''],
41
-
42
- # Others
43
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
44
- ]
45
-
46
- def initialize(rules = {})
47
- rules ||= {}
48
- @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
49
- @break_characters = rules[:break_characters] || BREAK_CHARACTERS
50
- @positive_rules = rules[:positive_rules] || POSITIVE_RULES
51
- @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
7
+ DEFAULT_RULES = {
8
+ # All the positions of new line characters always take sentence break.
9
+ break_characters: [
10
+ "\n"
11
+ ],
12
+
13
+ # All the positions of space and tab characters are candiates of sentence break.
14
+ break_candidates: [
15
+ " ", "\t"
16
+ ],
17
+
18
+ # First, positive rules are applied to the break candidates to make initial segmantations.
19
+ positive_rules: [
20
+ ['[.!?]', '[0-9A-Z]'],
21
+ ['[:]', '[0-9]'],
22
+ ['[:]', '[A-Z][a-z]']
23
+ ],
24
+
25
+ # Then, negative rules are applied to cancel some initial segmentations.
26
+ negative_rules: [
27
+ # Titles before names
28
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
29
+
30
+ # Titles usually before names, but ..
31
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
32
+
33
+ # Single letter abbriveations, e.g. middle name
34
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
35
+
36
+ # Abbriveations, e.g. middle name
37
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
38
+
39
+ # Frequent abbreviations that will never appear in the end of a sentence
40
+ ['(cf|vs)\.', ''],
41
+ ['e\.g\.', ''],
42
+ ['i\.e\.', ''],
43
+
44
+ # Others
45
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
46
+ ]
47
+ }
48
+
49
+ def initialize(rules = nil)
50
+ rules ||= DEFAULT_RULES
51
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
52
+ @rules[:break_characters] ||= []
53
+ @rules[:break_candidates] ||= []
54
+ @rules[:positive_rules] ||= []
55
+ @rules[:negative_rules] ||= []
52
56
  end
53
57
 
54
58
  def annotate(text)
@@ -77,21 +81,21 @@ class TextSentencer
77
81
  for l in 0..text.length
78
82
 
79
83
  ## apply the positive rules to the places of break candidates
80
- if @break_candidates.include?(text[l])
81
- POSITIVE_RULES.each do |r|
84
+ if @rules[:break_candidates].include?(text[l])
85
+ @rules[:positive_rules].each do |r|
82
86
  if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
83
87
  pbreaks << l
84
88
  break
85
89
  end
86
90
  end
87
- elsif @break_characters.include?(text[l])
91
+ elsif @rules[:break_characters].include?(text[l])
88
92
  breaks << l
89
93
  end
90
94
  end
91
95
 
92
96
  ## apply the negative rules to the places of break candidates
93
97
  pbreaks.each do |l|
94
- NEGATIVE_RULES.each do |r|
98
+ @rules[:negative_rules].each do |r|
95
99
  if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
96
100
  nbreaks << l
97
101
  break
@@ -118,33 +122,3 @@ class TextSentencer
118
122
  sentences
119
123
  end
120
124
  end
121
-
122
- if __FILE__ == $0
123
- rules = {
124
- break_candidates: [
125
- " ", "\t"
126
- ],
127
-
128
- break_characters: [
129
- "\n"
130
- ],
131
-
132
- positive_rules: [
133
- ['[\.!?]', '[0-9A-Z]'],
134
- ['[:]', '[0-9]'],
135
- ['[:]', '[A-Z][a-z]']
136
- ],
137
-
138
- negative_rules: []
139
- }
140
-
141
- sentencer = TextSentencer.new
142
-
143
- text = ''
144
- ARGF.each do |line|
145
- text += line
146
- end
147
-
148
- sen_so = sentencer.annotate(text)
149
- p(sen_so)
150
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-20 00:00:00.000000000 Z
11
+ date: 2017-07-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting a text block
14
14
  into sentences.
@@ -21,7 +21,7 @@ files:
21
21
  - bin/text_sentencer
22
22
  - lib/text_sentencer.rb
23
23
  - lib/text_sentencer/text_sentencer.rb
24
- homepage: http://rubygems.org/gems/text_sentencer
24
+ homepage: https://github.com/jdkim/text_sentencer
25
25
  licenses:
26
26
  - MIT
27
27
  metadata: {}