text_sentencer 0.2.0 → 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
4
- data.tar.gz: 0255396f5d925c06023111bca178157b56765586
2
+ SHA256:
3
+ metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
4
+ data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
5
5
  SHA512:
6
- metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
7
- data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
6
+ metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
7
+ data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
@@ -1,47 +1,45 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'json'
2
3
  require 'text_sentencer'
3
4
 
4
5
  config_filename = nil
6
+ output_mode = :sentences
5
7
 
6
8
  ## command line option processing
7
9
  require 'optparse'
8
10
  optparse = OptionParser.new do |opts|
9
- opts.banner = "Usage: text_sentencer [options]"
10
-
11
- opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
12
- config_filename = f
13
- end
14
-
15
- opts.on('-h', '--help', 'displays this screen.') do
16
- puts opts
17
- exit
18
- end
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
+ config_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
19
25
  end
20
26
 
21
27
  optparse.parse!
22
28
 
23
- config = if config_filename
24
- JSON.parse File.read(config_filename) if File.file?(config_filename)
29
+ config = if config_filename && File.file?(config_filename)
30
+ JSON.parse File.read(config_filename)
25
31
  end
26
32
 
27
33
  sentencer = TextSentencer.new(config)
28
34
 
29
35
  text = ARGF.read
30
-
31
- ## Preprocessing
32
- # It should be removed later
33
- text.gsub!(/ +/, ' ')
34
- text.gsub!(/\n+/, "\n")
35
- text.gsub!(/\t+/, "\t")
36
- text.gsub!(/\n /, "\n")
37
- text.gsub!(/ \n/, "\n")
38
- text.gsub!(/\t /, "\t")
39
- text.gsub!(/ \t/, "\t")
40
- text.gsub!(/\n\t/, "\n")
41
- text.gsub!(/\t\n/, "\n")
42
-
43
36
  annotations = sentencer.annotate(text)
44
- annotations[:denotations].each do |d|
45
- span = d[:span]
46
- puts text[span[:begin]...span[:end]]
37
+
38
+ if output_mode == :json
39
+ puts JSON.pretty_generate(annotations)
40
+ else
41
+ annotations[:denotations].each do |d|
42
+ span = d[:span]
43
+ puts text[span[:begin]...span[:end]]
44
+ end
47
45
  end
@@ -0,0 +1,9 @@
1
+ class String
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,150 +1,117 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_sentencer/string_scan_offset'
3
+ require 'pp'
2
4
 
3
5
  class TextSentencer
4
- ## default rules
5
-
6
- # All the positions of space and tab characters are candiates of sentence break.
7
- BREAK_CANDIDATES = [
8
- " ", "\t"
9
- ]
10
-
11
- # All the positions of new line characters always take sentence break.
12
- BREAK_CHARACTERS = [
13
- "\n"
14
- ]
15
-
16
- # First, positive rules are applied to the break candidates to make initial segmantations.
17
- POSITIVE_RULES = [
18
- ['[\.!?]', '[0-9A-Z]'],
19
- ['[:]', '[0-9]'],
20
- ['[:]', '[A-Z][a-z]']
21
- ]
22
-
23
- # Then, negative rules are applied to cancel some initial segmentations.
24
- NEGATIVE_RULES = [
25
- # Titles before names
26
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
27
-
28
- # Titles usually before names, but ..
29
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
30
-
31
- # Single letter abbriveations, e.g. middle name
32
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
33
-
34
- # Abbriveations, e.g. middle name
35
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
36
-
37
- # Frequent abbreviations that will never appear in the end of a sentence
38
- ['(cf|vs)\.', ''],
39
- ['e\.g\.', ''],
40
- ['i\.e\.', ''],
41
-
42
- # Others
43
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
44
- ]
45
-
46
- def initialize(rules = {})
47
- rules ||= {}
48
- @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
49
- @break_characters = rules[:break_characters] || BREAK_CHARACTERS
50
- @positive_rules = rules[:positive_rules] || POSITIVE_RULES
51
- @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
52
- end
53
-
54
- def annotate(text)
55
- return nil if text.nil? || text.empty?
56
- sentences = segment(text)
57
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
- denotations.empty? ? {text:text} : {text:text, denotations:denotations}
59
- end
60
-
61
- private
62
-
63
- def segment(text)
64
- original_text = text
65
- text = original_text.strip
66
- start = original_text.index(text)
67
-
68
- # sentence breaks
69
- breaks = []
70
-
71
- # breaks by positive rules
72
- pbreaks = []
73
-
74
- # canceled breaks by negative rules
75
- nbreaks = []
76
-
77
- for l in 0..text.length
78
-
79
- ## apply the positive rules to the places of break candidates
80
- if @break_candidates.include?(text[l])
81
- POSITIVE_RULES.each do |r|
82
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
83
- pbreaks << l
84
- break
85
- end
86
- end
87
- elsif @break_characters.include?(text[l])
88
- breaks << l
89
- end
90
- end
91
-
92
- ## apply the negative rules to the places of break candidates
93
- pbreaks.each do |l|
94
- NEGATIVE_RULES.each do |r|
95
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
96
- nbreaks << l
97
- break
98
- end
99
- end
100
- end
101
- breaks += pbreaks - nbreaks
102
- breaks.sort!
103
-
104
- sentences = []
105
- lastbreak = -1
106
- breaks.each do |b|
107
- sentences.push([lastbreak+1, b])
108
- lastbreak = b
109
- end
110
- sentences.push([lastbreak+1, text.length])
111
-
112
- ## filter out empty segments
113
- sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
114
-
115
- ## adjust offsets for the in text
116
- sentences.collect!{|b, e| [b + start, e + start]}
117
-
118
- sentences
119
- end
120
- end
121
-
122
- if __FILE__ == $0
123
- rules = {
124
- break_candidates: [
125
- " ", "\t"
126
- ],
127
-
128
- break_characters: [
129
- "\n"
130
- ],
131
-
132
- positive_rules: [
133
- ['[\.!?]', '[0-9A-Z]'],
134
- ['[:]', '[0-9]'],
135
- ['[:]', '[A-Z][a-z]']
136
- ],
137
-
138
- negative_rules: []
139
- }
140
-
141
- sentencer = TextSentencer.new
142
-
143
- text = ''
144
- ARGF.each do |line|
145
- text += line
146
- end
147
-
148
- sen_so = sentencer.annotate(text)
149
- p(sen_so)
6
+ ## default rules
7
+
8
+ DEFAULT_RULES = {
9
+ # All the positions of new line characters always take sentence break.
10
+ break_pattern: "([ \t]*\n+)+[ \t]*",
11
+
12
+ # All the positions of space and tab characters are candiates of sentence break.
13
+ candidate_pattern: "[ \t]+",
14
+
15
+ # First, positive rules are applied to the break candidates to make initial segmantations.
16
+ positive_rules: [
17
+ ['[.!?]', '[0-9A-Z]'],
18
+ ['[:]', '[0-9]'],
19
+ ['[:]', '[A-Z][a-z]']
20
+ ],
21
+
22
+ # Then, negative rules are applied to cancel some initial segmentations.
23
+ negative_rules: [
24
+ # Titles before names
25
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
+
27
+ # Titles usually before names, but ..
28
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
+
30
+ # Single letter abbriveations, e.g. middle name
31
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
+
33
+ # Abbriveations, e.g. middle name
34
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
+
36
+ # Frequent abbreviations that will never appear in the end of a sentence
37
+ ['(cf|vs)\.', ''],
38
+ ['e\.g\.', ''],
39
+ ['i\.e\.', ''],
40
+
41
+ # Others
42
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
+ ]
44
+ }
45
+
46
+ def initialize(rules = nil)
47
+ rules ||= DEFAULT_RULES
48
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
+ @rules[:break_pattern] ||= ""
50
+ @rules[:candidate_pattern] ||= ""
51
+ @rules[:positive_rules] ||= []
52
+ @rules[:negative_rules] ||= []
53
+ end
54
+
55
+ def annotate(text)
56
+ return nil if text.nil?
57
+
58
+ sentences = segment(text)
59
+ denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
+ {text:text, denotations:denotations}
61
+ end
62
+
63
+ def segment(text)
64
+ breaks = if @rules[:break_pattern].empty?
65
+ []
66
+ else
67
+ text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
+ end
69
+
70
+ candidates = if @rules[:candidate_pattern].empty?
71
+ []
72
+ else
73
+ text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
+ end
75
+
76
+ # breaks take precedent
77
+ candidates -= breaks
78
+
79
+ candidates.each do |c|
80
+ last_end, next_begin = c
81
+
82
+ if (last_end == 0) || (next_begin == text.length)
83
+ breaks << c
84
+ next
85
+ end
86
+
87
+ last_text = text[0...last_end]
88
+ next_text = text[next_begin..-1]
89
+
90
+ @rules[:positive_rules].each do |p|
91
+ if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
+ break_p = true
93
+ @rules[:negative_rules].each do |n|
94
+ if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
+ break_p = false
96
+ break
97
+ end
98
+ end
99
+ breaks << c if break_p
100
+ break
101
+ end
102
+ end
103
+ end
104
+
105
+ breaks.sort!
106
+
107
+ sentences = []
108
+ lastbreak = 0
109
+ breaks.each do |b|
110
+ sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
+ lastbreak = b[1]
112
+ end
113
+ sentences << [lastbreak, text.length] if lastbreak < text.length
114
+
115
+ sentences
116
+ end
150
117
  end
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-20 00:00:00.000000000 Z
11
+ date: 2021-01-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: TextSentencer is a simple rule-based system for segmenting a text block
14
- into sentences.
13
+ description: TextSentencer is a simple rule-based system for segmenting text into
14
+ sentences.
15
15
  email: jindong.kim@gmail.com
16
16
  executables:
17
17
  - text_sentencer
@@ -20,8 +20,9 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - bin/text_sentencer
22
22
  - lib/text_sentencer.rb
23
+ - lib/text_sentencer/string_scan_offset.rb
23
24
  - lib/text_sentencer/text_sentencer.rb
24
- homepage: http://rubygems.org/gems/text_sentencer
25
+ homepage: http://bionlp.dbcls.jp/text_sentencer
25
26
  licenses:
26
27
  - MIT
27
28
  metadata: {}
@@ -40,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
40
41
  - !ruby/object:Gem::Version
41
42
  version: '0'
42
43
  requirements: []
43
- rubyforge_project:
44
- rubygems_version: 2.4.8
44
+ rubygems_version: 3.2.3
45
45
  signing_key:
46
46
  specification_version: 4
47
47
  summary: A simple, rule-based script to find sentence boundaries in text.