text_sentencer 0.2.0 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
4
- data.tar.gz: 0255396f5d925c06023111bca178157b56765586
2
+ SHA256:
3
+ metadata.gz: b8dda6323a11356aa2f6c78fa0281658ea677f09fbf0bfb19b19330f732e72c1
4
+ data.tar.gz: a41e5cc5ae3e1e294e9ba4921993e2d98003d74435fd5bc099c6c6fbf188554a
5
5
  SHA512:
6
- metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
7
- data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
6
+ metadata.gz: b39ec569c1e988f4e7936385395ff1e34c15cb7721ffba6fb2453f633db360f092b5225d6ec4603b2beb391e2660291e9d48e912359eb33db9d2ccd1906d806f
7
+ data.tar.gz: 2780a4c3f6ac1fc7f0ab06e658a293b7e9a7d3f5a2df0a0904fa20aea187857c9f3bad7cc7adfb6f1eba4c57897f433c03d77a958559e09225b60c154c9488b3
@@ -1,47 +1,45 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'json'
2
3
  require 'text_sentencer'
3
4
 
4
5
  config_filename = nil
6
+ output_mode = :sentences
5
7
 
6
8
  ## command line option processing
7
9
  require 'optparse'
8
10
  optparse = OptionParser.new do |opts|
9
- opts.banner = "Usage: text_sentencer [options]"
10
-
11
- opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
12
- config_filename = f
13
- end
14
-
15
- opts.on('-h', '--help', 'displays this screen.') do
16
- puts opts
17
- exit
18
- end
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
14
+ config_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
19
25
  end
20
26
 
21
27
  optparse.parse!
22
28
 
23
- config = if config_filename
24
- JSON.parse File.read(config_filename) if File.file?(config_filename)
29
+ config = if config_filename && File.file?(config_filename)
30
+ JSON.parse File.read(config_filename)
25
31
  end
26
32
 
27
33
  sentencer = TextSentencer.new(config)
28
34
 
29
35
  text = ARGF.read
30
-
31
- ## Preprocessing
32
- # It should be removed later
33
- text.gsub!(/ +/, ' ')
34
- text.gsub!(/\n+/, "\n")
35
- text.gsub!(/\t+/, "\t")
36
- text.gsub!(/\n /, "\n")
37
- text.gsub!(/ \n/, "\n")
38
- text.gsub!(/\t /, "\t")
39
- text.gsub!(/ \t/, "\t")
40
- text.gsub!(/\n\t/, "\n")
41
- text.gsub!(/\t\n/, "\n")
42
-
43
36
  annotations = sentencer.annotate(text)
44
- annotations[:denotations].each do |d|
45
- span = d[:span]
46
- puts text[span[:begin]...span[:end]]
37
+
38
+ if output_mode == :json
39
+ puts JSON.pretty_generate(annotations)
40
+ else
41
+ annotations[:denotations].each do |d|
42
+ span = d[:span]
43
+ puts text[span[:begin]...span[:end]]
44
+ end
47
45
  end
@@ -0,0 +1,9 @@
1
+ class String
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,150 +1,117 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_sentencer/string_scan_offset'
3
+ require 'pp'
2
4
 
3
5
  class TextSentencer
4
- ## default rules
5
-
6
- # All the positions of space and tab characters are candiates of sentence break.
7
- BREAK_CANDIDATES = [
8
- " ", "\t"
9
- ]
10
-
11
- # All the positions of new line characters always take sentence break.
12
- BREAK_CHARACTERS = [
13
- "\n"
14
- ]
15
-
16
- # First, positive rules are applied to the break candidates to make initial segmantations.
17
- POSITIVE_RULES = [
18
- ['[\.!?]', '[0-9A-Z]'],
19
- ['[:]', '[0-9]'],
20
- ['[:]', '[A-Z][a-z]']
21
- ]
22
-
23
- # Then, negative rules are applied to cancel some initial segmentations.
24
- NEGATIVE_RULES = [
25
- # Titles before names
26
- ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
27
-
28
- # Titles usually before names, but ..
29
- ['(Sr|Jr)\.', '[A-Z][a-z]'],
30
-
31
- # Single letter abbriveations, e.g. middle name
32
- # ['\b[A-Z]\.', '[A-Z][a-z]'],
33
-
34
- # Abbriveations, e.g. middle name
35
- ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
36
-
37
- # Frequent abbreviations that will never appear in the end of a sentence
38
- ['(cf|vs)\.', ''],
39
- ['e\.g\.', ''],
40
- ['i\.e\.', ''],
41
-
42
- # Others
43
- ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
44
- ]
45
-
46
- def initialize(rules = {})
47
- rules ||= {}
48
- @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
49
- @break_characters = rules[:break_characters] || BREAK_CHARACTERS
50
- @positive_rules = rules[:positive_rules] || POSITIVE_RULES
51
- @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
52
- end
53
-
54
- def annotate(text)
55
- return nil if text.nil? || text.empty?
56
- sentences = segment(text)
57
- denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
58
- denotations.empty? ? {text:text} : {text:text, denotations:denotations}
59
- end
60
-
61
- private
62
-
63
- def segment(text)
64
- original_text = text
65
- text = original_text.strip
66
- start = original_text.index(text)
67
-
68
- # sentence breaks
69
- breaks = []
70
-
71
- # breaks by positive rules
72
- pbreaks = []
73
-
74
- # canceled breaks by negative rules
75
- nbreaks = []
76
-
77
- for l in 0..text.length
78
-
79
- ## apply the positive rules to the places of break candidates
80
- if @break_candidates.include?(text[l])
81
- POSITIVE_RULES.each do |r|
82
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
83
- pbreaks << l
84
- break
85
- end
86
- end
87
- elsif @break_characters.include?(text[l])
88
- breaks << l
89
- end
90
- end
91
-
92
- ## apply the negative rules to the places of break candidates
93
- pbreaks.each do |l|
94
- NEGATIVE_RULES.each do |r|
95
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
96
- nbreaks << l
97
- break
98
- end
99
- end
100
- end
101
- breaks += pbreaks - nbreaks
102
- breaks.sort!
103
-
104
- sentences = []
105
- lastbreak = -1
106
- breaks.each do |b|
107
- sentences.push([lastbreak+1, b])
108
- lastbreak = b
109
- end
110
- sentences.push([lastbreak+1, text.length])
111
-
112
- ## filter out empty segments
113
- sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
114
-
115
- ## adjust offsets for the in text
116
- sentences.collect!{|b, e| [b + start, e + start]}
117
-
118
- sentences
119
- end
120
- end
121
-
122
- if __FILE__ == $0
123
- rules = {
124
- break_candidates: [
125
- " ", "\t"
126
- ],
127
-
128
- break_characters: [
129
- "\n"
130
- ],
131
-
132
- positive_rules: [
133
- ['[\.!?]', '[0-9A-Z]'],
134
- ['[:]', '[0-9]'],
135
- ['[:]', '[A-Z][a-z]']
136
- ],
137
-
138
- negative_rules: []
139
- }
140
-
141
- sentencer = TextSentencer.new
142
-
143
- text = ''
144
- ARGF.each do |line|
145
- text += line
146
- end
147
-
148
- sen_so = sentencer.annotate(text)
149
- p(sen_so)
6
+ ## default rules
7
+
8
+ DEFAULT_RULES = {
9
+ # All the positions of new line characters always take sentence break.
10
+ break_pattern: "([ \t]*\n+)+[ \t]*",
11
+
12
+ # All the positions of space and tab characters are candiates of sentence break.
13
+ candidate_pattern: "[ \t]+",
14
+
15
+ # First, positive rules are applied to the break candidates to make initial segmantations.
16
+ positive_rules: [
17
+ ['[.!?]', '[0-9A-Z]'],
18
+ ['[:]', '[0-9]'],
19
+ ['[:]', '[A-Z][a-z]']
20
+ ],
21
+
22
+ # Then, negative rules are applied to cancel some initial segmentations.
23
+ negative_rules: [
24
+ # Titles before names
25
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
26
+
27
+ # Titles usually before names, but ..
28
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
29
+
30
+ # Single letter abbriveations, e.g. middle name
31
+ # ['\b[A-Z]\.', '[A-Z][a-z]'],
32
+
33
+ # Abbriveations, e.g. middle name
34
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
35
+
36
+ # Frequent abbreviations that will never appear in the end of a sentence
37
+ ['(cf|vs)\.', ''],
38
+ ['e\.g\.', ''],
39
+ ['i\.e\.', ''],
40
+
41
+ # Others
42
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
43
+ ]
44
+ }
45
+
46
+ def initialize(rules = nil)
47
+ rules ||= DEFAULT_RULES
48
+ @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
49
+ @rules[:break_pattern] ||= ""
50
+ @rules[:candidate_pattern] ||= ""
51
+ @rules[:positive_rules] ||= []
52
+ @rules[:negative_rules] ||= []
53
+ end
54
+
55
+ def annotate(text)
56
+ return nil if text.nil?
57
+
58
+ sentences = segment(text)
59
+ denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
60
+ {text:text, denotations:denotations}
61
+ end
62
+
63
+ def segment(text)
64
+ breaks = if @rules[:break_pattern].empty?
65
+ []
66
+ else
67
+ text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
68
+ end
69
+
70
+ candidates = if @rules[:candidate_pattern].empty?
71
+ []
72
+ else
73
+ text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
74
+ end
75
+
76
+ # breaks take precedent
77
+ candidates -= breaks
78
+
79
+ candidates.each do |c|
80
+ last_end, next_begin = c
81
+
82
+ if (last_end == 0) || (next_begin == text.length)
83
+ breaks << c
84
+ next
85
+ end
86
+
87
+ last_text = text[0...last_end]
88
+ next_text = text[next_begin..-1]
89
+
90
+ @rules[:positive_rules].each do |p|
91
+ if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
92
+ break_p = true
93
+ @rules[:negative_rules].each do |n|
94
+ if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
95
+ break_p = false
96
+ break
97
+ end
98
+ end
99
+ breaks << c if break_p
100
+ break
101
+ end
102
+ end
103
+ end
104
+
105
+ breaks.sort!
106
+
107
+ sentences = []
108
+ lastbreak = 0
109
+ breaks.each do |b|
110
+ sentences << [lastbreak, b[0]] if b[0] > lastbreak
111
+ lastbreak = b[1]
112
+ end
113
+ sentences << [lastbreak, text.length] if lastbreak < text.length
114
+
115
+ sentences
116
+ end
150
117
  end
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-20 00:00:00.000000000 Z
11
+ date: 2021-01-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: TextSentencer is a simple rule-based system for segmenting a text block
14
- into sentences.
13
+ description: TextSentencer is a simple rule-based system for segmenting text into
14
+ sentences.
15
15
  email: jindong.kim@gmail.com
16
16
  executables:
17
17
  - text_sentencer
@@ -20,8 +20,9 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - bin/text_sentencer
22
22
  - lib/text_sentencer.rb
23
+ - lib/text_sentencer/string_scan_offset.rb
23
24
  - lib/text_sentencer/text_sentencer.rb
24
- homepage: http://rubygems.org/gems/text_sentencer
25
+ homepage: http://bionlp.dbcls.jp/text_sentencer
25
26
  licenses:
26
27
  - MIT
27
28
  metadata: {}
@@ -40,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
40
41
  - !ruby/object:Gem::Version
41
42
  version: '0'
42
43
  requirements: []
43
- rubyforge_project:
44
- rubygems_version: 2.4.8
44
+ rubygems_version: 3.2.3
45
45
  signing_key:
46
46
  specification_version: 4
47
47
  summary: A simple, rule-based script to find sentence boundaries in text.