text_sentencer 0.2.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
4
- data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
3
+ metadata.gz: 0be05e5130a03bd0c189b112d1d26c610844a4c9
4
+ data.tar.gz: 16ea016fb63066dd617c7b4ac39887bbacba40af
5
5
  SHA512:
6
- metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
7
- data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227
6
+ metadata.gz: c07b216029b059b7f9bc2dcff01278801933fb95d7a201f19d95951e52329f66c311ebf26bdbd910a2c57a2347adef6f366361f5e71643a735e8bc1775c4b61c
7
+ data.tar.gz: eeb161448cdd65860686248281c6438e0b9e096529956ae175faab0a41bfa40cd3a101017b9aa5367099d4ad39fd7bc972fde48beae88e3f3dcb2c500715ae22
data/bin/text_sentencer CHANGED
@@ -3,6 +3,7 @@ require 'json'
3
3
  require 'text_sentencer'
4
4
 
5
5
  config_filename = nil
6
+ output_mode = :sentences
6
7
 
7
8
  ## command line option processing
8
9
  require 'optparse'
@@ -13,6 +14,10 @@ optparse = OptionParser.new do |opts|
13
14
  config_filename = c
14
15
  end
15
16
 
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
16
21
  opts.on('-h', '--help', 'displays this screen.') do
17
22
  puts opts
18
23
  exit
@@ -28,21 +33,13 @@ end
28
33
  sentencer = TextSentencer.new(config)
29
34
 
30
35
  text = ARGF.read
31
-
32
- ## Preprocessing
33
- # It should be removed later
34
- text.gsub!(/ +/, ' ')
35
- text.gsub!(/\n+/, "\n")
36
- text.gsub!(/\t+/, "\t")
37
- text.gsub!(/\n /, "\n")
38
- text.gsub!(/ \n/, "\n")
39
- text.gsub!(/\t /, "\t")
40
- text.gsub!(/ \t/, "\t")
41
- text.gsub!(/\n\t/, "\n")
42
- text.gsub!(/\t\n/, "\n")
43
-
44
36
  annotations = sentencer.annotate(text)
45
- annotations[:denotations].each do |d|
46
- span = d[:span]
47
- puts text[span[:begin]...span[:end]]
37
+
38
+ if output_mode == :json
39
+ puts JSON.pretty_generate(annotations)
40
+ else
41
+ annotations[:denotations].each do |d|
42
+ span = d[:span]
43
+ puts text[span[:begin]...span[:end]]
44
+ end
48
45
  end
@@ -0,0 +1,9 @@
1
+ class String
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_sentencer/string_scan_offset'
2
3
  require 'pp'
3
4
 
4
5
  class TextSentencer
@@ -6,14 +7,10 @@ class TextSentencer
6
7
 
7
8
  DEFAULT_RULES = {
8
9
  # All the positions of new line characters always take sentence break.
9
- break_characters: [
10
- "\n"
11
- ],
10
+ break_pattern: "([ \t]*\n+)+[ \t]*",
12
11
 
13
12
  # All the positions of space and tab characters are candiates of sentence break.
14
- break_candidates: [
15
- " ", "\t"
16
- ],
13
+ candidate_pattern: "[ \t]+",
17
14
 
18
15
  # First, positive rules are applied to the break candidates to make initial segmantations.
19
16
  positive_rules: [
@@ -49,75 +46,73 @@ class TextSentencer
49
46
  def initialize(rules = nil)
50
47
  rules ||= DEFAULT_RULES
51
48
  @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
52
- @rules[:break_characters] ||= []
53
- @rules[:break_candidates] ||= []
49
+ @rules[:break_pattern] ||= ""
50
+ @rules[:candidate_pattern] ||= ""
54
51
  @rules[:positive_rules] ||= []
55
52
  @rules[:negative_rules] ||= []
56
53
  end
57
54
 
58
55
  def annotate(text)
59
- return nil if text.nil? || text.empty?
56
+ return nil if text.nil?
57
+
60
58
  sentences = segment(text)
61
59
  denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
62
- denotations.empty? ? {text:text} : {text:text, denotations:denotations}
60
+ {text:text, denotations:denotations}
63
61
  end
64
62
 
65
63
  private
66
64
 
67
65
  def segment(text)
68
- original_text = text
69
- text = original_text.strip
70
- start = original_text.index(text)
71
-
72
- # sentence breaks
73
- breaks = []
66
+ breaks = if @rules[:break_pattern].empty?
67
+ []
68
+ else
69
+ text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
70
+ end
74
71
 
75
- # breaks by positive rules
76
- pbreaks = []
72
+ candidates = if @rules[:candidate_pattern].empty?
73
+ []
74
+ else
75
+ text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
76
+ end
77
77
 
78
- # canceled breaks by negative rules
79
- nbreaks = []
78
+ # breaks take precedent
79
+ candidates -= breaks
80
80
 
81
- for l in 0..text.length
81
+ candidates.each do |c|
82
+ last_end, next_begin = c
82
83
 
83
- ## apply the positive rules to the places of break candidates
84
- if @rules[:break_candidates].include?(text[l])
85
- @rules[:positive_rules].each do |r|
86
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
87
- pbreaks << l
88
- break
89
- end
90
- end
91
- elsif @rules[:break_characters].include?(text[l])
92
- breaks << l
84
+ if (last_end == 0) || (next_begin == text.length)
85
+ breaks << c
86
+ next
93
87
  end
94
- end
95
88
 
96
- ## apply the negative rules to the places of break candidates
97
- pbreaks.each do |l|
98
- @rules[:negative_rules].each do |r|
99
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
100
- nbreaks << l
89
+ last_text = text[0...last_end]
90
+ next_text = text[next_begin..-1]
91
+
92
+ @rules[:positive_rules].each do |p|
93
+ if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
94
+ break_p = true
95
+ @rules[:negative_rules].each do |n|
96
+ if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
97
+ break_p = false
98
+ break
99
+ end
100
+ end
101
+ breaks << c if break_p
101
102
  break
102
103
  end
103
104
  end
104
105
  end
105
- breaks += pbreaks - nbreaks
106
+
106
107
  breaks.sort!
107
108
 
108
109
  sentences = []
109
- lastbreak = -1
110
+ lastbreak = 0
110
111
  breaks.each do |b|
111
- sentences.push([lastbreak+1, b])
112
- lastbreak = b
112
+ sentences << [lastbreak, b[0]] if b[0] > lastbreak
113
+ lastbreak = b[1]
113
114
  end
114
- sentences.push([lastbreak+1, text.length])
115
-
116
- ## filter out empty segments
117
- sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
118
-
119
- ## adjust offsets for the in text
120
- sentences.collect!{|b, e| [b + start, e + start]}
115
+ sentences << [lastbreak, text.length] if lastbreak < text.length
121
116
 
122
117
  sentences
123
118
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-21 00:00:00.000000000 Z
11
+ date: 2017-07-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting a text block
14
14
  into sentences.
@@ -20,6 +20,7 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - bin/text_sentencer
22
22
  - lib/text_sentencer.rb
23
+ - lib/text_sentencer/string_scan_offset.rb
23
24
  - lib/text_sentencer/text_sentencer.rb
24
25
  homepage: https://github.com/jdkim/text_sentencer
25
26
  licenses: