text_sentencer 0.2.1 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
4
- data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
3
+ metadata.gz: 0be05e5130a03bd0c189b112d1d26c610844a4c9
4
+ data.tar.gz: 16ea016fb63066dd617c7b4ac39887bbacba40af
5
5
  SHA512:
6
- metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
7
- data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227
6
+ metadata.gz: c07b216029b059b7f9bc2dcff01278801933fb95d7a201f19d95951e52329f66c311ebf26bdbd910a2c57a2347adef6f366361f5e71643a735e8bc1775c4b61c
7
+ data.tar.gz: eeb161448cdd65860686248281c6438e0b9e096529956ae175faab0a41bfa40cd3a101017b9aa5367099d4ad39fd7bc972fde48beae88e3f3dcb2c500715ae22
data/bin/text_sentencer CHANGED
@@ -3,6 +3,7 @@ require 'json'
3
3
  require 'text_sentencer'
4
4
 
5
5
  config_filename = nil
6
+ output_mode = :sentences
6
7
 
7
8
  ## command line option processing
8
9
  require 'optparse'
@@ -13,6 +14,10 @@ optparse = OptionParser.new do |opts|
13
14
  config_filename = c
14
15
  end
15
16
 
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
16
21
  opts.on('-h', '--help', 'displays this screen.') do
17
22
  puts opts
18
23
  exit
@@ -28,21 +33,13 @@ end
28
33
  sentencer = TextSentencer.new(config)
29
34
 
30
35
  text = ARGF.read
31
-
32
- ## Preprocessing
33
- # It should be removed later
34
- text.gsub!(/ +/, ' ')
35
- text.gsub!(/\n+/, "\n")
36
- text.gsub!(/\t+/, "\t")
37
- text.gsub!(/\n /, "\n")
38
- text.gsub!(/ \n/, "\n")
39
- text.gsub!(/\t /, "\t")
40
- text.gsub!(/ \t/, "\t")
41
- text.gsub!(/\n\t/, "\n")
42
- text.gsub!(/\t\n/, "\n")
43
-
44
36
  annotations = sentencer.annotate(text)
45
- annotations[:denotations].each do |d|
46
- span = d[:span]
47
- puts text[span[:begin]...span[:end]]
37
+
38
+ if output_mode == :json
39
+ puts JSON.pretty_generate(annotations)
40
+ else
41
+ annotations[:denotations].each do |d|
42
+ span = d[:span]
43
+ puts text[span[:begin]...span[:end]]
44
+ end
48
45
  end
@@ -0,0 +1,9 @@
1
+ class String
2
+ def scan_offset(regex)
3
+ Enumerator.new do |y|
4
+ self.scan(regex) do
5
+ y << Regexp.last_match
6
+ end
7
+ end
8
+ end
9
+ end
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ require 'text_sentencer/string_scan_offset'
2
3
  require 'pp'
3
4
 
4
5
  class TextSentencer
@@ -6,14 +7,10 @@ class TextSentencer
6
7
 
7
8
  DEFAULT_RULES = {
8
9
  # All the positions of new line characters always take sentence break.
9
- break_characters: [
10
- "\n"
11
- ],
10
+ break_pattern: "([ \t]*\n+)+[ \t]*",
12
11
 
13
12
  # All the positions of space and tab characters are candiates of sentence break.
14
- break_candidates: [
15
- " ", "\t"
16
- ],
13
+ candidate_pattern: "[ \t]+",
17
14
 
18
15
  # First, positive rules are applied to the break candidates to make initial segmantations.
19
16
  positive_rules: [
@@ -49,75 +46,73 @@ class TextSentencer
49
46
  def initialize(rules = nil)
50
47
  rules ||= DEFAULT_RULES
51
48
  @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
52
- @rules[:break_characters] ||= []
53
- @rules[:break_candidates] ||= []
49
+ @rules[:break_pattern] ||= ""
50
+ @rules[:candidate_pattern] ||= ""
54
51
  @rules[:positive_rules] ||= []
55
52
  @rules[:negative_rules] ||= []
56
53
  end
57
54
 
58
55
  def annotate(text)
59
- return nil if text.nil? || text.empty?
56
+ return nil if text.nil?
57
+
60
58
  sentences = segment(text)
61
59
  denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
62
- denotations.empty? ? {text:text} : {text:text, denotations:denotations}
60
+ {text:text, denotations:denotations}
63
61
  end
64
62
 
65
63
  private
66
64
 
67
65
  def segment(text)
68
- original_text = text
69
- text = original_text.strip
70
- start = original_text.index(text)
71
-
72
- # sentence breaks
73
- breaks = []
66
+ breaks = if @rules[:break_pattern].empty?
67
+ []
68
+ else
69
+ text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
70
+ end
74
71
 
75
- # breaks by positive rules
76
- pbreaks = []
72
+ candidates = if @rules[:candidate_pattern].empty?
73
+ []
74
+ else
75
+ text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
76
+ end
77
77
 
78
- # canceled breaks by negative rules
79
- nbreaks = []
78
+ # breaks take precedent
79
+ candidates -= breaks
80
80
 
81
- for l in 0..text.length
81
+ candidates.each do |c|
82
+ last_end, next_begin = c
82
83
 
83
- ## apply the positive rules to the places of break candidates
84
- if @rules[:break_candidates].include?(text[l])
85
- @rules[:positive_rules].each do |r|
86
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
87
- pbreaks << l
88
- break
89
- end
90
- end
91
- elsif @rules[:break_characters].include?(text[l])
92
- breaks << l
84
+ if (last_end == 0) || (next_begin == text.length)
85
+ breaks << c
86
+ next
93
87
  end
94
- end
95
88
 
96
- ## apply the negative rules to the places of break candidates
97
- pbreaks.each do |l|
98
- @rules[:negative_rules].each do |r|
99
- if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
100
- nbreaks << l
89
+ last_text = text[0...last_end]
90
+ next_text = text[next_begin..-1]
91
+
92
+ @rules[:positive_rules].each do |p|
93
+ if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
94
+ break_p = true
95
+ @rules[:negative_rules].each do |n|
96
+ if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
97
+ break_p = false
98
+ break
99
+ end
100
+ end
101
+ breaks << c if break_p
101
102
  break
102
103
  end
103
104
  end
104
105
  end
105
- breaks += pbreaks - nbreaks
106
+
106
107
  breaks.sort!
107
108
 
108
109
  sentences = []
109
- lastbreak = -1
110
+ lastbreak = 0
110
111
  breaks.each do |b|
111
- sentences.push([lastbreak+1, b])
112
- lastbreak = b
112
+ sentences << [lastbreak, b[0]] if b[0] > lastbreak
113
+ lastbreak = b[1]
113
114
  end
114
- sentences.push([lastbreak+1, text.length])
115
-
116
- ## filter out empty segments
117
- sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
118
-
119
- ## adjust offsets for the in text
120
- sentences.collect!{|b, e| [b + start, e + start]}
115
+ sentences << [lastbreak, text.length] if lastbreak < text.length
121
116
 
122
117
  sentences
123
118
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_sentencer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jin-Dong Kim
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-07-21 00:00:00.000000000 Z
11
+ date: 2017-07-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: TextSentencer is a simple rule-based system for segmenting a text block
14
14
  into sentences.
@@ -20,6 +20,7 @@ extra_rdoc_files: []
20
20
  files:
21
21
  - bin/text_sentencer
22
22
  - lib/text_sentencer.rb
23
+ - lib/text_sentencer/string_scan_offset.rb
23
24
  - lib/text_sentencer/text_sentencer.rb
24
25
  homepage: https://github.com/jdkim/text_sentencer
25
26
  licenses: