punkt-segmenter 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ module Punkt
2
+ class Token
3
+
4
+ attr_accessor :token, :type, :period_final
5
+ attr_accessor :paragraph_start, :line_start
6
+ attr_accessor :sentence_break, :abbr, :ellipsis
7
+
8
+ def initialize(token, options = {})
9
+ valid_options = [:paragraph_start, :line_start, :sentence_break, :abbr, :ellipsis]
10
+
11
+ @token = token
12
+ @type = UnicodeUtils.downcase(token).gsub(/^-?[\.,]?\d[\d,\.-]*\.?$/, '##number##') # numeric
13
+ @period_final = token.end_with?('.')
14
+
15
+ valid_options.each do |item|
16
+ self.instance_variable_set(("@"+item.to_s).to_sym, nil)
17
+ end
18
+ options.each do |key, value|
19
+ self.instance_variable_set(("@"+key.to_s).to_sym, value) if valid_options.include?(key)
20
+ end
21
+ end
22
+
23
+ def type_without_period
24
+ @type.size > 1 && @type.end_with?('.') ? @type.chop : @type
25
+ end
26
+
27
+ def type_without_sentence_period
28
+ @sentence_break ? type_without_period : @type
29
+ end
30
+
31
+ def first_upper?
32
+ UnicodeUtils.uppercase_char?(@token[0])
33
+ end
34
+
35
+ def first_lower?
36
+ UnicodeUtils.lowercase_char?(@token[0])
37
+ end
38
+
39
+ def first_case
40
+ return :lower if first_lower?
41
+ return :upper if first_upper?
42
+ return :none
43
+ end
44
+
45
+ def ends_with_period?
46
+ @period_final
47
+ end
48
+
49
+ def is_ellipsis?
50
+ !(@token =~ /^\.\.+$/).nil?
51
+ end
52
+
53
+ def is_number?
54
+ @type.start_with?("##number##")
55
+ end
56
+
57
+ def is_initial?
58
+ !(@token =~ /^[^\W\d]\.$/).nil?
59
+ end
60
+
61
+ def is_alpha?
62
+ !(@token =~ /^[^\W\d]+$/).nil?
63
+ end
64
+
65
+ def is_non_punctuation?
66
+ !(@type =~ /[^\W\d]/).nil?
67
+ end
68
+
69
+ def to_s
70
+ result = @token
71
+ result += '<A>' if @abbr
72
+ result += '<E>' if @ellipsis
73
+ result += '<S>' if @sentence_break
74
+ result
75
+ end
76
+
77
+ def inspect
78
+ "<#{to_s}>"
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,304 @@
1
+ require "punkt-segmenter/frequency_distribution"
2
+
3
+ module Punkt
4
+ class Trainer < Base
5
+
6
+ # cut-off value whether a 'token' is an abbreviation
7
+ ABBREV = 0.3
8
+
9
+ # allows the disabling of the abbreviation penalty heuristic, which
10
+ # exponentially disadvantages words that are found at times without a
11
+ # final period.
12
+ IGNORE_ABBREV_PENALTY = false
13
+
14
+ # upper cut-off for Mikheev's(2002) abbreviation detection algorithm
15
+ ABBREV_BACKOFF = 5
16
+
17
+ # minimal log-likelihood value that two tokens need to be considered
18
+ # as a collocation
19
+ COLLOCATION = 7.88
20
+
21
+ # minimal log-likelihood value that a token requires to be considered
22
+ # as a frequent sentence starter
23
+ SENT_STARTER = 30
24
+
25
+ # this includes as potential collocations all word pairs where the first
26
+ # word ends in a period. It may be useful in corpora where there is a lot
27
+ # of variation that makes abbreviations like Mr difficult to identify.
28
+ INCLUDE_ALL_COLLOCS = true #TODO colocar false
29
+
30
+ # this includes as potential collocations all word pairs where the first
31
+ # word is an abbreviation. Such collocations override the orthographic
32
+ # heuristic, but not the sentence starter heuristic. This is overridden by
33
+ # INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
34
+ # and ordinals are considered.
35
+ INCLUDE_ABBREV_COLLOCS = false
36
+
37
+ # this sets a minimum bound on the number of times a bigram needs to
38
+ # appear before it can be considered a collocation, in addition to log
39
+ # likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.
40
+ MIN_COLLOC_FREQ = 1
41
+
42
+ def initialize(language_vars = Punkt::LanguageVars.new,
43
+ token_class = Punkt::Token)
44
+
45
+ super(language_vars, token_class)
46
+
47
+ @type_fdist = Probability::FrequencyDistribution.new
48
+ @collocation_fdist = Probability::FrequencyDistribution.new
49
+ @sentence_starter_fdist = Probability::FrequencyDistribution.new
50
+ @period_tokens_count = 0
51
+ @sentence_break_count = 0
52
+ @finalized = false
53
+ end
54
+
55
+ def train(text_or_tokens)
56
+ if text_or_tokens.kind_of?(String)
57
+ tokens = tokenize_words(text_or_tokens)
58
+ elsif text_or_tokens.kind_of?(Array)
59
+ tokens = text_or_tokens.map { |t| @token_class.new(t) }
60
+ end
61
+ train_tokens(tokens)
62
+ end
63
+
64
+ def parameters
65
+ finalize_training unless @finalized
66
+ return @parameters
67
+ end
68
+
69
+ def finalize_training
70
+ @parameters.clear_sentence_starters
71
+ find_sentence_starters do |type, ll|
72
+ @parameters.sentence_starters << type
73
+ end
74
+
75
+ @parameters.clear_collocations
76
+ find_collocations do |types, ll|
77
+ @parameters.collocations << [types[0], types[1]]
78
+ end
79
+
80
+ @finalized = true
81
+ end
82
+
83
+ private
84
+
85
+ def train_tokens(tokens)
86
+ tokens.each do |token|
87
+ @type_fdist << token.type
88
+ @period_tokens_count += 1 if token.ends_with_period?
89
+ end
90
+
91
+ unique_types = Set.new(tokens.map { |t| t.type })
92
+
93
+ reclassify_abbreviation_types(unique_types) do |abbr, score, is_add|
94
+ if score >= ABBREV
95
+ @parameters.abbreviation_types << abbr if is_add
96
+ else
97
+ @parameters.abbreviation_types.delete(abbr) unless is_add
98
+ end
99
+ end
100
+
101
+ tokens = annotate_first_pass(tokens)
102
+
103
+ get_orthography_data(tokens)
104
+
105
+ tokens.each { |token| @sentence_break_count += 1 if token.sentence_break }
106
+
107
+ pair_each(tokens) do |tok1, tok2|
108
+ next if !tok1.ends_with_period? || !tok2
109
+
110
+ if is_rare_abbreviation_type?(tok1, tok2)
111
+ @parameters.abbreviation_types << tok1.type_without_period
112
+ end
113
+
114
+ if is_potential_sentence_starter?(tok2, tok1)
115
+ @sentence_starter_fdist << tok2.type
116
+ end
117
+
118
+ if is_potential_collocation?(tok1, tok2)
119
+ @collocation_fdist << [tok1.type_without_period, tok2.type_without_sentence_period]
120
+ end
121
+ end
122
+
123
+ end
124
+
125
+ def reclassify_abbreviation_types(types, &block)
126
+ types.each do |type|
127
+ # if there is punctuation or is a number, continue. This will be processed later
128
+ next if (type =~ /[^\W\d]/).nil? || type == "##number##"
129
+
130
+ if type.end_with?(".")
131
+ next if @parameters.abbreviation_types.include?(type)
132
+ type = type.chop
133
+ is_add = true
134
+ else
135
+ next unless @parameters.abbreviation_types.include?(type)
136
+ is_add = false
137
+ end
138
+
139
+ periods_count = type.count(".") + 1
140
+ non_periods_count = type.size - periods_count + 1
141
+
142
+ with_periods_count = @type_fdist[type + "."]
143
+ without_periods_count = @type_fdist[type]
144
+
145
+ ll = dunning_log_likelihood(with_periods_count + without_periods_count,
146
+ @period_tokens_count,
147
+ with_periods_count,
148
+ @type_fdist.N)
149
+
150
+ f_length = Math.exp(-non_periods_count)
151
+ f_periods = periods_count
152
+ f_penalty = IGNORE_ABBREV_PENALTY ? 0 : non_periods_count**(-without_periods_count).to_f
153
+
154
+ score = ll * f_length * f_periods * f_penalty
155
+
156
+ yield(type, score, is_add)
157
+ end
158
+ end
159
+
160
+ def dunning_log_likelihood(count_a, count_b, count_ab, n)
161
+ p1 = count_b.to_f / n
162
+ p2 = 0.99
163
+
164
+ null_hypo = (count_ab.to_f * Math.log(p1) +
165
+ (count_a - count_ab) * Math.log(1.0 - p1))
166
+ alt_hypo = (count_ab.to_f * Math.log(p2) +
167
+ (count_a - count_ab) * Math.log(1.0 - p2))
168
+
169
+ likelihood = null_hypo - alt_hypo
170
+
171
+ return (-2.0 * likelihood)
172
+ end
173
+
174
+ def get_orthography_data(tokens)
175
+ context = :internal
176
+
177
+ tokens.each do |aug_token|
178
+ context = :initial if aug_token.paragraph_start && context != :unknown
179
+ context = :unknown if aug_token.line_start && context == :internal
180
+
181
+ type = aug_token.type_without_sentence_period
182
+
183
+ flag = Punkt::ORTHO_MAP[[context, aug_token.first_case]] || 0
184
+ @parameters.add_orthographic_context(type, flag) if flag
185
+
186
+ if aug_token.sentence_break
187
+ context = !(aug_token.is_number? || aug_token.is_initial?) ? :initial : :unknown
188
+ elsif aug_token.ellipsis || aug_token.abbr
189
+ context = :unknown
190
+ else
191
+ context = :internal
192
+ end
193
+ end
194
+ end
195
+
196
+ def is_rare_abbreviation_type?(current_token, next_token)
197
+ return false if current_token.abbr || !current_token.sentence_break
198
+
199
+ type = current_token.type_without_sentence_period
200
+
201
+ count = @type_fdist[type] + @type_fdist[type.chop]
202
+ return false if (@parameters.abbreviation_types.include?(type) || count >= ABBREV_BACKOFF)
203
+
204
+ if @language_vars.internal_punctuation.include?(next_token.token[0])
205
+ return true
206
+ elsif next_token.first_lower?
207
+ type2 = next_token.type_without_sentence_period
208
+ type2_orthographic_context = @parameters.orthographic_context[type2]
209
+ return true if (type2_orthographic_context & Punkt::ORTHO_BEG_UC != 0) && (type2_orthographic_context & Punkt::ORTHO_MID_UC != 0)
210
+ end
211
+ end
212
+
213
+ def is_potential_sentence_starter?(current_token, previous_token)
214
+ return (previous_token.sentence_break &&
215
+ !(previous_token.is_number? || previous_token.is_initial?) &&
216
+ current_token.is_alpha?)
217
+ end
218
+
219
+ def is_potential_collocation?(tok1, tok2)
220
+ return ((INCLUDE_ALL_COLLOCS ||
221
+ (INCLUDE_ABBREV_COLLOCS && tok1.abbr) ||
222
+ (tok1.sentence_break &&
223
+ (tok1.is_number? || tok2.is_initial?))) &&
224
+ tok1.is_non_punctuation? &&
225
+ tok2.is_non_punctuation?)
226
+ end
227
+
228
+ def find_sentence_starters(&block)
229
+ @sentence_starter_fdist.each do |type, type_at_break_count|
230
+ next if !type
231
+
232
+ type_count = @type_fdist[type] + @type_fdist[type + "."]
233
+
234
+ next if type_count < type_at_break_count
235
+
236
+ ll = col_log_likelihood(@sentence_break_count,
237
+ type_count,
238
+ type_at_break_count,
239
+ @type_fdist.N)
240
+
241
+ if (ll >= SENT_STARTER &&
242
+ @type_fdist.N.to_f/@sentence_break_count > type_count.to_f/type_at_break_count)
243
+ yield(type, ll)
244
+ end
245
+ end
246
+ end
247
+
248
+ def col_log_likelihood(count_a, count_b, count_ab, n)
249
+ p = 1.0 * count_b / n
250
+ p1 = 1.0 * count_ab / count_a
251
+ p2 = 1.0 * (count_b - count_ab) / (n - count_a)
252
+
253
+ summand1 = (count_ab * Math.log(p) +
254
+ (count_a - count_ab) * Math.log(1.0 - p))
255
+
256
+ summand2 = ((count_b - count_ab) * Math.log(p) +
257
+ (n - count_a - count_b + count_ab) * Math.log(1.0 - p))
258
+
259
+ if count_a == count_ab
260
+ summand3 = 0
261
+ else
262
+ summand3 = (count_ab * Math.log(p1) +
263
+ (count_a - count_ab) * Math.log(1.0 - p1))
264
+ end
265
+
266
+ if count_b == count_ab
267
+ summand4 = 0
268
+ else
269
+ summand4 = ((count_b - count_ab) * Math.log(p2) +
270
+ (n - count_a - count_b + count_ab) * Math.log(1.0 - p2))
271
+ end
272
+
273
+ likelihood = summand1 + summand2 - summand3 - summand4
274
+
275
+ return (-2.0 * likelihood)
276
+ end
277
+
278
+ def find_collocations(&block)
279
+ @collocation_fdist.each do |types, col_count|
280
+ type1, type2 = types
281
+
282
+ next if type1.nil? || type2.nil?
283
+ next if @parameters.sentence_starters.include?(type2)
284
+
285
+ type1_count = @type_fdist[type1] + @type_fdist[type1 + "."]
286
+ type2_count = @type_fdist[type2] + @type_fdist[type2 + "."]
287
+
288
+ if (type1_count > 1 && type2_count > 1 &&
289
+ MIN_COLLOC_FREQ < col_count &&
290
+ col_count <= [type1_count, type2_count].min)
291
+
292
+ ll = col_log_likelihood(type1_count, type2_count,
293
+ col_count, @type_fdist.N)
294
+
295
+ if (ll >= COLLOCATION &&
296
+ @type_fdist.N.to_f/type1_count > type2_count.to_f/col_count)
297
+ yield([type1, type2], ll)
298
+ end
299
+ end
300
+ end
301
+ end
302
+
303
+ end
304
+ end
@@ -0,0 +1,17 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "punkt-segmenter"
3
+ s.version = "0.9.0"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.summary = "Ruby port of the NLTK Punkt sentence segmentation algorithm"
6
+ s.require_paths = ['lib']
7
+ s.files = Dir["{lib/**/*.rb,README.md,LICENSE.txt,test/**/*.rb,Rakefile,*.gemspec,script/*}"]
8
+
9
+ s.author = "Luis Cipriani"
10
+ s.email = "lfcipriani@talleye.com"
11
+ s.homepage = "http://github.com/lfcipriani/punkt-segmenter"
12
+
13
+ s.add_dependency('unicode_utils', '>= 1.0.0')
14
+
15
+ s.add_development_dependency('cover_me')
16
+ s.add_development_dependency('ruby-debug19')
17
+ end
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # File: script/console
3
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
4
+
5
+ libs = " -r #{File.dirname(__FILE__) + '/../lib/punkt-segmenter.rb'}"
6
+ puts "Loading punkt-segmenter env"
7
+ exec "#{irb} #{libs} --simple-prompt #{ARGV.join("")}"
@@ -0,0 +1,118 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../test_helper')
2
+
3
+ class Probability::FrequencyDistributionTest < Test::Unit::TestCase
4
+
5
+ def setup
6
+ @words = %w(two one three one one three two one two)
7
+ @freq_dist = Probability::FrequencyDistribution.new
8
+ end
9
+
10
+ def test_increment_count_on_given_sample
11
+ @words.each { |word| @freq_dist << word }
12
+
13
+ assert_equal @freq_dist["one"] , 4
14
+ assert_equal @freq_dist["two"] , 3
15
+ assert_equal @freq_dist["three"], 2
16
+ assert_equal @freq_dist.N , 9
17
+ end
18
+
19
+ def test_increment_count_on_given_sample_for_count_different_than_1
20
+ @words.each { |word| @freq_dist.inc(word, 2) }
21
+
22
+ assert_equal @freq_dist["one"] , 8
23
+ assert_equal @freq_dist["two"] , 6
24
+ assert_equal @freq_dist["three"], 4
25
+ assert_equal @freq_dist.N , 18
26
+ end
27
+
28
+ def test_direct_count_attribution
29
+ @freq_dist["one"] = 10
30
+ @freq_dist["two"] = 20
31
+ @freq_dist["three"] = 30
32
+
33
+ assert_equal @freq_dist["one"] , 10
34
+ assert_equal @freq_dist["two"] , 20
35
+ assert_equal @freq_dist["three"], 30
36
+ assert_equal @freq_dist.N , 60
37
+ end
38
+
39
+ def test_get_sample_frequencies
40
+ @words.each { |word| @freq_dist << word }
41
+
42
+ assert_equal((@freq_dist.frequency_of("one") +
43
+ @freq_dist.frequency_of("two") +
44
+ @freq_dist.frequency_of("three")).round, 1)
45
+ end
46
+
47
+ def test_get_sample_with_maximum_ocurrences
48
+ @words.each { |word| @freq_dist << word }
49
+
50
+ assert_equal(@freq_dist.max, "one")
51
+ end
52
+
53
+ def test_merge_frequency_distribution
54
+ @words.each { |word| @freq_dist << word }
55
+ @new_freq = @freq_dist.merge(@freq_dist)
56
+
57
+ assert_equal @new_freq["one"] , 8
58
+ assert_equal @new_freq["two"] , 6
59
+ assert_equal @new_freq["three"], 4
60
+ assert_equal @new_freq.N , 18
61
+
62
+ assert_equal @freq_dist.merge!(@new_freq).N, 27
63
+ end
64
+
65
+ def test_get_keys_ordered_by_frequency_desc
66
+ @words.each { |word| @freq_dist << word }
67
+
68
+ assert_equal @freq_dist.keys.first, "one"
69
+ assert_equal @freq_dist.keys[1] , "two"
70
+ assert_equal @freq_dist.keys.last , "three"
71
+ end
72
+
73
+ def test_get_values_ordered_by_frequency_desc
74
+ @words.each { |word| @freq_dist << word }
75
+
76
+ assert_equal @freq_dist.values.first, 4
77
+ assert_equal @freq_dist.values[1] , 3
78
+ assert_equal @freq_dist.values.last , 2
79
+ end
80
+
81
+ def test_iterators_must_order_by_frequency_desc
82
+ @words.each { |word| @freq_dist << word }
83
+ ordered = []
84
+ @freq_dist.each do |sample, value|
85
+ ordered << [sample, value]
86
+ end
87
+ assert_equal ordered, @freq_dist.items
88
+
89
+ ordered = []
90
+ @freq_dist.each_key do |keys|
91
+ ordered << keys
92
+ end
93
+ assert_equal ordered, @freq_dist.keys
94
+
95
+ ordered = []
96
+ @freq_dist.each_value do |value|
97
+ ordered << value
98
+ end
99
+ assert_equal ordered, @freq_dist.values
100
+ end
101
+
102
+ def test_removing_samples
103
+ @words.each { |word| @freq_dist << word }
104
+
105
+ assert_equal @freq_dist.delete("one"), 4
106
+ assert_equal @freq_dist.N , 5
107
+
108
+ assert_raise(RuntimeError) { @freq_dist.delete_if { |sample, value| value == 2 } }
109
+ end
110
+
111
+ def test_features_with_empty_distribution
112
+ assert_equal @freq_dist["a sample"] , 0
113
+ assert_equal @freq_dist.N , 0
114
+ assert_equal @freq_dist.frequency_of("a sample"), 0
115
+ assert_equal @freq_dist.max , nil
116
+ end
117
+ end
118
+