llt-tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,362 @@
1
+ require 'array_scanner'
2
+ require 'llt/core'
3
+ require 'llt/constants/abbreviations'
4
+ require 'llt/core_extensions/array'
5
+ require 'llt/db_handler/prometheus'
6
+ require 'llt/helpers/metrical'
7
+
8
+ module LLT
9
+ class Tokenizer
10
+ require 'llt/token'
11
+ require 'llt/tokenizer/worker'
12
+
13
+ include Core::Serviceable
14
+ include Constants::Abbreviations
15
+ include Helpers::Metrical
16
+
17
+ uses_db { DbHandler::Prometheus.new }
18
+
19
+ attr_reader :default_options
20
+
21
+ def self.default_options
22
+ {
23
+ shifting: true,
24
+ enclitics_marker: '-',
25
+ merging: true,
26
+ indexing: true,
27
+ }
28
+ end
29
+
30
+ def tokenize(text, add_to: nil, **options)
31
+ raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
32
+ return [] if text.empty?
33
+
34
+ setup(text, options)
35
+
36
+ find_abbreviations_and_join_strings
37
+ split_enklitika_and_change_their_position
38
+ merge_what_needs_merging if @merging # quam diu => quamdiu
39
+ tokens = create_tokens
40
+
41
+ add_to << tokens if add_to.respond_to?(:<<)
42
+ tokens
43
+ end
44
+
45
+ def setup(text, options = {}, worker = [])
46
+ @text = text
47
+ evaluate_metrical_presence(@text)
48
+ @enclitics_marker = parse_option(:enclitics_marker, options)
49
+ @merging = parse_option(:merging, options)
50
+ @shifting = parse_option(:shifting, options)
51
+ @indexing = parse_option(:indexing, options)
52
+ @worker = setup_worker(worker)
53
+ @shift_range = shift_range(@shifting)
54
+ end
55
+
56
+ PUNCTUATION = /([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/
57
+
58
+ # This is here for two reasons:
59
+ # 1) easier test setup, when a preliminary result shall be further evaluated
60
+ #
61
+ # 2) more importantly adding a level of indirection, when
62
+ # the given text holds metrical information. It adds a
63
+ # substitute implementation for the worker array, but only
64
+ # if it's needed - which should perform better, when there
65
+ # are no metrics involved (the default case)
66
+ def setup_worker(worker)
67
+ if worker.any?
68
+ worker
69
+ else
70
+ elements = @text.gsub(PUNCTUATION, ' \0 ').split
71
+ put_xml_attributes_back_together(elements)
72
+ if metrical?
73
+ Worker.new(elements, @enclitics_marker)
74
+ else
75
+ elements
76
+ end
77
+ end
78
+ end
79
+
80
+ def shift_range(shifting_enabled)
81
+ shifting_enabled ? 0 : 1
82
+ end
83
+
84
+ def put_xml_attributes_back_together(elements)
85
+ # elements could be like this
86
+ # ['<tag', 'attr1="val"', 'attr1="val>']
87
+ # and we want the complete xml tag back together
88
+ as = ArrayScanner.new(elements)
89
+ loop do
90
+ last = as.look_behind
91
+ if last && last.start_with?('<') &! last.end_with?('>')
92
+ if as.current.match(/\w+=".*"$|>/)
93
+ last << ' ' << as.current
94
+ elements.delete_at(as.pos)
95
+ # we don't need to forward, as we delete an element anyway
96
+ next
97
+ end
98
+ else
99
+ as.forward(1)
100
+ end
101
+ break if as.eoa?
102
+ end
103
+ end
104
+
105
+
106
+ ######################
107
+
108
+ # covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero
109
+ # covers Roman date expression like a. d. V. Kal. Apr.
110
+ ABBREVIATIONS = /^(#{ALL_ABBRS_PIPED})$/
111
+
112
+ # %w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
113
+
114
+ def find_abbreviations_and_join_strings
115
+ arr = []
116
+ @worker.each_with_index do |e, i|
117
+ n = @worker[i + 1]
118
+ if e =~ ABBREVIATIONS && n == "."
119
+ @worker[i + 1] = n.prepend(e)
120
+ arr << (i - arr.size)
121
+ end
122
+ end
123
+
124
+ arr.each { |i| @worker.delete_at(i) }
125
+ end
126
+
127
+ ######################
128
+
129
+ WORDS_ENDING_WITH_QUE = /^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i # neque taken out!
130
+ WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
131
+ WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
132
+
133
+ # laetusque to -que laetus
134
+ # in eoque to -que in eo
135
+ # honestumne to -ne honestum
136
+ #
137
+ # but
138
+ #
139
+ # uterque, institutione, sive et al. remain
140
+
141
+ ENCLITICS = %w{ que ne ve c }
142
+ def split_enklitika_and_change_their_position
143
+ split_with_force
144
+ split_nec
145
+ make_frequent_corrections
146
+ end
147
+
148
+ def split_with_force
149
+ # uses brute force at first
150
+ # the restrictor regexps handle only obvious cases
151
+
152
+ # don't use c here atm
153
+ ENCLITICS[0..-2].each do |encl|
154
+ split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
155
+ end
156
+ end
157
+
158
+ def split_enklitikon(encl, restrictors)
159
+ # needs a word character in front - ne itself should be contained
160
+ regexp = /(?<=\w)#{encl}$/
161
+
162
+ indices = []
163
+ @worker.each_with_index do |token, i|
164
+ if token.match(regexp) && restrictors !~ token
165
+ token.slice!(regexp)
166
+ indices << (i + indices.size + @shift_range)
167
+ end
168
+ end
169
+
170
+ indices.each { |i| @worker.insert(i, enclitic(encl)) }
171
+ end
172
+
173
+ def enclitic(val)
174
+ "#{@enclitics_marker}#{val}"
175
+ end
176
+
177
+ def split_nec
178
+ indices = []
179
+ @worker.each_with_index do |token, i|
180
+ if token == 'nec'
181
+ token.slice!(-1)
182
+ indices << (i + indices.size + @shift_range)
183
+ end
184
+ end
185
+
186
+ indices.each { |i| @worker.insert(i, enclitic('c')) }
187
+ end
188
+
189
+ def make_frequent_corrections
190
+ # uses db lookups
191
+ # # TODO 27.11.13 14:15 by LFDM
192
+ # Implement caching here
193
+ ne_corrections
194
+ que_corrections
195
+ ve_corrections
196
+ end
197
+
198
+ def que_corrections
199
+ # this is used in rare only in cases like in eoque
200
+ # which needs a shift to -que in eo
201
+ if @shifting
202
+ to_be_shifted_que_indices.each do |i|
203
+ @worker.insert(i - 1, @worker.delete_at(i))
204
+ end
205
+ end
206
+ end
207
+
208
+ def to_be_shifted_que_indices
209
+ # double shifts would properly fail, but they might never happen
210
+ @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
211
+ accumulator << index if is_que?(element) && led_by_preposition?(index)
212
+ end
213
+ end
214
+
215
+ def is_que?(element)
216
+ element == enclitic('que')
217
+ end
218
+
219
+ def led_by_preposition?(index)
220
+ @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
221
+ end
222
+
223
+ def ne_corrections
224
+ corrections = []
225
+ @worker.each_with_index do |w, i|
226
+ if w == enclitic('ne')
227
+ orig_el = original_word(i)
228
+
229
+ entries = []
230
+ entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
231
+ entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
232
+ entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne
233
+ entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
234
+ entries += lookup(orig_el + "n", :adjective, :stem) # communis commune
235
+
236
+ if entries.any?(&:third_decl_with_possible_ne_abl?)
237
+ corrections << i - corrections.size
238
+ end
239
+ end
240
+ end
241
+
242
+ reverse_splittings(corrections)
243
+ end
244
+
245
+ def ve_corrections
246
+ corrections = []
247
+ @worker.each_with_index do |w, i|
248
+ if w == enclitic('ve')
249
+ orig_el = original_word(i)
250
+
251
+ entries = []
252
+ entries += lookup(orig_el + 'v', :adjective, :stem, 1)
253
+ entries += lookup(orig_el + 'v', :adjective, :stem, 3)
254
+ entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
255
+ entries += lookup(orig_el + 've', :verb, :pr, 2)
256
+ entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
257
+
258
+
259
+ if entries.any?
260
+ corrections << i - corrections.size
261
+ end
262
+ end
263
+ end
264
+
265
+ reverse_splittings(corrections)
266
+ end
267
+
268
+ def original_word(i)
269
+ # there are two possible scenarios at this point
270
+ # with shifting enabled:
271
+ # i i + 1
272
+ # arma que virum
273
+ # with shifting disabled:
274
+ # i - 1 i
275
+ # arma virum que
276
+ @worker[i + (@shifting ? 1 : -1)]
277
+ end
278
+
279
+ def lookup(string, type, column, inflection_class = 3)
280
+ string = (type == :persona ? string : string.downcase)
281
+ query = {
282
+ type: type, stem_type: column, stem: string,
283
+ restrictions: { type: :inflection_class, values: Array(inflection_class) }
284
+ }
285
+ @db.look_up_stem(query)
286
+ end
287
+
288
+ def reverse_splittings(indices)
289
+ indices.each do |i|
290
+ # need to retrieve the orig word before the splitted var is
291
+ # assigned, as it deletes something in the worker
292
+ ow = original_word(i)
293
+ splitted = @worker.delete_at(i).delete(@enclitics_marker)
294
+ ow << splitted
295
+ end
296
+ end
297
+
298
+
299
+ ######################
300
+
301
+ MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]
302
+
303
+ # quam diu to quamdiu
304
+ def merge_what_needs_merging
305
+ to_delete = []
306
+ @worker.each_overlapping_pair.each_with_index do |pair, i|
307
+ merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
308
+ end
309
+ to_delete.each { |i| @worker.delete_at(i) }
310
+ end
311
+
312
+ def is_a_mergable_pair?(x, y)
313
+ # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
314
+ # sentence's first position
315
+ MERGE_WORDS.any? { |a, b| a === x.downcase && b === y }
316
+ end
317
+
318
+ def merge_words(pair, i, to_delete)
319
+ pair.first << pair.last
320
+ to_delete << (i + 1 - to_delete.size)
321
+ end
322
+
323
+ ######################
324
+
325
+ ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
326
+ ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
327
+ PUNCT_ITSELF = Regexp.new(PUNCTUATION.source + '$')
328
+ XML_TAG = /<\/?.+?>/
329
+
330
+ def create_tokens
331
+ # call #to_a is to retrieve (and align) optional metrical data
332
+ reset_id
333
+ @worker.to_a.map! do |el|
334
+ case el
335
+ when XML_TAG then Token::XmlTag.new(el)
336
+ when ABBR_NAME_WITH_DOT then raise_id and Token::Filler.new(el, @id)
337
+ when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
338
+ when PUNCT_ITSELF then raise_id and Token::Punctuation.new(el, @id)
339
+ else raise_id and Token::Word.new(el, @id)
340
+ end
341
+ end
342
+ end
343
+
344
+ def reset_id
345
+ @id = (@indexing ? @id = 0 : nil)
346
+ end
347
+
348
+ def raise_id
349
+ if @indexing
350
+ @id += 1
351
+ else
352
+ # need to return true because this is used as first part
353
+ # of an and construction
354
+ true
355
+ end
356
+ end
357
+
358
+ def preliminary
359
+ @worker.to_a
360
+ end
361
+ end
362
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'llt/tokenizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "llt-tokenizer"
8
+ spec.version = LLT::Tokenizer::VERSION
9
+ spec.authors = ["LFDM"]
10
+ spec.email = ["1986gh@gmail.com"]
11
+ spec.description = %q{LLT's Tokenizer}
12
+ spec.summary = %q{Breaks latin sentences into tokens}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "array_scanner"
22
+ spec.add_development_dependency "bundler", "~> 1.3"
23
+ spec.add_development_dependency "rake"
24
+ spec.add_development_dependency "rspec"
25
+ spec.add_development_dependency "simplecov", "~> 0.7"
26
+ spec.add_dependency "llt-core"
27
+ spec.add_dependency "llt-core_extensions"
28
+ spec.add_dependency "llt-db_handler"
29
+ spec.add_dependency "llt-helpers"
30
+ end
@@ -0,0 +1,58 @@
1
+ ENV['RACK_ENV'] = 'test'
2
+
3
+ require 'spec_helper'
4
+ require 'llt/tokenizer/api'
5
+ require 'rack/test'
6
+
7
+ def app
8
+ Api
9
+ end
10
+
11
+ describe "tokenizer api" do
12
+ include Rack::Test::Methods
13
+
14
+ describe '/tokenize' do
15
+ context "with URI as input" do
16
+ end
17
+
18
+ let(:text) {{text: "homo mittit."}}
19
+
20
+ context "with text as input" do
21
+ context "with accept header json" do
22
+ it "segments the given text" do
23
+ pending
24
+ get '/tokenize', text,
25
+ {"HTTP_ACCEPT" => "application/json"}
26
+ last_response.should be_ok
27
+ response = last_response.body
28
+ parsed_response = JSON.parse(response)
29
+ parsed_response.should have(3).items
30
+ end
31
+ end
32
+
33
+ context "with accept header xml" do
34
+ it "tokenize the given text" do
35
+ get '/tokenize', text,
36
+ {"HTTP_ACCEPT" => "application/xml"}
37
+ last_response.should be_ok
38
+ body = last_response.body
39
+ body.should =~ /<w n="1">homo<\/w>/
40
+ body.should =~ /<w n="2">mittit<\/w>/
41
+ body.should =~ /<pc n="3">\.<\/pc>/
42
+ end
43
+
44
+ it "receives params for tokenization and markup" do
45
+ params = { indexing: false }.merge(text)
46
+
47
+ get '/tokenize', params,
48
+ {"HTTP_ACCEPT" => "application/xml"}
49
+ last_response.should be_ok
50
+ body = last_response.body
51
+ body.should =~ /<w>homo<\/w>/
52
+ body.should =~ /<w>mittit<\/w>/
53
+ body.should =~ /<pc>\.<\/pc>/
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end