llt-tokenizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,362 @@
1
+ require 'array_scanner'
2
+ require 'llt/core'
3
+ require 'llt/constants/abbreviations'
4
+ require 'llt/core_extensions/array'
5
+ require 'llt/db_handler/prometheus'
6
+ require 'llt/helpers/metrical'
7
+
8
+ module LLT
9
+ class Tokenizer
10
+ require 'llt/token'
11
+ require 'llt/tokenizer/worker'
12
+
13
+ include Core::Serviceable
14
+ include Constants::Abbreviations
15
+ include Helpers::Metrical
16
+
17
+ uses_db { DbHandler::Prometheus.new }
18
+
19
+ attr_reader :default_options
20
+
21
+ def self.default_options
22
+ {
23
+ shifting: true,
24
+ enclitics_marker: '-',
25
+ merging: true,
26
+ indexing: true,
27
+ }
28
+ end
29
+
30
+ def tokenize(text, add_to: nil, **options)
31
+ raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
32
+ return [] if text.empty?
33
+
34
+ setup(text, options)
35
+
36
+ find_abbreviations_and_join_strings
37
+ split_enklitika_and_change_their_position
38
+ merge_what_needs_merging if @merging # quam diu => quamdiu
39
+ tokens = create_tokens
40
+
41
+ add_to << tokens if add_to.respond_to?(:<<)
42
+ tokens
43
+ end
44
+
45
+ def setup(text, options = {}, worker = [])
46
+ @text = text
47
+ evaluate_metrical_presence(@text)
48
+ @enclitics_marker = parse_option(:enclitics_marker, options)
49
+ @merging = parse_option(:merging, options)
50
+ @shifting = parse_option(:shifting, options)
51
+ @indexing = parse_option(:indexing, options)
52
+ @worker = setup_worker(worker)
53
+ @shift_range = shift_range(@shifting)
54
+ end
55
+
56
+ PUNCTUATION = /([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/
57
+
58
+ # This is here for two reasons:
59
+ # 1) easier test setup, when a preliminary result shall be further evaluated
60
+ #
61
+ # 2) more importantly adding a level of indirection, when
62
+ # the given text holds metrical information. It adds a
63
+ # substitute implementation for the worker array, but only
64
+ # if it's needed - which should perform better, when there
65
+ # are no metrics involved (the default case)
66
+ def setup_worker(worker)
67
+ if worker.any?
68
+ worker
69
+ else
70
+ elements = @text.gsub(PUNCTUATION, ' \0 ').split
71
+ put_xml_attributes_back_together(elements)
72
+ if metrical?
73
+ Worker.new(elements, @enclitics_marker)
74
+ else
75
+ elements
76
+ end
77
+ end
78
+ end
79
+
80
+ def shift_range(shifting_enabled)
81
+ shifting_enabled ? 0 : 1
82
+ end
83
+
84
+ def put_xml_attributes_back_together(elements)
85
+ # elements could be like this
86
+ # ['<tag', 'attr1="val"', 'attr1="val>']
87
+ # and we want the complete xml tag back together
88
+ as = ArrayScanner.new(elements)
89
+ loop do
90
+ last = as.look_behind
91
+ if last && last.start_with?('<') &! last.end_with?('>')
92
+ if as.current.match(/\w+=".*"$|>/)
93
+ last << ' ' << as.current
94
+ elements.delete_at(as.pos)
95
+ # we don't need to forward, as we delete an element anyway
96
+ next
97
+ end
98
+ else
99
+ as.forward(1)
100
+ end
101
+ break if as.eoa?
102
+ end
103
+ end
104
+
105
+
106
+ ######################
107
+
108
+ # covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero
109
+ # covers Roman date expression like a. d. V. Kal. Apr.
110
+ ABBREVIATIONS = /^(#{ALL_ABBRS_PIPED})$/
111
+
112
+ # %w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
113
+
114
+ def find_abbreviations_and_join_strings
115
+ arr = []
116
+ @worker.each_with_index do |e, i|
117
+ n = @worker[i + 1]
118
+ if e =~ ABBREVIATIONS && n == "."
119
+ @worker[i + 1] = n.prepend(e)
120
+ arr << (i - arr.size)
121
+ end
122
+ end
123
+
124
+ arr.each { |i| @worker.delete_at(i) }
125
+ end
126
+
127
+ ######################
128
+
129
+ WORDS_ENDING_WITH_QUE = /^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i # neque taken out!
130
+ WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
131
+ WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
132
+
133
+ # laetusque to -que laetus
134
+ # in eoque to -que in eo
135
+ # honestumne to -ne honestum
136
+ #
137
+ # but
138
+ #
139
+ # uterque, institutione, sive et al. remain
140
+
141
+ ENCLITICS = %w{ que ne ve c }
142
+ def split_enklitika_and_change_their_position
143
+ split_with_force
144
+ split_nec
145
+ make_frequent_corrections
146
+ end
147
+
148
+ def split_with_force
149
+ # uses brute force at first
150
+ # the restrictor regexps handle only obvious cases
151
+
152
+ # don't use c here atm
153
+ ENCLITICS[0..-2].each do |encl|
154
+ split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
155
+ end
156
+ end
157
+
158
+ def split_enklitikon(encl, restrictors)
159
+ # needs a word character in front - ne itself should be contained
160
+ regexp = /(?<=\w)#{encl}$/
161
+
162
+ indices = []
163
+ @worker.each_with_index do |token, i|
164
+ if token.match(regexp) && restrictors !~ token
165
+ token.slice!(regexp)
166
+ indices << (i + indices.size + @shift_range)
167
+ end
168
+ end
169
+
170
+ indices.each { |i| @worker.insert(i, enclitic(encl)) }
171
+ end
172
+
173
+ def enclitic(val)
174
+ "#{@enclitics_marker}#{val}"
175
+ end
176
+
177
+ def split_nec
178
+ indices = []
179
+ @worker.each_with_index do |token, i|
180
+ if token == 'nec'
181
+ token.slice!(-1)
182
+ indices << (i + indices.size + @shift_range)
183
+ end
184
+ end
185
+
186
+ indices.each { |i| @worker.insert(i, enclitic('c')) }
187
+ end
188
+
189
+ def make_frequent_corrections
190
+ # uses db lookups
191
+ # # TODO 27.11.13 14:15 by LFDM
192
+ # Implement caching here
193
+ ne_corrections
194
+ que_corrections
195
+ ve_corrections
196
+ end
197
+
198
+ def que_corrections
199
+ # this is used in rare only in cases like in eoque
200
+ # which needs a shift to -que in eo
201
+ if @shifting
202
+ to_be_shifted_que_indices.each do |i|
203
+ @worker.insert(i - 1, @worker.delete_at(i))
204
+ end
205
+ end
206
+ end
207
+
208
+ def to_be_shifted_que_indices
209
+ # double shifts would properly fail, but they might never happen
210
+ @worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
211
+ accumulator << index if is_que?(element) && led_by_preposition?(index)
212
+ end
213
+ end
214
+
215
+ def is_que?(element)
216
+ element == enclitic('que')
217
+ end
218
+
219
+ def led_by_preposition?(index)
220
+ @worker[index - 1] =~ /^(in|ad|ob)$/i # and others
221
+ end
222
+
223
+ def ne_corrections
224
+ corrections = []
225
+ @worker.each_with_index do |w, i|
226
+ if w == enclitic('ne')
227
+ orig_el = original_word(i)
228
+
229
+ entries = []
230
+ entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
231
+ entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
232
+ entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne
233
+ entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
234
+ entries += lookup(orig_el + "n", :adjective, :stem) # communis commune
235
+
236
+ if entries.any?(&:third_decl_with_possible_ne_abl?)
237
+ corrections << i - corrections.size
238
+ end
239
+ end
240
+ end
241
+
242
+ reverse_splittings(corrections)
243
+ end
244
+
245
+ def ve_corrections
246
+ corrections = []
247
+ @worker.each_with_index do |w, i|
248
+ if w == enclitic('ve')
249
+ orig_el = original_word(i)
250
+
251
+ entries = []
252
+ entries += lookup(orig_el + 'v', :adjective, :stem, 1)
253
+ entries += lookup(orig_el + 'v', :adjective, :stem, 3)
254
+ entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
255
+ entries += lookup(orig_el + 've', :verb, :pr, 2)
256
+ entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
257
+
258
+
259
+ if entries.any?
260
+ corrections << i - corrections.size
261
+ end
262
+ end
263
+ end
264
+
265
+ reverse_splittings(corrections)
266
+ end
267
+
268
+ def original_word(i)
269
+ # there are two possible scenarios at this point
270
+ # with shifting enabled:
271
+ # i i + 1
272
+ # arma que virum
273
+ # with shifting disabled:
274
+ # i - 1 i
275
+ # arma virum que
276
+ @worker[i + (@shifting ? 1 : -1)]
277
+ end
278
+
279
+ def lookup(string, type, column, inflection_class = 3)
280
+ string = (type == :persona ? string : string.downcase)
281
+ query = {
282
+ type: type, stem_type: column, stem: string,
283
+ restrictions: { type: :inflection_class, values: Array(inflection_class) }
284
+ }
285
+ @db.look_up_stem(query)
286
+ end
287
+
288
+ def reverse_splittings(indices)
289
+ indices.each do |i|
290
+ # need to retrieve the orig word before the splitted var is
291
+ # assigned, as it deletes something in the worker
292
+ ow = original_word(i)
293
+ splitted = @worker.delete_at(i).delete(@enclitics_marker)
294
+ ow << splitted
295
+ end
296
+ end
297
+
298
+
299
+ ######################
300
+
301
+ MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]
302
+
303
+ # quam diu to quamdiu
304
+ def merge_what_needs_merging
305
+ to_delete = []
306
+ @worker.each_overlapping_pair.each_with_index do |pair, i|
307
+ merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
308
+ end
309
+ to_delete.each { |i| @worker.delete_at(i) }
310
+ end
311
+
312
+ def is_a_mergable_pair?(x, y)
313
+ # x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
314
+ # sentence's first position
315
+ MERGE_WORDS.any? { |a, b| a === x.downcase && b === y }
316
+ end
317
+
318
+ def merge_words(pair, i, to_delete)
319
+ pair.first << pair.last
320
+ to_delete << (i + 1 - to_delete.size)
321
+ end
322
+
323
+ ######################
324
+
325
+ ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
326
+ ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
327
+ PUNCT_ITSELF = Regexp.new(PUNCTUATION.source + '$')
328
+ XML_TAG = /<\/?.+?>/
329
+
330
+ def create_tokens
331
+ # call #to_a is to retrieve (and align) optional metrical data
332
+ reset_id
333
+ @worker.to_a.map! do |el|
334
+ case el
335
+ when XML_TAG then Token::XmlTag.new(el)
336
+ when ABBR_NAME_WITH_DOT then raise_id and Token::Filler.new(el, @id)
337
+ when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
338
+ when PUNCT_ITSELF then raise_id and Token::Punctuation.new(el, @id)
339
+ else raise_id and Token::Word.new(el, @id)
340
+ end
341
+ end
342
+ end
343
+
344
+ def reset_id
345
+ @id = (@indexing ? @id = 0 : nil)
346
+ end
347
+
348
+ def raise_id
349
+ if @indexing
350
+ @id += 1
351
+ else
352
+ # need to return true because this is used as first part
353
+ # of an and construction
354
+ true
355
+ end
356
+ end
357
+
358
+ def preliminary
359
+ @worker.to_a
360
+ end
361
+ end
362
+ end
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'llt/tokenizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "llt-tokenizer"
8
+ spec.version = LLT::Tokenizer::VERSION
9
+ spec.authors = ["LFDM"]
10
+ spec.email = ["1986gh@gmail.com"]
11
+ spec.description = %q{LLT's Tokenizer}
12
+ spec.summary = %q{Breaks latin sentences into tokens}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "array_scanner"
22
+ spec.add_development_dependency "bundler", "~> 1.3"
23
+ spec.add_development_dependency "rake"
24
+ spec.add_development_dependency "rspec"
25
+ spec.add_development_dependency "simplecov", "~> 0.7"
26
+ spec.add_dependency "llt-core"
27
+ spec.add_dependency "llt-core_extensions"
28
+ spec.add_dependency "llt-db_handler"
29
+ spec.add_dependency "llt-helpers"
30
+ end
@@ -0,0 +1,58 @@
1
+ ENV['RACK_ENV'] = 'test'
2
+
3
+ require 'spec_helper'
4
+ require 'llt/tokenizer/api'
5
+ require 'rack/test'
6
+
7
+ def app
8
+ Api
9
+ end
10
+
11
+ describe "tokenizer api" do
12
+ include Rack::Test::Methods
13
+
14
+ describe '/tokenize' do
15
+ context "with URI as input" do
16
+ end
17
+
18
+ let(:text) {{text: "homo mittit."}}
19
+
20
+ context "with text as input" do
21
+ context "with accept header json" do
22
+ it "segments the given text" do
23
+ pending
24
+ get '/tokenize', text,
25
+ {"HTTP_ACCEPT" => "application/json"}
26
+ last_response.should be_ok
27
+ response = last_response.body
28
+ parsed_response = JSON.parse(response)
29
+ parsed_response.should have(3).items
30
+ end
31
+ end
32
+
33
+ context "with accept header xml" do
34
+ it "tokenize the given text" do
35
+ get '/tokenize', text,
36
+ {"HTTP_ACCEPT" => "application/xml"}
37
+ last_response.should be_ok
38
+ body = last_response.body
39
+ body.should =~ /<w n="1">homo<\/w>/
40
+ body.should =~ /<w n="2">mittit<\/w>/
41
+ body.should =~ /<pc n="3">\.<\/pc>/
42
+ end
43
+
44
+ it "receives params for tokenization and markup" do
45
+ params = { indexing: false }.merge(text)
46
+
47
+ get '/tokenize', params,
48
+ {"HTTP_ACCEPT" => "application/xml"}
49
+ last_response.should be_ok
50
+ body = last_response.body
51
+ body.should =~ /<w>homo<\/w>/
52
+ body.should =~ /<w>mittit<\/w>/
53
+ body.should =~ /<pc>\.<\/pc>/
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end