llt-tokenizer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +22 -0
- data/README.md +97 -0
- data/Rakefile +6 -0
- data/lib/llt/token/filler.rb +31 -0
- data/lib/llt/token/punctuation.rb +36 -0
- data/lib/llt/token/word.rb +53 -0
- data/lib/llt/token/xml_tag.rb +24 -0
- data/lib/llt/token.rb +51 -0
- data/lib/llt/tokenizer/api.rb +20 -0
- data/lib/llt/tokenizer/version.rb +5 -0
- data/lib/llt/tokenizer/worker.rb +106 -0
- data/lib/llt/tokenizer.rb +362 -0
- data/llt-tokenizer.gemspec +30 -0
- data/spec/lib/llt/tokenizer/api_spec.rb +58 -0
- data/spec/lib/llt/tokenizer_spec.rb +361 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/matchers/tokenizer.rb +5 -0
- metadata +195 -0
@@ -0,0 +1,362 @@
|
|
1
|
+
require 'array_scanner'
|
2
|
+
require 'llt/core'
|
3
|
+
require 'llt/constants/abbreviations'
|
4
|
+
require 'llt/core_extensions/array'
|
5
|
+
require 'llt/db_handler/prometheus'
|
6
|
+
require 'llt/helpers/metrical'
|
7
|
+
|
8
|
+
module LLT
|
9
|
+
class Tokenizer
|
10
|
+
require 'llt/token'
|
11
|
+
require 'llt/tokenizer/worker'
|
12
|
+
|
13
|
+
include Core::Serviceable
|
14
|
+
include Constants::Abbreviations
|
15
|
+
include Helpers::Metrical
|
16
|
+
|
17
|
+
uses_db { DbHandler::Prometheus.new }
|
18
|
+
|
19
|
+
attr_reader :default_options
|
20
|
+
|
21
|
+
def self.default_options
|
22
|
+
{
|
23
|
+
shifting: true,
|
24
|
+
enclitics_marker: '-',
|
25
|
+
merging: true,
|
26
|
+
indexing: true,
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
def tokenize(text, add_to: nil, **options)
|
31
|
+
raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
|
32
|
+
return [] if text.empty?
|
33
|
+
|
34
|
+
setup(text, options)
|
35
|
+
|
36
|
+
find_abbreviations_and_join_strings
|
37
|
+
split_enklitika_and_change_their_position
|
38
|
+
merge_what_needs_merging if @merging # quam diu => quamdiu
|
39
|
+
tokens = create_tokens
|
40
|
+
|
41
|
+
add_to << tokens if add_to.respond_to?(:<<)
|
42
|
+
tokens
|
43
|
+
end
|
44
|
+
|
45
|
+
def setup(text, options = {}, worker = [])
|
46
|
+
@text = text
|
47
|
+
evaluate_metrical_presence(@text)
|
48
|
+
@enclitics_marker = parse_option(:enclitics_marker, options)
|
49
|
+
@merging = parse_option(:merging, options)
|
50
|
+
@shifting = parse_option(:shifting, options)
|
51
|
+
@indexing = parse_option(:indexing, options)
|
52
|
+
@worker = setup_worker(worker)
|
53
|
+
@shift_range = shift_range(@shifting)
|
54
|
+
end
|
55
|
+
|
56
|
+
PUNCTUATION = /([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/
|
57
|
+
|
58
|
+
# This is here for two reasons:
|
59
|
+
# 1) easier test setup, when a preliminary result shall be further evaluated
|
60
|
+
#
|
61
|
+
# 2) more importantly adding a level of indirection, when
|
62
|
+
# the given text holds metrical information. It adds a
|
63
|
+
# substitute implementation for the worker array, but only
|
64
|
+
# if it's needed - which should perform better, when there
|
65
|
+
# are no metrics involved (the default case)
|
66
|
+
def setup_worker(worker)
|
67
|
+
if worker.any?
|
68
|
+
worker
|
69
|
+
else
|
70
|
+
elements = @text.gsub(PUNCTUATION, ' \0 ').split
|
71
|
+
put_xml_attributes_back_together(elements)
|
72
|
+
if metrical?
|
73
|
+
Worker.new(elements, @enclitics_marker)
|
74
|
+
else
|
75
|
+
elements
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def shift_range(shifting_enabled)
|
81
|
+
shifting_enabled ? 0 : 1
|
82
|
+
end
|
83
|
+
|
84
|
+
def put_xml_attributes_back_together(elements)
|
85
|
+
# elements could be like this
|
86
|
+
# ['<tag', 'attr1="val"', 'attr1="val>']
|
87
|
+
# and we want the complete xml tag back together
|
88
|
+
as = ArrayScanner.new(elements)
|
89
|
+
loop do
|
90
|
+
last = as.look_behind
|
91
|
+
if last && last.start_with?('<') &! last.end_with?('>')
|
92
|
+
if as.current.match(/\w+=".*"$|>/)
|
93
|
+
last << ' ' << as.current
|
94
|
+
elements.delete_at(as.pos)
|
95
|
+
# we don't need to forward, as we delete an element anyway
|
96
|
+
next
|
97
|
+
end
|
98
|
+
else
|
99
|
+
as.forward(1)
|
100
|
+
end
|
101
|
+
break if as.eoa?
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
######################
|
107
|
+
|
108
|
+
# covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero
|
109
|
+
# covers Roman date expression like a. d. V. Kal. Apr.
|
110
|
+
ABBREVIATIONS = /^(#{ALL_ABBRS_PIPED})$/
|
111
|
+
|
112
|
+
# %w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
|
113
|
+
|
114
|
+
def find_abbreviations_and_join_strings
|
115
|
+
arr = []
|
116
|
+
@worker.each_with_index do |e, i|
|
117
|
+
n = @worker[i + 1]
|
118
|
+
if e =~ ABBREVIATIONS && n == "."
|
119
|
+
@worker[i + 1] = n.prepend(e)
|
120
|
+
arr << (i - arr.size)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
arr.each { |i| @worker.delete_at(i) }
|
125
|
+
end
|
126
|
+
|
127
|
+
######################
|
128
|
+
|
129
|
+
WORDS_ENDING_WITH_QUE = /^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i # neque taken out!
|
130
|
+
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
|
131
|
+
WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
|
132
|
+
|
133
|
+
# laetusque to -que laetus
|
134
|
+
# in eoque to -que in eo
|
135
|
+
# honestumne to -ne honestum
|
136
|
+
#
|
137
|
+
# but
|
138
|
+
#
|
139
|
+
# uterque, institutione, sive et al. remain
|
140
|
+
|
141
|
+
ENCLITICS = %w{ que ne ve c }
|
142
|
+
def split_enklitika_and_change_their_position
|
143
|
+
split_with_force
|
144
|
+
split_nec
|
145
|
+
make_frequent_corrections
|
146
|
+
end
|
147
|
+
|
148
|
+
def split_with_force
|
149
|
+
# uses brute force at first
|
150
|
+
# the restrictor regexps handle only obvious cases
|
151
|
+
|
152
|
+
# don't use c here atm
|
153
|
+
ENCLITICS[0..-2].each do |encl|
|
154
|
+
split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def split_enklitikon(encl, restrictors)
|
159
|
+
# needs a word character in front - ne itself should be contained
|
160
|
+
regexp = /(?<=\w)#{encl}$/
|
161
|
+
|
162
|
+
indices = []
|
163
|
+
@worker.each_with_index do |token, i|
|
164
|
+
if token.match(regexp) && restrictors !~ token
|
165
|
+
token.slice!(regexp)
|
166
|
+
indices << (i + indices.size + @shift_range)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
indices.each { |i| @worker.insert(i, enclitic(encl)) }
|
171
|
+
end
|
172
|
+
|
173
|
+
def enclitic(val)
|
174
|
+
"#{@enclitics_marker}#{val}"
|
175
|
+
end
|
176
|
+
|
177
|
+
def split_nec
|
178
|
+
indices = []
|
179
|
+
@worker.each_with_index do |token, i|
|
180
|
+
if token == 'nec'
|
181
|
+
token.slice!(-1)
|
182
|
+
indices << (i + indices.size + @shift_range)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
indices.each { |i| @worker.insert(i, enclitic('c')) }
|
187
|
+
end
|
188
|
+
|
189
|
+
def make_frequent_corrections
|
190
|
+
# uses db lookups
|
191
|
+
# # TODO 27.11.13 14:15 by LFDM
|
192
|
+
# Implement caching here
|
193
|
+
ne_corrections
|
194
|
+
que_corrections
|
195
|
+
ve_corrections
|
196
|
+
end
|
197
|
+
|
198
|
+
def que_corrections
|
199
|
+
# this is used in rare only in cases like in eoque
|
200
|
+
# which needs a shift to -que in eo
|
201
|
+
if @shifting
|
202
|
+
to_be_shifted_que_indices.each do |i|
|
203
|
+
@worker.insert(i - 1, @worker.delete_at(i))
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def to_be_shifted_que_indices
|
209
|
+
# double shifts would properly fail, but they might never happen
|
210
|
+
@worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
|
211
|
+
accumulator << index if is_que?(element) && led_by_preposition?(index)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def is_que?(element)
|
216
|
+
element == enclitic('que')
|
217
|
+
end
|
218
|
+
|
219
|
+
def led_by_preposition?(index)
|
220
|
+
@worker[index - 1] =~ /^(in|ad|ob)$/i # and others
|
221
|
+
end
|
222
|
+
|
223
|
+
def ne_corrections
|
224
|
+
corrections = []
|
225
|
+
@worker.each_with_index do |w, i|
|
226
|
+
if w == enclitic('ne')
|
227
|
+
orig_el = original_word(i)
|
228
|
+
|
229
|
+
entries = []
|
230
|
+
entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
|
231
|
+
entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
|
232
|
+
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne
|
233
|
+
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
|
234
|
+
entries += lookup(orig_el + "n", :adjective, :stem) # communis commune
|
235
|
+
|
236
|
+
if entries.any?(&:third_decl_with_possible_ne_abl?)
|
237
|
+
corrections << i - corrections.size
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
reverse_splittings(corrections)
|
243
|
+
end
|
244
|
+
|
245
|
+
def ve_corrections
|
246
|
+
corrections = []
|
247
|
+
@worker.each_with_index do |w, i|
|
248
|
+
if w == enclitic('ve')
|
249
|
+
orig_el = original_word(i)
|
250
|
+
|
251
|
+
entries = []
|
252
|
+
entries += lookup(orig_el + 'v', :adjective, :stem, 1)
|
253
|
+
entries += lookup(orig_el + 'v', :adjective, :stem, 3)
|
254
|
+
entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
|
255
|
+
entries += lookup(orig_el + 've', :verb, :pr, 2)
|
256
|
+
entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
|
257
|
+
|
258
|
+
|
259
|
+
if entries.any?
|
260
|
+
corrections << i - corrections.size
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
reverse_splittings(corrections)
|
266
|
+
end
|
267
|
+
|
268
|
+
def original_word(i)
|
269
|
+
# there are two possible scenarios at this point
|
270
|
+
# with shifting enabled:
|
271
|
+
# i i + 1
|
272
|
+
# arma que virum
|
273
|
+
# with shifting disabled:
|
274
|
+
# i - 1 i
|
275
|
+
# arma virum que
|
276
|
+
@worker[i + (@shifting ? 1 : -1)]
|
277
|
+
end
|
278
|
+
|
279
|
+
def lookup(string, type, column, inflection_class = 3)
|
280
|
+
string = (type == :persona ? string : string.downcase)
|
281
|
+
query = {
|
282
|
+
type: type, stem_type: column, stem: string,
|
283
|
+
restrictions: { type: :inflection_class, values: Array(inflection_class) }
|
284
|
+
}
|
285
|
+
@db.look_up_stem(query)
|
286
|
+
end
|
287
|
+
|
288
|
+
def reverse_splittings(indices)
|
289
|
+
indices.each do |i|
|
290
|
+
# need to retrieve the orig word before the splitted var is
|
291
|
+
# assigned, as it deletes something in the worker
|
292
|
+
ow = original_word(i)
|
293
|
+
splitted = @worker.delete_at(i).delete(@enclitics_marker)
|
294
|
+
ow << splitted
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
|
299
|
+
######################
|
300
|
+
|
301
|
+
MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]
|
302
|
+
|
303
|
+
# quam diu to quamdiu
|
304
|
+
def merge_what_needs_merging
|
305
|
+
to_delete = []
|
306
|
+
@worker.each_overlapping_pair.each_with_index do |pair, i|
|
307
|
+
merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
|
308
|
+
end
|
309
|
+
to_delete.each { |i| @worker.delete_at(i) }
|
310
|
+
end
|
311
|
+
|
312
|
+
def is_a_mergable_pair?(x, y)
|
313
|
+
# x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
|
314
|
+
# sentence's first position
|
315
|
+
MERGE_WORDS.any? { |a, b| a === x.downcase && b === y }
|
316
|
+
end
|
317
|
+
|
318
|
+
def merge_words(pair, i, to_delete)
|
319
|
+
pair.first << pair.last
|
320
|
+
to_delete << (i + 1 - to_delete.size)
|
321
|
+
end
|
322
|
+
|
323
|
+
######################
|
324
|
+
|
325
|
+
ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
|
326
|
+
ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
|
327
|
+
PUNCT_ITSELF = Regexp.new(PUNCTUATION.source + '$')
|
328
|
+
XML_TAG = /<\/?.+?>/
|
329
|
+
|
330
|
+
def create_tokens
|
331
|
+
# call #to_a is to retrieve (and align) optional metrical data
|
332
|
+
reset_id
|
333
|
+
@worker.to_a.map! do |el|
|
334
|
+
case el
|
335
|
+
when XML_TAG then Token::XmlTag.new(el)
|
336
|
+
when ABBR_NAME_WITH_DOT then raise_id and Token::Filler.new(el, @id)
|
337
|
+
when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
|
338
|
+
when PUNCT_ITSELF then raise_id and Token::Punctuation.new(el, @id)
|
339
|
+
else raise_id and Token::Word.new(el, @id)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def reset_id
|
345
|
+
@id = (@indexing ? @id = 0 : nil)
|
346
|
+
end
|
347
|
+
|
348
|
+
def raise_id
|
349
|
+
if @indexing
|
350
|
+
@id += 1
|
351
|
+
else
|
352
|
+
# need to return true because this is used as first part
|
353
|
+
# of an and construction
|
354
|
+
true
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
def preliminary
|
359
|
+
@worker.to_a
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'llt/tokenizer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "llt-tokenizer"
|
8
|
+
spec.version = LLT::Tokenizer::VERSION
|
9
|
+
spec.authors = ["LFDM"]
|
10
|
+
spec.email = ["1986gh@gmail.com"]
|
11
|
+
spec.description = %q{LLT's Tokenizer}
|
12
|
+
spec.summary = %q{Breaks latin sentences into tokens}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "array_scanner"
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
spec.add_development_dependency "rspec"
|
25
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
26
|
+
spec.add_dependency "llt-core"
|
27
|
+
spec.add_dependency "llt-core_extensions"
|
28
|
+
spec.add_dependency "llt-db_handler"
|
29
|
+
spec.add_dependency "llt-helpers"
|
30
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
ENV['RACK_ENV'] = 'test'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'llt/tokenizer/api'
|
5
|
+
require 'rack/test'
|
6
|
+
|
7
|
+
def app
|
8
|
+
Api
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "tokenizer api" do
|
12
|
+
include Rack::Test::Methods
|
13
|
+
|
14
|
+
describe '/tokenize' do
|
15
|
+
context "with URI as input" do
|
16
|
+
end
|
17
|
+
|
18
|
+
let(:text) {{text: "homo mittit."}}
|
19
|
+
|
20
|
+
context "with text as input" do
|
21
|
+
context "with accept header json" do
|
22
|
+
it "segments the given text" do
|
23
|
+
pending
|
24
|
+
get '/tokenize', text,
|
25
|
+
{"HTTP_ACCEPT" => "application/json"}
|
26
|
+
last_response.should be_ok
|
27
|
+
response = last_response.body
|
28
|
+
parsed_response = JSON.parse(response)
|
29
|
+
parsed_response.should have(3).items
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "with accept header xml" do
|
34
|
+
it "tokenize the given text" do
|
35
|
+
get '/tokenize', text,
|
36
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
37
|
+
last_response.should be_ok
|
38
|
+
body = last_response.body
|
39
|
+
body.should =~ /<w n="1">homo<\/w>/
|
40
|
+
body.should =~ /<w n="2">mittit<\/w>/
|
41
|
+
body.should =~ /<pc n="3">\.<\/pc>/
|
42
|
+
end
|
43
|
+
|
44
|
+
it "receives params for tokenization and markup" do
|
45
|
+
params = { indexing: false }.merge(text)
|
46
|
+
|
47
|
+
get '/tokenize', params,
|
48
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
49
|
+
last_response.should be_ok
|
50
|
+
body = last_response.body
|
51
|
+
body.should =~ /<w>homo<\/w>/
|
52
|
+
body.should =~ /<w>mittit<\/w>/
|
53
|
+
body.should =~ /<pc>\.<\/pc>/
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|