llt-tokenizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +2 -0
- data/.travis.yml +7 -0
- data/Gemfile +27 -0
- data/LICENSE.txt +22 -0
- data/README.md +97 -0
- data/Rakefile +6 -0
- data/lib/llt/token/filler.rb +31 -0
- data/lib/llt/token/punctuation.rb +36 -0
- data/lib/llt/token/word.rb +53 -0
- data/lib/llt/token/xml_tag.rb +24 -0
- data/lib/llt/token.rb +51 -0
- data/lib/llt/tokenizer/api.rb +20 -0
- data/lib/llt/tokenizer/version.rb +5 -0
- data/lib/llt/tokenizer/worker.rb +106 -0
- data/lib/llt/tokenizer.rb +362 -0
- data/llt-tokenizer.gemspec +30 -0
- data/spec/lib/llt/tokenizer/api_spec.rb +58 -0
- data/spec/lib/llt/tokenizer_spec.rb +361 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/matchers/tokenizer.rb +5 -0
- metadata +195 -0
@@ -0,0 +1,362 @@
|
|
1
|
+
require 'array_scanner'
|
2
|
+
require 'llt/core'
|
3
|
+
require 'llt/constants/abbreviations'
|
4
|
+
require 'llt/core_extensions/array'
|
5
|
+
require 'llt/db_handler/prometheus'
|
6
|
+
require 'llt/helpers/metrical'
|
7
|
+
|
8
|
+
module LLT
|
9
|
+
class Tokenizer
|
10
|
+
require 'llt/token'
|
11
|
+
require 'llt/tokenizer/worker'
|
12
|
+
|
13
|
+
include Core::Serviceable
|
14
|
+
include Constants::Abbreviations
|
15
|
+
include Helpers::Metrical
|
16
|
+
|
17
|
+
uses_db { DbHandler::Prometheus.new }
|
18
|
+
|
19
|
+
attr_reader :default_options
|
20
|
+
|
21
|
+
def self.default_options
|
22
|
+
{
|
23
|
+
shifting: true,
|
24
|
+
enclitics_marker: '-',
|
25
|
+
merging: true,
|
26
|
+
indexing: true,
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
def tokenize(text, add_to: nil, **options)
|
31
|
+
raise ArgumentError.new("The argument passed must be a String") unless text.is_a?(String)
|
32
|
+
return [] if text.empty?
|
33
|
+
|
34
|
+
setup(text, options)
|
35
|
+
|
36
|
+
find_abbreviations_and_join_strings
|
37
|
+
split_enklitika_and_change_their_position
|
38
|
+
merge_what_needs_merging if @merging # quam diu => quamdiu
|
39
|
+
tokens = create_tokens
|
40
|
+
|
41
|
+
add_to << tokens if add_to.respond_to?(:<<)
|
42
|
+
tokens
|
43
|
+
end
|
44
|
+
|
45
|
+
def setup(text, options = {}, worker = [])
|
46
|
+
@text = text
|
47
|
+
evaluate_metrical_presence(@text)
|
48
|
+
@enclitics_marker = parse_option(:enclitics_marker, options)
|
49
|
+
@merging = parse_option(:merging, options)
|
50
|
+
@shifting = parse_option(:shifting, options)
|
51
|
+
@indexing = parse_option(:indexing, options)
|
52
|
+
@worker = setup_worker(worker)
|
53
|
+
@shift_range = shift_range(@shifting)
|
54
|
+
end
|
55
|
+
|
56
|
+
PUNCTUATION = /([\.\?,!;\-:"'”\(\)\[\]†]|<\/?.+?>)\1*/
|
57
|
+
|
58
|
+
# This is here for two reasons:
|
59
|
+
# 1) easier test setup, when a preliminary result shall be further evaluated
|
60
|
+
#
|
61
|
+
# 2) more importantly adding a level of indirection, when
|
62
|
+
# the given text holds metrical information. It adds a
|
63
|
+
# substitute implementation for the worker array, but only
|
64
|
+
# if it's needed - which should perform better, when there
|
65
|
+
# are no metrics involved (the default case)
|
66
|
+
def setup_worker(worker)
|
67
|
+
if worker.any?
|
68
|
+
worker
|
69
|
+
else
|
70
|
+
elements = @text.gsub(PUNCTUATION, ' \0 ').split
|
71
|
+
put_xml_attributes_back_together(elements)
|
72
|
+
if metrical?
|
73
|
+
Worker.new(elements, @enclitics_marker)
|
74
|
+
else
|
75
|
+
elements
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def shift_range(shifting_enabled)
|
81
|
+
shifting_enabled ? 0 : 1
|
82
|
+
end
|
83
|
+
|
84
|
+
def put_xml_attributes_back_together(elements)
|
85
|
+
# elements could be like this
|
86
|
+
# ['<tag', 'attr1="val"', 'attr1="val>']
|
87
|
+
# and we want the complete xml tag back together
|
88
|
+
as = ArrayScanner.new(elements)
|
89
|
+
loop do
|
90
|
+
last = as.look_behind
|
91
|
+
if last && last.start_with?('<') &! last.end_with?('>')
|
92
|
+
if as.current.match(/\w+=".*"$|>/)
|
93
|
+
last << ' ' << as.current
|
94
|
+
elements.delete_at(as.pos)
|
95
|
+
# we don't need to forward, as we delete an element anyway
|
96
|
+
next
|
97
|
+
end
|
98
|
+
else
|
99
|
+
as.forward(1)
|
100
|
+
end
|
101
|
+
break if as.eoa?
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
######################
|
107
|
+
|
108
|
+
# covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero
|
109
|
+
# covers Roman date expression like a. d. V. Kal. Apr.
|
110
|
+
ABBREVIATIONS = /^(#{ALL_ABBRS_PIPED})$/
|
111
|
+
|
112
|
+
# %w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
|
113
|
+
|
114
|
+
def find_abbreviations_and_join_strings
|
115
|
+
arr = []
|
116
|
+
@worker.each_with_index do |e, i|
|
117
|
+
n = @worker[i + 1]
|
118
|
+
if e =~ ABBREVIATIONS && n == "."
|
119
|
+
@worker[i + 1] = n.prepend(e)
|
120
|
+
arr << (i - arr.size)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
arr.each { |i| @worker.delete_at(i) }
|
125
|
+
end
|
126
|
+
|
127
|
+
######################
|
128
|
+
|
129
|
+
WORDS_ENDING_WITH_QUE = /^([qc]u[ei].*que|qu[ao]que|itaque|atque|ut[er].*que|utcumque|plerumque|denique|undique)$/i # neque taken out!
|
130
|
+
WORDS_ENDING_WITH_NE = /^(omne|sine|bene|paene)$/i
|
131
|
+
WORDS_ENDING_WITH_VE = /^(sive|neve)$/i
|
132
|
+
|
133
|
+
# laetusque to -que laetus
|
134
|
+
# in eoque to -que in eo
|
135
|
+
# honestumne to -ne honestum
|
136
|
+
#
|
137
|
+
# but
|
138
|
+
#
|
139
|
+
# uterque, institutione, sive et al. remain
|
140
|
+
|
141
|
+
ENCLITICS = %w{ que ne ve c }
|
142
|
+
def split_enklitika_and_change_their_position
|
143
|
+
split_with_force
|
144
|
+
split_nec
|
145
|
+
make_frequent_corrections
|
146
|
+
end
|
147
|
+
|
148
|
+
def split_with_force
|
149
|
+
# uses brute force at first
|
150
|
+
# the restrictor regexps handle only obvious cases
|
151
|
+
|
152
|
+
# don't use c here atm
|
153
|
+
ENCLITICS[0..-2].each do |encl|
|
154
|
+
split_enklitikon(encl, self.class.const_get("WORDS_ENDING_WITH_#{encl.upcase}"))
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def split_enklitikon(encl, restrictors)
|
159
|
+
# needs a word character in front - ne itself should be contained
|
160
|
+
regexp = /(?<=\w)#{encl}$/
|
161
|
+
|
162
|
+
indices = []
|
163
|
+
@worker.each_with_index do |token, i|
|
164
|
+
if token.match(regexp) && restrictors !~ token
|
165
|
+
token.slice!(regexp)
|
166
|
+
indices << (i + indices.size + @shift_range)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
indices.each { |i| @worker.insert(i, enclitic(encl)) }
|
171
|
+
end
|
172
|
+
|
173
|
+
def enclitic(val)
|
174
|
+
"#{@enclitics_marker}#{val}"
|
175
|
+
end
|
176
|
+
|
177
|
+
def split_nec
|
178
|
+
indices = []
|
179
|
+
@worker.each_with_index do |token, i|
|
180
|
+
if token == 'nec'
|
181
|
+
token.slice!(-1)
|
182
|
+
indices << (i + indices.size + @shift_range)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
indices.each { |i| @worker.insert(i, enclitic('c')) }
|
187
|
+
end
|
188
|
+
|
189
|
+
def make_frequent_corrections
|
190
|
+
# uses db lookups
|
191
|
+
# # TODO 27.11.13 14:15 by LFDM
|
192
|
+
# Implement caching here
|
193
|
+
ne_corrections
|
194
|
+
que_corrections
|
195
|
+
ve_corrections
|
196
|
+
end
|
197
|
+
|
198
|
+
def que_corrections
|
199
|
+
# this is used in rare only in cases like in eoque
|
200
|
+
# which needs a shift to -que in eo
|
201
|
+
if @shifting
|
202
|
+
to_be_shifted_que_indices.each do |i|
|
203
|
+
@worker.insert(i - 1, @worker.delete_at(i))
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def to_be_shifted_que_indices
|
209
|
+
# double shifts would properly fail, but they might never happen
|
210
|
+
@worker.each_with_index.each_with_object([]) do |(element, index), accumulator|
|
211
|
+
accumulator << index if is_que?(element) && led_by_preposition?(index)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def is_que?(element)
|
216
|
+
element == enclitic('que')
|
217
|
+
end
|
218
|
+
|
219
|
+
def led_by_preposition?(index)
|
220
|
+
@worker[index - 1] =~ /^(in|ad|ob)$/i # and others
|
221
|
+
end
|
222
|
+
|
223
|
+
def ne_corrections
|
224
|
+
corrections = []
|
225
|
+
@worker.each_with_index do |w, i|
|
226
|
+
if w == enclitic('ne')
|
227
|
+
orig_el = original_word(i)
|
228
|
+
|
229
|
+
entries = []
|
230
|
+
entries += lookup(orig_el, :noun, :nom) if orig_el =~ /io$/ # actio-ne ratio-ne
|
231
|
+
entries += lookup(orig_el + "n", :persona, :stem) if orig_el =~ /o$/ # Plato-ne Cicero-ne Solo-ne
|
232
|
+
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /d?i$/ # fortitudi-ne ratio-ne libidi-ne homi-ne
|
233
|
+
entries += lookup(orig_el + "n", :noun, :stem) if orig_el =~ /mi$/ # flumi-ne agmi-ne
|
234
|
+
entries += lookup(orig_el + "n", :adjective, :stem) # communis commune
|
235
|
+
|
236
|
+
if entries.any?(&:third_decl_with_possible_ne_abl?)
|
237
|
+
corrections << i - corrections.size
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
reverse_splittings(corrections)
|
243
|
+
end
|
244
|
+
|
245
|
+
def ve_corrections
|
246
|
+
corrections = []
|
247
|
+
@worker.each_with_index do |w, i|
|
248
|
+
if w == enclitic('ve')
|
249
|
+
orig_el = original_word(i)
|
250
|
+
|
251
|
+
entries = []
|
252
|
+
entries += lookup(orig_el + 'v', :adjective, :stem, 1)
|
253
|
+
entries += lookup(orig_el + 'v', :adjective, :stem, 3)
|
254
|
+
entries += lookup(orig_el + 'v', :noun, :stem, [2, 5])
|
255
|
+
entries += lookup(orig_el + 've', :verb, :pr, 2)
|
256
|
+
entries += lookup(orig_el + 'v', :verb, :pr, [3, 5]) # not sure if such a word of 5 exists
|
257
|
+
|
258
|
+
|
259
|
+
if entries.any?
|
260
|
+
corrections << i - corrections.size
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
reverse_splittings(corrections)
|
266
|
+
end
|
267
|
+
|
268
|
+
def original_word(i)
|
269
|
+
# there are two possible scenarios at this point
|
270
|
+
# with shifting enabled:
|
271
|
+
# i i + 1
|
272
|
+
# arma que virum
|
273
|
+
# with shifting disabled:
|
274
|
+
# i - 1 i
|
275
|
+
# arma virum que
|
276
|
+
@worker[i + (@shifting ? 1 : -1)]
|
277
|
+
end
|
278
|
+
|
279
|
+
def lookup(string, type, column, inflection_class = 3)
|
280
|
+
string = (type == :persona ? string : string.downcase)
|
281
|
+
query = {
|
282
|
+
type: type, stem_type: column, stem: string,
|
283
|
+
restrictions: { type: :inflection_class, values: Array(inflection_class) }
|
284
|
+
}
|
285
|
+
@db.look_up_stem(query)
|
286
|
+
end
|
287
|
+
|
288
|
+
def reverse_splittings(indices)
|
289
|
+
indices.each do |i|
|
290
|
+
# need to retrieve the orig word before the splitted var is
|
291
|
+
# assigned, as it deletes something in the worker
|
292
|
+
ow = original_word(i)
|
293
|
+
splitted = @worker.delete_at(i).delete(@enclitics_marker)
|
294
|
+
ow << splitted
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
|
299
|
+
######################
|
300
|
+
|
301
|
+
MERGE_WORDS = [ %w{ quam diu }, ['non', /null.{1,4}$/] ]
|
302
|
+
|
303
|
+
# quam diu to quamdiu
|
304
|
+
def merge_what_needs_merging
|
305
|
+
to_delete = []
|
306
|
+
@worker.each_overlapping_pair.each_with_index do |pair, i|
|
307
|
+
merge_words(pair, i, to_delete) if is_a_mergable_pair?(*pair)
|
308
|
+
end
|
309
|
+
to_delete.each { |i| @worker.delete_at(i) }
|
310
|
+
end
|
311
|
+
|
312
|
+
def is_a_mergable_pair?(x, y)
|
313
|
+
# x, i.e. quam in quamdiu, needs to be downcased, as it could be in a
|
314
|
+
# sentence's first position
|
315
|
+
MERGE_WORDS.any? { |a, b| a === x.downcase && b === y }
|
316
|
+
end
|
317
|
+
|
318
|
+
def merge_words(pair, i, to_delete)
|
319
|
+
pair.first << pair.last
|
320
|
+
to_delete << (i + 1 - to_delete.size)
|
321
|
+
end
|
322
|
+
|
323
|
+
######################
|
324
|
+
|
325
|
+
ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
|
326
|
+
ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
|
327
|
+
PUNCT_ITSELF = Regexp.new(PUNCTUATION.source + '$')
|
328
|
+
XML_TAG = /<\/?.+?>/
|
329
|
+
|
330
|
+
def create_tokens
|
331
|
+
# call #to_a is to retrieve (and align) optional metrical data
|
332
|
+
reset_id
|
333
|
+
@worker.to_a.map! do |el|
|
334
|
+
case el
|
335
|
+
when XML_TAG then Token::XmlTag.new(el)
|
336
|
+
when ABBR_NAME_WITH_DOT then raise_id and Token::Filler.new(el, @id)
|
337
|
+
when ROMAN_DATE_EXPR_WITH_DOT then raise_id and Token::Filler.new(el, @id)
|
338
|
+
when PUNCT_ITSELF then raise_id and Token::Punctuation.new(el, @id)
|
339
|
+
else raise_id and Token::Word.new(el, @id)
|
340
|
+
end
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
def reset_id
|
345
|
+
@id = (@indexing ? @id = 0 : nil)
|
346
|
+
end
|
347
|
+
|
348
|
+
def raise_id
|
349
|
+
if @indexing
|
350
|
+
@id += 1
|
351
|
+
else
|
352
|
+
# need to return true because this is used as first part
|
353
|
+
# of an and construction
|
354
|
+
true
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
def preliminary
|
359
|
+
@worker.to_a
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'llt/tokenizer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "llt-tokenizer"
|
8
|
+
spec.version = LLT::Tokenizer::VERSION
|
9
|
+
spec.authors = ["LFDM"]
|
10
|
+
spec.email = ["1986gh@gmail.com"]
|
11
|
+
spec.description = %q{LLT's Tokenizer}
|
12
|
+
spec.summary = %q{Breaks latin sentences into tokens}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "array_scanner"
|
22
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
spec.add_development_dependency "rspec"
|
25
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
26
|
+
spec.add_dependency "llt-core"
|
27
|
+
spec.add_dependency "llt-core_extensions"
|
28
|
+
spec.add_dependency "llt-db_handler"
|
29
|
+
spec.add_dependency "llt-helpers"
|
30
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
ENV['RACK_ENV'] = 'test'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'llt/tokenizer/api'
|
5
|
+
require 'rack/test'
|
6
|
+
|
7
|
+
def app
|
8
|
+
Api
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "tokenizer api" do
|
12
|
+
include Rack::Test::Methods
|
13
|
+
|
14
|
+
describe '/tokenize' do
|
15
|
+
context "with URI as input" do
|
16
|
+
end
|
17
|
+
|
18
|
+
let(:text) {{text: "homo mittit."}}
|
19
|
+
|
20
|
+
context "with text as input" do
|
21
|
+
context "with accept header json" do
|
22
|
+
it "segments the given text" do
|
23
|
+
pending
|
24
|
+
get '/tokenize', text,
|
25
|
+
{"HTTP_ACCEPT" => "application/json"}
|
26
|
+
last_response.should be_ok
|
27
|
+
response = last_response.body
|
28
|
+
parsed_response = JSON.parse(response)
|
29
|
+
parsed_response.should have(3).items
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "with accept header xml" do
|
34
|
+
it "tokenize the given text" do
|
35
|
+
get '/tokenize', text,
|
36
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
37
|
+
last_response.should be_ok
|
38
|
+
body = last_response.body
|
39
|
+
body.should =~ /<w n="1">homo<\/w>/
|
40
|
+
body.should =~ /<w n="2">mittit<\/w>/
|
41
|
+
body.should =~ /<pc n="3">\.<\/pc>/
|
42
|
+
end
|
43
|
+
|
44
|
+
it "receives params for tokenization and markup" do
|
45
|
+
params = { indexing: false }.merge(text)
|
46
|
+
|
47
|
+
get '/tokenize', params,
|
48
|
+
{"HTTP_ACCEPT" => "application/xml"}
|
49
|
+
last_response.should be_ok
|
50
|
+
body = last_response.body
|
51
|
+
body.should =~ /<w>homo<\/w>/
|
52
|
+
body.should =~ /<w>mittit<\/w>/
|
53
|
+
body.should =~ /<pc>\.<\/pc>/
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|