llt-tokenizer 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +5 -0
- data/lib/llt/token/punctuation.rb +9 -0
- data/lib/llt/tokenizer.rb +36 -22
- data/lib/llt/tokenizer/version.rb +1 -1
- data/spec/lib/llt/token/punctuation_spec.rb +17 -0
- data/spec/lib/llt/tokenizer_spec.rb +77 -42
- metadata +64 -62
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 75d2abc5e72328a1b4ef2f224931c0656572e71d
|
4
|
+
data.tar.gz: b1b155c05e45b87cfec7f0eb4e1535d9f72783b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a7f8fbd9be93c7053fe601243d6ea4c3a603b0c1643a5f102727844ae06c6083acbeb805bdf74e870c5ceb9c92e9dfdd796b8b201a434f50ef50d532fb65a93
|
7
|
+
data.tar.gz: 37d6ab4e7e39a30b6165e61e0a4d341e4620de0a90521bb17ac8ce141356dbe2dc3a952d39d57dd1e4c90eba224f9077b2bc12ecd5d818c8c9372734c91593fa
|
data/Gemfile
CHANGED
@@ -12,7 +12,6 @@ gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler
|
|
12
12
|
gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
|
13
13
|
|
14
14
|
# Dependencies of db_handler
|
15
|
-
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
16
15
|
gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
|
17
16
|
|
18
17
|
platform :ruby do
|
data/README.md
CHANGED
@@ -51,6 +51,11 @@ The Tokenizer takes several options upon creation or a call to #tokenize:
|
|
51
51
|
tokens.map(&:to_s)
|
52
52
|
# => ["Arma", "virum", "--que", "cano", "."]
|
53
53
|
|
54
|
+
# splitting of enclitics can be disabled altogether
|
55
|
+
tokens = t.tokenize('Arma virumque cano.', splitting: false)
|
56
|
+
tokens.map(&:to_s)
|
57
|
+
# => ["Arma", "virumque", "cano", "."]
|
58
|
+
|
54
59
|
# indexing determines if each token shall receive a consecutive id
|
55
60
|
tokens = t.tokenize('Arma virumque cano.', indexing: true)
|
56
61
|
tokens.first.id # => 1
|
@@ -1,14 +1,19 @@
|
|
1
|
+
require 'xml_escape'
|
2
|
+
|
1
3
|
module LLT
|
2
4
|
class Token
|
3
5
|
class Punctuation < Token
|
4
6
|
xml_tag 'pc'
|
5
7
|
|
8
|
+
include XmlEscape
|
9
|
+
|
6
10
|
attr_accessor :opening, :closing, :other
|
7
11
|
|
8
12
|
def initialize(string, id = nil)
|
9
13
|
super
|
10
14
|
# this is part of an old interface that is mostly unused
|
11
15
|
# some parts remain - find and delete em
|
16
|
+
@string = xml_decode(string)
|
12
17
|
@opening = false
|
13
18
|
@closing = false
|
14
19
|
@other = false
|
@@ -31,6 +36,10 @@ module LLT
|
|
31
36
|
def inspect
|
32
37
|
"#{"Punctuation token:".yellow} #{@string}"
|
33
38
|
end
|
39
|
+
|
40
|
+
def as_xml
|
41
|
+
xml_encode(@string)
|
42
|
+
end
|
34
43
|
end
|
35
44
|
end
|
36
45
|
end
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -24,6 +24,8 @@ module LLT
|
|
24
24
|
enclitics_marker: '-',
|
25
25
|
merging: true,
|
26
26
|
indexing: true,
|
27
|
+
splitting: true,
|
28
|
+
xml: false,
|
27
29
|
}
|
28
30
|
end
|
29
31
|
|
@@ -34,7 +36,7 @@ module LLT
|
|
34
36
|
setup(text, options)
|
35
37
|
|
36
38
|
find_abbreviations_and_join_strings
|
37
|
-
split_enklitika_and_change_their_position
|
39
|
+
split_enklitika_and_change_their_position if @splitting
|
38
40
|
merge_what_needs_merging if @merging # quam diu => quamdiu
|
39
41
|
tokens = create_tokens
|
40
42
|
|
@@ -43,17 +45,20 @@ module LLT
|
|
43
45
|
end
|
44
46
|
|
45
47
|
def setup(text, options = {}, worker = [])
|
46
|
-
@text
|
48
|
+
@text = text
|
47
49
|
evaluate_metrical_presence(@text)
|
48
50
|
@enclitics_marker = parse_option(:enclitics_marker, options)
|
49
51
|
@merging = parse_option(:merging, options)
|
50
52
|
@shifting = parse_option(:shifting, options)
|
53
|
+
@splitting = parse_option(:splitting, options)
|
51
54
|
@indexing = parse_option(:indexing, options)
|
55
|
+
@xml = parse_option(:xml, options)
|
52
56
|
@worker = setup_worker(worker)
|
53
57
|
@shift_range = shift_range(@shifting)
|
54
58
|
end
|
55
59
|
|
56
|
-
PUNCTUATION =
|
60
|
+
PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>])\1*/
|
61
|
+
XML_TAG = /<\/?.+?>/
|
57
62
|
|
58
63
|
# This is here for two reasons:
|
59
64
|
# 1) easier test setup, when a preliminary result shall be further evaluated
|
@@ -64,16 +69,15 @@ module LLT
|
|
64
69
|
# if it's needed - which should perform better, when there
|
65
70
|
# are no metrics involved (the default case)
|
66
71
|
def setup_worker(worker)
|
67
|
-
if worker.any?
|
68
|
-
|
72
|
+
return worker if worker.any?
|
73
|
+
|
74
|
+
elements = split_and_space_text
|
75
|
+
put_xml_attributes_back_together(elements) if @xml
|
76
|
+
|
77
|
+
if metrical?
|
78
|
+
Worker.new(elements, @enclitics_marker)
|
69
79
|
else
|
70
|
-
elements
|
71
|
-
put_xml_attributes_back_together(elements)
|
72
|
-
if metrical?
|
73
|
-
Worker.new(elements, @enclitics_marker)
|
74
|
-
else
|
75
|
-
elements
|
76
|
-
end
|
80
|
+
elements
|
77
81
|
end
|
78
82
|
end
|
79
83
|
|
@@ -81,19 +85,23 @@ module LLT
|
|
81
85
|
shifting_enabled ? 0 : 1
|
82
86
|
end
|
83
87
|
|
88
|
+
def split_and_space_text
|
89
|
+
regex = @xml ? Regexp.union(XML_TAG, PUNCTUATION) : PUNCTUATION
|
90
|
+
@text.gsub(regex, ' \0 ').split
|
91
|
+
end
|
92
|
+
|
84
93
|
def put_xml_attributes_back_together(elements)
|
85
|
-
# elements could be like this
|
86
|
-
# ['<tag', 'attr1="val"', 'attr1="val>']
|
87
|
-
# and we want the complete xml tag back together
|
88
94
|
as = ArrayScanner.new(elements)
|
89
95
|
loop do
|
90
|
-
last = as.look_behind
|
91
|
-
if
|
92
|
-
|
96
|
+
last = as.look_behind.to_s # catch nil
|
97
|
+
if open_xml_tag?(last)
|
98
|
+
number_of_xml_elements = as.peek_until do |el|
|
99
|
+
el.end_with?('>')
|
100
|
+
end.size + 1
|
101
|
+
|
102
|
+
number_of_xml_elements.times do
|
93
103
|
last << ' ' << as.current
|
94
104
|
elements.delete_at(as.pos)
|
95
|
-
# we don't need to forward, as we delete an element anyway
|
96
|
-
next
|
97
105
|
end
|
98
106
|
else
|
99
107
|
as.forward(1)
|
@@ -102,12 +110,18 @@ module LLT
|
|
102
110
|
end
|
103
111
|
end
|
104
112
|
|
113
|
+
def open_xml_tag?(str)
|
114
|
+
str.start_with?('<') &! str.end_with?('>')
|
115
|
+
end
|
116
|
+
|
105
117
|
|
106
118
|
######################
|
107
119
|
|
108
120
|
# covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero
|
109
121
|
# covers Roman date expression like a. d. V. Kal. Apr.
|
110
122
|
ABBREVIATIONS = /^(#{ALL_ABBRS_PIPED})$/
|
123
|
+
# covers a list of words which are abbreviated with a ' like satin' for satisne
|
124
|
+
APOSTROPHE_WORDS = /^(#{APOSTROPHES_PIPED})$/
|
111
125
|
|
112
126
|
# %w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
|
113
127
|
|
@@ -115,7 +129,7 @@ module LLT
|
|
115
129
|
arr = []
|
116
130
|
@worker.each_with_index do |e, i|
|
117
131
|
n = @worker[i + 1]
|
118
|
-
if e =~ ABBREVIATIONS
|
132
|
+
if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS)
|
119
133
|
@worker[i + 1] = n.prepend(e)
|
120
134
|
arr << (i - arr.size)
|
121
135
|
end
|
@@ -324,7 +338,7 @@ module LLT
|
|
324
338
|
|
325
339
|
ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
|
326
340
|
ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
|
327
|
-
PUNCT_ITSELF = Regexp.new(PUNCTUATION.source
|
341
|
+
PUNCT_ITSELF = Regexp.new("^(?:#{PUNCTUATION.source})$")
|
328
342
|
XML_TAG = /<\/?.+?>/
|
329
343
|
|
330
344
|
def create_tokens
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe LLT::Token::Punctuation do
|
4
|
+
describe "#initialize" do
|
5
|
+
it "normalizes escaped xml characters" do
|
6
|
+
punct = LLT::Token::Punctuation.new('&')
|
7
|
+
punct.to_s.should == '&'
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "#as_xml" do
|
12
|
+
it "overrides LLT::Core::Containable#as_xml to use xml encodings" do
|
13
|
+
punct = LLT::Token::Punctuation.new('&')
|
14
|
+
punct.as_xml.should == "&"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -18,10 +18,16 @@ describe LLT::Tokenizer do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
it "handles all kinds of parens as well as cruces" do
|
21
|
-
txt = "Marcus (et Claudius) †amici† [sunt]."
|
21
|
+
txt = "<Marcus> (et Claudius) †amici† [sunt]."
|
22
22
|
tokens = tokenizer.tokenize(txt)
|
23
|
-
tokens.should have(
|
24
|
-
tokens.map(&:to_s).should == %w{ Marcus ( et Claudius ) † amici † [ sunt ] . }
|
23
|
+
tokens.should have(14).items
|
24
|
+
tokens.map(&:to_s).should == %w{ < Marcus > ( et Claudius ) † amici † [ sunt ] . }
|
25
|
+
end
|
26
|
+
|
27
|
+
it "handles escaped xml characters" do
|
28
|
+
txt = "& " ' > < ;"
|
29
|
+
tokens = tokenizer.tokenize(txt)
|
30
|
+
tokens.should have(6).items
|
25
31
|
end
|
26
32
|
|
27
33
|
describe "takes an optional keyword argument add_to" do
|
@@ -110,19 +116,11 @@ describe LLT::Tokenizer do
|
|
110
116
|
end
|
111
117
|
end
|
112
118
|
end
|
113
|
-
|
114
|
-
context "with embedded xml tags" do
|
115
|
-
it "doesn't break" do
|
116
|
-
txt = '<grc>text text</grc>'
|
117
|
-
tokens = tokenizer.tokenize(txt)
|
118
|
-
tokens.should have(4).items
|
119
|
-
end
|
120
|
-
end
|
121
119
|
end
|
122
120
|
end
|
123
121
|
|
124
122
|
describe "#find_abbreviations_and_join_strings" do
|
125
|
-
describe "should bring back abbreviation dots" do
|
123
|
+
describe "should bring back abbreviation dots and apostrophes" do
|
126
124
|
it "with names" do
|
127
125
|
tokenizer.setup("", {}, %w{ Atque Sex . et M . Cicero . })
|
128
126
|
tokenizer.find_abbreviations_and_join_strings
|
@@ -134,6 +132,12 @@ describe LLT::Tokenizer do
|
|
134
132
|
tokenizer.find_abbreviations_and_join_strings
|
135
133
|
tokenizer.preliminary.should == %w{ a. d. V Kal. Apr. }
|
136
134
|
end
|
135
|
+
|
136
|
+
it "with apostrophe" do
|
137
|
+
tokenizer.setup("", {}, %w{ ' Apostrophi ' sunt : po ' min ' vin ' tun' scin ' potin ' satin ' })
|
138
|
+
tokenizer.find_abbreviations_and_join_strings
|
139
|
+
tokenizer.preliminary.should == %w{ ' Apostrophi ' sunt : po' min' vin' tun' scin' potin' satin' }
|
140
|
+
end
|
137
141
|
end
|
138
142
|
end
|
139
143
|
|
@@ -252,10 +256,10 @@ describe LLT::Tokenizer do
|
|
252
256
|
end
|
253
257
|
|
254
258
|
examples = {
|
255
|
-
"Word" => %w{ ita Marcus quoque -que },
|
259
|
+
"Word" => %w{ ita Marcus quoque -que po' },
|
256
260
|
"Filler" => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
|
257
261
|
"XmlTag" => %w{ <grc> </grc> },
|
258
|
-
"Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' }
|
262
|
+
"Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' & < > & < > ' " }
|
259
263
|
}
|
260
264
|
|
261
265
|
examples.each do |klass, elements|
|
@@ -278,37 +282,11 @@ describe LLT::Tokenizer do
|
|
278
282
|
tokens.map(&:id).should == [1, 2]
|
279
283
|
end
|
280
284
|
|
281
|
-
it "can be disabled" do
|
285
|
+
it "id's can be disabled" do
|
282
286
|
txt = 'Cano.'
|
283
287
|
tokens = tokenizer.tokenize(txt, indexing: false)
|
284
288
|
tokens.map(&:id).should == [nil, nil]
|
285
289
|
end
|
286
|
-
|
287
|
-
it "doesn't count plain xml tags" do
|
288
|
-
txt = '<grc>text text</grc>'
|
289
|
-
tokens = tokenizer.tokenize(txt)
|
290
|
-
tokens.map(&:id).should == [nil, 1, 2, nil]
|
291
|
-
end
|
292
|
-
|
293
|
-
it "doesn't count xml tags when they come with attributes" do
|
294
|
-
txt = '<foreign lang="lat">Graeca</foreign> lingua est.'
|
295
|
-
tokens = tokenizer.tokenize(txt).map(&:to_s)
|
296
|
-
res = ['<foreign lang="lat">', 'Graeca', '</foreign>', 'lingua', 'est', '.']
|
297
|
-
tokens.should == res
|
298
|
-
end
|
299
|
-
|
300
|
-
it "handles nested xml as well" do
|
301
|
-
txt = '<l n="70"><foreign lang="lat">Graeca lingua est.</foreign></l>'
|
302
|
-
tokens = tokenizer.tokenize(txt).map(&:to_s)
|
303
|
-
res = ['<l n="70">', '<foreign lang="lat">', 'Graeca', 'lingua', 'est', '.', '</foreign>', '</l>']
|
304
|
-
tokens.should == res
|
305
|
-
end
|
306
|
-
|
307
|
-
it "handles text with broken off xml tags (the rest will e.g. be in another sentence)" do
|
308
|
-
txt = "<lg org=\"uniform\" sample=\"complete\"><l>quem vocet divum populus ruentis</l><l>imperi rebus?"
|
309
|
-
tokens = tokenizer.tokenize(txt)
|
310
|
-
tokens.should have(12).items
|
311
|
-
end
|
312
290
|
end
|
313
291
|
|
314
292
|
context "with options" do
|
@@ -346,11 +324,68 @@ describe LLT::Tokenizer do
|
|
346
324
|
tokens.should == %w{ quam diu cano ? }
|
347
325
|
end
|
348
326
|
end
|
327
|
+
|
328
|
+
context "with disabled splitting" do
|
329
|
+
it "doesn't split enclitics" do
|
330
|
+
txt = 'arma virumque cano.'
|
331
|
+
opts = { splitting: false }
|
332
|
+
tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
|
333
|
+
tokens.should == %w{ arma virumque cano . }
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
context "with xml handling enabled" do
|
338
|
+
let(:xml_tokenizer) { LLT::Tokenizer.new(db: stub_db, xml: true) }
|
339
|
+
|
340
|
+
it "doesn't break when xml is embedded" do
|
341
|
+
txt = '<grc>text text</grc>'
|
342
|
+
tokens = xml_tokenizer.tokenize(txt)
|
343
|
+
tokens.should have(4).items
|
344
|
+
end
|
345
|
+
|
346
|
+
it "doesn't count plain xml tags" do
|
347
|
+
txt = '<grc>text text</grc>'
|
348
|
+
tokens = xml_tokenizer.tokenize(txt)
|
349
|
+
tokens.map(&:id).should == [nil, 1, 2, nil]
|
350
|
+
end
|
351
|
+
|
352
|
+
it "doesn't count xml tags when they come with attributes" do
|
353
|
+
txt = '<foreign lang="lat">Graeca</foreign> lingua est.'
|
354
|
+
tokens = xml_tokenizer.tokenize(txt).map(&:to_s)
|
355
|
+
res = ['<foreign lang="lat">', 'Graeca', '</foreign>', 'lingua', 'est', '.']
|
356
|
+
tokens.should == res
|
357
|
+
end
|
358
|
+
|
359
|
+
it "handles nested xml as well" do
|
360
|
+
txt = '<l n="70"><foreign lang="lat">Graeca lingua est.</foreign></l>'
|
361
|
+
tokens = xml_tokenizer.tokenize(txt).map(&:to_s)
|
362
|
+
res = ['<l n="70">', '<foreign lang="lat">', 'Graeca', 'lingua', 'est', '.', '</foreign>', '</l>']
|
363
|
+
tokens.should == res
|
364
|
+
end
|
365
|
+
|
366
|
+
it "handles text with broken off xml tags (the rest will e.g. be in another sentence)" do
|
367
|
+
txt = "<lg org=\"uniform\" sample=\"complete\"><l>quem vocet divum populus ruentis</l><l>imperi rebus?"
|
368
|
+
tokens = xml_tokenizer.tokenize(txt)
|
369
|
+
tokens.should have(12).items
|
370
|
+
end
|
371
|
+
|
372
|
+
it "doesn't fall with spaces inside of xml attributes" do
|
373
|
+
txt = '<test>veni vidi <bad att="a a a">vici</bad></test>'
|
374
|
+
tokens = xml_tokenizer.tokenize(txt)
|
375
|
+
tokens.should have(7).items
|
376
|
+
end
|
377
|
+
|
378
|
+
it "expects all text chevrons to be escaped, otherwise they are xml tags!" do
|
379
|
+
txt = '<test><veni></test>'
|
380
|
+
tokens = xml_tokenizer.tokenize(txt)
|
381
|
+
tokens.should have(5).item
|
382
|
+
end
|
383
|
+
end
|
349
384
|
end
|
350
385
|
end
|
351
386
|
|
352
387
|
context "with options on instance creation" do
|
353
|
-
it "a new instance can receive options, which it will use as
|
388
|
+
it "a new instance can receive options, which it will use as its defaults" do
|
354
389
|
custom_tok = LLT::Tokenizer.new(db: stub_db,
|
355
390
|
shifting: false,
|
356
391
|
enclitics_marker: '')
|
metadata
CHANGED
@@ -1,141 +1,141 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.3'
|
20
15
|
requirement: !ruby/object:Gem::Requirement
|
21
16
|
requirements:
|
22
|
-
- - ~>
|
17
|
+
- - "~>"
|
23
18
|
- !ruby/object:Gem::Version
|
24
19
|
version: '1.3'
|
25
|
-
prerelease: false
|
26
20
|
type: :development
|
27
|
-
|
28
|
-
name: rake
|
21
|
+
prerelease: false
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - "~>"
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
34
29
|
requirement: !ruby/object:Gem::Requirement
|
35
30
|
requirements:
|
36
|
-
- -
|
31
|
+
- - ">="
|
37
32
|
- !ruby/object:Gem::Version
|
38
33
|
version: '0'
|
39
|
-
prerelease: false
|
40
34
|
type: :development
|
41
|
-
|
42
|
-
name: rspec
|
35
|
+
prerelease: false
|
43
36
|
version_requirements: !ruby/object:Gem::Requirement
|
44
37
|
requirements:
|
45
|
-
- -
|
38
|
+
- - ">="
|
46
39
|
- !ruby/object:Gem::Version
|
47
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
44
|
requirements:
|
50
|
-
- -
|
45
|
+
- - ">="
|
51
46
|
- !ruby/object:Gem::Version
|
52
47
|
version: '0'
|
53
|
-
prerelease: false
|
54
48
|
type: :development
|
55
|
-
|
56
|
-
name: simplecov
|
49
|
+
prerelease: false
|
57
50
|
version_requirements: !ruby/object:Gem::Requirement
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simplecov
|
62
57
|
requirement: !ruby/object:Gem::Requirement
|
63
58
|
requirements:
|
64
|
-
- - ~>
|
59
|
+
- - "~>"
|
65
60
|
- !ruby/object:Gem::Version
|
66
61
|
version: '0.7'
|
67
|
-
prerelease: false
|
68
62
|
type: :development
|
69
|
-
|
70
|
-
name: array_scanner
|
63
|
+
prerelease: false
|
71
64
|
version_requirements: !ruby/object:Gem::Requirement
|
72
65
|
requirements:
|
73
|
-
- -
|
66
|
+
- - "~>"
|
74
67
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
68
|
+
version: '0.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: array_scanner
|
76
71
|
requirement: !ruby/object:Gem::Requirement
|
77
72
|
requirements:
|
78
|
-
- -
|
73
|
+
- - ">="
|
79
74
|
- !ruby/object:Gem::Version
|
80
75
|
version: '0'
|
81
|
-
prerelease: false
|
82
76
|
type: :runtime
|
83
|
-
|
84
|
-
name: llt-core
|
77
|
+
prerelease: false
|
85
78
|
version_requirements: !ruby/object:Gem::Requirement
|
86
79
|
requirements:
|
87
|
-
- -
|
80
|
+
- - ">="
|
88
81
|
- !ruby/object:Gem::Version
|
89
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: llt-core
|
90
85
|
requirement: !ruby/object:Gem::Requirement
|
91
86
|
requirements:
|
92
|
-
- -
|
87
|
+
- - ">="
|
93
88
|
- !ruby/object:Gem::Version
|
94
89
|
version: '0'
|
95
|
-
prerelease: false
|
96
90
|
type: :runtime
|
97
|
-
|
98
|
-
name: llt-core_extensions
|
91
|
+
prerelease: false
|
99
92
|
version_requirements: !ruby/object:Gem::Requirement
|
100
93
|
requirements:
|
101
|
-
- -
|
94
|
+
- - ">="
|
102
95
|
- !ruby/object:Gem::Version
|
103
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: llt-core_extensions
|
104
99
|
requirement: !ruby/object:Gem::Requirement
|
105
100
|
requirements:
|
106
|
-
- -
|
101
|
+
- - ">="
|
107
102
|
- !ruby/object:Gem::Version
|
108
103
|
version: '0'
|
109
|
-
prerelease: false
|
110
104
|
type: :runtime
|
111
|
-
|
112
|
-
name: llt-db_handler
|
105
|
+
prerelease: false
|
113
106
|
version_requirements: !ruby/object:Gem::Requirement
|
114
107
|
requirements:
|
115
|
-
- -
|
108
|
+
- - ">="
|
116
109
|
- !ruby/object:Gem::Version
|
117
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: llt-db_handler
|
118
113
|
requirement: !ruby/object:Gem::Requirement
|
119
114
|
requirements:
|
120
|
-
- -
|
115
|
+
- - ">="
|
121
116
|
- !ruby/object:Gem::Version
|
122
117
|
version: '0'
|
123
|
-
prerelease: false
|
124
118
|
type: :runtime
|
125
|
-
|
126
|
-
name: llt-helpers
|
119
|
+
prerelease: false
|
127
120
|
version_requirements: !ruby/object:Gem::Requirement
|
128
121
|
requirements:
|
129
|
-
- -
|
122
|
+
- - ">="
|
130
123
|
- !ruby/object:Gem::Version
|
131
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: llt-helpers
|
132
127
|
requirement: !ruby/object:Gem::Requirement
|
133
128
|
requirements:
|
134
|
-
- -
|
129
|
+
- - ">="
|
135
130
|
- !ruby/object:Gem::Version
|
136
131
|
version: '0'
|
137
|
-
prerelease: false
|
138
132
|
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
139
|
description: LLT's Tokenizer
|
140
140
|
email:
|
141
141
|
- 1986gh@gmail.com
|
@@ -143,9 +143,9 @@ executables: []
|
|
143
143
|
extensions: []
|
144
144
|
extra_rdoc_files: []
|
145
145
|
files:
|
146
|
-
- .gitignore
|
147
|
-
- .rspec
|
148
|
-
- .travis.yml
|
146
|
+
- ".gitignore"
|
147
|
+
- ".rspec"
|
148
|
+
- ".travis.yml"
|
149
149
|
- Gemfile
|
150
150
|
- LICENSE.txt
|
151
151
|
- README.md
|
@@ -160,6 +160,7 @@ files:
|
|
160
160
|
- lib/llt/tokenizer/version.rb
|
161
161
|
- lib/llt/tokenizer/worker.rb
|
162
162
|
- llt-tokenizer.gemspec
|
163
|
+
- spec/lib/llt/token/punctuation_spec.rb
|
163
164
|
- spec/lib/llt/tokenizer/api_spec.rb
|
164
165
|
- spec/lib/llt/tokenizer_spec.rb
|
165
166
|
- spec/spec_helper.rb
|
@@ -168,27 +169,28 @@ homepage: ''
|
|
168
169
|
licenses:
|
169
170
|
- MIT
|
170
171
|
metadata: {}
|
171
|
-
post_install_message:
|
172
|
+
post_install_message:
|
172
173
|
rdoc_options: []
|
173
174
|
require_paths:
|
174
175
|
- lib
|
175
176
|
required_ruby_version: !ruby/object:Gem::Requirement
|
176
177
|
requirements:
|
177
|
-
- -
|
178
|
+
- - ">="
|
178
179
|
- !ruby/object:Gem::Version
|
179
180
|
version: '0'
|
180
181
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
182
|
requirements:
|
182
|
-
- -
|
183
|
+
- - ">="
|
183
184
|
- !ruby/object:Gem::Version
|
184
185
|
version: '0'
|
185
186
|
requirements: []
|
186
|
-
rubyforge_project:
|
187
|
-
rubygems_version: 2.
|
188
|
-
signing_key:
|
187
|
+
rubyforge_project:
|
188
|
+
rubygems_version: 2.2.0
|
189
|
+
signing_key:
|
189
190
|
specification_version: 4
|
190
191
|
summary: Breaks latin sentences into tokens
|
191
192
|
test_files:
|
193
|
+
- spec/lib/llt/token/punctuation_spec.rb
|
192
194
|
- spec/lib/llt/tokenizer/api_spec.rb
|
193
195
|
- spec/lib/llt/tokenizer_spec.rb
|
194
196
|
- spec/spec_helper.rb
|