llt-tokenizer 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +5 -0
- data/lib/llt/token/punctuation.rb +9 -0
- data/lib/llt/tokenizer.rb +36 -22
- data/lib/llt/tokenizer/version.rb +1 -1
- data/spec/lib/llt/token/punctuation_spec.rb +17 -0
- data/spec/lib/llt/tokenizer_spec.rb +77 -42
- metadata +64 -62
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 75d2abc5e72328a1b4ef2f224931c0656572e71d
|
4
|
+
data.tar.gz: b1b155c05e45b87cfec7f0eb4e1535d9f72783b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a7f8fbd9be93c7053fe601243d6ea4c3a603b0c1643a5f102727844ae06c6083acbeb805bdf74e870c5ceb9c92e9dfdd796b8b201a434f50ef50d532fb65a93
|
7
|
+
data.tar.gz: 37d6ab4e7e39a30b6165e61e0a4d341e4620de0a90521bb17ac8ce141356dbe2dc3a952d39d57dd1e4c90eba224f9077b2bc12ecd5d818c8c9372734c91593fa
|
data/Gemfile
CHANGED
@@ -12,7 +12,6 @@ gem 'llt-db_handler', git: 'git@github.com:latin-language-toolkit/llt-db_handler
|
|
12
12
|
gem 'llt-helpers', git: 'git@github.com:latin-language-toolkit/llt-helpers.git'
|
13
13
|
|
14
14
|
# Dependencies of db_handler
|
15
|
-
gem 'llt-core_extensions', git: 'git@github.com:latin-language-toolkit/llt-core_extensions.git'
|
16
15
|
gem 'llt-form_builder', git: 'git@github.com:latin-language-toolkit/llt-form_builder.git'
|
17
16
|
|
18
17
|
platform :ruby do
|
data/README.md
CHANGED
@@ -51,6 +51,11 @@ The Tokenizer takes several options upon creation or a call to #tokenize:
|
|
51
51
|
tokens.map(&:to_s)
|
52
52
|
# => ["Arma", "virum", "--que", "cano", "."]
|
53
53
|
|
54
|
+
# splitting of enclitics can be disabled altogether
|
55
|
+
tokens = t.tokenize('Arma virumque cano.', splitting: false)
|
56
|
+
tokens.map(&:to_s)
|
57
|
+
# => ["Arma", "virumque", "cano", "."]
|
58
|
+
|
54
59
|
# indexing determines if each token shall receive a consecutive id
|
55
60
|
tokens = t.tokenize('Arma virumque cano.', indexing: true)
|
56
61
|
tokens.first.id # => 1
|
@@ -1,14 +1,19 @@
|
|
1
|
+
require 'xml_escape'
|
2
|
+
|
1
3
|
module LLT
|
2
4
|
class Token
|
3
5
|
class Punctuation < Token
|
4
6
|
xml_tag 'pc'
|
5
7
|
|
8
|
+
include XmlEscape
|
9
|
+
|
6
10
|
attr_accessor :opening, :closing, :other
|
7
11
|
|
8
12
|
def initialize(string, id = nil)
|
9
13
|
super
|
10
14
|
# this is part of an old interface that is mostly unused
|
11
15
|
# some parts remain - find and delete em
|
16
|
+
@string = xml_decode(string)
|
12
17
|
@opening = false
|
13
18
|
@closing = false
|
14
19
|
@other = false
|
@@ -31,6 +36,10 @@ module LLT
|
|
31
36
|
def inspect
|
32
37
|
"#{"Punctuation token:".yellow} #{@string}"
|
33
38
|
end
|
39
|
+
|
40
|
+
def as_xml
|
41
|
+
xml_encode(@string)
|
42
|
+
end
|
34
43
|
end
|
35
44
|
end
|
36
45
|
end
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -24,6 +24,8 @@ module LLT
|
|
24
24
|
enclitics_marker: '-',
|
25
25
|
merging: true,
|
26
26
|
indexing: true,
|
27
|
+
splitting: true,
|
28
|
+
xml: false,
|
27
29
|
}
|
28
30
|
end
|
29
31
|
|
@@ -34,7 +36,7 @@ module LLT
|
|
34
36
|
setup(text, options)
|
35
37
|
|
36
38
|
find_abbreviations_and_join_strings
|
37
|
-
split_enklitika_and_change_their_position
|
39
|
+
split_enklitika_and_change_their_position if @splitting
|
38
40
|
merge_what_needs_merging if @merging # quam diu => quamdiu
|
39
41
|
tokens = create_tokens
|
40
42
|
|
@@ -43,17 +45,20 @@ module LLT
|
|
43
45
|
end
|
44
46
|
|
45
47
|
def setup(text, options = {}, worker = [])
|
46
|
-
@text
|
48
|
+
@text = text
|
47
49
|
evaluate_metrical_presence(@text)
|
48
50
|
@enclitics_marker = parse_option(:enclitics_marker, options)
|
49
51
|
@merging = parse_option(:merging, options)
|
50
52
|
@shifting = parse_option(:shifting, options)
|
53
|
+
@splitting = parse_option(:splitting, options)
|
51
54
|
@indexing = parse_option(:indexing, options)
|
55
|
+
@xml = parse_option(:xml, options)
|
52
56
|
@worker = setup_worker(worker)
|
53
57
|
@shift_range = shift_range(@shifting)
|
54
58
|
end
|
55
59
|
|
56
|
-
PUNCTUATION =
|
60
|
+
PUNCTUATION = /&(?:amp|quot|apos|lt|gt);|([\.\?,!;\-:"'”&\(\)\[\]†<>])\1*/
|
61
|
+
XML_TAG = /<\/?.+?>/
|
57
62
|
|
58
63
|
# This is here for two reasons:
|
59
64
|
# 1) easier test setup, when a preliminary result shall be further evaluated
|
@@ -64,16 +69,15 @@ module LLT
|
|
64
69
|
# if it's needed - which should perform better, when there
|
65
70
|
# are no metrics involved (the default case)
|
66
71
|
def setup_worker(worker)
|
67
|
-
if worker.any?
|
68
|
-
|
72
|
+
return worker if worker.any?
|
73
|
+
|
74
|
+
elements = split_and_space_text
|
75
|
+
put_xml_attributes_back_together(elements) if @xml
|
76
|
+
|
77
|
+
if metrical?
|
78
|
+
Worker.new(elements, @enclitics_marker)
|
69
79
|
else
|
70
|
-
elements
|
71
|
-
put_xml_attributes_back_together(elements)
|
72
|
-
if metrical?
|
73
|
-
Worker.new(elements, @enclitics_marker)
|
74
|
-
else
|
75
|
-
elements
|
76
|
-
end
|
80
|
+
elements
|
77
81
|
end
|
78
82
|
end
|
79
83
|
|
@@ -81,19 +85,23 @@ module LLT
|
|
81
85
|
shifting_enabled ? 0 : 1
|
82
86
|
end
|
83
87
|
|
88
|
+
def split_and_space_text
|
89
|
+
regex = @xml ? Regexp.union(XML_TAG, PUNCTUATION) : PUNCTUATION
|
90
|
+
@text.gsub(regex, ' \0 ').split
|
91
|
+
end
|
92
|
+
|
84
93
|
def put_xml_attributes_back_together(elements)
|
85
|
-
# elements could be like this
|
86
|
-
# ['<tag', 'attr1="val"', 'attr1="val>']
|
87
|
-
# and we want the complete xml tag back together
|
88
94
|
as = ArrayScanner.new(elements)
|
89
95
|
loop do
|
90
|
-
last = as.look_behind
|
91
|
-
if
|
92
|
-
|
96
|
+
last = as.look_behind.to_s # catch nil
|
97
|
+
if open_xml_tag?(last)
|
98
|
+
number_of_xml_elements = as.peek_until do |el|
|
99
|
+
el.end_with?('>')
|
100
|
+
end.size + 1
|
101
|
+
|
102
|
+
number_of_xml_elements.times do
|
93
103
|
last << ' ' << as.current
|
94
104
|
elements.delete_at(as.pos)
|
95
|
-
# we don't need to forward, as we delete an element anyway
|
96
|
-
next
|
97
105
|
end
|
98
106
|
else
|
99
107
|
as.forward(1)
|
@@ -102,12 +110,18 @@ module LLT
|
|
102
110
|
end
|
103
111
|
end
|
104
112
|
|
113
|
+
def open_xml_tag?(str)
|
114
|
+
str.start_with?('<') &! str.end_with?('>')
|
115
|
+
end
|
116
|
+
|
105
117
|
|
106
118
|
######################
|
107
119
|
|
108
120
|
# covers abbreviated Roman praenomen like Ti. in Ti. Claudius Nero
|
109
121
|
# covers Roman date expression like a. d. V. Kal. Apr.
|
110
122
|
ABBREVIATIONS = /^(#{ALL_ABBRS_PIPED})$/
|
123
|
+
# covers a list of words which are abbreviated with a ' like satin' for satisne
|
124
|
+
APOSTROPHE_WORDS = /^(#{APOSTROPHES_PIPED})$/
|
111
125
|
|
112
126
|
# %w{ Atque M . Cicero mittit } to %w{ Atque M. Cicero mittit }
|
113
127
|
|
@@ -115,7 +129,7 @@ module LLT
|
|
115
129
|
arr = []
|
116
130
|
@worker.each_with_index do |e, i|
|
117
131
|
n = @worker[i + 1]
|
118
|
-
if e =~ ABBREVIATIONS
|
132
|
+
if (n == '.' && e =~ ABBREVIATIONS) || (n == "'" && e =~ APOSTROPHE_WORDS)
|
119
133
|
@worker[i + 1] = n.prepend(e)
|
120
134
|
arr << (i - arr.size)
|
121
135
|
end
|
@@ -324,7 +338,7 @@ module LLT
|
|
324
338
|
|
325
339
|
ABBR_NAME_WITH_DOT = /^(#{NAMES_PIPED})\.$/
|
326
340
|
ROMAN_DATE_EXPR_WITH_DOT = /^(#{DATES_PIPED})\.$/
|
327
|
-
PUNCT_ITSELF = Regexp.new(PUNCTUATION.source
|
341
|
+
PUNCT_ITSELF = Regexp.new("^(?:#{PUNCTUATION.source})$")
|
328
342
|
XML_TAG = /<\/?.+?>/
|
329
343
|
|
330
344
|
def create_tokens
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe LLT::Token::Punctuation do
|
4
|
+
describe "#initialize" do
|
5
|
+
it "normalizes escaped xml characters" do
|
6
|
+
punct = LLT::Token::Punctuation.new('&')
|
7
|
+
punct.to_s.should == '&'
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe "#as_xml" do
|
12
|
+
it "overrides LLT::Core::Containable#as_xml to use xml encodings" do
|
13
|
+
punct = LLT::Token::Punctuation.new('&')
|
14
|
+
punct.as_xml.should == "&"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -18,10 +18,16 @@ describe LLT::Tokenizer do
|
|
18
18
|
end
|
19
19
|
|
20
20
|
it "handles all kinds of parens as well as cruces" do
|
21
|
-
txt = "Marcus (et Claudius) †amici† [sunt]."
|
21
|
+
txt = "<Marcus> (et Claudius) †amici† [sunt]."
|
22
22
|
tokens = tokenizer.tokenize(txt)
|
23
|
-
tokens.should have(
|
24
|
-
tokens.map(&:to_s).should == %w{ Marcus ( et Claudius ) † amici † [ sunt ] . }
|
23
|
+
tokens.should have(14).items
|
24
|
+
tokens.map(&:to_s).should == %w{ < Marcus > ( et Claudius ) † amici † [ sunt ] . }
|
25
|
+
end
|
26
|
+
|
27
|
+
it "handles escaped xml characters" do
|
28
|
+
txt = "& " ' > < ;"
|
29
|
+
tokens = tokenizer.tokenize(txt)
|
30
|
+
tokens.should have(6).items
|
25
31
|
end
|
26
32
|
|
27
33
|
describe "takes an optional keyword argument add_to" do
|
@@ -110,19 +116,11 @@ describe LLT::Tokenizer do
|
|
110
116
|
end
|
111
117
|
end
|
112
118
|
end
|
113
|
-
|
114
|
-
context "with embedded xml tags" do
|
115
|
-
it "doesn't break" do
|
116
|
-
txt = '<grc>text text</grc>'
|
117
|
-
tokens = tokenizer.tokenize(txt)
|
118
|
-
tokens.should have(4).items
|
119
|
-
end
|
120
|
-
end
|
121
119
|
end
|
122
120
|
end
|
123
121
|
|
124
122
|
describe "#find_abbreviations_and_join_strings" do
|
125
|
-
describe "should bring back abbreviation dots" do
|
123
|
+
describe "should bring back abbreviation dots and apostrophes" do
|
126
124
|
it "with names" do
|
127
125
|
tokenizer.setup("", {}, %w{ Atque Sex . et M . Cicero . })
|
128
126
|
tokenizer.find_abbreviations_and_join_strings
|
@@ -134,6 +132,12 @@ describe LLT::Tokenizer do
|
|
134
132
|
tokenizer.find_abbreviations_and_join_strings
|
135
133
|
tokenizer.preliminary.should == %w{ a. d. V Kal. Apr. }
|
136
134
|
end
|
135
|
+
|
136
|
+
it "with apostrophe" do
|
137
|
+
tokenizer.setup("", {}, %w{ ' Apostrophi ' sunt : po ' min ' vin ' tun' scin ' potin ' satin ' })
|
138
|
+
tokenizer.find_abbreviations_and_join_strings
|
139
|
+
tokenizer.preliminary.should == %w{ ' Apostrophi ' sunt : po' min' vin' tun' scin' potin' satin' }
|
140
|
+
end
|
137
141
|
end
|
138
142
|
end
|
139
143
|
|
@@ -252,10 +256,10 @@ describe LLT::Tokenizer do
|
|
252
256
|
end
|
253
257
|
|
254
258
|
examples = {
|
255
|
-
"Word" => %w{ ita Marcus quoque -que },
|
259
|
+
"Word" => %w{ ita Marcus quoque -que po' },
|
256
260
|
"Filler" => %w{ M. Sex. App. Ap. Tib. Ti. C. a. d. Kal. Ian. }, #I XI MMC }
|
257
261
|
"XmlTag" => %w{ <grc> </grc> },
|
258
|
-
"Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' }
|
262
|
+
"Punctuation" => %w{ , . ! ? † ( ) [ ] ... -- ” " ' & < > & < > ' " }
|
259
263
|
}
|
260
264
|
|
261
265
|
examples.each do |klass, elements|
|
@@ -278,37 +282,11 @@ describe LLT::Tokenizer do
|
|
278
282
|
tokens.map(&:id).should == [1, 2]
|
279
283
|
end
|
280
284
|
|
281
|
-
it "can be disabled" do
|
285
|
+
it "id's can be disabled" do
|
282
286
|
txt = 'Cano.'
|
283
287
|
tokens = tokenizer.tokenize(txt, indexing: false)
|
284
288
|
tokens.map(&:id).should == [nil, nil]
|
285
289
|
end
|
286
|
-
|
287
|
-
it "doesn't count plain xml tags" do
|
288
|
-
txt = '<grc>text text</grc>'
|
289
|
-
tokens = tokenizer.tokenize(txt)
|
290
|
-
tokens.map(&:id).should == [nil, 1, 2, nil]
|
291
|
-
end
|
292
|
-
|
293
|
-
it "doesn't count xml tags when they come with attributes" do
|
294
|
-
txt = '<foreign lang="lat">Graeca</foreign> lingua est.'
|
295
|
-
tokens = tokenizer.tokenize(txt).map(&:to_s)
|
296
|
-
res = ['<foreign lang="lat">', 'Graeca', '</foreign>', 'lingua', 'est', '.']
|
297
|
-
tokens.should == res
|
298
|
-
end
|
299
|
-
|
300
|
-
it "handles nested xml as well" do
|
301
|
-
txt = '<l n="70"><foreign lang="lat">Graeca lingua est.</foreign></l>'
|
302
|
-
tokens = tokenizer.tokenize(txt).map(&:to_s)
|
303
|
-
res = ['<l n="70">', '<foreign lang="lat">', 'Graeca', 'lingua', 'est', '.', '</foreign>', '</l>']
|
304
|
-
tokens.should == res
|
305
|
-
end
|
306
|
-
|
307
|
-
it "handles text with broken off xml tags (the rest will e.g. be in another sentence)" do
|
308
|
-
txt = "<lg org=\"uniform\" sample=\"complete\"><l>quem vocet divum populus ruentis</l><l>imperi rebus?"
|
309
|
-
tokens = tokenizer.tokenize(txt)
|
310
|
-
tokens.should have(12).items
|
311
|
-
end
|
312
290
|
end
|
313
291
|
|
314
292
|
context "with options" do
|
@@ -346,11 +324,68 @@ describe LLT::Tokenizer do
|
|
346
324
|
tokens.should == %w{ quam diu cano ? }
|
347
325
|
end
|
348
326
|
end
|
327
|
+
|
328
|
+
context "with disabled splitting" do
|
329
|
+
it "doesn't split enclitics" do
|
330
|
+
txt = 'arma virumque cano.'
|
331
|
+
opts = { splitting: false }
|
332
|
+
tokens = tokenizer.tokenize(txt, opts).map(&:to_s)
|
333
|
+
tokens.should == %w{ arma virumque cano . }
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
context "with xml handling enabled" do
|
338
|
+
let(:xml_tokenizer) { LLT::Tokenizer.new(db: stub_db, xml: true) }
|
339
|
+
|
340
|
+
it "doesn't break when xml is embedded" do
|
341
|
+
txt = '<grc>text text</grc>'
|
342
|
+
tokens = xml_tokenizer.tokenize(txt)
|
343
|
+
tokens.should have(4).items
|
344
|
+
end
|
345
|
+
|
346
|
+
it "doesn't count plain xml tags" do
|
347
|
+
txt = '<grc>text text</grc>'
|
348
|
+
tokens = xml_tokenizer.tokenize(txt)
|
349
|
+
tokens.map(&:id).should == [nil, 1, 2, nil]
|
350
|
+
end
|
351
|
+
|
352
|
+
it "doesn't count xml tags when they come with attributes" do
|
353
|
+
txt = '<foreign lang="lat">Graeca</foreign> lingua est.'
|
354
|
+
tokens = xml_tokenizer.tokenize(txt).map(&:to_s)
|
355
|
+
res = ['<foreign lang="lat">', 'Graeca', '</foreign>', 'lingua', 'est', '.']
|
356
|
+
tokens.should == res
|
357
|
+
end
|
358
|
+
|
359
|
+
it "handles nested xml as well" do
|
360
|
+
txt = '<l n="70"><foreign lang="lat">Graeca lingua est.</foreign></l>'
|
361
|
+
tokens = xml_tokenizer.tokenize(txt).map(&:to_s)
|
362
|
+
res = ['<l n="70">', '<foreign lang="lat">', 'Graeca', 'lingua', 'est', '.', '</foreign>', '</l>']
|
363
|
+
tokens.should == res
|
364
|
+
end
|
365
|
+
|
366
|
+
it "handles text with broken off xml tags (the rest will e.g. be in another sentence)" do
|
367
|
+
txt = "<lg org=\"uniform\" sample=\"complete\"><l>quem vocet divum populus ruentis</l><l>imperi rebus?"
|
368
|
+
tokens = xml_tokenizer.tokenize(txt)
|
369
|
+
tokens.should have(12).items
|
370
|
+
end
|
371
|
+
|
372
|
+
it "doesn't fall with spaces inside of xml attributes" do
|
373
|
+
txt = '<test>veni vidi <bad att="a a a">vici</bad></test>'
|
374
|
+
tokens = xml_tokenizer.tokenize(txt)
|
375
|
+
tokens.should have(7).items
|
376
|
+
end
|
377
|
+
|
378
|
+
it "expects all text chevrons to be escaped, otherwise they are xml tags!" do
|
379
|
+
txt = '<test><veni></test>'
|
380
|
+
tokens = xml_tokenizer.tokenize(txt)
|
381
|
+
tokens.should have(5).item
|
382
|
+
end
|
383
|
+
end
|
349
384
|
end
|
350
385
|
end
|
351
386
|
|
352
387
|
context "with options on instance creation" do
|
353
|
-
it "a new instance can receive options, which it will use as
|
388
|
+
it "a new instance can receive options, which it will use as its defaults" do
|
354
389
|
custom_tok = LLT::Tokenizer.new(db: stub_db,
|
355
390
|
shifting: false,
|
356
391
|
enclitics_marker: '')
|
metadata
CHANGED
@@ -1,141 +1,141 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.3'
|
20
15
|
requirement: !ruby/object:Gem::Requirement
|
21
16
|
requirements:
|
22
|
-
- - ~>
|
17
|
+
- - "~>"
|
23
18
|
- !ruby/object:Gem::Version
|
24
19
|
version: '1.3'
|
25
|
-
prerelease: false
|
26
20
|
type: :development
|
27
|
-
|
28
|
-
name: rake
|
21
|
+
prerelease: false
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - "~>"
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
34
29
|
requirement: !ruby/object:Gem::Requirement
|
35
30
|
requirements:
|
36
|
-
- -
|
31
|
+
- - ">="
|
37
32
|
- !ruby/object:Gem::Version
|
38
33
|
version: '0'
|
39
|
-
prerelease: false
|
40
34
|
type: :development
|
41
|
-
|
42
|
-
name: rspec
|
35
|
+
prerelease: false
|
43
36
|
version_requirements: !ruby/object:Gem::Requirement
|
44
37
|
requirements:
|
45
|
-
- -
|
38
|
+
- - ">="
|
46
39
|
- !ruby/object:Gem::Version
|
47
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
44
|
requirements:
|
50
|
-
- -
|
45
|
+
- - ">="
|
51
46
|
- !ruby/object:Gem::Version
|
52
47
|
version: '0'
|
53
|
-
prerelease: false
|
54
48
|
type: :development
|
55
|
-
|
56
|
-
name: simplecov
|
49
|
+
prerelease: false
|
57
50
|
version_requirements: !ruby/object:Gem::Requirement
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - ">="
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: simplecov
|
62
57
|
requirement: !ruby/object:Gem::Requirement
|
63
58
|
requirements:
|
64
|
-
- - ~>
|
59
|
+
- - "~>"
|
65
60
|
- !ruby/object:Gem::Version
|
66
61
|
version: '0.7'
|
67
|
-
prerelease: false
|
68
62
|
type: :development
|
69
|
-
|
70
|
-
name: array_scanner
|
63
|
+
prerelease: false
|
71
64
|
version_requirements: !ruby/object:Gem::Requirement
|
72
65
|
requirements:
|
73
|
-
- -
|
66
|
+
- - "~>"
|
74
67
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
68
|
+
version: '0.7'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: array_scanner
|
76
71
|
requirement: !ruby/object:Gem::Requirement
|
77
72
|
requirements:
|
78
|
-
- -
|
73
|
+
- - ">="
|
79
74
|
- !ruby/object:Gem::Version
|
80
75
|
version: '0'
|
81
|
-
prerelease: false
|
82
76
|
type: :runtime
|
83
|
-
|
84
|
-
name: llt-core
|
77
|
+
prerelease: false
|
85
78
|
version_requirements: !ruby/object:Gem::Requirement
|
86
79
|
requirements:
|
87
|
-
- -
|
80
|
+
- - ">="
|
88
81
|
- !ruby/object:Gem::Version
|
89
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: llt-core
|
90
85
|
requirement: !ruby/object:Gem::Requirement
|
91
86
|
requirements:
|
92
|
-
- -
|
87
|
+
- - ">="
|
93
88
|
- !ruby/object:Gem::Version
|
94
89
|
version: '0'
|
95
|
-
prerelease: false
|
96
90
|
type: :runtime
|
97
|
-
|
98
|
-
name: llt-core_extensions
|
91
|
+
prerelease: false
|
99
92
|
version_requirements: !ruby/object:Gem::Requirement
|
100
93
|
requirements:
|
101
|
-
- -
|
94
|
+
- - ">="
|
102
95
|
- !ruby/object:Gem::Version
|
103
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: llt-core_extensions
|
104
99
|
requirement: !ruby/object:Gem::Requirement
|
105
100
|
requirements:
|
106
|
-
- -
|
101
|
+
- - ">="
|
107
102
|
- !ruby/object:Gem::Version
|
108
103
|
version: '0'
|
109
|
-
prerelease: false
|
110
104
|
type: :runtime
|
111
|
-
|
112
|
-
name: llt-db_handler
|
105
|
+
prerelease: false
|
113
106
|
version_requirements: !ruby/object:Gem::Requirement
|
114
107
|
requirements:
|
115
|
-
- -
|
108
|
+
- - ">="
|
116
109
|
- !ruby/object:Gem::Version
|
117
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: llt-db_handler
|
118
113
|
requirement: !ruby/object:Gem::Requirement
|
119
114
|
requirements:
|
120
|
-
- -
|
115
|
+
- - ">="
|
121
116
|
- !ruby/object:Gem::Version
|
122
117
|
version: '0'
|
123
|
-
prerelease: false
|
124
118
|
type: :runtime
|
125
|
-
|
126
|
-
name: llt-helpers
|
119
|
+
prerelease: false
|
127
120
|
version_requirements: !ruby/object:Gem::Requirement
|
128
121
|
requirements:
|
129
|
-
- -
|
122
|
+
- - ">="
|
130
123
|
- !ruby/object:Gem::Version
|
131
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: llt-helpers
|
132
127
|
requirement: !ruby/object:Gem::Requirement
|
133
128
|
requirements:
|
134
|
-
- -
|
129
|
+
- - ">="
|
135
130
|
- !ruby/object:Gem::Version
|
136
131
|
version: '0'
|
137
|
-
prerelease: false
|
138
132
|
type: :runtime
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
139
|
description: LLT's Tokenizer
|
140
140
|
email:
|
141
141
|
- 1986gh@gmail.com
|
@@ -143,9 +143,9 @@ executables: []
|
|
143
143
|
extensions: []
|
144
144
|
extra_rdoc_files: []
|
145
145
|
files:
|
146
|
-
- .gitignore
|
147
|
-
- .rspec
|
148
|
-
- .travis.yml
|
146
|
+
- ".gitignore"
|
147
|
+
- ".rspec"
|
148
|
+
- ".travis.yml"
|
149
149
|
- Gemfile
|
150
150
|
- LICENSE.txt
|
151
151
|
- README.md
|
@@ -160,6 +160,7 @@ files:
|
|
160
160
|
- lib/llt/tokenizer/version.rb
|
161
161
|
- lib/llt/tokenizer/worker.rb
|
162
162
|
- llt-tokenizer.gemspec
|
163
|
+
- spec/lib/llt/token/punctuation_spec.rb
|
163
164
|
- spec/lib/llt/tokenizer/api_spec.rb
|
164
165
|
- spec/lib/llt/tokenizer_spec.rb
|
165
166
|
- spec/spec_helper.rb
|
@@ -168,27 +169,28 @@ homepage: ''
|
|
168
169
|
licenses:
|
169
170
|
- MIT
|
170
171
|
metadata: {}
|
171
|
-
post_install_message:
|
172
|
+
post_install_message:
|
172
173
|
rdoc_options: []
|
173
174
|
require_paths:
|
174
175
|
- lib
|
175
176
|
required_ruby_version: !ruby/object:Gem::Requirement
|
176
177
|
requirements:
|
177
|
-
- -
|
178
|
+
- - ">="
|
178
179
|
- !ruby/object:Gem::Version
|
179
180
|
version: '0'
|
180
181
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
182
|
requirements:
|
182
|
-
- -
|
183
|
+
- - ">="
|
183
184
|
- !ruby/object:Gem::Version
|
184
185
|
version: '0'
|
185
186
|
requirements: []
|
186
|
-
rubyforge_project:
|
187
|
-
rubygems_version: 2.
|
188
|
-
signing_key:
|
187
|
+
rubyforge_project:
|
188
|
+
rubygems_version: 2.2.0
|
189
|
+
signing_key:
|
189
190
|
specification_version: 4
|
190
191
|
summary: Breaks latin sentences into tokens
|
191
192
|
test_files:
|
193
|
+
- spec/lib/llt/token/punctuation_spec.rb
|
192
194
|
- spec/lib/llt/tokenizer/api_spec.rb
|
193
195
|
- spec/lib/llt/tokenizer_spec.rb
|
194
196
|
- spec/spec_helper.rb
|