llt-tokenizer 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/llt/tokenizer/version.rb +1 -1
- data/lib/llt/tokenizer.rb +12 -6
- data/spec/lib/llt/tokenizer_spec.rb +12 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd490c0611dc9cd5ed2f2aad95631b500e7e7d35
|
4
|
+
data.tar.gz: fe8162ded4cecfda383f3919576ec19ca14f5a38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8a90a133ccd0c27fbbd20e64df0a8e560ad183010ae55716362e0909c737a6d5359c51a647da387ddade71a3bd50545381c899cd3fcea095001ef0e27c42483
|
7
|
+
data.tar.gz: 2d81412a66b473206d23aec3128d3321ff736f3a9d5ab8fca0349bcc347e8d692b3ee33d0fd2c67f87f031335113c1a54caef80b5aa0a4eaca4c5bd2320d6778
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -169,7 +169,7 @@ module LLT
|
|
169
169
|
ENCLITICS = %w{ que ne ve c }
|
170
170
|
def split_enklitika_and_change_their_position
|
171
171
|
split_with_force
|
172
|
-
|
172
|
+
split_nec_and_oute
|
173
173
|
make_frequent_corrections
|
174
174
|
end
|
175
175
|
|
@@ -202,16 +202,22 @@ module LLT
|
|
202
202
|
"#{@enclitics_marker}#{val}"
|
203
203
|
end
|
204
204
|
|
205
|
-
def
|
206
|
-
|
205
|
+
def split_nec_and_oute
|
206
|
+
nec_indices = []
|
207
|
+
oute_indices = []
|
207
208
|
@worker.each_with_index do |token, i|
|
208
|
-
|
209
|
+
case token
|
210
|
+
when /^nec$/i
|
209
211
|
token.slice!(-1)
|
210
|
-
|
212
|
+
nec_indices << (i + nec_indices.size + @shift_range)
|
213
|
+
when /^οὐτε$/i
|
214
|
+
token.slice!(-2, 2)
|
215
|
+
oute_indices << (i + oute_indices.size + @shift_range)
|
211
216
|
end
|
212
217
|
end
|
213
218
|
|
214
|
-
|
219
|
+
nec_indices.each { |i| @worker.insert(i, enclitic('c')) }
|
220
|
+
oute_indices.each { |i| @worker.insert(i, enclitic('τε')) }
|
215
221
|
end
|
216
222
|
|
217
223
|
def make_frequent_corrections
|
@@ -260,6 +260,18 @@ describe LLT::Tokenizer do
|
|
260
260
|
end
|
261
261
|
end
|
262
262
|
end
|
263
|
+
|
264
|
+
context "when confronted with -τε" do
|
265
|
+
examples = {
|
266
|
+
'οὐτε' => '-τε οὐ'
|
267
|
+
}
|
268
|
+
|
269
|
+
examples.each do |example, expected|
|
270
|
+
it "transforms #{example} to #{expected}" do
|
271
|
+
enklitika_test(example).should be_transformed_to expected
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
263
275
|
end
|
264
276
|
|
265
277
|
describe "#merge_what_needs_merging" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -190,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
190
|
version: '0'
|
191
191
|
requirements: []
|
192
192
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.2.
|
193
|
+
rubygems_version: 2.2.2
|
194
194
|
signing_key:
|
195
195
|
specification_version: 4
|
196
196
|
summary: Breaks latin sentences into tokens
|