llt-tokenizer 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/llt/tokenizer/version.rb +1 -1
- data/lib/llt/tokenizer.rb +12 -6
- data/spec/lib/llt/tokenizer_spec.rb +12 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cd490c0611dc9cd5ed2f2aad95631b500e7e7d35
|
4
|
+
data.tar.gz: fe8162ded4cecfda383f3919576ec19ca14f5a38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a8a90a133ccd0c27fbbd20e64df0a8e560ad183010ae55716362e0909c737a6d5359c51a647da387ddade71a3bd50545381c899cd3fcea095001ef0e27c42483
|
7
|
+
data.tar.gz: 2d81412a66b473206d23aec3128d3321ff736f3a9d5ab8fca0349bcc347e8d692b3ee33d0fd2c67f87f031335113c1a54caef80b5aa0a4eaca4c5bd2320d6778
|
data/lib/llt/tokenizer.rb
CHANGED
@@ -169,7 +169,7 @@ module LLT
|
|
169
169
|
ENCLITICS = %w{ que ne ve c }
|
170
170
|
def split_enklitika_and_change_their_position
|
171
171
|
split_with_force
|
172
|
-
|
172
|
+
split_nec_and_oute
|
173
173
|
make_frequent_corrections
|
174
174
|
end
|
175
175
|
|
@@ -202,16 +202,22 @@ module LLT
|
|
202
202
|
"#{@enclitics_marker}#{val}"
|
203
203
|
end
|
204
204
|
|
205
|
-
def
|
206
|
-
|
205
|
+
def split_nec_and_oute
|
206
|
+
nec_indices = []
|
207
|
+
oute_indices = []
|
207
208
|
@worker.each_with_index do |token, i|
|
208
|
-
|
209
|
+
case token
|
210
|
+
when /^nec$/i
|
209
211
|
token.slice!(-1)
|
210
|
-
|
212
|
+
nec_indices << (i + nec_indices.size + @shift_range)
|
213
|
+
when /^οὐτε$/i
|
214
|
+
token.slice!(-2, 2)
|
215
|
+
oute_indices << (i + oute_indices.size + @shift_range)
|
211
216
|
end
|
212
217
|
end
|
213
218
|
|
214
|
-
|
219
|
+
nec_indices.each { |i| @worker.insert(i, enclitic('c')) }
|
220
|
+
oute_indices.each { |i| @worker.insert(i, enclitic('τε')) }
|
215
221
|
end
|
216
222
|
|
217
223
|
def make_frequent_corrections
|
@@ -260,6 +260,18 @@ describe LLT::Tokenizer do
|
|
260
260
|
end
|
261
261
|
end
|
262
262
|
end
|
263
|
+
|
264
|
+
context "when confronted with -τε" do
|
265
|
+
examples = {
|
266
|
+
'οὐτε' => '-τε οὐ'
|
267
|
+
}
|
268
|
+
|
269
|
+
examples.each do |example, expected|
|
270
|
+
it "transforms #{example} to #{expected}" do
|
271
|
+
enklitika_test(example).should be_transformed_to expected
|
272
|
+
end
|
273
|
+
end
|
274
|
+
end
|
263
275
|
end
|
264
276
|
|
265
277
|
describe "#merge_what_needs_merging" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LFDM
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-08-
|
11
|
+
date: 2014-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -190,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
190
190
|
version: '0'
|
191
191
|
requirements: []
|
192
192
|
rubyforge_project:
|
193
|
-
rubygems_version: 2.2.
|
193
|
+
rubygems_version: 2.2.2
|
194
194
|
signing_key:
|
195
195
|
specification_version: 4
|
196
196
|
summary: Breaks latin sentences into tokens
|