proiel 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/proiel/tokenization.rb +4 -2
- data/lib/proiel/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 66a5e6fc550b5d017d0b49396ec931624cc9cc12
|
4
|
+
data.tar.gz: a89f0936a25eb4092728ac527f8816095f721818
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c956426f6bd2b6e90b9254a72f256d14937c99d0ca5183897b414fd17c6a88803082c0c88690b1ad695347b29cc1bfd86de2a933cfcbff453b4fb93ad4b9e623
|
7
|
+
data.tar.gz: 756f1beb29e9ec3174e9b65aa7a52428a6fa87f1d9db9bec18e112afb26ff372c05d333e96e2c60b889d4a8cd72540c315ecd0719bae2ba3e2a5d6e8bbe8b4b8
|
data/lib/proiel/tokenization.rb
CHANGED
@@ -59,6 +59,8 @@ module PROIEL
|
|
59
59
|
form and form.length > 1
|
60
60
|
end
|
61
61
|
|
62
|
+
WORD_PATTERN = /([^[\u{E000}-\u{F8FF}][[:word:]]]+)/
|
63
|
+
|
62
64
|
# Splits a token form using the tokenization patterns that apply for a
|
63
65
|
# the specified language. Tokenization patterns must already have been
|
64
66
|
# loaded.
|
@@ -73,9 +75,9 @@ module PROIEL
|
|
73
75
|
raise ArgumentError, 'invalid language tag' unless language_tag.is_a?(String)
|
74
76
|
raise ArgumentError, 'invalid form' unless form.is_a?(String)
|
75
77
|
|
76
|
-
if form[
|
78
|
+
if form[WORD_PATTERN]
|
77
79
|
# Split on any non-word character like a space or punctuation
|
78
|
-
form.split(
|
80
|
+
form.split(WORD_PATTERN)
|
79
81
|
elsif @@regexes.key?(language_tag) and form[@@regexes[language_tag]]
|
80
82
|
# Apply language-specific pattern
|
81
83
|
form.match(@@regexes[language_tag]).captures
|
data/lib/proiel/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: proiel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius L. Jøhndal
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: json
|