pragmatic_tokenizer 3.0.7 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
|
4
|
+
data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
|
7
|
+
data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
|
@@ -11,11 +11,37 @@ module PragmaticTokenizer
|
|
11
11
|
# why can't we directly reference constants from Languages::Common?
|
12
12
|
ALNUM_QUOTE = PragmaticTokenizer::Languages::Common::SingleQuotes::ALNUM_QUOTE
|
13
13
|
QUOTE_WORD = PragmaticTokenizer::Languages::Common::SingleQuotes::QUOTE_WORD
|
14
|
+
C_APOSTROPHE = /c'/i
|
15
|
+
J_APOSTROPHE = /j'/i
|
14
16
|
L_APOSTROPHE = /l'/i
|
17
|
+
D_APOSTROPHE = /d'/i
|
18
|
+
QU_APOSTROPHE = /qu'/i
|
19
|
+
N_APOSTROPHE = /n'/i
|
20
|
+
T_APOSTROPHE = /t'/i
|
21
|
+
M_APOSTROPHE = /m'/i
|
22
|
+
S_APOSTROPHE = /s'/i
|
23
|
+
QUELQU_APOSTROPHE = /quelqu'/i
|
24
|
+
JUSQU_APOSTROPHE = /jusqu'/i
|
25
|
+
LORSQU_APOSTROPHE = /lorsqu'/i
|
26
|
+
PUISQU_APOSTROPHE = /puisqu'/i
|
27
|
+
QUOIQU_APOSTROPHE = /quoiqu'/i
|
15
28
|
|
16
29
|
def handle_single_quotes(text)
|
17
30
|
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
|
31
|
+
text.gsub!(C_APOSTROPHE, '\1 c' << replacement << ' ')
|
32
|
+
text.gsub!(J_APOSTROPHE, '\1 j' << replacement << ' ')
|
18
33
|
text.gsub!(L_APOSTROPHE, '\1 l' << replacement << ' ')
|
34
|
+
text.gsub!(D_APOSTROPHE, '\1 d' << replacement << ' ')
|
35
|
+
text.gsub!(QU_APOSTROPHE, '\1 qu' << replacement << ' ')
|
36
|
+
text.gsub!(N_APOSTROPHE, '\1 n' << replacement << ' ')
|
37
|
+
text.gsub!(T_APOSTROPHE, '\1 t' << replacement << ' ')
|
38
|
+
text.gsub!(M_APOSTROPHE, '\1 m' << replacement << ' ')
|
39
|
+
text.gsub!(S_APOSTROPHE, '\1 s' << replacement << ' ')
|
40
|
+
text.gsub!(QUELQU_APOSTROPHE, '\1 quelqu' << replacement << ' ')
|
41
|
+
text.gsub!(JUSQU_APOSTROPHE, '\1 jusqu' << replacement << ' ')
|
42
|
+
text.gsub!(LORSQU_APOSTROPHE, '\1 lorsqu' << replacement << ' ')
|
43
|
+
text.gsub!(PUISQU_APOSTROPHE, '\1 puisqu' << replacement << ' ')
|
44
|
+
text.gsub!(QUOIQU_APOSTROPHE, '\1 quoiqu' << replacement << ' ')
|
19
45
|
text.gsub!(ALNUM_QUOTE, '\1 ' << replacement << ' ')
|
20
46
|
text.gsub!(QUOTE_WORD, ' ' << replacement)
|
21
47
|
text
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
|
20
20
|
spec.add_runtime_dependency "unicode"
|
21
21
|
spec.add_development_dependency "bundler", "~> 1.9"
|
22
|
-
spec.add_development_dependency "rake", "
|
22
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
23
23
|
spec.add_development_dependency "rspec"
|
24
24
|
spec.add_development_dependency "stackprof"
|
25
25
|
spec.add_development_dependency "rubocop"
|
@@ -3,11 +3,11 @@ require 'spec_helper'
|
|
3
3
|
describe PragmaticTokenizer do
|
4
4
|
context 'Language: French (fr)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
|
-
text = "
|
6
|
+
text = "D'art de l'univers, c'est un art"
|
7
7
|
pt = PragmaticTokenizer::Tokenizer.new(
|
8
8
|
language: 'fr'
|
9
9
|
)
|
10
|
-
expect(pt.tokenize(text)).to eq(["
|
10
|
+
expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -42,16 +42,16 @@ dependencies:
|
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 12.3.3
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 12.3.3
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
171
|
rubyforge_project:
|
172
|
-
rubygems_version: 2.6
|
172
|
+
rubygems_version: 2.7.6
|
173
173
|
signing_key:
|
174
174
|
specification_version: 4
|
175
175
|
summary: A multilingual tokenizer
|