pragmatic_tokenizer 3.0.7 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a0941801b61a33578d7d5dcc6ffe25e5ce524767
4
- data.tar.gz: 611411f111648ec42d2cb23cf5c3be87e5a66510
2
+ SHA256:
3
+ metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
4
+ data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
5
5
  SHA512:
6
- metadata.gz: 4aaa482df0dcb555de46fe491d63da4ce70d222cb4b6392d1289d2d75e02e30039ccf1607c1b37c0741eb42616c051165ede5c3206eb05419da63fb7eda25564
7
- data.tar.gz: fa08a0a58961213b94f327bd9e53af12b70797d7e6a41551aecf30e1ba10537efec0711077a1440c712dd707bce873b54707cdc3a30c6592421fc9a7d83518b4
6
+ metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
7
+ data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
@@ -11,11 +11,37 @@ module PragmaticTokenizer
11
11
  # why can't we directly reference constants from Languages::Common?
12
12
  ALNUM_QUOTE = PragmaticTokenizer::Languages::Common::SingleQuotes::ALNUM_QUOTE
13
13
  QUOTE_WORD = PragmaticTokenizer::Languages::Common::SingleQuotes::QUOTE_WORD
14
+ C_APOSTROPHE = /c'/i
15
+ J_APOSTROPHE = /j'/i
14
16
  L_APOSTROPHE = /l'/i
17
+ D_APOSTROPHE = /d'/i
18
+ QU_APOSTROPHE = /qu'/i
19
+ N_APOSTROPHE = /n'/i
20
+ T_APOSTROPHE = /t'/i
21
+ M_APOSTROPHE = /m'/i
22
+ S_APOSTROPHE = /s'/i
23
+ QUELQU_APOSTROPHE = /quelqu'/i
24
+ JUSQU_APOSTROPHE = /jusqu'/i
25
+ LORSQU_APOSTROPHE = /lorsqu'/i
26
+ PUISQU_APOSTROPHE = /puisqu'/i
27
+ QUOIQU_APOSTROPHE = /quoiqu'/i
15
28
 
16
29
  def handle_single_quotes(text)
17
30
  replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
31
+ text.gsub!(C_APOSTROPHE, '\1 c' << replacement << ' ')
32
+ text.gsub!(J_APOSTROPHE, '\1 j' << replacement << ' ')
18
33
  text.gsub!(L_APOSTROPHE, '\1 l' << replacement << ' ')
34
+ text.gsub!(D_APOSTROPHE, '\1 d' << replacement << ' ')
35
+ text.gsub!(QU_APOSTROPHE, '\1 qu' << replacement << ' ')
36
+ text.gsub!(N_APOSTROPHE, '\1 n' << replacement << ' ')
37
+ text.gsub!(T_APOSTROPHE, '\1 t' << replacement << ' ')
38
+ text.gsub!(M_APOSTROPHE, '\1 m' << replacement << ' ')
39
+ text.gsub!(S_APOSTROPHE, '\1 s' << replacement << ' ')
40
+ text.gsub!(QUELQU_APOSTROPHE, '\1 quelqu' << replacement << ' ')
41
+ text.gsub!(JUSQU_APOSTROPHE, '\1 jusqu' << replacement << ' ')
42
+ text.gsub!(LORSQU_APOSTROPHE, '\1 lorsqu' << replacement << ' ')
43
+ text.gsub!(PUISQU_APOSTROPHE, '\1 puisqu' << replacement << ' ')
44
+ text.gsub!(QUOIQU_APOSTROPHE, '\1 quoiqu' << replacement << ' ')
19
45
  text.gsub!(ALNUM_QUOTE, '\1 ' << replacement << ' ')
20
46
  text.gsub!(QUOTE_WORD, ' ' << replacement)
21
47
  text
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.7".freeze
2
+ VERSION = "3.1.0".freeze
3
3
  end
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
19
19
 
20
20
  spec.add_runtime_dependency "unicode"
21
21
  spec.add_development_dependency "bundler", "~> 1.9"
22
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rake", ">= 12.3.3"
23
23
  spec.add_development_dependency "rspec"
24
24
  spec.add_development_dependency "stackprof"
25
25
  spec.add_development_dependency "rubocop"
@@ -3,11 +3,11 @@ require 'spec_helper'
3
3
  describe PragmaticTokenizer do
4
4
  context 'Language: French (fr)' do
5
5
  it 'tokenizes a string #001' do
6
- text = "L'art de l'univers, c'est un art"
6
+ text = "D'art de l'univers, c'est un art"
7
7
  pt = PragmaticTokenizer::Tokenizer.new(
8
8
  language: 'fr'
9
9
  )
10
- expect(pt.tokenize(text)).to eq(["l'", "art", "de", "l'", "univers", ",", "c'est", "un", "art"])
10
+ expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
11
11
  end
12
12
  end
13
13
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.7
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-18 00:00:00.000000000 Z
11
+ date: 2020-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: 12.3.3
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: 12.3.3
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
169
  version: '0'
170
170
  requirements: []
171
171
  rubyforge_project:
172
- rubygems_version: 2.6.14
172
+ rubygems_version: 2.7.6
173
173
  signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer