pragmatic_tokenizer 3.0.7 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a0941801b61a33578d7d5dcc6ffe25e5ce524767
4
- data.tar.gz: 611411f111648ec42d2cb23cf5c3be87e5a66510
2
+ SHA256:
3
+ metadata.gz: 0feac0e3ed363c652e85fda4dd631b6f221da44af13ef9da24e9f0be5a0cdd38
4
+ data.tar.gz: adc17ceadcf5749c1c9d1abce01f28658511a9bd251e25dda2ccf333f32cd355
5
5
  SHA512:
6
- metadata.gz: 4aaa482df0dcb555de46fe491d63da4ce70d222cb4b6392d1289d2d75e02e30039ccf1607c1b37c0741eb42616c051165ede5c3206eb05419da63fb7eda25564
7
- data.tar.gz: fa08a0a58961213b94f327bd9e53af12b70797d7e6a41551aecf30e1ba10537efec0711077a1440c712dd707bce873b54707cdc3a30c6592421fc9a7d83518b4
6
+ metadata.gz: 99bbf93e63cf667703c2c386e51592873f9c70fa20df823d15ef69cf3714fef98c48d2428d451ce9c1433c3b1d23c5a317a2ecf1fc5ed6f9be04a66c8d268773
7
+ data.tar.gz: ddb68cee3ea6cf0b4d2bbe581c99a37d1ecb34de5a2c0703073cc53b27d6520dbf6d9bdd811f0bfcb244120c91290040f269eefeb002c3440d8c9fa55a2d9671
@@ -11,11 +11,37 @@ module PragmaticTokenizer
11
11
  # why can't we directly reference constants from Languages::Common?
12
12
  ALNUM_QUOTE = PragmaticTokenizer::Languages::Common::SingleQuotes::ALNUM_QUOTE
13
13
  QUOTE_WORD = PragmaticTokenizer::Languages::Common::SingleQuotes::QUOTE_WORD
14
+ C_APOSTROPHE = /c'/i
15
+ J_APOSTROPHE = /j'/i
14
16
  L_APOSTROPHE = /l'/i
17
+ D_APOSTROPHE = /d'/i
18
+ QU_APOSTROPHE = /qu'/i
19
+ N_APOSTROPHE = /n'/i
20
+ T_APOSTROPHE = /t'/i
21
+ M_APOSTROPHE = /m'/i
22
+ S_APOSTROPHE = /s'/i
23
+ QUELQU_APOSTROPHE = /quelqu'/i
24
+ JUSQU_APOSTROPHE = /jusqu'/i
25
+ LORSQU_APOSTROPHE = /lorsqu'/i
26
+ PUISQU_APOSTROPHE = /puisqu'/i
27
+ QUOIQU_APOSTROPHE = /quoiqu'/i
15
28
 
16
29
  def handle_single_quotes(text)
17
30
  replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
31
+ text.gsub!(C_APOSTROPHE, '\1 c' << replacement << ' ')
32
+ text.gsub!(J_APOSTROPHE, '\1 j' << replacement << ' ')
18
33
  text.gsub!(L_APOSTROPHE, '\1 l' << replacement << ' ')
34
+ text.gsub!(D_APOSTROPHE, '\1 d' << replacement << ' ')
35
+ text.gsub!(QU_APOSTROPHE, '\1 qu' << replacement << ' ')
36
+ text.gsub!(N_APOSTROPHE, '\1 n' << replacement << ' ')
37
+ text.gsub!(T_APOSTROPHE, '\1 t' << replacement << ' ')
38
+ text.gsub!(M_APOSTROPHE, '\1 m' << replacement << ' ')
39
+ text.gsub!(S_APOSTROPHE, '\1 s' << replacement << ' ')
40
+ text.gsub!(QUELQU_APOSTROPHE, '\1 quelqu' << replacement << ' ')
41
+ text.gsub!(JUSQU_APOSTROPHE, '\1 jusqu' << replacement << ' ')
42
+ text.gsub!(LORSQU_APOSTROPHE, '\1 lorsqu' << replacement << ' ')
43
+ text.gsub!(PUISQU_APOSTROPHE, '\1 puisqu' << replacement << ' ')
44
+ text.gsub!(QUOIQU_APOSTROPHE, '\1 quoiqu' << replacement << ' ')
19
45
  text.gsub!(ALNUM_QUOTE, '\1 ' << replacement << ' ')
20
46
  text.gsub!(QUOTE_WORD, ' ' << replacement)
21
47
  text
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.7".freeze
2
+ VERSION = "3.1.0".freeze
3
3
  end
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
19
19
 
20
20
  spec.add_runtime_dependency "unicode"
21
21
  spec.add_development_dependency "bundler", "~> 1.9"
22
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "rake", ">= 12.3.3"
23
23
  spec.add_development_dependency "rspec"
24
24
  spec.add_development_dependency "stackprof"
25
25
  spec.add_development_dependency "rubocop"
@@ -3,11 +3,11 @@ require 'spec_helper'
3
3
  describe PragmaticTokenizer do
4
4
  context 'Language: French (fr)' do
5
5
  it 'tokenizes a string #001' do
6
- text = "L'art de l'univers, c'est un art"
6
+ text = "D'art de l'univers, c'est un art"
7
7
  pt = PragmaticTokenizer::Tokenizer.new(
8
8
  language: 'fr'
9
9
  )
10
- expect(pt.tokenize(text)).to eq(["l'", "art", "de", "l'", "univers", ",", "c'est", "un", "art"])
10
+ expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
11
11
  end
12
12
  end
13
13
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.7
4
+ version: 3.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-18 00:00:00.000000000 Z
11
+ date: 2020-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -42,16 +42,16 @@ dependencies:
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: 12.3.3
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: 12.3.3
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
169
169
  version: '0'
170
170
  requirements: []
171
171
  rubyforge_project:
172
- rubygems_version: 2.6.14
172
+ rubygems_version: 2.7.6
173
173
  signing_key:
174
174
  specification_version: 4
175
175
  summary: A multilingual tokenizer