pragmatic_tokenizer 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3b2988beef12450f5a7f653ec5c2e8db5e11efce
4
- data.tar.gz: 56d01b9c9ffef57ae3365f3519214318f7828a20
3
+ metadata.gz: f19baff424f25d8ba1fcf08001dfd6fc5ac4c54b
4
+ data.tar.gz: 645aaae817dd1a77c7fbfe506e5febe9746afbf7
5
5
  SHA512:
6
- metadata.gz: 938592c183b4bd1f2fca41554d74e17b85f608549cfb218476c99c84829fec15e7ac6b2dbc4ce69b60a5780de90e0878203b5bfd54cdfbd71bfd0daed57d15f9
7
- data.tar.gz: d767954ebcfd5003f1a58b0023307b93b22de30e08f491954511d1bcdafab4c3f23f5edb710f1d40515df98510bd63bcd9c427e4d2315f33b0d1641b726e998f
6
+ metadata.gz: c4a725af2141a66f0390258b36bc66a50994cc6f44e2db6b528b28e592df50baf1865d4445012dead0bf95a1347585acd2b93b22ba968ddf3a5fc08d4bcdaf79
7
+ data.tar.gz: 0a820f57952cdc0adb8973607b083bc0e14ba5006a7cf4ab5515f874c1a6f3bc5d1ce532b895c3011203ae74555480a7b81431906607ae26a376dcc5dd4a7d66
@@ -11,8 +11,8 @@ module PragmaticTokenizer
11
11
  text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
12
12
  text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
13
13
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
14
- text.gsub!(/l\'/, '\1 l\' \2') || text
15
- text.gsub!(/L\'/, '\1 L\' \2') || text
14
+ text.gsub!(/l\'/, '\1 l \2') || text
15
+ text.gsub!(/L\'/, '\1 L \2') || text
16
16
  end
17
17
  end
18
18
  end
@@ -22,7 +22,11 @@ module PragmaticTokenizer
22
22
  convert_sgl_quotes(text)
23
23
  shift_beginning_hyphen(text)
24
24
  shift_ending_hyphen(text)
25
- tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t) })
25
+ tokens = separate_full_stop(text.squeeze(' ')
26
+ .split
27
+ .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
28
+ .flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
29
+ .map { |t| convert_sym_to_punct(t) })
26
30
  separate_other_ending_punc(tokens)
27
31
  end
28
32
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias