pragmatic_tokenizer 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3b2988beef12450f5a7f653ec5c2e8db5e11efce
4
- data.tar.gz: 56d01b9c9ffef57ae3365f3519214318f7828a20
3
+ metadata.gz: f19baff424f25d8ba1fcf08001dfd6fc5ac4c54b
4
+ data.tar.gz: 645aaae817dd1a77c7fbfe506e5febe9746afbf7
5
5
  SHA512:
6
- metadata.gz: 938592c183b4bd1f2fca41554d74e17b85f608549cfb218476c99c84829fec15e7ac6b2dbc4ce69b60a5780de90e0878203b5bfd54cdfbd71bfd0daed57d15f9
7
- data.tar.gz: d767954ebcfd5003f1a58b0023307b93b22de30e08f491954511d1bcdafab4c3f23f5edb710f1d40515df98510bd63bcd9c427e4d2315f33b0d1641b726e998f
6
+ metadata.gz: c4a725af2141a66f0390258b36bc66a50994cc6f44e2db6b528b28e592df50baf1865d4445012dead0bf95a1347585acd2b93b22ba968ddf3a5fc08d4bcdaf79
7
+ data.tar.gz: 0a820f57952cdc0adb8973607b083bc0e14ba5006a7cf4ab5515f874c1a6f3bc5d1ce532b895c3011203ae74555480a7b81431906607ae26a376dcc5dd4a7d66
@@ -11,8 +11,8 @@ module PragmaticTokenizer
11
11
  text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
12
12
  text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
13
13
  text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
14
- text.gsub!(/l\'/, '\1 l\' \2') || text
15
- text.gsub!(/L\'/, '\1 L\' \2') || text
14
+ text.gsub!(/l\'/, '\1 l \2') || text
15
+ text.gsub!(/L\'/, '\1 L \2') || text
16
16
  end
17
17
  end
18
18
  end
@@ -22,7 +22,11 @@ module PragmaticTokenizer
22
22
  convert_sgl_quotes(text)
23
23
  shift_beginning_hyphen(text)
24
24
  shift_ending_hyphen(text)
25
- tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t) })
25
+ tokens = separate_full_stop(text.squeeze(' ')
26
+ .split
27
+ .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
28
+ .flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
29
+ .map { |t| convert_sym_to_punct(t) })
26
30
  separate_other_ending_punc(tokens)
27
31
  end
28
32
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias