pragmatic_tokenizer 2.2.1 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee933fd568e4ccb0af6034488d9bd0ea15288d25
4
- data.tar.gz: 90acbd94ecf4fb8f0ce3671cd1136f8ce309d676
3
+ metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
4
+ data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
5
5
  SHA512:
6
- metadata.gz: 896110659d02729735f16d9c740573c4cfb3367021d3e248d8e8421d64440d8f24c8a3020273f026eb5f7dfcb007f9e3ffa14f298c39988f0477b9142cbd9829
7
- data.tar.gz: 69ca7908471f37a03ed9953f547fb36043d43c351b29b49c73995bfdcec361f6edaa2a67001a79b8e33b8c4b80d9d7f3b02277a1174b7bc28ebfe8a1715473b0
6
+ metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
7
+ data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
@@ -23,7 +23,7 @@ module PragmaticTokenizer
23
23
  if downcase
24
24
  abbreviation = abbr[w]
25
25
  else
26
- abbreviation = abbr[UnicodeCaseConverter::downcase(w)]
26
+ abbreviation = abbr[Unicode.downcase(w)]
27
27
  end
28
28
  unless abbreviation || w =~ /\A[a-z]\z/i ||
29
29
  w =~ /[a-z](?:\.[a-z])+\z/i
@@ -35,11 +35,11 @@ module PragmaticTokenizer
35
35
  cleaned_tokens << tokens[i]
36
36
  end
37
37
  if downcase
38
- abbr_included = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
38
+ abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
39
39
  else
40
- abbr_included = abbreviations.include?(UnicodeCaseConverter::downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
40
+ abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
41
41
  end
42
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbr_included
42
+ if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
43
43
  cleaned_tokens[-1] = Regexp.last_match(1)
44
44
  cleaned_tokens.push '.'
45
45
  end
@@ -94,7 +94,7 @@ module PragmaticTokenizer
94
94
  if downcase
95
95
  token.split(/(\.)/)[0]
96
96
  else
97
- UnicodeCaseConverter::downcase(token.split(/(\.)/)[0])
97
+ Unicode.downcase(token.split(/(\.)/)[0])
98
98
  end
99
99
  end
100
100
 
@@ -33,7 +33,7 @@ module PragmaticTokenizer
33
33
  private
34
34
 
35
35
  def remove_non_breaking_space!
36
- gsub!(/\u{00A0}/, ''.freeze)
36
+ gsub!(/\u{00A0}/, ''.freeze)
37
37
  end
38
38
 
39
39
  # Shift commas off everything but numbers
@@ -1,12 +1,10 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'set'
3
- require 'cgi'
4
2
  require 'pragmatic_tokenizer/languages'
5
3
  require 'pragmatic_tokenizer/pre_processor'
6
4
  require 'pragmatic_tokenizer/post_processor'
7
5
  require 'pragmatic_tokenizer/full_stop_separator'
8
6
  require 'pragmatic_tokenizer/ending_punctuation_separator'
9
- require 'unicode_case_converter'
7
+ require 'unicode'
10
8
 
11
9
  module PragmaticTokenizer
12
10
  class Tokenizer
@@ -193,7 +191,7 @@ module PragmaticTokenizer
193
191
  normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
194
192
  return token unless @contractions.key?(normalized)
195
193
  result = @contractions[normalized].split(SPACE_STRING)
196
- result[0] = UnicodeCaseConverter::capitalize(result[0]) unless @downcase
194
+ result[0] = Unicode.capitalize(result[0]) unless @downcase
197
195
  result
198
196
  end
199
197
 
@@ -292,12 +290,12 @@ module PragmaticTokenizer
292
290
  end
293
291
 
294
292
  def chosen_case(token)
295
- @downcase ? UnicodeCaseConverter::downcase(token) : token
293
+ @downcase ? Unicode.downcase(token) : token
296
294
  end
297
295
 
298
296
  def inverse_case(token)
299
- @downcase ? token : UnicodeCaseConverter::downcase(token)
297
+ @downcase ? token : Unicode.downcase(token)
300
298
  end
301
299
 
302
300
  end
303
- end
301
+ end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "2.2.1".freeze
2
+ VERSION = "3.0.0".freeze
3
3
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "unicode_case_converter", "~> 1.0"
21
+ spec.add_runtime_dependency "unicode"
22
22
  spec.add_development_dependency "bundler", "~> 1.9"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
@@ -1335,27 +1335,6 @@ describe PragmaticTokenizer do
1335
1335
  )
1336
1336
  expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
1337
1337
  end
1338
-
1339
- it 'removes non-breaking spaces' do
1340
- text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast    da hello."
1341
- pt = PragmaticTokenizer::Tokenizer.new(
1342
- language: :en,
1343
- filter_languages: [:en],
1344
- clean: true,
1345
- numbers: :none,
1346
- minimum_length: 3,
1347
- expand_contractions: true,
1348
- remove_stop_words: true,
1349
- punctuation: :none,
1350
- remove_emails: true,
1351
- remove_domains: true,
1352
- remove_urls: true,
1353
- hashtags: :remove,
1354
- mentions: :remove,
1355
- downcase: true
1356
- )
1357
- expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
1358
- end
1359
1338
  end
1360
1339
  end
1361
1340
 
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-16 00:00:00.000000000 Z
11
+ date: 2016-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: unicode_case_converter
14
+ name: unicode
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement