pragmatic_tokenizer 2.2.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee933fd568e4ccb0af6034488d9bd0ea15288d25
4
- data.tar.gz: 90acbd94ecf4fb8f0ce3671cd1136f8ce309d676
3
+ metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
4
+ data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
5
5
  SHA512:
6
- metadata.gz: 896110659d02729735f16d9c740573c4cfb3367021d3e248d8e8421d64440d8f24c8a3020273f026eb5f7dfcb007f9e3ffa14f298c39988f0477b9142cbd9829
7
- data.tar.gz: 69ca7908471f37a03ed9953f547fb36043d43c351b29b49c73995bfdcec361f6edaa2a67001a79b8e33b8c4b80d9d7f3b02277a1174b7bc28ebfe8a1715473b0
6
+ metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
7
+ data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
@@ -23,7 +23,7 @@ module PragmaticTokenizer
23
23
  if downcase
24
24
  abbreviation = abbr[w]
25
25
  else
26
- abbreviation = abbr[UnicodeCaseConverter::downcase(w)]
26
+ abbreviation = abbr[Unicode.downcase(w)]
27
27
  end
28
28
  unless abbreviation || w =~ /\A[a-z]\z/i ||
29
29
  w =~ /[a-z](?:\.[a-z])+\z/i
@@ -35,11 +35,11 @@ module PragmaticTokenizer
35
35
  cleaned_tokens << tokens[i]
36
36
  end
37
37
  if downcase
38
- abbr_included = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
38
+ abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
39
39
  else
40
- abbr_included = abbreviations.include?(UnicodeCaseConverter::downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
40
+ abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
41
41
  end
42
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbr_included
42
+ if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
43
43
  cleaned_tokens[-1] = Regexp.last_match(1)
44
44
  cleaned_tokens.push '.'
45
45
  end
@@ -94,7 +94,7 @@ module PragmaticTokenizer
94
94
  if downcase
95
95
  token.split(/(\.)/)[0]
96
96
  else
97
- UnicodeCaseConverter::downcase(token.split(/(\.)/)[0])
97
+ Unicode.downcase(token.split(/(\.)/)[0])
98
98
  end
99
99
  end
100
100
 
@@ -33,7 +33,7 @@ module PragmaticTokenizer
33
33
  private
34
34
 
35
35
  def remove_non_breaking_space!
36
- gsub!(/\u{00A0}/, ''.freeze)
36
+ gsub!(/\u{00A0}/, ''.freeze)
37
37
  end
38
38
 
39
39
  # Shift commas off everything but numbers
@@ -1,12 +1,10 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'set'
3
- require 'cgi'
4
2
  require 'pragmatic_tokenizer/languages'
5
3
  require 'pragmatic_tokenizer/pre_processor'
6
4
  require 'pragmatic_tokenizer/post_processor'
7
5
  require 'pragmatic_tokenizer/full_stop_separator'
8
6
  require 'pragmatic_tokenizer/ending_punctuation_separator'
9
- require 'unicode_case_converter'
7
+ require 'unicode'
10
8
 
11
9
  module PragmaticTokenizer
12
10
  class Tokenizer
@@ -193,7 +191,7 @@ module PragmaticTokenizer
193
191
  normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
194
192
  return token unless @contractions.key?(normalized)
195
193
  result = @contractions[normalized].split(SPACE_STRING)
196
- result[0] = UnicodeCaseConverter::capitalize(result[0]) unless @downcase
194
+ result[0] = Unicode.capitalize(result[0]) unless @downcase
197
195
  result
198
196
  end
199
197
 
@@ -292,12 +290,12 @@ module PragmaticTokenizer
292
290
  end
293
291
 
294
292
  def chosen_case(token)
295
- @downcase ? UnicodeCaseConverter::downcase(token) : token
293
+ @downcase ? Unicode.downcase(token) : token
296
294
  end
297
295
 
298
296
  def inverse_case(token)
299
- @downcase ? token : UnicodeCaseConverter::downcase(token)
297
+ @downcase ? token : Unicode.downcase(token)
300
298
  end
301
299
 
302
300
  end
303
- end
301
+ end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "2.2.1".freeze
2
+ VERSION = "3.0.0".freeze
3
3
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "unicode_case_converter", "~> 1.0"
21
+ spec.add_runtime_dependency "unicode"
22
22
  spec.add_development_dependency "bundler", "~> 1.9"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
@@ -1335,27 +1335,6 @@ describe PragmaticTokenizer do
1335
1335
  )
1336
1336
  expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
1337
1337
  end
1338
-
1339
- it 'removes non-breaking spaces' do
1340
- text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast    da hello."
1341
- pt = PragmaticTokenizer::Tokenizer.new(
1342
- language: :en,
1343
- filter_languages: [:en],
1344
- clean: true,
1345
- numbers: :none,
1346
- minimum_length: 3,
1347
- expand_contractions: true,
1348
- remove_stop_words: true,
1349
- punctuation: :none,
1350
- remove_emails: true,
1351
- remove_domains: true,
1352
- remove_urls: true,
1353
- hashtags: :remove,
1354
- mentions: :remove,
1355
- downcase: true
1356
- )
1357
- expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
1358
- end
1359
1338
  end
1360
1339
  end
1361
1340
 
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-16 00:00:00.000000000 Z
11
+ date: 2016-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: unicode_case_converter
14
+ name: unicode
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement