pragmatic_tokenizer 2.2.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +4 -4
- data/lib/pragmatic_tokenizer/post_processor.rb +1 -1
- data/lib/pragmatic_tokenizer/pre_processor.rb +1 -1
- data/lib/pragmatic_tokenizer/tokenizer.rb +5 -7
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -1
- data/spec/languages/english_spec.rb +0 -21
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
|
4
|
+
data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
|
7
|
+
data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
|
@@ -23,7 +23,7 @@ module PragmaticTokenizer
|
|
23
23
|
if downcase
|
24
24
|
abbreviation = abbr[w]
|
25
25
|
else
|
26
|
-
abbreviation = abbr[
|
26
|
+
abbreviation = abbr[Unicode.downcase(w)]
|
27
27
|
end
|
28
28
|
unless abbreviation || w =~ /\A[a-z]\z/i ||
|
29
29
|
w =~ /[a-z](?:\.[a-z])+\z/i
|
@@ -35,11 +35,11 @@ module PragmaticTokenizer
|
|
35
35
|
cleaned_tokens << tokens[i]
|
36
36
|
end
|
37
37
|
if downcase
|
38
|
-
|
38
|
+
abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
|
39
39
|
else
|
40
|
-
|
40
|
+
abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
|
41
41
|
end
|
42
|
-
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !
|
42
|
+
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
|
43
43
|
cleaned_tokens[-1] = Regexp.last_match(1)
|
44
44
|
cleaned_tokens.push '.'
|
45
45
|
end
|
@@ -1,12 +1,10 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require 'set'
|
3
|
-
require 'cgi'
|
4
2
|
require 'pragmatic_tokenizer/languages'
|
5
3
|
require 'pragmatic_tokenizer/pre_processor'
|
6
4
|
require 'pragmatic_tokenizer/post_processor'
|
7
5
|
require 'pragmatic_tokenizer/full_stop_separator'
|
8
6
|
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
9
|
-
require '
|
7
|
+
require 'unicode'
|
10
8
|
|
11
9
|
module PragmaticTokenizer
|
12
10
|
class Tokenizer
|
@@ -193,7 +191,7 @@ module PragmaticTokenizer
|
|
193
191
|
normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
|
194
192
|
return token unless @contractions.key?(normalized)
|
195
193
|
result = @contractions[normalized].split(SPACE_STRING)
|
196
|
-
result[0] =
|
194
|
+
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
197
195
|
result
|
198
196
|
end
|
199
197
|
|
@@ -292,12 +290,12 @@ module PragmaticTokenizer
|
|
292
290
|
end
|
293
291
|
|
294
292
|
def chosen_case(token)
|
295
|
-
@downcase ?
|
293
|
+
@downcase ? Unicode.downcase(token) : token
|
296
294
|
end
|
297
295
|
|
298
296
|
def inverse_case(token)
|
299
|
-
@downcase ? token :
|
297
|
+
@downcase ? token : Unicode.downcase(token)
|
300
298
|
end
|
301
299
|
|
302
300
|
end
|
303
|
-
end
|
301
|
+
end
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency "
|
21
|
+
spec.add_runtime_dependency "unicode"
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
24
|
spec.add_development_dependency "rspec"
|
@@ -1335,27 +1335,6 @@ describe PragmaticTokenizer do
|
|
1335
1335
|
)
|
1336
1336
|
expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
|
1337
1337
|
end
|
1338
|
-
|
1339
|
-
it 'removes non-breaking spaces' do
|
1340
|
-
text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast da hello."
|
1341
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1342
|
-
language: :en,
|
1343
|
-
filter_languages: [:en],
|
1344
|
-
clean: true,
|
1345
|
-
numbers: :none,
|
1346
|
-
minimum_length: 3,
|
1347
|
-
expand_contractions: true,
|
1348
|
-
remove_stop_words: true,
|
1349
|
-
punctuation: :none,
|
1350
|
-
remove_emails: true,
|
1351
|
-
remove_domains: true,
|
1352
|
-
remove_urls: true,
|
1353
|
-
hashtags: :remove,
|
1354
|
-
mentions: :remove,
|
1355
|
-
downcase: true
|
1356
|
-
)
|
1357
|
-
expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
|
1358
|
-
end
|
1359
1338
|
end
|
1360
1339
|
end
|
1361
1340
|
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: unicode
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|