pragmatic_tokenizer 2.2.1 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +4 -4
- data/lib/pragmatic_tokenizer/post_processor.rb +1 -1
- data/lib/pragmatic_tokenizer/pre_processor.rb +1 -1
- data/lib/pragmatic_tokenizer/tokenizer.rb +5 -7
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -1
- data/spec/languages/english_spec.rb +0 -21
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
|
4
|
+
data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
|
7
|
+
data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
|
@@ -23,7 +23,7 @@ module PragmaticTokenizer
|
|
23
23
|
if downcase
|
24
24
|
abbreviation = abbr[w]
|
25
25
|
else
|
26
|
-
abbreviation = abbr[
|
26
|
+
abbreviation = abbr[Unicode.downcase(w)]
|
27
27
|
end
|
28
28
|
unless abbreviation || w =~ /\A[a-z]\z/i ||
|
29
29
|
w =~ /[a-z](?:\.[a-z])+\z/i
|
@@ -35,11 +35,11 @@ module PragmaticTokenizer
|
|
35
35
|
cleaned_tokens << tokens[i]
|
36
36
|
end
|
37
37
|
if downcase
|
38
|
-
|
38
|
+
abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
|
39
39
|
else
|
40
|
-
|
40
|
+
abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
|
41
41
|
end
|
42
|
-
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !
|
42
|
+
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
|
43
43
|
cleaned_tokens[-1] = Regexp.last_match(1)
|
44
44
|
cleaned_tokens.push '.'
|
45
45
|
end
|
@@ -1,12 +1,10 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require 'set'
|
3
|
-
require 'cgi'
|
4
2
|
require 'pragmatic_tokenizer/languages'
|
5
3
|
require 'pragmatic_tokenizer/pre_processor'
|
6
4
|
require 'pragmatic_tokenizer/post_processor'
|
7
5
|
require 'pragmatic_tokenizer/full_stop_separator'
|
8
6
|
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
9
|
-
require '
|
7
|
+
require 'unicode'
|
10
8
|
|
11
9
|
module PragmaticTokenizer
|
12
10
|
class Tokenizer
|
@@ -193,7 +191,7 @@ module PragmaticTokenizer
|
|
193
191
|
normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
|
194
192
|
return token unless @contractions.key?(normalized)
|
195
193
|
result = @contractions[normalized].split(SPACE_STRING)
|
196
|
-
result[0] =
|
194
|
+
result[0] = Unicode.capitalize(result[0]) unless @downcase
|
197
195
|
result
|
198
196
|
end
|
199
197
|
|
@@ -292,12 +290,12 @@ module PragmaticTokenizer
|
|
292
290
|
end
|
293
291
|
|
294
292
|
def chosen_case(token)
|
295
|
-
@downcase ?
|
293
|
+
@downcase ? Unicode.downcase(token) : token
|
296
294
|
end
|
297
295
|
|
298
296
|
def inverse_case(token)
|
299
|
-
@downcase ? token :
|
297
|
+
@downcase ? token : Unicode.downcase(token)
|
300
298
|
end
|
301
299
|
|
302
300
|
end
|
303
|
-
end
|
301
|
+
end
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency "
|
21
|
+
spec.add_runtime_dependency "unicode"
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
24
|
spec.add_development_dependency "rspec"
|
@@ -1335,27 +1335,6 @@ describe PragmaticTokenizer do
|
|
1335
1335
|
)
|
1336
1336
|
expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
|
1337
1337
|
end
|
1338
|
-
|
1339
|
-
it 'removes non-breaking spaces' do
|
1340
|
-
text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast da hello."
|
1341
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1342
|
-
language: :en,
|
1343
|
-
filter_languages: [:en],
|
1344
|
-
clean: true,
|
1345
|
-
numbers: :none,
|
1346
|
-
minimum_length: 3,
|
1347
|
-
expand_contractions: true,
|
1348
|
-
remove_stop_words: true,
|
1349
|
-
punctuation: :none,
|
1350
|
-
remove_emails: true,
|
1351
|
-
remove_domains: true,
|
1352
|
-
remove_urls: true,
|
1353
|
-
hashtags: :remove,
|
1354
|
-
mentions: :remove,
|
1355
|
-
downcase: true
|
1356
|
-
)
|
1357
|
-
expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
|
1358
|
-
end
|
1359
1338
|
end
|
1360
1339
|
end
|
1361
1340
|
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-02-
|
11
|
+
date: 2016-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: unicode
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|