pragmatic_tokenizer 3.0.4 → 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +150 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/english_spec.rb +13 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +0 -1
- data/spec/spec_helper.rb +1 -1
- metadata +12 -12
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'pragmatic_tokenizer/version'
|
5
4
|
|
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ["Kevin S. Dias"]
|
10
9
|
spec.email = ["diasks2@gmail.com"]
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
11
|
+
spec.summary = 'A multilingual tokenizer'
|
12
|
+
spec.description = 'A multilingual tokenizer to split a string into tokens.'
|
13
|
+
spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
|
15
14
|
|
16
15
|
spec.files = `git ls-files -z`.split("\x0")
|
17
16
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
@@ -20,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
20
19
|
|
21
20
|
spec.add_runtime_dependency "unicode"
|
22
21
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
|
-
spec.add_development_dependency "rake", "
|
22
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
24
23
|
spec.add_development_dependency "rspec"
|
25
24
|
spec.add_development_dependency "stackprof"
|
26
25
|
spec.add_development_dependency "rubocop"
|
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
|
|
88
88
|
expect(pt.tokenize(text)).to eq(["#ab-cd"])
|
89
89
|
end
|
90
90
|
|
91
|
+
it 'tokenizes a string #015' do
|
92
|
+
text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
|
93
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
94
|
+
expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
|
95
|
+
end
|
96
|
+
|
91
97
|
it 'handles numbers with symbols 2' do
|
92
98
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
93
99
|
pt = PragmaticTokenizer::Tokenizer.new
|
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
|
|
543
549
|
)
|
544
550
|
expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
545
551
|
end
|
552
|
+
it 'tokenizes something with a slash' do
|
553
|
+
text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
|
554
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
555
|
+
long_word_split: 1
|
556
|
+
)
|
557
|
+
expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
|
558
|
+
end
|
546
559
|
end
|
547
560
|
|
548
561
|
context 'option (clean)' do
|
@@ -3,11 +3,11 @@ require 'spec_helper'
|
|
3
3
|
describe PragmaticTokenizer do
|
4
4
|
context 'Language: French (fr)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
|
-
text = "
|
6
|
+
text = "D'art de l'univers, c'est un art"
|
7
7
|
pt = PragmaticTokenizer::Tokenizer.new(
|
8
8
|
language: 'fr'
|
9
9
|
)
|
10
|
-
expect(pt.tokenize(text)).to eq(["
|
10
|
+
expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
data/spec/performance_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
$LOAD_PATH.unshift File.expand_path('
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
2
2
|
require 'pragmatic_tokenizer'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -42,16 +42,16 @@ dependencies:
|
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 12.3.3
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 12.3.3
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/languages.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -140,6 +139,7 @@ files:
|
|
140
139
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
141
140
|
- lib/pragmatic_tokenizer/post_processor.rb
|
142
141
|
- lib/pragmatic_tokenizer/pre_processor.rb
|
142
|
+
- lib/pragmatic_tokenizer/regex.rb
|
143
143
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
144
144
|
- lib/pragmatic_tokenizer/version.rb
|
145
145
|
- pragmatic_tokenizer.gemspec
|
@@ -153,7 +153,7 @@ files:
|
|
153
153
|
homepage: https://github.com/diasks2/pragmatic_tokenizer
|
154
154
|
licenses: []
|
155
155
|
metadata: {}
|
156
|
-
post_install_message:
|
156
|
+
post_install_message:
|
157
157
|
rdoc_options: []
|
158
158
|
require_paths:
|
159
159
|
- lib
|
@@ -168,9 +168,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
168
168
|
- !ruby/object:Gem::Version
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
|
-
rubyforge_project:
|
172
|
-
rubygems_version: 2.
|
173
|
-
signing_key:
|
171
|
+
rubyforge_project:
|
172
|
+
rubygems_version: 2.7.6
|
173
|
+
signing_key:
|
174
174
|
specification_version: 4
|
175
175
|
summary: A multilingual tokenizer
|
176
176
|
test_files:
|
@@ -1,62 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates true full stops while ignoring
|
5
|
-
# periods that are part of an abbreviation
|
6
|
-
class FullStopSeparator
|
7
|
-
|
8
|
-
REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
|
9
|
-
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
-
REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
|
11
|
-
DOT = '.'.freeze
|
12
|
-
|
13
|
-
def initialize(tokens:, abbreviations:, downcase:)
|
14
|
-
@tokens = tokens
|
15
|
-
@abbreviations = abbreviations
|
16
|
-
@downcase = downcase
|
17
|
-
end
|
18
|
-
|
19
|
-
def separate
|
20
|
-
create_cleaned_tokens
|
21
|
-
replace_last_token unless @cleaned_tokens.empty?
|
22
|
-
@cleaned_tokens
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def create_cleaned_tokens
|
28
|
-
@cleaned_tokens = []
|
29
|
-
@tokens.each_with_index do |token, position|
|
30
|
-
if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
|
31
|
-
match = Regexp.last_match(1)
|
32
|
-
if abbreviation?(match)
|
33
|
-
@cleaned_tokens += [match, DOT]
|
34
|
-
next
|
35
|
-
end
|
36
|
-
end
|
37
|
-
@cleaned_tokens << token
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
def abbreviation?(token)
|
42
|
-
!defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
|
43
|
-
end
|
44
|
-
|
45
|
-
def defined_abbreviation?(token)
|
46
|
-
@abbreviations.include?(inverse_case(token))
|
47
|
-
end
|
48
|
-
|
49
|
-
def inverse_case(token)
|
50
|
-
@downcase ? token : Unicode.downcase(token)
|
51
|
-
end
|
52
|
-
|
53
|
-
def replace_last_token
|
54
|
-
last_token = @cleaned_tokens[-1]
|
55
|
-
return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
|
56
|
-
@cleaned_tokens[-1] = Regexp.last_match(1)
|
57
|
-
@cleaned_tokens << DOT
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
end
|