pragmatic_tokenizer 3.0.4 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +150 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/english_spec.rb +13 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +0 -1
- data/spec/spec_helper.rb +1 -1
- metadata +12 -12
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'pragmatic_tokenizer/version'
|
5
4
|
|
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ["Kevin S. Dias"]
|
10
9
|
spec.email = ["diasks2@gmail.com"]
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
11
|
+
spec.summary = 'A multilingual tokenizer'
|
12
|
+
spec.description = 'A multilingual tokenizer to split a string into tokens.'
|
13
|
+
spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
|
15
14
|
|
16
15
|
spec.files = `git ls-files -z`.split("\x0")
|
17
16
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
@@ -20,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
20
19
|
|
21
20
|
spec.add_runtime_dependency "unicode"
|
22
21
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
|
-
spec.add_development_dependency "rake", "
|
22
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
24
23
|
spec.add_development_dependency "rspec"
|
25
24
|
spec.add_development_dependency "stackprof"
|
26
25
|
spec.add_development_dependency "rubocop"
|
@@ -88,6 +88,12 @@ describe PragmaticTokenizer do
|
|
88
88
|
expect(pt.tokenize(text)).to eq(["#ab-cd"])
|
89
89
|
end
|
90
90
|
|
91
|
+
it 'tokenizes a string #015' do
|
92
|
+
text = "In 2004, he co-founded Palantir Technologies, which offers platforms for finance companies and intelligence, defense, and law enforcement communities to integrate, visualize, and analyze the world's information."
|
93
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
94
|
+
expect(pt.tokenize(text)).to eq(["in", "2004", ",", "he", "co-founded", "palantir", "technologies", ",", "which", "offers", "platforms", "for", "finance", "companies", "and", "intelligence", ",", "defense", ",", "and", "law", "enforcement", "communities", "to", "integrate", ",", "visualize", ",", "and", "analyze", "the", "world's", "information", "."])
|
95
|
+
end
|
96
|
+
|
91
97
|
it 'handles numbers with symbols 2' do
|
92
98
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
93
99
|
pt = PragmaticTokenizer::Tokenizer.new
|
@@ -543,6 +549,13 @@ describe PragmaticTokenizer do
|
|
543
549
|
)
|
544
550
|
expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
545
551
|
end
|
552
|
+
it 'tokenizes something with a slash' do
|
553
|
+
text = "EO/AA/M/F/Veterans/Disability/Sexual Orientation/Gender Identity"
|
554
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
555
|
+
long_word_split: 1
|
556
|
+
)
|
557
|
+
expect(pt.tokenize(text)).to eq(["eo", "aa", "m", "f", "veterans", "disability", "sexual", "orientation", "gender", "identity"])
|
558
|
+
end
|
546
559
|
end
|
547
560
|
|
548
561
|
context 'option (clean)' do
|
@@ -3,11 +3,11 @@ require 'spec_helper'
|
|
3
3
|
describe PragmaticTokenizer do
|
4
4
|
context 'Language: French (fr)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
|
-
text = "
|
6
|
+
text = "D'art de l'univers, c'est un art"
|
7
7
|
pt = PragmaticTokenizer::Tokenizer.new(
|
8
8
|
language: 'fr'
|
9
9
|
)
|
10
|
-
expect(pt.tokenize(text)).to eq(["
|
10
|
+
expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
data/spec/performance_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
$LOAD_PATH.unshift File.expand_path('
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
2
2
|
require 'pragmatic_tokenizer'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0
|
4
|
+
version: 3.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-11-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -42,16 +42,16 @@ dependencies:
|
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 12.3.3
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 12.3.3
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/languages.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -140,6 +139,7 @@ files:
|
|
140
139
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
141
140
|
- lib/pragmatic_tokenizer/post_processor.rb
|
142
141
|
- lib/pragmatic_tokenizer/pre_processor.rb
|
142
|
+
- lib/pragmatic_tokenizer/regex.rb
|
143
143
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
144
144
|
- lib/pragmatic_tokenizer/version.rb
|
145
145
|
- pragmatic_tokenizer.gemspec
|
@@ -153,7 +153,7 @@ files:
|
|
153
153
|
homepage: https://github.com/diasks2/pragmatic_tokenizer
|
154
154
|
licenses: []
|
155
155
|
metadata: {}
|
156
|
-
post_install_message:
|
156
|
+
post_install_message:
|
157
157
|
rdoc_options: []
|
158
158
|
require_paths:
|
159
159
|
- lib
|
@@ -168,9 +168,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
168
168
|
- !ruby/object:Gem::Version
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
|
-
rubyforge_project:
|
172
|
-
rubygems_version: 2.
|
173
|
-
signing_key:
|
171
|
+
rubyforge_project:
|
172
|
+
rubygems_version: 2.7.6
|
173
|
+
signing_key:
|
174
174
|
specification_version: 4
|
175
175
|
summary: A multilingual tokenizer
|
176
176
|
test_files:
|
@@ -1,62 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates true full stops while ignoring
|
5
|
-
# periods that are part of an abbreviation
|
6
|
-
class FullStopSeparator
|
7
|
-
|
8
|
-
REGEXP_ENDS_WITH_DOT = /\A(.*\w)\.\z/
|
9
|
-
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
-
REGEXP_ABBREVIATION = /[a-z](?:\.[a-z])+\z/i
|
11
|
-
DOT = '.'.freeze
|
12
|
-
|
13
|
-
def initialize(tokens:, abbreviations:, downcase:)
|
14
|
-
@tokens = tokens
|
15
|
-
@abbreviations = abbreviations
|
16
|
-
@downcase = downcase
|
17
|
-
end
|
18
|
-
|
19
|
-
def separate
|
20
|
-
create_cleaned_tokens
|
21
|
-
replace_last_token unless @cleaned_tokens.empty?
|
22
|
-
@cleaned_tokens
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def create_cleaned_tokens
|
28
|
-
@cleaned_tokens = []
|
29
|
-
@tokens.each_with_index do |token, position|
|
30
|
-
if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
|
31
|
-
match = Regexp.last_match(1)
|
32
|
-
if abbreviation?(match)
|
33
|
-
@cleaned_tokens += [match, DOT]
|
34
|
-
next
|
35
|
-
end
|
36
|
-
end
|
37
|
-
@cleaned_tokens << token
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
def abbreviation?(token)
|
42
|
-
!defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
|
43
|
-
end
|
44
|
-
|
45
|
-
def defined_abbreviation?(token)
|
46
|
-
@abbreviations.include?(inverse_case(token))
|
47
|
-
end
|
48
|
-
|
49
|
-
def inverse_case(token)
|
50
|
-
@downcase ? token : Unicode.downcase(token)
|
51
|
-
end
|
52
|
-
|
53
|
-
def replace_last_token
|
54
|
-
last_token = @cleaned_tokens[-1]
|
55
|
-
return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
|
56
|
-
@cleaned_tokens[-1] = Regexp.last_match(1)
|
57
|
-
@cleaned_tokens << DOT
|
58
|
-
end
|
59
|
-
|
60
|
-
end
|
61
|
-
|
62
|
-
end
|