pragmatic_tokenizer 3.0.3 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/README.md +1 -1
- data/lib/pragmatic_tokenizer/languages.rb +26 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
- data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
- data/lib/pragmatic_tokenizer/regex.rb +149 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +82 -116
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +5 -6
- data/spec/languages/deutsch_spec.rb +1 -1
- data/spec/languages/english_spec.rb +52 -0
- data/spec/languages/french_spec.rb +2 -2
- data/spec/performance_spec.rb +1 -1
- data/spec/spec_helper.rb +1 -1
- metadata +8 -8
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -63
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
1
|
+
lib = File.expand_path('lib', __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require 'pragmatic_tokenizer/version'
|
5
4
|
|
@@ -9,9 +8,9 @@ Gem::Specification.new do |spec|
|
|
9
8
|
spec.authors = ["Kevin S. Dias"]
|
10
9
|
spec.email = ["diasks2@gmail.com"]
|
11
10
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
11
|
+
spec.summary = 'A multilingual tokenizer'
|
12
|
+
spec.description = 'A multilingual tokenizer to split a string into tokens.'
|
13
|
+
spec.homepage = 'https://github.com/diasks2/pragmatic_tokenizer'
|
15
14
|
|
16
15
|
spec.files = `git ls-files -z`.split("\x0")
|
17
16
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
@@ -20,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
20
19
|
|
21
20
|
spec.add_runtime_dependency "unicode"
|
22
21
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
|
-
spec.add_development_dependency "rake", "
|
22
|
+
spec.add_development_dependency "rake", ">= 12.3.3"
|
24
23
|
spec.add_development_dependency "rspec"
|
25
24
|
spec.add_development_dependency "stackprof"
|
26
25
|
spec.add_development_dependency "rubocop"
|
@@ -175,7 +175,7 @@ describe PragmaticTokenizer do
|
|
175
175
|
remove_stop_words: true,
|
176
176
|
language: 'de'
|
177
177
|
)
|
178
|
-
expect(pt.tokenize(text)).to eq(["
|
178
|
+
expect(pt.tokenize(text)).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
179
179
|
end
|
180
180
|
|
181
181
|
it 'removes English and German stopwords' do
|
@@ -443,6 +443,17 @@ describe PragmaticTokenizer do
|
|
443
443
|
end
|
444
444
|
end
|
445
445
|
|
446
|
+
context 'option (downcase)' do
|
447
|
+
it 'does not downcase URLs' do
|
448
|
+
skip "NOT IMPLEMENTED"
|
449
|
+
text = "Here are some domains and urls GOOGLE.com http://test.com/UPPERCASE."
|
450
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
451
|
+
downcase: :true
|
452
|
+
)
|
453
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "GOOGLE.com", "http://test.com/UPPERCASE", "."])
|
454
|
+
end
|
455
|
+
end
|
456
|
+
|
446
457
|
context 'option (domains)' do
|
447
458
|
it 'tokenizes a string #001' do
|
448
459
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
@@ -485,6 +496,38 @@ describe PragmaticTokenizer do
|
|
485
496
|
end
|
486
497
|
|
487
498
|
context 'option (long_word_split)' do
|
499
|
+
it 'should not split twitter handles' do
|
500
|
+
text = "@john_doe"
|
501
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
502
|
+
long_word_split: 5
|
503
|
+
)
|
504
|
+
expect(pt.tokenize(text)).to eq(["@john_doe"])
|
505
|
+
end
|
506
|
+
|
507
|
+
it 'should not split emails' do
|
508
|
+
text = "john_doe@something.com"
|
509
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
510
|
+
long_word_split: 5
|
511
|
+
)
|
512
|
+
expect(pt.tokenize(text)).to eq(["john_doe@something.com"])
|
513
|
+
end
|
514
|
+
|
515
|
+
it 'should not split emails 2' do
|
516
|
+
text = "john_doe@something.com"
|
517
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
518
|
+
long_word_split: 5
|
519
|
+
)
|
520
|
+
expect(pt.tokenize(text)).to eq(["john_doe@something.com"])
|
521
|
+
end
|
522
|
+
|
523
|
+
it 'should not split urls' do
|
524
|
+
text = "http://test.com/some_path"
|
525
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
526
|
+
long_word_split: 5
|
527
|
+
)
|
528
|
+
expect(pt.tokenize(text)).to eq(["http://test.com/some_path"])
|
529
|
+
end
|
530
|
+
|
488
531
|
it 'tokenizes a string #001' do
|
489
532
|
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
490
533
|
pt = PragmaticTokenizer::Tokenizer.new(
|
@@ -1041,6 +1084,15 @@ describe PragmaticTokenizer do
|
|
1041
1084
|
expect(pt.tokenize(text)).to eq(["short", "sentence", "explanations", "."])
|
1042
1085
|
end
|
1043
1086
|
|
1087
|
+
it 'removes stop words 2' do
|
1088
|
+
text = 'This is a short sentence with explanations and stop words i.e. is a stop word as so is e.g. I think.'
|
1089
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1090
|
+
language: 'en',
|
1091
|
+
remove_stop_words: true
|
1092
|
+
)
|
1093
|
+
expect(pt.tokenize(text)).to eq(["short", "sentence", "explanations", "."])
|
1094
|
+
end
|
1095
|
+
|
1044
1096
|
it 'removes user-supplied stop words' do
|
1045
1097
|
text = 'This is a short sentence with explanations and stop words.'
|
1046
1098
|
pt = PragmaticTokenizer::Tokenizer.new(
|
@@ -3,11 +3,11 @@ require 'spec_helper'
|
|
3
3
|
describe PragmaticTokenizer do
|
4
4
|
context 'Language: French (fr)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
|
-
text = "
|
6
|
+
text = "D'art de l'univers, c'est un art"
|
7
7
|
pt = PragmaticTokenizer::Tokenizer.new(
|
8
8
|
language: 'fr'
|
9
9
|
)
|
10
|
-
expect(pt.tokenize(text)).to eq(["
|
10
|
+
expect(pt.tokenize(text)).to eq(["d'", "art", "de", "l'", "univers", ",", "c'" ,"est", "un", "art"])
|
11
11
|
end
|
12
12
|
end
|
13
13
|
end
|
data/spec/performance_spec.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
1
|
require 'benchmark'
|
3
2
|
require 'spec_helper'
|
4
3
|
require 'stackprof'
|
@@ -29,6 +28,7 @@ describe PragmaticTokenizer do
|
|
29
28
|
# 24.2
|
30
29
|
# 23.2
|
31
30
|
# 11.6
|
31
|
+
# 12.0
|
32
32
|
# it 'is fast? (long strings)' do
|
33
33
|
# string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
|
34
34
|
# puts "LENGTH: #{string.length}"
|
data/spec/spec_helper.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
$LOAD_PATH.unshift File.expand_path('
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
|
2
2
|
require 'pragmatic_tokenizer'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0
|
4
|
+
version: 3.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -42,16 +42,16 @@ dependencies:
|
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 12.3.3
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 12.3.3
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -111,7 +111,6 @@ files:
|
|
111
111
|
- README.md
|
112
112
|
- Rakefile
|
113
113
|
- lib/pragmatic_tokenizer.rb
|
114
|
-
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
115
114
|
- lib/pragmatic_tokenizer/languages.rb
|
116
115
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
117
116
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -140,6 +139,7 @@ files:
|
|
140
139
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
141
140
|
- lib/pragmatic_tokenizer/post_processor.rb
|
142
141
|
- lib/pragmatic_tokenizer/pre_processor.rb
|
142
|
+
- lib/pragmatic_tokenizer/regex.rb
|
143
143
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
144
144
|
- lib/pragmatic_tokenizer/version.rb
|
145
145
|
- pragmatic_tokenizer.gemspec
|
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
169
169
|
version: '0'
|
170
170
|
requirements: []
|
171
171
|
rubyforge_project:
|
172
|
-
rubygems_version: 2.
|
172
|
+
rubygems_version: 2.7.6
|
173
173
|
signing_key:
|
174
174
|
specification_version: 4
|
175
175
|
summary: A multilingual tokenizer
|
@@ -1,63 +0,0 @@
|
|
1
|
-
# -*- encoding : utf-8 -*-
|
2
|
-
|
3
|
-
module PragmaticTokenizer
|
4
|
-
# This class separates true full stops while ignoring
|
5
|
-
# periods that are part of an abbreviation
|
6
|
-
class FullStopSeparator
|
7
|
-
|
8
|
-
REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
|
9
|
-
REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
|
10
|
-
REGEXP_UNKNOWN1 = /[a-z](?:\.[a-z])+\z/i
|
11
|
-
REGEXP_UNKNOWN2 = /\A(.*\w)\.\z/
|
12
|
-
DOT = '.'.freeze
|
13
|
-
|
14
|
-
def initialize(tokens:, abbreviations:, downcase:)
|
15
|
-
@tokens = tokens
|
16
|
-
@abbreviations = abbreviations
|
17
|
-
@downcase = downcase
|
18
|
-
end
|
19
|
-
|
20
|
-
def separate
|
21
|
-
create_cleaned_tokens
|
22
|
-
replace_last_token unless @cleaned_tokens.empty?
|
23
|
-
@cleaned_tokens
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
def create_cleaned_tokens
|
29
|
-
@cleaned_tokens = []
|
30
|
-
@tokens.each_with_index do |token, position|
|
31
|
-
if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
|
32
|
-
match = Regexp.last_match(1)
|
33
|
-
if unknown_method1(match)
|
34
|
-
@cleaned_tokens += [match, DOT]
|
35
|
-
next
|
36
|
-
end
|
37
|
-
end
|
38
|
-
@cleaned_tokens << token
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def unknown_method1(token)
|
43
|
-
!abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
|
44
|
-
end
|
45
|
-
|
46
|
-
def abbreviation?(token)
|
47
|
-
@abbreviations.include?(inverse_case(token))
|
48
|
-
end
|
49
|
-
|
50
|
-
def inverse_case(token)
|
51
|
-
@downcase ? token : Unicode.downcase(token)
|
52
|
-
end
|
53
|
-
|
54
|
-
def replace_last_token
|
55
|
-
last_token = @cleaned_tokens[-1]
|
56
|
-
return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
|
57
|
-
@cleaned_tokens[-1] = Regexp.last_match(1)
|
58
|
-
@cleaned_tokens << DOT
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|