pragmatic_tokenizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -4,8 +4,9 @@ describe PragmaticTokenizer do
4
4
  context 'Language: French (fr)' do
5
5
  it 'tokenizes a string #001' do
6
6
  text = "L'art de l'univers, c'est un art"
7
- pt = PragmaticTokenizer::Tokenizer.new(text,
8
- language: 'fr'
7
+ pt = PragmaticTokenizer::Tokenizer.new(
8
+ text,
9
+ language: 'fr'
9
10
  )
10
11
  expect(pt.tokenize).to eq(["l'", "art", "de", "l'", "univers", ",", "c'est", "un", "art"])
11
12
  end
@@ -4,7 +4,6 @@ require 'spec_helper'
4
4
  require 'stackprof'
5
5
 
6
6
  describe PragmaticTokenizer do
7
-
8
7
  # Speed benchmarks tests
9
8
 
10
9
  # it 'is fast?' do
@@ -29,6 +28,8 @@ describe PragmaticTokenizer do
29
28
  # 26.8
30
29
  # 8.2
31
30
  # 9.6
31
+ # 23.25
32
+ # 24.2
32
33
  # it 'is fast? (long strings)' do
33
34
  # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
34
35
  # puts "LENGTH: #{string.length}"
@@ -36,12 +37,11 @@ describe PragmaticTokenizer do
36
37
  # PragmaticTokenizer::Tokenizer.new(string,
37
38
  # language: 'en',
38
39
  # clean: true,
39
- # remove_numbers: true,
40
40
  # minimum_length: 3,
41
41
  # expand_contractions: true,
42
42
  # remove_stop_words: true,
43
- # remove_roman_numerals: true,
44
- # punctuation: 'none'
43
+ # numbers: :none,
44
+ # punctuation: :none
45
45
  # ).tokenize
46
46
  # end
47
47
  # end
@@ -55,8 +55,8 @@ describe PragmaticTokenizer do
55
55
  # end
56
56
  end
57
57
 
58
- def benchmark(&block)
59
- block.call
60
- time = Benchmark.realtime { block.call }
58
+ def benchmark
59
+ yield
60
+ time = Benchmark.realtime { yield }
61
61
  puts "RUNTIME: #{time}"
62
62
  end
@@ -7,35 +7,35 @@ describe PragmaticTokenizer do
7
7
 
8
8
  describe '#initialize' do
9
9
  it 'raises an error if the text argument is nil' do
10
- lambda { expect(PragmaticTokenizer::Tokenizer.new(nil, language: 'en').tokenize).to raise_error }
10
+ -> { expect(PragmaticTokenizer::Tokenizer.new(nil, language: 'en').tokenize).to raise_error }
11
11
  end
12
12
 
13
13
  it 'raises an error if the text argument is empty' do
14
- lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en').tokenize).to raise_error }
14
+ -> { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en').tokenize).to raise_error }
15
15
  end
16
16
 
17
17
  it 'raises an error if minimum_length is not an Integer' do
18
- lambda { expect(PragmaticTokenizer::Tokenizer.new("heelo", minimum_length: "strawberry").tokenize).to raise_error }
18
+ -> { expect(PragmaticTokenizer::Tokenizer.new("heelo", minimum_length: "strawberry").tokenize).to raise_error }
19
19
  end
20
20
 
21
21
  it 'raises an error if long_word_split is not an Integer' do
22
- lambda { expect(PragmaticTokenizer::Tokenizer.new("heeloo", long_word_split: "yes!").tokenize).to raise_error }
22
+ -> { expect(PragmaticTokenizer::Tokenizer.new("heeloo", long_word_split: "yes!").tokenize).to raise_error }
23
23
  end
24
24
 
25
25
  it 'raises an error if the text is not a String' do
26
- lambda { expect(PragmaticTokenizer::Tokenizer.new(5).tokenize).to raise_error }
26
+ -> { expect(PragmaticTokenizer::Tokenizer.new(5).tokenize).to raise_error }
27
27
  end
28
28
 
29
29
  it "raises an error if the punctuation argument is not nil, 'all', 'semi', or 'none'" do
30
- lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', punctuation: 'world').tokenize).to raise_error }
30
+ -> { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', punctuation: 'world').tokenize).to raise_error }
31
31
  end
32
32
 
33
33
  it "raises an error if the numbers argument is not nil, 'all', 'semi', or 'none'" do
34
- lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', numbers: 'world').tokenize).to raise_error }
34
+ -> { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', numbers: 'world').tokenize).to raise_error }
35
35
  end
36
36
 
37
37
  it "raises an error if the mentions argument is not nil, 'all', 'semi', or 'none'" do
38
- lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', mentions: 'world').tokenize).to raise_error }
38
+ -> { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', mentions: 'world').tokenize).to raise_error }
39
39
  end
40
40
  end
41
41
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0
4
+ version: 1.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-22 00:00:00.000000000 Z
11
+ date: 2016-01-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  description: A multilingual tokenizer to split a string into tokens.
84
98
  email:
85
99
  - diasks2@gmail.com
@@ -89,6 +103,8 @@ extra_rdoc_files: []
89
103
  files:
90
104
  - ".gitignore"
91
105
  - ".rspec"
106
+ - ".rubocop.yml"
107
+ - ".rubocop_todo.yml"
92
108
  - ".travis.yml"
93
109
  - Gemfile
94
110
  - LICENSE.txt