pragmatic_tokenizer 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ context 'Language: French (fr)' do
5
+ it 'tokenizes a string #001' do
6
+ text = "L'art de l'univers, c'est un art"
7
+ pt = PragmaticTokenizer::Tokenizer.new(text,
8
+ language: 'fr'
9
+ )
10
+ expect(pt.tokenize).to eq(["l'", "art", "de", "l'", "univers", ",", "c'est", "un", "art"])
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,62 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'benchmark'
3
+ require 'spec_helper'
4
+ require 'stackprof'
5
+
6
+ describe PragmaticTokenizer do
7
+
8
+ # Speed benchmarks tests
9
+
10
+ # it 'is fast?' do
11
+ # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
12
+ # benchmark do
13
+ # 10.times do
14
+ # data = StackProf.run(mode: :cpu, interval: 1000) do
15
+ # PragmaticTokenizer::Tokenizer.new(string * 100,
16
+ # language: 'en',
17
+ # clean: true,
18
+ # remove_numbers: true,
19
+ # minimum_length: 3,
20
+ # expand_contractions: true,
21
+ # remove_stop_words: true
22
+ # ).tokenize
23
+ # end
24
+ # puts StackProf::Report.new(data).print_text
25
+ # end
26
+ # end
27
+ # end
28
+
29
+ # 26.8
30
+ # 8.2
31
+ # 9.6
32
+ # it 'is fast? (long strings)' do
33
+ # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
34
+ # puts "LENGTH: #{string.length}"
35
+ # benchmark do
36
+ # PragmaticTokenizer::Tokenizer.new(string,
37
+ # language: 'en',
38
+ # clean: true,
39
+ # remove_numbers: true,
40
+ # minimum_length: 3,
41
+ # expand_contractions: true,
42
+ # remove_stop_words: true,
43
+ # remove_roman_numerals: true,
44
+ # punctuation: 'none'
45
+ # ).tokenize
46
+ # end
47
+ # end
48
+
49
+ # it 'is the baseline' do
50
+ # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
51
+ # puts "LENGTH: #{string.length}"
52
+ # benchmark do
53
+ # string.split(' ')
54
+ # end
55
+ # end
56
+ end
57
+
58
+ def benchmark(&block)
59
+ block.call
60
+ time = Benchmark.realtime { block.call }
61
+ puts "RUNTIME: #{time}"
62
+ end
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ it 'has a version number' do
5
+ expect(PragmaticTokenizer::VERSION).not_to be nil
6
+ end
7
+
8
+ describe '#initialize' do
9
+ it 'raises an error if the text argument is nil' do
10
+ lambda { expect(PragmaticTokenizer::Tokenizer.new(nil, language: 'en').tokenize).to raise_error }
11
+ end
12
+
13
+ it 'raises an error if the text argument is empty' do
14
+ lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en').tokenize).to raise_error }
15
+ end
16
+
17
+ it 'raises an error if minimum_length is not an Integer' do
18
+ lambda { expect(PragmaticTokenizer::Tokenizer.new("heelo", minimum_length: "strawberry").tokenize).to raise_error }
19
+ end
20
+
21
+ it 'raises an error if long_word_split is not an Integer' do
22
+ lambda { expect(PragmaticTokenizer::Tokenizer.new("heeloo", long_word_split: "yes!").tokenize).to raise_error }
23
+ end
24
+
25
+ it 'raises an error if the text is not a String' do
26
+ lambda { expect(PragmaticTokenizer::Tokenizer.new(5).tokenize).to raise_error }
27
+ end
28
+
29
+ it "raises an error if the punctuation argument is not nil, 'all', 'semi', or 'none'" do
30
+ lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', punctuation: 'world').tokenize).to raise_error }
31
+ end
32
+
33
+ it "raises an error if the numbers argument is not nil, 'all', 'semi', or 'none'" do
34
+ lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', numbers: 'world').tokenize).to raise_error }
35
+ end
36
+
37
+ it "raises an error if the mentions argument is not nil, 'all', 'semi', or 'none'" do
38
+ lambda { expect(PragmaticTokenizer::Tokenizer.new('', language: 'en', mentions: 'world').tokenize).to raise_error }
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,2 @@
1
+ $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
+ require 'pragmatic_tokenizer'
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.1
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
- bindir: exe
9
+ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2016-01-22 00:00:00.000000000 Z
12
12
  dependencies:
@@ -94,8 +94,6 @@ files:
94
94
  - LICENSE.txt
95
95
  - README.md
96
96
  - Rakefile
97
- - bin/console
98
- - bin/setup
99
97
  - lib/pragmatic_tokenizer.rb
100
98
  - lib/pragmatic_tokenizer/ending_punctuation_separator.rb
101
99
  - lib/pragmatic_tokenizer/full_stop_separator.rb
@@ -130,6 +128,13 @@ files:
130
128
  - lib/pragmatic_tokenizer/tokenizer.rb
131
129
  - lib/pragmatic_tokenizer/version.rb
132
130
  - pragmatic_tokenizer.gemspec
131
+ - spec/languages/bulgarian_spec.rb
132
+ - spec/languages/deutsch_spec.rb
133
+ - spec/languages/english_spec.rb
134
+ - spec/languages/french_spec.rb
135
+ - spec/performance_spec.rb
136
+ - spec/pragmatic_tokenizer_spec.rb
137
+ - spec/spec_helper.rb
133
138
  homepage: https://github.com/diasks2/pragmatic_tokenizer
134
139
  licenses: []
135
140
  metadata: {}
@@ -153,4 +158,11 @@ rubygems_version: 2.4.1
153
158
  signing_key:
154
159
  specification_version: 4
155
160
  summary: A multilingual tokenizer
156
- test_files: []
161
+ test_files:
162
+ - spec/languages/bulgarian_spec.rb
163
+ - spec/languages/deutsch_spec.rb
164
+ - spec/languages/english_spec.rb
165
+ - spec/languages/french_spec.rb
166
+ - spec/performance_spec.rb
167
+ - spec/pragmatic_tokenizer_spec.rb
168
+ - spec/spec_helper.rb
data/bin/console DELETED
@@ -1,14 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require "bundler/setup"
4
- require "pragmatic_tokenizer"
5
-
6
- # You can add fixtures and/or initialization code here to make experimenting
7
- # with your gem easier. You can also use a different console, if you like.
8
-
9
- # (If you use this, don't forget to add pry to your Gemfile!)
10
- # require "pry"
11
- # Pry.start
12
-
13
- require "irb"
14
- IRB.start
data/bin/setup DELETED
@@ -1,7 +0,0 @@
1
- #!/bin/bash
2
- set -euo pipefail
3
- IFS=$'\n\t'
4
-
5
- bundle install
6
-
7
- # Do any other automated setup that you need to do here