gtokenizer 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/README.md +85 -0
- data/Rakefile +9 -0
- data/gtokenizer.gemspec +20 -0
- data/lib/gtokenizer.rb +11 -0
- data/lib/gtokenizer/version.rb +3 -0
- data/spec/gtokenizer_spec.rb +43 -0
- data/spec/spec_helper.rb +8 -0
- metadata +71 -0
data/.gitignore
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# GTokenizer
|
2
|
+
|
3
|
+
**GTokenizer** recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated _Science_ paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Install the gem via rubygems.
|
8
|
+
|
9
|
+
gem install gtokenizer
|
10
|
+
|
11
|
+
## Usage
|
12
|
+
|
13
|
+
You can extract tokens from a string using the `GTokenizer.parse` method.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
require 'gtokenizer'
|
17
|
+
|
18
|
+
#---
|
19
|
+
|
20
|
+
GTokenizer.parse("I saw the man with the telescope")
|
21
|
+
#=> ["I", "saw", "the", "man", "with", "the", "telescope"]
|
22
|
+
```
|
23
|
+
|
24
|
+
## How does the tokenizer work?
|
25
|
+
|
26
|
+
In general, 1 word = 1 token.
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
GTokenizer.parse("Hello world")
|
30
|
+
#=> ["Hello", "world"]
|
31
|
+
```
|
32
|
+
|
33
|
+
Punctuation is usually tokenized seperately.
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
GTokenizer.parse("Hello world?")
|
37
|
+
#=> ["Hello", "world", "?"]
|
38
|
+
```
|
39
|
+
|
40
|
+
Hyphenated words thus create three tokens.
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
GTokenizer.parse("Good-day world")
|
44
|
+
#=> ["Good", "-", "day", "world"]
|
45
|
+
```
|
46
|
+
|
47
|
+
& and _ are not treated as seperate tokens.
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
GTokenizer.parse("HKEY_LOCAL_MACHINE AT&T")
|
51
|
+
#=> ["Good", "-", "day", "world"]
|
52
|
+
```
|
53
|
+
|
54
|
+
Prices are treated as a single token.
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
GTokenizer.parse("$9.99")
|
58
|
+
#=> ["$9.99"]
|
59
|
+
```
|
60
|
+
|
61
|
+
For a fuller understanding of the mechanics of the tokenizer, see the specs, or Google's detailed outline in _Science_ (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
|
62
|
+
|
63
|
+
## License
|
64
|
+
|
65
|
+
GTokenizer is licensed under the [MIT License](http://creativecommons.org/licenses/MIT/):
|
66
|
+
|
67
|
+
Copyright (c) 2011 Alex Peattie (http://www.alexpeattie.com)
|
68
|
+
|
69
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
70
|
+
of this software and associated documentation files (the "Software"), to deal
|
71
|
+
in the Software without restriction, including without limitation the rights
|
72
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
73
|
+
copies of the Software, and to permit persons to whom the Software is
|
74
|
+
furnished to do so, subject to the following conditions:
|
75
|
+
|
76
|
+
The above copyright notice and this permission notice shall be included in
|
77
|
+
all copies or substantial portions of the Software.
|
78
|
+
|
79
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
80
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
81
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
82
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
83
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
84
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
85
|
+
THE SOFTWARE.
|
data/Rakefile
ADDED
data/gtokenizer.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "gtokenizer/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "gtokenizer"
|
7
|
+
s.version = GTokenizer::VERSION
|
8
|
+
s.authors = ["Alex Peattie"]
|
9
|
+
s.email = ["alexpeattie@gmail.com"]
|
10
|
+
s.homepage = "https://github.com/alexpeattie/gtokenizer"
|
11
|
+
s.summary = "A very simple string tokenizer, based on the one used by Google for their Google NGrams app (http://ngrams.googlelabs.com/)"
|
12
|
+
s.description = "GTokenizer recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)"
|
13
|
+
|
14
|
+
s.add_development_dependency "rspec"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
data/lib/gtokenizer.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require "gtokenizer/version"
|
2
|
+
|
3
|
+
module GTokenizer
|
4
|
+
PARSER_REGEX = /([A-GJXa-gjx]#|\$?[0-9.]+|[a-zA-Z&_0-9]+('s)?\+*|[\-.!@%^*()\[\]\={"'\\}|:;<,>?\/~`.\$#+])/
|
5
|
+
|
6
|
+
# input is the string to be tokenized
|
7
|
+
# returns an array of tokens as strings
|
8
|
+
def self.parse(input)
|
9
|
+
input.scan(PARSER_REGEX).map {|t| t[0]}
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "GTokenizer" do
|
4
|
+
it "tokenizes words" do
|
5
|
+
tokens = GTokenizer::parse("I saw the man with the telescope")
|
6
|
+
tokens.should == ["I", "saw", "the", "man", "with", "the", "telescope"]
|
7
|
+
end
|
8
|
+
|
9
|
+
it "tokenizes special characters" do
|
10
|
+
tokens = GTokenizer::parse("mail@domain.com !%^*()[]\\-={\"}|:;<,>?\/~`.$#+")
|
11
|
+
tokens.should == ["mail", "@", "domain", ".", "com", "!", "%", "^", "*", "(", ")", "[", "]", "\\", "-", "=", "{", "\"", "}", "|", ":", ";", "<", ",", ">", "?", "/", "~", "`", ".", "$", "#", "+"]
|
12
|
+
end
|
13
|
+
|
14
|
+
it "tokenizes apostrophes that don't precede an s" do
|
15
|
+
tokens = GTokenizer::parse("It's obvious that the parents' meeting won't be useful")
|
16
|
+
tokens.should == ["It's", "obvious", "that", "the", "parents", "'", "meeting", "won", "'", "t", "be", "useful"]
|
17
|
+
end
|
18
|
+
|
19
|
+
it "doesn't tokenize underscores and apostrophes" do
|
20
|
+
tokens = GTokenizer::parse("HKEY_LOCAL_MACHINE AT&T")
|
21
|
+
tokens.should == ["HKEY_LOCAL_MACHINE", "AT&T"]
|
22
|
+
end
|
23
|
+
|
24
|
+
it "tokenizes numbers and prices as a single token" do
|
25
|
+
tokens = GTokenizer::parse("1.5 feet of tape costs $1.99")
|
26
|
+
tokens.should == ["1.5", "feet", "of", "tape", "costs", "$1.99"]
|
27
|
+
end
|
28
|
+
|
29
|
+
it "doesn't tokenize # proceeding the letters a-g, j or x" do
|
30
|
+
tokens = GTokenizer::parse("A# b# C# d# E# g# J# and X# but not Q#")
|
31
|
+
tokens.should == ["A#", "b#", "C#", "d#", "E#", "g#", "J#", "and", "X#", "but", "not", "Q", "#"]
|
32
|
+
end
|
33
|
+
|
34
|
+
it "doesn't tokenize + proceeding an alphanumeric sequence" do
|
35
|
+
tokens = GTokenizer::parse("C++ or Na2+ but not 1+2")
|
36
|
+
tokens.should == ["C++", "or", "Na2+", "but", "not", "1", "+", "2"]
|
37
|
+
end
|
38
|
+
|
39
|
+
it "tokenizes a string with a mixture of words, numbers, special characters" do
|
40
|
+
tokens = GTokenizer::parse("Use C++, it's better than C# and it only costs $2.99!! Visit http://www.cplusplus.com/more_info_1 to learn more.")
|
41
|
+
tokens.should == ["Use", "C++", ",", "it's", "better", "than", "C#", "and", "it", "only", "costs", "$2.99", "!", "!", "Visit", "http", ":", "/", "/", "www", ".", "cplusplus", ".", "com", "/", "more_info_1", "to", "learn", "more", "."]
|
42
|
+
end
|
43
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'gtokenizer'
|
5
|
+
|
6
|
+
# Requires supporting files with custom matchers and macros, etc,
|
7
|
+
# in ./support/ and its subdirectories.
|
8
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gtokenizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Alex Peattie
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-07-02 00:00:00.000000000 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
requirement: &25955988 !ruby/object:Gem::Requirement
|
18
|
+
none: false
|
19
|
+
requirements:
|
20
|
+
- - ! '>='
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '0'
|
23
|
+
type: :development
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: *25955988
|
26
|
+
description: GTokenizer recreates the closed-source tokenization library used by Google
|
27
|
+
for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information
|
28
|
+
in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)
|
29
|
+
email:
|
30
|
+
- alexpeattie@gmail.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- .gitignore
|
36
|
+
- .rspec
|
37
|
+
- Gemfile
|
38
|
+
- README.md
|
39
|
+
- Rakefile
|
40
|
+
- gtokenizer.gemspec
|
41
|
+
- lib/gtokenizer.rb
|
42
|
+
- lib/gtokenizer/version.rb
|
43
|
+
- spec/gtokenizer_spec.rb
|
44
|
+
- spec/spec_helper.rb
|
45
|
+
has_rdoc: true
|
46
|
+
homepage: https://github.com/alexpeattie/gtokenizer
|
47
|
+
licenses: []
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options: []
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ! '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 1.5.2
|
67
|
+
signing_key:
|
68
|
+
specification_version: 3
|
69
|
+
summary: A very simple string tokenizer, based on the one used by Google for their
|
70
|
+
Google NGrams app (http://ngrams.googlelabs.com/)
|
71
|
+
test_files: []
|