gtokenizer 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in gtokenizer.gemspec
4
+ gemspec
@@ -0,0 +1,85 @@
1
+ # GTokenizer
2
+
3
+ **GTokenizer** recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated _Science_ paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
4
+
5
+ ## Installation
6
+
7
+ Install the gem via rubygems.
8
+
9
+ gem install gtokenizer
10
+
11
+ ## Usage
12
+
13
+ You can extract tokens from a string using the `GTokenizer.parse` method.
14
+
15
+ ```ruby
16
+ require 'gtokenizer'
17
+
18
+ #---
19
+
20
+ GTokenizer.parse("I saw the man with the telescope")
21
+ #=> ["I", "saw", "the", "man", "with", "the", "telescope"]
22
+ ```
23
+
24
+ ## How does the tokenizer work?
25
+
26
+ In general, 1 word = 1 token.
27
+
28
+ ```ruby
29
+ GTokenizer.parse("Hello world")
30
+ #=> ["Hello", "world"]
31
+ ```
32
+
33
+ Punctuation is usually tokenized seperately.
34
+
35
+ ```ruby
36
+ GTokenizer.parse("Hello world?")
37
+ #=> ["Hello", "world", "?"]
38
+ ```
39
+
40
+ Hyphenated words thus create three tokens.
41
+
42
+ ```ruby
43
+ GTokenizer.parse("Good-day world")
44
+ #=> ["Good", "-", "day", "world"]
45
+ ```
46
+
47
+ & and _ are not treated as seperate tokens.
48
+
49
+ ```ruby
50
+ GTokenizer.parse("HKEY_LOCAL_MACHINE AT&T")
51
+ #=> ["Good", "-", "day", "world"]
52
+ ```
53
+
54
+ Prices are treated as a single token.
55
+
56
+ ```ruby
57
+ GTokenizer.parse("$9.99")
58
+ #=> ["$9.99"]
59
+ ```
60
+
61
+ For a fuller understanding of the mechanics of the tokenizer, see the specs, or Google's detailed outline in _Science_ (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
62
+
63
+ ## License
64
+
65
+ GTokenizer is licensed under the [MIT License](http://creativecommons.org/licenses/MIT/):
66
+
67
+ Copyright (c) 2011 Alex Peattie (http://www.alexpeattie.com)
68
+
69
+ Permission is hereby granted, free of charge, to any person obtaining a copy
70
+ of this software and associated documentation files (the "Software"), to deal
71
+ in the Software without restriction, including without limitation the rights
72
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
73
+ copies of the Software, and to permit persons to whom the Software is
74
+ furnished to do so, subject to the following conditions:
75
+
76
+ The above copyright notice and this permission notice shall be included in
77
+ all copies or substantial portions of the Software.
78
+
79
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
83
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
84
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
85
+ THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rspec/core'
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec) do |spec|
6
+ spec.pattern = FileList['spec/**/*_spec.rb']
7
+ end
8
+
9
+ task :default => :spec
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "gtokenizer/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "gtokenizer"
7
+ s.version = GTokenizer::VERSION
8
+ s.authors = ["Alex Peattie"]
9
+ s.email = ["alexpeattie@gmail.com"]
10
+ s.homepage = "https://github.com/alexpeattie/gtokenizer"
11
+ s.summary = "A very simple string tokenizer, based on the one used by Google for their Google NGrams app (http://ngrams.googlelabs.com/)"
12
+ s.description = "GTokenizer recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)"
13
+
14
+ s.add_development_dependency "rspec"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
@@ -0,0 +1,11 @@
1
+ require "gtokenizer/version"
2
+
3
+ module GTokenizer
4
+ PARSER_REGEX = /([A-GJXa-gjx]#|\$?[0-9.]+|[a-zA-Z&_0-9]+('s)?\+*|[\-.!@%^*()\[\]\={"'\\}|:;<,>?\/~`.\$#+])/
5
+
6
+ # input is the string to be tokenized
7
+ # returns an array of tokens as strings
8
+ def self.parse(input)
9
+ input.scan(PARSER_REGEX).map {|t| t[0]}
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module GTokenizer
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,43 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "GTokenizer" do
4
+ it "tokenizes words" do
5
+ tokens = GTokenizer::parse("I saw the man with the telescope")
6
+ tokens.should == ["I", "saw", "the", "man", "with", "the", "telescope"]
7
+ end
8
+
9
+ it "tokenizes special characters" do
10
+ tokens = GTokenizer::parse("mail@domain.com !%^*()[]\\-={\"}|:;<,>?\/~`.$#+")
11
+ tokens.should == ["mail", "@", "domain", ".", "com", "!", "%", "^", "*", "(", ")", "[", "]", "\\", "-", "=", "{", "\"", "}", "|", ":", ";", "<", ",", ">", "?", "/", "~", "`", ".", "$", "#", "+"]
12
+ end
13
+
14
+ it "tokenizes apostrophes that don't precede an s" do
15
+ tokens = GTokenizer::parse("It's obvious that the parents' meeting won't be useful")
16
+ tokens.should == ["It's", "obvious", "that", "the", "parents", "'", "meeting", "won", "'", "t", "be", "useful"]
17
+ end
18
+
19
+ it "doesn't tokenize underscores and apostrophes" do
20
+ tokens = GTokenizer::parse("HKEY_LOCAL_MACHINE AT&T")
21
+ tokens.should == ["HKEY_LOCAL_MACHINE", "AT&T"]
22
+ end
23
+
24
+ it "tokenizes numbers and prices as a single token" do
25
+ tokens = GTokenizer::parse("1.5 feet of tape costs $1.99")
26
+ tokens.should == ["1.5", "feet", "of", "tape", "costs", "$1.99"]
27
+ end
28
+
29
+ it "doesn't tokenize # proceeding the letters a-g, j or x" do
30
+ tokens = GTokenizer::parse("A# b# C# d# E# g# J# and X# but not Q#")
31
+ tokens.should == ["A#", "b#", "C#", "d#", "E#", "g#", "J#", "and", "X#", "but", "not", "Q", "#"]
32
+ end
33
+
34
+ it "doesn't tokenize + proceeding an alphanumeric sequence" do
35
+ tokens = GTokenizer::parse("C++ or Na2+ but not 1+2")
36
+ tokens.should == ["C++", "or", "Na2+", "but", "not", "1", "+", "2"]
37
+ end
38
+
39
+ it "tokenizes a string with a mixture of words, numbers, special characters" do
40
+ tokens = GTokenizer::parse("Use C++, it's better than C# and it only costs $2.99!! Visit http://www.cplusplus.com/more_info_1 to learn more.")
41
+ tokens.should == ["Use", "C++", ",", "it's", "better", "than", "C#", "and", "it", "only", "costs", "$2.99", "!", "!", "Visit", "http", ":", "/", "/", "www", ".", "cplusplus", ".", "com", "/", "more_info_1", "to", "learn", "more", "."]
42
+ end
43
+ end
@@ -0,0 +1,8 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'gtokenizer'
5
+
6
+ # Requires supporting files with custom matchers and macros, etc,
7
+ # in ./support/ and its subdirectories.
8
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gtokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Alex Peattie
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-07-02 00:00:00.000000000 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ requirement: &25955988 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *25955988
26
+ description: GTokenizer recreates the closed-source tokenization library used by Google
27
+ for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information
28
+ in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)
29
+ email:
30
+ - alexpeattie@gmail.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - .gitignore
36
+ - .rspec
37
+ - Gemfile
38
+ - README.md
39
+ - Rakefile
40
+ - gtokenizer.gemspec
41
+ - lib/gtokenizer.rb
42
+ - lib/gtokenizer/version.rb
43
+ - spec/gtokenizer_spec.rb
44
+ - spec/spec_helper.rb
45
+ has_rdoc: true
46
+ homepage: https://github.com/alexpeattie/gtokenizer
47
+ licenses: []
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ! '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 1.5.2
67
+ signing_key:
68
+ specification_version: 3
69
+ summary: A very simple string tokenizer, based on the one used by Google for their
70
+ Google NGrams app (http://ngrams.googlelabs.com/)
71
+ test_files: []