gtokenizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in gtokenizer.gemspec
4
+ gemspec
@@ -0,0 +1,85 @@
1
+ # GTokenizer
2
+
3
+ **GTokenizer** recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated _Science_ paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
4
+
5
+ ## Installation
6
+
7
+ Install the gem via rubygems.
8
+
9
+ gem install gtokenizer
10
+
11
+ ## Usage
12
+
13
+ You can extract tokens from a string using the `GTokenizer.parse` method.
14
+
15
+ ```ruby
16
+ require 'gtokenizer'
17
+
18
+ #---
19
+
20
+ GTokenizer.parse("I saw the man with the telescope")
21
+ #=> ["I", "saw", "the", "man", "with", "the", "telescope"]
22
+ ```
23
+
24
+ ## How does the tokenizer work?
25
+
26
+ In general, 1 word = 1 token.
27
+
28
+ ```ruby
29
+ GTokenizer.parse("Hello world")
30
+ #=> ["Hello", "world"]
31
+ ```
32
+
33
+ Punctuation is usually tokenized seperately.
34
+
35
+ ```ruby
36
+ GTokenizer.parse("Hello world?")
37
+ #=> ["Hello", "world", "?"]
38
+ ```
39
+
40
+ Hyphenated words thus create three tokens.
41
+
42
+ ```ruby
43
+ GTokenizer.parse("Good-day world")
44
+ #=> ["Good", "-", "day", "world"]
45
+ ```
46
+
47
+ & and _ are not treated as seperate tokens.
48
+
49
+ ```ruby
50
+ GTokenizer.parse("HKEY_LOCAL_MACHINE AT&T")
51
+ #=> ["Good", "-", "day", "world"]
52
+ ```
53
+
54
+ Prices are treated as a single token.
55
+
56
+ ```ruby
57
+ GTokenizer.parse("$9.99")
58
+ #=> ["$9.99"]
59
+ ```
60
+
61
+ For a fuller understanding of the mechanics of the tokenizer, see the specs, or Google's detailed outline in _Science_ (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
62
+
63
+ ## License
64
+
65
+ GTokenizer is licensed under the [MIT License](http://creativecommons.org/licenses/MIT/):
66
+
67
+ Copyright (c) 2011 Alex Peattie (http://www.alexpeattie.com)
68
+
69
+ Permission is hereby granted, free of charge, to any person obtaining a copy
70
+ of this software and associated documentation files (the "Software"), to deal
71
+ in the Software without restriction, including without limitation the rights
72
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
73
+ copies of the Software, and to permit persons to whom the Software is
74
+ furnished to do so, subject to the following conditions:
75
+
76
+ The above copyright notice and this permission notice shall be included in
77
+ all copies or substantial portions of the Software.
78
+
79
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
80
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
81
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
82
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
83
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
84
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
85
+ THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rspec/core'
4
+ require 'rspec/core/rake_task'
5
+ RSpec::Core::RakeTask.new(:spec) do |spec|
6
+ spec.pattern = FileList['spec/**/*_spec.rb']
7
+ end
8
+
9
+ task :default => :spec
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "gtokenizer/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "gtokenizer"
7
+ s.version = GTokenizer::VERSION
8
+ s.authors = ["Alex Peattie"]
9
+ s.email = ["alexpeattie@gmail.com"]
10
+ s.homepage = "https://github.com/alexpeattie/gtokenizer"
11
+ s.summary = "A very simple string tokenizer, based on the one used by Google for their Google NGrams app (http://ngrams.googlelabs.com/)"
12
+ s.description = "GTokenizer recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)"
13
+
14
+ s.add_development_dependency "rspec"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ end
@@ -0,0 +1,11 @@
1
+ require "gtokenizer/version"
2
+
3
+ module GTokenizer
4
+ PARSER_REGEX = /([A-GJXa-gjx]#|\$?[0-9.]+|[a-zA-Z&_0-9]+('s)?\+*|[\-.!@%^*()\[\]\={"'\\}|:;<,>?\/~`.\$#+])/
5
+
6
+ # input is the string to be tokenized
7
+ # returns an array of tokens as strings
8
+ def self.parse(input)
9
+ input.scan(PARSER_REGEX).map {|t| t[0]}
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module GTokenizer
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,43 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "GTokenizer" do
4
+ it "tokenizes words" do
5
+ tokens = GTokenizer::parse("I saw the man with the telescope")
6
+ tokens.should == ["I", "saw", "the", "man", "with", "the", "telescope"]
7
+ end
8
+
9
+ it "tokenizes special characters" do
10
+ tokens = GTokenizer::parse("mail@domain.com !%^*()[]\\-={\"}|:;<,>?\/~`.$#+")
11
+ tokens.should == ["mail", "@", "domain", ".", "com", "!", "%", "^", "*", "(", ")", "[", "]", "\\", "-", "=", "{", "\"", "}", "|", ":", ";", "<", ",", ">", "?", "/", "~", "`", ".", "$", "#", "+"]
12
+ end
13
+
14
+ it "tokenizes apostrophes that don't precede an s" do
15
+ tokens = GTokenizer::parse("It's obvious that the parents' meeting won't be useful")
16
+ tokens.should == ["It's", "obvious", "that", "the", "parents", "'", "meeting", "won", "'", "t", "be", "useful"]
17
+ end
18
+
19
+ it "doesn't tokenize underscores and apostrophes" do
20
+ tokens = GTokenizer::parse("HKEY_LOCAL_MACHINE AT&T")
21
+ tokens.should == ["HKEY_LOCAL_MACHINE", "AT&T"]
22
+ end
23
+
24
+ it "tokenizes numbers and prices as a single token" do
25
+ tokens = GTokenizer::parse("1.5 feet of tape costs $1.99")
26
+ tokens.should == ["1.5", "feet", "of", "tape", "costs", "$1.99"]
27
+ end
28
+
29
+ it "doesn't tokenize # proceeding the letters a-g, j or x" do
30
+ tokens = GTokenizer::parse("A# b# C# d# E# g# J# and X# but not Q#")
31
+ tokens.should == ["A#", "b#", "C#", "d#", "E#", "g#", "J#", "and", "X#", "but", "not", "Q", "#"]
32
+ end
33
+
34
+ it "doesn't tokenize + proceeding an alphanumeric sequence" do
35
+ tokens = GTokenizer::parse("C++ or Na2+ but not 1+2")
36
+ tokens.should == ["C++", "or", "Na2+", "but", "not", "1", "+", "2"]
37
+ end
38
+
39
+ it "tokenizes a string with a mixture of words, numbers, special characters" do
40
+ tokens = GTokenizer::parse("Use C++, it's better than C# and it only costs $2.99!! Visit http://www.cplusplus.com/more_info_1 to learn more.")
41
+ tokens.should == ["Use", "C++", ",", "it's", "better", "than", "C#", "and", "it", "only", "costs", "$2.99", "!", "!", "Visit", "http", ":", "/", "/", "www", ".", "cplusplus", ".", "com", "/", "more_info_1", "to", "learn", "more", "."]
42
+ end
43
+ end
@@ -0,0 +1,8 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'gtokenizer'
5
+
6
+ # Requires supporting files with custom matchers and macros, etc,
7
+ # in ./support/ and its subdirectories.
8
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gtokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Alex Peattie
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-07-02 00:00:00.000000000 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ requirement: &25955988 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *25955988
26
+ description: GTokenizer recreates the closed-source tokenization library used by Google
27
+ for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information
28
+ in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)
29
+ email:
30
+ - alexpeattie@gmail.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - .gitignore
36
+ - .rspec
37
+ - Gemfile
38
+ - README.md
39
+ - Rakefile
40
+ - gtokenizer.gemspec
41
+ - lib/gtokenizer.rb
42
+ - lib/gtokenizer/version.rb
43
+ - spec/gtokenizer_spec.rb
44
+ - spec/spec_helper.rb
45
+ has_rdoc: true
46
+ homepage: https://github.com/alexpeattie/gtokenizer
47
+ licenses: []
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ! '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 1.5.2
67
+ signing_key:
68
+ specification_version: 3
69
+ summary: A very simple string tokenizer, based on the one used by Google for their
70
+ Google NGrams app (http://ngrams.googlelabs.com/)
71
+ test_files: []