RubyGems - gtokenizer - Versions diffs - 1.0.0 - Mend

gtokenizer 1.0.0

Files changed (11) hide show

data/.gitignore ADDED

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/.rspec ADDED

	@@ -0,0 +1 @@
1	+ --color

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in gtokenizer.gemspec
+gemspec

data/README.md ADDED

@@ -0,0 +1,85 @@
+# GTokenizer
+**GTokenizer** recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated _Science_ paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
+## Installation
+Install the gem via rubygems.
+    gem install gtokenizer
+## Usage
+You can extract tokens from a string using the `GTokenizer.parse` method.
+```ruby
+require 'gtokenizer'
+#---
+GTokenizer.parse("I saw the man with the telescope")
+  #=> ["I", "saw", "the", "man", "with", "the", "telescope"]
+```
+## How does the tokenizer work?
+In general, 1 word = 1 token.
+```ruby
+GTokenizer.parse("Hello world")
+  #=> ["Hello", "world"]
+```
+Punctuation is usually tokenized seperately.
+```ruby
+GTokenizer.parse("Hello world?")
+  #=> ["Hello", "world", "?"]
+```
+Hyphenated words thus create three tokens.
+```ruby
+GTokenizer.parse("Good-day world")
+  #=> ["Good", "-", "day", "world"]
+```
+& and _ are not treated as seperate tokens.
+```ruby
+GTokenizer.parse("HKEY_LOCAL_MACHINE AT&T")
+  #=> ["Good", "-", "day", "world"]
+```
+Prices are treated as a single token.
+```ruby
+GTokenizer.parse("$9.99")
+  #=> ["$9.99"]
+```
+For a fuller understanding of the mechanics of the tokenizer, see the specs, or Google's detailed outline in _Science_ (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf).
+## License
+GTokenizer is licensed under the [MIT License](http://creativecommons.org/licenses/MIT/):
+    Copyright (c) 2011 Alex Peattie (http://www.alexpeattie.com)
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in
+    all copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+    THE SOFTWARE.

data/Rakefile ADDED

@@ -0,0 +1,9 @@
+require 'bundler/gem_tasks'
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task :default => :spec

data/gtokenizer.gemspec ADDED

@@ -0,0 +1,20 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "gtokenizer/version"
+Gem::Specification.new do |s|
+  s.name        = "gtokenizer"
+  s.version     = GTokenizer::VERSION
+  s.authors     = ["Alex Peattie"]
+  s.email       = ["alexpeattie@gmail.com"]
+  s.homepage    = "https://github.com/alexpeattie/gtokenizer"
+  s.summary     = "A very simple string tokenizer, based on the one used by Google for their Google NGrams app (http://ngrams.googlelabs.com/)"
+  s.description = "GTokenizer recreates the closed-source tokenization library used by Google for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)"
+  s.add_development_dependency "rspec"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+end

data/lib/gtokenizer.rb ADDED

@@ -0,0 +1,11 @@
+require "gtokenizer/version"
+module GTokenizer
+  PARSER_REGEX = /([A-GJXa-gjx]#|\$?[0-9.]+|[a-zA-Z&_0-9]+('s)?\+*|[\-.!@%^*()\[\]\={"'\\}|:;<,>?\/~`.\$#+])/
+  # input is the string to be tokenized
+  # returns an array of tokens as strings
+  def self.parse(input)
+    input.scan(PARSER_REGEX).map {|t| t[0]}
+  end
+end

data/lib/gtokenizer/version.rb ADDED

@@ -0,0 +1,3 @@
+module GTokenizer
+  VERSION = "1.0.0"
+end

data/spec/gtokenizer_spec.rb ADDED

@@ -0,0 +1,43 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe "GTokenizer" do
+  it "tokenizes words" do
+    tokens = GTokenizer::parse("I saw the man with the telescope")
+    tokens.should == ["I", "saw", "the", "man", "with", "the", "telescope"]
+  end
+  it "tokenizes special characters" do
+    tokens = GTokenizer::parse("mail@domain.com !%^*()[]\\-={\"}|:;<,>?\/~`.$#+")
+    tokens.should == ["mail", "@", "domain", ".", "com", "!", "%", "^", "*", "(", ")", "[", "]", "\\", "-", "=", "{", "\"", "}", "|", ":", ";", "<", ",", ">", "?", "/", "~", "`", ".", "$", "#", "+"]
+  end
+  it "tokenizes apostrophes that don't precede an s" do
+    tokens = GTokenizer::parse("It's obvious that the parents' meeting won't be useful")
+    tokens.should == ["It's", "obvious", "that", "the", "parents", "'", "meeting", "won", "'", "t", "be", "useful"]
+  end
+  it "doesn't tokenize underscores and apostrophes" do
+    tokens = GTokenizer::parse("HKEY_LOCAL_MACHINE AT&T")
+    tokens.should == ["HKEY_LOCAL_MACHINE", "AT&T"]
+  end
+  it "tokenizes numbers and prices as a single token" do
+    tokens = GTokenizer::parse("1.5 feet of tape costs $1.99")
+    tokens.should == ["1.5", "feet", "of", "tape", "costs", "$1.99"]
+  end
+  it "doesn't tokenize # proceeding the letters a-g, j or x" do
+    tokens = GTokenizer::parse("A# b# C# d# E# g# J# and X# but not Q#")
+    tokens.should == ["A#", "b#", "C#", "d#", "E#", "g#", "J#", "and", "X#", "but", "not", "Q", "#"]
+  end
+  it "doesn't tokenize + proceeding an alphanumeric sequence" do
+    tokens = GTokenizer::parse("C++ or Na2+ but not 1+2")
+    tokens.should == ["C++", "or", "Na2+", "but", "not", "1", "+", "2"]
+  end
+  it "tokenizes a string with a mixture of words, numbers, special characters" do
+    tokens = GTokenizer::parse("Use C++, it's better than C# and it only costs $2.99!! Visit http://www.cplusplus.com/more_info_1 to learn more.")
+    tokens.should == ["Use", "C++", ",", "it's", "better", "than", "C#", "and", "it", "only", "costs", "$2.99", "!", "!", "Visit", "http", ":", "/", "/", "www", ".", "cplusplus", ".", "com", "/", "more_info_1", "to", "learn", "more", "."]
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,8 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'rspec'
+require 'gtokenizer'
+# Requires supporting files with custom matchers and macros, etc,
+# in ./support/ and its subdirectories.
+Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}

metadata ADDED

@@ -0,0 +1,71 @@
+--- !ruby/object:Gem::Specification
+name: gtokenizer
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+  prerelease:
+platform: ruby
+authors:
+- Alex Peattie
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-07-02 00:00:00.000000000 +01:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: &25955988 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *25955988
+description: GTokenizer recreates the closed-source tokenization library used by Google
+  for their Google NGrams app (http://ngrams.googlelabs.com/), based on the information
+  in the associated Science paper (http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf)
+email:
+- alexpeattie@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- Gemfile
+- README.md
+- Rakefile
+- gtokenizer.gemspec
+- lib/gtokenizer.rb
+- lib/gtokenizer/version.rb
+- spec/gtokenizer_spec.rb
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: https://github.com/alexpeattie/gtokenizer
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.5.2
+signing_key:
+specification_version: 3
+summary: A very simple string tokenizer, based on the one used by Google for their
+  Google NGrams app (http://ngrams.googlelabs.com/)
+test_files: []