microsoft_ngram 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in microsoft_ngram.gemspec
4
+ gemspec
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2010-08-31
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
@@ -0,0 +1,8 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/microsoft_ngram
6
+ lib/microsoft_ngram.rb
7
+ spec/microsoft_ngram_spec.rb
8
+ examples/segment.rb
@@ -0,0 +1,117 @@
1
+ microsoft_ngram
2
+ ===============
3
+
4
+ This is a simple ruby gem to access the Bing Ngram data. It's loosely based on Microsoft's Python library.
5
+ Source code at [github.com/willf/microsoft_ngram](http://github.com/willf/microsoft_ngram).
6
+
7
+ Installation
8
+ ------------
9
+
10
+ Email [webngram@microsoft.com](mailto:webngram@microsoft.com?subject=Token%20Request) and request a token.
11
+ When you get your token, add it to your .bashrc or .bash_profile:
12
+
13
+ export NGRAM_TOKEN="YOUR-TOKEN-HERE"
14
+
15
+ Then install the gem:
16
+
17
+ gem install microsoft_ngram
18
+
19
+ Usage
20
+ -----
21
+
22
+ To get a list of currently available models:
23
+
24
+ > MicrosoftNgram.models
25
+ => ["bing-anchor/jun09/1", "bing-anchor/jun09/2", "bing-anchor/jun09/3", "bing-anchor/jun09/4", "bing-body/jun09/1", "bing-body/jun09/2", "bing-body/jun09/3", "bing-title/jun09/1", "bing-title/jun09/2", "bing-title/jun09/3", "bing-title/jun09/4", "bing-query/jun09/1", "bing-query/jun09/2", "bing-query/jun09/3"]
26
+
27
+ To see the default model:
28
+
29
+ > MicrosoftNgram.new.model
30
+ => "bing-body/jun09/3"
31
+
32
+ Parameters to the initializer are:
33
+
34
+ :model => <i>string</i> (sets model)
35
+ :user_token => <i>string</i> (sets user token)
36
+ :debug => <i>boolean</i> (will show GET/POST calls)
37
+
38
+ So, to use the 2-gram title model:
39
+
40
+ > model = MicrosoftNgram.new(:model => "bing-title/jun09/2")
41
+
42
+ To get a single joint probability, or multiple joint probabilities (If
43
+ you know you want multiple joint probabilities, it is better to ask
44
+ for several at once):
45
+
46
+ > MicrosoftNgram.new.jps(['fish sticks', 'frog sticks'])
47
+ => [["fish sticks", -6.853792], ["frog sticks", -9.91852]]
48
+ > MicrosoftNgram.new.jp("fish sticks")
49
+ => -6.853792
50
+
51
+ To get a single conditional probability, or multiple conditional probabilities (If you know you want multiple conditional probabilities, it is better to ask for several at once):
52
+
53
+ > MicrosoftNgram.new.cp("fish sticks")
54
+ => -2.712575
55
+ > MicrosoftNgram.new.cps(['fish sticks', 'frog sticks'])
56
+ => [["fish sticks", -2.712575], ["frog sticks", -4.788582]]
57
+
58
+ To yield the most probable next token using the default model:
59
+
60
+ > MicrosoftNgram.new.generate("Microsoft Windows",5) {|x| puts x.join(' ')}
61
+ xp -0.6964428
62
+ vista -0.9242383
63
+ server -1.106876
64
+ 2000 -1.145312
65
+ currentversion -1.168404
66
+
67
+ To use the query model for the same thing:
68
+
69
+ > MicrosoftNgram.new(:model => 'bing-query/jun09/3').generate("Microsoft Windows",5) {|x| puts x.join(' ')}
70
+ xp -0.5429792
71
+ </s> -1.062959
72
+ update -1.08291
73
+ vista -1.199022
74
+ installer -1.248958
75
+
76
+ Sample Script
77
+ -------------
78
+
79
+ ```ruby
80
+ require 'rubygems'
81
+ require 'microsoft_ngram'
82
+ l = []
83
+ Bing::Ngram.new(:model => "bing-body/apr10/5").generate('a bum',50){ |w,_| l << w }
84
+ l.join("; ")
85
+ ```
86
+
87
+ More Info
88
+ ---------
89
+
90
+ See the [REST API](http://web-ngram.research.microsoft.com/info/rest.html) and the
91
+ [terms of use](http://web-ngram.research.microsoft.com/info/TermsOfUse.htm) for accessing the Microsoft data.
92
+
93
+ License
94
+ -------
95
+
96
+ (The MIT License)
97
+
98
+ Copyright (c) 2010/2011
99
+
100
+ Permission is hereby granted, free of charge, to any person obtaining
101
+ a copy of this software and associated documentation files (the
102
+ 'Software'), to deal in the Software without restriction, including
103
+ without limitation the rights to use, copy, modify, merge, publish,
104
+ distribute, sublicense, and/or sell copies of the Software, and to
105
+ permit persons to whom the Software is furnished to do so, subject to
106
+ the following conditions:
107
+
108
+ The above copyright notice and this permission notice shall be
109
+ included in all copies or substantial portions of the Software.
110
+
111
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
112
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
113
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
114
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
115
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
116
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
117
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,11 @@
1
+ require 'bundler'
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+ require 'microsoft_ngram'
5
+
6
+ Bundler::GemHelper.install_tasks
7
+
8
+ RSpec::Core::RakeTask.new('spec')
9
+
10
+ # If you want to make this the default task
11
+ task :default => :spec
File without changes
@@ -0,0 +1,77 @@
1
+ require File.dirname(__FILE__) + '/../lib/microsoft_ngram'
2
+ require 'rubygems'
3
+ require 'memoize'
4
+
5
+ # This code based on Peter Novig's chapter on "Natural Language Corpus Data" in
6
+ # Beautiful Data.
7
+
8
+ include Memoize
9
+
10
+ $bi_body_model = MicrosoftNgram.new(:model => "bing-body/jun09/2", :debug=>false)
11
+ $uni_body_model = MicrosoftNgram.new(:model => "bing-body/jun09/1", :debug=>false)
12
+ $magic_pr = -13.419954 # twice as uncommon as "kraig" last word in Bing 100k list
13
+
14
+ # Returns all the splits of a string up to a given length
15
+ def splits(text,max_len=text.size)
16
+ Range.new(0,[text.size,max_len].min-1).map{|i| [text[0..i],text[i+1..-1]]}
17
+ end
18
+
19
+ # This keeps just those splits whose first item is above the magic unigram
20
+ # log probability
21
+ def reasonable_splits(text,max_len=text.size)
22
+ splits(text,max_len).find_all{|pre,suf| Pr(pre)>=$magic_pr}
23
+ end
24
+
25
+ # Get the unigram log probability of a token
26
+ def Pr(str)
27
+ $uni_body_model.cp(str)
28
+ end
29
+
30
+ # Get the conditional probability of a word, given a prior
31
+ def cPw(word, prev)
32
+ $bi_body_model.cp([prev,word].join(' '))
33
+ end
34
+
35
+ # combine data
36
+ def combine(pfirst, first, pr)
37
+ prem, rem = pr
38
+ return [pfirst+prem, [first]+rem]
39
+ end
40
+
41
+ # segment a text, assuming it is at the beginning of a sentence
42
+ # return a pair: the log probability, and the most probable segmentation
43
+ def segment2(text, prev="<s>")
44
+ #puts "segment2: #{text.inspect} prev: #{prev}"
45
+ return [0.0,[]] if (!text or text.size==0)
46
+ reasonable_splits(text).map{|first,rem| combine(cPw(first,prev), first, segment2(rem, first))}.max
47
+ end
48
+
49
+ # just return the best segmentation
50
+ def segment(text)
51
+ segment2(text)[1]
52
+ end
53
+
54
+ # We want to memoize a lot of things.
55
+ memoize :splits
56
+ memoize :reasonable_splits
57
+ memoize :Pr
58
+ memoize :cPw
59
+ memoize :segment2
60
+
61
+ # These are some Twitter hash tags which I segmented.
62
+ # > segment("bpcares")
63
+ # => ["bp", "cares"]
64
+ # > segment("Twitter")
65
+ # => ["Twitter"]
66
+ # > segment("writers")
67
+ # => ["writers"]
68
+ # > segment("iamwriting")
69
+ # => ["i", "am", "writing"]
70
+ # > segment("backchannel")
71
+ # => ["back", "channel"]
72
+ # > segment("tcot")
73
+ # => ["tcot"]
74
+ # > segment("vacationfallout")
75
+ # => ["vacation", "fall", "out"]
76
+
77
+
@@ -0,0 +1,98 @@
1
+ require "microsoft_ngram/version"
2
+ require "rest-client"
3
+
4
+ module Bing
5
+
6
+ class Ngram
7
+
8
+ @@endpoint = "http://web-ngram.research.microsoft.com/rest/lookup.svc/"
9
+ @@models = nil
10
+
11
+ def self.models()
12
+ @@models=RestClient.get(@@endpoint).split(/\s+/)
13
+ end
14
+
15
+ def self.defined_model?(model)
16
+ Bing::Ngram.models() if @@models == nil # cache the current models
17
+ @@models.include?(model)
18
+ end
19
+
20
+ attr_accessor :user_token
21
+ # The model is the current model. Query this.models() for available models
22
+ attr_accessor :model
23
+ # Simple debug mode. If non-false, GET calls are display
24
+ attr_accessor :debug
25
+ # Ngram size based on model
26
+ attr_accessor :ngram_size
27
+
28
+ def initialize(args = {})
29
+ @user_token = args["user_token"] || args[:user_token] || ENV["NGRAM_TOKEN"]
30
+ unless @user_token
31
+ raise "Must provide user token as NGRAM_TOKEN env variable or as :user_token => token. To get a token, see http://web-ngram.research.microsoft.com/info/ "
32
+ end
33
+ # probably shouldn't change
34
+ @model = args["model"] || args[:model] || Bing::Ngram.models().find_all{|x| x =~ /body/}.max
35
+ unless Bing::Ngram.defined_model?(@model)
36
+ raise "Invalid model: #{@model}. Valid models are #{@@models.join('; ')}"
37
+ end
38
+ @debug = (args["debug"] || args[:debug] || nil)
39
+ @ngram_size = @model.split(/\//)[-1].to_i
40
+ end
41
+
42
+ def get(op,phrase,args)
43
+ model = args["model"] || args[:model] || @model
44
+ RestClient.get(@@endpoint + model + '/' + op, {:params => {:u => @user_token, :p => phrase}.merge(args)}) do |res,req,result|
45
+ $stderr.puts req.inspect if @debug
46
+ res
47
+ end
48
+ end
49
+
50
+ def post(op,phrases,args)
51
+ model = args["model"] || args[:model] || @model
52
+ RestClient.post(@@endpoint + model + '/' + op + "?u=#{@user_token}", phrases.join("\n")) do |res,req,result|
53
+ $stderr.puts req.inspect if @debug
54
+ res
55
+ end
56
+ end
57
+
58
+ def cp(phrase,args={})
59
+ get('cp',phrase,args).to_f
60
+ end
61
+
62
+ def cps(phrases,args={})
63
+ phrases.zip(post('cp',phrases,args).split(/\s+/).map{|r| r.strip.to_f})
64
+ end
65
+
66
+ def jp(phrase,args={})
67
+ get('jp',phrase,args).to_f
68
+ end
69
+
70
+ def jps(phrases,args={})
71
+ phrases.zip(post('jp',phrases,args).split(/\s+/).map{|r| r.strip.to_f})
72
+ end
73
+
74
+ # Yield up to nstop token, log-prob pairs given the tokens in the phrase.
75
+
76
+ def generate(phrase,nstop=2**32)
77
+ arg = {}
78
+ while true do
79
+ break if nstop <= 0
80
+ arg['n']=[1000,[0,nstop].max].min
81
+ result = get("gen",phrase,arg).split("\r\n")
82
+ break if result.size <= 2
83
+ nstop -= (result.size - 2)
84
+ arg['cookie']=result[0]
85
+ backoff = result[1]
86
+ result[2..-1].each do |x|
87
+ pair = x.split(';')
88
+ yield [pair[0], pair[1].to_f]
89
+ end
90
+ end
91
+ end
92
+
93
+ # spell-checking
94
+ # Bing::Ngram.new(:debug=>nil,:model=>'bing-body/jun09/1').jps(edits1("appresiate").uniq).sort{|a,b| b[1] <=> a[1]}[0..30]
95
+
96
+ end
97
+
98
+ end
@@ -0,0 +1,3 @@
1
+ module MicrosoftNgram
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "microsoft_ngram/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "microsoft_ngram"
7
+ s.version = MicrosoftNgram::VERSION
8
+ s.authors = ["Will Fitzgerald", "Zeke Sikelianos"]
9
+ s.email = ["will@wordnik.com", "zeke@sikelianos.com"]
10
+ s.homepage = "http://developer.wordnik.com"
11
+ s.summary = %q{A simple wrapper for Bing's ngram API}
12
+ s.description = %q{A simple wrapper for Bing's ngram API}
13
+
14
+ s.rubyforge_project = "microsoft_ngram"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {spec}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "hoe"
23
+ s.add_development_dependency 'rspec', '~> 2.8.0'
24
+ s.add_development_dependency 'autotest'
25
+
26
+ s.add_runtime_dependency "rest-client"
27
+ end
@@ -0,0 +1,61 @@
1
+ require 'spec_helper'
2
+
3
+ describe Bing::Ngram do
4
+
5
+ it "should return a list of models" do
6
+ Bing::Ngram.models.size.should > 0
7
+ end
8
+
9
+ it "should have a user token, or you're hosed" do
10
+ Bing::Ngram.new.user_token.should_not == nil
11
+ end
12
+
13
+ it "should have a default model" do
14
+ Bing::Ngram.new.model.should_not == nil
15
+ end
16
+
17
+ it "should retrieve a joint probability" do
18
+ Bing::Ngram.new.jp("fish sticks").should < 0
19
+ end
20
+
21
+ it 'should retrieve a list of joint probabilities' do
22
+ fish, frog = ["fish sticks", "frog sticks"]
23
+ fish_results, frog_results = Bing::Ngram.new.jps([fish, frog])
24
+ fish_results[0].should == fish
25
+ fish_results[1].should < 0
26
+ frog_results[0].should == frog
27
+ frog_results[1].should < 0
28
+ fish_results[1].should > frog_results[1]
29
+ end
30
+
31
+ it "should retrieve a conditional probability" do
32
+ Bing::Ngram.new.cp("fish sticks").should < 0
33
+ end
34
+
35
+ it 'should retrieve a list of conditional probabilities' do
36
+ fish, frog = ["fish sticks", "frog sticks"]
37
+ fish_results, frog_results = Bing::Ngram.new.cps([fish, frog])
38
+ fish_results[0].should == fish
39
+ fish_results[1].should < 0
40
+ frog_results[0].should == frog
41
+ frog_results[1].should < 0
42
+ fish_results[1].should > frog_results[1]
43
+ end
44
+
45
+ it 'should yield most probable next tokens' do
46
+ two_gram_stream = Bing::Ngram.models.find do |model|
47
+ name, date, size = model.split(/\//)
48
+ name.include?("body") && size=="2"
49
+ end
50
+ two_gram_stream.should_not == nil
51
+ m = Bing::Ngram.new(:model => two_gram_stream)
52
+ count = 0
53
+ m.generate("the",10) do |word, log_prob|
54
+ count += 1
55
+ word.should_not == nil
56
+ log_prob.should < 0
57
+ end
58
+ count.should == 10
59
+ end
60
+ end
61
+
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'rspec'
4
+ require 'microsoft_ngram'
5
+ require 'rest-client'
6
+
7
+ RSpec.configure do |config|
8
+ # some (optional) config here
9
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: microsoft_ngram
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Will Fitzgerald
9
+ - Zeke Sikelianos
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-02-03 00:00:00.000000000Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ requirement: &70093440050860 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *70093440050860
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: &70093440048640 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 2.8.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: *70093440048640
37
+ - !ruby/object:Gem::Dependency
38
+ name: autotest
39
+ requirement: &70093440048220 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *70093440048220
48
+ - !ruby/object:Gem::Dependency
49
+ name: rest-client
50
+ requirement: &70093440047740 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ type: :runtime
57
+ prerelease: false
58
+ version_requirements: *70093440047740
59
+ description: A simple wrapper for Bing's ngram API
60
+ email:
61
+ - will@wordnik.com
62
+ - zeke@sikelianos.com
63
+ executables:
64
+ - microsoft_ngram
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - .gitignore
69
+ - .rspec
70
+ - Gemfile
71
+ - History.txt
72
+ - Manifest.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/microsoft_ngram
76
+ - examples/segment.rb
77
+ - lib/microsoft_ngram.rb
78
+ - lib/microsoft_ngram/version.rb
79
+ - microsoft_ngram.gemspec
80
+ - spec/microsoft_ngram_spec.rb
81
+ - spec/spec_helper.rb
82
+ homepage: http://developer.wordnik.com
83
+ licenses: []
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ none: false
96
+ requirements:
97
+ - - ! '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project: microsoft_ngram
102
+ rubygems_version: 1.8.12
103
+ signing_key:
104
+ specification_version: 3
105
+ summary: A simple wrapper for Bing's ngram API
106
+ test_files: []