microsoft_ngram 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --colour
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in microsoft_ngram.gemspec
4
+ gemspec
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2010-08-31
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
@@ -0,0 +1,8 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/microsoft_ngram
6
+ lib/microsoft_ngram.rb
7
+ spec/microsoft_ngram_spec.rb
8
+ examples/segment.rb
@@ -0,0 +1,117 @@
1
+ microsoft_ngram
2
+ ===============
3
+
4
+ This is a simple ruby gem to access the Bing Ngram data. It's loosely based on Microsoft's Python library.
5
+ Source code at [github.com/willf/microsoft_ngram](http://github.com/willf/microsoft_ngram).
6
+
7
+ Installation
8
+ ------------
9
+
10
+ Email [webngram@microsoft.com](mailto:webngram@microsoft.com?subject=Token%20Request) and request a token.
11
+ When you get your token, add it to your .bashrc or .bash_profile:
12
+
13
+ export NGRAM_TOKEN="YOUR-TOKEN-HERE"
14
+
15
+ Then install the gem:
16
+
17
+ gem install microsoft_ngram
18
+
19
+ Usage
20
+ -----
21
+
22
+ To get a list of currently available models:
23
+
24
+ > MicrosoftNgram.models
25
+ => ["bing-anchor/jun09/1", "bing-anchor/jun09/2", "bing-anchor/jun09/3", "bing-anchor/jun09/4", "bing-body/jun09/1", "bing-body/jun09/2", "bing-body/jun09/3", "bing-title/jun09/1", "bing-title/jun09/2", "bing-title/jun09/3", "bing-title/jun09/4", "bing-query/jun09/1", "bing-query/jun09/2", "bing-query/jun09/3"]
26
+
27
+ To see the default model:
28
+
29
+ > MicrosoftNgram.new.model
30
+ => "bing-body/jun09/3"
31
+
32
+ Parameters to the initializer are:
33
+
34
+ :model => <i>string</i> (sets model)
35
+ :user_token => <i>string</i> (sets user token)
36
+ :debug => <i>boolean</i> (will show GET/POST calls)
37
+
38
+ So, to use the 2-gram title model:
39
+
40
+ > model = MicrosoftNgram.new(:model => "bing-title/jun09/2")
41
+
42
+ To get a single joint probability, or multiple joint probabilities (If
43
+ you know you want multiple joint probabilities, it is better to ask
44
+ for several at once):
45
+
46
+ > MicrosoftNgram.new.jps(['fish sticks', 'frog sticks'])
47
+ => [["fish sticks", -6.853792], ["frog sticks", -9.91852]]
48
+ > MicrosoftNgram.new.jp("fish sticks")
49
+ => -6.853792
50
+
51
+ To get a single conditional probability, or multiple conditional probabilities (If you know you want multiple conditional probabilities, it is better to ask for several at once):
52
+
53
+ > MicrosoftNgram.new.cp("fish sticks")
54
+ => -2.712575
55
+ > MicrosoftNgram.new.cps(['fish sticks', 'frog sticks'])
56
+ => [["fish sticks", -2.712575], ["frog sticks", -4.788582]]
57
+
58
+ To yield the most probable next token using the default model:
59
+
60
+ > MicrosoftNgram.new.generate("Microsoft Windows",5) {|x| puts x.join(' ')}
61
+ xp -0.6964428
62
+ vista -0.9242383
63
+ server -1.106876
64
+ 2000 -1.145312
65
+ currentversion -1.168404
66
+
67
+ To use the query model for the same thing:
68
+
69
+ > MicrosoftNgram.new(:model => 'bing-query/jun09/3').generate("Microsoft Windows",5) {|x| puts x.join(' ')}
70
+ xp -0.5429792
71
+ </s> -1.062959
72
+ update -1.08291
73
+ vista -1.199022
74
+ installer -1.248958
75
+
76
+ Sample Script
77
+ -------------
78
+
79
+ ```ruby
80
+ require 'rubygems'
81
+ require 'microsoft_ngram'
82
+ l = []
83
+ Bing::Ngram.new(:model => "bing-body/apr10/5").generate('a bum',50){ |w,_| l << w }
84
+ l.join("; ")
85
+ ```
86
+
87
+ More Info
88
+ ---------
89
+
90
+ See the [REST API](http://web-ngram.research.microsoft.com/info/rest.html) and the
91
+ [terms of use](http://web-ngram.research.microsoft.com/info/TermsOfUse.htm) for accessing the Microsoft data.
92
+
93
+ License
94
+ -------
95
+
96
+ (The MIT License)
97
+
98
+ Copyright (c) 2010/2011
99
+
100
+ Permission is hereby granted, free of charge, to any person obtaining
101
+ a copy of this software and associated documentation files (the
102
+ 'Software'), to deal in the Software without restriction, including
103
+ without limitation the rights to use, copy, modify, merge, publish,
104
+ distribute, sublicense, and/or sell copies of the Software, and to
105
+ permit persons to whom the Software is furnished to do so, subject to
106
+ the following conditions:
107
+
108
+ The above copyright notice and this permission notice shall be
109
+ included in all copies or substantial portions of the Software.
110
+
111
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
112
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
113
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
114
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
115
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
116
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
117
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,11 @@
1
+ require 'bundler'
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+ require 'microsoft_ngram'
5
+
6
+ Bundler::GemHelper.install_tasks
7
+
8
+ RSpec::Core::RakeTask.new('spec')
9
+
10
+ # If you want to make this the default task
11
+ task :default => :spec
File without changes
@@ -0,0 +1,77 @@
1
+ require File.dirname(__FILE__) + '/../lib/microsoft_ngram'
2
+ require 'rubygems'
3
+ require 'memoize'
4
+
5
+ # This code based on Peter Novig's chapter on "Natural Language Corpus Data" in
6
+ # Beautiful Data.
7
+
8
+ include Memoize
9
+
10
+ $bi_body_model = MicrosoftNgram.new(:model => "bing-body/jun09/2", :debug=>false)
11
+ $uni_body_model = MicrosoftNgram.new(:model => "bing-body/jun09/1", :debug=>false)
12
+ $magic_pr = -13.419954 # twice as uncommon as "kraig" last word in Bing 100k list
13
+
14
+ # Returns all the splits of a string up to a given length
15
+ def splits(text,max_len=text.size)
16
+ Range.new(0,[text.size,max_len].min-1).map{|i| [text[0..i],text[i+1..-1]]}
17
+ end
18
+
19
+ # This keeps just those splits whose first item is above the magic unigram
20
+ # log probability
21
+ def reasonable_splits(text,max_len=text.size)
22
+ splits(text,max_len).find_all{|pre,suf| Pr(pre)>=$magic_pr}
23
+ end
24
+
25
+ # Get the unigram log probability of a token
26
+ def Pr(str)
27
+ $uni_body_model.cp(str)
28
+ end
29
+
30
+ # Get the conditional probability of a word, given a prior
31
+ def cPw(word, prev)
32
+ $bi_body_model.cp([prev,word].join(' '))
33
+ end
34
+
35
+ # combine data
36
+ def combine(pfirst, first, pr)
37
+ prem, rem = pr
38
+ return [pfirst+prem, [first]+rem]
39
+ end
40
+
41
+ # segment a text, assuming it is at the beginning of a sentence
42
+ # return a pair: the log probability, and the most probable segmentation
43
+ def segment2(text, prev="<s>")
44
+ #puts "segment2: #{text.inspect} prev: #{prev}"
45
+ return [0.0,[]] if (!text or text.size==0)
46
+ reasonable_splits(text).map{|first,rem| combine(cPw(first,prev), first, segment2(rem, first))}.max
47
+ end
48
+
49
+ # just return the best segmentation
50
+ def segment(text)
51
+ segment2(text)[1]
52
+ end
53
+
54
+ # We want to memoize a lot of things.
55
+ memoize :splits
56
+ memoize :reasonable_splits
57
+ memoize :Pr
58
+ memoize :cPw
59
+ memoize :segment2
60
+
61
+ # These are some Twitter hash tags which I segmented.
62
+ # > segment("bpcares")
63
+ # => ["bp", "cares"]
64
+ # > segment("Twitter")
65
+ # => ["Twitter"]
66
+ # > segment("writers")
67
+ # => ["writers"]
68
+ # > segment("iamwriting")
69
+ # => ["i", "am", "writing"]
70
+ # > segment("backchannel")
71
+ # => ["back", "channel"]
72
+ # > segment("tcot")
73
+ # => ["tcot"]
74
+ # > segment("vacationfallout")
75
+ # => ["vacation", "fall", "out"]
76
+
77
+
@@ -0,0 +1,98 @@
1
+ require "microsoft_ngram/version"
2
+ require "rest-client"
3
+
4
+ module Bing
5
+
6
+ class Ngram
7
+
8
+ @@endpoint = "http://web-ngram.research.microsoft.com/rest/lookup.svc/"
9
+ @@models = nil
10
+
11
+ def self.models()
12
+ @@models=RestClient.get(@@endpoint).split(/\s+/)
13
+ end
14
+
15
+ def self.defined_model?(model)
16
+ Bing::Ngram.models() if @@models == nil # cache the current models
17
+ @@models.include?(model)
18
+ end
19
+
20
+ attr_accessor :user_token
21
+ # The model is the current model. Query this.models() for available models
22
+ attr_accessor :model
23
+ # Simple debug mode. If non-false, GET calls are display
24
+ attr_accessor :debug
25
+ # Ngram size based on model
26
+ attr_accessor :ngram_size
27
+
28
+ def initialize(args = {})
29
+ @user_token = args["user_token"] || args[:user_token] || ENV["NGRAM_TOKEN"]
30
+ unless @user_token
31
+ raise "Must provide user token as NGRAM_TOKEN env variable or as :user_token => token. To get a token, see http://web-ngram.research.microsoft.com/info/ "
32
+ end
33
+ # probably shouldn't change
34
+ @model = args["model"] || args[:model] || Bing::Ngram.models().find_all{|x| x =~ /body/}.max
35
+ unless Bing::Ngram.defined_model?(@model)
36
+ raise "Invalid model: #{@model}. Valid models are #{@@models.join('; ')}"
37
+ end
38
+ @debug = (args["debug"] || args[:debug] || nil)
39
+ @ngram_size = @model.split(/\//)[-1].to_i
40
+ end
41
+
42
+ def get(op,phrase,args)
43
+ model = args["model"] || args[:model] || @model
44
+ RestClient.get(@@endpoint + model + '/' + op, {:params => {:u => @user_token, :p => phrase}.merge(args)}) do |res,req,result|
45
+ $stderr.puts req.inspect if @debug
46
+ res
47
+ end
48
+ end
49
+
50
+ def post(op,phrases,args)
51
+ model = args["model"] || args[:model] || @model
52
+ RestClient.post(@@endpoint + model + '/' + op + "?u=#{@user_token}", phrases.join("\n")) do |res,req,result|
53
+ $stderr.puts req.inspect if @debug
54
+ res
55
+ end
56
+ end
57
+
58
+ def cp(phrase,args={})
59
+ get('cp',phrase,args).to_f
60
+ end
61
+
62
+ def cps(phrases,args={})
63
+ phrases.zip(post('cp',phrases,args).split(/\s+/).map{|r| r.strip.to_f})
64
+ end
65
+
66
+ def jp(phrase,args={})
67
+ get('jp',phrase,args).to_f
68
+ end
69
+
70
+ def jps(phrases,args={})
71
+ phrases.zip(post('jp',phrases,args).split(/\s+/).map{|r| r.strip.to_f})
72
+ end
73
+
74
+ # Yield up to nstop token, log-prob pairs given the tokens in the phrase.
75
+
76
+ def generate(phrase,nstop=2**32)
77
+ arg = {}
78
+ while true do
79
+ break if nstop <= 0
80
+ arg['n']=[1000,[0,nstop].max].min
81
+ result = get("gen",phrase,arg).split("\r\n")
82
+ break if result.size <= 2
83
+ nstop -= (result.size - 2)
84
+ arg['cookie']=result[0]
85
+ backoff = result[1]
86
+ result[2..-1].each do |x|
87
+ pair = x.split(';')
88
+ yield [pair[0], pair[1].to_f]
89
+ end
90
+ end
91
+ end
92
+
93
+ # spell-checking
94
+ # Bing::Ngram.new(:debug=>nil,:model=>'bing-body/jun09/1').jps(edits1("appresiate").uniq).sort{|a,b| b[1] <=> a[1]}[0..30]
95
+
96
+ end
97
+
98
+ end
@@ -0,0 +1,3 @@
1
+ module MicrosoftNgram
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "microsoft_ngram/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "microsoft_ngram"
7
+ s.version = MicrosoftNgram::VERSION
8
+ s.authors = ["Will Fitzgerald", "Zeke Sikelianos"]
9
+ s.email = ["will@wordnik.com", "zeke@sikelianos.com"]
10
+ s.homepage = "http://developer.wordnik.com"
11
+ s.summary = %q{A simple wrapper for Bing's ngram API}
12
+ s.description = %q{A simple wrapper for Bing's ngram API}
13
+
14
+ s.rubyforge_project = "microsoft_ngram"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {spec}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "hoe"
23
+ s.add_development_dependency 'rspec', '~> 2.8.0'
24
+ s.add_development_dependency 'autotest'
25
+
26
+ s.add_runtime_dependency "rest-client"
27
+ end
@@ -0,0 +1,61 @@
1
+ require 'spec_helper'
2
+
3
+ describe Bing::Ngram do
4
+
5
+ it "should return a list of models" do
6
+ Bing::Ngram.models.size.should > 0
7
+ end
8
+
9
+ it "should have a user token, or you're hosed" do
10
+ Bing::Ngram.new.user_token.should_not == nil
11
+ end
12
+
13
+ it "should have a default model" do
14
+ Bing::Ngram.new.model.should_not == nil
15
+ end
16
+
17
+ it "should retrieve a joint probability" do
18
+ Bing::Ngram.new.jp("fish sticks").should < 0
19
+ end
20
+
21
+ it 'should retrieve a list of joint probabilities' do
22
+ fish, frog = ["fish sticks", "frog sticks"]
23
+ fish_results, frog_results = Bing::Ngram.new.jps([fish, frog])
24
+ fish_results[0].should == fish
25
+ fish_results[1].should < 0
26
+ frog_results[0].should == frog
27
+ frog_results[1].should < 0
28
+ fish_results[1].should > frog_results[1]
29
+ end
30
+
31
+ it "should retrieve a conditional probability" do
32
+ Bing::Ngram.new.cp("fish sticks").should < 0
33
+ end
34
+
35
+ it 'should retrieve a list of conditional probabilities' do
36
+ fish, frog = ["fish sticks", "frog sticks"]
37
+ fish_results, frog_results = Bing::Ngram.new.cps([fish, frog])
38
+ fish_results[0].should == fish
39
+ fish_results[1].should < 0
40
+ frog_results[0].should == frog
41
+ frog_results[1].should < 0
42
+ fish_results[1].should > frog_results[1]
43
+ end
44
+
45
+ it 'should yield most probable next tokens' do
46
+ two_gram_stream = Bing::Ngram.models.find do |model|
47
+ name, date, size = model.split(/\//)
48
+ name.include?("body") && size=="2"
49
+ end
50
+ two_gram_stream.should_not == nil
51
+ m = Bing::Ngram.new(:model => two_gram_stream)
52
+ count = 0
53
+ m.generate("the",10) do |word, log_prob|
54
+ count += 1
55
+ word.should_not == nil
56
+ log_prob.should < 0
57
+ end
58
+ count.should == 10
59
+ end
60
+ end
61
+
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+ require 'rspec'
4
+ require 'microsoft_ngram'
5
+ require 'rest-client'
6
+
7
+ RSpec.configure do |config|
8
+ # some (optional) config here
9
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: microsoft_ngram
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Will Fitzgerald
9
+ - Zeke Sikelianos
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2012-02-03 00:00:00.000000000Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ requirement: &70093440050860 !ruby/object:Gem::Requirement
18
+ none: false
19
+ requirements:
20
+ - - ! '>='
21
+ - !ruby/object:Gem::Version
22
+ version: '0'
23
+ type: :development
24
+ prerelease: false
25
+ version_requirements: *70093440050860
26
+ - !ruby/object:Gem::Dependency
27
+ name: rspec
28
+ requirement: &70093440048640 !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 2.8.0
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: *70093440048640
37
+ - !ruby/object:Gem::Dependency
38
+ name: autotest
39
+ requirement: &70093440048220 !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ type: :development
46
+ prerelease: false
47
+ version_requirements: *70093440048220
48
+ - !ruby/object:Gem::Dependency
49
+ name: rest-client
50
+ requirement: &70093440047740 !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ type: :runtime
57
+ prerelease: false
58
+ version_requirements: *70093440047740
59
+ description: A simple wrapper for Bing's ngram API
60
+ email:
61
+ - will@wordnik.com
62
+ - zeke@sikelianos.com
63
+ executables:
64
+ - microsoft_ngram
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - .gitignore
69
+ - .rspec
70
+ - Gemfile
71
+ - History.txt
72
+ - Manifest.txt
73
+ - README.md
74
+ - Rakefile
75
+ - bin/microsoft_ngram
76
+ - examples/segment.rb
77
+ - lib/microsoft_ngram.rb
78
+ - lib/microsoft_ngram/version.rb
79
+ - microsoft_ngram.gemspec
80
+ - spec/microsoft_ngram_spec.rb
81
+ - spec/spec_helper.rb
82
+ homepage: http://developer.wordnik.com
83
+ licenses: []
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ none: false
96
+ requirements:
97
+ - - ! '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project: microsoft_ngram
102
+ rubygems_version: 1.8.12
103
+ signing_key:
104
+ specification_version: 3
105
+ summary: A simple wrapper for Bing's ngram API
106
+ test_files: []